Commit 8207649c41bf5c28a987be47d66545fa9d2994d8

Authored by Linus Torvalds

Merge branch 'akpm' (fixes from Andrew Morton)

Merge fixes from Andrew Morton:
 "9 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm: softdirty: keep bit when zapping file pte
  fs/cachefiles: add missing \n to kerror conversions
  genalloc: fix device node resource counter
  drivers/rtc/rtc-efi.c: add missing module alias
  mm, slab: initialize object alignment on cache creation
  mm: softdirty: addresses before VMAs in PTE holes aren't softdirty
  ocfs2/dlm: do not get resource spinlock if lockres is new
  nilfs2: fix data loss with mmap()
  ocfs2: free vol_label in ocfs2_delete_osb()

Showing 14 changed files Inline Diff

drivers/rtc/rtc-efi.c
1 /* 1 /*
2 * rtc-efi: RTC Class Driver for EFI-based systems 2 * rtc-efi: RTC Class Driver for EFI-based systems
3 * 3 *
4 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. 4 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
5 * 5 *
6 * Author: dann frazier <dannf@hp.com> 6 * Author: dann frazier <dannf@hp.com>
7 * Based on efirtc.c by Stephane Eranian 7 * Based on efirtc.c by Stephane Eranian
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by the 10 * under the terms of the GNU General Public License as published by the
11 * Free Software Foundation; either version 2 of the License, or (at your 11 * Free Software Foundation; either version 2 of the License, or (at your
12 * option) any later version. 12 * option) any later version.
13 * 13 *
14 */ 14 */
15 15
16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 17
18 #include <linux/kernel.h> 18 #include <linux/kernel.h>
19 #include <linux/module.h> 19 #include <linux/module.h>
20 #include <linux/stringify.h> 20 #include <linux/stringify.h>
21 #include <linux/time.h> 21 #include <linux/time.h>
22 #include <linux/platform_device.h> 22 #include <linux/platform_device.h>
23 #include <linux/rtc.h> 23 #include <linux/rtc.h>
24 #include <linux/efi.h> 24 #include <linux/efi.h>
25 25
26 #define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT) 26 #define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT)
27 /* 27 /*
28 * EFI Epoch is 1/1/1998 28 * EFI Epoch is 1/1/1998
29 */ 29 */
30 #define EFI_RTC_EPOCH 1998 30 #define EFI_RTC_EPOCH 1998
31 31
32 /* 32 /*
33 * returns day of the year [0-365] 33 * returns day of the year [0-365]
34 */ 34 */
35 static inline int 35 static inline int
36 compute_yday(efi_time_t *eft) 36 compute_yday(efi_time_t *eft)
37 { 37 {
38 /* efi_time_t.month is in the [1-12] so, we need -1 */ 38 /* efi_time_t.month is in the [1-12] so, we need -1 */
39 return rtc_year_days(eft->day, eft->month - 1, eft->year); 39 return rtc_year_days(eft->day, eft->month - 1, eft->year);
40 } 40 }
41 /* 41 /*
42 * returns day of the week [0-6] 0=Sunday 42 * returns day of the week [0-6] 0=Sunday
43 * 43 *
44 * Don't try to provide a year that's before 1998, please ! 44 * Don't try to provide a year that's before 1998, please !
45 */ 45 */
46 static int 46 static int
47 compute_wday(efi_time_t *eft) 47 compute_wday(efi_time_t *eft)
48 { 48 {
49 int y; 49 int y;
50 int ndays = 0; 50 int ndays = 0;
51 51
52 if (eft->year < EFI_RTC_EPOCH) { 52 if (eft->year < EFI_RTC_EPOCH) {
53 pr_err("EFI year < " __stringify(EFI_RTC_EPOCH) ", invalid date\n"); 53 pr_err("EFI year < " __stringify(EFI_RTC_EPOCH) ", invalid date\n");
54 return -1; 54 return -1;
55 } 55 }
56 56
57 for (y = EFI_RTC_EPOCH; y < eft->year; y++) 57 for (y = EFI_RTC_EPOCH; y < eft->year; y++)
58 ndays += 365 + (is_leap_year(y) ? 1 : 0); 58 ndays += 365 + (is_leap_year(y) ? 1 : 0);
59 59
60 ndays += compute_yday(eft); 60 ndays += compute_yday(eft);
61 61
62 /* 62 /*
63 * 4=1/1/1998 was a Thursday 63 * 4=1/1/1998 was a Thursday
64 */ 64 */
65 return (ndays + 4) % 7; 65 return (ndays + 4) % 7;
66 } 66 }
67 67
68 static void 68 static void
69 convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft) 69 convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft)
70 { 70 {
71 eft->year = wtime->tm_year + 1900; 71 eft->year = wtime->tm_year + 1900;
72 eft->month = wtime->tm_mon + 1; 72 eft->month = wtime->tm_mon + 1;
73 eft->day = wtime->tm_mday; 73 eft->day = wtime->tm_mday;
74 eft->hour = wtime->tm_hour; 74 eft->hour = wtime->tm_hour;
75 eft->minute = wtime->tm_min; 75 eft->minute = wtime->tm_min;
76 eft->second = wtime->tm_sec; 76 eft->second = wtime->tm_sec;
77 eft->nanosecond = 0; 77 eft->nanosecond = 0;
78 eft->daylight = wtime->tm_isdst ? EFI_ISDST : 0; 78 eft->daylight = wtime->tm_isdst ? EFI_ISDST : 0;
79 eft->timezone = EFI_UNSPECIFIED_TIMEZONE; 79 eft->timezone = EFI_UNSPECIFIED_TIMEZONE;
80 } 80 }
81 81
82 static bool 82 static bool
83 convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime) 83 convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime)
84 { 84 {
85 memset(wtime, 0, sizeof(*wtime)); 85 memset(wtime, 0, sizeof(*wtime));
86 86
87 if (eft->second >= 60) 87 if (eft->second >= 60)
88 return false; 88 return false;
89 wtime->tm_sec = eft->second; 89 wtime->tm_sec = eft->second;
90 90
91 if (eft->minute >= 60) 91 if (eft->minute >= 60)
92 return false; 92 return false;
93 wtime->tm_min = eft->minute; 93 wtime->tm_min = eft->minute;
94 94
95 if (eft->hour >= 24) 95 if (eft->hour >= 24)
96 return false; 96 return false;
97 wtime->tm_hour = eft->hour; 97 wtime->tm_hour = eft->hour;
98 98
99 if (!eft->day || eft->day > 31) 99 if (!eft->day || eft->day > 31)
100 return false; 100 return false;
101 wtime->tm_mday = eft->day; 101 wtime->tm_mday = eft->day;
102 102
103 if (!eft->month || eft->month > 12) 103 if (!eft->month || eft->month > 12)
104 return false; 104 return false;
105 wtime->tm_mon = eft->month - 1; 105 wtime->tm_mon = eft->month - 1;
106 wtime->tm_year = eft->year - 1900; 106 wtime->tm_year = eft->year - 1900;
107 107
108 /* day of the week [0-6], Sunday=0 */ 108 /* day of the week [0-6], Sunday=0 */
109 wtime->tm_wday = compute_wday(eft); 109 wtime->tm_wday = compute_wday(eft);
110 if (wtime->tm_wday < 0) 110 if (wtime->tm_wday < 0)
111 return false; 111 return false;
112 112
113 /* day in the year [1-365]*/ 113 /* day in the year [1-365]*/
114 wtime->tm_yday = compute_yday(eft); 114 wtime->tm_yday = compute_yday(eft);
115 115
116 116
117 switch (eft->daylight & EFI_ISDST) { 117 switch (eft->daylight & EFI_ISDST) {
118 case EFI_ISDST: 118 case EFI_ISDST:
119 wtime->tm_isdst = 1; 119 wtime->tm_isdst = 1;
120 break; 120 break;
121 case EFI_TIME_ADJUST_DAYLIGHT: 121 case EFI_TIME_ADJUST_DAYLIGHT:
122 wtime->tm_isdst = 0; 122 wtime->tm_isdst = 0;
123 break; 123 break;
124 default: 124 default:
125 wtime->tm_isdst = -1; 125 wtime->tm_isdst = -1;
126 } 126 }
127 127
128 return true; 128 return true;
129 } 129 }
130 130
131 static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) 131 static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
132 { 132 {
133 efi_time_t eft; 133 efi_time_t eft;
134 efi_status_t status; 134 efi_status_t status;
135 135
136 /* 136 /*
137 * As of EFI v1.10, this call always returns an unsupported status 137 * As of EFI v1.10, this call always returns an unsupported status
138 */ 138 */
139 status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled, 139 status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled,
140 (efi_bool_t *)&wkalrm->pending, &eft); 140 (efi_bool_t *)&wkalrm->pending, &eft);
141 141
142 if (status != EFI_SUCCESS) 142 if (status != EFI_SUCCESS)
143 return -EINVAL; 143 return -EINVAL;
144 144
145 if (!convert_from_efi_time(&eft, &wkalrm->time)) 145 if (!convert_from_efi_time(&eft, &wkalrm->time))
146 return -EIO; 146 return -EIO;
147 147
148 return rtc_valid_tm(&wkalrm->time); 148 return rtc_valid_tm(&wkalrm->time);
149 } 149 }
150 150
151 static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) 151 static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
152 { 152 {
153 efi_time_t eft; 153 efi_time_t eft;
154 efi_status_t status; 154 efi_status_t status;
155 155
156 convert_to_efi_time(&wkalrm->time, &eft); 156 convert_to_efi_time(&wkalrm->time, &eft);
157 157
158 /* 158 /*
159 * XXX Fixme: 159 * XXX Fixme:
160 * As of EFI 0.92 with the firmware I have on my 160 * As of EFI 0.92 with the firmware I have on my
161 * machine this call does not seem to work quite 161 * machine this call does not seem to work quite
162 * right 162 * right
163 * 163 *
164 * As of v1.10, this call always returns an unsupported status 164 * As of v1.10, this call always returns an unsupported status
165 */ 165 */
166 status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft); 166 status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft);
167 167
168 dev_warn(dev, "write status is %d\n", (int)status); 168 dev_warn(dev, "write status is %d\n", (int)status);
169 169
170 return status == EFI_SUCCESS ? 0 : -EINVAL; 170 return status == EFI_SUCCESS ? 0 : -EINVAL;
171 } 171 }
172 172
173 static int efi_read_time(struct device *dev, struct rtc_time *tm) 173 static int efi_read_time(struct device *dev, struct rtc_time *tm)
174 { 174 {
175 efi_status_t status; 175 efi_status_t status;
176 efi_time_t eft; 176 efi_time_t eft;
177 efi_time_cap_t cap; 177 efi_time_cap_t cap;
178 178
179 status = efi.get_time(&eft, &cap); 179 status = efi.get_time(&eft, &cap);
180 180
181 if (status != EFI_SUCCESS) { 181 if (status != EFI_SUCCESS) {
182 /* should never happen */ 182 /* should never happen */
183 dev_err(dev, "can't read time\n"); 183 dev_err(dev, "can't read time\n");
184 return -EINVAL; 184 return -EINVAL;
185 } 185 }
186 186
187 if (!convert_from_efi_time(&eft, tm)) 187 if (!convert_from_efi_time(&eft, tm))
188 return -EIO; 188 return -EIO;
189 189
190 return rtc_valid_tm(tm); 190 return rtc_valid_tm(tm);
191 } 191 }
192 192
193 static int efi_set_time(struct device *dev, struct rtc_time *tm) 193 static int efi_set_time(struct device *dev, struct rtc_time *tm)
194 { 194 {
195 efi_status_t status; 195 efi_status_t status;
196 efi_time_t eft; 196 efi_time_t eft;
197 197
198 convert_to_efi_time(tm, &eft); 198 convert_to_efi_time(tm, &eft);
199 199
200 status = efi.set_time(&eft); 200 status = efi.set_time(&eft);
201 201
202 return status == EFI_SUCCESS ? 0 : -EINVAL; 202 return status == EFI_SUCCESS ? 0 : -EINVAL;
203 } 203 }
204 204
205 static const struct rtc_class_ops efi_rtc_ops = { 205 static const struct rtc_class_ops efi_rtc_ops = {
206 .read_time = efi_read_time, 206 .read_time = efi_read_time,
207 .set_time = efi_set_time, 207 .set_time = efi_set_time,
208 .read_alarm = efi_read_alarm, 208 .read_alarm = efi_read_alarm,
209 .set_alarm = efi_set_alarm, 209 .set_alarm = efi_set_alarm,
210 }; 210 };
211 211
212 static int __init efi_rtc_probe(struct platform_device *dev) 212 static int __init efi_rtc_probe(struct platform_device *dev)
213 { 213 {
214 struct rtc_device *rtc; 214 struct rtc_device *rtc;
215 215
216 rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops, 216 rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops,
217 THIS_MODULE); 217 THIS_MODULE);
218 if (IS_ERR(rtc)) 218 if (IS_ERR(rtc))
219 return PTR_ERR(rtc); 219 return PTR_ERR(rtc);
220 220
221 platform_set_drvdata(dev, rtc); 221 platform_set_drvdata(dev, rtc);
222 222
223 return 0; 223 return 0;
224 } 224 }
225 225
226 static struct platform_driver efi_rtc_driver = { 226 static struct platform_driver efi_rtc_driver = {
227 .driver = { 227 .driver = {
228 .name = "rtc-efi", 228 .name = "rtc-efi",
229 .owner = THIS_MODULE, 229 .owner = THIS_MODULE,
230 }, 230 },
231 }; 231 };
232 232
233 module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe); 233 module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe);
234 234
235 MODULE_ALIAS("platform:rtc-efi");
235 MODULE_AUTHOR("dann frazier <dannf@hp.com>"); 236 MODULE_AUTHOR("dann frazier <dannf@hp.com>");
236 MODULE_LICENSE("GPL"); 237 MODULE_LICENSE("GPL");
237 MODULE_DESCRIPTION("EFI RTC driver"); 238 MODULE_DESCRIPTION("EFI RTC driver");
238 239
fs/cachefiles/bind.c
1 /* Bind and unbind a cache from the filesystem backing it 1 /* Bind and unbind a cache from the filesystem backing it
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/sched.h> 14 #include <linux/sched.h>
15 #include <linux/completion.h> 15 #include <linux/completion.h>
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <linux/fs.h> 17 #include <linux/fs.h>
18 #include <linux/file.h> 18 #include <linux/file.h>
19 #include <linux/namei.h> 19 #include <linux/namei.h>
20 #include <linux/mount.h> 20 #include <linux/mount.h>
21 #include <linux/statfs.h> 21 #include <linux/statfs.h>
22 #include <linux/ctype.h> 22 #include <linux/ctype.h>
23 #include "internal.h" 23 #include "internal.h"
24 24
25 static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); 25 static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
26 26
27 /* 27 /*
28 * bind a directory as a cache 28 * bind a directory as a cache
29 */ 29 */
30 int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) 30 int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
31 { 31 {
32 _enter("{%u,%u,%u,%u,%u,%u},%s", 32 _enter("{%u,%u,%u,%u,%u,%u},%s",
33 cache->frun_percent, 33 cache->frun_percent,
34 cache->fcull_percent, 34 cache->fcull_percent,
35 cache->fstop_percent, 35 cache->fstop_percent,
36 cache->brun_percent, 36 cache->brun_percent,
37 cache->bcull_percent, 37 cache->bcull_percent,
38 cache->bstop_percent, 38 cache->bstop_percent,
39 args); 39 args);
40 40
41 /* start by checking things over */ 41 /* start by checking things over */
42 ASSERT(cache->fstop_percent >= 0 && 42 ASSERT(cache->fstop_percent >= 0 &&
43 cache->fstop_percent < cache->fcull_percent && 43 cache->fstop_percent < cache->fcull_percent &&
44 cache->fcull_percent < cache->frun_percent && 44 cache->fcull_percent < cache->frun_percent &&
45 cache->frun_percent < 100); 45 cache->frun_percent < 100);
46 46
47 ASSERT(cache->bstop_percent >= 0 && 47 ASSERT(cache->bstop_percent >= 0 &&
48 cache->bstop_percent < cache->bcull_percent && 48 cache->bstop_percent < cache->bcull_percent &&
49 cache->bcull_percent < cache->brun_percent && 49 cache->bcull_percent < cache->brun_percent &&
50 cache->brun_percent < 100); 50 cache->brun_percent < 100);
51 51
52 if (*args) { 52 if (*args) {
53 pr_err("'bind' command doesn't take an argument"); 53 pr_err("'bind' command doesn't take an argument\n");
54 return -EINVAL; 54 return -EINVAL;
55 } 55 }
56 56
57 if (!cache->rootdirname) { 57 if (!cache->rootdirname) {
58 pr_err("No cache directory specified"); 58 pr_err("No cache directory specified\n");
59 return -EINVAL; 59 return -EINVAL;
60 } 60 }
61 61
62 /* don't permit already bound caches to be re-bound */ 62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) { 63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 pr_err("Cache already bound"); 64 pr_err("Cache already bound\n");
65 return -EBUSY; 65 return -EBUSY;
66 } 66 }
67 67
68 /* make sure we have copies of the tag and dirname strings */ 68 /* make sure we have copies of the tag and dirname strings */
69 if (!cache->tag) { 69 if (!cache->tag) {
70 /* the tag string is released by the fops->release() 70 /* the tag string is released by the fops->release()
71 * function, so we don't release it on error here */ 71 * function, so we don't release it on error here */
72 cache->tag = kstrdup("CacheFiles", GFP_KERNEL); 72 cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
73 if (!cache->tag) 73 if (!cache->tag)
74 return -ENOMEM; 74 return -ENOMEM;
75 } 75 }
76 76
77 /* add the cache */ 77 /* add the cache */
78 return cachefiles_daemon_add_cache(cache); 78 return cachefiles_daemon_add_cache(cache);
79 } 79 }
80 80
81 /* 81 /*
82 * add a cache 82 * add a cache
83 */ 83 */
84 static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) 84 static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85 { 85 {
86 struct cachefiles_object *fsdef; 86 struct cachefiles_object *fsdef;
87 struct path path; 87 struct path path;
88 struct kstatfs stats; 88 struct kstatfs stats;
89 struct dentry *graveyard, *cachedir, *root; 89 struct dentry *graveyard, *cachedir, *root;
90 const struct cred *saved_cred; 90 const struct cred *saved_cred;
91 int ret; 91 int ret;
92 92
93 _enter(""); 93 _enter("");
94 94
95 /* we want to work under the module's security ID */ 95 /* we want to work under the module's security ID */
96 ret = cachefiles_get_security_ID(cache); 96 ret = cachefiles_get_security_ID(cache);
97 if (ret < 0) 97 if (ret < 0)
98 return ret; 98 return ret;
99 99
100 cachefiles_begin_secure(cache, &saved_cred); 100 cachefiles_begin_secure(cache, &saved_cred);
101 101
102 /* allocate the root index object */ 102 /* allocate the root index object */
103 ret = -ENOMEM; 103 ret = -ENOMEM;
104 104
105 fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); 105 fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
106 if (!fsdef) 106 if (!fsdef)
107 goto error_root_object; 107 goto error_root_object;
108 108
109 ASSERTCMP(fsdef->backer, ==, NULL); 109 ASSERTCMP(fsdef->backer, ==, NULL);
110 110
111 atomic_set(&fsdef->usage, 1); 111 atomic_set(&fsdef->usage, 1);
112 fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; 112 fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
113 113
114 _debug("- fsdef %p", fsdef); 114 _debug("- fsdef %p", fsdef);
115 115
116 /* look up the directory at the root of the cache */ 116 /* look up the directory at the root of the cache */
117 ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); 117 ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
118 if (ret < 0) 118 if (ret < 0)
119 goto error_open_root; 119 goto error_open_root;
120 120
121 cache->mnt = path.mnt; 121 cache->mnt = path.mnt;
122 root = path.dentry; 122 root = path.dentry;
123 123
124 /* check parameters */ 124 /* check parameters */
125 ret = -EOPNOTSUPP; 125 ret = -EOPNOTSUPP;
126 if (!root->d_inode || 126 if (!root->d_inode ||
127 !root->d_inode->i_op->lookup || 127 !root->d_inode->i_op->lookup ||
128 !root->d_inode->i_op->mkdir || 128 !root->d_inode->i_op->mkdir ||
129 !root->d_inode->i_op->setxattr || 129 !root->d_inode->i_op->setxattr ||
130 !root->d_inode->i_op->getxattr || 130 !root->d_inode->i_op->getxattr ||
131 !root->d_sb->s_op->statfs || 131 !root->d_sb->s_op->statfs ||
132 !root->d_sb->s_op->sync_fs) 132 !root->d_sb->s_op->sync_fs)
133 goto error_unsupported; 133 goto error_unsupported;
134 134
135 ret = -EROFS; 135 ret = -EROFS;
136 if (root->d_sb->s_flags & MS_RDONLY) 136 if (root->d_sb->s_flags & MS_RDONLY)
137 goto error_unsupported; 137 goto error_unsupported;
138 138
139 /* determine the security of the on-disk cache as this governs 139 /* determine the security of the on-disk cache as this governs
140 * security ID of files we create */ 140 * security ID of files we create */
141 ret = cachefiles_determine_cache_security(cache, root, &saved_cred); 141 ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
142 if (ret < 0) 142 if (ret < 0)
143 goto error_unsupported; 143 goto error_unsupported;
144 144
145 /* get the cache size and blocksize */ 145 /* get the cache size and blocksize */
146 ret = vfs_statfs(&path, &stats); 146 ret = vfs_statfs(&path, &stats);
147 if (ret < 0) 147 if (ret < 0)
148 goto error_unsupported; 148 goto error_unsupported;
149 149
150 ret = -ERANGE; 150 ret = -ERANGE;
151 if (stats.f_bsize <= 0) 151 if (stats.f_bsize <= 0)
152 goto error_unsupported; 152 goto error_unsupported;
153 153
154 ret = -EOPNOTSUPP; 154 ret = -EOPNOTSUPP;
155 if (stats.f_bsize > PAGE_SIZE) 155 if (stats.f_bsize > PAGE_SIZE)
156 goto error_unsupported; 156 goto error_unsupported;
157 157
158 cache->bsize = stats.f_bsize; 158 cache->bsize = stats.f_bsize;
159 cache->bshift = 0; 159 cache->bshift = 0;
160 if (stats.f_bsize < PAGE_SIZE) 160 if (stats.f_bsize < PAGE_SIZE)
161 cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); 161 cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
162 162
163 _debug("blksize %u (shift %u)", 163 _debug("blksize %u (shift %u)",
164 cache->bsize, cache->bshift); 164 cache->bsize, cache->bshift);
165 165
166 _debug("size %llu, avail %llu", 166 _debug("size %llu, avail %llu",
167 (unsigned long long) stats.f_blocks, 167 (unsigned long long) stats.f_blocks,
168 (unsigned long long) stats.f_bavail); 168 (unsigned long long) stats.f_bavail);
169 169
170 /* set up caching limits */ 170 /* set up caching limits */
171 do_div(stats.f_files, 100); 171 do_div(stats.f_files, 100);
172 cache->fstop = stats.f_files * cache->fstop_percent; 172 cache->fstop = stats.f_files * cache->fstop_percent;
173 cache->fcull = stats.f_files * cache->fcull_percent; 173 cache->fcull = stats.f_files * cache->fcull_percent;
174 cache->frun = stats.f_files * cache->frun_percent; 174 cache->frun = stats.f_files * cache->frun_percent;
175 175
176 _debug("limits {%llu,%llu,%llu} files", 176 _debug("limits {%llu,%llu,%llu} files",
177 (unsigned long long) cache->frun, 177 (unsigned long long) cache->frun,
178 (unsigned long long) cache->fcull, 178 (unsigned long long) cache->fcull,
179 (unsigned long long) cache->fstop); 179 (unsigned long long) cache->fstop);
180 180
181 stats.f_blocks >>= cache->bshift; 181 stats.f_blocks >>= cache->bshift;
182 do_div(stats.f_blocks, 100); 182 do_div(stats.f_blocks, 100);
183 cache->bstop = stats.f_blocks * cache->bstop_percent; 183 cache->bstop = stats.f_blocks * cache->bstop_percent;
184 cache->bcull = stats.f_blocks * cache->bcull_percent; 184 cache->bcull = stats.f_blocks * cache->bcull_percent;
185 cache->brun = stats.f_blocks * cache->brun_percent; 185 cache->brun = stats.f_blocks * cache->brun_percent;
186 186
187 _debug("limits {%llu,%llu,%llu} blocks", 187 _debug("limits {%llu,%llu,%llu} blocks",
188 (unsigned long long) cache->brun, 188 (unsigned long long) cache->brun,
189 (unsigned long long) cache->bcull, 189 (unsigned long long) cache->bcull,
190 (unsigned long long) cache->bstop); 190 (unsigned long long) cache->bstop);
191 191
192 /* get the cache directory and check its type */ 192 /* get the cache directory and check its type */
193 cachedir = cachefiles_get_directory(cache, root, "cache"); 193 cachedir = cachefiles_get_directory(cache, root, "cache");
194 if (IS_ERR(cachedir)) { 194 if (IS_ERR(cachedir)) {
195 ret = PTR_ERR(cachedir); 195 ret = PTR_ERR(cachedir);
196 goto error_unsupported; 196 goto error_unsupported;
197 } 197 }
198 198
199 fsdef->dentry = cachedir; 199 fsdef->dentry = cachedir;
200 fsdef->fscache.cookie = NULL; 200 fsdef->fscache.cookie = NULL;
201 201
202 ret = cachefiles_check_object_type(fsdef); 202 ret = cachefiles_check_object_type(fsdef);
203 if (ret < 0) 203 if (ret < 0)
204 goto error_unsupported; 204 goto error_unsupported;
205 205
206 /* get the graveyard directory */ 206 /* get the graveyard directory */
207 graveyard = cachefiles_get_directory(cache, root, "graveyard"); 207 graveyard = cachefiles_get_directory(cache, root, "graveyard");
208 if (IS_ERR(graveyard)) { 208 if (IS_ERR(graveyard)) {
209 ret = PTR_ERR(graveyard); 209 ret = PTR_ERR(graveyard);
210 goto error_unsupported; 210 goto error_unsupported;
211 } 211 }
212 212
213 cache->graveyard = graveyard; 213 cache->graveyard = graveyard;
214 214
215 /* publish the cache */ 215 /* publish the cache */
216 fscache_init_cache(&cache->cache, 216 fscache_init_cache(&cache->cache,
217 &cachefiles_cache_ops, 217 &cachefiles_cache_ops,
218 "%s", 218 "%s",
219 fsdef->dentry->d_sb->s_id); 219 fsdef->dentry->d_sb->s_id);
220 220
221 fscache_object_init(&fsdef->fscache, NULL, &cache->cache); 221 fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
222 222
223 ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); 223 ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
224 if (ret < 0) 224 if (ret < 0)
225 goto error_add_cache; 225 goto error_add_cache;
226 226
227 /* done */ 227 /* done */
228 set_bit(CACHEFILES_READY, &cache->flags); 228 set_bit(CACHEFILES_READY, &cache->flags);
229 dput(root); 229 dput(root);
230 230
231 pr_info("File cache on %s registered\n", cache->cache.identifier); 231 pr_info("File cache on %s registered\n", cache->cache.identifier);
232 232
233 /* check how much space the cache has */ 233 /* check how much space the cache has */
234 cachefiles_has_space(cache, 0, 0); 234 cachefiles_has_space(cache, 0, 0);
235 cachefiles_end_secure(cache, saved_cred); 235 cachefiles_end_secure(cache, saved_cred);
236 return 0; 236 return 0;
237 237
238 error_add_cache: 238 error_add_cache:
239 dput(cache->graveyard); 239 dput(cache->graveyard);
240 cache->graveyard = NULL; 240 cache->graveyard = NULL;
241 error_unsupported: 241 error_unsupported:
242 mntput(cache->mnt); 242 mntput(cache->mnt);
243 cache->mnt = NULL; 243 cache->mnt = NULL;
244 dput(fsdef->dentry); 244 dput(fsdef->dentry);
245 fsdef->dentry = NULL; 245 fsdef->dentry = NULL;
246 dput(root); 246 dput(root);
247 error_open_root: 247 error_open_root:
248 kmem_cache_free(cachefiles_object_jar, fsdef); 248 kmem_cache_free(cachefiles_object_jar, fsdef);
249 error_root_object: 249 error_root_object:
250 cachefiles_end_secure(cache, saved_cred); 250 cachefiles_end_secure(cache, saved_cred);
251 pr_err("Failed to register: %d", ret); 251 pr_err("Failed to register: %d\n", ret);
252 return ret; 252 return ret;
253 } 253 }
254 254
255 /* 255 /*
256 * unbind a cache on fd release 256 * unbind a cache on fd release
257 */ 257 */
258 void cachefiles_daemon_unbind(struct cachefiles_cache *cache) 258 void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
259 { 259 {
260 _enter(""); 260 _enter("");
261 261
262 if (test_bit(CACHEFILES_READY, &cache->flags)) { 262 if (test_bit(CACHEFILES_READY, &cache->flags)) {
263 pr_info("File cache on %s unregistering\n", 263 pr_info("File cache on %s unregistering\n",
264 cache->cache.identifier); 264 cache->cache.identifier);
265 265
266 fscache_withdraw_cache(&cache->cache); 266 fscache_withdraw_cache(&cache->cache);
267 } 267 }
268 268
269 dput(cache->graveyard); 269 dput(cache->graveyard);
270 mntput(cache->mnt); 270 mntput(cache->mnt);
271 271
272 kfree(cache->rootdirname); 272 kfree(cache->rootdirname);
273 kfree(cache->secctx); 273 kfree(cache->secctx);
274 kfree(cache->tag); 274 kfree(cache->tag);
275 275
276 _leave(""); 276 _leave("");
277 } 277 }
278 278
fs/cachefiles/daemon.c
1 /* Daemon interface 1 /* Daemon interface
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/sched.h> 14 #include <linux/sched.h>
15 #include <linux/completion.h> 15 #include <linux/completion.h>
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <linux/fs.h> 17 #include <linux/fs.h>
18 #include <linux/file.h> 18 #include <linux/file.h>
19 #include <linux/namei.h> 19 #include <linux/namei.h>
20 #include <linux/poll.h> 20 #include <linux/poll.h>
21 #include <linux/mount.h> 21 #include <linux/mount.h>
22 #include <linux/statfs.h> 22 #include <linux/statfs.h>
23 #include <linux/ctype.h> 23 #include <linux/ctype.h>
24 #include <linux/string.h> 24 #include <linux/string.h>
25 #include <linux/fs_struct.h> 25 #include <linux/fs_struct.h>
26 #include "internal.h" 26 #include "internal.h"
27 27
28 static int cachefiles_daemon_open(struct inode *, struct file *); 28 static int cachefiles_daemon_open(struct inode *, struct file *);
29 static int cachefiles_daemon_release(struct inode *, struct file *); 29 static int cachefiles_daemon_release(struct inode *, struct file *);
30 static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, 30 static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
31 loff_t *); 31 loff_t *);
32 static ssize_t cachefiles_daemon_write(struct file *, const char __user *, 32 static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
33 size_t, loff_t *); 33 size_t, loff_t *);
34 static unsigned int cachefiles_daemon_poll(struct file *, 34 static unsigned int cachefiles_daemon_poll(struct file *,
35 struct poll_table_struct *); 35 struct poll_table_struct *);
36 static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); 36 static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
37 static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); 37 static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
38 static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); 38 static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
39 static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); 39 static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
40 static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); 40 static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
41 static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); 41 static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
42 static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); 42 static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
43 static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); 43 static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
44 static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); 44 static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
45 static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); 45 static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
46 static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); 46 static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
47 static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); 47 static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
48 48
49 static unsigned long cachefiles_open; 49 static unsigned long cachefiles_open;
50 50
51 const struct file_operations cachefiles_daemon_fops = { 51 const struct file_operations cachefiles_daemon_fops = {
52 .owner = THIS_MODULE, 52 .owner = THIS_MODULE,
53 .open = cachefiles_daemon_open, 53 .open = cachefiles_daemon_open,
54 .release = cachefiles_daemon_release, 54 .release = cachefiles_daemon_release,
55 .read = cachefiles_daemon_read, 55 .read = cachefiles_daemon_read,
56 .write = cachefiles_daemon_write, 56 .write = cachefiles_daemon_write,
57 .poll = cachefiles_daemon_poll, 57 .poll = cachefiles_daemon_poll,
58 .llseek = noop_llseek, 58 .llseek = noop_llseek,
59 }; 59 };
60 60
61 struct cachefiles_daemon_cmd { 61 struct cachefiles_daemon_cmd {
62 char name[8]; 62 char name[8];
63 int (*handler)(struct cachefiles_cache *cache, char *args); 63 int (*handler)(struct cachefiles_cache *cache, char *args);
64 }; 64 };
65 65
66 static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { 66 static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
67 { "bind", cachefiles_daemon_bind }, 67 { "bind", cachefiles_daemon_bind },
68 { "brun", cachefiles_daemon_brun }, 68 { "brun", cachefiles_daemon_brun },
69 { "bcull", cachefiles_daemon_bcull }, 69 { "bcull", cachefiles_daemon_bcull },
70 { "bstop", cachefiles_daemon_bstop }, 70 { "bstop", cachefiles_daemon_bstop },
71 { "cull", cachefiles_daemon_cull }, 71 { "cull", cachefiles_daemon_cull },
72 { "debug", cachefiles_daemon_debug }, 72 { "debug", cachefiles_daemon_debug },
73 { "dir", cachefiles_daemon_dir }, 73 { "dir", cachefiles_daemon_dir },
74 { "frun", cachefiles_daemon_frun }, 74 { "frun", cachefiles_daemon_frun },
75 { "fcull", cachefiles_daemon_fcull }, 75 { "fcull", cachefiles_daemon_fcull },
76 { "fstop", cachefiles_daemon_fstop }, 76 { "fstop", cachefiles_daemon_fstop },
77 { "inuse", cachefiles_daemon_inuse }, 77 { "inuse", cachefiles_daemon_inuse },
78 { "secctx", cachefiles_daemon_secctx }, 78 { "secctx", cachefiles_daemon_secctx },
79 { "tag", cachefiles_daemon_tag }, 79 { "tag", cachefiles_daemon_tag },
80 { "", NULL } 80 { "", NULL }
81 }; 81 };
82 82
83 83
84 /* 84 /*
85 * do various checks 85 * do various checks
86 */ 86 */
87 static int cachefiles_daemon_open(struct inode *inode, struct file *file) 87 static int cachefiles_daemon_open(struct inode *inode, struct file *file)
88 { 88 {
89 struct cachefiles_cache *cache; 89 struct cachefiles_cache *cache;
90 90
91 _enter(""); 91 _enter("");
92 92
93 /* only the superuser may do this */ 93 /* only the superuser may do this */
94 if (!capable(CAP_SYS_ADMIN)) 94 if (!capable(CAP_SYS_ADMIN))
95 return -EPERM; 95 return -EPERM;
96 96
97 /* the cachefiles device may only be open once at a time */ 97 /* the cachefiles device may only be open once at a time */
98 if (xchg(&cachefiles_open, 1) == 1) 98 if (xchg(&cachefiles_open, 1) == 1)
99 return -EBUSY; 99 return -EBUSY;
100 100
101 /* allocate a cache record */ 101 /* allocate a cache record */
102 cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); 102 cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
103 if (!cache) { 103 if (!cache) {
104 cachefiles_open = 0; 104 cachefiles_open = 0;
105 return -ENOMEM; 105 return -ENOMEM;
106 } 106 }
107 107
108 mutex_init(&cache->daemon_mutex); 108 mutex_init(&cache->daemon_mutex);
109 cache->active_nodes = RB_ROOT; 109 cache->active_nodes = RB_ROOT;
110 rwlock_init(&cache->active_lock); 110 rwlock_init(&cache->active_lock);
111 init_waitqueue_head(&cache->daemon_pollwq); 111 init_waitqueue_head(&cache->daemon_pollwq);
112 112
113 /* set default caching limits 113 /* set default caching limits
114 * - limit at 1% free space and/or free files 114 * - limit at 1% free space and/or free files
115 * - cull below 5% free space and/or free files 115 * - cull below 5% free space and/or free files
116 * - cease culling above 7% free space and/or free files 116 * - cease culling above 7% free space and/or free files
117 */ 117 */
118 cache->frun_percent = 7; 118 cache->frun_percent = 7;
119 cache->fcull_percent = 5; 119 cache->fcull_percent = 5;
120 cache->fstop_percent = 1; 120 cache->fstop_percent = 1;
121 cache->brun_percent = 7; 121 cache->brun_percent = 7;
122 cache->bcull_percent = 5; 122 cache->bcull_percent = 5;
123 cache->bstop_percent = 1; 123 cache->bstop_percent = 1;
124 124
125 file->private_data = cache; 125 file->private_data = cache;
126 cache->cachefilesd = file; 126 cache->cachefilesd = file;
127 return 0; 127 return 0;
128 } 128 }
129 129
130 /* 130 /*
131 * release a cache 131 * release a cache
132 */ 132 */
133 static int cachefiles_daemon_release(struct inode *inode, struct file *file) 133 static int cachefiles_daemon_release(struct inode *inode, struct file *file)
134 { 134 {
135 struct cachefiles_cache *cache = file->private_data; 135 struct cachefiles_cache *cache = file->private_data;
136 136
137 _enter(""); 137 _enter("");
138 138
139 ASSERT(cache); 139 ASSERT(cache);
140 140
141 set_bit(CACHEFILES_DEAD, &cache->flags); 141 set_bit(CACHEFILES_DEAD, &cache->flags);
142 142
143 cachefiles_daemon_unbind(cache); 143 cachefiles_daemon_unbind(cache);
144 144
145 ASSERT(!cache->active_nodes.rb_node); 145 ASSERT(!cache->active_nodes.rb_node);
146 146
147 /* clean up the control file interface */ 147 /* clean up the control file interface */
148 cache->cachefilesd = NULL; 148 cache->cachefilesd = NULL;
149 file->private_data = NULL; 149 file->private_data = NULL;
150 cachefiles_open = 0; 150 cachefiles_open = 0;
151 151
152 kfree(cache); 152 kfree(cache);
153 153
154 _leave(""); 154 _leave("");
155 return 0; 155 return 0;
156 } 156 }
157 157
158 /* 158 /*
159 * read the cache state 159 * read the cache state
160 */ 160 */
161 static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, 161 static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
162 size_t buflen, loff_t *pos) 162 size_t buflen, loff_t *pos)
163 { 163 {
164 struct cachefiles_cache *cache = file->private_data; 164 struct cachefiles_cache *cache = file->private_data;
165 char buffer[256]; 165 char buffer[256];
166 int n; 166 int n;
167 167
168 //_enter(",,%zu,", buflen); 168 //_enter(",,%zu,", buflen);
169 169
170 if (!test_bit(CACHEFILES_READY, &cache->flags)) 170 if (!test_bit(CACHEFILES_READY, &cache->flags))
171 return 0; 171 return 0;
172 172
173 /* check how much space the cache has */ 173 /* check how much space the cache has */
174 cachefiles_has_space(cache, 0, 0); 174 cachefiles_has_space(cache, 0, 0);
175 175
176 /* summarise */ 176 /* summarise */
177 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); 177 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
178 178
179 n = snprintf(buffer, sizeof(buffer), 179 n = snprintf(buffer, sizeof(buffer),
180 "cull=%c" 180 "cull=%c"
181 " frun=%llx" 181 " frun=%llx"
182 " fcull=%llx" 182 " fcull=%llx"
183 " fstop=%llx" 183 " fstop=%llx"
184 " brun=%llx" 184 " brun=%llx"
185 " bcull=%llx" 185 " bcull=%llx"
186 " bstop=%llx", 186 " bstop=%llx",
187 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', 187 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
188 (unsigned long long) cache->frun, 188 (unsigned long long) cache->frun,
189 (unsigned long long) cache->fcull, 189 (unsigned long long) cache->fcull,
190 (unsigned long long) cache->fstop, 190 (unsigned long long) cache->fstop,
191 (unsigned long long) cache->brun, 191 (unsigned long long) cache->brun,
192 (unsigned long long) cache->bcull, 192 (unsigned long long) cache->bcull,
193 (unsigned long long) cache->bstop 193 (unsigned long long) cache->bstop
194 ); 194 );
195 195
196 if (n > buflen) 196 if (n > buflen)
197 return -EMSGSIZE; 197 return -EMSGSIZE;
198 198
199 if (copy_to_user(_buffer, buffer, n) != 0) 199 if (copy_to_user(_buffer, buffer, n) != 0)
200 return -EFAULT; 200 return -EFAULT;
201 201
202 return n; 202 return n;
203 } 203 }
204 204
205 /* 205 /*
206 * command the cache 206 * command the cache
207 */ 207 */
208 static ssize_t cachefiles_daemon_write(struct file *file, 208 static ssize_t cachefiles_daemon_write(struct file *file,
209 const char __user *_data, 209 const char __user *_data,
210 size_t datalen, 210 size_t datalen,
211 loff_t *pos) 211 loff_t *pos)
212 { 212 {
213 const struct cachefiles_daemon_cmd *cmd; 213 const struct cachefiles_daemon_cmd *cmd;
214 struct cachefiles_cache *cache = file->private_data; 214 struct cachefiles_cache *cache = file->private_data;
215 ssize_t ret; 215 ssize_t ret;
216 char *data, *args, *cp; 216 char *data, *args, *cp;
217 217
218 //_enter(",,%zu,", datalen); 218 //_enter(",,%zu,", datalen);
219 219
220 ASSERT(cache); 220 ASSERT(cache);
221 221
222 if (test_bit(CACHEFILES_DEAD, &cache->flags)) 222 if (test_bit(CACHEFILES_DEAD, &cache->flags))
223 return -EIO; 223 return -EIO;
224 224
225 if (datalen < 0 || datalen > PAGE_SIZE - 1) 225 if (datalen < 0 || datalen > PAGE_SIZE - 1)
226 return -EOPNOTSUPP; 226 return -EOPNOTSUPP;
227 227
228 /* drag the command string into the kernel so we can parse it */ 228 /* drag the command string into the kernel so we can parse it */
229 data = kmalloc(datalen + 1, GFP_KERNEL); 229 data = kmalloc(datalen + 1, GFP_KERNEL);
230 if (!data) 230 if (!data)
231 return -ENOMEM; 231 return -ENOMEM;
232 232
233 ret = -EFAULT; 233 ret = -EFAULT;
234 if (copy_from_user(data, _data, datalen) != 0) 234 if (copy_from_user(data, _data, datalen) != 0)
235 goto error; 235 goto error;
236 236
237 data[datalen] = '\0'; 237 data[datalen] = '\0';
238 238
239 ret = -EINVAL; 239 ret = -EINVAL;
240 if (memchr(data, '\0', datalen)) 240 if (memchr(data, '\0', datalen))
241 goto error; 241 goto error;
242 242
243 /* strip any newline */ 243 /* strip any newline */
244 cp = memchr(data, '\n', datalen); 244 cp = memchr(data, '\n', datalen);
245 if (cp) { 245 if (cp) {
246 if (cp == data) 246 if (cp == data)
247 goto error; 247 goto error;
248 248
249 *cp = '\0'; 249 *cp = '\0';
250 } 250 }
251 251
252 /* parse the command */ 252 /* parse the command */
253 ret = -EOPNOTSUPP; 253 ret = -EOPNOTSUPP;
254 254
255 for (args = data; *args; args++) 255 for (args = data; *args; args++)
256 if (isspace(*args)) 256 if (isspace(*args))
257 break; 257 break;
258 if (*args) { 258 if (*args) {
259 if (args == data) 259 if (args == data)
260 goto error; 260 goto error;
261 *args = '\0'; 261 *args = '\0';
262 args = skip_spaces(++args); 262 args = skip_spaces(++args);
263 } 263 }
264 264
265 /* run the appropriate command handler */ 265 /* run the appropriate command handler */
266 for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) 266 for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
267 if (strcmp(cmd->name, data) == 0) 267 if (strcmp(cmd->name, data) == 0)
268 goto found_command; 268 goto found_command;
269 269
270 error: 270 error:
271 kfree(data); 271 kfree(data);
272 //_leave(" = %zd", ret); 272 //_leave(" = %zd", ret);
273 return ret; 273 return ret;
274 274
275 found_command: 275 found_command:
276 mutex_lock(&cache->daemon_mutex); 276 mutex_lock(&cache->daemon_mutex);
277 277
278 ret = -EIO; 278 ret = -EIO;
279 if (!test_bit(CACHEFILES_DEAD, &cache->flags)) 279 if (!test_bit(CACHEFILES_DEAD, &cache->flags))
280 ret = cmd->handler(cache, args); 280 ret = cmd->handler(cache, args);
281 281
282 mutex_unlock(&cache->daemon_mutex); 282 mutex_unlock(&cache->daemon_mutex);
283 283
284 if (ret == 0) 284 if (ret == 0)
285 ret = datalen; 285 ret = datalen;
286 goto error; 286 goto error;
287 } 287 }
288 288
289 /* 289 /*
290 * poll for culling state 290 * poll for culling state
291 * - use POLLOUT to indicate culling state 291 * - use POLLOUT to indicate culling state
292 */ 292 */
293 static unsigned int cachefiles_daemon_poll(struct file *file, 293 static unsigned int cachefiles_daemon_poll(struct file *file,
294 struct poll_table_struct *poll) 294 struct poll_table_struct *poll)
295 { 295 {
296 struct cachefiles_cache *cache = file->private_data; 296 struct cachefiles_cache *cache = file->private_data;
297 unsigned int mask; 297 unsigned int mask;
298 298
299 poll_wait(file, &cache->daemon_pollwq, poll); 299 poll_wait(file, &cache->daemon_pollwq, poll);
300 mask = 0; 300 mask = 0;
301 301
302 if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) 302 if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
303 mask |= POLLIN; 303 mask |= POLLIN;
304 304
305 if (test_bit(CACHEFILES_CULLING, &cache->flags)) 305 if (test_bit(CACHEFILES_CULLING, &cache->flags))
306 mask |= POLLOUT; 306 mask |= POLLOUT;
307 307
308 return mask; 308 return mask;
309 } 309 }
310 310
311 /* 311 /*
312 * give a range error for cache space constraints 312 * give a range error for cache space constraints
313 * - can be tail-called 313 * - can be tail-called
314 */ 314 */
315 static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, 315 static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
316 char *args) 316 char *args)
317 { 317 {
318 pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); 318 pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
319 319
320 return -EINVAL; 320 return -EINVAL;
321 } 321 }
322 322
323 /* 323 /*
324 * set the percentage of files at which to stop culling 324 * set the percentage of files at which to stop culling
325 * - command: "frun <N>%" 325 * - command: "frun <N>%"
326 */ 326 */
327 static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) 327 static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
328 { 328 {
329 unsigned long frun; 329 unsigned long frun;
330 330
331 _enter(",%s", args); 331 _enter(",%s", args);
332 332
333 if (!*args) 333 if (!*args)
334 return -EINVAL; 334 return -EINVAL;
335 335
336 frun = simple_strtoul(args, &args, 10); 336 frun = simple_strtoul(args, &args, 10);
337 if (args[0] != '%' || args[1] != '\0') 337 if (args[0] != '%' || args[1] != '\0')
338 return -EINVAL; 338 return -EINVAL;
339 339
340 if (frun <= cache->fcull_percent || frun >= 100) 340 if (frun <= cache->fcull_percent || frun >= 100)
341 return cachefiles_daemon_range_error(cache, args); 341 return cachefiles_daemon_range_error(cache, args);
342 342
343 cache->frun_percent = frun; 343 cache->frun_percent = frun;
344 return 0; 344 return 0;
345 } 345 }
346 346
347 /* 347 /*
348 * set the percentage of files at which to start culling 348 * set the percentage of files at which to start culling
349 * - command: "fcull <N>%" 349 * - command: "fcull <N>%"
350 */ 350 */
351 static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) 351 static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
352 { 352 {
353 unsigned long fcull; 353 unsigned long fcull;
354 354
355 _enter(",%s", args); 355 _enter(",%s", args);
356 356
357 if (!*args) 357 if (!*args)
358 return -EINVAL; 358 return -EINVAL;
359 359
360 fcull = simple_strtoul(args, &args, 10); 360 fcull = simple_strtoul(args, &args, 10);
361 if (args[0] != '%' || args[1] != '\0') 361 if (args[0] != '%' || args[1] != '\0')
362 return -EINVAL; 362 return -EINVAL;
363 363
364 if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) 364 if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
365 return cachefiles_daemon_range_error(cache, args); 365 return cachefiles_daemon_range_error(cache, args);
366 366
367 cache->fcull_percent = fcull; 367 cache->fcull_percent = fcull;
368 return 0; 368 return 0;
369 } 369 }
370 370
371 /* 371 /*
372 * set the percentage of files at which to stop allocating 372 * set the percentage of files at which to stop allocating
373 * - command: "fstop <N>%" 373 * - command: "fstop <N>%"
374 */ 374 */
375 static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) 375 static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
376 { 376 {
377 unsigned long fstop; 377 unsigned long fstop;
378 378
379 _enter(",%s", args); 379 _enter(",%s", args);
380 380
381 if (!*args) 381 if (!*args)
382 return -EINVAL; 382 return -EINVAL;
383 383
384 fstop = simple_strtoul(args, &args, 10); 384 fstop = simple_strtoul(args, &args, 10);
385 if (args[0] != '%' || args[1] != '\0') 385 if (args[0] != '%' || args[1] != '\0')
386 return -EINVAL; 386 return -EINVAL;
387 387
388 if (fstop < 0 || fstop >= cache->fcull_percent) 388 if (fstop < 0 || fstop >= cache->fcull_percent)
389 return cachefiles_daemon_range_error(cache, args); 389 return cachefiles_daemon_range_error(cache, args);
390 390
391 cache->fstop_percent = fstop; 391 cache->fstop_percent = fstop;
392 return 0; 392 return 0;
393 } 393 }
394 394
395 /* 395 /*
396 * set the percentage of blocks at which to stop culling 396 * set the percentage of blocks at which to stop culling
397 * - command: "brun <N>%" 397 * - command: "brun <N>%"
398 */ 398 */
399 static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) 399 static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
400 { 400 {
401 unsigned long brun; 401 unsigned long brun;
402 402
403 _enter(",%s", args); 403 _enter(",%s", args);
404 404
405 if (!*args) 405 if (!*args)
406 return -EINVAL; 406 return -EINVAL;
407 407
408 brun = simple_strtoul(args, &args, 10); 408 brun = simple_strtoul(args, &args, 10);
409 if (args[0] != '%' || args[1] != '\0') 409 if (args[0] != '%' || args[1] != '\0')
410 return -EINVAL; 410 return -EINVAL;
411 411
412 if (brun <= cache->bcull_percent || brun >= 100) 412 if (brun <= cache->bcull_percent || brun >= 100)
413 return cachefiles_daemon_range_error(cache, args); 413 return cachefiles_daemon_range_error(cache, args);
414 414
415 cache->brun_percent = brun; 415 cache->brun_percent = brun;
416 return 0; 416 return 0;
417 } 417 }
418 418
419 /* 419 /*
420 * set the percentage of blocks at which to start culling 420 * set the percentage of blocks at which to start culling
421 * - command: "bcull <N>%" 421 * - command: "bcull <N>%"
422 */ 422 */
423 static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) 423 static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
424 { 424 {
425 unsigned long bcull; 425 unsigned long bcull;
426 426
427 _enter(",%s", args); 427 _enter(",%s", args);
428 428
429 if (!*args) 429 if (!*args)
430 return -EINVAL; 430 return -EINVAL;
431 431
432 bcull = simple_strtoul(args, &args, 10); 432 bcull = simple_strtoul(args, &args, 10);
433 if (args[0] != '%' || args[1] != '\0') 433 if (args[0] != '%' || args[1] != '\0')
434 return -EINVAL; 434 return -EINVAL;
435 435
436 if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) 436 if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
437 return cachefiles_daemon_range_error(cache, args); 437 return cachefiles_daemon_range_error(cache, args);
438 438
439 cache->bcull_percent = bcull; 439 cache->bcull_percent = bcull;
440 return 0; 440 return 0;
441 } 441 }
442 442
443 /* 443 /*
444 * set the percentage of blocks at which to stop allocating 444 * set the percentage of blocks at which to stop allocating
445 * - command: "bstop <N>%" 445 * - command: "bstop <N>%"
446 */ 446 */
447 static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) 447 static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
448 { 448 {
449 unsigned long bstop; 449 unsigned long bstop;
450 450
451 _enter(",%s", args); 451 _enter(",%s", args);
452 452
453 if (!*args) 453 if (!*args)
454 return -EINVAL; 454 return -EINVAL;
455 455
456 bstop = simple_strtoul(args, &args, 10); 456 bstop = simple_strtoul(args, &args, 10);
457 if (args[0] != '%' || args[1] != '\0') 457 if (args[0] != '%' || args[1] != '\0')
458 return -EINVAL; 458 return -EINVAL;
459 459
460 if (bstop < 0 || bstop >= cache->bcull_percent) 460 if (bstop < 0 || bstop >= cache->bcull_percent)
461 return cachefiles_daemon_range_error(cache, args); 461 return cachefiles_daemon_range_error(cache, args);
462 462
463 cache->bstop_percent = bstop; 463 cache->bstop_percent = bstop;
464 return 0; 464 return 0;
465 } 465 }
466 466
467 /* 467 /*
468 * set the cache directory 468 * set the cache directory
469 * - command: "dir <name>" 469 * - command: "dir <name>"
470 */ 470 */
471 static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) 471 static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
472 { 472 {
473 char *dir; 473 char *dir;
474 474
475 _enter(",%s", args); 475 _enter(",%s", args);
476 476
477 if (!*args) { 477 if (!*args) {
478 pr_err("Empty directory specified"); 478 pr_err("Empty directory specified\n");
479 return -EINVAL; 479 return -EINVAL;
480 } 480 }
481 481
482 if (cache->rootdirname) { 482 if (cache->rootdirname) {
483 pr_err("Second cache directory specified"); 483 pr_err("Second cache directory specified\n");
484 return -EEXIST; 484 return -EEXIST;
485 } 485 }
486 486
487 dir = kstrdup(args, GFP_KERNEL); 487 dir = kstrdup(args, GFP_KERNEL);
488 if (!dir) 488 if (!dir)
489 return -ENOMEM; 489 return -ENOMEM;
490 490
491 cache->rootdirname = dir; 491 cache->rootdirname = dir;
492 return 0; 492 return 0;
493 } 493 }
494 494
495 /* 495 /*
496 * set the cache security context 496 * set the cache security context
497 * - command: "secctx <ctx>" 497 * - command: "secctx <ctx>"
498 */ 498 */
499 static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) 499 static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
500 { 500 {
501 char *secctx; 501 char *secctx;
502 502
503 _enter(",%s", args); 503 _enter(",%s", args);
504 504
505 if (!*args) { 505 if (!*args) {
506 pr_err("Empty security context specified"); 506 pr_err("Empty security context specified\n");
507 return -EINVAL; 507 return -EINVAL;
508 } 508 }
509 509
510 if (cache->secctx) { 510 if (cache->secctx) {
511 pr_err("Second security context specified"); 511 pr_err("Second security context specified\n");
512 return -EINVAL; 512 return -EINVAL;
513 } 513 }
514 514
515 secctx = kstrdup(args, GFP_KERNEL); 515 secctx = kstrdup(args, GFP_KERNEL);
516 if (!secctx) 516 if (!secctx)
517 return -ENOMEM; 517 return -ENOMEM;
518 518
519 cache->secctx = secctx; 519 cache->secctx = secctx;
520 return 0; 520 return 0;
521 } 521 }
522 522
523 /* 523 /*
524 * set the cache tag 524 * set the cache tag
525 * - command: "tag <name>" 525 * - command: "tag <name>"
526 */ 526 */
527 static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) 527 static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
528 { 528 {
529 char *tag; 529 char *tag;
530 530
531 _enter(",%s", args); 531 _enter(",%s", args);
532 532
533 if (!*args) { 533 if (!*args) {
534 pr_err("Empty tag specified"); 534 pr_err("Empty tag specified\n");
535 return -EINVAL; 535 return -EINVAL;
536 } 536 }
537 537
538 if (cache->tag) 538 if (cache->tag)
539 return -EEXIST; 539 return -EEXIST;
540 540
541 tag = kstrdup(args, GFP_KERNEL); 541 tag = kstrdup(args, GFP_KERNEL);
542 if (!tag) 542 if (!tag)
543 return -ENOMEM; 543 return -ENOMEM;
544 544
545 cache->tag = tag; 545 cache->tag = tag;
546 return 0; 546 return 0;
547 } 547 }
548 548
549 /* 549 /*
550 * request a node in the cache be culled from the current working directory 550 * request a node in the cache be culled from the current working directory
551 * - command: "cull <name>" 551 * - command: "cull <name>"
552 */ 552 */
553 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) 553 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554 { 554 {
555 struct path path; 555 struct path path;
556 const struct cred *saved_cred; 556 const struct cred *saved_cred;
557 int ret; 557 int ret;
558 558
559 _enter(",%s", args); 559 _enter(",%s", args);
560 560
561 if (strchr(args, '/')) 561 if (strchr(args, '/'))
562 goto inval; 562 goto inval;
563 563
564 if (!test_bit(CACHEFILES_READY, &cache->flags)) { 564 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
565 pr_err("cull applied to unready cache"); 565 pr_err("cull applied to unready cache\n");
566 return -EIO; 566 return -EIO;
567 } 567 }
568 568
569 if (test_bit(CACHEFILES_DEAD, &cache->flags)) { 569 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
570 pr_err("cull applied to dead cache"); 570 pr_err("cull applied to dead cache\n");
571 return -EIO; 571 return -EIO;
572 } 572 }
573 573
574 /* extract the directory dentry from the cwd */ 574 /* extract the directory dentry from the cwd */
575 get_fs_pwd(current->fs, &path); 575 get_fs_pwd(current->fs, &path);
576 576
577 if (!S_ISDIR(path.dentry->d_inode->i_mode)) 577 if (!S_ISDIR(path.dentry->d_inode->i_mode))
578 goto notdir; 578 goto notdir;
579 579
580 cachefiles_begin_secure(cache, &saved_cred); 580 cachefiles_begin_secure(cache, &saved_cred);
581 ret = cachefiles_cull(cache, path.dentry, args); 581 ret = cachefiles_cull(cache, path.dentry, args);
582 cachefiles_end_secure(cache, saved_cred); 582 cachefiles_end_secure(cache, saved_cred);
583 583
584 path_put(&path); 584 path_put(&path);
585 _leave(" = %d", ret); 585 _leave(" = %d", ret);
586 return ret; 586 return ret;
587 587
588 notdir: 588 notdir:
589 path_put(&path); 589 path_put(&path);
590 pr_err("cull command requires dirfd to be a directory"); 590 pr_err("cull command requires dirfd to be a directory\n");
591 return -ENOTDIR; 591 return -ENOTDIR;
592 592
593 inval: 593 inval:
594 pr_err("cull command requires dirfd and filename"); 594 pr_err("cull command requires dirfd and filename\n");
595 return -EINVAL; 595 return -EINVAL;
596 } 596 }
597 597
598 /* 598 /*
599 * set debugging mode 599 * set debugging mode
600 * - command: "debug <mask>" 600 * - command: "debug <mask>"
601 */ 601 */
602 static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) 602 static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
603 { 603 {
604 unsigned long mask; 604 unsigned long mask;
605 605
606 _enter(",%s", args); 606 _enter(",%s", args);
607 607
608 mask = simple_strtoul(args, &args, 0); 608 mask = simple_strtoul(args, &args, 0);
609 if (args[0] != '\0') 609 if (args[0] != '\0')
610 goto inval; 610 goto inval;
611 611
612 cachefiles_debug = mask; 612 cachefiles_debug = mask;
613 _leave(" = 0"); 613 _leave(" = 0");
614 return 0; 614 return 0;
615 615
616 inval: 616 inval:
617 pr_err("debug command requires mask"); 617 pr_err("debug command requires mask\n");
618 return -EINVAL; 618 return -EINVAL;
619 } 619 }
620 620
621 /* 621 /*
622 * find out whether an object in the current working directory is in use or not 622 * find out whether an object in the current working directory is in use or not
623 * - command: "inuse <name>" 623 * - command: "inuse <name>"
624 */ 624 */
625 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) 625 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
626 { 626 {
627 struct path path; 627 struct path path;
628 const struct cred *saved_cred; 628 const struct cred *saved_cred;
629 int ret; 629 int ret;
630 630
631 //_enter(",%s", args); 631 //_enter(",%s", args);
632 632
633 if (strchr(args, '/')) 633 if (strchr(args, '/'))
634 goto inval; 634 goto inval;
635 635
636 if (!test_bit(CACHEFILES_READY, &cache->flags)) { 636 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
637 pr_err("inuse applied to unready cache"); 637 pr_err("inuse applied to unready cache\n");
638 return -EIO; 638 return -EIO;
639 } 639 }
640 640
641 if (test_bit(CACHEFILES_DEAD, &cache->flags)) { 641 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
642 pr_err("inuse applied to dead cache"); 642 pr_err("inuse applied to dead cache\n");
643 return -EIO; 643 return -EIO;
644 } 644 }
645 645
646 /* extract the directory dentry from the cwd */ 646 /* extract the directory dentry from the cwd */
647 get_fs_pwd(current->fs, &path); 647 get_fs_pwd(current->fs, &path);
648 648
649 if (!S_ISDIR(path.dentry->d_inode->i_mode)) 649 if (!S_ISDIR(path.dentry->d_inode->i_mode))
650 goto notdir; 650 goto notdir;
651 651
652 cachefiles_begin_secure(cache, &saved_cred); 652 cachefiles_begin_secure(cache, &saved_cred);
653 ret = cachefiles_check_in_use(cache, path.dentry, args); 653 ret = cachefiles_check_in_use(cache, path.dentry, args);
654 cachefiles_end_secure(cache, saved_cred); 654 cachefiles_end_secure(cache, saved_cred);
655 655
656 path_put(&path); 656 path_put(&path);
657 //_leave(" = %d", ret); 657 //_leave(" = %d", ret);
658 return ret; 658 return ret;
659 659
660 notdir: 660 notdir:
661 path_put(&path); 661 path_put(&path);
662 pr_err("inuse command requires dirfd to be a directory"); 662 pr_err("inuse command requires dirfd to be a directory\n");
663 return -ENOTDIR; 663 return -ENOTDIR;
664 664
665 inval: 665 inval:
666 pr_err("inuse command requires dirfd and filename"); 666 pr_err("inuse command requires dirfd and filename\n");
667 return -EINVAL; 667 return -EINVAL;
668 } 668 }
669 669
670 /* 670 /*
671 * see if we have space for a number of pages and/or a number of files in the 671 * see if we have space for a number of pages and/or a number of files in the
672 * cache 672 * cache
673 */ 673 */
674 int cachefiles_has_space(struct cachefiles_cache *cache, 674 int cachefiles_has_space(struct cachefiles_cache *cache,
675 unsigned fnr, unsigned bnr) 675 unsigned fnr, unsigned bnr)
676 { 676 {
677 struct kstatfs stats; 677 struct kstatfs stats;
678 struct path path = { 678 struct path path = {
679 .mnt = cache->mnt, 679 .mnt = cache->mnt,
680 .dentry = cache->mnt->mnt_root, 680 .dentry = cache->mnt->mnt_root,
681 }; 681 };
682 int ret; 682 int ret;
683 683
684 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", 684 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
685 // (unsigned long long) cache->frun, 685 // (unsigned long long) cache->frun,
686 // (unsigned long long) cache->fcull, 686 // (unsigned long long) cache->fcull,
687 // (unsigned long long) cache->fstop, 687 // (unsigned long long) cache->fstop,
688 // (unsigned long long) cache->brun, 688 // (unsigned long long) cache->brun,
689 // (unsigned long long) cache->bcull, 689 // (unsigned long long) cache->bcull,
690 // (unsigned long long) cache->bstop, 690 // (unsigned long long) cache->bstop,
691 // fnr, bnr); 691 // fnr, bnr);
692 692
693 /* find out how many pages of blockdev are available */ 693 /* find out how many pages of blockdev are available */
694 memset(&stats, 0, sizeof(stats)); 694 memset(&stats, 0, sizeof(stats));
695 695
696 ret = vfs_statfs(&path, &stats); 696 ret = vfs_statfs(&path, &stats);
697 if (ret < 0) { 697 if (ret < 0) {
698 if (ret == -EIO) 698 if (ret == -EIO)
699 cachefiles_io_error(cache, "statfs failed"); 699 cachefiles_io_error(cache, "statfs failed");
700 _leave(" = %d", ret); 700 _leave(" = %d", ret);
701 return ret; 701 return ret;
702 } 702 }
703 703
704 stats.f_bavail >>= cache->bshift; 704 stats.f_bavail >>= cache->bshift;
705 705
706 //_debug("avail %llu,%llu", 706 //_debug("avail %llu,%llu",
707 // (unsigned long long) stats.f_ffree, 707 // (unsigned long long) stats.f_ffree,
708 // (unsigned long long) stats.f_bavail); 708 // (unsigned long long) stats.f_bavail);
709 709
710 /* see if there is sufficient space */ 710 /* see if there is sufficient space */
711 if (stats.f_ffree > fnr) 711 if (stats.f_ffree > fnr)
712 stats.f_ffree -= fnr; 712 stats.f_ffree -= fnr;
713 else 713 else
714 stats.f_ffree = 0; 714 stats.f_ffree = 0;
715 715
716 if (stats.f_bavail > bnr) 716 if (stats.f_bavail > bnr)
717 stats.f_bavail -= bnr; 717 stats.f_bavail -= bnr;
718 else 718 else
719 stats.f_bavail = 0; 719 stats.f_bavail = 0;
720 720
721 ret = -ENOBUFS; 721 ret = -ENOBUFS;
722 if (stats.f_ffree < cache->fstop || 722 if (stats.f_ffree < cache->fstop ||
723 stats.f_bavail < cache->bstop) 723 stats.f_bavail < cache->bstop)
724 goto begin_cull; 724 goto begin_cull;
725 725
726 ret = 0; 726 ret = 0;
727 if (stats.f_ffree < cache->fcull || 727 if (stats.f_ffree < cache->fcull ||
728 stats.f_bavail < cache->bcull) 728 stats.f_bavail < cache->bcull)
729 goto begin_cull; 729 goto begin_cull;
730 730
731 if (test_bit(CACHEFILES_CULLING, &cache->flags) && 731 if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
732 stats.f_ffree >= cache->frun && 732 stats.f_ffree >= cache->frun &&
733 stats.f_bavail >= cache->brun && 733 stats.f_bavail >= cache->brun &&
734 test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) 734 test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
735 ) { 735 ) {
736 _debug("cease culling"); 736 _debug("cease culling");
737 cachefiles_state_changed(cache); 737 cachefiles_state_changed(cache);
738 } 738 }
739 739
740 //_leave(" = 0"); 740 //_leave(" = 0");
741 return 0; 741 return 0;
742 742
743 begin_cull: 743 begin_cull:
744 if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { 744 if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
745 _debug("### CULL CACHE ###"); 745 _debug("### CULL CACHE ###");
746 cachefiles_state_changed(cache); 746 cachefiles_state_changed(cache);
747 } 747 }
748 748
749 _leave(" = %d", ret); 749 _leave(" = %d", ret);
750 return ret; 750 return ret;
751 } 751 }
752 752
fs/cachefiles/internal.h
1 /* General netfs cache on cache files internal defs 1 /* General netfs cache on cache files internal defs
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12 #ifdef pr_fmt 12 #ifdef pr_fmt
13 #undef pr_fmt 13 #undef pr_fmt
14 #endif 14 #endif
15 15
16 #define pr_fmt(fmt) "CacheFiles: " fmt 16 #define pr_fmt(fmt) "CacheFiles: " fmt
17 17
18 18
19 #include <linux/fscache-cache.h> 19 #include <linux/fscache-cache.h>
20 #include <linux/timer.h> 20 #include <linux/timer.h>
21 #include <linux/wait.h> 21 #include <linux/wait.h>
22 #include <linux/workqueue.h> 22 #include <linux/workqueue.h>
23 #include <linux/security.h> 23 #include <linux/security.h>
24 24
25 struct cachefiles_cache; 25 struct cachefiles_cache;
26 struct cachefiles_object; 26 struct cachefiles_object;
27 27
28 extern unsigned cachefiles_debug; 28 extern unsigned cachefiles_debug;
29 #define CACHEFILES_DEBUG_KENTER 1 29 #define CACHEFILES_DEBUG_KENTER 1
30 #define CACHEFILES_DEBUG_KLEAVE 2 30 #define CACHEFILES_DEBUG_KLEAVE 2
31 #define CACHEFILES_DEBUG_KDEBUG 4 31 #define CACHEFILES_DEBUG_KDEBUG 4
32 32
33 #define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) 33 #define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
34 34
35 /* 35 /*
36 * node records 36 * node records
37 */ 37 */
38 struct cachefiles_object { 38 struct cachefiles_object {
39 struct fscache_object fscache; /* fscache handle */ 39 struct fscache_object fscache; /* fscache handle */
40 struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ 40 struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
41 struct dentry *dentry; /* the file/dir representing this object */ 41 struct dentry *dentry; /* the file/dir representing this object */
42 struct dentry *backer; /* backing file */ 42 struct dentry *backer; /* backing file */
43 loff_t i_size; /* object size */ 43 loff_t i_size; /* object size */
44 unsigned long flags; 44 unsigned long flags;
45 #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ 45 #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
46 #define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ 46 #define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */
47 atomic_t usage; /* object usage count */ 47 atomic_t usage; /* object usage count */
48 uint8_t type; /* object type */ 48 uint8_t type; /* object type */
49 uint8_t new; /* T if object new */ 49 uint8_t new; /* T if object new */
50 spinlock_t work_lock; 50 spinlock_t work_lock;
51 struct rb_node active_node; /* link in active tree (dentry is key) */ 51 struct rb_node active_node; /* link in active tree (dentry is key) */
52 }; 52 };
53 53
54 extern struct kmem_cache *cachefiles_object_jar; 54 extern struct kmem_cache *cachefiles_object_jar;
55 55
56 /* 56 /*
57 * Cache files cache definition 57 * Cache files cache definition
58 */ 58 */
59 struct cachefiles_cache { 59 struct cachefiles_cache {
60 struct fscache_cache cache; /* FS-Cache record */ 60 struct fscache_cache cache; /* FS-Cache record */
61 struct vfsmount *mnt; /* mountpoint holding the cache */ 61 struct vfsmount *mnt; /* mountpoint holding the cache */
62 struct dentry *graveyard; /* directory into which dead objects go */ 62 struct dentry *graveyard; /* directory into which dead objects go */
63 struct file *cachefilesd; /* manager daemon handle */ 63 struct file *cachefilesd; /* manager daemon handle */
64 const struct cred *cache_cred; /* security override for accessing cache */ 64 const struct cred *cache_cred; /* security override for accessing cache */
65 struct mutex daemon_mutex; /* command serialisation mutex */ 65 struct mutex daemon_mutex; /* command serialisation mutex */
66 wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ 66 wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
67 struct rb_root active_nodes; /* active nodes (can't be culled) */ 67 struct rb_root active_nodes; /* active nodes (can't be culled) */
68 rwlock_t active_lock; /* lock for active_nodes */ 68 rwlock_t active_lock; /* lock for active_nodes */
69 atomic_t gravecounter; /* graveyard uniquifier */ 69 atomic_t gravecounter; /* graveyard uniquifier */
70 unsigned frun_percent; /* when to stop culling (% files) */ 70 unsigned frun_percent; /* when to stop culling (% files) */
71 unsigned fcull_percent; /* when to start culling (% files) */ 71 unsigned fcull_percent; /* when to start culling (% files) */
72 unsigned fstop_percent; /* when to stop allocating (% files) */ 72 unsigned fstop_percent; /* when to stop allocating (% files) */
73 unsigned brun_percent; /* when to stop culling (% blocks) */ 73 unsigned brun_percent; /* when to stop culling (% blocks) */
74 unsigned bcull_percent; /* when to start culling (% blocks) */ 74 unsigned bcull_percent; /* when to start culling (% blocks) */
75 unsigned bstop_percent; /* when to stop allocating (% blocks) */ 75 unsigned bstop_percent; /* when to stop allocating (% blocks) */
76 unsigned bsize; /* cache's block size */ 76 unsigned bsize; /* cache's block size */
77 unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ 77 unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
78 uint64_t frun; /* when to stop culling */ 78 uint64_t frun; /* when to stop culling */
79 uint64_t fcull; /* when to start culling */ 79 uint64_t fcull; /* when to start culling */
80 uint64_t fstop; /* when to stop allocating */ 80 uint64_t fstop; /* when to stop allocating */
81 sector_t brun; /* when to stop culling */ 81 sector_t brun; /* when to stop culling */
82 sector_t bcull; /* when to start culling */ 82 sector_t bcull; /* when to start culling */
83 sector_t bstop; /* when to stop allocating */ 83 sector_t bstop; /* when to stop allocating */
84 unsigned long flags; 84 unsigned long flags;
85 #define CACHEFILES_READY 0 /* T if cache prepared */ 85 #define CACHEFILES_READY 0 /* T if cache prepared */
86 #define CACHEFILES_DEAD 1 /* T if cache dead */ 86 #define CACHEFILES_DEAD 1 /* T if cache dead */
87 #define CACHEFILES_CULLING 2 /* T if cull engaged */ 87 #define CACHEFILES_CULLING 2 /* T if cull engaged */
88 #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ 88 #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
89 char *rootdirname; /* name of cache root directory */ 89 char *rootdirname; /* name of cache root directory */
90 char *secctx; /* LSM security context */ 90 char *secctx; /* LSM security context */
91 char *tag; /* cache binding tag */ 91 char *tag; /* cache binding tag */
92 }; 92 };
93 93
94 /* 94 /*
95 * backing file read tracking 95 * backing file read tracking
96 */ 96 */
97 struct cachefiles_one_read { 97 struct cachefiles_one_read {
98 wait_queue_t monitor; /* link into monitored waitqueue */ 98 wait_queue_t monitor; /* link into monitored waitqueue */
99 struct page *back_page; /* backing file page we're waiting for */ 99 struct page *back_page; /* backing file page we're waiting for */
100 struct page *netfs_page; /* netfs page we're going to fill */ 100 struct page *netfs_page; /* netfs page we're going to fill */
101 struct fscache_retrieval *op; /* retrieval op covering this */ 101 struct fscache_retrieval *op; /* retrieval op covering this */
102 struct list_head op_link; /* link in op's todo list */ 102 struct list_head op_link; /* link in op's todo list */
103 }; 103 };
104 104
105 /* 105 /*
106 * backing file write tracking 106 * backing file write tracking
107 */ 107 */
108 struct cachefiles_one_write { 108 struct cachefiles_one_write {
109 struct page *netfs_page; /* netfs page to copy */ 109 struct page *netfs_page; /* netfs page to copy */
110 struct cachefiles_object *object; 110 struct cachefiles_object *object;
111 struct list_head obj_link; /* link in object's lists */ 111 struct list_head obj_link; /* link in object's lists */
112 fscache_rw_complete_t end_io_func; 112 fscache_rw_complete_t end_io_func;
113 void *context; 113 void *context;
114 }; 114 };
115 115
116 /* 116 /*
117 * auxiliary data xattr buffer 117 * auxiliary data xattr buffer
118 */ 118 */
119 struct cachefiles_xattr { 119 struct cachefiles_xattr {
120 uint16_t len; 120 uint16_t len;
121 uint8_t type; 121 uint8_t type;
122 uint8_t data[]; 122 uint8_t data[];
123 }; 123 };
124 124
125 /* 125 /*
126 * note change of state for daemon 126 * note change of state for daemon
127 */ 127 */
128 static inline void cachefiles_state_changed(struct cachefiles_cache *cache) 128 static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
129 { 129 {
130 set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); 130 set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
131 wake_up_all(&cache->daemon_pollwq); 131 wake_up_all(&cache->daemon_pollwq);
132 } 132 }
133 133
134 /* 134 /*
135 * bind.c 135 * bind.c
136 */ 136 */
137 extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); 137 extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
138 extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); 138 extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
139 139
140 /* 140 /*
141 * daemon.c 141 * daemon.c
142 */ 142 */
143 extern const struct file_operations cachefiles_daemon_fops; 143 extern const struct file_operations cachefiles_daemon_fops;
144 144
145 extern int cachefiles_has_space(struct cachefiles_cache *cache, 145 extern int cachefiles_has_space(struct cachefiles_cache *cache,
146 unsigned fnr, unsigned bnr); 146 unsigned fnr, unsigned bnr);
147 147
148 /* 148 /*
149 * interface.c 149 * interface.c
150 */ 150 */
151 extern const struct fscache_cache_ops cachefiles_cache_ops; 151 extern const struct fscache_cache_ops cachefiles_cache_ops;
152 152
153 /* 153 /*
154 * key.c 154 * key.c
155 */ 155 */
156 extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); 156 extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
157 157
158 /* 158 /*
159 * namei.c 159 * namei.c
160 */ 160 */
161 extern int cachefiles_delete_object(struct cachefiles_cache *cache, 161 extern int cachefiles_delete_object(struct cachefiles_cache *cache,
162 struct cachefiles_object *object); 162 struct cachefiles_object *object);
163 extern int cachefiles_walk_to_object(struct cachefiles_object *parent, 163 extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
164 struct cachefiles_object *object, 164 struct cachefiles_object *object,
165 const char *key, 165 const char *key,
166 struct cachefiles_xattr *auxdata); 166 struct cachefiles_xattr *auxdata);
167 extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, 167 extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
168 struct dentry *dir, 168 struct dentry *dir,
169 const char *name); 169 const char *name);
170 170
171 extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, 171 extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
172 char *filename); 172 char *filename);
173 173
174 extern int cachefiles_check_in_use(struct cachefiles_cache *cache, 174 extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
175 struct dentry *dir, char *filename); 175 struct dentry *dir, char *filename);
176 176
177 /* 177 /*
178 * proc.c 178 * proc.c
179 */ 179 */
180 #ifdef CONFIG_CACHEFILES_HISTOGRAM 180 #ifdef CONFIG_CACHEFILES_HISTOGRAM
181 extern atomic_t cachefiles_lookup_histogram[HZ]; 181 extern atomic_t cachefiles_lookup_histogram[HZ];
182 extern atomic_t cachefiles_mkdir_histogram[HZ]; 182 extern atomic_t cachefiles_mkdir_histogram[HZ];
183 extern atomic_t cachefiles_create_histogram[HZ]; 183 extern atomic_t cachefiles_create_histogram[HZ];
184 184
185 extern int __init cachefiles_proc_init(void); 185 extern int __init cachefiles_proc_init(void);
186 extern void cachefiles_proc_cleanup(void); 186 extern void cachefiles_proc_cleanup(void);
187 static inline 187 static inline
188 void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) 188 void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
189 { 189 {
190 unsigned long jif = jiffies - start_jif; 190 unsigned long jif = jiffies - start_jif;
191 if (jif >= HZ) 191 if (jif >= HZ)
192 jif = HZ - 1; 192 jif = HZ - 1;
193 atomic_inc(&histogram[jif]); 193 atomic_inc(&histogram[jif]);
194 } 194 }
195 195
196 #else 196 #else
197 #define cachefiles_proc_init() (0) 197 #define cachefiles_proc_init() (0)
198 #define cachefiles_proc_cleanup() do {} while (0) 198 #define cachefiles_proc_cleanup() do {} while (0)
199 #define cachefiles_hist(hist, start_jif) do {} while (0) 199 #define cachefiles_hist(hist, start_jif) do {} while (0)
200 #endif 200 #endif
201 201
202 /* 202 /*
203 * rdwr.c 203 * rdwr.c
204 */ 204 */
205 extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, 205 extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
206 struct page *, gfp_t); 206 struct page *, gfp_t);
207 extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, 207 extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
208 struct list_head *, unsigned *, 208 struct list_head *, unsigned *,
209 gfp_t); 209 gfp_t);
210 extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, 210 extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
211 gfp_t); 211 gfp_t);
212 extern int cachefiles_allocate_pages(struct fscache_retrieval *, 212 extern int cachefiles_allocate_pages(struct fscache_retrieval *,
213 struct list_head *, unsigned *, gfp_t); 213 struct list_head *, unsigned *, gfp_t);
214 extern int cachefiles_write_page(struct fscache_storage *, struct page *); 214 extern int cachefiles_write_page(struct fscache_storage *, struct page *);
215 extern void cachefiles_uncache_page(struct fscache_object *, struct page *); 215 extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
216 216
217 /* 217 /*
218 * security.c 218 * security.c
219 */ 219 */
220 extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); 220 extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
221 extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, 221 extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
222 struct dentry *root, 222 struct dentry *root,
223 const struct cred **_saved_cred); 223 const struct cred **_saved_cred);
224 224
225 static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, 225 static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
226 const struct cred **_saved_cred) 226 const struct cred **_saved_cred)
227 { 227 {
228 *_saved_cred = override_creds(cache->cache_cred); 228 *_saved_cred = override_creds(cache->cache_cred);
229 } 229 }
230 230
231 static inline void cachefiles_end_secure(struct cachefiles_cache *cache, 231 static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
232 const struct cred *saved_cred) 232 const struct cred *saved_cred)
233 { 233 {
234 revert_creds(saved_cred); 234 revert_creds(saved_cred);
235 } 235 }
236 236
237 /* 237 /*
238 * xattr.c 238 * xattr.c
239 */ 239 */
240 extern int cachefiles_check_object_type(struct cachefiles_object *object); 240 extern int cachefiles_check_object_type(struct cachefiles_object *object);
241 extern int cachefiles_set_object_xattr(struct cachefiles_object *object, 241 extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
242 struct cachefiles_xattr *auxdata); 242 struct cachefiles_xattr *auxdata);
243 extern int cachefiles_update_object_xattr(struct cachefiles_object *object, 243 extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
244 struct cachefiles_xattr *auxdata); 244 struct cachefiles_xattr *auxdata);
245 extern int cachefiles_check_auxdata(struct cachefiles_object *object); 245 extern int cachefiles_check_auxdata(struct cachefiles_object *object);
246 extern int cachefiles_check_object_xattr(struct cachefiles_object *object, 246 extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
247 struct cachefiles_xattr *auxdata); 247 struct cachefiles_xattr *auxdata);
248 extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, 248 extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
249 struct dentry *dentry); 249 struct dentry *dentry);
250 250
251 251
252 /* 252 /*
253 * error handling 253 * error handling
254 */ 254 */
255 255
256 #define cachefiles_io_error(___cache, FMT, ...) \ 256 #define cachefiles_io_error(___cache, FMT, ...) \
257 do { \ 257 do { \
258 pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ 258 pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
259 fscache_io_error(&(___cache)->cache); \ 259 fscache_io_error(&(___cache)->cache); \
260 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ 260 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
261 } while (0) 261 } while (0)
262 262
263 #define cachefiles_io_error_obj(object, FMT, ...) \ 263 #define cachefiles_io_error_obj(object, FMT, ...) \
264 do { \ 264 do { \
265 struct cachefiles_cache *___cache; \ 265 struct cachefiles_cache *___cache; \
266 \ 266 \
267 ___cache = container_of((object)->fscache.cache, \ 267 ___cache = container_of((object)->fscache.cache, \
268 struct cachefiles_cache, cache); \ 268 struct cachefiles_cache, cache); \
269 cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ 269 cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
270 } while (0) 270 } while (0)
271 271
272 272
273 /* 273 /*
274 * debug tracing 274 * debug tracing
275 */ 275 */
276 #define dbgprintk(FMT, ...) \ 276 #define dbgprintk(FMT, ...) \
277 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 277 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
278 278
279 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 279 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
280 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 280 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
281 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 281 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
282 282
283 283
284 #if defined(__KDEBUG) 284 #if defined(__KDEBUG)
285 #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) 285 #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
286 #define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) 286 #define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
287 #define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) 287 #define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
288 288
289 #elif defined(CONFIG_CACHEFILES_DEBUG) 289 #elif defined(CONFIG_CACHEFILES_DEBUG)
290 #define _enter(FMT, ...) \ 290 #define _enter(FMT, ...) \
291 do { \ 291 do { \
292 if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ 292 if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
293 kenter(FMT, ##__VA_ARGS__); \ 293 kenter(FMT, ##__VA_ARGS__); \
294 } while (0) 294 } while (0)
295 295
296 #define _leave(FMT, ...) \ 296 #define _leave(FMT, ...) \
297 do { \ 297 do { \
298 if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ 298 if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
299 kleave(FMT, ##__VA_ARGS__); \ 299 kleave(FMT, ##__VA_ARGS__); \
300 } while (0) 300 } while (0)
301 301
302 #define _debug(FMT, ...) \ 302 #define _debug(FMT, ...) \
303 do { \ 303 do { \
304 if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ 304 if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
305 kdebug(FMT, ##__VA_ARGS__); \ 305 kdebug(FMT, ##__VA_ARGS__); \
306 } while (0) 306 } while (0)
307 307
308 #else 308 #else
309 #define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) 309 #define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
310 #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 310 #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
311 #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) 311 #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
312 #endif 312 #endif
313 313
314 #if 1 /* defined(__KDEBUGALL) */ 314 #if 1 /* defined(__KDEBUGALL) */
315 315
316 #define ASSERT(X) \ 316 #define ASSERT(X) \
317 do { \ 317 do { \
318 if (unlikely(!(X))) { \ 318 if (unlikely(!(X))) { \
319 pr_err("\n"); \ 319 pr_err("\n"); \
320 pr_err("Assertion failed\n"); \ 320 pr_err("Assertion failed\n"); \
321 BUG(); \ 321 BUG(); \
322 } \ 322 } \
323 } while (0) 323 } while (0)
324 324
325 #define ASSERTCMP(X, OP, Y) \ 325 #define ASSERTCMP(X, OP, Y) \
326 do { \ 326 do { \
327 if (unlikely(!((X) OP (Y)))) { \ 327 if (unlikely(!((X) OP (Y)))) { \
328 pr_err("\n"); \ 328 pr_err("\n"); \
329 pr_err("Assertion failed\n"); \ 329 pr_err("Assertion failed\n"); \
330 pr_err("%lx " #OP " %lx is false\n", \ 330 pr_err("%lx " #OP " %lx is false\n", \
331 (unsigned long)(X), (unsigned long)(Y)); \ 331 (unsigned long)(X), (unsigned long)(Y)); \
332 BUG(); \ 332 BUG(); \
333 } \ 333 } \
334 } while (0) 334 } while (0)
335 335
336 #define ASSERTIF(C, X) \ 336 #define ASSERTIF(C, X) \
337 do { \ 337 do { \
338 if (unlikely((C) && !(X))) { \ 338 if (unlikely((C) && !(X))) { \
339 pr_err("\n"); \ 339 pr_err("\n"); \
340 pr_err("Assertion failed\n"); \ 340 pr_err("Assertion failed\n"); \
341 BUG(); \ 341 BUG(); \
342 } \ 342 } \
343 } while (0) 343 } while (0)
344 344
345 #define ASSERTIFCMP(C, X, OP, Y) \ 345 #define ASSERTIFCMP(C, X, OP, Y) \
346 do { \ 346 do { \
347 if (unlikely((C) && !((X) OP (Y)))) { \ 347 if (unlikely((C) && !((X) OP (Y)))) { \
348 pr_err("\n"); \ 348 pr_err("\n"); \
349 pr_err("Assertion failed\n"); \ 349 pr_err("Assertion failed\n"); \
350 pr_err("%lx " #OP " %lx is false\n", \ 350 pr_err("%lx " #OP " %lx is false\n", \
351 (unsigned long)(X), (unsigned long)(Y)); \ 351 (unsigned long)(X), (unsigned long)(Y)); \
352 BUG(); \ 352 BUG(); \
353 } \ 353 } \
354 } while (0) 354 } while (0)
355 355
356 #else 356 #else
357 357
358 #define ASSERT(X) do {} while (0) 358 #define ASSERT(X) do {} while (0)
359 #define ASSERTCMP(X, OP, Y) do {} while (0) 359 #define ASSERTCMP(X, OP, Y) do {} while (0)
360 #define ASSERTIF(C, X) do {} while (0) 360 #define ASSERTIF(C, X) do {} while (0)
361 #define ASSERTIFCMP(C, X, OP, Y) do {} while (0) 361 #define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
362 362
363 #endif 363 #endif
364 364
fs/cachefiles/main.c
1 /* Network filesystem caching backend to use cache files on a premounted 1 /* Network filesystem caching backend to use cache files on a premounted
2 * filesystem 2 * filesystem
3 * 3 *
4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com) 5 * Written by David Howells (dhowells@redhat.com)
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public Licence 8 * modify it under the terms of the GNU General Public Licence
9 * as published by the Free Software Foundation; either version 9 * as published by the Free Software Foundation; either version
10 * 2 of the Licence, or (at your option) any later version. 10 * 2 of the Licence, or (at your option) any later version.
11 */ 11 */
12 12
13 #include <linux/module.h> 13 #include <linux/module.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <linux/sched.h> 15 #include <linux/sched.h>
16 #include <linux/completion.h> 16 #include <linux/completion.h>
17 #include <linux/slab.h> 17 #include <linux/slab.h>
18 #include <linux/fs.h> 18 #include <linux/fs.h>
19 #include <linux/file.h> 19 #include <linux/file.h>
20 #include <linux/namei.h> 20 #include <linux/namei.h>
21 #include <linux/mount.h> 21 #include <linux/mount.h>
22 #include <linux/statfs.h> 22 #include <linux/statfs.h>
23 #include <linux/sysctl.h> 23 #include <linux/sysctl.h>
24 #include <linux/miscdevice.h> 24 #include <linux/miscdevice.h>
25 #include "internal.h" 25 #include "internal.h"
26 26
27 unsigned cachefiles_debug; 27 unsigned cachefiles_debug;
28 module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); 28 module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
29 MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); 29 MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
30 30
31 MODULE_DESCRIPTION("Mounted-filesystem based cache"); 31 MODULE_DESCRIPTION("Mounted-filesystem based cache");
32 MODULE_AUTHOR("Red Hat, Inc."); 32 MODULE_AUTHOR("Red Hat, Inc.");
33 MODULE_LICENSE("GPL"); 33 MODULE_LICENSE("GPL");
34 34
35 struct kmem_cache *cachefiles_object_jar; 35 struct kmem_cache *cachefiles_object_jar;
36 36
37 static struct miscdevice cachefiles_dev = { 37 static struct miscdevice cachefiles_dev = {
38 .minor = MISC_DYNAMIC_MINOR, 38 .minor = MISC_DYNAMIC_MINOR,
39 .name = "cachefiles", 39 .name = "cachefiles",
40 .fops = &cachefiles_daemon_fops, 40 .fops = &cachefiles_daemon_fops,
41 }; 41 };
42 42
43 static void cachefiles_object_init_once(void *_object) 43 static void cachefiles_object_init_once(void *_object)
44 { 44 {
45 struct cachefiles_object *object = _object; 45 struct cachefiles_object *object = _object;
46 46
47 memset(object, 0, sizeof(*object)); 47 memset(object, 0, sizeof(*object));
48 spin_lock_init(&object->work_lock); 48 spin_lock_init(&object->work_lock);
49 } 49 }
50 50
51 /* 51 /*
52 * initialise the fs caching module 52 * initialise the fs caching module
53 */ 53 */
54 static int __init cachefiles_init(void) 54 static int __init cachefiles_init(void)
55 { 55 {
56 int ret; 56 int ret;
57 57
58 ret = misc_register(&cachefiles_dev); 58 ret = misc_register(&cachefiles_dev);
59 if (ret < 0) 59 if (ret < 0)
60 goto error_dev; 60 goto error_dev;
61 61
62 /* create an object jar */ 62 /* create an object jar */
63 ret = -ENOMEM; 63 ret = -ENOMEM;
64 cachefiles_object_jar = 64 cachefiles_object_jar =
65 kmem_cache_create("cachefiles_object_jar", 65 kmem_cache_create("cachefiles_object_jar",
66 sizeof(struct cachefiles_object), 66 sizeof(struct cachefiles_object),
67 0, 67 0,
68 SLAB_HWCACHE_ALIGN, 68 SLAB_HWCACHE_ALIGN,
69 cachefiles_object_init_once); 69 cachefiles_object_init_once);
70 if (!cachefiles_object_jar) { 70 if (!cachefiles_object_jar) {
71 pr_notice("Failed to allocate an object jar\n"); 71 pr_notice("Failed to allocate an object jar\n");
72 goto error_object_jar; 72 goto error_object_jar;
73 } 73 }
74 74
75 ret = cachefiles_proc_init(); 75 ret = cachefiles_proc_init();
76 if (ret < 0) 76 if (ret < 0)
77 goto error_proc; 77 goto error_proc;
78 78
79 pr_info("Loaded\n"); 79 pr_info("Loaded\n");
80 return 0; 80 return 0;
81 81
82 error_proc: 82 error_proc:
83 kmem_cache_destroy(cachefiles_object_jar); 83 kmem_cache_destroy(cachefiles_object_jar);
84 error_object_jar: 84 error_object_jar:
85 misc_deregister(&cachefiles_dev); 85 misc_deregister(&cachefiles_dev);
86 error_dev: 86 error_dev:
87 pr_err("failed to register: %d", ret); 87 pr_err("failed to register: %d\n", ret);
88 return ret; 88 return ret;
89 } 89 }
90 90
91 fs_initcall(cachefiles_init); 91 fs_initcall(cachefiles_init);
92 92
93 /* 93 /*
94 * clean up on module removal 94 * clean up on module removal
95 */ 95 */
96 static void __exit cachefiles_exit(void) 96 static void __exit cachefiles_exit(void)
97 { 97 {
98 pr_info("Unloading\n"); 98 pr_info("Unloading\n");
99 99
100 cachefiles_proc_cleanup(); 100 cachefiles_proc_cleanup();
101 kmem_cache_destroy(cachefiles_object_jar); 101 kmem_cache_destroy(cachefiles_object_jar);
102 misc_deregister(&cachefiles_dev); 102 misc_deregister(&cachefiles_dev);
103 } 103 }
104 104
105 module_exit(cachefiles_exit); 105 module_exit(cachefiles_exit);
106 106
fs/cachefiles/namei.c
1 /* CacheFiles path walking and related routines 1 /* CacheFiles path walking and related routines
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/sched.h> 13 #include <linux/sched.h>
14 #include <linux/file.h> 14 #include <linux/file.h>
15 #include <linux/fs.h> 15 #include <linux/fs.h>
16 #include <linux/fsnotify.h> 16 #include <linux/fsnotify.h>
17 #include <linux/quotaops.h> 17 #include <linux/quotaops.h>
18 #include <linux/xattr.h> 18 #include <linux/xattr.h>
19 #include <linux/mount.h> 19 #include <linux/mount.h>
20 #include <linux/namei.h> 20 #include <linux/namei.h>
21 #include <linux/security.h> 21 #include <linux/security.h>
22 #include <linux/slab.h> 22 #include <linux/slab.h>
23 #include "internal.h" 23 #include "internal.h"
24 24
25 #define CACHEFILES_KEYBUF_SIZE 512 25 #define CACHEFILES_KEYBUF_SIZE 512
26 26
27 /* 27 /*
28 * dump debugging info about an object 28 * dump debugging info about an object
29 */ 29 */
30 static noinline 30 static noinline
31 void __cachefiles_printk_object(struct cachefiles_object *object, 31 void __cachefiles_printk_object(struct cachefiles_object *object,
32 const char *prefix, 32 const char *prefix,
33 u8 *keybuf) 33 u8 *keybuf)
34 { 34 {
35 struct fscache_cookie *cookie; 35 struct fscache_cookie *cookie;
36 unsigned keylen, loop; 36 unsigned keylen, loop;
37 37
38 pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); 38 pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
39 pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", 39 pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
40 prefix, object->fscache.state->name, 40 prefix, object->fscache.state->name,
41 object->fscache.flags, work_busy(&object->fscache.work), 41 object->fscache.flags, work_busy(&object->fscache.work),
42 object->fscache.events, object->fscache.event_mask); 42 object->fscache.events, object->fscache.event_mask);
43 pr_err("%sops=%u inp=%u exc=%u\n", 43 pr_err("%sops=%u inp=%u exc=%u\n",
44 prefix, object->fscache.n_ops, object->fscache.n_in_progress, 44 prefix, object->fscache.n_ops, object->fscache.n_in_progress,
45 object->fscache.n_exclusive); 45 object->fscache.n_exclusive);
46 pr_err("%sparent=%p\n", 46 pr_err("%sparent=%p\n",
47 prefix, object->fscache.parent); 47 prefix, object->fscache.parent);
48 48
49 spin_lock(&object->fscache.lock); 49 spin_lock(&object->fscache.lock);
50 cookie = object->fscache.cookie; 50 cookie = object->fscache.cookie;
51 if (cookie) { 51 if (cookie) {
52 pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", 52 pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",
53 prefix, 53 prefix,
54 object->fscache.cookie, 54 object->fscache.cookie,
55 object->fscache.cookie->parent, 55 object->fscache.cookie->parent,
56 object->fscache.cookie->netfs_data, 56 object->fscache.cookie->netfs_data,
57 object->fscache.cookie->flags); 57 object->fscache.cookie->flags);
58 if (keybuf && cookie->def) 58 if (keybuf && cookie->def)
59 keylen = cookie->def->get_key(cookie->netfs_data, keybuf, 59 keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
60 CACHEFILES_KEYBUF_SIZE); 60 CACHEFILES_KEYBUF_SIZE);
61 else 61 else
62 keylen = 0; 62 keylen = 0;
63 } else { 63 } else {
64 pr_err("%scookie=NULL\n", prefix); 64 pr_err("%scookie=NULL\n", prefix);
65 keylen = 0; 65 keylen = 0;
66 } 66 }
67 spin_unlock(&object->fscache.lock); 67 spin_unlock(&object->fscache.lock);
68 68
69 if (keylen) { 69 if (keylen) {
70 pr_err("%skey=[%u] '", prefix, keylen); 70 pr_err("%skey=[%u] '", prefix, keylen);
71 for (loop = 0; loop < keylen; loop++) 71 for (loop = 0; loop < keylen; loop++)
72 pr_cont("%02x", keybuf[loop]); 72 pr_cont("%02x", keybuf[loop]);
73 pr_cont("'\n"); 73 pr_cont("'\n");
74 } 74 }
75 } 75 }
76 76
77 /* 77 /*
78 * dump debugging info about a pair of objects 78 * dump debugging info about a pair of objects
79 */ 79 */
80 static noinline void cachefiles_printk_object(struct cachefiles_object *object, 80 static noinline void cachefiles_printk_object(struct cachefiles_object *object,
81 struct cachefiles_object *xobject) 81 struct cachefiles_object *xobject)
82 { 82 {
83 u8 *keybuf; 83 u8 *keybuf;
84 84
85 keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); 85 keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
86 if (object) 86 if (object)
87 __cachefiles_printk_object(object, "", keybuf); 87 __cachefiles_printk_object(object, "", keybuf);
88 if (xobject) 88 if (xobject)
89 __cachefiles_printk_object(xobject, "x", keybuf); 89 __cachefiles_printk_object(xobject, "x", keybuf);
90 kfree(keybuf); 90 kfree(keybuf);
91 } 91 }
92 92
93 /* 93 /*
94 * mark the owner of a dentry, if there is one, to indicate that that dentry 94 * mark the owner of a dentry, if there is one, to indicate that that dentry
95 * has been preemptively deleted 95 * has been preemptively deleted
96 * - the caller must hold the i_mutex on the dentry's parent as required to 96 * - the caller must hold the i_mutex on the dentry's parent as required to
97 * call vfs_unlink(), vfs_rmdir() or vfs_rename() 97 * call vfs_unlink(), vfs_rmdir() or vfs_rename()
98 */ 98 */
99 static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, 99 static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
100 struct dentry *dentry) 100 struct dentry *dentry)
101 { 101 {
102 struct cachefiles_object *object; 102 struct cachefiles_object *object;
103 struct rb_node *p; 103 struct rb_node *p;
104 104
105 _enter(",'%*.*s'", 105 _enter(",'%*.*s'",
106 dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); 106 dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
107 107
108 write_lock(&cache->active_lock); 108 write_lock(&cache->active_lock);
109 109
110 p = cache->active_nodes.rb_node; 110 p = cache->active_nodes.rb_node;
111 while (p) { 111 while (p) {
112 object = rb_entry(p, struct cachefiles_object, active_node); 112 object = rb_entry(p, struct cachefiles_object, active_node);
113 if (object->dentry > dentry) 113 if (object->dentry > dentry)
114 p = p->rb_left; 114 p = p->rb_left;
115 else if (object->dentry < dentry) 115 else if (object->dentry < dentry)
116 p = p->rb_right; 116 p = p->rb_right;
117 else 117 else
118 goto found_dentry; 118 goto found_dentry;
119 } 119 }
120 120
121 write_unlock(&cache->active_lock); 121 write_unlock(&cache->active_lock);
122 _leave(" [no owner]"); 122 _leave(" [no owner]");
123 return; 123 return;
124 124
125 /* found the dentry for */ 125 /* found the dentry for */
126 found_dentry: 126 found_dentry:
127 kdebug("preemptive burial: OBJ%x [%s] %p", 127 kdebug("preemptive burial: OBJ%x [%s] %p",
128 object->fscache.debug_id, 128 object->fscache.debug_id,
129 object->fscache.state->name, 129 object->fscache.state->name,
130 dentry); 130 dentry);
131 131
132 if (fscache_object_is_live(&object->fscache)) { 132 if (fscache_object_is_live(&object->fscache)) {
133 pr_err("\n"); 133 pr_err("\n");
134 pr_err("Error: Can't preemptively bury live object\n"); 134 pr_err("Error: Can't preemptively bury live object\n");
135 cachefiles_printk_object(object, NULL); 135 cachefiles_printk_object(object, NULL);
136 } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { 136 } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
137 pr_err("Error: Object already preemptively buried\n"); 137 pr_err("Error: Object already preemptively buried\n");
138 } 138 }
139 139
140 write_unlock(&cache->active_lock); 140 write_unlock(&cache->active_lock);
141 _leave(" [owner marked]"); 141 _leave(" [owner marked]");
142 } 142 }
143 143
144 /* 144 /*
145 * record the fact that an object is now active 145 * record the fact that an object is now active
146 */ 146 */
147 static int cachefiles_mark_object_active(struct cachefiles_cache *cache, 147 static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
148 struct cachefiles_object *object) 148 struct cachefiles_object *object)
149 { 149 {
150 struct cachefiles_object *xobject; 150 struct cachefiles_object *xobject;
151 struct rb_node **_p, *_parent = NULL; 151 struct rb_node **_p, *_parent = NULL;
152 struct dentry *dentry; 152 struct dentry *dentry;
153 153
154 _enter(",%p", object); 154 _enter(",%p", object);
155 155
156 try_again: 156 try_again:
157 write_lock(&cache->active_lock); 157 write_lock(&cache->active_lock);
158 158
159 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { 159 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
160 pr_err("Error: Object already active\n"); 160 pr_err("Error: Object already active\n");
161 cachefiles_printk_object(object, NULL); 161 cachefiles_printk_object(object, NULL);
162 BUG(); 162 BUG();
163 } 163 }
164 164
165 dentry = object->dentry; 165 dentry = object->dentry;
166 _p = &cache->active_nodes.rb_node; 166 _p = &cache->active_nodes.rb_node;
167 while (*_p) { 167 while (*_p) {
168 _parent = *_p; 168 _parent = *_p;
169 xobject = rb_entry(_parent, 169 xobject = rb_entry(_parent,
170 struct cachefiles_object, active_node); 170 struct cachefiles_object, active_node);
171 171
172 ASSERT(xobject != object); 172 ASSERT(xobject != object);
173 173
174 if (xobject->dentry > dentry) 174 if (xobject->dentry > dentry)
175 _p = &(*_p)->rb_left; 175 _p = &(*_p)->rb_left;
176 else if (xobject->dentry < dentry) 176 else if (xobject->dentry < dentry)
177 _p = &(*_p)->rb_right; 177 _p = &(*_p)->rb_right;
178 else 178 else
179 goto wait_for_old_object; 179 goto wait_for_old_object;
180 } 180 }
181 181
182 rb_link_node(&object->active_node, _parent, _p); 182 rb_link_node(&object->active_node, _parent, _p);
183 rb_insert_color(&object->active_node, &cache->active_nodes); 183 rb_insert_color(&object->active_node, &cache->active_nodes);
184 184
185 write_unlock(&cache->active_lock); 185 write_unlock(&cache->active_lock);
186 _leave(" = 0"); 186 _leave(" = 0");
187 return 0; 187 return 0;
188 188
189 /* an old object from a previous incarnation is hogging the slot - we 189 /* an old object from a previous incarnation is hogging the slot - we
190 * need to wait for it to be destroyed */ 190 * need to wait for it to be destroyed */
191 wait_for_old_object: 191 wait_for_old_object:
192 if (fscache_object_is_live(&object->fscache)) { 192 if (fscache_object_is_live(&object->fscache)) {
193 pr_err("\n"); 193 pr_err("\n");
194 pr_err("Error: Unexpected object collision\n"); 194 pr_err("Error: Unexpected object collision\n");
195 cachefiles_printk_object(object, xobject); 195 cachefiles_printk_object(object, xobject);
196 BUG(); 196 BUG();
197 } 197 }
198 atomic_inc(&xobject->usage); 198 atomic_inc(&xobject->usage);
199 write_unlock(&cache->active_lock); 199 write_unlock(&cache->active_lock);
200 200
201 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { 201 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
202 wait_queue_head_t *wq; 202 wait_queue_head_t *wq;
203 203
204 signed long timeout = 60 * HZ; 204 signed long timeout = 60 * HZ;
205 wait_queue_t wait; 205 wait_queue_t wait;
206 bool requeue; 206 bool requeue;
207 207
208 /* if the object we're waiting for is queued for processing, 208 /* if the object we're waiting for is queued for processing,
209 * then just put ourselves on the queue behind it */ 209 * then just put ourselves on the queue behind it */
210 if (work_pending(&xobject->fscache.work)) { 210 if (work_pending(&xobject->fscache.work)) {
211 _debug("queue OBJ%x behind OBJ%x immediately", 211 _debug("queue OBJ%x behind OBJ%x immediately",
212 object->fscache.debug_id, 212 object->fscache.debug_id,
213 xobject->fscache.debug_id); 213 xobject->fscache.debug_id);
214 goto requeue; 214 goto requeue;
215 } 215 }
216 216
217 /* otherwise we sleep until either the object we're waiting for 217 /* otherwise we sleep until either the object we're waiting for
218 * is done, or the fscache_object is congested */ 218 * is done, or the fscache_object is congested */
219 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); 219 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
220 init_wait(&wait); 220 init_wait(&wait);
221 requeue = false; 221 requeue = false;
222 do { 222 do {
223 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 223 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
224 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) 224 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
225 break; 225 break;
226 226
227 requeue = fscache_object_sleep_till_congested(&timeout); 227 requeue = fscache_object_sleep_till_congested(&timeout);
228 } while (timeout > 0 && !requeue); 228 } while (timeout > 0 && !requeue);
229 finish_wait(wq, &wait); 229 finish_wait(wq, &wait);
230 230
231 if (requeue && 231 if (requeue &&
232 test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { 232 test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
233 _debug("queue OBJ%x behind OBJ%x after wait", 233 _debug("queue OBJ%x behind OBJ%x after wait",
234 object->fscache.debug_id, 234 object->fscache.debug_id,
235 xobject->fscache.debug_id); 235 xobject->fscache.debug_id);
236 goto requeue; 236 goto requeue;
237 } 237 }
238 238
239 if (timeout <= 0) { 239 if (timeout <= 0) {
240 pr_err("\n"); 240 pr_err("\n");
241 pr_err("Error: Overlong wait for old active object to go away\n"); 241 pr_err("Error: Overlong wait for old active object to go away\n");
242 cachefiles_printk_object(object, xobject); 242 cachefiles_printk_object(object, xobject);
243 goto requeue; 243 goto requeue;
244 } 244 }
245 } 245 }
246 246
247 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); 247 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
248 248
249 cache->cache.ops->put_object(&xobject->fscache); 249 cache->cache.ops->put_object(&xobject->fscache);
250 goto try_again; 250 goto try_again;
251 251
252 requeue: 252 requeue:
253 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); 253 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
254 cache->cache.ops->put_object(&xobject->fscache); 254 cache->cache.ops->put_object(&xobject->fscache);
255 _leave(" = -ETIMEDOUT"); 255 _leave(" = -ETIMEDOUT");
256 return -ETIMEDOUT; 256 return -ETIMEDOUT;
257 } 257 }
258 258
259 /* 259 /*
260 * delete an object representation from the cache 260 * delete an object representation from the cache
261 * - file backed objects are unlinked 261 * - file backed objects are unlinked
262 * - directory backed objects are stuffed into the graveyard for userspace to 262 * - directory backed objects are stuffed into the graveyard for userspace to
263 * delete 263 * delete
264 * - unlocks the directory mutex 264 * - unlocks the directory mutex
265 */ 265 */
266 static int cachefiles_bury_object(struct cachefiles_cache *cache, 266 static int cachefiles_bury_object(struct cachefiles_cache *cache,
267 struct dentry *dir, 267 struct dentry *dir,
268 struct dentry *rep, 268 struct dentry *rep,
269 bool preemptive) 269 bool preemptive)
270 { 270 {
271 struct dentry *grave, *trap; 271 struct dentry *grave, *trap;
272 struct path path, path_to_graveyard; 272 struct path path, path_to_graveyard;
273 char nbuffer[8 + 8 + 1]; 273 char nbuffer[8 + 8 + 1];
274 int ret; 274 int ret;
275 275
276 _enter(",'%*.*s','%*.*s'", 276 _enter(",'%*.*s','%*.*s'",
277 dir->d_name.len, dir->d_name.len, dir->d_name.name, 277 dir->d_name.len, dir->d_name.len, dir->d_name.name,
278 rep->d_name.len, rep->d_name.len, rep->d_name.name); 278 rep->d_name.len, rep->d_name.len, rep->d_name.name);
279 279
280 _debug("remove %p from %p", rep, dir); 280 _debug("remove %p from %p", rep, dir);
281 281
282 /* non-directories can just be unlinked */ 282 /* non-directories can just be unlinked */
283 if (!S_ISDIR(rep->d_inode->i_mode)) { 283 if (!S_ISDIR(rep->d_inode->i_mode)) {
284 _debug("unlink stale object"); 284 _debug("unlink stale object");
285 285
286 path.mnt = cache->mnt; 286 path.mnt = cache->mnt;
287 path.dentry = dir; 287 path.dentry = dir;
288 ret = security_path_unlink(&path, rep); 288 ret = security_path_unlink(&path, rep);
289 if (ret < 0) { 289 if (ret < 0) {
290 cachefiles_io_error(cache, "Unlink security error"); 290 cachefiles_io_error(cache, "Unlink security error");
291 } else { 291 } else {
292 ret = vfs_unlink(dir->d_inode, rep, NULL); 292 ret = vfs_unlink(dir->d_inode, rep, NULL);
293 293
294 if (preemptive) 294 if (preemptive)
295 cachefiles_mark_object_buried(cache, rep); 295 cachefiles_mark_object_buried(cache, rep);
296 } 296 }
297 297
298 mutex_unlock(&dir->d_inode->i_mutex); 298 mutex_unlock(&dir->d_inode->i_mutex);
299 299
300 if (ret == -EIO) 300 if (ret == -EIO)
301 cachefiles_io_error(cache, "Unlink failed"); 301 cachefiles_io_error(cache, "Unlink failed");
302 302
303 _leave(" = %d", ret); 303 _leave(" = %d", ret);
304 return ret; 304 return ret;
305 } 305 }
306 306
307 /* directories have to be moved to the graveyard */ 307 /* directories have to be moved to the graveyard */
308 _debug("move stale object to graveyard"); 308 _debug("move stale object to graveyard");
309 mutex_unlock(&dir->d_inode->i_mutex); 309 mutex_unlock(&dir->d_inode->i_mutex);
310 310
311 try_again: 311 try_again:
312 /* first step is to make up a grave dentry in the graveyard */ 312 /* first step is to make up a grave dentry in the graveyard */
313 sprintf(nbuffer, "%08x%08x", 313 sprintf(nbuffer, "%08x%08x",
314 (uint32_t) get_seconds(), 314 (uint32_t) get_seconds(),
315 (uint32_t) atomic_inc_return(&cache->gravecounter)); 315 (uint32_t) atomic_inc_return(&cache->gravecounter));
316 316
317 /* do the multiway lock magic */ 317 /* do the multiway lock magic */
318 trap = lock_rename(cache->graveyard, dir); 318 trap = lock_rename(cache->graveyard, dir);
319 319
320 /* do some checks before getting the grave dentry */ 320 /* do some checks before getting the grave dentry */
321 if (rep->d_parent != dir) { 321 if (rep->d_parent != dir) {
322 /* the entry was probably culled when we dropped the parent dir 322 /* the entry was probably culled when we dropped the parent dir
323 * lock */ 323 * lock */
324 unlock_rename(cache->graveyard, dir); 324 unlock_rename(cache->graveyard, dir);
325 _leave(" = 0 [culled?]"); 325 _leave(" = 0 [culled?]");
326 return 0; 326 return 0;
327 } 327 }
328 328
329 if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { 329 if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
330 unlock_rename(cache->graveyard, dir); 330 unlock_rename(cache->graveyard, dir);
331 cachefiles_io_error(cache, "Graveyard no longer a directory"); 331 cachefiles_io_error(cache, "Graveyard no longer a directory");
332 return -EIO; 332 return -EIO;
333 } 333 }
334 334
335 if (trap == rep) { 335 if (trap == rep) {
336 unlock_rename(cache->graveyard, dir); 336 unlock_rename(cache->graveyard, dir);
337 cachefiles_io_error(cache, "May not make directory loop"); 337 cachefiles_io_error(cache, "May not make directory loop");
338 return -EIO; 338 return -EIO;
339 } 339 }
340 340
341 if (d_mountpoint(rep)) { 341 if (d_mountpoint(rep)) {
342 unlock_rename(cache->graveyard, dir); 342 unlock_rename(cache->graveyard, dir);
343 cachefiles_io_error(cache, "Mountpoint in cache"); 343 cachefiles_io_error(cache, "Mountpoint in cache");
344 return -EIO; 344 return -EIO;
345 } 345 }
346 346
347 grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); 347 grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
348 if (IS_ERR(grave)) { 348 if (IS_ERR(grave)) {
349 unlock_rename(cache->graveyard, dir); 349 unlock_rename(cache->graveyard, dir);
350 350
351 if (PTR_ERR(grave) == -ENOMEM) { 351 if (PTR_ERR(grave) == -ENOMEM) {
352 _leave(" = -ENOMEM"); 352 _leave(" = -ENOMEM");
353 return -ENOMEM; 353 return -ENOMEM;
354 } 354 }
355 355
356 cachefiles_io_error(cache, "Lookup error %ld", 356 cachefiles_io_error(cache, "Lookup error %ld",
357 PTR_ERR(grave)); 357 PTR_ERR(grave));
358 return -EIO; 358 return -EIO;
359 } 359 }
360 360
361 if (grave->d_inode) { 361 if (grave->d_inode) {
362 unlock_rename(cache->graveyard, dir); 362 unlock_rename(cache->graveyard, dir);
363 dput(grave); 363 dput(grave);
364 grave = NULL; 364 grave = NULL;
365 cond_resched(); 365 cond_resched();
366 goto try_again; 366 goto try_again;
367 } 367 }
368 368
369 if (d_mountpoint(grave)) { 369 if (d_mountpoint(grave)) {
370 unlock_rename(cache->graveyard, dir); 370 unlock_rename(cache->graveyard, dir);
371 dput(grave); 371 dput(grave);
372 cachefiles_io_error(cache, "Mountpoint in graveyard"); 372 cachefiles_io_error(cache, "Mountpoint in graveyard");
373 return -EIO; 373 return -EIO;
374 } 374 }
375 375
376 /* target should not be an ancestor of source */ 376 /* target should not be an ancestor of source */
377 if (trap == grave) { 377 if (trap == grave) {
378 unlock_rename(cache->graveyard, dir); 378 unlock_rename(cache->graveyard, dir);
379 dput(grave); 379 dput(grave);
380 cachefiles_io_error(cache, "May not make directory loop"); 380 cachefiles_io_error(cache, "May not make directory loop");
381 return -EIO; 381 return -EIO;
382 } 382 }
383 383
384 /* attempt the rename */ 384 /* attempt the rename */
385 path.mnt = cache->mnt; 385 path.mnt = cache->mnt;
386 path.dentry = dir; 386 path.dentry = dir;
387 path_to_graveyard.mnt = cache->mnt; 387 path_to_graveyard.mnt = cache->mnt;
388 path_to_graveyard.dentry = cache->graveyard; 388 path_to_graveyard.dentry = cache->graveyard;
389 ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); 389 ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
390 if (ret < 0) { 390 if (ret < 0) {
391 cachefiles_io_error(cache, "Rename security error %d", ret); 391 cachefiles_io_error(cache, "Rename security error %d", ret);
392 } else { 392 } else {
393 ret = vfs_rename(dir->d_inode, rep, 393 ret = vfs_rename(dir->d_inode, rep,
394 cache->graveyard->d_inode, grave, NULL, 0); 394 cache->graveyard->d_inode, grave, NULL, 0);
395 if (ret != 0 && ret != -ENOMEM) 395 if (ret != 0 && ret != -ENOMEM)
396 cachefiles_io_error(cache, 396 cachefiles_io_error(cache,
397 "Rename failed with error %d", ret); 397 "Rename failed with error %d", ret);
398 398
399 if (preemptive) 399 if (preemptive)
400 cachefiles_mark_object_buried(cache, rep); 400 cachefiles_mark_object_buried(cache, rep);
401 } 401 }
402 402
403 unlock_rename(cache->graveyard, dir); 403 unlock_rename(cache->graveyard, dir);
404 dput(grave); 404 dput(grave);
405 _leave(" = 0"); 405 _leave(" = 0");
406 return 0; 406 return 0;
407 } 407 }
408 408
409 /* 409 /*
410 * delete an object representation from the cache 410 * delete an object representation from the cache
411 */ 411 */
412 int cachefiles_delete_object(struct cachefiles_cache *cache, 412 int cachefiles_delete_object(struct cachefiles_cache *cache,
413 struct cachefiles_object *object) 413 struct cachefiles_object *object)
414 { 414 {
415 struct dentry *dir; 415 struct dentry *dir;
416 int ret; 416 int ret;
417 417
418 _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); 418 _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
419 419
420 ASSERT(object->dentry); 420 ASSERT(object->dentry);
421 ASSERT(object->dentry->d_inode); 421 ASSERT(object->dentry->d_inode);
422 ASSERT(object->dentry->d_parent); 422 ASSERT(object->dentry->d_parent);
423 423
424 dir = dget_parent(object->dentry); 424 dir = dget_parent(object->dentry);
425 425
426 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 426 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
427 427
428 if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { 428 if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
429 /* object allocation for the same key preemptively deleted this 429 /* object allocation for the same key preemptively deleted this
430 * object's file so that it could create its own file */ 430 * object's file so that it could create its own file */
431 _debug("object preemptively buried"); 431 _debug("object preemptively buried");
432 mutex_unlock(&dir->d_inode->i_mutex); 432 mutex_unlock(&dir->d_inode->i_mutex);
433 ret = 0; 433 ret = 0;
434 } else { 434 } else {
435 /* we need to check that our parent is _still_ our parent - it 435 /* we need to check that our parent is _still_ our parent - it
436 * may have been renamed */ 436 * may have been renamed */
437 if (dir == object->dentry->d_parent) { 437 if (dir == object->dentry->d_parent) {
438 ret = cachefiles_bury_object(cache, dir, 438 ret = cachefiles_bury_object(cache, dir,
439 object->dentry, false); 439 object->dentry, false);
440 } else { 440 } else {
441 /* it got moved, presumably by cachefilesd culling it, 441 /* it got moved, presumably by cachefilesd culling it,
442 * so it's no longer in the key path and we can ignore 442 * so it's no longer in the key path and we can ignore
443 * it */ 443 * it */
444 mutex_unlock(&dir->d_inode->i_mutex); 444 mutex_unlock(&dir->d_inode->i_mutex);
445 ret = 0; 445 ret = 0;
446 } 446 }
447 } 447 }
448 448
449 dput(dir); 449 dput(dir);
450 _leave(" = %d", ret); 450 _leave(" = %d", ret);
451 return ret; 451 return ret;
452 } 452 }
453 453
454 /* 454 /*
455 * walk from the parent object to the child object through the backing 455 * walk from the parent object to the child object through the backing
456 * filesystem, creating directories as we go 456 * filesystem, creating directories as we go
457 */ 457 */
458 int cachefiles_walk_to_object(struct cachefiles_object *parent, 458 int cachefiles_walk_to_object(struct cachefiles_object *parent,
459 struct cachefiles_object *object, 459 struct cachefiles_object *object,
460 const char *key, 460 const char *key,
461 struct cachefiles_xattr *auxdata) 461 struct cachefiles_xattr *auxdata)
462 { 462 {
463 struct cachefiles_cache *cache; 463 struct cachefiles_cache *cache;
464 struct dentry *dir, *next = NULL; 464 struct dentry *dir, *next = NULL;
465 struct path path; 465 struct path path;
466 unsigned long start; 466 unsigned long start;
467 const char *name; 467 const char *name;
468 int ret, nlen; 468 int ret, nlen;
469 469
470 _enter("OBJ%x{%p},OBJ%x,%s,", 470 _enter("OBJ%x{%p},OBJ%x,%s,",
471 parent->fscache.debug_id, parent->dentry, 471 parent->fscache.debug_id, parent->dentry,
472 object->fscache.debug_id, key); 472 object->fscache.debug_id, key);
473 473
474 cache = container_of(parent->fscache.cache, 474 cache = container_of(parent->fscache.cache,
475 struct cachefiles_cache, cache); 475 struct cachefiles_cache, cache);
476 path.mnt = cache->mnt; 476 path.mnt = cache->mnt;
477 477
478 ASSERT(parent->dentry); 478 ASSERT(parent->dentry);
479 ASSERT(parent->dentry->d_inode); 479 ASSERT(parent->dentry->d_inode);
480 480
481 if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { 481 if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
482 // TODO: convert file to dir 482 // TODO: convert file to dir
483 _leave("looking up in none directory"); 483 _leave("looking up in none directory");
484 return -ENOBUFS; 484 return -ENOBUFS;
485 } 485 }
486 486
487 dir = dget(parent->dentry); 487 dir = dget(parent->dentry);
488 488
489 advance: 489 advance:
490 /* attempt to transit the first directory component */ 490 /* attempt to transit the first directory component */
491 name = key; 491 name = key;
492 nlen = strlen(key); 492 nlen = strlen(key);
493 493
494 /* key ends in a double NUL */ 494 /* key ends in a double NUL */
495 key = key + nlen + 1; 495 key = key + nlen + 1;
496 if (!*key) 496 if (!*key)
497 key = NULL; 497 key = NULL;
498 498
499 lookup_again: 499 lookup_again:
500 /* search the current directory for the element name */ 500 /* search the current directory for the element name */
501 _debug("lookup '%s'", name); 501 _debug("lookup '%s'", name);
502 502
503 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 503 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
504 504
505 start = jiffies; 505 start = jiffies;
506 next = lookup_one_len(name, dir, nlen); 506 next = lookup_one_len(name, dir, nlen);
507 cachefiles_hist(cachefiles_lookup_histogram, start); 507 cachefiles_hist(cachefiles_lookup_histogram, start);
508 if (IS_ERR(next)) 508 if (IS_ERR(next))
509 goto lookup_error; 509 goto lookup_error;
510 510
511 _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); 511 _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
512 512
513 if (!key) 513 if (!key)
514 object->new = !next->d_inode; 514 object->new = !next->d_inode;
515 515
516 /* if this element of the path doesn't exist, then the lookup phase 516 /* if this element of the path doesn't exist, then the lookup phase
517 * failed, and we can release any readers in the certain knowledge that 517 * failed, and we can release any readers in the certain knowledge that
518 * there's nothing for them to actually read */ 518 * there's nothing for them to actually read */
519 if (!next->d_inode) 519 if (!next->d_inode)
520 fscache_object_lookup_negative(&object->fscache); 520 fscache_object_lookup_negative(&object->fscache);
521 521
522 /* we need to create the object if it's negative */ 522 /* we need to create the object if it's negative */
523 if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { 523 if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
524 /* index objects and intervening tree levels must be subdirs */ 524 /* index objects and intervening tree levels must be subdirs */
525 if (!next->d_inode) { 525 if (!next->d_inode) {
526 ret = cachefiles_has_space(cache, 1, 0); 526 ret = cachefiles_has_space(cache, 1, 0);
527 if (ret < 0) 527 if (ret < 0)
528 goto create_error; 528 goto create_error;
529 529
530 path.dentry = dir; 530 path.dentry = dir;
531 ret = security_path_mkdir(&path, next, 0); 531 ret = security_path_mkdir(&path, next, 0);
532 if (ret < 0) 532 if (ret < 0)
533 goto create_error; 533 goto create_error;
534 start = jiffies; 534 start = jiffies;
535 ret = vfs_mkdir(dir->d_inode, next, 0); 535 ret = vfs_mkdir(dir->d_inode, next, 0);
536 cachefiles_hist(cachefiles_mkdir_histogram, start); 536 cachefiles_hist(cachefiles_mkdir_histogram, start);
537 if (ret < 0) 537 if (ret < 0)
538 goto create_error; 538 goto create_error;
539 539
540 ASSERT(next->d_inode); 540 ASSERT(next->d_inode);
541 541
542 _debug("mkdir -> %p{%p{ino=%lu}}", 542 _debug("mkdir -> %p{%p{ino=%lu}}",
543 next, next->d_inode, next->d_inode->i_ino); 543 next, next->d_inode, next->d_inode->i_ino);
544 544
545 } else if (!S_ISDIR(next->d_inode->i_mode)) { 545 } else if (!S_ISDIR(next->d_inode->i_mode)) {
546 pr_err("inode %lu is not a directory", 546 pr_err("inode %lu is not a directory\n",
547 next->d_inode->i_ino); 547 next->d_inode->i_ino);
548 ret = -ENOBUFS; 548 ret = -ENOBUFS;
549 goto error; 549 goto error;
550 } 550 }
551 551
552 } else { 552 } else {
553 /* non-index objects start out life as files */ 553 /* non-index objects start out life as files */
554 if (!next->d_inode) { 554 if (!next->d_inode) {
555 ret = cachefiles_has_space(cache, 1, 0); 555 ret = cachefiles_has_space(cache, 1, 0);
556 if (ret < 0) 556 if (ret < 0)
557 goto create_error; 557 goto create_error;
558 558
559 path.dentry = dir; 559 path.dentry = dir;
560 ret = security_path_mknod(&path, next, S_IFREG, 0); 560 ret = security_path_mknod(&path, next, S_IFREG, 0);
561 if (ret < 0) 561 if (ret < 0)
562 goto create_error; 562 goto create_error;
563 start = jiffies; 563 start = jiffies;
564 ret = vfs_create(dir->d_inode, next, S_IFREG, true); 564 ret = vfs_create(dir->d_inode, next, S_IFREG, true);
565 cachefiles_hist(cachefiles_create_histogram, start); 565 cachefiles_hist(cachefiles_create_histogram, start);
566 if (ret < 0) 566 if (ret < 0)
567 goto create_error; 567 goto create_error;
568 568
569 ASSERT(next->d_inode); 569 ASSERT(next->d_inode);
570 570
571 _debug("create -> %p{%p{ino=%lu}}", 571 _debug("create -> %p{%p{ino=%lu}}",
572 next, next->d_inode, next->d_inode->i_ino); 572 next, next->d_inode, next->d_inode->i_ino);
573 573
574 } else if (!S_ISDIR(next->d_inode->i_mode) && 574 } else if (!S_ISDIR(next->d_inode->i_mode) &&
575 !S_ISREG(next->d_inode->i_mode) 575 !S_ISREG(next->d_inode->i_mode)
576 ) { 576 ) {
577 pr_err("inode %lu is not a file or directory", 577 pr_err("inode %lu is not a file or directory\n",
578 next->d_inode->i_ino); 578 next->d_inode->i_ino);
579 ret = -ENOBUFS; 579 ret = -ENOBUFS;
580 goto error; 580 goto error;
581 } 581 }
582 } 582 }
583 583
584 /* process the next component */ 584 /* process the next component */
585 if (key) { 585 if (key) {
586 _debug("advance"); 586 _debug("advance");
587 mutex_unlock(&dir->d_inode->i_mutex); 587 mutex_unlock(&dir->d_inode->i_mutex);
588 dput(dir); 588 dput(dir);
589 dir = next; 589 dir = next;
590 next = NULL; 590 next = NULL;
591 goto advance; 591 goto advance;
592 } 592 }
593 593
594 /* we've found the object we were looking for */ 594 /* we've found the object we were looking for */
595 object->dentry = next; 595 object->dentry = next;
596 596
597 /* if we've found that the terminal object exists, then we need to 597 /* if we've found that the terminal object exists, then we need to
598 * check its attributes and delete it if it's out of date */ 598 * check its attributes and delete it if it's out of date */
599 if (!object->new) { 599 if (!object->new) {
600 _debug("validate '%*.*s'", 600 _debug("validate '%*.*s'",
601 next->d_name.len, next->d_name.len, next->d_name.name); 601 next->d_name.len, next->d_name.len, next->d_name.name);
602 602
603 ret = cachefiles_check_object_xattr(object, auxdata); 603 ret = cachefiles_check_object_xattr(object, auxdata);
604 if (ret == -ESTALE) { 604 if (ret == -ESTALE) {
605 /* delete the object (the deleter drops the directory 605 /* delete the object (the deleter drops the directory
606 * mutex) */ 606 * mutex) */
607 object->dentry = NULL; 607 object->dentry = NULL;
608 608
609 ret = cachefiles_bury_object(cache, dir, next, true); 609 ret = cachefiles_bury_object(cache, dir, next, true);
610 dput(next); 610 dput(next);
611 next = NULL; 611 next = NULL;
612 612
613 if (ret < 0) 613 if (ret < 0)
614 goto delete_error; 614 goto delete_error;
615 615
616 _debug("redo lookup"); 616 _debug("redo lookup");
617 goto lookup_again; 617 goto lookup_again;
618 } 618 }
619 } 619 }
620 620
621 /* note that we're now using this object */ 621 /* note that we're now using this object */
622 ret = cachefiles_mark_object_active(cache, object); 622 ret = cachefiles_mark_object_active(cache, object);
623 623
624 mutex_unlock(&dir->d_inode->i_mutex); 624 mutex_unlock(&dir->d_inode->i_mutex);
625 dput(dir); 625 dput(dir);
626 dir = NULL; 626 dir = NULL;
627 627
628 if (ret == -ETIMEDOUT) 628 if (ret == -ETIMEDOUT)
629 goto mark_active_timed_out; 629 goto mark_active_timed_out;
630 630
631 _debug("=== OBTAINED_OBJECT ==="); 631 _debug("=== OBTAINED_OBJECT ===");
632 632
633 if (object->new) { 633 if (object->new) {
634 /* attach data to a newly constructed terminal object */ 634 /* attach data to a newly constructed terminal object */
635 ret = cachefiles_set_object_xattr(object, auxdata); 635 ret = cachefiles_set_object_xattr(object, auxdata);
636 if (ret < 0) 636 if (ret < 0)
637 goto check_error; 637 goto check_error;
638 } else { 638 } else {
639 /* always update the atime on an object we've just looked up 639 /* always update the atime on an object we've just looked up
640 * (this is used to keep track of culling, and atimes are only 640 * (this is used to keep track of culling, and atimes are only
641 * updated by read, write and readdir but not lookup or 641 * updated by read, write and readdir but not lookup or
642 * open) */ 642 * open) */
643 path.dentry = next; 643 path.dentry = next;
644 touch_atime(&path); 644 touch_atime(&path);
645 } 645 }
646 646
647 /* open a file interface onto a data file */ 647 /* open a file interface onto a data file */
648 if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { 648 if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
649 if (S_ISREG(object->dentry->d_inode->i_mode)) { 649 if (S_ISREG(object->dentry->d_inode->i_mode)) {
650 const struct address_space_operations *aops; 650 const struct address_space_operations *aops;
651 651
652 ret = -EPERM; 652 ret = -EPERM;
653 aops = object->dentry->d_inode->i_mapping->a_ops; 653 aops = object->dentry->d_inode->i_mapping->a_ops;
654 if (!aops->bmap) 654 if (!aops->bmap)
655 goto check_error; 655 goto check_error;
656 656
657 object->backer = object->dentry; 657 object->backer = object->dentry;
658 } else { 658 } else {
659 BUG(); // TODO: open file in data-class subdir 659 BUG(); // TODO: open file in data-class subdir
660 } 660 }
661 } 661 }
662 662
663 object->new = 0; 663 object->new = 0;
664 fscache_obtained_object(&object->fscache); 664 fscache_obtained_object(&object->fscache);
665 665
666 _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); 666 _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
667 return 0; 667 return 0;
668 668
669 create_error: 669 create_error:
670 _debug("create error %d", ret); 670 _debug("create error %d", ret);
671 if (ret == -EIO) 671 if (ret == -EIO)
672 cachefiles_io_error(cache, "Create/mkdir failed"); 672 cachefiles_io_error(cache, "Create/mkdir failed");
673 goto error; 673 goto error;
674 674
675 mark_active_timed_out: 675 mark_active_timed_out:
676 _debug("mark active timed out"); 676 _debug("mark active timed out");
677 goto release_dentry; 677 goto release_dentry;
678 678
679 check_error: 679 check_error:
680 _debug("check error %d", ret); 680 _debug("check error %d", ret);
681 write_lock(&cache->active_lock); 681 write_lock(&cache->active_lock);
682 rb_erase(&object->active_node, &cache->active_nodes); 682 rb_erase(&object->active_node, &cache->active_nodes);
683 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); 683 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
684 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); 684 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
685 write_unlock(&cache->active_lock); 685 write_unlock(&cache->active_lock);
686 release_dentry: 686 release_dentry:
687 dput(object->dentry); 687 dput(object->dentry);
688 object->dentry = NULL; 688 object->dentry = NULL;
689 goto error_out; 689 goto error_out;
690 690
691 delete_error: 691 delete_error:
692 _debug("delete error %d", ret); 692 _debug("delete error %d", ret);
693 goto error_out2; 693 goto error_out2;
694 694
695 lookup_error: 695 lookup_error:
696 _debug("lookup error %ld", PTR_ERR(next)); 696 _debug("lookup error %ld", PTR_ERR(next));
697 ret = PTR_ERR(next); 697 ret = PTR_ERR(next);
698 if (ret == -EIO) 698 if (ret == -EIO)
699 cachefiles_io_error(cache, "Lookup failed"); 699 cachefiles_io_error(cache, "Lookup failed");
700 next = NULL; 700 next = NULL;
701 error: 701 error:
702 mutex_unlock(&dir->d_inode->i_mutex); 702 mutex_unlock(&dir->d_inode->i_mutex);
703 dput(next); 703 dput(next);
704 error_out2: 704 error_out2:
705 dput(dir); 705 dput(dir);
706 error_out: 706 error_out:
707 _leave(" = error %d", -ret); 707 _leave(" = error %d", -ret);
708 return ret; 708 return ret;
709 } 709 }
710 710
711 /* 711 /*
712 * get a subdirectory 712 * get a subdirectory
713 */ 713 */
714 struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, 714 struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
715 struct dentry *dir, 715 struct dentry *dir,
716 const char *dirname) 716 const char *dirname)
717 { 717 {
718 struct dentry *subdir; 718 struct dentry *subdir;
719 unsigned long start; 719 unsigned long start;
720 struct path path; 720 struct path path;
721 int ret; 721 int ret;
722 722
723 _enter(",,%s", dirname); 723 _enter(",,%s", dirname);
724 724
725 /* search the current directory for the element name */ 725 /* search the current directory for the element name */
726 mutex_lock(&dir->d_inode->i_mutex); 726 mutex_lock(&dir->d_inode->i_mutex);
727 727
728 start = jiffies; 728 start = jiffies;
729 subdir = lookup_one_len(dirname, dir, strlen(dirname)); 729 subdir = lookup_one_len(dirname, dir, strlen(dirname));
730 cachefiles_hist(cachefiles_lookup_histogram, start); 730 cachefiles_hist(cachefiles_lookup_histogram, start);
731 if (IS_ERR(subdir)) { 731 if (IS_ERR(subdir)) {
732 if (PTR_ERR(subdir) == -ENOMEM) 732 if (PTR_ERR(subdir) == -ENOMEM)
733 goto nomem_d_alloc; 733 goto nomem_d_alloc;
734 goto lookup_error; 734 goto lookup_error;
735 } 735 }
736 736
737 _debug("subdir -> %p %s", 737 _debug("subdir -> %p %s",
738 subdir, subdir->d_inode ? "positive" : "negative"); 738 subdir, subdir->d_inode ? "positive" : "negative");
739 739
740 /* we need to create the subdir if it doesn't exist yet */ 740 /* we need to create the subdir if it doesn't exist yet */
741 if (!subdir->d_inode) { 741 if (!subdir->d_inode) {
742 ret = cachefiles_has_space(cache, 1, 0); 742 ret = cachefiles_has_space(cache, 1, 0);
743 if (ret < 0) 743 if (ret < 0)
744 goto mkdir_error; 744 goto mkdir_error;
745 745
746 _debug("attempt mkdir"); 746 _debug("attempt mkdir");
747 747
748 path.mnt = cache->mnt; 748 path.mnt = cache->mnt;
749 path.dentry = dir; 749 path.dentry = dir;
750 ret = security_path_mkdir(&path, subdir, 0700); 750 ret = security_path_mkdir(&path, subdir, 0700);
751 if (ret < 0) 751 if (ret < 0)
752 goto mkdir_error; 752 goto mkdir_error;
753 ret = vfs_mkdir(dir->d_inode, subdir, 0700); 753 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
754 if (ret < 0) 754 if (ret < 0)
755 goto mkdir_error; 755 goto mkdir_error;
756 756
757 ASSERT(subdir->d_inode); 757 ASSERT(subdir->d_inode);
758 758
759 _debug("mkdir -> %p{%p{ino=%lu}}", 759 _debug("mkdir -> %p{%p{ino=%lu}}",
760 subdir, 760 subdir,
761 subdir->d_inode, 761 subdir->d_inode,
762 subdir->d_inode->i_ino); 762 subdir->d_inode->i_ino);
763 } 763 }
764 764
765 mutex_unlock(&dir->d_inode->i_mutex); 765 mutex_unlock(&dir->d_inode->i_mutex);
766 766
767 /* we need to make sure the subdir is a directory */ 767 /* we need to make sure the subdir is a directory */
768 ASSERT(subdir->d_inode); 768 ASSERT(subdir->d_inode);
769 769
770 if (!S_ISDIR(subdir->d_inode->i_mode)) { 770 if (!S_ISDIR(subdir->d_inode->i_mode)) {
771 pr_err("%s is not a directory", dirname); 771 pr_err("%s is not a directory\n", dirname);
772 ret = -EIO; 772 ret = -EIO;
773 goto check_error; 773 goto check_error;
774 } 774 }
775 775
776 ret = -EPERM; 776 ret = -EPERM;
777 if (!subdir->d_inode->i_op->setxattr || 777 if (!subdir->d_inode->i_op->setxattr ||
778 !subdir->d_inode->i_op->getxattr || 778 !subdir->d_inode->i_op->getxattr ||
779 !subdir->d_inode->i_op->lookup || 779 !subdir->d_inode->i_op->lookup ||
780 !subdir->d_inode->i_op->mkdir || 780 !subdir->d_inode->i_op->mkdir ||
781 !subdir->d_inode->i_op->create || 781 !subdir->d_inode->i_op->create ||
782 (!subdir->d_inode->i_op->rename && 782 (!subdir->d_inode->i_op->rename &&
783 !subdir->d_inode->i_op->rename2) || 783 !subdir->d_inode->i_op->rename2) ||
784 !subdir->d_inode->i_op->rmdir || 784 !subdir->d_inode->i_op->rmdir ||
785 !subdir->d_inode->i_op->unlink) 785 !subdir->d_inode->i_op->unlink)
786 goto check_error; 786 goto check_error;
787 787
788 _leave(" = [%lu]", subdir->d_inode->i_ino); 788 _leave(" = [%lu]", subdir->d_inode->i_ino);
789 return subdir; 789 return subdir;
790 790
791 check_error: 791 check_error:
792 dput(subdir); 792 dput(subdir);
793 _leave(" = %d [check]", ret); 793 _leave(" = %d [check]", ret);
794 return ERR_PTR(ret); 794 return ERR_PTR(ret);
795 795
796 mkdir_error: 796 mkdir_error:
797 mutex_unlock(&dir->d_inode->i_mutex); 797 mutex_unlock(&dir->d_inode->i_mutex);
798 dput(subdir); 798 dput(subdir);
799 pr_err("mkdir %s failed with error %d", dirname, ret); 799 pr_err("mkdir %s failed with error %d\n", dirname, ret);
800 return ERR_PTR(ret); 800 return ERR_PTR(ret);
801 801
802 lookup_error: 802 lookup_error:
803 mutex_unlock(&dir->d_inode->i_mutex); 803 mutex_unlock(&dir->d_inode->i_mutex);
804 ret = PTR_ERR(subdir); 804 ret = PTR_ERR(subdir);
805 pr_err("Lookup %s failed with error %d", dirname, ret); 805 pr_err("Lookup %s failed with error %d\n", dirname, ret);
806 return ERR_PTR(ret); 806 return ERR_PTR(ret);
807 807
808 nomem_d_alloc: 808 nomem_d_alloc:
809 mutex_unlock(&dir->d_inode->i_mutex); 809 mutex_unlock(&dir->d_inode->i_mutex);
810 _leave(" = -ENOMEM"); 810 _leave(" = -ENOMEM");
811 return ERR_PTR(-ENOMEM); 811 return ERR_PTR(-ENOMEM);
812 } 812 }
813 813
814 /* 814 /*
815 * find out if an object is in use or not 815 * find out if an object is in use or not
816 * - if finds object and it's not in use: 816 * - if finds object and it's not in use:
817 * - returns a pointer to the object and a reference on it 817 * - returns a pointer to the object and a reference on it
818 * - returns with the directory locked 818 * - returns with the directory locked
819 */ 819 */
820 static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, 820 static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
821 struct dentry *dir, 821 struct dentry *dir,
822 char *filename) 822 char *filename)
823 { 823 {
824 struct cachefiles_object *object; 824 struct cachefiles_object *object;
825 struct rb_node *_n; 825 struct rb_node *_n;
826 struct dentry *victim; 826 struct dentry *victim;
827 unsigned long start; 827 unsigned long start;
828 int ret; 828 int ret;
829 829
830 //_enter(",%*.*s/,%s", 830 //_enter(",%*.*s/,%s",
831 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); 831 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
832 832
833 /* look up the victim */ 833 /* look up the victim */
834 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 834 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
835 835
836 start = jiffies; 836 start = jiffies;
837 victim = lookup_one_len(filename, dir, strlen(filename)); 837 victim = lookup_one_len(filename, dir, strlen(filename));
838 cachefiles_hist(cachefiles_lookup_histogram, start); 838 cachefiles_hist(cachefiles_lookup_histogram, start);
839 if (IS_ERR(victim)) 839 if (IS_ERR(victim))
840 goto lookup_error; 840 goto lookup_error;
841 841
842 //_debug("victim -> %p %s", 842 //_debug("victim -> %p %s",
843 // victim, victim->d_inode ? "positive" : "negative"); 843 // victim, victim->d_inode ? "positive" : "negative");
844 844
845 /* if the object is no longer there then we probably retired the object 845 /* if the object is no longer there then we probably retired the object
846 * at the netfs's request whilst the cull was in progress 846 * at the netfs's request whilst the cull was in progress
847 */ 847 */
848 if (!victim->d_inode) { 848 if (!victim->d_inode) {
849 mutex_unlock(&dir->d_inode->i_mutex); 849 mutex_unlock(&dir->d_inode->i_mutex);
850 dput(victim); 850 dput(victim);
851 _leave(" = -ENOENT [absent]"); 851 _leave(" = -ENOENT [absent]");
852 return ERR_PTR(-ENOENT); 852 return ERR_PTR(-ENOENT);
853 } 853 }
854 854
855 /* check to see if we're using this object */ 855 /* check to see if we're using this object */
856 read_lock(&cache->active_lock); 856 read_lock(&cache->active_lock);
857 857
858 _n = cache->active_nodes.rb_node; 858 _n = cache->active_nodes.rb_node;
859 859
860 while (_n) { 860 while (_n) {
861 object = rb_entry(_n, struct cachefiles_object, active_node); 861 object = rb_entry(_n, struct cachefiles_object, active_node);
862 862
863 if (object->dentry > victim) 863 if (object->dentry > victim)
864 _n = _n->rb_left; 864 _n = _n->rb_left;
865 else if (object->dentry < victim) 865 else if (object->dentry < victim)
866 _n = _n->rb_right; 866 _n = _n->rb_right;
867 else 867 else
868 goto object_in_use; 868 goto object_in_use;
869 } 869 }
870 870
871 read_unlock(&cache->active_lock); 871 read_unlock(&cache->active_lock);
872 872
873 //_leave(" = %p", victim); 873 //_leave(" = %p", victim);
874 return victim; 874 return victim;
875 875
876 object_in_use: 876 object_in_use:
877 read_unlock(&cache->active_lock); 877 read_unlock(&cache->active_lock);
878 mutex_unlock(&dir->d_inode->i_mutex); 878 mutex_unlock(&dir->d_inode->i_mutex);
879 dput(victim); 879 dput(victim);
880 //_leave(" = -EBUSY [in use]"); 880 //_leave(" = -EBUSY [in use]");
881 return ERR_PTR(-EBUSY); 881 return ERR_PTR(-EBUSY);
882 882
883 lookup_error: 883 lookup_error:
884 mutex_unlock(&dir->d_inode->i_mutex); 884 mutex_unlock(&dir->d_inode->i_mutex);
885 ret = PTR_ERR(victim); 885 ret = PTR_ERR(victim);
886 if (ret == -ENOENT) { 886 if (ret == -ENOENT) {
887 /* file or dir now absent - probably retired by netfs */ 887 /* file or dir now absent - probably retired by netfs */
888 _leave(" = -ESTALE [absent]"); 888 _leave(" = -ESTALE [absent]");
889 return ERR_PTR(-ESTALE); 889 return ERR_PTR(-ESTALE);
890 } 890 }
891 891
892 if (ret == -EIO) { 892 if (ret == -EIO) {
893 cachefiles_io_error(cache, "Lookup failed"); 893 cachefiles_io_error(cache, "Lookup failed");
894 } else if (ret != -ENOMEM) { 894 } else if (ret != -ENOMEM) {
895 pr_err("Internal error: %d", ret); 895 pr_err("Internal error: %d\n", ret);
896 ret = -EIO; 896 ret = -EIO;
897 } 897 }
898 898
899 _leave(" = %d", ret); 899 _leave(" = %d", ret);
900 return ERR_PTR(ret); 900 return ERR_PTR(ret);
901 } 901 }
902 902
903 /* 903 /*
904 * cull an object if it's not in use 904 * cull an object if it's not in use
905 * - called only by cache manager daemon 905 * - called only by cache manager daemon
906 */ 906 */
907 int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, 907 int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
908 char *filename) 908 char *filename)
909 { 909 {
910 struct dentry *victim; 910 struct dentry *victim;
911 int ret; 911 int ret;
912 912
913 _enter(",%*.*s/,%s", 913 _enter(",%*.*s/,%s",
914 dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); 914 dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
915 915
916 victim = cachefiles_check_active(cache, dir, filename); 916 victim = cachefiles_check_active(cache, dir, filename);
917 if (IS_ERR(victim)) 917 if (IS_ERR(victim))
918 return PTR_ERR(victim); 918 return PTR_ERR(victim);
919 919
920 _debug("victim -> %p %s", 920 _debug("victim -> %p %s",
921 victim, victim->d_inode ? "positive" : "negative"); 921 victim, victim->d_inode ? "positive" : "negative");
922 922
923 /* okay... the victim is not being used so we can cull it 923 /* okay... the victim is not being used so we can cull it
924 * - start by marking it as stale 924 * - start by marking it as stale
925 */ 925 */
926 _debug("victim is cullable"); 926 _debug("victim is cullable");
927 927
928 ret = cachefiles_remove_object_xattr(cache, victim); 928 ret = cachefiles_remove_object_xattr(cache, victim);
929 if (ret < 0) 929 if (ret < 0)
930 goto error_unlock; 930 goto error_unlock;
931 931
932 /* actually remove the victim (drops the dir mutex) */ 932 /* actually remove the victim (drops the dir mutex) */
933 _debug("bury"); 933 _debug("bury");
934 934
935 ret = cachefiles_bury_object(cache, dir, victim, false); 935 ret = cachefiles_bury_object(cache, dir, victim, false);
936 if (ret < 0) 936 if (ret < 0)
937 goto error; 937 goto error;
938 938
939 dput(victim); 939 dput(victim);
940 _leave(" = 0"); 940 _leave(" = 0");
941 return 0; 941 return 0;
942 942
943 error_unlock: 943 error_unlock:
944 mutex_unlock(&dir->d_inode->i_mutex); 944 mutex_unlock(&dir->d_inode->i_mutex);
945 error: 945 error:
946 dput(victim); 946 dput(victim);
947 if (ret == -ENOENT) { 947 if (ret == -ENOENT) {
948 /* file or dir now absent - probably retired by netfs */ 948 /* file or dir now absent - probably retired by netfs */
949 _leave(" = -ESTALE [absent]"); 949 _leave(" = -ESTALE [absent]");
950 return -ESTALE; 950 return -ESTALE;
951 } 951 }
952 952
953 if (ret != -ENOMEM) { 953 if (ret != -ENOMEM) {
954 pr_err("Internal error: %d", ret); 954 pr_err("Internal error: %d\n", ret);
955 ret = -EIO; 955 ret = -EIO;
956 } 956 }
957 957
958 _leave(" = %d", ret); 958 _leave(" = %d", ret);
959 return ret; 959 return ret;
960 } 960 }
961 961
962 /* 962 /*
963 * find out if an object is in use or not 963 * find out if an object is in use or not
964 * - called only by cache manager daemon 964 * - called only by cache manager daemon
965 * - returns -EBUSY or 0 to indicate whether an object is in use or not 965 * - returns -EBUSY or 0 to indicate whether an object is in use or not
966 */ 966 */
967 int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, 967 int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
968 char *filename) 968 char *filename)
969 { 969 {
970 struct dentry *victim; 970 struct dentry *victim;
971 971
972 //_enter(",%*.*s/,%s", 972 //_enter(",%*.*s/,%s",
973 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); 973 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
974 974
975 victim = cachefiles_check_active(cache, dir, filename); 975 victim = cachefiles_check_active(cache, dir, filename);
976 if (IS_ERR(victim)) 976 if (IS_ERR(victim))
977 return PTR_ERR(victim); 977 return PTR_ERR(victim);
978 978
979 mutex_unlock(&dir->d_inode->i_mutex); 979 mutex_unlock(&dir->d_inode->i_mutex);
980 dput(victim); 980 dput(victim);
981 //_leave(" = 0"); 981 //_leave(" = 0");
982 return 0; 982 return 0;
983 } 983 }
984 984
fs/cachefiles/xattr.c
1 /* CacheFiles extended attribute management 1 /* CacheFiles extended attribute management
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/sched.h> 13 #include <linux/sched.h>
14 #include <linux/file.h> 14 #include <linux/file.h>
15 #include <linux/fs.h> 15 #include <linux/fs.h>
16 #include <linux/fsnotify.h> 16 #include <linux/fsnotify.h>
17 #include <linux/quotaops.h> 17 #include <linux/quotaops.h>
18 #include <linux/xattr.h> 18 #include <linux/xattr.h>
19 #include <linux/slab.h> 19 #include <linux/slab.h>
20 #include "internal.h" 20 #include "internal.h"
21 21
22 static const char cachefiles_xattr_cache[] = 22 static const char cachefiles_xattr_cache[] =
23 XATTR_USER_PREFIX "CacheFiles.cache"; 23 XATTR_USER_PREFIX "CacheFiles.cache";
24 24
25 /* 25 /*
26 * check the type label on an object 26 * check the type label on an object
27 * - done using xattrs 27 * - done using xattrs
28 */ 28 */
29 int cachefiles_check_object_type(struct cachefiles_object *object) 29 int cachefiles_check_object_type(struct cachefiles_object *object)
30 { 30 {
31 struct dentry *dentry = object->dentry; 31 struct dentry *dentry = object->dentry;
32 char type[3], xtype[3]; 32 char type[3], xtype[3];
33 int ret; 33 int ret;
34 34
35 ASSERT(dentry); 35 ASSERT(dentry);
36 ASSERT(dentry->d_inode); 36 ASSERT(dentry->d_inode);
37 37
38 if (!object->fscache.cookie) 38 if (!object->fscache.cookie)
39 strcpy(type, "C3"); 39 strcpy(type, "C3");
40 else 40 else
41 snprintf(type, 3, "%02x", object->fscache.cookie->def->type); 41 snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
42 42
43 _enter("%p{%s}", object, type); 43 _enter("%p{%s}", object, type);
44 44
45 /* attempt to install a type label directly */ 45 /* attempt to install a type label directly */
46 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, 46 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
47 XATTR_CREATE); 47 XATTR_CREATE);
48 if (ret == 0) { 48 if (ret == 0) {
49 _debug("SET"); /* we succeeded */ 49 _debug("SET"); /* we succeeded */
50 goto error; 50 goto error;
51 } 51 }
52 52
53 if (ret != -EEXIST) { 53 if (ret != -EEXIST) {
54 pr_err("Can't set xattr on %*.*s [%lu] (err %d)", 54 pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
55 dentry->d_name.len, dentry->d_name.len, 55 dentry->d_name.len, dentry->d_name.len,
56 dentry->d_name.name, dentry->d_inode->i_ino, 56 dentry->d_name.name, dentry->d_inode->i_ino,
57 -ret); 57 -ret);
58 goto error; 58 goto error;
59 } 59 }
60 60
61 /* read the current type label */ 61 /* read the current type label */
62 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); 62 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
63 if (ret < 0) { 63 if (ret < 0) {
64 if (ret == -ERANGE) 64 if (ret == -ERANGE)
65 goto bad_type_length; 65 goto bad_type_length;
66 66
67 pr_err("Can't read xattr on %*.*s [%lu] (err %d)", 67 pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
68 dentry->d_name.len, dentry->d_name.len, 68 dentry->d_name.len, dentry->d_name.len,
69 dentry->d_name.name, dentry->d_inode->i_ino, 69 dentry->d_name.name, dentry->d_inode->i_ino,
70 -ret); 70 -ret);
71 goto error; 71 goto error;
72 } 72 }
73 73
74 /* check the type is what we're expecting */ 74 /* check the type is what we're expecting */
75 if (ret != 2) 75 if (ret != 2)
76 goto bad_type_length; 76 goto bad_type_length;
77 77
78 if (xtype[0] != type[0] || xtype[1] != type[1]) 78 if (xtype[0] != type[0] || xtype[1] != type[1])
79 goto bad_type; 79 goto bad_type;
80 80
81 ret = 0; 81 ret = 0;
82 82
83 error: 83 error:
84 _leave(" = %d", ret); 84 _leave(" = %d", ret);
85 return ret; 85 return ret;
86 86
87 bad_type_length: 87 bad_type_length:
88 pr_err("Cache object %lu type xattr length incorrect", 88 pr_err("Cache object %lu type xattr length incorrect\n",
89 dentry->d_inode->i_ino); 89 dentry->d_inode->i_ino);
90 ret = -EIO; 90 ret = -EIO;
91 goto error; 91 goto error;
92 92
93 bad_type: 93 bad_type:
94 xtype[2] = 0; 94 xtype[2] = 0;
95 pr_err("Cache object %*.*s [%lu] type %s not %s", 95 pr_err("Cache object %*.*s [%lu] type %s not %s\n",
96 dentry->d_name.len, dentry->d_name.len, 96 dentry->d_name.len, dentry->d_name.len,
97 dentry->d_name.name, dentry->d_inode->i_ino, 97 dentry->d_name.name, dentry->d_inode->i_ino,
98 xtype, type); 98 xtype, type);
99 ret = -EIO; 99 ret = -EIO;
100 goto error; 100 goto error;
101 } 101 }
102 102
103 /* 103 /*
104 * set the state xattr on a cache file 104 * set the state xattr on a cache file
105 */ 105 */
106 int cachefiles_set_object_xattr(struct cachefiles_object *object, 106 int cachefiles_set_object_xattr(struct cachefiles_object *object,
107 struct cachefiles_xattr *auxdata) 107 struct cachefiles_xattr *auxdata)
108 { 108 {
109 struct dentry *dentry = object->dentry; 109 struct dentry *dentry = object->dentry;
110 int ret; 110 int ret;
111 111
112 ASSERT(dentry); 112 ASSERT(dentry);
113 113
114 _enter("%p,#%d", object, auxdata->len); 114 _enter("%p,#%d", object, auxdata->len);
115 115
116 /* attempt to install the cache metadata directly */ 116 /* attempt to install the cache metadata directly */
117 _debug("SET #%u", auxdata->len); 117 _debug("SET #%u", auxdata->len);
118 118
119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
120 &auxdata->type, auxdata->len, 120 &auxdata->type, auxdata->len,
121 XATTR_CREATE); 121 XATTR_CREATE);
122 if (ret < 0 && ret != -ENOMEM) 122 if (ret < 0 && ret != -ENOMEM)
123 cachefiles_io_error_obj( 123 cachefiles_io_error_obj(
124 object, 124 object,
125 "Failed to set xattr with error %d", ret); 125 "Failed to set xattr with error %d", ret);
126 126
127 _leave(" = %d", ret); 127 _leave(" = %d", ret);
128 return ret; 128 return ret;
129 } 129 }
130 130
131 /* 131 /*
132 * update the state xattr on a cache file 132 * update the state xattr on a cache file
133 */ 133 */
134 int cachefiles_update_object_xattr(struct cachefiles_object *object, 134 int cachefiles_update_object_xattr(struct cachefiles_object *object,
135 struct cachefiles_xattr *auxdata) 135 struct cachefiles_xattr *auxdata)
136 { 136 {
137 struct dentry *dentry = object->dentry; 137 struct dentry *dentry = object->dentry;
138 int ret; 138 int ret;
139 139
140 ASSERT(dentry); 140 ASSERT(dentry);
141 141
142 _enter("%p,#%d", object, auxdata->len); 142 _enter("%p,#%d", object, auxdata->len);
143 143
144 /* attempt to install the cache metadata directly */ 144 /* attempt to install the cache metadata directly */
145 _debug("SET #%u", auxdata->len); 145 _debug("SET #%u", auxdata->len);
146 146
147 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 147 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
148 &auxdata->type, auxdata->len, 148 &auxdata->type, auxdata->len,
149 XATTR_REPLACE); 149 XATTR_REPLACE);
150 if (ret < 0 && ret != -ENOMEM) 150 if (ret < 0 && ret != -ENOMEM)
151 cachefiles_io_error_obj( 151 cachefiles_io_error_obj(
152 object, 152 object,
153 "Failed to update xattr with error %d", ret); 153 "Failed to update xattr with error %d", ret);
154 154
155 _leave(" = %d", ret); 155 _leave(" = %d", ret);
156 return ret; 156 return ret;
157 } 157 }
158 158
159 /* 159 /*
160 * check the consistency between the backing cache and the FS-Cache cookie 160 * check the consistency between the backing cache and the FS-Cache cookie
161 */ 161 */
162 int cachefiles_check_auxdata(struct cachefiles_object *object) 162 int cachefiles_check_auxdata(struct cachefiles_object *object)
163 { 163 {
164 struct cachefiles_xattr *auxbuf; 164 struct cachefiles_xattr *auxbuf;
165 enum fscache_checkaux validity; 165 enum fscache_checkaux validity;
166 struct dentry *dentry = object->dentry; 166 struct dentry *dentry = object->dentry;
167 ssize_t xlen; 167 ssize_t xlen;
168 int ret; 168 int ret;
169 169
170 ASSERT(dentry); 170 ASSERT(dentry);
171 ASSERT(dentry->d_inode); 171 ASSERT(dentry->d_inode);
172 ASSERT(object->fscache.cookie->def->check_aux); 172 ASSERT(object->fscache.cookie->def->check_aux);
173 173
174 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); 174 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
175 if (!auxbuf) 175 if (!auxbuf)
176 return -ENOMEM; 176 return -ENOMEM;
177 177
178 xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, 178 xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
179 &auxbuf->type, 512 + 1); 179 &auxbuf->type, 512 + 1);
180 ret = -ESTALE; 180 ret = -ESTALE;
181 if (xlen < 1 || 181 if (xlen < 1 ||
182 auxbuf->type != object->fscache.cookie->def->type) 182 auxbuf->type != object->fscache.cookie->def->type)
183 goto error; 183 goto error;
184 184
185 xlen--; 185 xlen--;
186 validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); 186 validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
187 if (validity != FSCACHE_CHECKAUX_OKAY) 187 if (validity != FSCACHE_CHECKAUX_OKAY)
188 goto error; 188 goto error;
189 189
190 ret = 0; 190 ret = 0;
191 error: 191 error:
192 kfree(auxbuf); 192 kfree(auxbuf);
193 return ret; 193 return ret;
194 } 194 }
195 195
196 /* 196 /*
197 * check the state xattr on a cache file 197 * check the state xattr on a cache file
198 * - return -ESTALE if the object should be deleted 198 * - return -ESTALE if the object should be deleted
199 */ 199 */
200 int cachefiles_check_object_xattr(struct cachefiles_object *object, 200 int cachefiles_check_object_xattr(struct cachefiles_object *object,
201 struct cachefiles_xattr *auxdata) 201 struct cachefiles_xattr *auxdata)
202 { 202 {
203 struct cachefiles_xattr *auxbuf; 203 struct cachefiles_xattr *auxbuf;
204 struct dentry *dentry = object->dentry; 204 struct dentry *dentry = object->dentry;
205 int ret; 205 int ret;
206 206
207 _enter("%p,#%d", object, auxdata->len); 207 _enter("%p,#%d", object, auxdata->len);
208 208
209 ASSERT(dentry); 209 ASSERT(dentry);
210 ASSERT(dentry->d_inode); 210 ASSERT(dentry->d_inode);
211 211
212 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); 212 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
213 if (!auxbuf) { 213 if (!auxbuf) {
214 _leave(" = -ENOMEM"); 214 _leave(" = -ENOMEM");
215 return -ENOMEM; 215 return -ENOMEM;
216 } 216 }
217 217
218 /* read the current type label */ 218 /* read the current type label */
219 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, 219 ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
220 &auxbuf->type, 512 + 1); 220 &auxbuf->type, 512 + 1);
221 if (ret < 0) { 221 if (ret < 0) {
222 if (ret == -ENODATA) 222 if (ret == -ENODATA)
223 goto stale; /* no attribute - power went off 223 goto stale; /* no attribute - power went off
224 * mid-cull? */ 224 * mid-cull? */
225 225
226 if (ret == -ERANGE) 226 if (ret == -ERANGE)
227 goto bad_type_length; 227 goto bad_type_length;
228 228
229 cachefiles_io_error_obj(object, 229 cachefiles_io_error_obj(object,
230 "Can't read xattr on %lu (err %d)", 230 "Can't read xattr on %lu (err %d)",
231 dentry->d_inode->i_ino, -ret); 231 dentry->d_inode->i_ino, -ret);
232 goto error; 232 goto error;
233 } 233 }
234 234
235 /* check the on-disk object */ 235 /* check the on-disk object */
236 if (ret < 1) 236 if (ret < 1)
237 goto bad_type_length; 237 goto bad_type_length;
238 238
239 if (auxbuf->type != auxdata->type) 239 if (auxbuf->type != auxdata->type)
240 goto stale; 240 goto stale;
241 241
242 auxbuf->len = ret; 242 auxbuf->len = ret;
243 243
244 /* consult the netfs */ 244 /* consult the netfs */
245 if (object->fscache.cookie->def->check_aux) { 245 if (object->fscache.cookie->def->check_aux) {
246 enum fscache_checkaux result; 246 enum fscache_checkaux result;
247 unsigned int dlen; 247 unsigned int dlen;
248 248
249 dlen = auxbuf->len - 1; 249 dlen = auxbuf->len - 1;
250 250
251 _debug("checkaux %s #%u", 251 _debug("checkaux %s #%u",
252 object->fscache.cookie->def->name, dlen); 252 object->fscache.cookie->def->name, dlen);
253 253
254 result = fscache_check_aux(&object->fscache, 254 result = fscache_check_aux(&object->fscache,
255 &auxbuf->data, dlen); 255 &auxbuf->data, dlen);
256 256
257 switch (result) { 257 switch (result) {
258 /* entry okay as is */ 258 /* entry okay as is */
259 case FSCACHE_CHECKAUX_OKAY: 259 case FSCACHE_CHECKAUX_OKAY:
260 goto okay; 260 goto okay;
261 261
262 /* entry requires update */ 262 /* entry requires update */
263 case FSCACHE_CHECKAUX_NEEDS_UPDATE: 263 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
264 break; 264 break;
265 265
266 /* entry requires deletion */ 266 /* entry requires deletion */
267 case FSCACHE_CHECKAUX_OBSOLETE: 267 case FSCACHE_CHECKAUX_OBSOLETE:
268 goto stale; 268 goto stale;
269 269
270 default: 270 default:
271 BUG(); 271 BUG();
272 } 272 }
273 273
274 /* update the current label */ 274 /* update the current label */
275 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 275 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
276 &auxdata->type, auxdata->len, 276 &auxdata->type, auxdata->len,
277 XATTR_REPLACE); 277 XATTR_REPLACE);
278 if (ret < 0) { 278 if (ret < 0) {
279 cachefiles_io_error_obj(object, 279 cachefiles_io_error_obj(object,
280 "Can't update xattr on %lu" 280 "Can't update xattr on %lu"
281 " (error %d)", 281 " (error %d)",
282 dentry->d_inode->i_ino, -ret); 282 dentry->d_inode->i_ino, -ret);
283 goto error; 283 goto error;
284 } 284 }
285 } 285 }
286 286
287 okay: 287 okay:
288 ret = 0; 288 ret = 0;
289 289
290 error: 290 error:
291 kfree(auxbuf); 291 kfree(auxbuf);
292 _leave(" = %d", ret); 292 _leave(" = %d", ret);
293 return ret; 293 return ret;
294 294
295 bad_type_length: 295 bad_type_length:
296 pr_err("Cache object %lu xattr length incorrect", 296 pr_err("Cache object %lu xattr length incorrect\n",
297 dentry->d_inode->i_ino); 297 dentry->d_inode->i_ino);
298 ret = -EIO; 298 ret = -EIO;
299 goto error; 299 goto error;
300 300
301 stale: 301 stale:
302 ret = -ESTALE; 302 ret = -ESTALE;
303 goto error; 303 goto error;
304 } 304 }
305 305
306 /* 306 /*
307 * remove the object's xattr to mark it stale 307 * remove the object's xattr to mark it stale
308 */ 308 */
309 int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, 309 int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
310 struct dentry *dentry) 310 struct dentry *dentry)
311 { 311 {
312 int ret; 312 int ret;
313 313
314 ret = vfs_removexattr(dentry, cachefiles_xattr_cache); 314 ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
315 if (ret < 0) { 315 if (ret < 0) {
316 if (ret == -ENOENT || ret == -ENODATA) 316 if (ret == -ENOENT || ret == -ENODATA)
317 ret = 0; 317 ret = 0;
318 else if (ret != -ENOMEM) 318 else if (ret != -ENOMEM)
319 cachefiles_io_error(cache, 319 cachefiles_io_error(cache,
320 "Can't remove xattr from %lu" 320 "Can't remove xattr from %lu"
321 " (error %d)", 321 " (error %d)",
322 dentry->d_inode->i_ino, -ret); 322 dentry->d_inode->i_ino, -ret);
323 } 323 }
324 324
325 _leave(" = %d", ret); 325 _leave(" = %d", ret);
326 return ret; 326 return ret;
327 } 327 }
328 328
1 /* 1 /*
2 * inode.c - NILFS inode operations. 2 * inode.c - NILFS inode operations.
3 * 3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. 4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or 8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version. 9 * (at your option) any later version.
10 * 10 *
11 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net> 20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 * 21 *
22 */ 22 */
23 23
24 #include <linux/buffer_head.h> 24 #include <linux/buffer_head.h>
25 #include <linux/gfp.h> 25 #include <linux/gfp.h>
26 #include <linux/mpage.h> 26 #include <linux/mpage.h>
27 #include <linux/pagemap.h>
27 #include <linux/writeback.h> 28 #include <linux/writeback.h>
28 #include <linux/aio.h> 29 #include <linux/aio.h>
29 #include "nilfs.h" 30 #include "nilfs.h"
30 #include "btnode.h" 31 #include "btnode.h"
31 #include "segment.h" 32 #include "segment.h"
32 #include "page.h" 33 #include "page.h"
33 #include "mdt.h" 34 #include "mdt.h"
34 #include "cpfile.h" 35 #include "cpfile.h"
35 #include "ifile.h" 36 #include "ifile.h"
36 37
37 /** 38 /**
38 * struct nilfs_iget_args - arguments used during comparison between inodes 39 * struct nilfs_iget_args - arguments used during comparison between inodes
39 * @ino: inode number 40 * @ino: inode number
40 * @cno: checkpoint number 41 * @cno: checkpoint number
41 * @root: pointer on NILFS root object (mounted checkpoint) 42 * @root: pointer on NILFS root object (mounted checkpoint)
42 * @for_gc: inode for GC flag 43 * @for_gc: inode for GC flag
43 */ 44 */
44 struct nilfs_iget_args { 45 struct nilfs_iget_args {
45 u64 ino; 46 u64 ino;
46 __u64 cno; 47 __u64 cno;
47 struct nilfs_root *root; 48 struct nilfs_root *root;
48 int for_gc; 49 int for_gc;
49 }; 50 };
50 51
51 void nilfs_inode_add_blocks(struct inode *inode, int n) 52 void nilfs_inode_add_blocks(struct inode *inode, int n)
52 { 53 {
53 struct nilfs_root *root = NILFS_I(inode)->i_root; 54 struct nilfs_root *root = NILFS_I(inode)->i_root;
54 55
55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n); 56 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
56 if (root) 57 if (root)
57 atomic64_add(n, &root->blocks_count); 58 atomic64_add(n, &root->blocks_count);
58 } 59 }
59 60
60 void nilfs_inode_sub_blocks(struct inode *inode, int n) 61 void nilfs_inode_sub_blocks(struct inode *inode, int n)
61 { 62 {
62 struct nilfs_root *root = NILFS_I(inode)->i_root; 63 struct nilfs_root *root = NILFS_I(inode)->i_root;
63 64
64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); 65 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
65 if (root) 66 if (root)
66 atomic64_sub(n, &root->blocks_count); 67 atomic64_sub(n, &root->blocks_count);
67 } 68 }
68 69
69 /** 70 /**
70 * nilfs_get_block() - get a file block on the filesystem (callback function) 71 * nilfs_get_block() - get a file block on the filesystem (callback function)
71 * @inode - inode struct of the target file 72 * @inode - inode struct of the target file
72 * @blkoff - file block number 73 * @blkoff - file block number
73 * @bh_result - buffer head to be mapped on 74 * @bh_result - buffer head to be mapped on
74 * @create - indicate whether allocating the block or not when it has not 75 * @create - indicate whether allocating the block or not when it has not
75 * been allocated yet. 76 * been allocated yet.
76 * 77 *
77 * This function does not issue actual read request of the specified data 78 * This function does not issue actual read request of the specified data
78 * block. It is done by VFS. 79 * block. It is done by VFS.
79 */ 80 */
80 int nilfs_get_block(struct inode *inode, sector_t blkoff, 81 int nilfs_get_block(struct inode *inode, sector_t blkoff,
81 struct buffer_head *bh_result, int create) 82 struct buffer_head *bh_result, int create)
82 { 83 {
83 struct nilfs_inode_info *ii = NILFS_I(inode); 84 struct nilfs_inode_info *ii = NILFS_I(inode);
84 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 85 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
85 __u64 blknum = 0; 86 __u64 blknum = 0;
86 int err = 0, ret; 87 int err = 0, ret;
87 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; 88 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
88 89
89 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 90 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
90 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); 91 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
91 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 92 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
92 if (ret >= 0) { /* found */ 93 if (ret >= 0) { /* found */
93 map_bh(bh_result, inode->i_sb, blknum); 94 map_bh(bh_result, inode->i_sb, blknum);
94 if (ret > 0) 95 if (ret > 0)
95 bh_result->b_size = (ret << inode->i_blkbits); 96 bh_result->b_size = (ret << inode->i_blkbits);
96 goto out; 97 goto out;
97 } 98 }
98 /* data block was not found */ 99 /* data block was not found */
99 if (ret == -ENOENT && create) { 100 if (ret == -ENOENT && create) {
100 struct nilfs_transaction_info ti; 101 struct nilfs_transaction_info ti;
101 102
102 bh_result->b_blocknr = 0; 103 bh_result->b_blocknr = 0;
103 err = nilfs_transaction_begin(inode->i_sb, &ti, 1); 104 err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
104 if (unlikely(err)) 105 if (unlikely(err))
105 goto out; 106 goto out;
106 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, 107 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
107 (unsigned long)bh_result); 108 (unsigned long)bh_result);
108 if (unlikely(err != 0)) { 109 if (unlikely(err != 0)) {
109 if (err == -EEXIST) { 110 if (err == -EEXIST) {
110 /* 111 /*
111 * The get_block() function could be called 112 * The get_block() function could be called
112 * from multiple callers for an inode. 113 * from multiple callers for an inode.
113 * However, the page having this block must 114 * However, the page having this block must
114 * be locked in this case. 115 * be locked in this case.
115 */ 116 */
116 printk(KERN_WARNING 117 printk(KERN_WARNING
117 "nilfs_get_block: a race condition " 118 "nilfs_get_block: a race condition "
118 "while inserting a data block. " 119 "while inserting a data block. "
119 "(inode number=%lu, file block " 120 "(inode number=%lu, file block "
120 "offset=%llu)\n", 121 "offset=%llu)\n",
121 inode->i_ino, 122 inode->i_ino,
122 (unsigned long long)blkoff); 123 (unsigned long long)blkoff);
123 err = 0; 124 err = 0;
124 } 125 }
125 nilfs_transaction_abort(inode->i_sb); 126 nilfs_transaction_abort(inode->i_sb);
126 goto out; 127 goto out;
127 } 128 }
128 nilfs_mark_inode_dirty(inode); 129 nilfs_mark_inode_dirty(inode);
129 nilfs_transaction_commit(inode->i_sb); /* never fails */ 130 nilfs_transaction_commit(inode->i_sb); /* never fails */
130 /* Error handling should be detailed */ 131 /* Error handling should be detailed */
131 set_buffer_new(bh_result); 132 set_buffer_new(bh_result);
132 set_buffer_delay(bh_result); 133 set_buffer_delay(bh_result);
133 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed 134 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
134 to proper value */ 135 to proper value */
135 } else if (ret == -ENOENT) { 136 } else if (ret == -ENOENT) {
136 /* not found is not error (e.g. hole); must return without 137 /* not found is not error (e.g. hole); must return without
137 the mapped state flag. */ 138 the mapped state flag. */
138 ; 139 ;
139 } else { 140 } else {
140 err = ret; 141 err = ret;
141 } 142 }
142 143
143 out: 144 out:
144 return err; 145 return err;
145 } 146 }
146 147
147 /** 148 /**
148 * nilfs_readpage() - implement readpage() method of nilfs_aops {} 149 * nilfs_readpage() - implement readpage() method of nilfs_aops {}
149 * address_space_operations. 150 * address_space_operations.
150 * @file - file struct of the file to be read 151 * @file - file struct of the file to be read
151 * @page - the page to be read 152 * @page - the page to be read
152 */ 153 */
153 static int nilfs_readpage(struct file *file, struct page *page) 154 static int nilfs_readpage(struct file *file, struct page *page)
154 { 155 {
155 return mpage_readpage(page, nilfs_get_block); 156 return mpage_readpage(page, nilfs_get_block);
156 } 157 }
157 158
158 /** 159 /**
159 * nilfs_readpages() - implement readpages() method of nilfs_aops {} 160 * nilfs_readpages() - implement readpages() method of nilfs_aops {}
160 * address_space_operations. 161 * address_space_operations.
161 * @file - file struct of the file to be read 162 * @file - file struct of the file to be read
162 * @mapping - address_space struct used for reading multiple pages 163 * @mapping - address_space struct used for reading multiple pages
163 * @pages - the pages to be read 164 * @pages - the pages to be read
164 * @nr_pages - number of pages to be read 165 * @nr_pages - number of pages to be read
165 */ 166 */
166 static int nilfs_readpages(struct file *file, struct address_space *mapping, 167 static int nilfs_readpages(struct file *file, struct address_space *mapping,
167 struct list_head *pages, unsigned nr_pages) 168 struct list_head *pages, unsigned nr_pages)
168 { 169 {
169 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); 170 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
170 } 171 }
171 172
172 static int nilfs_writepages(struct address_space *mapping, 173 static int nilfs_writepages(struct address_space *mapping,
173 struct writeback_control *wbc) 174 struct writeback_control *wbc)
174 { 175 {
175 struct inode *inode = mapping->host; 176 struct inode *inode = mapping->host;
176 int err = 0; 177 int err = 0;
177 178
178 if (inode->i_sb->s_flags & MS_RDONLY) { 179 if (inode->i_sb->s_flags & MS_RDONLY) {
179 nilfs_clear_dirty_pages(mapping, false); 180 nilfs_clear_dirty_pages(mapping, false);
180 return -EROFS; 181 return -EROFS;
181 } 182 }
182 183
183 if (wbc->sync_mode == WB_SYNC_ALL) 184 if (wbc->sync_mode == WB_SYNC_ALL)
184 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 185 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
185 wbc->range_start, 186 wbc->range_start,
186 wbc->range_end); 187 wbc->range_end);
187 return err; 188 return err;
188 } 189 }
189 190
190 static int nilfs_writepage(struct page *page, struct writeback_control *wbc) 191 static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
191 { 192 {
192 struct inode *inode = page->mapping->host; 193 struct inode *inode = page->mapping->host;
193 int err; 194 int err;
194 195
195 if (inode->i_sb->s_flags & MS_RDONLY) { 196 if (inode->i_sb->s_flags & MS_RDONLY) {
196 /* 197 /*
197 * It means that filesystem was remounted in read-only 198 * It means that filesystem was remounted in read-only
198 * mode because of error or metadata corruption. But we 199 * mode because of error or metadata corruption. But we
199 * have dirty pages that try to be flushed in background. 200 * have dirty pages that try to be flushed in background.
200 * So, here we simply discard this dirty page. 201 * So, here we simply discard this dirty page.
201 */ 202 */
202 nilfs_clear_dirty_page(page, false); 203 nilfs_clear_dirty_page(page, false);
203 unlock_page(page); 204 unlock_page(page);
204 return -EROFS; 205 return -EROFS;
205 } 206 }
206 207
207 redirty_page_for_writepage(wbc, page); 208 redirty_page_for_writepage(wbc, page);
208 unlock_page(page); 209 unlock_page(page);
209 210
210 if (wbc->sync_mode == WB_SYNC_ALL) { 211 if (wbc->sync_mode == WB_SYNC_ALL) {
211 err = nilfs_construct_segment(inode->i_sb); 212 err = nilfs_construct_segment(inode->i_sb);
212 if (unlikely(err)) 213 if (unlikely(err))
213 return err; 214 return err;
214 } else if (wbc->for_reclaim) 215 } else if (wbc->for_reclaim)
215 nilfs_flush_segment(inode->i_sb, inode->i_ino); 216 nilfs_flush_segment(inode->i_sb, inode->i_ino);
216 217
217 return 0; 218 return 0;
218 } 219 }
219 220
220 static int nilfs_set_page_dirty(struct page *page) 221 static int nilfs_set_page_dirty(struct page *page)
221 { 222 {
223 struct inode *inode = page->mapping->host;
222 int ret = __set_page_dirty_nobuffers(page); 224 int ret = __set_page_dirty_nobuffers(page);
223 225
224 if (page_has_buffers(page)) { 226 if (page_has_buffers(page)) {
225 struct inode *inode = page->mapping->host;
226 unsigned nr_dirty = 0; 227 unsigned nr_dirty = 0;
227 struct buffer_head *bh, *head; 228 struct buffer_head *bh, *head;
228 229
229 /* 230 /*
230 * This page is locked by callers, and no other thread 231 * This page is locked by callers, and no other thread
231 * concurrently marks its buffers dirty since they are 232 * concurrently marks its buffers dirty since they are
232 * only dirtied through routines in fs/buffer.c in 233 * only dirtied through routines in fs/buffer.c in
233 * which call sites of mark_buffer_dirty are protected 234 * which call sites of mark_buffer_dirty are protected
234 * by page lock. 235 * by page lock.
235 */ 236 */
236 bh = head = page_buffers(page); 237 bh = head = page_buffers(page);
237 do { 238 do {
238 /* Do not mark hole blocks dirty */ 239 /* Do not mark hole blocks dirty */
239 if (buffer_dirty(bh) || !buffer_mapped(bh)) 240 if (buffer_dirty(bh) || !buffer_mapped(bh))
240 continue; 241 continue;
241 242
242 set_buffer_dirty(bh); 243 set_buffer_dirty(bh);
243 nr_dirty++; 244 nr_dirty++;
244 } while (bh = bh->b_this_page, bh != head); 245 } while (bh = bh->b_this_page, bh != head);
245 246
246 if (nr_dirty) 247 if (nr_dirty)
247 nilfs_set_file_dirty(inode, nr_dirty); 248 nilfs_set_file_dirty(inode, nr_dirty);
249 } else if (ret) {
250 unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
251
252 nilfs_set_file_dirty(inode, nr_dirty);
248 } 253 }
249 return ret; 254 return ret;
250 } 255 }
251 256
252 void nilfs_write_failed(struct address_space *mapping, loff_t to) 257 void nilfs_write_failed(struct address_space *mapping, loff_t to)
253 { 258 {
254 struct inode *inode = mapping->host; 259 struct inode *inode = mapping->host;
255 260
256 if (to > inode->i_size) { 261 if (to > inode->i_size) {
257 truncate_pagecache(inode, inode->i_size); 262 truncate_pagecache(inode, inode->i_size);
258 nilfs_truncate(inode); 263 nilfs_truncate(inode);
259 } 264 }
260 } 265 }
261 266
262 static int nilfs_write_begin(struct file *file, struct address_space *mapping, 267 static int nilfs_write_begin(struct file *file, struct address_space *mapping,
263 loff_t pos, unsigned len, unsigned flags, 268 loff_t pos, unsigned len, unsigned flags,
264 struct page **pagep, void **fsdata) 269 struct page **pagep, void **fsdata)
265 270
266 { 271 {
267 struct inode *inode = mapping->host; 272 struct inode *inode = mapping->host;
268 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); 273 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
269 274
270 if (unlikely(err)) 275 if (unlikely(err))
271 return err; 276 return err;
272 277
273 err = block_write_begin(mapping, pos, len, flags, pagep, 278 err = block_write_begin(mapping, pos, len, flags, pagep,
274 nilfs_get_block); 279 nilfs_get_block);
275 if (unlikely(err)) { 280 if (unlikely(err)) {
276 nilfs_write_failed(mapping, pos + len); 281 nilfs_write_failed(mapping, pos + len);
277 nilfs_transaction_abort(inode->i_sb); 282 nilfs_transaction_abort(inode->i_sb);
278 } 283 }
279 return err; 284 return err;
280 } 285 }
281 286
282 static int nilfs_write_end(struct file *file, struct address_space *mapping, 287 static int nilfs_write_end(struct file *file, struct address_space *mapping,
283 loff_t pos, unsigned len, unsigned copied, 288 loff_t pos, unsigned len, unsigned copied,
284 struct page *page, void *fsdata) 289 struct page *page, void *fsdata)
285 { 290 {
286 struct inode *inode = mapping->host; 291 struct inode *inode = mapping->host;
287 unsigned start = pos & (PAGE_CACHE_SIZE - 1); 292 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
288 unsigned nr_dirty; 293 unsigned nr_dirty;
289 int err; 294 int err;
290 295
291 nr_dirty = nilfs_page_count_clean_buffers(page, start, 296 nr_dirty = nilfs_page_count_clean_buffers(page, start,
292 start + copied); 297 start + copied);
293 copied = generic_write_end(file, mapping, pos, len, copied, page, 298 copied = generic_write_end(file, mapping, pos, len, copied, page,
294 fsdata); 299 fsdata);
295 nilfs_set_file_dirty(inode, nr_dirty); 300 nilfs_set_file_dirty(inode, nr_dirty);
296 err = nilfs_transaction_commit(inode->i_sb); 301 err = nilfs_transaction_commit(inode->i_sb);
297 return err ? : copied; 302 return err ? : copied;
298 } 303 }
299 304
300 static ssize_t 305 static ssize_t
301 nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, 306 nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
302 loff_t offset) 307 loff_t offset)
303 { 308 {
304 struct file *file = iocb->ki_filp; 309 struct file *file = iocb->ki_filp;
305 struct address_space *mapping = file->f_mapping; 310 struct address_space *mapping = file->f_mapping;
306 struct inode *inode = file->f_mapping->host; 311 struct inode *inode = file->f_mapping->host;
307 size_t count = iov_iter_count(iter); 312 size_t count = iov_iter_count(iter);
308 ssize_t size; 313 ssize_t size;
309 314
310 if (rw == WRITE) 315 if (rw == WRITE)
311 return 0; 316 return 0;
312 317
313 /* Needs synchronization with the cleaner */ 318 /* Needs synchronization with the cleaner */
314 size = blockdev_direct_IO(rw, iocb, inode, iter, offset, 319 size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
315 nilfs_get_block); 320 nilfs_get_block);
316 321
317 /* 322 /*
318 * In case of error extending write may have instantiated a few 323 * In case of error extending write may have instantiated a few
319 * blocks outside i_size. Trim these off again. 324 * blocks outside i_size. Trim these off again.
320 */ 325 */
321 if (unlikely((rw & WRITE) && size < 0)) { 326 if (unlikely((rw & WRITE) && size < 0)) {
322 loff_t isize = i_size_read(inode); 327 loff_t isize = i_size_read(inode);
323 loff_t end = offset + count; 328 loff_t end = offset + count;
324 329
325 if (end > isize) 330 if (end > isize)
326 nilfs_write_failed(mapping, end); 331 nilfs_write_failed(mapping, end);
327 } 332 }
328 333
329 return size; 334 return size;
330 } 335 }
331 336
332 const struct address_space_operations nilfs_aops = { 337 const struct address_space_operations nilfs_aops = {
333 .writepage = nilfs_writepage, 338 .writepage = nilfs_writepage,
334 .readpage = nilfs_readpage, 339 .readpage = nilfs_readpage,
335 .writepages = nilfs_writepages, 340 .writepages = nilfs_writepages,
336 .set_page_dirty = nilfs_set_page_dirty, 341 .set_page_dirty = nilfs_set_page_dirty,
337 .readpages = nilfs_readpages, 342 .readpages = nilfs_readpages,
338 .write_begin = nilfs_write_begin, 343 .write_begin = nilfs_write_begin,
339 .write_end = nilfs_write_end, 344 .write_end = nilfs_write_end,
340 /* .releasepage = nilfs_releasepage, */ 345 /* .releasepage = nilfs_releasepage, */
341 .invalidatepage = block_invalidatepage, 346 .invalidatepage = block_invalidatepage,
342 .direct_IO = nilfs_direct_IO, 347 .direct_IO = nilfs_direct_IO,
343 .is_partially_uptodate = block_is_partially_uptodate, 348 .is_partially_uptodate = block_is_partially_uptodate,
344 }; 349 };
345 350
346 struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) 351 struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
347 { 352 {
348 struct super_block *sb = dir->i_sb; 353 struct super_block *sb = dir->i_sb;
349 struct the_nilfs *nilfs = sb->s_fs_info; 354 struct the_nilfs *nilfs = sb->s_fs_info;
350 struct inode *inode; 355 struct inode *inode;
351 struct nilfs_inode_info *ii; 356 struct nilfs_inode_info *ii;
352 struct nilfs_root *root; 357 struct nilfs_root *root;
353 int err = -ENOMEM; 358 int err = -ENOMEM;
354 ino_t ino; 359 ino_t ino;
355 360
356 inode = new_inode(sb); 361 inode = new_inode(sb);
357 if (unlikely(!inode)) 362 if (unlikely(!inode))
358 goto failed; 363 goto failed;
359 364
360 mapping_set_gfp_mask(inode->i_mapping, 365 mapping_set_gfp_mask(inode->i_mapping,
361 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 366 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
362 367
363 root = NILFS_I(dir)->i_root; 368 root = NILFS_I(dir)->i_root;
364 ii = NILFS_I(inode); 369 ii = NILFS_I(inode);
365 ii->i_state = 1 << NILFS_I_NEW; 370 ii->i_state = 1 << NILFS_I_NEW;
366 ii->i_root = root; 371 ii->i_root = root;
367 372
368 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); 373 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
369 if (unlikely(err)) 374 if (unlikely(err))
370 goto failed_ifile_create_inode; 375 goto failed_ifile_create_inode;
371 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 376 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
372 377
373 atomic64_inc(&root->inodes_count); 378 atomic64_inc(&root->inodes_count);
374 inode_init_owner(inode, dir, mode); 379 inode_init_owner(inode, dir, mode);
375 inode->i_ino = ino; 380 inode->i_ino = ino;
376 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 381 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
377 382
378 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 383 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
379 err = nilfs_bmap_read(ii->i_bmap, NULL); 384 err = nilfs_bmap_read(ii->i_bmap, NULL);
380 if (err < 0) 385 if (err < 0)
381 goto failed_bmap; 386 goto failed_bmap;
382 387
383 set_bit(NILFS_I_BMAP, &ii->i_state); 388 set_bit(NILFS_I_BMAP, &ii->i_state);
384 /* No lock is needed; iget() ensures it. */ 389 /* No lock is needed; iget() ensures it. */
385 } 390 }
386 391
387 ii->i_flags = nilfs_mask_flags( 392 ii->i_flags = nilfs_mask_flags(
388 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED); 393 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
389 394
390 /* ii->i_file_acl = 0; */ 395 /* ii->i_file_acl = 0; */
391 /* ii->i_dir_acl = 0; */ 396 /* ii->i_dir_acl = 0; */
392 ii->i_dir_start_lookup = 0; 397 ii->i_dir_start_lookup = 0;
393 nilfs_set_inode_flags(inode); 398 nilfs_set_inode_flags(inode);
394 spin_lock(&nilfs->ns_next_gen_lock); 399 spin_lock(&nilfs->ns_next_gen_lock);
395 inode->i_generation = nilfs->ns_next_generation++; 400 inode->i_generation = nilfs->ns_next_generation++;
396 spin_unlock(&nilfs->ns_next_gen_lock); 401 spin_unlock(&nilfs->ns_next_gen_lock);
397 insert_inode_hash(inode); 402 insert_inode_hash(inode);
398 403
399 err = nilfs_init_acl(inode, dir); 404 err = nilfs_init_acl(inode, dir);
400 if (unlikely(err)) 405 if (unlikely(err))
401 goto failed_acl; /* never occur. When supporting 406 goto failed_acl; /* never occur. When supporting
402 nilfs_init_acl(), proper cancellation of 407 nilfs_init_acl(), proper cancellation of
403 above jobs should be considered */ 408 above jobs should be considered */
404 409
405 return inode; 410 return inode;
406 411
407 failed_acl: 412 failed_acl:
408 failed_bmap: 413 failed_bmap:
409 clear_nlink(inode); 414 clear_nlink(inode);
410 iput(inode); /* raw_inode will be deleted through 415 iput(inode); /* raw_inode will be deleted through
411 generic_delete_inode() */ 416 generic_delete_inode() */
412 goto failed; 417 goto failed;
413 418
414 failed_ifile_create_inode: 419 failed_ifile_create_inode:
415 make_bad_inode(inode); 420 make_bad_inode(inode);
416 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be 421 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
417 called */ 422 called */
418 failed: 423 failed:
419 return ERR_PTR(err); 424 return ERR_PTR(err);
420 } 425 }
421 426
422 void nilfs_set_inode_flags(struct inode *inode) 427 void nilfs_set_inode_flags(struct inode *inode)
423 { 428 {
424 unsigned int flags = NILFS_I(inode)->i_flags; 429 unsigned int flags = NILFS_I(inode)->i_flags;
425 430
426 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | 431 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
427 S_DIRSYNC); 432 S_DIRSYNC);
428 if (flags & FS_SYNC_FL) 433 if (flags & FS_SYNC_FL)
429 inode->i_flags |= S_SYNC; 434 inode->i_flags |= S_SYNC;
430 if (flags & FS_APPEND_FL) 435 if (flags & FS_APPEND_FL)
431 inode->i_flags |= S_APPEND; 436 inode->i_flags |= S_APPEND;
432 if (flags & FS_IMMUTABLE_FL) 437 if (flags & FS_IMMUTABLE_FL)
433 inode->i_flags |= S_IMMUTABLE; 438 inode->i_flags |= S_IMMUTABLE;
434 if (flags & FS_NOATIME_FL) 439 if (flags & FS_NOATIME_FL)
435 inode->i_flags |= S_NOATIME; 440 inode->i_flags |= S_NOATIME;
436 if (flags & FS_DIRSYNC_FL) 441 if (flags & FS_DIRSYNC_FL)
437 inode->i_flags |= S_DIRSYNC; 442 inode->i_flags |= S_DIRSYNC;
438 mapping_set_gfp_mask(inode->i_mapping, 443 mapping_set_gfp_mask(inode->i_mapping,
439 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 444 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
440 } 445 }
441 446
442 int nilfs_read_inode_common(struct inode *inode, 447 int nilfs_read_inode_common(struct inode *inode,
443 struct nilfs_inode *raw_inode) 448 struct nilfs_inode *raw_inode)
444 { 449 {
445 struct nilfs_inode_info *ii = NILFS_I(inode); 450 struct nilfs_inode_info *ii = NILFS_I(inode);
446 int err; 451 int err;
447 452
448 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 453 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
449 i_uid_write(inode, le32_to_cpu(raw_inode->i_uid)); 454 i_uid_write(inode, le32_to_cpu(raw_inode->i_uid));
450 i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); 455 i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
451 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 456 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
452 inode->i_size = le64_to_cpu(raw_inode->i_size); 457 inode->i_size = le64_to_cpu(raw_inode->i_size);
453 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 458 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
454 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); 459 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
455 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 460 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
456 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 461 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
457 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); 462 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
458 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 463 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
459 if (inode->i_nlink == 0 && inode->i_mode == 0) 464 if (inode->i_nlink == 0 && inode->i_mode == 0)
460 return -EINVAL; /* this inode is deleted */ 465 return -EINVAL; /* this inode is deleted */
461 466
462 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); 467 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
463 ii->i_flags = le32_to_cpu(raw_inode->i_flags); 468 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
464 #if 0 469 #if 0
465 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 470 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
466 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 471 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
467 0 : le32_to_cpu(raw_inode->i_dir_acl); 472 0 : le32_to_cpu(raw_inode->i_dir_acl);
468 #endif 473 #endif
469 ii->i_dir_start_lookup = 0; 474 ii->i_dir_start_lookup = 0;
470 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 475 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
471 476
472 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 477 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
473 S_ISLNK(inode->i_mode)) { 478 S_ISLNK(inode->i_mode)) {
474 err = nilfs_bmap_read(ii->i_bmap, raw_inode); 479 err = nilfs_bmap_read(ii->i_bmap, raw_inode);
475 if (err < 0) 480 if (err < 0)
476 return err; 481 return err;
477 set_bit(NILFS_I_BMAP, &ii->i_state); 482 set_bit(NILFS_I_BMAP, &ii->i_state);
478 /* No lock is needed; iget() ensures it. */ 483 /* No lock is needed; iget() ensures it. */
479 } 484 }
480 return 0; 485 return 0;
481 } 486 }
482 487
483 static int __nilfs_read_inode(struct super_block *sb, 488 static int __nilfs_read_inode(struct super_block *sb,
484 struct nilfs_root *root, unsigned long ino, 489 struct nilfs_root *root, unsigned long ino,
485 struct inode *inode) 490 struct inode *inode)
486 { 491 {
487 struct the_nilfs *nilfs = sb->s_fs_info; 492 struct the_nilfs *nilfs = sb->s_fs_info;
488 struct buffer_head *bh; 493 struct buffer_head *bh;
489 struct nilfs_inode *raw_inode; 494 struct nilfs_inode *raw_inode;
490 int err; 495 int err;
491 496
492 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 497 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
493 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); 498 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
494 if (unlikely(err)) 499 if (unlikely(err))
495 goto bad_inode; 500 goto bad_inode;
496 501
497 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh); 502 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
498 503
499 err = nilfs_read_inode_common(inode, raw_inode); 504 err = nilfs_read_inode_common(inode, raw_inode);
500 if (err) 505 if (err)
501 goto failed_unmap; 506 goto failed_unmap;
502 507
503 if (S_ISREG(inode->i_mode)) { 508 if (S_ISREG(inode->i_mode)) {
504 inode->i_op = &nilfs_file_inode_operations; 509 inode->i_op = &nilfs_file_inode_operations;
505 inode->i_fop = &nilfs_file_operations; 510 inode->i_fop = &nilfs_file_operations;
506 inode->i_mapping->a_ops = &nilfs_aops; 511 inode->i_mapping->a_ops = &nilfs_aops;
507 } else if (S_ISDIR(inode->i_mode)) { 512 } else if (S_ISDIR(inode->i_mode)) {
508 inode->i_op = &nilfs_dir_inode_operations; 513 inode->i_op = &nilfs_dir_inode_operations;
509 inode->i_fop = &nilfs_dir_operations; 514 inode->i_fop = &nilfs_dir_operations;
510 inode->i_mapping->a_ops = &nilfs_aops; 515 inode->i_mapping->a_ops = &nilfs_aops;
511 } else if (S_ISLNK(inode->i_mode)) { 516 } else if (S_ISLNK(inode->i_mode)) {
512 inode->i_op = &nilfs_symlink_inode_operations; 517 inode->i_op = &nilfs_symlink_inode_operations;
513 inode->i_mapping->a_ops = &nilfs_aops; 518 inode->i_mapping->a_ops = &nilfs_aops;
514 } else { 519 } else {
515 inode->i_op = &nilfs_special_inode_operations; 520 inode->i_op = &nilfs_special_inode_operations;
516 init_special_inode( 521 init_special_inode(
517 inode, inode->i_mode, 522 inode, inode->i_mode,
518 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 523 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
519 } 524 }
520 nilfs_ifile_unmap_inode(root->ifile, ino, bh); 525 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
521 brelse(bh); 526 brelse(bh);
522 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 527 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
523 nilfs_set_inode_flags(inode); 528 nilfs_set_inode_flags(inode);
524 return 0; 529 return 0;
525 530
526 failed_unmap: 531 failed_unmap:
527 nilfs_ifile_unmap_inode(root->ifile, ino, bh); 532 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
528 brelse(bh); 533 brelse(bh);
529 534
530 bad_inode: 535 bad_inode:
531 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 536 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
532 return err; 537 return err;
533 } 538 }
534 539
535 static int nilfs_iget_test(struct inode *inode, void *opaque) 540 static int nilfs_iget_test(struct inode *inode, void *opaque)
536 { 541 {
537 struct nilfs_iget_args *args = opaque; 542 struct nilfs_iget_args *args = opaque;
538 struct nilfs_inode_info *ii; 543 struct nilfs_inode_info *ii;
539 544
540 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root) 545 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
541 return 0; 546 return 0;
542 547
543 ii = NILFS_I(inode); 548 ii = NILFS_I(inode);
544 if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) 549 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
545 return !args->for_gc; 550 return !args->for_gc;
546 551
547 return args->for_gc && args->cno == ii->i_cno; 552 return args->for_gc && args->cno == ii->i_cno;
548 } 553 }
549 554
550 static int nilfs_iget_set(struct inode *inode, void *opaque) 555 static int nilfs_iget_set(struct inode *inode, void *opaque)
551 { 556 {
552 struct nilfs_iget_args *args = opaque; 557 struct nilfs_iget_args *args = opaque;
553 558
554 inode->i_ino = args->ino; 559 inode->i_ino = args->ino;
555 if (args->for_gc) { 560 if (args->for_gc) {
556 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; 561 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
557 NILFS_I(inode)->i_cno = args->cno; 562 NILFS_I(inode)->i_cno = args->cno;
558 NILFS_I(inode)->i_root = NULL; 563 NILFS_I(inode)->i_root = NULL;
559 } else { 564 } else {
560 if (args->root && args->ino == NILFS_ROOT_INO) 565 if (args->root && args->ino == NILFS_ROOT_INO)
561 nilfs_get_root(args->root); 566 nilfs_get_root(args->root);
562 NILFS_I(inode)->i_root = args->root; 567 NILFS_I(inode)->i_root = args->root;
563 } 568 }
564 return 0; 569 return 0;
565 } 570 }
566 571
567 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, 572 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
568 unsigned long ino) 573 unsigned long ino)
569 { 574 {
570 struct nilfs_iget_args args = { 575 struct nilfs_iget_args args = {
571 .ino = ino, .root = root, .cno = 0, .for_gc = 0 576 .ino = ino, .root = root, .cno = 0, .for_gc = 0
572 }; 577 };
573 578
574 return ilookup5(sb, ino, nilfs_iget_test, &args); 579 return ilookup5(sb, ino, nilfs_iget_test, &args);
575 } 580 }
576 581
577 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, 582 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
578 unsigned long ino) 583 unsigned long ino)
579 { 584 {
580 struct nilfs_iget_args args = { 585 struct nilfs_iget_args args = {
581 .ino = ino, .root = root, .cno = 0, .for_gc = 0 586 .ino = ino, .root = root, .cno = 0, .for_gc = 0
582 }; 587 };
583 588
584 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); 589 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
585 } 590 }
586 591
587 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, 592 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
588 unsigned long ino) 593 unsigned long ino)
589 { 594 {
590 struct inode *inode; 595 struct inode *inode;
591 int err; 596 int err;
592 597
593 inode = nilfs_iget_locked(sb, root, ino); 598 inode = nilfs_iget_locked(sb, root, ino);
594 if (unlikely(!inode)) 599 if (unlikely(!inode))
595 return ERR_PTR(-ENOMEM); 600 return ERR_PTR(-ENOMEM);
596 if (!(inode->i_state & I_NEW)) 601 if (!(inode->i_state & I_NEW))
597 return inode; 602 return inode;
598 603
599 err = __nilfs_read_inode(sb, root, ino, inode); 604 err = __nilfs_read_inode(sb, root, ino, inode);
600 if (unlikely(err)) { 605 if (unlikely(err)) {
601 iget_failed(inode); 606 iget_failed(inode);
602 return ERR_PTR(err); 607 return ERR_PTR(err);
603 } 608 }
604 unlock_new_inode(inode); 609 unlock_new_inode(inode);
605 return inode; 610 return inode;
606 } 611 }
607 612
608 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, 613 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
609 __u64 cno) 614 __u64 cno)
610 { 615 {
611 struct nilfs_iget_args args = { 616 struct nilfs_iget_args args = {
612 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 617 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
613 }; 618 };
614 struct inode *inode; 619 struct inode *inode;
615 int err; 620 int err;
616 621
617 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); 622 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
618 if (unlikely(!inode)) 623 if (unlikely(!inode))
619 return ERR_PTR(-ENOMEM); 624 return ERR_PTR(-ENOMEM);
620 if (!(inode->i_state & I_NEW)) 625 if (!(inode->i_state & I_NEW))
621 return inode; 626 return inode;
622 627
623 err = nilfs_init_gcinode(inode); 628 err = nilfs_init_gcinode(inode);
624 if (unlikely(err)) { 629 if (unlikely(err)) {
625 iget_failed(inode); 630 iget_failed(inode);
626 return ERR_PTR(err); 631 return ERR_PTR(err);
627 } 632 }
628 unlock_new_inode(inode); 633 unlock_new_inode(inode);
629 return inode; 634 return inode;
630 } 635 }
631 636
632 void nilfs_write_inode_common(struct inode *inode, 637 void nilfs_write_inode_common(struct inode *inode,
633 struct nilfs_inode *raw_inode, int has_bmap) 638 struct nilfs_inode *raw_inode, int has_bmap)
634 { 639 {
635 struct nilfs_inode_info *ii = NILFS_I(inode); 640 struct nilfs_inode_info *ii = NILFS_I(inode);
636 641
637 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 642 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
638 raw_inode->i_uid = cpu_to_le32(i_uid_read(inode)); 643 raw_inode->i_uid = cpu_to_le32(i_uid_read(inode));
639 raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); 644 raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
640 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 645 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
641 raw_inode->i_size = cpu_to_le64(inode->i_size); 646 raw_inode->i_size = cpu_to_le64(inode->i_size);
642 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 647 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
643 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 648 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
644 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 649 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
645 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 650 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
646 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); 651 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
647 652
648 raw_inode->i_flags = cpu_to_le32(ii->i_flags); 653 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
649 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 654 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
650 655
651 if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { 656 if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
652 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 657 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
653 658
654 /* zero-fill unused portion in the case of super root block */ 659 /* zero-fill unused portion in the case of super root block */
655 raw_inode->i_xattr = 0; 660 raw_inode->i_xattr = 0;
656 raw_inode->i_pad = 0; 661 raw_inode->i_pad = 0;
657 memset((void *)raw_inode + sizeof(*raw_inode), 0, 662 memset((void *)raw_inode + sizeof(*raw_inode), 0,
658 nilfs->ns_inode_size - sizeof(*raw_inode)); 663 nilfs->ns_inode_size - sizeof(*raw_inode));
659 } 664 }
660 665
661 if (has_bmap) 666 if (has_bmap)
662 nilfs_bmap_write(ii->i_bmap, raw_inode); 667 nilfs_bmap_write(ii->i_bmap, raw_inode);
663 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 668 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
664 raw_inode->i_device_code = 669 raw_inode->i_device_code =
665 cpu_to_le64(huge_encode_dev(inode->i_rdev)); 670 cpu_to_le64(huge_encode_dev(inode->i_rdev));
666 /* When extending inode, nilfs->ns_inode_size should be checked 671 /* When extending inode, nilfs->ns_inode_size should be checked
667 for substitutions of appended fields */ 672 for substitutions of appended fields */
668 } 673 }
669 674
670 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) 675 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
671 { 676 {
672 ino_t ino = inode->i_ino; 677 ino_t ino = inode->i_ino;
673 struct nilfs_inode_info *ii = NILFS_I(inode); 678 struct nilfs_inode_info *ii = NILFS_I(inode);
674 struct inode *ifile = ii->i_root->ifile; 679 struct inode *ifile = ii->i_root->ifile;
675 struct nilfs_inode *raw_inode; 680 struct nilfs_inode *raw_inode;
676 681
677 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh); 682 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
678 683
679 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 684 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
680 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); 685 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
681 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 686 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
682 687
683 nilfs_write_inode_common(inode, raw_inode, 0); 688 nilfs_write_inode_common(inode, raw_inode, 0);
684 /* XXX: call with has_bmap = 0 is a workaround to avoid 689 /* XXX: call with has_bmap = 0 is a workaround to avoid
685 deadlock of bmap. This delays update of i_bmap to just 690 deadlock of bmap. This delays update of i_bmap to just
686 before writing */ 691 before writing */
687 nilfs_ifile_unmap_inode(ifile, ino, ibh); 692 nilfs_ifile_unmap_inode(ifile, ino, ibh);
688 } 693 }
689 694
690 #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 695 #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
691 696
692 static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, 697 static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
693 unsigned long from) 698 unsigned long from)
694 { 699 {
695 unsigned long b; 700 unsigned long b;
696 int ret; 701 int ret;
697 702
698 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 703 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
699 return; 704 return;
700 repeat: 705 repeat:
701 ret = nilfs_bmap_last_key(ii->i_bmap, &b); 706 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
702 if (ret == -ENOENT) 707 if (ret == -ENOENT)
703 return; 708 return;
704 else if (ret < 0) 709 else if (ret < 0)
705 goto failed; 710 goto failed;
706 711
707 if (b < from) 712 if (b < from)
708 return; 713 return;
709 714
710 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); 715 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
711 ret = nilfs_bmap_truncate(ii->i_bmap, b); 716 ret = nilfs_bmap_truncate(ii->i_bmap, b);
712 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); 717 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
713 if (!ret || (ret == -ENOMEM && 718 if (!ret || (ret == -ENOMEM &&
714 nilfs_bmap_truncate(ii->i_bmap, b) == 0)) 719 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
715 goto repeat; 720 goto repeat;
716 721
717 failed: 722 failed:
718 nilfs_warning(ii->vfs_inode.i_sb, __func__, 723 nilfs_warning(ii->vfs_inode.i_sb, __func__,
719 "failed to truncate bmap (ino=%lu, err=%d)", 724 "failed to truncate bmap (ino=%lu, err=%d)",
720 ii->vfs_inode.i_ino, ret); 725 ii->vfs_inode.i_ino, ret);
721 } 726 }
722 727
723 void nilfs_truncate(struct inode *inode) 728 void nilfs_truncate(struct inode *inode)
724 { 729 {
725 unsigned long blkoff; 730 unsigned long blkoff;
726 unsigned int blocksize; 731 unsigned int blocksize;
727 struct nilfs_transaction_info ti; 732 struct nilfs_transaction_info ti;
728 struct super_block *sb = inode->i_sb; 733 struct super_block *sb = inode->i_sb;
729 struct nilfs_inode_info *ii = NILFS_I(inode); 734 struct nilfs_inode_info *ii = NILFS_I(inode);
730 735
731 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 736 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
732 return; 737 return;
733 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 738 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
734 return; 739 return;
735 740
736 blocksize = sb->s_blocksize; 741 blocksize = sb->s_blocksize;
737 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; 742 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
738 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 743 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
739 744
740 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); 745 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
741 746
742 nilfs_truncate_bmap(ii, blkoff); 747 nilfs_truncate_bmap(ii, blkoff);
743 748
744 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 749 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
745 if (IS_SYNC(inode)) 750 if (IS_SYNC(inode))
746 nilfs_set_transaction_flag(NILFS_TI_SYNC); 751 nilfs_set_transaction_flag(NILFS_TI_SYNC);
747 752
748 nilfs_mark_inode_dirty(inode); 753 nilfs_mark_inode_dirty(inode);
749 nilfs_set_file_dirty(inode, 0); 754 nilfs_set_file_dirty(inode, 0);
750 nilfs_transaction_commit(sb); 755 nilfs_transaction_commit(sb);
751 /* May construct a logical segment and may fail in sync mode. 756 /* May construct a logical segment and may fail in sync mode.
752 But truncate has no return value. */ 757 But truncate has no return value. */
753 } 758 }
754 759
755 static void nilfs_clear_inode(struct inode *inode) 760 static void nilfs_clear_inode(struct inode *inode)
756 { 761 {
757 struct nilfs_inode_info *ii = NILFS_I(inode); 762 struct nilfs_inode_info *ii = NILFS_I(inode);
758 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 763 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
759 764
760 /* 765 /*
761 * Free resources allocated in nilfs_read_inode(), here. 766 * Free resources allocated in nilfs_read_inode(), here.
762 */ 767 */
763 BUG_ON(!list_empty(&ii->i_dirty)); 768 BUG_ON(!list_empty(&ii->i_dirty));
764 brelse(ii->i_bh); 769 brelse(ii->i_bh);
765 ii->i_bh = NULL; 770 ii->i_bh = NULL;
766 771
767 if (mdi && mdi->mi_palloc_cache) 772 if (mdi && mdi->mi_palloc_cache)
768 nilfs_palloc_destroy_cache(inode); 773 nilfs_palloc_destroy_cache(inode);
769 774
770 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 775 if (test_bit(NILFS_I_BMAP, &ii->i_state))
771 nilfs_bmap_clear(ii->i_bmap); 776 nilfs_bmap_clear(ii->i_bmap);
772 777
773 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 778 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
774 779
775 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) 780 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
776 nilfs_put_root(ii->i_root); 781 nilfs_put_root(ii->i_root);
777 } 782 }
778 783
779 void nilfs_evict_inode(struct inode *inode) 784 void nilfs_evict_inode(struct inode *inode)
780 { 785 {
781 struct nilfs_transaction_info ti; 786 struct nilfs_transaction_info ti;
782 struct super_block *sb = inode->i_sb; 787 struct super_block *sb = inode->i_sb;
783 struct nilfs_inode_info *ii = NILFS_I(inode); 788 struct nilfs_inode_info *ii = NILFS_I(inode);
784 int ret; 789 int ret;
785 790
786 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 791 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
787 truncate_inode_pages_final(&inode->i_data); 792 truncate_inode_pages_final(&inode->i_data);
788 clear_inode(inode); 793 clear_inode(inode);
789 nilfs_clear_inode(inode); 794 nilfs_clear_inode(inode);
790 return; 795 return;
791 } 796 }
792 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 797 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
793 798
794 truncate_inode_pages_final(&inode->i_data); 799 truncate_inode_pages_final(&inode->i_data);
795 800
796 /* TODO: some of the following operations may fail. */ 801 /* TODO: some of the following operations may fail. */
797 nilfs_truncate_bmap(ii, 0); 802 nilfs_truncate_bmap(ii, 0);
798 nilfs_mark_inode_dirty(inode); 803 nilfs_mark_inode_dirty(inode);
799 clear_inode(inode); 804 clear_inode(inode);
800 805
801 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 806 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
802 if (!ret) 807 if (!ret)
803 atomic64_dec(&ii->i_root->inodes_count); 808 atomic64_dec(&ii->i_root->inodes_count);
804 809
805 nilfs_clear_inode(inode); 810 nilfs_clear_inode(inode);
806 811
807 if (IS_SYNC(inode)) 812 if (IS_SYNC(inode))
808 nilfs_set_transaction_flag(NILFS_TI_SYNC); 813 nilfs_set_transaction_flag(NILFS_TI_SYNC);
809 nilfs_transaction_commit(sb); 814 nilfs_transaction_commit(sb);
810 /* May construct a logical segment and may fail in sync mode. 815 /* May construct a logical segment and may fail in sync mode.
811 But delete_inode has no return value. */ 816 But delete_inode has no return value. */
812 } 817 }
813 818
814 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) 819 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
815 { 820 {
816 struct nilfs_transaction_info ti; 821 struct nilfs_transaction_info ti;
817 struct inode *inode = dentry->d_inode; 822 struct inode *inode = dentry->d_inode;
818 struct super_block *sb = inode->i_sb; 823 struct super_block *sb = inode->i_sb;
819 int err; 824 int err;
820 825
821 err = inode_change_ok(inode, iattr); 826 err = inode_change_ok(inode, iattr);
822 if (err) 827 if (err)
823 return err; 828 return err;
824 829
825 err = nilfs_transaction_begin(sb, &ti, 0); 830 err = nilfs_transaction_begin(sb, &ti, 0);
826 if (unlikely(err)) 831 if (unlikely(err))
827 return err; 832 return err;
828 833
829 if ((iattr->ia_valid & ATTR_SIZE) && 834 if ((iattr->ia_valid & ATTR_SIZE) &&
830 iattr->ia_size != i_size_read(inode)) { 835 iattr->ia_size != i_size_read(inode)) {
831 inode_dio_wait(inode); 836 inode_dio_wait(inode);
832 truncate_setsize(inode, iattr->ia_size); 837 truncate_setsize(inode, iattr->ia_size);
833 nilfs_truncate(inode); 838 nilfs_truncate(inode);
834 } 839 }
835 840
836 setattr_copy(inode, iattr); 841 setattr_copy(inode, iattr);
837 mark_inode_dirty(inode); 842 mark_inode_dirty(inode);
838 843
839 if (iattr->ia_valid & ATTR_MODE) { 844 if (iattr->ia_valid & ATTR_MODE) {
840 err = nilfs_acl_chmod(inode); 845 err = nilfs_acl_chmod(inode);
841 if (unlikely(err)) 846 if (unlikely(err))
842 goto out_err; 847 goto out_err;
843 } 848 }
844 849
845 return nilfs_transaction_commit(sb); 850 return nilfs_transaction_commit(sb);
846 851
847 out_err: 852 out_err:
848 nilfs_transaction_abort(sb); 853 nilfs_transaction_abort(sb);
849 return err; 854 return err;
850 } 855 }
851 856
852 int nilfs_permission(struct inode *inode, int mask) 857 int nilfs_permission(struct inode *inode, int mask)
853 { 858 {
854 struct nilfs_root *root = NILFS_I(inode)->i_root; 859 struct nilfs_root *root = NILFS_I(inode)->i_root;
855 if ((mask & MAY_WRITE) && root && 860 if ((mask & MAY_WRITE) && root &&
856 root->cno != NILFS_CPTREE_CURRENT_CNO) 861 root->cno != NILFS_CPTREE_CURRENT_CNO)
857 return -EROFS; /* snapshot is not writable */ 862 return -EROFS; /* snapshot is not writable */
858 863
859 return generic_permission(inode, mask); 864 return generic_permission(inode, mask);
860 } 865 }
861 866
862 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) 867 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
863 { 868 {
864 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 869 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
865 struct nilfs_inode_info *ii = NILFS_I(inode); 870 struct nilfs_inode_info *ii = NILFS_I(inode);
866 int err; 871 int err;
867 872
868 spin_lock(&nilfs->ns_inode_lock); 873 spin_lock(&nilfs->ns_inode_lock);
869 if (ii->i_bh == NULL) { 874 if (ii->i_bh == NULL) {
870 spin_unlock(&nilfs->ns_inode_lock); 875 spin_unlock(&nilfs->ns_inode_lock);
871 err = nilfs_ifile_get_inode_block(ii->i_root->ifile, 876 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
872 inode->i_ino, pbh); 877 inode->i_ino, pbh);
873 if (unlikely(err)) 878 if (unlikely(err))
874 return err; 879 return err;
875 spin_lock(&nilfs->ns_inode_lock); 880 spin_lock(&nilfs->ns_inode_lock);
876 if (ii->i_bh == NULL) 881 if (ii->i_bh == NULL)
877 ii->i_bh = *pbh; 882 ii->i_bh = *pbh;
878 else { 883 else {
879 brelse(*pbh); 884 brelse(*pbh);
880 *pbh = ii->i_bh; 885 *pbh = ii->i_bh;
881 } 886 }
882 } else 887 } else
883 *pbh = ii->i_bh; 888 *pbh = ii->i_bh;
884 889
885 get_bh(*pbh); 890 get_bh(*pbh);
886 spin_unlock(&nilfs->ns_inode_lock); 891 spin_unlock(&nilfs->ns_inode_lock);
887 return 0; 892 return 0;
888 } 893 }
889 894
890 int nilfs_inode_dirty(struct inode *inode) 895 int nilfs_inode_dirty(struct inode *inode)
891 { 896 {
892 struct nilfs_inode_info *ii = NILFS_I(inode); 897 struct nilfs_inode_info *ii = NILFS_I(inode);
893 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 898 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
894 int ret = 0; 899 int ret = 0;
895 900
896 if (!list_empty(&ii->i_dirty)) { 901 if (!list_empty(&ii->i_dirty)) {
897 spin_lock(&nilfs->ns_inode_lock); 902 spin_lock(&nilfs->ns_inode_lock);
898 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || 903 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
899 test_bit(NILFS_I_BUSY, &ii->i_state); 904 test_bit(NILFS_I_BUSY, &ii->i_state);
900 spin_unlock(&nilfs->ns_inode_lock); 905 spin_unlock(&nilfs->ns_inode_lock);
901 } 906 }
902 return ret; 907 return ret;
903 } 908 }
904 909
905 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) 910 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
906 { 911 {
907 struct nilfs_inode_info *ii = NILFS_I(inode); 912 struct nilfs_inode_info *ii = NILFS_I(inode);
908 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 913 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
909 914
910 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks); 915 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
911 916
912 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) 917 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
913 return 0; 918 return 0;
914 919
915 spin_lock(&nilfs->ns_inode_lock); 920 spin_lock(&nilfs->ns_inode_lock);
916 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 921 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
917 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 922 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
918 /* Because this routine may race with nilfs_dispose_list(), 923 /* Because this routine may race with nilfs_dispose_list(),
919 we have to check NILFS_I_QUEUED here, too. */ 924 we have to check NILFS_I_QUEUED here, too. */
920 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { 925 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
921 /* This will happen when somebody is freeing 926 /* This will happen when somebody is freeing
922 this inode. */ 927 this inode. */
923 nilfs_warning(inode->i_sb, __func__, 928 nilfs_warning(inode->i_sb, __func__,
924 "cannot get inode (ino=%lu)\n", 929 "cannot get inode (ino=%lu)\n",
925 inode->i_ino); 930 inode->i_ino);
926 spin_unlock(&nilfs->ns_inode_lock); 931 spin_unlock(&nilfs->ns_inode_lock);
927 return -EINVAL; /* NILFS_I_DIRTY may remain for 932 return -EINVAL; /* NILFS_I_DIRTY may remain for
928 freeing inode */ 933 freeing inode */
929 } 934 }
930 list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); 935 list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
931 set_bit(NILFS_I_QUEUED, &ii->i_state); 936 set_bit(NILFS_I_QUEUED, &ii->i_state);
932 } 937 }
933 spin_unlock(&nilfs->ns_inode_lock); 938 spin_unlock(&nilfs->ns_inode_lock);
934 return 0; 939 return 0;
935 } 940 }
936 941
937 int nilfs_mark_inode_dirty(struct inode *inode) 942 int nilfs_mark_inode_dirty(struct inode *inode)
938 { 943 {
939 struct buffer_head *ibh; 944 struct buffer_head *ibh;
940 int err; 945 int err;
941 946
942 err = nilfs_load_inode_block(inode, &ibh); 947 err = nilfs_load_inode_block(inode, &ibh);
943 if (unlikely(err)) { 948 if (unlikely(err)) {
944 nilfs_warning(inode->i_sb, __func__, 949 nilfs_warning(inode->i_sb, __func__,
945 "failed to reget inode block.\n"); 950 "failed to reget inode block.\n");
946 return err; 951 return err;
947 } 952 }
948 nilfs_update_inode(inode, ibh); 953 nilfs_update_inode(inode, ibh);
949 mark_buffer_dirty(ibh); 954 mark_buffer_dirty(ibh);
950 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); 955 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
951 brelse(ibh); 956 brelse(ibh);
952 return 0; 957 return 0;
953 } 958 }
954 959
955 /** 960 /**
956 * nilfs_dirty_inode - reflect changes on given inode to an inode block. 961 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
957 * @inode: inode of the file to be registered. 962 * @inode: inode of the file to be registered.
958 * 963 *
959 * nilfs_dirty_inode() loads a inode block containing the specified 964 * nilfs_dirty_inode() loads a inode block containing the specified
960 * @inode and copies data from a nilfs_inode to a corresponding inode 965 * @inode and copies data from a nilfs_inode to a corresponding inode
961 * entry in the inode block. This operation is excluded from the segment 966 * entry in the inode block. This operation is excluded from the segment
962 * construction. This function can be called both as a single operation 967 * construction. This function can be called both as a single operation
963 * and as a part of indivisible file operations. 968 * and as a part of indivisible file operations.
964 */ 969 */
965 void nilfs_dirty_inode(struct inode *inode, int flags) 970 void nilfs_dirty_inode(struct inode *inode, int flags)
966 { 971 {
967 struct nilfs_transaction_info ti; 972 struct nilfs_transaction_info ti;
968 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 973 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
969 974
970 if (is_bad_inode(inode)) { 975 if (is_bad_inode(inode)) {
971 nilfs_warning(inode->i_sb, __func__, 976 nilfs_warning(inode->i_sb, __func__,
972 "tried to mark bad_inode dirty. ignored.\n"); 977 "tried to mark bad_inode dirty. ignored.\n");
973 dump_stack(); 978 dump_stack();
974 return; 979 return;
975 } 980 }
976 if (mdi) { 981 if (mdi) {
977 nilfs_mdt_mark_dirty(inode); 982 nilfs_mdt_mark_dirty(inode);
978 return; 983 return;
979 } 984 }
980 nilfs_transaction_begin(inode->i_sb, &ti, 0); 985 nilfs_transaction_begin(inode->i_sb, &ti, 0);
981 nilfs_mark_inode_dirty(inode); 986 nilfs_mark_inode_dirty(inode);
982 nilfs_transaction_commit(inode->i_sb); /* never fails */ 987 nilfs_transaction_commit(inode->i_sb); /* never fails */
983 } 988 }
984 989
985 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 990 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
986 __u64 start, __u64 len) 991 __u64 start, __u64 len)
987 { 992 {
988 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 993 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
989 __u64 logical = 0, phys = 0, size = 0; 994 __u64 logical = 0, phys = 0, size = 0;
990 __u32 flags = 0; 995 __u32 flags = 0;
991 loff_t isize; 996 loff_t isize;
992 sector_t blkoff, end_blkoff; 997 sector_t blkoff, end_blkoff;
993 sector_t delalloc_blkoff; 998 sector_t delalloc_blkoff;
994 unsigned long delalloc_blklen; 999 unsigned long delalloc_blklen;
995 unsigned int blkbits = inode->i_blkbits; 1000 unsigned int blkbits = inode->i_blkbits;
996 int ret, n; 1001 int ret, n;
997 1002
998 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); 1003 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
999 if (ret) 1004 if (ret)
1000 return ret; 1005 return ret;
1001 1006
1002 mutex_lock(&inode->i_mutex); 1007 mutex_lock(&inode->i_mutex);
1003 1008
1004 isize = i_size_read(inode); 1009 isize = i_size_read(inode);
1005 1010
1006 blkoff = start >> blkbits; 1011 blkoff = start >> blkbits;
1007 end_blkoff = (start + len - 1) >> blkbits; 1012 end_blkoff = (start + len - 1) >> blkbits;
1008 1013
1009 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, 1014 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
1010 &delalloc_blkoff); 1015 &delalloc_blkoff);
1011 1016
1012 do { 1017 do {
1013 __u64 blkphy; 1018 __u64 blkphy;
1014 unsigned int maxblocks; 1019 unsigned int maxblocks;
1015 1020
1016 if (delalloc_blklen && blkoff == delalloc_blkoff) { 1021 if (delalloc_blklen && blkoff == delalloc_blkoff) {
1017 if (size) { 1022 if (size) {
1018 /* End of the current extent */ 1023 /* End of the current extent */
1019 ret = fiemap_fill_next_extent( 1024 ret = fiemap_fill_next_extent(
1020 fieinfo, logical, phys, size, flags); 1025 fieinfo, logical, phys, size, flags);
1021 if (ret) 1026 if (ret)
1022 break; 1027 break;
1023 } 1028 }
1024 if (blkoff > end_blkoff) 1029 if (blkoff > end_blkoff)
1025 break; 1030 break;
1026 1031
1027 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; 1032 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
1028 logical = blkoff << blkbits; 1033 logical = blkoff << blkbits;
1029 phys = 0; 1034 phys = 0;
1030 size = delalloc_blklen << blkbits; 1035 size = delalloc_blklen << blkbits;
1031 1036
1032 blkoff = delalloc_blkoff + delalloc_blklen; 1037 blkoff = delalloc_blkoff + delalloc_blklen;
1033 delalloc_blklen = nilfs_find_uncommitted_extent( 1038 delalloc_blklen = nilfs_find_uncommitted_extent(
1034 inode, blkoff, &delalloc_blkoff); 1039 inode, blkoff, &delalloc_blkoff);
1035 continue; 1040 continue;
1036 } 1041 }
1037 1042
1038 /* 1043 /*
1039 * Limit the number of blocks that we look up so as 1044 * Limit the number of blocks that we look up so as
1040 * not to get into the next delayed allocation extent. 1045 * not to get into the next delayed allocation extent.
1041 */ 1046 */
1042 maxblocks = INT_MAX; 1047 maxblocks = INT_MAX;
1043 if (delalloc_blklen) 1048 if (delalloc_blklen)
1044 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, 1049 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
1045 maxblocks); 1050 maxblocks);
1046 blkphy = 0; 1051 blkphy = 0;
1047 1052
1048 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 1053 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
1049 n = nilfs_bmap_lookup_contig( 1054 n = nilfs_bmap_lookup_contig(
1050 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); 1055 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
1051 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 1056 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
1052 1057
1053 if (n < 0) { 1058 if (n < 0) {
1054 int past_eof; 1059 int past_eof;
1055 1060
1056 if (unlikely(n != -ENOENT)) 1061 if (unlikely(n != -ENOENT))
1057 break; /* error */ 1062 break; /* error */
1058 1063
1059 /* HOLE */ 1064 /* HOLE */
1060 blkoff++; 1065 blkoff++;
1061 past_eof = ((blkoff << blkbits) >= isize); 1066 past_eof = ((blkoff << blkbits) >= isize);
1062 1067
1063 if (size) { 1068 if (size) {
1064 /* End of the current extent */ 1069 /* End of the current extent */
1065 1070
1066 if (past_eof) 1071 if (past_eof)
1067 flags |= FIEMAP_EXTENT_LAST; 1072 flags |= FIEMAP_EXTENT_LAST;
1068 1073
1069 ret = fiemap_fill_next_extent( 1074 ret = fiemap_fill_next_extent(
1070 fieinfo, logical, phys, size, flags); 1075 fieinfo, logical, phys, size, flags);
1071 if (ret) 1076 if (ret)
1072 break; 1077 break;
1073 size = 0; 1078 size = 0;
1074 } 1079 }
1075 if (blkoff > end_blkoff || past_eof) 1080 if (blkoff > end_blkoff || past_eof)
1076 break; 1081 break;
1077 } else { 1082 } else {
1078 if (size) { 1083 if (size) {
1079 if (phys && blkphy << blkbits == phys + size) { 1084 if (phys && blkphy << blkbits == phys + size) {
1080 /* The current extent goes on */ 1085 /* The current extent goes on */
1081 size += n << blkbits; 1086 size += n << blkbits;
1082 } else { 1087 } else {
1083 /* Terminate the current extent */ 1088 /* Terminate the current extent */
1084 ret = fiemap_fill_next_extent( 1089 ret = fiemap_fill_next_extent(
1085 fieinfo, logical, phys, size, 1090 fieinfo, logical, phys, size,
1086 flags); 1091 flags);
1087 if (ret || blkoff > end_blkoff) 1092 if (ret || blkoff > end_blkoff)
1088 break; 1093 break;
1089 1094
1090 /* Start another extent */ 1095 /* Start another extent */
1091 flags = FIEMAP_EXTENT_MERGED; 1096 flags = FIEMAP_EXTENT_MERGED;
1092 logical = blkoff << blkbits; 1097 logical = blkoff << blkbits;
1093 phys = blkphy << blkbits; 1098 phys = blkphy << blkbits;
1094 size = n << blkbits; 1099 size = n << blkbits;
1095 } 1100 }
1096 } else { 1101 } else {
1097 /* Start a new extent */ 1102 /* Start a new extent */
1098 flags = FIEMAP_EXTENT_MERGED; 1103 flags = FIEMAP_EXTENT_MERGED;
1099 logical = blkoff << blkbits; 1104 logical = blkoff << blkbits;
1100 phys = blkphy << blkbits; 1105 phys = blkphy << blkbits;
1101 size = n << blkbits; 1106 size = n << blkbits;
1102 } 1107 }
1103 blkoff += n; 1108 blkoff += n;
1104 } 1109 }
1105 cond_resched(); 1110 cond_resched();
1106 } while (true); 1111 } while (true);
1107 1112
1108 /* If ret is 1 then we just hit the end of the extent array */ 1113 /* If ret is 1 then we just hit the end of the extent array */
1109 if (ret == 1) 1114 if (ret == 1)
1110 ret = 0; 1115 ret = 0;
1111 1116
1112 mutex_unlock(&inode->i_mutex); 1117 mutex_unlock(&inode->i_mutex);
1113 return ret; 1118 return ret;
1114 } 1119 }
fs/ocfs2/dlm/dlmmaster.c
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * dlmmod.c 4 * dlmmod.c
5 * 5 *
6 * standalone DLM module 6 * standalone DLM module
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 * 24 *
25 */ 25 */
26 26
27 27
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/fs.h> 29 #include <linux/fs.h>
30 #include <linux/types.h> 30 #include <linux/types.h>
31 #include <linux/slab.h> 31 #include <linux/slab.h>
32 #include <linux/highmem.h> 32 #include <linux/highmem.h>
33 #include <linux/init.h> 33 #include <linux/init.h>
34 #include <linux/sysctl.h> 34 #include <linux/sysctl.h>
35 #include <linux/random.h> 35 #include <linux/random.h>
36 #include <linux/blkdev.h> 36 #include <linux/blkdev.h>
37 #include <linux/socket.h> 37 #include <linux/socket.h>
38 #include <linux/inet.h> 38 #include <linux/inet.h>
39 #include <linux/spinlock.h> 39 #include <linux/spinlock.h>
40 #include <linux/delay.h> 40 #include <linux/delay.h>
41 41
42 42
43 #include "cluster/heartbeat.h" 43 #include "cluster/heartbeat.h"
44 #include "cluster/nodemanager.h" 44 #include "cluster/nodemanager.h"
45 #include "cluster/tcp.h" 45 #include "cluster/tcp.h"
46 46
47 #include "dlmapi.h" 47 #include "dlmapi.h"
48 #include "dlmcommon.h" 48 #include "dlmcommon.h"
49 #include "dlmdomain.h" 49 #include "dlmdomain.h"
50 #include "dlmdebug.h" 50 #include "dlmdebug.h"
51 51
52 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 52 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
53 #include "cluster/masklog.h" 53 #include "cluster/masklog.h"
54 54
55 static void dlm_mle_node_down(struct dlm_ctxt *dlm, 55 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
56 struct dlm_master_list_entry *mle, 56 struct dlm_master_list_entry *mle,
57 struct o2nm_node *node, 57 struct o2nm_node *node,
58 int idx); 58 int idx);
59 static void dlm_mle_node_up(struct dlm_ctxt *dlm, 59 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
60 struct dlm_master_list_entry *mle, 60 struct dlm_master_list_entry *mle,
61 struct o2nm_node *node, 61 struct o2nm_node *node,
62 int idx); 62 int idx);
63 63
64 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); 64 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
65 static int dlm_do_assert_master(struct dlm_ctxt *dlm, 65 static int dlm_do_assert_master(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res, 66 struct dlm_lock_resource *res,
67 void *nodemap, u32 flags); 67 void *nodemap, u32 flags);
68 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); 68 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
69 69
70 static inline int dlm_mle_equal(struct dlm_ctxt *dlm, 70 static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
71 struct dlm_master_list_entry *mle, 71 struct dlm_master_list_entry *mle,
72 const char *name, 72 const char *name,
73 unsigned int namelen) 73 unsigned int namelen)
74 { 74 {
75 if (dlm != mle->dlm) 75 if (dlm != mle->dlm)
76 return 0; 76 return 0;
77 77
78 if (namelen != mle->mnamelen || 78 if (namelen != mle->mnamelen ||
79 memcmp(name, mle->mname, namelen) != 0) 79 memcmp(name, mle->mname, namelen) != 0)
80 return 0; 80 return 0;
81 81
82 return 1; 82 return 1;
83 } 83 }
84 84
85 static struct kmem_cache *dlm_lockres_cache; 85 static struct kmem_cache *dlm_lockres_cache;
86 static struct kmem_cache *dlm_lockname_cache; 86 static struct kmem_cache *dlm_lockname_cache;
87 static struct kmem_cache *dlm_mle_cache; 87 static struct kmem_cache *dlm_mle_cache;
88 88
89 static void dlm_mle_release(struct kref *kref); 89 static void dlm_mle_release(struct kref *kref);
90 static void dlm_init_mle(struct dlm_master_list_entry *mle, 90 static void dlm_init_mle(struct dlm_master_list_entry *mle,
91 enum dlm_mle_type type, 91 enum dlm_mle_type type,
92 struct dlm_ctxt *dlm, 92 struct dlm_ctxt *dlm,
93 struct dlm_lock_resource *res, 93 struct dlm_lock_resource *res,
94 const char *name, 94 const char *name,
95 unsigned int namelen); 95 unsigned int namelen);
96 static void dlm_put_mle(struct dlm_master_list_entry *mle); 96 static void dlm_put_mle(struct dlm_master_list_entry *mle);
97 static void __dlm_put_mle(struct dlm_master_list_entry *mle); 97 static void __dlm_put_mle(struct dlm_master_list_entry *mle);
98 static int dlm_find_mle(struct dlm_ctxt *dlm, 98 static int dlm_find_mle(struct dlm_ctxt *dlm,
99 struct dlm_master_list_entry **mle, 99 struct dlm_master_list_entry **mle,
100 char *name, unsigned int namelen); 100 char *name, unsigned int namelen);
101 101
102 static int dlm_do_master_request(struct dlm_lock_resource *res, 102 static int dlm_do_master_request(struct dlm_lock_resource *res,
103 struct dlm_master_list_entry *mle, int to); 103 struct dlm_master_list_entry *mle, int to);
104 104
105 105
106 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 106 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
107 struct dlm_lock_resource *res, 107 struct dlm_lock_resource *res,
108 struct dlm_master_list_entry *mle, 108 struct dlm_master_list_entry *mle,
109 int *blocked); 109 int *blocked);
110 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, 110 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
111 struct dlm_lock_resource *res, 111 struct dlm_lock_resource *res,
112 struct dlm_master_list_entry *mle, 112 struct dlm_master_list_entry *mle,
113 int blocked); 113 int blocked);
114 static int dlm_add_migration_mle(struct dlm_ctxt *dlm, 114 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
115 struct dlm_lock_resource *res, 115 struct dlm_lock_resource *res,
116 struct dlm_master_list_entry *mle, 116 struct dlm_master_list_entry *mle,
117 struct dlm_master_list_entry **oldmle, 117 struct dlm_master_list_entry **oldmle,
118 const char *name, unsigned int namelen, 118 const char *name, unsigned int namelen,
119 u8 new_master, u8 master); 119 u8 new_master, u8 master);
120 120
121 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 121 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
122 struct dlm_lock_resource *res); 122 struct dlm_lock_resource *res);
123 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 123 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
124 struct dlm_lock_resource *res); 124 struct dlm_lock_resource *res);
125 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, 125 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
126 struct dlm_lock_resource *res, 126 struct dlm_lock_resource *res,
127 u8 target); 127 u8 target);
128 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 128 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
129 struct dlm_lock_resource *res); 129 struct dlm_lock_resource *res);
130 130
131 131
132 int dlm_is_host_down(int errno) 132 int dlm_is_host_down(int errno)
133 { 133 {
134 switch (errno) { 134 switch (errno) {
135 case -EBADF: 135 case -EBADF:
136 case -ECONNREFUSED: 136 case -ECONNREFUSED:
137 case -ENOTCONN: 137 case -ENOTCONN:
138 case -ECONNRESET: 138 case -ECONNRESET:
139 case -EPIPE: 139 case -EPIPE:
140 case -EHOSTDOWN: 140 case -EHOSTDOWN:
141 case -EHOSTUNREACH: 141 case -EHOSTUNREACH:
142 case -ETIMEDOUT: 142 case -ETIMEDOUT:
143 case -ECONNABORTED: 143 case -ECONNABORTED:
144 case -ENETDOWN: 144 case -ENETDOWN:
145 case -ENETUNREACH: 145 case -ENETUNREACH:
146 case -ENETRESET: 146 case -ENETRESET:
147 case -ESHUTDOWN: 147 case -ESHUTDOWN:
148 case -ENOPROTOOPT: 148 case -ENOPROTOOPT:
149 case -EINVAL: /* if returned from our tcp code, 149 case -EINVAL: /* if returned from our tcp code,
150 this means there is no socket */ 150 this means there is no socket */
151 return 1; 151 return 1;
152 } 152 }
153 return 0; 153 return 0;
154 } 154 }
155 155
156 156
157 /* 157 /*
158 * MASTER LIST FUNCTIONS 158 * MASTER LIST FUNCTIONS
159 */ 159 */
160 160
161 161
162 /* 162 /*
163 * regarding master list entries and heartbeat callbacks: 163 * regarding master list entries and heartbeat callbacks:
164 * 164 *
165 * in order to avoid sleeping and allocation that occurs in 165 * in order to avoid sleeping and allocation that occurs in
166 * heartbeat, master list entries are simply attached to the 166 * heartbeat, master list entries are simply attached to the
167 * dlm's established heartbeat callbacks. the mle is attached 167 * dlm's established heartbeat callbacks. the mle is attached
168 * when it is created, and since the dlm->spinlock is held at 168 * when it is created, and since the dlm->spinlock is held at
169 * that time, any heartbeat event will be properly discovered 169 * that time, any heartbeat event will be properly discovered
170 * by the mle. the mle needs to be detached from the 170 * by the mle. the mle needs to be detached from the
171 * dlm->mle_hb_events list as soon as heartbeat events are no 171 * dlm->mle_hb_events list as soon as heartbeat events are no
172 * longer useful to the mle, and before the mle is freed. 172 * longer useful to the mle, and before the mle is freed.
173 * 173 *
174 * as a general rule, heartbeat events are no longer needed by 174 * as a general rule, heartbeat events are no longer needed by
175 * the mle once an "answer" regarding the lock master has been 175 * the mle once an "answer" regarding the lock master has been
176 * received. 176 * received.
177 */ 177 */
178 static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, 178 static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
179 struct dlm_master_list_entry *mle) 179 struct dlm_master_list_entry *mle)
180 { 180 {
181 assert_spin_locked(&dlm->spinlock); 181 assert_spin_locked(&dlm->spinlock);
182 182
183 list_add_tail(&mle->hb_events, &dlm->mle_hb_events); 183 list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
184 } 184 }
185 185
186 186
187 static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, 187 static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
188 struct dlm_master_list_entry *mle) 188 struct dlm_master_list_entry *mle)
189 { 189 {
190 if (!list_empty(&mle->hb_events)) 190 if (!list_empty(&mle->hb_events))
191 list_del_init(&mle->hb_events); 191 list_del_init(&mle->hb_events);
192 } 192 }
193 193
194 194
195 static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, 195 static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
196 struct dlm_master_list_entry *mle) 196 struct dlm_master_list_entry *mle)
197 { 197 {
198 spin_lock(&dlm->spinlock); 198 spin_lock(&dlm->spinlock);
199 __dlm_mle_detach_hb_events(dlm, mle); 199 __dlm_mle_detach_hb_events(dlm, mle);
200 spin_unlock(&dlm->spinlock); 200 spin_unlock(&dlm->spinlock);
201 } 201 }
202 202
203 static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) 203 static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
204 { 204 {
205 struct dlm_ctxt *dlm; 205 struct dlm_ctxt *dlm;
206 dlm = mle->dlm; 206 dlm = mle->dlm;
207 207
208 assert_spin_locked(&dlm->spinlock); 208 assert_spin_locked(&dlm->spinlock);
209 assert_spin_locked(&dlm->master_lock); 209 assert_spin_locked(&dlm->master_lock);
210 mle->inuse++; 210 mle->inuse++;
211 kref_get(&mle->mle_refs); 211 kref_get(&mle->mle_refs);
212 } 212 }
213 213
214 static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) 214 static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
215 { 215 {
216 struct dlm_ctxt *dlm; 216 struct dlm_ctxt *dlm;
217 dlm = mle->dlm; 217 dlm = mle->dlm;
218 218
219 spin_lock(&dlm->spinlock); 219 spin_lock(&dlm->spinlock);
220 spin_lock(&dlm->master_lock); 220 spin_lock(&dlm->master_lock);
221 mle->inuse--; 221 mle->inuse--;
222 __dlm_put_mle(mle); 222 __dlm_put_mle(mle);
223 spin_unlock(&dlm->master_lock); 223 spin_unlock(&dlm->master_lock);
224 spin_unlock(&dlm->spinlock); 224 spin_unlock(&dlm->spinlock);
225 225
226 } 226 }
227 227
228 /* remove from list and free */ 228 /* remove from list and free */
229 static void __dlm_put_mle(struct dlm_master_list_entry *mle) 229 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
230 { 230 {
231 struct dlm_ctxt *dlm; 231 struct dlm_ctxt *dlm;
232 dlm = mle->dlm; 232 dlm = mle->dlm;
233 233
234 assert_spin_locked(&dlm->spinlock); 234 assert_spin_locked(&dlm->spinlock);
235 assert_spin_locked(&dlm->master_lock); 235 assert_spin_locked(&dlm->master_lock);
236 if (!atomic_read(&mle->mle_refs.refcount)) { 236 if (!atomic_read(&mle->mle_refs.refcount)) {
237 /* this may or may not crash, but who cares. 237 /* this may or may not crash, but who cares.
238 * it's a BUG. */ 238 * it's a BUG. */
239 mlog(ML_ERROR, "bad mle: %p\n", mle); 239 mlog(ML_ERROR, "bad mle: %p\n", mle);
240 dlm_print_one_mle(mle); 240 dlm_print_one_mle(mle);
241 BUG(); 241 BUG();
242 } else 242 } else
243 kref_put(&mle->mle_refs, dlm_mle_release); 243 kref_put(&mle->mle_refs, dlm_mle_release);
244 } 244 }
245 245
246 246
247 /* must not have any spinlocks coming in */ 247 /* must not have any spinlocks coming in */
248 static void dlm_put_mle(struct dlm_master_list_entry *mle) 248 static void dlm_put_mle(struct dlm_master_list_entry *mle)
249 { 249 {
250 struct dlm_ctxt *dlm; 250 struct dlm_ctxt *dlm;
251 dlm = mle->dlm; 251 dlm = mle->dlm;
252 252
253 spin_lock(&dlm->spinlock); 253 spin_lock(&dlm->spinlock);
254 spin_lock(&dlm->master_lock); 254 spin_lock(&dlm->master_lock);
255 __dlm_put_mle(mle); 255 __dlm_put_mle(mle);
256 spin_unlock(&dlm->master_lock); 256 spin_unlock(&dlm->master_lock);
257 spin_unlock(&dlm->spinlock); 257 spin_unlock(&dlm->spinlock);
258 } 258 }
259 259
260 static inline void dlm_get_mle(struct dlm_master_list_entry *mle) 260 static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
261 { 261 {
262 kref_get(&mle->mle_refs); 262 kref_get(&mle->mle_refs);
263 } 263 }
264 264
265 static void dlm_init_mle(struct dlm_master_list_entry *mle, 265 static void dlm_init_mle(struct dlm_master_list_entry *mle,
266 enum dlm_mle_type type, 266 enum dlm_mle_type type,
267 struct dlm_ctxt *dlm, 267 struct dlm_ctxt *dlm,
268 struct dlm_lock_resource *res, 268 struct dlm_lock_resource *res,
269 const char *name, 269 const char *name,
270 unsigned int namelen) 270 unsigned int namelen)
271 { 271 {
272 assert_spin_locked(&dlm->spinlock); 272 assert_spin_locked(&dlm->spinlock);
273 273
274 mle->dlm = dlm; 274 mle->dlm = dlm;
275 mle->type = type; 275 mle->type = type;
276 INIT_HLIST_NODE(&mle->master_hash_node); 276 INIT_HLIST_NODE(&mle->master_hash_node);
277 INIT_LIST_HEAD(&mle->hb_events); 277 INIT_LIST_HEAD(&mle->hb_events);
278 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 278 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
279 spin_lock_init(&mle->spinlock); 279 spin_lock_init(&mle->spinlock);
280 init_waitqueue_head(&mle->wq); 280 init_waitqueue_head(&mle->wq);
281 atomic_set(&mle->woken, 0); 281 atomic_set(&mle->woken, 0);
282 kref_init(&mle->mle_refs); 282 kref_init(&mle->mle_refs);
283 memset(mle->response_map, 0, sizeof(mle->response_map)); 283 memset(mle->response_map, 0, sizeof(mle->response_map));
284 mle->master = O2NM_MAX_NODES; 284 mle->master = O2NM_MAX_NODES;
285 mle->new_master = O2NM_MAX_NODES; 285 mle->new_master = O2NM_MAX_NODES;
286 mle->inuse = 0; 286 mle->inuse = 0;
287 287
288 BUG_ON(mle->type != DLM_MLE_BLOCK && 288 BUG_ON(mle->type != DLM_MLE_BLOCK &&
289 mle->type != DLM_MLE_MASTER && 289 mle->type != DLM_MLE_MASTER &&
290 mle->type != DLM_MLE_MIGRATION); 290 mle->type != DLM_MLE_MIGRATION);
291 291
292 if (mle->type == DLM_MLE_MASTER) { 292 if (mle->type == DLM_MLE_MASTER) {
293 BUG_ON(!res); 293 BUG_ON(!res);
294 mle->mleres = res; 294 mle->mleres = res;
295 memcpy(mle->mname, res->lockname.name, res->lockname.len); 295 memcpy(mle->mname, res->lockname.name, res->lockname.len);
296 mle->mnamelen = res->lockname.len; 296 mle->mnamelen = res->lockname.len;
297 mle->mnamehash = res->lockname.hash; 297 mle->mnamehash = res->lockname.hash;
298 } else { 298 } else {
299 BUG_ON(!name); 299 BUG_ON(!name);
300 mle->mleres = NULL; 300 mle->mleres = NULL;
301 memcpy(mle->mname, name, namelen); 301 memcpy(mle->mname, name, namelen);
302 mle->mnamelen = namelen; 302 mle->mnamelen = namelen;
303 mle->mnamehash = dlm_lockid_hash(name, namelen); 303 mle->mnamehash = dlm_lockid_hash(name, namelen);
304 } 304 }
305 305
306 atomic_inc(&dlm->mle_tot_count[mle->type]); 306 atomic_inc(&dlm->mle_tot_count[mle->type]);
307 atomic_inc(&dlm->mle_cur_count[mle->type]); 307 atomic_inc(&dlm->mle_cur_count[mle->type]);
308 308
309 /* copy off the node_map and register hb callbacks on our copy */ 309 /* copy off the node_map and register hb callbacks on our copy */
310 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 310 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
311 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 311 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
312 clear_bit(dlm->node_num, mle->vote_map); 312 clear_bit(dlm->node_num, mle->vote_map);
313 clear_bit(dlm->node_num, mle->node_map); 313 clear_bit(dlm->node_num, mle->node_map);
314 314
315 /* attach the mle to the domain node up/down events */ 315 /* attach the mle to the domain node up/down events */
316 __dlm_mle_attach_hb_events(dlm, mle); 316 __dlm_mle_attach_hb_events(dlm, mle);
317 } 317 }
318 318
319 void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) 319 void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
320 { 320 {
321 assert_spin_locked(&dlm->spinlock); 321 assert_spin_locked(&dlm->spinlock);
322 assert_spin_locked(&dlm->master_lock); 322 assert_spin_locked(&dlm->master_lock);
323 323
324 if (!hlist_unhashed(&mle->master_hash_node)) 324 if (!hlist_unhashed(&mle->master_hash_node))
325 hlist_del_init(&mle->master_hash_node); 325 hlist_del_init(&mle->master_hash_node);
326 } 326 }
327 327
328 void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) 328 void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
329 { 329 {
330 struct hlist_head *bucket; 330 struct hlist_head *bucket;
331 331
332 assert_spin_locked(&dlm->master_lock); 332 assert_spin_locked(&dlm->master_lock);
333 333
334 bucket = dlm_master_hash(dlm, mle->mnamehash); 334 bucket = dlm_master_hash(dlm, mle->mnamehash);
335 hlist_add_head(&mle->master_hash_node, bucket); 335 hlist_add_head(&mle->master_hash_node, bucket);
336 } 336 }
337 337
338 /* returns 1 if found, 0 if not */ 338 /* returns 1 if found, 0 if not */
339 static int dlm_find_mle(struct dlm_ctxt *dlm, 339 static int dlm_find_mle(struct dlm_ctxt *dlm,
340 struct dlm_master_list_entry **mle, 340 struct dlm_master_list_entry **mle,
341 char *name, unsigned int namelen) 341 char *name, unsigned int namelen)
342 { 342 {
343 struct dlm_master_list_entry *tmpmle; 343 struct dlm_master_list_entry *tmpmle;
344 struct hlist_head *bucket; 344 struct hlist_head *bucket;
345 unsigned int hash; 345 unsigned int hash;
346 346
347 assert_spin_locked(&dlm->master_lock); 347 assert_spin_locked(&dlm->master_lock);
348 348
349 hash = dlm_lockid_hash(name, namelen); 349 hash = dlm_lockid_hash(name, namelen);
350 bucket = dlm_master_hash(dlm, hash); 350 bucket = dlm_master_hash(dlm, hash);
351 hlist_for_each_entry(tmpmle, bucket, master_hash_node) { 351 hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
352 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 352 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
353 continue; 353 continue;
354 dlm_get_mle(tmpmle); 354 dlm_get_mle(tmpmle);
355 *mle = tmpmle; 355 *mle = tmpmle;
356 return 1; 356 return 1;
357 } 357 }
358 return 0; 358 return 0;
359 } 359 }
360 360
361 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 361 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
362 { 362 {
363 struct dlm_master_list_entry *mle; 363 struct dlm_master_list_entry *mle;
364 364
365 assert_spin_locked(&dlm->spinlock); 365 assert_spin_locked(&dlm->spinlock);
366 366
367 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { 367 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
368 if (node_up) 368 if (node_up)
369 dlm_mle_node_up(dlm, mle, NULL, idx); 369 dlm_mle_node_up(dlm, mle, NULL, idx);
370 else 370 else
371 dlm_mle_node_down(dlm, mle, NULL, idx); 371 dlm_mle_node_down(dlm, mle, NULL, idx);
372 } 372 }
373 } 373 }
374 374
375 static void dlm_mle_node_down(struct dlm_ctxt *dlm, 375 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
376 struct dlm_master_list_entry *mle, 376 struct dlm_master_list_entry *mle,
377 struct o2nm_node *node, int idx) 377 struct o2nm_node *node, int idx)
378 { 378 {
379 spin_lock(&mle->spinlock); 379 spin_lock(&mle->spinlock);
380 380
381 if (!test_bit(idx, mle->node_map)) 381 if (!test_bit(idx, mle->node_map))
382 mlog(0, "node %u already removed from nodemap!\n", idx); 382 mlog(0, "node %u already removed from nodemap!\n", idx);
383 else 383 else
384 clear_bit(idx, mle->node_map); 384 clear_bit(idx, mle->node_map);
385 385
386 spin_unlock(&mle->spinlock); 386 spin_unlock(&mle->spinlock);
387 } 387 }
388 388
389 static void dlm_mle_node_up(struct dlm_ctxt *dlm, 389 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
390 struct dlm_master_list_entry *mle, 390 struct dlm_master_list_entry *mle,
391 struct o2nm_node *node, int idx) 391 struct o2nm_node *node, int idx)
392 { 392 {
393 spin_lock(&mle->spinlock); 393 spin_lock(&mle->spinlock);
394 394
395 if (test_bit(idx, mle->node_map)) 395 if (test_bit(idx, mle->node_map))
396 mlog(0, "node %u already in node map!\n", idx); 396 mlog(0, "node %u already in node map!\n", idx);
397 else 397 else
398 set_bit(idx, mle->node_map); 398 set_bit(idx, mle->node_map);
399 399
400 spin_unlock(&mle->spinlock); 400 spin_unlock(&mle->spinlock);
401 } 401 }
402 402
403 403
404 int dlm_init_mle_cache(void) 404 int dlm_init_mle_cache(void)
405 { 405 {
406 dlm_mle_cache = kmem_cache_create("o2dlm_mle", 406 dlm_mle_cache = kmem_cache_create("o2dlm_mle",
407 sizeof(struct dlm_master_list_entry), 407 sizeof(struct dlm_master_list_entry),
408 0, SLAB_HWCACHE_ALIGN, 408 0, SLAB_HWCACHE_ALIGN,
409 NULL); 409 NULL);
410 if (dlm_mle_cache == NULL) 410 if (dlm_mle_cache == NULL)
411 return -ENOMEM; 411 return -ENOMEM;
412 return 0; 412 return 0;
413 } 413 }
414 414
415 void dlm_destroy_mle_cache(void) 415 void dlm_destroy_mle_cache(void)
416 { 416 {
417 if (dlm_mle_cache) 417 if (dlm_mle_cache)
418 kmem_cache_destroy(dlm_mle_cache); 418 kmem_cache_destroy(dlm_mle_cache);
419 } 419 }
420 420
421 static void dlm_mle_release(struct kref *kref) 421 static void dlm_mle_release(struct kref *kref)
422 { 422 {
423 struct dlm_master_list_entry *mle; 423 struct dlm_master_list_entry *mle;
424 struct dlm_ctxt *dlm; 424 struct dlm_ctxt *dlm;
425 425
426 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 426 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
427 dlm = mle->dlm; 427 dlm = mle->dlm;
428 428
429 assert_spin_locked(&dlm->spinlock); 429 assert_spin_locked(&dlm->spinlock);
430 assert_spin_locked(&dlm->master_lock); 430 assert_spin_locked(&dlm->master_lock);
431 431
432 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, 432 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
433 mle->type); 433 mle->type);
434 434
435 /* remove from list if not already */ 435 /* remove from list if not already */
436 __dlm_unlink_mle(dlm, mle); 436 __dlm_unlink_mle(dlm, mle);
437 437
438 /* detach the mle from the domain node up/down events */ 438 /* detach the mle from the domain node up/down events */
439 __dlm_mle_detach_hb_events(dlm, mle); 439 __dlm_mle_detach_hb_events(dlm, mle);
440 440
441 atomic_dec(&dlm->mle_cur_count[mle->type]); 441 atomic_dec(&dlm->mle_cur_count[mle->type]);
442 442
443 /* NOTE: kfree under spinlock here. 443 /* NOTE: kfree under spinlock here.
444 * if this is bad, we can move this to a freelist. */ 444 * if this is bad, we can move this to a freelist. */
445 kmem_cache_free(dlm_mle_cache, mle); 445 kmem_cache_free(dlm_mle_cache, mle);
446 } 446 }
447 447
448 448
449 /* 449 /*
450 * LOCK RESOURCE FUNCTIONS 450 * LOCK RESOURCE FUNCTIONS
451 */ 451 */
452 452
453 int dlm_init_master_caches(void) 453 int dlm_init_master_caches(void)
454 { 454 {
455 dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", 455 dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
456 sizeof(struct dlm_lock_resource), 456 sizeof(struct dlm_lock_resource),
457 0, SLAB_HWCACHE_ALIGN, NULL); 457 0, SLAB_HWCACHE_ALIGN, NULL);
458 if (!dlm_lockres_cache) 458 if (!dlm_lockres_cache)
459 goto bail; 459 goto bail;
460 460
461 dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", 461 dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
462 DLM_LOCKID_NAME_MAX, 0, 462 DLM_LOCKID_NAME_MAX, 0,
463 SLAB_HWCACHE_ALIGN, NULL); 463 SLAB_HWCACHE_ALIGN, NULL);
464 if (!dlm_lockname_cache) 464 if (!dlm_lockname_cache)
465 goto bail; 465 goto bail;
466 466
467 return 0; 467 return 0;
468 bail: 468 bail:
469 dlm_destroy_master_caches(); 469 dlm_destroy_master_caches();
470 return -ENOMEM; 470 return -ENOMEM;
471 } 471 }
472 472
473 void dlm_destroy_master_caches(void) 473 void dlm_destroy_master_caches(void)
474 { 474 {
475 if (dlm_lockname_cache) { 475 if (dlm_lockname_cache) {
476 kmem_cache_destroy(dlm_lockname_cache); 476 kmem_cache_destroy(dlm_lockname_cache);
477 dlm_lockname_cache = NULL; 477 dlm_lockname_cache = NULL;
478 } 478 }
479 479
480 if (dlm_lockres_cache) { 480 if (dlm_lockres_cache) {
481 kmem_cache_destroy(dlm_lockres_cache); 481 kmem_cache_destroy(dlm_lockres_cache);
482 dlm_lockres_cache = NULL; 482 dlm_lockres_cache = NULL;
483 } 483 }
484 } 484 }
485 485
486 static void dlm_lockres_release(struct kref *kref) 486 static void dlm_lockres_release(struct kref *kref)
487 { 487 {
488 struct dlm_lock_resource *res; 488 struct dlm_lock_resource *res;
489 struct dlm_ctxt *dlm; 489 struct dlm_ctxt *dlm;
490 490
491 res = container_of(kref, struct dlm_lock_resource, refs); 491 res = container_of(kref, struct dlm_lock_resource, refs);
492 dlm = res->dlm; 492 dlm = res->dlm;
493 493
494 /* This should not happen -- all lockres' have a name 494 /* This should not happen -- all lockres' have a name
495 * associated with them at init time. */ 495 * associated with them at init time. */
496 BUG_ON(!res->lockname.name); 496 BUG_ON(!res->lockname.name);
497 497
498 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 498 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
499 res->lockname.name); 499 res->lockname.name);
500 500
501 spin_lock(&dlm->track_lock); 501 spin_lock(&dlm->track_lock);
502 if (!list_empty(&res->tracking)) 502 if (!list_empty(&res->tracking))
503 list_del_init(&res->tracking); 503 list_del_init(&res->tracking);
504 else { 504 else {
505 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", 505 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
506 res->lockname.len, res->lockname.name); 506 res->lockname.len, res->lockname.name);
507 dlm_print_one_lock_resource(res); 507 dlm_print_one_lock_resource(res);
508 } 508 }
509 spin_unlock(&dlm->track_lock); 509 spin_unlock(&dlm->track_lock);
510 510
511 atomic_dec(&dlm->res_cur_count); 511 atomic_dec(&dlm->res_cur_count);
512 512
513 if (!hlist_unhashed(&res->hash_node) || 513 if (!hlist_unhashed(&res->hash_node) ||
514 !list_empty(&res->granted) || 514 !list_empty(&res->granted) ||
515 !list_empty(&res->converting) || 515 !list_empty(&res->converting) ||
516 !list_empty(&res->blocked) || 516 !list_empty(&res->blocked) ||
517 !list_empty(&res->dirty) || 517 !list_empty(&res->dirty) ||
518 !list_empty(&res->recovering) || 518 !list_empty(&res->recovering) ||
519 !list_empty(&res->purge)) { 519 !list_empty(&res->purge)) {
520 mlog(ML_ERROR, 520 mlog(ML_ERROR,
521 "Going to BUG for resource %.*s." 521 "Going to BUG for resource %.*s."
522 " We're on a list! [%c%c%c%c%c%c%c]\n", 522 " We're on a list! [%c%c%c%c%c%c%c]\n",
523 res->lockname.len, res->lockname.name, 523 res->lockname.len, res->lockname.name,
524 !hlist_unhashed(&res->hash_node) ? 'H' : ' ', 524 !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
525 !list_empty(&res->granted) ? 'G' : ' ', 525 !list_empty(&res->granted) ? 'G' : ' ',
526 !list_empty(&res->converting) ? 'C' : ' ', 526 !list_empty(&res->converting) ? 'C' : ' ',
527 !list_empty(&res->blocked) ? 'B' : ' ', 527 !list_empty(&res->blocked) ? 'B' : ' ',
528 !list_empty(&res->dirty) ? 'D' : ' ', 528 !list_empty(&res->dirty) ? 'D' : ' ',
529 !list_empty(&res->recovering) ? 'R' : ' ', 529 !list_empty(&res->recovering) ? 'R' : ' ',
530 !list_empty(&res->purge) ? 'P' : ' '); 530 !list_empty(&res->purge) ? 'P' : ' ');
531 531
532 dlm_print_one_lock_resource(res); 532 dlm_print_one_lock_resource(res);
533 } 533 }
534 534
535 /* By the time we're ready to blow this guy away, we shouldn't 535 /* By the time we're ready to blow this guy away, we shouldn't
536 * be on any lists. */ 536 * be on any lists. */
537 BUG_ON(!hlist_unhashed(&res->hash_node)); 537 BUG_ON(!hlist_unhashed(&res->hash_node));
538 BUG_ON(!list_empty(&res->granted)); 538 BUG_ON(!list_empty(&res->granted));
539 BUG_ON(!list_empty(&res->converting)); 539 BUG_ON(!list_empty(&res->converting));
540 BUG_ON(!list_empty(&res->blocked)); 540 BUG_ON(!list_empty(&res->blocked));
541 BUG_ON(!list_empty(&res->dirty)); 541 BUG_ON(!list_empty(&res->dirty));
542 BUG_ON(!list_empty(&res->recovering)); 542 BUG_ON(!list_empty(&res->recovering));
543 BUG_ON(!list_empty(&res->purge)); 543 BUG_ON(!list_empty(&res->purge));
544 544
545 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); 545 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
546 546
547 kmem_cache_free(dlm_lockres_cache, res); 547 kmem_cache_free(dlm_lockres_cache, res);
548 } 548 }
549 549
550 void dlm_lockres_put(struct dlm_lock_resource *res) 550 void dlm_lockres_put(struct dlm_lock_resource *res)
551 { 551 {
552 kref_put(&res->refs, dlm_lockres_release); 552 kref_put(&res->refs, dlm_lockres_release);
553 } 553 }
554 554
555 static void dlm_init_lockres(struct dlm_ctxt *dlm, 555 static void dlm_init_lockres(struct dlm_ctxt *dlm,
556 struct dlm_lock_resource *res, 556 struct dlm_lock_resource *res,
557 const char *name, unsigned int namelen) 557 const char *name, unsigned int namelen)
558 { 558 {
559 char *qname; 559 char *qname;
560 560
561 /* If we memset here, we lose our reference to the kmalloc'd 561 /* If we memset here, we lose our reference to the kmalloc'd
562 * res->lockname.name, so be sure to init every field 562 * res->lockname.name, so be sure to init every field
563 * correctly! */ 563 * correctly! */
564 564
565 qname = (char *) res->lockname.name; 565 qname = (char *) res->lockname.name;
566 memcpy(qname, name, namelen); 566 memcpy(qname, name, namelen);
567 567
568 res->lockname.len = namelen; 568 res->lockname.len = namelen;
569 res->lockname.hash = dlm_lockid_hash(name, namelen); 569 res->lockname.hash = dlm_lockid_hash(name, namelen);
570 570
571 init_waitqueue_head(&res->wq); 571 init_waitqueue_head(&res->wq);
572 spin_lock_init(&res->spinlock); 572 spin_lock_init(&res->spinlock);
573 INIT_HLIST_NODE(&res->hash_node); 573 INIT_HLIST_NODE(&res->hash_node);
574 INIT_LIST_HEAD(&res->granted); 574 INIT_LIST_HEAD(&res->granted);
575 INIT_LIST_HEAD(&res->converting); 575 INIT_LIST_HEAD(&res->converting);
576 INIT_LIST_HEAD(&res->blocked); 576 INIT_LIST_HEAD(&res->blocked);
577 INIT_LIST_HEAD(&res->dirty); 577 INIT_LIST_HEAD(&res->dirty);
578 INIT_LIST_HEAD(&res->recovering); 578 INIT_LIST_HEAD(&res->recovering);
579 INIT_LIST_HEAD(&res->purge); 579 INIT_LIST_HEAD(&res->purge);
580 INIT_LIST_HEAD(&res->tracking); 580 INIT_LIST_HEAD(&res->tracking);
581 atomic_set(&res->asts_reserved, 0); 581 atomic_set(&res->asts_reserved, 0);
582 res->migration_pending = 0; 582 res->migration_pending = 0;
583 res->inflight_locks = 0; 583 res->inflight_locks = 0;
584 res->inflight_assert_workers = 0; 584 res->inflight_assert_workers = 0;
585 585
586 res->dlm = dlm; 586 res->dlm = dlm;
587 587
588 kref_init(&res->refs); 588 kref_init(&res->refs);
589 589
590 atomic_inc(&dlm->res_tot_count); 590 atomic_inc(&dlm->res_tot_count);
591 atomic_inc(&dlm->res_cur_count); 591 atomic_inc(&dlm->res_cur_count);
592 592
593 /* just for consistency */ 593 /* just for consistency */
594 spin_lock(&res->spinlock); 594 spin_lock(&res->spinlock);
595 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 595 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
596 spin_unlock(&res->spinlock); 596 spin_unlock(&res->spinlock);
597 597
598 res->state = DLM_LOCK_RES_IN_PROGRESS; 598 res->state = DLM_LOCK_RES_IN_PROGRESS;
599 599
600 res->last_used = 0; 600 res->last_used = 0;
601 601
602 spin_lock(&dlm->spinlock); 602 spin_lock(&dlm->spinlock);
603 list_add_tail(&res->tracking, &dlm->tracking_list); 603 list_add_tail(&res->tracking, &dlm->tracking_list);
604 spin_unlock(&dlm->spinlock); 604 spin_unlock(&dlm->spinlock);
605 605
606 memset(res->lvb, 0, DLM_LVB_LEN); 606 memset(res->lvb, 0, DLM_LVB_LEN);
607 memset(res->refmap, 0, sizeof(res->refmap)); 607 memset(res->refmap, 0, sizeof(res->refmap));
608 } 608 }
609 609
610 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, 610 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
611 const char *name, 611 const char *name,
612 unsigned int namelen) 612 unsigned int namelen)
613 { 613 {
614 struct dlm_lock_resource *res = NULL; 614 struct dlm_lock_resource *res = NULL;
615 615
616 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); 616 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
617 if (!res) 617 if (!res)
618 goto error; 618 goto error;
619 619
620 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); 620 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
621 if (!res->lockname.name) 621 if (!res->lockname.name)
622 goto error; 622 goto error;
623 623
624 dlm_init_lockres(dlm, res, name, namelen); 624 dlm_init_lockres(dlm, res, name, namelen);
625 return res; 625 return res;
626 626
627 error: 627 error:
628 if (res && res->lockname.name) 628 if (res && res->lockname.name)
629 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); 629 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
630 630
631 if (res) 631 if (res)
632 kmem_cache_free(dlm_lockres_cache, res); 632 kmem_cache_free(dlm_lockres_cache, res);
633 return NULL; 633 return NULL;
634 } 634 }
635 635
636 void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, 636 void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
637 struct dlm_lock_resource *res, int bit) 637 struct dlm_lock_resource *res, int bit)
638 { 638 {
639 assert_spin_locked(&res->spinlock); 639 assert_spin_locked(&res->spinlock);
640 640
641 mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, 641 mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
642 res->lockname.name, bit, __builtin_return_address(0)); 642 res->lockname.name, bit, __builtin_return_address(0));
643 643
644 set_bit(bit, res->refmap); 644 set_bit(bit, res->refmap);
645 } 645 }
646 646
647 void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, 647 void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
648 struct dlm_lock_resource *res, int bit) 648 struct dlm_lock_resource *res, int bit)
649 { 649 {
650 assert_spin_locked(&res->spinlock); 650 assert_spin_locked(&res->spinlock);
651 651
652 mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, 652 mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
653 res->lockname.name, bit, __builtin_return_address(0)); 653 res->lockname.name, bit, __builtin_return_address(0));
654 654
655 clear_bit(bit, res->refmap); 655 clear_bit(bit, res->refmap);
656 } 656 }
657 657
658 658 static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
659 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
660 struct dlm_lock_resource *res) 659 struct dlm_lock_resource *res)
661 { 660 {
662 assert_spin_locked(&res->spinlock);
663
664 res->inflight_locks++; 661 res->inflight_locks++;
665 662
666 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, 663 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
667 res->lockname.len, res->lockname.name, res->inflight_locks, 664 res->lockname.len, res->lockname.name, res->inflight_locks,
668 __builtin_return_address(0)); 665 __builtin_return_address(0));
669 } 666 }
670 667
668 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
669 struct dlm_lock_resource *res)
670 {
671 assert_spin_locked(&res->spinlock);
672 __dlm_lockres_grab_inflight_ref(dlm, res);
673 }
674
671 void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 675 void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
672 struct dlm_lock_resource *res) 676 struct dlm_lock_resource *res)
673 { 677 {
674 assert_spin_locked(&res->spinlock); 678 assert_spin_locked(&res->spinlock);
675 679
676 BUG_ON(res->inflight_locks == 0); 680 BUG_ON(res->inflight_locks == 0);
677 681
678 res->inflight_locks--; 682 res->inflight_locks--;
679 683
680 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, 684 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
681 res->lockname.len, res->lockname.name, res->inflight_locks, 685 res->lockname.len, res->lockname.name, res->inflight_locks,
682 __builtin_return_address(0)); 686 __builtin_return_address(0));
683 687
684 wake_up(&res->wq); 688 wake_up(&res->wq);
685 } 689 }
686 690
687 void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, 691 void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
688 struct dlm_lock_resource *res) 692 struct dlm_lock_resource *res)
689 { 693 {
690 assert_spin_locked(&res->spinlock); 694 assert_spin_locked(&res->spinlock);
691 res->inflight_assert_workers++; 695 res->inflight_assert_workers++;
692 mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", 696 mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
693 dlm->name, res->lockname.len, res->lockname.name, 697 dlm->name, res->lockname.len, res->lockname.name,
694 res->inflight_assert_workers); 698 res->inflight_assert_workers);
695 } 699 }
696 700
697 static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, 701 static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
698 struct dlm_lock_resource *res) 702 struct dlm_lock_resource *res)
699 { 703 {
700 spin_lock(&res->spinlock); 704 spin_lock(&res->spinlock);
701 __dlm_lockres_grab_inflight_worker(dlm, res); 705 __dlm_lockres_grab_inflight_worker(dlm, res);
702 spin_unlock(&res->spinlock); 706 spin_unlock(&res->spinlock);
703 } 707 }
704 708
705 static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, 709 static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
706 struct dlm_lock_resource *res) 710 struct dlm_lock_resource *res)
707 { 711 {
708 assert_spin_locked(&res->spinlock); 712 assert_spin_locked(&res->spinlock);
709 BUG_ON(res->inflight_assert_workers == 0); 713 BUG_ON(res->inflight_assert_workers == 0);
710 res->inflight_assert_workers--; 714 res->inflight_assert_workers--;
711 mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", 715 mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
712 dlm->name, res->lockname.len, res->lockname.name, 716 dlm->name, res->lockname.len, res->lockname.name,
713 res->inflight_assert_workers); 717 res->inflight_assert_workers);
714 } 718 }
715 719
716 static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, 720 static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
717 struct dlm_lock_resource *res) 721 struct dlm_lock_resource *res)
718 { 722 {
719 spin_lock(&res->spinlock); 723 spin_lock(&res->spinlock);
720 __dlm_lockres_drop_inflight_worker(dlm, res); 724 __dlm_lockres_drop_inflight_worker(dlm, res);
721 spin_unlock(&res->spinlock); 725 spin_unlock(&res->spinlock);
722 } 726 }
723 727
724 /* 728 /*
725 * lookup a lock resource by name. 729 * lookup a lock resource by name.
726 * may already exist in the hashtable. 730 * may already exist in the hashtable.
727 * lockid is null terminated 731 * lockid is null terminated
728 * 732 *
729 * if not, allocate enough for the lockres and for 733 * if not, allocate enough for the lockres and for
730 * the temporary structure used in doing the mastering. 734 * the temporary structure used in doing the mastering.
731 * 735 *
732 * also, do a lookup in the dlm->master_list to see 736 * also, do a lookup in the dlm->master_list to see
733 * if another node has begun mastering the same lock. 737 * if another node has begun mastering the same lock.
734 * if so, there should be a block entry in there 738 * if so, there should be a block entry in there
735 * for this name, and we should *not* attempt to master 739 * for this name, and we should *not* attempt to master
736 * the lock here. need to wait around for that node 740 * the lock here. need to wait around for that node
737 * to assert_master (or die). 741 * to assert_master (or die).
738 * 742 *
739 */ 743 */
740 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 744 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
741 const char *lockid, 745 const char *lockid,
742 int namelen, 746 int namelen,
743 int flags) 747 int flags)
744 { 748 {
745 struct dlm_lock_resource *tmpres=NULL, *res=NULL; 749 struct dlm_lock_resource *tmpres=NULL, *res=NULL;
746 struct dlm_master_list_entry *mle = NULL; 750 struct dlm_master_list_entry *mle = NULL;
747 struct dlm_master_list_entry *alloc_mle = NULL; 751 struct dlm_master_list_entry *alloc_mle = NULL;
748 int blocked = 0; 752 int blocked = 0;
749 int ret, nodenum; 753 int ret, nodenum;
750 struct dlm_node_iter iter; 754 struct dlm_node_iter iter;
751 unsigned int hash; 755 unsigned int hash;
752 int tries = 0; 756 int tries = 0;
753 int bit, wait_on_recovery = 0; 757 int bit, wait_on_recovery = 0;
754 758
755 BUG_ON(!lockid); 759 BUG_ON(!lockid);
756 760
757 hash = dlm_lockid_hash(lockid, namelen); 761 hash = dlm_lockid_hash(lockid, namelen);
758 762
759 mlog(0, "get lockres %s (len %d)\n", lockid, namelen); 763 mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
760 764
761 lookup: 765 lookup:
762 spin_lock(&dlm->spinlock); 766 spin_lock(&dlm->spinlock);
763 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 767 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
764 if (tmpres) { 768 if (tmpres) {
765 spin_unlock(&dlm->spinlock); 769 spin_unlock(&dlm->spinlock);
766 spin_lock(&tmpres->spinlock); 770 spin_lock(&tmpres->spinlock);
767 /* Wait on the thread that is mastering the resource */ 771 /* Wait on the thread that is mastering the resource */
768 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 772 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
769 __dlm_wait_on_lockres(tmpres); 773 __dlm_wait_on_lockres(tmpres);
770 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 774 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
771 spin_unlock(&tmpres->spinlock); 775 spin_unlock(&tmpres->spinlock);
772 dlm_lockres_put(tmpres); 776 dlm_lockres_put(tmpres);
773 tmpres = NULL; 777 tmpres = NULL;
774 goto lookup; 778 goto lookup;
775 } 779 }
776 780
777 /* Wait on the resource purge to complete before continuing */ 781 /* Wait on the resource purge to complete before continuing */
778 if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { 782 if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
779 BUG_ON(tmpres->owner == dlm->node_num); 783 BUG_ON(tmpres->owner == dlm->node_num);
780 __dlm_wait_on_lockres_flags(tmpres, 784 __dlm_wait_on_lockres_flags(tmpres,
781 DLM_LOCK_RES_DROPPING_REF); 785 DLM_LOCK_RES_DROPPING_REF);
782 spin_unlock(&tmpres->spinlock); 786 spin_unlock(&tmpres->spinlock);
783 dlm_lockres_put(tmpres); 787 dlm_lockres_put(tmpres);
784 tmpres = NULL; 788 tmpres = NULL;
785 goto lookup; 789 goto lookup;
786 } 790 }
787 791
788 /* Grab inflight ref to pin the resource */ 792 /* Grab inflight ref to pin the resource */
789 dlm_lockres_grab_inflight_ref(dlm, tmpres); 793 dlm_lockres_grab_inflight_ref(dlm, tmpres);
790 794
791 spin_unlock(&tmpres->spinlock); 795 spin_unlock(&tmpres->spinlock);
792 if (res) 796 if (res)
793 dlm_lockres_put(res); 797 dlm_lockres_put(res);
794 res = tmpres; 798 res = tmpres;
795 goto leave; 799 goto leave;
796 } 800 }
797 801
798 if (!res) { 802 if (!res) {
799 spin_unlock(&dlm->spinlock); 803 spin_unlock(&dlm->spinlock);
800 mlog(0, "allocating a new resource\n"); 804 mlog(0, "allocating a new resource\n");
801 /* nothing found and we need to allocate one. */ 805 /* nothing found and we need to allocate one. */
802 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 806 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
803 if (!alloc_mle) 807 if (!alloc_mle)
804 goto leave; 808 goto leave;
805 res = dlm_new_lockres(dlm, lockid, namelen); 809 res = dlm_new_lockres(dlm, lockid, namelen);
806 if (!res) 810 if (!res)
807 goto leave; 811 goto leave;
808 goto lookup; 812 goto lookup;
809 } 813 }
810 814
811 mlog(0, "no lockres found, allocated our own: %p\n", res); 815 mlog(0, "no lockres found, allocated our own: %p\n", res);
812 816
813 if (flags & LKM_LOCAL) { 817 if (flags & LKM_LOCAL) {
814 /* caller knows it's safe to assume it's not mastered elsewhere 818 /* caller knows it's safe to assume it's not mastered elsewhere
815 * DONE! return right away */ 819 * DONE! return right away */
816 spin_lock(&res->spinlock); 820 spin_lock(&res->spinlock);
817 dlm_change_lockres_owner(dlm, res, dlm->node_num); 821 dlm_change_lockres_owner(dlm, res, dlm->node_num);
818 __dlm_insert_lockres(dlm, res); 822 __dlm_insert_lockres(dlm, res);
819 dlm_lockres_grab_inflight_ref(dlm, res); 823 dlm_lockres_grab_inflight_ref(dlm, res);
820 spin_unlock(&res->spinlock); 824 spin_unlock(&res->spinlock);
821 spin_unlock(&dlm->spinlock); 825 spin_unlock(&dlm->spinlock);
822 /* lockres still marked IN_PROGRESS */ 826 /* lockres still marked IN_PROGRESS */
823 goto wake_waiters; 827 goto wake_waiters;
824 } 828 }
825 829
826 /* check master list to see if another node has started mastering it */ 830 /* check master list to see if another node has started mastering it */
827 spin_lock(&dlm->master_lock); 831 spin_lock(&dlm->master_lock);
828 832
829 /* if we found a block, wait for lock to be mastered by another node */ 833 /* if we found a block, wait for lock to be mastered by another node */
830 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); 834 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
831 if (blocked) { 835 if (blocked) {
832 int mig; 836 int mig;
833 if (mle->type == DLM_MLE_MASTER) { 837 if (mle->type == DLM_MLE_MASTER) {
834 mlog(ML_ERROR, "master entry for nonexistent lock!\n"); 838 mlog(ML_ERROR, "master entry for nonexistent lock!\n");
835 BUG(); 839 BUG();
836 } 840 }
837 mig = (mle->type == DLM_MLE_MIGRATION); 841 mig = (mle->type == DLM_MLE_MIGRATION);
838 /* if there is a migration in progress, let the migration 842 /* if there is a migration in progress, let the migration
839 * finish before continuing. we can wait for the absence 843 * finish before continuing. we can wait for the absence
840 * of the MIGRATION mle: either the migrate finished or 844 * of the MIGRATION mle: either the migrate finished or
841 * one of the nodes died and the mle was cleaned up. 845 * one of the nodes died and the mle was cleaned up.
842 * if there is a BLOCK here, but it already has a master 846 * if there is a BLOCK here, but it already has a master
843 * set, we are too late. the master does not have a ref 847 * set, we are too late. the master does not have a ref
844 * for us in the refmap. detach the mle and drop it. 848 * for us in the refmap. detach the mle and drop it.
845 * either way, go back to the top and start over. */ 849 * either way, go back to the top and start over. */
846 if (mig || mle->master != O2NM_MAX_NODES) { 850 if (mig || mle->master != O2NM_MAX_NODES) {
847 BUG_ON(mig && mle->master == dlm->node_num); 851 BUG_ON(mig && mle->master == dlm->node_num);
848 /* we arrived too late. the master does not 852 /* we arrived too late. the master does not
849 * have a ref for us. retry. */ 853 * have a ref for us. retry. */
850 mlog(0, "%s:%.*s: late on %s\n", 854 mlog(0, "%s:%.*s: late on %s\n",
851 dlm->name, namelen, lockid, 855 dlm->name, namelen, lockid,
852 mig ? "MIGRATION" : "BLOCK"); 856 mig ? "MIGRATION" : "BLOCK");
853 spin_unlock(&dlm->master_lock); 857 spin_unlock(&dlm->master_lock);
854 spin_unlock(&dlm->spinlock); 858 spin_unlock(&dlm->spinlock);
855 859
856 /* master is known, detach */ 860 /* master is known, detach */
857 if (!mig) 861 if (!mig)
858 dlm_mle_detach_hb_events(dlm, mle); 862 dlm_mle_detach_hb_events(dlm, mle);
859 dlm_put_mle(mle); 863 dlm_put_mle(mle);
860 mle = NULL; 864 mle = NULL;
861 /* this is lame, but we can't wait on either 865 /* this is lame, but we can't wait on either
862 * the mle or lockres waitqueue here */ 866 * the mle or lockres waitqueue here */
863 if (mig) 867 if (mig)
864 msleep(100); 868 msleep(100);
865 goto lookup; 869 goto lookup;
866 } 870 }
867 } else { 871 } else {
868 /* go ahead and try to master lock on this node */ 872 /* go ahead and try to master lock on this node */
869 mle = alloc_mle; 873 mle = alloc_mle;
870 /* make sure this does not get freed below */ 874 /* make sure this does not get freed below */
871 alloc_mle = NULL; 875 alloc_mle = NULL;
872 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 876 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
873 set_bit(dlm->node_num, mle->maybe_map); 877 set_bit(dlm->node_num, mle->maybe_map);
874 __dlm_insert_mle(dlm, mle); 878 __dlm_insert_mle(dlm, mle);
875 879
876 /* still holding the dlm spinlock, check the recovery map 880 /* still holding the dlm spinlock, check the recovery map
877 * to see if there are any nodes that still need to be 881 * to see if there are any nodes that still need to be
878 * considered. these will not appear in the mle nodemap 882 * considered. these will not appear in the mle nodemap
879 * but they might own this lockres. wait on them. */ 883 * but they might own this lockres. wait on them. */
880 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 884 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
881 if (bit < O2NM_MAX_NODES) { 885 if (bit < O2NM_MAX_NODES) {
882 mlog(0, "%s: res %.*s, At least one node (%d) " 886 mlog(0, "%s: res %.*s, At least one node (%d) "
883 "to recover before lock mastery can begin\n", 887 "to recover before lock mastery can begin\n",
884 dlm->name, namelen, (char *)lockid, bit); 888 dlm->name, namelen, (char *)lockid, bit);
885 wait_on_recovery = 1; 889 wait_on_recovery = 1;
886 } 890 }
887 } 891 }
888 892
889 /* at this point there is either a DLM_MLE_BLOCK or a 893 /* at this point there is either a DLM_MLE_BLOCK or a
890 * DLM_MLE_MASTER on the master list, so it's safe to add the 894 * DLM_MLE_MASTER on the master list, so it's safe to add the
891 * lockres to the hashtable. anyone who finds the lock will 895 * lockres to the hashtable. anyone who finds the lock will
892 * still have to wait on the IN_PROGRESS. */ 896 * still have to wait on the IN_PROGRESS. */
893 897
894 /* finally add the lockres to its hash bucket */ 898 /* finally add the lockres to its hash bucket */
895 __dlm_insert_lockres(dlm, res); 899 __dlm_insert_lockres(dlm, res);
896 900
897 /* Grab inflight ref to pin the resource */ 901 /* since this lockres is new it doesn't not require the spinlock */
898 spin_lock(&res->spinlock); 902 __dlm_lockres_grab_inflight_ref(dlm, res);
899 dlm_lockres_grab_inflight_ref(dlm, res);
900 spin_unlock(&res->spinlock);
901 903
902 /* get an extra ref on the mle in case this is a BLOCK 904 /* get an extra ref on the mle in case this is a BLOCK
903 * if so, the creator of the BLOCK may try to put the last 905 * if so, the creator of the BLOCK may try to put the last
904 * ref at this time in the assert master handler, so we 906 * ref at this time in the assert master handler, so we
905 * need an extra one to keep from a bad ptr deref. */ 907 * need an extra one to keep from a bad ptr deref. */
906 dlm_get_mle_inuse(mle); 908 dlm_get_mle_inuse(mle);
907 spin_unlock(&dlm->master_lock); 909 spin_unlock(&dlm->master_lock);
908 spin_unlock(&dlm->spinlock); 910 spin_unlock(&dlm->spinlock);
909 911
910 redo_request: 912 redo_request:
911 while (wait_on_recovery) { 913 while (wait_on_recovery) {
912 /* any cluster changes that occurred after dropping the 914 /* any cluster changes that occurred after dropping the
913 * dlm spinlock would be detectable be a change on the mle, 915 * dlm spinlock would be detectable be a change on the mle,
914 * so we only need to clear out the recovery map once. */ 916 * so we only need to clear out the recovery map once. */
915 if (dlm_is_recovery_lock(lockid, namelen)) { 917 if (dlm_is_recovery_lock(lockid, namelen)) {
916 mlog(0, "%s: Recovery map is not empty, but must " 918 mlog(0, "%s: Recovery map is not empty, but must "
917 "master $RECOVERY lock now\n", dlm->name); 919 "master $RECOVERY lock now\n", dlm->name);
918 if (!dlm_pre_master_reco_lockres(dlm, res)) 920 if (!dlm_pre_master_reco_lockres(dlm, res))
919 wait_on_recovery = 0; 921 wait_on_recovery = 0;
920 else { 922 else {
921 mlog(0, "%s: waiting 500ms for heartbeat state " 923 mlog(0, "%s: waiting 500ms for heartbeat state "
922 "change\n", dlm->name); 924 "change\n", dlm->name);
923 msleep(500); 925 msleep(500);
924 } 926 }
925 continue; 927 continue;
926 } 928 }
927 929
928 dlm_kick_recovery_thread(dlm); 930 dlm_kick_recovery_thread(dlm);
929 msleep(1000); 931 msleep(1000);
930 dlm_wait_for_recovery(dlm); 932 dlm_wait_for_recovery(dlm);
931 933
932 spin_lock(&dlm->spinlock); 934 spin_lock(&dlm->spinlock);
933 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 935 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
934 if (bit < O2NM_MAX_NODES) { 936 if (bit < O2NM_MAX_NODES) {
935 mlog(0, "%s: res %.*s, At least one node (%d) " 937 mlog(0, "%s: res %.*s, At least one node (%d) "
936 "to recover before lock mastery can begin\n", 938 "to recover before lock mastery can begin\n",
937 dlm->name, namelen, (char *)lockid, bit); 939 dlm->name, namelen, (char *)lockid, bit);
938 wait_on_recovery = 1; 940 wait_on_recovery = 1;
939 } else 941 } else
940 wait_on_recovery = 0; 942 wait_on_recovery = 0;
941 spin_unlock(&dlm->spinlock); 943 spin_unlock(&dlm->spinlock);
942 944
943 if (wait_on_recovery) 945 if (wait_on_recovery)
944 dlm_wait_for_node_recovery(dlm, bit, 10000); 946 dlm_wait_for_node_recovery(dlm, bit, 10000);
945 } 947 }
946 948
947 /* must wait for lock to be mastered elsewhere */ 949 /* must wait for lock to be mastered elsewhere */
948 if (blocked) 950 if (blocked)
949 goto wait; 951 goto wait;
950 952
951 ret = -EINVAL; 953 ret = -EINVAL;
952 dlm_node_iter_init(mle->vote_map, &iter); 954 dlm_node_iter_init(mle->vote_map, &iter);
953 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 955 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
954 ret = dlm_do_master_request(res, mle, nodenum); 956 ret = dlm_do_master_request(res, mle, nodenum);
955 if (ret < 0) 957 if (ret < 0)
956 mlog_errno(ret); 958 mlog_errno(ret);
957 if (mle->master != O2NM_MAX_NODES) { 959 if (mle->master != O2NM_MAX_NODES) {
958 /* found a master ! */ 960 /* found a master ! */
959 if (mle->master <= nodenum) 961 if (mle->master <= nodenum)
960 break; 962 break;
961 /* if our master request has not reached the master 963 /* if our master request has not reached the master
962 * yet, keep going until it does. this is how the 964 * yet, keep going until it does. this is how the
963 * master will know that asserts are needed back to 965 * master will know that asserts are needed back to
964 * the lower nodes. */ 966 * the lower nodes. */
965 mlog(0, "%s: res %.*s, Requests only up to %u but " 967 mlog(0, "%s: res %.*s, Requests only up to %u but "
966 "master is %u, keep going\n", dlm->name, namelen, 968 "master is %u, keep going\n", dlm->name, namelen,
967 lockid, nodenum, mle->master); 969 lockid, nodenum, mle->master);
968 } 970 }
969 } 971 }
970 972
971 wait: 973 wait:
972 /* keep going until the response map includes all nodes */ 974 /* keep going until the response map includes all nodes */
973 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 975 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
974 if (ret < 0) { 976 if (ret < 0) {
975 wait_on_recovery = 1; 977 wait_on_recovery = 1;
976 mlog(0, "%s: res %.*s, Node map changed, redo the master " 978 mlog(0, "%s: res %.*s, Node map changed, redo the master "
977 "request now, blocked=%d\n", dlm->name, res->lockname.len, 979 "request now, blocked=%d\n", dlm->name, res->lockname.len,
978 res->lockname.name, blocked); 980 res->lockname.name, blocked);
979 if (++tries > 20) { 981 if (++tries > 20) {
980 mlog(ML_ERROR, "%s: res %.*s, Spinning on " 982 mlog(ML_ERROR, "%s: res %.*s, Spinning on "
981 "dlm_wait_for_lock_mastery, blocked = %d\n", 983 "dlm_wait_for_lock_mastery, blocked = %d\n",
982 dlm->name, res->lockname.len, 984 dlm->name, res->lockname.len,
983 res->lockname.name, blocked); 985 res->lockname.name, blocked);
984 dlm_print_one_lock_resource(res); 986 dlm_print_one_lock_resource(res);
985 dlm_print_one_mle(mle); 987 dlm_print_one_mle(mle);
986 tries = 0; 988 tries = 0;
987 } 989 }
988 goto redo_request; 990 goto redo_request;
989 } 991 }
990 992
991 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, 993 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
992 res->lockname.name, res->owner); 994 res->lockname.name, res->owner);
993 /* make sure we never continue without this */ 995 /* make sure we never continue without this */
994 BUG_ON(res->owner == O2NM_MAX_NODES); 996 BUG_ON(res->owner == O2NM_MAX_NODES);
995 997
996 /* master is known, detach if not already detached */ 998 /* master is known, detach if not already detached */
997 dlm_mle_detach_hb_events(dlm, mle); 999 dlm_mle_detach_hb_events(dlm, mle);
998 dlm_put_mle(mle); 1000 dlm_put_mle(mle);
999 /* put the extra ref */ 1001 /* put the extra ref */
1000 dlm_put_mle_inuse(mle); 1002 dlm_put_mle_inuse(mle);
1001 1003
1002 wake_waiters: 1004 wake_waiters:
1003 spin_lock(&res->spinlock); 1005 spin_lock(&res->spinlock);
1004 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 1006 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1005 spin_unlock(&res->spinlock); 1007 spin_unlock(&res->spinlock);
1006 wake_up(&res->wq); 1008 wake_up(&res->wq);
1007 1009
1008 leave: 1010 leave:
1009 /* need to free the unused mle */ 1011 /* need to free the unused mle */
1010 if (alloc_mle) 1012 if (alloc_mle)
1011 kmem_cache_free(dlm_mle_cache, alloc_mle); 1013 kmem_cache_free(dlm_mle_cache, alloc_mle);
1012 1014
1013 return res; 1015 return res;
1014 } 1016 }
1015 1017
1016 1018
1017 #define DLM_MASTERY_TIMEOUT_MS 5000 1019 #define DLM_MASTERY_TIMEOUT_MS 5000
1018 1020
1019 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 1021 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
1020 struct dlm_lock_resource *res, 1022 struct dlm_lock_resource *res,
1021 struct dlm_master_list_entry *mle, 1023 struct dlm_master_list_entry *mle,
1022 int *blocked) 1024 int *blocked)
1023 { 1025 {
1024 u8 m; 1026 u8 m;
1025 int ret, bit; 1027 int ret, bit;
1026 int map_changed, voting_done; 1028 int map_changed, voting_done;
1027 int assert, sleep; 1029 int assert, sleep;
1028 1030
1029 recheck: 1031 recheck:
1030 ret = 0; 1032 ret = 0;
1031 assert = 0; 1033 assert = 0;
1032 1034
1033 /* check if another node has already become the owner */ 1035 /* check if another node has already become the owner */
1034 spin_lock(&res->spinlock); 1036 spin_lock(&res->spinlock);
1035 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 1037 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1036 mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, 1038 mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
1037 res->lockname.len, res->lockname.name, res->owner); 1039 res->lockname.len, res->lockname.name, res->owner);
1038 spin_unlock(&res->spinlock); 1040 spin_unlock(&res->spinlock);
1039 /* this will cause the master to re-assert across 1041 /* this will cause the master to re-assert across
1040 * the whole cluster, freeing up mles */ 1042 * the whole cluster, freeing up mles */
1041 if (res->owner != dlm->node_num) { 1043 if (res->owner != dlm->node_num) {
1042 ret = dlm_do_master_request(res, mle, res->owner); 1044 ret = dlm_do_master_request(res, mle, res->owner);
1043 if (ret < 0) { 1045 if (ret < 0) {
1044 /* give recovery a chance to run */ 1046 /* give recovery a chance to run */
1045 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); 1047 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
1046 msleep(500); 1048 msleep(500);
1047 goto recheck; 1049 goto recheck;
1048 } 1050 }
1049 } 1051 }
1050 ret = 0; 1052 ret = 0;
1051 goto leave; 1053 goto leave;
1052 } 1054 }
1053 spin_unlock(&res->spinlock); 1055 spin_unlock(&res->spinlock);
1054 1056
1055 spin_lock(&mle->spinlock); 1057 spin_lock(&mle->spinlock);
1056 m = mle->master; 1058 m = mle->master;
1057 map_changed = (memcmp(mle->vote_map, mle->node_map, 1059 map_changed = (memcmp(mle->vote_map, mle->node_map,
1058 sizeof(mle->vote_map)) != 0); 1060 sizeof(mle->vote_map)) != 0);
1059 voting_done = (memcmp(mle->vote_map, mle->response_map, 1061 voting_done = (memcmp(mle->vote_map, mle->response_map,
1060 sizeof(mle->vote_map)) == 0); 1062 sizeof(mle->vote_map)) == 0);
1061 1063
1062 /* restart if we hit any errors */ 1064 /* restart if we hit any errors */
1063 if (map_changed) { 1065 if (map_changed) {
1064 int b; 1066 int b;
1065 mlog(0, "%s: %.*s: node map changed, restarting\n", 1067 mlog(0, "%s: %.*s: node map changed, restarting\n",
1066 dlm->name, res->lockname.len, res->lockname.name); 1068 dlm->name, res->lockname.len, res->lockname.name);
1067 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); 1069 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1068 b = (mle->type == DLM_MLE_BLOCK); 1070 b = (mle->type == DLM_MLE_BLOCK);
1069 if ((*blocked && !b) || (!*blocked && b)) { 1071 if ((*blocked && !b) || (!*blocked && b)) {
1070 mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 1072 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1071 dlm->name, res->lockname.len, res->lockname.name, 1073 dlm->name, res->lockname.len, res->lockname.name,
1072 *blocked, b); 1074 *blocked, b);
1073 *blocked = b; 1075 *blocked = b;
1074 } 1076 }
1075 spin_unlock(&mle->spinlock); 1077 spin_unlock(&mle->spinlock);
1076 if (ret < 0) { 1078 if (ret < 0) {
1077 mlog_errno(ret); 1079 mlog_errno(ret);
1078 goto leave; 1080 goto leave;
1079 } 1081 }
1080 mlog(0, "%s:%.*s: restart lock mastery succeeded, " 1082 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
1081 "rechecking now\n", dlm->name, res->lockname.len, 1083 "rechecking now\n", dlm->name, res->lockname.len,
1082 res->lockname.name); 1084 res->lockname.name);
1083 goto recheck; 1085 goto recheck;
1084 } else { 1086 } else {
1085 if (!voting_done) { 1087 if (!voting_done) {
1086 mlog(0, "map not changed and voting not done " 1088 mlog(0, "map not changed and voting not done "
1087 "for %s:%.*s\n", dlm->name, res->lockname.len, 1089 "for %s:%.*s\n", dlm->name, res->lockname.len,
1088 res->lockname.name); 1090 res->lockname.name);
1089 } 1091 }
1090 } 1092 }
1091 1093
1092 if (m != O2NM_MAX_NODES) { 1094 if (m != O2NM_MAX_NODES) {
1093 /* another node has done an assert! 1095 /* another node has done an assert!
1094 * all done! */ 1096 * all done! */
1095 sleep = 0; 1097 sleep = 0;
1096 } else { 1098 } else {
1097 sleep = 1; 1099 sleep = 1;
1098 /* have all nodes responded? */ 1100 /* have all nodes responded? */
1099 if (voting_done && !*blocked) { 1101 if (voting_done && !*blocked) {
1100 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 1102 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
1101 if (dlm->node_num <= bit) { 1103 if (dlm->node_num <= bit) {
1102 /* my node number is lowest. 1104 /* my node number is lowest.
1103 * now tell other nodes that I am 1105 * now tell other nodes that I am
1104 * mastering this. */ 1106 * mastering this. */
1105 mle->master = dlm->node_num; 1107 mle->master = dlm->node_num;
1106 /* ref was grabbed in get_lock_resource 1108 /* ref was grabbed in get_lock_resource
1107 * will be dropped in dlmlock_master */ 1109 * will be dropped in dlmlock_master */
1108 assert = 1; 1110 assert = 1;
1109 sleep = 0; 1111 sleep = 0;
1110 } 1112 }
1111 /* if voting is done, but we have not received 1113 /* if voting is done, but we have not received
1112 * an assert master yet, we must sleep */ 1114 * an assert master yet, we must sleep */
1113 } 1115 }
1114 } 1116 }
1115 1117
1116 spin_unlock(&mle->spinlock); 1118 spin_unlock(&mle->spinlock);
1117 1119
1118 /* sleep if we haven't finished voting yet */ 1120 /* sleep if we haven't finished voting yet */
1119 if (sleep) { 1121 if (sleep) {
1120 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); 1122 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
1121 1123
1122 /* 1124 /*
1123 if (atomic_read(&mle->mle_refs.refcount) < 2) 1125 if (atomic_read(&mle->mle_refs.refcount) < 2)
1124 mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, 1126 mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
1125 atomic_read(&mle->mle_refs.refcount), 1127 atomic_read(&mle->mle_refs.refcount),
1126 res->lockname.len, res->lockname.name); 1128 res->lockname.len, res->lockname.name);
1127 */ 1129 */
1128 atomic_set(&mle->woken, 0); 1130 atomic_set(&mle->woken, 0);
1129 (void)wait_event_timeout(mle->wq, 1131 (void)wait_event_timeout(mle->wq,
1130 (atomic_read(&mle->woken) == 1), 1132 (atomic_read(&mle->woken) == 1),
1131 timeo); 1133 timeo);
1132 if (res->owner == O2NM_MAX_NODES) { 1134 if (res->owner == O2NM_MAX_NODES) {
1133 mlog(0, "%s:%.*s: waiting again\n", dlm->name, 1135 mlog(0, "%s:%.*s: waiting again\n", dlm->name,
1134 res->lockname.len, res->lockname.name); 1136 res->lockname.len, res->lockname.name);
1135 goto recheck; 1137 goto recheck;
1136 } 1138 }
1137 mlog(0, "done waiting, master is %u\n", res->owner); 1139 mlog(0, "done waiting, master is %u\n", res->owner);
1138 ret = 0; 1140 ret = 0;
1139 goto leave; 1141 goto leave;
1140 } 1142 }
1141 1143
1142 ret = 0; /* done */ 1144 ret = 0; /* done */
1143 if (assert) { 1145 if (assert) {
1144 m = dlm->node_num; 1146 m = dlm->node_num;
1145 mlog(0, "about to master %.*s here, this=%u\n", 1147 mlog(0, "about to master %.*s here, this=%u\n",
1146 res->lockname.len, res->lockname.name, m); 1148 res->lockname.len, res->lockname.name, m);
1147 ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); 1149 ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
1148 if (ret) { 1150 if (ret) {
1149 /* This is a failure in the network path, 1151 /* This is a failure in the network path,
1150 * not in the response to the assert_master 1152 * not in the response to the assert_master
1151 * (any nonzero response is a BUG on this node). 1153 * (any nonzero response is a BUG on this node).
1152 * Most likely a socket just got disconnected 1154 * Most likely a socket just got disconnected
1153 * due to node death. */ 1155 * due to node death. */
1154 mlog_errno(ret); 1156 mlog_errno(ret);
1155 } 1157 }
1156 /* no longer need to restart lock mastery. 1158 /* no longer need to restart lock mastery.
1157 * all living nodes have been contacted. */ 1159 * all living nodes have been contacted. */
1158 ret = 0; 1160 ret = 0;
1159 } 1161 }
1160 1162
1161 /* set the lockres owner */ 1163 /* set the lockres owner */
1162 spin_lock(&res->spinlock); 1164 spin_lock(&res->spinlock);
1163 /* mastery reference obtained either during 1165 /* mastery reference obtained either during
1164 * assert_master_handler or in get_lock_resource */ 1166 * assert_master_handler or in get_lock_resource */
1165 dlm_change_lockres_owner(dlm, res, m); 1167 dlm_change_lockres_owner(dlm, res, m);
1166 spin_unlock(&res->spinlock); 1168 spin_unlock(&res->spinlock);
1167 1169
1168 leave: 1170 leave:
1169 return ret; 1171 return ret;
1170 } 1172 }
1171 1173
1172 struct dlm_bitmap_diff_iter 1174 struct dlm_bitmap_diff_iter
1173 { 1175 {
1174 int curnode; 1176 int curnode;
1175 unsigned long *orig_bm; 1177 unsigned long *orig_bm;
1176 unsigned long *cur_bm; 1178 unsigned long *cur_bm;
1177 unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1179 unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
1178 }; 1180 };
1179 1181
1180 enum dlm_node_state_change 1182 enum dlm_node_state_change
1181 { 1183 {
1182 NODE_DOWN = -1, 1184 NODE_DOWN = -1,
1183 NODE_NO_CHANGE = 0, 1185 NODE_NO_CHANGE = 0,
1184 NODE_UP 1186 NODE_UP
1185 }; 1187 };
1186 1188
1187 static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, 1189 static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
1188 unsigned long *orig_bm, 1190 unsigned long *orig_bm,
1189 unsigned long *cur_bm) 1191 unsigned long *cur_bm)
1190 { 1192 {
1191 unsigned long p1, p2; 1193 unsigned long p1, p2;
1192 int i; 1194 int i;
1193 1195
1194 iter->curnode = -1; 1196 iter->curnode = -1;
1195 iter->orig_bm = orig_bm; 1197 iter->orig_bm = orig_bm;
1196 iter->cur_bm = cur_bm; 1198 iter->cur_bm = cur_bm;
1197 1199
1198 for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { 1200 for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1199 p1 = *(iter->orig_bm + i); 1201 p1 = *(iter->orig_bm + i);
1200 p2 = *(iter->cur_bm + i); 1202 p2 = *(iter->cur_bm + i);
1201 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); 1203 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1202 } 1204 }
1203 } 1205 }
1204 1206
1205 static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, 1207 static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1206 enum dlm_node_state_change *state) 1208 enum dlm_node_state_change *state)
1207 { 1209 {
1208 int bit; 1210 int bit;
1209 1211
1210 if (iter->curnode >= O2NM_MAX_NODES) 1212 if (iter->curnode >= O2NM_MAX_NODES)
1211 return -ENOENT; 1213 return -ENOENT;
1212 1214
1213 bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, 1215 bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1214 iter->curnode+1); 1216 iter->curnode+1);
1215 if (bit >= O2NM_MAX_NODES) { 1217 if (bit >= O2NM_MAX_NODES) {
1216 iter->curnode = O2NM_MAX_NODES; 1218 iter->curnode = O2NM_MAX_NODES;
1217 return -ENOENT; 1219 return -ENOENT;
1218 } 1220 }
1219 1221
1220 /* if it was there in the original then this node died */ 1222 /* if it was there in the original then this node died */
1221 if (test_bit(bit, iter->orig_bm)) 1223 if (test_bit(bit, iter->orig_bm))
1222 *state = NODE_DOWN; 1224 *state = NODE_DOWN;
1223 else 1225 else
1224 *state = NODE_UP; 1226 *state = NODE_UP;
1225 1227
1226 iter->curnode = bit; 1228 iter->curnode = bit;
1227 return bit; 1229 return bit;
1228 } 1230 }
1229 1231
1230 1232
1231 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, 1233 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1232 struct dlm_lock_resource *res, 1234 struct dlm_lock_resource *res,
1233 struct dlm_master_list_entry *mle, 1235 struct dlm_master_list_entry *mle,
1234 int blocked) 1236 int blocked)
1235 { 1237 {
1236 struct dlm_bitmap_diff_iter bdi; 1238 struct dlm_bitmap_diff_iter bdi;
1237 enum dlm_node_state_change sc; 1239 enum dlm_node_state_change sc;
1238 int node; 1240 int node;
1239 int ret = 0; 1241 int ret = 0;
1240 1242
1241 mlog(0, "something happened such that the " 1243 mlog(0, "something happened such that the "
1242 "master process may need to be restarted!\n"); 1244 "master process may need to be restarted!\n");
1243 1245
1244 assert_spin_locked(&mle->spinlock); 1246 assert_spin_locked(&mle->spinlock);
1245 1247
1246 dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); 1248 dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1247 node = dlm_bitmap_diff_iter_next(&bdi, &sc); 1249 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1248 while (node >= 0) { 1250 while (node >= 0) {
1249 if (sc == NODE_UP) { 1251 if (sc == NODE_UP) {
1250 /* a node came up. clear any old vote from 1252 /* a node came up. clear any old vote from
1251 * the response map and set it in the vote map 1253 * the response map and set it in the vote map
1252 * then restart the mastery. */ 1254 * then restart the mastery. */
1253 mlog(ML_NOTICE, "node %d up while restarting\n", node); 1255 mlog(ML_NOTICE, "node %d up while restarting\n", node);
1254 1256
1255 /* redo the master request, but only for the new node */ 1257 /* redo the master request, but only for the new node */
1256 mlog(0, "sending request to new node\n"); 1258 mlog(0, "sending request to new node\n");
1257 clear_bit(node, mle->response_map); 1259 clear_bit(node, mle->response_map);
1258 set_bit(node, mle->vote_map); 1260 set_bit(node, mle->vote_map);
1259 } else { 1261 } else {
1260 mlog(ML_ERROR, "node down! %d\n", node); 1262 mlog(ML_ERROR, "node down! %d\n", node);
1261 if (blocked) { 1263 if (blocked) {
1262 int lowest = find_next_bit(mle->maybe_map, 1264 int lowest = find_next_bit(mle->maybe_map,
1263 O2NM_MAX_NODES, 0); 1265 O2NM_MAX_NODES, 0);
1264 1266
1265 /* act like it was never there */ 1267 /* act like it was never there */
1266 clear_bit(node, mle->maybe_map); 1268 clear_bit(node, mle->maybe_map);
1267 1269
1268 if (node == lowest) { 1270 if (node == lowest) {
1269 mlog(0, "expected master %u died" 1271 mlog(0, "expected master %u died"
1270 " while this node was blocked " 1272 " while this node was blocked "
1271 "waiting on it!\n", node); 1273 "waiting on it!\n", node);
1272 lowest = find_next_bit(mle->maybe_map, 1274 lowest = find_next_bit(mle->maybe_map,
1273 O2NM_MAX_NODES, 1275 O2NM_MAX_NODES,
1274 lowest+1); 1276 lowest+1);
1275 if (lowest < O2NM_MAX_NODES) { 1277 if (lowest < O2NM_MAX_NODES) {
1276 mlog(0, "%s:%.*s:still " 1278 mlog(0, "%s:%.*s:still "
1277 "blocked. waiting on %u " 1279 "blocked. waiting on %u "
1278 "now\n", dlm->name, 1280 "now\n", dlm->name,
1279 res->lockname.len, 1281 res->lockname.len,
1280 res->lockname.name, 1282 res->lockname.name,
1281 lowest); 1283 lowest);
1282 } else { 1284 } else {
1283 /* mle is an MLE_BLOCK, but 1285 /* mle is an MLE_BLOCK, but
1284 * there is now nothing left to 1286 * there is now nothing left to
1285 * block on. we need to return 1287 * block on. we need to return
1286 * all the way back out and try 1288 * all the way back out and try
1287 * again with an MLE_MASTER. 1289 * again with an MLE_MASTER.
1288 * dlm_do_local_recovery_cleanup 1290 * dlm_do_local_recovery_cleanup
1289 * has already run, so the mle 1291 * has already run, so the mle
1290 * refcount is ok */ 1292 * refcount is ok */
1291 mlog(0, "%s:%.*s: no " 1293 mlog(0, "%s:%.*s: no "
1292 "longer blocking. try to " 1294 "longer blocking. try to "
1293 "master this here\n", 1295 "master this here\n",
1294 dlm->name, 1296 dlm->name,
1295 res->lockname.len, 1297 res->lockname.len,
1296 res->lockname.name); 1298 res->lockname.name);
1297 mle->type = DLM_MLE_MASTER; 1299 mle->type = DLM_MLE_MASTER;
1298 mle->mleres = res; 1300 mle->mleres = res;
1299 } 1301 }
1300 } 1302 }
1301 } 1303 }
1302 1304
1303 /* now blank out everything, as if we had never 1305 /* now blank out everything, as if we had never
1304 * contacted anyone */ 1306 * contacted anyone */
1305 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 1307 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
1306 memset(mle->response_map, 0, sizeof(mle->response_map)); 1308 memset(mle->response_map, 0, sizeof(mle->response_map));
1307 /* reset the vote_map to the current node_map */ 1309 /* reset the vote_map to the current node_map */
1308 memcpy(mle->vote_map, mle->node_map, 1310 memcpy(mle->vote_map, mle->node_map,
1309 sizeof(mle->node_map)); 1311 sizeof(mle->node_map));
1310 /* put myself into the maybe map */ 1312 /* put myself into the maybe map */
1311 if (mle->type != DLM_MLE_BLOCK) 1313 if (mle->type != DLM_MLE_BLOCK)
1312 set_bit(dlm->node_num, mle->maybe_map); 1314 set_bit(dlm->node_num, mle->maybe_map);
1313 } 1315 }
1314 ret = -EAGAIN; 1316 ret = -EAGAIN;
1315 node = dlm_bitmap_diff_iter_next(&bdi, &sc); 1317 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1316 } 1318 }
1317 return ret; 1319 return ret;
1318 } 1320 }
1319 1321
1320 1322
1321 /* 1323 /*
1322 * DLM_MASTER_REQUEST_MSG 1324 * DLM_MASTER_REQUEST_MSG
1323 * 1325 *
1324 * returns: 0 on success, 1326 * returns: 0 on success,
1325 * -errno on a network error 1327 * -errno on a network error
1326 * 1328 *
1327 * on error, the caller should assume the target node is "dead" 1329 * on error, the caller should assume the target node is "dead"
1328 * 1330 *
1329 */ 1331 */
1330 1332
1331 static int dlm_do_master_request(struct dlm_lock_resource *res, 1333 static int dlm_do_master_request(struct dlm_lock_resource *res,
1332 struct dlm_master_list_entry *mle, int to) 1334 struct dlm_master_list_entry *mle, int to)
1333 { 1335 {
1334 struct dlm_ctxt *dlm = mle->dlm; 1336 struct dlm_ctxt *dlm = mle->dlm;
1335 struct dlm_master_request request; 1337 struct dlm_master_request request;
1336 int ret, response=0, resend; 1338 int ret, response=0, resend;
1337 1339
1338 memset(&request, 0, sizeof(request)); 1340 memset(&request, 0, sizeof(request));
1339 request.node_idx = dlm->node_num; 1341 request.node_idx = dlm->node_num;
1340 1342
1341 BUG_ON(mle->type == DLM_MLE_MIGRATION); 1343 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1342 1344
1343 request.namelen = (u8)mle->mnamelen; 1345 request.namelen = (u8)mle->mnamelen;
1344 memcpy(request.name, mle->mname, request.namelen); 1346 memcpy(request.name, mle->mname, request.namelen);
1345 1347
1346 again: 1348 again:
1347 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 1349 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1348 sizeof(request), to, &response); 1350 sizeof(request), to, &response);
1349 if (ret < 0) { 1351 if (ret < 0) {
1350 if (ret == -ESRCH) { 1352 if (ret == -ESRCH) {
1351 /* should never happen */ 1353 /* should never happen */
1352 mlog(ML_ERROR, "TCP stack not ready!\n"); 1354 mlog(ML_ERROR, "TCP stack not ready!\n");
1353 BUG(); 1355 BUG();
1354 } else if (ret == -EINVAL) { 1356 } else if (ret == -EINVAL) {
1355 mlog(ML_ERROR, "bad args passed to o2net!\n"); 1357 mlog(ML_ERROR, "bad args passed to o2net!\n");
1356 BUG(); 1358 BUG();
1357 } else if (ret == -ENOMEM) { 1359 } else if (ret == -ENOMEM) {
1358 mlog(ML_ERROR, "out of memory while trying to send " 1360 mlog(ML_ERROR, "out of memory while trying to send "
1359 "network message! retrying\n"); 1361 "network message! retrying\n");
1360 /* this is totally crude */ 1362 /* this is totally crude */
1361 msleep(50); 1363 msleep(50);
1362 goto again; 1364 goto again;
1363 } else if (!dlm_is_host_down(ret)) { 1365 } else if (!dlm_is_host_down(ret)) {
1364 /* not a network error. bad. */ 1366 /* not a network error. bad. */
1365 mlog_errno(ret); 1367 mlog_errno(ret);
1366 mlog(ML_ERROR, "unhandled error!"); 1368 mlog(ML_ERROR, "unhandled error!");
1367 BUG(); 1369 BUG();
1368 } 1370 }
1369 /* all other errors should be network errors, 1371 /* all other errors should be network errors,
1370 * and likely indicate node death */ 1372 * and likely indicate node death */
1371 mlog(ML_ERROR, "link to %d went down!\n", to); 1373 mlog(ML_ERROR, "link to %d went down!\n", to);
1372 goto out; 1374 goto out;
1373 } 1375 }
1374 1376
1375 ret = 0; 1377 ret = 0;
1376 resend = 0; 1378 resend = 0;
1377 spin_lock(&mle->spinlock); 1379 spin_lock(&mle->spinlock);
1378 switch (response) { 1380 switch (response) {
1379 case DLM_MASTER_RESP_YES: 1381 case DLM_MASTER_RESP_YES:
1380 set_bit(to, mle->response_map); 1382 set_bit(to, mle->response_map);
1381 mlog(0, "node %u is the master, response=YES\n", to); 1383 mlog(0, "node %u is the master, response=YES\n", to);
1382 mlog(0, "%s:%.*s: master node %u now knows I have a " 1384 mlog(0, "%s:%.*s: master node %u now knows I have a "
1383 "reference\n", dlm->name, res->lockname.len, 1385 "reference\n", dlm->name, res->lockname.len,
1384 res->lockname.name, to); 1386 res->lockname.name, to);
1385 mle->master = to; 1387 mle->master = to;
1386 break; 1388 break;
1387 case DLM_MASTER_RESP_NO: 1389 case DLM_MASTER_RESP_NO:
1388 mlog(0, "node %u not master, response=NO\n", to); 1390 mlog(0, "node %u not master, response=NO\n", to);
1389 set_bit(to, mle->response_map); 1391 set_bit(to, mle->response_map);
1390 break; 1392 break;
1391 case DLM_MASTER_RESP_MAYBE: 1393 case DLM_MASTER_RESP_MAYBE:
1392 mlog(0, "node %u not master, response=MAYBE\n", to); 1394 mlog(0, "node %u not master, response=MAYBE\n", to);
1393 set_bit(to, mle->response_map); 1395 set_bit(to, mle->response_map);
1394 set_bit(to, mle->maybe_map); 1396 set_bit(to, mle->maybe_map);
1395 break; 1397 break;
1396 case DLM_MASTER_RESP_ERROR: 1398 case DLM_MASTER_RESP_ERROR:
1397 mlog(0, "node %u hit an error, resending\n", to); 1399 mlog(0, "node %u hit an error, resending\n", to);
1398 resend = 1; 1400 resend = 1;
1399 response = 0; 1401 response = 0;
1400 break; 1402 break;
1401 default: 1403 default:
1402 mlog(ML_ERROR, "bad response! %u\n", response); 1404 mlog(ML_ERROR, "bad response! %u\n", response);
1403 BUG(); 1405 BUG();
1404 } 1406 }
1405 spin_unlock(&mle->spinlock); 1407 spin_unlock(&mle->spinlock);
1406 if (resend) { 1408 if (resend) {
1407 /* this is also totally crude */ 1409 /* this is also totally crude */
1408 msleep(50); 1410 msleep(50);
1409 goto again; 1411 goto again;
1410 } 1412 }
1411 1413
1412 out: 1414 out:
1413 return ret; 1415 return ret;
1414 } 1416 }
1415 1417
1416 /* 1418 /*
1417 * locks that can be taken here: 1419 * locks that can be taken here:
1418 * dlm->spinlock 1420 * dlm->spinlock
1419 * res->spinlock 1421 * res->spinlock
1420 * mle->spinlock 1422 * mle->spinlock
1421 * dlm->master_list 1423 * dlm->master_list
1422 * 1424 *
1423 * if possible, TRIM THIS DOWN!!! 1425 * if possible, TRIM THIS DOWN!!!
1424 */ 1426 */
1425 int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, 1427 int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
1426 void **ret_data) 1428 void **ret_data)
1427 { 1429 {
1428 u8 response = DLM_MASTER_RESP_MAYBE; 1430 u8 response = DLM_MASTER_RESP_MAYBE;
1429 struct dlm_ctxt *dlm = data; 1431 struct dlm_ctxt *dlm = data;
1430 struct dlm_lock_resource *res = NULL; 1432 struct dlm_lock_resource *res = NULL;
1431 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; 1433 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1432 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; 1434 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1433 char *name; 1435 char *name;
1434 unsigned int namelen, hash; 1436 unsigned int namelen, hash;
1435 int found, ret; 1437 int found, ret;
1436 int set_maybe; 1438 int set_maybe;
1437 int dispatch_assert = 0; 1439 int dispatch_assert = 0;
1438 1440
1439 if (!dlm_grab(dlm)) 1441 if (!dlm_grab(dlm))
1440 return DLM_MASTER_RESP_NO; 1442 return DLM_MASTER_RESP_NO;
1441 1443
1442 if (!dlm_domain_fully_joined(dlm)) { 1444 if (!dlm_domain_fully_joined(dlm)) {
1443 response = DLM_MASTER_RESP_NO; 1445 response = DLM_MASTER_RESP_NO;
1444 goto send_response; 1446 goto send_response;
1445 } 1447 }
1446 1448
1447 name = request->name; 1449 name = request->name;
1448 namelen = request->namelen; 1450 namelen = request->namelen;
1449 hash = dlm_lockid_hash(name, namelen); 1451 hash = dlm_lockid_hash(name, namelen);
1450 1452
1451 if (namelen > DLM_LOCKID_NAME_MAX) { 1453 if (namelen > DLM_LOCKID_NAME_MAX) {
1452 response = DLM_IVBUFLEN; 1454 response = DLM_IVBUFLEN;
1453 goto send_response; 1455 goto send_response;
1454 } 1456 }
1455 1457
1456 way_up_top: 1458 way_up_top:
1457 spin_lock(&dlm->spinlock); 1459 spin_lock(&dlm->spinlock);
1458 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 1460 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1459 if (res) { 1461 if (res) {
1460 spin_unlock(&dlm->spinlock); 1462 spin_unlock(&dlm->spinlock);
1461 1463
1462 /* take care of the easy cases up front */ 1464 /* take care of the easy cases up front */
1463 spin_lock(&res->spinlock); 1465 spin_lock(&res->spinlock);
1464 if (res->state & (DLM_LOCK_RES_RECOVERING| 1466 if (res->state & (DLM_LOCK_RES_RECOVERING|
1465 DLM_LOCK_RES_MIGRATING)) { 1467 DLM_LOCK_RES_MIGRATING)) {
1466 spin_unlock(&res->spinlock); 1468 spin_unlock(&res->spinlock);
1467 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " 1469 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1468 "being recovered/migrated\n"); 1470 "being recovered/migrated\n");
1469 response = DLM_MASTER_RESP_ERROR; 1471 response = DLM_MASTER_RESP_ERROR;
1470 if (mle) 1472 if (mle)
1471 kmem_cache_free(dlm_mle_cache, mle); 1473 kmem_cache_free(dlm_mle_cache, mle);
1472 goto send_response; 1474 goto send_response;
1473 } 1475 }
1474 1476
1475 if (res->owner == dlm->node_num) { 1477 if (res->owner == dlm->node_num) {
1476 dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); 1478 dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
1477 spin_unlock(&res->spinlock); 1479 spin_unlock(&res->spinlock);
1478 response = DLM_MASTER_RESP_YES; 1480 response = DLM_MASTER_RESP_YES;
1479 if (mle) 1481 if (mle)
1480 kmem_cache_free(dlm_mle_cache, mle); 1482 kmem_cache_free(dlm_mle_cache, mle);
1481 1483
1482 /* this node is the owner. 1484 /* this node is the owner.
1483 * there is some extra work that needs to 1485 * there is some extra work that needs to
1484 * happen now. the requesting node has 1486 * happen now. the requesting node has
1485 * caused all nodes up to this one to 1487 * caused all nodes up to this one to
1486 * create mles. this node now needs to 1488 * create mles. this node now needs to
1487 * go back and clean those up. */ 1489 * go back and clean those up. */
1488 dispatch_assert = 1; 1490 dispatch_assert = 1;
1489 goto send_response; 1491 goto send_response;
1490 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 1492 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1491 spin_unlock(&res->spinlock); 1493 spin_unlock(&res->spinlock);
1492 // mlog(0, "node %u is the master\n", res->owner); 1494 // mlog(0, "node %u is the master\n", res->owner);
1493 response = DLM_MASTER_RESP_NO; 1495 response = DLM_MASTER_RESP_NO;
1494 if (mle) 1496 if (mle)
1495 kmem_cache_free(dlm_mle_cache, mle); 1497 kmem_cache_free(dlm_mle_cache, mle);
1496 goto send_response; 1498 goto send_response;
1497 } 1499 }
1498 1500
1499 /* ok, there is no owner. either this node is 1501 /* ok, there is no owner. either this node is
1500 * being blocked, or it is actively trying to 1502 * being blocked, or it is actively trying to
1501 * master this lock. */ 1503 * master this lock. */
1502 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { 1504 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1503 mlog(ML_ERROR, "lock with no owner should be " 1505 mlog(ML_ERROR, "lock with no owner should be "
1504 "in-progress!\n"); 1506 "in-progress!\n");
1505 BUG(); 1507 BUG();
1506 } 1508 }
1507 1509
1508 // mlog(0, "lockres is in progress...\n"); 1510 // mlog(0, "lockres is in progress...\n");
1509 spin_lock(&dlm->master_lock); 1511 spin_lock(&dlm->master_lock);
1510 found = dlm_find_mle(dlm, &tmpmle, name, namelen); 1512 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1511 if (!found) { 1513 if (!found) {
1512 mlog(ML_ERROR, "no mle found for this lock!\n"); 1514 mlog(ML_ERROR, "no mle found for this lock!\n");
1513 BUG(); 1515 BUG();
1514 } 1516 }
1515 set_maybe = 1; 1517 set_maybe = 1;
1516 spin_lock(&tmpmle->spinlock); 1518 spin_lock(&tmpmle->spinlock);
1517 if (tmpmle->type == DLM_MLE_BLOCK) { 1519 if (tmpmle->type == DLM_MLE_BLOCK) {
1518 // mlog(0, "this node is waiting for " 1520 // mlog(0, "this node is waiting for "
1519 // "lockres to be mastered\n"); 1521 // "lockres to be mastered\n");
1520 response = DLM_MASTER_RESP_NO; 1522 response = DLM_MASTER_RESP_NO;
1521 } else if (tmpmle->type == DLM_MLE_MIGRATION) { 1523 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1522 mlog(0, "node %u is master, but trying to migrate to " 1524 mlog(0, "node %u is master, but trying to migrate to "
1523 "node %u.\n", tmpmle->master, tmpmle->new_master); 1525 "node %u.\n", tmpmle->master, tmpmle->new_master);
1524 if (tmpmle->master == dlm->node_num) { 1526 if (tmpmle->master == dlm->node_num) {
1525 mlog(ML_ERROR, "no owner on lockres, but this " 1527 mlog(ML_ERROR, "no owner on lockres, but this "
1526 "node is trying to migrate it to %u?!\n", 1528 "node is trying to migrate it to %u?!\n",
1527 tmpmle->new_master); 1529 tmpmle->new_master);
1528 BUG(); 1530 BUG();
1529 } else { 1531 } else {
1530 /* the real master can respond on its own */ 1532 /* the real master can respond on its own */
1531 response = DLM_MASTER_RESP_NO; 1533 response = DLM_MASTER_RESP_NO;
1532 } 1534 }
1533 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { 1535 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1534 set_maybe = 0; 1536 set_maybe = 0;
1535 if (tmpmle->master == dlm->node_num) { 1537 if (tmpmle->master == dlm->node_num) {
1536 response = DLM_MASTER_RESP_YES; 1538 response = DLM_MASTER_RESP_YES;
1537 /* this node will be the owner. 1539 /* this node will be the owner.
1538 * go back and clean the mles on any 1540 * go back and clean the mles on any
1539 * other nodes */ 1541 * other nodes */
1540 dispatch_assert = 1; 1542 dispatch_assert = 1;
1541 dlm_lockres_set_refmap_bit(dlm, res, 1543 dlm_lockres_set_refmap_bit(dlm, res,
1542 request->node_idx); 1544 request->node_idx);
1543 } else 1545 } else
1544 response = DLM_MASTER_RESP_NO; 1546 response = DLM_MASTER_RESP_NO;
1545 } else { 1547 } else {
1546 // mlog(0, "this node is attempting to " 1548 // mlog(0, "this node is attempting to "
1547 // "master lockres\n"); 1549 // "master lockres\n");
1548 response = DLM_MASTER_RESP_MAYBE; 1550 response = DLM_MASTER_RESP_MAYBE;
1549 } 1551 }
1550 if (set_maybe) 1552 if (set_maybe)
1551 set_bit(request->node_idx, tmpmle->maybe_map); 1553 set_bit(request->node_idx, tmpmle->maybe_map);
1552 spin_unlock(&tmpmle->spinlock); 1554 spin_unlock(&tmpmle->spinlock);
1553 1555
1554 spin_unlock(&dlm->master_lock); 1556 spin_unlock(&dlm->master_lock);
1555 spin_unlock(&res->spinlock); 1557 spin_unlock(&res->spinlock);
1556 1558
1557 /* keep the mle attached to heartbeat events */ 1559 /* keep the mle attached to heartbeat events */
1558 dlm_put_mle(tmpmle); 1560 dlm_put_mle(tmpmle);
1559 if (mle) 1561 if (mle)
1560 kmem_cache_free(dlm_mle_cache, mle); 1562 kmem_cache_free(dlm_mle_cache, mle);
1561 goto send_response; 1563 goto send_response;
1562 } 1564 }
1563 1565
1564 /* 1566 /*
1565 * lockres doesn't exist on this node 1567 * lockres doesn't exist on this node
1566 * if there is an MLE_BLOCK, return NO 1568 * if there is an MLE_BLOCK, return NO
1567 * if there is an MLE_MASTER, return MAYBE 1569 * if there is an MLE_MASTER, return MAYBE
1568 * otherwise, add an MLE_BLOCK, return NO 1570 * otherwise, add an MLE_BLOCK, return NO
1569 */ 1571 */
1570 spin_lock(&dlm->master_lock); 1572 spin_lock(&dlm->master_lock);
1571 found = dlm_find_mle(dlm, &tmpmle, name, namelen); 1573 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1572 if (!found) { 1574 if (!found) {
1573 /* this lockid has never been seen on this node yet */ 1575 /* this lockid has never been seen on this node yet */
1574 // mlog(0, "no mle found\n"); 1576 // mlog(0, "no mle found\n");
1575 if (!mle) { 1577 if (!mle) {
1576 spin_unlock(&dlm->master_lock); 1578 spin_unlock(&dlm->master_lock);
1577 spin_unlock(&dlm->spinlock); 1579 spin_unlock(&dlm->spinlock);
1578 1580
1579 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 1581 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1580 if (!mle) { 1582 if (!mle) {
1581 response = DLM_MASTER_RESP_ERROR; 1583 response = DLM_MASTER_RESP_ERROR;
1582 mlog_errno(-ENOMEM); 1584 mlog_errno(-ENOMEM);
1583 goto send_response; 1585 goto send_response;
1584 } 1586 }
1585 goto way_up_top; 1587 goto way_up_top;
1586 } 1588 }
1587 1589
1588 // mlog(0, "this is second time thru, already allocated, " 1590 // mlog(0, "this is second time thru, already allocated, "
1589 // "add the block.\n"); 1591 // "add the block.\n");
1590 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1592 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1591 set_bit(request->node_idx, mle->maybe_map); 1593 set_bit(request->node_idx, mle->maybe_map);
1592 __dlm_insert_mle(dlm, mle); 1594 __dlm_insert_mle(dlm, mle);
1593 response = DLM_MASTER_RESP_NO; 1595 response = DLM_MASTER_RESP_NO;
1594 } else { 1596 } else {
1595 // mlog(0, "mle was found\n"); 1597 // mlog(0, "mle was found\n");
1596 set_maybe = 1; 1598 set_maybe = 1;
1597 spin_lock(&tmpmle->spinlock); 1599 spin_lock(&tmpmle->spinlock);
1598 if (tmpmle->master == dlm->node_num) { 1600 if (tmpmle->master == dlm->node_num) {
1599 mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); 1601 mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
1600 BUG(); 1602 BUG();
1601 } 1603 }
1602 if (tmpmle->type == DLM_MLE_BLOCK) 1604 if (tmpmle->type == DLM_MLE_BLOCK)
1603 response = DLM_MASTER_RESP_NO; 1605 response = DLM_MASTER_RESP_NO;
1604 else if (tmpmle->type == DLM_MLE_MIGRATION) { 1606 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1605 mlog(0, "migration mle was found (%u->%u)\n", 1607 mlog(0, "migration mle was found (%u->%u)\n",
1606 tmpmle->master, tmpmle->new_master); 1608 tmpmle->master, tmpmle->new_master);
1607 /* real master can respond on its own */ 1609 /* real master can respond on its own */
1608 response = DLM_MASTER_RESP_NO; 1610 response = DLM_MASTER_RESP_NO;
1609 } else 1611 } else
1610 response = DLM_MASTER_RESP_MAYBE; 1612 response = DLM_MASTER_RESP_MAYBE;
1611 if (set_maybe) 1613 if (set_maybe)
1612 set_bit(request->node_idx, tmpmle->maybe_map); 1614 set_bit(request->node_idx, tmpmle->maybe_map);
1613 spin_unlock(&tmpmle->spinlock); 1615 spin_unlock(&tmpmle->spinlock);
1614 } 1616 }
1615 spin_unlock(&dlm->master_lock); 1617 spin_unlock(&dlm->master_lock);
1616 spin_unlock(&dlm->spinlock); 1618 spin_unlock(&dlm->spinlock);
1617 1619
1618 if (found) { 1620 if (found) {
1619 /* keep the mle attached to heartbeat events */ 1621 /* keep the mle attached to heartbeat events */
1620 dlm_put_mle(tmpmle); 1622 dlm_put_mle(tmpmle);
1621 } 1623 }
1622 send_response: 1624 send_response:
1623 /* 1625 /*
1624 * __dlm_lookup_lockres() grabbed a reference to this lockres. 1626 * __dlm_lookup_lockres() grabbed a reference to this lockres.
1625 * The reference is released by dlm_assert_master_worker() under 1627 * The reference is released by dlm_assert_master_worker() under
1626 * the call to dlm_dispatch_assert_master(). If 1628 * the call to dlm_dispatch_assert_master(). If
1627 * dlm_assert_master_worker() isn't called, we drop it here. 1629 * dlm_assert_master_worker() isn't called, we drop it here.
1628 */ 1630 */
1629 if (dispatch_assert) { 1631 if (dispatch_assert) {
1630 if (response != DLM_MASTER_RESP_YES) 1632 if (response != DLM_MASTER_RESP_YES)
1631 mlog(ML_ERROR, "invalid response %d\n", response); 1633 mlog(ML_ERROR, "invalid response %d\n", response);
1632 if (!res) { 1634 if (!res) {
1633 mlog(ML_ERROR, "bad lockres while trying to assert!\n"); 1635 mlog(ML_ERROR, "bad lockres while trying to assert!\n");
1634 BUG(); 1636 BUG();
1635 } 1637 }
1636 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 1638 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1637 dlm->node_num, res->lockname.len, res->lockname.name); 1639 dlm->node_num, res->lockname.len, res->lockname.name);
1638 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 1640 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1639 DLM_ASSERT_MASTER_MLE_CLEANUP); 1641 DLM_ASSERT_MASTER_MLE_CLEANUP);
1640 if (ret < 0) { 1642 if (ret < 0) {
1641 mlog(ML_ERROR, "failed to dispatch assert master work\n"); 1643 mlog(ML_ERROR, "failed to dispatch assert master work\n");
1642 response = DLM_MASTER_RESP_ERROR; 1644 response = DLM_MASTER_RESP_ERROR;
1643 dlm_lockres_put(res); 1645 dlm_lockres_put(res);
1644 } else 1646 } else
1645 dlm_lockres_grab_inflight_worker(dlm, res); 1647 dlm_lockres_grab_inflight_worker(dlm, res);
1646 } else { 1648 } else {
1647 if (res) 1649 if (res)
1648 dlm_lockres_put(res); 1650 dlm_lockres_put(res);
1649 } 1651 }
1650 1652
1651 dlm_put(dlm); 1653 dlm_put(dlm);
1652 return response; 1654 return response;
1653 } 1655 }
1654 1656
1655 /* 1657 /*
1656 * DLM_ASSERT_MASTER_MSG 1658 * DLM_ASSERT_MASTER_MSG
1657 */ 1659 */
1658 1660
1659 1661
1660 /* 1662 /*
1661 * NOTE: this can be used for debugging 1663 * NOTE: this can be used for debugging
1662 * can periodically run all locks owned by this node 1664 * can periodically run all locks owned by this node
1663 * and re-assert across the cluster... 1665 * and re-assert across the cluster...
1664 */ 1666 */
1665 static int dlm_do_assert_master(struct dlm_ctxt *dlm, 1667 static int dlm_do_assert_master(struct dlm_ctxt *dlm,
1666 struct dlm_lock_resource *res, 1668 struct dlm_lock_resource *res,
1667 void *nodemap, u32 flags) 1669 void *nodemap, u32 flags)
1668 { 1670 {
1669 struct dlm_assert_master assert; 1671 struct dlm_assert_master assert;
1670 int to, tmpret; 1672 int to, tmpret;
1671 struct dlm_node_iter iter; 1673 struct dlm_node_iter iter;
1672 int ret = 0; 1674 int ret = 0;
1673 int reassert; 1675 int reassert;
1674 const char *lockname = res->lockname.name; 1676 const char *lockname = res->lockname.name;
1675 unsigned int namelen = res->lockname.len; 1677 unsigned int namelen = res->lockname.len;
1676 1678
1677 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 1679 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1678 1680
1679 spin_lock(&res->spinlock); 1681 spin_lock(&res->spinlock);
1680 res->state |= DLM_LOCK_RES_SETREF_INPROG; 1682 res->state |= DLM_LOCK_RES_SETREF_INPROG;
1681 spin_unlock(&res->spinlock); 1683 spin_unlock(&res->spinlock);
1682 1684
1683 again: 1685 again:
1684 reassert = 0; 1686 reassert = 0;
1685 1687
1686 /* note that if this nodemap is empty, it returns 0 */ 1688 /* note that if this nodemap is empty, it returns 0 */
1687 dlm_node_iter_init(nodemap, &iter); 1689 dlm_node_iter_init(nodemap, &iter);
1688 while ((to = dlm_node_iter_next(&iter)) >= 0) { 1690 while ((to = dlm_node_iter_next(&iter)) >= 0) {
1689 int r = 0; 1691 int r = 0;
1690 struct dlm_master_list_entry *mle = NULL; 1692 struct dlm_master_list_entry *mle = NULL;
1691 1693
1692 mlog(0, "sending assert master to %d (%.*s)\n", to, 1694 mlog(0, "sending assert master to %d (%.*s)\n", to,
1693 namelen, lockname); 1695 namelen, lockname);
1694 memset(&assert, 0, sizeof(assert)); 1696 memset(&assert, 0, sizeof(assert));
1695 assert.node_idx = dlm->node_num; 1697 assert.node_idx = dlm->node_num;
1696 assert.namelen = namelen; 1698 assert.namelen = namelen;
1697 memcpy(assert.name, lockname, namelen); 1699 memcpy(assert.name, lockname, namelen);
1698 assert.flags = cpu_to_be32(flags); 1700 assert.flags = cpu_to_be32(flags);
1699 1701
1700 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 1702 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1701 &assert, sizeof(assert), to, &r); 1703 &assert, sizeof(assert), to, &r);
1702 if (tmpret < 0) { 1704 if (tmpret < 0) {
1703 mlog(ML_ERROR, "Error %d when sending message %u (key " 1705 mlog(ML_ERROR, "Error %d when sending message %u (key "
1704 "0x%x) to node %u\n", tmpret, 1706 "0x%x) to node %u\n", tmpret,
1705 DLM_ASSERT_MASTER_MSG, dlm->key, to); 1707 DLM_ASSERT_MASTER_MSG, dlm->key, to);
1706 if (!dlm_is_host_down(tmpret)) { 1708 if (!dlm_is_host_down(tmpret)) {
1707 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 1709 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1708 BUG(); 1710 BUG();
1709 } 1711 }
1710 /* a node died. finish out the rest of the nodes. */ 1712 /* a node died. finish out the rest of the nodes. */
1711 mlog(0, "link to %d went down!\n", to); 1713 mlog(0, "link to %d went down!\n", to);
1712 /* any nonzero status return will do */ 1714 /* any nonzero status return will do */
1713 ret = tmpret; 1715 ret = tmpret;
1714 r = 0; 1716 r = 0;
1715 } else if (r < 0) { 1717 } else if (r < 0) {
1716 /* ok, something horribly messed. kill thyself. */ 1718 /* ok, something horribly messed. kill thyself. */
1717 mlog(ML_ERROR,"during assert master of %.*s to %u, " 1719 mlog(ML_ERROR,"during assert master of %.*s to %u, "
1718 "got %d.\n", namelen, lockname, to, r); 1720 "got %d.\n", namelen, lockname, to, r);
1719 spin_lock(&dlm->spinlock); 1721 spin_lock(&dlm->spinlock);
1720 spin_lock(&dlm->master_lock); 1722 spin_lock(&dlm->master_lock);
1721 if (dlm_find_mle(dlm, &mle, (char *)lockname, 1723 if (dlm_find_mle(dlm, &mle, (char *)lockname,
1722 namelen)) { 1724 namelen)) {
1723 dlm_print_one_mle(mle); 1725 dlm_print_one_mle(mle);
1724 __dlm_put_mle(mle); 1726 __dlm_put_mle(mle);
1725 } 1727 }
1726 spin_unlock(&dlm->master_lock); 1728 spin_unlock(&dlm->master_lock);
1727 spin_unlock(&dlm->spinlock); 1729 spin_unlock(&dlm->spinlock);
1728 BUG(); 1730 BUG();
1729 } 1731 }
1730 1732
1731 if (r & DLM_ASSERT_RESPONSE_REASSERT && 1733 if (r & DLM_ASSERT_RESPONSE_REASSERT &&
1732 !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { 1734 !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
1733 mlog(ML_ERROR, "%.*s: very strange, " 1735 mlog(ML_ERROR, "%.*s: very strange, "
1734 "master MLE but no lockres on %u\n", 1736 "master MLE but no lockres on %u\n",
1735 namelen, lockname, to); 1737 namelen, lockname, to);
1736 } 1738 }
1737 1739
1738 if (r & DLM_ASSERT_RESPONSE_REASSERT) { 1740 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1739 mlog(0, "%.*s: node %u create mles on other " 1741 mlog(0, "%.*s: node %u create mles on other "
1740 "nodes and requests a re-assert\n", 1742 "nodes and requests a re-assert\n",
1741 namelen, lockname, to); 1743 namelen, lockname, to);
1742 reassert = 1; 1744 reassert = 1;
1743 } 1745 }
1744 if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { 1746 if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
1745 mlog(0, "%.*s: node %u has a reference to this " 1747 mlog(0, "%.*s: node %u has a reference to this "
1746 "lockres, set the bit in the refmap\n", 1748 "lockres, set the bit in the refmap\n",
1747 namelen, lockname, to); 1749 namelen, lockname, to);
1748 spin_lock(&res->spinlock); 1750 spin_lock(&res->spinlock);
1749 dlm_lockres_set_refmap_bit(dlm, res, to); 1751 dlm_lockres_set_refmap_bit(dlm, res, to);
1750 spin_unlock(&res->spinlock); 1752 spin_unlock(&res->spinlock);
1751 } 1753 }
1752 } 1754 }
1753 1755
1754 if (reassert) 1756 if (reassert)
1755 goto again; 1757 goto again;
1756 1758
1757 spin_lock(&res->spinlock); 1759 spin_lock(&res->spinlock);
1758 res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 1760 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
1759 spin_unlock(&res->spinlock); 1761 spin_unlock(&res->spinlock);
1760 wake_up(&res->wq); 1762 wake_up(&res->wq);
1761 1763
1762 return ret; 1764 return ret;
1763 } 1765 }
1764 1766
1765 /* 1767 /*
1766 * locks that can be taken here: 1768 * locks that can be taken here:
1767 * dlm->spinlock 1769 * dlm->spinlock
1768 * res->spinlock 1770 * res->spinlock
1769 * mle->spinlock 1771 * mle->spinlock
1770 * dlm->master_list 1772 * dlm->master_list
1771 * 1773 *
1772 * if possible, TRIM THIS DOWN!!! 1774 * if possible, TRIM THIS DOWN!!!
1773 */ 1775 */
1774 int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, 1776 int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1775 void **ret_data) 1777 void **ret_data)
1776 { 1778 {
1777 struct dlm_ctxt *dlm = data; 1779 struct dlm_ctxt *dlm = data;
1778 struct dlm_master_list_entry *mle = NULL; 1780 struct dlm_master_list_entry *mle = NULL;
1779 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; 1781 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1780 struct dlm_lock_resource *res = NULL; 1782 struct dlm_lock_resource *res = NULL;
1781 char *name; 1783 char *name;
1782 unsigned int namelen, hash; 1784 unsigned int namelen, hash;
1783 u32 flags; 1785 u32 flags;
1784 int master_request = 0, have_lockres_ref = 0; 1786 int master_request = 0, have_lockres_ref = 0;
1785 int ret = 0; 1787 int ret = 0;
1786 1788
1787 if (!dlm_grab(dlm)) 1789 if (!dlm_grab(dlm))
1788 return 0; 1790 return 0;
1789 1791
1790 name = assert->name; 1792 name = assert->name;
1791 namelen = assert->namelen; 1793 namelen = assert->namelen;
1792 hash = dlm_lockid_hash(name, namelen); 1794 hash = dlm_lockid_hash(name, namelen);
1793 flags = be32_to_cpu(assert->flags); 1795 flags = be32_to_cpu(assert->flags);
1794 1796
1795 if (namelen > DLM_LOCKID_NAME_MAX) { 1797 if (namelen > DLM_LOCKID_NAME_MAX) {
1796 mlog(ML_ERROR, "Invalid name length!"); 1798 mlog(ML_ERROR, "Invalid name length!");
1797 goto done; 1799 goto done;
1798 } 1800 }
1799 1801
1800 spin_lock(&dlm->spinlock); 1802 spin_lock(&dlm->spinlock);
1801 1803
1802 if (flags) 1804 if (flags)
1803 mlog(0, "assert_master with flags: %u\n", flags); 1805 mlog(0, "assert_master with flags: %u\n", flags);
1804 1806
1805 /* find the MLE */ 1807 /* find the MLE */
1806 spin_lock(&dlm->master_lock); 1808 spin_lock(&dlm->master_lock);
1807 if (!dlm_find_mle(dlm, &mle, name, namelen)) { 1809 if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1808 /* not an error, could be master just re-asserting */ 1810 /* not an error, could be master just re-asserting */
1809 mlog(0, "just got an assert_master from %u, but no " 1811 mlog(0, "just got an assert_master from %u, but no "
1810 "MLE for it! (%.*s)\n", assert->node_idx, 1812 "MLE for it! (%.*s)\n", assert->node_idx,
1811 namelen, name); 1813 namelen, name);
1812 } else { 1814 } else {
1813 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); 1815 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1814 if (bit >= O2NM_MAX_NODES) { 1816 if (bit >= O2NM_MAX_NODES) {
1815 /* not necessarily an error, though less likely. 1817 /* not necessarily an error, though less likely.
1816 * could be master just re-asserting. */ 1818 * could be master just re-asserting. */
1817 mlog(0, "no bits set in the maybe_map, but %u " 1819 mlog(0, "no bits set in the maybe_map, but %u "
1818 "is asserting! (%.*s)\n", assert->node_idx, 1820 "is asserting! (%.*s)\n", assert->node_idx,
1819 namelen, name); 1821 namelen, name);
1820 } else if (bit != assert->node_idx) { 1822 } else if (bit != assert->node_idx) {
1821 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { 1823 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1822 mlog(0, "master %u was found, %u should " 1824 mlog(0, "master %u was found, %u should "
1823 "back off\n", assert->node_idx, bit); 1825 "back off\n", assert->node_idx, bit);
1824 } else { 1826 } else {
1825 /* with the fix for bug 569, a higher node 1827 /* with the fix for bug 569, a higher node
1826 * number winning the mastery will respond 1828 * number winning the mastery will respond
1827 * YES to mastery requests, but this node 1829 * YES to mastery requests, but this node
1828 * had no way of knowing. let it pass. */ 1830 * had no way of knowing. let it pass. */
1829 mlog(0, "%u is the lowest node, " 1831 mlog(0, "%u is the lowest node, "
1830 "%u is asserting. (%.*s) %u must " 1832 "%u is asserting. (%.*s) %u must "
1831 "have begun after %u won.\n", bit, 1833 "have begun after %u won.\n", bit,
1832 assert->node_idx, namelen, name, bit, 1834 assert->node_idx, namelen, name, bit,
1833 assert->node_idx); 1835 assert->node_idx);
1834 } 1836 }
1835 } 1837 }
1836 if (mle->type == DLM_MLE_MIGRATION) { 1838 if (mle->type == DLM_MLE_MIGRATION) {
1837 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { 1839 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1838 mlog(0, "%s:%.*s: got cleanup assert" 1840 mlog(0, "%s:%.*s: got cleanup assert"
1839 " from %u for migration\n", 1841 " from %u for migration\n",
1840 dlm->name, namelen, name, 1842 dlm->name, namelen, name,
1841 assert->node_idx); 1843 assert->node_idx);
1842 } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { 1844 } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
1843 mlog(0, "%s:%.*s: got unrelated assert" 1845 mlog(0, "%s:%.*s: got unrelated assert"
1844 " from %u for migration, ignoring\n", 1846 " from %u for migration, ignoring\n",
1845 dlm->name, namelen, name, 1847 dlm->name, namelen, name,
1846 assert->node_idx); 1848 assert->node_idx);
1847 __dlm_put_mle(mle); 1849 __dlm_put_mle(mle);
1848 spin_unlock(&dlm->master_lock); 1850 spin_unlock(&dlm->master_lock);
1849 spin_unlock(&dlm->spinlock); 1851 spin_unlock(&dlm->spinlock);
1850 goto done; 1852 goto done;
1851 } 1853 }
1852 } 1854 }
1853 } 1855 }
1854 spin_unlock(&dlm->master_lock); 1856 spin_unlock(&dlm->master_lock);
1855 1857
1856 /* ok everything checks out with the MLE 1858 /* ok everything checks out with the MLE
1857 * now check to see if there is a lockres */ 1859 * now check to see if there is a lockres */
1858 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 1860 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1859 if (res) { 1861 if (res) {
1860 spin_lock(&res->spinlock); 1862 spin_lock(&res->spinlock);
1861 if (res->state & DLM_LOCK_RES_RECOVERING) { 1863 if (res->state & DLM_LOCK_RES_RECOVERING) {
1862 mlog(ML_ERROR, "%u asserting but %.*s is " 1864 mlog(ML_ERROR, "%u asserting but %.*s is "
1863 "RECOVERING!\n", assert->node_idx, namelen, name); 1865 "RECOVERING!\n", assert->node_idx, namelen, name);
1864 goto kill; 1866 goto kill;
1865 } 1867 }
1866 if (!mle) { 1868 if (!mle) {
1867 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && 1869 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1868 res->owner != assert->node_idx) { 1870 res->owner != assert->node_idx) {
1869 mlog(ML_ERROR, "DIE! Mastery assert from %u, " 1871 mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1870 "but current owner is %u! (%.*s)\n", 1872 "but current owner is %u! (%.*s)\n",
1871 assert->node_idx, res->owner, namelen, 1873 assert->node_idx, res->owner, namelen,
1872 name); 1874 name);
1873 __dlm_print_one_lock_resource(res); 1875 __dlm_print_one_lock_resource(res);
1874 BUG(); 1876 BUG();
1875 } 1877 }
1876 } else if (mle->type != DLM_MLE_MIGRATION) { 1878 } else if (mle->type != DLM_MLE_MIGRATION) {
1877 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 1879 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1878 /* owner is just re-asserting */ 1880 /* owner is just re-asserting */
1879 if (res->owner == assert->node_idx) { 1881 if (res->owner == assert->node_idx) {
1880 mlog(0, "owner %u re-asserting on " 1882 mlog(0, "owner %u re-asserting on "
1881 "lock %.*s\n", assert->node_idx, 1883 "lock %.*s\n", assert->node_idx,
1882 namelen, name); 1884 namelen, name);
1883 goto ok; 1885 goto ok;
1884 } 1886 }
1885 mlog(ML_ERROR, "got assert_master from " 1887 mlog(ML_ERROR, "got assert_master from "
1886 "node %u, but %u is the owner! " 1888 "node %u, but %u is the owner! "
1887 "(%.*s)\n", assert->node_idx, 1889 "(%.*s)\n", assert->node_idx,
1888 res->owner, namelen, name); 1890 res->owner, namelen, name);
1889 goto kill; 1891 goto kill;
1890 } 1892 }
1891 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { 1893 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1892 mlog(ML_ERROR, "got assert from %u, but lock " 1894 mlog(ML_ERROR, "got assert from %u, but lock "
1893 "with no owner should be " 1895 "with no owner should be "
1894 "in-progress! (%.*s)\n", 1896 "in-progress! (%.*s)\n",
1895 assert->node_idx, 1897 assert->node_idx,
1896 namelen, name); 1898 namelen, name);
1897 goto kill; 1899 goto kill;
1898 } 1900 }
1899 } else /* mle->type == DLM_MLE_MIGRATION */ { 1901 } else /* mle->type == DLM_MLE_MIGRATION */ {
1900 /* should only be getting an assert from new master */ 1902 /* should only be getting an assert from new master */
1901 if (assert->node_idx != mle->new_master) { 1903 if (assert->node_idx != mle->new_master) {
1902 mlog(ML_ERROR, "got assert from %u, but " 1904 mlog(ML_ERROR, "got assert from %u, but "
1903 "new master is %u, and old master " 1905 "new master is %u, and old master "
1904 "was %u (%.*s)\n", 1906 "was %u (%.*s)\n",
1905 assert->node_idx, mle->new_master, 1907 assert->node_idx, mle->new_master,
1906 mle->master, namelen, name); 1908 mle->master, namelen, name);
1907 goto kill; 1909 goto kill;
1908 } 1910 }
1909 1911
1910 } 1912 }
1911 ok: 1913 ok:
1912 spin_unlock(&res->spinlock); 1914 spin_unlock(&res->spinlock);
1913 } 1915 }
1914 1916
1915 // mlog(0, "woo! got an assert_master from node %u!\n", 1917 // mlog(0, "woo! got an assert_master from node %u!\n",
1916 // assert->node_idx); 1918 // assert->node_idx);
1917 if (mle) { 1919 if (mle) {
1918 int extra_ref = 0; 1920 int extra_ref = 0;
1919 int nn = -1; 1921 int nn = -1;
1920 int rr, err = 0; 1922 int rr, err = 0;
1921 1923
1922 spin_lock(&mle->spinlock); 1924 spin_lock(&mle->spinlock);
1923 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 1925 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1924 extra_ref = 1; 1926 extra_ref = 1;
1925 else { 1927 else {
1926 /* MASTER mle: if any bits set in the response map 1928 /* MASTER mle: if any bits set in the response map
1927 * then the calling node needs to re-assert to clear 1929 * then the calling node needs to re-assert to clear
1928 * up nodes that this node contacted */ 1930 * up nodes that this node contacted */
1929 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 1931 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1930 nn+1)) < O2NM_MAX_NODES) { 1932 nn+1)) < O2NM_MAX_NODES) {
1931 if (nn != dlm->node_num && nn != assert->node_idx) { 1933 if (nn != dlm->node_num && nn != assert->node_idx) {
1932 master_request = 1; 1934 master_request = 1;
1933 break; 1935 break;
1934 } 1936 }
1935 } 1937 }
1936 } 1938 }
1937 mle->master = assert->node_idx; 1939 mle->master = assert->node_idx;
1938 atomic_set(&mle->woken, 1); 1940 atomic_set(&mle->woken, 1);
1939 wake_up(&mle->wq); 1941 wake_up(&mle->wq);
1940 spin_unlock(&mle->spinlock); 1942 spin_unlock(&mle->spinlock);
1941 1943
1942 if (res) { 1944 if (res) {
1943 int wake = 0; 1945 int wake = 0;
1944 spin_lock(&res->spinlock); 1946 spin_lock(&res->spinlock);
1945 if (mle->type == DLM_MLE_MIGRATION) { 1947 if (mle->type == DLM_MLE_MIGRATION) {
1946 mlog(0, "finishing off migration of lockres %.*s, " 1948 mlog(0, "finishing off migration of lockres %.*s, "
1947 "from %u to %u\n", 1949 "from %u to %u\n",
1948 res->lockname.len, res->lockname.name, 1950 res->lockname.len, res->lockname.name,
1949 dlm->node_num, mle->new_master); 1951 dlm->node_num, mle->new_master);
1950 res->state &= ~DLM_LOCK_RES_MIGRATING; 1952 res->state &= ~DLM_LOCK_RES_MIGRATING;
1951 wake = 1; 1953 wake = 1;
1952 dlm_change_lockres_owner(dlm, res, mle->new_master); 1954 dlm_change_lockres_owner(dlm, res, mle->new_master);
1953 BUG_ON(res->state & DLM_LOCK_RES_DIRTY); 1955 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1954 } else { 1956 } else {
1955 dlm_change_lockres_owner(dlm, res, mle->master); 1957 dlm_change_lockres_owner(dlm, res, mle->master);
1956 } 1958 }
1957 spin_unlock(&res->spinlock); 1959 spin_unlock(&res->spinlock);
1958 have_lockres_ref = 1; 1960 have_lockres_ref = 1;
1959 if (wake) 1961 if (wake)
1960 wake_up(&res->wq); 1962 wake_up(&res->wq);
1961 } 1963 }
1962 1964
1963 /* master is known, detach if not already detached. 1965 /* master is known, detach if not already detached.
1964 * ensures that only one assert_master call will happen 1966 * ensures that only one assert_master call will happen
1965 * on this mle. */ 1967 * on this mle. */
1966 spin_lock(&dlm->master_lock); 1968 spin_lock(&dlm->master_lock);
1967 1969
1968 rr = atomic_read(&mle->mle_refs.refcount); 1970 rr = atomic_read(&mle->mle_refs.refcount);
1969 if (mle->inuse > 0) { 1971 if (mle->inuse > 0) {
1970 if (extra_ref && rr < 3) 1972 if (extra_ref && rr < 3)
1971 err = 1; 1973 err = 1;
1972 else if (!extra_ref && rr < 2) 1974 else if (!extra_ref && rr < 2)
1973 err = 1; 1975 err = 1;
1974 } else { 1976 } else {
1975 if (extra_ref && rr < 2) 1977 if (extra_ref && rr < 2)
1976 err = 1; 1978 err = 1;
1977 else if (!extra_ref && rr < 1) 1979 else if (!extra_ref && rr < 1)
1978 err = 1; 1980 err = 1;
1979 } 1981 }
1980 if (err) { 1982 if (err) {
1981 mlog(ML_ERROR, "%s:%.*s: got assert master from %u " 1983 mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
1982 "that will mess up this node, refs=%d, extra=%d, " 1984 "that will mess up this node, refs=%d, extra=%d, "
1983 "inuse=%d\n", dlm->name, namelen, name, 1985 "inuse=%d\n", dlm->name, namelen, name,
1984 assert->node_idx, rr, extra_ref, mle->inuse); 1986 assert->node_idx, rr, extra_ref, mle->inuse);
1985 dlm_print_one_mle(mle); 1987 dlm_print_one_mle(mle);
1986 } 1988 }
1987 __dlm_unlink_mle(dlm, mle); 1989 __dlm_unlink_mle(dlm, mle);
1988 __dlm_mle_detach_hb_events(dlm, mle); 1990 __dlm_mle_detach_hb_events(dlm, mle);
1989 __dlm_put_mle(mle); 1991 __dlm_put_mle(mle);
1990 if (extra_ref) { 1992 if (extra_ref) {
1991 /* the assert master message now balances the extra 1993 /* the assert master message now balances the extra
1992 * ref given by the master / migration request message. 1994 * ref given by the master / migration request message.
1993 * if this is the last put, it will be removed 1995 * if this is the last put, it will be removed
1994 * from the list. */ 1996 * from the list. */
1995 __dlm_put_mle(mle); 1997 __dlm_put_mle(mle);
1996 } 1998 }
1997 spin_unlock(&dlm->master_lock); 1999 spin_unlock(&dlm->master_lock);
1998 } else if (res) { 2000 } else if (res) {
1999 if (res->owner != assert->node_idx) { 2001 if (res->owner != assert->node_idx) {
2000 mlog(0, "assert_master from %u, but current " 2002 mlog(0, "assert_master from %u, but current "
2001 "owner is %u (%.*s), no mle\n", assert->node_idx, 2003 "owner is %u (%.*s), no mle\n", assert->node_idx,
2002 res->owner, namelen, name); 2004 res->owner, namelen, name);
2003 } 2005 }
2004 } 2006 }
2005 spin_unlock(&dlm->spinlock); 2007 spin_unlock(&dlm->spinlock);
2006 2008
2007 done: 2009 done:
2008 ret = 0; 2010 ret = 0;
2009 if (res) { 2011 if (res) {
2010 spin_lock(&res->spinlock); 2012 spin_lock(&res->spinlock);
2011 res->state |= DLM_LOCK_RES_SETREF_INPROG; 2013 res->state |= DLM_LOCK_RES_SETREF_INPROG;
2012 spin_unlock(&res->spinlock); 2014 spin_unlock(&res->spinlock);
2013 *ret_data = (void *)res; 2015 *ret_data = (void *)res;
2014 } 2016 }
2015 dlm_put(dlm); 2017 dlm_put(dlm);
2016 if (master_request) { 2018 if (master_request) {
2017 mlog(0, "need to tell master to reassert\n"); 2019 mlog(0, "need to tell master to reassert\n");
2018 /* positive. negative would shoot down the node. */ 2020 /* positive. negative would shoot down the node. */
2019 ret |= DLM_ASSERT_RESPONSE_REASSERT; 2021 ret |= DLM_ASSERT_RESPONSE_REASSERT;
2020 if (!have_lockres_ref) { 2022 if (!have_lockres_ref) {
2021 mlog(ML_ERROR, "strange, got assert from %u, MASTER " 2023 mlog(ML_ERROR, "strange, got assert from %u, MASTER "
2022 "mle present here for %s:%.*s, but no lockres!\n", 2024 "mle present here for %s:%.*s, but no lockres!\n",
2023 assert->node_idx, dlm->name, namelen, name); 2025 assert->node_idx, dlm->name, namelen, name);
2024 } 2026 }
2025 } 2027 }
2026 if (have_lockres_ref) { 2028 if (have_lockres_ref) {
2027 /* let the master know we have a reference to the lockres */ 2029 /* let the master know we have a reference to the lockres */
2028 ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; 2030 ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
2029 mlog(0, "%s:%.*s: got assert from %u, need a ref\n", 2031 mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2030 dlm->name, namelen, name, assert->node_idx); 2032 dlm->name, namelen, name, assert->node_idx);
2031 } 2033 }
2032 return ret; 2034 return ret;
2033 2035
2034 kill: 2036 kill:
2035 /* kill the caller! */ 2037 /* kill the caller! */
2036 mlog(ML_ERROR, "Bad message received from another node. Dumping state " 2038 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
2037 "and killing the other node now! This node is OK and can continue.\n"); 2039 "and killing the other node now! This node is OK and can continue.\n");
2038 __dlm_print_one_lock_resource(res); 2040 __dlm_print_one_lock_resource(res);
2039 spin_unlock(&res->spinlock); 2041 spin_unlock(&res->spinlock);
2040 spin_unlock(&dlm->spinlock); 2042 spin_unlock(&dlm->spinlock);
2041 *ret_data = (void *)res; 2043 *ret_data = (void *)res;
2042 dlm_put(dlm); 2044 dlm_put(dlm);
2043 return -EINVAL; 2045 return -EINVAL;
2044 } 2046 }
2045 2047
2046 void dlm_assert_master_post_handler(int status, void *data, void *ret_data) 2048 void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
2047 { 2049 {
2048 struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; 2050 struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
2049 2051
2050 if (ret_data) { 2052 if (ret_data) {
2051 spin_lock(&res->spinlock); 2053 spin_lock(&res->spinlock);
2052 res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 2054 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
2053 spin_unlock(&res->spinlock); 2055 spin_unlock(&res->spinlock);
2054 wake_up(&res->wq); 2056 wake_up(&res->wq);
2055 dlm_lockres_put(res); 2057 dlm_lockres_put(res);
2056 } 2058 }
2057 return; 2059 return;
2058 } 2060 }
2059 2061
2060 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, 2062 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2061 struct dlm_lock_resource *res, 2063 struct dlm_lock_resource *res,
2062 int ignore_higher, u8 request_from, u32 flags) 2064 int ignore_higher, u8 request_from, u32 flags)
2063 { 2065 {
2064 struct dlm_work_item *item; 2066 struct dlm_work_item *item;
2065 item = kzalloc(sizeof(*item), GFP_ATOMIC); 2067 item = kzalloc(sizeof(*item), GFP_ATOMIC);
2066 if (!item) 2068 if (!item)
2067 return -ENOMEM; 2069 return -ENOMEM;
2068 2070
2069 2071
2070 /* queue up work for dlm_assert_master_worker */ 2072 /* queue up work for dlm_assert_master_worker */
2071 dlm_grab(dlm); /* get an extra ref for the work item */ 2073 dlm_grab(dlm); /* get an extra ref for the work item */
2072 dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); 2074 dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
2073 item->u.am.lockres = res; /* already have a ref */ 2075 item->u.am.lockres = res; /* already have a ref */
2074 /* can optionally ignore node numbers higher than this node */ 2076 /* can optionally ignore node numbers higher than this node */
2075 item->u.am.ignore_higher = ignore_higher; 2077 item->u.am.ignore_higher = ignore_higher;
2076 item->u.am.request_from = request_from; 2078 item->u.am.request_from = request_from;
2077 item->u.am.flags = flags; 2079 item->u.am.flags = flags;
2078 2080
2079 if (ignore_higher) 2081 if (ignore_higher)
2080 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 2082 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2081 res->lockname.name); 2083 res->lockname.name);
2082 2084
2083 spin_lock(&dlm->work_lock); 2085 spin_lock(&dlm->work_lock);
2084 list_add_tail(&item->list, &dlm->work_list); 2086 list_add_tail(&item->list, &dlm->work_list);
2085 spin_unlock(&dlm->work_lock); 2087 spin_unlock(&dlm->work_lock);
2086 2088
2087 queue_work(dlm->dlm_worker, &dlm->dispatched_work); 2089 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2088 return 0; 2090 return 0;
2089 } 2091 }
2090 2092
2091 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) 2093 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2092 { 2094 {
2093 struct dlm_ctxt *dlm = data; 2095 struct dlm_ctxt *dlm = data;
2094 int ret = 0; 2096 int ret = 0;
2095 struct dlm_lock_resource *res; 2097 struct dlm_lock_resource *res;
2096 unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2098 unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
2097 int ignore_higher; 2099 int ignore_higher;
2098 int bit; 2100 int bit;
2099 u8 request_from; 2101 u8 request_from;
2100 u32 flags; 2102 u32 flags;
2101 2103
2102 dlm = item->dlm; 2104 dlm = item->dlm;
2103 res = item->u.am.lockres; 2105 res = item->u.am.lockres;
2104 ignore_higher = item->u.am.ignore_higher; 2106 ignore_higher = item->u.am.ignore_higher;
2105 request_from = item->u.am.request_from; 2107 request_from = item->u.am.request_from;
2106 flags = item->u.am.flags; 2108 flags = item->u.am.flags;
2107 2109
2108 spin_lock(&dlm->spinlock); 2110 spin_lock(&dlm->spinlock);
2109 memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); 2111 memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
2110 spin_unlock(&dlm->spinlock); 2112 spin_unlock(&dlm->spinlock);
2111 2113
2112 clear_bit(dlm->node_num, nodemap); 2114 clear_bit(dlm->node_num, nodemap);
2113 if (ignore_higher) { 2115 if (ignore_higher) {
2114 /* if is this just to clear up mles for nodes below 2116 /* if is this just to clear up mles for nodes below
2115 * this node, do not send the message to the original 2117 * this node, do not send the message to the original
2116 * caller or any node number higher than this */ 2118 * caller or any node number higher than this */
2117 clear_bit(request_from, nodemap); 2119 clear_bit(request_from, nodemap);
2118 bit = dlm->node_num; 2120 bit = dlm->node_num;
2119 while (1) { 2121 while (1) {
2120 bit = find_next_bit(nodemap, O2NM_MAX_NODES, 2122 bit = find_next_bit(nodemap, O2NM_MAX_NODES,
2121 bit+1); 2123 bit+1);
2122 if (bit >= O2NM_MAX_NODES) 2124 if (bit >= O2NM_MAX_NODES)
2123 break; 2125 break;
2124 clear_bit(bit, nodemap); 2126 clear_bit(bit, nodemap);
2125 } 2127 }
2126 } 2128 }
2127 2129
2128 /* 2130 /*
2129 * If we're migrating this lock to someone else, we are no 2131 * If we're migrating this lock to someone else, we are no
2130 * longer allowed to assert out own mastery. OTOH, we need to 2132 * longer allowed to assert out own mastery. OTOH, we need to
2131 * prevent migration from starting while we're still asserting 2133 * prevent migration from starting while we're still asserting
2132 * our dominance. The reserved ast delays migration. 2134 * our dominance. The reserved ast delays migration.
2133 */ 2135 */
2134 spin_lock(&res->spinlock); 2136 spin_lock(&res->spinlock);
2135 if (res->state & DLM_LOCK_RES_MIGRATING) { 2137 if (res->state & DLM_LOCK_RES_MIGRATING) {
2136 mlog(0, "Someone asked us to assert mastery, but we're " 2138 mlog(0, "Someone asked us to assert mastery, but we're "
2137 "in the middle of migration. Skipping assert, " 2139 "in the middle of migration. Skipping assert, "
2138 "the new master will handle that.\n"); 2140 "the new master will handle that.\n");
2139 spin_unlock(&res->spinlock); 2141 spin_unlock(&res->spinlock);
2140 goto put; 2142 goto put;
2141 } else 2143 } else
2142 __dlm_lockres_reserve_ast(res); 2144 __dlm_lockres_reserve_ast(res);
2143 spin_unlock(&res->spinlock); 2145 spin_unlock(&res->spinlock);
2144 2146
2145 /* this call now finishes out the nodemap 2147 /* this call now finishes out the nodemap
2146 * even if one or more nodes die */ 2148 * even if one or more nodes die */
2147 mlog(0, "worker about to master %.*s here, this=%u\n", 2149 mlog(0, "worker about to master %.*s here, this=%u\n",
2148 res->lockname.len, res->lockname.name, dlm->node_num); 2150 res->lockname.len, res->lockname.name, dlm->node_num);
2149 ret = dlm_do_assert_master(dlm, res, nodemap, flags); 2151 ret = dlm_do_assert_master(dlm, res, nodemap, flags);
2150 if (ret < 0) { 2152 if (ret < 0) {
2151 /* no need to restart, we are done */ 2153 /* no need to restart, we are done */
2152 if (!dlm_is_host_down(ret)) 2154 if (!dlm_is_host_down(ret))
2153 mlog_errno(ret); 2155 mlog_errno(ret);
2154 } 2156 }
2155 2157
2156 /* Ok, we've asserted ourselves. Let's let migration start. */ 2158 /* Ok, we've asserted ourselves. Let's let migration start. */
2157 dlm_lockres_release_ast(dlm, res); 2159 dlm_lockres_release_ast(dlm, res);
2158 2160
2159 put: 2161 put:
2160 dlm_lockres_drop_inflight_worker(dlm, res); 2162 dlm_lockres_drop_inflight_worker(dlm, res);
2161 2163
2162 dlm_lockres_put(res); 2164 dlm_lockres_put(res);
2163 2165
2164 mlog(0, "finished with dlm_assert_master_worker\n"); 2166 mlog(0, "finished with dlm_assert_master_worker\n");
2165 } 2167 }
2166 2168
2167 /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. 2169 /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
2168 * We cannot wait for node recovery to complete to begin mastering this 2170 * We cannot wait for node recovery to complete to begin mastering this
2169 * lockres because this lockres is used to kick off recovery! ;-) 2171 * lockres because this lockres is used to kick off recovery! ;-)
2170 * So, do a pre-check on all living nodes to see if any of those nodes 2172 * So, do a pre-check on all living nodes to see if any of those nodes
2171 * think that $RECOVERY is currently mastered by a dead node. If so, 2173 * think that $RECOVERY is currently mastered by a dead node. If so,
2172 * we wait a short time to allow that node to get notified by its own 2174 * we wait a short time to allow that node to get notified by its own
2173 * heartbeat stack, then check again. All $RECOVERY lock resources 2175 * heartbeat stack, then check again. All $RECOVERY lock resources
2174 * mastered by dead nodes are purged when the hearbeat callback is 2176 * mastered by dead nodes are purged when the hearbeat callback is
2175 * fired, so we can know for sure that it is safe to continue once 2177 * fired, so we can know for sure that it is safe to continue once
2176 * the node returns a live node or no node. */ 2178 * the node returns a live node or no node. */
2177 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 2179 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2178 struct dlm_lock_resource *res) 2180 struct dlm_lock_resource *res)
2179 { 2181 {
2180 struct dlm_node_iter iter; 2182 struct dlm_node_iter iter;
2181 int nodenum; 2183 int nodenum;
2182 int ret = 0; 2184 int ret = 0;
2183 u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; 2185 u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
2184 2186
2185 spin_lock(&dlm->spinlock); 2187 spin_lock(&dlm->spinlock);
2186 dlm_node_iter_init(dlm->domain_map, &iter); 2188 dlm_node_iter_init(dlm->domain_map, &iter);
2187 spin_unlock(&dlm->spinlock); 2189 spin_unlock(&dlm->spinlock);
2188 2190
2189 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 2191 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2190 /* do not send to self */ 2192 /* do not send to self */
2191 if (nodenum == dlm->node_num) 2193 if (nodenum == dlm->node_num)
2192 continue; 2194 continue;
2193 ret = dlm_do_master_requery(dlm, res, nodenum, &master); 2195 ret = dlm_do_master_requery(dlm, res, nodenum, &master);
2194 if (ret < 0) { 2196 if (ret < 0) {
2195 mlog_errno(ret); 2197 mlog_errno(ret);
2196 if (!dlm_is_host_down(ret)) 2198 if (!dlm_is_host_down(ret))
2197 BUG(); 2199 BUG();
2198 /* host is down, so answer for that node would be 2200 /* host is down, so answer for that node would be
2199 * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ 2201 * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
2200 ret = 0; 2202 ret = 0;
2201 } 2203 }
2202 2204
2203 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { 2205 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
2204 /* check to see if this master is in the recovery map */ 2206 /* check to see if this master is in the recovery map */
2205 spin_lock(&dlm->spinlock); 2207 spin_lock(&dlm->spinlock);
2206 if (test_bit(master, dlm->recovery_map)) { 2208 if (test_bit(master, dlm->recovery_map)) {
2207 mlog(ML_NOTICE, "%s: node %u has not seen " 2209 mlog(ML_NOTICE, "%s: node %u has not seen "
2208 "node %u go down yet, and thinks the " 2210 "node %u go down yet, and thinks the "
2209 "dead node is mastering the recovery " 2211 "dead node is mastering the recovery "
2210 "lock. must wait.\n", dlm->name, 2212 "lock. must wait.\n", dlm->name,
2211 nodenum, master); 2213 nodenum, master);
2212 ret = -EAGAIN; 2214 ret = -EAGAIN;
2213 } 2215 }
2214 spin_unlock(&dlm->spinlock); 2216 spin_unlock(&dlm->spinlock);
2215 mlog(0, "%s: reco lock master is %u\n", dlm->name, 2217 mlog(0, "%s: reco lock master is %u\n", dlm->name,
2216 master); 2218 master);
2217 break; 2219 break;
2218 } 2220 }
2219 } 2221 }
2220 return ret; 2222 return ret;
2221 } 2223 }
2222 2224
2223 /* 2225 /*
2224 * DLM_DEREF_LOCKRES_MSG 2226 * DLM_DEREF_LOCKRES_MSG
2225 */ 2227 */
2226 2228
2227 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2229 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2228 { 2230 {
2229 struct dlm_deref_lockres deref; 2231 struct dlm_deref_lockres deref;
2230 int ret = 0, r; 2232 int ret = 0, r;
2231 const char *lockname; 2233 const char *lockname;
2232 unsigned int namelen; 2234 unsigned int namelen;
2233 2235
2234 lockname = res->lockname.name; 2236 lockname = res->lockname.name;
2235 namelen = res->lockname.len; 2237 namelen = res->lockname.len;
2236 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2238 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2237 2239
2238 memset(&deref, 0, sizeof(deref)); 2240 memset(&deref, 0, sizeof(deref));
2239 deref.node_idx = dlm->node_num; 2241 deref.node_idx = dlm->node_num;
2240 deref.namelen = namelen; 2242 deref.namelen = namelen;
2241 memcpy(deref.name, lockname, namelen); 2243 memcpy(deref.name, lockname, namelen);
2242 2244
2243 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2245 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2244 &deref, sizeof(deref), res->owner, &r); 2246 &deref, sizeof(deref), res->owner, &r);
2245 if (ret < 0) 2247 if (ret < 0)
2246 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", 2248 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2247 dlm->name, namelen, lockname, ret, res->owner); 2249 dlm->name, namelen, lockname, ret, res->owner);
2248 else if (r < 0) { 2250 else if (r < 0) {
2249 /* BAD. other node says I did not have a ref. */ 2251 /* BAD. other node says I did not have a ref. */
2250 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", 2252 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2251 dlm->name, namelen, lockname, res->owner, r); 2253 dlm->name, namelen, lockname, res->owner, r);
2252 dlm_print_one_lock_resource(res); 2254 dlm_print_one_lock_resource(res);
2253 BUG(); 2255 BUG();
2254 } 2256 }
2255 return ret; 2257 return ret;
2256 } 2258 }
2257 2259
2258 int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 2260 int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2259 void **ret_data) 2261 void **ret_data)
2260 { 2262 {
2261 struct dlm_ctxt *dlm = data; 2263 struct dlm_ctxt *dlm = data;
2262 struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; 2264 struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
2263 struct dlm_lock_resource *res = NULL; 2265 struct dlm_lock_resource *res = NULL;
2264 char *name; 2266 char *name;
2265 unsigned int namelen; 2267 unsigned int namelen;
2266 int ret = -EINVAL; 2268 int ret = -EINVAL;
2267 u8 node; 2269 u8 node;
2268 unsigned int hash; 2270 unsigned int hash;
2269 struct dlm_work_item *item; 2271 struct dlm_work_item *item;
2270 int cleared = 0; 2272 int cleared = 0;
2271 int dispatch = 0; 2273 int dispatch = 0;
2272 2274
2273 if (!dlm_grab(dlm)) 2275 if (!dlm_grab(dlm))
2274 return 0; 2276 return 0;
2275 2277
2276 name = deref->name; 2278 name = deref->name;
2277 namelen = deref->namelen; 2279 namelen = deref->namelen;
2278 node = deref->node_idx; 2280 node = deref->node_idx;
2279 2281
2280 if (namelen > DLM_LOCKID_NAME_MAX) { 2282 if (namelen > DLM_LOCKID_NAME_MAX) {
2281 mlog(ML_ERROR, "Invalid name length!"); 2283 mlog(ML_ERROR, "Invalid name length!");
2282 goto done; 2284 goto done;
2283 } 2285 }
2284 if (deref->node_idx >= O2NM_MAX_NODES) { 2286 if (deref->node_idx >= O2NM_MAX_NODES) {
2285 mlog(ML_ERROR, "Invalid node number: %u\n", node); 2287 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2286 goto done; 2288 goto done;
2287 } 2289 }
2288 2290
2289 hash = dlm_lockid_hash(name, namelen); 2291 hash = dlm_lockid_hash(name, namelen);
2290 2292
2291 spin_lock(&dlm->spinlock); 2293 spin_lock(&dlm->spinlock);
2292 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); 2294 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2293 if (!res) { 2295 if (!res) {
2294 spin_unlock(&dlm->spinlock); 2296 spin_unlock(&dlm->spinlock);
2295 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", 2297 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2296 dlm->name, namelen, name); 2298 dlm->name, namelen, name);
2297 goto done; 2299 goto done;
2298 } 2300 }
2299 spin_unlock(&dlm->spinlock); 2301 spin_unlock(&dlm->spinlock);
2300 2302
2301 spin_lock(&res->spinlock); 2303 spin_lock(&res->spinlock);
2302 if (res->state & DLM_LOCK_RES_SETREF_INPROG) 2304 if (res->state & DLM_LOCK_RES_SETREF_INPROG)
2303 dispatch = 1; 2305 dispatch = 1;
2304 else { 2306 else {
2305 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2307 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2306 if (test_bit(node, res->refmap)) { 2308 if (test_bit(node, res->refmap)) {
2307 dlm_lockres_clear_refmap_bit(dlm, res, node); 2309 dlm_lockres_clear_refmap_bit(dlm, res, node);
2308 cleared = 1; 2310 cleared = 1;
2309 } 2311 }
2310 } 2312 }
2311 spin_unlock(&res->spinlock); 2313 spin_unlock(&res->spinlock);
2312 2314
2313 if (!dispatch) { 2315 if (!dispatch) {
2314 if (cleared) 2316 if (cleared)
2315 dlm_lockres_calc_usage(dlm, res); 2317 dlm_lockres_calc_usage(dlm, res);
2316 else { 2318 else {
2317 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 2319 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2318 "but it is already dropped!\n", dlm->name, 2320 "but it is already dropped!\n", dlm->name,
2319 res->lockname.len, res->lockname.name, node); 2321 res->lockname.len, res->lockname.name, node);
2320 dlm_print_one_lock_resource(res); 2322 dlm_print_one_lock_resource(res);
2321 } 2323 }
2322 ret = 0; 2324 ret = 0;
2323 goto done; 2325 goto done;
2324 } 2326 }
2325 2327
2326 item = kzalloc(sizeof(*item), GFP_NOFS); 2328 item = kzalloc(sizeof(*item), GFP_NOFS);
2327 if (!item) { 2329 if (!item) {
2328 ret = -ENOMEM; 2330 ret = -ENOMEM;
2329 mlog_errno(ret); 2331 mlog_errno(ret);
2330 goto done; 2332 goto done;
2331 } 2333 }
2332 2334
2333 dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); 2335 dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
2334 item->u.dl.deref_res = res; 2336 item->u.dl.deref_res = res;
2335 item->u.dl.deref_node = node; 2337 item->u.dl.deref_node = node;
2336 2338
2337 spin_lock(&dlm->work_lock); 2339 spin_lock(&dlm->work_lock);
2338 list_add_tail(&item->list, &dlm->work_list); 2340 list_add_tail(&item->list, &dlm->work_list);
2339 spin_unlock(&dlm->work_lock); 2341 spin_unlock(&dlm->work_lock);
2340 2342
2341 queue_work(dlm->dlm_worker, &dlm->dispatched_work); 2343 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2342 return 0; 2344 return 0;
2343 2345
2344 done: 2346 done:
2345 if (res) 2347 if (res)
2346 dlm_lockres_put(res); 2348 dlm_lockres_put(res);
2347 dlm_put(dlm); 2349 dlm_put(dlm);
2348 2350
2349 return ret; 2351 return ret;
2350 } 2352 }
2351 2353
2352 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) 2354 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2353 { 2355 {
2354 struct dlm_ctxt *dlm; 2356 struct dlm_ctxt *dlm;
2355 struct dlm_lock_resource *res; 2357 struct dlm_lock_resource *res;
2356 u8 node; 2358 u8 node;
2357 u8 cleared = 0; 2359 u8 cleared = 0;
2358 2360
2359 dlm = item->dlm; 2361 dlm = item->dlm;
2360 res = item->u.dl.deref_res; 2362 res = item->u.dl.deref_res;
2361 node = item->u.dl.deref_node; 2363 node = item->u.dl.deref_node;
2362 2364
2363 spin_lock(&res->spinlock); 2365 spin_lock(&res->spinlock);
2364 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2366 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2365 if (test_bit(node, res->refmap)) { 2367 if (test_bit(node, res->refmap)) {
2366 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2368 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2367 dlm_lockres_clear_refmap_bit(dlm, res, node); 2369 dlm_lockres_clear_refmap_bit(dlm, res, node);
2368 cleared = 1; 2370 cleared = 1;
2369 } 2371 }
2370 spin_unlock(&res->spinlock); 2372 spin_unlock(&res->spinlock);
2371 2373
2372 if (cleared) { 2374 if (cleared) {
2373 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", 2375 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2374 dlm->name, res->lockname.len, res->lockname.name, node); 2376 dlm->name, res->lockname.len, res->lockname.name, node);
2375 dlm_lockres_calc_usage(dlm, res); 2377 dlm_lockres_calc_usage(dlm, res);
2376 } else { 2378 } else {
2377 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 2379 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2378 "but it is already dropped!\n", dlm->name, 2380 "but it is already dropped!\n", dlm->name,
2379 res->lockname.len, res->lockname.name, node); 2381 res->lockname.len, res->lockname.name, node);
2380 dlm_print_one_lock_resource(res); 2382 dlm_print_one_lock_resource(res);
2381 } 2383 }
2382 2384
2383 dlm_lockres_put(res); 2385 dlm_lockres_put(res);
2384 } 2386 }
2385 2387
2386 /* 2388 /*
2387 * A migrateable resource is one that is : 2389 * A migrateable resource is one that is :
2388 * 1. locally mastered, and, 2390 * 1. locally mastered, and,
2389 * 2. zero local locks, and, 2391 * 2. zero local locks, and,
2390 * 3. one or more non-local locks, or, one or more references 2392 * 3. one or more non-local locks, or, one or more references
2391 * Returns 1 if yes, 0 if not. 2393 * Returns 1 if yes, 0 if not.
2392 */ 2394 */
2393 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2395 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2394 struct dlm_lock_resource *res) 2396 struct dlm_lock_resource *res)
2395 { 2397 {
2396 enum dlm_lockres_list idx; 2398 enum dlm_lockres_list idx;
2397 int nonlocal = 0, node_ref; 2399 int nonlocal = 0, node_ref;
2398 struct list_head *queue; 2400 struct list_head *queue;
2399 struct dlm_lock *lock; 2401 struct dlm_lock *lock;
2400 u64 cookie; 2402 u64 cookie;
2401 2403
2402 assert_spin_locked(&res->spinlock); 2404 assert_spin_locked(&res->spinlock);
2403 2405
2404 /* delay migration when the lockres is in MIGRATING state */ 2406 /* delay migration when the lockres is in MIGRATING state */
2405 if (res->state & DLM_LOCK_RES_MIGRATING) 2407 if (res->state & DLM_LOCK_RES_MIGRATING)
2406 return 0; 2408 return 0;
2407 2409
2408 /* delay migration when the lockres is in RECOCERING state */ 2410 /* delay migration when the lockres is in RECOCERING state */
2409 if (res->state & DLM_LOCK_RES_RECOVERING) 2411 if (res->state & DLM_LOCK_RES_RECOVERING)
2410 return 0; 2412 return 0;
2411 2413
2412 if (res->owner != dlm->node_num) 2414 if (res->owner != dlm->node_num)
2413 return 0; 2415 return 0;
2414 2416
2415 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { 2417 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2416 queue = dlm_list_idx_to_ptr(res, idx); 2418 queue = dlm_list_idx_to_ptr(res, idx);
2417 list_for_each_entry(lock, queue, list) { 2419 list_for_each_entry(lock, queue, list) {
2418 if (lock->ml.node != dlm->node_num) { 2420 if (lock->ml.node != dlm->node_num) {
2419 nonlocal++; 2421 nonlocal++;
2420 continue; 2422 continue;
2421 } 2423 }
2422 cookie = be64_to_cpu(lock->ml.cookie); 2424 cookie = be64_to_cpu(lock->ml.cookie);
2423 mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " 2425 mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
2424 "%s list\n", dlm->name, res->lockname.len, 2426 "%s list\n", dlm->name, res->lockname.len,
2425 res->lockname.name, 2427 res->lockname.name,
2426 dlm_get_lock_cookie_node(cookie), 2428 dlm_get_lock_cookie_node(cookie),
2427 dlm_get_lock_cookie_seq(cookie), 2429 dlm_get_lock_cookie_seq(cookie),
2428 dlm_list_in_text(idx)); 2430 dlm_list_in_text(idx));
2429 return 0; 2431 return 0;
2430 } 2432 }
2431 } 2433 }
2432 2434
2433 if (!nonlocal) { 2435 if (!nonlocal) {
2434 node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 2436 node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2435 if (node_ref >= O2NM_MAX_NODES) 2437 if (node_ref >= O2NM_MAX_NODES)
2436 return 0; 2438 return 0;
2437 } 2439 }
2438 2440
2439 mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, 2441 mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
2440 res->lockname.name); 2442 res->lockname.name);
2441 2443
2442 return 1; 2444 return 1;
2443 } 2445 }
2444 2446
2445 /* 2447 /*
2446 * DLM_MIGRATE_LOCKRES 2448 * DLM_MIGRATE_LOCKRES
2447 */ 2449 */
2448 2450
2449 2451
2450 static int dlm_migrate_lockres(struct dlm_ctxt *dlm, 2452 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2451 struct dlm_lock_resource *res, u8 target) 2453 struct dlm_lock_resource *res, u8 target)
2452 { 2454 {
2453 struct dlm_master_list_entry *mle = NULL; 2455 struct dlm_master_list_entry *mle = NULL;
2454 struct dlm_master_list_entry *oldmle = NULL; 2456 struct dlm_master_list_entry *oldmle = NULL;
2455 struct dlm_migratable_lockres *mres = NULL; 2457 struct dlm_migratable_lockres *mres = NULL;
2456 int ret = 0; 2458 int ret = 0;
2457 const char *name; 2459 const char *name;
2458 unsigned int namelen; 2460 unsigned int namelen;
2459 int mle_added = 0; 2461 int mle_added = 0;
2460 int wake = 0; 2462 int wake = 0;
2461 2463
2462 if (!dlm_grab(dlm)) 2464 if (!dlm_grab(dlm))
2463 return -EINVAL; 2465 return -EINVAL;
2464 2466
2465 BUG_ON(target == O2NM_MAX_NODES); 2467 BUG_ON(target == O2NM_MAX_NODES);
2466 2468
2467 name = res->lockname.name; 2469 name = res->lockname.name;
2468 namelen = res->lockname.len; 2470 namelen = res->lockname.len;
2469 2471
2470 mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, 2472 mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2471 target); 2473 target);
2472 2474
2473 /* preallocate up front. if this fails, abort */ 2475 /* preallocate up front. if this fails, abort */
2474 ret = -ENOMEM; 2476 ret = -ENOMEM;
2475 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); 2477 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2476 if (!mres) { 2478 if (!mres) {
2477 mlog_errno(ret); 2479 mlog_errno(ret);
2478 goto leave; 2480 goto leave;
2479 } 2481 }
2480 2482
2481 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 2483 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2482 if (!mle) { 2484 if (!mle) {
2483 mlog_errno(ret); 2485 mlog_errno(ret);
2484 goto leave; 2486 goto leave;
2485 } 2487 }
2486 ret = 0; 2488 ret = 0;
2487 2489
2488 /* 2490 /*
2489 * clear any existing master requests and 2491 * clear any existing master requests and
2490 * add the migration mle to the list 2492 * add the migration mle to the list
2491 */ 2493 */
2492 spin_lock(&dlm->spinlock); 2494 spin_lock(&dlm->spinlock);
2493 spin_lock(&dlm->master_lock); 2495 spin_lock(&dlm->master_lock);
2494 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 2496 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2495 namelen, target, dlm->node_num); 2497 namelen, target, dlm->node_num);
2496 spin_unlock(&dlm->master_lock); 2498 spin_unlock(&dlm->master_lock);
2497 spin_unlock(&dlm->spinlock); 2499 spin_unlock(&dlm->spinlock);
2498 2500
2499 if (ret == -EEXIST) { 2501 if (ret == -EEXIST) {
2500 mlog(0, "another process is already migrating it\n"); 2502 mlog(0, "another process is already migrating it\n");
2501 goto fail; 2503 goto fail;
2502 } 2504 }
2503 mle_added = 1; 2505 mle_added = 1;
2504 2506
2505 /* 2507 /*
2506 * set the MIGRATING flag and flush asts 2508 * set the MIGRATING flag and flush asts
2507 * if we fail after this we need to re-dirty the lockres 2509 * if we fail after this we need to re-dirty the lockres
2508 */ 2510 */
2509 if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { 2511 if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
2510 mlog(ML_ERROR, "tried to migrate %.*s to %u, but " 2512 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
2511 "the target went down.\n", res->lockname.len, 2513 "the target went down.\n", res->lockname.len,
2512 res->lockname.name, target); 2514 res->lockname.name, target);
2513 spin_lock(&res->spinlock); 2515 spin_lock(&res->spinlock);
2514 res->state &= ~DLM_LOCK_RES_MIGRATING; 2516 res->state &= ~DLM_LOCK_RES_MIGRATING;
2515 wake = 1; 2517 wake = 1;
2516 spin_unlock(&res->spinlock); 2518 spin_unlock(&res->spinlock);
2517 ret = -EINVAL; 2519 ret = -EINVAL;
2518 } 2520 }
2519 2521
2520 fail: 2522 fail:
2521 if (oldmle) { 2523 if (oldmle) {
2522 /* master is known, detach if not already detached */ 2524 /* master is known, detach if not already detached */
2523 dlm_mle_detach_hb_events(dlm, oldmle); 2525 dlm_mle_detach_hb_events(dlm, oldmle);
2524 dlm_put_mle(oldmle); 2526 dlm_put_mle(oldmle);
2525 } 2527 }
2526 2528
2527 if (ret < 0) { 2529 if (ret < 0) {
2528 if (mle_added) { 2530 if (mle_added) {
2529 dlm_mle_detach_hb_events(dlm, mle); 2531 dlm_mle_detach_hb_events(dlm, mle);
2530 dlm_put_mle(mle); 2532 dlm_put_mle(mle);
2531 } else if (mle) { 2533 } else if (mle) {
2532 kmem_cache_free(dlm_mle_cache, mle); 2534 kmem_cache_free(dlm_mle_cache, mle);
2533 mle = NULL; 2535 mle = NULL;
2534 } 2536 }
2535 goto leave; 2537 goto leave;
2536 } 2538 }
2537 2539
2538 /* 2540 /*
2539 * at this point, we have a migration target, an mle 2541 * at this point, we have a migration target, an mle
2540 * in the master list, and the MIGRATING flag set on 2542 * in the master list, and the MIGRATING flag set on
2541 * the lockres 2543 * the lockres
2542 */ 2544 */
2543 2545
2544 /* now that remote nodes are spinning on the MIGRATING flag, 2546 /* now that remote nodes are spinning on the MIGRATING flag,
2545 * ensure that all assert_master work is flushed. */ 2547 * ensure that all assert_master work is flushed. */
2546 flush_workqueue(dlm->dlm_worker); 2548 flush_workqueue(dlm->dlm_worker);
2547 2549
2548 /* get an extra reference on the mle. 2550 /* get an extra reference on the mle.
2549 * otherwise the assert_master from the new 2551 * otherwise the assert_master from the new
2550 * master will destroy this. 2552 * master will destroy this.
2551 * also, make sure that all callers of dlm_get_mle 2553 * also, make sure that all callers of dlm_get_mle
2552 * take both dlm->spinlock and dlm->master_lock */ 2554 * take both dlm->spinlock and dlm->master_lock */
2553 spin_lock(&dlm->spinlock); 2555 spin_lock(&dlm->spinlock);
2554 spin_lock(&dlm->master_lock); 2556 spin_lock(&dlm->master_lock);
2555 dlm_get_mle_inuse(mle); 2557 dlm_get_mle_inuse(mle);
2556 spin_unlock(&dlm->master_lock); 2558 spin_unlock(&dlm->master_lock);
2557 spin_unlock(&dlm->spinlock); 2559 spin_unlock(&dlm->spinlock);
2558 2560
2559 /* notify new node and send all lock state */ 2561 /* notify new node and send all lock state */
2560 /* call send_one_lockres with migration flag. 2562 /* call send_one_lockres with migration flag.
2561 * this serves as notice to the target node that a 2563 * this serves as notice to the target node that a
2562 * migration is starting. */ 2564 * migration is starting. */
2563 ret = dlm_send_one_lockres(dlm, res, mres, target, 2565 ret = dlm_send_one_lockres(dlm, res, mres, target,
2564 DLM_MRES_MIGRATION); 2566 DLM_MRES_MIGRATION);
2565 2567
2566 if (ret < 0) { 2568 if (ret < 0) {
2567 mlog(0, "migration to node %u failed with %d\n", 2569 mlog(0, "migration to node %u failed with %d\n",
2568 target, ret); 2570 target, ret);
2569 /* migration failed, detach and clean up mle */ 2571 /* migration failed, detach and clean up mle */
2570 dlm_mle_detach_hb_events(dlm, mle); 2572 dlm_mle_detach_hb_events(dlm, mle);
2571 dlm_put_mle(mle); 2573 dlm_put_mle(mle);
2572 dlm_put_mle_inuse(mle); 2574 dlm_put_mle_inuse(mle);
2573 spin_lock(&res->spinlock); 2575 spin_lock(&res->spinlock);
2574 res->state &= ~DLM_LOCK_RES_MIGRATING; 2576 res->state &= ~DLM_LOCK_RES_MIGRATING;
2575 wake = 1; 2577 wake = 1;
2576 spin_unlock(&res->spinlock); 2578 spin_unlock(&res->spinlock);
2577 if (dlm_is_host_down(ret)) 2579 if (dlm_is_host_down(ret))
2578 dlm_wait_for_node_death(dlm, target, 2580 dlm_wait_for_node_death(dlm, target,
2579 DLM_NODE_DEATH_WAIT_MAX); 2581 DLM_NODE_DEATH_WAIT_MAX);
2580 goto leave; 2582 goto leave;
2581 } 2583 }
2582 2584
2583 /* at this point, the target sends a message to all nodes, 2585 /* at this point, the target sends a message to all nodes,
2584 * (using dlm_do_migrate_request). this node is skipped since 2586 * (using dlm_do_migrate_request). this node is skipped since
2585 * we had to put an mle in the list to begin the process. this 2587 * we had to put an mle in the list to begin the process. this
2586 * node now waits for target to do an assert master. this node 2588 * node now waits for target to do an assert master. this node
2587 * will be the last one notified, ensuring that the migration 2589 * will be the last one notified, ensuring that the migration
2588 * is complete everywhere. if the target dies while this is 2590 * is complete everywhere. if the target dies while this is
2589 * going on, some nodes could potentially see the target as the 2591 * going on, some nodes could potentially see the target as the
2590 * master, so it is important that my recovery finds the migration 2592 * master, so it is important that my recovery finds the migration
2591 * mle and sets the master to UNKNOWN. */ 2593 * mle and sets the master to UNKNOWN. */
2592 2594
2593 2595
2594 /* wait for new node to assert master */ 2596 /* wait for new node to assert master */
2595 while (1) { 2597 while (1) {
2596 ret = wait_event_interruptible_timeout(mle->wq, 2598 ret = wait_event_interruptible_timeout(mle->wq,
2597 (atomic_read(&mle->woken) == 1), 2599 (atomic_read(&mle->woken) == 1),
2598 msecs_to_jiffies(5000)); 2600 msecs_to_jiffies(5000));
2599 2601
2600 if (ret >= 0) { 2602 if (ret >= 0) {
2601 if (atomic_read(&mle->woken) == 1 || 2603 if (atomic_read(&mle->woken) == 1 ||
2602 res->owner == target) 2604 res->owner == target)
2603 break; 2605 break;
2604 2606
2605 mlog(0, "%s:%.*s: timed out during migration\n", 2607 mlog(0, "%s:%.*s: timed out during migration\n",
2606 dlm->name, res->lockname.len, res->lockname.name); 2608 dlm->name, res->lockname.len, res->lockname.name);
2607 /* avoid hang during shutdown when migrating lockres 2609 /* avoid hang during shutdown when migrating lockres
2608 * to a node which also goes down */ 2610 * to a node which also goes down */
2609 if (dlm_is_node_dead(dlm, target)) { 2611 if (dlm_is_node_dead(dlm, target)) {
2610 mlog(0, "%s:%.*s: expected migration " 2612 mlog(0, "%s:%.*s: expected migration "
2611 "target %u is no longer up, restarting\n", 2613 "target %u is no longer up, restarting\n",
2612 dlm->name, res->lockname.len, 2614 dlm->name, res->lockname.len,
2613 res->lockname.name, target); 2615 res->lockname.name, target);
2614 ret = -EINVAL; 2616 ret = -EINVAL;
2615 /* migration failed, detach and clean up mle */ 2617 /* migration failed, detach and clean up mle */
2616 dlm_mle_detach_hb_events(dlm, mle); 2618 dlm_mle_detach_hb_events(dlm, mle);
2617 dlm_put_mle(mle); 2619 dlm_put_mle(mle);
2618 dlm_put_mle_inuse(mle); 2620 dlm_put_mle_inuse(mle);
2619 spin_lock(&res->spinlock); 2621 spin_lock(&res->spinlock);
2620 res->state &= ~DLM_LOCK_RES_MIGRATING; 2622 res->state &= ~DLM_LOCK_RES_MIGRATING;
2621 wake = 1; 2623 wake = 1;
2622 spin_unlock(&res->spinlock); 2624 spin_unlock(&res->spinlock);
2623 goto leave; 2625 goto leave;
2624 } 2626 }
2625 } else 2627 } else
2626 mlog(0, "%s:%.*s: caught signal during migration\n", 2628 mlog(0, "%s:%.*s: caught signal during migration\n",
2627 dlm->name, res->lockname.len, res->lockname.name); 2629 dlm->name, res->lockname.len, res->lockname.name);
2628 } 2630 }
2629 2631
2630 /* all done, set the owner, clear the flag */ 2632 /* all done, set the owner, clear the flag */
2631 spin_lock(&res->spinlock); 2633 spin_lock(&res->spinlock);
2632 dlm_set_lockres_owner(dlm, res, target); 2634 dlm_set_lockres_owner(dlm, res, target);
2633 res->state &= ~DLM_LOCK_RES_MIGRATING; 2635 res->state &= ~DLM_LOCK_RES_MIGRATING;
2634 dlm_remove_nonlocal_locks(dlm, res); 2636 dlm_remove_nonlocal_locks(dlm, res);
2635 spin_unlock(&res->spinlock); 2637 spin_unlock(&res->spinlock);
2636 wake_up(&res->wq); 2638 wake_up(&res->wq);
2637 2639
2638 /* master is known, detach if not already detached */ 2640 /* master is known, detach if not already detached */
2639 dlm_mle_detach_hb_events(dlm, mle); 2641 dlm_mle_detach_hb_events(dlm, mle);
2640 dlm_put_mle_inuse(mle); 2642 dlm_put_mle_inuse(mle);
2641 ret = 0; 2643 ret = 0;
2642 2644
2643 dlm_lockres_calc_usage(dlm, res); 2645 dlm_lockres_calc_usage(dlm, res);
2644 2646
2645 leave: 2647 leave:
2646 /* re-dirty the lockres if we failed */ 2648 /* re-dirty the lockres if we failed */
2647 if (ret < 0) 2649 if (ret < 0)
2648 dlm_kick_thread(dlm, res); 2650 dlm_kick_thread(dlm, res);
2649 2651
2650 /* wake up waiters if the MIGRATING flag got set 2652 /* wake up waiters if the MIGRATING flag got set
2651 * but migration failed */ 2653 * but migration failed */
2652 if (wake) 2654 if (wake)
2653 wake_up(&res->wq); 2655 wake_up(&res->wq);
2654 2656
2655 if (mres) 2657 if (mres)
2656 free_page((unsigned long)mres); 2658 free_page((unsigned long)mres);
2657 2659
2658 dlm_put(dlm); 2660 dlm_put(dlm);
2659 2661
2660 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, 2662 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2661 name, target, ret); 2663 name, target, ret);
2662 return ret; 2664 return ret;
2663 } 2665 }
2664 2666
2665 #define DLM_MIGRATION_RETRY_MS 100 2667 #define DLM_MIGRATION_RETRY_MS 100
2666 2668
2667 /* 2669 /*
2668 * Should be called only after beginning the domain leave process. 2670 * Should be called only after beginning the domain leave process.
2669 * There should not be any remaining locks on nonlocal lock resources, 2671 * There should not be any remaining locks on nonlocal lock resources,
2670 * and there should be no local locks left on locally mastered resources. 2672 * and there should be no local locks left on locally mastered resources.
2671 * 2673 *
2672 * Called with the dlm spinlock held, may drop it to do migration, but 2674 * Called with the dlm spinlock held, may drop it to do migration, but
2673 * will re-acquire before exit. 2675 * will re-acquire before exit.
2674 * 2676 *
2675 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped 2677 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2676 */ 2678 */
2677 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2679 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2678 { 2680 {
2679 int ret; 2681 int ret;
2680 int lock_dropped = 0; 2682 int lock_dropped = 0;
2681 u8 target = O2NM_MAX_NODES; 2683 u8 target = O2NM_MAX_NODES;
2682 2684
2683 assert_spin_locked(&dlm->spinlock); 2685 assert_spin_locked(&dlm->spinlock);
2684 2686
2685 spin_lock(&res->spinlock); 2687 spin_lock(&res->spinlock);
2686 if (dlm_is_lockres_migrateable(dlm, res)) 2688 if (dlm_is_lockres_migrateable(dlm, res))
2687 target = dlm_pick_migration_target(dlm, res); 2689 target = dlm_pick_migration_target(dlm, res);
2688 spin_unlock(&res->spinlock); 2690 spin_unlock(&res->spinlock);
2689 2691
2690 if (target == O2NM_MAX_NODES) 2692 if (target == O2NM_MAX_NODES)
2691 goto leave; 2693 goto leave;
2692 2694
2693 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 2695 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2694 spin_unlock(&dlm->spinlock); 2696 spin_unlock(&dlm->spinlock);
2695 lock_dropped = 1; 2697 lock_dropped = 1;
2696 ret = dlm_migrate_lockres(dlm, res, target); 2698 ret = dlm_migrate_lockres(dlm, res, target);
2697 if (ret) 2699 if (ret)
2698 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", 2700 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2699 dlm->name, res->lockname.len, res->lockname.name, 2701 dlm->name, res->lockname.len, res->lockname.name,
2700 target, ret); 2702 target, ret);
2701 spin_lock(&dlm->spinlock); 2703 spin_lock(&dlm->spinlock);
2702 leave: 2704 leave:
2703 return lock_dropped; 2705 return lock_dropped;
2704 } 2706 }
2705 2707
2706 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) 2708 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2707 { 2709 {
2708 int ret; 2710 int ret;
2709 spin_lock(&dlm->ast_lock); 2711 spin_lock(&dlm->ast_lock);
2710 spin_lock(&lock->spinlock); 2712 spin_lock(&lock->spinlock);
2711 ret = (list_empty(&lock->bast_list) && !lock->bast_pending); 2713 ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2712 spin_unlock(&lock->spinlock); 2714 spin_unlock(&lock->spinlock);
2713 spin_unlock(&dlm->ast_lock); 2715 spin_unlock(&dlm->ast_lock);
2714 return ret; 2716 return ret;
2715 } 2717 }
2716 2718
2717 static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, 2719 static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2718 struct dlm_lock_resource *res, 2720 struct dlm_lock_resource *res,
2719 u8 mig_target) 2721 u8 mig_target)
2720 { 2722 {
2721 int can_proceed; 2723 int can_proceed;
2722 spin_lock(&res->spinlock); 2724 spin_lock(&res->spinlock);
2723 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); 2725 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2724 spin_unlock(&res->spinlock); 2726 spin_unlock(&res->spinlock);
2725 2727
2726 /* target has died, so make the caller break out of the 2728 /* target has died, so make the caller break out of the
2727 * wait_event, but caller must recheck the domain_map */ 2729 * wait_event, but caller must recheck the domain_map */
2728 spin_lock(&dlm->spinlock); 2730 spin_lock(&dlm->spinlock);
2729 if (!test_bit(mig_target, dlm->domain_map)) 2731 if (!test_bit(mig_target, dlm->domain_map))
2730 can_proceed = 1; 2732 can_proceed = 1;
2731 spin_unlock(&dlm->spinlock); 2733 spin_unlock(&dlm->spinlock);
2732 return can_proceed; 2734 return can_proceed;
2733 } 2735 }
2734 2736
2735 static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, 2737 static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
2736 struct dlm_lock_resource *res) 2738 struct dlm_lock_resource *res)
2737 { 2739 {
2738 int ret; 2740 int ret;
2739 spin_lock(&res->spinlock); 2741 spin_lock(&res->spinlock);
2740 ret = !!(res->state & DLM_LOCK_RES_DIRTY); 2742 ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2741 spin_unlock(&res->spinlock); 2743 spin_unlock(&res->spinlock);
2742 return ret; 2744 return ret;
2743 } 2745 }
2744 2746
2745 2747
2746 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, 2748 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2747 struct dlm_lock_resource *res, 2749 struct dlm_lock_resource *res,
2748 u8 target) 2750 u8 target)
2749 { 2751 {
2750 int ret = 0; 2752 int ret = 0;
2751 2753
2752 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", 2754 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2753 res->lockname.len, res->lockname.name, dlm->node_num, 2755 res->lockname.len, res->lockname.name, dlm->node_num,
2754 target); 2756 target);
2755 /* need to set MIGRATING flag on lockres. this is done by 2757 /* need to set MIGRATING flag on lockres. this is done by
2756 * ensuring that all asts have been flushed for this lockres. */ 2758 * ensuring that all asts have been flushed for this lockres. */
2757 spin_lock(&res->spinlock); 2759 spin_lock(&res->spinlock);
2758 BUG_ON(res->migration_pending); 2760 BUG_ON(res->migration_pending);
2759 res->migration_pending = 1; 2761 res->migration_pending = 1;
2760 /* strategy is to reserve an extra ast then release 2762 /* strategy is to reserve an extra ast then release
2761 * it below, letting the release do all of the work */ 2763 * it below, letting the release do all of the work */
2762 __dlm_lockres_reserve_ast(res); 2764 __dlm_lockres_reserve_ast(res);
2763 spin_unlock(&res->spinlock); 2765 spin_unlock(&res->spinlock);
2764 2766
2765 /* now flush all the pending asts */ 2767 /* now flush all the pending asts */
2766 dlm_kick_thread(dlm, res); 2768 dlm_kick_thread(dlm, res);
2767 /* before waiting on DIRTY, block processes which may 2769 /* before waiting on DIRTY, block processes which may
2768 * try to dirty the lockres before MIGRATING is set */ 2770 * try to dirty the lockres before MIGRATING is set */
2769 spin_lock(&res->spinlock); 2771 spin_lock(&res->spinlock);
2770 BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); 2772 BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
2771 res->state |= DLM_LOCK_RES_BLOCK_DIRTY; 2773 res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
2772 spin_unlock(&res->spinlock); 2774 spin_unlock(&res->spinlock);
2773 /* now wait on any pending asts and the DIRTY state */ 2775 /* now wait on any pending asts and the DIRTY state */
2774 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); 2776 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2775 dlm_lockres_release_ast(dlm, res); 2777 dlm_lockres_release_ast(dlm, res);
2776 2778
2777 mlog(0, "about to wait on migration_wq, dirty=%s\n", 2779 mlog(0, "about to wait on migration_wq, dirty=%s\n",
2778 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); 2780 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2779 /* if the extra ref we just put was the final one, this 2781 /* if the extra ref we just put was the final one, this
2780 * will pass thru immediately. otherwise, we need to wait 2782 * will pass thru immediately. otherwise, we need to wait
2781 * for the last ast to finish. */ 2783 * for the last ast to finish. */
2782 again: 2784 again:
2783 ret = wait_event_interruptible_timeout(dlm->migration_wq, 2785 ret = wait_event_interruptible_timeout(dlm->migration_wq,
2784 dlm_migration_can_proceed(dlm, res, target), 2786 dlm_migration_can_proceed(dlm, res, target),
2785 msecs_to_jiffies(1000)); 2787 msecs_to_jiffies(1000));
2786 if (ret < 0) { 2788 if (ret < 0) {
2787 mlog(0, "woken again: migrating? %s, dead? %s\n", 2789 mlog(0, "woken again: migrating? %s, dead? %s\n",
2788 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", 2790 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2789 test_bit(target, dlm->domain_map) ? "no":"yes"); 2791 test_bit(target, dlm->domain_map) ? "no":"yes");
2790 } else { 2792 } else {
2791 mlog(0, "all is well: migrating? %s, dead? %s\n", 2793 mlog(0, "all is well: migrating? %s, dead? %s\n",
2792 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", 2794 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2793 test_bit(target, dlm->domain_map) ? "no":"yes"); 2795 test_bit(target, dlm->domain_map) ? "no":"yes");
2794 } 2796 }
2795 if (!dlm_migration_can_proceed(dlm, res, target)) { 2797 if (!dlm_migration_can_proceed(dlm, res, target)) {
2796 mlog(0, "trying again...\n"); 2798 mlog(0, "trying again...\n");
2797 goto again; 2799 goto again;
2798 } 2800 }
2799 2801
2800 ret = 0; 2802 ret = 0;
2801 /* did the target go down or die? */ 2803 /* did the target go down or die? */
2802 spin_lock(&dlm->spinlock); 2804 spin_lock(&dlm->spinlock);
2803 if (!test_bit(target, dlm->domain_map)) { 2805 if (!test_bit(target, dlm->domain_map)) {
2804 mlog(ML_ERROR, "aha. migration target %u just went down\n", 2806 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2805 target); 2807 target);
2806 ret = -EHOSTDOWN; 2808 ret = -EHOSTDOWN;
2807 } 2809 }
2808 spin_unlock(&dlm->spinlock); 2810 spin_unlock(&dlm->spinlock);
2809 2811
2810 /* 2812 /*
2811 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for 2813 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2812 * another try; otherwise, we are sure the MIGRATING state is there, 2814 * another try; otherwise, we are sure the MIGRATING state is there,
2813 * drop the unneded state which blocked threads trying to DIRTY 2815 * drop the unneded state which blocked threads trying to DIRTY
2814 */ 2816 */
2815 spin_lock(&res->spinlock); 2817 spin_lock(&res->spinlock);
2816 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); 2818 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2817 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; 2819 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2818 if (!ret) 2820 if (!ret)
2819 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); 2821 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2820 spin_unlock(&res->spinlock); 2822 spin_unlock(&res->spinlock);
2821 2823
2822 /* 2824 /*
2823 * at this point: 2825 * at this point:
2824 * 2826 *
2825 * o the DLM_LOCK_RES_MIGRATING flag is set if target not down 2827 * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
2826 * o there are no pending asts on this lockres 2828 * o there are no pending asts on this lockres
2827 * o all processes trying to reserve an ast on this 2829 * o all processes trying to reserve an ast on this
2828 * lockres must wait for the MIGRATING flag to clear 2830 * lockres must wait for the MIGRATING flag to clear
2829 */ 2831 */
2830 return ret; 2832 return ret;
2831 } 2833 }
2832 2834
2833 /* last step in the migration process. 2835 /* last step in the migration process.
2834 * original master calls this to free all of the dlm_lock 2836 * original master calls this to free all of the dlm_lock
2835 * structures that used to be for other nodes. */ 2837 * structures that used to be for other nodes. */
2836 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 2838 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2837 struct dlm_lock_resource *res) 2839 struct dlm_lock_resource *res)
2838 { 2840 {
2839 struct list_head *queue = &res->granted; 2841 struct list_head *queue = &res->granted;
2840 int i, bit; 2842 int i, bit;
2841 struct dlm_lock *lock, *next; 2843 struct dlm_lock *lock, *next;
2842 2844
2843 assert_spin_locked(&res->spinlock); 2845 assert_spin_locked(&res->spinlock);
2844 2846
2845 BUG_ON(res->owner == dlm->node_num); 2847 BUG_ON(res->owner == dlm->node_num);
2846 2848
2847 for (i=0; i<3; i++) { 2849 for (i=0; i<3; i++) {
2848 list_for_each_entry_safe(lock, next, queue, list) { 2850 list_for_each_entry_safe(lock, next, queue, list) {
2849 if (lock->ml.node != dlm->node_num) { 2851 if (lock->ml.node != dlm->node_num) {
2850 mlog(0, "putting lock for node %u\n", 2852 mlog(0, "putting lock for node %u\n",
2851 lock->ml.node); 2853 lock->ml.node);
2852 /* be extra careful */ 2854 /* be extra careful */
2853 BUG_ON(!list_empty(&lock->ast_list)); 2855 BUG_ON(!list_empty(&lock->ast_list));
2854 BUG_ON(!list_empty(&lock->bast_list)); 2856 BUG_ON(!list_empty(&lock->bast_list));
2855 BUG_ON(lock->ast_pending); 2857 BUG_ON(lock->ast_pending);
2856 BUG_ON(lock->bast_pending); 2858 BUG_ON(lock->bast_pending);
2857 dlm_lockres_clear_refmap_bit(dlm, res, 2859 dlm_lockres_clear_refmap_bit(dlm, res,
2858 lock->ml.node); 2860 lock->ml.node);
2859 list_del_init(&lock->list); 2861 list_del_init(&lock->list);
2860 dlm_lock_put(lock); 2862 dlm_lock_put(lock);
2861 /* In a normal unlock, we would have added a 2863 /* In a normal unlock, we would have added a
2862 * DLM_UNLOCK_FREE_LOCK action. Force it. */ 2864 * DLM_UNLOCK_FREE_LOCK action. Force it. */
2863 dlm_lock_put(lock); 2865 dlm_lock_put(lock);
2864 } 2866 }
2865 } 2867 }
2866 queue++; 2868 queue++;
2867 } 2869 }
2868 bit = 0; 2870 bit = 0;
2869 while (1) { 2871 while (1) {
2870 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 2872 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
2871 if (bit >= O2NM_MAX_NODES) 2873 if (bit >= O2NM_MAX_NODES)
2872 break; 2874 break;
2873 /* do not clear the local node reference, if there is a 2875 /* do not clear the local node reference, if there is a
2874 * process holding this, let it drop the ref itself */ 2876 * process holding this, let it drop the ref itself */
2875 if (bit != dlm->node_num) { 2877 if (bit != dlm->node_num) {
2876 mlog(0, "%s:%.*s: node %u had a ref to this " 2878 mlog(0, "%s:%.*s: node %u had a ref to this "
2877 "migrating lockres, clearing\n", dlm->name, 2879 "migrating lockres, clearing\n", dlm->name,
2878 res->lockname.len, res->lockname.name, bit); 2880 res->lockname.len, res->lockname.name, bit);
2879 dlm_lockres_clear_refmap_bit(dlm, res, bit); 2881 dlm_lockres_clear_refmap_bit(dlm, res, bit);
2880 } 2882 }
2881 bit++; 2883 bit++;
2882 } 2884 }
2883 } 2885 }
2884 2886
2885 /* 2887 /*
2886 * Pick a node to migrate the lock resource to. This function selects a 2888 * Pick a node to migrate the lock resource to. This function selects a
2887 * potential target based first on the locks and then on refmap. It skips 2889 * potential target based first on the locks and then on refmap. It skips
2888 * nodes that are in the process of exiting the domain. 2890 * nodes that are in the process of exiting the domain.
2889 */ 2891 */
2890 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 2892 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2891 struct dlm_lock_resource *res) 2893 struct dlm_lock_resource *res)
2892 { 2894 {
2893 enum dlm_lockres_list idx; 2895 enum dlm_lockres_list idx;
2894 struct list_head *queue = &res->granted; 2896 struct list_head *queue = &res->granted;
2895 struct dlm_lock *lock; 2897 struct dlm_lock *lock;
2896 int noderef; 2898 int noderef;
2897 u8 nodenum = O2NM_MAX_NODES; 2899 u8 nodenum = O2NM_MAX_NODES;
2898 2900
2899 assert_spin_locked(&dlm->spinlock); 2901 assert_spin_locked(&dlm->spinlock);
2900 assert_spin_locked(&res->spinlock); 2902 assert_spin_locked(&res->spinlock);
2901 2903
2902 /* Go through all the locks */ 2904 /* Go through all the locks */
2903 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { 2905 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2904 queue = dlm_list_idx_to_ptr(res, idx); 2906 queue = dlm_list_idx_to_ptr(res, idx);
2905 list_for_each_entry(lock, queue, list) { 2907 list_for_each_entry(lock, queue, list) {
2906 if (lock->ml.node == dlm->node_num) 2908 if (lock->ml.node == dlm->node_num)
2907 continue; 2909 continue;
2908 if (test_bit(lock->ml.node, dlm->exit_domain_map)) 2910 if (test_bit(lock->ml.node, dlm->exit_domain_map))
2909 continue; 2911 continue;
2910 nodenum = lock->ml.node; 2912 nodenum = lock->ml.node;
2911 goto bail; 2913 goto bail;
2912 } 2914 }
2913 } 2915 }
2914 2916
2915 /* Go thru the refmap */ 2917 /* Go thru the refmap */
2916 noderef = -1; 2918 noderef = -1;
2917 while (1) { 2919 while (1) {
2918 noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, 2920 noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
2919 noderef + 1); 2921 noderef + 1);
2920 if (noderef >= O2NM_MAX_NODES) 2922 if (noderef >= O2NM_MAX_NODES)
2921 break; 2923 break;
2922 if (noderef == dlm->node_num) 2924 if (noderef == dlm->node_num)
2923 continue; 2925 continue;
2924 if (test_bit(noderef, dlm->exit_domain_map)) 2926 if (test_bit(noderef, dlm->exit_domain_map))
2925 continue; 2927 continue;
2926 nodenum = noderef; 2928 nodenum = noderef;
2927 goto bail; 2929 goto bail;
2928 } 2930 }
2929 2931
2930 bail: 2932 bail:
2931 return nodenum; 2933 return nodenum;
2932 } 2934 }
2933 2935
2934 /* this is called by the new master once all lockres 2936 /* this is called by the new master once all lockres
2935 * data has been received */ 2937 * data has been received */
2936 static int dlm_do_migrate_request(struct dlm_ctxt *dlm, 2938 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2937 struct dlm_lock_resource *res, 2939 struct dlm_lock_resource *res,
2938 u8 master, u8 new_master, 2940 u8 master, u8 new_master,
2939 struct dlm_node_iter *iter) 2941 struct dlm_node_iter *iter)
2940 { 2942 {
2941 struct dlm_migrate_request migrate; 2943 struct dlm_migrate_request migrate;
2942 int ret, skip, status = 0; 2944 int ret, skip, status = 0;
2943 int nodenum; 2945 int nodenum;
2944 2946
2945 memset(&migrate, 0, sizeof(migrate)); 2947 memset(&migrate, 0, sizeof(migrate));
2946 migrate.namelen = res->lockname.len; 2948 migrate.namelen = res->lockname.len;
2947 memcpy(migrate.name, res->lockname.name, migrate.namelen); 2949 memcpy(migrate.name, res->lockname.name, migrate.namelen);
2948 migrate.new_master = new_master; 2950 migrate.new_master = new_master;
2949 migrate.master = master; 2951 migrate.master = master;
2950 2952
2951 ret = 0; 2953 ret = 0;
2952 2954
2953 /* send message to all nodes, except the master and myself */ 2955 /* send message to all nodes, except the master and myself */
2954 while ((nodenum = dlm_node_iter_next(iter)) >= 0) { 2956 while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
2955 if (nodenum == master || 2957 if (nodenum == master ||
2956 nodenum == new_master) 2958 nodenum == new_master)
2957 continue; 2959 continue;
2958 2960
2959 /* We could race exit domain. If exited, skip. */ 2961 /* We could race exit domain. If exited, skip. */
2960 spin_lock(&dlm->spinlock); 2962 spin_lock(&dlm->spinlock);
2961 skip = (!test_bit(nodenum, dlm->domain_map)); 2963 skip = (!test_bit(nodenum, dlm->domain_map));
2962 spin_unlock(&dlm->spinlock); 2964 spin_unlock(&dlm->spinlock);
2963 if (skip) { 2965 if (skip) {
2964 clear_bit(nodenum, iter->node_map); 2966 clear_bit(nodenum, iter->node_map);
2965 continue; 2967 continue;
2966 } 2968 }
2967 2969
2968 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, 2970 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2969 &migrate, sizeof(migrate), nodenum, 2971 &migrate, sizeof(migrate), nodenum,
2970 &status); 2972 &status);
2971 if (ret < 0) { 2973 if (ret < 0) {
2972 mlog(ML_ERROR, "%s: res %.*s, Error %d send " 2974 mlog(ML_ERROR, "%s: res %.*s, Error %d send "
2973 "MIGRATE_REQUEST to node %u\n", dlm->name, 2975 "MIGRATE_REQUEST to node %u\n", dlm->name,
2974 migrate.namelen, migrate.name, ret, nodenum); 2976 migrate.namelen, migrate.name, ret, nodenum);
2975 if (!dlm_is_host_down(ret)) { 2977 if (!dlm_is_host_down(ret)) {
2976 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2978 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2977 BUG(); 2979 BUG();
2978 } 2980 }
2979 clear_bit(nodenum, iter->node_map); 2981 clear_bit(nodenum, iter->node_map);
2980 ret = 0; 2982 ret = 0;
2981 } else if (status < 0) { 2983 } else if (status < 0) {
2982 mlog(0, "migrate request (node %u) returned %d!\n", 2984 mlog(0, "migrate request (node %u) returned %d!\n",
2983 nodenum, status); 2985 nodenum, status);
2984 ret = status; 2986 ret = status;
2985 } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { 2987 } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
2986 /* during the migration request we short-circuited 2988 /* during the migration request we short-circuited
2987 * the mastery of the lockres. make sure we have 2989 * the mastery of the lockres. make sure we have
2988 * a mastery ref for nodenum */ 2990 * a mastery ref for nodenum */
2989 mlog(0, "%s:%.*s: need ref for node %u\n", 2991 mlog(0, "%s:%.*s: need ref for node %u\n",
2990 dlm->name, res->lockname.len, res->lockname.name, 2992 dlm->name, res->lockname.len, res->lockname.name,
2991 nodenum); 2993 nodenum);
2992 spin_lock(&res->spinlock); 2994 spin_lock(&res->spinlock);
2993 dlm_lockres_set_refmap_bit(dlm, res, nodenum); 2995 dlm_lockres_set_refmap_bit(dlm, res, nodenum);
2994 spin_unlock(&res->spinlock); 2996 spin_unlock(&res->spinlock);
2995 } 2997 }
2996 } 2998 }
2997 2999
2998 if (ret < 0) 3000 if (ret < 0)
2999 mlog_errno(ret); 3001 mlog_errno(ret);
3000 3002
3001 mlog(0, "returning ret=%d\n", ret); 3003 mlog(0, "returning ret=%d\n", ret);
3002 return ret; 3004 return ret;
3003 } 3005 }
3004 3006
3005 3007
3006 /* if there is an existing mle for this lockres, we now know who the master is. 3008 /* if there is an existing mle for this lockres, we now know who the master is.
3007 * (the one who sent us *this* message) we can clear it up right away. 3009 * (the one who sent us *this* message) we can clear it up right away.
3008 * since the process that put the mle on the list still has a reference to it, 3010 * since the process that put the mle on the list still has a reference to it,
3009 * we can unhash it now, set the master and wake the process. as a result, 3011 * we can unhash it now, set the master and wake the process. as a result,
3010 * we will have no mle in the list to start with. now we can add an mle for 3012 * we will have no mle in the list to start with. now we can add an mle for
3011 * the migration and this should be the only one found for those scanning the 3013 * the migration and this should be the only one found for those scanning the
3012 * list. */ 3014 * list. */
3013 int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 3015 int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3014 void **ret_data) 3016 void **ret_data)
3015 { 3017 {
3016 struct dlm_ctxt *dlm = data; 3018 struct dlm_ctxt *dlm = data;
3017 struct dlm_lock_resource *res = NULL; 3019 struct dlm_lock_resource *res = NULL;
3018 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; 3020 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
3019 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; 3021 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
3020 const char *name; 3022 const char *name;
3021 unsigned int namelen, hash; 3023 unsigned int namelen, hash;
3022 int ret = 0; 3024 int ret = 0;
3023 3025
3024 if (!dlm_grab(dlm)) 3026 if (!dlm_grab(dlm))
3025 return -EINVAL; 3027 return -EINVAL;
3026 3028
3027 name = migrate->name; 3029 name = migrate->name;
3028 namelen = migrate->namelen; 3030 namelen = migrate->namelen;
3029 hash = dlm_lockid_hash(name, namelen); 3031 hash = dlm_lockid_hash(name, namelen);
3030 3032
3031 /* preallocate.. if this fails, abort */ 3033 /* preallocate.. if this fails, abort */
3032 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 3034 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3033 3035
3034 if (!mle) { 3036 if (!mle) {
3035 ret = -ENOMEM; 3037 ret = -ENOMEM;
3036 goto leave; 3038 goto leave;
3037 } 3039 }
3038 3040
3039 /* check for pre-existing lock */ 3041 /* check for pre-existing lock */
3040 spin_lock(&dlm->spinlock); 3042 spin_lock(&dlm->spinlock);
3041 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 3043 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3042 if (res) { 3044 if (res) {
3043 spin_lock(&res->spinlock); 3045 spin_lock(&res->spinlock);
3044 if (res->state & DLM_LOCK_RES_RECOVERING) { 3046 if (res->state & DLM_LOCK_RES_RECOVERING) {
3045 /* if all is working ok, this can only mean that we got 3047 /* if all is working ok, this can only mean that we got
3046 * a migrate request from a node that we now see as 3048 * a migrate request from a node that we now see as
3047 * dead. what can we do here? drop it to the floor? */ 3049 * dead. what can we do here? drop it to the floor? */
3048 spin_unlock(&res->spinlock); 3050 spin_unlock(&res->spinlock);
3049 mlog(ML_ERROR, "Got a migrate request, but the " 3051 mlog(ML_ERROR, "Got a migrate request, but the "
3050 "lockres is marked as recovering!"); 3052 "lockres is marked as recovering!");
3051 kmem_cache_free(dlm_mle_cache, mle); 3053 kmem_cache_free(dlm_mle_cache, mle);
3052 ret = -EINVAL; /* need a better solution */ 3054 ret = -EINVAL; /* need a better solution */
3053 goto unlock; 3055 goto unlock;
3054 } 3056 }
3055 res->state |= DLM_LOCK_RES_MIGRATING; 3057 res->state |= DLM_LOCK_RES_MIGRATING;
3056 spin_unlock(&res->spinlock); 3058 spin_unlock(&res->spinlock);
3057 } 3059 }
3058 3060
3059 spin_lock(&dlm->master_lock); 3061 spin_lock(&dlm->master_lock);
3060 /* ignore status. only nonzero status would BUG. */ 3062 /* ignore status. only nonzero status would BUG. */
3061 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 3063 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3062 name, namelen, 3064 name, namelen,
3063 migrate->new_master, 3065 migrate->new_master,
3064 migrate->master); 3066 migrate->master);
3065 3067
3066 spin_unlock(&dlm->master_lock); 3068 spin_unlock(&dlm->master_lock);
3067 unlock: 3069 unlock:
3068 spin_unlock(&dlm->spinlock); 3070 spin_unlock(&dlm->spinlock);
3069 3071
3070 if (oldmle) { 3072 if (oldmle) {
3071 /* master is known, detach if not already detached */ 3073 /* master is known, detach if not already detached */
3072 dlm_mle_detach_hb_events(dlm, oldmle); 3074 dlm_mle_detach_hb_events(dlm, oldmle);
3073 dlm_put_mle(oldmle); 3075 dlm_put_mle(oldmle);
3074 } 3076 }
3075 3077
3076 if (res) 3078 if (res)
3077 dlm_lockres_put(res); 3079 dlm_lockres_put(res);
3078 leave: 3080 leave:
3079 dlm_put(dlm); 3081 dlm_put(dlm);
3080 return ret; 3082 return ret;
3081 } 3083 }
3082 3084
3083 /* must be holding dlm->spinlock and dlm->master_lock 3085 /* must be holding dlm->spinlock and dlm->master_lock
3084 * when adding a migration mle, we can clear any other mles 3086 * when adding a migration mle, we can clear any other mles
3085 * in the master list because we know with certainty that 3087 * in the master list because we know with certainty that
3086 * the master is "master". so we remove any old mle from 3088 * the master is "master". so we remove any old mle from
3087 * the list after setting it's master field, and then add 3089 * the list after setting it's master field, and then add
3088 * the new migration mle. this way we can hold with the rule 3090 * the new migration mle. this way we can hold with the rule
3089 * of having only one mle for a given lock name at all times. */ 3091 * of having only one mle for a given lock name at all times. */
3090 static int dlm_add_migration_mle(struct dlm_ctxt *dlm, 3092 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3091 struct dlm_lock_resource *res, 3093 struct dlm_lock_resource *res,
3092 struct dlm_master_list_entry *mle, 3094 struct dlm_master_list_entry *mle,
3093 struct dlm_master_list_entry **oldmle, 3095 struct dlm_master_list_entry **oldmle,
3094 const char *name, unsigned int namelen, 3096 const char *name, unsigned int namelen,
3095 u8 new_master, u8 master) 3097 u8 new_master, u8 master)
3096 { 3098 {
3097 int found; 3099 int found;
3098 int ret = 0; 3100 int ret = 0;
3099 3101
3100 *oldmle = NULL; 3102 *oldmle = NULL;
3101 3103
3102 assert_spin_locked(&dlm->spinlock); 3104 assert_spin_locked(&dlm->spinlock);
3103 assert_spin_locked(&dlm->master_lock); 3105 assert_spin_locked(&dlm->master_lock);
3104 3106
3105 /* caller is responsible for any ref taken here on oldmle */ 3107 /* caller is responsible for any ref taken here on oldmle */
3106 found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); 3108 found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
3107 if (found) { 3109 if (found) {
3108 struct dlm_master_list_entry *tmp = *oldmle; 3110 struct dlm_master_list_entry *tmp = *oldmle;
3109 spin_lock(&tmp->spinlock); 3111 spin_lock(&tmp->spinlock);
3110 if (tmp->type == DLM_MLE_MIGRATION) { 3112 if (tmp->type == DLM_MLE_MIGRATION) {
3111 if (master == dlm->node_num) { 3113 if (master == dlm->node_num) {
3112 /* ah another process raced me to it */ 3114 /* ah another process raced me to it */
3113 mlog(0, "tried to migrate %.*s, but some " 3115 mlog(0, "tried to migrate %.*s, but some "
3114 "process beat me to it\n", 3116 "process beat me to it\n",
3115 namelen, name); 3117 namelen, name);
3116 ret = -EEXIST; 3118 ret = -EEXIST;
3117 } else { 3119 } else {
3118 /* bad. 2 NODES are trying to migrate! */ 3120 /* bad. 2 NODES are trying to migrate! */
3119 mlog(ML_ERROR, "migration error mle: " 3121 mlog(ML_ERROR, "migration error mle: "
3120 "master=%u new_master=%u // request: " 3122 "master=%u new_master=%u // request: "
3121 "master=%u new_master=%u // " 3123 "master=%u new_master=%u // "
3122 "lockres=%.*s\n", 3124 "lockres=%.*s\n",
3123 tmp->master, tmp->new_master, 3125 tmp->master, tmp->new_master,
3124 master, new_master, 3126 master, new_master,
3125 namelen, name); 3127 namelen, name);
3126 BUG(); 3128 BUG();
3127 } 3129 }
3128 } else { 3130 } else {
3129 /* this is essentially what assert_master does */ 3131 /* this is essentially what assert_master does */
3130 tmp->master = master; 3132 tmp->master = master;
3131 atomic_set(&tmp->woken, 1); 3133 atomic_set(&tmp->woken, 1);
3132 wake_up(&tmp->wq); 3134 wake_up(&tmp->wq);
3133 /* remove it so that only one mle will be found */ 3135 /* remove it so that only one mle will be found */
3134 __dlm_unlink_mle(dlm, tmp); 3136 __dlm_unlink_mle(dlm, tmp);
3135 __dlm_mle_detach_hb_events(dlm, tmp); 3137 __dlm_mle_detach_hb_events(dlm, tmp);
3136 if (tmp->type == DLM_MLE_MASTER) { 3138 if (tmp->type == DLM_MLE_MASTER) {
3137 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3139 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3138 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3140 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3139 "telling master to get ref " 3141 "telling master to get ref "
3140 "for cleared out mle during " 3142 "for cleared out mle during "
3141 "migration\n", dlm->name, 3143 "migration\n", dlm->name,
3142 namelen, name, master, 3144 namelen, name, master,
3143 new_master); 3145 new_master);
3144 } 3146 }
3145 } 3147 }
3146 spin_unlock(&tmp->spinlock); 3148 spin_unlock(&tmp->spinlock);
3147 } 3149 }
3148 3150
3149 /* now add a migration mle to the tail of the list */ 3151 /* now add a migration mle to the tail of the list */
3150 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); 3152 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
3151 mle->new_master = new_master; 3153 mle->new_master = new_master;
3152 /* the new master will be sending an assert master for this. 3154 /* the new master will be sending an assert master for this.
3153 * at that point we will get the refmap reference */ 3155 * at that point we will get the refmap reference */
3154 mle->master = master; 3156 mle->master = master;
3155 /* do this for consistency with other mle types */ 3157 /* do this for consistency with other mle types */
3156 set_bit(new_master, mle->maybe_map); 3158 set_bit(new_master, mle->maybe_map);
3157 __dlm_insert_mle(dlm, mle); 3159 __dlm_insert_mle(dlm, mle);
3158 3160
3159 return ret; 3161 return ret;
3160 } 3162 }
3161 3163
3162 /* 3164 /*
3163 * Sets the owner of the lockres, associated to the mle, to UNKNOWN 3165 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3164 */ 3166 */
3165 static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, 3167 static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3166 struct dlm_master_list_entry *mle) 3168 struct dlm_master_list_entry *mle)
3167 { 3169 {
3168 struct dlm_lock_resource *res; 3170 struct dlm_lock_resource *res;
3169 3171
3170 /* Find the lockres associated to the mle and set its owner to UNK */ 3172 /* Find the lockres associated to the mle and set its owner to UNK */
3171 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, 3173 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3172 mle->mnamehash); 3174 mle->mnamehash);
3173 if (res) { 3175 if (res) {
3174 spin_unlock(&dlm->master_lock); 3176 spin_unlock(&dlm->master_lock);
3175 3177
3176 /* move lockres onto recovery list */ 3178 /* move lockres onto recovery list */
3177 spin_lock(&res->spinlock); 3179 spin_lock(&res->spinlock);
3178 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 3180 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3179 dlm_move_lockres_to_recovery_list(dlm, res); 3181 dlm_move_lockres_to_recovery_list(dlm, res);
3180 spin_unlock(&res->spinlock); 3182 spin_unlock(&res->spinlock);
3181 dlm_lockres_put(res); 3183 dlm_lockres_put(res);
3182 3184
3183 /* about to get rid of mle, detach from heartbeat */ 3185 /* about to get rid of mle, detach from heartbeat */
3184 __dlm_mle_detach_hb_events(dlm, mle); 3186 __dlm_mle_detach_hb_events(dlm, mle);
3185 3187
3186 /* dump the mle */ 3188 /* dump the mle */
3187 spin_lock(&dlm->master_lock); 3189 spin_lock(&dlm->master_lock);
3188 __dlm_put_mle(mle); 3190 __dlm_put_mle(mle);
3189 spin_unlock(&dlm->master_lock); 3191 spin_unlock(&dlm->master_lock);
3190 } 3192 }
3191 3193
3192 return res; 3194 return res;
3193 } 3195 }
3194 3196
3195 static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, 3197 static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3196 struct dlm_master_list_entry *mle) 3198 struct dlm_master_list_entry *mle)
3197 { 3199 {
3198 __dlm_mle_detach_hb_events(dlm, mle); 3200 __dlm_mle_detach_hb_events(dlm, mle);
3199 3201
3200 spin_lock(&mle->spinlock); 3202 spin_lock(&mle->spinlock);
3201 __dlm_unlink_mle(dlm, mle); 3203 __dlm_unlink_mle(dlm, mle);
3202 atomic_set(&mle->woken, 1); 3204 atomic_set(&mle->woken, 1);
3203 spin_unlock(&mle->spinlock); 3205 spin_unlock(&mle->spinlock);
3204 3206
3205 wake_up(&mle->wq); 3207 wake_up(&mle->wq);
3206 } 3208 }
3207 3209
3208 static void dlm_clean_block_mle(struct dlm_ctxt *dlm, 3210 static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3209 struct dlm_master_list_entry *mle, u8 dead_node) 3211 struct dlm_master_list_entry *mle, u8 dead_node)
3210 { 3212 {
3211 int bit; 3213 int bit;
3212 3214
3213 BUG_ON(mle->type != DLM_MLE_BLOCK); 3215 BUG_ON(mle->type != DLM_MLE_BLOCK);
3214 3216
3215 spin_lock(&mle->spinlock); 3217 spin_lock(&mle->spinlock);
3216 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3218 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3217 if (bit != dead_node) { 3219 if (bit != dead_node) {
3218 mlog(0, "mle found, but dead node %u would not have been " 3220 mlog(0, "mle found, but dead node %u would not have been "
3219 "master\n", dead_node); 3221 "master\n", dead_node);
3220 spin_unlock(&mle->spinlock); 3222 spin_unlock(&mle->spinlock);
3221 } else { 3223 } else {
3222 /* Must drop the refcount by one since the assert_master will 3224 /* Must drop the refcount by one since the assert_master will
3223 * never arrive. This may result in the mle being unlinked and 3225 * never arrive. This may result in the mle being unlinked and
3224 * freed, but there may still be a process waiting in the 3226 * freed, but there may still be a process waiting in the
3225 * dlmlock path which is fine. */ 3227 * dlmlock path which is fine. */
3226 mlog(0, "node %u was expected master\n", dead_node); 3228 mlog(0, "node %u was expected master\n", dead_node);
3227 atomic_set(&mle->woken, 1); 3229 atomic_set(&mle->woken, 1);
3228 spin_unlock(&mle->spinlock); 3230 spin_unlock(&mle->spinlock);
3229 wake_up(&mle->wq); 3231 wake_up(&mle->wq);
3230 3232
3231 /* Do not need events any longer, so detach from heartbeat */ 3233 /* Do not need events any longer, so detach from heartbeat */
3232 __dlm_mle_detach_hb_events(dlm, mle); 3234 __dlm_mle_detach_hb_events(dlm, mle);
3233 __dlm_put_mle(mle); 3235 __dlm_put_mle(mle);
3234 } 3236 }
3235 } 3237 }
3236 3238
3237 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3239 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3238 { 3240 {
3239 struct dlm_master_list_entry *mle; 3241 struct dlm_master_list_entry *mle;
3240 struct dlm_lock_resource *res; 3242 struct dlm_lock_resource *res;
3241 struct hlist_head *bucket; 3243 struct hlist_head *bucket;
3242 struct hlist_node *tmp; 3244 struct hlist_node *tmp;
3243 unsigned int i; 3245 unsigned int i;
3244 3246
3245 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); 3247 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
3246 top: 3248 top:
3247 assert_spin_locked(&dlm->spinlock); 3249 assert_spin_locked(&dlm->spinlock);
3248 3250
3249 /* clean the master list */ 3251 /* clean the master list */
3250 spin_lock(&dlm->master_lock); 3252 spin_lock(&dlm->master_lock);
3251 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 3253 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3252 bucket = dlm_master_hash(dlm, i); 3254 bucket = dlm_master_hash(dlm, i);
3253 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { 3255 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3254 BUG_ON(mle->type != DLM_MLE_BLOCK && 3256 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3255 mle->type != DLM_MLE_MASTER && 3257 mle->type != DLM_MLE_MASTER &&
3256 mle->type != DLM_MLE_MIGRATION); 3258 mle->type != DLM_MLE_MIGRATION);
3257 3259
3258 /* MASTER mles are initiated locally. The waiting 3260 /* MASTER mles are initiated locally. The waiting
3259 * process will notice the node map change shortly. 3261 * process will notice the node map change shortly.
3260 * Let that happen as normal. */ 3262 * Let that happen as normal. */
3261 if (mle->type == DLM_MLE_MASTER) 3263 if (mle->type == DLM_MLE_MASTER)
3262 continue; 3264 continue;
3263 3265
3264 /* BLOCK mles are initiated by other nodes. Need to 3266 /* BLOCK mles are initiated by other nodes. Need to
3265 * clean up if the dead node would have been the 3267 * clean up if the dead node would have been the
3266 * master. */ 3268 * master. */
3267 if (mle->type == DLM_MLE_BLOCK) { 3269 if (mle->type == DLM_MLE_BLOCK) {
3268 dlm_clean_block_mle(dlm, mle, dead_node); 3270 dlm_clean_block_mle(dlm, mle, dead_node);
3269 continue; 3271 continue;
3270 } 3272 }
3271 3273
3272 /* Everything else is a MIGRATION mle */ 3274 /* Everything else is a MIGRATION mle */
3273 3275
3274 /* The rule for MIGRATION mles is that the master 3276 /* The rule for MIGRATION mles is that the master
3275 * becomes UNKNOWN if *either* the original or the new 3277 * becomes UNKNOWN if *either* the original or the new
3276 * master dies. All UNKNOWN lockres' are sent to 3278 * master dies. All UNKNOWN lockres' are sent to
3277 * whichever node becomes the recovery master. The new 3279 * whichever node becomes the recovery master. The new
3278 * master is responsible for determining if there is 3280 * master is responsible for determining if there is
3279 * still a master for this lockres, or if he needs to 3281 * still a master for this lockres, or if he needs to
3280 * take over mastery. Either way, this node should 3282 * take over mastery. Either way, this node should
3281 * expect another message to resolve this. */ 3283 * expect another message to resolve this. */
3282 3284
3283 if (mle->master != dead_node && 3285 if (mle->master != dead_node &&
3284 mle->new_master != dead_node) 3286 mle->new_master != dead_node)
3285 continue; 3287 continue;
3286 3288
3287 /* If we have reached this point, this mle needs to be 3289 /* If we have reached this point, this mle needs to be
3288 * removed from the list and freed. */ 3290 * removed from the list and freed. */
3289 dlm_clean_migration_mle(dlm, mle); 3291 dlm_clean_migration_mle(dlm, mle);
3290 3292
3291 mlog(0, "%s: node %u died during migration from " 3293 mlog(0, "%s: node %u died during migration from "
3292 "%u to %u!\n", dlm->name, dead_node, mle->master, 3294 "%u to %u!\n", dlm->name, dead_node, mle->master,
3293 mle->new_master); 3295 mle->new_master);
3294 3296
3295 /* If we find a lockres associated with the mle, we've 3297 /* If we find a lockres associated with the mle, we've
3296 * hit this rare case that messes up our lock ordering. 3298 * hit this rare case that messes up our lock ordering.
3297 * If so, we need to drop the master lock so that we can 3299 * If so, we need to drop the master lock so that we can
3298 * take the lockres lock, meaning that we will have to 3300 * take the lockres lock, meaning that we will have to
3299 * restart from the head of list. */ 3301 * restart from the head of list. */
3300 res = dlm_reset_mleres_owner(dlm, mle); 3302 res = dlm_reset_mleres_owner(dlm, mle);
3301 if (res) 3303 if (res)
3302 /* restart */ 3304 /* restart */
3303 goto top; 3305 goto top;
3304 3306
3305 /* This may be the last reference */ 3307 /* This may be the last reference */
3306 __dlm_put_mle(mle); 3308 __dlm_put_mle(mle);
3307 } 3309 }
3308 } 3310 }
3309 spin_unlock(&dlm->master_lock); 3311 spin_unlock(&dlm->master_lock);
3310 } 3312 }
3311 3313
3312 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 3314 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3313 u8 old_master) 3315 u8 old_master)
3314 { 3316 {
3315 struct dlm_node_iter iter; 3317 struct dlm_node_iter iter;
3316 int ret = 0; 3318 int ret = 0;
3317 3319
3318 spin_lock(&dlm->spinlock); 3320 spin_lock(&dlm->spinlock);
3319 dlm_node_iter_init(dlm->domain_map, &iter); 3321 dlm_node_iter_init(dlm->domain_map, &iter);
3320 clear_bit(old_master, iter.node_map); 3322 clear_bit(old_master, iter.node_map);
3321 clear_bit(dlm->node_num, iter.node_map); 3323 clear_bit(dlm->node_num, iter.node_map);
3322 spin_unlock(&dlm->spinlock); 3324 spin_unlock(&dlm->spinlock);
3323 3325
3324 /* ownership of the lockres is changing. account for the 3326 /* ownership of the lockres is changing. account for the
3325 * mastery reference here since old_master will briefly have 3327 * mastery reference here since old_master will briefly have
3326 * a reference after the migration completes */ 3328 * a reference after the migration completes */
3327 spin_lock(&res->spinlock); 3329 spin_lock(&res->spinlock);
3328 dlm_lockres_set_refmap_bit(dlm, res, old_master); 3330 dlm_lockres_set_refmap_bit(dlm, res, old_master);
3329 spin_unlock(&res->spinlock); 3331 spin_unlock(&res->spinlock);
3330 3332
3331 mlog(0, "now time to do a migrate request to other nodes\n"); 3333 mlog(0, "now time to do a migrate request to other nodes\n");
3332 ret = dlm_do_migrate_request(dlm, res, old_master, 3334 ret = dlm_do_migrate_request(dlm, res, old_master,
3333 dlm->node_num, &iter); 3335 dlm->node_num, &iter);
3334 if (ret < 0) { 3336 if (ret < 0) {
3335 mlog_errno(ret); 3337 mlog_errno(ret);
3336 goto leave; 3338 goto leave;
3337 } 3339 }
3338 3340
3339 mlog(0, "doing assert master of %.*s to all except the original node\n", 3341 mlog(0, "doing assert master of %.*s to all except the original node\n",
3340 res->lockname.len, res->lockname.name); 3342 res->lockname.len, res->lockname.name);
3341 /* this call now finishes out the nodemap 3343 /* this call now finishes out the nodemap
3342 * even if one or more nodes die */ 3344 * even if one or more nodes die */
3343 ret = dlm_do_assert_master(dlm, res, iter.node_map, 3345 ret = dlm_do_assert_master(dlm, res, iter.node_map,
3344 DLM_ASSERT_MASTER_FINISH_MIGRATION); 3346 DLM_ASSERT_MASTER_FINISH_MIGRATION);
3345 if (ret < 0) { 3347 if (ret < 0) {
3346 /* no longer need to retry. all living nodes contacted. */ 3348 /* no longer need to retry. all living nodes contacted. */
3347 mlog_errno(ret); 3349 mlog_errno(ret);
3348 ret = 0; 3350 ret = 0;
3349 } 3351 }
3350 3352
3351 memset(iter.node_map, 0, sizeof(iter.node_map)); 3353 memset(iter.node_map, 0, sizeof(iter.node_map));
3352 set_bit(old_master, iter.node_map); 3354 set_bit(old_master, iter.node_map);
3353 mlog(0, "doing assert master of %.*s back to %u\n", 3355 mlog(0, "doing assert master of %.*s back to %u\n",
3354 res->lockname.len, res->lockname.name, old_master); 3356 res->lockname.len, res->lockname.name, old_master);
3355 ret = dlm_do_assert_master(dlm, res, iter.node_map, 3357 ret = dlm_do_assert_master(dlm, res, iter.node_map,
3356 DLM_ASSERT_MASTER_FINISH_MIGRATION); 3358 DLM_ASSERT_MASTER_FINISH_MIGRATION);
3357 if (ret < 0) { 3359 if (ret < 0) {
3358 mlog(0, "assert master to original master failed " 3360 mlog(0, "assert master to original master failed "
3359 "with %d.\n", ret); 3361 "with %d.\n", ret);
3360 /* the only nonzero status here would be because of 3362 /* the only nonzero status here would be because of
3361 * a dead original node. we're done. */ 3363 * a dead original node. we're done. */
3362 ret = 0; 3364 ret = 0;
3363 } 3365 }
3364 3366
3365 /* all done, set the owner, clear the flag */ 3367 /* all done, set the owner, clear the flag */
3366 spin_lock(&res->spinlock); 3368 spin_lock(&res->spinlock);
3367 dlm_set_lockres_owner(dlm, res, dlm->node_num); 3369 dlm_set_lockres_owner(dlm, res, dlm->node_num);
3368 res->state &= ~DLM_LOCK_RES_MIGRATING; 3370 res->state &= ~DLM_LOCK_RES_MIGRATING;
3369 spin_unlock(&res->spinlock); 3371 spin_unlock(&res->spinlock);
3370 /* re-dirty it on the new master */ 3372 /* re-dirty it on the new master */
3371 dlm_kick_thread(dlm, res); 3373 dlm_kick_thread(dlm, res);
3372 wake_up(&res->wq); 3374 wake_up(&res->wq);
3373 leave: 3375 leave:
3374 return ret; 3376 return ret;
3375 } 3377 }
3376 3378
3377 /* 3379 /*
3378 * LOCKRES AST REFCOUNT 3380 * LOCKRES AST REFCOUNT
3379 * this is integral to migration 3381 * this is integral to migration
3380 */ 3382 */
3381 3383
3382 /* for future intent to call an ast, reserve one ahead of time. 3384 /* for future intent to call an ast, reserve one ahead of time.
3383 * this should be called only after waiting on the lockres 3385 * this should be called only after waiting on the lockres
3384 * with dlm_wait_on_lockres, and while still holding the 3386 * with dlm_wait_on_lockres, and while still holding the
3385 * spinlock after the call. */ 3387 * spinlock after the call. */
3386 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) 3388 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
3387 { 3389 {
3388 assert_spin_locked(&res->spinlock); 3390 assert_spin_locked(&res->spinlock);
3389 if (res->state & DLM_LOCK_RES_MIGRATING) { 3391 if (res->state & DLM_LOCK_RES_MIGRATING) {
3390 __dlm_print_one_lock_resource(res); 3392 __dlm_print_one_lock_resource(res);
3391 } 3393 }
3392 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 3394 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3393 3395
3394 atomic_inc(&res->asts_reserved); 3396 atomic_inc(&res->asts_reserved);
3395 } 3397 }
3396 3398
3397 /* 3399 /*
3398 * used to drop the reserved ast, either because it went unused, 3400 * used to drop the reserved ast, either because it went unused,
3399 * or because the ast/bast was actually called. 3401 * or because the ast/bast was actually called.
3400 * 3402 *
3401 * also, if there is a pending migration on this lockres, 3403 * also, if there is a pending migration on this lockres,
3402 * and this was the last pending ast on the lockres, 3404 * and this was the last pending ast on the lockres,
3403 * atomically set the MIGRATING flag before we drop the lock. 3405 * atomically set the MIGRATING flag before we drop the lock.
3404 * this is how we ensure that migration can proceed with no 3406 * this is how we ensure that migration can proceed with no
3405 * asts in progress. note that it is ok if the state of the 3407 * asts in progress. note that it is ok if the state of the
3406 * queues is such that a lock should be granted in the future 3408 * queues is such that a lock should be granted in the future
3407 * or that a bast should be fired, because the new master will 3409 * or that a bast should be fired, because the new master will
3408 * shuffle the lists on this lockres as soon as it is migrated. 3410 * shuffle the lists on this lockres as soon as it is migrated.
3409 */ 3411 */
3410 void dlm_lockres_release_ast(struct dlm_ctxt *dlm, 3412 void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3411 struct dlm_lock_resource *res) 3413 struct dlm_lock_resource *res)
3412 { 3414 {
3413 if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) 3415 if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
3414 return; 3416 return;
3415 3417
3416 if (!res->migration_pending) { 3418 if (!res->migration_pending) {
3417 spin_unlock(&res->spinlock); 3419 spin_unlock(&res->spinlock);
3418 return; 3420 return;
3419 } 3421 }
3420 3422
3421 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 3423 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3422 res->migration_pending = 0; 3424 res->migration_pending = 0;
3423 res->state |= DLM_LOCK_RES_MIGRATING; 3425 res->state |= DLM_LOCK_RES_MIGRATING;
3424 spin_unlock(&res->spinlock); 3426 spin_unlock(&res->spinlock);
3425 wake_up(&res->wq); 3427 wake_up(&res->wq);
3426 wake_up(&dlm->migration_wq); 3428 wake_up(&dlm->migration_wq);
3427 } 3429 }
3428 3430
3429 void dlm_force_free_mles(struct dlm_ctxt *dlm) 3431 void dlm_force_free_mles(struct dlm_ctxt *dlm)
3430 { 3432 {
3431 int i; 3433 int i;
3432 struct hlist_head *bucket; 3434 struct hlist_head *bucket;
3433 struct dlm_master_list_entry *mle; 3435 struct dlm_master_list_entry *mle;
3434 struct hlist_node *tmp; 3436 struct hlist_node *tmp;
3435 3437
3436 /* 3438 /*
3437 * We notified all other nodes that we are exiting the domain and 3439 * We notified all other nodes that we are exiting the domain and
3438 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still 3440 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3439 * around we force free them and wake any processes that are waiting 3441 * around we force free them and wake any processes that are waiting
3440 * on the mles 3442 * on the mles
3441 */ 3443 */
3442 spin_lock(&dlm->spinlock); 3444 spin_lock(&dlm->spinlock);
3443 spin_lock(&dlm->master_lock); 3445 spin_lock(&dlm->master_lock);
3444 3446
3445 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); 3447 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3446 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); 3448 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3447 3449
3448 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 3450 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3449 bucket = dlm_master_hash(dlm, i); 3451 bucket = dlm_master_hash(dlm, i);
3450 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { 3452 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3451 if (mle->type != DLM_MLE_BLOCK) { 3453 if (mle->type != DLM_MLE_BLOCK) {
3452 mlog(ML_ERROR, "bad mle: %p\n", mle); 3454 mlog(ML_ERROR, "bad mle: %p\n", mle);
3453 dlm_print_one_mle(mle); 3455 dlm_print_one_mle(mle);
3454 } 3456 }
3455 atomic_set(&mle->woken, 1); 3457 atomic_set(&mle->woken, 1);
3456 wake_up(&mle->wq); 3458 wake_up(&mle->wq);
3457 3459
3458 __dlm_unlink_mle(dlm, mle); 3460 __dlm_unlink_mle(dlm, mle);
3459 __dlm_mle_detach_hb_events(dlm, mle); 3461 __dlm_mle_detach_hb_events(dlm, mle);
3460 __dlm_put_mle(mle); 3462 __dlm_put_mle(mle);
3461 } 3463 }
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * super.c 4 * super.c
5 * 5 *
6 * load/unload driver, mount/dismount volumes 6 * load/unload driver, mount/dismount volumes
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/module.h> 26 #include <linux/module.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/highmem.h> 30 #include <linux/highmem.h>
31 #include <linux/init.h> 31 #include <linux/init.h>
32 #include <linux/random.h> 32 #include <linux/random.h>
33 #include <linux/statfs.h> 33 #include <linux/statfs.h>
34 #include <linux/moduleparam.h> 34 #include <linux/moduleparam.h>
35 #include <linux/blkdev.h> 35 #include <linux/blkdev.h>
36 #include <linux/socket.h> 36 #include <linux/socket.h>
37 #include <linux/inet.h> 37 #include <linux/inet.h>
38 #include <linux/parser.h> 38 #include <linux/parser.h>
39 #include <linux/crc32.h> 39 #include <linux/crc32.h>
40 #include <linux/debugfs.h> 40 #include <linux/debugfs.h>
41 #include <linux/mount.h> 41 #include <linux/mount.h>
42 #include <linux/seq_file.h> 42 #include <linux/seq_file.h>
43 #include <linux/quotaops.h> 43 #include <linux/quotaops.h>
44 #include <linux/cleancache.h> 44 #include <linux/cleancache.h>
45 45
46 #define CREATE_TRACE_POINTS 46 #define CREATE_TRACE_POINTS
47 #include "ocfs2_trace.h" 47 #include "ocfs2_trace.h"
48 48
49 #include <cluster/masklog.h> 49 #include <cluster/masklog.h>
50 50
51 #include "ocfs2.h" 51 #include "ocfs2.h"
52 52
53 /* this should be the only file to include a version 1 header */ 53 /* this should be the only file to include a version 1 header */
54 #include "ocfs1_fs_compat.h" 54 #include "ocfs1_fs_compat.h"
55 55
56 #include "alloc.h" 56 #include "alloc.h"
57 #include "aops.h" 57 #include "aops.h"
58 #include "blockcheck.h" 58 #include "blockcheck.h"
59 #include "dlmglue.h" 59 #include "dlmglue.h"
60 #include "export.h" 60 #include "export.h"
61 #include "extent_map.h" 61 #include "extent_map.h"
62 #include "heartbeat.h" 62 #include "heartbeat.h"
63 #include "inode.h" 63 #include "inode.h"
64 #include "journal.h" 64 #include "journal.h"
65 #include "localalloc.h" 65 #include "localalloc.h"
66 #include "namei.h" 66 #include "namei.h"
67 #include "slot_map.h" 67 #include "slot_map.h"
68 #include "super.h" 68 #include "super.h"
69 #include "sysfile.h" 69 #include "sysfile.h"
70 #include "uptodate.h" 70 #include "uptodate.h"
71 #include "xattr.h" 71 #include "xattr.h"
72 #include "quota.h" 72 #include "quota.h"
73 #include "refcounttree.h" 73 #include "refcounttree.h"
74 #include "suballoc.h" 74 #include "suballoc.h"
75 75
76 #include "buffer_head_io.h" 76 #include "buffer_head_io.h"
77 77
78 static struct kmem_cache *ocfs2_inode_cachep; 78 static struct kmem_cache *ocfs2_inode_cachep;
79 struct kmem_cache *ocfs2_dquot_cachep; 79 struct kmem_cache *ocfs2_dquot_cachep;
80 struct kmem_cache *ocfs2_qf_chunk_cachep; 80 struct kmem_cache *ocfs2_qf_chunk_cachep;
81 81
82 /* OCFS2 needs to schedule several different types of work which 82 /* OCFS2 needs to schedule several different types of work which
83 * require cluster locking, disk I/O, recovery waits, etc. Since these 83 * require cluster locking, disk I/O, recovery waits, etc. Since these
84 * types of work tend to be heavy we avoid using the kernel events 84 * types of work tend to be heavy we avoid using the kernel events
85 * workqueue and schedule on our own. */ 85 * workqueue and schedule on our own. */
86 struct workqueue_struct *ocfs2_wq = NULL; 86 struct workqueue_struct *ocfs2_wq = NULL;
87 87
88 static struct dentry *ocfs2_debugfs_root; 88 static struct dentry *ocfs2_debugfs_root;
89 89
90 MODULE_AUTHOR("Oracle"); 90 MODULE_AUTHOR("Oracle");
91 MODULE_LICENSE("GPL"); 91 MODULE_LICENSE("GPL");
92 MODULE_DESCRIPTION("OCFS2 cluster file system"); 92 MODULE_DESCRIPTION("OCFS2 cluster file system");
93 93
94 struct mount_options 94 struct mount_options
95 { 95 {
96 unsigned long commit_interval; 96 unsigned long commit_interval;
97 unsigned long mount_opt; 97 unsigned long mount_opt;
98 unsigned int atime_quantum; 98 unsigned int atime_quantum;
99 signed short slot; 99 signed short slot;
100 int localalloc_opt; 100 int localalloc_opt;
101 unsigned int resv_level; 101 unsigned int resv_level;
102 int dir_resv_level; 102 int dir_resv_level;
103 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 103 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
104 }; 104 };
105 105
106 static int ocfs2_parse_options(struct super_block *sb, char *options, 106 static int ocfs2_parse_options(struct super_block *sb, char *options,
107 struct mount_options *mopt, 107 struct mount_options *mopt,
108 int is_remount); 108 int is_remount);
109 static int ocfs2_check_set_options(struct super_block *sb, 109 static int ocfs2_check_set_options(struct super_block *sb,
110 struct mount_options *options); 110 struct mount_options *options);
111 static int ocfs2_show_options(struct seq_file *s, struct dentry *root); 111 static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
112 static void ocfs2_put_super(struct super_block *sb); 112 static void ocfs2_put_super(struct super_block *sb);
113 static int ocfs2_mount_volume(struct super_block *sb); 113 static int ocfs2_mount_volume(struct super_block *sb);
114 static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 114 static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
115 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); 115 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
116 static int ocfs2_initialize_mem_caches(void); 116 static int ocfs2_initialize_mem_caches(void);
117 static void ocfs2_free_mem_caches(void); 117 static void ocfs2_free_mem_caches(void);
118 static void ocfs2_delete_osb(struct ocfs2_super *osb); 118 static void ocfs2_delete_osb(struct ocfs2_super *osb);
119 119
120 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); 120 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
121 121
122 static int ocfs2_sync_fs(struct super_block *sb, int wait); 122 static int ocfs2_sync_fs(struct super_block *sb, int wait);
123 123
124 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 124 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
125 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 125 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
126 static void ocfs2_release_system_inodes(struct ocfs2_super *osb); 126 static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
127 static int ocfs2_check_volume(struct ocfs2_super *osb); 127 static int ocfs2_check_volume(struct ocfs2_super *osb);
128 static int ocfs2_verify_volume(struct ocfs2_dinode *di, 128 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
129 struct buffer_head *bh, 129 struct buffer_head *bh,
130 u32 sectsize, 130 u32 sectsize,
131 struct ocfs2_blockcheck_stats *stats); 131 struct ocfs2_blockcheck_stats *stats);
132 static int ocfs2_initialize_super(struct super_block *sb, 132 static int ocfs2_initialize_super(struct super_block *sb,
133 struct buffer_head *bh, 133 struct buffer_head *bh,
134 int sector_size, 134 int sector_size,
135 struct ocfs2_blockcheck_stats *stats); 135 struct ocfs2_blockcheck_stats *stats);
136 static int ocfs2_get_sector(struct super_block *sb, 136 static int ocfs2_get_sector(struct super_block *sb,
137 struct buffer_head **bh, 137 struct buffer_head **bh,
138 int block, 138 int block,
139 int sect_size); 139 int sect_size);
140 static struct inode *ocfs2_alloc_inode(struct super_block *sb); 140 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
141 static void ocfs2_destroy_inode(struct inode *inode); 141 static void ocfs2_destroy_inode(struct inode *inode);
142 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); 142 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
143 static int ocfs2_enable_quotas(struct ocfs2_super *osb); 143 static int ocfs2_enable_quotas(struct ocfs2_super *osb);
144 static void ocfs2_disable_quotas(struct ocfs2_super *osb); 144 static void ocfs2_disable_quotas(struct ocfs2_super *osb);
145 145
146 static const struct super_operations ocfs2_sops = { 146 static const struct super_operations ocfs2_sops = {
147 .statfs = ocfs2_statfs, 147 .statfs = ocfs2_statfs,
148 .alloc_inode = ocfs2_alloc_inode, 148 .alloc_inode = ocfs2_alloc_inode,
149 .destroy_inode = ocfs2_destroy_inode, 149 .destroy_inode = ocfs2_destroy_inode,
150 .drop_inode = ocfs2_drop_inode, 150 .drop_inode = ocfs2_drop_inode,
151 .evict_inode = ocfs2_evict_inode, 151 .evict_inode = ocfs2_evict_inode,
152 .sync_fs = ocfs2_sync_fs, 152 .sync_fs = ocfs2_sync_fs,
153 .put_super = ocfs2_put_super, 153 .put_super = ocfs2_put_super,
154 .remount_fs = ocfs2_remount, 154 .remount_fs = ocfs2_remount,
155 .show_options = ocfs2_show_options, 155 .show_options = ocfs2_show_options,
156 .quota_read = ocfs2_quota_read, 156 .quota_read = ocfs2_quota_read,
157 .quota_write = ocfs2_quota_write, 157 .quota_write = ocfs2_quota_write,
158 }; 158 };
159 159
160 enum { 160 enum {
161 Opt_barrier, 161 Opt_barrier,
162 Opt_err_panic, 162 Opt_err_panic,
163 Opt_err_ro, 163 Opt_err_ro,
164 Opt_intr, 164 Opt_intr,
165 Opt_nointr, 165 Opt_nointr,
166 Opt_hb_none, 166 Opt_hb_none,
167 Opt_hb_local, 167 Opt_hb_local,
168 Opt_hb_global, 168 Opt_hb_global,
169 Opt_data_ordered, 169 Opt_data_ordered,
170 Opt_data_writeback, 170 Opt_data_writeback,
171 Opt_atime_quantum, 171 Opt_atime_quantum,
172 Opt_slot, 172 Opt_slot,
173 Opt_commit, 173 Opt_commit,
174 Opt_localalloc, 174 Opt_localalloc,
175 Opt_localflocks, 175 Opt_localflocks,
176 Opt_stack, 176 Opt_stack,
177 Opt_user_xattr, 177 Opt_user_xattr,
178 Opt_nouser_xattr, 178 Opt_nouser_xattr,
179 Opt_inode64, 179 Opt_inode64,
180 Opt_acl, 180 Opt_acl,
181 Opt_noacl, 181 Opt_noacl,
182 Opt_usrquota, 182 Opt_usrquota,
183 Opt_grpquota, 183 Opt_grpquota,
184 Opt_coherency_buffered, 184 Opt_coherency_buffered,
185 Opt_coherency_full, 185 Opt_coherency_full,
186 Opt_resv_level, 186 Opt_resv_level,
187 Opt_dir_resv_level, 187 Opt_dir_resv_level,
188 Opt_err, 188 Opt_err,
189 }; 189 };
190 190
191 static const match_table_t tokens = { 191 static const match_table_t tokens = {
192 {Opt_barrier, "barrier=%u"}, 192 {Opt_barrier, "barrier=%u"},
193 {Opt_err_panic, "errors=panic"}, 193 {Opt_err_panic, "errors=panic"},
194 {Opt_err_ro, "errors=remount-ro"}, 194 {Opt_err_ro, "errors=remount-ro"},
195 {Opt_intr, "intr"}, 195 {Opt_intr, "intr"},
196 {Opt_nointr, "nointr"}, 196 {Opt_nointr, "nointr"},
197 {Opt_hb_none, OCFS2_HB_NONE}, 197 {Opt_hb_none, OCFS2_HB_NONE},
198 {Opt_hb_local, OCFS2_HB_LOCAL}, 198 {Opt_hb_local, OCFS2_HB_LOCAL},
199 {Opt_hb_global, OCFS2_HB_GLOBAL}, 199 {Opt_hb_global, OCFS2_HB_GLOBAL},
200 {Opt_data_ordered, "data=ordered"}, 200 {Opt_data_ordered, "data=ordered"},
201 {Opt_data_writeback, "data=writeback"}, 201 {Opt_data_writeback, "data=writeback"},
202 {Opt_atime_quantum, "atime_quantum=%u"}, 202 {Opt_atime_quantum, "atime_quantum=%u"},
203 {Opt_slot, "preferred_slot=%u"}, 203 {Opt_slot, "preferred_slot=%u"},
204 {Opt_commit, "commit=%u"}, 204 {Opt_commit, "commit=%u"},
205 {Opt_localalloc, "localalloc=%d"}, 205 {Opt_localalloc, "localalloc=%d"},
206 {Opt_localflocks, "localflocks"}, 206 {Opt_localflocks, "localflocks"},
207 {Opt_stack, "cluster_stack=%s"}, 207 {Opt_stack, "cluster_stack=%s"},
208 {Opt_user_xattr, "user_xattr"}, 208 {Opt_user_xattr, "user_xattr"},
209 {Opt_nouser_xattr, "nouser_xattr"}, 209 {Opt_nouser_xattr, "nouser_xattr"},
210 {Opt_inode64, "inode64"}, 210 {Opt_inode64, "inode64"},
211 {Opt_acl, "acl"}, 211 {Opt_acl, "acl"},
212 {Opt_noacl, "noacl"}, 212 {Opt_noacl, "noacl"},
213 {Opt_usrquota, "usrquota"}, 213 {Opt_usrquota, "usrquota"},
214 {Opt_grpquota, "grpquota"}, 214 {Opt_grpquota, "grpquota"},
215 {Opt_coherency_buffered, "coherency=buffered"}, 215 {Opt_coherency_buffered, "coherency=buffered"},
216 {Opt_coherency_full, "coherency=full"}, 216 {Opt_coherency_full, "coherency=full"},
217 {Opt_resv_level, "resv_level=%u"}, 217 {Opt_resv_level, "resv_level=%u"},
218 {Opt_dir_resv_level, "dir_resv_level=%u"}, 218 {Opt_dir_resv_level, "dir_resv_level=%u"},
219 {Opt_err, NULL} 219 {Opt_err, NULL}
220 }; 220 };
221 221
222 #ifdef CONFIG_DEBUG_FS 222 #ifdef CONFIG_DEBUG_FS
223 static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) 223 static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
224 { 224 {
225 struct ocfs2_cluster_connection *cconn = osb->cconn; 225 struct ocfs2_cluster_connection *cconn = osb->cconn;
226 struct ocfs2_recovery_map *rm = osb->recovery_map; 226 struct ocfs2_recovery_map *rm = osb->recovery_map;
227 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; 227 struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
228 int i, out = 0; 228 int i, out = 0;
229 229
230 out += snprintf(buf + out, len - out, 230 out += snprintf(buf + out, len - out,
231 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", 231 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
232 "Device", osb->dev_str, osb->uuid_str, 232 "Device", osb->dev_str, osb->uuid_str,
233 osb->fs_generation, osb->vol_label); 233 osb->fs_generation, osb->vol_label);
234 234
235 out += snprintf(buf + out, len - out, 235 out += snprintf(buf + out, len - out,
236 "%10s => State: %d Flags: 0x%lX\n", "Volume", 236 "%10s => State: %d Flags: 0x%lX\n", "Volume",
237 atomic_read(&osb->vol_state), osb->osb_flags); 237 atomic_read(&osb->vol_state), osb->osb_flags);
238 238
239 out += snprintf(buf + out, len - out, 239 out += snprintf(buf + out, len - out,
240 "%10s => Block: %lu Cluster: %d\n", "Sizes", 240 "%10s => Block: %lu Cluster: %d\n", "Sizes",
241 osb->sb->s_blocksize, osb->s_clustersize); 241 osb->sb->s_blocksize, osb->s_clustersize);
242 242
243 out += snprintf(buf + out, len - out, 243 out += snprintf(buf + out, len - out,
244 "%10s => Compat: 0x%X Incompat: 0x%X " 244 "%10s => Compat: 0x%X Incompat: 0x%X "
245 "ROcompat: 0x%X\n", 245 "ROcompat: 0x%X\n",
246 "Features", osb->s_feature_compat, 246 "Features", osb->s_feature_compat,
247 osb->s_feature_incompat, osb->s_feature_ro_compat); 247 osb->s_feature_incompat, osb->s_feature_ro_compat);
248 248
249 out += snprintf(buf + out, len - out, 249 out += snprintf(buf + out, len - out,
250 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", 250 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
251 osb->s_mount_opt, osb->s_atime_quantum); 251 osb->s_mount_opt, osb->s_atime_quantum);
252 252
253 if (cconn) { 253 if (cconn) {
254 out += snprintf(buf + out, len - out, 254 out += snprintf(buf + out, len - out,
255 "%10s => Stack: %s Name: %*s " 255 "%10s => Stack: %s Name: %*s "
256 "Version: %d.%d\n", "Cluster", 256 "Version: %d.%d\n", "Cluster",
257 (*osb->osb_cluster_stack == '\0' ? 257 (*osb->osb_cluster_stack == '\0' ?
258 "o2cb" : osb->osb_cluster_stack), 258 "o2cb" : osb->osb_cluster_stack),
259 cconn->cc_namelen, cconn->cc_name, 259 cconn->cc_namelen, cconn->cc_name,
260 cconn->cc_version.pv_major, 260 cconn->cc_version.pv_major,
261 cconn->cc_version.pv_minor); 261 cconn->cc_version.pv_minor);
262 } 262 }
263 263
264 spin_lock(&osb->dc_task_lock); 264 spin_lock(&osb->dc_task_lock);
265 out += snprintf(buf + out, len - out, 265 out += snprintf(buf + out, len - out,
266 "%10s => Pid: %d Count: %lu WakeSeq: %lu " 266 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
267 "WorkSeq: %lu\n", "DownCnvt", 267 "WorkSeq: %lu\n", "DownCnvt",
268 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), 268 (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
269 osb->blocked_lock_count, osb->dc_wake_sequence, 269 osb->blocked_lock_count, osb->dc_wake_sequence,
270 osb->dc_work_sequence); 270 osb->dc_work_sequence);
271 spin_unlock(&osb->dc_task_lock); 271 spin_unlock(&osb->dc_task_lock);
272 272
273 spin_lock(&osb->osb_lock); 273 spin_lock(&osb->osb_lock);
274 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", 274 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
275 "Recovery", 275 "Recovery",
276 (osb->recovery_thread_task ? 276 (osb->recovery_thread_task ?
277 task_pid_nr(osb->recovery_thread_task) : -1)); 277 task_pid_nr(osb->recovery_thread_task) : -1));
278 if (rm->rm_used == 0) 278 if (rm->rm_used == 0)
279 out += snprintf(buf + out, len - out, " None\n"); 279 out += snprintf(buf + out, len - out, " None\n");
280 else { 280 else {
281 for (i = 0; i < rm->rm_used; i++) 281 for (i = 0; i < rm->rm_used; i++)
282 out += snprintf(buf + out, len - out, " %d", 282 out += snprintf(buf + out, len - out, " %d",
283 rm->rm_entries[i]); 283 rm->rm_entries[i]);
284 out += snprintf(buf + out, len - out, "\n"); 284 out += snprintf(buf + out, len - out, "\n");
285 } 285 }
286 spin_unlock(&osb->osb_lock); 286 spin_unlock(&osb->osb_lock);
287 287
288 out += snprintf(buf + out, len - out, 288 out += snprintf(buf + out, len - out,
289 "%10s => Pid: %d Interval: %lu\n", "Commit", 289 "%10s => Pid: %d Interval: %lu\n", "Commit",
290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), 290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
291 osb->osb_commit_interval); 291 osb->osb_commit_interval);
292 292
293 out += snprintf(buf + out, len - out, 293 out += snprintf(buf + out, len - out,
294 "%10s => State: %d TxnId: %lu NumTxns: %d\n", 294 "%10s => State: %d TxnId: %lu NumTxns: %d\n",
295 "Journal", osb->journal->j_state, 295 "Journal", osb->journal->j_state,
296 osb->journal->j_trans_id, 296 osb->journal->j_trans_id,
297 atomic_read(&osb->journal->j_num_trans)); 297 atomic_read(&osb->journal->j_num_trans));
298 298
299 out += snprintf(buf + out, len - out, 299 out += snprintf(buf + out, len - out,
300 "%10s => GlobalAllocs: %d LocalAllocs: %d " 300 "%10s => GlobalAllocs: %d LocalAllocs: %d "
301 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", 301 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
302 "Stats", 302 "Stats",
303 atomic_read(&osb->alloc_stats.bitmap_data), 303 atomic_read(&osb->alloc_stats.bitmap_data),
304 atomic_read(&osb->alloc_stats.local_data), 304 atomic_read(&osb->alloc_stats.local_data),
305 atomic_read(&osb->alloc_stats.bg_allocs), 305 atomic_read(&osb->alloc_stats.bg_allocs),
306 atomic_read(&osb->alloc_stats.moves), 306 atomic_read(&osb->alloc_stats.moves),
307 atomic_read(&osb->alloc_stats.bg_extends)); 307 atomic_read(&osb->alloc_stats.bg_extends));
308 308
309 out += snprintf(buf + out, len - out, 309 out += snprintf(buf + out, len - out,
310 "%10s => State: %u Descriptor: %llu Size: %u bits " 310 "%10s => State: %u Descriptor: %llu Size: %u bits "
311 "Default: %u bits\n", 311 "Default: %u bits\n",
312 "LocalAlloc", osb->local_alloc_state, 312 "LocalAlloc", osb->local_alloc_state,
313 (unsigned long long)osb->la_last_gd, 313 (unsigned long long)osb->la_last_gd,
314 osb->local_alloc_bits, osb->local_alloc_default_bits); 314 osb->local_alloc_bits, osb->local_alloc_default_bits);
315 315
316 spin_lock(&osb->osb_lock); 316 spin_lock(&osb->osb_lock);
317 out += snprintf(buf + out, len - out, 317 out += snprintf(buf + out, len - out,
318 "%10s => InodeSlot: %d StolenInodes: %d, " 318 "%10s => InodeSlot: %d StolenInodes: %d, "
319 "MetaSlot: %d StolenMeta: %d\n", "Steal", 319 "MetaSlot: %d StolenMeta: %d\n", "Steal",
320 osb->s_inode_steal_slot, 320 osb->s_inode_steal_slot,
321 atomic_read(&osb->s_num_inodes_stolen), 321 atomic_read(&osb->s_num_inodes_stolen),
322 osb->s_meta_steal_slot, 322 osb->s_meta_steal_slot,
323 atomic_read(&osb->s_num_meta_stolen)); 323 atomic_read(&osb->s_num_meta_stolen));
324 spin_unlock(&osb->osb_lock); 324 spin_unlock(&osb->osb_lock);
325 325
326 out += snprintf(buf + out, len - out, "OrphanScan => "); 326 out += snprintf(buf + out, len - out, "OrphanScan => ");
327 out += snprintf(buf + out, len - out, "Local: %u Global: %u ", 327 out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
328 os->os_count, os->os_seqno); 328 os->os_count, os->os_seqno);
329 out += snprintf(buf + out, len - out, " Last Scan: "); 329 out += snprintf(buf + out, len - out, " Last Scan: ");
330 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 330 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
331 out += snprintf(buf + out, len - out, "Disabled\n"); 331 out += snprintf(buf + out, len - out, "Disabled\n");
332 else 332 else
333 out += snprintf(buf + out, len - out, "%lu seconds ago\n", 333 out += snprintf(buf + out, len - out, "%lu seconds ago\n",
334 (get_seconds() - os->os_scantime.tv_sec)); 334 (get_seconds() - os->os_scantime.tv_sec));
335 335
336 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", 336 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
337 "Slots", "Num", "RecoGen"); 337 "Slots", "Num", "RecoGen");
338 for (i = 0; i < osb->max_slots; ++i) { 338 for (i = 0; i < osb->max_slots; ++i) {
339 out += snprintf(buf + out, len - out, 339 out += snprintf(buf + out, len - out,
340 "%10s %c %3d %10d\n", 340 "%10s %c %3d %10d\n",
341 " ", 341 " ",
342 (i == osb->slot_num ? '*' : ' '), 342 (i == osb->slot_num ? '*' : ' '),
343 i, osb->slot_recovery_generations[i]); 343 i, osb->slot_recovery_generations[i]);
344 } 344 }
345 345
346 return out; 346 return out;
347 } 347 }
348 348
349 static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) 349 static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
350 { 350 {
351 struct ocfs2_super *osb = inode->i_private; 351 struct ocfs2_super *osb = inode->i_private;
352 char *buf = NULL; 352 char *buf = NULL;
353 353
354 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 354 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
355 if (!buf) 355 if (!buf)
356 goto bail; 356 goto bail;
357 357
358 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); 358 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
359 359
360 file->private_data = buf; 360 file->private_data = buf;
361 361
362 return 0; 362 return 0;
363 bail: 363 bail:
364 return -ENOMEM; 364 return -ENOMEM;
365 } 365 }
366 366
367 static int ocfs2_debug_release(struct inode *inode, struct file *file) 367 static int ocfs2_debug_release(struct inode *inode, struct file *file)
368 { 368 {
369 kfree(file->private_data); 369 kfree(file->private_data);
370 return 0; 370 return 0;
371 } 371 }
372 372
373 static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, 373 static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
374 size_t nbytes, loff_t *ppos) 374 size_t nbytes, loff_t *ppos)
375 { 375 {
376 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, 376 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
377 i_size_read(file->f_mapping->host)); 377 i_size_read(file->f_mapping->host));
378 } 378 }
379 #else 379 #else
380 static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) 380 static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
381 { 381 {
382 return 0; 382 return 0;
383 } 383 }
384 static int ocfs2_debug_release(struct inode *inode, struct file *file) 384 static int ocfs2_debug_release(struct inode *inode, struct file *file)
385 { 385 {
386 return 0; 386 return 0;
387 } 387 }
388 static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, 388 static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
389 size_t nbytes, loff_t *ppos) 389 size_t nbytes, loff_t *ppos)
390 { 390 {
391 return 0; 391 return 0;
392 } 392 }
393 #endif /* CONFIG_DEBUG_FS */ 393 #endif /* CONFIG_DEBUG_FS */
394 394
395 static const struct file_operations ocfs2_osb_debug_fops = { 395 static const struct file_operations ocfs2_osb_debug_fops = {
396 .open = ocfs2_osb_debug_open, 396 .open = ocfs2_osb_debug_open,
397 .release = ocfs2_debug_release, 397 .release = ocfs2_debug_release,
398 .read = ocfs2_debug_read, 398 .read = ocfs2_debug_read,
399 .llseek = generic_file_llseek, 399 .llseek = generic_file_llseek,
400 }; 400 };
401 401
402 static int ocfs2_sync_fs(struct super_block *sb, int wait) 402 static int ocfs2_sync_fs(struct super_block *sb, int wait)
403 { 403 {
404 int status; 404 int status;
405 tid_t target; 405 tid_t target;
406 struct ocfs2_super *osb = OCFS2_SB(sb); 406 struct ocfs2_super *osb = OCFS2_SB(sb);
407 407
408 if (ocfs2_is_hard_readonly(osb)) 408 if (ocfs2_is_hard_readonly(osb))
409 return -EROFS; 409 return -EROFS;
410 410
411 if (wait) { 411 if (wait) {
412 status = ocfs2_flush_truncate_log(osb); 412 status = ocfs2_flush_truncate_log(osb);
413 if (status < 0) 413 if (status < 0)
414 mlog_errno(status); 414 mlog_errno(status);
415 } else { 415 } else {
416 ocfs2_schedule_truncate_log_flush(osb, 0); 416 ocfs2_schedule_truncate_log_flush(osb, 0);
417 } 417 }
418 418
419 if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, 419 if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
420 &target)) { 420 &target)) {
421 if (wait) 421 if (wait)
422 jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, 422 jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
423 target); 423 target);
424 } 424 }
425 return 0; 425 return 0;
426 } 426 }
427 427
428 static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) 428 static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
429 { 429 {
430 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) 430 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
431 && (ino == USER_QUOTA_SYSTEM_INODE 431 && (ino == USER_QUOTA_SYSTEM_INODE
432 || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) 432 || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
433 return 0; 433 return 0;
434 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 434 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
435 && (ino == GROUP_QUOTA_SYSTEM_INODE 435 && (ino == GROUP_QUOTA_SYSTEM_INODE
436 || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) 436 || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
437 return 0; 437 return 0;
438 return 1; 438 return 1;
439 } 439 }
440 440
441 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) 441 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
442 { 442 {
443 struct inode *new = NULL; 443 struct inode *new = NULL;
444 int status = 0; 444 int status = 0;
445 int i; 445 int i;
446 446
447 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); 447 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
448 if (IS_ERR(new)) { 448 if (IS_ERR(new)) {
449 status = PTR_ERR(new); 449 status = PTR_ERR(new);
450 mlog_errno(status); 450 mlog_errno(status);
451 goto bail; 451 goto bail;
452 } 452 }
453 osb->root_inode = new; 453 osb->root_inode = new;
454 454
455 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); 455 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
456 if (IS_ERR(new)) { 456 if (IS_ERR(new)) {
457 status = PTR_ERR(new); 457 status = PTR_ERR(new);
458 mlog_errno(status); 458 mlog_errno(status);
459 goto bail; 459 goto bail;
460 } 460 }
461 osb->sys_root_inode = new; 461 osb->sys_root_inode = new;
462 462
463 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; 463 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
464 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { 464 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
465 if (!ocfs2_need_system_inode(osb, i)) 465 if (!ocfs2_need_system_inode(osb, i))
466 continue; 466 continue;
467 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 467 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
468 if (!new) { 468 if (!new) {
469 ocfs2_release_system_inodes(osb); 469 ocfs2_release_system_inodes(osb);
470 status = -EINVAL; 470 status = -EINVAL;
471 mlog_errno(status); 471 mlog_errno(status);
472 /* FIXME: Should ERROR_RO_FS */ 472 /* FIXME: Should ERROR_RO_FS */
473 mlog(ML_ERROR, "Unable to load system inode %d, " 473 mlog(ML_ERROR, "Unable to load system inode %d, "
474 "possibly corrupt fs?", i); 474 "possibly corrupt fs?", i);
475 goto bail; 475 goto bail;
476 } 476 }
477 // the array now has one ref, so drop this one 477 // the array now has one ref, so drop this one
478 iput(new); 478 iput(new);
479 } 479 }
480 480
481 bail: 481 bail:
482 if (status) 482 if (status)
483 mlog_errno(status); 483 mlog_errno(status);
484 return status; 484 return status;
485 } 485 }
486 486
487 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) 487 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
488 { 488 {
489 struct inode *new = NULL; 489 struct inode *new = NULL;
490 int status = 0; 490 int status = 0;
491 int i; 491 int i;
492 492
493 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; 493 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
494 i < NUM_SYSTEM_INODES; 494 i < NUM_SYSTEM_INODES;
495 i++) { 495 i++) {
496 if (!ocfs2_need_system_inode(osb, i)) 496 if (!ocfs2_need_system_inode(osb, i))
497 continue; 497 continue;
498 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 498 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
499 if (!new) { 499 if (!new) {
500 ocfs2_release_system_inodes(osb); 500 ocfs2_release_system_inodes(osb);
501 status = -EINVAL; 501 status = -EINVAL;
502 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", 502 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
503 status, i, osb->slot_num); 503 status, i, osb->slot_num);
504 goto bail; 504 goto bail;
505 } 505 }
506 /* the array now has one ref, so drop this one */ 506 /* the array now has one ref, so drop this one */
507 iput(new); 507 iput(new);
508 } 508 }
509 509
510 bail: 510 bail:
511 if (status) 511 if (status)
512 mlog_errno(status); 512 mlog_errno(status);
513 return status; 513 return status;
514 } 514 }
515 515
516 static void ocfs2_release_system_inodes(struct ocfs2_super *osb) 516 static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
517 { 517 {
518 int i; 518 int i;
519 struct inode *inode; 519 struct inode *inode;
520 520
521 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { 521 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
522 inode = osb->global_system_inodes[i]; 522 inode = osb->global_system_inodes[i];
523 if (inode) { 523 if (inode) {
524 iput(inode); 524 iput(inode);
525 osb->global_system_inodes[i] = NULL; 525 osb->global_system_inodes[i] = NULL;
526 } 526 }
527 } 527 }
528 528
529 inode = osb->sys_root_inode; 529 inode = osb->sys_root_inode;
530 if (inode) { 530 if (inode) {
531 iput(inode); 531 iput(inode);
532 osb->sys_root_inode = NULL; 532 osb->sys_root_inode = NULL;
533 } 533 }
534 534
535 inode = osb->root_inode; 535 inode = osb->root_inode;
536 if (inode) { 536 if (inode) {
537 iput(inode); 537 iput(inode);
538 osb->root_inode = NULL; 538 osb->root_inode = NULL;
539 } 539 }
540 540
541 if (!osb->local_system_inodes) 541 if (!osb->local_system_inodes)
542 return; 542 return;
543 543
544 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { 544 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
545 if (osb->local_system_inodes[i]) { 545 if (osb->local_system_inodes[i]) {
546 iput(osb->local_system_inodes[i]); 546 iput(osb->local_system_inodes[i]);
547 osb->local_system_inodes[i] = NULL; 547 osb->local_system_inodes[i] = NULL;
548 } 548 }
549 } 549 }
550 550
551 kfree(osb->local_system_inodes); 551 kfree(osb->local_system_inodes);
552 osb->local_system_inodes = NULL; 552 osb->local_system_inodes = NULL;
553 } 553 }
554 554
555 /* We're allocating fs objects, use GFP_NOFS */ 555 /* We're allocating fs objects, use GFP_NOFS */
556 static struct inode *ocfs2_alloc_inode(struct super_block *sb) 556 static struct inode *ocfs2_alloc_inode(struct super_block *sb)
557 { 557 {
558 struct ocfs2_inode_info *oi; 558 struct ocfs2_inode_info *oi;
559 559
560 oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); 560 oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
561 if (!oi) 561 if (!oi)
562 return NULL; 562 return NULL;
563 563
564 oi->i_sync_tid = 0; 564 oi->i_sync_tid = 0;
565 oi->i_datasync_tid = 0; 565 oi->i_datasync_tid = 0;
566 566
567 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); 567 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
568 return &oi->vfs_inode; 568 return &oi->vfs_inode;
569 } 569 }
570 570
571 static void ocfs2_i_callback(struct rcu_head *head) 571 static void ocfs2_i_callback(struct rcu_head *head)
572 { 572 {
573 struct inode *inode = container_of(head, struct inode, i_rcu); 573 struct inode *inode = container_of(head, struct inode, i_rcu);
574 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); 574 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
575 } 575 }
576 576
577 static void ocfs2_destroy_inode(struct inode *inode) 577 static void ocfs2_destroy_inode(struct inode *inode)
578 { 578 {
579 call_rcu(&inode->i_rcu, ocfs2_i_callback); 579 call_rcu(&inode->i_rcu, ocfs2_i_callback);
580 } 580 }
581 581
582 static unsigned long long ocfs2_max_file_offset(unsigned int bbits, 582 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
583 unsigned int cbits) 583 unsigned int cbits)
584 { 584 {
585 unsigned int bytes = 1 << cbits; 585 unsigned int bytes = 1 << cbits;
586 unsigned int trim = bytes; 586 unsigned int trim = bytes;
587 unsigned int bitshift = 32; 587 unsigned int bitshift = 32;
588 588
589 /* 589 /*
590 * i_size and all block offsets in ocfs2 are always 64 bits 590 * i_size and all block offsets in ocfs2 are always 64 bits
591 * wide. i_clusters is 32 bits, in cluster-sized units. So on 591 * wide. i_clusters is 32 bits, in cluster-sized units. So on
592 * 64 bit platforms, cluster size will be the limiting factor. 592 * 64 bit platforms, cluster size will be the limiting factor.
593 */ 593 */
594 594
595 #if BITS_PER_LONG == 32 595 #if BITS_PER_LONG == 32
596 # if defined(CONFIG_LBDAF) 596 # if defined(CONFIG_LBDAF)
597 BUILD_BUG_ON(sizeof(sector_t) != 8); 597 BUILD_BUG_ON(sizeof(sector_t) != 8);
598 /* 598 /*
599 * We might be limited by page cache size. 599 * We might be limited by page cache size.
600 */ 600 */
601 if (bytes > PAGE_CACHE_SIZE) { 601 if (bytes > PAGE_CACHE_SIZE) {
602 bytes = PAGE_CACHE_SIZE; 602 bytes = PAGE_CACHE_SIZE;
603 trim = 1; 603 trim = 1;
604 /* 604 /*
605 * Shift by 31 here so that we don't get larger than 605 * Shift by 31 here so that we don't get larger than
606 * MAX_LFS_FILESIZE 606 * MAX_LFS_FILESIZE
607 */ 607 */
608 bitshift = 31; 608 bitshift = 31;
609 } 609 }
610 # else 610 # else
611 /* 611 /*
612 * We are limited by the size of sector_t. Use block size, as 612 * We are limited by the size of sector_t. Use block size, as
613 * that's what we expose to the VFS. 613 * that's what we expose to the VFS.
614 */ 614 */
615 bytes = 1 << bbits; 615 bytes = 1 << bbits;
616 trim = 1; 616 trim = 1;
617 bitshift = 31; 617 bitshift = 31;
618 # endif 618 # endif
619 #endif 619 #endif
620 620
621 /* 621 /*
622 * Trim by a whole cluster when we can actually approach the 622 * Trim by a whole cluster when we can actually approach the
623 * on-disk limits. Otherwise we can overflow i_clusters when 623 * on-disk limits. Otherwise we can overflow i_clusters when
624 * an extent start is at the max offset. 624 * an extent start is at the max offset.
625 */ 625 */
626 return (((unsigned long long)bytes) << bitshift) - trim; 626 return (((unsigned long long)bytes) << bitshift) - trim;
627 } 627 }
628 628
629 static int ocfs2_remount(struct super_block *sb, int *flags, char *data) 629 static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
630 { 630 {
631 int incompat_features; 631 int incompat_features;
632 int ret = 0; 632 int ret = 0;
633 struct mount_options parsed_options; 633 struct mount_options parsed_options;
634 struct ocfs2_super *osb = OCFS2_SB(sb); 634 struct ocfs2_super *osb = OCFS2_SB(sb);
635 u32 tmp; 635 u32 tmp;
636 636
637 sync_filesystem(sb); 637 sync_filesystem(sb);
638 638
639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
640 !ocfs2_check_set_options(sb, &parsed_options)) { 640 !ocfs2_check_set_options(sb, &parsed_options)) {
641 ret = -EINVAL; 641 ret = -EINVAL;
642 goto out; 642 goto out;
643 } 643 }
644 644
645 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | 645 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
646 OCFS2_MOUNT_HB_NONE; 646 OCFS2_MOUNT_HB_NONE;
647 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { 647 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
648 ret = -EINVAL; 648 ret = -EINVAL;
649 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 649 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
650 goto out; 650 goto out;
651 } 651 }
652 652
653 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != 653 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
654 (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { 654 (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
655 ret = -EINVAL; 655 ret = -EINVAL;
656 mlog(ML_ERROR, "Cannot change data mode on remount\n"); 656 mlog(ML_ERROR, "Cannot change data mode on remount\n");
657 goto out; 657 goto out;
658 } 658 }
659 659
660 /* Probably don't want this on remount; it might 660 /* Probably don't want this on remount; it might
661 * mess with other nodes */ 661 * mess with other nodes */
662 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && 662 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
663 (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { 663 (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
664 ret = -EINVAL; 664 ret = -EINVAL;
665 mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); 665 mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
666 goto out; 666 goto out;
667 } 667 }
668 668
669 /* We're going to/from readonly mode. */ 669 /* We're going to/from readonly mode. */
670 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 670 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
671 /* Disable quota accounting before remounting RO */ 671 /* Disable quota accounting before remounting RO */
672 if (*flags & MS_RDONLY) { 672 if (*flags & MS_RDONLY) {
673 ret = ocfs2_susp_quotas(osb, 0); 673 ret = ocfs2_susp_quotas(osb, 0);
674 if (ret < 0) 674 if (ret < 0)
675 goto out; 675 goto out;
676 } 676 }
677 /* Lock here so the check of HARD_RO and the potential 677 /* Lock here so the check of HARD_RO and the potential
678 * setting of SOFT_RO is atomic. */ 678 * setting of SOFT_RO is atomic. */
679 spin_lock(&osb->osb_lock); 679 spin_lock(&osb->osb_lock);
680 if (osb->osb_flags & OCFS2_OSB_HARD_RO) { 680 if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
681 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); 681 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
682 ret = -EROFS; 682 ret = -EROFS;
683 goto unlock_osb; 683 goto unlock_osb;
684 } 684 }
685 685
686 if (*flags & MS_RDONLY) { 686 if (*flags & MS_RDONLY) {
687 sb->s_flags |= MS_RDONLY; 687 sb->s_flags |= MS_RDONLY;
688 osb->osb_flags |= OCFS2_OSB_SOFT_RO; 688 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
689 } else { 689 } else {
690 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { 690 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
691 mlog(ML_ERROR, "Cannot remount RDWR " 691 mlog(ML_ERROR, "Cannot remount RDWR "
692 "filesystem due to previous errors.\n"); 692 "filesystem due to previous errors.\n");
693 ret = -EROFS; 693 ret = -EROFS;
694 goto unlock_osb; 694 goto unlock_osb;
695 } 695 }
696 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); 696 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
697 if (incompat_features) { 697 if (incompat_features) {
698 mlog(ML_ERROR, "Cannot remount RDWR because " 698 mlog(ML_ERROR, "Cannot remount RDWR because "
699 "of unsupported optional features " 699 "of unsupported optional features "
700 "(%x).\n", incompat_features); 700 "(%x).\n", incompat_features);
701 ret = -EINVAL; 701 ret = -EINVAL;
702 goto unlock_osb; 702 goto unlock_osb;
703 } 703 }
704 sb->s_flags &= ~MS_RDONLY; 704 sb->s_flags &= ~MS_RDONLY;
705 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; 705 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
706 } 706 }
707 trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); 707 trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
708 unlock_osb: 708 unlock_osb:
709 spin_unlock(&osb->osb_lock); 709 spin_unlock(&osb->osb_lock);
710 /* Enable quota accounting after remounting RW */ 710 /* Enable quota accounting after remounting RW */
711 if (!ret && !(*flags & MS_RDONLY)) { 711 if (!ret && !(*flags & MS_RDONLY)) {
712 if (sb_any_quota_suspended(sb)) 712 if (sb_any_quota_suspended(sb))
713 ret = ocfs2_susp_quotas(osb, 1); 713 ret = ocfs2_susp_quotas(osb, 1);
714 else 714 else
715 ret = ocfs2_enable_quotas(osb); 715 ret = ocfs2_enable_quotas(osb);
716 if (ret < 0) { 716 if (ret < 0) {
717 /* Return back changes... */ 717 /* Return back changes... */
718 spin_lock(&osb->osb_lock); 718 spin_lock(&osb->osb_lock);
719 sb->s_flags |= MS_RDONLY; 719 sb->s_flags |= MS_RDONLY;
720 osb->osb_flags |= OCFS2_OSB_SOFT_RO; 720 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
721 spin_unlock(&osb->osb_lock); 721 spin_unlock(&osb->osb_lock);
722 goto out; 722 goto out;
723 } 723 }
724 } 724 }
725 } 725 }
726 726
727 if (!ret) { 727 if (!ret) {
728 /* Only save off the new mount options in case of a successful 728 /* Only save off the new mount options in case of a successful
729 * remount. */ 729 * remount. */
730 osb->s_mount_opt = parsed_options.mount_opt; 730 osb->s_mount_opt = parsed_options.mount_opt;
731 osb->s_atime_quantum = parsed_options.atime_quantum; 731 osb->s_atime_quantum = parsed_options.atime_quantum;
732 osb->preferred_slot = parsed_options.slot; 732 osb->preferred_slot = parsed_options.slot;
733 if (parsed_options.commit_interval) 733 if (parsed_options.commit_interval)
734 osb->osb_commit_interval = parsed_options.commit_interval; 734 osb->osb_commit_interval = parsed_options.commit_interval;
735 735
736 if (!ocfs2_is_hard_readonly(osb)) 736 if (!ocfs2_is_hard_readonly(osb))
737 ocfs2_set_journal_params(osb); 737 ocfs2_set_journal_params(osb);
738 738
739 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 739 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
740 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? 740 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
741 MS_POSIXACL : 0); 741 MS_POSIXACL : 0);
742 } 742 }
743 out: 743 out:
744 return ret; 744 return ret;
745 } 745 }
746 746
747 static int ocfs2_sb_probe(struct super_block *sb, 747 static int ocfs2_sb_probe(struct super_block *sb,
748 struct buffer_head **bh, 748 struct buffer_head **bh,
749 int *sector_size, 749 int *sector_size,
750 struct ocfs2_blockcheck_stats *stats) 750 struct ocfs2_blockcheck_stats *stats)
751 { 751 {
752 int status, tmpstat; 752 int status, tmpstat;
753 struct ocfs1_vol_disk_hdr *hdr; 753 struct ocfs1_vol_disk_hdr *hdr;
754 struct ocfs2_dinode *di; 754 struct ocfs2_dinode *di;
755 int blksize; 755 int blksize;
756 756
757 *bh = NULL; 757 *bh = NULL;
758 758
759 /* may be > 512 */ 759 /* may be > 512 */
760 *sector_size = bdev_logical_block_size(sb->s_bdev); 760 *sector_size = bdev_logical_block_size(sb->s_bdev);
761 if (*sector_size > OCFS2_MAX_BLOCKSIZE) { 761 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
762 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", 762 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
763 *sector_size, OCFS2_MAX_BLOCKSIZE); 763 *sector_size, OCFS2_MAX_BLOCKSIZE);
764 status = -EINVAL; 764 status = -EINVAL;
765 goto bail; 765 goto bail;
766 } 766 }
767 767
768 /* Can this really happen? */ 768 /* Can this really happen? */
769 if (*sector_size < OCFS2_MIN_BLOCKSIZE) 769 if (*sector_size < OCFS2_MIN_BLOCKSIZE)
770 *sector_size = OCFS2_MIN_BLOCKSIZE; 770 *sector_size = OCFS2_MIN_BLOCKSIZE;
771 771
772 /* check block zero for old format */ 772 /* check block zero for old format */
773 status = ocfs2_get_sector(sb, bh, 0, *sector_size); 773 status = ocfs2_get_sector(sb, bh, 0, *sector_size);
774 if (status < 0) { 774 if (status < 0) {
775 mlog_errno(status); 775 mlog_errno(status);
776 goto bail; 776 goto bail;
777 } 777 }
778 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; 778 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
779 if (hdr->major_version == OCFS1_MAJOR_VERSION) { 779 if (hdr->major_version == OCFS1_MAJOR_VERSION) {
780 mlog(ML_ERROR, "incompatible version: %u.%u\n", 780 mlog(ML_ERROR, "incompatible version: %u.%u\n",
781 hdr->major_version, hdr->minor_version); 781 hdr->major_version, hdr->minor_version);
782 status = -EINVAL; 782 status = -EINVAL;
783 } 783 }
784 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, 784 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
785 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { 785 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
786 mlog(ML_ERROR, "incompatible volume signature: %8s\n", 786 mlog(ML_ERROR, "incompatible volume signature: %8s\n",
787 hdr->signature); 787 hdr->signature);
788 status = -EINVAL; 788 status = -EINVAL;
789 } 789 }
790 brelse(*bh); 790 brelse(*bh);
791 *bh = NULL; 791 *bh = NULL;
792 if (status < 0) { 792 if (status < 0) {
793 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " 793 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
794 "upgraded before mounting with ocfs v2\n"); 794 "upgraded before mounting with ocfs v2\n");
795 goto bail; 795 goto bail;
796 } 796 }
797 797
798 /* 798 /*
799 * Now check at magic offset for 512, 1024, 2048, 4096 799 * Now check at magic offset for 512, 1024, 2048, 4096
800 * blocksizes. 4096 is the maximum blocksize because it is 800 * blocksizes. 4096 is the maximum blocksize because it is
801 * the minimum clustersize. 801 * the minimum clustersize.
802 */ 802 */
803 status = -EINVAL; 803 status = -EINVAL;
804 for (blksize = *sector_size; 804 for (blksize = *sector_size;
805 blksize <= OCFS2_MAX_BLOCKSIZE; 805 blksize <= OCFS2_MAX_BLOCKSIZE;
806 blksize <<= 1) { 806 blksize <<= 1) {
807 tmpstat = ocfs2_get_sector(sb, bh, 807 tmpstat = ocfs2_get_sector(sb, bh,
808 OCFS2_SUPER_BLOCK_BLKNO, 808 OCFS2_SUPER_BLOCK_BLKNO,
809 blksize); 809 blksize);
810 if (tmpstat < 0) { 810 if (tmpstat < 0) {
811 status = tmpstat; 811 status = tmpstat;
812 mlog_errno(status); 812 mlog_errno(status);
813 break; 813 break;
814 } 814 }
815 di = (struct ocfs2_dinode *) (*bh)->b_data; 815 di = (struct ocfs2_dinode *) (*bh)->b_data;
816 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); 816 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
817 spin_lock_init(&stats->b_lock); 817 spin_lock_init(&stats->b_lock);
818 tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); 818 tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
819 if (tmpstat < 0) { 819 if (tmpstat < 0) {
820 brelse(*bh); 820 brelse(*bh);
821 *bh = NULL; 821 *bh = NULL;
822 } 822 }
823 if (tmpstat != -EAGAIN) { 823 if (tmpstat != -EAGAIN) {
824 status = tmpstat; 824 status = tmpstat;
825 break; 825 break;
826 } 826 }
827 } 827 }
828 828
829 bail: 829 bail:
830 return status; 830 return status;
831 } 831 }
832 832
833 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 833 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
834 { 834 {
835 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; 835 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
836 836
837 if (osb->s_mount_opt & hb_enabled) { 837 if (osb->s_mount_opt & hb_enabled) {
838 if (ocfs2_mount_local(osb)) { 838 if (ocfs2_mount_local(osb)) {
839 mlog(ML_ERROR, "Cannot heartbeat on a locally " 839 mlog(ML_ERROR, "Cannot heartbeat on a locally "
840 "mounted device.\n"); 840 "mounted device.\n");
841 return -EINVAL; 841 return -EINVAL;
842 } 842 }
843 if (ocfs2_userspace_stack(osb)) { 843 if (ocfs2_userspace_stack(osb)) {
844 mlog(ML_ERROR, "Userspace stack expected, but " 844 mlog(ML_ERROR, "Userspace stack expected, but "
845 "o2cb heartbeat arguments passed to mount\n"); 845 "o2cb heartbeat arguments passed to mount\n");
846 return -EINVAL; 846 return -EINVAL;
847 } 847 }
848 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && 848 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
849 !ocfs2_cluster_o2cb_global_heartbeat(osb)) || 849 !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
850 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && 850 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
851 ocfs2_cluster_o2cb_global_heartbeat(osb))) { 851 ocfs2_cluster_o2cb_global_heartbeat(osb))) {
852 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); 852 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
853 return -EINVAL; 853 return -EINVAL;
854 } 854 }
855 } 855 }
856 856
857 if (!(osb->s_mount_opt & hb_enabled)) { 857 if (!(osb->s_mount_opt & hb_enabled)) {
858 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && 858 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
859 !ocfs2_userspace_stack(osb)) { 859 !ocfs2_userspace_stack(osb)) {
860 mlog(ML_ERROR, "Heartbeat has to be started to mount " 860 mlog(ML_ERROR, "Heartbeat has to be started to mount "
861 "a read-write clustered device.\n"); 861 "a read-write clustered device.\n");
862 return -EINVAL; 862 return -EINVAL;
863 } 863 }
864 } 864 }
865 865
866 return 0; 866 return 0;
867 } 867 }
868 868
869 /* 869 /*
870 * If we're using a userspace stack, mount should have passed 870 * If we're using a userspace stack, mount should have passed
871 * a name that matches the disk. If not, mount should not 871 * a name that matches the disk. If not, mount should not
872 * have passed a stack. 872 * have passed a stack.
873 */ 873 */
874 static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, 874 static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
875 struct mount_options *mopt) 875 struct mount_options *mopt)
876 { 876 {
877 if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { 877 if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
878 mlog(ML_ERROR, 878 mlog(ML_ERROR,
879 "cluster stack passed to mount, but this filesystem " 879 "cluster stack passed to mount, but this filesystem "
880 "does not support it\n"); 880 "does not support it\n");
881 return -EINVAL; 881 return -EINVAL;
882 } 882 }
883 883
884 if (ocfs2_userspace_stack(osb) && 884 if (ocfs2_userspace_stack(osb) &&
885 strncmp(osb->osb_cluster_stack, mopt->cluster_stack, 885 strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
886 OCFS2_STACK_LABEL_LEN)) { 886 OCFS2_STACK_LABEL_LEN)) {
887 mlog(ML_ERROR, 887 mlog(ML_ERROR,
888 "cluster stack passed to mount (\"%s\") does not " 888 "cluster stack passed to mount (\"%s\") does not "
889 "match the filesystem (\"%s\")\n", 889 "match the filesystem (\"%s\")\n",
890 mopt->cluster_stack, 890 mopt->cluster_stack,
891 osb->osb_cluster_stack); 891 osb->osb_cluster_stack);
892 return -EINVAL; 892 return -EINVAL;
893 } 893 }
894 894
895 return 0; 895 return 0;
896 } 896 }
897 897
898 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) 898 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
899 { 899 {
900 int type; 900 int type;
901 struct super_block *sb = osb->sb; 901 struct super_block *sb = osb->sb;
902 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 902 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
903 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 903 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
904 int status = 0; 904 int status = 0;
905 905
906 for (type = 0; type < MAXQUOTAS; type++) { 906 for (type = 0; type < MAXQUOTAS; type++) {
907 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 907 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
908 continue; 908 continue;
909 if (unsuspend) 909 if (unsuspend)
910 status = dquot_resume(sb, type); 910 status = dquot_resume(sb, type);
911 else { 911 else {
912 struct ocfs2_mem_dqinfo *oinfo; 912 struct ocfs2_mem_dqinfo *oinfo;
913 913
914 /* Cancel periodic syncing before suspending */ 914 /* Cancel periodic syncing before suspending */
915 oinfo = sb_dqinfo(sb, type)->dqi_priv; 915 oinfo = sb_dqinfo(sb, type)->dqi_priv;
916 cancel_delayed_work_sync(&oinfo->dqi_sync_work); 916 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
917 status = dquot_suspend(sb, type); 917 status = dquot_suspend(sb, type);
918 } 918 }
919 if (status < 0) 919 if (status < 0)
920 break; 920 break;
921 } 921 }
922 if (status < 0) 922 if (status < 0)
923 mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " 923 mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
924 "remount (error = %d).\n", status); 924 "remount (error = %d).\n", status);
925 return status; 925 return status;
926 } 926 }
927 927
928 static int ocfs2_enable_quotas(struct ocfs2_super *osb) 928 static int ocfs2_enable_quotas(struct ocfs2_super *osb)
929 { 929 {
930 struct inode *inode[MAXQUOTAS] = { NULL, NULL }; 930 struct inode *inode[MAXQUOTAS] = { NULL, NULL };
931 struct super_block *sb = osb->sb; 931 struct super_block *sb = osb->sb;
932 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 932 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
933 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 933 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
934 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 934 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
935 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 935 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
936 int status; 936 int status;
937 int type; 937 int type;
938 938
939 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; 939 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
940 for (type = 0; type < MAXQUOTAS; type++) { 940 for (type = 0; type < MAXQUOTAS; type++) {
941 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 941 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
942 continue; 942 continue;
943 inode[type] = ocfs2_get_system_file_inode(osb, ino[type], 943 inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
944 osb->slot_num); 944 osb->slot_num);
945 if (!inode[type]) { 945 if (!inode[type]) {
946 status = -ENOENT; 946 status = -ENOENT;
947 goto out_quota_off; 947 goto out_quota_off;
948 } 948 }
949 status = dquot_enable(inode[type], type, QFMT_OCFS2, 949 status = dquot_enable(inode[type], type, QFMT_OCFS2,
950 DQUOT_USAGE_ENABLED); 950 DQUOT_USAGE_ENABLED);
951 if (status < 0) 951 if (status < 0)
952 goto out_quota_off; 952 goto out_quota_off;
953 } 953 }
954 954
955 for (type = 0; type < MAXQUOTAS; type++) 955 for (type = 0; type < MAXQUOTAS; type++)
956 iput(inode[type]); 956 iput(inode[type]);
957 return 0; 957 return 0;
958 out_quota_off: 958 out_quota_off:
959 ocfs2_disable_quotas(osb); 959 ocfs2_disable_quotas(osb);
960 for (type = 0; type < MAXQUOTAS; type++) 960 for (type = 0; type < MAXQUOTAS; type++)
961 iput(inode[type]); 961 iput(inode[type]);
962 mlog_errno(status); 962 mlog_errno(status);
963 return status; 963 return status;
964 } 964 }
965 965
966 static void ocfs2_disable_quotas(struct ocfs2_super *osb) 966 static void ocfs2_disable_quotas(struct ocfs2_super *osb)
967 { 967 {
968 int type; 968 int type;
969 struct inode *inode; 969 struct inode *inode;
970 struct super_block *sb = osb->sb; 970 struct super_block *sb = osb->sb;
971 struct ocfs2_mem_dqinfo *oinfo; 971 struct ocfs2_mem_dqinfo *oinfo;
972 972
973 /* We mostly ignore errors in this function because there's not much 973 /* We mostly ignore errors in this function because there's not much
974 * we can do when we see them */ 974 * we can do when we see them */
975 for (type = 0; type < MAXQUOTAS; type++) { 975 for (type = 0; type < MAXQUOTAS; type++) {
976 if (!sb_has_quota_loaded(sb, type)) 976 if (!sb_has_quota_loaded(sb, type))
977 continue; 977 continue;
978 /* Cancel periodic syncing before we grab dqonoff_mutex */ 978 /* Cancel periodic syncing before we grab dqonoff_mutex */
979 oinfo = sb_dqinfo(sb, type)->dqi_priv; 979 oinfo = sb_dqinfo(sb, type)->dqi_priv;
980 cancel_delayed_work_sync(&oinfo->dqi_sync_work); 980 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
981 inode = igrab(sb->s_dquot.files[type]); 981 inode = igrab(sb->s_dquot.files[type]);
982 /* Turn off quotas. This will remove all dquot structures from 982 /* Turn off quotas. This will remove all dquot structures from
983 * memory and so they will be automatically synced to global 983 * memory and so they will be automatically synced to global
984 * quota files */ 984 * quota files */
985 dquot_disable(sb, type, DQUOT_USAGE_ENABLED | 985 dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
986 DQUOT_LIMITS_ENABLED); 986 DQUOT_LIMITS_ENABLED);
987 if (!inode) 987 if (!inode)
988 continue; 988 continue;
989 iput(inode); 989 iput(inode);
990 } 990 }
991 } 991 }
992 992
993 /* Handle quota on quotactl */ 993 /* Handle quota on quotactl */
994 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) 994 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
995 { 995 {
996 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 996 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
997 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 997 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
998 998
999 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 999 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
1000 return -EINVAL; 1000 return -EINVAL;
1001 1001
1002 return dquot_enable(sb_dqopt(sb)->files[type], type, 1002 return dquot_enable(sb_dqopt(sb)->files[type], type,
1003 format_id, DQUOT_LIMITS_ENABLED); 1003 format_id, DQUOT_LIMITS_ENABLED);
1004 } 1004 }
1005 1005
1006 /* Handle quota off quotactl */ 1006 /* Handle quota off quotactl */
1007 static int ocfs2_quota_off(struct super_block *sb, int type) 1007 static int ocfs2_quota_off(struct super_block *sb, int type)
1008 { 1008 {
1009 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); 1009 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
1010 } 1010 }
1011 1011
1012 static const struct quotactl_ops ocfs2_quotactl_ops = { 1012 static const struct quotactl_ops ocfs2_quotactl_ops = {
1013 .quota_on_meta = ocfs2_quota_on, 1013 .quota_on_meta = ocfs2_quota_on,
1014 .quota_off = ocfs2_quota_off, 1014 .quota_off = ocfs2_quota_off,
1015 .quota_sync = dquot_quota_sync, 1015 .quota_sync = dquot_quota_sync,
1016 .get_info = dquot_get_dqinfo, 1016 .get_info = dquot_get_dqinfo,
1017 .set_info = dquot_set_dqinfo, 1017 .set_info = dquot_set_dqinfo,
1018 .get_dqblk = dquot_get_dqblk, 1018 .get_dqblk = dquot_get_dqblk,
1019 .set_dqblk = dquot_set_dqblk, 1019 .set_dqblk = dquot_set_dqblk,
1020 }; 1020 };
1021 1021
1022 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 1022 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1023 { 1023 {
1024 struct dentry *root; 1024 struct dentry *root;
1025 int status, sector_size; 1025 int status, sector_size;
1026 struct mount_options parsed_options; 1026 struct mount_options parsed_options;
1027 struct inode *inode = NULL; 1027 struct inode *inode = NULL;
1028 struct ocfs2_super *osb = NULL; 1028 struct ocfs2_super *osb = NULL;
1029 struct buffer_head *bh = NULL; 1029 struct buffer_head *bh = NULL;
1030 char nodestr[12]; 1030 char nodestr[12];
1031 struct ocfs2_blockcheck_stats stats; 1031 struct ocfs2_blockcheck_stats stats;
1032 1032
1033 trace_ocfs2_fill_super(sb, data, silent); 1033 trace_ocfs2_fill_super(sb, data, silent);
1034 1034
1035 if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { 1035 if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
1036 status = -EINVAL; 1036 status = -EINVAL;
1037 goto read_super_error; 1037 goto read_super_error;
1038 } 1038 }
1039 1039
1040 /* probe for superblock */ 1040 /* probe for superblock */
1041 status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats); 1041 status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
1042 if (status < 0) { 1042 if (status < 0) {
1043 mlog(ML_ERROR, "superblock probe failed!\n"); 1043 mlog(ML_ERROR, "superblock probe failed!\n");
1044 goto read_super_error; 1044 goto read_super_error;
1045 } 1045 }
1046 1046
1047 status = ocfs2_initialize_super(sb, bh, sector_size, &stats); 1047 status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
1048 osb = OCFS2_SB(sb); 1048 osb = OCFS2_SB(sb);
1049 if (status < 0) { 1049 if (status < 0) {
1050 mlog_errno(status); 1050 mlog_errno(status);
1051 goto read_super_error; 1051 goto read_super_error;
1052 } 1052 }
1053 brelse(bh); 1053 brelse(bh);
1054 bh = NULL; 1054 bh = NULL;
1055 1055
1056 if (!ocfs2_check_set_options(sb, &parsed_options)) { 1056 if (!ocfs2_check_set_options(sb, &parsed_options)) {
1057 status = -EINVAL; 1057 status = -EINVAL;
1058 goto read_super_error; 1058 goto read_super_error;
1059 } 1059 }
1060 osb->s_mount_opt = parsed_options.mount_opt; 1060 osb->s_mount_opt = parsed_options.mount_opt;
1061 osb->s_atime_quantum = parsed_options.atime_quantum; 1061 osb->s_atime_quantum = parsed_options.atime_quantum;
1062 osb->preferred_slot = parsed_options.slot; 1062 osb->preferred_slot = parsed_options.slot;
1063 osb->osb_commit_interval = parsed_options.commit_interval; 1063 osb->osb_commit_interval = parsed_options.commit_interval;
1064 1064
1065 ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); 1065 ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
1066 osb->osb_resv_level = parsed_options.resv_level; 1066 osb->osb_resv_level = parsed_options.resv_level;
1067 osb->osb_dir_resv_level = parsed_options.resv_level; 1067 osb->osb_dir_resv_level = parsed_options.resv_level;
1068 if (parsed_options.dir_resv_level == -1) 1068 if (parsed_options.dir_resv_level == -1)
1069 osb->osb_dir_resv_level = parsed_options.resv_level; 1069 osb->osb_dir_resv_level = parsed_options.resv_level;
1070 else 1070 else
1071 osb->osb_dir_resv_level = parsed_options.dir_resv_level; 1071 osb->osb_dir_resv_level = parsed_options.dir_resv_level;
1072 1072
1073 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1073 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1074 if (status) 1074 if (status)
1075 goto read_super_error; 1075 goto read_super_error;
1076 1076
1077 sb->s_magic = OCFS2_SUPER_MAGIC; 1077 sb->s_magic = OCFS2_SUPER_MAGIC;
1078 1078
1079 sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | 1079 sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
1080 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1080 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1081 1081
1082 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 1082 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
1083 * heartbeat=none */ 1083 * heartbeat=none */
1084 if (bdev_read_only(sb->s_bdev)) { 1084 if (bdev_read_only(sb->s_bdev)) {
1085 if (!(sb->s_flags & MS_RDONLY)) { 1085 if (!(sb->s_flags & MS_RDONLY)) {
1086 status = -EACCES; 1086 status = -EACCES;
1087 mlog(ML_ERROR, "Readonly device detected but readonly " 1087 mlog(ML_ERROR, "Readonly device detected but readonly "
1088 "mount was not specified.\n"); 1088 "mount was not specified.\n");
1089 goto read_super_error; 1089 goto read_super_error;
1090 } 1090 }
1091 1091
1092 /* You should not be able to start a local heartbeat 1092 /* You should not be able to start a local heartbeat
1093 * on a readonly device. */ 1093 * on a readonly device. */
1094 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 1094 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
1095 status = -EROFS; 1095 status = -EROFS;
1096 mlog(ML_ERROR, "Local heartbeat specified on readonly " 1096 mlog(ML_ERROR, "Local heartbeat specified on readonly "
1097 "device.\n"); 1097 "device.\n");
1098 goto read_super_error; 1098 goto read_super_error;
1099 } 1099 }
1100 1100
1101 status = ocfs2_check_journals_nolocks(osb); 1101 status = ocfs2_check_journals_nolocks(osb);
1102 if (status < 0) { 1102 if (status < 0) {
1103 if (status == -EROFS) 1103 if (status == -EROFS)
1104 mlog(ML_ERROR, "Recovery required on readonly " 1104 mlog(ML_ERROR, "Recovery required on readonly "
1105 "file system, but write access is " 1105 "file system, but write access is "
1106 "unavailable.\n"); 1106 "unavailable.\n");
1107 else 1107 else
1108 mlog_errno(status); 1108 mlog_errno(status);
1109 goto read_super_error; 1109 goto read_super_error;
1110 } 1110 }
1111 1111
1112 ocfs2_set_ro_flag(osb, 1); 1112 ocfs2_set_ro_flag(osb, 1);
1113 1113
1114 printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " 1114 printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
1115 "Cluster services will not be used for this mount. " 1115 "Cluster services will not be used for this mount. "
1116 "Recovery will be skipped.\n", osb->dev_str); 1116 "Recovery will be skipped.\n", osb->dev_str);
1117 } 1117 }
1118 1118
1119 if (!ocfs2_is_hard_readonly(osb)) { 1119 if (!ocfs2_is_hard_readonly(osb)) {
1120 if (sb->s_flags & MS_RDONLY) 1120 if (sb->s_flags & MS_RDONLY)
1121 ocfs2_set_ro_flag(osb, 0); 1121 ocfs2_set_ro_flag(osb, 0);
1122 } 1122 }
1123 1123
1124 status = ocfs2_verify_heartbeat(osb); 1124 status = ocfs2_verify_heartbeat(osb);
1125 if (status < 0) { 1125 if (status < 0) {
1126 mlog_errno(status); 1126 mlog_errno(status);
1127 goto read_super_error; 1127 goto read_super_error;
1128 } 1128 }
1129 1129
1130 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, 1130 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
1131 ocfs2_debugfs_root); 1131 ocfs2_debugfs_root);
1132 if (!osb->osb_debug_root) { 1132 if (!osb->osb_debug_root) {
1133 status = -EINVAL; 1133 status = -EINVAL;
1134 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); 1134 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
1135 goto read_super_error; 1135 goto read_super_error;
1136 } 1136 }
1137 1137
1138 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, 1138 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
1139 osb->osb_debug_root, 1139 osb->osb_debug_root,
1140 osb, 1140 osb,
1141 &ocfs2_osb_debug_fops); 1141 &ocfs2_osb_debug_fops);
1142 if (!osb->osb_ctxt) { 1142 if (!osb->osb_ctxt) {
1143 status = -EINVAL; 1143 status = -EINVAL;
1144 mlog_errno(status); 1144 mlog_errno(status);
1145 goto read_super_error; 1145 goto read_super_error;
1146 } 1146 }
1147 1147
1148 if (ocfs2_meta_ecc(osb)) { 1148 if (ocfs2_meta_ecc(osb)) {
1149 status = ocfs2_blockcheck_stats_debugfs_install( 1149 status = ocfs2_blockcheck_stats_debugfs_install(
1150 &osb->osb_ecc_stats, 1150 &osb->osb_ecc_stats,
1151 osb->osb_debug_root); 1151 osb->osb_debug_root);
1152 if (status) { 1152 if (status) {
1153 mlog(ML_ERROR, 1153 mlog(ML_ERROR,
1154 "Unable to create blockcheck statistics " 1154 "Unable to create blockcheck statistics "
1155 "files\n"); 1155 "files\n");
1156 goto read_super_error; 1156 goto read_super_error;
1157 } 1157 }
1158 } 1158 }
1159 1159
1160 status = ocfs2_mount_volume(sb); 1160 status = ocfs2_mount_volume(sb);
1161 if (status < 0) 1161 if (status < 0)
1162 goto read_super_error; 1162 goto read_super_error;
1163 1163
1164 if (osb->root_inode) 1164 if (osb->root_inode)
1165 inode = igrab(osb->root_inode); 1165 inode = igrab(osb->root_inode);
1166 1166
1167 if (!inode) { 1167 if (!inode) {
1168 status = -EIO; 1168 status = -EIO;
1169 mlog_errno(status); 1169 mlog_errno(status);
1170 goto read_super_error; 1170 goto read_super_error;
1171 } 1171 }
1172 1172
1173 root = d_make_root(inode); 1173 root = d_make_root(inode);
1174 if (!root) { 1174 if (!root) {
1175 status = -ENOMEM; 1175 status = -ENOMEM;
1176 mlog_errno(status); 1176 mlog_errno(status);
1177 goto read_super_error; 1177 goto read_super_error;
1178 } 1178 }
1179 1179
1180 sb->s_root = root; 1180 sb->s_root = root;
1181 1181
1182 ocfs2_complete_mount_recovery(osb); 1182 ocfs2_complete_mount_recovery(osb);
1183 1183
1184 if (ocfs2_mount_local(osb)) 1184 if (ocfs2_mount_local(osb))
1185 snprintf(nodestr, sizeof(nodestr), "local"); 1185 snprintf(nodestr, sizeof(nodestr), "local");
1186 else 1186 else
1187 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); 1187 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
1188 1188
1189 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " 1189 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
1190 "with %s data mode.\n", 1190 "with %s data mode.\n",
1191 osb->dev_str, nodestr, osb->slot_num, 1191 osb->dev_str, nodestr, osb->slot_num,
1192 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : 1192 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
1193 "ordered"); 1193 "ordered");
1194 1194
1195 atomic_set(&osb->vol_state, VOLUME_MOUNTED); 1195 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
1196 wake_up(&osb->osb_mount_event); 1196 wake_up(&osb->osb_mount_event);
1197 1197
1198 /* Now we can initialize quotas because we can afford to wait 1198 /* Now we can initialize quotas because we can afford to wait
1199 * for cluster locks recovery now. That also means that truncation 1199 * for cluster locks recovery now. That also means that truncation
1200 * log recovery can happen but that waits for proper quota setup */ 1200 * log recovery can happen but that waits for proper quota setup */
1201 if (!(sb->s_flags & MS_RDONLY)) { 1201 if (!(sb->s_flags & MS_RDONLY)) {
1202 status = ocfs2_enable_quotas(osb); 1202 status = ocfs2_enable_quotas(osb);
1203 if (status < 0) { 1203 if (status < 0) {
1204 /* We have to err-out specially here because 1204 /* We have to err-out specially here because
1205 * s_root is already set */ 1205 * s_root is already set */
1206 mlog_errno(status); 1206 mlog_errno(status);
1207 atomic_set(&osb->vol_state, VOLUME_DISABLED); 1207 atomic_set(&osb->vol_state, VOLUME_DISABLED);
1208 wake_up(&osb->osb_mount_event); 1208 wake_up(&osb->osb_mount_event);
1209 return status; 1209 return status;
1210 } 1210 }
1211 } 1211 }
1212 1212
1213 ocfs2_complete_quota_recovery(osb); 1213 ocfs2_complete_quota_recovery(osb);
1214 1214
1215 /* Now we wake up again for processes waiting for quotas */ 1215 /* Now we wake up again for processes waiting for quotas */
1216 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); 1216 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
1217 wake_up(&osb->osb_mount_event); 1217 wake_up(&osb->osb_mount_event);
1218 1218
1219 /* Start this when the mount is almost sure of being successful */ 1219 /* Start this when the mount is almost sure of being successful */
1220 ocfs2_orphan_scan_start(osb); 1220 ocfs2_orphan_scan_start(osb);
1221 1221
1222 return status; 1222 return status;
1223 1223
1224 read_super_error: 1224 read_super_error:
1225 brelse(bh); 1225 brelse(bh);
1226 1226
1227 if (osb) { 1227 if (osb) {
1228 atomic_set(&osb->vol_state, VOLUME_DISABLED); 1228 atomic_set(&osb->vol_state, VOLUME_DISABLED);
1229 wake_up(&osb->osb_mount_event); 1229 wake_up(&osb->osb_mount_event);
1230 ocfs2_dismount_volume(sb, 1); 1230 ocfs2_dismount_volume(sb, 1);
1231 } 1231 }
1232 1232
1233 if (status) 1233 if (status)
1234 mlog_errno(status); 1234 mlog_errno(status);
1235 return status; 1235 return status;
1236 } 1236 }
1237 1237
1238 static struct dentry *ocfs2_mount(struct file_system_type *fs_type, 1238 static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1239 int flags, 1239 int flags,
1240 const char *dev_name, 1240 const char *dev_name,
1241 void *data) 1241 void *data)
1242 { 1242 {
1243 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); 1243 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1244 } 1244 }
1245 1245
1246 static struct file_system_type ocfs2_fs_type = { 1246 static struct file_system_type ocfs2_fs_type = {
1247 .owner = THIS_MODULE, 1247 .owner = THIS_MODULE,
1248 .name = "ocfs2", 1248 .name = "ocfs2",
1249 .mount = ocfs2_mount, 1249 .mount = ocfs2_mount,
1250 .kill_sb = kill_block_super, 1250 .kill_sb = kill_block_super,
1251 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1251 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
1252 .next = NULL 1252 .next = NULL
1253 }; 1253 };
1254 MODULE_ALIAS_FS("ocfs2"); 1254 MODULE_ALIAS_FS("ocfs2");
1255 1255
1256 static int ocfs2_check_set_options(struct super_block *sb, 1256 static int ocfs2_check_set_options(struct super_block *sb,
1257 struct mount_options *options) 1257 struct mount_options *options)
1258 { 1258 {
1259 if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && 1259 if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
1260 !OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1260 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1261 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1261 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1262 mlog(ML_ERROR, "User quotas were requested, but this " 1262 mlog(ML_ERROR, "User quotas were requested, but this "
1263 "filesystem does not have the feature enabled.\n"); 1263 "filesystem does not have the feature enabled.\n");
1264 return 0; 1264 return 0;
1265 } 1265 }
1266 if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && 1266 if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
1267 !OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1267 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1268 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1268 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1269 mlog(ML_ERROR, "Group quotas were requested, but this " 1269 mlog(ML_ERROR, "Group quotas were requested, but this "
1270 "filesystem does not have the feature enabled.\n"); 1270 "filesystem does not have the feature enabled.\n");
1271 return 0; 1271 return 0;
1272 } 1272 }
1273 if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && 1273 if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
1274 !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { 1274 !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
1275 mlog(ML_ERROR, "ACL support requested but extended attributes " 1275 mlog(ML_ERROR, "ACL support requested but extended attributes "
1276 "feature is not enabled\n"); 1276 "feature is not enabled\n");
1277 return 0; 1277 return 0;
1278 } 1278 }
1279 /* No ACL setting specified? Use XATTR feature... */ 1279 /* No ACL setting specified? Use XATTR feature... */
1280 if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | 1280 if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
1281 OCFS2_MOUNT_NO_POSIX_ACL))) { 1281 OCFS2_MOUNT_NO_POSIX_ACL))) {
1282 if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) 1282 if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
1283 options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1283 options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1284 else 1284 else
1285 options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; 1285 options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1286 } 1286 }
1287 return 1; 1287 return 1;
1288 } 1288 }
1289 1289
1290 static int ocfs2_parse_options(struct super_block *sb, 1290 static int ocfs2_parse_options(struct super_block *sb,
1291 char *options, 1291 char *options,
1292 struct mount_options *mopt, 1292 struct mount_options *mopt,
1293 int is_remount) 1293 int is_remount)
1294 { 1294 {
1295 int status, user_stack = 0; 1295 int status, user_stack = 0;
1296 char *p; 1296 char *p;
1297 u32 tmp; 1297 u32 tmp;
1298 1298
1299 trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); 1299 trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
1300 1300
1301 mopt->commit_interval = 0; 1301 mopt->commit_interval = 0;
1302 mopt->mount_opt = OCFS2_MOUNT_NOINTR; 1302 mopt->mount_opt = OCFS2_MOUNT_NOINTR;
1303 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1303 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1304 mopt->slot = OCFS2_INVALID_SLOT; 1304 mopt->slot = OCFS2_INVALID_SLOT;
1305 mopt->localalloc_opt = -1; 1305 mopt->localalloc_opt = -1;
1306 mopt->cluster_stack[0] = '\0'; 1306 mopt->cluster_stack[0] = '\0';
1307 mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; 1307 mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
1308 mopt->dir_resv_level = -1; 1308 mopt->dir_resv_level = -1;
1309 1309
1310 if (!options) { 1310 if (!options) {
1311 status = 1; 1311 status = 1;
1312 goto bail; 1312 goto bail;
1313 } 1313 }
1314 1314
1315 while ((p = strsep(&options, ",")) != NULL) { 1315 while ((p = strsep(&options, ",")) != NULL) {
1316 int token, option; 1316 int token, option;
1317 substring_t args[MAX_OPT_ARGS]; 1317 substring_t args[MAX_OPT_ARGS];
1318 1318
1319 if (!*p) 1319 if (!*p)
1320 continue; 1320 continue;
1321 1321
1322 token = match_token(p, tokens, args); 1322 token = match_token(p, tokens, args);
1323 switch (token) { 1323 switch (token) {
1324 case Opt_hb_local: 1324 case Opt_hb_local:
1325 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; 1325 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
1326 break; 1326 break;
1327 case Opt_hb_none: 1327 case Opt_hb_none:
1328 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; 1328 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
1329 break; 1329 break;
1330 case Opt_hb_global: 1330 case Opt_hb_global:
1331 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; 1331 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
1332 break; 1332 break;
1333 case Opt_barrier: 1333 case Opt_barrier:
1334 if (match_int(&args[0], &option)) { 1334 if (match_int(&args[0], &option)) {
1335 status = 0; 1335 status = 0;
1336 goto bail; 1336 goto bail;
1337 } 1337 }
1338 if (option) 1338 if (option)
1339 mopt->mount_opt |= OCFS2_MOUNT_BARRIER; 1339 mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
1340 else 1340 else
1341 mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; 1341 mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
1342 break; 1342 break;
1343 case Opt_intr: 1343 case Opt_intr:
1344 mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; 1344 mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
1345 break; 1345 break;
1346 case Opt_nointr: 1346 case Opt_nointr:
1347 mopt->mount_opt |= OCFS2_MOUNT_NOINTR; 1347 mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
1348 break; 1348 break;
1349 case Opt_err_panic: 1349 case Opt_err_panic:
1350 mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 1350 mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
1351 break; 1351 break;
1352 case Opt_err_ro: 1352 case Opt_err_ro:
1353 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; 1353 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
1354 break; 1354 break;
1355 case Opt_data_ordered: 1355 case Opt_data_ordered:
1356 mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; 1356 mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
1357 break; 1357 break;
1358 case Opt_data_writeback: 1358 case Opt_data_writeback:
1359 mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; 1359 mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
1360 break; 1360 break;
1361 case Opt_user_xattr: 1361 case Opt_user_xattr:
1362 mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; 1362 mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
1363 break; 1363 break;
1364 case Opt_nouser_xattr: 1364 case Opt_nouser_xattr:
1365 mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; 1365 mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
1366 break; 1366 break;
1367 case Opt_atime_quantum: 1367 case Opt_atime_quantum:
1368 if (match_int(&args[0], &option)) { 1368 if (match_int(&args[0], &option)) {
1369 status = 0; 1369 status = 0;
1370 goto bail; 1370 goto bail;
1371 } 1371 }
1372 if (option >= 0) 1372 if (option >= 0)
1373 mopt->atime_quantum = option; 1373 mopt->atime_quantum = option;
1374 break; 1374 break;
1375 case Opt_slot: 1375 case Opt_slot:
1376 option = 0; 1376 option = 0;
1377 if (match_int(&args[0], &option)) { 1377 if (match_int(&args[0], &option)) {
1378 status = 0; 1378 status = 0;
1379 goto bail; 1379 goto bail;
1380 } 1380 }
1381 if (option) 1381 if (option)
1382 mopt->slot = (s16)option; 1382 mopt->slot = (s16)option;
1383 break; 1383 break;
1384 case Opt_commit: 1384 case Opt_commit:
1385 option = 0; 1385 option = 0;
1386 if (match_int(&args[0], &option)) { 1386 if (match_int(&args[0], &option)) {
1387 status = 0; 1387 status = 0;
1388 goto bail; 1388 goto bail;
1389 } 1389 }
1390 if (option < 0) 1390 if (option < 0)
1391 return 0; 1391 return 0;
1392 if (option == 0) 1392 if (option == 0)
1393 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1393 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1394 mopt->commit_interval = HZ * option; 1394 mopt->commit_interval = HZ * option;
1395 break; 1395 break;
1396 case Opt_localalloc: 1396 case Opt_localalloc:
1397 option = 0; 1397 option = 0;
1398 if (match_int(&args[0], &option)) { 1398 if (match_int(&args[0], &option)) {
1399 status = 0; 1399 status = 0;
1400 goto bail; 1400 goto bail;
1401 } 1401 }
1402 if (option >= 0) 1402 if (option >= 0)
1403 mopt->localalloc_opt = option; 1403 mopt->localalloc_opt = option;
1404 break; 1404 break;
1405 case Opt_localflocks: 1405 case Opt_localflocks:
1406 /* 1406 /*
1407 * Changing this during remount could race 1407 * Changing this during remount could race
1408 * flock() requests, or "unbalance" existing 1408 * flock() requests, or "unbalance" existing
1409 * ones (e.g., a lock is taken in one mode but 1409 * ones (e.g., a lock is taken in one mode but
1410 * dropped in the other). If users care enough 1410 * dropped in the other). If users care enough
1411 * to flip locking modes during remount, we 1411 * to flip locking modes during remount, we
1412 * could add a "local" flag to individual 1412 * could add a "local" flag to individual
1413 * flock structures for proper tracking of 1413 * flock structures for proper tracking of
1414 * state. 1414 * state.
1415 */ 1415 */
1416 if (!is_remount) 1416 if (!is_remount)
1417 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; 1417 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
1418 break; 1418 break;
1419 case Opt_stack: 1419 case Opt_stack:
1420 /* Check both that the option we were passed 1420 /* Check both that the option we were passed
1421 * is of the right length and that it is a proper 1421 * is of the right length and that it is a proper
1422 * string of the right length. 1422 * string of the right length.
1423 */ 1423 */
1424 if (((args[0].to - args[0].from) != 1424 if (((args[0].to - args[0].from) !=
1425 OCFS2_STACK_LABEL_LEN) || 1425 OCFS2_STACK_LABEL_LEN) ||
1426 (strnlen(args[0].from, 1426 (strnlen(args[0].from,
1427 OCFS2_STACK_LABEL_LEN) != 1427 OCFS2_STACK_LABEL_LEN) !=
1428 OCFS2_STACK_LABEL_LEN)) { 1428 OCFS2_STACK_LABEL_LEN)) {
1429 mlog(ML_ERROR, 1429 mlog(ML_ERROR,
1430 "Invalid cluster_stack option\n"); 1430 "Invalid cluster_stack option\n");
1431 status = 0; 1431 status = 0;
1432 goto bail; 1432 goto bail;
1433 } 1433 }
1434 memcpy(mopt->cluster_stack, args[0].from, 1434 memcpy(mopt->cluster_stack, args[0].from,
1435 OCFS2_STACK_LABEL_LEN); 1435 OCFS2_STACK_LABEL_LEN);
1436 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; 1436 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
1437 /* 1437 /*
1438 * Open code the memcmp here as we don't have 1438 * Open code the memcmp here as we don't have
1439 * an osb to pass to 1439 * an osb to pass to
1440 * ocfs2_userspace_stack(). 1440 * ocfs2_userspace_stack().
1441 */ 1441 */
1442 if (memcmp(mopt->cluster_stack, 1442 if (memcmp(mopt->cluster_stack,
1443 OCFS2_CLASSIC_CLUSTER_STACK, 1443 OCFS2_CLASSIC_CLUSTER_STACK,
1444 OCFS2_STACK_LABEL_LEN)) 1444 OCFS2_STACK_LABEL_LEN))
1445 user_stack = 1; 1445 user_stack = 1;
1446 break; 1446 break;
1447 case Opt_inode64: 1447 case Opt_inode64:
1448 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1448 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
1449 break; 1449 break;
1450 case Opt_usrquota: 1450 case Opt_usrquota:
1451 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; 1451 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
1452 break; 1452 break;
1453 case Opt_grpquota: 1453 case Opt_grpquota:
1454 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1454 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1455 break; 1455 break;
1456 case Opt_coherency_buffered: 1456 case Opt_coherency_buffered:
1457 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; 1457 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
1458 break; 1458 break;
1459 case Opt_coherency_full: 1459 case Opt_coherency_full:
1460 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; 1460 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
1461 break; 1461 break;
1462 case Opt_acl: 1462 case Opt_acl:
1463 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1463 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1464 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; 1464 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
1465 break; 1465 break;
1466 case Opt_noacl: 1466 case Opt_noacl:
1467 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; 1467 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1468 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1468 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1469 break; 1469 break;
1470 case Opt_resv_level: 1470 case Opt_resv_level:
1471 if (is_remount) 1471 if (is_remount)
1472 break; 1472 break;
1473 if (match_int(&args[0], &option)) { 1473 if (match_int(&args[0], &option)) {
1474 status = 0; 1474 status = 0;
1475 goto bail; 1475 goto bail;
1476 } 1476 }
1477 if (option >= OCFS2_MIN_RESV_LEVEL && 1477 if (option >= OCFS2_MIN_RESV_LEVEL &&
1478 option < OCFS2_MAX_RESV_LEVEL) 1478 option < OCFS2_MAX_RESV_LEVEL)
1479 mopt->resv_level = option; 1479 mopt->resv_level = option;
1480 break; 1480 break;
1481 case Opt_dir_resv_level: 1481 case Opt_dir_resv_level:
1482 if (is_remount) 1482 if (is_remount)
1483 break; 1483 break;
1484 if (match_int(&args[0], &option)) { 1484 if (match_int(&args[0], &option)) {
1485 status = 0; 1485 status = 0;
1486 goto bail; 1486 goto bail;
1487 } 1487 }
1488 if (option >= OCFS2_MIN_RESV_LEVEL && 1488 if (option >= OCFS2_MIN_RESV_LEVEL &&
1489 option < OCFS2_MAX_RESV_LEVEL) 1489 option < OCFS2_MAX_RESV_LEVEL)
1490 mopt->dir_resv_level = option; 1490 mopt->dir_resv_level = option;
1491 break; 1491 break;
1492 default: 1492 default:
1493 mlog(ML_ERROR, 1493 mlog(ML_ERROR,
1494 "Unrecognized mount option \"%s\" " 1494 "Unrecognized mount option \"%s\" "
1495 "or missing value\n", p); 1495 "or missing value\n", p);
1496 status = 0; 1496 status = 0;
1497 goto bail; 1497 goto bail;
1498 } 1498 }
1499 } 1499 }
1500 1500
1501 if (user_stack == 0) { 1501 if (user_stack == 0) {
1502 /* Ensure only one heartbeat mode */ 1502 /* Ensure only one heartbeat mode */
1503 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | 1503 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
1504 OCFS2_MOUNT_HB_GLOBAL | 1504 OCFS2_MOUNT_HB_GLOBAL |
1505 OCFS2_MOUNT_HB_NONE); 1505 OCFS2_MOUNT_HB_NONE);
1506 if (hweight32(tmp) != 1) { 1506 if (hweight32(tmp) != 1) {
1507 mlog(ML_ERROR, "Invalid heartbeat mount options\n"); 1507 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1508 status = 0; 1508 status = 0;
1509 goto bail; 1509 goto bail;
1510 } 1510 }
1511 } 1511 }
1512 1512
1513 status = 1; 1513 status = 1;
1514 1514
1515 bail: 1515 bail:
1516 return status; 1516 return status;
1517 } 1517 }
1518 1518
1519 static int ocfs2_show_options(struct seq_file *s, struct dentry *root) 1519 static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1520 { 1520 {
1521 struct ocfs2_super *osb = OCFS2_SB(root->d_sb); 1521 struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
1522 unsigned long opts = osb->s_mount_opt; 1522 unsigned long opts = osb->s_mount_opt;
1523 unsigned int local_alloc_megs; 1523 unsigned int local_alloc_megs;
1524 1524
1525 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { 1525 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
1526 seq_printf(s, ",_netdev"); 1526 seq_printf(s, ",_netdev");
1527 if (opts & OCFS2_MOUNT_HB_LOCAL) 1527 if (opts & OCFS2_MOUNT_HB_LOCAL)
1528 seq_printf(s, ",%s", OCFS2_HB_LOCAL); 1528 seq_printf(s, ",%s", OCFS2_HB_LOCAL);
1529 else 1529 else
1530 seq_printf(s, ",%s", OCFS2_HB_GLOBAL); 1530 seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
1531 } else 1531 } else
1532 seq_printf(s, ",%s", OCFS2_HB_NONE); 1532 seq_printf(s, ",%s", OCFS2_HB_NONE);
1533 1533
1534 if (opts & OCFS2_MOUNT_NOINTR) 1534 if (opts & OCFS2_MOUNT_NOINTR)
1535 seq_printf(s, ",nointr"); 1535 seq_printf(s, ",nointr");
1536 1536
1537 if (opts & OCFS2_MOUNT_DATA_WRITEBACK) 1537 if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
1538 seq_printf(s, ",data=writeback"); 1538 seq_printf(s, ",data=writeback");
1539 else 1539 else
1540 seq_printf(s, ",data=ordered"); 1540 seq_printf(s, ",data=ordered");
1541 1541
1542 if (opts & OCFS2_MOUNT_BARRIER) 1542 if (opts & OCFS2_MOUNT_BARRIER)
1543 seq_printf(s, ",barrier=1"); 1543 seq_printf(s, ",barrier=1");
1544 1544
1545 if (opts & OCFS2_MOUNT_ERRORS_PANIC) 1545 if (opts & OCFS2_MOUNT_ERRORS_PANIC)
1546 seq_printf(s, ",errors=panic"); 1546 seq_printf(s, ",errors=panic");
1547 else 1547 else
1548 seq_printf(s, ",errors=remount-ro"); 1548 seq_printf(s, ",errors=remount-ro");
1549 1549
1550 if (osb->preferred_slot != OCFS2_INVALID_SLOT) 1550 if (osb->preferred_slot != OCFS2_INVALID_SLOT)
1551 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); 1551 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
1552 1552
1553 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 1553 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
1554 1554
1555 if (osb->osb_commit_interval) 1555 if (osb->osb_commit_interval)
1556 seq_printf(s, ",commit=%u", 1556 seq_printf(s, ",commit=%u",
1557 (unsigned) (osb->osb_commit_interval / HZ)); 1557 (unsigned) (osb->osb_commit_interval / HZ));
1558 1558
1559 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); 1559 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
1560 if (local_alloc_megs != ocfs2_la_default_mb(osb)) 1560 if (local_alloc_megs != ocfs2_la_default_mb(osb))
1561 seq_printf(s, ",localalloc=%d", local_alloc_megs); 1561 seq_printf(s, ",localalloc=%d", local_alloc_megs);
1562 1562
1563 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1563 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
1564 seq_printf(s, ",localflocks,"); 1564 seq_printf(s, ",localflocks,");
1565 1565
1566 if (osb->osb_cluster_stack[0]) 1566 if (osb->osb_cluster_stack[0])
1567 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1567 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
1568 osb->osb_cluster_stack); 1568 osb->osb_cluster_stack);
1569 if (opts & OCFS2_MOUNT_USRQUOTA) 1569 if (opts & OCFS2_MOUNT_USRQUOTA)
1570 seq_printf(s, ",usrquota"); 1570 seq_printf(s, ",usrquota");
1571 if (opts & OCFS2_MOUNT_GRPQUOTA) 1571 if (opts & OCFS2_MOUNT_GRPQUOTA)
1572 seq_printf(s, ",grpquota"); 1572 seq_printf(s, ",grpquota");
1573 1573
1574 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) 1574 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
1575 seq_printf(s, ",coherency=buffered"); 1575 seq_printf(s, ",coherency=buffered");
1576 else 1576 else
1577 seq_printf(s, ",coherency=full"); 1577 seq_printf(s, ",coherency=full");
1578 1578
1579 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1579 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1580 seq_printf(s, ",nouser_xattr"); 1580 seq_printf(s, ",nouser_xattr");
1581 else 1581 else
1582 seq_printf(s, ",user_xattr"); 1582 seq_printf(s, ",user_xattr");
1583 1583
1584 if (opts & OCFS2_MOUNT_INODE64) 1584 if (opts & OCFS2_MOUNT_INODE64)
1585 seq_printf(s, ",inode64"); 1585 seq_printf(s, ",inode64");
1586 1586
1587 if (opts & OCFS2_MOUNT_POSIX_ACL) 1587 if (opts & OCFS2_MOUNT_POSIX_ACL)
1588 seq_printf(s, ",acl"); 1588 seq_printf(s, ",acl");
1589 else 1589 else
1590 seq_printf(s, ",noacl"); 1590 seq_printf(s, ",noacl");
1591 1591
1592 if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) 1592 if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
1593 seq_printf(s, ",resv_level=%d", osb->osb_resv_level); 1593 seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
1594 1594
1595 if (osb->osb_dir_resv_level != osb->osb_resv_level) 1595 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1596 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); 1596 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1597 1597
1598 return 0; 1598 return 0;
1599 } 1599 }
1600 1600
1601 static int __init ocfs2_init(void) 1601 static int __init ocfs2_init(void)
1602 { 1602 {
1603 int status; 1603 int status;
1604 1604
1605 status = init_ocfs2_uptodate_cache(); 1605 status = init_ocfs2_uptodate_cache();
1606 if (status < 0) 1606 if (status < 0)
1607 goto out1; 1607 goto out1;
1608 1608
1609 status = ocfs2_initialize_mem_caches(); 1609 status = ocfs2_initialize_mem_caches();
1610 if (status < 0) 1610 if (status < 0)
1611 goto out2; 1611 goto out2;
1612 1612
1613 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); 1613 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
1614 if (!ocfs2_wq) { 1614 if (!ocfs2_wq) {
1615 status = -ENOMEM; 1615 status = -ENOMEM;
1616 goto out3; 1616 goto out3;
1617 } 1617 }
1618 1618
1619 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1619 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
1620 if (!ocfs2_debugfs_root) { 1620 if (!ocfs2_debugfs_root) {
1621 status = -EFAULT; 1621 status = -EFAULT;
1622 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1622 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1623 } 1623 }
1624 1624
1625 ocfs2_set_locking_protocol(); 1625 ocfs2_set_locking_protocol();
1626 1626
1627 status = register_quota_format(&ocfs2_quota_format); 1627 status = register_quota_format(&ocfs2_quota_format);
1628 if (status < 0) 1628 if (status < 0)
1629 goto out4; 1629 goto out4;
1630 status = register_filesystem(&ocfs2_fs_type); 1630 status = register_filesystem(&ocfs2_fs_type);
1631 if (!status) 1631 if (!status)
1632 return 0; 1632 return 0;
1633 1633
1634 unregister_quota_format(&ocfs2_quota_format); 1634 unregister_quota_format(&ocfs2_quota_format);
1635 out4: 1635 out4:
1636 destroy_workqueue(ocfs2_wq); 1636 destroy_workqueue(ocfs2_wq);
1637 debugfs_remove(ocfs2_debugfs_root); 1637 debugfs_remove(ocfs2_debugfs_root);
1638 out3: 1638 out3:
1639 ocfs2_free_mem_caches(); 1639 ocfs2_free_mem_caches();
1640 out2: 1640 out2:
1641 exit_ocfs2_uptodate_cache(); 1641 exit_ocfs2_uptodate_cache();
1642 out1: 1642 out1:
1643 mlog_errno(status); 1643 mlog_errno(status);
1644 return status; 1644 return status;
1645 } 1645 }
1646 1646
1647 static void __exit ocfs2_exit(void) 1647 static void __exit ocfs2_exit(void)
1648 { 1648 {
1649 if (ocfs2_wq) { 1649 if (ocfs2_wq) {
1650 flush_workqueue(ocfs2_wq); 1650 flush_workqueue(ocfs2_wq);
1651 destroy_workqueue(ocfs2_wq); 1651 destroy_workqueue(ocfs2_wq);
1652 } 1652 }
1653 1653
1654 unregister_quota_format(&ocfs2_quota_format); 1654 unregister_quota_format(&ocfs2_quota_format);
1655 1655
1656 debugfs_remove(ocfs2_debugfs_root); 1656 debugfs_remove(ocfs2_debugfs_root);
1657 1657
1658 ocfs2_free_mem_caches(); 1658 ocfs2_free_mem_caches();
1659 1659
1660 unregister_filesystem(&ocfs2_fs_type); 1660 unregister_filesystem(&ocfs2_fs_type);
1661 1661
1662 exit_ocfs2_uptodate_cache(); 1662 exit_ocfs2_uptodate_cache();
1663 } 1663 }
1664 1664
1665 static void ocfs2_put_super(struct super_block *sb) 1665 static void ocfs2_put_super(struct super_block *sb)
1666 { 1666 {
1667 trace_ocfs2_put_super(sb); 1667 trace_ocfs2_put_super(sb);
1668 1668
1669 ocfs2_sync_blockdev(sb); 1669 ocfs2_sync_blockdev(sb);
1670 ocfs2_dismount_volume(sb, 0); 1670 ocfs2_dismount_volume(sb, 0);
1671 } 1671 }
1672 1672
1673 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) 1673 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1674 { 1674 {
1675 struct ocfs2_super *osb; 1675 struct ocfs2_super *osb;
1676 u32 numbits, freebits; 1676 u32 numbits, freebits;
1677 int status; 1677 int status;
1678 struct ocfs2_dinode *bm_lock; 1678 struct ocfs2_dinode *bm_lock;
1679 struct buffer_head *bh = NULL; 1679 struct buffer_head *bh = NULL;
1680 struct inode *inode = NULL; 1680 struct inode *inode = NULL;
1681 1681
1682 trace_ocfs2_statfs(dentry->d_sb, buf); 1682 trace_ocfs2_statfs(dentry->d_sb, buf);
1683 1683
1684 osb = OCFS2_SB(dentry->d_sb); 1684 osb = OCFS2_SB(dentry->d_sb);
1685 1685
1686 inode = ocfs2_get_system_file_inode(osb, 1686 inode = ocfs2_get_system_file_inode(osb,
1687 GLOBAL_BITMAP_SYSTEM_INODE, 1687 GLOBAL_BITMAP_SYSTEM_INODE,
1688 OCFS2_INVALID_SLOT); 1688 OCFS2_INVALID_SLOT);
1689 if (!inode) { 1689 if (!inode) {
1690 mlog(ML_ERROR, "failed to get bitmap inode\n"); 1690 mlog(ML_ERROR, "failed to get bitmap inode\n");
1691 status = -EIO; 1691 status = -EIO;
1692 goto bail; 1692 goto bail;
1693 } 1693 }
1694 1694
1695 status = ocfs2_inode_lock(inode, &bh, 0); 1695 status = ocfs2_inode_lock(inode, &bh, 0);
1696 if (status < 0) { 1696 if (status < 0) {
1697 mlog_errno(status); 1697 mlog_errno(status);
1698 goto bail; 1698 goto bail;
1699 } 1699 }
1700 1700
1701 bm_lock = (struct ocfs2_dinode *) bh->b_data; 1701 bm_lock = (struct ocfs2_dinode *) bh->b_data;
1702 1702
1703 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); 1703 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
1704 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); 1704 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
1705 1705
1706 buf->f_type = OCFS2_SUPER_MAGIC; 1706 buf->f_type = OCFS2_SUPER_MAGIC;
1707 buf->f_bsize = dentry->d_sb->s_blocksize; 1707 buf->f_bsize = dentry->d_sb->s_blocksize;
1708 buf->f_namelen = OCFS2_MAX_FILENAME_LEN; 1708 buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
1709 buf->f_blocks = ((sector_t) numbits) * 1709 buf->f_blocks = ((sector_t) numbits) *
1710 (osb->s_clustersize >> osb->sb->s_blocksize_bits); 1710 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
1711 buf->f_bfree = ((sector_t) freebits) * 1711 buf->f_bfree = ((sector_t) freebits) *
1712 (osb->s_clustersize >> osb->sb->s_blocksize_bits); 1712 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
1713 buf->f_bavail = buf->f_bfree; 1713 buf->f_bavail = buf->f_bfree;
1714 buf->f_files = numbits; 1714 buf->f_files = numbits;
1715 buf->f_ffree = freebits; 1715 buf->f_ffree = freebits;
1716 buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) 1716 buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
1717 & 0xFFFFFFFFUL; 1717 & 0xFFFFFFFFUL;
1718 buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, 1718 buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
1719 OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; 1719 OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
1720 1720
1721 brelse(bh); 1721 brelse(bh);
1722 1722
1723 ocfs2_inode_unlock(inode, 0); 1723 ocfs2_inode_unlock(inode, 0);
1724 status = 0; 1724 status = 0;
1725 bail: 1725 bail:
1726 if (inode) 1726 if (inode)
1727 iput(inode); 1727 iput(inode);
1728 1728
1729 if (status) 1729 if (status)
1730 mlog_errno(status); 1730 mlog_errno(status);
1731 1731
1732 return status; 1732 return status;
1733 } 1733 }
1734 1734
1735 static void ocfs2_inode_init_once(void *data) 1735 static void ocfs2_inode_init_once(void *data)
1736 { 1736 {
1737 struct ocfs2_inode_info *oi = data; 1737 struct ocfs2_inode_info *oi = data;
1738 1738
1739 oi->ip_flags = 0; 1739 oi->ip_flags = 0;
1740 oi->ip_open_count = 0; 1740 oi->ip_open_count = 0;
1741 spin_lock_init(&oi->ip_lock); 1741 spin_lock_init(&oi->ip_lock);
1742 ocfs2_extent_map_init(&oi->vfs_inode); 1742 ocfs2_extent_map_init(&oi->vfs_inode);
1743 INIT_LIST_HEAD(&oi->ip_io_markers); 1743 INIT_LIST_HEAD(&oi->ip_io_markers);
1744 oi->ip_dir_start_lookup = 0; 1744 oi->ip_dir_start_lookup = 0;
1745 mutex_init(&oi->ip_unaligned_aio); 1745 mutex_init(&oi->ip_unaligned_aio);
1746 init_rwsem(&oi->ip_alloc_sem); 1746 init_rwsem(&oi->ip_alloc_sem);
1747 init_rwsem(&oi->ip_xattr_sem); 1747 init_rwsem(&oi->ip_xattr_sem);
1748 mutex_init(&oi->ip_io_mutex); 1748 mutex_init(&oi->ip_io_mutex);
1749 1749
1750 oi->ip_blkno = 0ULL; 1750 oi->ip_blkno = 0ULL;
1751 oi->ip_clusters = 0; 1751 oi->ip_clusters = 0;
1752 1752
1753 ocfs2_resv_init_once(&oi->ip_la_data_resv); 1753 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1754 1754
1755 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1755 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1756 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1756 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1757 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1757 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1758 1758
1759 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), 1759 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1760 &ocfs2_inode_caching_ops); 1760 &ocfs2_inode_caching_ops);
1761 1761
1762 inode_init_once(&oi->vfs_inode); 1762 inode_init_once(&oi->vfs_inode);
1763 } 1763 }
1764 1764
1765 static int ocfs2_initialize_mem_caches(void) 1765 static int ocfs2_initialize_mem_caches(void)
1766 { 1766 {
1767 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", 1767 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
1768 sizeof(struct ocfs2_inode_info), 1768 sizeof(struct ocfs2_inode_info),
1769 0, 1769 0,
1770 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1770 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1771 SLAB_MEM_SPREAD), 1771 SLAB_MEM_SPREAD),
1772 ocfs2_inode_init_once); 1772 ocfs2_inode_init_once);
1773 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", 1773 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
1774 sizeof(struct ocfs2_dquot), 1774 sizeof(struct ocfs2_dquot),
1775 0, 1775 0,
1776 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1776 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1777 SLAB_MEM_SPREAD), 1777 SLAB_MEM_SPREAD),
1778 NULL); 1778 NULL);
1779 ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", 1779 ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
1780 sizeof(struct ocfs2_quota_chunk), 1780 sizeof(struct ocfs2_quota_chunk),
1781 0, 1781 0,
1782 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), 1782 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
1783 NULL); 1783 NULL);
1784 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || 1784 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
1785 !ocfs2_qf_chunk_cachep) { 1785 !ocfs2_qf_chunk_cachep) {
1786 if (ocfs2_inode_cachep) 1786 if (ocfs2_inode_cachep)
1787 kmem_cache_destroy(ocfs2_inode_cachep); 1787 kmem_cache_destroy(ocfs2_inode_cachep);
1788 if (ocfs2_dquot_cachep) 1788 if (ocfs2_dquot_cachep)
1789 kmem_cache_destroy(ocfs2_dquot_cachep); 1789 kmem_cache_destroy(ocfs2_dquot_cachep);
1790 if (ocfs2_qf_chunk_cachep) 1790 if (ocfs2_qf_chunk_cachep)
1791 kmem_cache_destroy(ocfs2_qf_chunk_cachep); 1791 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1792 return -ENOMEM; 1792 return -ENOMEM;
1793 } 1793 }
1794 1794
1795 return 0; 1795 return 0;
1796 } 1796 }
1797 1797
1798 static void ocfs2_free_mem_caches(void) 1798 static void ocfs2_free_mem_caches(void)
1799 { 1799 {
1800 /* 1800 /*
1801 * Make sure all delayed rcu free inodes are flushed before we 1801 * Make sure all delayed rcu free inodes are flushed before we
1802 * destroy cache. 1802 * destroy cache.
1803 */ 1803 */
1804 rcu_barrier(); 1804 rcu_barrier();
1805 if (ocfs2_inode_cachep) 1805 if (ocfs2_inode_cachep)
1806 kmem_cache_destroy(ocfs2_inode_cachep); 1806 kmem_cache_destroy(ocfs2_inode_cachep);
1807 ocfs2_inode_cachep = NULL; 1807 ocfs2_inode_cachep = NULL;
1808 1808
1809 if (ocfs2_dquot_cachep) 1809 if (ocfs2_dquot_cachep)
1810 kmem_cache_destroy(ocfs2_dquot_cachep); 1810 kmem_cache_destroy(ocfs2_dquot_cachep);
1811 ocfs2_dquot_cachep = NULL; 1811 ocfs2_dquot_cachep = NULL;
1812 1812
1813 if (ocfs2_qf_chunk_cachep) 1813 if (ocfs2_qf_chunk_cachep)
1814 kmem_cache_destroy(ocfs2_qf_chunk_cachep); 1814 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1815 ocfs2_qf_chunk_cachep = NULL; 1815 ocfs2_qf_chunk_cachep = NULL;
1816 } 1816 }
1817 1817
1818 static int ocfs2_get_sector(struct super_block *sb, 1818 static int ocfs2_get_sector(struct super_block *sb,
1819 struct buffer_head **bh, 1819 struct buffer_head **bh,
1820 int block, 1820 int block,
1821 int sect_size) 1821 int sect_size)
1822 { 1822 {
1823 if (!sb_set_blocksize(sb, sect_size)) { 1823 if (!sb_set_blocksize(sb, sect_size)) {
1824 mlog(ML_ERROR, "unable to set blocksize\n"); 1824 mlog(ML_ERROR, "unable to set blocksize\n");
1825 return -EIO; 1825 return -EIO;
1826 } 1826 }
1827 1827
1828 *bh = sb_getblk(sb, block); 1828 *bh = sb_getblk(sb, block);
1829 if (!*bh) { 1829 if (!*bh) {
1830 mlog_errno(-ENOMEM); 1830 mlog_errno(-ENOMEM);
1831 return -ENOMEM; 1831 return -ENOMEM;
1832 } 1832 }
1833 lock_buffer(*bh); 1833 lock_buffer(*bh);
1834 if (!buffer_dirty(*bh)) 1834 if (!buffer_dirty(*bh))
1835 clear_buffer_uptodate(*bh); 1835 clear_buffer_uptodate(*bh);
1836 unlock_buffer(*bh); 1836 unlock_buffer(*bh);
1837 ll_rw_block(READ, 1, bh); 1837 ll_rw_block(READ, 1, bh);
1838 wait_on_buffer(*bh); 1838 wait_on_buffer(*bh);
1839 if (!buffer_uptodate(*bh)) { 1839 if (!buffer_uptodate(*bh)) {
1840 mlog_errno(-EIO); 1840 mlog_errno(-EIO);
1841 brelse(*bh); 1841 brelse(*bh);
1842 *bh = NULL; 1842 *bh = NULL;
1843 return -EIO; 1843 return -EIO;
1844 } 1844 }
1845 1845
1846 return 0; 1846 return 0;
1847 } 1847 }
1848 1848
1849 static int ocfs2_mount_volume(struct super_block *sb) 1849 static int ocfs2_mount_volume(struct super_block *sb)
1850 { 1850 {
1851 int status = 0; 1851 int status = 0;
1852 int unlock_super = 0; 1852 int unlock_super = 0;
1853 struct ocfs2_super *osb = OCFS2_SB(sb); 1853 struct ocfs2_super *osb = OCFS2_SB(sb);
1854 1854
1855 if (ocfs2_is_hard_readonly(osb)) 1855 if (ocfs2_is_hard_readonly(osb))
1856 goto leave; 1856 goto leave;
1857 1857
1858 status = ocfs2_dlm_init(osb); 1858 status = ocfs2_dlm_init(osb);
1859 if (status < 0) { 1859 if (status < 0) {
1860 mlog_errno(status); 1860 mlog_errno(status);
1861 goto leave; 1861 goto leave;
1862 } 1862 }
1863 1863
1864 status = ocfs2_super_lock(osb, 1); 1864 status = ocfs2_super_lock(osb, 1);
1865 if (status < 0) { 1865 if (status < 0) {
1866 mlog_errno(status); 1866 mlog_errno(status);
1867 goto leave; 1867 goto leave;
1868 } 1868 }
1869 unlock_super = 1; 1869 unlock_super = 1;
1870 1870
1871 /* This will load up the node map and add ourselves to it. */ 1871 /* This will load up the node map and add ourselves to it. */
1872 status = ocfs2_find_slot(osb); 1872 status = ocfs2_find_slot(osb);
1873 if (status < 0) { 1873 if (status < 0) {
1874 mlog_errno(status); 1874 mlog_errno(status);
1875 goto leave; 1875 goto leave;
1876 } 1876 }
1877 1877
1878 /* load all node-local system inodes */ 1878 /* load all node-local system inodes */
1879 status = ocfs2_init_local_system_inodes(osb); 1879 status = ocfs2_init_local_system_inodes(osb);
1880 if (status < 0) { 1880 if (status < 0) {
1881 mlog_errno(status); 1881 mlog_errno(status);
1882 goto leave; 1882 goto leave;
1883 } 1883 }
1884 1884
1885 status = ocfs2_check_volume(osb); 1885 status = ocfs2_check_volume(osb);
1886 if (status < 0) { 1886 if (status < 0) {
1887 mlog_errno(status); 1887 mlog_errno(status);
1888 goto leave; 1888 goto leave;
1889 } 1889 }
1890 1890
1891 status = ocfs2_truncate_log_init(osb); 1891 status = ocfs2_truncate_log_init(osb);
1892 if (status < 0) 1892 if (status < 0)
1893 mlog_errno(status); 1893 mlog_errno(status);
1894 1894
1895 leave: 1895 leave:
1896 if (unlock_super) 1896 if (unlock_super)
1897 ocfs2_super_unlock(osb, 1); 1897 ocfs2_super_unlock(osb, 1);
1898 1898
1899 return status; 1899 return status;
1900 } 1900 }
1901 1901
1902 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) 1902 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1903 { 1903 {
1904 int tmp, hangup_needed = 0; 1904 int tmp, hangup_needed = 0;
1905 struct ocfs2_super *osb = NULL; 1905 struct ocfs2_super *osb = NULL;
1906 char nodestr[12]; 1906 char nodestr[12];
1907 1907
1908 trace_ocfs2_dismount_volume(sb); 1908 trace_ocfs2_dismount_volume(sb);
1909 1909
1910 BUG_ON(!sb); 1910 BUG_ON(!sb);
1911 osb = OCFS2_SB(sb); 1911 osb = OCFS2_SB(sb);
1912 BUG_ON(!osb); 1912 BUG_ON(!osb);
1913 1913
1914 debugfs_remove(osb->osb_ctxt); 1914 debugfs_remove(osb->osb_ctxt);
1915 1915
1916 /* Orphan scan should be stopped as early as possible */ 1916 /* Orphan scan should be stopped as early as possible */
1917 ocfs2_orphan_scan_stop(osb); 1917 ocfs2_orphan_scan_stop(osb);
1918 1918
1919 ocfs2_disable_quotas(osb); 1919 ocfs2_disable_quotas(osb);
1920 1920
1921 /* All dquots should be freed by now */ 1921 /* All dquots should be freed by now */
1922 WARN_ON(!llist_empty(&osb->dquot_drop_list)); 1922 WARN_ON(!llist_empty(&osb->dquot_drop_list));
1923 /* Wait for worker to be done with the work structure in osb */ 1923 /* Wait for worker to be done with the work structure in osb */
1924 cancel_work_sync(&osb->dquot_drop_work); 1924 cancel_work_sync(&osb->dquot_drop_work);
1925 1925
1926 ocfs2_shutdown_local_alloc(osb); 1926 ocfs2_shutdown_local_alloc(osb);
1927 1927
1928 ocfs2_truncate_log_shutdown(osb); 1928 ocfs2_truncate_log_shutdown(osb);
1929 1929
1930 /* This will disable recovery and flush any recovery work. */ 1930 /* This will disable recovery and flush any recovery work. */
1931 ocfs2_recovery_exit(osb); 1931 ocfs2_recovery_exit(osb);
1932 1932
1933 ocfs2_journal_shutdown(osb); 1933 ocfs2_journal_shutdown(osb);
1934 1934
1935 ocfs2_sync_blockdev(sb); 1935 ocfs2_sync_blockdev(sb);
1936 1936
1937 ocfs2_purge_refcount_trees(osb); 1937 ocfs2_purge_refcount_trees(osb);
1938 1938
1939 /* No cluster connection means we've failed during mount, so skip 1939 /* No cluster connection means we've failed during mount, so skip
1940 * all the steps which depended on that to complete. */ 1940 * all the steps which depended on that to complete. */
1941 if (osb->cconn) { 1941 if (osb->cconn) {
1942 tmp = ocfs2_super_lock(osb, 1); 1942 tmp = ocfs2_super_lock(osb, 1);
1943 if (tmp < 0) { 1943 if (tmp < 0) {
1944 mlog_errno(tmp); 1944 mlog_errno(tmp);
1945 return; 1945 return;
1946 } 1946 }
1947 } 1947 }
1948 1948
1949 if (osb->slot_num != OCFS2_INVALID_SLOT) 1949 if (osb->slot_num != OCFS2_INVALID_SLOT)
1950 ocfs2_put_slot(osb); 1950 ocfs2_put_slot(osb);
1951 1951
1952 if (osb->cconn) 1952 if (osb->cconn)
1953 ocfs2_super_unlock(osb, 1); 1953 ocfs2_super_unlock(osb, 1);
1954 1954
1955 ocfs2_release_system_inodes(osb); 1955 ocfs2_release_system_inodes(osb);
1956 1956
1957 /* 1957 /*
1958 * If we're dismounting due to mount error, mount.ocfs2 will clean 1958 * If we're dismounting due to mount error, mount.ocfs2 will clean
1959 * up heartbeat. If we're a local mount, there is no heartbeat. 1959 * up heartbeat. If we're a local mount, there is no heartbeat.
1960 * If we failed before we got a uuid_str yet, we can't stop 1960 * If we failed before we got a uuid_str yet, we can't stop
1961 * heartbeat. Otherwise, do it. 1961 * heartbeat. Otherwise, do it.
1962 */ 1962 */
1963 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && 1963 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
1964 !ocfs2_is_hard_readonly(osb)) 1964 !ocfs2_is_hard_readonly(osb))
1965 hangup_needed = 1; 1965 hangup_needed = 1;
1966 1966
1967 if (osb->cconn) 1967 if (osb->cconn)
1968 ocfs2_dlm_shutdown(osb, hangup_needed); 1968 ocfs2_dlm_shutdown(osb, hangup_needed);
1969 1969
1970 ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); 1970 ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
1971 debugfs_remove(osb->osb_debug_root); 1971 debugfs_remove(osb->osb_debug_root);
1972 1972
1973 if (hangup_needed) 1973 if (hangup_needed)
1974 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); 1974 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
1975 1975
1976 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); 1976 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1977 1977
1978 if (ocfs2_mount_local(osb)) 1978 if (ocfs2_mount_local(osb))
1979 snprintf(nodestr, sizeof(nodestr), "local"); 1979 snprintf(nodestr, sizeof(nodestr), "local");
1980 else 1980 else
1981 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); 1981 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
1982 1982
1983 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", 1983 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
1984 osb->dev_str, nodestr); 1984 osb->dev_str, nodestr);
1985 1985
1986 ocfs2_delete_osb(osb); 1986 ocfs2_delete_osb(osb);
1987 kfree(osb); 1987 kfree(osb);
1988 sb->s_dev = 0; 1988 sb->s_dev = 0;
1989 sb->s_fs_info = NULL; 1989 sb->s_fs_info = NULL;
1990 } 1990 }
1991 1991
1992 static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, 1992 static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
1993 unsigned uuid_bytes) 1993 unsigned uuid_bytes)
1994 { 1994 {
1995 int i, ret; 1995 int i, ret;
1996 char *ptr; 1996 char *ptr;
1997 1997
1998 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); 1998 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
1999 1999
2000 osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); 2000 osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
2001 if (osb->uuid_str == NULL) 2001 if (osb->uuid_str == NULL)
2002 return -ENOMEM; 2002 return -ENOMEM;
2003 2003
2004 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { 2004 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
2005 /* print with null */ 2005 /* print with null */
2006 ret = snprintf(ptr, 3, "%02X", uuid[i]); 2006 ret = snprintf(ptr, 3, "%02X", uuid[i]);
2007 if (ret != 2) /* drop super cleans up */ 2007 if (ret != 2) /* drop super cleans up */
2008 return -EINVAL; 2008 return -EINVAL;
2009 /* then only advance past the last char */ 2009 /* then only advance past the last char */
2010 ptr += 2; 2010 ptr += 2;
2011 } 2011 }
2012 2012
2013 return 0; 2013 return 0;
2014 } 2014 }
2015 2015
2016 /* Make sure entire volume is addressable by our journal. Requires 2016 /* Make sure entire volume is addressable by our journal. Requires
2017 osb_clusters_at_boot to be valid and for the journal to have been 2017 osb_clusters_at_boot to be valid and for the journal to have been
2018 initialized by ocfs2_journal_init(). */ 2018 initialized by ocfs2_journal_init(). */
2019 static int ocfs2_journal_addressable(struct ocfs2_super *osb) 2019 static int ocfs2_journal_addressable(struct ocfs2_super *osb)
2020 { 2020 {
2021 int status = 0; 2021 int status = 0;
2022 u64 max_block = 2022 u64 max_block =
2023 ocfs2_clusters_to_blocks(osb->sb, 2023 ocfs2_clusters_to_blocks(osb->sb,
2024 osb->osb_clusters_at_boot) - 1; 2024 osb->osb_clusters_at_boot) - 1;
2025 2025
2026 /* 32-bit block number is always OK. */ 2026 /* 32-bit block number is always OK. */
2027 if (max_block <= (u32)~0ULL) 2027 if (max_block <= (u32)~0ULL)
2028 goto out; 2028 goto out;
2029 2029
2030 /* Volume is "huge", so see if our journal is new enough to 2030 /* Volume is "huge", so see if our journal is new enough to
2031 support it. */ 2031 support it. */
2032 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, 2032 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
2033 OCFS2_FEATURE_COMPAT_JBD2_SB) && 2033 OCFS2_FEATURE_COMPAT_JBD2_SB) &&
2034 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, 2034 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
2035 JBD2_FEATURE_INCOMPAT_64BIT))) { 2035 JBD2_FEATURE_INCOMPAT_64BIT))) {
2036 mlog(ML_ERROR, "The journal cannot address the entire volume. " 2036 mlog(ML_ERROR, "The journal cannot address the entire volume. "
2037 "Enable the 'block64' journal option with tunefs.ocfs2"); 2037 "Enable the 'block64' journal option with tunefs.ocfs2");
2038 status = -EFBIG; 2038 status = -EFBIG;
2039 goto out; 2039 goto out;
2040 } 2040 }
2041 2041
2042 out: 2042 out:
2043 return status; 2043 return status;
2044 } 2044 }
2045 2045
2046 static int ocfs2_initialize_super(struct super_block *sb, 2046 static int ocfs2_initialize_super(struct super_block *sb,
2047 struct buffer_head *bh, 2047 struct buffer_head *bh,
2048 int sector_size, 2048 int sector_size,
2049 struct ocfs2_blockcheck_stats *stats) 2049 struct ocfs2_blockcheck_stats *stats)
2050 { 2050 {
2051 int status; 2051 int status;
2052 int i, cbits, bbits; 2052 int i, cbits, bbits;
2053 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 2053 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2054 struct inode *inode = NULL; 2054 struct inode *inode = NULL;
2055 struct ocfs2_journal *journal; 2055 struct ocfs2_journal *journal;
2056 struct ocfs2_super *osb; 2056 struct ocfs2_super *osb;
2057 u64 total_blocks; 2057 u64 total_blocks;
2058 2058
2059 osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); 2059 osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
2060 if (!osb) { 2060 if (!osb) {
2061 status = -ENOMEM; 2061 status = -ENOMEM;
2062 mlog_errno(status); 2062 mlog_errno(status);
2063 goto bail; 2063 goto bail;
2064 } 2064 }
2065 2065
2066 sb->s_fs_info = osb; 2066 sb->s_fs_info = osb;
2067 sb->s_op = &ocfs2_sops; 2067 sb->s_op = &ocfs2_sops;
2068 sb->s_d_op = &ocfs2_dentry_ops; 2068 sb->s_d_op = &ocfs2_dentry_ops;
2069 sb->s_export_op = &ocfs2_export_ops; 2069 sb->s_export_op = &ocfs2_export_ops;
2070 sb->s_qcop = &ocfs2_quotactl_ops; 2070 sb->s_qcop = &ocfs2_quotactl_ops;
2071 sb->dq_op = &ocfs2_quota_operations; 2071 sb->dq_op = &ocfs2_quota_operations;
2072 sb->s_xattr = ocfs2_xattr_handlers; 2072 sb->s_xattr = ocfs2_xattr_handlers;
2073 sb->s_time_gran = 1; 2073 sb->s_time_gran = 1;
2074 sb->s_flags |= MS_NOATIME; 2074 sb->s_flags |= MS_NOATIME;
2075 /* this is needed to support O_LARGEFILE */ 2075 /* this is needed to support O_LARGEFILE */
2076 cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); 2076 cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
2077 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 2077 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
2078 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 2078 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
2079 2079
2080 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; 2080 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
2081 2081
2082 for (i = 0; i < 3; i++) 2082 for (i = 0; i < 3; i++)
2083 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); 2083 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
2084 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); 2084 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
2085 2085
2086 osb->sb = sb; 2086 osb->sb = sb;
2087 /* Save off for ocfs2_rw_direct */ 2087 /* Save off for ocfs2_rw_direct */
2088 osb->s_sectsize_bits = blksize_bits(sector_size); 2088 osb->s_sectsize_bits = blksize_bits(sector_size);
2089 BUG_ON(!osb->s_sectsize_bits); 2089 BUG_ON(!osb->s_sectsize_bits);
2090 2090
2091 spin_lock_init(&osb->dc_task_lock); 2091 spin_lock_init(&osb->dc_task_lock);
2092 init_waitqueue_head(&osb->dc_event); 2092 init_waitqueue_head(&osb->dc_event);
2093 osb->dc_work_sequence = 0; 2093 osb->dc_work_sequence = 0;
2094 osb->dc_wake_sequence = 0; 2094 osb->dc_wake_sequence = 0;
2095 INIT_LIST_HEAD(&osb->blocked_lock_list); 2095 INIT_LIST_HEAD(&osb->blocked_lock_list);
2096 osb->blocked_lock_count = 0; 2096 osb->blocked_lock_count = 0;
2097 spin_lock_init(&osb->osb_lock); 2097 spin_lock_init(&osb->osb_lock);
2098 spin_lock_init(&osb->osb_xattr_lock); 2098 spin_lock_init(&osb->osb_xattr_lock);
2099 ocfs2_init_steal_slots(osb); 2099 ocfs2_init_steal_slots(osb);
2100 2100
2101 mutex_init(&osb->system_file_mutex); 2101 mutex_init(&osb->system_file_mutex);
2102 2102
2103 atomic_set(&osb->alloc_stats.moves, 0); 2103 atomic_set(&osb->alloc_stats.moves, 0);
2104 atomic_set(&osb->alloc_stats.local_data, 0); 2104 atomic_set(&osb->alloc_stats.local_data, 0);
2105 atomic_set(&osb->alloc_stats.bitmap_data, 0); 2105 atomic_set(&osb->alloc_stats.bitmap_data, 0);
2106 atomic_set(&osb->alloc_stats.bg_allocs, 0); 2106 atomic_set(&osb->alloc_stats.bg_allocs, 0);
2107 atomic_set(&osb->alloc_stats.bg_extends, 0); 2107 atomic_set(&osb->alloc_stats.bg_extends, 0);
2108 2108
2109 /* Copy the blockcheck stats from the superblock probe */ 2109 /* Copy the blockcheck stats from the superblock probe */
2110 osb->osb_ecc_stats = *stats; 2110 osb->osb_ecc_stats = *stats;
2111 2111
2112 ocfs2_init_node_maps(osb); 2112 ocfs2_init_node_maps(osb);
2113 2113
2114 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2114 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
2115 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2115 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
2116 2116
2117 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); 2117 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2118 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { 2118 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2119 mlog(ML_ERROR, "Invalid number of node slots (%u)\n", 2119 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2120 osb->max_slots); 2120 osb->max_slots);
2121 status = -EINVAL; 2121 status = -EINVAL;
2122 goto bail; 2122 goto bail;
2123 } 2123 }
2124 2124
2125 ocfs2_orphan_scan_init(osb); 2125 ocfs2_orphan_scan_init(osb);
2126 2126
2127 status = ocfs2_recovery_init(osb); 2127 status = ocfs2_recovery_init(osb);
2128 if (status) { 2128 if (status) {
2129 mlog(ML_ERROR, "Unable to initialize recovery state\n"); 2129 mlog(ML_ERROR, "Unable to initialize recovery state\n");
2130 mlog_errno(status); 2130 mlog_errno(status);
2131 goto bail; 2131 goto bail;
2132 } 2132 }
2133 2133
2134 init_waitqueue_head(&osb->checkpoint_event); 2134 init_waitqueue_head(&osb->checkpoint_event);
2135 2135
2136 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 2136 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
2137 2137
2138 osb->slot_num = OCFS2_INVALID_SLOT; 2138 osb->slot_num = OCFS2_INVALID_SLOT;
2139 2139
2140 osb->s_xattr_inline_size = le16_to_cpu( 2140 osb->s_xattr_inline_size = le16_to_cpu(
2141 di->id2.i_super.s_xattr_inline_size); 2141 di->id2.i_super.s_xattr_inline_size);
2142 2142
2143 osb->local_alloc_state = OCFS2_LA_UNUSED; 2143 osb->local_alloc_state = OCFS2_LA_UNUSED;
2144 osb->local_alloc_bh = NULL; 2144 osb->local_alloc_bh = NULL;
2145 INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); 2145 INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
2146 2146
2147 init_waitqueue_head(&osb->osb_mount_event); 2147 init_waitqueue_head(&osb->osb_mount_event);
2148 2148
2149 status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); 2149 status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
2150 if (status) { 2150 if (status) {
2151 mlog_errno(status); 2151 mlog_errno(status);
2152 goto bail; 2152 goto bail;
2153 } 2153 }
2154 2154
2155 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 2155 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
2156 if (!osb->vol_label) { 2156 if (!osb->vol_label) {
2157 mlog(ML_ERROR, "unable to alloc vol label\n"); 2157 mlog(ML_ERROR, "unable to alloc vol label\n");
2158 status = -ENOMEM; 2158 status = -ENOMEM;
2159 goto bail; 2159 goto bail;
2160 } 2160 }
2161 2161
2162 osb->slot_recovery_generations = 2162 osb->slot_recovery_generations =
2163 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), 2163 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
2164 GFP_KERNEL); 2164 GFP_KERNEL);
2165 if (!osb->slot_recovery_generations) { 2165 if (!osb->slot_recovery_generations) {
2166 status = -ENOMEM; 2166 status = -ENOMEM;
2167 mlog_errno(status); 2167 mlog_errno(status);
2168 goto bail; 2168 goto bail;
2169 } 2169 }
2170 2170
2171 init_waitqueue_head(&osb->osb_wipe_event); 2171 init_waitqueue_head(&osb->osb_wipe_event);
2172 osb->osb_orphan_wipes = kcalloc(osb->max_slots, 2172 osb->osb_orphan_wipes = kcalloc(osb->max_slots,
2173 sizeof(*osb->osb_orphan_wipes), 2173 sizeof(*osb->osb_orphan_wipes),
2174 GFP_KERNEL); 2174 GFP_KERNEL);
2175 if (!osb->osb_orphan_wipes) { 2175 if (!osb->osb_orphan_wipes) {
2176 status = -ENOMEM; 2176 status = -ENOMEM;
2177 mlog_errno(status); 2177 mlog_errno(status);
2178 goto bail; 2178 goto bail;
2179 } 2179 }
2180 2180
2181 osb->osb_rf_lock_tree = RB_ROOT; 2181 osb->osb_rf_lock_tree = RB_ROOT;
2182 2182
2183 osb->s_feature_compat = 2183 osb->s_feature_compat =
2184 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 2184 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
2185 osb->s_feature_ro_compat = 2185 osb->s_feature_ro_compat =
2186 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); 2186 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
2187 osb->s_feature_incompat = 2187 osb->s_feature_incompat =
2188 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); 2188 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
2189 2189
2190 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { 2190 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
2191 mlog(ML_ERROR, "couldn't mount because of unsupported " 2191 mlog(ML_ERROR, "couldn't mount because of unsupported "
2192 "optional features (%x).\n", i); 2192 "optional features (%x).\n", i);
2193 status = -EINVAL; 2193 status = -EINVAL;
2194 goto bail; 2194 goto bail;
2195 } 2195 }
2196 if (!(osb->sb->s_flags & MS_RDONLY) && 2196 if (!(osb->sb->s_flags & MS_RDONLY) &&
2197 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { 2197 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
2198 mlog(ML_ERROR, "couldn't mount RDWR because of " 2198 mlog(ML_ERROR, "couldn't mount RDWR because of "
2199 "unsupported optional features (%x).\n", i); 2199 "unsupported optional features (%x).\n", i);
2200 status = -EINVAL; 2200 status = -EINVAL;
2201 goto bail; 2201 goto bail;
2202 } 2202 }
2203 2203
2204 if (ocfs2_clusterinfo_valid(osb)) { 2204 if (ocfs2_clusterinfo_valid(osb)) {
2205 osb->osb_stackflags = 2205 osb->osb_stackflags =
2206 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; 2206 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2207 strlcpy(osb->osb_cluster_stack, 2207 strlcpy(osb->osb_cluster_stack,
2208 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2208 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2209 OCFS2_STACK_LABEL_LEN + 1); 2209 OCFS2_STACK_LABEL_LEN + 1);
2210 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { 2210 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
2211 mlog(ML_ERROR, 2211 mlog(ML_ERROR,
2212 "couldn't mount because of an invalid " 2212 "couldn't mount because of an invalid "
2213 "cluster stack label (%s) \n", 2213 "cluster stack label (%s) \n",
2214 osb->osb_cluster_stack); 2214 osb->osb_cluster_stack);
2215 status = -EINVAL; 2215 status = -EINVAL;
2216 goto bail; 2216 goto bail;
2217 } 2217 }
2218 strlcpy(osb->osb_cluster_name, 2218 strlcpy(osb->osb_cluster_name,
2219 OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, 2219 OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
2220 OCFS2_CLUSTER_NAME_LEN + 1); 2220 OCFS2_CLUSTER_NAME_LEN + 1);
2221 } else { 2221 } else {
2222 /* The empty string is identical with classic tools that 2222 /* The empty string is identical with classic tools that
2223 * don't know about s_cluster_info. */ 2223 * don't know about s_cluster_info. */
2224 osb->osb_cluster_stack[0] = '\0'; 2224 osb->osb_cluster_stack[0] = '\0';
2225 } 2225 }
2226 2226
2227 get_random_bytes(&osb->s_next_generation, sizeof(u32)); 2227 get_random_bytes(&osb->s_next_generation, sizeof(u32));
2228 2228
2229 /* FIXME 2229 /* FIXME
2230 * This should be done in ocfs2_journal_init(), but unknown 2230 * This should be done in ocfs2_journal_init(), but unknown
2231 * ordering issues will cause the filesystem to crash. 2231 * ordering issues will cause the filesystem to crash.
2232 * If anyone wants to figure out what part of the code 2232 * If anyone wants to figure out what part of the code
2233 * refers to osb->journal before ocfs2_journal_init() is run, 2233 * refers to osb->journal before ocfs2_journal_init() is run,
2234 * be my guest. 2234 * be my guest.
2235 */ 2235 */
2236 /* initialize our journal structure */ 2236 /* initialize our journal structure */
2237 2237
2238 journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); 2238 journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
2239 if (!journal) { 2239 if (!journal) {
2240 mlog(ML_ERROR, "unable to alloc journal\n"); 2240 mlog(ML_ERROR, "unable to alloc journal\n");
2241 status = -ENOMEM; 2241 status = -ENOMEM;
2242 goto bail; 2242 goto bail;
2243 } 2243 }
2244 osb->journal = journal; 2244 osb->journal = journal;
2245 journal->j_osb = osb; 2245 journal->j_osb = osb;
2246 2246
2247 atomic_set(&journal->j_num_trans, 0); 2247 atomic_set(&journal->j_num_trans, 0);
2248 init_rwsem(&journal->j_trans_barrier); 2248 init_rwsem(&journal->j_trans_barrier);
2249 init_waitqueue_head(&journal->j_checkpointed); 2249 init_waitqueue_head(&journal->j_checkpointed);
2250 spin_lock_init(&journal->j_lock); 2250 spin_lock_init(&journal->j_lock);
2251 journal->j_trans_id = (unsigned long) 1; 2251 journal->j_trans_id = (unsigned long) 1;
2252 INIT_LIST_HEAD(&journal->j_la_cleanups); 2252 INIT_LIST_HEAD(&journal->j_la_cleanups);
2253 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 2253 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
2254 journal->j_state = OCFS2_JOURNAL_FREE; 2254 journal->j_state = OCFS2_JOURNAL_FREE;
2255 2255
2256 INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); 2256 INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
2257 init_llist_head(&osb->dquot_drop_list); 2257 init_llist_head(&osb->dquot_drop_list);
2258 2258
2259 /* get some pseudo constants for clustersize bits */ 2259 /* get some pseudo constants for clustersize bits */
2260 osb->s_clustersize_bits = 2260 osb->s_clustersize_bits =
2261 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 2261 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
2262 osb->s_clustersize = 1 << osb->s_clustersize_bits; 2262 osb->s_clustersize = 1 << osb->s_clustersize_bits;
2263 2263
2264 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || 2264 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
2265 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { 2265 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
2266 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", 2266 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
2267 osb->s_clustersize); 2267 osb->s_clustersize);
2268 status = -EINVAL; 2268 status = -EINVAL;
2269 goto bail; 2269 goto bail;
2270 } 2270 }
2271 2271
2272 total_blocks = ocfs2_clusters_to_blocks(osb->sb, 2272 total_blocks = ocfs2_clusters_to_blocks(osb->sb,
2273 le32_to_cpu(di->i_clusters)); 2273 le32_to_cpu(di->i_clusters));
2274 2274
2275 status = generic_check_addressable(osb->sb->s_blocksize_bits, 2275 status = generic_check_addressable(osb->sb->s_blocksize_bits,
2276 total_blocks); 2276 total_blocks);
2277 if (status) { 2277 if (status) {
2278 mlog(ML_ERROR, "Volume too large " 2278 mlog(ML_ERROR, "Volume too large "
2279 "to mount safely on this system"); 2279 "to mount safely on this system");
2280 status = -EFBIG; 2280 status = -EFBIG;
2281 goto bail; 2281 goto bail;
2282 } 2282 }
2283 2283
2284 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, 2284 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
2285 sizeof(di->id2.i_super.s_uuid))) { 2285 sizeof(di->id2.i_super.s_uuid))) {
2286 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); 2286 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
2287 status = -ENOMEM; 2287 status = -ENOMEM;
2288 goto bail; 2288 goto bail;
2289 } 2289 }
2290 2290
2291 strlcpy(osb->vol_label, di->id2.i_super.s_label, 2291 strlcpy(osb->vol_label, di->id2.i_super.s_label,
2292 OCFS2_MAX_VOL_LABEL_LEN); 2292 OCFS2_MAX_VOL_LABEL_LEN);
2293 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); 2293 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
2294 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); 2294 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
2295 osb->first_cluster_group_blkno = 2295 osb->first_cluster_group_blkno =
2296 le64_to_cpu(di->id2.i_super.s_first_cluster_group); 2296 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
2297 osb->fs_generation = le32_to_cpu(di->i_fs_generation); 2297 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
2298 osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); 2298 osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
2299 trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, 2299 trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
2300 (unsigned long long)osb->root_blkno, 2300 (unsigned long long)osb->root_blkno,
2301 (unsigned long long)osb->system_dir_blkno, 2301 (unsigned long long)osb->system_dir_blkno,
2302 osb->s_clustersize_bits); 2302 osb->s_clustersize_bits);
2303 2303
2304 osb->osb_dlm_debug = ocfs2_new_dlm_debug(); 2304 osb->osb_dlm_debug = ocfs2_new_dlm_debug();
2305 if (!osb->osb_dlm_debug) { 2305 if (!osb->osb_dlm_debug) {
2306 status = -ENOMEM; 2306 status = -ENOMEM;
2307 mlog_errno(status); 2307 mlog_errno(status);
2308 goto bail; 2308 goto bail;
2309 } 2309 }
2310 2310
2311 atomic_set(&osb->vol_state, VOLUME_INIT); 2311 atomic_set(&osb->vol_state, VOLUME_INIT);
2312 2312
2313 /* load root, system_dir, and all global system inodes */ 2313 /* load root, system_dir, and all global system inodes */
2314 status = ocfs2_init_global_system_inodes(osb); 2314 status = ocfs2_init_global_system_inodes(osb);
2315 if (status < 0) { 2315 if (status < 0) {
2316 mlog_errno(status); 2316 mlog_errno(status);
2317 goto bail; 2317 goto bail;
2318 } 2318 }
2319 2319
2320 /* 2320 /*
2321 * global bitmap 2321 * global bitmap
2322 */ 2322 */
2323 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 2323 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
2324 OCFS2_INVALID_SLOT); 2324 OCFS2_INVALID_SLOT);
2325 if (!inode) { 2325 if (!inode) {
2326 status = -EINVAL; 2326 status = -EINVAL;
2327 mlog_errno(status); 2327 mlog_errno(status);
2328 goto bail; 2328 goto bail;
2329 } 2329 }
2330 2330
2331 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 2331 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
2332 osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; 2332 osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
2333 iput(inode); 2333 iput(inode);
2334 2334
2335 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, 2335 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
2336 osb->s_feature_incompat) * 8; 2336 osb->s_feature_incompat) * 8;
2337 2337
2338 status = ocfs2_init_slot_info(osb); 2338 status = ocfs2_init_slot_info(osb);
2339 if (status < 0) { 2339 if (status < 0) {
2340 mlog_errno(status); 2340 mlog_errno(status);
2341 goto bail; 2341 goto bail;
2342 } 2342 }
2343 cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); 2343 cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
2344 2344
2345 bail: 2345 bail:
2346 return status; 2346 return status;
2347 } 2347 }
2348 2348
2349 /* 2349 /*
2350 * will return: -EAGAIN if it is ok to keep searching for superblocks 2350 * will return: -EAGAIN if it is ok to keep searching for superblocks
2351 * -EINVAL if there is a bad superblock 2351 * -EINVAL if there is a bad superblock
2352 * 0 on success 2352 * 0 on success
2353 */ 2353 */
2354 static int ocfs2_verify_volume(struct ocfs2_dinode *di, 2354 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
2355 struct buffer_head *bh, 2355 struct buffer_head *bh,
2356 u32 blksz, 2356 u32 blksz,
2357 struct ocfs2_blockcheck_stats *stats) 2357 struct ocfs2_blockcheck_stats *stats)
2358 { 2358 {
2359 int status = -EAGAIN; 2359 int status = -EAGAIN;
2360 2360
2361 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, 2361 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
2362 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { 2362 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
2363 /* We have to do a raw check of the feature here */ 2363 /* We have to do a raw check of the feature here */
2364 if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & 2364 if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
2365 OCFS2_FEATURE_INCOMPAT_META_ECC) { 2365 OCFS2_FEATURE_INCOMPAT_META_ECC) {
2366 status = ocfs2_block_check_validate(bh->b_data, 2366 status = ocfs2_block_check_validate(bh->b_data,
2367 bh->b_size, 2367 bh->b_size,
2368 &di->i_check, 2368 &di->i_check,
2369 stats); 2369 stats);
2370 if (status) 2370 if (status)
2371 goto out; 2371 goto out;
2372 } 2372 }
2373 status = -EINVAL; 2373 status = -EINVAL;
2374 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { 2374 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
2375 mlog(ML_ERROR, "found superblock with incorrect block " 2375 mlog(ML_ERROR, "found superblock with incorrect block "
2376 "size: found %u, should be %u\n", 2376 "size: found %u, should be %u\n",
2377 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), 2377 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
2378 blksz); 2378 blksz);
2379 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != 2379 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
2380 OCFS2_MAJOR_REV_LEVEL || 2380 OCFS2_MAJOR_REV_LEVEL ||
2381 le16_to_cpu(di->id2.i_super.s_minor_rev_level) != 2381 le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
2382 OCFS2_MINOR_REV_LEVEL) { 2382 OCFS2_MINOR_REV_LEVEL) {
2383 mlog(ML_ERROR, "found superblock with bad version: " 2383 mlog(ML_ERROR, "found superblock with bad version: "
2384 "found %u.%u, should be %u.%u\n", 2384 "found %u.%u, should be %u.%u\n",
2385 le16_to_cpu(di->id2.i_super.s_major_rev_level), 2385 le16_to_cpu(di->id2.i_super.s_major_rev_level),
2386 le16_to_cpu(di->id2.i_super.s_minor_rev_level), 2386 le16_to_cpu(di->id2.i_super.s_minor_rev_level),
2387 OCFS2_MAJOR_REV_LEVEL, 2387 OCFS2_MAJOR_REV_LEVEL,
2388 OCFS2_MINOR_REV_LEVEL); 2388 OCFS2_MINOR_REV_LEVEL);
2389 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { 2389 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
2390 mlog(ML_ERROR, "bad block number on superblock: " 2390 mlog(ML_ERROR, "bad block number on superblock: "
2391 "found %llu, should be %llu\n", 2391 "found %llu, should be %llu\n",
2392 (unsigned long long)le64_to_cpu(di->i_blkno), 2392 (unsigned long long)le64_to_cpu(di->i_blkno),
2393 (unsigned long long)bh->b_blocknr); 2393 (unsigned long long)bh->b_blocknr);
2394 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || 2394 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
2395 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { 2395 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
2396 mlog(ML_ERROR, "bad cluster size found: %u\n", 2396 mlog(ML_ERROR, "bad cluster size found: %u\n",
2397 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); 2397 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
2398 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { 2398 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
2399 mlog(ML_ERROR, "bad root_blkno: 0\n"); 2399 mlog(ML_ERROR, "bad root_blkno: 0\n");
2400 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { 2400 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
2401 mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); 2401 mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
2402 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { 2402 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
2403 mlog(ML_ERROR, 2403 mlog(ML_ERROR,
2404 "Superblock slots found greater than file system " 2404 "Superblock slots found greater than file system "
2405 "maximum: found %u, max %u\n", 2405 "maximum: found %u, max %u\n",
2406 le16_to_cpu(di->id2.i_super.s_max_slots), 2406 le16_to_cpu(di->id2.i_super.s_max_slots),
2407 OCFS2_MAX_SLOTS); 2407 OCFS2_MAX_SLOTS);
2408 } else { 2408 } else {
2409 /* found it! */ 2409 /* found it! */
2410 status = 0; 2410 status = 0;
2411 } 2411 }
2412 } 2412 }
2413 2413
2414 out: 2414 out:
2415 if (status && status != -EAGAIN) 2415 if (status && status != -EAGAIN)
2416 mlog_errno(status); 2416 mlog_errno(status);
2417 return status; 2417 return status;
2418 } 2418 }
2419 2419
2420 static int ocfs2_check_volume(struct ocfs2_super *osb) 2420 static int ocfs2_check_volume(struct ocfs2_super *osb)
2421 { 2421 {
2422 int status; 2422 int status;
2423 int dirty; 2423 int dirty;
2424 int local; 2424 int local;
2425 struct ocfs2_dinode *local_alloc = NULL; /* only used if we 2425 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
2426 * recover 2426 * recover
2427 * ourselves. */ 2427 * ourselves. */
2428 2428
2429 /* Init our journal object. */ 2429 /* Init our journal object. */
2430 status = ocfs2_journal_init(osb->journal, &dirty); 2430 status = ocfs2_journal_init(osb->journal, &dirty);
2431 if (status < 0) { 2431 if (status < 0) {
2432 mlog(ML_ERROR, "Could not initialize journal!\n"); 2432 mlog(ML_ERROR, "Could not initialize journal!\n");
2433 goto finally; 2433 goto finally;
2434 } 2434 }
2435 2435
2436 /* Now that journal has been initialized, check to make sure 2436 /* Now that journal has been initialized, check to make sure
2437 entire volume is addressable. */ 2437 entire volume is addressable. */
2438 status = ocfs2_journal_addressable(osb); 2438 status = ocfs2_journal_addressable(osb);
2439 if (status) 2439 if (status)
2440 goto finally; 2440 goto finally;
2441 2441
2442 /* If the journal was unmounted cleanly then we don't want to 2442 /* If the journal was unmounted cleanly then we don't want to
2443 * recover anything. Otherwise, journal_load will do that 2443 * recover anything. Otherwise, journal_load will do that
2444 * dirty work for us :) */ 2444 * dirty work for us :) */
2445 if (!dirty) { 2445 if (!dirty) {
2446 status = ocfs2_journal_wipe(osb->journal, 0); 2446 status = ocfs2_journal_wipe(osb->journal, 0);
2447 if (status < 0) { 2447 if (status < 0) {
2448 mlog_errno(status); 2448 mlog_errno(status);
2449 goto finally; 2449 goto finally;
2450 } 2450 }
2451 } else { 2451 } else {
2452 printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " 2452 printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
2453 "unmounted cleanly, recovering it.\n", osb->dev_str); 2453 "unmounted cleanly, recovering it.\n", osb->dev_str);
2454 } 2454 }
2455 2455
2456 local = ocfs2_mount_local(osb); 2456 local = ocfs2_mount_local(osb);
2457 2457
2458 /* will play back anything left in the journal. */ 2458 /* will play back anything left in the journal. */
2459 status = ocfs2_journal_load(osb->journal, local, dirty); 2459 status = ocfs2_journal_load(osb->journal, local, dirty);
2460 if (status < 0) { 2460 if (status < 0) {
2461 mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); 2461 mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
2462 goto finally; 2462 goto finally;
2463 } 2463 }
2464 2464
2465 if (dirty) { 2465 if (dirty) {
2466 /* recover my local alloc if we didn't unmount cleanly. */ 2466 /* recover my local alloc if we didn't unmount cleanly. */
2467 status = ocfs2_begin_local_alloc_recovery(osb, 2467 status = ocfs2_begin_local_alloc_recovery(osb,
2468 osb->slot_num, 2468 osb->slot_num,
2469 &local_alloc); 2469 &local_alloc);
2470 if (status < 0) { 2470 if (status < 0) {
2471 mlog_errno(status); 2471 mlog_errno(status);
2472 goto finally; 2472 goto finally;
2473 } 2473 }
2474 /* we complete the recovery process after we've marked 2474 /* we complete the recovery process after we've marked
2475 * ourselves as mounted. */ 2475 * ourselves as mounted. */
2476 } 2476 }
2477 2477
2478 status = ocfs2_load_local_alloc(osb); 2478 status = ocfs2_load_local_alloc(osb);
2479 if (status < 0) { 2479 if (status < 0) {
2480 mlog_errno(status); 2480 mlog_errno(status);
2481 goto finally; 2481 goto finally;
2482 } 2482 }
2483 2483
2484 if (dirty) { 2484 if (dirty) {
2485 /* Recovery will be completed after we've mounted the 2485 /* Recovery will be completed after we've mounted the
2486 * rest of the volume. */ 2486 * rest of the volume. */
2487 osb->dirty = 1; 2487 osb->dirty = 1;
2488 osb->local_alloc_copy = local_alloc; 2488 osb->local_alloc_copy = local_alloc;
2489 local_alloc = NULL; 2489 local_alloc = NULL;
2490 } 2490 }
2491 2491
2492 /* go through each journal, trylock it and if you get the 2492 /* go through each journal, trylock it and if you get the
2493 * lock, and it's marked as dirty, set the bit in the recover 2493 * lock, and it's marked as dirty, set the bit in the recover
2494 * map and launch a recovery thread for it. */ 2494 * map and launch a recovery thread for it. */
2495 status = ocfs2_mark_dead_nodes(osb); 2495 status = ocfs2_mark_dead_nodes(osb);
2496 if (status < 0) { 2496 if (status < 0) {
2497 mlog_errno(status); 2497 mlog_errno(status);
2498 goto finally; 2498 goto finally;
2499 } 2499 }
2500 2500
2501 status = ocfs2_compute_replay_slots(osb); 2501 status = ocfs2_compute_replay_slots(osb);
2502 if (status < 0) 2502 if (status < 0)
2503 mlog_errno(status); 2503 mlog_errno(status);
2504 2504
2505 finally: 2505 finally:
2506 kfree(local_alloc); 2506 kfree(local_alloc);
2507 2507
2508 if (status) 2508 if (status)
2509 mlog_errno(status); 2509 mlog_errno(status);
2510 return status; 2510 return status;
2511 } 2511 }
2512 2512
2513 /* 2513 /*
2514 * The routine gets called from dismount or close whenever a dismount on 2514 * The routine gets called from dismount or close whenever a dismount on
2515 * volume is requested and the osb open count becomes 1. 2515 * volume is requested and the osb open count becomes 1.
2516 * It will remove the osb from the global list and also free up all the 2516 * It will remove the osb from the global list and also free up all the
2517 * initialized resources and fileobject. 2517 * initialized resources and fileobject.
2518 */ 2518 */
2519 static void ocfs2_delete_osb(struct ocfs2_super *osb) 2519 static void ocfs2_delete_osb(struct ocfs2_super *osb)
2520 { 2520 {
2521 /* This function assumes that the caller has the main osb resource */ 2521 /* This function assumes that the caller has the main osb resource */
2522 2522
2523 ocfs2_free_slot_info(osb); 2523 ocfs2_free_slot_info(osb);
2524 2524
2525 kfree(osb->osb_orphan_wipes); 2525 kfree(osb->osb_orphan_wipes);
2526 kfree(osb->slot_recovery_generations); 2526 kfree(osb->slot_recovery_generations);
2527 /* FIXME 2527 /* FIXME
2528 * This belongs in journal shutdown, but because we have to 2528 * This belongs in journal shutdown, but because we have to
2529 * allocate osb->journal at the start of ocfs2_initialize_osb(), 2529 * allocate osb->journal at the start of ocfs2_initialize_osb(),
2530 * we free it here. 2530 * we free it here.
2531 */ 2531 */
2532 kfree(osb->journal); 2532 kfree(osb->journal);
2533 kfree(osb->local_alloc_copy); 2533 kfree(osb->local_alloc_copy);
2534 kfree(osb->uuid_str); 2534 kfree(osb->uuid_str);
2535 kfree(osb->vol_label);
2535 ocfs2_put_dlm_debug(osb->osb_dlm_debug); 2536 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
2536 memset(osb, 0, sizeof(struct ocfs2_super)); 2537 memset(osb, 0, sizeof(struct ocfs2_super));
2537 } 2538 }
2538 2539
2539 /* Put OCFS2 into a readonly state, or (if the user specifies it), 2540 /* Put OCFS2 into a readonly state, or (if the user specifies it),
2540 * panic(). We do not support continue-on-error operation. */ 2541 * panic(). We do not support continue-on-error operation. */
2541 static void ocfs2_handle_error(struct super_block *sb) 2542 static void ocfs2_handle_error(struct super_block *sb)
2542 { 2543 {
2543 struct ocfs2_super *osb = OCFS2_SB(sb); 2544 struct ocfs2_super *osb = OCFS2_SB(sb);
2544 2545
2545 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) 2546 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
2546 panic("OCFS2: (device %s): panic forced after error\n", 2547 panic("OCFS2: (device %s): panic forced after error\n",
2547 sb->s_id); 2548 sb->s_id);
2548 2549
2549 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); 2550 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
2550 2551
2551 if (sb->s_flags & MS_RDONLY && 2552 if (sb->s_flags & MS_RDONLY &&
2552 (ocfs2_is_soft_readonly(osb) || 2553 (ocfs2_is_soft_readonly(osb) ||
2553 ocfs2_is_hard_readonly(osb))) 2554 ocfs2_is_hard_readonly(osb)))
2554 return; 2555 return;
2555 2556
2556 printk(KERN_CRIT "File system is now read-only due to the potential " 2557 printk(KERN_CRIT "File system is now read-only due to the potential "
2557 "of on-disk corruption. Please run fsck.ocfs2 once the file " 2558 "of on-disk corruption. Please run fsck.ocfs2 once the file "
2558 "system is unmounted.\n"); 2559 "system is unmounted.\n");
2559 sb->s_flags |= MS_RDONLY; 2560 sb->s_flags |= MS_RDONLY;
2560 ocfs2_set_ro_flag(osb, 0); 2561 ocfs2_set_ro_flag(osb, 0);
2561 } 2562 }
2562 2563
2563 static char error_buf[1024]; 2564 static char error_buf[1024];
2564 2565
2565 void __ocfs2_error(struct super_block *sb, 2566 void __ocfs2_error(struct super_block *sb,
2566 const char *function, 2567 const char *function,
2567 const char *fmt, ...) 2568 const char *fmt, ...)
2568 { 2569 {
2569 va_list args; 2570 va_list args;
2570 2571
2571 va_start(args, fmt); 2572 va_start(args, fmt);
2572 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 2573 vsnprintf(error_buf, sizeof(error_buf), fmt, args);
2573 va_end(args); 2574 va_end(args);
2574 2575
2575 /* Not using mlog here because we want to show the actual 2576 /* Not using mlog here because we want to show the actual
2576 * function the error came from. */ 2577 * function the error came from. */
2577 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", 2578 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
2578 sb->s_id, function, error_buf); 2579 sb->s_id, function, error_buf);
2579 2580
2580 ocfs2_handle_error(sb); 2581 ocfs2_handle_error(sb);
2581 } 2582 }
2582 2583
2583 /* Handle critical errors. This is intentionally more drastic than 2584 /* Handle critical errors. This is intentionally more drastic than
2584 * ocfs2_handle_error, so we only use for things like journal errors, 2585 * ocfs2_handle_error, so we only use for things like journal errors,
2585 * etc. */ 2586 * etc. */
2586 void __ocfs2_abort(struct super_block* sb, 2587 void __ocfs2_abort(struct super_block* sb,
2587 const char *function, 2588 const char *function,
2588 const char *fmt, ...) 2589 const char *fmt, ...)
2589 { 2590 {
2590 va_list args; 2591 va_list args;
2591 2592
2592 va_start(args, fmt); 2593 va_start(args, fmt);
2593 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 2594 vsnprintf(error_buf, sizeof(error_buf), fmt, args);
2594 va_end(args); 2595 va_end(args);
2595 2596
2596 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", 2597 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
2597 sb->s_id, function, error_buf); 2598 sb->s_id, function, error_buf);
2598 2599
2599 /* We don't have the cluster support yet to go straight to 2600 /* We don't have the cluster support yet to go straight to
2600 * hard readonly in here. Until then, we want to keep 2601 * hard readonly in here. Until then, we want to keep
2601 * ocfs2_abort() so that we can at least mark critical 2602 * ocfs2_abort() so that we can at least mark critical
2602 * errors. 2603 * errors.
2603 * 2604 *
2604 * TODO: This should abort the journal and alert other nodes 2605 * TODO: This should abort the journal and alert other nodes
2605 * that our slot needs recovery. */ 2606 * that our slot needs recovery. */
2606 2607
2607 /* Force a panic(). This stinks, but it's better than letting 2608 /* Force a panic(). This stinks, but it's better than letting
2608 * things continue without having a proper hard readonly 2609 * things continue without having a proper hard readonly
2609 * here. */ 2610 * here. */
2610 if (!ocfs2_mount_local(OCFS2_SB(sb))) 2611 if (!ocfs2_mount_local(OCFS2_SB(sb)))
2611 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 2612 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
2612 ocfs2_handle_error(sb); 2613 ocfs2_handle_error(sb);
2613 } 2614 }
2614 2615
2615 /* 2616 /*
2616 * Void signal blockers, because in-kernel sigprocmask() only fails 2617 * Void signal blockers, because in-kernel sigprocmask() only fails
2617 * when SIG_* is wrong. 2618 * when SIG_* is wrong.
2618 */ 2619 */
2619 void ocfs2_block_signals(sigset_t *oldset) 2620 void ocfs2_block_signals(sigset_t *oldset)
2620 { 2621 {
2621 int rc; 2622 int rc;
2622 sigset_t blocked; 2623 sigset_t blocked;
2623 2624
2624 sigfillset(&blocked); 2625 sigfillset(&blocked);
2625 rc = sigprocmask(SIG_BLOCK, &blocked, oldset); 2626 rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
2626 BUG_ON(rc); 2627 BUG_ON(rc);
2627 } 2628 }
2628 2629
2629 void ocfs2_unblock_signals(sigset_t *oldset) 2630 void ocfs2_unblock_signals(sigset_t *oldset)
2630 { 2631 {
2631 int rc = sigprocmask(SIG_SETMASK, oldset, NULL); 2632 int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
2632 BUG_ON(rc); 2633 BUG_ON(rc);
2633 } 2634 }
2634 2635
2635 module_init(ocfs2_init); 2636 module_init(ocfs2_init);
2636 module_exit(ocfs2_exit); 2637 module_exit(ocfs2_exit);
2637 2638
1 #include <linux/mm.h> 1 #include <linux/mm.h>
2 #include <linux/vmacache.h> 2 #include <linux/vmacache.h>
3 #include <linux/hugetlb.h> 3 #include <linux/hugetlb.h>
4 #include <linux/huge_mm.h> 4 #include <linux/huge_mm.h>
5 #include <linux/mount.h> 5 #include <linux/mount.h>
6 #include <linux/seq_file.h> 6 #include <linux/seq_file.h>
7 #include <linux/highmem.h> 7 #include <linux/highmem.h>
8 #include <linux/ptrace.h> 8 #include <linux/ptrace.h>
9 #include <linux/slab.h> 9 #include <linux/slab.h>
10 #include <linux/pagemap.h> 10 #include <linux/pagemap.h>
11 #include <linux/mempolicy.h> 11 #include <linux/mempolicy.h>
12 #include <linux/rmap.h> 12 #include <linux/rmap.h>
13 #include <linux/swap.h> 13 #include <linux/swap.h>
14 #include <linux/swapops.h> 14 #include <linux/swapops.h>
15 #include <linux/mmu_notifier.h> 15 #include <linux/mmu_notifier.h>
16 16
17 #include <asm/elf.h> 17 #include <asm/elf.h>
18 #include <asm/uaccess.h> 18 #include <asm/uaccess.h>
19 #include <asm/tlbflush.h> 19 #include <asm/tlbflush.h>
20 #include "internal.h" 20 #include "internal.h"
21 21
22 void task_mem(struct seq_file *m, struct mm_struct *mm) 22 void task_mem(struct seq_file *m, struct mm_struct *mm)
23 { 23 {
24 unsigned long data, text, lib, swap; 24 unsigned long data, text, lib, swap;
25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
26 26
27 /* 27 /*
28 * Note: to minimize their overhead, mm maintains hiwater_vm and 28 * Note: to minimize their overhead, mm maintains hiwater_vm and
29 * hiwater_rss only when about to *lower* total_vm or rss. Any 29 * hiwater_rss only when about to *lower* total_vm or rss. Any
30 * collector of these hiwater stats must therefore get total_vm 30 * collector of these hiwater stats must therefore get total_vm
31 * and rss too, which will usually be the higher. Barriers? not 31 * and rss too, which will usually be the higher. Barriers? not
32 * worth the effort, such snapshots can always be inconsistent. 32 * worth the effort, such snapshots can always be inconsistent.
33 */ 33 */
34 hiwater_vm = total_vm = mm->total_vm; 34 hiwater_vm = total_vm = mm->total_vm;
35 if (hiwater_vm < mm->hiwater_vm) 35 if (hiwater_vm < mm->hiwater_vm)
36 hiwater_vm = mm->hiwater_vm; 36 hiwater_vm = mm->hiwater_vm;
37 hiwater_rss = total_rss = get_mm_rss(mm); 37 hiwater_rss = total_rss = get_mm_rss(mm);
38 if (hiwater_rss < mm->hiwater_rss) 38 if (hiwater_rss < mm->hiwater_rss)
39 hiwater_rss = mm->hiwater_rss; 39 hiwater_rss = mm->hiwater_rss;
40 40
41 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 41 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
44 swap = get_mm_counter(mm, MM_SWAPENTS); 44 swap = get_mm_counter(mm, MM_SWAPENTS);
45 seq_printf(m, 45 seq_printf(m,
46 "VmPeak:\t%8lu kB\n" 46 "VmPeak:\t%8lu kB\n"
47 "VmSize:\t%8lu kB\n" 47 "VmSize:\t%8lu kB\n"
48 "VmLck:\t%8lu kB\n" 48 "VmLck:\t%8lu kB\n"
49 "VmPin:\t%8lu kB\n" 49 "VmPin:\t%8lu kB\n"
50 "VmHWM:\t%8lu kB\n" 50 "VmHWM:\t%8lu kB\n"
51 "VmRSS:\t%8lu kB\n" 51 "VmRSS:\t%8lu kB\n"
52 "VmData:\t%8lu kB\n" 52 "VmData:\t%8lu kB\n"
53 "VmStk:\t%8lu kB\n" 53 "VmStk:\t%8lu kB\n"
54 "VmExe:\t%8lu kB\n" 54 "VmExe:\t%8lu kB\n"
55 "VmLib:\t%8lu kB\n" 55 "VmLib:\t%8lu kB\n"
56 "VmPTE:\t%8lu kB\n" 56 "VmPTE:\t%8lu kB\n"
57 "VmSwap:\t%8lu kB\n", 57 "VmSwap:\t%8lu kB\n",
58 hiwater_vm << (PAGE_SHIFT-10), 58 hiwater_vm << (PAGE_SHIFT-10),
59 total_vm << (PAGE_SHIFT-10), 59 total_vm << (PAGE_SHIFT-10),
60 mm->locked_vm << (PAGE_SHIFT-10), 60 mm->locked_vm << (PAGE_SHIFT-10),
61 mm->pinned_vm << (PAGE_SHIFT-10), 61 mm->pinned_vm << (PAGE_SHIFT-10),
62 hiwater_rss << (PAGE_SHIFT-10), 62 hiwater_rss << (PAGE_SHIFT-10),
63 total_rss << (PAGE_SHIFT-10), 63 total_rss << (PAGE_SHIFT-10),
64 data << (PAGE_SHIFT-10), 64 data << (PAGE_SHIFT-10),
65 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 65 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66 (PTRS_PER_PTE * sizeof(pte_t) * 66 (PTRS_PER_PTE * sizeof(pte_t) *
67 atomic_long_read(&mm->nr_ptes)) >> 10, 67 atomic_long_read(&mm->nr_ptes)) >> 10,
68 swap << (PAGE_SHIFT-10)); 68 swap << (PAGE_SHIFT-10));
69 } 69 }
70 70
71 unsigned long task_vsize(struct mm_struct *mm) 71 unsigned long task_vsize(struct mm_struct *mm)
72 { 72 {
73 return PAGE_SIZE * mm->total_vm; 73 return PAGE_SIZE * mm->total_vm;
74 } 74 }
75 75
76 unsigned long task_statm(struct mm_struct *mm, 76 unsigned long task_statm(struct mm_struct *mm,
77 unsigned long *shared, unsigned long *text, 77 unsigned long *shared, unsigned long *text,
78 unsigned long *data, unsigned long *resident) 78 unsigned long *data, unsigned long *resident)
79 { 79 {
80 *shared = get_mm_counter(mm, MM_FILEPAGES); 80 *shared = get_mm_counter(mm, MM_FILEPAGES);
81 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 81 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
82 >> PAGE_SHIFT; 82 >> PAGE_SHIFT;
83 *data = mm->total_vm - mm->shared_vm; 83 *data = mm->total_vm - mm->shared_vm;
84 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); 84 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
85 return mm->total_vm; 85 return mm->total_vm;
86 } 86 }
87 87
88 #ifdef CONFIG_NUMA 88 #ifdef CONFIG_NUMA
89 /* 89 /*
90 * These functions are for numa_maps but called in generic **maps seq_file 90 * These functions are for numa_maps but called in generic **maps seq_file
91 * ->start(), ->stop() ops. 91 * ->start(), ->stop() ops.
92 * 92 *
93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy. 93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
94 * Each mempolicy object is controlled by reference counting. The problem here 94 * Each mempolicy object is controlled by reference counting. The problem here
95 * is how to avoid accessing dead mempolicy object. 95 * is how to avoid accessing dead mempolicy object.
96 * 96 *
97 * Because we're holding mmap_sem while reading seq_file, it's safe to access 97 * Because we're holding mmap_sem while reading seq_file, it's safe to access
98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy. 98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
99 * 99 *
100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy 100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). 101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot 102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
103 * gurantee the task never exits under us. But taking task_lock() around 103 * gurantee the task never exits under us. But taking task_lock() around
104 * get_vma_plicy() causes lock order problem. 104 * get_vma_plicy() causes lock order problem.
105 * 105 *
106 * To access task->mempolicy without lock, we hold a reference count of an 106 * To access task->mempolicy without lock, we hold a reference count of an
107 * object pointed by task->mempolicy and remember it. This will guarantee 107 * object pointed by task->mempolicy and remember it. This will guarantee
108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses. 108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
109 */ 109 */
110 static void hold_task_mempolicy(struct proc_maps_private *priv) 110 static void hold_task_mempolicy(struct proc_maps_private *priv)
111 { 111 {
112 struct task_struct *task = priv->task; 112 struct task_struct *task = priv->task;
113 113
114 task_lock(task); 114 task_lock(task);
115 priv->task_mempolicy = task->mempolicy; 115 priv->task_mempolicy = task->mempolicy;
116 mpol_get(priv->task_mempolicy); 116 mpol_get(priv->task_mempolicy);
117 task_unlock(task); 117 task_unlock(task);
118 } 118 }
119 static void release_task_mempolicy(struct proc_maps_private *priv) 119 static void release_task_mempolicy(struct proc_maps_private *priv)
120 { 120 {
121 mpol_put(priv->task_mempolicy); 121 mpol_put(priv->task_mempolicy);
122 } 122 }
123 #else 123 #else
124 static void hold_task_mempolicy(struct proc_maps_private *priv) 124 static void hold_task_mempolicy(struct proc_maps_private *priv)
125 { 125 {
126 } 126 }
127 static void release_task_mempolicy(struct proc_maps_private *priv) 127 static void release_task_mempolicy(struct proc_maps_private *priv)
128 { 128 {
129 } 129 }
130 #endif 130 #endif
131 131
132 static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) 132 static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
133 { 133 {
134 if (vma && vma != priv->tail_vma) { 134 if (vma && vma != priv->tail_vma) {
135 struct mm_struct *mm = vma->vm_mm; 135 struct mm_struct *mm = vma->vm_mm;
136 release_task_mempolicy(priv); 136 release_task_mempolicy(priv);
137 up_read(&mm->mmap_sem); 137 up_read(&mm->mmap_sem);
138 mmput(mm); 138 mmput(mm);
139 } 139 }
140 } 140 }
141 141
142 static void *m_start(struct seq_file *m, loff_t *pos) 142 static void *m_start(struct seq_file *m, loff_t *pos)
143 { 143 {
144 struct proc_maps_private *priv = m->private; 144 struct proc_maps_private *priv = m->private;
145 unsigned long last_addr = m->version; 145 unsigned long last_addr = m->version;
146 struct mm_struct *mm; 146 struct mm_struct *mm;
147 struct vm_area_struct *vma, *tail_vma = NULL; 147 struct vm_area_struct *vma, *tail_vma = NULL;
148 loff_t l = *pos; 148 loff_t l = *pos;
149 149
150 /* Clear the per syscall fields in priv */ 150 /* Clear the per syscall fields in priv */
151 priv->task = NULL; 151 priv->task = NULL;
152 priv->tail_vma = NULL; 152 priv->tail_vma = NULL;
153 153
154 /* 154 /*
155 * We remember last_addr rather than next_addr to hit with 155 * We remember last_addr rather than next_addr to hit with
156 * vmacache most of the time. We have zero last_addr at 156 * vmacache most of the time. We have zero last_addr at
157 * the beginning and also after lseek. We will have -1 last_addr 157 * the beginning and also after lseek. We will have -1 last_addr
158 * after the end of the vmas. 158 * after the end of the vmas.
159 */ 159 */
160 160
161 if (last_addr == -1UL) 161 if (last_addr == -1UL)
162 return NULL; 162 return NULL;
163 163
164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
165 if (!priv->task) 165 if (!priv->task)
166 return ERR_PTR(-ESRCH); 166 return ERR_PTR(-ESRCH);
167 167
168 mm = mm_access(priv->task, PTRACE_MODE_READ); 168 mm = mm_access(priv->task, PTRACE_MODE_READ);
169 if (!mm || IS_ERR(mm)) 169 if (!mm || IS_ERR(mm))
170 return mm; 170 return mm;
171 down_read(&mm->mmap_sem); 171 down_read(&mm->mmap_sem);
172 172
173 tail_vma = get_gate_vma(priv->task->mm); 173 tail_vma = get_gate_vma(priv->task->mm);
174 priv->tail_vma = tail_vma; 174 priv->tail_vma = tail_vma;
175 hold_task_mempolicy(priv); 175 hold_task_mempolicy(priv);
176 /* Start with last addr hint */ 176 /* Start with last addr hint */
177 vma = find_vma(mm, last_addr); 177 vma = find_vma(mm, last_addr);
178 if (last_addr && vma) { 178 if (last_addr && vma) {
179 vma = vma->vm_next; 179 vma = vma->vm_next;
180 goto out; 180 goto out;
181 } 181 }
182 182
183 /* 183 /*
184 * Check the vma index is within the range and do 184 * Check the vma index is within the range and do
185 * sequential scan until m_index. 185 * sequential scan until m_index.
186 */ 186 */
187 vma = NULL; 187 vma = NULL;
188 if ((unsigned long)l < mm->map_count) { 188 if ((unsigned long)l < mm->map_count) {
189 vma = mm->mmap; 189 vma = mm->mmap;
190 while (l-- && vma) 190 while (l-- && vma)
191 vma = vma->vm_next; 191 vma = vma->vm_next;
192 goto out; 192 goto out;
193 } 193 }
194 194
195 if (l != mm->map_count) 195 if (l != mm->map_count)
196 tail_vma = NULL; /* After gate vma */ 196 tail_vma = NULL; /* After gate vma */
197 197
198 out: 198 out:
199 if (vma) 199 if (vma)
200 return vma; 200 return vma;
201 201
202 release_task_mempolicy(priv); 202 release_task_mempolicy(priv);
203 /* End of vmas has been reached */ 203 /* End of vmas has been reached */
204 m->version = (tail_vma != NULL)? 0: -1UL; 204 m->version = (tail_vma != NULL)? 0: -1UL;
205 up_read(&mm->mmap_sem); 205 up_read(&mm->mmap_sem);
206 mmput(mm); 206 mmput(mm);
207 return tail_vma; 207 return tail_vma;
208 } 208 }
209 209
210 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 210 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
211 { 211 {
212 struct proc_maps_private *priv = m->private; 212 struct proc_maps_private *priv = m->private;
213 struct vm_area_struct *vma = v; 213 struct vm_area_struct *vma = v;
214 struct vm_area_struct *tail_vma = priv->tail_vma; 214 struct vm_area_struct *tail_vma = priv->tail_vma;
215 215
216 (*pos)++; 216 (*pos)++;
217 if (vma && (vma != tail_vma) && vma->vm_next) 217 if (vma && (vma != tail_vma) && vma->vm_next)
218 return vma->vm_next; 218 return vma->vm_next;
219 vma_stop(priv, vma); 219 vma_stop(priv, vma);
220 return (vma != tail_vma)? tail_vma: NULL; 220 return (vma != tail_vma)? tail_vma: NULL;
221 } 221 }
222 222
223 static void m_stop(struct seq_file *m, void *v) 223 static void m_stop(struct seq_file *m, void *v)
224 { 224 {
225 struct proc_maps_private *priv = m->private; 225 struct proc_maps_private *priv = m->private;
226 struct vm_area_struct *vma = v; 226 struct vm_area_struct *vma = v;
227 227
228 if (!IS_ERR(vma)) 228 if (!IS_ERR(vma))
229 vma_stop(priv, vma); 229 vma_stop(priv, vma);
230 if (priv->task) 230 if (priv->task)
231 put_task_struct(priv->task); 231 put_task_struct(priv->task);
232 } 232 }
233 233
234 static int do_maps_open(struct inode *inode, struct file *file, 234 static int do_maps_open(struct inode *inode, struct file *file,
235 const struct seq_operations *ops) 235 const struct seq_operations *ops)
236 { 236 {
237 struct proc_maps_private *priv; 237 struct proc_maps_private *priv;
238 int ret = -ENOMEM; 238 int ret = -ENOMEM;
239 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 239 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
240 if (priv) { 240 if (priv) {
241 priv->pid = proc_pid(inode); 241 priv->pid = proc_pid(inode);
242 ret = seq_open(file, ops); 242 ret = seq_open(file, ops);
243 if (!ret) { 243 if (!ret) {
244 struct seq_file *m = file->private_data; 244 struct seq_file *m = file->private_data;
245 m->private = priv; 245 m->private = priv;
246 } else { 246 } else {
247 kfree(priv); 247 kfree(priv);
248 } 248 }
249 } 249 }
250 return ret; 250 return ret;
251 } 251 }
252 252
253 static void 253 static void
254 show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) 254 show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
255 { 255 {
256 struct mm_struct *mm = vma->vm_mm; 256 struct mm_struct *mm = vma->vm_mm;
257 struct file *file = vma->vm_file; 257 struct file *file = vma->vm_file;
258 struct proc_maps_private *priv = m->private; 258 struct proc_maps_private *priv = m->private;
259 struct task_struct *task = priv->task; 259 struct task_struct *task = priv->task;
260 vm_flags_t flags = vma->vm_flags; 260 vm_flags_t flags = vma->vm_flags;
261 unsigned long ino = 0; 261 unsigned long ino = 0;
262 unsigned long long pgoff = 0; 262 unsigned long long pgoff = 0;
263 unsigned long start, end; 263 unsigned long start, end;
264 dev_t dev = 0; 264 dev_t dev = 0;
265 const char *name = NULL; 265 const char *name = NULL;
266 266
267 if (file) { 267 if (file) {
268 struct inode *inode = file_inode(vma->vm_file); 268 struct inode *inode = file_inode(vma->vm_file);
269 dev = inode->i_sb->s_dev; 269 dev = inode->i_sb->s_dev;
270 ino = inode->i_ino; 270 ino = inode->i_ino;
271 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 271 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
272 } 272 }
273 273
274 /* We don't show the stack guard page in /proc/maps */ 274 /* We don't show the stack guard page in /proc/maps */
275 start = vma->vm_start; 275 start = vma->vm_start;
276 if (stack_guard_page_start(vma, start)) 276 if (stack_guard_page_start(vma, start))
277 start += PAGE_SIZE; 277 start += PAGE_SIZE;
278 end = vma->vm_end; 278 end = vma->vm_end;
279 if (stack_guard_page_end(vma, end)) 279 if (stack_guard_page_end(vma, end))
280 end -= PAGE_SIZE; 280 end -= PAGE_SIZE;
281 281
282 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); 282 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
283 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", 283 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
284 start, 284 start,
285 end, 285 end,
286 flags & VM_READ ? 'r' : '-', 286 flags & VM_READ ? 'r' : '-',
287 flags & VM_WRITE ? 'w' : '-', 287 flags & VM_WRITE ? 'w' : '-',
288 flags & VM_EXEC ? 'x' : '-', 288 flags & VM_EXEC ? 'x' : '-',
289 flags & VM_MAYSHARE ? 's' : 'p', 289 flags & VM_MAYSHARE ? 's' : 'p',
290 pgoff, 290 pgoff,
291 MAJOR(dev), MINOR(dev), ino); 291 MAJOR(dev), MINOR(dev), ino);
292 292
293 /* 293 /*
294 * Print the dentry name for named mappings, and a 294 * Print the dentry name for named mappings, and a
295 * special [heap] marker for the heap: 295 * special [heap] marker for the heap:
296 */ 296 */
297 if (file) { 297 if (file) {
298 seq_pad(m, ' '); 298 seq_pad(m, ' ');
299 seq_path(m, &file->f_path, "\n"); 299 seq_path(m, &file->f_path, "\n");
300 goto done; 300 goto done;
301 } 301 }
302 302
303 if (vma->vm_ops && vma->vm_ops->name) { 303 if (vma->vm_ops && vma->vm_ops->name) {
304 name = vma->vm_ops->name(vma); 304 name = vma->vm_ops->name(vma);
305 if (name) 305 if (name)
306 goto done; 306 goto done;
307 } 307 }
308 308
309 name = arch_vma_name(vma); 309 name = arch_vma_name(vma);
310 if (!name) { 310 if (!name) {
311 pid_t tid; 311 pid_t tid;
312 312
313 if (!mm) { 313 if (!mm) {
314 name = "[vdso]"; 314 name = "[vdso]";
315 goto done; 315 goto done;
316 } 316 }
317 317
318 if (vma->vm_start <= mm->brk && 318 if (vma->vm_start <= mm->brk &&
319 vma->vm_end >= mm->start_brk) { 319 vma->vm_end >= mm->start_brk) {
320 name = "[heap]"; 320 name = "[heap]";
321 goto done; 321 goto done;
322 } 322 }
323 323
324 tid = vm_is_stack(task, vma, is_pid); 324 tid = vm_is_stack(task, vma, is_pid);
325 325
326 if (tid != 0) { 326 if (tid != 0) {
327 /* 327 /*
328 * Thread stack in /proc/PID/task/TID/maps or 328 * Thread stack in /proc/PID/task/TID/maps or
329 * the main process stack. 329 * the main process stack.
330 */ 330 */
331 if (!is_pid || (vma->vm_start <= mm->start_stack && 331 if (!is_pid || (vma->vm_start <= mm->start_stack &&
332 vma->vm_end >= mm->start_stack)) { 332 vma->vm_end >= mm->start_stack)) {
333 name = "[stack]"; 333 name = "[stack]";
334 } else { 334 } else {
335 /* Thread stack in /proc/PID/maps */ 335 /* Thread stack in /proc/PID/maps */
336 seq_pad(m, ' '); 336 seq_pad(m, ' ');
337 seq_printf(m, "[stack:%d]", tid); 337 seq_printf(m, "[stack:%d]", tid);
338 } 338 }
339 } 339 }
340 } 340 }
341 341
342 done: 342 done:
343 if (name) { 343 if (name) {
344 seq_pad(m, ' '); 344 seq_pad(m, ' ');
345 seq_puts(m, name); 345 seq_puts(m, name);
346 } 346 }
347 seq_putc(m, '\n'); 347 seq_putc(m, '\n');
348 } 348 }
349 349
350 static int show_map(struct seq_file *m, void *v, int is_pid) 350 static int show_map(struct seq_file *m, void *v, int is_pid)
351 { 351 {
352 struct vm_area_struct *vma = v; 352 struct vm_area_struct *vma = v;
353 struct proc_maps_private *priv = m->private; 353 struct proc_maps_private *priv = m->private;
354 struct task_struct *task = priv->task; 354 struct task_struct *task = priv->task;
355 355
356 show_map_vma(m, vma, is_pid); 356 show_map_vma(m, vma, is_pid);
357 357
358 if (m->count < m->size) /* vma is copied successfully */ 358 if (m->count < m->size) /* vma is copied successfully */
359 m->version = (vma != get_gate_vma(task->mm)) 359 m->version = (vma != get_gate_vma(task->mm))
360 ? vma->vm_start : 0; 360 ? vma->vm_start : 0;
361 return 0; 361 return 0;
362 } 362 }
363 363
364 static int show_pid_map(struct seq_file *m, void *v) 364 static int show_pid_map(struct seq_file *m, void *v)
365 { 365 {
366 return show_map(m, v, 1); 366 return show_map(m, v, 1);
367 } 367 }
368 368
369 static int show_tid_map(struct seq_file *m, void *v) 369 static int show_tid_map(struct seq_file *m, void *v)
370 { 370 {
371 return show_map(m, v, 0); 371 return show_map(m, v, 0);
372 } 372 }
373 373
374 static const struct seq_operations proc_pid_maps_op = { 374 static const struct seq_operations proc_pid_maps_op = {
375 .start = m_start, 375 .start = m_start,
376 .next = m_next, 376 .next = m_next,
377 .stop = m_stop, 377 .stop = m_stop,
378 .show = show_pid_map 378 .show = show_pid_map
379 }; 379 };
380 380
381 static const struct seq_operations proc_tid_maps_op = { 381 static const struct seq_operations proc_tid_maps_op = {
382 .start = m_start, 382 .start = m_start,
383 .next = m_next, 383 .next = m_next,
384 .stop = m_stop, 384 .stop = m_stop,
385 .show = show_tid_map 385 .show = show_tid_map
386 }; 386 };
387 387
388 static int pid_maps_open(struct inode *inode, struct file *file) 388 static int pid_maps_open(struct inode *inode, struct file *file)
389 { 389 {
390 return do_maps_open(inode, file, &proc_pid_maps_op); 390 return do_maps_open(inode, file, &proc_pid_maps_op);
391 } 391 }
392 392
393 static int tid_maps_open(struct inode *inode, struct file *file) 393 static int tid_maps_open(struct inode *inode, struct file *file)
394 { 394 {
395 return do_maps_open(inode, file, &proc_tid_maps_op); 395 return do_maps_open(inode, file, &proc_tid_maps_op);
396 } 396 }
397 397
398 const struct file_operations proc_pid_maps_operations = { 398 const struct file_operations proc_pid_maps_operations = {
399 .open = pid_maps_open, 399 .open = pid_maps_open,
400 .read = seq_read, 400 .read = seq_read,
401 .llseek = seq_lseek, 401 .llseek = seq_lseek,
402 .release = seq_release_private, 402 .release = seq_release_private,
403 }; 403 };
404 404
405 const struct file_operations proc_tid_maps_operations = { 405 const struct file_operations proc_tid_maps_operations = {
406 .open = tid_maps_open, 406 .open = tid_maps_open,
407 .read = seq_read, 407 .read = seq_read,
408 .llseek = seq_lseek, 408 .llseek = seq_lseek,
409 .release = seq_release_private, 409 .release = seq_release_private,
410 }; 410 };
411 411
412 /* 412 /*
413 * Proportional Set Size(PSS): my share of RSS. 413 * Proportional Set Size(PSS): my share of RSS.
414 * 414 *
415 * PSS of a process is the count of pages it has in memory, where each 415 * PSS of a process is the count of pages it has in memory, where each
416 * page is divided by the number of processes sharing it. So if a 416 * page is divided by the number of processes sharing it. So if a
417 * process has 1000 pages all to itself, and 1000 shared with one other 417 * process has 1000 pages all to itself, and 1000 shared with one other
418 * process, its PSS will be 1500. 418 * process, its PSS will be 1500.
419 * 419 *
420 * To keep (accumulated) division errors low, we adopt a 64bit 420 * To keep (accumulated) division errors low, we adopt a 64bit
421 * fixed-point pss counter to minimize division errors. So (pss >> 421 * fixed-point pss counter to minimize division errors. So (pss >>
422 * PSS_SHIFT) would be the real byte count. 422 * PSS_SHIFT) would be the real byte count.
423 * 423 *
424 * A shift of 12 before division means (assuming 4K page size): 424 * A shift of 12 before division means (assuming 4K page size):
425 * - 1M 3-user-pages add up to 8KB errors; 425 * - 1M 3-user-pages add up to 8KB errors;
426 * - supports mapcount up to 2^24, or 16M; 426 * - supports mapcount up to 2^24, or 16M;
427 * - supports PSS up to 2^52 bytes, or 4PB. 427 * - supports PSS up to 2^52 bytes, or 4PB.
428 */ 428 */
429 #define PSS_SHIFT 12 429 #define PSS_SHIFT 12
430 430
431 #ifdef CONFIG_PROC_PAGE_MONITOR 431 #ifdef CONFIG_PROC_PAGE_MONITOR
432 struct mem_size_stats { 432 struct mem_size_stats {
433 struct vm_area_struct *vma; 433 struct vm_area_struct *vma;
434 unsigned long resident; 434 unsigned long resident;
435 unsigned long shared_clean; 435 unsigned long shared_clean;
436 unsigned long shared_dirty; 436 unsigned long shared_dirty;
437 unsigned long private_clean; 437 unsigned long private_clean;
438 unsigned long private_dirty; 438 unsigned long private_dirty;
439 unsigned long referenced; 439 unsigned long referenced;
440 unsigned long anonymous; 440 unsigned long anonymous;
441 unsigned long anonymous_thp; 441 unsigned long anonymous_thp;
442 unsigned long swap; 442 unsigned long swap;
443 unsigned long nonlinear; 443 unsigned long nonlinear;
444 u64 pss; 444 u64 pss;
445 }; 445 };
446 446
447 447
448 static void smaps_pte_entry(pte_t ptent, unsigned long addr, 448 static void smaps_pte_entry(pte_t ptent, unsigned long addr,
449 unsigned long ptent_size, struct mm_walk *walk) 449 unsigned long ptent_size, struct mm_walk *walk)
450 { 450 {
451 struct mem_size_stats *mss = walk->private; 451 struct mem_size_stats *mss = walk->private;
452 struct vm_area_struct *vma = mss->vma; 452 struct vm_area_struct *vma = mss->vma;
453 pgoff_t pgoff = linear_page_index(vma, addr); 453 pgoff_t pgoff = linear_page_index(vma, addr);
454 struct page *page = NULL; 454 struct page *page = NULL;
455 int mapcount; 455 int mapcount;
456 456
457 if (pte_present(ptent)) { 457 if (pte_present(ptent)) {
458 page = vm_normal_page(vma, addr, ptent); 458 page = vm_normal_page(vma, addr, ptent);
459 } else if (is_swap_pte(ptent)) { 459 } else if (is_swap_pte(ptent)) {
460 swp_entry_t swpent = pte_to_swp_entry(ptent); 460 swp_entry_t swpent = pte_to_swp_entry(ptent);
461 461
462 if (!non_swap_entry(swpent)) 462 if (!non_swap_entry(swpent))
463 mss->swap += ptent_size; 463 mss->swap += ptent_size;
464 else if (is_migration_entry(swpent)) 464 else if (is_migration_entry(swpent))
465 page = migration_entry_to_page(swpent); 465 page = migration_entry_to_page(swpent);
466 } else if (pte_file(ptent)) { 466 } else if (pte_file(ptent)) {
467 if (pte_to_pgoff(ptent) != pgoff) 467 if (pte_to_pgoff(ptent) != pgoff)
468 mss->nonlinear += ptent_size; 468 mss->nonlinear += ptent_size;
469 } 469 }
470 470
471 if (!page) 471 if (!page)
472 return; 472 return;
473 473
474 if (PageAnon(page)) 474 if (PageAnon(page))
475 mss->anonymous += ptent_size; 475 mss->anonymous += ptent_size;
476 476
477 if (page->index != pgoff) 477 if (page->index != pgoff)
478 mss->nonlinear += ptent_size; 478 mss->nonlinear += ptent_size;
479 479
480 mss->resident += ptent_size; 480 mss->resident += ptent_size;
481 /* Accumulate the size in pages that have been accessed. */ 481 /* Accumulate the size in pages that have been accessed. */
482 if (pte_young(ptent) || PageReferenced(page)) 482 if (pte_young(ptent) || PageReferenced(page))
483 mss->referenced += ptent_size; 483 mss->referenced += ptent_size;
484 mapcount = page_mapcount(page); 484 mapcount = page_mapcount(page);
485 if (mapcount >= 2) { 485 if (mapcount >= 2) {
486 if (pte_dirty(ptent) || PageDirty(page)) 486 if (pte_dirty(ptent) || PageDirty(page))
487 mss->shared_dirty += ptent_size; 487 mss->shared_dirty += ptent_size;
488 else 488 else
489 mss->shared_clean += ptent_size; 489 mss->shared_clean += ptent_size;
490 mss->pss += (ptent_size << PSS_SHIFT) / mapcount; 490 mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
491 } else { 491 } else {
492 if (pte_dirty(ptent) || PageDirty(page)) 492 if (pte_dirty(ptent) || PageDirty(page))
493 mss->private_dirty += ptent_size; 493 mss->private_dirty += ptent_size;
494 else 494 else
495 mss->private_clean += ptent_size; 495 mss->private_clean += ptent_size;
496 mss->pss += (ptent_size << PSS_SHIFT); 496 mss->pss += (ptent_size << PSS_SHIFT);
497 } 497 }
498 } 498 }
499 499
500 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 500 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
501 struct mm_walk *walk) 501 struct mm_walk *walk)
502 { 502 {
503 struct mem_size_stats *mss = walk->private; 503 struct mem_size_stats *mss = walk->private;
504 struct vm_area_struct *vma = mss->vma; 504 struct vm_area_struct *vma = mss->vma;
505 pte_t *pte; 505 pte_t *pte;
506 spinlock_t *ptl; 506 spinlock_t *ptl;
507 507
508 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 508 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
509 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); 509 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
510 spin_unlock(ptl); 510 spin_unlock(ptl);
511 mss->anonymous_thp += HPAGE_PMD_SIZE; 511 mss->anonymous_thp += HPAGE_PMD_SIZE;
512 return 0; 512 return 0;
513 } 513 }
514 514
515 if (pmd_trans_unstable(pmd)) 515 if (pmd_trans_unstable(pmd))
516 return 0; 516 return 0;
517 /* 517 /*
518 * The mmap_sem held all the way back in m_start() is what 518 * The mmap_sem held all the way back in m_start() is what
519 * keeps khugepaged out of here and from collapsing things 519 * keeps khugepaged out of here and from collapsing things
520 * in here. 520 * in here.
521 */ 521 */
522 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 522 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
523 for (; addr != end; pte++, addr += PAGE_SIZE) 523 for (; addr != end; pte++, addr += PAGE_SIZE)
524 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); 524 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
525 pte_unmap_unlock(pte - 1, ptl); 525 pte_unmap_unlock(pte - 1, ptl);
526 cond_resched(); 526 cond_resched();
527 return 0; 527 return 0;
528 } 528 }
529 529
530 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 530 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
531 { 531 {
532 /* 532 /*
533 * Don't forget to update Documentation/ on changes. 533 * Don't forget to update Documentation/ on changes.
534 */ 534 */
535 static const char mnemonics[BITS_PER_LONG][2] = { 535 static const char mnemonics[BITS_PER_LONG][2] = {
536 /* 536 /*
537 * In case if we meet a flag we don't know about. 537 * In case if we meet a flag we don't know about.
538 */ 538 */
539 [0 ... (BITS_PER_LONG-1)] = "??", 539 [0 ... (BITS_PER_LONG-1)] = "??",
540 540
541 [ilog2(VM_READ)] = "rd", 541 [ilog2(VM_READ)] = "rd",
542 [ilog2(VM_WRITE)] = "wr", 542 [ilog2(VM_WRITE)] = "wr",
543 [ilog2(VM_EXEC)] = "ex", 543 [ilog2(VM_EXEC)] = "ex",
544 [ilog2(VM_SHARED)] = "sh", 544 [ilog2(VM_SHARED)] = "sh",
545 [ilog2(VM_MAYREAD)] = "mr", 545 [ilog2(VM_MAYREAD)] = "mr",
546 [ilog2(VM_MAYWRITE)] = "mw", 546 [ilog2(VM_MAYWRITE)] = "mw",
547 [ilog2(VM_MAYEXEC)] = "me", 547 [ilog2(VM_MAYEXEC)] = "me",
548 [ilog2(VM_MAYSHARE)] = "ms", 548 [ilog2(VM_MAYSHARE)] = "ms",
549 [ilog2(VM_GROWSDOWN)] = "gd", 549 [ilog2(VM_GROWSDOWN)] = "gd",
550 [ilog2(VM_PFNMAP)] = "pf", 550 [ilog2(VM_PFNMAP)] = "pf",
551 [ilog2(VM_DENYWRITE)] = "dw", 551 [ilog2(VM_DENYWRITE)] = "dw",
552 [ilog2(VM_LOCKED)] = "lo", 552 [ilog2(VM_LOCKED)] = "lo",
553 [ilog2(VM_IO)] = "io", 553 [ilog2(VM_IO)] = "io",
554 [ilog2(VM_SEQ_READ)] = "sr", 554 [ilog2(VM_SEQ_READ)] = "sr",
555 [ilog2(VM_RAND_READ)] = "rr", 555 [ilog2(VM_RAND_READ)] = "rr",
556 [ilog2(VM_DONTCOPY)] = "dc", 556 [ilog2(VM_DONTCOPY)] = "dc",
557 [ilog2(VM_DONTEXPAND)] = "de", 557 [ilog2(VM_DONTEXPAND)] = "de",
558 [ilog2(VM_ACCOUNT)] = "ac", 558 [ilog2(VM_ACCOUNT)] = "ac",
559 [ilog2(VM_NORESERVE)] = "nr", 559 [ilog2(VM_NORESERVE)] = "nr",
560 [ilog2(VM_HUGETLB)] = "ht", 560 [ilog2(VM_HUGETLB)] = "ht",
561 [ilog2(VM_NONLINEAR)] = "nl", 561 [ilog2(VM_NONLINEAR)] = "nl",
562 [ilog2(VM_ARCH_1)] = "ar", 562 [ilog2(VM_ARCH_1)] = "ar",
563 [ilog2(VM_DONTDUMP)] = "dd", 563 [ilog2(VM_DONTDUMP)] = "dd",
564 #ifdef CONFIG_MEM_SOFT_DIRTY 564 #ifdef CONFIG_MEM_SOFT_DIRTY
565 [ilog2(VM_SOFTDIRTY)] = "sd", 565 [ilog2(VM_SOFTDIRTY)] = "sd",
566 #endif 566 #endif
567 [ilog2(VM_MIXEDMAP)] = "mm", 567 [ilog2(VM_MIXEDMAP)] = "mm",
568 [ilog2(VM_HUGEPAGE)] = "hg", 568 [ilog2(VM_HUGEPAGE)] = "hg",
569 [ilog2(VM_NOHUGEPAGE)] = "nh", 569 [ilog2(VM_NOHUGEPAGE)] = "nh",
570 [ilog2(VM_MERGEABLE)] = "mg", 570 [ilog2(VM_MERGEABLE)] = "mg",
571 }; 571 };
572 size_t i; 572 size_t i;
573 573
574 seq_puts(m, "VmFlags: "); 574 seq_puts(m, "VmFlags: ");
575 for (i = 0; i < BITS_PER_LONG; i++) { 575 for (i = 0; i < BITS_PER_LONG; i++) {
576 if (vma->vm_flags & (1UL << i)) { 576 if (vma->vm_flags & (1UL << i)) {
577 seq_printf(m, "%c%c ", 577 seq_printf(m, "%c%c ",
578 mnemonics[i][0], mnemonics[i][1]); 578 mnemonics[i][0], mnemonics[i][1]);
579 } 579 }
580 } 580 }
581 seq_putc(m, '\n'); 581 seq_putc(m, '\n');
582 } 582 }
583 583
584 static int show_smap(struct seq_file *m, void *v, int is_pid) 584 static int show_smap(struct seq_file *m, void *v, int is_pid)
585 { 585 {
586 struct proc_maps_private *priv = m->private; 586 struct proc_maps_private *priv = m->private;
587 struct task_struct *task = priv->task; 587 struct task_struct *task = priv->task;
588 struct vm_area_struct *vma = v; 588 struct vm_area_struct *vma = v;
589 struct mem_size_stats mss; 589 struct mem_size_stats mss;
590 struct mm_walk smaps_walk = { 590 struct mm_walk smaps_walk = {
591 .pmd_entry = smaps_pte_range, 591 .pmd_entry = smaps_pte_range,
592 .mm = vma->vm_mm, 592 .mm = vma->vm_mm,
593 .private = &mss, 593 .private = &mss,
594 }; 594 };
595 595
596 memset(&mss, 0, sizeof mss); 596 memset(&mss, 0, sizeof mss);
597 mss.vma = vma; 597 mss.vma = vma;
598 /* mmap_sem is held in m_start */ 598 /* mmap_sem is held in m_start */
599 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 599 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
600 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 600 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
601 601
602 show_map_vma(m, vma, is_pid); 602 show_map_vma(m, vma, is_pid);
603 603
604 seq_printf(m, 604 seq_printf(m,
605 "Size: %8lu kB\n" 605 "Size: %8lu kB\n"
606 "Rss: %8lu kB\n" 606 "Rss: %8lu kB\n"
607 "Pss: %8lu kB\n" 607 "Pss: %8lu kB\n"
608 "Shared_Clean: %8lu kB\n" 608 "Shared_Clean: %8lu kB\n"
609 "Shared_Dirty: %8lu kB\n" 609 "Shared_Dirty: %8lu kB\n"
610 "Private_Clean: %8lu kB\n" 610 "Private_Clean: %8lu kB\n"
611 "Private_Dirty: %8lu kB\n" 611 "Private_Dirty: %8lu kB\n"
612 "Referenced: %8lu kB\n" 612 "Referenced: %8lu kB\n"
613 "Anonymous: %8lu kB\n" 613 "Anonymous: %8lu kB\n"
614 "AnonHugePages: %8lu kB\n" 614 "AnonHugePages: %8lu kB\n"
615 "Swap: %8lu kB\n" 615 "Swap: %8lu kB\n"
616 "KernelPageSize: %8lu kB\n" 616 "KernelPageSize: %8lu kB\n"
617 "MMUPageSize: %8lu kB\n" 617 "MMUPageSize: %8lu kB\n"
618 "Locked: %8lu kB\n", 618 "Locked: %8lu kB\n",
619 (vma->vm_end - vma->vm_start) >> 10, 619 (vma->vm_end - vma->vm_start) >> 10,
620 mss.resident >> 10, 620 mss.resident >> 10,
621 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 621 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
622 mss.shared_clean >> 10, 622 mss.shared_clean >> 10,
623 mss.shared_dirty >> 10, 623 mss.shared_dirty >> 10,
624 mss.private_clean >> 10, 624 mss.private_clean >> 10,
625 mss.private_dirty >> 10, 625 mss.private_dirty >> 10,
626 mss.referenced >> 10, 626 mss.referenced >> 10,
627 mss.anonymous >> 10, 627 mss.anonymous >> 10,
628 mss.anonymous_thp >> 10, 628 mss.anonymous_thp >> 10,
629 mss.swap >> 10, 629 mss.swap >> 10,
630 vma_kernel_pagesize(vma) >> 10, 630 vma_kernel_pagesize(vma) >> 10,
631 vma_mmu_pagesize(vma) >> 10, 631 vma_mmu_pagesize(vma) >> 10,
632 (vma->vm_flags & VM_LOCKED) ? 632 (vma->vm_flags & VM_LOCKED) ?
633 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 633 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
634 634
635 if (vma->vm_flags & VM_NONLINEAR) 635 if (vma->vm_flags & VM_NONLINEAR)
636 seq_printf(m, "Nonlinear: %8lu kB\n", 636 seq_printf(m, "Nonlinear: %8lu kB\n",
637 mss.nonlinear >> 10); 637 mss.nonlinear >> 10);
638 638
639 show_smap_vma_flags(m, vma); 639 show_smap_vma_flags(m, vma);
640 640
641 if (m->count < m->size) /* vma is copied successfully */ 641 if (m->count < m->size) /* vma is copied successfully */
642 m->version = (vma != get_gate_vma(task->mm)) 642 m->version = (vma != get_gate_vma(task->mm))
643 ? vma->vm_start : 0; 643 ? vma->vm_start : 0;
644 return 0; 644 return 0;
645 } 645 }
646 646
647 static int show_pid_smap(struct seq_file *m, void *v) 647 static int show_pid_smap(struct seq_file *m, void *v)
648 { 648 {
649 return show_smap(m, v, 1); 649 return show_smap(m, v, 1);
650 } 650 }
651 651
652 static int show_tid_smap(struct seq_file *m, void *v) 652 static int show_tid_smap(struct seq_file *m, void *v)
653 { 653 {
654 return show_smap(m, v, 0); 654 return show_smap(m, v, 0);
655 } 655 }
656 656
657 static const struct seq_operations proc_pid_smaps_op = { 657 static const struct seq_operations proc_pid_smaps_op = {
658 .start = m_start, 658 .start = m_start,
659 .next = m_next, 659 .next = m_next,
660 .stop = m_stop, 660 .stop = m_stop,
661 .show = show_pid_smap 661 .show = show_pid_smap
662 }; 662 };
663 663
664 static const struct seq_operations proc_tid_smaps_op = { 664 static const struct seq_operations proc_tid_smaps_op = {
665 .start = m_start, 665 .start = m_start,
666 .next = m_next, 666 .next = m_next,
667 .stop = m_stop, 667 .stop = m_stop,
668 .show = show_tid_smap 668 .show = show_tid_smap
669 }; 669 };
670 670
671 static int pid_smaps_open(struct inode *inode, struct file *file) 671 static int pid_smaps_open(struct inode *inode, struct file *file)
672 { 672 {
673 return do_maps_open(inode, file, &proc_pid_smaps_op); 673 return do_maps_open(inode, file, &proc_pid_smaps_op);
674 } 674 }
675 675
676 static int tid_smaps_open(struct inode *inode, struct file *file) 676 static int tid_smaps_open(struct inode *inode, struct file *file)
677 { 677 {
678 return do_maps_open(inode, file, &proc_tid_smaps_op); 678 return do_maps_open(inode, file, &proc_tid_smaps_op);
679 } 679 }
680 680
681 const struct file_operations proc_pid_smaps_operations = { 681 const struct file_operations proc_pid_smaps_operations = {
682 .open = pid_smaps_open, 682 .open = pid_smaps_open,
683 .read = seq_read, 683 .read = seq_read,
684 .llseek = seq_lseek, 684 .llseek = seq_lseek,
685 .release = seq_release_private, 685 .release = seq_release_private,
686 }; 686 };
687 687
688 const struct file_operations proc_tid_smaps_operations = { 688 const struct file_operations proc_tid_smaps_operations = {
689 .open = tid_smaps_open, 689 .open = tid_smaps_open,
690 .read = seq_read, 690 .read = seq_read,
691 .llseek = seq_lseek, 691 .llseek = seq_lseek,
692 .release = seq_release_private, 692 .release = seq_release_private,
693 }; 693 };
694 694
695 /* 695 /*
696 * We do not want to have constant page-shift bits sitting in 696 * We do not want to have constant page-shift bits sitting in
697 * pagemap entries and are about to reuse them some time soon. 697 * pagemap entries and are about to reuse them some time soon.
698 * 698 *
699 * Here's the "migration strategy": 699 * Here's the "migration strategy":
700 * 1. when the system boots these bits remain what they are, 700 * 1. when the system boots these bits remain what they are,
701 * but a warning about future change is printed in log; 701 * but a warning about future change is printed in log;
702 * 2. once anyone clears soft-dirty bits via clear_refs file, 702 * 2. once anyone clears soft-dirty bits via clear_refs file,
703 * these flag is set to denote, that user is aware of the 703 * these flag is set to denote, that user is aware of the
704 * new API and those page-shift bits change their meaning. 704 * new API and those page-shift bits change their meaning.
705 * The respective warning is printed in dmesg; 705 * The respective warning is printed in dmesg;
706 * 3. In a couple of releases we will remove all the mentions 706 * 3. In a couple of releases we will remove all the mentions
707 * of page-shift in pagemap entries. 707 * of page-shift in pagemap entries.
708 */ 708 */
709 709
710 static bool soft_dirty_cleared __read_mostly; 710 static bool soft_dirty_cleared __read_mostly;
711 711
712 enum clear_refs_types { 712 enum clear_refs_types {
713 CLEAR_REFS_ALL = 1, 713 CLEAR_REFS_ALL = 1,
714 CLEAR_REFS_ANON, 714 CLEAR_REFS_ANON,
715 CLEAR_REFS_MAPPED, 715 CLEAR_REFS_MAPPED,
716 CLEAR_REFS_SOFT_DIRTY, 716 CLEAR_REFS_SOFT_DIRTY,
717 CLEAR_REFS_LAST, 717 CLEAR_REFS_LAST,
718 }; 718 };
719 719
720 struct clear_refs_private { 720 struct clear_refs_private {
721 struct vm_area_struct *vma; 721 struct vm_area_struct *vma;
722 enum clear_refs_types type; 722 enum clear_refs_types type;
723 }; 723 };
724 724
725 static inline void clear_soft_dirty(struct vm_area_struct *vma, 725 static inline void clear_soft_dirty(struct vm_area_struct *vma,
726 unsigned long addr, pte_t *pte) 726 unsigned long addr, pte_t *pte)
727 { 727 {
728 #ifdef CONFIG_MEM_SOFT_DIRTY 728 #ifdef CONFIG_MEM_SOFT_DIRTY
729 /* 729 /*
730 * The soft-dirty tracker uses #PF-s to catch writes 730 * The soft-dirty tracker uses #PF-s to catch writes
731 * to pages, so write-protect the pte as well. See the 731 * to pages, so write-protect the pte as well. See the
732 * Documentation/vm/soft-dirty.txt for full description 732 * Documentation/vm/soft-dirty.txt for full description
733 * of how soft-dirty works. 733 * of how soft-dirty works.
734 */ 734 */
735 pte_t ptent = *pte; 735 pte_t ptent = *pte;
736 736
737 if (pte_present(ptent)) { 737 if (pte_present(ptent)) {
738 ptent = pte_wrprotect(ptent); 738 ptent = pte_wrprotect(ptent);
739 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); 739 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
740 } else if (is_swap_pte(ptent)) { 740 } else if (is_swap_pte(ptent)) {
741 ptent = pte_swp_clear_soft_dirty(ptent); 741 ptent = pte_swp_clear_soft_dirty(ptent);
742 } else if (pte_file(ptent)) { 742 } else if (pte_file(ptent)) {
743 ptent = pte_file_clear_soft_dirty(ptent); 743 ptent = pte_file_clear_soft_dirty(ptent);
744 } 744 }
745 745
746 set_pte_at(vma->vm_mm, addr, pte, ptent); 746 set_pte_at(vma->vm_mm, addr, pte, ptent);
747 #endif 747 #endif
748 } 748 }
749 749
750 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 750 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
751 unsigned long end, struct mm_walk *walk) 751 unsigned long end, struct mm_walk *walk)
752 { 752 {
753 struct clear_refs_private *cp = walk->private; 753 struct clear_refs_private *cp = walk->private;
754 struct vm_area_struct *vma = cp->vma; 754 struct vm_area_struct *vma = cp->vma;
755 pte_t *pte, ptent; 755 pte_t *pte, ptent;
756 spinlock_t *ptl; 756 spinlock_t *ptl;
757 struct page *page; 757 struct page *page;
758 758
759 split_huge_page_pmd(vma, addr, pmd); 759 split_huge_page_pmd(vma, addr, pmd);
760 if (pmd_trans_unstable(pmd)) 760 if (pmd_trans_unstable(pmd))
761 return 0; 761 return 0;
762 762
763 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 763 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
764 for (; addr != end; pte++, addr += PAGE_SIZE) { 764 for (; addr != end; pte++, addr += PAGE_SIZE) {
765 ptent = *pte; 765 ptent = *pte;
766 766
767 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 767 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
768 clear_soft_dirty(vma, addr, pte); 768 clear_soft_dirty(vma, addr, pte);
769 continue; 769 continue;
770 } 770 }
771 771
772 if (!pte_present(ptent)) 772 if (!pte_present(ptent))
773 continue; 773 continue;
774 774
775 page = vm_normal_page(vma, addr, ptent); 775 page = vm_normal_page(vma, addr, ptent);
776 if (!page) 776 if (!page)
777 continue; 777 continue;
778 778
779 /* Clear accessed and referenced bits. */ 779 /* Clear accessed and referenced bits. */
780 ptep_test_and_clear_young(vma, addr, pte); 780 ptep_test_and_clear_young(vma, addr, pte);
781 ClearPageReferenced(page); 781 ClearPageReferenced(page);
782 } 782 }
783 pte_unmap_unlock(pte - 1, ptl); 783 pte_unmap_unlock(pte - 1, ptl);
784 cond_resched(); 784 cond_resched();
785 return 0; 785 return 0;
786 } 786 }
787 787
788 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 788 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
789 size_t count, loff_t *ppos) 789 size_t count, loff_t *ppos)
790 { 790 {
791 struct task_struct *task; 791 struct task_struct *task;
792 char buffer[PROC_NUMBUF]; 792 char buffer[PROC_NUMBUF];
793 struct mm_struct *mm; 793 struct mm_struct *mm;
794 struct vm_area_struct *vma; 794 struct vm_area_struct *vma;
795 enum clear_refs_types type; 795 enum clear_refs_types type;
796 int itype; 796 int itype;
797 int rv; 797 int rv;
798 798
799 memset(buffer, 0, sizeof(buffer)); 799 memset(buffer, 0, sizeof(buffer));
800 if (count > sizeof(buffer) - 1) 800 if (count > sizeof(buffer) - 1)
801 count = sizeof(buffer) - 1; 801 count = sizeof(buffer) - 1;
802 if (copy_from_user(buffer, buf, count)) 802 if (copy_from_user(buffer, buf, count))
803 return -EFAULT; 803 return -EFAULT;
804 rv = kstrtoint(strstrip(buffer), 10, &itype); 804 rv = kstrtoint(strstrip(buffer), 10, &itype);
805 if (rv < 0) 805 if (rv < 0)
806 return rv; 806 return rv;
807 type = (enum clear_refs_types)itype; 807 type = (enum clear_refs_types)itype;
808 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 808 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
809 return -EINVAL; 809 return -EINVAL;
810 810
811 if (type == CLEAR_REFS_SOFT_DIRTY) { 811 if (type == CLEAR_REFS_SOFT_DIRTY) {
812 soft_dirty_cleared = true; 812 soft_dirty_cleared = true;
813 pr_warn_once("The pagemap bits 55-60 has changed their meaning!" 813 pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
814 " See the linux/Documentation/vm/pagemap.txt for " 814 " See the linux/Documentation/vm/pagemap.txt for "
815 "details.\n"); 815 "details.\n");
816 } 816 }
817 817
818 task = get_proc_task(file_inode(file)); 818 task = get_proc_task(file_inode(file));
819 if (!task) 819 if (!task)
820 return -ESRCH; 820 return -ESRCH;
821 mm = get_task_mm(task); 821 mm = get_task_mm(task);
822 if (mm) { 822 if (mm) {
823 struct clear_refs_private cp = { 823 struct clear_refs_private cp = {
824 .type = type, 824 .type = type,
825 }; 825 };
826 struct mm_walk clear_refs_walk = { 826 struct mm_walk clear_refs_walk = {
827 .pmd_entry = clear_refs_pte_range, 827 .pmd_entry = clear_refs_pte_range,
828 .mm = mm, 828 .mm = mm,
829 .private = &cp, 829 .private = &cp,
830 }; 830 };
831 down_read(&mm->mmap_sem); 831 down_read(&mm->mmap_sem);
832 if (type == CLEAR_REFS_SOFT_DIRTY) 832 if (type == CLEAR_REFS_SOFT_DIRTY)
833 mmu_notifier_invalidate_range_start(mm, 0, -1); 833 mmu_notifier_invalidate_range_start(mm, 0, -1);
834 for (vma = mm->mmap; vma; vma = vma->vm_next) { 834 for (vma = mm->mmap; vma; vma = vma->vm_next) {
835 cp.vma = vma; 835 cp.vma = vma;
836 if (is_vm_hugetlb_page(vma)) 836 if (is_vm_hugetlb_page(vma))
837 continue; 837 continue;
838 /* 838 /*
839 * Writing 1 to /proc/pid/clear_refs affects all pages. 839 * Writing 1 to /proc/pid/clear_refs affects all pages.
840 * 840 *
841 * Writing 2 to /proc/pid/clear_refs only affects 841 * Writing 2 to /proc/pid/clear_refs only affects
842 * Anonymous pages. 842 * Anonymous pages.
843 * 843 *
844 * Writing 3 to /proc/pid/clear_refs only affects file 844 * Writing 3 to /proc/pid/clear_refs only affects file
845 * mapped pages. 845 * mapped pages.
846 * 846 *
847 * Writing 4 to /proc/pid/clear_refs affects all pages. 847 * Writing 4 to /proc/pid/clear_refs affects all pages.
848 */ 848 */
849 if (type == CLEAR_REFS_ANON && vma->vm_file) 849 if (type == CLEAR_REFS_ANON && vma->vm_file)
850 continue; 850 continue;
851 if (type == CLEAR_REFS_MAPPED && !vma->vm_file) 851 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
852 continue; 852 continue;
853 if (type == CLEAR_REFS_SOFT_DIRTY) { 853 if (type == CLEAR_REFS_SOFT_DIRTY) {
854 if (vma->vm_flags & VM_SOFTDIRTY) 854 if (vma->vm_flags & VM_SOFTDIRTY)
855 vma->vm_flags &= ~VM_SOFTDIRTY; 855 vma->vm_flags &= ~VM_SOFTDIRTY;
856 } 856 }
857 walk_page_range(vma->vm_start, vma->vm_end, 857 walk_page_range(vma->vm_start, vma->vm_end,
858 &clear_refs_walk); 858 &clear_refs_walk);
859 } 859 }
860 if (type == CLEAR_REFS_SOFT_DIRTY) 860 if (type == CLEAR_REFS_SOFT_DIRTY)
861 mmu_notifier_invalidate_range_end(mm, 0, -1); 861 mmu_notifier_invalidate_range_end(mm, 0, -1);
862 flush_tlb_mm(mm); 862 flush_tlb_mm(mm);
863 up_read(&mm->mmap_sem); 863 up_read(&mm->mmap_sem);
864 mmput(mm); 864 mmput(mm);
865 } 865 }
866 put_task_struct(task); 866 put_task_struct(task);
867 867
868 return count; 868 return count;
869 } 869 }
870 870
871 const struct file_operations proc_clear_refs_operations = { 871 const struct file_operations proc_clear_refs_operations = {
872 .write = clear_refs_write, 872 .write = clear_refs_write,
873 .llseek = noop_llseek, 873 .llseek = noop_llseek,
874 }; 874 };
875 875
876 typedef struct { 876 typedef struct {
877 u64 pme; 877 u64 pme;
878 } pagemap_entry_t; 878 } pagemap_entry_t;
879 879
880 struct pagemapread { 880 struct pagemapread {
881 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 881 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
882 pagemap_entry_t *buffer; 882 pagemap_entry_t *buffer;
883 bool v2; 883 bool v2;
884 }; 884 };
885 885
886 #define PAGEMAP_WALK_SIZE (PMD_SIZE) 886 #define PAGEMAP_WALK_SIZE (PMD_SIZE)
887 #define PAGEMAP_WALK_MASK (PMD_MASK) 887 #define PAGEMAP_WALK_MASK (PMD_MASK)
888 888
889 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 889 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
890 #define PM_STATUS_BITS 3 890 #define PM_STATUS_BITS 3
891 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 891 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
892 #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 892 #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
893 #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) 893 #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
894 #define PM_PSHIFT_BITS 6 894 #define PM_PSHIFT_BITS 6
895 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 895 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
896 #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 896 #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
897 #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 897 #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
898 #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 898 #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
899 #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 899 #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
900 /* in "new" pagemap pshift bits are occupied with more status bits */ 900 /* in "new" pagemap pshift bits are occupied with more status bits */
901 #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) 901 #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
902 902
903 #define __PM_SOFT_DIRTY (1LL) 903 #define __PM_SOFT_DIRTY (1LL)
904 #define PM_PRESENT PM_STATUS(4LL) 904 #define PM_PRESENT PM_STATUS(4LL)
905 #define PM_SWAP PM_STATUS(2LL) 905 #define PM_SWAP PM_STATUS(2LL)
906 #define PM_FILE PM_STATUS(1LL) 906 #define PM_FILE PM_STATUS(1LL)
907 #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) 907 #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
908 #define PM_END_OF_BUFFER 1 908 #define PM_END_OF_BUFFER 1
909 909
910 static inline pagemap_entry_t make_pme(u64 val) 910 static inline pagemap_entry_t make_pme(u64 val)
911 { 911 {
912 return (pagemap_entry_t) { .pme = val }; 912 return (pagemap_entry_t) { .pme = val };
913 } 913 }
914 914
915 static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, 915 static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
916 struct pagemapread *pm) 916 struct pagemapread *pm)
917 { 917 {
918 pm->buffer[pm->pos++] = *pme; 918 pm->buffer[pm->pos++] = *pme;
919 if (pm->pos >= pm->len) 919 if (pm->pos >= pm->len)
920 return PM_END_OF_BUFFER; 920 return PM_END_OF_BUFFER;
921 return 0; 921 return 0;
922 } 922 }
923 923
924 static int pagemap_pte_hole(unsigned long start, unsigned long end, 924 static int pagemap_pte_hole(unsigned long start, unsigned long end,
925 struct mm_walk *walk) 925 struct mm_walk *walk)
926 { 926 {
927 struct pagemapread *pm = walk->private; 927 struct pagemapread *pm = walk->private;
928 unsigned long addr = start; 928 unsigned long addr = start;
929 int err = 0; 929 int err = 0;
930 930
931 while (addr < end) { 931 while (addr < end) {
932 struct vm_area_struct *vma = find_vma(walk->mm, addr); 932 struct vm_area_struct *vma = find_vma(walk->mm, addr);
933 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 933 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
934 unsigned long vm_end; 934 /* End of address space hole, which we mark as non-present. */
935 unsigned long hole_end;
935 936
936 if (!vma) { 937 if (vma)
937 vm_end = end; 938 hole_end = min(end, vma->vm_start);
938 } else { 939 else
939 vm_end = min(end, vma->vm_end); 940 hole_end = end;
940 if (vma->vm_flags & VM_SOFTDIRTY) 941
941 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); 942 for (; addr < hole_end; addr += PAGE_SIZE) {
943 err = add_to_pagemap(addr, &pme, pm);
944 if (err)
945 goto out;
942 } 946 }
943 947
944 for (; addr < vm_end; addr += PAGE_SIZE) { 948 if (!vma)
949 break;
950
951 /* Addresses in the VMA. */
952 if (vma->vm_flags & VM_SOFTDIRTY)
953 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
954 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
945 err = add_to_pagemap(addr, &pme, pm); 955 err = add_to_pagemap(addr, &pme, pm);
946 if (err) 956 if (err)
947 goto out; 957 goto out;
948 } 958 }
949 } 959 }
950
951 out: 960 out:
952 return err; 961 return err;
953 } 962 }
954 963
955 static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 964 static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
956 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 965 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
957 { 966 {
958 u64 frame, flags; 967 u64 frame, flags;
959 struct page *page = NULL; 968 struct page *page = NULL;
960 int flags2 = 0; 969 int flags2 = 0;
961 970
962 if (pte_present(pte)) { 971 if (pte_present(pte)) {
963 frame = pte_pfn(pte); 972 frame = pte_pfn(pte);
964 flags = PM_PRESENT; 973 flags = PM_PRESENT;
965 page = vm_normal_page(vma, addr, pte); 974 page = vm_normal_page(vma, addr, pte);
966 if (pte_soft_dirty(pte)) 975 if (pte_soft_dirty(pte))
967 flags2 |= __PM_SOFT_DIRTY; 976 flags2 |= __PM_SOFT_DIRTY;
968 } else if (is_swap_pte(pte)) { 977 } else if (is_swap_pte(pte)) {
969 swp_entry_t entry; 978 swp_entry_t entry;
970 if (pte_swp_soft_dirty(pte)) 979 if (pte_swp_soft_dirty(pte))
971 flags2 |= __PM_SOFT_DIRTY; 980 flags2 |= __PM_SOFT_DIRTY;
972 entry = pte_to_swp_entry(pte); 981 entry = pte_to_swp_entry(pte);
973 frame = swp_type(entry) | 982 frame = swp_type(entry) |
974 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 983 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
975 flags = PM_SWAP; 984 flags = PM_SWAP;
976 if (is_migration_entry(entry)) 985 if (is_migration_entry(entry))
977 page = migration_entry_to_page(entry); 986 page = migration_entry_to_page(entry);
978 } else { 987 } else {
979 if (vma->vm_flags & VM_SOFTDIRTY) 988 if (vma->vm_flags & VM_SOFTDIRTY)
980 flags2 |= __PM_SOFT_DIRTY; 989 flags2 |= __PM_SOFT_DIRTY;
981 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 990 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
982 return; 991 return;
983 } 992 }
984 993
985 if (page && !PageAnon(page)) 994 if (page && !PageAnon(page))
986 flags |= PM_FILE; 995 flags |= PM_FILE;
987 if ((vma->vm_flags & VM_SOFTDIRTY)) 996 if ((vma->vm_flags & VM_SOFTDIRTY))
988 flags2 |= __PM_SOFT_DIRTY; 997 flags2 |= __PM_SOFT_DIRTY;
989 998
990 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 999 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
991 } 1000 }
992 1001
993 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1002 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
994 static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1003 static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
995 pmd_t pmd, int offset, int pmd_flags2) 1004 pmd_t pmd, int offset, int pmd_flags2)
996 { 1005 {
997 /* 1006 /*
998 * Currently pmd for thp is always present because thp can not be 1007 * Currently pmd for thp is always present because thp can not be
999 * swapped-out, migrated, or HWPOISONed (split in such cases instead.) 1008 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
1000 * This if-check is just to prepare for future implementation. 1009 * This if-check is just to prepare for future implementation.
1001 */ 1010 */
1002 if (pmd_present(pmd)) 1011 if (pmd_present(pmd))
1003 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 1012 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
1004 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 1013 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
1005 else 1014 else
1006 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); 1015 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
1007 } 1016 }
1008 #else 1017 #else
1009 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1018 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1010 pmd_t pmd, int offset, int pmd_flags2) 1019 pmd_t pmd, int offset, int pmd_flags2)
1011 { 1020 {
1012 } 1021 }
1013 #endif 1022 #endif
1014 1023
1015 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1024 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1016 struct mm_walk *walk) 1025 struct mm_walk *walk)
1017 { 1026 {
1018 struct vm_area_struct *vma; 1027 struct vm_area_struct *vma;
1019 struct pagemapread *pm = walk->private; 1028 struct pagemapread *pm = walk->private;
1020 spinlock_t *ptl; 1029 spinlock_t *ptl;
1021 pte_t *pte; 1030 pte_t *pte;
1022 int err = 0; 1031 int err = 0;
1023 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1032 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1024 1033
1025 /* find the first VMA at or above 'addr' */ 1034 /* find the first VMA at or above 'addr' */
1026 vma = find_vma(walk->mm, addr); 1035 vma = find_vma(walk->mm, addr);
1027 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1036 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1028 int pmd_flags2; 1037 int pmd_flags2;
1029 1038
1030 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1039 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
1031 pmd_flags2 = __PM_SOFT_DIRTY; 1040 pmd_flags2 = __PM_SOFT_DIRTY;
1032 else 1041 else
1033 pmd_flags2 = 0; 1042 pmd_flags2 = 0;
1034 1043
1035 for (; addr != end; addr += PAGE_SIZE) { 1044 for (; addr != end; addr += PAGE_SIZE) {
1036 unsigned long offset; 1045 unsigned long offset;
1037 1046
1038 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1047 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1039 PAGE_SHIFT; 1048 PAGE_SHIFT;
1040 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); 1049 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
1041 err = add_to_pagemap(addr, &pme, pm); 1050 err = add_to_pagemap(addr, &pme, pm);
1042 if (err) 1051 if (err)
1043 break; 1052 break;
1044 } 1053 }
1045 spin_unlock(ptl); 1054 spin_unlock(ptl);
1046 return err; 1055 return err;
1047 } 1056 }
1048 1057
1049 if (pmd_trans_unstable(pmd)) 1058 if (pmd_trans_unstable(pmd))
1050 return 0; 1059 return 0;
1051 for (; addr != end; addr += PAGE_SIZE) { 1060 for (; addr != end; addr += PAGE_SIZE) {
1052 int flags2; 1061 int flags2;
1053 1062
1054 /* check to see if we've left 'vma' behind 1063 /* check to see if we've left 'vma' behind
1055 * and need a new, higher one */ 1064 * and need a new, higher one */
1056 if (vma && (addr >= vma->vm_end)) { 1065 if (vma && (addr >= vma->vm_end)) {
1057 vma = find_vma(walk->mm, addr); 1066 vma = find_vma(walk->mm, addr);
1058 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1067 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1059 flags2 = __PM_SOFT_DIRTY; 1068 flags2 = __PM_SOFT_DIRTY;
1060 else 1069 else
1061 flags2 = 0; 1070 flags2 = 0;
1062 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 1071 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1063 } 1072 }
1064 1073
1065 /* check that 'vma' actually covers this address, 1074 /* check that 'vma' actually covers this address,
1066 * and that it isn't a huge page vma */ 1075 * and that it isn't a huge page vma */
1067 if (vma && (vma->vm_start <= addr) && 1076 if (vma && (vma->vm_start <= addr) &&
1068 !is_vm_hugetlb_page(vma)) { 1077 !is_vm_hugetlb_page(vma)) {
1069 pte = pte_offset_map(pmd, addr); 1078 pte = pte_offset_map(pmd, addr);
1070 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1079 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1071 /* unmap before userspace copy */ 1080 /* unmap before userspace copy */
1072 pte_unmap(pte); 1081 pte_unmap(pte);
1073 } 1082 }
1074 err = add_to_pagemap(addr, &pme, pm); 1083 err = add_to_pagemap(addr, &pme, pm);
1075 if (err) 1084 if (err)
1076 return err; 1085 return err;
1077 } 1086 }
1078 1087
1079 cond_resched(); 1088 cond_resched();
1080 1089
1081 return err; 1090 return err;
1082 } 1091 }
1083 1092
1084 #ifdef CONFIG_HUGETLB_PAGE 1093 #ifdef CONFIG_HUGETLB_PAGE
1085 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1094 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1086 pte_t pte, int offset, int flags2) 1095 pte_t pte, int offset, int flags2)
1087 { 1096 {
1088 if (pte_present(pte)) 1097 if (pte_present(pte))
1089 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | 1098 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
1090 PM_STATUS2(pm->v2, flags2) | 1099 PM_STATUS2(pm->v2, flags2) |
1091 PM_PRESENT); 1100 PM_PRESENT);
1092 else 1101 else
1093 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | 1102 *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
1094 PM_STATUS2(pm->v2, flags2)); 1103 PM_STATUS2(pm->v2, flags2));
1095 } 1104 }
1096 1105
1097 /* This function walks within one hugetlb entry in the single call */ 1106 /* This function walks within one hugetlb entry in the single call */
1098 static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, 1107 static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1099 unsigned long addr, unsigned long end, 1108 unsigned long addr, unsigned long end,
1100 struct mm_walk *walk) 1109 struct mm_walk *walk)
1101 { 1110 {
1102 struct pagemapread *pm = walk->private; 1111 struct pagemapread *pm = walk->private;
1103 struct vm_area_struct *vma; 1112 struct vm_area_struct *vma;
1104 int err = 0; 1113 int err = 0;
1105 int flags2; 1114 int flags2;
1106 pagemap_entry_t pme; 1115 pagemap_entry_t pme;
1107 1116
1108 vma = find_vma(walk->mm, addr); 1117 vma = find_vma(walk->mm, addr);
1109 WARN_ON_ONCE(!vma); 1118 WARN_ON_ONCE(!vma);
1110 1119
1111 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1120 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1112 flags2 = __PM_SOFT_DIRTY; 1121 flags2 = __PM_SOFT_DIRTY;
1113 else 1122 else
1114 flags2 = 0; 1123 flags2 = 0;
1115 1124
1116 for (; addr != end; addr += PAGE_SIZE) { 1125 for (; addr != end; addr += PAGE_SIZE) {
1117 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1126 int offset = (addr & ~hmask) >> PAGE_SHIFT;
1118 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); 1127 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1119 err = add_to_pagemap(addr, &pme, pm); 1128 err = add_to_pagemap(addr, &pme, pm);
1120 if (err) 1129 if (err)
1121 return err; 1130 return err;
1122 } 1131 }
1123 1132
1124 cond_resched(); 1133 cond_resched();
1125 1134
1126 return err; 1135 return err;
1127 } 1136 }
1128 #endif /* HUGETLB_PAGE */ 1137 #endif /* HUGETLB_PAGE */
1129 1138
1130 /* 1139 /*
1131 * /proc/pid/pagemap - an array mapping virtual pages to pfns 1140 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1132 * 1141 *
1133 * For each page in the address space, this file contains one 64-bit entry 1142 * For each page in the address space, this file contains one 64-bit entry
1134 * consisting of the following: 1143 * consisting of the following:
1135 * 1144 *
1136 * Bits 0-54 page frame number (PFN) if present 1145 * Bits 0-54 page frame number (PFN) if present
1137 * Bits 0-4 swap type if swapped 1146 * Bits 0-4 swap type if swapped
1138 * Bits 5-54 swap offset if swapped 1147 * Bits 5-54 swap offset if swapped
1139 * Bits 55-60 page shift (page size = 1<<page shift) 1148 * Bits 55-60 page shift (page size = 1<<page shift)
1140 * Bit 61 page is file-page or shared-anon 1149 * Bit 61 page is file-page or shared-anon
1141 * Bit 62 page swapped 1150 * Bit 62 page swapped
1142 * Bit 63 page present 1151 * Bit 63 page present
1143 * 1152 *
1144 * If the page is not present but in swap, then the PFN contains an 1153 * If the page is not present but in swap, then the PFN contains an
1145 * encoding of the swap file number and the page's offset into the 1154 * encoding of the swap file number and the page's offset into the
1146 * swap. Unmapped pages return a null PFN. This allows determining 1155 * swap. Unmapped pages return a null PFN. This allows determining
1147 * precisely which pages are mapped (or in swap) and comparing mapped 1156 * precisely which pages are mapped (or in swap) and comparing mapped
1148 * pages between processes. 1157 * pages between processes.
1149 * 1158 *
1150 * Efficient users of this interface will use /proc/pid/maps to 1159 * Efficient users of this interface will use /proc/pid/maps to
1151 * determine which areas of memory are actually mapped and llseek to 1160 * determine which areas of memory are actually mapped and llseek to
1152 * skip over unmapped regions. 1161 * skip over unmapped regions.
1153 */ 1162 */
1154 static ssize_t pagemap_read(struct file *file, char __user *buf, 1163 static ssize_t pagemap_read(struct file *file, char __user *buf,
1155 size_t count, loff_t *ppos) 1164 size_t count, loff_t *ppos)
1156 { 1165 {
1157 struct task_struct *task = get_proc_task(file_inode(file)); 1166 struct task_struct *task = get_proc_task(file_inode(file));
1158 struct mm_struct *mm; 1167 struct mm_struct *mm;
1159 struct pagemapread pm; 1168 struct pagemapread pm;
1160 int ret = -ESRCH; 1169 int ret = -ESRCH;
1161 struct mm_walk pagemap_walk = {}; 1170 struct mm_walk pagemap_walk = {};
1162 unsigned long src; 1171 unsigned long src;
1163 unsigned long svpfn; 1172 unsigned long svpfn;
1164 unsigned long start_vaddr; 1173 unsigned long start_vaddr;
1165 unsigned long end_vaddr; 1174 unsigned long end_vaddr;
1166 int copied = 0; 1175 int copied = 0;
1167 1176
1168 if (!task) 1177 if (!task)
1169 goto out; 1178 goto out;
1170 1179
1171 ret = -EINVAL; 1180 ret = -EINVAL;
1172 /* file position must be aligned */ 1181 /* file position must be aligned */
1173 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 1182 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1174 goto out_task; 1183 goto out_task;
1175 1184
1176 ret = 0; 1185 ret = 0;
1177 if (!count) 1186 if (!count)
1178 goto out_task; 1187 goto out_task;
1179 1188
1180 pm.v2 = soft_dirty_cleared; 1189 pm.v2 = soft_dirty_cleared;
1181 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1190 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1182 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); 1191 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1183 ret = -ENOMEM; 1192 ret = -ENOMEM;
1184 if (!pm.buffer) 1193 if (!pm.buffer)
1185 goto out_task; 1194 goto out_task;
1186 1195
1187 mm = mm_access(task, PTRACE_MODE_READ); 1196 mm = mm_access(task, PTRACE_MODE_READ);
1188 ret = PTR_ERR(mm); 1197 ret = PTR_ERR(mm);
1189 if (!mm || IS_ERR(mm)) 1198 if (!mm || IS_ERR(mm))
1190 goto out_free; 1199 goto out_free;
1191 1200
1192 pagemap_walk.pmd_entry = pagemap_pte_range; 1201 pagemap_walk.pmd_entry = pagemap_pte_range;
1193 pagemap_walk.pte_hole = pagemap_pte_hole; 1202 pagemap_walk.pte_hole = pagemap_pte_hole;
1194 #ifdef CONFIG_HUGETLB_PAGE 1203 #ifdef CONFIG_HUGETLB_PAGE
1195 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 1204 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1196 #endif 1205 #endif
1197 pagemap_walk.mm = mm; 1206 pagemap_walk.mm = mm;
1198 pagemap_walk.private = &pm; 1207 pagemap_walk.private = &pm;
1199 1208
1200 src = *ppos; 1209 src = *ppos;
1201 svpfn = src / PM_ENTRY_BYTES; 1210 svpfn = src / PM_ENTRY_BYTES;
1202 start_vaddr = svpfn << PAGE_SHIFT; 1211 start_vaddr = svpfn << PAGE_SHIFT;
1203 end_vaddr = TASK_SIZE_OF(task); 1212 end_vaddr = TASK_SIZE_OF(task);
1204 1213
1205 /* watch out for wraparound */ 1214 /* watch out for wraparound */
1206 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) 1215 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
1207 start_vaddr = end_vaddr; 1216 start_vaddr = end_vaddr;
1208 1217
1209 /* 1218 /*
1210 * The odds are that this will stop walking way 1219 * The odds are that this will stop walking way
1211 * before end_vaddr, because the length of the 1220 * before end_vaddr, because the length of the
1212 * user buffer is tracked in "pm", and the walk 1221 * user buffer is tracked in "pm", and the walk
1213 * will stop when we hit the end of the buffer. 1222 * will stop when we hit the end of the buffer.
1214 */ 1223 */
1215 ret = 0; 1224 ret = 0;
1216 while (count && (start_vaddr < end_vaddr)) { 1225 while (count && (start_vaddr < end_vaddr)) {
1217 int len; 1226 int len;
1218 unsigned long end; 1227 unsigned long end;
1219 1228
1220 pm.pos = 0; 1229 pm.pos = 0;
1221 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; 1230 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1222 /* overflow ? */ 1231 /* overflow ? */
1223 if (end < start_vaddr || end > end_vaddr) 1232 if (end < start_vaddr || end > end_vaddr)
1224 end = end_vaddr; 1233 end = end_vaddr;
1225 down_read(&mm->mmap_sem); 1234 down_read(&mm->mmap_sem);
1226 ret = walk_page_range(start_vaddr, end, &pagemap_walk); 1235 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1227 up_read(&mm->mmap_sem); 1236 up_read(&mm->mmap_sem);
1228 start_vaddr = end; 1237 start_vaddr = end;
1229 1238
1230 len = min(count, PM_ENTRY_BYTES * pm.pos); 1239 len = min(count, PM_ENTRY_BYTES * pm.pos);
1231 if (copy_to_user(buf, pm.buffer, len)) { 1240 if (copy_to_user(buf, pm.buffer, len)) {
1232 ret = -EFAULT; 1241 ret = -EFAULT;
1233 goto out_mm; 1242 goto out_mm;
1234 } 1243 }
1235 copied += len; 1244 copied += len;
1236 buf += len; 1245 buf += len;
1237 count -= len; 1246 count -= len;
1238 } 1247 }
1239 *ppos += copied; 1248 *ppos += copied;
1240 if (!ret || ret == PM_END_OF_BUFFER) 1249 if (!ret || ret == PM_END_OF_BUFFER)
1241 ret = copied; 1250 ret = copied;
1242 1251
1243 out_mm: 1252 out_mm:
1244 mmput(mm); 1253 mmput(mm);
1245 out_free: 1254 out_free:
1246 kfree(pm.buffer); 1255 kfree(pm.buffer);
1247 out_task: 1256 out_task:
1248 put_task_struct(task); 1257 put_task_struct(task);
1249 out: 1258 out:
1250 return ret; 1259 return ret;
1251 } 1260 }
1252 1261
1253 static int pagemap_open(struct inode *inode, struct file *file) 1262 static int pagemap_open(struct inode *inode, struct file *file)
1254 { 1263 {
1255 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " 1264 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1256 "to stop being page-shift some time soon. See the " 1265 "to stop being page-shift some time soon. See the "
1257 "linux/Documentation/vm/pagemap.txt for details.\n"); 1266 "linux/Documentation/vm/pagemap.txt for details.\n");
1258 return 0; 1267 return 0;
1259 } 1268 }
1260 1269
1261 const struct file_operations proc_pagemap_operations = { 1270 const struct file_operations proc_pagemap_operations = {
1262 .llseek = mem_lseek, /* borrow this */ 1271 .llseek = mem_lseek, /* borrow this */
1263 .read = pagemap_read, 1272 .read = pagemap_read,
1264 .open = pagemap_open, 1273 .open = pagemap_open,
1265 }; 1274 };
1266 #endif /* CONFIG_PROC_PAGE_MONITOR */ 1275 #endif /* CONFIG_PROC_PAGE_MONITOR */
1267 1276
1268 #ifdef CONFIG_NUMA 1277 #ifdef CONFIG_NUMA
1269 1278
1270 struct numa_maps { 1279 struct numa_maps {
1271 struct vm_area_struct *vma; 1280 struct vm_area_struct *vma;
1272 unsigned long pages; 1281 unsigned long pages;
1273 unsigned long anon; 1282 unsigned long anon;
1274 unsigned long active; 1283 unsigned long active;
1275 unsigned long writeback; 1284 unsigned long writeback;
1276 unsigned long mapcount_max; 1285 unsigned long mapcount_max;
1277 unsigned long dirty; 1286 unsigned long dirty;
1278 unsigned long swapcache; 1287 unsigned long swapcache;
1279 unsigned long node[MAX_NUMNODES]; 1288 unsigned long node[MAX_NUMNODES];
1280 }; 1289 };
1281 1290
1282 struct numa_maps_private { 1291 struct numa_maps_private {
1283 struct proc_maps_private proc_maps; 1292 struct proc_maps_private proc_maps;
1284 struct numa_maps md; 1293 struct numa_maps md;
1285 }; 1294 };
1286 1295
1287 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, 1296 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1288 unsigned long nr_pages) 1297 unsigned long nr_pages)
1289 { 1298 {
1290 int count = page_mapcount(page); 1299 int count = page_mapcount(page);
1291 1300
1292 md->pages += nr_pages; 1301 md->pages += nr_pages;
1293 if (pte_dirty || PageDirty(page)) 1302 if (pte_dirty || PageDirty(page))
1294 md->dirty += nr_pages; 1303 md->dirty += nr_pages;
1295 1304
1296 if (PageSwapCache(page)) 1305 if (PageSwapCache(page))
1297 md->swapcache += nr_pages; 1306 md->swapcache += nr_pages;
1298 1307
1299 if (PageActive(page) || PageUnevictable(page)) 1308 if (PageActive(page) || PageUnevictable(page))
1300 md->active += nr_pages; 1309 md->active += nr_pages;
1301 1310
1302 if (PageWriteback(page)) 1311 if (PageWriteback(page))
1303 md->writeback += nr_pages; 1312 md->writeback += nr_pages;
1304 1313
1305 if (PageAnon(page)) 1314 if (PageAnon(page))
1306 md->anon += nr_pages; 1315 md->anon += nr_pages;
1307 1316
1308 if (count > md->mapcount_max) 1317 if (count > md->mapcount_max)
1309 md->mapcount_max = count; 1318 md->mapcount_max = count;
1310 1319
1311 md->node[page_to_nid(page)] += nr_pages; 1320 md->node[page_to_nid(page)] += nr_pages;
1312 } 1321 }
1313 1322
1314 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, 1323 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1315 unsigned long addr) 1324 unsigned long addr)
1316 { 1325 {
1317 struct page *page; 1326 struct page *page;
1318 int nid; 1327 int nid;
1319 1328
1320 if (!pte_present(pte)) 1329 if (!pte_present(pte))
1321 return NULL; 1330 return NULL;
1322 1331
1323 page = vm_normal_page(vma, addr, pte); 1332 page = vm_normal_page(vma, addr, pte);
1324 if (!page) 1333 if (!page)
1325 return NULL; 1334 return NULL;
1326 1335
1327 if (PageReserved(page)) 1336 if (PageReserved(page))
1328 return NULL; 1337 return NULL;
1329 1338
1330 nid = page_to_nid(page); 1339 nid = page_to_nid(page);
1331 if (!node_isset(nid, node_states[N_MEMORY])) 1340 if (!node_isset(nid, node_states[N_MEMORY]))
1332 return NULL; 1341 return NULL;
1333 1342
1334 return page; 1343 return page;
1335 } 1344 }
1336 1345
1337 static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 1346 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1338 unsigned long end, struct mm_walk *walk) 1347 unsigned long end, struct mm_walk *walk)
1339 { 1348 {
1340 struct numa_maps *md; 1349 struct numa_maps *md;
1341 spinlock_t *ptl; 1350 spinlock_t *ptl;
1342 pte_t *orig_pte; 1351 pte_t *orig_pte;
1343 pte_t *pte; 1352 pte_t *pte;
1344 1353
1345 md = walk->private; 1354 md = walk->private;
1346 1355
1347 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { 1356 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1348 pte_t huge_pte = *(pte_t *)pmd; 1357 pte_t huge_pte = *(pte_t *)pmd;
1349 struct page *page; 1358 struct page *page;
1350 1359
1351 page = can_gather_numa_stats(huge_pte, md->vma, addr); 1360 page = can_gather_numa_stats(huge_pte, md->vma, addr);
1352 if (page) 1361 if (page)
1353 gather_stats(page, md, pte_dirty(huge_pte), 1362 gather_stats(page, md, pte_dirty(huge_pte),
1354 HPAGE_PMD_SIZE/PAGE_SIZE); 1363 HPAGE_PMD_SIZE/PAGE_SIZE);
1355 spin_unlock(ptl); 1364 spin_unlock(ptl);
1356 return 0; 1365 return 0;
1357 } 1366 }
1358 1367
1359 if (pmd_trans_unstable(pmd)) 1368 if (pmd_trans_unstable(pmd))
1360 return 0; 1369 return 0;
1361 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1370 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1362 do { 1371 do {
1363 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 1372 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
1364 if (!page) 1373 if (!page)
1365 continue; 1374 continue;
1366 gather_stats(page, md, pte_dirty(*pte), 1); 1375 gather_stats(page, md, pte_dirty(*pte), 1);
1367 1376
1368 } while (pte++, addr += PAGE_SIZE, addr != end); 1377 } while (pte++, addr += PAGE_SIZE, addr != end);
1369 pte_unmap_unlock(orig_pte, ptl); 1378 pte_unmap_unlock(orig_pte, ptl);
1370 return 0; 1379 return 0;
1371 } 1380 }
1372 #ifdef CONFIG_HUGETLB_PAGE 1381 #ifdef CONFIG_HUGETLB_PAGE
1373 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1382 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1374 unsigned long addr, unsigned long end, struct mm_walk *walk) 1383 unsigned long addr, unsigned long end, struct mm_walk *walk)
1375 { 1384 {
1376 struct numa_maps *md; 1385 struct numa_maps *md;
1377 struct page *page; 1386 struct page *page;
1378 1387
1379 if (!pte_present(*pte)) 1388 if (!pte_present(*pte))
1380 return 0; 1389 return 0;
1381 1390
1382 page = pte_page(*pte); 1391 page = pte_page(*pte);
1383 if (!page) 1392 if (!page)
1384 return 0; 1393 return 0;
1385 1394
1386 md = walk->private; 1395 md = walk->private;
1387 gather_stats(page, md, pte_dirty(*pte), 1); 1396 gather_stats(page, md, pte_dirty(*pte), 1);
1388 return 0; 1397 return 0;
1389 } 1398 }
1390 1399
1391 #else 1400 #else
1392 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1401 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1393 unsigned long addr, unsigned long end, struct mm_walk *walk) 1402 unsigned long addr, unsigned long end, struct mm_walk *walk)
1394 { 1403 {
1395 return 0; 1404 return 0;
1396 } 1405 }
1397 #endif 1406 #endif
1398 1407
1399 /* 1408 /*
1400 * Display pages allocated per node and memory policy via /proc. 1409 * Display pages allocated per node and memory policy via /proc.
1401 */ 1410 */
1402 static int show_numa_map(struct seq_file *m, void *v, int is_pid) 1411 static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1403 { 1412 {
1404 struct numa_maps_private *numa_priv = m->private; 1413 struct numa_maps_private *numa_priv = m->private;
1405 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 1414 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1406 struct vm_area_struct *vma = v; 1415 struct vm_area_struct *vma = v;
1407 struct numa_maps *md = &numa_priv->md; 1416 struct numa_maps *md = &numa_priv->md;
1408 struct file *file = vma->vm_file; 1417 struct file *file = vma->vm_file;
1409 struct task_struct *task = proc_priv->task; 1418 struct task_struct *task = proc_priv->task;
1410 struct mm_struct *mm = vma->vm_mm; 1419 struct mm_struct *mm = vma->vm_mm;
1411 struct mm_walk walk = {}; 1420 struct mm_walk walk = {};
1412 struct mempolicy *pol; 1421 struct mempolicy *pol;
1413 char buffer[64]; 1422 char buffer[64];
1414 int nid; 1423 int nid;
1415 1424
1416 if (!mm) 1425 if (!mm)
1417 return 0; 1426 return 0;
1418 1427
1419 /* Ensure we start with an empty set of numa_maps statistics. */ 1428 /* Ensure we start with an empty set of numa_maps statistics. */
1420 memset(md, 0, sizeof(*md)); 1429 memset(md, 0, sizeof(*md));
1421 1430
1422 md->vma = vma; 1431 md->vma = vma;
1423 1432
1424 walk.hugetlb_entry = gather_hugetbl_stats; 1433 walk.hugetlb_entry = gather_hugetbl_stats;
1425 walk.pmd_entry = gather_pte_stats; 1434 walk.pmd_entry = gather_pte_stats;
1426 walk.private = md; 1435 walk.private = md;
1427 walk.mm = mm; 1436 walk.mm = mm;
1428 1437
1429 pol = get_vma_policy(task, vma, vma->vm_start); 1438 pol = get_vma_policy(task, vma, vma->vm_start);
1430 mpol_to_str(buffer, sizeof(buffer), pol); 1439 mpol_to_str(buffer, sizeof(buffer), pol);
1431 mpol_cond_put(pol); 1440 mpol_cond_put(pol);
1432 1441
1433 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1442 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1434 1443
1435 if (file) { 1444 if (file) {
1436 seq_puts(m, " file="); 1445 seq_puts(m, " file=");
1437 seq_path(m, &file->f_path, "\n\t= "); 1446 seq_path(m, &file->f_path, "\n\t= ");
1438 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1447 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1439 seq_puts(m, " heap"); 1448 seq_puts(m, " heap");
1440 } else { 1449 } else {
1441 pid_t tid = vm_is_stack(task, vma, is_pid); 1450 pid_t tid = vm_is_stack(task, vma, is_pid);
1442 if (tid != 0) { 1451 if (tid != 0) {
1443 /* 1452 /*
1444 * Thread stack in /proc/PID/task/TID/maps or 1453 * Thread stack in /proc/PID/task/TID/maps or
1445 * the main process stack. 1454 * the main process stack.
1446 */ 1455 */
1447 if (!is_pid || (vma->vm_start <= mm->start_stack && 1456 if (!is_pid || (vma->vm_start <= mm->start_stack &&
1448 vma->vm_end >= mm->start_stack)) 1457 vma->vm_end >= mm->start_stack))
1449 seq_puts(m, " stack"); 1458 seq_puts(m, " stack");
1450 else 1459 else
1451 seq_printf(m, " stack:%d", tid); 1460 seq_printf(m, " stack:%d", tid);
1452 } 1461 }
1453 } 1462 }
1454 1463
1455 if (is_vm_hugetlb_page(vma)) 1464 if (is_vm_hugetlb_page(vma))
1456 seq_puts(m, " huge"); 1465 seq_puts(m, " huge");
1457 1466
1458 walk_page_range(vma->vm_start, vma->vm_end, &walk); 1467 walk_page_range(vma->vm_start, vma->vm_end, &walk);
1459 1468
1460 if (!md->pages) 1469 if (!md->pages)
1461 goto out; 1470 goto out;
1462 1471
1463 if (md->anon) 1472 if (md->anon)
1464 seq_printf(m, " anon=%lu", md->anon); 1473 seq_printf(m, " anon=%lu", md->anon);
1465 1474
1466 if (md->dirty) 1475 if (md->dirty)
1467 seq_printf(m, " dirty=%lu", md->dirty); 1476 seq_printf(m, " dirty=%lu", md->dirty);
1468 1477
1469 if (md->pages != md->anon && md->pages != md->dirty) 1478 if (md->pages != md->anon && md->pages != md->dirty)
1470 seq_printf(m, " mapped=%lu", md->pages); 1479 seq_printf(m, " mapped=%lu", md->pages);
1471 1480
1472 if (md->mapcount_max > 1) 1481 if (md->mapcount_max > 1)
1473 seq_printf(m, " mapmax=%lu", md->mapcount_max); 1482 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1474 1483
1475 if (md->swapcache) 1484 if (md->swapcache)
1476 seq_printf(m, " swapcache=%lu", md->swapcache); 1485 seq_printf(m, " swapcache=%lu", md->swapcache);
1477 1486
1478 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 1487 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1479 seq_printf(m, " active=%lu", md->active); 1488 seq_printf(m, " active=%lu", md->active);
1480 1489
1481 if (md->writeback) 1490 if (md->writeback)
1482 seq_printf(m, " writeback=%lu", md->writeback); 1491 seq_printf(m, " writeback=%lu", md->writeback);
1483 1492
1484 for_each_node_state(nid, N_MEMORY) 1493 for_each_node_state(nid, N_MEMORY)
1485 if (md->node[nid]) 1494 if (md->node[nid])
1486 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1495 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1487 out: 1496 out:
1488 seq_putc(m, '\n'); 1497 seq_putc(m, '\n');
1489 1498
1490 if (m->count < m->size) 1499 if (m->count < m->size)
1491 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; 1500 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1492 return 0; 1501 return 0;
1493 } 1502 }
1494 1503
1495 static int show_pid_numa_map(struct seq_file *m, void *v) 1504 static int show_pid_numa_map(struct seq_file *m, void *v)
1496 { 1505 {
1497 return show_numa_map(m, v, 1); 1506 return show_numa_map(m, v, 1);
1498 } 1507 }
1499 1508
1500 static int show_tid_numa_map(struct seq_file *m, void *v) 1509 static int show_tid_numa_map(struct seq_file *m, void *v)
1501 { 1510 {
1502 return show_numa_map(m, v, 0); 1511 return show_numa_map(m, v, 0);
1503 } 1512 }
1504 1513
1505 static const struct seq_operations proc_pid_numa_maps_op = { 1514 static const struct seq_operations proc_pid_numa_maps_op = {
1506 .start = m_start, 1515 .start = m_start,
1507 .next = m_next, 1516 .next = m_next,
1508 .stop = m_stop, 1517 .stop = m_stop,
1509 .show = show_pid_numa_map, 1518 .show = show_pid_numa_map,
1510 }; 1519 };
1511 1520
1512 static const struct seq_operations proc_tid_numa_maps_op = { 1521 static const struct seq_operations proc_tid_numa_maps_op = {
1513 .start = m_start, 1522 .start = m_start,
1514 .next = m_next, 1523 .next = m_next,
1515 .stop = m_stop, 1524 .stop = m_stop,
1516 .show = show_tid_numa_map, 1525 .show = show_tid_numa_map,
1517 }; 1526 };
1518 1527
1519 static int numa_maps_open(struct inode *inode, struct file *file, 1528 static int numa_maps_open(struct inode *inode, struct file *file,
1520 const struct seq_operations *ops) 1529 const struct seq_operations *ops)
1521 { 1530 {
1522 struct numa_maps_private *priv; 1531 struct numa_maps_private *priv;
1523 int ret = -ENOMEM; 1532 int ret = -ENOMEM;
1524 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1533 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1525 if (priv) { 1534 if (priv) {
1526 priv->proc_maps.pid = proc_pid(inode); 1535 priv->proc_maps.pid = proc_pid(inode);
1527 ret = seq_open(file, ops); 1536 ret = seq_open(file, ops);
1528 if (!ret) { 1537 if (!ret) {
1529 struct seq_file *m = file->private_data; 1538 struct seq_file *m = file->private_data;
1530 m->private = priv; 1539 m->private = priv;
1531 } else { 1540 } else {
1532 kfree(priv); 1541 kfree(priv);
1533 } 1542 }
1534 } 1543 }
1535 return ret; 1544 return ret;
1536 } 1545 }
1537 1546
1538 static int pid_numa_maps_open(struct inode *inode, struct file *file) 1547 static int pid_numa_maps_open(struct inode *inode, struct file *file)
1539 { 1548 {
1540 return numa_maps_open(inode, file, &proc_pid_numa_maps_op); 1549 return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1541 } 1550 }
1542 1551
1543 static int tid_numa_maps_open(struct inode *inode, struct file *file) 1552 static int tid_numa_maps_open(struct inode *inode, struct file *file)
1544 { 1553 {
1545 return numa_maps_open(inode, file, &proc_tid_numa_maps_op); 1554 return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1546 } 1555 }
1547 1556
1548 const struct file_operations proc_pid_numa_maps_operations = { 1557 const struct file_operations proc_pid_numa_maps_operations = {
1549 .open = pid_numa_maps_open, 1558 .open = pid_numa_maps_open,
1550 .read = seq_read, 1559 .read = seq_read,
1551 .llseek = seq_lseek, 1560 .llseek = seq_lseek,
1552 .release = seq_release_private, 1561 .release = seq_release_private,
1553 }; 1562 };
1554 1563
1555 const struct file_operations proc_tid_numa_maps_operations = { 1564 const struct file_operations proc_tid_numa_maps_operations = {
1556 .open = tid_numa_maps_open, 1565 .open = tid_numa_maps_open,
1557 .read = seq_read, 1566 .read = seq_read,
1558 .llseek = seq_lseek, 1567 .llseek = seq_lseek,
1559 .release = seq_release_private, 1568 .release = seq_release_private,
1560 }; 1569 };
1561 #endif /* CONFIG_NUMA */ 1570 #endif /* CONFIG_NUMA */
1 /* 1 /*
2 * Basic general purpose allocator for managing special purpose 2 * Basic general purpose allocator for managing special purpose
3 * memory, for example, memory that is not managed by the regular 3 * memory, for example, memory that is not managed by the regular
4 * kmalloc/kfree interface. Uses for this includes on-device special 4 * kmalloc/kfree interface. Uses for this includes on-device special
5 * memory, uncached memory etc. 5 * memory, uncached memory etc.
6 * 6 *
7 * It is safe to use the allocator in NMI handlers and other special 7 * It is safe to use the allocator in NMI handlers and other special
8 * unblockable contexts that could otherwise deadlock on locks. This 8 * unblockable contexts that could otherwise deadlock on locks. This
9 * is implemented by using atomic operations and retries on any 9 * is implemented by using atomic operations and retries on any
10 * conflicts. The disadvantage is that there may be livelocks in 10 * conflicts. The disadvantage is that there may be livelocks in
11 * extreme cases. For better scalability, one allocator can be used 11 * extreme cases. For better scalability, one allocator can be used
12 * for each CPU. 12 * for each CPU.
13 * 13 *
14 * The lockless operation only works if there is enough memory 14 * The lockless operation only works if there is enough memory
15 * available. If new memory is added to the pool a lock has to be 15 * available. If new memory is added to the pool a lock has to be
16 * still taken. So any user relying on locklessness has to ensure 16 * still taken. So any user relying on locklessness has to ensure
17 * that sufficient memory is preallocated. 17 * that sufficient memory is preallocated.
18 * 18 *
19 * The basic atomic operation of this allocator is cmpxchg on long. 19 * The basic atomic operation of this allocator is cmpxchg on long.
20 * On architectures that don't have NMI-safe cmpxchg implementation, 20 * On architectures that don't have NMI-safe cmpxchg implementation,
21 * the allocator can NOT be used in NMI handler. So code uses the 21 * the allocator can NOT be used in NMI handler. So code uses the
22 * allocator in NMI handler should depend on 22 * allocator in NMI handler should depend on
23 * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 23 * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
24 * 24 *
25 * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org> 25 * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
26 * 26 *
27 * This source code is licensed under the GNU General Public License, 27 * This source code is licensed under the GNU General Public License,
28 * Version 2. See the file COPYING for more details. 28 * Version 2. See the file COPYING for more details.
29 */ 29 */
30 30
31 #include <linux/slab.h> 31 #include <linux/slab.h>
32 #include <linux/export.h> 32 #include <linux/export.h>
33 #include <linux/bitmap.h> 33 #include <linux/bitmap.h>
34 #include <linux/rculist.h> 34 #include <linux/rculist.h>
35 #include <linux/interrupt.h> 35 #include <linux/interrupt.h>
36 #include <linux/genalloc.h> 36 #include <linux/genalloc.h>
37 #include <linux/of_address.h> 37 #include <linux/of_address.h>
38 #include <linux/of_device.h> 38 #include <linux/of_device.h>
39 39
40 static inline size_t chunk_size(const struct gen_pool_chunk *chunk) 40 static inline size_t chunk_size(const struct gen_pool_chunk *chunk)
41 { 41 {
42 return chunk->end_addr - chunk->start_addr + 1; 42 return chunk->end_addr - chunk->start_addr + 1;
43 } 43 }
44 44
45 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) 45 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
46 { 46 {
47 unsigned long val, nval; 47 unsigned long val, nval;
48 48
49 nval = *addr; 49 nval = *addr;
50 do { 50 do {
51 val = nval; 51 val = nval;
52 if (val & mask_to_set) 52 if (val & mask_to_set)
53 return -EBUSY; 53 return -EBUSY;
54 cpu_relax(); 54 cpu_relax();
55 } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); 55 } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
56 56
57 return 0; 57 return 0;
58 } 58 }
59 59
60 static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) 60 static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear)
61 { 61 {
62 unsigned long val, nval; 62 unsigned long val, nval;
63 63
64 nval = *addr; 64 nval = *addr;
65 do { 65 do {
66 val = nval; 66 val = nval;
67 if ((val & mask_to_clear) != mask_to_clear) 67 if ((val & mask_to_clear) != mask_to_clear)
68 return -EBUSY; 68 return -EBUSY;
69 cpu_relax(); 69 cpu_relax();
70 } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); 70 } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
71 71
72 return 0; 72 return 0;
73 } 73 }
74 74
75 /* 75 /*
76 * bitmap_set_ll - set the specified number of bits at the specified position 76 * bitmap_set_ll - set the specified number of bits at the specified position
77 * @map: pointer to a bitmap 77 * @map: pointer to a bitmap
78 * @start: a bit position in @map 78 * @start: a bit position in @map
79 * @nr: number of bits to set 79 * @nr: number of bits to set
80 * 80 *
81 * Set @nr bits start from @start in @map lock-lessly. Several users 81 * Set @nr bits start from @start in @map lock-lessly. Several users
82 * can set/clear the same bitmap simultaneously without lock. If two 82 * can set/clear the same bitmap simultaneously without lock. If two
83 * users set the same bit, one user will return remain bits, otherwise 83 * users set the same bit, one user will return remain bits, otherwise
84 * return 0. 84 * return 0.
85 */ 85 */
86 static int bitmap_set_ll(unsigned long *map, int start, int nr) 86 static int bitmap_set_ll(unsigned long *map, int start, int nr)
87 { 87 {
88 unsigned long *p = map + BIT_WORD(start); 88 unsigned long *p = map + BIT_WORD(start);
89 const int size = start + nr; 89 const int size = start + nr;
90 int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); 90 int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
91 unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); 91 unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
92 92
93 while (nr - bits_to_set >= 0) { 93 while (nr - bits_to_set >= 0) {
94 if (set_bits_ll(p, mask_to_set)) 94 if (set_bits_ll(p, mask_to_set))
95 return nr; 95 return nr;
96 nr -= bits_to_set; 96 nr -= bits_to_set;
97 bits_to_set = BITS_PER_LONG; 97 bits_to_set = BITS_PER_LONG;
98 mask_to_set = ~0UL; 98 mask_to_set = ~0UL;
99 p++; 99 p++;
100 } 100 }
101 if (nr) { 101 if (nr) {
102 mask_to_set &= BITMAP_LAST_WORD_MASK(size); 102 mask_to_set &= BITMAP_LAST_WORD_MASK(size);
103 if (set_bits_ll(p, mask_to_set)) 103 if (set_bits_ll(p, mask_to_set))
104 return nr; 104 return nr;
105 } 105 }
106 106
107 return 0; 107 return 0;
108 } 108 }
109 109
110 /* 110 /*
111 * bitmap_clear_ll - clear the specified number of bits at the specified position 111 * bitmap_clear_ll - clear the specified number of bits at the specified position
112 * @map: pointer to a bitmap 112 * @map: pointer to a bitmap
113 * @start: a bit position in @map 113 * @start: a bit position in @map
114 * @nr: number of bits to set 114 * @nr: number of bits to set
115 * 115 *
116 * Clear @nr bits start from @start in @map lock-lessly. Several users 116 * Clear @nr bits start from @start in @map lock-lessly. Several users
117 * can set/clear the same bitmap simultaneously without lock. If two 117 * can set/clear the same bitmap simultaneously without lock. If two
118 * users clear the same bit, one user will return remain bits, 118 * users clear the same bit, one user will return remain bits,
119 * otherwise return 0. 119 * otherwise return 0.
120 */ 120 */
121 static int bitmap_clear_ll(unsigned long *map, int start, int nr) 121 static int bitmap_clear_ll(unsigned long *map, int start, int nr)
122 { 122 {
123 unsigned long *p = map + BIT_WORD(start); 123 unsigned long *p = map + BIT_WORD(start);
124 const int size = start + nr; 124 const int size = start + nr;
125 int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); 125 int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
126 unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); 126 unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
127 127
128 while (nr - bits_to_clear >= 0) { 128 while (nr - bits_to_clear >= 0) {
129 if (clear_bits_ll(p, mask_to_clear)) 129 if (clear_bits_ll(p, mask_to_clear))
130 return nr; 130 return nr;
131 nr -= bits_to_clear; 131 nr -= bits_to_clear;
132 bits_to_clear = BITS_PER_LONG; 132 bits_to_clear = BITS_PER_LONG;
133 mask_to_clear = ~0UL; 133 mask_to_clear = ~0UL;
134 p++; 134 p++;
135 } 135 }
136 if (nr) { 136 if (nr) {
137 mask_to_clear &= BITMAP_LAST_WORD_MASK(size); 137 mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
138 if (clear_bits_ll(p, mask_to_clear)) 138 if (clear_bits_ll(p, mask_to_clear))
139 return nr; 139 return nr;
140 } 140 }
141 141
142 return 0; 142 return 0;
143 } 143 }
144 144
145 /** 145 /**
146 * gen_pool_create - create a new special memory pool 146 * gen_pool_create - create a new special memory pool
147 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents 147 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
148 * @nid: node id of the node the pool structure should be allocated on, or -1 148 * @nid: node id of the node the pool structure should be allocated on, or -1
149 * 149 *
150 * Create a new special memory pool that can be used to manage special purpose 150 * Create a new special memory pool that can be used to manage special purpose
151 * memory not managed by the regular kmalloc/kfree interface. 151 * memory not managed by the regular kmalloc/kfree interface.
152 */ 152 */
153 struct gen_pool *gen_pool_create(int min_alloc_order, int nid) 153 struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
154 { 154 {
155 struct gen_pool *pool; 155 struct gen_pool *pool;
156 156
157 pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid); 157 pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
158 if (pool != NULL) { 158 if (pool != NULL) {
159 spin_lock_init(&pool->lock); 159 spin_lock_init(&pool->lock);
160 INIT_LIST_HEAD(&pool->chunks); 160 INIT_LIST_HEAD(&pool->chunks);
161 pool->min_alloc_order = min_alloc_order; 161 pool->min_alloc_order = min_alloc_order;
162 pool->algo = gen_pool_first_fit; 162 pool->algo = gen_pool_first_fit;
163 pool->data = NULL; 163 pool->data = NULL;
164 } 164 }
165 return pool; 165 return pool;
166 } 166 }
167 EXPORT_SYMBOL(gen_pool_create); 167 EXPORT_SYMBOL(gen_pool_create);
168 168
169 /** 169 /**
170 * gen_pool_add_virt - add a new chunk of special memory to the pool 170 * gen_pool_add_virt - add a new chunk of special memory to the pool
171 * @pool: pool to add new memory chunk to 171 * @pool: pool to add new memory chunk to
172 * @virt: virtual starting address of memory chunk to add to pool 172 * @virt: virtual starting address of memory chunk to add to pool
173 * @phys: physical starting address of memory chunk to add to pool 173 * @phys: physical starting address of memory chunk to add to pool
174 * @size: size in bytes of the memory chunk to add to pool 174 * @size: size in bytes of the memory chunk to add to pool
175 * @nid: node id of the node the chunk structure and bitmap should be 175 * @nid: node id of the node the chunk structure and bitmap should be
176 * allocated on, or -1 176 * allocated on, or -1
177 * 177 *
178 * Add a new chunk of special memory to the specified pool. 178 * Add a new chunk of special memory to the specified pool.
179 * 179 *
180 * Returns 0 on success or a -ve errno on failure. 180 * Returns 0 on success or a -ve errno on failure.
181 */ 181 */
182 int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phys, 182 int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phys,
183 size_t size, int nid) 183 size_t size, int nid)
184 { 184 {
185 struct gen_pool_chunk *chunk; 185 struct gen_pool_chunk *chunk;
186 int nbits = size >> pool->min_alloc_order; 186 int nbits = size >> pool->min_alloc_order;
187 int nbytes = sizeof(struct gen_pool_chunk) + 187 int nbytes = sizeof(struct gen_pool_chunk) +
188 BITS_TO_LONGS(nbits) * sizeof(long); 188 BITS_TO_LONGS(nbits) * sizeof(long);
189 189
190 chunk = kzalloc_node(nbytes, GFP_KERNEL, nid); 190 chunk = kzalloc_node(nbytes, GFP_KERNEL, nid);
191 if (unlikely(chunk == NULL)) 191 if (unlikely(chunk == NULL))
192 return -ENOMEM; 192 return -ENOMEM;
193 193
194 chunk->phys_addr = phys; 194 chunk->phys_addr = phys;
195 chunk->start_addr = virt; 195 chunk->start_addr = virt;
196 chunk->end_addr = virt + size - 1; 196 chunk->end_addr = virt + size - 1;
197 atomic_set(&chunk->avail, size); 197 atomic_set(&chunk->avail, size);
198 198
199 spin_lock(&pool->lock); 199 spin_lock(&pool->lock);
200 list_add_rcu(&chunk->next_chunk, &pool->chunks); 200 list_add_rcu(&chunk->next_chunk, &pool->chunks);
201 spin_unlock(&pool->lock); 201 spin_unlock(&pool->lock);
202 202
203 return 0; 203 return 0;
204 } 204 }
205 EXPORT_SYMBOL(gen_pool_add_virt); 205 EXPORT_SYMBOL(gen_pool_add_virt);
206 206
207 /** 207 /**
208 * gen_pool_virt_to_phys - return the physical address of memory 208 * gen_pool_virt_to_phys - return the physical address of memory
209 * @pool: pool to allocate from 209 * @pool: pool to allocate from
210 * @addr: starting address of memory 210 * @addr: starting address of memory
211 * 211 *
212 * Returns the physical address on success, or -1 on error. 212 * Returns the physical address on success, or -1 on error.
213 */ 213 */
214 phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) 214 phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
215 { 215 {
216 struct gen_pool_chunk *chunk; 216 struct gen_pool_chunk *chunk;
217 phys_addr_t paddr = -1; 217 phys_addr_t paddr = -1;
218 218
219 rcu_read_lock(); 219 rcu_read_lock();
220 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 220 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
221 if (addr >= chunk->start_addr && addr <= chunk->end_addr) { 221 if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
222 paddr = chunk->phys_addr + (addr - chunk->start_addr); 222 paddr = chunk->phys_addr + (addr - chunk->start_addr);
223 break; 223 break;
224 } 224 }
225 } 225 }
226 rcu_read_unlock(); 226 rcu_read_unlock();
227 227
228 return paddr; 228 return paddr;
229 } 229 }
230 EXPORT_SYMBOL(gen_pool_virt_to_phys); 230 EXPORT_SYMBOL(gen_pool_virt_to_phys);
231 231
232 /** 232 /**
233 * gen_pool_destroy - destroy a special memory pool 233 * gen_pool_destroy - destroy a special memory pool
234 * @pool: pool to destroy 234 * @pool: pool to destroy
235 * 235 *
236 * Destroy the specified special memory pool. Verifies that there are no 236 * Destroy the specified special memory pool. Verifies that there are no
237 * outstanding allocations. 237 * outstanding allocations.
238 */ 238 */
239 void gen_pool_destroy(struct gen_pool *pool) 239 void gen_pool_destroy(struct gen_pool *pool)
240 { 240 {
241 struct list_head *_chunk, *_next_chunk; 241 struct list_head *_chunk, *_next_chunk;
242 struct gen_pool_chunk *chunk; 242 struct gen_pool_chunk *chunk;
243 int order = pool->min_alloc_order; 243 int order = pool->min_alloc_order;
244 int bit, end_bit; 244 int bit, end_bit;
245 245
246 list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { 246 list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
247 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 247 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
248 list_del(&chunk->next_chunk); 248 list_del(&chunk->next_chunk);
249 249
250 end_bit = chunk_size(chunk) >> order; 250 end_bit = chunk_size(chunk) >> order;
251 bit = find_next_bit(chunk->bits, end_bit, 0); 251 bit = find_next_bit(chunk->bits, end_bit, 0);
252 BUG_ON(bit < end_bit); 252 BUG_ON(bit < end_bit);
253 253
254 kfree(chunk); 254 kfree(chunk);
255 } 255 }
256 kfree(pool); 256 kfree(pool);
257 return; 257 return;
258 } 258 }
259 EXPORT_SYMBOL(gen_pool_destroy); 259 EXPORT_SYMBOL(gen_pool_destroy);
260 260
261 /** 261 /**
262 * gen_pool_alloc - allocate special memory from the pool 262 * gen_pool_alloc - allocate special memory from the pool
263 * @pool: pool to allocate from 263 * @pool: pool to allocate from
264 * @size: number of bytes to allocate from the pool 264 * @size: number of bytes to allocate from the pool
265 * 265 *
266 * Allocate the requested number of bytes from the specified pool. 266 * Allocate the requested number of bytes from the specified pool.
267 * Uses the pool allocation function (with first-fit algorithm by default). 267 * Uses the pool allocation function (with first-fit algorithm by default).
268 * Can not be used in NMI handler on architectures without 268 * Can not be used in NMI handler on architectures without
269 * NMI-safe cmpxchg implementation. 269 * NMI-safe cmpxchg implementation.
270 */ 270 */
271 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) 271 unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
272 { 272 {
273 struct gen_pool_chunk *chunk; 273 struct gen_pool_chunk *chunk;
274 unsigned long addr = 0; 274 unsigned long addr = 0;
275 int order = pool->min_alloc_order; 275 int order = pool->min_alloc_order;
276 int nbits, start_bit = 0, end_bit, remain; 276 int nbits, start_bit = 0, end_bit, remain;
277 277
278 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 278 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
279 BUG_ON(in_nmi()); 279 BUG_ON(in_nmi());
280 #endif 280 #endif
281 281
282 if (size == 0) 282 if (size == 0)
283 return 0; 283 return 0;
284 284
285 nbits = (size + (1UL << order) - 1) >> order; 285 nbits = (size + (1UL << order) - 1) >> order;
286 rcu_read_lock(); 286 rcu_read_lock();
287 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 287 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
288 if (size > atomic_read(&chunk->avail)) 288 if (size > atomic_read(&chunk->avail))
289 continue; 289 continue;
290 290
291 end_bit = chunk_size(chunk) >> order; 291 end_bit = chunk_size(chunk) >> order;
292 retry: 292 retry:
293 start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, 293 start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits,
294 pool->data); 294 pool->data);
295 if (start_bit >= end_bit) 295 if (start_bit >= end_bit)
296 continue; 296 continue;
297 remain = bitmap_set_ll(chunk->bits, start_bit, nbits); 297 remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
298 if (remain) { 298 if (remain) {
299 remain = bitmap_clear_ll(chunk->bits, start_bit, 299 remain = bitmap_clear_ll(chunk->bits, start_bit,
300 nbits - remain); 300 nbits - remain);
301 BUG_ON(remain); 301 BUG_ON(remain);
302 goto retry; 302 goto retry;
303 } 303 }
304 304
305 addr = chunk->start_addr + ((unsigned long)start_bit << order); 305 addr = chunk->start_addr + ((unsigned long)start_bit << order);
306 size = nbits << order; 306 size = nbits << order;
307 atomic_sub(size, &chunk->avail); 307 atomic_sub(size, &chunk->avail);
308 break; 308 break;
309 } 309 }
310 rcu_read_unlock(); 310 rcu_read_unlock();
311 return addr; 311 return addr;
312 } 312 }
313 EXPORT_SYMBOL(gen_pool_alloc); 313 EXPORT_SYMBOL(gen_pool_alloc);
314 314
315 /** 315 /**
316 * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage 316 * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage
317 * @pool: pool to allocate from 317 * @pool: pool to allocate from
318 * @size: number of bytes to allocate from the pool 318 * @size: number of bytes to allocate from the pool
319 * @dma: dma-view physical address return value. Use NULL if unneeded. 319 * @dma: dma-view physical address return value. Use NULL if unneeded.
320 * 320 *
321 * Allocate the requested number of bytes from the specified pool. 321 * Allocate the requested number of bytes from the specified pool.
322 * Uses the pool allocation function (with first-fit algorithm by default). 322 * Uses the pool allocation function (with first-fit algorithm by default).
323 * Can not be used in NMI handler on architectures without 323 * Can not be used in NMI handler on architectures without
324 * NMI-safe cmpxchg implementation. 324 * NMI-safe cmpxchg implementation.
325 */ 325 */
326 void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma) 326 void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma)
327 { 327 {
328 unsigned long vaddr; 328 unsigned long vaddr;
329 329
330 if (!pool) 330 if (!pool)
331 return NULL; 331 return NULL;
332 332
333 vaddr = gen_pool_alloc(pool, size); 333 vaddr = gen_pool_alloc(pool, size);
334 if (!vaddr) 334 if (!vaddr)
335 return NULL; 335 return NULL;
336 336
337 if (dma) 337 if (dma)
338 *dma = gen_pool_virt_to_phys(pool, vaddr); 338 *dma = gen_pool_virt_to_phys(pool, vaddr);
339 339
340 return (void *)vaddr; 340 return (void *)vaddr;
341 } 341 }
342 EXPORT_SYMBOL(gen_pool_dma_alloc); 342 EXPORT_SYMBOL(gen_pool_dma_alloc);
343 343
344 /** 344 /**
345 * gen_pool_free - free allocated special memory back to the pool 345 * gen_pool_free - free allocated special memory back to the pool
346 * @pool: pool to free to 346 * @pool: pool to free to
347 * @addr: starting address of memory to free back to pool 347 * @addr: starting address of memory to free back to pool
348 * @size: size in bytes of memory to free 348 * @size: size in bytes of memory to free
349 * 349 *
350 * Free previously allocated special memory back to the specified 350 * Free previously allocated special memory back to the specified
351 * pool. Can not be used in NMI handler on architectures without 351 * pool. Can not be used in NMI handler on architectures without
352 * NMI-safe cmpxchg implementation. 352 * NMI-safe cmpxchg implementation.
353 */ 353 */
354 void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) 354 void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
355 { 355 {
356 struct gen_pool_chunk *chunk; 356 struct gen_pool_chunk *chunk;
357 int order = pool->min_alloc_order; 357 int order = pool->min_alloc_order;
358 int start_bit, nbits, remain; 358 int start_bit, nbits, remain;
359 359
360 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG 360 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
361 BUG_ON(in_nmi()); 361 BUG_ON(in_nmi());
362 #endif 362 #endif
363 363
364 nbits = (size + (1UL << order) - 1) >> order; 364 nbits = (size + (1UL << order) - 1) >> order;
365 rcu_read_lock(); 365 rcu_read_lock();
366 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { 366 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
367 if (addr >= chunk->start_addr && addr <= chunk->end_addr) { 367 if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
368 BUG_ON(addr + size - 1 > chunk->end_addr); 368 BUG_ON(addr + size - 1 > chunk->end_addr);
369 start_bit = (addr - chunk->start_addr) >> order; 369 start_bit = (addr - chunk->start_addr) >> order;
370 remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); 370 remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
371 BUG_ON(remain); 371 BUG_ON(remain);
372 size = nbits << order; 372 size = nbits << order;
373 atomic_add(size, &chunk->avail); 373 atomic_add(size, &chunk->avail);
374 rcu_read_unlock(); 374 rcu_read_unlock();
375 return; 375 return;
376 } 376 }
377 } 377 }
378 rcu_read_unlock(); 378 rcu_read_unlock();
379 BUG(); 379 BUG();
380 } 380 }
381 EXPORT_SYMBOL(gen_pool_free); 381 EXPORT_SYMBOL(gen_pool_free);
382 382
383 /** 383 /**
384 * gen_pool_for_each_chunk - call func for every chunk of generic memory pool 384 * gen_pool_for_each_chunk - call func for every chunk of generic memory pool
385 * @pool: the generic memory pool 385 * @pool: the generic memory pool
386 * @func: func to call 386 * @func: func to call
387 * @data: additional data used by @func 387 * @data: additional data used by @func
388 * 388 *
389 * Call @func for every chunk of generic memory pool. The @func is 389 * Call @func for every chunk of generic memory pool. The @func is
390 * called with rcu_read_lock held. 390 * called with rcu_read_lock held.
391 */ 391 */
392 void gen_pool_for_each_chunk(struct gen_pool *pool, 392 void gen_pool_for_each_chunk(struct gen_pool *pool,
393 void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data), 393 void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data),
394 void *data) 394 void *data)
395 { 395 {
396 struct gen_pool_chunk *chunk; 396 struct gen_pool_chunk *chunk;
397 397
398 rcu_read_lock(); 398 rcu_read_lock();
399 list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) 399 list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk)
400 func(pool, chunk, data); 400 func(pool, chunk, data);
401 rcu_read_unlock(); 401 rcu_read_unlock();
402 } 402 }
403 EXPORT_SYMBOL(gen_pool_for_each_chunk); 403 EXPORT_SYMBOL(gen_pool_for_each_chunk);
404 404
405 /** 405 /**
406 * gen_pool_avail - get available free space of the pool 406 * gen_pool_avail - get available free space of the pool
407 * @pool: pool to get available free space 407 * @pool: pool to get available free space
408 * 408 *
409 * Return available free space of the specified pool. 409 * Return available free space of the specified pool.
410 */ 410 */
411 size_t gen_pool_avail(struct gen_pool *pool) 411 size_t gen_pool_avail(struct gen_pool *pool)
412 { 412 {
413 struct gen_pool_chunk *chunk; 413 struct gen_pool_chunk *chunk;
414 size_t avail = 0; 414 size_t avail = 0;
415 415
416 rcu_read_lock(); 416 rcu_read_lock();
417 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) 417 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
418 avail += atomic_read(&chunk->avail); 418 avail += atomic_read(&chunk->avail);
419 rcu_read_unlock(); 419 rcu_read_unlock();
420 return avail; 420 return avail;
421 } 421 }
422 EXPORT_SYMBOL_GPL(gen_pool_avail); 422 EXPORT_SYMBOL_GPL(gen_pool_avail);
423 423
424 /** 424 /**
425 * gen_pool_size - get size in bytes of memory managed by the pool 425 * gen_pool_size - get size in bytes of memory managed by the pool
426 * @pool: pool to get size 426 * @pool: pool to get size
427 * 427 *
428 * Return size in bytes of memory managed by the pool. 428 * Return size in bytes of memory managed by the pool.
429 */ 429 */
430 size_t gen_pool_size(struct gen_pool *pool) 430 size_t gen_pool_size(struct gen_pool *pool)
431 { 431 {
432 struct gen_pool_chunk *chunk; 432 struct gen_pool_chunk *chunk;
433 size_t size = 0; 433 size_t size = 0;
434 434
435 rcu_read_lock(); 435 rcu_read_lock();
436 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) 436 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
437 size += chunk_size(chunk); 437 size += chunk_size(chunk);
438 rcu_read_unlock(); 438 rcu_read_unlock();
439 return size; 439 return size;
440 } 440 }
441 EXPORT_SYMBOL_GPL(gen_pool_size); 441 EXPORT_SYMBOL_GPL(gen_pool_size);
442 442
443 /** 443 /**
444 * gen_pool_set_algo - set the allocation algorithm 444 * gen_pool_set_algo - set the allocation algorithm
445 * @pool: pool to change allocation algorithm 445 * @pool: pool to change allocation algorithm
446 * @algo: custom algorithm function 446 * @algo: custom algorithm function
447 * @data: additional data used by @algo 447 * @data: additional data used by @algo
448 * 448 *
449 * Call @algo for each memory allocation in the pool. 449 * Call @algo for each memory allocation in the pool.
450 * If @algo is NULL use gen_pool_first_fit as default 450 * If @algo is NULL use gen_pool_first_fit as default
451 * memory allocation function. 451 * memory allocation function.
452 */ 452 */
453 void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, void *data) 453 void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, void *data)
454 { 454 {
455 rcu_read_lock(); 455 rcu_read_lock();
456 456
457 pool->algo = algo; 457 pool->algo = algo;
458 if (!pool->algo) 458 if (!pool->algo)
459 pool->algo = gen_pool_first_fit; 459 pool->algo = gen_pool_first_fit;
460 460
461 pool->data = data; 461 pool->data = data;
462 462
463 rcu_read_unlock(); 463 rcu_read_unlock();
464 } 464 }
465 EXPORT_SYMBOL(gen_pool_set_algo); 465 EXPORT_SYMBOL(gen_pool_set_algo);
466 466
467 /** 467 /**
468 * gen_pool_first_fit - find the first available region 468 * gen_pool_first_fit - find the first available region
469 * of memory matching the size requirement (no alignment constraint) 469 * of memory matching the size requirement (no alignment constraint)
470 * @map: The address to base the search on 470 * @map: The address to base the search on
471 * @size: The bitmap size in bits 471 * @size: The bitmap size in bits
472 * @start: The bitnumber to start searching at 472 * @start: The bitnumber to start searching at
473 * @nr: The number of zeroed bits we're looking for 473 * @nr: The number of zeroed bits we're looking for
474 * @data: additional data - unused 474 * @data: additional data - unused
475 */ 475 */
476 unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, 476 unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
477 unsigned long start, unsigned int nr, void *data) 477 unsigned long start, unsigned int nr, void *data)
478 { 478 {
479 return bitmap_find_next_zero_area(map, size, start, nr, 0); 479 return bitmap_find_next_zero_area(map, size, start, nr, 0);
480 } 480 }
481 EXPORT_SYMBOL(gen_pool_first_fit); 481 EXPORT_SYMBOL(gen_pool_first_fit);
482 482
483 /** 483 /**
484 * gen_pool_best_fit - find the best fitting region of memory 484 * gen_pool_best_fit - find the best fitting region of memory
485 * macthing the size requirement (no alignment constraint) 485 * macthing the size requirement (no alignment constraint)
486 * @map: The address to base the search on 486 * @map: The address to base the search on
487 * @size: The bitmap size in bits 487 * @size: The bitmap size in bits
488 * @start: The bitnumber to start searching at 488 * @start: The bitnumber to start searching at
489 * @nr: The number of zeroed bits we're looking for 489 * @nr: The number of zeroed bits we're looking for
490 * @data: additional data - unused 490 * @data: additional data - unused
491 * 491 *
492 * Iterate over the bitmap to find the smallest free region 492 * Iterate over the bitmap to find the smallest free region
493 * which we can allocate the memory. 493 * which we can allocate the memory.
494 */ 494 */
495 unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, 495 unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
496 unsigned long start, unsigned int nr, void *data) 496 unsigned long start, unsigned int nr, void *data)
497 { 497 {
498 unsigned long start_bit = size; 498 unsigned long start_bit = size;
499 unsigned long len = size + 1; 499 unsigned long len = size + 1;
500 unsigned long index; 500 unsigned long index;
501 501
502 index = bitmap_find_next_zero_area(map, size, start, nr, 0); 502 index = bitmap_find_next_zero_area(map, size, start, nr, 0);
503 503
504 while (index < size) { 504 while (index < size) {
505 int next_bit = find_next_bit(map, size, index + nr); 505 int next_bit = find_next_bit(map, size, index + nr);
506 if ((next_bit - index) < len) { 506 if ((next_bit - index) < len) {
507 len = next_bit - index; 507 len = next_bit - index;
508 start_bit = index; 508 start_bit = index;
509 if (len == nr) 509 if (len == nr)
510 return start_bit; 510 return start_bit;
511 } 511 }
512 index = bitmap_find_next_zero_area(map, size, 512 index = bitmap_find_next_zero_area(map, size,
513 next_bit + 1, nr, 0); 513 next_bit + 1, nr, 0);
514 } 514 }
515 515
516 return start_bit; 516 return start_bit;
517 } 517 }
518 EXPORT_SYMBOL(gen_pool_best_fit); 518 EXPORT_SYMBOL(gen_pool_best_fit);
519 519
520 static void devm_gen_pool_release(struct device *dev, void *res) 520 static void devm_gen_pool_release(struct device *dev, void *res)
521 { 521 {
522 gen_pool_destroy(*(struct gen_pool **)res); 522 gen_pool_destroy(*(struct gen_pool **)res);
523 } 523 }
524 524
525 /** 525 /**
526 * devm_gen_pool_create - managed gen_pool_create 526 * devm_gen_pool_create - managed gen_pool_create
527 * @dev: device that provides the gen_pool 527 * @dev: device that provides the gen_pool
528 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents 528 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
529 * @nid: node id of the node the pool structure should be allocated on, or -1 529 * @nid: node id of the node the pool structure should be allocated on, or -1
530 * 530 *
531 * Create a new special memory pool that can be used to manage special purpose 531 * Create a new special memory pool that can be used to manage special purpose
532 * memory not managed by the regular kmalloc/kfree interface. The pool will be 532 * memory not managed by the regular kmalloc/kfree interface. The pool will be
533 * automatically destroyed by the device management code. 533 * automatically destroyed by the device management code.
534 */ 534 */
535 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, 535 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
536 int nid) 536 int nid)
537 { 537 {
538 struct gen_pool **ptr, *pool; 538 struct gen_pool **ptr, *pool;
539 539
540 ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); 540 ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
541 541
542 pool = gen_pool_create(min_alloc_order, nid); 542 pool = gen_pool_create(min_alloc_order, nid);
543 if (pool) { 543 if (pool) {
544 *ptr = pool; 544 *ptr = pool;
545 devres_add(dev, ptr); 545 devres_add(dev, ptr);
546 } else { 546 } else {
547 devres_free(ptr); 547 devres_free(ptr);
548 } 548 }
549 549
550 return pool; 550 return pool;
551 } 551 }
552 552
553 /** 553 /**
554 * dev_get_gen_pool - Obtain the gen_pool (if any) for a device 554 * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
555 * @dev: device to retrieve the gen_pool from 555 * @dev: device to retrieve the gen_pool from
556 * 556 *
557 * Returns the gen_pool for the device if one is present, or NULL. 557 * Returns the gen_pool for the device if one is present, or NULL.
558 */ 558 */
559 struct gen_pool *dev_get_gen_pool(struct device *dev) 559 struct gen_pool *dev_get_gen_pool(struct device *dev)
560 { 560 {
561 struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, 561 struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
562 NULL); 562 NULL);
563 563
564 if (!p) 564 if (!p)
565 return NULL; 565 return NULL;
566 return *p; 566 return *p;
567 } 567 }
568 EXPORT_SYMBOL_GPL(dev_get_gen_pool); 568 EXPORT_SYMBOL_GPL(dev_get_gen_pool);
569 569
570 #ifdef CONFIG_OF 570 #ifdef CONFIG_OF
571 /** 571 /**
572 * of_get_named_gen_pool - find a pool by phandle property 572 * of_get_named_gen_pool - find a pool by phandle property
573 * @np: device node 573 * @np: device node
574 * @propname: property name containing phandle(s) 574 * @propname: property name containing phandle(s)
575 * @index: index into the phandle array 575 * @index: index into the phandle array
576 * 576 *
577 * Returns the pool that contains the chunk starting at the physical 577 * Returns the pool that contains the chunk starting at the physical
578 * address of the device tree node pointed at by the phandle property, 578 * address of the device tree node pointed at by the phandle property,
579 * or NULL if not found. 579 * or NULL if not found.
580 */ 580 */
581 struct gen_pool *of_get_named_gen_pool(struct device_node *np, 581 struct gen_pool *of_get_named_gen_pool(struct device_node *np,
582 const char *propname, int index) 582 const char *propname, int index)
583 { 583 {
584 struct platform_device *pdev; 584 struct platform_device *pdev;
585 struct device_node *np_pool; 585 struct device_node *np_pool;
586 586
587 np_pool = of_parse_phandle(np, propname, index); 587 np_pool = of_parse_phandle(np, propname, index);
588 if (!np_pool) 588 if (!np_pool)
589 return NULL; 589 return NULL;
590 pdev = of_find_device_by_node(np_pool); 590 pdev = of_find_device_by_node(np_pool);
591 of_node_put(np_pool);
591 if (!pdev) 592 if (!pdev)
592 return NULL; 593 return NULL;
593 return dev_get_gen_pool(&pdev->dev); 594 return dev_get_gen_pool(&pdev->dev);
594 } 595 }
595 EXPORT_SYMBOL_GPL(of_get_named_gen_pool); 596 EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
596 #endif /* CONFIG_OF */ 597 #endif /* CONFIG_OF */
597 598
1 /* 1 /*
2 * linux/mm/memory.c 2 * linux/mm/memory.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of 8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus 9 * things wanted, and it should be easy to implement. - Linus
10 */ 10 */
11 11
12 /* 12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared 13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus. 14 * pages started 02.12.91, seems to work. - Linus.
15 * 15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it 16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as 17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see. 18 * far as I could see.
19 * 19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them. 20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */ 21 */
22 22
23 /* 23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and 24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well.. 25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. 26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now. 27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root. 28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */ 29 */
30 30
31 /* 31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1. 32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk) 33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 * 34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG 35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de) 36 * (Gerhard.Wichert@pdb.siemens.de)
37 * 37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) 38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */ 39 */
40 40
41 #include <linux/kernel_stat.h> 41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h> 42 #include <linux/mm.h>
43 #include <linux/hugetlb.h> 43 #include <linux/hugetlb.h>
44 #include <linux/mman.h> 44 #include <linux/mman.h>
45 #include <linux/swap.h> 45 #include <linux/swap.h>
46 #include <linux/highmem.h> 46 #include <linux/highmem.h>
47 #include <linux/pagemap.h> 47 #include <linux/pagemap.h>
48 #include <linux/ksm.h> 48 #include <linux/ksm.h>
49 #include <linux/rmap.h> 49 #include <linux/rmap.h>
50 #include <linux/export.h> 50 #include <linux/export.h>
51 #include <linux/delayacct.h> 51 #include <linux/delayacct.h>
52 #include <linux/init.h> 52 #include <linux/init.h>
53 #include <linux/writeback.h> 53 #include <linux/writeback.h>
54 #include <linux/memcontrol.h> 54 #include <linux/memcontrol.h>
55 #include <linux/mmu_notifier.h> 55 #include <linux/mmu_notifier.h>
56 #include <linux/kallsyms.h> 56 #include <linux/kallsyms.h>
57 #include <linux/swapops.h> 57 #include <linux/swapops.h>
58 #include <linux/elf.h> 58 #include <linux/elf.h>
59 #include <linux/gfp.h> 59 #include <linux/gfp.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/string.h> 61 #include <linux/string.h>
62 #include <linux/dma-debug.h> 62 #include <linux/dma-debug.h>
63 #include <linux/debugfs.h> 63 #include <linux/debugfs.h>
64 64
65 #include <asm/io.h> 65 #include <asm/io.h>
66 #include <asm/pgalloc.h> 66 #include <asm/pgalloc.h>
67 #include <asm/uaccess.h> 67 #include <asm/uaccess.h>
68 #include <asm/tlb.h> 68 #include <asm/tlb.h>
69 #include <asm/tlbflush.h> 69 #include <asm/tlbflush.h>
70 #include <asm/pgtable.h> 70 #include <asm/pgtable.h>
71 71
72 #include "internal.h" 72 #include "internal.h"
73 73
74 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 74 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
75 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. 75 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
76 #endif 76 #endif
77 77
78 #ifndef CONFIG_NEED_MULTIPLE_NODES 78 #ifndef CONFIG_NEED_MULTIPLE_NODES
79 /* use the per-pgdat data instead for discontigmem - mbligh */ 79 /* use the per-pgdat data instead for discontigmem - mbligh */
80 unsigned long max_mapnr; 80 unsigned long max_mapnr;
81 struct page *mem_map; 81 struct page *mem_map;
82 82
83 EXPORT_SYMBOL(max_mapnr); 83 EXPORT_SYMBOL(max_mapnr);
84 EXPORT_SYMBOL(mem_map); 84 EXPORT_SYMBOL(mem_map);
85 #endif 85 #endif
86 86
87 /* 87 /*
88 * A number of key systems in x86 including ioremap() rely on the assumption 88 * A number of key systems in x86 including ioremap() rely on the assumption
89 * that high_memory defines the upper bound on direct map memory, then end 89 * that high_memory defines the upper bound on direct map memory, then end
90 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and 90 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
91 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL 91 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
92 * and ZONE_HIGHMEM. 92 * and ZONE_HIGHMEM.
93 */ 93 */
94 void * high_memory; 94 void * high_memory;
95 95
96 EXPORT_SYMBOL(high_memory); 96 EXPORT_SYMBOL(high_memory);
97 97
98 /* 98 /*
99 * Randomize the address space (stacks, mmaps, brk, etc.). 99 * Randomize the address space (stacks, mmaps, brk, etc.).
100 * 100 *
101 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, 101 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
102 * as ancient (libc5 based) binaries can segfault. ) 102 * as ancient (libc5 based) binaries can segfault. )
103 */ 103 */
104 int randomize_va_space __read_mostly = 104 int randomize_va_space __read_mostly =
105 #ifdef CONFIG_COMPAT_BRK 105 #ifdef CONFIG_COMPAT_BRK
106 1; 106 1;
107 #else 107 #else
108 2; 108 2;
109 #endif 109 #endif
110 110
111 static int __init disable_randmaps(char *s) 111 static int __init disable_randmaps(char *s)
112 { 112 {
113 randomize_va_space = 0; 113 randomize_va_space = 0;
114 return 1; 114 return 1;
115 } 115 }
116 __setup("norandmaps", disable_randmaps); 116 __setup("norandmaps", disable_randmaps);
117 117
118 unsigned long zero_pfn __read_mostly; 118 unsigned long zero_pfn __read_mostly;
119 unsigned long highest_memmap_pfn __read_mostly; 119 unsigned long highest_memmap_pfn __read_mostly;
120 120
121 EXPORT_SYMBOL(zero_pfn); 121 EXPORT_SYMBOL(zero_pfn);
122 122
123 /* 123 /*
124 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 124 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
125 */ 125 */
126 static int __init init_zero_pfn(void) 126 static int __init init_zero_pfn(void)
127 { 127 {
128 zero_pfn = page_to_pfn(ZERO_PAGE(0)); 128 zero_pfn = page_to_pfn(ZERO_PAGE(0));
129 return 0; 129 return 0;
130 } 130 }
131 core_initcall(init_zero_pfn); 131 core_initcall(init_zero_pfn);
132 132
133 133
134 #if defined(SPLIT_RSS_COUNTING) 134 #if defined(SPLIT_RSS_COUNTING)
135 135
136 void sync_mm_rss(struct mm_struct *mm) 136 void sync_mm_rss(struct mm_struct *mm)
137 { 137 {
138 int i; 138 int i;
139 139
140 for (i = 0; i < NR_MM_COUNTERS; i++) { 140 for (i = 0; i < NR_MM_COUNTERS; i++) {
141 if (current->rss_stat.count[i]) { 141 if (current->rss_stat.count[i]) {
142 add_mm_counter(mm, i, current->rss_stat.count[i]); 142 add_mm_counter(mm, i, current->rss_stat.count[i]);
143 current->rss_stat.count[i] = 0; 143 current->rss_stat.count[i] = 0;
144 } 144 }
145 } 145 }
146 current->rss_stat.events = 0; 146 current->rss_stat.events = 0;
147 } 147 }
148 148
149 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 149 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
150 { 150 {
151 struct task_struct *task = current; 151 struct task_struct *task = current;
152 152
153 if (likely(task->mm == mm)) 153 if (likely(task->mm == mm))
154 task->rss_stat.count[member] += val; 154 task->rss_stat.count[member] += val;
155 else 155 else
156 add_mm_counter(mm, member, val); 156 add_mm_counter(mm, member, val);
157 } 157 }
158 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) 158 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
159 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) 159 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
160 160
161 /* sync counter once per 64 page faults */ 161 /* sync counter once per 64 page faults */
162 #define TASK_RSS_EVENTS_THRESH (64) 162 #define TASK_RSS_EVENTS_THRESH (64)
163 static void check_sync_rss_stat(struct task_struct *task) 163 static void check_sync_rss_stat(struct task_struct *task)
164 { 164 {
165 if (unlikely(task != current)) 165 if (unlikely(task != current))
166 return; 166 return;
167 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 167 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
168 sync_mm_rss(task->mm); 168 sync_mm_rss(task->mm);
169 } 169 }
170 #else /* SPLIT_RSS_COUNTING */ 170 #else /* SPLIT_RSS_COUNTING */
171 171
172 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) 172 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
173 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) 173 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
174 174
175 static void check_sync_rss_stat(struct task_struct *task) 175 static void check_sync_rss_stat(struct task_struct *task)
176 { 176 {
177 } 177 }
178 178
179 #endif /* SPLIT_RSS_COUNTING */ 179 #endif /* SPLIT_RSS_COUNTING */
180 180
181 #ifdef HAVE_GENERIC_MMU_GATHER 181 #ifdef HAVE_GENERIC_MMU_GATHER
182 182
183 static int tlb_next_batch(struct mmu_gather *tlb) 183 static int tlb_next_batch(struct mmu_gather *tlb)
184 { 184 {
185 struct mmu_gather_batch *batch; 185 struct mmu_gather_batch *batch;
186 186
187 batch = tlb->active; 187 batch = tlb->active;
188 if (batch->next) { 188 if (batch->next) {
189 tlb->active = batch->next; 189 tlb->active = batch->next;
190 return 1; 190 return 1;
191 } 191 }
192 192
193 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 193 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
194 return 0; 194 return 0;
195 195
196 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 196 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
197 if (!batch) 197 if (!batch)
198 return 0; 198 return 0;
199 199
200 tlb->batch_count++; 200 tlb->batch_count++;
201 batch->next = NULL; 201 batch->next = NULL;
202 batch->nr = 0; 202 batch->nr = 0;
203 batch->max = MAX_GATHER_BATCH; 203 batch->max = MAX_GATHER_BATCH;
204 204
205 tlb->active->next = batch; 205 tlb->active->next = batch;
206 tlb->active = batch; 206 tlb->active = batch;
207 207
208 return 1; 208 return 1;
209 } 209 }
210 210
211 /* tlb_gather_mmu 211 /* tlb_gather_mmu
212 * Called to initialize an (on-stack) mmu_gather structure for page-table 212 * Called to initialize an (on-stack) mmu_gather structure for page-table
213 * tear-down from @mm. The @fullmm argument is used when @mm is without 213 * tear-down from @mm. The @fullmm argument is used when @mm is without
214 * users and we're going to destroy the full address space (exit/execve). 214 * users and we're going to destroy the full address space (exit/execve).
215 */ 215 */
216 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 216 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
217 { 217 {
218 tlb->mm = mm; 218 tlb->mm = mm;
219 219
220 /* Is it from 0 to ~0? */ 220 /* Is it from 0 to ~0? */
221 tlb->fullmm = !(start | (end+1)); 221 tlb->fullmm = !(start | (end+1));
222 tlb->need_flush_all = 0; 222 tlb->need_flush_all = 0;
223 tlb->start = start; 223 tlb->start = start;
224 tlb->end = end; 224 tlb->end = end;
225 tlb->need_flush = 0; 225 tlb->need_flush = 0;
226 tlb->local.next = NULL; 226 tlb->local.next = NULL;
227 tlb->local.nr = 0; 227 tlb->local.nr = 0;
228 tlb->local.max = ARRAY_SIZE(tlb->__pages); 228 tlb->local.max = ARRAY_SIZE(tlb->__pages);
229 tlb->active = &tlb->local; 229 tlb->active = &tlb->local;
230 tlb->batch_count = 0; 230 tlb->batch_count = 0;
231 231
232 #ifdef CONFIG_HAVE_RCU_TABLE_FREE 232 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
233 tlb->batch = NULL; 233 tlb->batch = NULL;
234 #endif 234 #endif
235 } 235 }
236 236
237 static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) 237 static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
238 { 238 {
239 tlb->need_flush = 0; 239 tlb->need_flush = 0;
240 tlb_flush(tlb); 240 tlb_flush(tlb);
241 #ifdef CONFIG_HAVE_RCU_TABLE_FREE 241 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
242 tlb_table_flush(tlb); 242 tlb_table_flush(tlb);
243 #endif 243 #endif
244 } 244 }
245 245
246 static void tlb_flush_mmu_free(struct mmu_gather *tlb) 246 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
247 { 247 {
248 struct mmu_gather_batch *batch; 248 struct mmu_gather_batch *batch;
249 249
250 for (batch = &tlb->local; batch; batch = batch->next) { 250 for (batch = &tlb->local; batch; batch = batch->next) {
251 free_pages_and_swap_cache(batch->pages, batch->nr); 251 free_pages_and_swap_cache(batch->pages, batch->nr);
252 batch->nr = 0; 252 batch->nr = 0;
253 } 253 }
254 tlb->active = &tlb->local; 254 tlb->active = &tlb->local;
255 } 255 }
256 256
257 void tlb_flush_mmu(struct mmu_gather *tlb) 257 void tlb_flush_mmu(struct mmu_gather *tlb)
258 { 258 {
259 if (!tlb->need_flush) 259 if (!tlb->need_flush)
260 return; 260 return;
261 tlb_flush_mmu_tlbonly(tlb); 261 tlb_flush_mmu_tlbonly(tlb);
262 tlb_flush_mmu_free(tlb); 262 tlb_flush_mmu_free(tlb);
263 } 263 }
264 264
265 /* tlb_finish_mmu 265 /* tlb_finish_mmu
266 * Called at the end of the shootdown operation to free up any resources 266 * Called at the end of the shootdown operation to free up any resources
267 * that were required. 267 * that were required.
268 */ 268 */
269 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) 269 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
270 { 270 {
271 struct mmu_gather_batch *batch, *next; 271 struct mmu_gather_batch *batch, *next;
272 272
273 tlb_flush_mmu(tlb); 273 tlb_flush_mmu(tlb);
274 274
275 /* keep the page table cache within bounds */ 275 /* keep the page table cache within bounds */
276 check_pgt_cache(); 276 check_pgt_cache();
277 277
278 for (batch = tlb->local.next; batch; batch = next) { 278 for (batch = tlb->local.next; batch; batch = next) {
279 next = batch->next; 279 next = batch->next;
280 free_pages((unsigned long)batch, 0); 280 free_pages((unsigned long)batch, 0);
281 } 281 }
282 tlb->local.next = NULL; 282 tlb->local.next = NULL;
283 } 283 }
284 284
285 /* __tlb_remove_page 285 /* __tlb_remove_page
286 * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while 286 * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
287 * handling the additional races in SMP caused by other CPUs caching valid 287 * handling the additional races in SMP caused by other CPUs caching valid
288 * mappings in their TLBs. Returns the number of free page slots left. 288 * mappings in their TLBs. Returns the number of free page slots left.
289 * When out of page slots we must call tlb_flush_mmu(). 289 * When out of page slots we must call tlb_flush_mmu().
290 */ 290 */
291 int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 291 int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
292 { 292 {
293 struct mmu_gather_batch *batch; 293 struct mmu_gather_batch *batch;
294 294
295 VM_BUG_ON(!tlb->need_flush); 295 VM_BUG_ON(!tlb->need_flush);
296 296
297 batch = tlb->active; 297 batch = tlb->active;
298 batch->pages[batch->nr++] = page; 298 batch->pages[batch->nr++] = page;
299 if (batch->nr == batch->max) { 299 if (batch->nr == batch->max) {
300 if (!tlb_next_batch(tlb)) 300 if (!tlb_next_batch(tlb))
301 return 0; 301 return 0;
302 batch = tlb->active; 302 batch = tlb->active;
303 } 303 }
304 VM_BUG_ON_PAGE(batch->nr > batch->max, page); 304 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
305 305
306 return batch->max - batch->nr; 306 return batch->max - batch->nr;
307 } 307 }
308 308
309 #endif /* HAVE_GENERIC_MMU_GATHER */ 309 #endif /* HAVE_GENERIC_MMU_GATHER */
310 310
311 #ifdef CONFIG_HAVE_RCU_TABLE_FREE 311 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
312 312
313 /* 313 /*
314 * See the comment near struct mmu_table_batch. 314 * See the comment near struct mmu_table_batch.
315 */ 315 */
316 316
317 static void tlb_remove_table_smp_sync(void *arg) 317 static void tlb_remove_table_smp_sync(void *arg)
318 { 318 {
319 /* Simply deliver the interrupt */ 319 /* Simply deliver the interrupt */
320 } 320 }
321 321
322 static void tlb_remove_table_one(void *table) 322 static void tlb_remove_table_one(void *table)
323 { 323 {
324 /* 324 /*
325 * This isn't an RCU grace period and hence the page-tables cannot be 325 * This isn't an RCU grace period and hence the page-tables cannot be
326 * assumed to be actually RCU-freed. 326 * assumed to be actually RCU-freed.
327 * 327 *
328 * It is however sufficient for software page-table walkers that rely on 328 * It is however sufficient for software page-table walkers that rely on
329 * IRQ disabling. See the comment near struct mmu_table_batch. 329 * IRQ disabling. See the comment near struct mmu_table_batch.
330 */ 330 */
331 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 331 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
332 __tlb_remove_table(table); 332 __tlb_remove_table(table);
333 } 333 }
334 334
335 static void tlb_remove_table_rcu(struct rcu_head *head) 335 static void tlb_remove_table_rcu(struct rcu_head *head)
336 { 336 {
337 struct mmu_table_batch *batch; 337 struct mmu_table_batch *batch;
338 int i; 338 int i;
339 339
340 batch = container_of(head, struct mmu_table_batch, rcu); 340 batch = container_of(head, struct mmu_table_batch, rcu);
341 341
342 for (i = 0; i < batch->nr; i++) 342 for (i = 0; i < batch->nr; i++)
343 __tlb_remove_table(batch->tables[i]); 343 __tlb_remove_table(batch->tables[i]);
344 344
345 free_page((unsigned long)batch); 345 free_page((unsigned long)batch);
346 } 346 }
347 347
348 void tlb_table_flush(struct mmu_gather *tlb) 348 void tlb_table_flush(struct mmu_gather *tlb)
349 { 349 {
350 struct mmu_table_batch **batch = &tlb->batch; 350 struct mmu_table_batch **batch = &tlb->batch;
351 351
352 if (*batch) { 352 if (*batch) {
353 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 353 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
354 *batch = NULL; 354 *batch = NULL;
355 } 355 }
356 } 356 }
357 357
358 void tlb_remove_table(struct mmu_gather *tlb, void *table) 358 void tlb_remove_table(struct mmu_gather *tlb, void *table)
359 { 359 {
360 struct mmu_table_batch **batch = &tlb->batch; 360 struct mmu_table_batch **batch = &tlb->batch;
361 361
362 tlb->need_flush = 1; 362 tlb->need_flush = 1;
363 363
364 /* 364 /*
365 * When there's less then two users of this mm there cannot be a 365 * When there's less then two users of this mm there cannot be a
366 * concurrent page-table walk. 366 * concurrent page-table walk.
367 */ 367 */
368 if (atomic_read(&tlb->mm->mm_users) < 2) { 368 if (atomic_read(&tlb->mm->mm_users) < 2) {
369 __tlb_remove_table(table); 369 __tlb_remove_table(table);
370 return; 370 return;
371 } 371 }
372 372
373 if (*batch == NULL) { 373 if (*batch == NULL) {
374 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 374 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
375 if (*batch == NULL) { 375 if (*batch == NULL) {
376 tlb_remove_table_one(table); 376 tlb_remove_table_one(table);
377 return; 377 return;
378 } 378 }
379 (*batch)->nr = 0; 379 (*batch)->nr = 0;
380 } 380 }
381 (*batch)->tables[(*batch)->nr++] = table; 381 (*batch)->tables[(*batch)->nr++] = table;
382 if ((*batch)->nr == MAX_TABLE_BATCH) 382 if ((*batch)->nr == MAX_TABLE_BATCH)
383 tlb_table_flush(tlb); 383 tlb_table_flush(tlb);
384 } 384 }
385 385
386 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ 386 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
387 387
388 /* 388 /*
389 * Note: this doesn't free the actual pages themselves. That 389 * Note: this doesn't free the actual pages themselves. That
390 * has been handled earlier when unmapping all the memory regions. 390 * has been handled earlier when unmapping all the memory regions.
391 */ 391 */
392 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 392 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
393 unsigned long addr) 393 unsigned long addr)
394 { 394 {
395 pgtable_t token = pmd_pgtable(*pmd); 395 pgtable_t token = pmd_pgtable(*pmd);
396 pmd_clear(pmd); 396 pmd_clear(pmd);
397 pte_free_tlb(tlb, token, addr); 397 pte_free_tlb(tlb, token, addr);
398 atomic_long_dec(&tlb->mm->nr_ptes); 398 atomic_long_dec(&tlb->mm->nr_ptes);
399 } 399 }
400 400
401 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 401 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
402 unsigned long addr, unsigned long end, 402 unsigned long addr, unsigned long end,
403 unsigned long floor, unsigned long ceiling) 403 unsigned long floor, unsigned long ceiling)
404 { 404 {
405 pmd_t *pmd; 405 pmd_t *pmd;
406 unsigned long next; 406 unsigned long next;
407 unsigned long start; 407 unsigned long start;
408 408
409 start = addr; 409 start = addr;
410 pmd = pmd_offset(pud, addr); 410 pmd = pmd_offset(pud, addr);
411 do { 411 do {
412 next = pmd_addr_end(addr, end); 412 next = pmd_addr_end(addr, end);
413 if (pmd_none_or_clear_bad(pmd)) 413 if (pmd_none_or_clear_bad(pmd))
414 continue; 414 continue;
415 free_pte_range(tlb, pmd, addr); 415 free_pte_range(tlb, pmd, addr);
416 } while (pmd++, addr = next, addr != end); 416 } while (pmd++, addr = next, addr != end);
417 417
418 start &= PUD_MASK; 418 start &= PUD_MASK;
419 if (start < floor) 419 if (start < floor)
420 return; 420 return;
421 if (ceiling) { 421 if (ceiling) {
422 ceiling &= PUD_MASK; 422 ceiling &= PUD_MASK;
423 if (!ceiling) 423 if (!ceiling)
424 return; 424 return;
425 } 425 }
426 if (end - 1 > ceiling - 1) 426 if (end - 1 > ceiling - 1)
427 return; 427 return;
428 428
429 pmd = pmd_offset(pud, start); 429 pmd = pmd_offset(pud, start);
430 pud_clear(pud); 430 pud_clear(pud);
431 pmd_free_tlb(tlb, pmd, start); 431 pmd_free_tlb(tlb, pmd, start);
432 } 432 }
433 433
434 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 434 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
435 unsigned long addr, unsigned long end, 435 unsigned long addr, unsigned long end,
436 unsigned long floor, unsigned long ceiling) 436 unsigned long floor, unsigned long ceiling)
437 { 437 {
438 pud_t *pud; 438 pud_t *pud;
439 unsigned long next; 439 unsigned long next;
440 unsigned long start; 440 unsigned long start;
441 441
442 start = addr; 442 start = addr;
443 pud = pud_offset(pgd, addr); 443 pud = pud_offset(pgd, addr);
444 do { 444 do {
445 next = pud_addr_end(addr, end); 445 next = pud_addr_end(addr, end);
446 if (pud_none_or_clear_bad(pud)) 446 if (pud_none_or_clear_bad(pud))
447 continue; 447 continue;
448 free_pmd_range(tlb, pud, addr, next, floor, ceiling); 448 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
449 } while (pud++, addr = next, addr != end); 449 } while (pud++, addr = next, addr != end);
450 450
451 start &= PGDIR_MASK; 451 start &= PGDIR_MASK;
452 if (start < floor) 452 if (start < floor)
453 return; 453 return;
454 if (ceiling) { 454 if (ceiling) {
455 ceiling &= PGDIR_MASK; 455 ceiling &= PGDIR_MASK;
456 if (!ceiling) 456 if (!ceiling)
457 return; 457 return;
458 } 458 }
459 if (end - 1 > ceiling - 1) 459 if (end - 1 > ceiling - 1)
460 return; 460 return;
461 461
462 pud = pud_offset(pgd, start); 462 pud = pud_offset(pgd, start);
463 pgd_clear(pgd); 463 pgd_clear(pgd);
464 pud_free_tlb(tlb, pud, start); 464 pud_free_tlb(tlb, pud, start);
465 } 465 }
466 466
467 /* 467 /*
468 * This function frees user-level page tables of a process. 468 * This function frees user-level page tables of a process.
469 */ 469 */
470 void free_pgd_range(struct mmu_gather *tlb, 470 void free_pgd_range(struct mmu_gather *tlb,
471 unsigned long addr, unsigned long end, 471 unsigned long addr, unsigned long end,
472 unsigned long floor, unsigned long ceiling) 472 unsigned long floor, unsigned long ceiling)
473 { 473 {
474 pgd_t *pgd; 474 pgd_t *pgd;
475 unsigned long next; 475 unsigned long next;
476 476
477 /* 477 /*
478 * The next few lines have given us lots of grief... 478 * The next few lines have given us lots of grief...
479 * 479 *
480 * Why are we testing PMD* at this top level? Because often 480 * Why are we testing PMD* at this top level? Because often
481 * there will be no work to do at all, and we'd prefer not to 481 * there will be no work to do at all, and we'd prefer not to
482 * go all the way down to the bottom just to discover that. 482 * go all the way down to the bottom just to discover that.
483 * 483 *
484 * Why all these "- 1"s? Because 0 represents both the bottom 484 * Why all these "- 1"s? Because 0 represents both the bottom
485 * of the address space and the top of it (using -1 for the 485 * of the address space and the top of it (using -1 for the
486 * top wouldn't help much: the masks would do the wrong thing). 486 * top wouldn't help much: the masks would do the wrong thing).
487 * The rule is that addr 0 and floor 0 refer to the bottom of 487 * The rule is that addr 0 and floor 0 refer to the bottom of
488 * the address space, but end 0 and ceiling 0 refer to the top 488 * the address space, but end 0 and ceiling 0 refer to the top
489 * Comparisons need to use "end - 1" and "ceiling - 1" (though 489 * Comparisons need to use "end - 1" and "ceiling - 1" (though
490 * that end 0 case should be mythical). 490 * that end 0 case should be mythical).
491 * 491 *
492 * Wherever addr is brought up or ceiling brought down, we must 492 * Wherever addr is brought up or ceiling brought down, we must
493 * be careful to reject "the opposite 0" before it confuses the 493 * be careful to reject "the opposite 0" before it confuses the
494 * subsequent tests. But what about where end is brought down 494 * subsequent tests. But what about where end is brought down
495 * by PMD_SIZE below? no, end can't go down to 0 there. 495 * by PMD_SIZE below? no, end can't go down to 0 there.
496 * 496 *
497 * Whereas we round start (addr) and ceiling down, by different 497 * Whereas we round start (addr) and ceiling down, by different
498 * masks at different levels, in order to test whether a table 498 * masks at different levels, in order to test whether a table
499 * now has no other vmas using it, so can be freed, we don't 499 * now has no other vmas using it, so can be freed, we don't
500 * bother to round floor or end up - the tests don't need that. 500 * bother to round floor or end up - the tests don't need that.
501 */ 501 */
502 502
503 addr &= PMD_MASK; 503 addr &= PMD_MASK;
504 if (addr < floor) { 504 if (addr < floor) {
505 addr += PMD_SIZE; 505 addr += PMD_SIZE;
506 if (!addr) 506 if (!addr)
507 return; 507 return;
508 } 508 }
509 if (ceiling) { 509 if (ceiling) {
510 ceiling &= PMD_MASK; 510 ceiling &= PMD_MASK;
511 if (!ceiling) 511 if (!ceiling)
512 return; 512 return;
513 } 513 }
514 if (end - 1 > ceiling - 1) 514 if (end - 1 > ceiling - 1)
515 end -= PMD_SIZE; 515 end -= PMD_SIZE;
516 if (addr > end - 1) 516 if (addr > end - 1)
517 return; 517 return;
518 518
519 pgd = pgd_offset(tlb->mm, addr); 519 pgd = pgd_offset(tlb->mm, addr);
520 do { 520 do {
521 next = pgd_addr_end(addr, end); 521 next = pgd_addr_end(addr, end);
522 if (pgd_none_or_clear_bad(pgd)) 522 if (pgd_none_or_clear_bad(pgd))
523 continue; 523 continue;
524 free_pud_range(tlb, pgd, addr, next, floor, ceiling); 524 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
525 } while (pgd++, addr = next, addr != end); 525 } while (pgd++, addr = next, addr != end);
526 } 526 }
527 527
528 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, 528 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
529 unsigned long floor, unsigned long ceiling) 529 unsigned long floor, unsigned long ceiling)
530 { 530 {
531 while (vma) { 531 while (vma) {
532 struct vm_area_struct *next = vma->vm_next; 532 struct vm_area_struct *next = vma->vm_next;
533 unsigned long addr = vma->vm_start; 533 unsigned long addr = vma->vm_start;
534 534
535 /* 535 /*
536 * Hide vma from rmap and truncate_pagecache before freeing 536 * Hide vma from rmap and truncate_pagecache before freeing
537 * pgtables 537 * pgtables
538 */ 538 */
539 unlink_anon_vmas(vma); 539 unlink_anon_vmas(vma);
540 unlink_file_vma(vma); 540 unlink_file_vma(vma);
541 541
542 if (is_vm_hugetlb_page(vma)) { 542 if (is_vm_hugetlb_page(vma)) {
543 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 543 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
544 floor, next? next->vm_start: ceiling); 544 floor, next? next->vm_start: ceiling);
545 } else { 545 } else {
546 /* 546 /*
547 * Optimization: gather nearby vmas into one call down 547 * Optimization: gather nearby vmas into one call down
548 */ 548 */
549 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 549 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
550 && !is_vm_hugetlb_page(next)) { 550 && !is_vm_hugetlb_page(next)) {
551 vma = next; 551 vma = next;
552 next = vma->vm_next; 552 next = vma->vm_next;
553 unlink_anon_vmas(vma); 553 unlink_anon_vmas(vma);
554 unlink_file_vma(vma); 554 unlink_file_vma(vma);
555 } 555 }
556 free_pgd_range(tlb, addr, vma->vm_end, 556 free_pgd_range(tlb, addr, vma->vm_end,
557 floor, next? next->vm_start: ceiling); 557 floor, next? next->vm_start: ceiling);
558 } 558 }
559 vma = next; 559 vma = next;
560 } 560 }
561 } 561 }
562 562
563 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 563 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
564 pmd_t *pmd, unsigned long address) 564 pmd_t *pmd, unsigned long address)
565 { 565 {
566 spinlock_t *ptl; 566 spinlock_t *ptl;
567 pgtable_t new = pte_alloc_one(mm, address); 567 pgtable_t new = pte_alloc_one(mm, address);
568 int wait_split_huge_page; 568 int wait_split_huge_page;
569 if (!new) 569 if (!new)
570 return -ENOMEM; 570 return -ENOMEM;
571 571
572 /* 572 /*
573 * Ensure all pte setup (eg. pte page lock and page clearing) are 573 * Ensure all pte setup (eg. pte page lock and page clearing) are
574 * visible before the pte is made visible to other CPUs by being 574 * visible before the pte is made visible to other CPUs by being
575 * put into page tables. 575 * put into page tables.
576 * 576 *
577 * The other side of the story is the pointer chasing in the page 577 * The other side of the story is the pointer chasing in the page
578 * table walking code (when walking the page table without locking; 578 * table walking code (when walking the page table without locking;
579 * ie. most of the time). Fortunately, these data accesses consist 579 * ie. most of the time). Fortunately, these data accesses consist
580 * of a chain of data-dependent loads, meaning most CPUs (alpha 580 * of a chain of data-dependent loads, meaning most CPUs (alpha
581 * being the notable exception) will already guarantee loads are 581 * being the notable exception) will already guarantee loads are
582 * seen in-order. See the alpha page table accessors for the 582 * seen in-order. See the alpha page table accessors for the
583 * smp_read_barrier_depends() barriers in page table walking code. 583 * smp_read_barrier_depends() barriers in page table walking code.
584 */ 584 */
585 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 585 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
586 586
587 ptl = pmd_lock(mm, pmd); 587 ptl = pmd_lock(mm, pmd);
588 wait_split_huge_page = 0; 588 wait_split_huge_page = 0;
589 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 589 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
590 atomic_long_inc(&mm->nr_ptes); 590 atomic_long_inc(&mm->nr_ptes);
591 pmd_populate(mm, pmd, new); 591 pmd_populate(mm, pmd, new);
592 new = NULL; 592 new = NULL;
593 } else if (unlikely(pmd_trans_splitting(*pmd))) 593 } else if (unlikely(pmd_trans_splitting(*pmd)))
594 wait_split_huge_page = 1; 594 wait_split_huge_page = 1;
595 spin_unlock(ptl); 595 spin_unlock(ptl);
596 if (new) 596 if (new)
597 pte_free(mm, new); 597 pte_free(mm, new);
598 if (wait_split_huge_page) 598 if (wait_split_huge_page)
599 wait_split_huge_page(vma->anon_vma, pmd); 599 wait_split_huge_page(vma->anon_vma, pmd);
600 return 0; 600 return 0;
601 } 601 }
602 602
603 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) 603 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
604 { 604 {
605 pte_t *new = pte_alloc_one_kernel(&init_mm, address); 605 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
606 if (!new) 606 if (!new)
607 return -ENOMEM; 607 return -ENOMEM;
608 608
609 smp_wmb(); /* See comment in __pte_alloc */ 609 smp_wmb(); /* See comment in __pte_alloc */
610 610
611 spin_lock(&init_mm.page_table_lock); 611 spin_lock(&init_mm.page_table_lock);
612 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 612 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
613 pmd_populate_kernel(&init_mm, pmd, new); 613 pmd_populate_kernel(&init_mm, pmd, new);
614 new = NULL; 614 new = NULL;
615 } else 615 } else
616 VM_BUG_ON(pmd_trans_splitting(*pmd)); 616 VM_BUG_ON(pmd_trans_splitting(*pmd));
617 spin_unlock(&init_mm.page_table_lock); 617 spin_unlock(&init_mm.page_table_lock);
618 if (new) 618 if (new)
619 pte_free_kernel(&init_mm, new); 619 pte_free_kernel(&init_mm, new);
620 return 0; 620 return 0;
621 } 621 }
622 622
623 static inline void init_rss_vec(int *rss) 623 static inline void init_rss_vec(int *rss)
624 { 624 {
625 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); 625 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
626 } 626 }
627 627
628 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) 628 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
629 { 629 {
630 int i; 630 int i;
631 631
632 if (current->mm == mm) 632 if (current->mm == mm)
633 sync_mm_rss(mm); 633 sync_mm_rss(mm);
634 for (i = 0; i < NR_MM_COUNTERS; i++) 634 for (i = 0; i < NR_MM_COUNTERS; i++)
635 if (rss[i]) 635 if (rss[i])
636 add_mm_counter(mm, i, rss[i]); 636 add_mm_counter(mm, i, rss[i]);
637 } 637 }
638 638
639 /* 639 /*
640 * This function is called to print an error when a bad pte 640 * This function is called to print an error when a bad pte
641 * is found. For example, we might have a PFN-mapped pte in 641 * is found. For example, we might have a PFN-mapped pte in
642 * a region that doesn't allow it. 642 * a region that doesn't allow it.
643 * 643 *
644 * The calling function must still handle the error. 644 * The calling function must still handle the error.
645 */ 645 */
646 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, 646 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
647 pte_t pte, struct page *page) 647 pte_t pte, struct page *page)
648 { 648 {
649 pgd_t *pgd = pgd_offset(vma->vm_mm, addr); 649 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
650 pud_t *pud = pud_offset(pgd, addr); 650 pud_t *pud = pud_offset(pgd, addr);
651 pmd_t *pmd = pmd_offset(pud, addr); 651 pmd_t *pmd = pmd_offset(pud, addr);
652 struct address_space *mapping; 652 struct address_space *mapping;
653 pgoff_t index; 653 pgoff_t index;
654 static unsigned long resume; 654 static unsigned long resume;
655 static unsigned long nr_shown; 655 static unsigned long nr_shown;
656 static unsigned long nr_unshown; 656 static unsigned long nr_unshown;
657 657
658 /* 658 /*
659 * Allow a burst of 60 reports, then keep quiet for that minute; 659 * Allow a burst of 60 reports, then keep quiet for that minute;
660 * or allow a steady drip of one report per second. 660 * or allow a steady drip of one report per second.
661 */ 661 */
662 if (nr_shown == 60) { 662 if (nr_shown == 60) {
663 if (time_before(jiffies, resume)) { 663 if (time_before(jiffies, resume)) {
664 nr_unshown++; 664 nr_unshown++;
665 return; 665 return;
666 } 666 }
667 if (nr_unshown) { 667 if (nr_unshown) {
668 printk(KERN_ALERT 668 printk(KERN_ALERT
669 "BUG: Bad page map: %lu messages suppressed\n", 669 "BUG: Bad page map: %lu messages suppressed\n",
670 nr_unshown); 670 nr_unshown);
671 nr_unshown = 0; 671 nr_unshown = 0;
672 } 672 }
673 nr_shown = 0; 673 nr_shown = 0;
674 } 674 }
675 if (nr_shown++ == 0) 675 if (nr_shown++ == 0)
676 resume = jiffies + 60 * HZ; 676 resume = jiffies + 60 * HZ;
677 677
678 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; 678 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
679 index = linear_page_index(vma, addr); 679 index = linear_page_index(vma, addr);
680 680
681 printk(KERN_ALERT 681 printk(KERN_ALERT
682 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 682 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
683 current->comm, 683 current->comm,
684 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 684 (long long)pte_val(pte), (long long)pmd_val(*pmd));
685 if (page) 685 if (page)
686 dump_page(page, "bad pte"); 686 dump_page(page, "bad pte");
687 printk(KERN_ALERT 687 printk(KERN_ALERT
688 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 688 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
689 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 689 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
690 /* 690 /*
691 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 691 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
692 */ 692 */
693 if (vma->vm_ops) 693 if (vma->vm_ops)
694 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", 694 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
695 vma->vm_ops->fault); 695 vma->vm_ops->fault);
696 if (vma->vm_file) 696 if (vma->vm_file)
697 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", 697 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
698 vma->vm_file->f_op->mmap); 698 vma->vm_file->f_op->mmap);
699 dump_stack(); 699 dump_stack();
700 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 700 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
701 } 701 }
702 702
703 /* 703 /*
704 * vm_normal_page -- This function gets the "struct page" associated with a pte. 704 * vm_normal_page -- This function gets the "struct page" associated with a pte.
705 * 705 *
706 * "Special" mappings do not wish to be associated with a "struct page" (either 706 * "Special" mappings do not wish to be associated with a "struct page" (either
707 * it doesn't exist, or it exists but they don't want to touch it). In this 707 * it doesn't exist, or it exists but they don't want to touch it). In this
708 * case, NULL is returned here. "Normal" mappings do have a struct page. 708 * case, NULL is returned here. "Normal" mappings do have a struct page.
709 * 709 *
710 * There are 2 broad cases. Firstly, an architecture may define a pte_special() 710 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
711 * pte bit, in which case this function is trivial. Secondly, an architecture 711 * pte bit, in which case this function is trivial. Secondly, an architecture
712 * may not have a spare pte bit, which requires a more complicated scheme, 712 * may not have a spare pte bit, which requires a more complicated scheme,
713 * described below. 713 * described below.
714 * 714 *
715 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a 715 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
716 * special mapping (even if there are underlying and valid "struct pages"). 716 * special mapping (even if there are underlying and valid "struct pages").
717 * COWed pages of a VM_PFNMAP are always normal. 717 * COWed pages of a VM_PFNMAP are always normal.
718 * 718 *
719 * The way we recognize COWed pages within VM_PFNMAP mappings is through the 719 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
720 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 720 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
721 * set, and the vm_pgoff will point to the first PFN mapped: thus every special 721 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
722 * mapping will always honor the rule 722 * mapping will always honor the rule
723 * 723 *
724 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 724 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
725 * 725 *
726 * And for normal mappings this is false. 726 * And for normal mappings this is false.
727 * 727 *
728 * This restricts such mappings to be a linear translation from virtual address 728 * This restricts such mappings to be a linear translation from virtual address
729 * to pfn. To get around this restriction, we allow arbitrary mappings so long 729 * to pfn. To get around this restriction, we allow arbitrary mappings so long
730 * as the vma is not a COW mapping; in that case, we know that all ptes are 730 * as the vma is not a COW mapping; in that case, we know that all ptes are
731 * special (because none can have been COWed). 731 * special (because none can have been COWed).
732 * 732 *
733 * 733 *
734 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. 734 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
735 * 735 *
736 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 736 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
737 * page" backing, however the difference is that _all_ pages with a struct 737 * page" backing, however the difference is that _all_ pages with a struct
738 * page (that is, those where pfn_valid is true) are refcounted and considered 738 * page (that is, those where pfn_valid is true) are refcounted and considered
739 * normal pages by the VM. The disadvantage is that pages are refcounted 739 * normal pages by the VM. The disadvantage is that pages are refcounted
740 * (which can be slower and simply not an option for some PFNMAP users). The 740 * (which can be slower and simply not an option for some PFNMAP users). The
741 * advantage is that we don't have to follow the strict linearity rule of 741 * advantage is that we don't have to follow the strict linearity rule of
742 * PFNMAP mappings in order to support COWable mappings. 742 * PFNMAP mappings in order to support COWable mappings.
743 * 743 *
744 */ 744 */
745 #ifdef __HAVE_ARCH_PTE_SPECIAL 745 #ifdef __HAVE_ARCH_PTE_SPECIAL
746 # define HAVE_PTE_SPECIAL 1 746 # define HAVE_PTE_SPECIAL 1
747 #else 747 #else
748 # define HAVE_PTE_SPECIAL 0 748 # define HAVE_PTE_SPECIAL 0
749 #endif 749 #endif
750 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 750 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
751 pte_t pte) 751 pte_t pte)
752 { 752 {
753 unsigned long pfn = pte_pfn(pte); 753 unsigned long pfn = pte_pfn(pte);
754 754
755 if (HAVE_PTE_SPECIAL) { 755 if (HAVE_PTE_SPECIAL) {
756 if (likely(!pte_special(pte))) 756 if (likely(!pte_special(pte)))
757 goto check_pfn; 757 goto check_pfn;
758 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 758 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
759 return NULL; 759 return NULL;
760 if (!is_zero_pfn(pfn)) 760 if (!is_zero_pfn(pfn))
761 print_bad_pte(vma, addr, pte, NULL); 761 print_bad_pte(vma, addr, pte, NULL);
762 return NULL; 762 return NULL;
763 } 763 }
764 764
765 /* !HAVE_PTE_SPECIAL case follows: */ 765 /* !HAVE_PTE_SPECIAL case follows: */
766 766
767 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 767 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
768 if (vma->vm_flags & VM_MIXEDMAP) { 768 if (vma->vm_flags & VM_MIXEDMAP) {
769 if (!pfn_valid(pfn)) 769 if (!pfn_valid(pfn))
770 return NULL; 770 return NULL;
771 goto out; 771 goto out;
772 } else { 772 } else {
773 unsigned long off; 773 unsigned long off;
774 off = (addr - vma->vm_start) >> PAGE_SHIFT; 774 off = (addr - vma->vm_start) >> PAGE_SHIFT;
775 if (pfn == vma->vm_pgoff + off) 775 if (pfn == vma->vm_pgoff + off)
776 return NULL; 776 return NULL;
777 if (!is_cow_mapping(vma->vm_flags)) 777 if (!is_cow_mapping(vma->vm_flags))
778 return NULL; 778 return NULL;
779 } 779 }
780 } 780 }
781 781
782 if (is_zero_pfn(pfn)) 782 if (is_zero_pfn(pfn))
783 return NULL; 783 return NULL;
784 check_pfn: 784 check_pfn:
785 if (unlikely(pfn > highest_memmap_pfn)) { 785 if (unlikely(pfn > highest_memmap_pfn)) {
786 print_bad_pte(vma, addr, pte, NULL); 786 print_bad_pte(vma, addr, pte, NULL);
787 return NULL; 787 return NULL;
788 } 788 }
789 789
790 /* 790 /*
791 * NOTE! We still have PageReserved() pages in the page tables. 791 * NOTE! We still have PageReserved() pages in the page tables.
792 * eg. VDSO mappings can cause them to exist. 792 * eg. VDSO mappings can cause them to exist.
793 */ 793 */
794 out: 794 out:
795 return pfn_to_page(pfn); 795 return pfn_to_page(pfn);
796 } 796 }
797 797
798 /* 798 /*
799 * copy one vm_area from one task to the other. Assumes the page tables 799 * copy one vm_area from one task to the other. Assumes the page tables
800 * already present in the new task to be cleared in the whole range 800 * already present in the new task to be cleared in the whole range
801 * covered by this vma. 801 * covered by this vma.
802 */ 802 */
803 803
804 static inline unsigned long 804 static inline unsigned long
805 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 805 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
806 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 806 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
807 unsigned long addr, int *rss) 807 unsigned long addr, int *rss)
808 { 808 {
809 unsigned long vm_flags = vma->vm_flags; 809 unsigned long vm_flags = vma->vm_flags;
810 pte_t pte = *src_pte; 810 pte_t pte = *src_pte;
811 struct page *page; 811 struct page *page;
812 812
813 /* pte contains position in swap or file, so copy. */ 813 /* pte contains position in swap or file, so copy. */
814 if (unlikely(!pte_present(pte))) { 814 if (unlikely(!pte_present(pte))) {
815 if (!pte_file(pte)) { 815 if (!pte_file(pte)) {
816 swp_entry_t entry = pte_to_swp_entry(pte); 816 swp_entry_t entry = pte_to_swp_entry(pte);
817 817
818 if (swap_duplicate(entry) < 0) 818 if (swap_duplicate(entry) < 0)
819 return entry.val; 819 return entry.val;
820 820
821 /* make sure dst_mm is on swapoff's mmlist. */ 821 /* make sure dst_mm is on swapoff's mmlist. */
822 if (unlikely(list_empty(&dst_mm->mmlist))) { 822 if (unlikely(list_empty(&dst_mm->mmlist))) {
823 spin_lock(&mmlist_lock); 823 spin_lock(&mmlist_lock);
824 if (list_empty(&dst_mm->mmlist)) 824 if (list_empty(&dst_mm->mmlist))
825 list_add(&dst_mm->mmlist, 825 list_add(&dst_mm->mmlist,
826 &src_mm->mmlist); 826 &src_mm->mmlist);
827 spin_unlock(&mmlist_lock); 827 spin_unlock(&mmlist_lock);
828 } 828 }
829 if (likely(!non_swap_entry(entry))) 829 if (likely(!non_swap_entry(entry)))
830 rss[MM_SWAPENTS]++; 830 rss[MM_SWAPENTS]++;
831 else if (is_migration_entry(entry)) { 831 else if (is_migration_entry(entry)) {
832 page = migration_entry_to_page(entry); 832 page = migration_entry_to_page(entry);
833 833
834 if (PageAnon(page)) 834 if (PageAnon(page))
835 rss[MM_ANONPAGES]++; 835 rss[MM_ANONPAGES]++;
836 else 836 else
837 rss[MM_FILEPAGES]++; 837 rss[MM_FILEPAGES]++;
838 838
839 if (is_write_migration_entry(entry) && 839 if (is_write_migration_entry(entry) &&
840 is_cow_mapping(vm_flags)) { 840 is_cow_mapping(vm_flags)) {
841 /* 841 /*
842 * COW mappings require pages in both 842 * COW mappings require pages in both
843 * parent and child to be set to read. 843 * parent and child to be set to read.
844 */ 844 */
845 make_migration_entry_read(&entry); 845 make_migration_entry_read(&entry);
846 pte = swp_entry_to_pte(entry); 846 pte = swp_entry_to_pte(entry);
847 if (pte_swp_soft_dirty(*src_pte)) 847 if (pte_swp_soft_dirty(*src_pte))
848 pte = pte_swp_mksoft_dirty(pte); 848 pte = pte_swp_mksoft_dirty(pte);
849 set_pte_at(src_mm, addr, src_pte, pte); 849 set_pte_at(src_mm, addr, src_pte, pte);
850 } 850 }
851 } 851 }
852 } 852 }
853 goto out_set_pte; 853 goto out_set_pte;
854 } 854 }
855 855
856 /* 856 /*
857 * If it's a COW mapping, write protect it both 857 * If it's a COW mapping, write protect it both
858 * in the parent and the child 858 * in the parent and the child
859 */ 859 */
860 if (is_cow_mapping(vm_flags)) { 860 if (is_cow_mapping(vm_flags)) {
861 ptep_set_wrprotect(src_mm, addr, src_pte); 861 ptep_set_wrprotect(src_mm, addr, src_pte);
862 pte = pte_wrprotect(pte); 862 pte = pte_wrprotect(pte);
863 } 863 }
864 864
865 /* 865 /*
866 * If it's a shared mapping, mark it clean in 866 * If it's a shared mapping, mark it clean in
867 * the child 867 * the child
868 */ 868 */
869 if (vm_flags & VM_SHARED) 869 if (vm_flags & VM_SHARED)
870 pte = pte_mkclean(pte); 870 pte = pte_mkclean(pte);
871 pte = pte_mkold(pte); 871 pte = pte_mkold(pte);
872 872
873 page = vm_normal_page(vma, addr, pte); 873 page = vm_normal_page(vma, addr, pte);
874 if (page) { 874 if (page) {
875 get_page(page); 875 get_page(page);
876 page_dup_rmap(page); 876 page_dup_rmap(page);
877 if (PageAnon(page)) 877 if (PageAnon(page))
878 rss[MM_ANONPAGES]++; 878 rss[MM_ANONPAGES]++;
879 else 879 else
880 rss[MM_FILEPAGES]++; 880 rss[MM_FILEPAGES]++;
881 } 881 }
882 882
883 out_set_pte: 883 out_set_pte:
884 set_pte_at(dst_mm, addr, dst_pte, pte); 884 set_pte_at(dst_mm, addr, dst_pte, pte);
885 return 0; 885 return 0;
886 } 886 }
887 887
888 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 888 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
889 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 889 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
890 unsigned long addr, unsigned long end) 890 unsigned long addr, unsigned long end)
891 { 891 {
892 pte_t *orig_src_pte, *orig_dst_pte; 892 pte_t *orig_src_pte, *orig_dst_pte;
893 pte_t *src_pte, *dst_pte; 893 pte_t *src_pte, *dst_pte;
894 spinlock_t *src_ptl, *dst_ptl; 894 spinlock_t *src_ptl, *dst_ptl;
895 int progress = 0; 895 int progress = 0;
896 int rss[NR_MM_COUNTERS]; 896 int rss[NR_MM_COUNTERS];
897 swp_entry_t entry = (swp_entry_t){0}; 897 swp_entry_t entry = (swp_entry_t){0};
898 898
899 again: 899 again:
900 init_rss_vec(rss); 900 init_rss_vec(rss);
901 901
902 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 902 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
903 if (!dst_pte) 903 if (!dst_pte)
904 return -ENOMEM; 904 return -ENOMEM;
905 src_pte = pte_offset_map(src_pmd, addr); 905 src_pte = pte_offset_map(src_pmd, addr);
906 src_ptl = pte_lockptr(src_mm, src_pmd); 906 src_ptl = pte_lockptr(src_mm, src_pmd);
907 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 907 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
908 orig_src_pte = src_pte; 908 orig_src_pte = src_pte;
909 orig_dst_pte = dst_pte; 909 orig_dst_pte = dst_pte;
910 arch_enter_lazy_mmu_mode(); 910 arch_enter_lazy_mmu_mode();
911 911
912 do { 912 do {
913 /* 913 /*
914 * We are holding two locks at this point - either of them 914 * We are holding two locks at this point - either of them
915 * could generate latencies in another task on another CPU. 915 * could generate latencies in another task on another CPU.
916 */ 916 */
917 if (progress >= 32) { 917 if (progress >= 32) {
918 progress = 0; 918 progress = 0;
919 if (need_resched() || 919 if (need_resched() ||
920 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) 920 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
921 break; 921 break;
922 } 922 }
923 if (pte_none(*src_pte)) { 923 if (pte_none(*src_pte)) {
924 progress++; 924 progress++;
925 continue; 925 continue;
926 } 926 }
927 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, 927 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
928 vma, addr, rss); 928 vma, addr, rss);
929 if (entry.val) 929 if (entry.val)
930 break; 930 break;
931 progress += 8; 931 progress += 8;
932 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 932 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
933 933
934 arch_leave_lazy_mmu_mode(); 934 arch_leave_lazy_mmu_mode();
935 spin_unlock(src_ptl); 935 spin_unlock(src_ptl);
936 pte_unmap(orig_src_pte); 936 pte_unmap(orig_src_pte);
937 add_mm_rss_vec(dst_mm, rss); 937 add_mm_rss_vec(dst_mm, rss);
938 pte_unmap_unlock(orig_dst_pte, dst_ptl); 938 pte_unmap_unlock(orig_dst_pte, dst_ptl);
939 cond_resched(); 939 cond_resched();
940 940
941 if (entry.val) { 941 if (entry.val) {
942 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) 942 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
943 return -ENOMEM; 943 return -ENOMEM;
944 progress = 0; 944 progress = 0;
945 } 945 }
946 if (addr != end) 946 if (addr != end)
947 goto again; 947 goto again;
948 return 0; 948 return 0;
949 } 949 }
950 950
951 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 951 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
952 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, 952 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
953 unsigned long addr, unsigned long end) 953 unsigned long addr, unsigned long end)
954 { 954 {
955 pmd_t *src_pmd, *dst_pmd; 955 pmd_t *src_pmd, *dst_pmd;
956 unsigned long next; 956 unsigned long next;
957 957
958 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); 958 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
959 if (!dst_pmd) 959 if (!dst_pmd)
960 return -ENOMEM; 960 return -ENOMEM;
961 src_pmd = pmd_offset(src_pud, addr); 961 src_pmd = pmd_offset(src_pud, addr);
962 do { 962 do {
963 next = pmd_addr_end(addr, end); 963 next = pmd_addr_end(addr, end);
964 if (pmd_trans_huge(*src_pmd)) { 964 if (pmd_trans_huge(*src_pmd)) {
965 int err; 965 int err;
966 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 966 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
967 err = copy_huge_pmd(dst_mm, src_mm, 967 err = copy_huge_pmd(dst_mm, src_mm,
968 dst_pmd, src_pmd, addr, vma); 968 dst_pmd, src_pmd, addr, vma);
969 if (err == -ENOMEM) 969 if (err == -ENOMEM)
970 return -ENOMEM; 970 return -ENOMEM;
971 if (!err) 971 if (!err)
972 continue; 972 continue;
973 /* fall through */ 973 /* fall through */
974 } 974 }
975 if (pmd_none_or_clear_bad(src_pmd)) 975 if (pmd_none_or_clear_bad(src_pmd))
976 continue; 976 continue;
977 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 977 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
978 vma, addr, next)) 978 vma, addr, next))
979 return -ENOMEM; 979 return -ENOMEM;
980 } while (dst_pmd++, src_pmd++, addr = next, addr != end); 980 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
981 return 0; 981 return 0;
982 } 982 }
983 983
984 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 984 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
985 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, 985 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
986 unsigned long addr, unsigned long end) 986 unsigned long addr, unsigned long end)
987 { 987 {
988 pud_t *src_pud, *dst_pud; 988 pud_t *src_pud, *dst_pud;
989 unsigned long next; 989 unsigned long next;
990 990
991 dst_pud = pud_alloc(dst_mm, dst_pgd, addr); 991 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
992 if (!dst_pud) 992 if (!dst_pud)
993 return -ENOMEM; 993 return -ENOMEM;
994 src_pud = pud_offset(src_pgd, addr); 994 src_pud = pud_offset(src_pgd, addr);
995 do { 995 do {
996 next = pud_addr_end(addr, end); 996 next = pud_addr_end(addr, end);
997 if (pud_none_or_clear_bad(src_pud)) 997 if (pud_none_or_clear_bad(src_pud))
998 continue; 998 continue;
999 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 999 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1000 vma, addr, next)) 1000 vma, addr, next))
1001 return -ENOMEM; 1001 return -ENOMEM;
1002 } while (dst_pud++, src_pud++, addr = next, addr != end); 1002 } while (dst_pud++, src_pud++, addr = next, addr != end);
1003 return 0; 1003 return 0;
1004 } 1004 }
1005 1005
1006 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1006 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1007 struct vm_area_struct *vma) 1007 struct vm_area_struct *vma)
1008 { 1008 {
1009 pgd_t *src_pgd, *dst_pgd; 1009 pgd_t *src_pgd, *dst_pgd;
1010 unsigned long next; 1010 unsigned long next;
1011 unsigned long addr = vma->vm_start; 1011 unsigned long addr = vma->vm_start;
1012 unsigned long end = vma->vm_end; 1012 unsigned long end = vma->vm_end;
1013 unsigned long mmun_start; /* For mmu_notifiers */ 1013 unsigned long mmun_start; /* For mmu_notifiers */
1014 unsigned long mmun_end; /* For mmu_notifiers */ 1014 unsigned long mmun_end; /* For mmu_notifiers */
1015 bool is_cow; 1015 bool is_cow;
1016 int ret; 1016 int ret;
1017 1017
1018 /* 1018 /*
1019 * Don't copy ptes where a page fault will fill them correctly. 1019 * Don't copy ptes where a page fault will fill them correctly.
1020 * Fork becomes much lighter when there are big shared or private 1020 * Fork becomes much lighter when there are big shared or private
1021 * readonly mappings. The tradeoff is that copy_page_range is more 1021 * readonly mappings. The tradeoff is that copy_page_range is more
1022 * efficient than faulting. 1022 * efficient than faulting.
1023 */ 1023 */
1024 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | 1024 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1025 VM_PFNMAP | VM_MIXEDMAP))) { 1025 VM_PFNMAP | VM_MIXEDMAP))) {
1026 if (!vma->anon_vma) 1026 if (!vma->anon_vma)
1027 return 0; 1027 return 0;
1028 } 1028 }
1029 1029
1030 if (is_vm_hugetlb_page(vma)) 1030 if (is_vm_hugetlb_page(vma))
1031 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1031 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1032 1032
1033 if (unlikely(vma->vm_flags & VM_PFNMAP)) { 1033 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1034 /* 1034 /*
1035 * We do not free on error cases below as remove_vma 1035 * We do not free on error cases below as remove_vma
1036 * gets called on error from higher level routine 1036 * gets called on error from higher level routine
1037 */ 1037 */
1038 ret = track_pfn_copy(vma); 1038 ret = track_pfn_copy(vma);
1039 if (ret) 1039 if (ret)
1040 return ret; 1040 return ret;
1041 } 1041 }
1042 1042
1043 /* 1043 /*
1044 * We need to invalidate the secondary MMU mappings only when 1044 * We need to invalidate the secondary MMU mappings only when
1045 * there could be a permission downgrade on the ptes of the 1045 * there could be a permission downgrade on the ptes of the
1046 * parent mm. And a permission downgrade will only happen if 1046 * parent mm. And a permission downgrade will only happen if
1047 * is_cow_mapping() returns true. 1047 * is_cow_mapping() returns true.
1048 */ 1048 */
1049 is_cow = is_cow_mapping(vma->vm_flags); 1049 is_cow = is_cow_mapping(vma->vm_flags);
1050 mmun_start = addr; 1050 mmun_start = addr;
1051 mmun_end = end; 1051 mmun_end = end;
1052 if (is_cow) 1052 if (is_cow)
1053 mmu_notifier_invalidate_range_start(src_mm, mmun_start, 1053 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1054 mmun_end); 1054 mmun_end);
1055 1055
1056 ret = 0; 1056 ret = 0;
1057 dst_pgd = pgd_offset(dst_mm, addr); 1057 dst_pgd = pgd_offset(dst_mm, addr);
1058 src_pgd = pgd_offset(src_mm, addr); 1058 src_pgd = pgd_offset(src_mm, addr);
1059 do { 1059 do {
1060 next = pgd_addr_end(addr, end); 1060 next = pgd_addr_end(addr, end);
1061 if (pgd_none_or_clear_bad(src_pgd)) 1061 if (pgd_none_or_clear_bad(src_pgd))
1062 continue; 1062 continue;
1063 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 1063 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1064 vma, addr, next))) { 1064 vma, addr, next))) {
1065 ret = -ENOMEM; 1065 ret = -ENOMEM;
1066 break; 1066 break;
1067 } 1067 }
1068 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1068 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1069 1069
1070 if (is_cow) 1070 if (is_cow)
1071 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); 1071 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1072 return ret; 1072 return ret;
1073 } 1073 }
1074 1074
1075 static unsigned long zap_pte_range(struct mmu_gather *tlb, 1075 static unsigned long zap_pte_range(struct mmu_gather *tlb,
1076 struct vm_area_struct *vma, pmd_t *pmd, 1076 struct vm_area_struct *vma, pmd_t *pmd,
1077 unsigned long addr, unsigned long end, 1077 unsigned long addr, unsigned long end,
1078 struct zap_details *details) 1078 struct zap_details *details)
1079 { 1079 {
1080 struct mm_struct *mm = tlb->mm; 1080 struct mm_struct *mm = tlb->mm;
1081 int force_flush = 0; 1081 int force_flush = 0;
1082 int rss[NR_MM_COUNTERS]; 1082 int rss[NR_MM_COUNTERS];
1083 spinlock_t *ptl; 1083 spinlock_t *ptl;
1084 pte_t *start_pte; 1084 pte_t *start_pte;
1085 pte_t *pte; 1085 pte_t *pte;
1086 1086
1087 again: 1087 again:
1088 init_rss_vec(rss); 1088 init_rss_vec(rss);
1089 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1089 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1090 pte = start_pte; 1090 pte = start_pte;
1091 arch_enter_lazy_mmu_mode(); 1091 arch_enter_lazy_mmu_mode();
1092 do { 1092 do {
1093 pte_t ptent = *pte; 1093 pte_t ptent = *pte;
1094 if (pte_none(ptent)) { 1094 if (pte_none(ptent)) {
1095 continue; 1095 continue;
1096 } 1096 }
1097 1097
1098 if (pte_present(ptent)) { 1098 if (pte_present(ptent)) {
1099 struct page *page; 1099 struct page *page;
1100 1100
1101 page = vm_normal_page(vma, addr, ptent); 1101 page = vm_normal_page(vma, addr, ptent);
1102 if (unlikely(details) && page) { 1102 if (unlikely(details) && page) {
1103 /* 1103 /*
1104 * unmap_shared_mapping_pages() wants to 1104 * unmap_shared_mapping_pages() wants to
1105 * invalidate cache without truncating: 1105 * invalidate cache without truncating:
1106 * unmap shared but keep private pages. 1106 * unmap shared but keep private pages.
1107 */ 1107 */
1108 if (details->check_mapping && 1108 if (details->check_mapping &&
1109 details->check_mapping != page->mapping) 1109 details->check_mapping != page->mapping)
1110 continue; 1110 continue;
1111 /* 1111 /*
1112 * Each page->index must be checked when 1112 * Each page->index must be checked when
1113 * invalidating or truncating nonlinear. 1113 * invalidating or truncating nonlinear.
1114 */ 1114 */
1115 if (details->nonlinear_vma && 1115 if (details->nonlinear_vma &&
1116 (page->index < details->first_index || 1116 (page->index < details->first_index ||
1117 page->index > details->last_index)) 1117 page->index > details->last_index))
1118 continue; 1118 continue;
1119 } 1119 }
1120 ptent = ptep_get_and_clear_full(mm, addr, pte, 1120 ptent = ptep_get_and_clear_full(mm, addr, pte,
1121 tlb->fullmm); 1121 tlb->fullmm);
1122 tlb_remove_tlb_entry(tlb, pte, addr); 1122 tlb_remove_tlb_entry(tlb, pte, addr);
1123 if (unlikely(!page)) 1123 if (unlikely(!page))
1124 continue; 1124 continue;
1125 if (unlikely(details) && details->nonlinear_vma 1125 if (unlikely(details) && details->nonlinear_vma
1126 && linear_page_index(details->nonlinear_vma, 1126 && linear_page_index(details->nonlinear_vma,
1127 addr) != page->index) { 1127 addr) != page->index) {
1128 pte_t ptfile = pgoff_to_pte(page->index); 1128 pte_t ptfile = pgoff_to_pte(page->index);
1129 if (pte_soft_dirty(ptent)) 1129 if (pte_soft_dirty(ptent))
1130 pte_file_mksoft_dirty(ptfile); 1130 ptfile = pte_file_mksoft_dirty(ptfile);
1131 set_pte_at(mm, addr, pte, ptfile); 1131 set_pte_at(mm, addr, pte, ptfile);
1132 } 1132 }
1133 if (PageAnon(page)) 1133 if (PageAnon(page))
1134 rss[MM_ANONPAGES]--; 1134 rss[MM_ANONPAGES]--;
1135 else { 1135 else {
1136 if (pte_dirty(ptent)) { 1136 if (pte_dirty(ptent)) {
1137 force_flush = 1; 1137 force_flush = 1;
1138 set_page_dirty(page); 1138 set_page_dirty(page);
1139 } 1139 }
1140 if (pte_young(ptent) && 1140 if (pte_young(ptent) &&
1141 likely(!(vma->vm_flags & VM_SEQ_READ))) 1141 likely(!(vma->vm_flags & VM_SEQ_READ)))
1142 mark_page_accessed(page); 1142 mark_page_accessed(page);
1143 rss[MM_FILEPAGES]--; 1143 rss[MM_FILEPAGES]--;
1144 } 1144 }
1145 page_remove_rmap(page); 1145 page_remove_rmap(page);
1146 if (unlikely(page_mapcount(page) < 0)) 1146 if (unlikely(page_mapcount(page) < 0))
1147 print_bad_pte(vma, addr, ptent, page); 1147 print_bad_pte(vma, addr, ptent, page);
1148 if (unlikely(!__tlb_remove_page(tlb, page))) { 1148 if (unlikely(!__tlb_remove_page(tlb, page))) {
1149 force_flush = 1; 1149 force_flush = 1;
1150 break; 1150 break;
1151 } 1151 }
1152 continue; 1152 continue;
1153 } 1153 }
1154 /* 1154 /*
1155 * If details->check_mapping, we leave swap entries; 1155 * If details->check_mapping, we leave swap entries;
1156 * if details->nonlinear_vma, we leave file entries. 1156 * if details->nonlinear_vma, we leave file entries.
1157 */ 1157 */
1158 if (unlikely(details)) 1158 if (unlikely(details))
1159 continue; 1159 continue;
1160 if (pte_file(ptent)) { 1160 if (pte_file(ptent)) {
1161 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 1161 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1162 print_bad_pte(vma, addr, ptent, NULL); 1162 print_bad_pte(vma, addr, ptent, NULL);
1163 } else { 1163 } else {
1164 swp_entry_t entry = pte_to_swp_entry(ptent); 1164 swp_entry_t entry = pte_to_swp_entry(ptent);
1165 1165
1166 if (!non_swap_entry(entry)) 1166 if (!non_swap_entry(entry))
1167 rss[MM_SWAPENTS]--; 1167 rss[MM_SWAPENTS]--;
1168 else if (is_migration_entry(entry)) { 1168 else if (is_migration_entry(entry)) {
1169 struct page *page; 1169 struct page *page;
1170 1170
1171 page = migration_entry_to_page(entry); 1171 page = migration_entry_to_page(entry);
1172 1172
1173 if (PageAnon(page)) 1173 if (PageAnon(page))
1174 rss[MM_ANONPAGES]--; 1174 rss[MM_ANONPAGES]--;
1175 else 1175 else
1176 rss[MM_FILEPAGES]--; 1176 rss[MM_FILEPAGES]--;
1177 } 1177 }
1178 if (unlikely(!free_swap_and_cache(entry))) 1178 if (unlikely(!free_swap_and_cache(entry)))
1179 print_bad_pte(vma, addr, ptent, NULL); 1179 print_bad_pte(vma, addr, ptent, NULL);
1180 } 1180 }
1181 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1181 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1182 } while (pte++, addr += PAGE_SIZE, addr != end); 1182 } while (pte++, addr += PAGE_SIZE, addr != end);
1183 1183
1184 add_mm_rss_vec(mm, rss); 1184 add_mm_rss_vec(mm, rss);
1185 arch_leave_lazy_mmu_mode(); 1185 arch_leave_lazy_mmu_mode();
1186 1186
1187 /* Do the actual TLB flush before dropping ptl */ 1187 /* Do the actual TLB flush before dropping ptl */
1188 if (force_flush) { 1188 if (force_flush) {
1189 unsigned long old_end; 1189 unsigned long old_end;
1190 1190
1191 /* 1191 /*
1192 * Flush the TLB just for the previous segment, 1192 * Flush the TLB just for the previous segment,
1193 * then update the range to be the remaining 1193 * then update the range to be the remaining
1194 * TLB range. 1194 * TLB range.
1195 */ 1195 */
1196 old_end = tlb->end; 1196 old_end = tlb->end;
1197 tlb->end = addr; 1197 tlb->end = addr;
1198 tlb_flush_mmu_tlbonly(tlb); 1198 tlb_flush_mmu_tlbonly(tlb);
1199 tlb->start = addr; 1199 tlb->start = addr;
1200 tlb->end = old_end; 1200 tlb->end = old_end;
1201 } 1201 }
1202 pte_unmap_unlock(start_pte, ptl); 1202 pte_unmap_unlock(start_pte, ptl);
1203 1203
1204 /* 1204 /*
1205 * If we forced a TLB flush (either due to running out of 1205 * If we forced a TLB flush (either due to running out of
1206 * batch buffers or because we needed to flush dirty TLB 1206 * batch buffers or because we needed to flush dirty TLB
1207 * entries before releasing the ptl), free the batched 1207 * entries before releasing the ptl), free the batched
1208 * memory too. Restart if we didn't do everything. 1208 * memory too. Restart if we didn't do everything.
1209 */ 1209 */
1210 if (force_flush) { 1210 if (force_flush) {
1211 force_flush = 0; 1211 force_flush = 0;
1212 tlb_flush_mmu_free(tlb); 1212 tlb_flush_mmu_free(tlb);
1213 1213
1214 if (addr != end) 1214 if (addr != end)
1215 goto again; 1215 goto again;
1216 } 1216 }
1217 1217
1218 return addr; 1218 return addr;
1219 } 1219 }
1220 1220
1221 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 1221 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1222 struct vm_area_struct *vma, pud_t *pud, 1222 struct vm_area_struct *vma, pud_t *pud,
1223 unsigned long addr, unsigned long end, 1223 unsigned long addr, unsigned long end,
1224 struct zap_details *details) 1224 struct zap_details *details)
1225 { 1225 {
1226 pmd_t *pmd; 1226 pmd_t *pmd;
1227 unsigned long next; 1227 unsigned long next;
1228 1228
1229 pmd = pmd_offset(pud, addr); 1229 pmd = pmd_offset(pud, addr);
1230 do { 1230 do {
1231 next = pmd_addr_end(addr, end); 1231 next = pmd_addr_end(addr, end);
1232 if (pmd_trans_huge(*pmd)) { 1232 if (pmd_trans_huge(*pmd)) {
1233 if (next - addr != HPAGE_PMD_SIZE) { 1233 if (next - addr != HPAGE_PMD_SIZE) {
1234 #ifdef CONFIG_DEBUG_VM 1234 #ifdef CONFIG_DEBUG_VM
1235 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { 1235 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1236 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", 1236 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1237 __func__, addr, end, 1237 __func__, addr, end,
1238 vma->vm_start, 1238 vma->vm_start,
1239 vma->vm_end); 1239 vma->vm_end);
1240 BUG(); 1240 BUG();
1241 } 1241 }
1242 #endif 1242 #endif
1243 split_huge_page_pmd(vma, addr, pmd); 1243 split_huge_page_pmd(vma, addr, pmd);
1244 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1244 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1245 goto next; 1245 goto next;
1246 /* fall through */ 1246 /* fall through */
1247 } 1247 }
1248 /* 1248 /*
1249 * Here there can be other concurrent MADV_DONTNEED or 1249 * Here there can be other concurrent MADV_DONTNEED or
1250 * trans huge page faults running, and if the pmd is 1250 * trans huge page faults running, and if the pmd is
1251 * none or trans huge it can change under us. This is 1251 * none or trans huge it can change under us. This is
1252 * because MADV_DONTNEED holds the mmap_sem in read 1252 * because MADV_DONTNEED holds the mmap_sem in read
1253 * mode. 1253 * mode.
1254 */ 1254 */
1255 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1255 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1256 goto next; 1256 goto next;
1257 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1257 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1258 next: 1258 next:
1259 cond_resched(); 1259 cond_resched();
1260 } while (pmd++, addr = next, addr != end); 1260 } while (pmd++, addr = next, addr != end);
1261 1261
1262 return addr; 1262 return addr;
1263 } 1263 }
1264 1264
1265 static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 1265 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1266 struct vm_area_struct *vma, pgd_t *pgd, 1266 struct vm_area_struct *vma, pgd_t *pgd,
1267 unsigned long addr, unsigned long end, 1267 unsigned long addr, unsigned long end,
1268 struct zap_details *details) 1268 struct zap_details *details)
1269 { 1269 {
1270 pud_t *pud; 1270 pud_t *pud;
1271 unsigned long next; 1271 unsigned long next;
1272 1272
1273 pud = pud_offset(pgd, addr); 1273 pud = pud_offset(pgd, addr);
1274 do { 1274 do {
1275 next = pud_addr_end(addr, end); 1275 next = pud_addr_end(addr, end);
1276 if (pud_none_or_clear_bad(pud)) 1276 if (pud_none_or_clear_bad(pud))
1277 continue; 1277 continue;
1278 next = zap_pmd_range(tlb, vma, pud, addr, next, details); 1278 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1279 } while (pud++, addr = next, addr != end); 1279 } while (pud++, addr = next, addr != end);
1280 1280
1281 return addr; 1281 return addr;
1282 } 1282 }
1283 1283
1284 static void unmap_page_range(struct mmu_gather *tlb, 1284 static void unmap_page_range(struct mmu_gather *tlb,
1285 struct vm_area_struct *vma, 1285 struct vm_area_struct *vma,
1286 unsigned long addr, unsigned long end, 1286 unsigned long addr, unsigned long end,
1287 struct zap_details *details) 1287 struct zap_details *details)
1288 { 1288 {
1289 pgd_t *pgd; 1289 pgd_t *pgd;
1290 unsigned long next; 1290 unsigned long next;
1291 1291
1292 if (details && !details->check_mapping && !details->nonlinear_vma) 1292 if (details && !details->check_mapping && !details->nonlinear_vma)
1293 details = NULL; 1293 details = NULL;
1294 1294
1295 BUG_ON(addr >= end); 1295 BUG_ON(addr >= end);
1296 tlb_start_vma(tlb, vma); 1296 tlb_start_vma(tlb, vma);
1297 pgd = pgd_offset(vma->vm_mm, addr); 1297 pgd = pgd_offset(vma->vm_mm, addr);
1298 do { 1298 do {
1299 next = pgd_addr_end(addr, end); 1299 next = pgd_addr_end(addr, end);
1300 if (pgd_none_or_clear_bad(pgd)) 1300 if (pgd_none_or_clear_bad(pgd))
1301 continue; 1301 continue;
1302 next = zap_pud_range(tlb, vma, pgd, addr, next, details); 1302 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1303 } while (pgd++, addr = next, addr != end); 1303 } while (pgd++, addr = next, addr != end);
1304 tlb_end_vma(tlb, vma); 1304 tlb_end_vma(tlb, vma);
1305 } 1305 }
1306 1306
1307 1307
1308 static void unmap_single_vma(struct mmu_gather *tlb, 1308 static void unmap_single_vma(struct mmu_gather *tlb,
1309 struct vm_area_struct *vma, unsigned long start_addr, 1309 struct vm_area_struct *vma, unsigned long start_addr,
1310 unsigned long end_addr, 1310 unsigned long end_addr,
1311 struct zap_details *details) 1311 struct zap_details *details)
1312 { 1312 {
1313 unsigned long start = max(vma->vm_start, start_addr); 1313 unsigned long start = max(vma->vm_start, start_addr);
1314 unsigned long end; 1314 unsigned long end;
1315 1315
1316 if (start >= vma->vm_end) 1316 if (start >= vma->vm_end)
1317 return; 1317 return;
1318 end = min(vma->vm_end, end_addr); 1318 end = min(vma->vm_end, end_addr);
1319 if (end <= vma->vm_start) 1319 if (end <= vma->vm_start)
1320 return; 1320 return;
1321 1321
1322 if (vma->vm_file) 1322 if (vma->vm_file)
1323 uprobe_munmap(vma, start, end); 1323 uprobe_munmap(vma, start, end);
1324 1324
1325 if (unlikely(vma->vm_flags & VM_PFNMAP)) 1325 if (unlikely(vma->vm_flags & VM_PFNMAP))
1326 untrack_pfn(vma, 0, 0); 1326 untrack_pfn(vma, 0, 0);
1327 1327
1328 if (start != end) { 1328 if (start != end) {
1329 if (unlikely(is_vm_hugetlb_page(vma))) { 1329 if (unlikely(is_vm_hugetlb_page(vma))) {
1330 /* 1330 /*
1331 * It is undesirable to test vma->vm_file as it 1331 * It is undesirable to test vma->vm_file as it
1332 * should be non-null for valid hugetlb area. 1332 * should be non-null for valid hugetlb area.
1333 * However, vm_file will be NULL in the error 1333 * However, vm_file will be NULL in the error
1334 * cleanup path of mmap_region. When 1334 * cleanup path of mmap_region. When
1335 * hugetlbfs ->mmap method fails, 1335 * hugetlbfs ->mmap method fails,
1336 * mmap_region() nullifies vma->vm_file 1336 * mmap_region() nullifies vma->vm_file
1337 * before calling this function to clean up. 1337 * before calling this function to clean up.
1338 * Since no pte has actually been setup, it is 1338 * Since no pte has actually been setup, it is
1339 * safe to do nothing in this case. 1339 * safe to do nothing in this case.
1340 */ 1340 */
1341 if (vma->vm_file) { 1341 if (vma->vm_file) {
1342 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 1342 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1343 __unmap_hugepage_range_final(tlb, vma, start, end, NULL); 1343 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1344 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 1344 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1345 } 1345 }
1346 } else 1346 } else
1347 unmap_page_range(tlb, vma, start, end, details); 1347 unmap_page_range(tlb, vma, start, end, details);
1348 } 1348 }
1349 } 1349 }
1350 1350
1351 /** 1351 /**
1352 * unmap_vmas - unmap a range of memory covered by a list of vma's 1352 * unmap_vmas - unmap a range of memory covered by a list of vma's
1353 * @tlb: address of the caller's struct mmu_gather 1353 * @tlb: address of the caller's struct mmu_gather
1354 * @vma: the starting vma 1354 * @vma: the starting vma
1355 * @start_addr: virtual address at which to start unmapping 1355 * @start_addr: virtual address at which to start unmapping
1356 * @end_addr: virtual address at which to end unmapping 1356 * @end_addr: virtual address at which to end unmapping
1357 * 1357 *
1358 * Unmap all pages in the vma list. 1358 * Unmap all pages in the vma list.
1359 * 1359 *
1360 * Only addresses between `start' and `end' will be unmapped. 1360 * Only addresses between `start' and `end' will be unmapped.
1361 * 1361 *
1362 * The VMA list must be sorted in ascending virtual address order. 1362 * The VMA list must be sorted in ascending virtual address order.
1363 * 1363 *
1364 * unmap_vmas() assumes that the caller will flush the whole unmapped address 1364 * unmap_vmas() assumes that the caller will flush the whole unmapped address
1365 * range after unmap_vmas() returns. So the only responsibility here is to 1365 * range after unmap_vmas() returns. So the only responsibility here is to
1366 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1366 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1367 * drops the lock and schedules. 1367 * drops the lock and schedules.
1368 */ 1368 */
1369 void unmap_vmas(struct mmu_gather *tlb, 1369 void unmap_vmas(struct mmu_gather *tlb,
1370 struct vm_area_struct *vma, unsigned long start_addr, 1370 struct vm_area_struct *vma, unsigned long start_addr,
1371 unsigned long end_addr) 1371 unsigned long end_addr)
1372 { 1372 {
1373 struct mm_struct *mm = vma->vm_mm; 1373 struct mm_struct *mm = vma->vm_mm;
1374 1374
1375 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1375 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1376 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1376 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1377 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); 1377 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1378 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1378 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1379 } 1379 }
1380 1380
1381 /** 1381 /**
1382 * zap_page_range - remove user pages in a given range 1382 * zap_page_range - remove user pages in a given range
1383 * @vma: vm_area_struct holding the applicable pages 1383 * @vma: vm_area_struct holding the applicable pages
1384 * @start: starting address of pages to zap 1384 * @start: starting address of pages to zap
1385 * @size: number of bytes to zap 1385 * @size: number of bytes to zap
1386 * @details: details of nonlinear truncation or shared cache invalidation 1386 * @details: details of nonlinear truncation or shared cache invalidation
1387 * 1387 *
1388 * Caller must protect the VMA list 1388 * Caller must protect the VMA list
1389 */ 1389 */
1390 void zap_page_range(struct vm_area_struct *vma, unsigned long start, 1390 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1391 unsigned long size, struct zap_details *details) 1391 unsigned long size, struct zap_details *details)
1392 { 1392 {
1393 struct mm_struct *mm = vma->vm_mm; 1393 struct mm_struct *mm = vma->vm_mm;
1394 struct mmu_gather tlb; 1394 struct mmu_gather tlb;
1395 unsigned long end = start + size; 1395 unsigned long end = start + size;
1396 1396
1397 lru_add_drain(); 1397 lru_add_drain();
1398 tlb_gather_mmu(&tlb, mm, start, end); 1398 tlb_gather_mmu(&tlb, mm, start, end);
1399 update_hiwater_rss(mm); 1399 update_hiwater_rss(mm);
1400 mmu_notifier_invalidate_range_start(mm, start, end); 1400 mmu_notifier_invalidate_range_start(mm, start, end);
1401 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) 1401 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1402 unmap_single_vma(&tlb, vma, start, end, details); 1402 unmap_single_vma(&tlb, vma, start, end, details);
1403 mmu_notifier_invalidate_range_end(mm, start, end); 1403 mmu_notifier_invalidate_range_end(mm, start, end);
1404 tlb_finish_mmu(&tlb, start, end); 1404 tlb_finish_mmu(&tlb, start, end);
1405 } 1405 }
1406 1406
1407 /** 1407 /**
1408 * zap_page_range_single - remove user pages in a given range 1408 * zap_page_range_single - remove user pages in a given range
1409 * @vma: vm_area_struct holding the applicable pages 1409 * @vma: vm_area_struct holding the applicable pages
1410 * @address: starting address of pages to zap 1410 * @address: starting address of pages to zap
1411 * @size: number of bytes to zap 1411 * @size: number of bytes to zap
1412 * @details: details of nonlinear truncation or shared cache invalidation 1412 * @details: details of nonlinear truncation or shared cache invalidation
1413 * 1413 *
1414 * The range must fit into one VMA. 1414 * The range must fit into one VMA.
1415 */ 1415 */
1416 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1416 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1417 unsigned long size, struct zap_details *details) 1417 unsigned long size, struct zap_details *details)
1418 { 1418 {
1419 struct mm_struct *mm = vma->vm_mm; 1419 struct mm_struct *mm = vma->vm_mm;
1420 struct mmu_gather tlb; 1420 struct mmu_gather tlb;
1421 unsigned long end = address + size; 1421 unsigned long end = address + size;
1422 1422
1423 lru_add_drain(); 1423 lru_add_drain();
1424 tlb_gather_mmu(&tlb, mm, address, end); 1424 tlb_gather_mmu(&tlb, mm, address, end);
1425 update_hiwater_rss(mm); 1425 update_hiwater_rss(mm);
1426 mmu_notifier_invalidate_range_start(mm, address, end); 1426 mmu_notifier_invalidate_range_start(mm, address, end);
1427 unmap_single_vma(&tlb, vma, address, end, details); 1427 unmap_single_vma(&tlb, vma, address, end, details);
1428 mmu_notifier_invalidate_range_end(mm, address, end); 1428 mmu_notifier_invalidate_range_end(mm, address, end);
1429 tlb_finish_mmu(&tlb, address, end); 1429 tlb_finish_mmu(&tlb, address, end);
1430 } 1430 }
1431 1431
1432 /** 1432 /**
1433 * zap_vma_ptes - remove ptes mapping the vma 1433 * zap_vma_ptes - remove ptes mapping the vma
1434 * @vma: vm_area_struct holding ptes to be zapped 1434 * @vma: vm_area_struct holding ptes to be zapped
1435 * @address: starting address of pages to zap 1435 * @address: starting address of pages to zap
1436 * @size: number of bytes to zap 1436 * @size: number of bytes to zap
1437 * 1437 *
1438 * This function only unmaps ptes assigned to VM_PFNMAP vmas. 1438 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1439 * 1439 *
1440 * The entire address range must be fully contained within the vma. 1440 * The entire address range must be fully contained within the vma.
1441 * 1441 *
1442 * Returns 0 if successful. 1442 * Returns 0 if successful.
1443 */ 1443 */
1444 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 1444 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1445 unsigned long size) 1445 unsigned long size)
1446 { 1446 {
1447 if (address < vma->vm_start || address + size > vma->vm_end || 1447 if (address < vma->vm_start || address + size > vma->vm_end ||
1448 !(vma->vm_flags & VM_PFNMAP)) 1448 !(vma->vm_flags & VM_PFNMAP))
1449 return -1; 1449 return -1;
1450 zap_page_range_single(vma, address, size, NULL); 1450 zap_page_range_single(vma, address, size, NULL);
1451 return 0; 1451 return 0;
1452 } 1452 }
1453 EXPORT_SYMBOL_GPL(zap_vma_ptes); 1453 EXPORT_SYMBOL_GPL(zap_vma_ptes);
1454 1454
1455 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1455 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1456 spinlock_t **ptl) 1456 spinlock_t **ptl)
1457 { 1457 {
1458 pgd_t * pgd = pgd_offset(mm, addr); 1458 pgd_t * pgd = pgd_offset(mm, addr);
1459 pud_t * pud = pud_alloc(mm, pgd, addr); 1459 pud_t * pud = pud_alloc(mm, pgd, addr);
1460 if (pud) { 1460 if (pud) {
1461 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1461 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1462 if (pmd) { 1462 if (pmd) {
1463 VM_BUG_ON(pmd_trans_huge(*pmd)); 1463 VM_BUG_ON(pmd_trans_huge(*pmd));
1464 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1464 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1465 } 1465 }
1466 } 1466 }
1467 return NULL; 1467 return NULL;
1468 } 1468 }
1469 1469
1470 /* 1470 /*
1471 * This is the old fallback for page remapping. 1471 * This is the old fallback for page remapping.
1472 * 1472 *
1473 * For historical reasons, it only allows reserved pages. Only 1473 * For historical reasons, it only allows reserved pages. Only
1474 * old drivers should use this, and they needed to mark their 1474 * old drivers should use this, and they needed to mark their
1475 * pages reserved for the old functions anyway. 1475 * pages reserved for the old functions anyway.
1476 */ 1476 */
1477 static int insert_page(struct vm_area_struct *vma, unsigned long addr, 1477 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1478 struct page *page, pgprot_t prot) 1478 struct page *page, pgprot_t prot)
1479 { 1479 {
1480 struct mm_struct *mm = vma->vm_mm; 1480 struct mm_struct *mm = vma->vm_mm;
1481 int retval; 1481 int retval;
1482 pte_t *pte; 1482 pte_t *pte;
1483 spinlock_t *ptl; 1483 spinlock_t *ptl;
1484 1484
1485 retval = -EINVAL; 1485 retval = -EINVAL;
1486 if (PageAnon(page)) 1486 if (PageAnon(page))
1487 goto out; 1487 goto out;
1488 retval = -ENOMEM; 1488 retval = -ENOMEM;
1489 flush_dcache_page(page); 1489 flush_dcache_page(page);
1490 pte = get_locked_pte(mm, addr, &ptl); 1490 pte = get_locked_pte(mm, addr, &ptl);
1491 if (!pte) 1491 if (!pte)
1492 goto out; 1492 goto out;
1493 retval = -EBUSY; 1493 retval = -EBUSY;
1494 if (!pte_none(*pte)) 1494 if (!pte_none(*pte))
1495 goto out_unlock; 1495 goto out_unlock;
1496 1496
1497 /* Ok, finally just insert the thing.. */ 1497 /* Ok, finally just insert the thing.. */
1498 get_page(page); 1498 get_page(page);
1499 inc_mm_counter_fast(mm, MM_FILEPAGES); 1499 inc_mm_counter_fast(mm, MM_FILEPAGES);
1500 page_add_file_rmap(page); 1500 page_add_file_rmap(page);
1501 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1501 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1502 1502
1503 retval = 0; 1503 retval = 0;
1504 pte_unmap_unlock(pte, ptl); 1504 pte_unmap_unlock(pte, ptl);
1505 return retval; 1505 return retval;
1506 out_unlock: 1506 out_unlock:
1507 pte_unmap_unlock(pte, ptl); 1507 pte_unmap_unlock(pte, ptl);
1508 out: 1508 out:
1509 return retval; 1509 return retval;
1510 } 1510 }
1511 1511
1512 /** 1512 /**
1513 * vm_insert_page - insert single page into user vma 1513 * vm_insert_page - insert single page into user vma
1514 * @vma: user vma to map to 1514 * @vma: user vma to map to
1515 * @addr: target user address of this page 1515 * @addr: target user address of this page
1516 * @page: source kernel page 1516 * @page: source kernel page
1517 * 1517 *
1518 * This allows drivers to insert individual pages they've allocated 1518 * This allows drivers to insert individual pages they've allocated
1519 * into a user vma. 1519 * into a user vma.
1520 * 1520 *
1521 * The page has to be a nice clean _individual_ kernel allocation. 1521 * The page has to be a nice clean _individual_ kernel allocation.
1522 * If you allocate a compound page, you need to have marked it as 1522 * If you allocate a compound page, you need to have marked it as
1523 * such (__GFP_COMP), or manually just split the page up yourself 1523 * such (__GFP_COMP), or manually just split the page up yourself
1524 * (see split_page()). 1524 * (see split_page()).
1525 * 1525 *
1526 * NOTE! Traditionally this was done with "remap_pfn_range()" which 1526 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1527 * took an arbitrary page protection parameter. This doesn't allow 1527 * took an arbitrary page protection parameter. This doesn't allow
1528 * that. Your vma protection will have to be set up correctly, which 1528 * that. Your vma protection will have to be set up correctly, which
1529 * means that if you want a shared writable mapping, you'd better 1529 * means that if you want a shared writable mapping, you'd better
1530 * ask for a shared writable mapping! 1530 * ask for a shared writable mapping!
1531 * 1531 *
1532 * The page does not need to be reserved. 1532 * The page does not need to be reserved.
1533 * 1533 *
1534 * Usually this function is called from f_op->mmap() handler 1534 * Usually this function is called from f_op->mmap() handler
1535 * under mm->mmap_sem write-lock, so it can change vma->vm_flags. 1535 * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
1536 * Caller must set VM_MIXEDMAP on vma if it wants to call this 1536 * Caller must set VM_MIXEDMAP on vma if it wants to call this
1537 * function from other places, for example from page-fault handler. 1537 * function from other places, for example from page-fault handler.
1538 */ 1538 */
1539 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 1539 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1540 struct page *page) 1540 struct page *page)
1541 { 1541 {
1542 if (addr < vma->vm_start || addr >= vma->vm_end) 1542 if (addr < vma->vm_start || addr >= vma->vm_end)
1543 return -EFAULT; 1543 return -EFAULT;
1544 if (!page_count(page)) 1544 if (!page_count(page))
1545 return -EINVAL; 1545 return -EINVAL;
1546 if (!(vma->vm_flags & VM_MIXEDMAP)) { 1546 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1547 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); 1547 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1548 BUG_ON(vma->vm_flags & VM_PFNMAP); 1548 BUG_ON(vma->vm_flags & VM_PFNMAP);
1549 vma->vm_flags |= VM_MIXEDMAP; 1549 vma->vm_flags |= VM_MIXEDMAP;
1550 } 1550 }
1551 return insert_page(vma, addr, page, vma->vm_page_prot); 1551 return insert_page(vma, addr, page, vma->vm_page_prot);
1552 } 1552 }
1553 EXPORT_SYMBOL(vm_insert_page); 1553 EXPORT_SYMBOL(vm_insert_page);
1554 1554
1555 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1555 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1556 unsigned long pfn, pgprot_t prot) 1556 unsigned long pfn, pgprot_t prot)
1557 { 1557 {
1558 struct mm_struct *mm = vma->vm_mm; 1558 struct mm_struct *mm = vma->vm_mm;
1559 int retval; 1559 int retval;
1560 pte_t *pte, entry; 1560 pte_t *pte, entry;
1561 spinlock_t *ptl; 1561 spinlock_t *ptl;
1562 1562
1563 retval = -ENOMEM; 1563 retval = -ENOMEM;
1564 pte = get_locked_pte(mm, addr, &ptl); 1564 pte = get_locked_pte(mm, addr, &ptl);
1565 if (!pte) 1565 if (!pte)
1566 goto out; 1566 goto out;
1567 retval = -EBUSY; 1567 retval = -EBUSY;
1568 if (!pte_none(*pte)) 1568 if (!pte_none(*pte))
1569 goto out_unlock; 1569 goto out_unlock;
1570 1570
1571 /* Ok, finally just insert the thing.. */ 1571 /* Ok, finally just insert the thing.. */
1572 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1572 entry = pte_mkspecial(pfn_pte(pfn, prot));
1573 set_pte_at(mm, addr, pte, entry); 1573 set_pte_at(mm, addr, pte, entry);
1574 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 1574 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1575 1575
1576 retval = 0; 1576 retval = 0;
1577 out_unlock: 1577 out_unlock:
1578 pte_unmap_unlock(pte, ptl); 1578 pte_unmap_unlock(pte, ptl);
1579 out: 1579 out:
1580 return retval; 1580 return retval;
1581 } 1581 }
1582 1582
1583 /** 1583 /**
1584 * vm_insert_pfn - insert single pfn into user vma 1584 * vm_insert_pfn - insert single pfn into user vma
1585 * @vma: user vma to map to 1585 * @vma: user vma to map to
1586 * @addr: target user address of this page 1586 * @addr: target user address of this page
1587 * @pfn: source kernel pfn 1587 * @pfn: source kernel pfn
1588 * 1588 *
1589 * Similar to vm_insert_page, this allows drivers to insert individual pages 1589 * Similar to vm_insert_page, this allows drivers to insert individual pages
1590 * they've allocated into a user vma. Same comments apply. 1590 * they've allocated into a user vma. Same comments apply.
1591 * 1591 *
1592 * This function should only be called from a vm_ops->fault handler, and 1592 * This function should only be called from a vm_ops->fault handler, and
1593 * in that case the handler should return NULL. 1593 * in that case the handler should return NULL.
1594 * 1594 *
1595 * vma cannot be a COW mapping. 1595 * vma cannot be a COW mapping.
1596 * 1596 *
1597 * As this is called only for pages that do not currently exist, we 1597 * As this is called only for pages that do not currently exist, we
1598 * do not need to flush old virtual caches or the TLB. 1598 * do not need to flush old virtual caches or the TLB.
1599 */ 1599 */
1600 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1600 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1601 unsigned long pfn) 1601 unsigned long pfn)
1602 { 1602 {
1603 int ret; 1603 int ret;
1604 pgprot_t pgprot = vma->vm_page_prot; 1604 pgprot_t pgprot = vma->vm_page_prot;
1605 /* 1605 /*
1606 * Technically, architectures with pte_special can avoid all these 1606 * Technically, architectures with pte_special can avoid all these
1607 * restrictions (same for remap_pfn_range). However we would like 1607 * restrictions (same for remap_pfn_range). However we would like
1608 * consistency in testing and feature parity among all, so we should 1608 * consistency in testing and feature parity among all, so we should
1609 * try to keep these invariants in place for everybody. 1609 * try to keep these invariants in place for everybody.
1610 */ 1610 */
1611 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1611 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1612 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1612 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1613 (VM_PFNMAP|VM_MIXEDMAP)); 1613 (VM_PFNMAP|VM_MIXEDMAP));
1614 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1614 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1615 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 1615 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1616 1616
1617 if (addr < vma->vm_start || addr >= vma->vm_end) 1617 if (addr < vma->vm_start || addr >= vma->vm_end)
1618 return -EFAULT; 1618 return -EFAULT;
1619 if (track_pfn_insert(vma, &pgprot, pfn)) 1619 if (track_pfn_insert(vma, &pgprot, pfn))
1620 return -EINVAL; 1620 return -EINVAL;
1621 1621
1622 ret = insert_pfn(vma, addr, pfn, pgprot); 1622 ret = insert_pfn(vma, addr, pfn, pgprot);
1623 1623
1624 return ret; 1624 return ret;
1625 } 1625 }
1626 EXPORT_SYMBOL(vm_insert_pfn); 1626 EXPORT_SYMBOL(vm_insert_pfn);
1627 1627
1628 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1628 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1629 unsigned long pfn) 1629 unsigned long pfn)
1630 { 1630 {
1631 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); 1631 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1632 1632
1633 if (addr < vma->vm_start || addr >= vma->vm_end) 1633 if (addr < vma->vm_start || addr >= vma->vm_end)
1634 return -EFAULT; 1634 return -EFAULT;
1635 1635
1636 /* 1636 /*
1637 * If we don't have pte special, then we have to use the pfn_valid() 1637 * If we don't have pte special, then we have to use the pfn_valid()
1638 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1638 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1639 * refcount the page if pfn_valid is true (hence insert_page rather 1639 * refcount the page if pfn_valid is true (hence insert_page rather
1640 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 1640 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1641 * without pte special, it would there be refcounted as a normal page. 1641 * without pte special, it would there be refcounted as a normal page.
1642 */ 1642 */
1643 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1643 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1644 struct page *page; 1644 struct page *page;
1645 1645
1646 page = pfn_to_page(pfn); 1646 page = pfn_to_page(pfn);
1647 return insert_page(vma, addr, page, vma->vm_page_prot); 1647 return insert_page(vma, addr, page, vma->vm_page_prot);
1648 } 1648 }
1649 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1649 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1650 } 1650 }
1651 EXPORT_SYMBOL(vm_insert_mixed); 1651 EXPORT_SYMBOL(vm_insert_mixed);
1652 1652
1653 /* 1653 /*
1654 * maps a range of physical memory into the requested pages. the old 1654 * maps a range of physical memory into the requested pages. the old
1655 * mappings are removed. any references to nonexistent pages results 1655 * mappings are removed. any references to nonexistent pages results
1656 * in null mappings (currently treated as "copy-on-access") 1656 * in null mappings (currently treated as "copy-on-access")
1657 */ 1657 */
1658 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, 1658 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1659 unsigned long addr, unsigned long end, 1659 unsigned long addr, unsigned long end,
1660 unsigned long pfn, pgprot_t prot) 1660 unsigned long pfn, pgprot_t prot)
1661 { 1661 {
1662 pte_t *pte; 1662 pte_t *pte;
1663 spinlock_t *ptl; 1663 spinlock_t *ptl;
1664 1664
1665 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1665 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1666 if (!pte) 1666 if (!pte)
1667 return -ENOMEM; 1667 return -ENOMEM;
1668 arch_enter_lazy_mmu_mode(); 1668 arch_enter_lazy_mmu_mode();
1669 do { 1669 do {
1670 BUG_ON(!pte_none(*pte)); 1670 BUG_ON(!pte_none(*pte));
1671 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); 1671 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1672 pfn++; 1672 pfn++;
1673 } while (pte++, addr += PAGE_SIZE, addr != end); 1673 } while (pte++, addr += PAGE_SIZE, addr != end);
1674 arch_leave_lazy_mmu_mode(); 1674 arch_leave_lazy_mmu_mode();
1675 pte_unmap_unlock(pte - 1, ptl); 1675 pte_unmap_unlock(pte - 1, ptl);
1676 return 0; 1676 return 0;
1677 } 1677 }
1678 1678
1679 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, 1679 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1680 unsigned long addr, unsigned long end, 1680 unsigned long addr, unsigned long end,
1681 unsigned long pfn, pgprot_t prot) 1681 unsigned long pfn, pgprot_t prot)
1682 { 1682 {
1683 pmd_t *pmd; 1683 pmd_t *pmd;
1684 unsigned long next; 1684 unsigned long next;
1685 1685
1686 pfn -= addr >> PAGE_SHIFT; 1686 pfn -= addr >> PAGE_SHIFT;
1687 pmd = pmd_alloc(mm, pud, addr); 1687 pmd = pmd_alloc(mm, pud, addr);
1688 if (!pmd) 1688 if (!pmd)
1689 return -ENOMEM; 1689 return -ENOMEM;
1690 VM_BUG_ON(pmd_trans_huge(*pmd)); 1690 VM_BUG_ON(pmd_trans_huge(*pmd));
1691 do { 1691 do {
1692 next = pmd_addr_end(addr, end); 1692 next = pmd_addr_end(addr, end);
1693 if (remap_pte_range(mm, pmd, addr, next, 1693 if (remap_pte_range(mm, pmd, addr, next,
1694 pfn + (addr >> PAGE_SHIFT), prot)) 1694 pfn + (addr >> PAGE_SHIFT), prot))
1695 return -ENOMEM; 1695 return -ENOMEM;
1696 } while (pmd++, addr = next, addr != end); 1696 } while (pmd++, addr = next, addr != end);
1697 return 0; 1697 return 0;
1698 } 1698 }
1699 1699
1700 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, 1700 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1701 unsigned long addr, unsigned long end, 1701 unsigned long addr, unsigned long end,
1702 unsigned long pfn, pgprot_t prot) 1702 unsigned long pfn, pgprot_t prot)
1703 { 1703 {
1704 pud_t *pud; 1704 pud_t *pud;
1705 unsigned long next; 1705 unsigned long next;
1706 1706
1707 pfn -= addr >> PAGE_SHIFT; 1707 pfn -= addr >> PAGE_SHIFT;
1708 pud = pud_alloc(mm, pgd, addr); 1708 pud = pud_alloc(mm, pgd, addr);
1709 if (!pud) 1709 if (!pud)
1710 return -ENOMEM; 1710 return -ENOMEM;
1711 do { 1711 do {
1712 next = pud_addr_end(addr, end); 1712 next = pud_addr_end(addr, end);
1713 if (remap_pmd_range(mm, pud, addr, next, 1713 if (remap_pmd_range(mm, pud, addr, next,
1714 pfn + (addr >> PAGE_SHIFT), prot)) 1714 pfn + (addr >> PAGE_SHIFT), prot))
1715 return -ENOMEM; 1715 return -ENOMEM;
1716 } while (pud++, addr = next, addr != end); 1716 } while (pud++, addr = next, addr != end);
1717 return 0; 1717 return 0;
1718 } 1718 }
1719 1719
1720 /** 1720 /**
1721 * remap_pfn_range - remap kernel memory to userspace 1721 * remap_pfn_range - remap kernel memory to userspace
1722 * @vma: user vma to map to 1722 * @vma: user vma to map to
1723 * @addr: target user address to start at 1723 * @addr: target user address to start at
1724 * @pfn: physical address of kernel memory 1724 * @pfn: physical address of kernel memory
1725 * @size: size of map area 1725 * @size: size of map area
1726 * @prot: page protection flags for this mapping 1726 * @prot: page protection flags for this mapping
1727 * 1727 *
1728 * Note: this is only safe if the mm semaphore is held when called. 1728 * Note: this is only safe if the mm semaphore is held when called.
1729 */ 1729 */
1730 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1730 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1731 unsigned long pfn, unsigned long size, pgprot_t prot) 1731 unsigned long pfn, unsigned long size, pgprot_t prot)
1732 { 1732 {
1733 pgd_t *pgd; 1733 pgd_t *pgd;
1734 unsigned long next; 1734 unsigned long next;
1735 unsigned long end = addr + PAGE_ALIGN(size); 1735 unsigned long end = addr + PAGE_ALIGN(size);
1736 struct mm_struct *mm = vma->vm_mm; 1736 struct mm_struct *mm = vma->vm_mm;
1737 int err; 1737 int err;
1738 1738
1739 /* 1739 /*
1740 * Physically remapped pages are special. Tell the 1740 * Physically remapped pages are special. Tell the
1741 * rest of the world about it: 1741 * rest of the world about it:
1742 * VM_IO tells people not to look at these pages 1742 * VM_IO tells people not to look at these pages
1743 * (accesses can have side effects). 1743 * (accesses can have side effects).
1744 * VM_PFNMAP tells the core MM that the base pages are just 1744 * VM_PFNMAP tells the core MM that the base pages are just
1745 * raw PFN mappings, and do not have a "struct page" associated 1745 * raw PFN mappings, and do not have a "struct page" associated
1746 * with them. 1746 * with them.
1747 * VM_DONTEXPAND 1747 * VM_DONTEXPAND
1748 * Disable vma merging and expanding with mremap(). 1748 * Disable vma merging and expanding with mremap().
1749 * VM_DONTDUMP 1749 * VM_DONTDUMP
1750 * Omit vma from core dump, even when VM_IO turned off. 1750 * Omit vma from core dump, even when VM_IO turned off.
1751 * 1751 *
1752 * There's a horrible special case to handle copy-on-write 1752 * There's a horrible special case to handle copy-on-write
1753 * behaviour that some programs depend on. We mark the "original" 1753 * behaviour that some programs depend on. We mark the "original"
1754 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 1754 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1755 * See vm_normal_page() for details. 1755 * See vm_normal_page() for details.
1756 */ 1756 */
1757 if (is_cow_mapping(vma->vm_flags)) { 1757 if (is_cow_mapping(vma->vm_flags)) {
1758 if (addr != vma->vm_start || end != vma->vm_end) 1758 if (addr != vma->vm_start || end != vma->vm_end)
1759 return -EINVAL; 1759 return -EINVAL;
1760 vma->vm_pgoff = pfn; 1760 vma->vm_pgoff = pfn;
1761 } 1761 }
1762 1762
1763 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); 1763 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
1764 if (err) 1764 if (err)
1765 return -EINVAL; 1765 return -EINVAL;
1766 1766
1767 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1767 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1768 1768
1769 BUG_ON(addr >= end); 1769 BUG_ON(addr >= end);
1770 pfn -= addr >> PAGE_SHIFT; 1770 pfn -= addr >> PAGE_SHIFT;
1771 pgd = pgd_offset(mm, addr); 1771 pgd = pgd_offset(mm, addr);
1772 flush_cache_range(vma, addr, end); 1772 flush_cache_range(vma, addr, end);
1773 do { 1773 do {
1774 next = pgd_addr_end(addr, end); 1774 next = pgd_addr_end(addr, end);
1775 err = remap_pud_range(mm, pgd, addr, next, 1775 err = remap_pud_range(mm, pgd, addr, next,
1776 pfn + (addr >> PAGE_SHIFT), prot); 1776 pfn + (addr >> PAGE_SHIFT), prot);
1777 if (err) 1777 if (err)
1778 break; 1778 break;
1779 } while (pgd++, addr = next, addr != end); 1779 } while (pgd++, addr = next, addr != end);
1780 1780
1781 if (err) 1781 if (err)
1782 untrack_pfn(vma, pfn, PAGE_ALIGN(size)); 1782 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
1783 1783
1784 return err; 1784 return err;
1785 } 1785 }
1786 EXPORT_SYMBOL(remap_pfn_range); 1786 EXPORT_SYMBOL(remap_pfn_range);
1787 1787
1788 /** 1788 /**
1789 * vm_iomap_memory - remap memory to userspace 1789 * vm_iomap_memory - remap memory to userspace
1790 * @vma: user vma to map to 1790 * @vma: user vma to map to
1791 * @start: start of area 1791 * @start: start of area
1792 * @len: size of area 1792 * @len: size of area
1793 * 1793 *
1794 * This is a simplified io_remap_pfn_range() for common driver use. The 1794 * This is a simplified io_remap_pfn_range() for common driver use. The
1795 * driver just needs to give us the physical memory range to be mapped, 1795 * driver just needs to give us the physical memory range to be mapped,
1796 * we'll figure out the rest from the vma information. 1796 * we'll figure out the rest from the vma information.
1797 * 1797 *
1798 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get 1798 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
1799 * whatever write-combining details or similar. 1799 * whatever write-combining details or similar.
1800 */ 1800 */
1801 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 1801 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1802 { 1802 {
1803 unsigned long vm_len, pfn, pages; 1803 unsigned long vm_len, pfn, pages;
1804 1804
1805 /* Check that the physical memory area passed in looks valid */ 1805 /* Check that the physical memory area passed in looks valid */
1806 if (start + len < start) 1806 if (start + len < start)
1807 return -EINVAL; 1807 return -EINVAL;
1808 /* 1808 /*
1809 * You *really* shouldn't map things that aren't page-aligned, 1809 * You *really* shouldn't map things that aren't page-aligned,
1810 * but we've historically allowed it because IO memory might 1810 * but we've historically allowed it because IO memory might
1811 * just have smaller alignment. 1811 * just have smaller alignment.
1812 */ 1812 */
1813 len += start & ~PAGE_MASK; 1813 len += start & ~PAGE_MASK;
1814 pfn = start >> PAGE_SHIFT; 1814 pfn = start >> PAGE_SHIFT;
1815 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; 1815 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1816 if (pfn + pages < pfn) 1816 if (pfn + pages < pfn)
1817 return -EINVAL; 1817 return -EINVAL;
1818 1818
1819 /* We start the mapping 'vm_pgoff' pages into the area */ 1819 /* We start the mapping 'vm_pgoff' pages into the area */
1820 if (vma->vm_pgoff > pages) 1820 if (vma->vm_pgoff > pages)
1821 return -EINVAL; 1821 return -EINVAL;
1822 pfn += vma->vm_pgoff; 1822 pfn += vma->vm_pgoff;
1823 pages -= vma->vm_pgoff; 1823 pages -= vma->vm_pgoff;
1824 1824
1825 /* Can we fit all of the mapping? */ 1825 /* Can we fit all of the mapping? */
1826 vm_len = vma->vm_end - vma->vm_start; 1826 vm_len = vma->vm_end - vma->vm_start;
1827 if (vm_len >> PAGE_SHIFT > pages) 1827 if (vm_len >> PAGE_SHIFT > pages)
1828 return -EINVAL; 1828 return -EINVAL;
1829 1829
1830 /* Ok, let it rip */ 1830 /* Ok, let it rip */
1831 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); 1831 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1832 } 1832 }
1833 EXPORT_SYMBOL(vm_iomap_memory); 1833 EXPORT_SYMBOL(vm_iomap_memory);
1834 1834
1835 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 1835 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1836 unsigned long addr, unsigned long end, 1836 unsigned long addr, unsigned long end,
1837 pte_fn_t fn, void *data) 1837 pte_fn_t fn, void *data)
1838 { 1838 {
1839 pte_t *pte; 1839 pte_t *pte;
1840 int err; 1840 int err;
1841 pgtable_t token; 1841 pgtable_t token;
1842 spinlock_t *uninitialized_var(ptl); 1842 spinlock_t *uninitialized_var(ptl);
1843 1843
1844 pte = (mm == &init_mm) ? 1844 pte = (mm == &init_mm) ?
1845 pte_alloc_kernel(pmd, addr) : 1845 pte_alloc_kernel(pmd, addr) :
1846 pte_alloc_map_lock(mm, pmd, addr, &ptl); 1846 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1847 if (!pte) 1847 if (!pte)
1848 return -ENOMEM; 1848 return -ENOMEM;
1849 1849
1850 BUG_ON(pmd_huge(*pmd)); 1850 BUG_ON(pmd_huge(*pmd));
1851 1851
1852 arch_enter_lazy_mmu_mode(); 1852 arch_enter_lazy_mmu_mode();
1853 1853
1854 token = pmd_pgtable(*pmd); 1854 token = pmd_pgtable(*pmd);
1855 1855
1856 do { 1856 do {
1857 err = fn(pte++, token, addr, data); 1857 err = fn(pte++, token, addr, data);
1858 if (err) 1858 if (err)
1859 break; 1859 break;
1860 } while (addr += PAGE_SIZE, addr != end); 1860 } while (addr += PAGE_SIZE, addr != end);
1861 1861
1862 arch_leave_lazy_mmu_mode(); 1862 arch_leave_lazy_mmu_mode();
1863 1863
1864 if (mm != &init_mm) 1864 if (mm != &init_mm)
1865 pte_unmap_unlock(pte-1, ptl); 1865 pte_unmap_unlock(pte-1, ptl);
1866 return err; 1866 return err;
1867 } 1867 }
1868 1868
1869 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, 1869 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1870 unsigned long addr, unsigned long end, 1870 unsigned long addr, unsigned long end,
1871 pte_fn_t fn, void *data) 1871 pte_fn_t fn, void *data)
1872 { 1872 {
1873 pmd_t *pmd; 1873 pmd_t *pmd;
1874 unsigned long next; 1874 unsigned long next;
1875 int err; 1875 int err;
1876 1876
1877 BUG_ON(pud_huge(*pud)); 1877 BUG_ON(pud_huge(*pud));
1878 1878
1879 pmd = pmd_alloc(mm, pud, addr); 1879 pmd = pmd_alloc(mm, pud, addr);
1880 if (!pmd) 1880 if (!pmd)
1881 return -ENOMEM; 1881 return -ENOMEM;
1882 do { 1882 do {
1883 next = pmd_addr_end(addr, end); 1883 next = pmd_addr_end(addr, end);
1884 err = apply_to_pte_range(mm, pmd, addr, next, fn, data); 1884 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1885 if (err) 1885 if (err)
1886 break; 1886 break;
1887 } while (pmd++, addr = next, addr != end); 1887 } while (pmd++, addr = next, addr != end);
1888 return err; 1888 return err;
1889 } 1889 }
1890 1890
1891 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, 1891 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1892 unsigned long addr, unsigned long end, 1892 unsigned long addr, unsigned long end,
1893 pte_fn_t fn, void *data) 1893 pte_fn_t fn, void *data)
1894 { 1894 {
1895 pud_t *pud; 1895 pud_t *pud;
1896 unsigned long next; 1896 unsigned long next;
1897 int err; 1897 int err;
1898 1898
1899 pud = pud_alloc(mm, pgd, addr); 1899 pud = pud_alloc(mm, pgd, addr);
1900 if (!pud) 1900 if (!pud)
1901 return -ENOMEM; 1901 return -ENOMEM;
1902 do { 1902 do {
1903 next = pud_addr_end(addr, end); 1903 next = pud_addr_end(addr, end);
1904 err = apply_to_pmd_range(mm, pud, addr, next, fn, data); 1904 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1905 if (err) 1905 if (err)
1906 break; 1906 break;
1907 } while (pud++, addr = next, addr != end); 1907 } while (pud++, addr = next, addr != end);
1908 return err; 1908 return err;
1909 } 1909 }
1910 1910
1911 /* 1911 /*
1912 * Scan a region of virtual memory, filling in page tables as necessary 1912 * Scan a region of virtual memory, filling in page tables as necessary
1913 * and calling a provided function on each leaf page table. 1913 * and calling a provided function on each leaf page table.
1914 */ 1914 */
1915 int apply_to_page_range(struct mm_struct *mm, unsigned long addr, 1915 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1916 unsigned long size, pte_fn_t fn, void *data) 1916 unsigned long size, pte_fn_t fn, void *data)
1917 { 1917 {
1918 pgd_t *pgd; 1918 pgd_t *pgd;
1919 unsigned long next; 1919 unsigned long next;
1920 unsigned long end = addr + size; 1920 unsigned long end = addr + size;
1921 int err; 1921 int err;
1922 1922
1923 BUG_ON(addr >= end); 1923 BUG_ON(addr >= end);
1924 pgd = pgd_offset(mm, addr); 1924 pgd = pgd_offset(mm, addr);
1925 do { 1925 do {
1926 next = pgd_addr_end(addr, end); 1926 next = pgd_addr_end(addr, end);
1927 err = apply_to_pud_range(mm, pgd, addr, next, fn, data); 1927 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1928 if (err) 1928 if (err)
1929 break; 1929 break;
1930 } while (pgd++, addr = next, addr != end); 1930 } while (pgd++, addr = next, addr != end);
1931 1931
1932 return err; 1932 return err;
1933 } 1933 }
1934 EXPORT_SYMBOL_GPL(apply_to_page_range); 1934 EXPORT_SYMBOL_GPL(apply_to_page_range);
1935 1935
1936 /* 1936 /*
1937 * handle_pte_fault chooses page fault handler according to an entry 1937 * handle_pte_fault chooses page fault handler according to an entry
1938 * which was read non-atomically. Before making any commitment, on 1938 * which was read non-atomically. Before making any commitment, on
1939 * those architectures or configurations (e.g. i386 with PAE) which 1939 * those architectures or configurations (e.g. i386 with PAE) which
1940 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault 1940 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
1941 * must check under lock before unmapping the pte and proceeding 1941 * must check under lock before unmapping the pte and proceeding
1942 * (but do_wp_page is only called after already making such a check; 1942 * (but do_wp_page is only called after already making such a check;
1943 * and do_anonymous_page can safely check later on). 1943 * and do_anonymous_page can safely check later on).
1944 */ 1944 */
1945 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 1945 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1946 pte_t *page_table, pte_t orig_pte) 1946 pte_t *page_table, pte_t orig_pte)
1947 { 1947 {
1948 int same = 1; 1948 int same = 1;
1949 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 1949 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1950 if (sizeof(pte_t) > sizeof(unsigned long)) { 1950 if (sizeof(pte_t) > sizeof(unsigned long)) {
1951 spinlock_t *ptl = pte_lockptr(mm, pmd); 1951 spinlock_t *ptl = pte_lockptr(mm, pmd);
1952 spin_lock(ptl); 1952 spin_lock(ptl);
1953 same = pte_same(*page_table, orig_pte); 1953 same = pte_same(*page_table, orig_pte);
1954 spin_unlock(ptl); 1954 spin_unlock(ptl);
1955 } 1955 }
1956 #endif 1956 #endif
1957 pte_unmap(page_table); 1957 pte_unmap(page_table);
1958 return same; 1958 return same;
1959 } 1959 }
1960 1960
1961 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 1961 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1962 { 1962 {
1963 debug_dma_assert_idle(src); 1963 debug_dma_assert_idle(src);
1964 1964
1965 /* 1965 /*
1966 * If the source page was a PFN mapping, we don't have 1966 * If the source page was a PFN mapping, we don't have
1967 * a "struct page" for it. We do a best-effort copy by 1967 * a "struct page" for it. We do a best-effort copy by
1968 * just copying from the original user address. If that 1968 * just copying from the original user address. If that
1969 * fails, we just zero-fill it. Live with it. 1969 * fails, we just zero-fill it. Live with it.
1970 */ 1970 */
1971 if (unlikely(!src)) { 1971 if (unlikely(!src)) {
1972 void *kaddr = kmap_atomic(dst); 1972 void *kaddr = kmap_atomic(dst);
1973 void __user *uaddr = (void __user *)(va & PAGE_MASK); 1973 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1974 1974
1975 /* 1975 /*
1976 * This really shouldn't fail, because the page is there 1976 * This really shouldn't fail, because the page is there
1977 * in the page tables. But it might just be unreadable, 1977 * in the page tables. But it might just be unreadable,
1978 * in which case we just give up and fill the result with 1978 * in which case we just give up and fill the result with
1979 * zeroes. 1979 * zeroes.
1980 */ 1980 */
1981 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 1981 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1982 clear_page(kaddr); 1982 clear_page(kaddr);
1983 kunmap_atomic(kaddr); 1983 kunmap_atomic(kaddr);
1984 flush_dcache_page(dst); 1984 flush_dcache_page(dst);
1985 } else 1985 } else
1986 copy_user_highpage(dst, src, va, vma); 1986 copy_user_highpage(dst, src, va, vma);
1987 } 1987 }
1988 1988
1989 /* 1989 /*
1990 * Notify the address space that the page is about to become writable so that 1990 * Notify the address space that the page is about to become writable so that
1991 * it can prohibit this or wait for the page to get into an appropriate state. 1991 * it can prohibit this or wait for the page to get into an appropriate state.
1992 * 1992 *
1993 * We do this without the lock held, so that it can sleep if it needs to. 1993 * We do this without the lock held, so that it can sleep if it needs to.
1994 */ 1994 */
1995 static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, 1995 static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1996 unsigned long address) 1996 unsigned long address)
1997 { 1997 {
1998 struct vm_fault vmf; 1998 struct vm_fault vmf;
1999 int ret; 1999 int ret;
2000 2000
2001 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2001 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2002 vmf.pgoff = page->index; 2002 vmf.pgoff = page->index;
2003 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 2003 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2004 vmf.page = page; 2004 vmf.page = page;
2005 2005
2006 ret = vma->vm_ops->page_mkwrite(vma, &vmf); 2006 ret = vma->vm_ops->page_mkwrite(vma, &vmf);
2007 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2007 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2008 return ret; 2008 return ret;
2009 if (unlikely(!(ret & VM_FAULT_LOCKED))) { 2009 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2010 lock_page(page); 2010 lock_page(page);
2011 if (!page->mapping) { 2011 if (!page->mapping) {
2012 unlock_page(page); 2012 unlock_page(page);
2013 return 0; /* retry */ 2013 return 0; /* retry */
2014 } 2014 }
2015 ret |= VM_FAULT_LOCKED; 2015 ret |= VM_FAULT_LOCKED;
2016 } else 2016 } else
2017 VM_BUG_ON_PAGE(!PageLocked(page), page); 2017 VM_BUG_ON_PAGE(!PageLocked(page), page);
2018 return ret; 2018 return ret;
2019 } 2019 }
2020 2020
2021 /* 2021 /*
2022 * This routine handles present pages, when users try to write 2022 * This routine handles present pages, when users try to write
2023 * to a shared page. It is done by copying the page to a new address 2023 * to a shared page. It is done by copying the page to a new address
2024 * and decrementing the shared-page counter for the old page. 2024 * and decrementing the shared-page counter for the old page.
2025 * 2025 *
2026 * Note that this routine assumes that the protection checks have been 2026 * Note that this routine assumes that the protection checks have been
2027 * done by the caller (the low-level page fault routine in most cases). 2027 * done by the caller (the low-level page fault routine in most cases).
2028 * Thus we can safely just mark it writable once we've done any necessary 2028 * Thus we can safely just mark it writable once we've done any necessary
2029 * COW. 2029 * COW.
2030 * 2030 *
2031 * We also mark the page dirty at this point even though the page will 2031 * We also mark the page dirty at this point even though the page will
2032 * change only once the write actually happens. This avoids a few races, 2032 * change only once the write actually happens. This avoids a few races,
2033 * and potentially makes it more efficient. 2033 * and potentially makes it more efficient.
2034 * 2034 *
2035 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2035 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2036 * but allow concurrent faults), with pte both mapped and locked. 2036 * but allow concurrent faults), with pte both mapped and locked.
2037 * We return with mmap_sem still held, but pte unmapped and unlocked. 2037 * We return with mmap_sem still held, but pte unmapped and unlocked.
2038 */ 2038 */
2039 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2040 unsigned long address, pte_t *page_table, pmd_t *pmd, 2040 unsigned long address, pte_t *page_table, pmd_t *pmd,
2041 spinlock_t *ptl, pte_t orig_pte) 2041 spinlock_t *ptl, pte_t orig_pte)
2042 __releases(ptl) 2042 __releases(ptl)
2043 { 2043 {
2044 struct page *old_page, *new_page = NULL; 2044 struct page *old_page, *new_page = NULL;
2045 pte_t entry; 2045 pte_t entry;
2046 int ret = 0; 2046 int ret = 0;
2047 int page_mkwrite = 0; 2047 int page_mkwrite = 0;
2048 struct page *dirty_page = NULL; 2048 struct page *dirty_page = NULL;
2049 unsigned long mmun_start = 0; /* For mmu_notifiers */ 2049 unsigned long mmun_start = 0; /* For mmu_notifiers */
2050 unsigned long mmun_end = 0; /* For mmu_notifiers */ 2050 unsigned long mmun_end = 0; /* For mmu_notifiers */
2051 struct mem_cgroup *memcg; 2051 struct mem_cgroup *memcg;
2052 2052
2053 old_page = vm_normal_page(vma, address, orig_pte); 2053 old_page = vm_normal_page(vma, address, orig_pte);
2054 if (!old_page) { 2054 if (!old_page) {
2055 /* 2055 /*
2056 * VM_MIXEDMAP !pfn_valid() case 2056 * VM_MIXEDMAP !pfn_valid() case
2057 * 2057 *
2058 * We should not cow pages in a shared writeable mapping. 2058 * We should not cow pages in a shared writeable mapping.
2059 * Just mark the pages writable as we can't do any dirty 2059 * Just mark the pages writable as we can't do any dirty
2060 * accounting on raw pfn maps. 2060 * accounting on raw pfn maps.
2061 */ 2061 */
2062 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2062 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2063 (VM_WRITE|VM_SHARED)) 2063 (VM_WRITE|VM_SHARED))
2064 goto reuse; 2064 goto reuse;
2065 goto gotten; 2065 goto gotten;
2066 } 2066 }
2067 2067
2068 /* 2068 /*
2069 * Take out anonymous pages first, anonymous shared vmas are 2069 * Take out anonymous pages first, anonymous shared vmas are
2070 * not dirty accountable. 2070 * not dirty accountable.
2071 */ 2071 */
2072 if (PageAnon(old_page) && !PageKsm(old_page)) { 2072 if (PageAnon(old_page) && !PageKsm(old_page)) {
2073 if (!trylock_page(old_page)) { 2073 if (!trylock_page(old_page)) {
2074 page_cache_get(old_page); 2074 page_cache_get(old_page);
2075 pte_unmap_unlock(page_table, ptl); 2075 pte_unmap_unlock(page_table, ptl);
2076 lock_page(old_page); 2076 lock_page(old_page);
2077 page_table = pte_offset_map_lock(mm, pmd, address, 2077 page_table = pte_offset_map_lock(mm, pmd, address,
2078 &ptl); 2078 &ptl);
2079 if (!pte_same(*page_table, orig_pte)) { 2079 if (!pte_same(*page_table, orig_pte)) {
2080 unlock_page(old_page); 2080 unlock_page(old_page);
2081 goto unlock; 2081 goto unlock;
2082 } 2082 }
2083 page_cache_release(old_page); 2083 page_cache_release(old_page);
2084 } 2084 }
2085 if (reuse_swap_page(old_page)) { 2085 if (reuse_swap_page(old_page)) {
2086 /* 2086 /*
2087 * The page is all ours. Move it to our anon_vma so 2087 * The page is all ours. Move it to our anon_vma so
2088 * the rmap code will not search our parent or siblings. 2088 * the rmap code will not search our parent or siblings.
2089 * Protected against the rmap code by the page lock. 2089 * Protected against the rmap code by the page lock.
2090 */ 2090 */
2091 page_move_anon_rmap(old_page, vma, address); 2091 page_move_anon_rmap(old_page, vma, address);
2092 unlock_page(old_page); 2092 unlock_page(old_page);
2093 goto reuse; 2093 goto reuse;
2094 } 2094 }
2095 unlock_page(old_page); 2095 unlock_page(old_page);
2096 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2096 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2097 (VM_WRITE|VM_SHARED))) { 2097 (VM_WRITE|VM_SHARED))) {
2098 /* 2098 /*
2099 * Only catch write-faults on shared writable pages, 2099 * Only catch write-faults on shared writable pages,
2100 * read-only shared pages can get COWed by 2100 * read-only shared pages can get COWed by
2101 * get_user_pages(.write=1, .force=1). 2101 * get_user_pages(.write=1, .force=1).
2102 */ 2102 */
2103 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2103 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2104 int tmp; 2104 int tmp;
2105 page_cache_get(old_page); 2105 page_cache_get(old_page);
2106 pte_unmap_unlock(page_table, ptl); 2106 pte_unmap_unlock(page_table, ptl);
2107 tmp = do_page_mkwrite(vma, old_page, address); 2107 tmp = do_page_mkwrite(vma, old_page, address);
2108 if (unlikely(!tmp || (tmp & 2108 if (unlikely(!tmp || (tmp &
2109 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 2109 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2110 page_cache_release(old_page); 2110 page_cache_release(old_page);
2111 return tmp; 2111 return tmp;
2112 } 2112 }
2113 /* 2113 /*
2114 * Since we dropped the lock we need to revalidate 2114 * Since we dropped the lock we need to revalidate
2115 * the PTE as someone else may have changed it. If 2115 * the PTE as someone else may have changed it. If
2116 * they did, we just return, as we can count on the 2116 * they did, we just return, as we can count on the
2117 * MMU to tell us if they didn't also make it writable. 2117 * MMU to tell us if they didn't also make it writable.
2118 */ 2118 */
2119 page_table = pte_offset_map_lock(mm, pmd, address, 2119 page_table = pte_offset_map_lock(mm, pmd, address,
2120 &ptl); 2120 &ptl);
2121 if (!pte_same(*page_table, orig_pte)) { 2121 if (!pte_same(*page_table, orig_pte)) {
2122 unlock_page(old_page); 2122 unlock_page(old_page);
2123 goto unlock; 2123 goto unlock;
2124 } 2124 }
2125 2125
2126 page_mkwrite = 1; 2126 page_mkwrite = 1;
2127 } 2127 }
2128 dirty_page = old_page; 2128 dirty_page = old_page;
2129 get_page(dirty_page); 2129 get_page(dirty_page);
2130 2130
2131 reuse: 2131 reuse:
2132 /* 2132 /*
2133 * Clear the pages cpupid information as the existing 2133 * Clear the pages cpupid information as the existing
2134 * information potentially belongs to a now completely 2134 * information potentially belongs to a now completely
2135 * unrelated process. 2135 * unrelated process.
2136 */ 2136 */
2137 if (old_page) 2137 if (old_page)
2138 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); 2138 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2139 2139
2140 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2140 flush_cache_page(vma, address, pte_pfn(orig_pte));
2141 entry = pte_mkyoung(orig_pte); 2141 entry = pte_mkyoung(orig_pte);
2142 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2142 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2143 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2143 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2144 update_mmu_cache(vma, address, page_table); 2144 update_mmu_cache(vma, address, page_table);
2145 pte_unmap_unlock(page_table, ptl); 2145 pte_unmap_unlock(page_table, ptl);
2146 ret |= VM_FAULT_WRITE; 2146 ret |= VM_FAULT_WRITE;
2147 2147
2148 if (!dirty_page) 2148 if (!dirty_page)
2149 return ret; 2149 return ret;
2150 2150
2151 /* 2151 /*
2152 * Yes, Virginia, this is actually required to prevent a race 2152 * Yes, Virginia, this is actually required to prevent a race
2153 * with clear_page_dirty_for_io() from clearing the page dirty 2153 * with clear_page_dirty_for_io() from clearing the page dirty
2154 * bit after it clear all dirty ptes, but before a racing 2154 * bit after it clear all dirty ptes, but before a racing
2155 * do_wp_page installs a dirty pte. 2155 * do_wp_page installs a dirty pte.
2156 * 2156 *
2157 * do_shared_fault is protected similarly. 2157 * do_shared_fault is protected similarly.
2158 */ 2158 */
2159 if (!page_mkwrite) { 2159 if (!page_mkwrite) {
2160 wait_on_page_locked(dirty_page); 2160 wait_on_page_locked(dirty_page);
2161 set_page_dirty_balance(dirty_page); 2161 set_page_dirty_balance(dirty_page);
2162 /* file_update_time outside page_lock */ 2162 /* file_update_time outside page_lock */
2163 if (vma->vm_file) 2163 if (vma->vm_file)
2164 file_update_time(vma->vm_file); 2164 file_update_time(vma->vm_file);
2165 } 2165 }
2166 put_page(dirty_page); 2166 put_page(dirty_page);
2167 if (page_mkwrite) { 2167 if (page_mkwrite) {
2168 struct address_space *mapping = dirty_page->mapping; 2168 struct address_space *mapping = dirty_page->mapping;
2169 2169
2170 set_page_dirty(dirty_page); 2170 set_page_dirty(dirty_page);
2171 unlock_page(dirty_page); 2171 unlock_page(dirty_page);
2172 page_cache_release(dirty_page); 2172 page_cache_release(dirty_page);
2173 if (mapping) { 2173 if (mapping) {
2174 /* 2174 /*
2175 * Some device drivers do not set page.mapping 2175 * Some device drivers do not set page.mapping
2176 * but still dirty their pages 2176 * but still dirty their pages
2177 */ 2177 */
2178 balance_dirty_pages_ratelimited(mapping); 2178 balance_dirty_pages_ratelimited(mapping);
2179 } 2179 }
2180 } 2180 }
2181 2181
2182 return ret; 2182 return ret;
2183 } 2183 }
2184 2184
2185 /* 2185 /*
2186 * Ok, we need to copy. Oh, well.. 2186 * Ok, we need to copy. Oh, well..
2187 */ 2187 */
2188 page_cache_get(old_page); 2188 page_cache_get(old_page);
2189 gotten: 2189 gotten:
2190 pte_unmap_unlock(page_table, ptl); 2190 pte_unmap_unlock(page_table, ptl);
2191 2191
2192 if (unlikely(anon_vma_prepare(vma))) 2192 if (unlikely(anon_vma_prepare(vma)))
2193 goto oom; 2193 goto oom;
2194 2194
2195 if (is_zero_pfn(pte_pfn(orig_pte))) { 2195 if (is_zero_pfn(pte_pfn(orig_pte))) {
2196 new_page = alloc_zeroed_user_highpage_movable(vma, address); 2196 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2197 if (!new_page) 2197 if (!new_page)
2198 goto oom; 2198 goto oom;
2199 } else { 2199 } else {
2200 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2200 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2201 if (!new_page) 2201 if (!new_page)
2202 goto oom; 2202 goto oom;
2203 cow_user_page(new_page, old_page, address, vma); 2203 cow_user_page(new_page, old_page, address, vma);
2204 } 2204 }
2205 __SetPageUptodate(new_page); 2205 __SetPageUptodate(new_page);
2206 2206
2207 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) 2207 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2208 goto oom_free_new; 2208 goto oom_free_new;
2209 2209
2210 mmun_start = address & PAGE_MASK; 2210 mmun_start = address & PAGE_MASK;
2211 mmun_end = mmun_start + PAGE_SIZE; 2211 mmun_end = mmun_start + PAGE_SIZE;
2212 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2212 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2213 2213
2214 /* 2214 /*
2215 * Re-check the pte - we dropped the lock 2215 * Re-check the pte - we dropped the lock
2216 */ 2216 */
2217 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2217 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2218 if (likely(pte_same(*page_table, orig_pte))) { 2218 if (likely(pte_same(*page_table, orig_pte))) {
2219 if (old_page) { 2219 if (old_page) {
2220 if (!PageAnon(old_page)) { 2220 if (!PageAnon(old_page)) {
2221 dec_mm_counter_fast(mm, MM_FILEPAGES); 2221 dec_mm_counter_fast(mm, MM_FILEPAGES);
2222 inc_mm_counter_fast(mm, MM_ANONPAGES); 2222 inc_mm_counter_fast(mm, MM_ANONPAGES);
2223 } 2223 }
2224 } else 2224 } else
2225 inc_mm_counter_fast(mm, MM_ANONPAGES); 2225 inc_mm_counter_fast(mm, MM_ANONPAGES);
2226 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2226 flush_cache_page(vma, address, pte_pfn(orig_pte));
2227 entry = mk_pte(new_page, vma->vm_page_prot); 2227 entry = mk_pte(new_page, vma->vm_page_prot);
2228 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2228 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2229 /* 2229 /*
2230 * Clear the pte entry and flush it first, before updating the 2230 * Clear the pte entry and flush it first, before updating the
2231 * pte with the new entry. This will avoid a race condition 2231 * pte with the new entry. This will avoid a race condition
2232 * seen in the presence of one thread doing SMC and another 2232 * seen in the presence of one thread doing SMC and another
2233 * thread doing COW. 2233 * thread doing COW.
2234 */ 2234 */
2235 ptep_clear_flush(vma, address, page_table); 2235 ptep_clear_flush(vma, address, page_table);
2236 page_add_new_anon_rmap(new_page, vma, address); 2236 page_add_new_anon_rmap(new_page, vma, address);
2237 mem_cgroup_commit_charge(new_page, memcg, false); 2237 mem_cgroup_commit_charge(new_page, memcg, false);
2238 lru_cache_add_active_or_unevictable(new_page, vma); 2238 lru_cache_add_active_or_unevictable(new_page, vma);
2239 /* 2239 /*
2240 * We call the notify macro here because, when using secondary 2240 * We call the notify macro here because, when using secondary
2241 * mmu page tables (such as kvm shadow page tables), we want the 2241 * mmu page tables (such as kvm shadow page tables), we want the
2242 * new page to be mapped directly into the secondary page table. 2242 * new page to be mapped directly into the secondary page table.
2243 */ 2243 */
2244 set_pte_at_notify(mm, address, page_table, entry); 2244 set_pte_at_notify(mm, address, page_table, entry);
2245 update_mmu_cache(vma, address, page_table); 2245 update_mmu_cache(vma, address, page_table);
2246 if (old_page) { 2246 if (old_page) {
2247 /* 2247 /*
2248 * Only after switching the pte to the new page may 2248 * Only after switching the pte to the new page may
2249 * we remove the mapcount here. Otherwise another 2249 * we remove the mapcount here. Otherwise another
2250 * process may come and find the rmap count decremented 2250 * process may come and find the rmap count decremented
2251 * before the pte is switched to the new page, and 2251 * before the pte is switched to the new page, and
2252 * "reuse" the old page writing into it while our pte 2252 * "reuse" the old page writing into it while our pte
2253 * here still points into it and can be read by other 2253 * here still points into it and can be read by other
2254 * threads. 2254 * threads.
2255 * 2255 *
2256 * The critical issue is to order this 2256 * The critical issue is to order this
2257 * page_remove_rmap with the ptp_clear_flush above. 2257 * page_remove_rmap with the ptp_clear_flush above.
2258 * Those stores are ordered by (if nothing else,) 2258 * Those stores are ordered by (if nothing else,)
2259 * the barrier present in the atomic_add_negative 2259 * the barrier present in the atomic_add_negative
2260 * in page_remove_rmap. 2260 * in page_remove_rmap.
2261 * 2261 *
2262 * Then the TLB flush in ptep_clear_flush ensures that 2262 * Then the TLB flush in ptep_clear_flush ensures that
2263 * no process can access the old page before the 2263 * no process can access the old page before the
2264 * decremented mapcount is visible. And the old page 2264 * decremented mapcount is visible. And the old page
2265 * cannot be reused until after the decremented 2265 * cannot be reused until after the decremented
2266 * mapcount is visible. So transitively, TLBs to 2266 * mapcount is visible. So transitively, TLBs to
2267 * old page will be flushed before it can be reused. 2267 * old page will be flushed before it can be reused.
2268 */ 2268 */
2269 page_remove_rmap(old_page); 2269 page_remove_rmap(old_page);
2270 } 2270 }
2271 2271
2272 /* Free the old page.. */ 2272 /* Free the old page.. */
2273 new_page = old_page; 2273 new_page = old_page;
2274 ret |= VM_FAULT_WRITE; 2274 ret |= VM_FAULT_WRITE;
2275 } else 2275 } else
2276 mem_cgroup_cancel_charge(new_page, memcg); 2276 mem_cgroup_cancel_charge(new_page, memcg);
2277 2277
2278 if (new_page) 2278 if (new_page)
2279 page_cache_release(new_page); 2279 page_cache_release(new_page);
2280 unlock: 2280 unlock:
2281 pte_unmap_unlock(page_table, ptl); 2281 pte_unmap_unlock(page_table, ptl);
2282 if (mmun_end > mmun_start) 2282 if (mmun_end > mmun_start)
2283 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2283 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2284 if (old_page) { 2284 if (old_page) {
2285 /* 2285 /*
2286 * Don't let another task, with possibly unlocked vma, 2286 * Don't let another task, with possibly unlocked vma,
2287 * keep the mlocked page. 2287 * keep the mlocked page.
2288 */ 2288 */
2289 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { 2289 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2290 lock_page(old_page); /* LRU manipulation */ 2290 lock_page(old_page); /* LRU manipulation */
2291 munlock_vma_page(old_page); 2291 munlock_vma_page(old_page);
2292 unlock_page(old_page); 2292 unlock_page(old_page);
2293 } 2293 }
2294 page_cache_release(old_page); 2294 page_cache_release(old_page);
2295 } 2295 }
2296 return ret; 2296 return ret;
2297 oom_free_new: 2297 oom_free_new:
2298 page_cache_release(new_page); 2298 page_cache_release(new_page);
2299 oom: 2299 oom:
2300 if (old_page) 2300 if (old_page)
2301 page_cache_release(old_page); 2301 page_cache_release(old_page);
2302 return VM_FAULT_OOM; 2302 return VM_FAULT_OOM;
2303 } 2303 }
2304 2304
2305 static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2305 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2306 unsigned long start_addr, unsigned long end_addr, 2306 unsigned long start_addr, unsigned long end_addr,
2307 struct zap_details *details) 2307 struct zap_details *details)
2308 { 2308 {
2309 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 2309 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2310 } 2310 }
2311 2311
2312 static inline void unmap_mapping_range_tree(struct rb_root *root, 2312 static inline void unmap_mapping_range_tree(struct rb_root *root,
2313 struct zap_details *details) 2313 struct zap_details *details)
2314 { 2314 {
2315 struct vm_area_struct *vma; 2315 struct vm_area_struct *vma;
2316 pgoff_t vba, vea, zba, zea; 2316 pgoff_t vba, vea, zba, zea;
2317 2317
2318 vma_interval_tree_foreach(vma, root, 2318 vma_interval_tree_foreach(vma, root,
2319 details->first_index, details->last_index) { 2319 details->first_index, details->last_index) {
2320 2320
2321 vba = vma->vm_pgoff; 2321 vba = vma->vm_pgoff;
2322 vea = vba + vma_pages(vma) - 1; 2322 vea = vba + vma_pages(vma) - 1;
2323 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2323 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2324 zba = details->first_index; 2324 zba = details->first_index;
2325 if (zba < vba) 2325 if (zba < vba)
2326 zba = vba; 2326 zba = vba;
2327 zea = details->last_index; 2327 zea = details->last_index;
2328 if (zea > vea) 2328 if (zea > vea)
2329 zea = vea; 2329 zea = vea;
2330 2330
2331 unmap_mapping_range_vma(vma, 2331 unmap_mapping_range_vma(vma,
2332 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2332 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2333 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2333 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2334 details); 2334 details);
2335 } 2335 }
2336 } 2336 }
2337 2337
2338 static inline void unmap_mapping_range_list(struct list_head *head, 2338 static inline void unmap_mapping_range_list(struct list_head *head,
2339 struct zap_details *details) 2339 struct zap_details *details)
2340 { 2340 {
2341 struct vm_area_struct *vma; 2341 struct vm_area_struct *vma;
2342 2342
2343 /* 2343 /*
2344 * In nonlinear VMAs there is no correspondence between virtual address 2344 * In nonlinear VMAs there is no correspondence between virtual address
2345 * offset and file offset. So we must perform an exhaustive search 2345 * offset and file offset. So we must perform an exhaustive search
2346 * across *all* the pages in each nonlinear VMA, not just the pages 2346 * across *all* the pages in each nonlinear VMA, not just the pages
2347 * whose virtual address lies outside the file truncation point. 2347 * whose virtual address lies outside the file truncation point.
2348 */ 2348 */
2349 list_for_each_entry(vma, head, shared.nonlinear) { 2349 list_for_each_entry(vma, head, shared.nonlinear) {
2350 details->nonlinear_vma = vma; 2350 details->nonlinear_vma = vma;
2351 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); 2351 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2352 } 2352 }
2353 } 2353 }
2354 2354
2355 /** 2355 /**
2356 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. 2356 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
2357 * @mapping: the address space containing mmaps to be unmapped. 2357 * @mapping: the address space containing mmaps to be unmapped.
2358 * @holebegin: byte in first page to unmap, relative to the start of 2358 * @holebegin: byte in first page to unmap, relative to the start of
2359 * the underlying file. This will be rounded down to a PAGE_SIZE 2359 * the underlying file. This will be rounded down to a PAGE_SIZE
2360 * boundary. Note that this is different from truncate_pagecache(), which 2360 * boundary. Note that this is different from truncate_pagecache(), which
2361 * must keep the partial page. In contrast, we must get rid of 2361 * must keep the partial page. In contrast, we must get rid of
2362 * partial pages. 2362 * partial pages.
2363 * @holelen: size of prospective hole in bytes. This will be rounded 2363 * @holelen: size of prospective hole in bytes. This will be rounded
2364 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the 2364 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
2365 * end of the file. 2365 * end of the file.
2366 * @even_cows: 1 when truncating a file, unmap even private COWed pages; 2366 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
2367 * but 0 when invalidating pagecache, don't throw away private data. 2367 * but 0 when invalidating pagecache, don't throw away private data.
2368 */ 2368 */
2369 void unmap_mapping_range(struct address_space *mapping, 2369 void unmap_mapping_range(struct address_space *mapping,
2370 loff_t const holebegin, loff_t const holelen, int even_cows) 2370 loff_t const holebegin, loff_t const holelen, int even_cows)
2371 { 2371 {
2372 struct zap_details details; 2372 struct zap_details details;
2373 pgoff_t hba = holebegin >> PAGE_SHIFT; 2373 pgoff_t hba = holebegin >> PAGE_SHIFT;
2374 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2374 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2375 2375
2376 /* Check for overflow. */ 2376 /* Check for overflow. */
2377 if (sizeof(holelen) > sizeof(hlen)) { 2377 if (sizeof(holelen) > sizeof(hlen)) {
2378 long long holeend = 2378 long long holeend =
2379 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2379 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2380 if (holeend & ~(long long)ULONG_MAX) 2380 if (holeend & ~(long long)ULONG_MAX)
2381 hlen = ULONG_MAX - hba + 1; 2381 hlen = ULONG_MAX - hba + 1;
2382 } 2382 }
2383 2383
2384 details.check_mapping = even_cows? NULL: mapping; 2384 details.check_mapping = even_cows? NULL: mapping;
2385 details.nonlinear_vma = NULL; 2385 details.nonlinear_vma = NULL;
2386 details.first_index = hba; 2386 details.first_index = hba;
2387 details.last_index = hba + hlen - 1; 2387 details.last_index = hba + hlen - 1;
2388 if (details.last_index < details.first_index) 2388 if (details.last_index < details.first_index)
2389 details.last_index = ULONG_MAX; 2389 details.last_index = ULONG_MAX;
2390 2390
2391 2391
2392 mutex_lock(&mapping->i_mmap_mutex); 2392 mutex_lock(&mapping->i_mmap_mutex);
2393 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2393 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2394 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2394 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2395 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2395 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2396 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2396 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2397 mutex_unlock(&mapping->i_mmap_mutex); 2397 mutex_unlock(&mapping->i_mmap_mutex);
2398 } 2398 }
2399 EXPORT_SYMBOL(unmap_mapping_range); 2399 EXPORT_SYMBOL(unmap_mapping_range);
2400 2400
2401 /* 2401 /*
2402 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2402 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2403 * but allow concurrent faults), and pte mapped but not yet locked. 2403 * but allow concurrent faults), and pte mapped but not yet locked.
2404 * We return with pte unmapped and unlocked. 2404 * We return with pte unmapped and unlocked.
2405 * 2405 *
2406 * We return with the mmap_sem locked or unlocked in the same cases 2406 * We return with the mmap_sem locked or unlocked in the same cases
2407 * as does filemap_fault(). 2407 * as does filemap_fault().
2408 */ 2408 */
2409 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2409 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2410 unsigned long address, pte_t *page_table, pmd_t *pmd, 2410 unsigned long address, pte_t *page_table, pmd_t *pmd,
2411 unsigned int flags, pte_t orig_pte) 2411 unsigned int flags, pte_t orig_pte)
2412 { 2412 {
2413 spinlock_t *ptl; 2413 spinlock_t *ptl;
2414 struct page *page, *swapcache; 2414 struct page *page, *swapcache;
2415 struct mem_cgroup *memcg; 2415 struct mem_cgroup *memcg;
2416 swp_entry_t entry; 2416 swp_entry_t entry;
2417 pte_t pte; 2417 pte_t pte;
2418 int locked; 2418 int locked;
2419 int exclusive = 0; 2419 int exclusive = 0;
2420 int ret = 0; 2420 int ret = 0;
2421 2421
2422 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2422 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2423 goto out; 2423 goto out;
2424 2424
2425 entry = pte_to_swp_entry(orig_pte); 2425 entry = pte_to_swp_entry(orig_pte);
2426 if (unlikely(non_swap_entry(entry))) { 2426 if (unlikely(non_swap_entry(entry))) {
2427 if (is_migration_entry(entry)) { 2427 if (is_migration_entry(entry)) {
2428 migration_entry_wait(mm, pmd, address); 2428 migration_entry_wait(mm, pmd, address);
2429 } else if (is_hwpoison_entry(entry)) { 2429 } else if (is_hwpoison_entry(entry)) {
2430 ret = VM_FAULT_HWPOISON; 2430 ret = VM_FAULT_HWPOISON;
2431 } else { 2431 } else {
2432 print_bad_pte(vma, address, orig_pte, NULL); 2432 print_bad_pte(vma, address, orig_pte, NULL);
2433 ret = VM_FAULT_SIGBUS; 2433 ret = VM_FAULT_SIGBUS;
2434 } 2434 }
2435 goto out; 2435 goto out;
2436 } 2436 }
2437 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2437 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2438 page = lookup_swap_cache(entry); 2438 page = lookup_swap_cache(entry);
2439 if (!page) { 2439 if (!page) {
2440 page = swapin_readahead(entry, 2440 page = swapin_readahead(entry,
2441 GFP_HIGHUSER_MOVABLE, vma, address); 2441 GFP_HIGHUSER_MOVABLE, vma, address);
2442 if (!page) { 2442 if (!page) {
2443 /* 2443 /*
2444 * Back out if somebody else faulted in this pte 2444 * Back out if somebody else faulted in this pte
2445 * while we released the pte lock. 2445 * while we released the pte lock.
2446 */ 2446 */
2447 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2447 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2448 if (likely(pte_same(*page_table, orig_pte))) 2448 if (likely(pte_same(*page_table, orig_pte)))
2449 ret = VM_FAULT_OOM; 2449 ret = VM_FAULT_OOM;
2450 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2450 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2451 goto unlock; 2451 goto unlock;
2452 } 2452 }
2453 2453
2454 /* Had to read the page from swap area: Major fault */ 2454 /* Had to read the page from swap area: Major fault */
2455 ret = VM_FAULT_MAJOR; 2455 ret = VM_FAULT_MAJOR;
2456 count_vm_event(PGMAJFAULT); 2456 count_vm_event(PGMAJFAULT);
2457 mem_cgroup_count_vm_event(mm, PGMAJFAULT); 2457 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2458 } else if (PageHWPoison(page)) { 2458 } else if (PageHWPoison(page)) {
2459 /* 2459 /*
2460 * hwpoisoned dirty swapcache pages are kept for killing 2460 * hwpoisoned dirty swapcache pages are kept for killing
2461 * owner processes (which may be unknown at hwpoison time) 2461 * owner processes (which may be unknown at hwpoison time)
2462 */ 2462 */
2463 ret = VM_FAULT_HWPOISON; 2463 ret = VM_FAULT_HWPOISON;
2464 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2464 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2465 swapcache = page; 2465 swapcache = page;
2466 goto out_release; 2466 goto out_release;
2467 } 2467 }
2468 2468
2469 swapcache = page; 2469 swapcache = page;
2470 locked = lock_page_or_retry(page, mm, flags); 2470 locked = lock_page_or_retry(page, mm, flags);
2471 2471
2472 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2472 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2473 if (!locked) { 2473 if (!locked) {
2474 ret |= VM_FAULT_RETRY; 2474 ret |= VM_FAULT_RETRY;
2475 goto out_release; 2475 goto out_release;
2476 } 2476 }
2477 2477
2478 /* 2478 /*
2479 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 2479 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
2480 * release the swapcache from under us. The page pin, and pte_same 2480 * release the swapcache from under us. The page pin, and pte_same
2481 * test below, are not enough to exclude that. Even if it is still 2481 * test below, are not enough to exclude that. Even if it is still
2482 * swapcache, we need to check that the page's swap has not changed. 2482 * swapcache, we need to check that the page's swap has not changed.
2483 */ 2483 */
2484 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2484 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2485 goto out_page; 2485 goto out_page;
2486 2486
2487 page = ksm_might_need_to_copy(page, vma, address); 2487 page = ksm_might_need_to_copy(page, vma, address);
2488 if (unlikely(!page)) { 2488 if (unlikely(!page)) {
2489 ret = VM_FAULT_OOM; 2489 ret = VM_FAULT_OOM;
2490 page = swapcache; 2490 page = swapcache;
2491 goto out_page; 2491 goto out_page;
2492 } 2492 }
2493 2493
2494 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { 2494 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2495 ret = VM_FAULT_OOM; 2495 ret = VM_FAULT_OOM;
2496 goto out_page; 2496 goto out_page;
2497 } 2497 }
2498 2498
2499 /* 2499 /*
2500 * Back out if somebody else already faulted in this pte. 2500 * Back out if somebody else already faulted in this pte.
2501 */ 2501 */
2502 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2502 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2503 if (unlikely(!pte_same(*page_table, orig_pte))) 2503 if (unlikely(!pte_same(*page_table, orig_pte)))
2504 goto out_nomap; 2504 goto out_nomap;
2505 2505
2506 if (unlikely(!PageUptodate(page))) { 2506 if (unlikely(!PageUptodate(page))) {
2507 ret = VM_FAULT_SIGBUS; 2507 ret = VM_FAULT_SIGBUS;
2508 goto out_nomap; 2508 goto out_nomap;
2509 } 2509 }
2510 2510
2511 /* 2511 /*
2512 * The page isn't present yet, go ahead with the fault. 2512 * The page isn't present yet, go ahead with the fault.
2513 * 2513 *
2514 * Be careful about the sequence of operations here. 2514 * Be careful about the sequence of operations here.
2515 * To get its accounting right, reuse_swap_page() must be called 2515 * To get its accounting right, reuse_swap_page() must be called
2516 * while the page is counted on swap but not yet in mapcount i.e. 2516 * while the page is counted on swap but not yet in mapcount i.e.
2517 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 2517 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2518 * must be called after the swap_free(), or it will never succeed. 2518 * must be called after the swap_free(), or it will never succeed.
2519 */ 2519 */
2520 2520
2521 inc_mm_counter_fast(mm, MM_ANONPAGES); 2521 inc_mm_counter_fast(mm, MM_ANONPAGES);
2522 dec_mm_counter_fast(mm, MM_SWAPENTS); 2522 dec_mm_counter_fast(mm, MM_SWAPENTS);
2523 pte = mk_pte(page, vma->vm_page_prot); 2523 pte = mk_pte(page, vma->vm_page_prot);
2524 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2524 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2525 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2525 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2526 flags &= ~FAULT_FLAG_WRITE; 2526 flags &= ~FAULT_FLAG_WRITE;
2527 ret |= VM_FAULT_WRITE; 2527 ret |= VM_FAULT_WRITE;
2528 exclusive = 1; 2528 exclusive = 1;
2529 } 2529 }
2530 flush_icache_page(vma, page); 2530 flush_icache_page(vma, page);
2531 if (pte_swp_soft_dirty(orig_pte)) 2531 if (pte_swp_soft_dirty(orig_pte))
2532 pte = pte_mksoft_dirty(pte); 2532 pte = pte_mksoft_dirty(pte);
2533 set_pte_at(mm, address, page_table, pte); 2533 set_pte_at(mm, address, page_table, pte);
2534 if (page == swapcache) { 2534 if (page == swapcache) {
2535 do_page_add_anon_rmap(page, vma, address, exclusive); 2535 do_page_add_anon_rmap(page, vma, address, exclusive);
2536 mem_cgroup_commit_charge(page, memcg, true); 2536 mem_cgroup_commit_charge(page, memcg, true);
2537 } else { /* ksm created a completely new copy */ 2537 } else { /* ksm created a completely new copy */
2538 page_add_new_anon_rmap(page, vma, address); 2538 page_add_new_anon_rmap(page, vma, address);
2539 mem_cgroup_commit_charge(page, memcg, false); 2539 mem_cgroup_commit_charge(page, memcg, false);
2540 lru_cache_add_active_or_unevictable(page, vma); 2540 lru_cache_add_active_or_unevictable(page, vma);
2541 } 2541 }
2542 2542
2543 swap_free(entry); 2543 swap_free(entry);
2544 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2544 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2545 try_to_free_swap(page); 2545 try_to_free_swap(page);
2546 unlock_page(page); 2546 unlock_page(page);
2547 if (page != swapcache) { 2547 if (page != swapcache) {
2548 /* 2548 /*
2549 * Hold the lock to avoid the swap entry to be reused 2549 * Hold the lock to avoid the swap entry to be reused
2550 * until we take the PT lock for the pte_same() check 2550 * until we take the PT lock for the pte_same() check
2551 * (to avoid false positives from pte_same). For 2551 * (to avoid false positives from pte_same). For
2552 * further safety release the lock after the swap_free 2552 * further safety release the lock after the swap_free
2553 * so that the swap count won't change under a 2553 * so that the swap count won't change under a
2554 * parallel locked swapcache. 2554 * parallel locked swapcache.
2555 */ 2555 */
2556 unlock_page(swapcache); 2556 unlock_page(swapcache);
2557 page_cache_release(swapcache); 2557 page_cache_release(swapcache);
2558 } 2558 }
2559 2559
2560 if (flags & FAULT_FLAG_WRITE) { 2560 if (flags & FAULT_FLAG_WRITE) {
2561 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2561 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2562 if (ret & VM_FAULT_ERROR) 2562 if (ret & VM_FAULT_ERROR)
2563 ret &= VM_FAULT_ERROR; 2563 ret &= VM_FAULT_ERROR;
2564 goto out; 2564 goto out;
2565 } 2565 }
2566 2566
2567 /* No need to invalidate - it was non-present before */ 2567 /* No need to invalidate - it was non-present before */
2568 update_mmu_cache(vma, address, page_table); 2568 update_mmu_cache(vma, address, page_table);
2569 unlock: 2569 unlock:
2570 pte_unmap_unlock(page_table, ptl); 2570 pte_unmap_unlock(page_table, ptl);
2571 out: 2571 out:
2572 return ret; 2572 return ret;
2573 out_nomap: 2573 out_nomap:
2574 mem_cgroup_cancel_charge(page, memcg); 2574 mem_cgroup_cancel_charge(page, memcg);
2575 pte_unmap_unlock(page_table, ptl); 2575 pte_unmap_unlock(page_table, ptl);
2576 out_page: 2576 out_page:
2577 unlock_page(page); 2577 unlock_page(page);
2578 out_release: 2578 out_release:
2579 page_cache_release(page); 2579 page_cache_release(page);
2580 if (page != swapcache) { 2580 if (page != swapcache) {
2581 unlock_page(swapcache); 2581 unlock_page(swapcache);
2582 page_cache_release(swapcache); 2582 page_cache_release(swapcache);
2583 } 2583 }
2584 return ret; 2584 return ret;
2585 } 2585 }
2586 2586
2587 /* 2587 /*
2588 * This is like a special single-page "expand_{down|up}wards()", 2588 * This is like a special single-page "expand_{down|up}wards()",
2589 * except we must first make sure that 'address{-|+}PAGE_SIZE' 2589 * except we must first make sure that 'address{-|+}PAGE_SIZE'
2590 * doesn't hit another vma. 2590 * doesn't hit another vma.
2591 */ 2591 */
2592 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) 2592 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2593 { 2593 {
2594 address &= PAGE_MASK; 2594 address &= PAGE_MASK;
2595 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { 2595 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2596 struct vm_area_struct *prev = vma->vm_prev; 2596 struct vm_area_struct *prev = vma->vm_prev;
2597 2597
2598 /* 2598 /*
2599 * Is there a mapping abutting this one below? 2599 * Is there a mapping abutting this one below?
2600 * 2600 *
2601 * That's only ok if it's the same stack mapping 2601 * That's only ok if it's the same stack mapping
2602 * that has gotten split.. 2602 * that has gotten split..
2603 */ 2603 */
2604 if (prev && prev->vm_end == address) 2604 if (prev && prev->vm_end == address)
2605 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; 2605 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2606 2606
2607 expand_downwards(vma, address - PAGE_SIZE); 2607 expand_downwards(vma, address - PAGE_SIZE);
2608 } 2608 }
2609 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { 2609 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2610 struct vm_area_struct *next = vma->vm_next; 2610 struct vm_area_struct *next = vma->vm_next;
2611 2611
2612 /* As VM_GROWSDOWN but s/below/above/ */ 2612 /* As VM_GROWSDOWN but s/below/above/ */
2613 if (next && next->vm_start == address + PAGE_SIZE) 2613 if (next && next->vm_start == address + PAGE_SIZE)
2614 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; 2614 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2615 2615
2616 expand_upwards(vma, address + PAGE_SIZE); 2616 expand_upwards(vma, address + PAGE_SIZE);
2617 } 2617 }
2618 return 0; 2618 return 0;
2619 } 2619 }
2620 2620
2621 /* 2621 /*
2622 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2622 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2623 * but allow concurrent faults), and pte mapped but not yet locked. 2623 * but allow concurrent faults), and pte mapped but not yet locked.
2624 * We return with mmap_sem still held, but pte unmapped and unlocked. 2624 * We return with mmap_sem still held, but pte unmapped and unlocked.
2625 */ 2625 */
2626 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2626 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2627 unsigned long address, pte_t *page_table, pmd_t *pmd, 2627 unsigned long address, pte_t *page_table, pmd_t *pmd,
2628 unsigned int flags) 2628 unsigned int flags)
2629 { 2629 {
2630 struct mem_cgroup *memcg; 2630 struct mem_cgroup *memcg;
2631 struct page *page; 2631 struct page *page;
2632 spinlock_t *ptl; 2632 spinlock_t *ptl;
2633 pte_t entry; 2633 pte_t entry;
2634 2634
2635 pte_unmap(page_table); 2635 pte_unmap(page_table);
2636 2636
2637 /* Check if we need to add a guard page to the stack */ 2637 /* Check if we need to add a guard page to the stack */
2638 if (check_stack_guard_page(vma, address) < 0) 2638 if (check_stack_guard_page(vma, address) < 0)
2639 return VM_FAULT_SIGBUS; 2639 return VM_FAULT_SIGBUS;
2640 2640
2641 /* Use the zero-page for reads */ 2641 /* Use the zero-page for reads */
2642 if (!(flags & FAULT_FLAG_WRITE)) { 2642 if (!(flags & FAULT_FLAG_WRITE)) {
2643 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2643 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2644 vma->vm_page_prot)); 2644 vma->vm_page_prot));
2645 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2645 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2646 if (!pte_none(*page_table)) 2646 if (!pte_none(*page_table))
2647 goto unlock; 2647 goto unlock;
2648 goto setpte; 2648 goto setpte;
2649 } 2649 }
2650 2650
2651 /* Allocate our own private page. */ 2651 /* Allocate our own private page. */
2652 if (unlikely(anon_vma_prepare(vma))) 2652 if (unlikely(anon_vma_prepare(vma)))
2653 goto oom; 2653 goto oom;
2654 page = alloc_zeroed_user_highpage_movable(vma, address); 2654 page = alloc_zeroed_user_highpage_movable(vma, address);
2655 if (!page) 2655 if (!page)
2656 goto oom; 2656 goto oom;
2657 /* 2657 /*
2658 * The memory barrier inside __SetPageUptodate makes sure that 2658 * The memory barrier inside __SetPageUptodate makes sure that
2659 * preceeding stores to the page contents become visible before 2659 * preceeding stores to the page contents become visible before
2660 * the set_pte_at() write. 2660 * the set_pte_at() write.
2661 */ 2661 */
2662 __SetPageUptodate(page); 2662 __SetPageUptodate(page);
2663 2663
2664 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) 2664 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2665 goto oom_free_page; 2665 goto oom_free_page;
2666 2666
2667 entry = mk_pte(page, vma->vm_page_prot); 2667 entry = mk_pte(page, vma->vm_page_prot);
2668 if (vma->vm_flags & VM_WRITE) 2668 if (vma->vm_flags & VM_WRITE)
2669 entry = pte_mkwrite(pte_mkdirty(entry)); 2669 entry = pte_mkwrite(pte_mkdirty(entry));
2670 2670
2671 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2671 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2672 if (!pte_none(*page_table)) 2672 if (!pte_none(*page_table))
2673 goto release; 2673 goto release;
2674 2674
2675 inc_mm_counter_fast(mm, MM_ANONPAGES); 2675 inc_mm_counter_fast(mm, MM_ANONPAGES);
2676 page_add_new_anon_rmap(page, vma, address); 2676 page_add_new_anon_rmap(page, vma, address);
2677 mem_cgroup_commit_charge(page, memcg, false); 2677 mem_cgroup_commit_charge(page, memcg, false);
2678 lru_cache_add_active_or_unevictable(page, vma); 2678 lru_cache_add_active_or_unevictable(page, vma);
2679 setpte: 2679 setpte:
2680 set_pte_at(mm, address, page_table, entry); 2680 set_pte_at(mm, address, page_table, entry);
2681 2681
2682 /* No need to invalidate - it was non-present before */ 2682 /* No need to invalidate - it was non-present before */
2683 update_mmu_cache(vma, address, page_table); 2683 update_mmu_cache(vma, address, page_table);
2684 unlock: 2684 unlock:
2685 pte_unmap_unlock(page_table, ptl); 2685 pte_unmap_unlock(page_table, ptl);
2686 return 0; 2686 return 0;
2687 release: 2687 release:
2688 mem_cgroup_cancel_charge(page, memcg); 2688 mem_cgroup_cancel_charge(page, memcg);
2689 page_cache_release(page); 2689 page_cache_release(page);
2690 goto unlock; 2690 goto unlock;
2691 oom_free_page: 2691 oom_free_page:
2692 page_cache_release(page); 2692 page_cache_release(page);
2693 oom: 2693 oom:
2694 return VM_FAULT_OOM; 2694 return VM_FAULT_OOM;
2695 } 2695 }
2696 2696
2697 /* 2697 /*
2698 * The mmap_sem must have been held on entry, and may have been 2698 * The mmap_sem must have been held on entry, and may have been
2699 * released depending on flags and vma->vm_ops->fault() return value. 2699 * released depending on flags and vma->vm_ops->fault() return value.
2700 * See filemap_fault() and __lock_page_retry(). 2700 * See filemap_fault() and __lock_page_retry().
2701 */ 2701 */
2702 static int __do_fault(struct vm_area_struct *vma, unsigned long address, 2702 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2703 pgoff_t pgoff, unsigned int flags, struct page **page) 2703 pgoff_t pgoff, unsigned int flags, struct page **page)
2704 { 2704 {
2705 struct vm_fault vmf; 2705 struct vm_fault vmf;
2706 int ret; 2706 int ret;
2707 2707
2708 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2708 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2709 vmf.pgoff = pgoff; 2709 vmf.pgoff = pgoff;
2710 vmf.flags = flags; 2710 vmf.flags = flags;
2711 vmf.page = NULL; 2711 vmf.page = NULL;
2712 2712
2713 ret = vma->vm_ops->fault(vma, &vmf); 2713 ret = vma->vm_ops->fault(vma, &vmf);
2714 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 2714 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2715 return ret; 2715 return ret;
2716 2716
2717 if (unlikely(PageHWPoison(vmf.page))) { 2717 if (unlikely(PageHWPoison(vmf.page))) {
2718 if (ret & VM_FAULT_LOCKED) 2718 if (ret & VM_FAULT_LOCKED)
2719 unlock_page(vmf.page); 2719 unlock_page(vmf.page);
2720 page_cache_release(vmf.page); 2720 page_cache_release(vmf.page);
2721 return VM_FAULT_HWPOISON; 2721 return VM_FAULT_HWPOISON;
2722 } 2722 }
2723 2723
2724 if (unlikely(!(ret & VM_FAULT_LOCKED))) 2724 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2725 lock_page(vmf.page); 2725 lock_page(vmf.page);
2726 else 2726 else
2727 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); 2727 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2728 2728
2729 *page = vmf.page; 2729 *page = vmf.page;
2730 return ret; 2730 return ret;
2731 } 2731 }
2732 2732
2733 /** 2733 /**
2734 * do_set_pte - setup new PTE entry for given page and add reverse page mapping. 2734 * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
2735 * 2735 *
2736 * @vma: virtual memory area 2736 * @vma: virtual memory area
2737 * @address: user virtual address 2737 * @address: user virtual address
2738 * @page: page to map 2738 * @page: page to map
2739 * @pte: pointer to target page table entry 2739 * @pte: pointer to target page table entry
2740 * @write: true, if new entry is writable 2740 * @write: true, if new entry is writable
2741 * @anon: true, if it's anonymous page 2741 * @anon: true, if it's anonymous page
2742 * 2742 *
2743 * Caller must hold page table lock relevant for @pte. 2743 * Caller must hold page table lock relevant for @pte.
2744 * 2744 *
2745 * Target users are page handler itself and implementations of 2745 * Target users are page handler itself and implementations of
2746 * vm_ops->map_pages. 2746 * vm_ops->map_pages.
2747 */ 2747 */
2748 void do_set_pte(struct vm_area_struct *vma, unsigned long address, 2748 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2749 struct page *page, pte_t *pte, bool write, bool anon) 2749 struct page *page, pte_t *pte, bool write, bool anon)
2750 { 2750 {
2751 pte_t entry; 2751 pte_t entry;
2752 2752
2753 flush_icache_page(vma, page); 2753 flush_icache_page(vma, page);
2754 entry = mk_pte(page, vma->vm_page_prot); 2754 entry = mk_pte(page, vma->vm_page_prot);
2755 if (write) 2755 if (write)
2756 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2756 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2757 else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) 2757 else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
2758 entry = pte_mksoft_dirty(entry); 2758 entry = pte_mksoft_dirty(entry);
2759 if (anon) { 2759 if (anon) {
2760 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2760 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2761 page_add_new_anon_rmap(page, vma, address); 2761 page_add_new_anon_rmap(page, vma, address);
2762 } else { 2762 } else {
2763 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); 2763 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
2764 page_add_file_rmap(page); 2764 page_add_file_rmap(page);
2765 } 2765 }
2766 set_pte_at(vma->vm_mm, address, pte, entry); 2766 set_pte_at(vma->vm_mm, address, pte, entry);
2767 2767
2768 /* no need to invalidate: a not-present page won't be cached */ 2768 /* no need to invalidate: a not-present page won't be cached */
2769 update_mmu_cache(vma, address, pte); 2769 update_mmu_cache(vma, address, pte);
2770 } 2770 }
2771 2771
2772 static unsigned long fault_around_bytes __read_mostly = 2772 static unsigned long fault_around_bytes __read_mostly =
2773 rounddown_pow_of_two(65536); 2773 rounddown_pow_of_two(65536);
2774 2774
2775 #ifdef CONFIG_DEBUG_FS 2775 #ifdef CONFIG_DEBUG_FS
2776 static int fault_around_bytes_get(void *data, u64 *val) 2776 static int fault_around_bytes_get(void *data, u64 *val)
2777 { 2777 {
2778 *val = fault_around_bytes; 2778 *val = fault_around_bytes;
2779 return 0; 2779 return 0;
2780 } 2780 }
2781 2781
2782 /* 2782 /*
2783 * fault_around_pages() and fault_around_mask() expects fault_around_bytes 2783 * fault_around_pages() and fault_around_mask() expects fault_around_bytes
2784 * rounded down to nearest page order. It's what do_fault_around() expects to 2784 * rounded down to nearest page order. It's what do_fault_around() expects to
2785 * see. 2785 * see.
2786 */ 2786 */
2787 static int fault_around_bytes_set(void *data, u64 val) 2787 static int fault_around_bytes_set(void *data, u64 val)
2788 { 2788 {
2789 if (val / PAGE_SIZE > PTRS_PER_PTE) 2789 if (val / PAGE_SIZE > PTRS_PER_PTE)
2790 return -EINVAL; 2790 return -EINVAL;
2791 if (val > PAGE_SIZE) 2791 if (val > PAGE_SIZE)
2792 fault_around_bytes = rounddown_pow_of_two(val); 2792 fault_around_bytes = rounddown_pow_of_two(val);
2793 else 2793 else
2794 fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ 2794 fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
2795 return 0; 2795 return 0;
2796 } 2796 }
2797 DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, 2797 DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
2798 fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); 2798 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
2799 2799
2800 static int __init fault_around_debugfs(void) 2800 static int __init fault_around_debugfs(void)
2801 { 2801 {
2802 void *ret; 2802 void *ret;
2803 2803
2804 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, 2804 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
2805 &fault_around_bytes_fops); 2805 &fault_around_bytes_fops);
2806 if (!ret) 2806 if (!ret)
2807 pr_warn("Failed to create fault_around_bytes in debugfs"); 2807 pr_warn("Failed to create fault_around_bytes in debugfs");
2808 return 0; 2808 return 0;
2809 } 2809 }
2810 late_initcall(fault_around_debugfs); 2810 late_initcall(fault_around_debugfs);
2811 #endif 2811 #endif
2812 2812
2813 /* 2813 /*
2814 * do_fault_around() tries to map few pages around the fault address. The hope 2814 * do_fault_around() tries to map few pages around the fault address. The hope
2815 * is that the pages will be needed soon and this will lower the number of 2815 * is that the pages will be needed soon and this will lower the number of
2816 * faults to handle. 2816 * faults to handle.
2817 * 2817 *
2818 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's 2818 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
2819 * not ready to be mapped: not up-to-date, locked, etc. 2819 * not ready to be mapped: not up-to-date, locked, etc.
2820 * 2820 *
2821 * This function is called with the page table lock taken. In the split ptlock 2821 * This function is called with the page table lock taken. In the split ptlock
2822 * case the page table lock only protects only those entries which belong to 2822 * case the page table lock only protects only those entries which belong to
2823 * the page table corresponding to the fault address. 2823 * the page table corresponding to the fault address.
2824 * 2824 *
2825 * This function doesn't cross the VMA boundaries, in order to call map_pages() 2825 * This function doesn't cross the VMA boundaries, in order to call map_pages()
2826 * only once. 2826 * only once.
2827 * 2827 *
2828 * fault_around_pages() defines how many pages we'll try to map. 2828 * fault_around_pages() defines how many pages we'll try to map.
2829 * do_fault_around() expects it to return a power of two less than or equal to 2829 * do_fault_around() expects it to return a power of two less than or equal to
2830 * PTRS_PER_PTE. 2830 * PTRS_PER_PTE.
2831 * 2831 *
2832 * The virtual address of the area that we map is naturally aligned to the 2832 * The virtual address of the area that we map is naturally aligned to the
2833 * fault_around_pages() value (and therefore to page order). This way it's 2833 * fault_around_pages() value (and therefore to page order). This way it's
2834 * easier to guarantee that we don't cross page table boundaries. 2834 * easier to guarantee that we don't cross page table boundaries.
2835 */ 2835 */
2836 static void do_fault_around(struct vm_area_struct *vma, unsigned long address, 2836 static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2837 pte_t *pte, pgoff_t pgoff, unsigned int flags) 2837 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2838 { 2838 {
2839 unsigned long start_addr, nr_pages, mask; 2839 unsigned long start_addr, nr_pages, mask;
2840 pgoff_t max_pgoff; 2840 pgoff_t max_pgoff;
2841 struct vm_fault vmf; 2841 struct vm_fault vmf;
2842 int off; 2842 int off;
2843 2843
2844 nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; 2844 nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2845 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 2845 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2846 2846
2847 start_addr = max(address & mask, vma->vm_start); 2847 start_addr = max(address & mask, vma->vm_start);
2848 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 2848 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2849 pte -= off; 2849 pte -= off;
2850 pgoff -= off; 2850 pgoff -= off;
2851 2851
2852 /* 2852 /*
2853 * max_pgoff is either end of page table or end of vma 2853 * max_pgoff is either end of page table or end of vma
2854 * or fault_around_pages() from pgoff, depending what is nearest. 2854 * or fault_around_pages() from pgoff, depending what is nearest.
2855 */ 2855 */
2856 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 2856 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2857 PTRS_PER_PTE - 1; 2857 PTRS_PER_PTE - 1;
2858 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, 2858 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
2859 pgoff + nr_pages - 1); 2859 pgoff + nr_pages - 1);
2860 2860
2861 /* Check if it makes any sense to call ->map_pages */ 2861 /* Check if it makes any sense to call ->map_pages */
2862 while (!pte_none(*pte)) { 2862 while (!pte_none(*pte)) {
2863 if (++pgoff > max_pgoff) 2863 if (++pgoff > max_pgoff)
2864 return; 2864 return;
2865 start_addr += PAGE_SIZE; 2865 start_addr += PAGE_SIZE;
2866 if (start_addr >= vma->vm_end) 2866 if (start_addr >= vma->vm_end)
2867 return; 2867 return;
2868 pte++; 2868 pte++;
2869 } 2869 }
2870 2870
2871 vmf.virtual_address = (void __user *) start_addr; 2871 vmf.virtual_address = (void __user *) start_addr;
2872 vmf.pte = pte; 2872 vmf.pte = pte;
2873 vmf.pgoff = pgoff; 2873 vmf.pgoff = pgoff;
2874 vmf.max_pgoff = max_pgoff; 2874 vmf.max_pgoff = max_pgoff;
2875 vmf.flags = flags; 2875 vmf.flags = flags;
2876 vma->vm_ops->map_pages(vma, &vmf); 2876 vma->vm_ops->map_pages(vma, &vmf);
2877 } 2877 }
2878 2878
2879 static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2879 static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2880 unsigned long address, pmd_t *pmd, 2880 unsigned long address, pmd_t *pmd,
2881 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 2881 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2882 { 2882 {
2883 struct page *fault_page; 2883 struct page *fault_page;
2884 spinlock_t *ptl; 2884 spinlock_t *ptl;
2885 pte_t *pte; 2885 pte_t *pte;
2886 int ret = 0; 2886 int ret = 0;
2887 2887
2888 /* 2888 /*
2889 * Let's call ->map_pages() first and use ->fault() as fallback 2889 * Let's call ->map_pages() first and use ->fault() as fallback
2890 * if page by the offset is not ready to be mapped (cold cache or 2890 * if page by the offset is not ready to be mapped (cold cache or
2891 * something). 2891 * something).
2892 */ 2892 */
2893 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && 2893 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
2894 fault_around_bytes >> PAGE_SHIFT > 1) { 2894 fault_around_bytes >> PAGE_SHIFT > 1) {
2895 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2895 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2896 do_fault_around(vma, address, pte, pgoff, flags); 2896 do_fault_around(vma, address, pte, pgoff, flags);
2897 if (!pte_same(*pte, orig_pte)) 2897 if (!pte_same(*pte, orig_pte))
2898 goto unlock_out; 2898 goto unlock_out;
2899 pte_unmap_unlock(pte, ptl); 2899 pte_unmap_unlock(pte, ptl);
2900 } 2900 }
2901 2901
2902 ret = __do_fault(vma, address, pgoff, flags, &fault_page); 2902 ret = __do_fault(vma, address, pgoff, flags, &fault_page);
2903 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 2903 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2904 return ret; 2904 return ret;
2905 2905
2906 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2906 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2907 if (unlikely(!pte_same(*pte, orig_pte))) { 2907 if (unlikely(!pte_same(*pte, orig_pte))) {
2908 pte_unmap_unlock(pte, ptl); 2908 pte_unmap_unlock(pte, ptl);
2909 unlock_page(fault_page); 2909 unlock_page(fault_page);
2910 page_cache_release(fault_page); 2910 page_cache_release(fault_page);
2911 return ret; 2911 return ret;
2912 } 2912 }
2913 do_set_pte(vma, address, fault_page, pte, false, false); 2913 do_set_pte(vma, address, fault_page, pte, false, false);
2914 unlock_page(fault_page); 2914 unlock_page(fault_page);
2915 unlock_out: 2915 unlock_out:
2916 pte_unmap_unlock(pte, ptl); 2916 pte_unmap_unlock(pte, ptl);
2917 return ret; 2917 return ret;
2918 } 2918 }
2919 2919
2920 static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2920 static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2921 unsigned long address, pmd_t *pmd, 2921 unsigned long address, pmd_t *pmd,
2922 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 2922 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2923 { 2923 {
2924 struct page *fault_page, *new_page; 2924 struct page *fault_page, *new_page;
2925 struct mem_cgroup *memcg; 2925 struct mem_cgroup *memcg;
2926 spinlock_t *ptl; 2926 spinlock_t *ptl;
2927 pte_t *pte; 2927 pte_t *pte;
2928 int ret; 2928 int ret;
2929 2929
2930 if (unlikely(anon_vma_prepare(vma))) 2930 if (unlikely(anon_vma_prepare(vma)))
2931 return VM_FAULT_OOM; 2931 return VM_FAULT_OOM;
2932 2932
2933 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2933 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2934 if (!new_page) 2934 if (!new_page)
2935 return VM_FAULT_OOM; 2935 return VM_FAULT_OOM;
2936 2936
2937 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { 2937 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2938 page_cache_release(new_page); 2938 page_cache_release(new_page);
2939 return VM_FAULT_OOM; 2939 return VM_FAULT_OOM;
2940 } 2940 }
2941 2941
2942 ret = __do_fault(vma, address, pgoff, flags, &fault_page); 2942 ret = __do_fault(vma, address, pgoff, flags, &fault_page);
2943 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 2943 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2944 goto uncharge_out; 2944 goto uncharge_out;
2945 2945
2946 copy_user_highpage(new_page, fault_page, address, vma); 2946 copy_user_highpage(new_page, fault_page, address, vma);
2947 __SetPageUptodate(new_page); 2947 __SetPageUptodate(new_page);
2948 2948
2949 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2949 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2950 if (unlikely(!pte_same(*pte, orig_pte))) { 2950 if (unlikely(!pte_same(*pte, orig_pte))) {
2951 pte_unmap_unlock(pte, ptl); 2951 pte_unmap_unlock(pte, ptl);
2952 unlock_page(fault_page); 2952 unlock_page(fault_page);
2953 page_cache_release(fault_page); 2953 page_cache_release(fault_page);
2954 goto uncharge_out; 2954 goto uncharge_out;
2955 } 2955 }
2956 do_set_pte(vma, address, new_page, pte, true, true); 2956 do_set_pte(vma, address, new_page, pte, true, true);
2957 mem_cgroup_commit_charge(new_page, memcg, false); 2957 mem_cgroup_commit_charge(new_page, memcg, false);
2958 lru_cache_add_active_or_unevictable(new_page, vma); 2958 lru_cache_add_active_or_unevictable(new_page, vma);
2959 pte_unmap_unlock(pte, ptl); 2959 pte_unmap_unlock(pte, ptl);
2960 unlock_page(fault_page); 2960 unlock_page(fault_page);
2961 page_cache_release(fault_page); 2961 page_cache_release(fault_page);
2962 return ret; 2962 return ret;
2963 uncharge_out: 2963 uncharge_out:
2964 mem_cgroup_cancel_charge(new_page, memcg); 2964 mem_cgroup_cancel_charge(new_page, memcg);
2965 page_cache_release(new_page); 2965 page_cache_release(new_page);
2966 return ret; 2966 return ret;
2967 } 2967 }
2968 2968
2969 static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2969 static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2970 unsigned long address, pmd_t *pmd, 2970 unsigned long address, pmd_t *pmd,
2971 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 2971 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2972 { 2972 {
2973 struct page *fault_page; 2973 struct page *fault_page;
2974 struct address_space *mapping; 2974 struct address_space *mapping;
2975 spinlock_t *ptl; 2975 spinlock_t *ptl;
2976 pte_t *pte; 2976 pte_t *pte;
2977 int dirtied = 0; 2977 int dirtied = 0;
2978 int ret, tmp; 2978 int ret, tmp;
2979 2979
2980 ret = __do_fault(vma, address, pgoff, flags, &fault_page); 2980 ret = __do_fault(vma, address, pgoff, flags, &fault_page);
2981 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 2981 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2982 return ret; 2982 return ret;
2983 2983
2984 /* 2984 /*
2985 * Check if the backing address space wants to know that the page is 2985 * Check if the backing address space wants to know that the page is
2986 * about to become writable 2986 * about to become writable
2987 */ 2987 */
2988 if (vma->vm_ops->page_mkwrite) { 2988 if (vma->vm_ops->page_mkwrite) {
2989 unlock_page(fault_page); 2989 unlock_page(fault_page);
2990 tmp = do_page_mkwrite(vma, fault_page, address); 2990 tmp = do_page_mkwrite(vma, fault_page, address);
2991 if (unlikely(!tmp || 2991 if (unlikely(!tmp ||
2992 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 2992 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2993 page_cache_release(fault_page); 2993 page_cache_release(fault_page);
2994 return tmp; 2994 return tmp;
2995 } 2995 }
2996 } 2996 }
2997 2997
2998 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2998 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2999 if (unlikely(!pte_same(*pte, orig_pte))) { 2999 if (unlikely(!pte_same(*pte, orig_pte))) {
3000 pte_unmap_unlock(pte, ptl); 3000 pte_unmap_unlock(pte, ptl);
3001 unlock_page(fault_page); 3001 unlock_page(fault_page);
3002 page_cache_release(fault_page); 3002 page_cache_release(fault_page);
3003 return ret; 3003 return ret;
3004 } 3004 }
3005 do_set_pte(vma, address, fault_page, pte, true, false); 3005 do_set_pte(vma, address, fault_page, pte, true, false);
3006 pte_unmap_unlock(pte, ptl); 3006 pte_unmap_unlock(pte, ptl);
3007 3007
3008 if (set_page_dirty(fault_page)) 3008 if (set_page_dirty(fault_page))
3009 dirtied = 1; 3009 dirtied = 1;
3010 mapping = fault_page->mapping; 3010 mapping = fault_page->mapping;
3011 unlock_page(fault_page); 3011 unlock_page(fault_page);
3012 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { 3012 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3013 /* 3013 /*
3014 * Some device drivers do not set page.mapping but still 3014 * Some device drivers do not set page.mapping but still
3015 * dirty their pages 3015 * dirty their pages
3016 */ 3016 */
3017 balance_dirty_pages_ratelimited(mapping); 3017 balance_dirty_pages_ratelimited(mapping);
3018 } 3018 }
3019 3019
3020 /* file_update_time outside page_lock */ 3020 /* file_update_time outside page_lock */
3021 if (vma->vm_file && !vma->vm_ops->page_mkwrite) 3021 if (vma->vm_file && !vma->vm_ops->page_mkwrite)
3022 file_update_time(vma->vm_file); 3022 file_update_time(vma->vm_file);
3023 3023
3024 return ret; 3024 return ret;
3025 } 3025 }
3026 3026
3027 /* 3027 /*
3028 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3028 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3029 * but allow concurrent faults). 3029 * but allow concurrent faults).
3030 * The mmap_sem may have been released depending on flags and our 3030 * The mmap_sem may have been released depending on flags and our
3031 * return value. See filemap_fault() and __lock_page_or_retry(). 3031 * return value. See filemap_fault() and __lock_page_or_retry().
3032 */ 3032 */
3033 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3033 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3034 unsigned long address, pte_t *page_table, pmd_t *pmd, 3034 unsigned long address, pte_t *page_table, pmd_t *pmd,
3035 unsigned int flags, pte_t orig_pte) 3035 unsigned int flags, pte_t orig_pte)
3036 { 3036 {
3037 pgoff_t pgoff = (((address & PAGE_MASK) 3037 pgoff_t pgoff = (((address & PAGE_MASK)
3038 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 3038 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3039 3039
3040 pte_unmap(page_table); 3040 pte_unmap(page_table);
3041 if (!(flags & FAULT_FLAG_WRITE)) 3041 if (!(flags & FAULT_FLAG_WRITE))
3042 return do_read_fault(mm, vma, address, pmd, pgoff, flags, 3042 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3043 orig_pte); 3043 orig_pte);
3044 if (!(vma->vm_flags & VM_SHARED)) 3044 if (!(vma->vm_flags & VM_SHARED))
3045 return do_cow_fault(mm, vma, address, pmd, pgoff, flags, 3045 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3046 orig_pte); 3046 orig_pte);
3047 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3047 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3048 } 3048 }
3049 3049
3050 /* 3050 /*
3051 * Fault of a previously existing named mapping. Repopulate the pte 3051 * Fault of a previously existing named mapping. Repopulate the pte
3052 * from the encoded file_pte if possible. This enables swappable 3052 * from the encoded file_pte if possible. This enables swappable
3053 * nonlinear vmas. 3053 * nonlinear vmas.
3054 * 3054 *
3055 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3055 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3056 * but allow concurrent faults), and pte mapped but not yet locked. 3056 * but allow concurrent faults), and pte mapped but not yet locked.
3057 * We return with pte unmapped and unlocked. 3057 * We return with pte unmapped and unlocked.
3058 * The mmap_sem may have been released depending on flags and our 3058 * The mmap_sem may have been released depending on flags and our
3059 * return value. See filemap_fault() and __lock_page_or_retry(). 3059 * return value. See filemap_fault() and __lock_page_or_retry().
3060 */ 3060 */
3061 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3061 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3062 unsigned long address, pte_t *page_table, pmd_t *pmd, 3062 unsigned long address, pte_t *page_table, pmd_t *pmd,
3063 unsigned int flags, pte_t orig_pte) 3063 unsigned int flags, pte_t orig_pte)
3064 { 3064 {
3065 pgoff_t pgoff; 3065 pgoff_t pgoff;
3066 3066
3067 flags |= FAULT_FLAG_NONLINEAR; 3067 flags |= FAULT_FLAG_NONLINEAR;
3068 3068
3069 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 3069 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3070 return 0; 3070 return 0;
3071 3071
3072 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { 3072 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3073 /* 3073 /*
3074 * Page table corrupted: show pte and kill process. 3074 * Page table corrupted: show pte and kill process.
3075 */ 3075 */
3076 print_bad_pte(vma, address, orig_pte, NULL); 3076 print_bad_pte(vma, address, orig_pte, NULL);
3077 return VM_FAULT_SIGBUS; 3077 return VM_FAULT_SIGBUS;
3078 } 3078 }
3079 3079
3080 pgoff = pte_to_pgoff(orig_pte); 3080 pgoff = pte_to_pgoff(orig_pte);
3081 if (!(flags & FAULT_FLAG_WRITE)) 3081 if (!(flags & FAULT_FLAG_WRITE))
3082 return do_read_fault(mm, vma, address, pmd, pgoff, flags, 3082 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3083 orig_pte); 3083 orig_pte);
3084 if (!(vma->vm_flags & VM_SHARED)) 3084 if (!(vma->vm_flags & VM_SHARED))
3085 return do_cow_fault(mm, vma, address, pmd, pgoff, flags, 3085 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3086 orig_pte); 3086 orig_pte);
3087 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3087 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3088 } 3088 }
3089 3089
3090 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3090 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3091 unsigned long addr, int page_nid, 3091 unsigned long addr, int page_nid,
3092 int *flags) 3092 int *flags)
3093 { 3093 {
3094 get_page(page); 3094 get_page(page);
3095 3095
3096 count_vm_numa_event(NUMA_HINT_FAULTS); 3096 count_vm_numa_event(NUMA_HINT_FAULTS);
3097 if (page_nid == numa_node_id()) { 3097 if (page_nid == numa_node_id()) {
3098 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3098 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3099 *flags |= TNF_FAULT_LOCAL; 3099 *flags |= TNF_FAULT_LOCAL;
3100 } 3100 }
3101 3101
3102 return mpol_misplaced(page, vma, addr); 3102 return mpol_misplaced(page, vma, addr);
3103 } 3103 }
3104 3104
3105 static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 3105 static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3106 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) 3106 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3107 { 3107 {
3108 struct page *page = NULL; 3108 struct page *page = NULL;
3109 spinlock_t *ptl; 3109 spinlock_t *ptl;
3110 int page_nid = -1; 3110 int page_nid = -1;
3111 int last_cpupid; 3111 int last_cpupid;
3112 int target_nid; 3112 int target_nid;
3113 bool migrated = false; 3113 bool migrated = false;
3114 int flags = 0; 3114 int flags = 0;
3115 3115
3116 /* 3116 /*
3117 * The "pte" at this point cannot be used safely without 3117 * The "pte" at this point cannot be used safely without
3118 * validation through pte_unmap_same(). It's of NUMA type but 3118 * validation through pte_unmap_same(). It's of NUMA type but
3119 * the pfn may be screwed if the read is non atomic. 3119 * the pfn may be screwed if the read is non atomic.
3120 * 3120 *
3121 * ptep_modify_prot_start is not called as this is clearing 3121 * ptep_modify_prot_start is not called as this is clearing
3122 * the _PAGE_NUMA bit and it is not really expected that there 3122 * the _PAGE_NUMA bit and it is not really expected that there
3123 * would be concurrent hardware modifications to the PTE. 3123 * would be concurrent hardware modifications to the PTE.
3124 */ 3124 */
3125 ptl = pte_lockptr(mm, pmd); 3125 ptl = pte_lockptr(mm, pmd);
3126 spin_lock(ptl); 3126 spin_lock(ptl);
3127 if (unlikely(!pte_same(*ptep, pte))) { 3127 if (unlikely(!pte_same(*ptep, pte))) {
3128 pte_unmap_unlock(ptep, ptl); 3128 pte_unmap_unlock(ptep, ptl);
3129 goto out; 3129 goto out;
3130 } 3130 }
3131 3131
3132 pte = pte_mknonnuma(pte); 3132 pte = pte_mknonnuma(pte);
3133 set_pte_at(mm, addr, ptep, pte); 3133 set_pte_at(mm, addr, ptep, pte);
3134 update_mmu_cache(vma, addr, ptep); 3134 update_mmu_cache(vma, addr, ptep);
3135 3135
3136 page = vm_normal_page(vma, addr, pte); 3136 page = vm_normal_page(vma, addr, pte);
3137 if (!page) { 3137 if (!page) {
3138 pte_unmap_unlock(ptep, ptl); 3138 pte_unmap_unlock(ptep, ptl);
3139 return 0; 3139 return 0;
3140 } 3140 }
3141 BUG_ON(is_zero_pfn(page_to_pfn(page))); 3141 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3142 3142
3143 /* 3143 /*
3144 * Avoid grouping on DSO/COW pages in specific and RO pages 3144 * Avoid grouping on DSO/COW pages in specific and RO pages
3145 * in general, RO pages shouldn't hurt as much anyway since 3145 * in general, RO pages shouldn't hurt as much anyway since
3146 * they can be in shared cache state. 3146 * they can be in shared cache state.
3147 */ 3147 */
3148 if (!pte_write(pte)) 3148 if (!pte_write(pte))
3149 flags |= TNF_NO_GROUP; 3149 flags |= TNF_NO_GROUP;
3150 3150
3151 /* 3151 /*
3152 * Flag if the page is shared between multiple address spaces. This 3152 * Flag if the page is shared between multiple address spaces. This
3153 * is later used when determining whether to group tasks together 3153 * is later used when determining whether to group tasks together
3154 */ 3154 */
3155 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) 3155 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3156 flags |= TNF_SHARED; 3156 flags |= TNF_SHARED;
3157 3157
3158 last_cpupid = page_cpupid_last(page); 3158 last_cpupid = page_cpupid_last(page);
3159 page_nid = page_to_nid(page); 3159 page_nid = page_to_nid(page);
3160 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); 3160 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3161 pte_unmap_unlock(ptep, ptl); 3161 pte_unmap_unlock(ptep, ptl);
3162 if (target_nid == -1) { 3162 if (target_nid == -1) {
3163 put_page(page); 3163 put_page(page);
3164 goto out; 3164 goto out;
3165 } 3165 }
3166 3166
3167 /* Migrate to the requested node */ 3167 /* Migrate to the requested node */
3168 migrated = migrate_misplaced_page(page, vma, target_nid); 3168 migrated = migrate_misplaced_page(page, vma, target_nid);
3169 if (migrated) { 3169 if (migrated) {
3170 page_nid = target_nid; 3170 page_nid = target_nid;
3171 flags |= TNF_MIGRATED; 3171 flags |= TNF_MIGRATED;
3172 } 3172 }
3173 3173
3174 out: 3174 out:
3175 if (page_nid != -1) 3175 if (page_nid != -1)
3176 task_numa_fault(last_cpupid, page_nid, 1, flags); 3176 task_numa_fault(last_cpupid, page_nid, 1, flags);
3177 return 0; 3177 return 0;
3178 } 3178 }
3179 3179
3180 /* 3180 /*
3181 * These routines also need to handle stuff like marking pages dirty 3181 * These routines also need to handle stuff like marking pages dirty
3182 * and/or accessed for architectures that don't do it in hardware (most 3182 * and/or accessed for architectures that don't do it in hardware (most
3183 * RISC architectures). The early dirtying is also good on the i386. 3183 * RISC architectures). The early dirtying is also good on the i386.
3184 * 3184 *
3185 * There is also a hook called "update_mmu_cache()" that architectures 3185 * There is also a hook called "update_mmu_cache()" that architectures
3186 * with external mmu caches can use to update those (ie the Sparc or 3186 * with external mmu caches can use to update those (ie the Sparc or
3187 * PowerPC hashed page tables that act as extended TLBs). 3187 * PowerPC hashed page tables that act as extended TLBs).
3188 * 3188 *
3189 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3189 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3190 * but allow concurrent faults), and pte mapped but not yet locked. 3190 * but allow concurrent faults), and pte mapped but not yet locked.
3191 * We return with pte unmapped and unlocked. 3191 * We return with pte unmapped and unlocked.
3192 * 3192 *
3193 * The mmap_sem may have been released depending on flags and our 3193 * The mmap_sem may have been released depending on flags and our
3194 * return value. See filemap_fault() and __lock_page_or_retry(). 3194 * return value. See filemap_fault() and __lock_page_or_retry().
3195 */ 3195 */
3196 static int handle_pte_fault(struct mm_struct *mm, 3196 static int handle_pte_fault(struct mm_struct *mm,
3197 struct vm_area_struct *vma, unsigned long address, 3197 struct vm_area_struct *vma, unsigned long address,
3198 pte_t *pte, pmd_t *pmd, unsigned int flags) 3198 pte_t *pte, pmd_t *pmd, unsigned int flags)
3199 { 3199 {
3200 pte_t entry; 3200 pte_t entry;
3201 spinlock_t *ptl; 3201 spinlock_t *ptl;
3202 3202
3203 entry = ACCESS_ONCE(*pte); 3203 entry = ACCESS_ONCE(*pte);
3204 if (!pte_present(entry)) { 3204 if (!pte_present(entry)) {
3205 if (pte_none(entry)) { 3205 if (pte_none(entry)) {
3206 if (vma->vm_ops) { 3206 if (vma->vm_ops) {
3207 if (likely(vma->vm_ops->fault)) 3207 if (likely(vma->vm_ops->fault))
3208 return do_linear_fault(mm, vma, address, 3208 return do_linear_fault(mm, vma, address,
3209 pte, pmd, flags, entry); 3209 pte, pmd, flags, entry);
3210 } 3210 }
3211 return do_anonymous_page(mm, vma, address, 3211 return do_anonymous_page(mm, vma, address,
3212 pte, pmd, flags); 3212 pte, pmd, flags);
3213 } 3213 }
3214 if (pte_file(entry)) 3214 if (pte_file(entry))
3215 return do_nonlinear_fault(mm, vma, address, 3215 return do_nonlinear_fault(mm, vma, address,
3216 pte, pmd, flags, entry); 3216 pte, pmd, flags, entry);
3217 return do_swap_page(mm, vma, address, 3217 return do_swap_page(mm, vma, address,
3218 pte, pmd, flags, entry); 3218 pte, pmd, flags, entry);
3219 } 3219 }
3220 3220
3221 if (pte_numa(entry)) 3221 if (pte_numa(entry))
3222 return do_numa_page(mm, vma, address, entry, pte, pmd); 3222 return do_numa_page(mm, vma, address, entry, pte, pmd);
3223 3223
3224 ptl = pte_lockptr(mm, pmd); 3224 ptl = pte_lockptr(mm, pmd);
3225 spin_lock(ptl); 3225 spin_lock(ptl);
3226 if (unlikely(!pte_same(*pte, entry))) 3226 if (unlikely(!pte_same(*pte, entry)))
3227 goto unlock; 3227 goto unlock;
3228 if (flags & FAULT_FLAG_WRITE) { 3228 if (flags & FAULT_FLAG_WRITE) {
3229 if (!pte_write(entry)) 3229 if (!pte_write(entry))
3230 return do_wp_page(mm, vma, address, 3230 return do_wp_page(mm, vma, address,
3231 pte, pmd, ptl, entry); 3231 pte, pmd, ptl, entry);
3232 entry = pte_mkdirty(entry); 3232 entry = pte_mkdirty(entry);
3233 } 3233 }
3234 entry = pte_mkyoung(entry); 3234 entry = pte_mkyoung(entry);
3235 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3235 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3236 update_mmu_cache(vma, address, pte); 3236 update_mmu_cache(vma, address, pte);
3237 } else { 3237 } else {
3238 /* 3238 /*
3239 * This is needed only for protection faults but the arch code 3239 * This is needed only for protection faults but the arch code
3240 * is not yet telling us if this is a protection fault or not. 3240 * is not yet telling us if this is a protection fault or not.
3241 * This still avoids useless tlb flushes for .text page faults 3241 * This still avoids useless tlb flushes for .text page faults
3242 * with threads. 3242 * with threads.
3243 */ 3243 */
3244 if (flags & FAULT_FLAG_WRITE) 3244 if (flags & FAULT_FLAG_WRITE)
3245 flush_tlb_fix_spurious_fault(vma, address); 3245 flush_tlb_fix_spurious_fault(vma, address);
3246 } 3246 }
3247 unlock: 3247 unlock:
3248 pte_unmap_unlock(pte, ptl); 3248 pte_unmap_unlock(pte, ptl);
3249 return 0; 3249 return 0;
3250 } 3250 }
3251 3251
3252 /* 3252 /*
3253 * By the time we get here, we already hold the mm semaphore 3253 * By the time we get here, we already hold the mm semaphore
3254 * 3254 *
3255 * The mmap_sem may have been released depending on flags and our 3255 * The mmap_sem may have been released depending on flags and our
3256 * return value. See filemap_fault() and __lock_page_or_retry(). 3256 * return value. See filemap_fault() and __lock_page_or_retry().
3257 */ 3257 */
3258 static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3258 static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3259 unsigned long address, unsigned int flags) 3259 unsigned long address, unsigned int flags)
3260 { 3260 {
3261 pgd_t *pgd; 3261 pgd_t *pgd;
3262 pud_t *pud; 3262 pud_t *pud;
3263 pmd_t *pmd; 3263 pmd_t *pmd;
3264 pte_t *pte; 3264 pte_t *pte;
3265 3265
3266 if (unlikely(is_vm_hugetlb_page(vma))) 3266 if (unlikely(is_vm_hugetlb_page(vma)))
3267 return hugetlb_fault(mm, vma, address, flags); 3267 return hugetlb_fault(mm, vma, address, flags);
3268 3268
3269 pgd = pgd_offset(mm, address); 3269 pgd = pgd_offset(mm, address);
3270 pud = pud_alloc(mm, pgd, address); 3270 pud = pud_alloc(mm, pgd, address);
3271 if (!pud) 3271 if (!pud)
3272 return VM_FAULT_OOM; 3272 return VM_FAULT_OOM;
3273 pmd = pmd_alloc(mm, pud, address); 3273 pmd = pmd_alloc(mm, pud, address);
3274 if (!pmd) 3274 if (!pmd)
3275 return VM_FAULT_OOM; 3275 return VM_FAULT_OOM;
3276 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3276 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3277 int ret = VM_FAULT_FALLBACK; 3277 int ret = VM_FAULT_FALLBACK;
3278 if (!vma->vm_ops) 3278 if (!vma->vm_ops)
3279 ret = do_huge_pmd_anonymous_page(mm, vma, address, 3279 ret = do_huge_pmd_anonymous_page(mm, vma, address,
3280 pmd, flags); 3280 pmd, flags);
3281 if (!(ret & VM_FAULT_FALLBACK)) 3281 if (!(ret & VM_FAULT_FALLBACK))
3282 return ret; 3282 return ret;
3283 } else { 3283 } else {
3284 pmd_t orig_pmd = *pmd; 3284 pmd_t orig_pmd = *pmd;
3285 int ret; 3285 int ret;
3286 3286
3287 barrier(); 3287 barrier();
3288 if (pmd_trans_huge(orig_pmd)) { 3288 if (pmd_trans_huge(orig_pmd)) {
3289 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3289 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3290 3290
3291 /* 3291 /*
3292 * If the pmd is splitting, return and retry the 3292 * If the pmd is splitting, return and retry the
3293 * the fault. Alternative: wait until the split 3293 * the fault. Alternative: wait until the split
3294 * is done, and goto retry. 3294 * is done, and goto retry.
3295 */ 3295 */
3296 if (pmd_trans_splitting(orig_pmd)) 3296 if (pmd_trans_splitting(orig_pmd))
3297 return 0; 3297 return 0;
3298 3298
3299 if (pmd_numa(orig_pmd)) 3299 if (pmd_numa(orig_pmd))
3300 return do_huge_pmd_numa_page(mm, vma, address, 3300 return do_huge_pmd_numa_page(mm, vma, address,
3301 orig_pmd, pmd); 3301 orig_pmd, pmd);
3302 3302
3303 if (dirty && !pmd_write(orig_pmd)) { 3303 if (dirty && !pmd_write(orig_pmd)) {
3304 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3304 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3305 orig_pmd); 3305 orig_pmd);
3306 if (!(ret & VM_FAULT_FALLBACK)) 3306 if (!(ret & VM_FAULT_FALLBACK))
3307 return ret; 3307 return ret;
3308 } else { 3308 } else {
3309 huge_pmd_set_accessed(mm, vma, address, pmd, 3309 huge_pmd_set_accessed(mm, vma, address, pmd,
3310 orig_pmd, dirty); 3310 orig_pmd, dirty);
3311 return 0; 3311 return 0;
3312 } 3312 }
3313 } 3313 }
3314 } 3314 }
3315 3315
3316 /* 3316 /*
3317 * Use __pte_alloc instead of pte_alloc_map, because we can't 3317 * Use __pte_alloc instead of pte_alloc_map, because we can't
3318 * run pte_offset_map on the pmd, if an huge pmd could 3318 * run pte_offset_map on the pmd, if an huge pmd could
3319 * materialize from under us from a different thread. 3319 * materialize from under us from a different thread.
3320 */ 3320 */
3321 if (unlikely(pmd_none(*pmd)) && 3321 if (unlikely(pmd_none(*pmd)) &&
3322 unlikely(__pte_alloc(mm, vma, pmd, address))) 3322 unlikely(__pte_alloc(mm, vma, pmd, address)))
3323 return VM_FAULT_OOM; 3323 return VM_FAULT_OOM;
3324 /* if an huge pmd materialized from under us just retry later */ 3324 /* if an huge pmd materialized from under us just retry later */
3325 if (unlikely(pmd_trans_huge(*pmd))) 3325 if (unlikely(pmd_trans_huge(*pmd)))
3326 return 0; 3326 return 0;
3327 /* 3327 /*
3328 * A regular pmd is established and it can't morph into a huge pmd 3328 * A regular pmd is established and it can't morph into a huge pmd
3329 * from under us anymore at this point because we hold the mmap_sem 3329 * from under us anymore at this point because we hold the mmap_sem
3330 * read mode and khugepaged takes it in write mode. So now it's 3330 * read mode and khugepaged takes it in write mode. So now it's
3331 * safe to run pte_offset_map(). 3331 * safe to run pte_offset_map().
3332 */ 3332 */
3333 pte = pte_offset_map(pmd, address); 3333 pte = pte_offset_map(pmd, address);
3334 3334
3335 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3335 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3336 } 3336 }
3337 3337
3338 /* 3338 /*
3339 * By the time we get here, we already hold the mm semaphore 3339 * By the time we get here, we already hold the mm semaphore
3340 * 3340 *
3341 * The mmap_sem may have been released depending on flags and our 3341 * The mmap_sem may have been released depending on flags and our
3342 * return value. See filemap_fault() and __lock_page_or_retry(). 3342 * return value. See filemap_fault() and __lock_page_or_retry().
3343 */ 3343 */
3344 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3344 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3345 unsigned long address, unsigned int flags) 3345 unsigned long address, unsigned int flags)
3346 { 3346 {
3347 int ret; 3347 int ret;
3348 3348
3349 __set_current_state(TASK_RUNNING); 3349 __set_current_state(TASK_RUNNING);
3350 3350
3351 count_vm_event(PGFAULT); 3351 count_vm_event(PGFAULT);
3352 mem_cgroup_count_vm_event(mm, PGFAULT); 3352 mem_cgroup_count_vm_event(mm, PGFAULT);
3353 3353
3354 /* do counter updates before entering really critical section. */ 3354 /* do counter updates before entering really critical section. */
3355 check_sync_rss_stat(current); 3355 check_sync_rss_stat(current);
3356 3356
3357 /* 3357 /*
3358 * Enable the memcg OOM handling for faults triggered in user 3358 * Enable the memcg OOM handling for faults triggered in user
3359 * space. Kernel faults are handled more gracefully. 3359 * space. Kernel faults are handled more gracefully.
3360 */ 3360 */
3361 if (flags & FAULT_FLAG_USER) 3361 if (flags & FAULT_FLAG_USER)
3362 mem_cgroup_oom_enable(); 3362 mem_cgroup_oom_enable();
3363 3363
3364 ret = __handle_mm_fault(mm, vma, address, flags); 3364 ret = __handle_mm_fault(mm, vma, address, flags);
3365 3365
3366 if (flags & FAULT_FLAG_USER) { 3366 if (flags & FAULT_FLAG_USER) {
3367 mem_cgroup_oom_disable(); 3367 mem_cgroup_oom_disable();
3368 /* 3368 /*
3369 * The task may have entered a memcg OOM situation but 3369 * The task may have entered a memcg OOM situation but
3370 * if the allocation error was handled gracefully (no 3370 * if the allocation error was handled gracefully (no
3371 * VM_FAULT_OOM), there is no need to kill anything. 3371 * VM_FAULT_OOM), there is no need to kill anything.
3372 * Just clean up the OOM state peacefully. 3372 * Just clean up the OOM state peacefully.
3373 */ 3373 */
3374 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) 3374 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3375 mem_cgroup_oom_synchronize(false); 3375 mem_cgroup_oom_synchronize(false);
3376 } 3376 }
3377 3377
3378 return ret; 3378 return ret;
3379 } 3379 }
3380 3380
3381 #ifndef __PAGETABLE_PUD_FOLDED 3381 #ifndef __PAGETABLE_PUD_FOLDED
3382 /* 3382 /*
3383 * Allocate page upper directory. 3383 * Allocate page upper directory.
3384 * We've already handled the fast-path in-line. 3384 * We've already handled the fast-path in-line.
3385 */ 3385 */
3386 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 3386 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3387 { 3387 {
3388 pud_t *new = pud_alloc_one(mm, address); 3388 pud_t *new = pud_alloc_one(mm, address);
3389 if (!new) 3389 if (!new)
3390 return -ENOMEM; 3390 return -ENOMEM;
3391 3391
3392 smp_wmb(); /* See comment in __pte_alloc */ 3392 smp_wmb(); /* See comment in __pte_alloc */
3393 3393
3394 spin_lock(&mm->page_table_lock); 3394 spin_lock(&mm->page_table_lock);
3395 if (pgd_present(*pgd)) /* Another has populated it */ 3395 if (pgd_present(*pgd)) /* Another has populated it */
3396 pud_free(mm, new); 3396 pud_free(mm, new);
3397 else 3397 else
3398 pgd_populate(mm, pgd, new); 3398 pgd_populate(mm, pgd, new);
3399 spin_unlock(&mm->page_table_lock); 3399 spin_unlock(&mm->page_table_lock);
3400 return 0; 3400 return 0;
3401 } 3401 }
3402 #endif /* __PAGETABLE_PUD_FOLDED */ 3402 #endif /* __PAGETABLE_PUD_FOLDED */
3403 3403
3404 #ifndef __PAGETABLE_PMD_FOLDED 3404 #ifndef __PAGETABLE_PMD_FOLDED
3405 /* 3405 /*
3406 * Allocate page middle directory. 3406 * Allocate page middle directory.
3407 * We've already handled the fast-path in-line. 3407 * We've already handled the fast-path in-line.
3408 */ 3408 */
3409 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3409 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3410 { 3410 {
3411 pmd_t *new = pmd_alloc_one(mm, address); 3411 pmd_t *new = pmd_alloc_one(mm, address);
3412 if (!new) 3412 if (!new)
3413 return -ENOMEM; 3413 return -ENOMEM;
3414 3414
3415 smp_wmb(); /* See comment in __pte_alloc */ 3415 smp_wmb(); /* See comment in __pte_alloc */
3416 3416
3417 spin_lock(&mm->page_table_lock); 3417 spin_lock(&mm->page_table_lock);
3418 #ifndef __ARCH_HAS_4LEVEL_HACK 3418 #ifndef __ARCH_HAS_4LEVEL_HACK
3419 if (pud_present(*pud)) /* Another has populated it */ 3419 if (pud_present(*pud)) /* Another has populated it */
3420 pmd_free(mm, new); 3420 pmd_free(mm, new);
3421 else 3421 else
3422 pud_populate(mm, pud, new); 3422 pud_populate(mm, pud, new);
3423 #else 3423 #else
3424 if (pgd_present(*pud)) /* Another has populated it */ 3424 if (pgd_present(*pud)) /* Another has populated it */
3425 pmd_free(mm, new); 3425 pmd_free(mm, new);
3426 else 3426 else
3427 pgd_populate(mm, pud, new); 3427 pgd_populate(mm, pud, new);
3428 #endif /* __ARCH_HAS_4LEVEL_HACK */ 3428 #endif /* __ARCH_HAS_4LEVEL_HACK */
3429 spin_unlock(&mm->page_table_lock); 3429 spin_unlock(&mm->page_table_lock);
3430 return 0; 3430 return 0;
3431 } 3431 }
3432 #endif /* __PAGETABLE_PMD_FOLDED */ 3432 #endif /* __PAGETABLE_PMD_FOLDED */
3433 3433
3434 static int __follow_pte(struct mm_struct *mm, unsigned long address, 3434 static int __follow_pte(struct mm_struct *mm, unsigned long address,
3435 pte_t **ptepp, spinlock_t **ptlp) 3435 pte_t **ptepp, spinlock_t **ptlp)
3436 { 3436 {
3437 pgd_t *pgd; 3437 pgd_t *pgd;
3438 pud_t *pud; 3438 pud_t *pud;
3439 pmd_t *pmd; 3439 pmd_t *pmd;
3440 pte_t *ptep; 3440 pte_t *ptep;
3441 3441
3442 pgd = pgd_offset(mm, address); 3442 pgd = pgd_offset(mm, address);
3443 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3443 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3444 goto out; 3444 goto out;
3445 3445
3446 pud = pud_offset(pgd, address); 3446 pud = pud_offset(pgd, address);
3447 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 3447 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3448 goto out; 3448 goto out;
3449 3449
3450 pmd = pmd_offset(pud, address); 3450 pmd = pmd_offset(pud, address);
3451 VM_BUG_ON(pmd_trans_huge(*pmd)); 3451 VM_BUG_ON(pmd_trans_huge(*pmd));
3452 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3452 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3453 goto out; 3453 goto out;
3454 3454
3455 /* We cannot handle huge page PFN maps. Luckily they don't exist. */ 3455 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
3456 if (pmd_huge(*pmd)) 3456 if (pmd_huge(*pmd))
3457 goto out; 3457 goto out;
3458 3458
3459 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 3459 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3460 if (!ptep) 3460 if (!ptep)
3461 goto out; 3461 goto out;
3462 if (!pte_present(*ptep)) 3462 if (!pte_present(*ptep))
3463 goto unlock; 3463 goto unlock;
3464 *ptepp = ptep; 3464 *ptepp = ptep;
3465 return 0; 3465 return 0;
3466 unlock: 3466 unlock:
3467 pte_unmap_unlock(ptep, *ptlp); 3467 pte_unmap_unlock(ptep, *ptlp);
3468 out: 3468 out:
3469 return -EINVAL; 3469 return -EINVAL;
3470 } 3470 }
3471 3471
3472 static inline int follow_pte(struct mm_struct *mm, unsigned long address, 3472 static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3473 pte_t **ptepp, spinlock_t **ptlp) 3473 pte_t **ptepp, spinlock_t **ptlp)
3474 { 3474 {
3475 int res; 3475 int res;
3476 3476
3477 /* (void) is needed to make gcc happy */ 3477 /* (void) is needed to make gcc happy */
3478 (void) __cond_lock(*ptlp, 3478 (void) __cond_lock(*ptlp,
3479 !(res = __follow_pte(mm, address, ptepp, ptlp))); 3479 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3480 return res; 3480 return res;
3481 } 3481 }
3482 3482
3483 /** 3483 /**
3484 * follow_pfn - look up PFN at a user virtual address 3484 * follow_pfn - look up PFN at a user virtual address
3485 * @vma: memory mapping 3485 * @vma: memory mapping
3486 * @address: user virtual address 3486 * @address: user virtual address
3487 * @pfn: location to store found PFN 3487 * @pfn: location to store found PFN
3488 * 3488 *
3489 * Only IO mappings and raw PFN mappings are allowed. 3489 * Only IO mappings and raw PFN mappings are allowed.
3490 * 3490 *
3491 * Returns zero and the pfn at @pfn on success, -ve otherwise. 3491 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3492 */ 3492 */
3493 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 3493 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3494 unsigned long *pfn) 3494 unsigned long *pfn)
3495 { 3495 {
3496 int ret = -EINVAL; 3496 int ret = -EINVAL;
3497 spinlock_t *ptl; 3497 spinlock_t *ptl;
3498 pte_t *ptep; 3498 pte_t *ptep;
3499 3499
3500 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 3500 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3501 return ret; 3501 return ret;
3502 3502
3503 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); 3503 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3504 if (ret) 3504 if (ret)
3505 return ret; 3505 return ret;
3506 *pfn = pte_pfn(*ptep); 3506 *pfn = pte_pfn(*ptep);
3507 pte_unmap_unlock(ptep, ptl); 3507 pte_unmap_unlock(ptep, ptl);
3508 return 0; 3508 return 0;
3509 } 3509 }
3510 EXPORT_SYMBOL(follow_pfn); 3510 EXPORT_SYMBOL(follow_pfn);
3511 3511
3512 #ifdef CONFIG_HAVE_IOREMAP_PROT 3512 #ifdef CONFIG_HAVE_IOREMAP_PROT
3513 int follow_phys(struct vm_area_struct *vma, 3513 int follow_phys(struct vm_area_struct *vma,
3514 unsigned long address, unsigned int flags, 3514 unsigned long address, unsigned int flags,
3515 unsigned long *prot, resource_size_t *phys) 3515 unsigned long *prot, resource_size_t *phys)
3516 { 3516 {
3517 int ret = -EINVAL; 3517 int ret = -EINVAL;
3518 pte_t *ptep, pte; 3518 pte_t *ptep, pte;
3519 spinlock_t *ptl; 3519 spinlock_t *ptl;
3520 3520
3521 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 3521 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3522 goto out; 3522 goto out;
3523 3523
3524 if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) 3524 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3525 goto out; 3525 goto out;
3526 pte = *ptep; 3526 pte = *ptep;
3527 3527
3528 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3528 if ((flags & FOLL_WRITE) && !pte_write(pte))
3529 goto unlock; 3529 goto unlock;
3530 3530
3531 *prot = pgprot_val(pte_pgprot(pte)); 3531 *prot = pgprot_val(pte_pgprot(pte));
3532 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 3532 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3533 3533
3534 ret = 0; 3534 ret = 0;
3535 unlock: 3535 unlock:
3536 pte_unmap_unlock(ptep, ptl); 3536 pte_unmap_unlock(ptep, ptl);
3537 out: 3537 out:
3538 return ret; 3538 return ret;
3539 } 3539 }
3540 3540
3541 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 3541 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3542 void *buf, int len, int write) 3542 void *buf, int len, int write)
3543 { 3543 {
3544 resource_size_t phys_addr; 3544 resource_size_t phys_addr;
3545 unsigned long prot = 0; 3545 unsigned long prot = 0;
3546 void __iomem *maddr; 3546 void __iomem *maddr;
3547 int offset = addr & (PAGE_SIZE-1); 3547 int offset = addr & (PAGE_SIZE-1);
3548 3548
3549 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3549 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3550 return -EINVAL; 3550 return -EINVAL;
3551 3551
3552 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 3552 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3553 if (write) 3553 if (write)
3554 memcpy_toio(maddr + offset, buf, len); 3554 memcpy_toio(maddr + offset, buf, len);
3555 else 3555 else
3556 memcpy_fromio(buf, maddr + offset, len); 3556 memcpy_fromio(buf, maddr + offset, len);
3557 iounmap(maddr); 3557 iounmap(maddr);
3558 3558
3559 return len; 3559 return len;
3560 } 3560 }
3561 EXPORT_SYMBOL_GPL(generic_access_phys); 3561 EXPORT_SYMBOL_GPL(generic_access_phys);
3562 #endif 3562 #endif
3563 3563
3564 /* 3564 /*
3565 * Access another process' address space as given in mm. If non-NULL, use the 3565 * Access another process' address space as given in mm. If non-NULL, use the
3566 * given task for page fault accounting. 3566 * given task for page fault accounting.
3567 */ 3567 */
3568 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 3568 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3569 unsigned long addr, void *buf, int len, int write) 3569 unsigned long addr, void *buf, int len, int write)
3570 { 3570 {
3571 struct vm_area_struct *vma; 3571 struct vm_area_struct *vma;
3572 void *old_buf = buf; 3572 void *old_buf = buf;
3573 3573
3574 down_read(&mm->mmap_sem); 3574 down_read(&mm->mmap_sem);
3575 /* ignore errors, just check how much was successfully transferred */ 3575 /* ignore errors, just check how much was successfully transferred */
3576 while (len) { 3576 while (len) {
3577 int bytes, ret, offset; 3577 int bytes, ret, offset;
3578 void *maddr; 3578 void *maddr;
3579 struct page *page = NULL; 3579 struct page *page = NULL;
3580 3580
3581 ret = get_user_pages(tsk, mm, addr, 1, 3581 ret = get_user_pages(tsk, mm, addr, 1,
3582 write, 1, &page, &vma); 3582 write, 1, &page, &vma);
3583 if (ret <= 0) { 3583 if (ret <= 0) {
3584 #ifndef CONFIG_HAVE_IOREMAP_PROT 3584 #ifndef CONFIG_HAVE_IOREMAP_PROT
3585 break; 3585 break;
3586 #else 3586 #else
3587 /* 3587 /*
3588 * Check if this is a VM_IO | VM_PFNMAP VMA, which 3588 * Check if this is a VM_IO | VM_PFNMAP VMA, which
3589 * we can access using slightly different code. 3589 * we can access using slightly different code.
3590 */ 3590 */
3591 vma = find_vma(mm, addr); 3591 vma = find_vma(mm, addr);
3592 if (!vma || vma->vm_start > addr) 3592 if (!vma || vma->vm_start > addr)
3593 break; 3593 break;
3594 if (vma->vm_ops && vma->vm_ops->access) 3594 if (vma->vm_ops && vma->vm_ops->access)
3595 ret = vma->vm_ops->access(vma, addr, buf, 3595 ret = vma->vm_ops->access(vma, addr, buf,
3596 len, write); 3596 len, write);
3597 if (ret <= 0) 3597 if (ret <= 0)
3598 break; 3598 break;
3599 bytes = ret; 3599 bytes = ret;
3600 #endif 3600 #endif
3601 } else { 3601 } else {
3602 bytes = len; 3602 bytes = len;
3603 offset = addr & (PAGE_SIZE-1); 3603 offset = addr & (PAGE_SIZE-1);
3604 if (bytes > PAGE_SIZE-offset) 3604 if (bytes > PAGE_SIZE-offset)
3605 bytes = PAGE_SIZE-offset; 3605 bytes = PAGE_SIZE-offset;
3606 3606
3607 maddr = kmap(page); 3607 maddr = kmap(page);
3608 if (write) { 3608 if (write) {
3609 copy_to_user_page(vma, page, addr, 3609 copy_to_user_page(vma, page, addr,
3610 maddr + offset, buf, bytes); 3610 maddr + offset, buf, bytes);
3611 set_page_dirty_lock(page); 3611 set_page_dirty_lock(page);
3612 } else { 3612 } else {
3613 copy_from_user_page(vma, page, addr, 3613 copy_from_user_page(vma, page, addr,
3614 buf, maddr + offset, bytes); 3614 buf, maddr + offset, bytes);
3615 } 3615 }
3616 kunmap(page); 3616 kunmap(page);
3617 page_cache_release(page); 3617 page_cache_release(page);
3618 } 3618 }
3619 len -= bytes; 3619 len -= bytes;
3620 buf += bytes; 3620 buf += bytes;
3621 addr += bytes; 3621 addr += bytes;
3622 } 3622 }
3623 up_read(&mm->mmap_sem); 3623 up_read(&mm->mmap_sem);
3624 3624
3625 return buf - old_buf; 3625 return buf - old_buf;
3626 } 3626 }
3627 3627
3628 /** 3628 /**
3629 * access_remote_vm - access another process' address space 3629 * access_remote_vm - access another process' address space
3630 * @mm: the mm_struct of the target address space 3630 * @mm: the mm_struct of the target address space
3631 * @addr: start address to access 3631 * @addr: start address to access
3632 * @buf: source or destination buffer 3632 * @buf: source or destination buffer
3633 * @len: number of bytes to transfer 3633 * @len: number of bytes to transfer
3634 * @write: whether the access is a write 3634 * @write: whether the access is a write
3635 * 3635 *
3636 * The caller must hold a reference on @mm. 3636 * The caller must hold a reference on @mm.
3637 */ 3637 */
3638 int access_remote_vm(struct mm_struct *mm, unsigned long addr, 3638 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3639 void *buf, int len, int write) 3639 void *buf, int len, int write)
3640 { 3640 {
3641 return __access_remote_vm(NULL, mm, addr, buf, len, write); 3641 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3642 } 3642 }
3643 3643
3644 /* 3644 /*
3645 * Access another process' address space. 3645 * Access another process' address space.
3646 * Source/target buffer must be kernel space, 3646 * Source/target buffer must be kernel space,
3647 * Do not walk the page table directly, use get_user_pages 3647 * Do not walk the page table directly, use get_user_pages
3648 */ 3648 */
3649 int access_process_vm(struct task_struct *tsk, unsigned long addr, 3649 int access_process_vm(struct task_struct *tsk, unsigned long addr,
3650 void *buf, int len, int write) 3650 void *buf, int len, int write)
3651 { 3651 {
3652 struct mm_struct *mm; 3652 struct mm_struct *mm;
3653 int ret; 3653 int ret;
3654 3654
3655 mm = get_task_mm(tsk); 3655 mm = get_task_mm(tsk);
3656 if (!mm) 3656 if (!mm)
3657 return 0; 3657 return 0;
3658 3658
3659 ret = __access_remote_vm(tsk, mm, addr, buf, len, write); 3659 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3660 mmput(mm); 3660 mmput(mm);
3661 3661
3662 return ret; 3662 return ret;
3663 } 3663 }
3664 3664
3665 /* 3665 /*
3666 * Print the name of a VMA. 3666 * Print the name of a VMA.
3667 */ 3667 */
3668 void print_vma_addr(char *prefix, unsigned long ip) 3668 void print_vma_addr(char *prefix, unsigned long ip)
3669 { 3669 {
3670 struct mm_struct *mm = current->mm; 3670 struct mm_struct *mm = current->mm;
3671 struct vm_area_struct *vma; 3671 struct vm_area_struct *vma;
3672 3672
3673 /* 3673 /*
3674 * Do not print if we are in atomic 3674 * Do not print if we are in atomic
3675 * contexts (in exception stacks, etc.): 3675 * contexts (in exception stacks, etc.):
3676 */ 3676 */
3677 if (preempt_count()) 3677 if (preempt_count())
3678 return; 3678 return;
3679 3679
3680 down_read(&mm->mmap_sem); 3680 down_read(&mm->mmap_sem);
3681 vma = find_vma(mm, ip); 3681 vma = find_vma(mm, ip);
3682 if (vma && vma->vm_file) { 3682 if (vma && vma->vm_file) {
3683 struct file *f = vma->vm_file; 3683 struct file *f = vma->vm_file;
3684 char *buf = (char *)__get_free_page(GFP_KERNEL); 3684 char *buf = (char *)__get_free_page(GFP_KERNEL);
3685 if (buf) { 3685 if (buf) {
3686 char *p; 3686 char *p;
3687 3687
3688 p = d_path(&f->f_path, buf, PAGE_SIZE); 3688 p = d_path(&f->f_path, buf, PAGE_SIZE);
3689 if (IS_ERR(p)) 3689 if (IS_ERR(p))
3690 p = "?"; 3690 p = "?";
3691 printk("%s%s[%lx+%lx]", prefix, kbasename(p), 3691 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3692 vma->vm_start, 3692 vma->vm_start,
3693 vma->vm_end - vma->vm_start); 3693 vma->vm_end - vma->vm_start);
3694 free_page((unsigned long)buf); 3694 free_page((unsigned long)buf);
3695 } 3695 }
3696 } 3696 }
3697 up_read(&mm->mmap_sem); 3697 up_read(&mm->mmap_sem);
3698 } 3698 }
3699 3699
3700 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) 3700 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3701 void might_fault(void) 3701 void might_fault(void)
3702 { 3702 {
3703 /* 3703 /*
3704 * Some code (nfs/sunrpc) uses socket ops on kernel memory while 3704 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
3705 * holding the mmap_sem, this is safe because kernel memory doesn't 3705 * holding the mmap_sem, this is safe because kernel memory doesn't
3706 * get paged out, therefore we'll never actually fault, and the 3706 * get paged out, therefore we'll never actually fault, and the
3707 * below annotations will generate false positives. 3707 * below annotations will generate false positives.
3708 */ 3708 */
3709 if (segment_eq(get_fs(), KERNEL_DS)) 3709 if (segment_eq(get_fs(), KERNEL_DS))
3710 return; 3710 return;
3711 3711
3712 /* 3712 /*
3713 * it would be nicer only to annotate paths which are not under 3713 * it would be nicer only to annotate paths which are not under
3714 * pagefault_disable, however that requires a larger audit and 3714 * pagefault_disable, however that requires a larger audit and
3715 * providing helpers like get_user_atomic. 3715 * providing helpers like get_user_atomic.
3716 */ 3716 */
3717 if (in_atomic()) 3717 if (in_atomic())
3718 return; 3718 return;
3719 3719
3720 __might_sleep(__FILE__, __LINE__, 0); 3720 __might_sleep(__FILE__, __LINE__, 0);
3721 3721
3722 if (current->mm) 3722 if (current->mm)
3723 might_lock_read(&current->mm->mmap_sem); 3723 might_lock_read(&current->mm->mmap_sem);
3724 } 3724 }
3725 EXPORT_SYMBOL(might_fault); 3725 EXPORT_SYMBOL(might_fault);
3726 #endif 3726 #endif
3727 3727
3728 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 3728 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3729 static void clear_gigantic_page(struct page *page, 3729 static void clear_gigantic_page(struct page *page,
3730 unsigned long addr, 3730 unsigned long addr,
3731 unsigned int pages_per_huge_page) 3731 unsigned int pages_per_huge_page)
3732 { 3732 {
3733 int i; 3733 int i;
3734 struct page *p = page; 3734 struct page *p = page;
3735 3735
3736 might_sleep(); 3736 might_sleep();
3737 for (i = 0; i < pages_per_huge_page; 3737 for (i = 0; i < pages_per_huge_page;
3738 i++, p = mem_map_next(p, page, i)) { 3738 i++, p = mem_map_next(p, page, i)) {
3739 cond_resched(); 3739 cond_resched();
3740 clear_user_highpage(p, addr + i * PAGE_SIZE); 3740 clear_user_highpage(p, addr + i * PAGE_SIZE);
3741 } 3741 }
3742 } 3742 }
3743 void clear_huge_page(struct page *page, 3743 void clear_huge_page(struct page *page,
3744 unsigned long addr, unsigned int pages_per_huge_page) 3744 unsigned long addr, unsigned int pages_per_huge_page)
3745 { 3745 {
3746 int i; 3746 int i;
3747 3747
3748 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 3748 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3749 clear_gigantic_page(page, addr, pages_per_huge_page); 3749 clear_gigantic_page(page, addr, pages_per_huge_page);
3750 return; 3750 return;
3751 } 3751 }
3752 3752
3753 might_sleep(); 3753 might_sleep();
3754 for (i = 0; i < pages_per_huge_page; i++) { 3754 for (i = 0; i < pages_per_huge_page; i++) {
3755 cond_resched(); 3755 cond_resched();
3756 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 3756 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3757 } 3757 }
3758 } 3758 }
3759 3759
3760 static void copy_user_gigantic_page(struct page *dst, struct page *src, 3760 static void copy_user_gigantic_page(struct page *dst, struct page *src,
3761 unsigned long addr, 3761 unsigned long addr,
3762 struct vm_area_struct *vma, 3762 struct vm_area_struct *vma,
3763 unsigned int pages_per_huge_page) 3763 unsigned int pages_per_huge_page)
3764 { 3764 {
3765 int i; 3765 int i;
3766 struct page *dst_base = dst; 3766 struct page *dst_base = dst;
3767 struct page *src_base = src; 3767 struct page *src_base = src;
3768 3768
3769 for (i = 0; i < pages_per_huge_page; ) { 3769 for (i = 0; i < pages_per_huge_page; ) {
3770 cond_resched(); 3770 cond_resched();
3771 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 3771 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3772 3772
3773 i++; 3773 i++;
3774 dst = mem_map_next(dst, dst_base, i); 3774 dst = mem_map_next(dst, dst_base, i);
3775 src = mem_map_next(src, src_base, i); 3775 src = mem_map_next(src, src_base, i);
3776 } 3776 }
3777 } 3777 }
3778 3778
3779 void copy_user_huge_page(struct page *dst, struct page *src, 3779 void copy_user_huge_page(struct page *dst, struct page *src,
3780 unsigned long addr, struct vm_area_struct *vma, 3780 unsigned long addr, struct vm_area_struct *vma,
3781 unsigned int pages_per_huge_page) 3781 unsigned int pages_per_huge_page)
3782 { 3782 {
3783 int i; 3783 int i;
3784 3784
3785 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 3785 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3786 copy_user_gigantic_page(dst, src, addr, vma, 3786 copy_user_gigantic_page(dst, src, addr, vma,
3787 pages_per_huge_page); 3787 pages_per_huge_page);
3788 return; 3788 return;
3789 } 3789 }
3790 3790
3791 might_sleep(); 3791 might_sleep();
3792 for (i = 0; i < pages_per_huge_page; i++) { 3792 for (i = 0; i < pages_per_huge_page; i++) {
3793 cond_resched(); 3793 cond_resched();
3794 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 3794 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3795 } 3795 }
3796 } 3796 }
3797 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 3797 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
3798 3798
3799 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 3799 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
3800 3800
3801 static struct kmem_cache *page_ptl_cachep; 3801 static struct kmem_cache *page_ptl_cachep;
3802 3802
3803 void __init ptlock_cache_init(void) 3803 void __init ptlock_cache_init(void)
3804 { 3804 {
3805 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, 3805 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
3806 SLAB_PANIC, NULL); 3806 SLAB_PANIC, NULL);
3807 } 3807 }
3808 3808
3809 bool ptlock_alloc(struct page *page) 3809 bool ptlock_alloc(struct page *page)
3810 { 3810 {
3811 spinlock_t *ptl; 3811 spinlock_t *ptl;
3812 3812
3813 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); 3813 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
3814 if (!ptl) 3814 if (!ptl)
3815 return false; 3815 return false;
3816 page->ptl = ptl; 3816 page->ptl = ptl;
3817 return true; 3817 return true;
3818 } 3818 }
3819 3819
3820 void ptlock_free(struct page *page) 3820 void ptlock_free(struct page *page)
3821 { 3821 {
3822 kmem_cache_free(page_ptl_cachep, page->ptl); 3822 kmem_cache_free(page_ptl_cachep, page->ptl);
3823 } 3823 }
3824 #endif 3824 #endif
3825 3825
1 /* 1 /*
2 * linux/mm/slab.c 2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97. 3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk) 4 * (markhe@nextd.demon.co.uk)
5 * 5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 * 7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays 8 * Major cleanup, different bufctl logic, per-cpu arrays
9 * (c) 2000 Manfred Spraul 9 * (c) 2000 Manfred Spraul
10 * 10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA 11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * (c) 2002 Manfred Spraul 12 * (c) 2002 Manfred Spraul
13 * 13 *
14 * An implementation of the Slab Allocator as described in outline in; 14 * An implementation of the Slab Allocator as described in outline in;
15 * UNIX Internals: The New Frontiers by Uresh Vahalia 15 * UNIX Internals: The New Frontiers by Uresh Vahalia
16 * Pub: Prentice Hall ISBN 0-13-101908-2 16 * Pub: Prentice Hall ISBN 0-13-101908-2
17 * or with a little more detail in; 17 * or with a little more detail in;
18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 * Jeff Bonwick (Sun Microsystems). 19 * Jeff Bonwick (Sun Microsystems).
20 * Presented at: USENIX Summer 1994 Technical Conference 20 * Presented at: USENIX Summer 1994 Technical Conference
21 * 21 *
22 * The memory is organized in caches, one cache for each object type. 22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one 24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple 25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects. 26 * initialized objects.
27 * 27 *
28 * This means, that your constructor is used only for newly allocated 28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same initializations to 29 * slabs and you must pass objects with the same initializations to
30 * kmem_cache_free. 30 * kmem_cache_free.
31 * 31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new 33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type. 34 * cache for that memory type.
35 * 35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 * full slabs with 0 free objects 37 * full slabs with 0 free objects
38 * partial slabs 38 * partial slabs
39 * empty slabs with no allocated objects 39 * empty slabs with no allocated objects
40 * 40 *
41 * If partial slabs exist, then new allocations come from these slabs, 41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated. 42 * otherwise from empty slabs or new slabs are allocated.
43 * 43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 * 46 *
47 * Each cache has a short per-cpu head array, most allocs 47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2 48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache. 49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates. 50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations. 51 * On SMP, it additionally reduces the spinlock operations.
52 * 52 *
53 * The c_cpuarray may not be read with enabled local interrupts - 53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function(). 54 * it's changed with a smp_call_function().
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
57 * constructors and destructors are called without any locking. 57 * constructors and destructors are called without any locking.
58 * Several members in struct kmem_cache and struct slab never change, they 58 * Several members in struct kmem_cache and struct slab never change, they
59 * are accessed without any locking. 59 * are accessed without any locking.
60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 * and local interrupts are disabled so slab code is preempt-safe. 61 * and local interrupts are disabled so slab code is preempt-safe.
62 * The non-constant members are protected with a per-cache irq spinlock. 62 * The non-constant members are protected with a per-cache irq spinlock.
63 * 63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from 65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch. 66 * his patch.
67 * 67 *
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the mutex 'slab_mutex'. 71 * The global cache-chain is protected by the mutex 'slab_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
75 * 75 *
76 * At present, each engine can be growing a cache. This should be blocked. 76 * At present, each engine can be growing a cache. This should be blocked.
77 * 77 *
78 * 15 March 2005. NUMA slab allocator. 78 * 15 March 2005. NUMA slab allocator.
79 * Shai Fultheim <shai@scalex86.org>. 79 * Shai Fultheim <shai@scalex86.org>.
80 * Shobhit Dayal <shobhit@calsoftinc.com> 80 * Shobhit Dayal <shobhit@calsoftinc.com>
81 * Alok N Kataria <alokk@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com>
82 * Christoph Lameter <christoph@lameter.com> 82 * Christoph Lameter <christoph@lameter.com>
83 * 83 *
84 * Modified the slab allocator to be node aware on NUMA systems. 84 * Modified the slab allocator to be node aware on NUMA systems.
85 * Each node has its own list of partial, free and full slabs. 85 * Each node has its own list of partial, free and full slabs.
86 * All object allocations for a node occur from node specific slab lists. 86 * All object allocations for a node occur from node specific slab lists.
87 */ 87 */
88 88
89 #include <linux/slab.h> 89 #include <linux/slab.h>
90 #include <linux/mm.h> 90 #include <linux/mm.h>
91 #include <linux/poison.h> 91 #include <linux/poison.h>
92 #include <linux/swap.h> 92 #include <linux/swap.h>
93 #include <linux/cache.h> 93 #include <linux/cache.h>
94 #include <linux/interrupt.h> 94 #include <linux/interrupt.h>
95 #include <linux/init.h> 95 #include <linux/init.h>
96 #include <linux/compiler.h> 96 #include <linux/compiler.h>
97 #include <linux/cpuset.h> 97 #include <linux/cpuset.h>
98 #include <linux/proc_fs.h> 98 #include <linux/proc_fs.h>
99 #include <linux/seq_file.h> 99 #include <linux/seq_file.h>
100 #include <linux/notifier.h> 100 #include <linux/notifier.h>
101 #include <linux/kallsyms.h> 101 #include <linux/kallsyms.h>
102 #include <linux/cpu.h> 102 #include <linux/cpu.h>
103 #include <linux/sysctl.h> 103 #include <linux/sysctl.h>
104 #include <linux/module.h> 104 #include <linux/module.h>
105 #include <linux/rcupdate.h> 105 #include <linux/rcupdate.h>
106 #include <linux/string.h> 106 #include <linux/string.h>
107 #include <linux/uaccess.h> 107 #include <linux/uaccess.h>
108 #include <linux/nodemask.h> 108 #include <linux/nodemask.h>
109 #include <linux/kmemleak.h> 109 #include <linux/kmemleak.h>
110 #include <linux/mempolicy.h> 110 #include <linux/mempolicy.h>
111 #include <linux/mutex.h> 111 #include <linux/mutex.h>
112 #include <linux/fault-inject.h> 112 #include <linux/fault-inject.h>
113 #include <linux/rtmutex.h> 113 #include <linux/rtmutex.h>
114 #include <linux/reciprocal_div.h> 114 #include <linux/reciprocal_div.h>
115 #include <linux/debugobjects.h> 115 #include <linux/debugobjects.h>
116 #include <linux/kmemcheck.h> 116 #include <linux/kmemcheck.h>
117 #include <linux/memory.h> 117 #include <linux/memory.h>
118 #include <linux/prefetch.h> 118 #include <linux/prefetch.h>
119 119
120 #include <net/sock.h> 120 #include <net/sock.h>
121 121
122 #include <asm/cacheflush.h> 122 #include <asm/cacheflush.h>
123 #include <asm/tlbflush.h> 123 #include <asm/tlbflush.h>
124 #include <asm/page.h> 124 #include <asm/page.h>
125 125
126 #include <trace/events/kmem.h> 126 #include <trace/events/kmem.h>
127 127
128 #include "internal.h" 128 #include "internal.h"
129 129
130 #include "slab.h" 130 #include "slab.h"
131 131
132 /* 132 /*
133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 133 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
134 * 0 for faster, smaller code (especially in the critical paths). 134 * 0 for faster, smaller code (especially in the critical paths).
135 * 135 *
136 * STATS - 1 to collect stats for /proc/slabinfo. 136 * STATS - 1 to collect stats for /proc/slabinfo.
137 * 0 for faster, smaller code (especially in the critical paths). 137 * 0 for faster, smaller code (especially in the critical paths).
138 * 138 *
139 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 139 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
140 */ 140 */
141 141
142 #ifdef CONFIG_DEBUG_SLAB 142 #ifdef CONFIG_DEBUG_SLAB
143 #define DEBUG 1 143 #define DEBUG 1
144 #define STATS 1 144 #define STATS 1
145 #define FORCED_DEBUG 1 145 #define FORCED_DEBUG 1
146 #else 146 #else
147 #define DEBUG 0 147 #define DEBUG 0
148 #define STATS 0 148 #define STATS 0
149 #define FORCED_DEBUG 0 149 #define FORCED_DEBUG 0
150 #endif 150 #endif
151 151
152 /* Shouldn't this be in a header file somewhere? */ 152 /* Shouldn't this be in a header file somewhere? */
153 #define BYTES_PER_WORD sizeof(void *) 153 #define BYTES_PER_WORD sizeof(void *)
154 #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 154 #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
155 155
156 #ifndef ARCH_KMALLOC_FLAGS 156 #ifndef ARCH_KMALLOC_FLAGS
157 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 157 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
158 #endif 158 #endif
159 159
160 #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ 160 #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
161 <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) 161 <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
162 162
163 #if FREELIST_BYTE_INDEX 163 #if FREELIST_BYTE_INDEX
164 typedef unsigned char freelist_idx_t; 164 typedef unsigned char freelist_idx_t;
165 #else 165 #else
166 typedef unsigned short freelist_idx_t; 166 typedef unsigned short freelist_idx_t;
167 #endif 167 #endif
168 168
169 #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) 169 #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
170 170
171 /* 171 /*
172 * true if a page was allocated from pfmemalloc reserves for network-based 172 * true if a page was allocated from pfmemalloc reserves for network-based
173 * swap 173 * swap
174 */ 174 */
175 static bool pfmemalloc_active __read_mostly; 175 static bool pfmemalloc_active __read_mostly;
176 176
177 /* 177 /*
178 * struct array_cache 178 * struct array_cache
179 * 179 *
180 * Purpose: 180 * Purpose:
181 * - LIFO ordering, to hand out cache-warm objects from _alloc 181 * - LIFO ordering, to hand out cache-warm objects from _alloc
182 * - reduce the number of linked list operations 182 * - reduce the number of linked list operations
183 * - reduce spinlock operations 183 * - reduce spinlock operations
184 * 184 *
185 * The limit is stored in the per-cpu structure to reduce the data cache 185 * The limit is stored in the per-cpu structure to reduce the data cache
186 * footprint. 186 * footprint.
187 * 187 *
188 */ 188 */
189 struct array_cache { 189 struct array_cache {
190 unsigned int avail; 190 unsigned int avail;
191 unsigned int limit; 191 unsigned int limit;
192 unsigned int batchcount; 192 unsigned int batchcount;
193 unsigned int touched; 193 unsigned int touched;
194 void *entry[]; /* 194 void *entry[]; /*
195 * Must have this definition in here for the proper 195 * Must have this definition in here for the proper
196 * alignment of array_cache. Also simplifies accessing 196 * alignment of array_cache. Also simplifies accessing
197 * the entries. 197 * the entries.
198 * 198 *
199 * Entries should not be directly dereferenced as 199 * Entries should not be directly dereferenced as
200 * entries belonging to slabs marked pfmemalloc will 200 * entries belonging to slabs marked pfmemalloc will
201 * have the lower bits set SLAB_OBJ_PFMEMALLOC 201 * have the lower bits set SLAB_OBJ_PFMEMALLOC
202 */ 202 */
203 }; 203 };
204 204
205 struct alien_cache { 205 struct alien_cache {
206 spinlock_t lock; 206 spinlock_t lock;
207 struct array_cache ac; 207 struct array_cache ac;
208 }; 208 };
209 209
210 #define SLAB_OBJ_PFMEMALLOC 1 210 #define SLAB_OBJ_PFMEMALLOC 1
211 static inline bool is_obj_pfmemalloc(void *objp) 211 static inline bool is_obj_pfmemalloc(void *objp)
212 { 212 {
213 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; 213 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
214 } 214 }
215 215
216 static inline void set_obj_pfmemalloc(void **objp) 216 static inline void set_obj_pfmemalloc(void **objp)
217 { 217 {
218 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); 218 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
219 return; 219 return;
220 } 220 }
221 221
222 static inline void clear_obj_pfmemalloc(void **objp) 222 static inline void clear_obj_pfmemalloc(void **objp)
223 { 223 {
224 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); 224 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
225 } 225 }
226 226
227 /* 227 /*
228 * bootstrap: The caches do not work without cpuarrays anymore, but the 228 * bootstrap: The caches do not work without cpuarrays anymore, but the
229 * cpuarrays are allocated from the generic caches... 229 * cpuarrays are allocated from the generic caches...
230 */ 230 */
231 #define BOOT_CPUCACHE_ENTRIES 1 231 #define BOOT_CPUCACHE_ENTRIES 1
232 struct arraycache_init { 232 struct arraycache_init {
233 struct array_cache cache; 233 struct array_cache cache;
234 void *entries[BOOT_CPUCACHE_ENTRIES]; 234 void *entries[BOOT_CPUCACHE_ENTRIES];
235 }; 235 };
236 236
237 /* 237 /*
238 * Need this for bootstrapping a per node allocator. 238 * Need this for bootstrapping a per node allocator.
239 */ 239 */
240 #define NUM_INIT_LISTS (3 * MAX_NUMNODES) 240 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
241 static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; 241 static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
242 #define CACHE_CACHE 0 242 #define CACHE_CACHE 0
243 #define SIZE_AC MAX_NUMNODES 243 #define SIZE_AC MAX_NUMNODES
244 #define SIZE_NODE (2 * MAX_NUMNODES) 244 #define SIZE_NODE (2 * MAX_NUMNODES)
245 245
246 static int drain_freelist(struct kmem_cache *cache, 246 static int drain_freelist(struct kmem_cache *cache,
247 struct kmem_cache_node *n, int tofree); 247 struct kmem_cache_node *n, int tofree);
248 static void free_block(struct kmem_cache *cachep, void **objpp, int len, 248 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
249 int node, struct list_head *list); 249 int node, struct list_head *list);
250 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); 250 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
251 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 251 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
252 static void cache_reap(struct work_struct *unused); 252 static void cache_reap(struct work_struct *unused);
253 253
254 static int slab_early_init = 1; 254 static int slab_early_init = 1;
255 255
256 #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) 256 #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
257 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) 257 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
258 258
259 static void kmem_cache_node_init(struct kmem_cache_node *parent) 259 static void kmem_cache_node_init(struct kmem_cache_node *parent)
260 { 260 {
261 INIT_LIST_HEAD(&parent->slabs_full); 261 INIT_LIST_HEAD(&parent->slabs_full);
262 INIT_LIST_HEAD(&parent->slabs_partial); 262 INIT_LIST_HEAD(&parent->slabs_partial);
263 INIT_LIST_HEAD(&parent->slabs_free); 263 INIT_LIST_HEAD(&parent->slabs_free);
264 parent->shared = NULL; 264 parent->shared = NULL;
265 parent->alien = NULL; 265 parent->alien = NULL;
266 parent->colour_next = 0; 266 parent->colour_next = 0;
267 spin_lock_init(&parent->list_lock); 267 spin_lock_init(&parent->list_lock);
268 parent->free_objects = 0; 268 parent->free_objects = 0;
269 parent->free_touched = 0; 269 parent->free_touched = 0;
270 } 270 }
271 271
272 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 272 #define MAKE_LIST(cachep, listp, slab, nodeid) \
273 do { \ 273 do { \
274 INIT_LIST_HEAD(listp); \ 274 INIT_LIST_HEAD(listp); \
275 list_splice(&get_node(cachep, nodeid)->slab, listp); \ 275 list_splice(&get_node(cachep, nodeid)->slab, listp); \
276 } while (0) 276 } while (0)
277 277
278 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 278 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
279 do { \ 279 do { \
280 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 280 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
281 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 281 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
282 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 282 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
283 } while (0) 283 } while (0)
284 284
285 #define CFLGS_OFF_SLAB (0x80000000UL) 285 #define CFLGS_OFF_SLAB (0x80000000UL)
286 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 286 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
287 287
288 #define BATCHREFILL_LIMIT 16 288 #define BATCHREFILL_LIMIT 16
289 /* 289 /*
290 * Optimization question: fewer reaps means less probability for unnessary 290 * Optimization question: fewer reaps means less probability for unnessary
291 * cpucache drain/refill cycles. 291 * cpucache drain/refill cycles.
292 * 292 *
293 * OTOH the cpuarrays can contain lots of objects, 293 * OTOH the cpuarrays can contain lots of objects,
294 * which could lock up otherwise freeable slabs. 294 * which could lock up otherwise freeable slabs.
295 */ 295 */
296 #define REAPTIMEOUT_AC (2*HZ) 296 #define REAPTIMEOUT_AC (2*HZ)
297 #define REAPTIMEOUT_NODE (4*HZ) 297 #define REAPTIMEOUT_NODE (4*HZ)
298 298
299 #if STATS 299 #if STATS
300 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 300 #define STATS_INC_ACTIVE(x) ((x)->num_active++)
301 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 301 #define STATS_DEC_ACTIVE(x) ((x)->num_active--)
302 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 302 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
303 #define STATS_INC_GROWN(x) ((x)->grown++) 303 #define STATS_INC_GROWN(x) ((x)->grown++)
304 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 304 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
305 #define STATS_SET_HIGH(x) \ 305 #define STATS_SET_HIGH(x) \
306 do { \ 306 do { \
307 if ((x)->num_active > (x)->high_mark) \ 307 if ((x)->num_active > (x)->high_mark) \
308 (x)->high_mark = (x)->num_active; \ 308 (x)->high_mark = (x)->num_active; \
309 } while (0) 309 } while (0)
310 #define STATS_INC_ERR(x) ((x)->errors++) 310 #define STATS_INC_ERR(x) ((x)->errors++)
311 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 311 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
312 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 312 #define STATS_INC_NODEFREES(x) ((x)->node_frees++)
313 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 313 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
314 #define STATS_SET_FREEABLE(x, i) \ 314 #define STATS_SET_FREEABLE(x, i) \
315 do { \ 315 do { \
316 if ((x)->max_freeable < i) \ 316 if ((x)->max_freeable < i) \
317 (x)->max_freeable = i; \ 317 (x)->max_freeable = i; \
318 } while (0) 318 } while (0)
319 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 319 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
320 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 320 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
321 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 321 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
322 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 322 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
323 #else 323 #else
324 #define STATS_INC_ACTIVE(x) do { } while (0) 324 #define STATS_INC_ACTIVE(x) do { } while (0)
325 #define STATS_DEC_ACTIVE(x) do { } while (0) 325 #define STATS_DEC_ACTIVE(x) do { } while (0)
326 #define STATS_INC_ALLOCED(x) do { } while (0) 326 #define STATS_INC_ALLOCED(x) do { } while (0)
327 #define STATS_INC_GROWN(x) do { } while (0) 327 #define STATS_INC_GROWN(x) do { } while (0)
328 #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) 328 #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
329 #define STATS_SET_HIGH(x) do { } while (0) 329 #define STATS_SET_HIGH(x) do { } while (0)
330 #define STATS_INC_ERR(x) do { } while (0) 330 #define STATS_INC_ERR(x) do { } while (0)
331 #define STATS_INC_NODEALLOCS(x) do { } while (0) 331 #define STATS_INC_NODEALLOCS(x) do { } while (0)
332 #define STATS_INC_NODEFREES(x) do { } while (0) 332 #define STATS_INC_NODEFREES(x) do { } while (0)
333 #define STATS_INC_ACOVERFLOW(x) do { } while (0) 333 #define STATS_INC_ACOVERFLOW(x) do { } while (0)
334 #define STATS_SET_FREEABLE(x, i) do { } while (0) 334 #define STATS_SET_FREEABLE(x, i) do { } while (0)
335 #define STATS_INC_ALLOCHIT(x) do { } while (0) 335 #define STATS_INC_ALLOCHIT(x) do { } while (0)
336 #define STATS_INC_ALLOCMISS(x) do { } while (0) 336 #define STATS_INC_ALLOCMISS(x) do { } while (0)
337 #define STATS_INC_FREEHIT(x) do { } while (0) 337 #define STATS_INC_FREEHIT(x) do { } while (0)
338 #define STATS_INC_FREEMISS(x) do { } while (0) 338 #define STATS_INC_FREEMISS(x) do { } while (0)
339 #endif 339 #endif
340 340
341 #if DEBUG 341 #if DEBUG
342 342
343 /* 343 /*
344 * memory layout of objects: 344 * memory layout of objects:
345 * 0 : objp 345 * 0 : objp
346 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 346 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
347 * the end of an object is aligned with the end of the real 347 * the end of an object is aligned with the end of the real
348 * allocation. Catches writes behind the end of the allocation. 348 * allocation. Catches writes behind the end of the allocation.
349 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 349 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
350 * redzone word. 350 * redzone word.
351 * cachep->obj_offset: The real object. 351 * cachep->obj_offset: The real object.
352 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 352 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
353 * cachep->size - 1* BYTES_PER_WORD: last caller address 353 * cachep->size - 1* BYTES_PER_WORD: last caller address
354 * [BYTES_PER_WORD long] 354 * [BYTES_PER_WORD long]
355 */ 355 */
356 static int obj_offset(struct kmem_cache *cachep) 356 static int obj_offset(struct kmem_cache *cachep)
357 { 357 {
358 return cachep->obj_offset; 358 return cachep->obj_offset;
359 } 359 }
360 360
361 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 361 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
362 { 362 {
363 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 363 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
364 return (unsigned long long*) (objp + obj_offset(cachep) - 364 return (unsigned long long*) (objp + obj_offset(cachep) -
365 sizeof(unsigned long long)); 365 sizeof(unsigned long long));
366 } 366 }
367 367
368 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 368 static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
369 { 369 {
370 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 370 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
371 if (cachep->flags & SLAB_STORE_USER) 371 if (cachep->flags & SLAB_STORE_USER)
372 return (unsigned long long *)(objp + cachep->size - 372 return (unsigned long long *)(objp + cachep->size -
373 sizeof(unsigned long long) - 373 sizeof(unsigned long long) -
374 REDZONE_ALIGN); 374 REDZONE_ALIGN);
375 return (unsigned long long *) (objp + cachep->size - 375 return (unsigned long long *) (objp + cachep->size -
376 sizeof(unsigned long long)); 376 sizeof(unsigned long long));
377 } 377 }
378 378
379 static void **dbg_userword(struct kmem_cache *cachep, void *objp) 379 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
380 { 380 {
381 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 381 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
382 return (void **)(objp + cachep->size - BYTES_PER_WORD); 382 return (void **)(objp + cachep->size - BYTES_PER_WORD);
383 } 383 }
384 384
385 #else 385 #else
386 386
387 #define obj_offset(x) 0 387 #define obj_offset(x) 0
388 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 388 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
389 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 389 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
390 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 390 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
391 391
392 #endif 392 #endif
393 393
394 #define OBJECT_FREE (0) 394 #define OBJECT_FREE (0)
395 #define OBJECT_ACTIVE (1) 395 #define OBJECT_ACTIVE (1)
396 396
397 #ifdef CONFIG_DEBUG_SLAB_LEAK 397 #ifdef CONFIG_DEBUG_SLAB_LEAK
398 398
399 static void set_obj_status(struct page *page, int idx, int val) 399 static void set_obj_status(struct page *page, int idx, int val)
400 { 400 {
401 int freelist_size; 401 int freelist_size;
402 char *status; 402 char *status;
403 struct kmem_cache *cachep = page->slab_cache; 403 struct kmem_cache *cachep = page->slab_cache;
404 404
405 freelist_size = cachep->num * sizeof(freelist_idx_t); 405 freelist_size = cachep->num * sizeof(freelist_idx_t);
406 status = (char *)page->freelist + freelist_size; 406 status = (char *)page->freelist + freelist_size;
407 status[idx] = val; 407 status[idx] = val;
408 } 408 }
409 409
410 static inline unsigned int get_obj_status(struct page *page, int idx) 410 static inline unsigned int get_obj_status(struct page *page, int idx)
411 { 411 {
412 int freelist_size; 412 int freelist_size;
413 char *status; 413 char *status;
414 struct kmem_cache *cachep = page->slab_cache; 414 struct kmem_cache *cachep = page->slab_cache;
415 415
416 freelist_size = cachep->num * sizeof(freelist_idx_t); 416 freelist_size = cachep->num * sizeof(freelist_idx_t);
417 status = (char *)page->freelist + freelist_size; 417 status = (char *)page->freelist + freelist_size;
418 418
419 return status[idx]; 419 return status[idx];
420 } 420 }
421 421
422 #else 422 #else
423 static inline void set_obj_status(struct page *page, int idx, int val) {} 423 static inline void set_obj_status(struct page *page, int idx, int val) {}
424 424
425 #endif 425 #endif
426 426
427 /* 427 /*
428 * Do not go above this order unless 0 objects fit into the slab or 428 * Do not go above this order unless 0 objects fit into the slab or
429 * overridden on the command line. 429 * overridden on the command line.
430 */ 430 */
431 #define SLAB_MAX_ORDER_HI 1 431 #define SLAB_MAX_ORDER_HI 1
432 #define SLAB_MAX_ORDER_LO 0 432 #define SLAB_MAX_ORDER_LO 0
433 static int slab_max_order = SLAB_MAX_ORDER_LO; 433 static int slab_max_order = SLAB_MAX_ORDER_LO;
434 static bool slab_max_order_set __initdata; 434 static bool slab_max_order_set __initdata;
435 435
436 static inline struct kmem_cache *virt_to_cache(const void *obj) 436 static inline struct kmem_cache *virt_to_cache(const void *obj)
437 { 437 {
438 struct page *page = virt_to_head_page(obj); 438 struct page *page = virt_to_head_page(obj);
439 return page->slab_cache; 439 return page->slab_cache;
440 } 440 }
441 441
442 static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, 442 static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
443 unsigned int idx) 443 unsigned int idx)
444 { 444 {
445 return page->s_mem + cache->size * idx; 445 return page->s_mem + cache->size * idx;
446 } 446 }
447 447
448 /* 448 /*
449 * We want to avoid an expensive divide : (offset / cache->size) 449 * We want to avoid an expensive divide : (offset / cache->size)
450 * Using the fact that size is a constant for a particular cache, 450 * Using the fact that size is a constant for a particular cache,
451 * we can replace (offset / cache->size) by 451 * we can replace (offset / cache->size) by
452 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 452 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
453 */ 453 */
454 static inline unsigned int obj_to_index(const struct kmem_cache *cache, 454 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
455 const struct page *page, void *obj) 455 const struct page *page, void *obj)
456 { 456 {
457 u32 offset = (obj - page->s_mem); 457 u32 offset = (obj - page->s_mem);
458 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 458 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
459 } 459 }
460 460
461 static struct arraycache_init initarray_generic = 461 static struct arraycache_init initarray_generic =
462 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 462 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
463 463
464 /* internal cache of cache description objs */ 464 /* internal cache of cache description objs */
465 static struct kmem_cache kmem_cache_boot = { 465 static struct kmem_cache kmem_cache_boot = {
466 .batchcount = 1, 466 .batchcount = 1,
467 .limit = BOOT_CPUCACHE_ENTRIES, 467 .limit = BOOT_CPUCACHE_ENTRIES,
468 .shared = 1, 468 .shared = 1,
469 .size = sizeof(struct kmem_cache), 469 .size = sizeof(struct kmem_cache),
470 .name = "kmem_cache", 470 .name = "kmem_cache",
471 }; 471 };
472 472
473 #define BAD_ALIEN_MAGIC 0x01020304ul 473 #define BAD_ALIEN_MAGIC 0x01020304ul
474 474
475 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 475 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
476 476
477 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 477 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
478 { 478 {
479 return cachep->array[smp_processor_id()]; 479 return cachep->array[smp_processor_id()];
480 } 480 }
481 481
482 static size_t calculate_freelist_size(int nr_objs, size_t align) 482 static size_t calculate_freelist_size(int nr_objs, size_t align)
483 { 483 {
484 size_t freelist_size; 484 size_t freelist_size;
485 485
486 freelist_size = nr_objs * sizeof(freelist_idx_t); 486 freelist_size = nr_objs * sizeof(freelist_idx_t);
487 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 487 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
488 freelist_size += nr_objs * sizeof(char); 488 freelist_size += nr_objs * sizeof(char);
489 489
490 if (align) 490 if (align)
491 freelist_size = ALIGN(freelist_size, align); 491 freelist_size = ALIGN(freelist_size, align);
492 492
493 return freelist_size; 493 return freelist_size;
494 } 494 }
495 495
496 static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 496 static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
497 size_t idx_size, size_t align) 497 size_t idx_size, size_t align)
498 { 498 {
499 int nr_objs; 499 int nr_objs;
500 size_t remained_size; 500 size_t remained_size;
501 size_t freelist_size; 501 size_t freelist_size;
502 int extra_space = 0; 502 int extra_space = 0;
503 503
504 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 504 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
505 extra_space = sizeof(char); 505 extra_space = sizeof(char);
506 /* 506 /*
507 * Ignore padding for the initial guess. The padding 507 * Ignore padding for the initial guess. The padding
508 * is at most @align-1 bytes, and @buffer_size is at 508 * is at most @align-1 bytes, and @buffer_size is at
509 * least @align. In the worst case, this result will 509 * least @align. In the worst case, this result will
510 * be one greater than the number of objects that fit 510 * be one greater than the number of objects that fit
511 * into the memory allocation when taking the padding 511 * into the memory allocation when taking the padding
512 * into account. 512 * into account.
513 */ 513 */
514 nr_objs = slab_size / (buffer_size + idx_size + extra_space); 514 nr_objs = slab_size / (buffer_size + idx_size + extra_space);
515 515
516 /* 516 /*
517 * This calculated number will be either the right 517 * This calculated number will be either the right
518 * amount, or one greater than what we want. 518 * amount, or one greater than what we want.
519 */ 519 */
520 remained_size = slab_size - nr_objs * buffer_size; 520 remained_size = slab_size - nr_objs * buffer_size;
521 freelist_size = calculate_freelist_size(nr_objs, align); 521 freelist_size = calculate_freelist_size(nr_objs, align);
522 if (remained_size < freelist_size) 522 if (remained_size < freelist_size)
523 nr_objs--; 523 nr_objs--;
524 524
525 return nr_objs; 525 return nr_objs;
526 } 526 }
527 527
528 /* 528 /*
529 * Calculate the number of objects and left-over bytes for a given buffer size. 529 * Calculate the number of objects and left-over bytes for a given buffer size.
530 */ 530 */
531 static void cache_estimate(unsigned long gfporder, size_t buffer_size, 531 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
532 size_t align, int flags, size_t *left_over, 532 size_t align, int flags, size_t *left_over,
533 unsigned int *num) 533 unsigned int *num)
534 { 534 {
535 int nr_objs; 535 int nr_objs;
536 size_t mgmt_size; 536 size_t mgmt_size;
537 size_t slab_size = PAGE_SIZE << gfporder; 537 size_t slab_size = PAGE_SIZE << gfporder;
538 538
539 /* 539 /*
540 * The slab management structure can be either off the slab or 540 * The slab management structure can be either off the slab or
541 * on it. For the latter case, the memory allocated for a 541 * on it. For the latter case, the memory allocated for a
542 * slab is used for: 542 * slab is used for:
543 * 543 *
544 * - One unsigned int for each object 544 * - One unsigned int for each object
545 * - Padding to respect alignment of @align 545 * - Padding to respect alignment of @align
546 * - @buffer_size bytes for each object 546 * - @buffer_size bytes for each object
547 * 547 *
548 * If the slab management structure is off the slab, then the 548 * If the slab management structure is off the slab, then the
549 * alignment will already be calculated into the size. Because 549 * alignment will already be calculated into the size. Because
550 * the slabs are all pages aligned, the objects will be at the 550 * the slabs are all pages aligned, the objects will be at the
551 * correct alignment when allocated. 551 * correct alignment when allocated.
552 */ 552 */
553 if (flags & CFLGS_OFF_SLAB) { 553 if (flags & CFLGS_OFF_SLAB) {
554 mgmt_size = 0; 554 mgmt_size = 0;
555 nr_objs = slab_size / buffer_size; 555 nr_objs = slab_size / buffer_size;
556 556
557 } else { 557 } else {
558 nr_objs = calculate_nr_objs(slab_size, buffer_size, 558 nr_objs = calculate_nr_objs(slab_size, buffer_size,
559 sizeof(freelist_idx_t), align); 559 sizeof(freelist_idx_t), align);
560 mgmt_size = calculate_freelist_size(nr_objs, align); 560 mgmt_size = calculate_freelist_size(nr_objs, align);
561 } 561 }
562 *num = nr_objs; 562 *num = nr_objs;
563 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 563 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
564 } 564 }
565 565
566 #if DEBUG 566 #if DEBUG
567 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 567 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
568 568
569 static void __slab_error(const char *function, struct kmem_cache *cachep, 569 static void __slab_error(const char *function, struct kmem_cache *cachep,
570 char *msg) 570 char *msg)
571 { 571 {
572 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 572 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
573 function, cachep->name, msg); 573 function, cachep->name, msg);
574 dump_stack(); 574 dump_stack();
575 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 575 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
576 } 576 }
577 #endif 577 #endif
578 578
579 /* 579 /*
580 * By default on NUMA we use alien caches to stage the freeing of 580 * By default on NUMA we use alien caches to stage the freeing of
581 * objects allocated from other nodes. This causes massive memory 581 * objects allocated from other nodes. This causes massive memory
582 * inefficiencies when using fake NUMA setup to split memory into a 582 * inefficiencies when using fake NUMA setup to split memory into a
583 * large number of small nodes, so it can be disabled on the command 583 * large number of small nodes, so it can be disabled on the command
584 * line 584 * line
585 */ 585 */
586 586
587 static int use_alien_caches __read_mostly = 1; 587 static int use_alien_caches __read_mostly = 1;
588 static int __init noaliencache_setup(char *s) 588 static int __init noaliencache_setup(char *s)
589 { 589 {
590 use_alien_caches = 0; 590 use_alien_caches = 0;
591 return 1; 591 return 1;
592 } 592 }
593 __setup("noaliencache", noaliencache_setup); 593 __setup("noaliencache", noaliencache_setup);
594 594
595 static int __init slab_max_order_setup(char *str) 595 static int __init slab_max_order_setup(char *str)
596 { 596 {
597 get_option(&str, &slab_max_order); 597 get_option(&str, &slab_max_order);
598 slab_max_order = slab_max_order < 0 ? 0 : 598 slab_max_order = slab_max_order < 0 ? 0 :
599 min(slab_max_order, MAX_ORDER - 1); 599 min(slab_max_order, MAX_ORDER - 1);
600 slab_max_order_set = true; 600 slab_max_order_set = true;
601 601
602 return 1; 602 return 1;
603 } 603 }
604 __setup("slab_max_order=", slab_max_order_setup); 604 __setup("slab_max_order=", slab_max_order_setup);
605 605
606 #ifdef CONFIG_NUMA 606 #ifdef CONFIG_NUMA
607 /* 607 /*
608 * Special reaping functions for NUMA systems called from cache_reap(). 608 * Special reaping functions for NUMA systems called from cache_reap().
609 * These take care of doing round robin flushing of alien caches (containing 609 * These take care of doing round robin flushing of alien caches (containing
610 * objects freed on different nodes from which they were allocated) and the 610 * objects freed on different nodes from which they were allocated) and the
611 * flushing of remote pcps by calling drain_node_pages. 611 * flushing of remote pcps by calling drain_node_pages.
612 */ 612 */
613 static DEFINE_PER_CPU(unsigned long, slab_reap_node); 613 static DEFINE_PER_CPU(unsigned long, slab_reap_node);
614 614
615 static void init_reap_node(int cpu) 615 static void init_reap_node(int cpu)
616 { 616 {
617 int node; 617 int node;
618 618
619 node = next_node(cpu_to_mem(cpu), node_online_map); 619 node = next_node(cpu_to_mem(cpu), node_online_map);
620 if (node == MAX_NUMNODES) 620 if (node == MAX_NUMNODES)
621 node = first_node(node_online_map); 621 node = first_node(node_online_map);
622 622
623 per_cpu(slab_reap_node, cpu) = node; 623 per_cpu(slab_reap_node, cpu) = node;
624 } 624 }
625 625
626 static void next_reap_node(void) 626 static void next_reap_node(void)
627 { 627 {
628 int node = __this_cpu_read(slab_reap_node); 628 int node = __this_cpu_read(slab_reap_node);
629 629
630 node = next_node(node, node_online_map); 630 node = next_node(node, node_online_map);
631 if (unlikely(node >= MAX_NUMNODES)) 631 if (unlikely(node >= MAX_NUMNODES))
632 node = first_node(node_online_map); 632 node = first_node(node_online_map);
633 __this_cpu_write(slab_reap_node, node); 633 __this_cpu_write(slab_reap_node, node);
634 } 634 }
635 635
636 #else 636 #else
637 #define init_reap_node(cpu) do { } while (0) 637 #define init_reap_node(cpu) do { } while (0)
638 #define next_reap_node(void) do { } while (0) 638 #define next_reap_node(void) do { } while (0)
639 #endif 639 #endif
640 640
641 /* 641 /*
642 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 642 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
643 * via the workqueue/eventd. 643 * via the workqueue/eventd.
644 * Add the CPU number into the expiration time to minimize the possibility of 644 * Add the CPU number into the expiration time to minimize the possibility of
645 * the CPUs getting into lockstep and contending for the global cache chain 645 * the CPUs getting into lockstep and contending for the global cache chain
646 * lock. 646 * lock.
647 */ 647 */
648 static void start_cpu_timer(int cpu) 648 static void start_cpu_timer(int cpu)
649 { 649 {
650 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 650 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
651 651
652 /* 652 /*
653 * When this gets called from do_initcalls via cpucache_init(), 653 * When this gets called from do_initcalls via cpucache_init(),
654 * init_workqueues() has already run, so keventd will be setup 654 * init_workqueues() has already run, so keventd will be setup
655 * at that time. 655 * at that time.
656 */ 656 */
657 if (keventd_up() && reap_work->work.func == NULL) { 657 if (keventd_up() && reap_work->work.func == NULL) {
658 init_reap_node(cpu); 658 init_reap_node(cpu);
659 INIT_DEFERRABLE_WORK(reap_work, cache_reap); 659 INIT_DEFERRABLE_WORK(reap_work, cache_reap);
660 schedule_delayed_work_on(cpu, reap_work, 660 schedule_delayed_work_on(cpu, reap_work,
661 __round_jiffies_relative(HZ, cpu)); 661 __round_jiffies_relative(HZ, cpu));
662 } 662 }
663 } 663 }
664 664
665 static void init_arraycache(struct array_cache *ac, int limit, int batch) 665 static void init_arraycache(struct array_cache *ac, int limit, int batch)
666 { 666 {
667 /* 667 /*
668 * The array_cache structures contain pointers to free object. 668 * The array_cache structures contain pointers to free object.
669 * However, when such objects are allocated or transferred to another 669 * However, when such objects are allocated or transferred to another
670 * cache the pointers are not cleared and they could be counted as 670 * cache the pointers are not cleared and they could be counted as
671 * valid references during a kmemleak scan. Therefore, kmemleak must 671 * valid references during a kmemleak scan. Therefore, kmemleak must
672 * not scan such objects. 672 * not scan such objects.
673 */ 673 */
674 kmemleak_no_scan(ac); 674 kmemleak_no_scan(ac);
675 if (ac) { 675 if (ac) {
676 ac->avail = 0; 676 ac->avail = 0;
677 ac->limit = limit; 677 ac->limit = limit;
678 ac->batchcount = batch; 678 ac->batchcount = batch;
679 ac->touched = 0; 679 ac->touched = 0;
680 } 680 }
681 } 681 }
682 682
683 static struct array_cache *alloc_arraycache(int node, int entries, 683 static struct array_cache *alloc_arraycache(int node, int entries,
684 int batchcount, gfp_t gfp) 684 int batchcount, gfp_t gfp)
685 { 685 {
686 size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); 686 size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
687 struct array_cache *ac = NULL; 687 struct array_cache *ac = NULL;
688 688
689 ac = kmalloc_node(memsize, gfp, node); 689 ac = kmalloc_node(memsize, gfp, node);
690 init_arraycache(ac, entries, batchcount); 690 init_arraycache(ac, entries, batchcount);
691 return ac; 691 return ac;
692 } 692 }
693 693
694 static inline bool is_slab_pfmemalloc(struct page *page) 694 static inline bool is_slab_pfmemalloc(struct page *page)
695 { 695 {
696 return PageSlabPfmemalloc(page); 696 return PageSlabPfmemalloc(page);
697 } 697 }
698 698
699 /* Clears pfmemalloc_active if no slabs have pfmalloc set */ 699 /* Clears pfmemalloc_active if no slabs have pfmalloc set */
700 static void recheck_pfmemalloc_active(struct kmem_cache *cachep, 700 static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
701 struct array_cache *ac) 701 struct array_cache *ac)
702 { 702 {
703 struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); 703 struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
704 struct page *page; 704 struct page *page;
705 unsigned long flags; 705 unsigned long flags;
706 706
707 if (!pfmemalloc_active) 707 if (!pfmemalloc_active)
708 return; 708 return;
709 709
710 spin_lock_irqsave(&n->list_lock, flags); 710 spin_lock_irqsave(&n->list_lock, flags);
711 list_for_each_entry(page, &n->slabs_full, lru) 711 list_for_each_entry(page, &n->slabs_full, lru)
712 if (is_slab_pfmemalloc(page)) 712 if (is_slab_pfmemalloc(page))
713 goto out; 713 goto out;
714 714
715 list_for_each_entry(page, &n->slabs_partial, lru) 715 list_for_each_entry(page, &n->slabs_partial, lru)
716 if (is_slab_pfmemalloc(page)) 716 if (is_slab_pfmemalloc(page))
717 goto out; 717 goto out;
718 718
719 list_for_each_entry(page, &n->slabs_free, lru) 719 list_for_each_entry(page, &n->slabs_free, lru)
720 if (is_slab_pfmemalloc(page)) 720 if (is_slab_pfmemalloc(page))
721 goto out; 721 goto out;
722 722
723 pfmemalloc_active = false; 723 pfmemalloc_active = false;
724 out: 724 out:
725 spin_unlock_irqrestore(&n->list_lock, flags); 725 spin_unlock_irqrestore(&n->list_lock, flags);
726 } 726 }
727 727
728 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, 728 static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
729 gfp_t flags, bool force_refill) 729 gfp_t flags, bool force_refill)
730 { 730 {
731 int i; 731 int i;
732 void *objp = ac->entry[--ac->avail]; 732 void *objp = ac->entry[--ac->avail];
733 733
734 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ 734 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
735 if (unlikely(is_obj_pfmemalloc(objp))) { 735 if (unlikely(is_obj_pfmemalloc(objp))) {
736 struct kmem_cache_node *n; 736 struct kmem_cache_node *n;
737 737
738 if (gfp_pfmemalloc_allowed(flags)) { 738 if (gfp_pfmemalloc_allowed(flags)) {
739 clear_obj_pfmemalloc(&objp); 739 clear_obj_pfmemalloc(&objp);
740 return objp; 740 return objp;
741 } 741 }
742 742
743 /* The caller cannot use PFMEMALLOC objects, find another one */ 743 /* The caller cannot use PFMEMALLOC objects, find another one */
744 for (i = 0; i < ac->avail; i++) { 744 for (i = 0; i < ac->avail; i++) {
745 /* If a !PFMEMALLOC object is found, swap them */ 745 /* If a !PFMEMALLOC object is found, swap them */
746 if (!is_obj_pfmemalloc(ac->entry[i])) { 746 if (!is_obj_pfmemalloc(ac->entry[i])) {
747 objp = ac->entry[i]; 747 objp = ac->entry[i];
748 ac->entry[i] = ac->entry[ac->avail]; 748 ac->entry[i] = ac->entry[ac->avail];
749 ac->entry[ac->avail] = objp; 749 ac->entry[ac->avail] = objp;
750 return objp; 750 return objp;
751 } 751 }
752 } 752 }
753 753
754 /* 754 /*
755 * If there are empty slabs on the slabs_free list and we are 755 * If there are empty slabs on the slabs_free list and we are
756 * being forced to refill the cache, mark this one !pfmemalloc. 756 * being forced to refill the cache, mark this one !pfmemalloc.
757 */ 757 */
758 n = get_node(cachep, numa_mem_id()); 758 n = get_node(cachep, numa_mem_id());
759 if (!list_empty(&n->slabs_free) && force_refill) { 759 if (!list_empty(&n->slabs_free) && force_refill) {
760 struct page *page = virt_to_head_page(objp); 760 struct page *page = virt_to_head_page(objp);
761 ClearPageSlabPfmemalloc(page); 761 ClearPageSlabPfmemalloc(page);
762 clear_obj_pfmemalloc(&objp); 762 clear_obj_pfmemalloc(&objp);
763 recheck_pfmemalloc_active(cachep, ac); 763 recheck_pfmemalloc_active(cachep, ac);
764 return objp; 764 return objp;
765 } 765 }
766 766
767 /* No !PFMEMALLOC objects available */ 767 /* No !PFMEMALLOC objects available */
768 ac->avail++; 768 ac->avail++;
769 objp = NULL; 769 objp = NULL;
770 } 770 }
771 771
772 return objp; 772 return objp;
773 } 773 }
774 774
775 static inline void *ac_get_obj(struct kmem_cache *cachep, 775 static inline void *ac_get_obj(struct kmem_cache *cachep,
776 struct array_cache *ac, gfp_t flags, bool force_refill) 776 struct array_cache *ac, gfp_t flags, bool force_refill)
777 { 777 {
778 void *objp; 778 void *objp;
779 779
780 if (unlikely(sk_memalloc_socks())) 780 if (unlikely(sk_memalloc_socks()))
781 objp = __ac_get_obj(cachep, ac, flags, force_refill); 781 objp = __ac_get_obj(cachep, ac, flags, force_refill);
782 else 782 else
783 objp = ac->entry[--ac->avail]; 783 objp = ac->entry[--ac->avail];
784 784
785 return objp; 785 return objp;
786 } 786 }
787 787
788 static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 788 static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
789 void *objp) 789 void *objp)
790 { 790 {
791 if (unlikely(pfmemalloc_active)) { 791 if (unlikely(pfmemalloc_active)) {
792 /* Some pfmemalloc slabs exist, check if this is one */ 792 /* Some pfmemalloc slabs exist, check if this is one */
793 struct page *page = virt_to_head_page(objp); 793 struct page *page = virt_to_head_page(objp);
794 if (PageSlabPfmemalloc(page)) 794 if (PageSlabPfmemalloc(page))
795 set_obj_pfmemalloc(&objp); 795 set_obj_pfmemalloc(&objp);
796 } 796 }
797 797
798 return objp; 798 return objp;
799 } 799 }
800 800
801 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 801 static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
802 void *objp) 802 void *objp)
803 { 803 {
804 if (unlikely(sk_memalloc_socks())) 804 if (unlikely(sk_memalloc_socks()))
805 objp = __ac_put_obj(cachep, ac, objp); 805 objp = __ac_put_obj(cachep, ac, objp);
806 806
807 ac->entry[ac->avail++] = objp; 807 ac->entry[ac->avail++] = objp;
808 } 808 }
809 809
810 /* 810 /*
811 * Transfer objects in one arraycache to another. 811 * Transfer objects in one arraycache to another.
812 * Locking must be handled by the caller. 812 * Locking must be handled by the caller.
813 * 813 *
814 * Return the number of entries transferred. 814 * Return the number of entries transferred.
815 */ 815 */
816 static int transfer_objects(struct array_cache *to, 816 static int transfer_objects(struct array_cache *to,
817 struct array_cache *from, unsigned int max) 817 struct array_cache *from, unsigned int max)
818 { 818 {
819 /* Figure out how many entries to transfer */ 819 /* Figure out how many entries to transfer */
820 int nr = min3(from->avail, max, to->limit - to->avail); 820 int nr = min3(from->avail, max, to->limit - to->avail);
821 821
822 if (!nr) 822 if (!nr)
823 return 0; 823 return 0;
824 824
825 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 825 memcpy(to->entry + to->avail, from->entry + from->avail -nr,
826 sizeof(void *) *nr); 826 sizeof(void *) *nr);
827 827
828 from->avail -= nr; 828 from->avail -= nr;
829 to->avail += nr; 829 to->avail += nr;
830 return nr; 830 return nr;
831 } 831 }
832 832
833 #ifndef CONFIG_NUMA 833 #ifndef CONFIG_NUMA
834 834
835 #define drain_alien_cache(cachep, alien) do { } while (0) 835 #define drain_alien_cache(cachep, alien) do { } while (0)
836 #define reap_alien(cachep, n) do { } while (0) 836 #define reap_alien(cachep, n) do { } while (0)
837 837
838 static inline struct alien_cache **alloc_alien_cache(int node, 838 static inline struct alien_cache **alloc_alien_cache(int node,
839 int limit, gfp_t gfp) 839 int limit, gfp_t gfp)
840 { 840 {
841 return (struct alien_cache **)BAD_ALIEN_MAGIC; 841 return (struct alien_cache **)BAD_ALIEN_MAGIC;
842 } 842 }
843 843
844 static inline void free_alien_cache(struct alien_cache **ac_ptr) 844 static inline void free_alien_cache(struct alien_cache **ac_ptr)
845 { 845 {
846 } 846 }
847 847
848 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 848 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
849 { 849 {
850 return 0; 850 return 0;
851 } 851 }
852 852
853 static inline void *alternate_node_alloc(struct kmem_cache *cachep, 853 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
854 gfp_t flags) 854 gfp_t flags)
855 { 855 {
856 return NULL; 856 return NULL;
857 } 857 }
858 858
859 static inline void *____cache_alloc_node(struct kmem_cache *cachep, 859 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
860 gfp_t flags, int nodeid) 860 gfp_t flags, int nodeid)
861 { 861 {
862 return NULL; 862 return NULL;
863 } 863 }
864 864
865 #else /* CONFIG_NUMA */ 865 #else /* CONFIG_NUMA */
866 866
867 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 867 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
868 static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 868 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
869 869
870 static struct alien_cache *__alloc_alien_cache(int node, int entries, 870 static struct alien_cache *__alloc_alien_cache(int node, int entries,
871 int batch, gfp_t gfp) 871 int batch, gfp_t gfp)
872 { 872 {
873 size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); 873 size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
874 struct alien_cache *alc = NULL; 874 struct alien_cache *alc = NULL;
875 875
876 alc = kmalloc_node(memsize, gfp, node); 876 alc = kmalloc_node(memsize, gfp, node);
877 init_arraycache(&alc->ac, entries, batch); 877 init_arraycache(&alc->ac, entries, batch);
878 spin_lock_init(&alc->lock); 878 spin_lock_init(&alc->lock);
879 return alc; 879 return alc;
880 } 880 }
881 881
882 static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 882 static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
883 { 883 {
884 struct alien_cache **alc_ptr; 884 struct alien_cache **alc_ptr;
885 size_t memsize = sizeof(void *) * nr_node_ids; 885 size_t memsize = sizeof(void *) * nr_node_ids;
886 int i; 886 int i;
887 887
888 if (limit > 1) 888 if (limit > 1)
889 limit = 12; 889 limit = 12;
890 alc_ptr = kzalloc_node(memsize, gfp, node); 890 alc_ptr = kzalloc_node(memsize, gfp, node);
891 if (!alc_ptr) 891 if (!alc_ptr)
892 return NULL; 892 return NULL;
893 893
894 for_each_node(i) { 894 for_each_node(i) {
895 if (i == node || !node_online(i)) 895 if (i == node || !node_online(i))
896 continue; 896 continue;
897 alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); 897 alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
898 if (!alc_ptr[i]) { 898 if (!alc_ptr[i]) {
899 for (i--; i >= 0; i--) 899 for (i--; i >= 0; i--)
900 kfree(alc_ptr[i]); 900 kfree(alc_ptr[i]);
901 kfree(alc_ptr); 901 kfree(alc_ptr);
902 return NULL; 902 return NULL;
903 } 903 }
904 } 904 }
905 return alc_ptr; 905 return alc_ptr;
906 } 906 }
907 907
908 static void free_alien_cache(struct alien_cache **alc_ptr) 908 static void free_alien_cache(struct alien_cache **alc_ptr)
909 { 909 {
910 int i; 910 int i;
911 911
912 if (!alc_ptr) 912 if (!alc_ptr)
913 return; 913 return;
914 for_each_node(i) 914 for_each_node(i)
915 kfree(alc_ptr[i]); 915 kfree(alc_ptr[i]);
916 kfree(alc_ptr); 916 kfree(alc_ptr);
917 } 917 }
918 918
919 static void __drain_alien_cache(struct kmem_cache *cachep, 919 static void __drain_alien_cache(struct kmem_cache *cachep,
920 struct array_cache *ac, int node, 920 struct array_cache *ac, int node,
921 struct list_head *list) 921 struct list_head *list)
922 { 922 {
923 struct kmem_cache_node *n = get_node(cachep, node); 923 struct kmem_cache_node *n = get_node(cachep, node);
924 924
925 if (ac->avail) { 925 if (ac->avail) {
926 spin_lock(&n->list_lock); 926 spin_lock(&n->list_lock);
927 /* 927 /*
928 * Stuff objects into the remote nodes shared array first. 928 * Stuff objects into the remote nodes shared array first.
929 * That way we could avoid the overhead of putting the objects 929 * That way we could avoid the overhead of putting the objects
930 * into the free lists and getting them back later. 930 * into the free lists and getting them back later.
931 */ 931 */
932 if (n->shared) 932 if (n->shared)
933 transfer_objects(n->shared, ac, ac->limit); 933 transfer_objects(n->shared, ac, ac->limit);
934 934
935 free_block(cachep, ac->entry, ac->avail, node, list); 935 free_block(cachep, ac->entry, ac->avail, node, list);
936 ac->avail = 0; 936 ac->avail = 0;
937 spin_unlock(&n->list_lock); 937 spin_unlock(&n->list_lock);
938 } 938 }
939 } 939 }
940 940
941 /* 941 /*
942 * Called from cache_reap() to regularly drain alien caches round robin. 942 * Called from cache_reap() to regularly drain alien caches round robin.
943 */ 943 */
944 static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) 944 static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
945 { 945 {
946 int node = __this_cpu_read(slab_reap_node); 946 int node = __this_cpu_read(slab_reap_node);
947 947
948 if (n->alien) { 948 if (n->alien) {
949 struct alien_cache *alc = n->alien[node]; 949 struct alien_cache *alc = n->alien[node];
950 struct array_cache *ac; 950 struct array_cache *ac;
951 951
952 if (alc) { 952 if (alc) {
953 ac = &alc->ac; 953 ac = &alc->ac;
954 if (ac->avail && spin_trylock_irq(&alc->lock)) { 954 if (ac->avail && spin_trylock_irq(&alc->lock)) {
955 LIST_HEAD(list); 955 LIST_HEAD(list);
956 956
957 __drain_alien_cache(cachep, ac, node, &list); 957 __drain_alien_cache(cachep, ac, node, &list);
958 spin_unlock_irq(&alc->lock); 958 spin_unlock_irq(&alc->lock);
959 slabs_destroy(cachep, &list); 959 slabs_destroy(cachep, &list);
960 } 960 }
961 } 961 }
962 } 962 }
963 } 963 }
964 964
965 static void drain_alien_cache(struct kmem_cache *cachep, 965 static void drain_alien_cache(struct kmem_cache *cachep,
966 struct alien_cache **alien) 966 struct alien_cache **alien)
967 { 967 {
968 int i = 0; 968 int i = 0;
969 struct alien_cache *alc; 969 struct alien_cache *alc;
970 struct array_cache *ac; 970 struct array_cache *ac;
971 unsigned long flags; 971 unsigned long flags;
972 972
973 for_each_online_node(i) { 973 for_each_online_node(i) {
974 alc = alien[i]; 974 alc = alien[i];
975 if (alc) { 975 if (alc) {
976 LIST_HEAD(list); 976 LIST_HEAD(list);
977 977
978 ac = &alc->ac; 978 ac = &alc->ac;
979 spin_lock_irqsave(&alc->lock, flags); 979 spin_lock_irqsave(&alc->lock, flags);
980 __drain_alien_cache(cachep, ac, i, &list); 980 __drain_alien_cache(cachep, ac, i, &list);
981 spin_unlock_irqrestore(&alc->lock, flags); 981 spin_unlock_irqrestore(&alc->lock, flags);
982 slabs_destroy(cachep, &list); 982 slabs_destroy(cachep, &list);
983 } 983 }
984 } 984 }
985 } 985 }
986 986
987 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 987 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
988 { 988 {
989 int nodeid = page_to_nid(virt_to_page(objp)); 989 int nodeid = page_to_nid(virt_to_page(objp));
990 struct kmem_cache_node *n; 990 struct kmem_cache_node *n;
991 struct alien_cache *alien = NULL; 991 struct alien_cache *alien = NULL;
992 struct array_cache *ac; 992 struct array_cache *ac;
993 int node; 993 int node;
994 LIST_HEAD(list); 994 LIST_HEAD(list);
995 995
996 node = numa_mem_id(); 996 node = numa_mem_id();
997 997
998 /* 998 /*
999 * Make sure we are not freeing a object from another node to the array 999 * Make sure we are not freeing a object from another node to the array
1000 * cache on this cpu. 1000 * cache on this cpu.
1001 */ 1001 */
1002 if (likely(nodeid == node)) 1002 if (likely(nodeid == node))
1003 return 0; 1003 return 0;
1004 1004
1005 n = get_node(cachep, node); 1005 n = get_node(cachep, node);
1006 STATS_INC_NODEFREES(cachep); 1006 STATS_INC_NODEFREES(cachep);
1007 if (n->alien && n->alien[nodeid]) { 1007 if (n->alien && n->alien[nodeid]) {
1008 alien = n->alien[nodeid]; 1008 alien = n->alien[nodeid];
1009 ac = &alien->ac; 1009 ac = &alien->ac;
1010 spin_lock(&alien->lock); 1010 spin_lock(&alien->lock);
1011 if (unlikely(ac->avail == ac->limit)) { 1011 if (unlikely(ac->avail == ac->limit)) {
1012 STATS_INC_ACOVERFLOW(cachep); 1012 STATS_INC_ACOVERFLOW(cachep);
1013 __drain_alien_cache(cachep, ac, nodeid, &list); 1013 __drain_alien_cache(cachep, ac, nodeid, &list);
1014 } 1014 }
1015 ac_put_obj(cachep, ac, objp); 1015 ac_put_obj(cachep, ac, objp);
1016 spin_unlock(&alien->lock); 1016 spin_unlock(&alien->lock);
1017 slabs_destroy(cachep, &list); 1017 slabs_destroy(cachep, &list);
1018 } else { 1018 } else {
1019 n = get_node(cachep, nodeid); 1019 n = get_node(cachep, nodeid);
1020 spin_lock(&n->list_lock); 1020 spin_lock(&n->list_lock);
1021 free_block(cachep, &objp, 1, nodeid, &list); 1021 free_block(cachep, &objp, 1, nodeid, &list);
1022 spin_unlock(&n->list_lock); 1022 spin_unlock(&n->list_lock);
1023 slabs_destroy(cachep, &list); 1023 slabs_destroy(cachep, &list);
1024 } 1024 }
1025 return 1; 1025 return 1;
1026 } 1026 }
1027 #endif 1027 #endif
1028 1028
1029 /* 1029 /*
1030 * Allocates and initializes node for a node on each slab cache, used for 1030 * Allocates and initializes node for a node on each slab cache, used for
1031 * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node 1031 * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
1032 * will be allocated off-node since memory is not yet online for the new node. 1032 * will be allocated off-node since memory is not yet online for the new node.
1033 * When hotplugging memory or a cpu, existing node are not replaced if 1033 * When hotplugging memory or a cpu, existing node are not replaced if
1034 * already in use. 1034 * already in use.
1035 * 1035 *
1036 * Must hold slab_mutex. 1036 * Must hold slab_mutex.
1037 */ 1037 */
1038 static int init_cache_node_node(int node) 1038 static int init_cache_node_node(int node)
1039 { 1039 {
1040 struct kmem_cache *cachep; 1040 struct kmem_cache *cachep;
1041 struct kmem_cache_node *n; 1041 struct kmem_cache_node *n;
1042 const size_t memsize = sizeof(struct kmem_cache_node); 1042 const size_t memsize = sizeof(struct kmem_cache_node);
1043 1043
1044 list_for_each_entry(cachep, &slab_caches, list) { 1044 list_for_each_entry(cachep, &slab_caches, list) {
1045 /* 1045 /*
1046 * Set up the kmem_cache_node for cpu before we can 1046 * Set up the kmem_cache_node for cpu before we can
1047 * begin anything. Make sure some other cpu on this 1047 * begin anything. Make sure some other cpu on this
1048 * node has not already allocated this 1048 * node has not already allocated this
1049 */ 1049 */
1050 n = get_node(cachep, node); 1050 n = get_node(cachep, node);
1051 if (!n) { 1051 if (!n) {
1052 n = kmalloc_node(memsize, GFP_KERNEL, node); 1052 n = kmalloc_node(memsize, GFP_KERNEL, node);
1053 if (!n) 1053 if (!n)
1054 return -ENOMEM; 1054 return -ENOMEM;
1055 kmem_cache_node_init(n); 1055 kmem_cache_node_init(n);
1056 n->next_reap = jiffies + REAPTIMEOUT_NODE + 1056 n->next_reap = jiffies + REAPTIMEOUT_NODE +
1057 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1057 ((unsigned long)cachep) % REAPTIMEOUT_NODE;
1058 1058
1059 /* 1059 /*
1060 * The kmem_cache_nodes don't come and go as CPUs 1060 * The kmem_cache_nodes don't come and go as CPUs
1061 * come and go. slab_mutex is sufficient 1061 * come and go. slab_mutex is sufficient
1062 * protection here. 1062 * protection here.
1063 */ 1063 */
1064 cachep->node[node] = n; 1064 cachep->node[node] = n;
1065 } 1065 }
1066 1066
1067 spin_lock_irq(&n->list_lock); 1067 spin_lock_irq(&n->list_lock);
1068 n->free_limit = 1068 n->free_limit =
1069 (1 + nr_cpus_node(node)) * 1069 (1 + nr_cpus_node(node)) *
1070 cachep->batchcount + cachep->num; 1070 cachep->batchcount + cachep->num;
1071 spin_unlock_irq(&n->list_lock); 1071 spin_unlock_irq(&n->list_lock);
1072 } 1072 }
1073 return 0; 1073 return 0;
1074 } 1074 }
1075 1075
1076 static inline int slabs_tofree(struct kmem_cache *cachep, 1076 static inline int slabs_tofree(struct kmem_cache *cachep,
1077 struct kmem_cache_node *n) 1077 struct kmem_cache_node *n)
1078 { 1078 {
1079 return (n->free_objects + cachep->num - 1) / cachep->num; 1079 return (n->free_objects + cachep->num - 1) / cachep->num;
1080 } 1080 }
1081 1081
1082 static void cpuup_canceled(long cpu) 1082 static void cpuup_canceled(long cpu)
1083 { 1083 {
1084 struct kmem_cache *cachep; 1084 struct kmem_cache *cachep;
1085 struct kmem_cache_node *n = NULL; 1085 struct kmem_cache_node *n = NULL;
1086 int node = cpu_to_mem(cpu); 1086 int node = cpu_to_mem(cpu);
1087 const struct cpumask *mask = cpumask_of_node(node); 1087 const struct cpumask *mask = cpumask_of_node(node);
1088 1088
1089 list_for_each_entry(cachep, &slab_caches, list) { 1089 list_for_each_entry(cachep, &slab_caches, list) {
1090 struct array_cache *nc; 1090 struct array_cache *nc;
1091 struct array_cache *shared; 1091 struct array_cache *shared;
1092 struct alien_cache **alien; 1092 struct alien_cache **alien;
1093 LIST_HEAD(list); 1093 LIST_HEAD(list);
1094 1094
1095 /* cpu is dead; no one can alloc from it. */ 1095 /* cpu is dead; no one can alloc from it. */
1096 nc = cachep->array[cpu]; 1096 nc = cachep->array[cpu];
1097 cachep->array[cpu] = NULL; 1097 cachep->array[cpu] = NULL;
1098 n = get_node(cachep, node); 1098 n = get_node(cachep, node);
1099 1099
1100 if (!n) 1100 if (!n)
1101 goto free_array_cache; 1101 goto free_array_cache;
1102 1102
1103 spin_lock_irq(&n->list_lock); 1103 spin_lock_irq(&n->list_lock);
1104 1104
1105 /* Free limit for this kmem_cache_node */ 1105 /* Free limit for this kmem_cache_node */
1106 n->free_limit -= cachep->batchcount; 1106 n->free_limit -= cachep->batchcount;
1107 if (nc) 1107 if (nc)
1108 free_block(cachep, nc->entry, nc->avail, node, &list); 1108 free_block(cachep, nc->entry, nc->avail, node, &list);
1109 1109
1110 if (!cpumask_empty(mask)) { 1110 if (!cpumask_empty(mask)) {
1111 spin_unlock_irq(&n->list_lock); 1111 spin_unlock_irq(&n->list_lock);
1112 goto free_array_cache; 1112 goto free_array_cache;
1113 } 1113 }
1114 1114
1115 shared = n->shared; 1115 shared = n->shared;
1116 if (shared) { 1116 if (shared) {
1117 free_block(cachep, shared->entry, 1117 free_block(cachep, shared->entry,
1118 shared->avail, node, &list); 1118 shared->avail, node, &list);
1119 n->shared = NULL; 1119 n->shared = NULL;
1120 } 1120 }
1121 1121
1122 alien = n->alien; 1122 alien = n->alien;
1123 n->alien = NULL; 1123 n->alien = NULL;
1124 1124
1125 spin_unlock_irq(&n->list_lock); 1125 spin_unlock_irq(&n->list_lock);
1126 1126
1127 kfree(shared); 1127 kfree(shared);
1128 if (alien) { 1128 if (alien) {
1129 drain_alien_cache(cachep, alien); 1129 drain_alien_cache(cachep, alien);
1130 free_alien_cache(alien); 1130 free_alien_cache(alien);
1131 } 1131 }
1132 free_array_cache: 1132 free_array_cache:
1133 slabs_destroy(cachep, &list); 1133 slabs_destroy(cachep, &list);
1134 kfree(nc); 1134 kfree(nc);
1135 } 1135 }
1136 /* 1136 /*
1137 * In the previous loop, all the objects were freed to 1137 * In the previous loop, all the objects were freed to
1138 * the respective cache's slabs, now we can go ahead and 1138 * the respective cache's slabs, now we can go ahead and
1139 * shrink each nodelist to its limit. 1139 * shrink each nodelist to its limit.
1140 */ 1140 */
1141 list_for_each_entry(cachep, &slab_caches, list) { 1141 list_for_each_entry(cachep, &slab_caches, list) {
1142 n = get_node(cachep, node); 1142 n = get_node(cachep, node);
1143 if (!n) 1143 if (!n)
1144 continue; 1144 continue;
1145 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1145 drain_freelist(cachep, n, slabs_tofree(cachep, n));
1146 } 1146 }
1147 } 1147 }
1148 1148
1149 static int cpuup_prepare(long cpu) 1149 static int cpuup_prepare(long cpu)
1150 { 1150 {
1151 struct kmem_cache *cachep; 1151 struct kmem_cache *cachep;
1152 struct kmem_cache_node *n = NULL; 1152 struct kmem_cache_node *n = NULL;
1153 int node = cpu_to_mem(cpu); 1153 int node = cpu_to_mem(cpu);
1154 int err; 1154 int err;
1155 1155
1156 /* 1156 /*
1157 * We need to do this right in the beginning since 1157 * We need to do this right in the beginning since
1158 * alloc_arraycache's are going to use this list. 1158 * alloc_arraycache's are going to use this list.
1159 * kmalloc_node allows us to add the slab to the right 1159 * kmalloc_node allows us to add the slab to the right
1160 * kmem_cache_node and not this cpu's kmem_cache_node 1160 * kmem_cache_node and not this cpu's kmem_cache_node
1161 */ 1161 */
1162 err = init_cache_node_node(node); 1162 err = init_cache_node_node(node);
1163 if (err < 0) 1163 if (err < 0)
1164 goto bad; 1164 goto bad;
1165 1165
1166 /* 1166 /*
1167 * Now we can go ahead with allocating the shared arrays and 1167 * Now we can go ahead with allocating the shared arrays and
1168 * array caches 1168 * array caches
1169 */ 1169 */
1170 list_for_each_entry(cachep, &slab_caches, list) { 1170 list_for_each_entry(cachep, &slab_caches, list) {
1171 struct array_cache *nc; 1171 struct array_cache *nc;
1172 struct array_cache *shared = NULL; 1172 struct array_cache *shared = NULL;
1173 struct alien_cache **alien = NULL; 1173 struct alien_cache **alien = NULL;
1174 1174
1175 nc = alloc_arraycache(node, cachep->limit, 1175 nc = alloc_arraycache(node, cachep->limit,
1176 cachep->batchcount, GFP_KERNEL); 1176 cachep->batchcount, GFP_KERNEL);
1177 if (!nc) 1177 if (!nc)
1178 goto bad; 1178 goto bad;
1179 if (cachep->shared) { 1179 if (cachep->shared) {
1180 shared = alloc_arraycache(node, 1180 shared = alloc_arraycache(node,
1181 cachep->shared * cachep->batchcount, 1181 cachep->shared * cachep->batchcount,
1182 0xbaadf00d, GFP_KERNEL); 1182 0xbaadf00d, GFP_KERNEL);
1183 if (!shared) { 1183 if (!shared) {
1184 kfree(nc); 1184 kfree(nc);
1185 goto bad; 1185 goto bad;
1186 } 1186 }
1187 } 1187 }
1188 if (use_alien_caches) { 1188 if (use_alien_caches) {
1189 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1189 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1190 if (!alien) { 1190 if (!alien) {
1191 kfree(shared); 1191 kfree(shared);
1192 kfree(nc); 1192 kfree(nc);
1193 goto bad; 1193 goto bad;
1194 } 1194 }
1195 } 1195 }
1196 cachep->array[cpu] = nc; 1196 cachep->array[cpu] = nc;
1197 n = get_node(cachep, node); 1197 n = get_node(cachep, node);
1198 BUG_ON(!n); 1198 BUG_ON(!n);
1199 1199
1200 spin_lock_irq(&n->list_lock); 1200 spin_lock_irq(&n->list_lock);
1201 if (!n->shared) { 1201 if (!n->shared) {
1202 /* 1202 /*
1203 * We are serialised from CPU_DEAD or 1203 * We are serialised from CPU_DEAD or
1204 * CPU_UP_CANCELLED by the cpucontrol lock 1204 * CPU_UP_CANCELLED by the cpucontrol lock
1205 */ 1205 */
1206 n->shared = shared; 1206 n->shared = shared;
1207 shared = NULL; 1207 shared = NULL;
1208 } 1208 }
1209 #ifdef CONFIG_NUMA 1209 #ifdef CONFIG_NUMA
1210 if (!n->alien) { 1210 if (!n->alien) {
1211 n->alien = alien; 1211 n->alien = alien;
1212 alien = NULL; 1212 alien = NULL;
1213 } 1213 }
1214 #endif 1214 #endif
1215 spin_unlock_irq(&n->list_lock); 1215 spin_unlock_irq(&n->list_lock);
1216 kfree(shared); 1216 kfree(shared);
1217 free_alien_cache(alien); 1217 free_alien_cache(alien);
1218 } 1218 }
1219 1219
1220 return 0; 1220 return 0;
1221 bad: 1221 bad:
1222 cpuup_canceled(cpu); 1222 cpuup_canceled(cpu);
1223 return -ENOMEM; 1223 return -ENOMEM;
1224 } 1224 }
1225 1225
1226 static int cpuup_callback(struct notifier_block *nfb, 1226 static int cpuup_callback(struct notifier_block *nfb,
1227 unsigned long action, void *hcpu) 1227 unsigned long action, void *hcpu)
1228 { 1228 {
1229 long cpu = (long)hcpu; 1229 long cpu = (long)hcpu;
1230 int err = 0; 1230 int err = 0;
1231 1231
1232 switch (action) { 1232 switch (action) {
1233 case CPU_UP_PREPARE: 1233 case CPU_UP_PREPARE:
1234 case CPU_UP_PREPARE_FROZEN: 1234 case CPU_UP_PREPARE_FROZEN:
1235 mutex_lock(&slab_mutex); 1235 mutex_lock(&slab_mutex);
1236 err = cpuup_prepare(cpu); 1236 err = cpuup_prepare(cpu);
1237 mutex_unlock(&slab_mutex); 1237 mutex_unlock(&slab_mutex);
1238 break; 1238 break;
1239 case CPU_ONLINE: 1239 case CPU_ONLINE:
1240 case CPU_ONLINE_FROZEN: 1240 case CPU_ONLINE_FROZEN:
1241 start_cpu_timer(cpu); 1241 start_cpu_timer(cpu);
1242 break; 1242 break;
1243 #ifdef CONFIG_HOTPLUG_CPU 1243 #ifdef CONFIG_HOTPLUG_CPU
1244 case CPU_DOWN_PREPARE: 1244 case CPU_DOWN_PREPARE:
1245 case CPU_DOWN_PREPARE_FROZEN: 1245 case CPU_DOWN_PREPARE_FROZEN:
1246 /* 1246 /*
1247 * Shutdown cache reaper. Note that the slab_mutex is 1247 * Shutdown cache reaper. Note that the slab_mutex is
1248 * held so that if cache_reap() is invoked it cannot do 1248 * held so that if cache_reap() is invoked it cannot do
1249 * anything expensive but will only modify reap_work 1249 * anything expensive but will only modify reap_work
1250 * and reschedule the timer. 1250 * and reschedule the timer.
1251 */ 1251 */
1252 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); 1252 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1253 /* Now the cache_reaper is guaranteed to be not running. */ 1253 /* Now the cache_reaper is guaranteed to be not running. */
1254 per_cpu(slab_reap_work, cpu).work.func = NULL; 1254 per_cpu(slab_reap_work, cpu).work.func = NULL;
1255 break; 1255 break;
1256 case CPU_DOWN_FAILED: 1256 case CPU_DOWN_FAILED:
1257 case CPU_DOWN_FAILED_FROZEN: 1257 case CPU_DOWN_FAILED_FROZEN:
1258 start_cpu_timer(cpu); 1258 start_cpu_timer(cpu);
1259 break; 1259 break;
1260 case CPU_DEAD: 1260 case CPU_DEAD:
1261 case CPU_DEAD_FROZEN: 1261 case CPU_DEAD_FROZEN:
1262 /* 1262 /*
1263 * Even if all the cpus of a node are down, we don't free the 1263 * Even if all the cpus of a node are down, we don't free the
1264 * kmem_cache_node of any cache. This to avoid a race between 1264 * kmem_cache_node of any cache. This to avoid a race between
1265 * cpu_down, and a kmalloc allocation from another cpu for 1265 * cpu_down, and a kmalloc allocation from another cpu for
1266 * memory from the node of the cpu going down. The node 1266 * memory from the node of the cpu going down. The node
1267 * structure is usually allocated from kmem_cache_create() and 1267 * structure is usually allocated from kmem_cache_create() and
1268 * gets destroyed at kmem_cache_destroy(). 1268 * gets destroyed at kmem_cache_destroy().
1269 */ 1269 */
1270 /* fall through */ 1270 /* fall through */
1271 #endif 1271 #endif
1272 case CPU_UP_CANCELED: 1272 case CPU_UP_CANCELED:
1273 case CPU_UP_CANCELED_FROZEN: 1273 case CPU_UP_CANCELED_FROZEN:
1274 mutex_lock(&slab_mutex); 1274 mutex_lock(&slab_mutex);
1275 cpuup_canceled(cpu); 1275 cpuup_canceled(cpu);
1276 mutex_unlock(&slab_mutex); 1276 mutex_unlock(&slab_mutex);
1277 break; 1277 break;
1278 } 1278 }
1279 return notifier_from_errno(err); 1279 return notifier_from_errno(err);
1280 } 1280 }
1281 1281
1282 static struct notifier_block cpucache_notifier = { 1282 static struct notifier_block cpucache_notifier = {
1283 &cpuup_callback, NULL, 0 1283 &cpuup_callback, NULL, 0
1284 }; 1284 };
1285 1285
1286 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1286 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1287 /* 1287 /*
1288 * Drains freelist for a node on each slab cache, used for memory hot-remove. 1288 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1289 * Returns -EBUSY if all objects cannot be drained so that the node is not 1289 * Returns -EBUSY if all objects cannot be drained so that the node is not
1290 * removed. 1290 * removed.
1291 * 1291 *
1292 * Must hold slab_mutex. 1292 * Must hold slab_mutex.
1293 */ 1293 */
1294 static int __meminit drain_cache_node_node(int node) 1294 static int __meminit drain_cache_node_node(int node)
1295 { 1295 {
1296 struct kmem_cache *cachep; 1296 struct kmem_cache *cachep;
1297 int ret = 0; 1297 int ret = 0;
1298 1298
1299 list_for_each_entry(cachep, &slab_caches, list) { 1299 list_for_each_entry(cachep, &slab_caches, list) {
1300 struct kmem_cache_node *n; 1300 struct kmem_cache_node *n;
1301 1301
1302 n = get_node(cachep, node); 1302 n = get_node(cachep, node);
1303 if (!n) 1303 if (!n)
1304 continue; 1304 continue;
1305 1305
1306 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1306 drain_freelist(cachep, n, slabs_tofree(cachep, n));
1307 1307
1308 if (!list_empty(&n->slabs_full) || 1308 if (!list_empty(&n->slabs_full) ||
1309 !list_empty(&n->slabs_partial)) { 1309 !list_empty(&n->slabs_partial)) {
1310 ret = -EBUSY; 1310 ret = -EBUSY;
1311 break; 1311 break;
1312 } 1312 }
1313 } 1313 }
1314 return ret; 1314 return ret;
1315 } 1315 }
1316 1316
1317 static int __meminit slab_memory_callback(struct notifier_block *self, 1317 static int __meminit slab_memory_callback(struct notifier_block *self,
1318 unsigned long action, void *arg) 1318 unsigned long action, void *arg)
1319 { 1319 {
1320 struct memory_notify *mnb = arg; 1320 struct memory_notify *mnb = arg;
1321 int ret = 0; 1321 int ret = 0;
1322 int nid; 1322 int nid;
1323 1323
1324 nid = mnb->status_change_nid; 1324 nid = mnb->status_change_nid;
1325 if (nid < 0) 1325 if (nid < 0)
1326 goto out; 1326 goto out;
1327 1327
1328 switch (action) { 1328 switch (action) {
1329 case MEM_GOING_ONLINE: 1329 case MEM_GOING_ONLINE:
1330 mutex_lock(&slab_mutex); 1330 mutex_lock(&slab_mutex);
1331 ret = init_cache_node_node(nid); 1331 ret = init_cache_node_node(nid);
1332 mutex_unlock(&slab_mutex); 1332 mutex_unlock(&slab_mutex);
1333 break; 1333 break;
1334 case MEM_GOING_OFFLINE: 1334 case MEM_GOING_OFFLINE:
1335 mutex_lock(&slab_mutex); 1335 mutex_lock(&slab_mutex);
1336 ret = drain_cache_node_node(nid); 1336 ret = drain_cache_node_node(nid);
1337 mutex_unlock(&slab_mutex); 1337 mutex_unlock(&slab_mutex);
1338 break; 1338 break;
1339 case MEM_ONLINE: 1339 case MEM_ONLINE:
1340 case MEM_OFFLINE: 1340 case MEM_OFFLINE:
1341 case MEM_CANCEL_ONLINE: 1341 case MEM_CANCEL_ONLINE:
1342 case MEM_CANCEL_OFFLINE: 1342 case MEM_CANCEL_OFFLINE:
1343 break; 1343 break;
1344 } 1344 }
1345 out: 1345 out:
1346 return notifier_from_errno(ret); 1346 return notifier_from_errno(ret);
1347 } 1347 }
1348 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1348 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1349 1349
1350 /* 1350 /*
1351 * swap the static kmem_cache_node with kmalloced memory 1351 * swap the static kmem_cache_node with kmalloced memory
1352 */ 1352 */
1353 static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, 1353 static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
1354 int nodeid) 1354 int nodeid)
1355 { 1355 {
1356 struct kmem_cache_node *ptr; 1356 struct kmem_cache_node *ptr;
1357 1357
1358 ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); 1358 ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
1359 BUG_ON(!ptr); 1359 BUG_ON(!ptr);
1360 1360
1361 memcpy(ptr, list, sizeof(struct kmem_cache_node)); 1361 memcpy(ptr, list, sizeof(struct kmem_cache_node));
1362 /* 1362 /*
1363 * Do not assume that spinlocks can be initialized via memcpy: 1363 * Do not assume that spinlocks can be initialized via memcpy:
1364 */ 1364 */
1365 spin_lock_init(&ptr->list_lock); 1365 spin_lock_init(&ptr->list_lock);
1366 1366
1367 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1367 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1368 cachep->node[nodeid] = ptr; 1368 cachep->node[nodeid] = ptr;
1369 } 1369 }
1370 1370
1371 /* 1371 /*
1372 * For setting up all the kmem_cache_node for cache whose buffer_size is same as 1372 * For setting up all the kmem_cache_node for cache whose buffer_size is same as
1373 * size of kmem_cache_node. 1373 * size of kmem_cache_node.
1374 */ 1374 */
1375 static void __init set_up_node(struct kmem_cache *cachep, int index) 1375 static void __init set_up_node(struct kmem_cache *cachep, int index)
1376 { 1376 {
1377 int node; 1377 int node;
1378 1378
1379 for_each_online_node(node) { 1379 for_each_online_node(node) {
1380 cachep->node[node] = &init_kmem_cache_node[index + node]; 1380 cachep->node[node] = &init_kmem_cache_node[index + node];
1381 cachep->node[node]->next_reap = jiffies + 1381 cachep->node[node]->next_reap = jiffies +
1382 REAPTIMEOUT_NODE + 1382 REAPTIMEOUT_NODE +
1383 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1383 ((unsigned long)cachep) % REAPTIMEOUT_NODE;
1384 } 1384 }
1385 } 1385 }
1386 1386
1387 /* 1387 /*
1388 * The memory after the last cpu cache pointer is used for the 1388 * The memory after the last cpu cache pointer is used for the
1389 * the node pointer. 1389 * the node pointer.
1390 */ 1390 */
1391 static void setup_node_pointer(struct kmem_cache *cachep) 1391 static void setup_node_pointer(struct kmem_cache *cachep)
1392 { 1392 {
1393 cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; 1393 cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
1394 } 1394 }
1395 1395
1396 /* 1396 /*
1397 * Initialisation. Called after the page allocator have been initialised and 1397 * Initialisation. Called after the page allocator have been initialised and
1398 * before smp_init(). 1398 * before smp_init().
1399 */ 1399 */
1400 void __init kmem_cache_init(void) 1400 void __init kmem_cache_init(void)
1401 { 1401 {
1402 int i; 1402 int i;
1403 1403
1404 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < 1404 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
1405 sizeof(struct rcu_head)); 1405 sizeof(struct rcu_head));
1406 kmem_cache = &kmem_cache_boot; 1406 kmem_cache = &kmem_cache_boot;
1407 setup_node_pointer(kmem_cache); 1407 setup_node_pointer(kmem_cache);
1408 1408
1409 if (num_possible_nodes() == 1) 1409 if (num_possible_nodes() == 1)
1410 use_alien_caches = 0; 1410 use_alien_caches = 0;
1411 1411
1412 for (i = 0; i < NUM_INIT_LISTS; i++) 1412 for (i = 0; i < NUM_INIT_LISTS; i++)
1413 kmem_cache_node_init(&init_kmem_cache_node[i]); 1413 kmem_cache_node_init(&init_kmem_cache_node[i]);
1414 1414
1415 set_up_node(kmem_cache, CACHE_CACHE); 1415 set_up_node(kmem_cache, CACHE_CACHE);
1416 1416
1417 /* 1417 /*
1418 * Fragmentation resistance on low memory - only use bigger 1418 * Fragmentation resistance on low memory - only use bigger
1419 * page orders on machines with more than 32MB of memory if 1419 * page orders on machines with more than 32MB of memory if
1420 * not overridden on the command line. 1420 * not overridden on the command line.
1421 */ 1421 */
1422 if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) 1422 if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
1423 slab_max_order = SLAB_MAX_ORDER_HI; 1423 slab_max_order = SLAB_MAX_ORDER_HI;
1424 1424
1425 /* Bootstrap is tricky, because several objects are allocated 1425 /* Bootstrap is tricky, because several objects are allocated
1426 * from caches that do not exist yet: 1426 * from caches that do not exist yet:
1427 * 1) initialize the kmem_cache cache: it contains the struct 1427 * 1) initialize the kmem_cache cache: it contains the struct
1428 * kmem_cache structures of all caches, except kmem_cache itself: 1428 * kmem_cache structures of all caches, except kmem_cache itself:
1429 * kmem_cache is statically allocated. 1429 * kmem_cache is statically allocated.
1430 * Initially an __init data area is used for the head array and the 1430 * Initially an __init data area is used for the head array and the
1431 * kmem_cache_node structures, it's replaced with a kmalloc allocated 1431 * kmem_cache_node structures, it's replaced with a kmalloc allocated
1432 * array at the end of the bootstrap. 1432 * array at the end of the bootstrap.
1433 * 2) Create the first kmalloc cache. 1433 * 2) Create the first kmalloc cache.
1434 * The struct kmem_cache for the new cache is allocated normally. 1434 * The struct kmem_cache for the new cache is allocated normally.
1435 * An __init data area is used for the head array. 1435 * An __init data area is used for the head array.
1436 * 3) Create the remaining kmalloc caches, with minimally sized 1436 * 3) Create the remaining kmalloc caches, with minimally sized
1437 * head arrays. 1437 * head arrays.
1438 * 4) Replace the __init data head arrays for kmem_cache and the first 1438 * 4) Replace the __init data head arrays for kmem_cache and the first
1439 * kmalloc cache with kmalloc allocated arrays. 1439 * kmalloc cache with kmalloc allocated arrays.
1440 * 5) Replace the __init data for kmem_cache_node for kmem_cache and 1440 * 5) Replace the __init data for kmem_cache_node for kmem_cache and
1441 * the other cache's with kmalloc allocated memory. 1441 * the other cache's with kmalloc allocated memory.
1442 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1442 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1443 */ 1443 */
1444 1444
1445 /* 1) create the kmem_cache */ 1445 /* 1) create the kmem_cache */
1446 1446
1447 /* 1447 /*
1448 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1448 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1449 */ 1449 */
1450 create_boot_cache(kmem_cache, "kmem_cache", 1450 create_boot_cache(kmem_cache, "kmem_cache",
1451 offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1451 offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1452 nr_node_ids * sizeof(struct kmem_cache_node *), 1452 nr_node_ids * sizeof(struct kmem_cache_node *),
1453 SLAB_HWCACHE_ALIGN); 1453 SLAB_HWCACHE_ALIGN);
1454 list_add(&kmem_cache->list, &slab_caches); 1454 list_add(&kmem_cache->list, &slab_caches);
1455 1455
1456 /* 2+3) create the kmalloc caches */ 1456 /* 2+3) create the kmalloc caches */
1457 1457
1458 /* 1458 /*
1459 * Initialize the caches that provide memory for the array cache and the 1459 * Initialize the caches that provide memory for the array cache and the
1460 * kmem_cache_node structures first. Without this, further allocations will 1460 * kmem_cache_node structures first. Without this, further allocations will
1461 * bug. 1461 * bug.
1462 */ 1462 */
1463 1463
1464 kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", 1464 kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
1465 kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); 1465 kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
1466 1466
1467 if (INDEX_AC != INDEX_NODE) 1467 if (INDEX_AC != INDEX_NODE)
1468 kmalloc_caches[INDEX_NODE] = 1468 kmalloc_caches[INDEX_NODE] =
1469 create_kmalloc_cache("kmalloc-node", 1469 create_kmalloc_cache("kmalloc-node",
1470 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); 1470 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
1471 1471
1472 slab_early_init = 0; 1472 slab_early_init = 0;
1473 1473
1474 /* 4) Replace the bootstrap head arrays */ 1474 /* 4) Replace the bootstrap head arrays */
1475 { 1475 {
1476 struct array_cache *ptr; 1476 struct array_cache *ptr;
1477 1477
1478 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1478 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1479 1479
1480 memcpy(ptr, cpu_cache_get(kmem_cache), 1480 memcpy(ptr, cpu_cache_get(kmem_cache),
1481 sizeof(struct arraycache_init)); 1481 sizeof(struct arraycache_init));
1482 1482
1483 kmem_cache->array[smp_processor_id()] = ptr; 1483 kmem_cache->array[smp_processor_id()] = ptr;
1484 1484
1485 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1485 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1486 1486
1487 BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) 1487 BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
1488 != &initarray_generic.cache); 1488 != &initarray_generic.cache);
1489 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), 1489 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
1490 sizeof(struct arraycache_init)); 1490 sizeof(struct arraycache_init));
1491 1491
1492 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; 1492 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
1493 } 1493 }
1494 /* 5) Replace the bootstrap kmem_cache_node */ 1494 /* 5) Replace the bootstrap kmem_cache_node */
1495 { 1495 {
1496 int nid; 1496 int nid;
1497 1497
1498 for_each_online_node(nid) { 1498 for_each_online_node(nid) {
1499 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1499 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1500 1500
1501 init_list(kmalloc_caches[INDEX_AC], 1501 init_list(kmalloc_caches[INDEX_AC],
1502 &init_kmem_cache_node[SIZE_AC + nid], nid); 1502 &init_kmem_cache_node[SIZE_AC + nid], nid);
1503 1503
1504 if (INDEX_AC != INDEX_NODE) { 1504 if (INDEX_AC != INDEX_NODE) {
1505 init_list(kmalloc_caches[INDEX_NODE], 1505 init_list(kmalloc_caches[INDEX_NODE],
1506 &init_kmem_cache_node[SIZE_NODE + nid], nid); 1506 &init_kmem_cache_node[SIZE_NODE + nid], nid);
1507 } 1507 }
1508 } 1508 }
1509 } 1509 }
1510 1510
1511 create_kmalloc_caches(ARCH_KMALLOC_FLAGS); 1511 create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
1512 } 1512 }
1513 1513
1514 void __init kmem_cache_init_late(void) 1514 void __init kmem_cache_init_late(void)
1515 { 1515 {
1516 struct kmem_cache *cachep; 1516 struct kmem_cache *cachep;
1517 1517
1518 slab_state = UP; 1518 slab_state = UP;
1519 1519
1520 /* 6) resize the head arrays to their final sizes */ 1520 /* 6) resize the head arrays to their final sizes */
1521 mutex_lock(&slab_mutex); 1521 mutex_lock(&slab_mutex);
1522 list_for_each_entry(cachep, &slab_caches, list) 1522 list_for_each_entry(cachep, &slab_caches, list)
1523 if (enable_cpucache(cachep, GFP_NOWAIT)) 1523 if (enable_cpucache(cachep, GFP_NOWAIT))
1524 BUG(); 1524 BUG();
1525 mutex_unlock(&slab_mutex); 1525 mutex_unlock(&slab_mutex);
1526 1526
1527 /* Done! */ 1527 /* Done! */
1528 slab_state = FULL; 1528 slab_state = FULL;
1529 1529
1530 /* 1530 /*
1531 * Register a cpu startup notifier callback that initializes 1531 * Register a cpu startup notifier callback that initializes
1532 * cpu_cache_get for all new cpus 1532 * cpu_cache_get for all new cpus
1533 */ 1533 */
1534 register_cpu_notifier(&cpucache_notifier); 1534 register_cpu_notifier(&cpucache_notifier);
1535 1535
1536 #ifdef CONFIG_NUMA 1536 #ifdef CONFIG_NUMA
1537 /* 1537 /*
1538 * Register a memory hotplug callback that initializes and frees 1538 * Register a memory hotplug callback that initializes and frees
1539 * node. 1539 * node.
1540 */ 1540 */
1541 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 1541 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1542 #endif 1542 #endif
1543 1543
1544 /* 1544 /*
1545 * The reap timers are started later, with a module init call: That part 1545 * The reap timers are started later, with a module init call: That part
1546 * of the kernel is not yet operational. 1546 * of the kernel is not yet operational.
1547 */ 1547 */
1548 } 1548 }
1549 1549
1550 static int __init cpucache_init(void) 1550 static int __init cpucache_init(void)
1551 { 1551 {
1552 int cpu; 1552 int cpu;
1553 1553
1554 /* 1554 /*
1555 * Register the timers that return unneeded pages to the page allocator 1555 * Register the timers that return unneeded pages to the page allocator
1556 */ 1556 */
1557 for_each_online_cpu(cpu) 1557 for_each_online_cpu(cpu)
1558 start_cpu_timer(cpu); 1558 start_cpu_timer(cpu);
1559 1559
1560 /* Done! */ 1560 /* Done! */
1561 slab_state = FULL; 1561 slab_state = FULL;
1562 return 0; 1562 return 0;
1563 } 1563 }
1564 __initcall(cpucache_init); 1564 __initcall(cpucache_init);
1565 1565
1566 static noinline void 1566 static noinline void
1567 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1567 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1568 { 1568 {
1569 #if DEBUG 1569 #if DEBUG
1570 struct kmem_cache_node *n; 1570 struct kmem_cache_node *n;
1571 struct page *page; 1571 struct page *page;
1572 unsigned long flags; 1572 unsigned long flags;
1573 int node; 1573 int node;
1574 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 1574 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1575 DEFAULT_RATELIMIT_BURST); 1575 DEFAULT_RATELIMIT_BURST);
1576 1576
1577 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) 1577 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
1578 return; 1578 return;
1579 1579
1580 printk(KERN_WARNING 1580 printk(KERN_WARNING
1581 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", 1581 "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1582 nodeid, gfpflags); 1582 nodeid, gfpflags);
1583 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1583 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1584 cachep->name, cachep->size, cachep->gfporder); 1584 cachep->name, cachep->size, cachep->gfporder);
1585 1585
1586 for_each_kmem_cache_node(cachep, node, n) { 1586 for_each_kmem_cache_node(cachep, node, n) {
1587 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1587 unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
1588 unsigned long active_slabs = 0, num_slabs = 0; 1588 unsigned long active_slabs = 0, num_slabs = 0;
1589 1589
1590 spin_lock_irqsave(&n->list_lock, flags); 1590 spin_lock_irqsave(&n->list_lock, flags);
1591 list_for_each_entry(page, &n->slabs_full, lru) { 1591 list_for_each_entry(page, &n->slabs_full, lru) {
1592 active_objs += cachep->num; 1592 active_objs += cachep->num;
1593 active_slabs++; 1593 active_slabs++;
1594 } 1594 }
1595 list_for_each_entry(page, &n->slabs_partial, lru) { 1595 list_for_each_entry(page, &n->slabs_partial, lru) {
1596 active_objs += page->active; 1596 active_objs += page->active;
1597 active_slabs++; 1597 active_slabs++;
1598 } 1598 }
1599 list_for_each_entry(page, &n->slabs_free, lru) 1599 list_for_each_entry(page, &n->slabs_free, lru)
1600 num_slabs++; 1600 num_slabs++;
1601 1601
1602 free_objects += n->free_objects; 1602 free_objects += n->free_objects;
1603 spin_unlock_irqrestore(&n->list_lock, flags); 1603 spin_unlock_irqrestore(&n->list_lock, flags);
1604 1604
1605 num_slabs += active_slabs; 1605 num_slabs += active_slabs;
1606 num_objs = num_slabs * cachep->num; 1606 num_objs = num_slabs * cachep->num;
1607 printk(KERN_WARNING 1607 printk(KERN_WARNING
1608 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", 1608 " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1609 node, active_slabs, num_slabs, active_objs, num_objs, 1609 node, active_slabs, num_slabs, active_objs, num_objs,
1610 free_objects); 1610 free_objects);
1611 } 1611 }
1612 #endif 1612 #endif
1613 } 1613 }
1614 1614
1615 /* 1615 /*
1616 * Interface to system's page allocator. No need to hold the 1616 * Interface to system's page allocator. No need to hold the
1617 * kmem_cache_node ->list_lock. 1617 * kmem_cache_node ->list_lock.
1618 * 1618 *
1619 * If we requested dmaable memory, we will get it. Even if we 1619 * If we requested dmaable memory, we will get it. Even if we
1620 * did not request dmaable memory, we might get it, but that 1620 * did not request dmaable memory, we might get it, but that
1621 * would be relatively rare and ignorable. 1621 * would be relatively rare and ignorable.
1622 */ 1622 */
1623 static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, 1623 static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1624 int nodeid) 1624 int nodeid)
1625 { 1625 {
1626 struct page *page; 1626 struct page *page;
1627 int nr_pages; 1627 int nr_pages;
1628 1628
1629 flags |= cachep->allocflags; 1629 flags |= cachep->allocflags;
1630 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1630 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1631 flags |= __GFP_RECLAIMABLE; 1631 flags |= __GFP_RECLAIMABLE;
1632 1632
1633 if (memcg_charge_slab(cachep, flags, cachep->gfporder)) 1633 if (memcg_charge_slab(cachep, flags, cachep->gfporder))
1634 return NULL; 1634 return NULL;
1635 1635
1636 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1636 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1637 if (!page) { 1637 if (!page) {
1638 memcg_uncharge_slab(cachep, cachep->gfporder); 1638 memcg_uncharge_slab(cachep, cachep->gfporder);
1639 slab_out_of_memory(cachep, flags, nodeid); 1639 slab_out_of_memory(cachep, flags, nodeid);
1640 return NULL; 1640 return NULL;
1641 } 1641 }
1642 1642
1643 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ 1643 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1644 if (unlikely(page->pfmemalloc)) 1644 if (unlikely(page->pfmemalloc))
1645 pfmemalloc_active = true; 1645 pfmemalloc_active = true;
1646 1646
1647 nr_pages = (1 << cachep->gfporder); 1647 nr_pages = (1 << cachep->gfporder);
1648 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1648 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1649 add_zone_page_state(page_zone(page), 1649 add_zone_page_state(page_zone(page),
1650 NR_SLAB_RECLAIMABLE, nr_pages); 1650 NR_SLAB_RECLAIMABLE, nr_pages);
1651 else 1651 else
1652 add_zone_page_state(page_zone(page), 1652 add_zone_page_state(page_zone(page),
1653 NR_SLAB_UNRECLAIMABLE, nr_pages); 1653 NR_SLAB_UNRECLAIMABLE, nr_pages);
1654 __SetPageSlab(page); 1654 __SetPageSlab(page);
1655 if (page->pfmemalloc) 1655 if (page->pfmemalloc)
1656 SetPageSlabPfmemalloc(page); 1656 SetPageSlabPfmemalloc(page);
1657 1657
1658 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1658 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1659 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1659 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1660 1660
1661 if (cachep->ctor) 1661 if (cachep->ctor)
1662 kmemcheck_mark_uninitialized_pages(page, nr_pages); 1662 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1663 else 1663 else
1664 kmemcheck_mark_unallocated_pages(page, nr_pages); 1664 kmemcheck_mark_unallocated_pages(page, nr_pages);
1665 } 1665 }
1666 1666
1667 return page; 1667 return page;
1668 } 1668 }
1669 1669
1670 /* 1670 /*
1671 * Interface to system's page release. 1671 * Interface to system's page release.
1672 */ 1672 */
1673 static void kmem_freepages(struct kmem_cache *cachep, struct page *page) 1673 static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1674 { 1674 {
1675 const unsigned long nr_freed = (1 << cachep->gfporder); 1675 const unsigned long nr_freed = (1 << cachep->gfporder);
1676 1676
1677 kmemcheck_free_shadow(page, cachep->gfporder); 1677 kmemcheck_free_shadow(page, cachep->gfporder);
1678 1678
1679 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1679 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1680 sub_zone_page_state(page_zone(page), 1680 sub_zone_page_state(page_zone(page),
1681 NR_SLAB_RECLAIMABLE, nr_freed); 1681 NR_SLAB_RECLAIMABLE, nr_freed);
1682 else 1682 else
1683 sub_zone_page_state(page_zone(page), 1683 sub_zone_page_state(page_zone(page),
1684 NR_SLAB_UNRECLAIMABLE, nr_freed); 1684 NR_SLAB_UNRECLAIMABLE, nr_freed);
1685 1685
1686 BUG_ON(!PageSlab(page)); 1686 BUG_ON(!PageSlab(page));
1687 __ClearPageSlabPfmemalloc(page); 1687 __ClearPageSlabPfmemalloc(page);
1688 __ClearPageSlab(page); 1688 __ClearPageSlab(page);
1689 page_mapcount_reset(page); 1689 page_mapcount_reset(page);
1690 page->mapping = NULL; 1690 page->mapping = NULL;
1691 1691
1692 if (current->reclaim_state) 1692 if (current->reclaim_state)
1693 current->reclaim_state->reclaimed_slab += nr_freed; 1693 current->reclaim_state->reclaimed_slab += nr_freed;
1694 __free_pages(page, cachep->gfporder); 1694 __free_pages(page, cachep->gfporder);
1695 memcg_uncharge_slab(cachep, cachep->gfporder); 1695 memcg_uncharge_slab(cachep, cachep->gfporder);
1696 } 1696 }
1697 1697
1698 static void kmem_rcu_free(struct rcu_head *head) 1698 static void kmem_rcu_free(struct rcu_head *head)
1699 { 1699 {
1700 struct kmem_cache *cachep; 1700 struct kmem_cache *cachep;
1701 struct page *page; 1701 struct page *page;
1702 1702
1703 page = container_of(head, struct page, rcu_head); 1703 page = container_of(head, struct page, rcu_head);
1704 cachep = page->slab_cache; 1704 cachep = page->slab_cache;
1705 1705
1706 kmem_freepages(cachep, page); 1706 kmem_freepages(cachep, page);
1707 } 1707 }
1708 1708
1709 #if DEBUG 1709 #if DEBUG
1710 1710
1711 #ifdef CONFIG_DEBUG_PAGEALLOC 1711 #ifdef CONFIG_DEBUG_PAGEALLOC
1712 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1712 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1713 unsigned long caller) 1713 unsigned long caller)
1714 { 1714 {
1715 int size = cachep->object_size; 1715 int size = cachep->object_size;
1716 1716
1717 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1717 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1718 1718
1719 if (size < 5 * sizeof(unsigned long)) 1719 if (size < 5 * sizeof(unsigned long))
1720 return; 1720 return;
1721 1721
1722 *addr++ = 0x12345678; 1722 *addr++ = 0x12345678;
1723 *addr++ = caller; 1723 *addr++ = caller;
1724 *addr++ = smp_processor_id(); 1724 *addr++ = smp_processor_id();
1725 size -= 3 * sizeof(unsigned long); 1725 size -= 3 * sizeof(unsigned long);
1726 { 1726 {
1727 unsigned long *sptr = &caller; 1727 unsigned long *sptr = &caller;
1728 unsigned long svalue; 1728 unsigned long svalue;
1729 1729
1730 while (!kstack_end(sptr)) { 1730 while (!kstack_end(sptr)) {
1731 svalue = *sptr++; 1731 svalue = *sptr++;
1732 if (kernel_text_address(svalue)) { 1732 if (kernel_text_address(svalue)) {
1733 *addr++ = svalue; 1733 *addr++ = svalue;
1734 size -= sizeof(unsigned long); 1734 size -= sizeof(unsigned long);
1735 if (size <= sizeof(unsigned long)) 1735 if (size <= sizeof(unsigned long))
1736 break; 1736 break;
1737 } 1737 }
1738 } 1738 }
1739 1739
1740 } 1740 }
1741 *addr++ = 0x87654321; 1741 *addr++ = 0x87654321;
1742 } 1742 }
1743 #endif 1743 #endif
1744 1744
1745 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1745 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1746 { 1746 {
1747 int size = cachep->object_size; 1747 int size = cachep->object_size;
1748 addr = &((char *)addr)[obj_offset(cachep)]; 1748 addr = &((char *)addr)[obj_offset(cachep)];
1749 1749
1750 memset(addr, val, size); 1750 memset(addr, val, size);
1751 *(unsigned char *)(addr + size - 1) = POISON_END; 1751 *(unsigned char *)(addr + size - 1) = POISON_END;
1752 } 1752 }
1753 1753
1754 static void dump_line(char *data, int offset, int limit) 1754 static void dump_line(char *data, int offset, int limit)
1755 { 1755 {
1756 int i; 1756 int i;
1757 unsigned char error = 0; 1757 unsigned char error = 0;
1758 int bad_count = 0; 1758 int bad_count = 0;
1759 1759
1760 printk(KERN_ERR "%03x: ", offset); 1760 printk(KERN_ERR "%03x: ", offset);
1761 for (i = 0; i < limit; i++) { 1761 for (i = 0; i < limit; i++) {
1762 if (data[offset + i] != POISON_FREE) { 1762 if (data[offset + i] != POISON_FREE) {
1763 error = data[offset + i]; 1763 error = data[offset + i];
1764 bad_count++; 1764 bad_count++;
1765 } 1765 }
1766 } 1766 }
1767 print_hex_dump(KERN_CONT, "", 0, 16, 1, 1767 print_hex_dump(KERN_CONT, "", 0, 16, 1,
1768 &data[offset], limit, 1); 1768 &data[offset], limit, 1);
1769 1769
1770 if (bad_count == 1) { 1770 if (bad_count == 1) {
1771 error ^= POISON_FREE; 1771 error ^= POISON_FREE;
1772 if (!(error & (error - 1))) { 1772 if (!(error & (error - 1))) {
1773 printk(KERN_ERR "Single bit error detected. Probably " 1773 printk(KERN_ERR "Single bit error detected. Probably "
1774 "bad RAM.\n"); 1774 "bad RAM.\n");
1775 #ifdef CONFIG_X86 1775 #ifdef CONFIG_X86
1776 printk(KERN_ERR "Run memtest86+ or a similar memory " 1776 printk(KERN_ERR "Run memtest86+ or a similar memory "
1777 "test tool.\n"); 1777 "test tool.\n");
1778 #else 1778 #else
1779 printk(KERN_ERR "Run a memory test tool.\n"); 1779 printk(KERN_ERR "Run a memory test tool.\n");
1780 #endif 1780 #endif
1781 } 1781 }
1782 } 1782 }
1783 } 1783 }
1784 #endif 1784 #endif
1785 1785
1786 #if DEBUG 1786 #if DEBUG
1787 1787
1788 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1788 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1789 { 1789 {
1790 int i, size; 1790 int i, size;
1791 char *realobj; 1791 char *realobj;
1792 1792
1793 if (cachep->flags & SLAB_RED_ZONE) { 1793 if (cachep->flags & SLAB_RED_ZONE) {
1794 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", 1794 printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1795 *dbg_redzone1(cachep, objp), 1795 *dbg_redzone1(cachep, objp),
1796 *dbg_redzone2(cachep, objp)); 1796 *dbg_redzone2(cachep, objp));
1797 } 1797 }
1798 1798
1799 if (cachep->flags & SLAB_STORE_USER) { 1799 if (cachep->flags & SLAB_STORE_USER) {
1800 printk(KERN_ERR "Last user: [<%p>](%pSR)\n", 1800 printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
1801 *dbg_userword(cachep, objp), 1801 *dbg_userword(cachep, objp),
1802 *dbg_userword(cachep, objp)); 1802 *dbg_userword(cachep, objp));
1803 } 1803 }
1804 realobj = (char *)objp + obj_offset(cachep); 1804 realobj = (char *)objp + obj_offset(cachep);
1805 size = cachep->object_size; 1805 size = cachep->object_size;
1806 for (i = 0; i < size && lines; i += 16, lines--) { 1806 for (i = 0; i < size && lines; i += 16, lines--) {
1807 int limit; 1807 int limit;
1808 limit = 16; 1808 limit = 16;
1809 if (i + limit > size) 1809 if (i + limit > size)
1810 limit = size - i; 1810 limit = size - i;
1811 dump_line(realobj, i, limit); 1811 dump_line(realobj, i, limit);
1812 } 1812 }
1813 } 1813 }
1814 1814
1815 static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1815 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1816 { 1816 {
1817 char *realobj; 1817 char *realobj;
1818 int size, i; 1818 int size, i;
1819 int lines = 0; 1819 int lines = 0;
1820 1820
1821 realobj = (char *)objp + obj_offset(cachep); 1821 realobj = (char *)objp + obj_offset(cachep);
1822 size = cachep->object_size; 1822 size = cachep->object_size;
1823 1823
1824 for (i = 0; i < size; i++) { 1824 for (i = 0; i < size; i++) {
1825 char exp = POISON_FREE; 1825 char exp = POISON_FREE;
1826 if (i == size - 1) 1826 if (i == size - 1)
1827 exp = POISON_END; 1827 exp = POISON_END;
1828 if (realobj[i] != exp) { 1828 if (realobj[i] != exp) {
1829 int limit; 1829 int limit;
1830 /* Mismatch ! */ 1830 /* Mismatch ! */
1831 /* Print header */ 1831 /* Print header */
1832 if (lines == 0) { 1832 if (lines == 0) {
1833 printk(KERN_ERR 1833 printk(KERN_ERR
1834 "Slab corruption (%s): %s start=%p, len=%d\n", 1834 "Slab corruption (%s): %s start=%p, len=%d\n",
1835 print_tainted(), cachep->name, realobj, size); 1835 print_tainted(), cachep->name, realobj, size);
1836 print_objinfo(cachep, objp, 0); 1836 print_objinfo(cachep, objp, 0);
1837 } 1837 }
1838 /* Hexdump the affected line */ 1838 /* Hexdump the affected line */
1839 i = (i / 16) * 16; 1839 i = (i / 16) * 16;
1840 limit = 16; 1840 limit = 16;
1841 if (i + limit > size) 1841 if (i + limit > size)
1842 limit = size - i; 1842 limit = size - i;
1843 dump_line(realobj, i, limit); 1843 dump_line(realobj, i, limit);
1844 i += 16; 1844 i += 16;
1845 lines++; 1845 lines++;
1846 /* Limit to 5 lines */ 1846 /* Limit to 5 lines */
1847 if (lines > 5) 1847 if (lines > 5)
1848 break; 1848 break;
1849 } 1849 }
1850 } 1850 }
1851 if (lines != 0) { 1851 if (lines != 0) {
1852 /* Print some data about the neighboring objects, if they 1852 /* Print some data about the neighboring objects, if they
1853 * exist: 1853 * exist:
1854 */ 1854 */
1855 struct page *page = virt_to_head_page(objp); 1855 struct page *page = virt_to_head_page(objp);
1856 unsigned int objnr; 1856 unsigned int objnr;
1857 1857
1858 objnr = obj_to_index(cachep, page, objp); 1858 objnr = obj_to_index(cachep, page, objp);
1859 if (objnr) { 1859 if (objnr) {
1860 objp = index_to_obj(cachep, page, objnr - 1); 1860 objp = index_to_obj(cachep, page, objnr - 1);
1861 realobj = (char *)objp + obj_offset(cachep); 1861 realobj = (char *)objp + obj_offset(cachep);
1862 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1862 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1863 realobj, size); 1863 realobj, size);
1864 print_objinfo(cachep, objp, 2); 1864 print_objinfo(cachep, objp, 2);
1865 } 1865 }
1866 if (objnr + 1 < cachep->num) { 1866 if (objnr + 1 < cachep->num) {
1867 objp = index_to_obj(cachep, page, objnr + 1); 1867 objp = index_to_obj(cachep, page, objnr + 1);
1868 realobj = (char *)objp + obj_offset(cachep); 1868 realobj = (char *)objp + obj_offset(cachep);
1869 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1869 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1870 realobj, size); 1870 realobj, size);
1871 print_objinfo(cachep, objp, 2); 1871 print_objinfo(cachep, objp, 2);
1872 } 1872 }
1873 } 1873 }
1874 } 1874 }
1875 #endif 1875 #endif
1876 1876
1877 #if DEBUG 1877 #if DEBUG
1878 static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1878 static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1879 struct page *page) 1879 struct page *page)
1880 { 1880 {
1881 int i; 1881 int i;
1882 for (i = 0; i < cachep->num; i++) { 1882 for (i = 0; i < cachep->num; i++) {
1883 void *objp = index_to_obj(cachep, page, i); 1883 void *objp = index_to_obj(cachep, page, i);
1884 1884
1885 if (cachep->flags & SLAB_POISON) { 1885 if (cachep->flags & SLAB_POISON) {
1886 #ifdef CONFIG_DEBUG_PAGEALLOC 1886 #ifdef CONFIG_DEBUG_PAGEALLOC
1887 if (cachep->size % PAGE_SIZE == 0 && 1887 if (cachep->size % PAGE_SIZE == 0 &&
1888 OFF_SLAB(cachep)) 1888 OFF_SLAB(cachep))
1889 kernel_map_pages(virt_to_page(objp), 1889 kernel_map_pages(virt_to_page(objp),
1890 cachep->size / PAGE_SIZE, 1); 1890 cachep->size / PAGE_SIZE, 1);
1891 else 1891 else
1892 check_poison_obj(cachep, objp); 1892 check_poison_obj(cachep, objp);
1893 #else 1893 #else
1894 check_poison_obj(cachep, objp); 1894 check_poison_obj(cachep, objp);
1895 #endif 1895 #endif
1896 } 1896 }
1897 if (cachep->flags & SLAB_RED_ZONE) { 1897 if (cachep->flags & SLAB_RED_ZONE) {
1898 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1898 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1899 slab_error(cachep, "start of a freed object " 1899 slab_error(cachep, "start of a freed object "
1900 "was overwritten"); 1900 "was overwritten");
1901 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1901 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1902 slab_error(cachep, "end of a freed object " 1902 slab_error(cachep, "end of a freed object "
1903 "was overwritten"); 1903 "was overwritten");
1904 } 1904 }
1905 } 1905 }
1906 } 1906 }
1907 #else 1907 #else
1908 static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1908 static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1909 struct page *page) 1909 struct page *page)
1910 { 1910 {
1911 } 1911 }
1912 #endif 1912 #endif
1913 1913
1914 /** 1914 /**
1915 * slab_destroy - destroy and release all objects in a slab 1915 * slab_destroy - destroy and release all objects in a slab
1916 * @cachep: cache pointer being destroyed 1916 * @cachep: cache pointer being destroyed
1917 * @page: page pointer being destroyed 1917 * @page: page pointer being destroyed
1918 * 1918 *
1919 * Destroy all the objs in a slab page, and release the mem back to the system. 1919 * Destroy all the objs in a slab page, and release the mem back to the system.
1920 * Before calling the slab page must have been unlinked from the cache. The 1920 * Before calling the slab page must have been unlinked from the cache. The
1921 * kmem_cache_node ->list_lock is not held/needed. 1921 * kmem_cache_node ->list_lock is not held/needed.
1922 */ 1922 */
1923 static void slab_destroy(struct kmem_cache *cachep, struct page *page) 1923 static void slab_destroy(struct kmem_cache *cachep, struct page *page)
1924 { 1924 {
1925 void *freelist; 1925 void *freelist;
1926 1926
1927 freelist = page->freelist; 1927 freelist = page->freelist;
1928 slab_destroy_debugcheck(cachep, page); 1928 slab_destroy_debugcheck(cachep, page);
1929 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1929 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1930 struct rcu_head *head; 1930 struct rcu_head *head;
1931 1931
1932 /* 1932 /*
1933 * RCU free overloads the RCU head over the LRU. 1933 * RCU free overloads the RCU head over the LRU.
1934 * slab_page has been overloeaded over the LRU, 1934 * slab_page has been overloeaded over the LRU,
1935 * however it is not used from now on so that 1935 * however it is not used from now on so that
1936 * we can use it safely. 1936 * we can use it safely.
1937 */ 1937 */
1938 head = (void *)&page->rcu_head; 1938 head = (void *)&page->rcu_head;
1939 call_rcu(head, kmem_rcu_free); 1939 call_rcu(head, kmem_rcu_free);
1940 1940
1941 } else { 1941 } else {
1942 kmem_freepages(cachep, page); 1942 kmem_freepages(cachep, page);
1943 } 1943 }
1944 1944
1945 /* 1945 /*
1946 * From now on, we don't use freelist 1946 * From now on, we don't use freelist
1947 * although actual page can be freed in rcu context 1947 * although actual page can be freed in rcu context
1948 */ 1948 */
1949 if (OFF_SLAB(cachep)) 1949 if (OFF_SLAB(cachep))
1950 kmem_cache_free(cachep->freelist_cache, freelist); 1950 kmem_cache_free(cachep->freelist_cache, freelist);
1951 } 1951 }
1952 1952
1953 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) 1953 static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1954 { 1954 {
1955 struct page *page, *n; 1955 struct page *page, *n;
1956 1956
1957 list_for_each_entry_safe(page, n, list, lru) { 1957 list_for_each_entry_safe(page, n, list, lru) {
1958 list_del(&page->lru); 1958 list_del(&page->lru);
1959 slab_destroy(cachep, page); 1959 slab_destroy(cachep, page);
1960 } 1960 }
1961 } 1961 }
1962 1962
1963 /** 1963 /**
1964 * calculate_slab_order - calculate size (page order) of slabs 1964 * calculate_slab_order - calculate size (page order) of slabs
1965 * @cachep: pointer to the cache that is being created 1965 * @cachep: pointer to the cache that is being created
1966 * @size: size of objects to be created in this cache. 1966 * @size: size of objects to be created in this cache.
1967 * @align: required alignment for the objects. 1967 * @align: required alignment for the objects.
1968 * @flags: slab allocation flags 1968 * @flags: slab allocation flags
1969 * 1969 *
1970 * Also calculates the number of objects per slab. 1970 * Also calculates the number of objects per slab.
1971 * 1971 *
1972 * This could be made much more intelligent. For now, try to avoid using 1972 * This could be made much more intelligent. For now, try to avoid using
1973 * high order pages for slabs. When the gfp() functions are more friendly 1973 * high order pages for slabs. When the gfp() functions are more friendly
1974 * towards high-order requests, this should be changed. 1974 * towards high-order requests, this should be changed.
1975 */ 1975 */
1976 static size_t calculate_slab_order(struct kmem_cache *cachep, 1976 static size_t calculate_slab_order(struct kmem_cache *cachep,
1977 size_t size, size_t align, unsigned long flags) 1977 size_t size, size_t align, unsigned long flags)
1978 { 1978 {
1979 unsigned long offslab_limit; 1979 unsigned long offslab_limit;
1980 size_t left_over = 0; 1980 size_t left_over = 0;
1981 int gfporder; 1981 int gfporder;
1982 1982
1983 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 1983 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
1984 unsigned int num; 1984 unsigned int num;
1985 size_t remainder; 1985 size_t remainder;
1986 1986
1987 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1987 cache_estimate(gfporder, size, align, flags, &remainder, &num);
1988 if (!num) 1988 if (!num)
1989 continue; 1989 continue;
1990 1990
1991 /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ 1991 /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
1992 if (num > SLAB_OBJ_MAX_NUM) 1992 if (num > SLAB_OBJ_MAX_NUM)
1993 break; 1993 break;
1994 1994
1995 if (flags & CFLGS_OFF_SLAB) { 1995 if (flags & CFLGS_OFF_SLAB) {
1996 size_t freelist_size_per_obj = sizeof(freelist_idx_t); 1996 size_t freelist_size_per_obj = sizeof(freelist_idx_t);
1997 /* 1997 /*
1998 * Max number of objs-per-slab for caches which 1998 * Max number of objs-per-slab for caches which
1999 * use off-slab slabs. Needed to avoid a possible 1999 * use off-slab slabs. Needed to avoid a possible
2000 * looping condition in cache_grow(). 2000 * looping condition in cache_grow().
2001 */ 2001 */
2002 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 2002 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
2003 freelist_size_per_obj += sizeof(char); 2003 freelist_size_per_obj += sizeof(char);
2004 offslab_limit = size; 2004 offslab_limit = size;
2005 offslab_limit /= freelist_size_per_obj; 2005 offslab_limit /= freelist_size_per_obj;
2006 2006
2007 if (num > offslab_limit) 2007 if (num > offslab_limit)
2008 break; 2008 break;
2009 } 2009 }
2010 2010
2011 /* Found something acceptable - save it away */ 2011 /* Found something acceptable - save it away */
2012 cachep->num = num; 2012 cachep->num = num;
2013 cachep->gfporder = gfporder; 2013 cachep->gfporder = gfporder;
2014 left_over = remainder; 2014 left_over = remainder;
2015 2015
2016 /* 2016 /*
2017 * A VFS-reclaimable slab tends to have most allocations 2017 * A VFS-reclaimable slab tends to have most allocations
2018 * as GFP_NOFS and we really don't want to have to be allocating 2018 * as GFP_NOFS and we really don't want to have to be allocating
2019 * higher-order pages when we are unable to shrink dcache. 2019 * higher-order pages when we are unable to shrink dcache.
2020 */ 2020 */
2021 if (flags & SLAB_RECLAIM_ACCOUNT) 2021 if (flags & SLAB_RECLAIM_ACCOUNT)
2022 break; 2022 break;
2023 2023
2024 /* 2024 /*
2025 * Large number of objects is good, but very large slabs are 2025 * Large number of objects is good, but very large slabs are
2026 * currently bad for the gfp()s. 2026 * currently bad for the gfp()s.
2027 */ 2027 */
2028 if (gfporder >= slab_max_order) 2028 if (gfporder >= slab_max_order)
2029 break; 2029 break;
2030 2030
2031 /* 2031 /*
2032 * Acceptable internal fragmentation? 2032 * Acceptable internal fragmentation?
2033 */ 2033 */
2034 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 2034 if (left_over * 8 <= (PAGE_SIZE << gfporder))
2035 break; 2035 break;
2036 } 2036 }
2037 return left_over; 2037 return left_over;
2038 } 2038 }
2039 2039
2040 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2040 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2041 { 2041 {
2042 if (slab_state >= FULL) 2042 if (slab_state >= FULL)
2043 return enable_cpucache(cachep, gfp); 2043 return enable_cpucache(cachep, gfp);
2044 2044
2045 if (slab_state == DOWN) { 2045 if (slab_state == DOWN) {
2046 /* 2046 /*
2047 * Note: Creation of first cache (kmem_cache). 2047 * Note: Creation of first cache (kmem_cache).
2048 * The setup_node is taken care 2048 * The setup_node is taken care
2049 * of by the caller of __kmem_cache_create 2049 * of by the caller of __kmem_cache_create
2050 */ 2050 */
2051 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2051 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2052 slab_state = PARTIAL; 2052 slab_state = PARTIAL;
2053 } else if (slab_state == PARTIAL) { 2053 } else if (slab_state == PARTIAL) {
2054 /* 2054 /*
2055 * Note: the second kmem_cache_create must create the cache 2055 * Note: the second kmem_cache_create must create the cache
2056 * that's used by kmalloc(24), otherwise the creation of 2056 * that's used by kmalloc(24), otherwise the creation of
2057 * further caches will BUG(). 2057 * further caches will BUG().
2058 */ 2058 */
2059 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2059 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2060 2060
2061 /* 2061 /*
2062 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is 2062 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
2063 * the second cache, then we need to set up all its node/, 2063 * the second cache, then we need to set up all its node/,
2064 * otherwise the creation of further caches will BUG(). 2064 * otherwise the creation of further caches will BUG().
2065 */ 2065 */
2066 set_up_node(cachep, SIZE_AC); 2066 set_up_node(cachep, SIZE_AC);
2067 if (INDEX_AC == INDEX_NODE) 2067 if (INDEX_AC == INDEX_NODE)
2068 slab_state = PARTIAL_NODE; 2068 slab_state = PARTIAL_NODE;
2069 else 2069 else
2070 slab_state = PARTIAL_ARRAYCACHE; 2070 slab_state = PARTIAL_ARRAYCACHE;
2071 } else { 2071 } else {
2072 /* Remaining boot caches */ 2072 /* Remaining boot caches */
2073 cachep->array[smp_processor_id()] = 2073 cachep->array[smp_processor_id()] =
2074 kmalloc(sizeof(struct arraycache_init), gfp); 2074 kmalloc(sizeof(struct arraycache_init), gfp);
2075 2075
2076 if (slab_state == PARTIAL_ARRAYCACHE) { 2076 if (slab_state == PARTIAL_ARRAYCACHE) {
2077 set_up_node(cachep, SIZE_NODE); 2077 set_up_node(cachep, SIZE_NODE);
2078 slab_state = PARTIAL_NODE; 2078 slab_state = PARTIAL_NODE;
2079 } else { 2079 } else {
2080 int node; 2080 int node;
2081 for_each_online_node(node) { 2081 for_each_online_node(node) {
2082 cachep->node[node] = 2082 cachep->node[node] =
2083 kmalloc_node(sizeof(struct kmem_cache_node), 2083 kmalloc_node(sizeof(struct kmem_cache_node),
2084 gfp, node); 2084 gfp, node);
2085 BUG_ON(!cachep->node[node]); 2085 BUG_ON(!cachep->node[node]);
2086 kmem_cache_node_init(cachep->node[node]); 2086 kmem_cache_node_init(cachep->node[node]);
2087 } 2087 }
2088 } 2088 }
2089 } 2089 }
2090 cachep->node[numa_mem_id()]->next_reap = 2090 cachep->node[numa_mem_id()]->next_reap =
2091 jiffies + REAPTIMEOUT_NODE + 2091 jiffies + REAPTIMEOUT_NODE +
2092 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 2092 ((unsigned long)cachep) % REAPTIMEOUT_NODE;
2093 2093
2094 cpu_cache_get(cachep)->avail = 0; 2094 cpu_cache_get(cachep)->avail = 0;
2095 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2095 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2096 cpu_cache_get(cachep)->batchcount = 1; 2096 cpu_cache_get(cachep)->batchcount = 1;
2097 cpu_cache_get(cachep)->touched = 0; 2097 cpu_cache_get(cachep)->touched = 0;
2098 cachep->batchcount = 1; 2098 cachep->batchcount = 1;
2099 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2099 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2100 return 0; 2100 return 0;
2101 } 2101 }
2102 2102
2103 /** 2103 /**
2104 * __kmem_cache_create - Create a cache. 2104 * __kmem_cache_create - Create a cache.
2105 * @cachep: cache management descriptor 2105 * @cachep: cache management descriptor
2106 * @flags: SLAB flags 2106 * @flags: SLAB flags
2107 * 2107 *
2108 * Returns a ptr to the cache on success, NULL on failure. 2108 * Returns a ptr to the cache on success, NULL on failure.
2109 * Cannot be called within a int, but can be interrupted. 2109 * Cannot be called within a int, but can be interrupted.
2110 * The @ctor is run when new pages are allocated by the cache. 2110 * The @ctor is run when new pages are allocated by the cache.
2111 * 2111 *
2112 * The flags are 2112 * The flags are
2113 * 2113 *
2114 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2114 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2115 * to catch references to uninitialised memory. 2115 * to catch references to uninitialised memory.
2116 * 2116 *
2117 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2117 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2118 * for buffer overruns. 2118 * for buffer overruns.
2119 * 2119 *
2120 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2120 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2121 * cacheline. This can be beneficial if you're counting cycles as closely 2121 * cacheline. This can be beneficial if you're counting cycles as closely
2122 * as davem. 2122 * as davem.
2123 */ 2123 */
2124 int 2124 int
2125 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2125 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2126 { 2126 {
2127 size_t left_over, freelist_size, ralign; 2127 size_t left_over, freelist_size;
2128 size_t ralign = BYTES_PER_WORD;
2128 gfp_t gfp; 2129 gfp_t gfp;
2129 int err; 2130 int err;
2130 size_t size = cachep->size; 2131 size_t size = cachep->size;
2131 2132
2132 #if DEBUG 2133 #if DEBUG
2133 #if FORCED_DEBUG 2134 #if FORCED_DEBUG
2134 /* 2135 /*
2135 * Enable redzoning and last user accounting, except for caches with 2136 * Enable redzoning and last user accounting, except for caches with
2136 * large objects, if the increased size would increase the object size 2137 * large objects, if the increased size would increase the object size
2137 * above the next power of two: caches with object sizes just above a 2138 * above the next power of two: caches with object sizes just above a
2138 * power of two have a significant amount of internal fragmentation. 2139 * power of two have a significant amount of internal fragmentation.
2139 */ 2140 */
2140 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2141 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2141 2 * sizeof(unsigned long long))) 2142 2 * sizeof(unsigned long long)))
2142 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2143 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2143 if (!(flags & SLAB_DESTROY_BY_RCU)) 2144 if (!(flags & SLAB_DESTROY_BY_RCU))
2144 flags |= SLAB_POISON; 2145 flags |= SLAB_POISON;
2145 #endif 2146 #endif
2146 if (flags & SLAB_DESTROY_BY_RCU) 2147 if (flags & SLAB_DESTROY_BY_RCU)
2147 BUG_ON(flags & SLAB_POISON); 2148 BUG_ON(flags & SLAB_POISON);
2148 #endif 2149 #endif
2149 2150
2150 /* 2151 /*
2151 * Check that size is in terms of words. This is needed to avoid 2152 * Check that size is in terms of words. This is needed to avoid
2152 * unaligned accesses for some archs when redzoning is used, and makes 2153 * unaligned accesses for some archs when redzoning is used, and makes
2153 * sure any on-slab bufctl's are also correctly aligned. 2154 * sure any on-slab bufctl's are also correctly aligned.
2154 */ 2155 */
2155 if (size & (BYTES_PER_WORD - 1)) { 2156 if (size & (BYTES_PER_WORD - 1)) {
2156 size += (BYTES_PER_WORD - 1); 2157 size += (BYTES_PER_WORD - 1);
2157 size &= ~(BYTES_PER_WORD - 1); 2158 size &= ~(BYTES_PER_WORD - 1);
2158 } 2159 }
2159
2160 /*
2161 * Redzoning and user store require word alignment or possibly larger.
2162 * Note this will be overridden by architecture or caller mandated
2163 * alignment if either is greater than BYTES_PER_WORD.
2164 */
2165 if (flags & SLAB_STORE_USER)
2166 ralign = BYTES_PER_WORD;
2167 2160
2168 if (flags & SLAB_RED_ZONE) { 2161 if (flags & SLAB_RED_ZONE) {
2169 ralign = REDZONE_ALIGN; 2162 ralign = REDZONE_ALIGN;
2170 /* If redzoning, ensure that the second redzone is suitably 2163 /* If redzoning, ensure that the second redzone is suitably
2171 * aligned, by adjusting the object size accordingly. */ 2164 * aligned, by adjusting the object size accordingly. */
2172 size += REDZONE_ALIGN - 1; 2165 size += REDZONE_ALIGN - 1;
2173 size &= ~(REDZONE_ALIGN - 1); 2166 size &= ~(REDZONE_ALIGN - 1);
2174 } 2167 }
2175 2168
2176 /* 3) caller mandated alignment */ 2169 /* 3) caller mandated alignment */
2177 if (ralign < cachep->align) { 2170 if (ralign < cachep->align) {
2178 ralign = cachep->align; 2171 ralign = cachep->align;
2179 } 2172 }
2180 /* disable debug if necessary */ 2173 /* disable debug if necessary */
2181 if (ralign > __alignof__(unsigned long long)) 2174 if (ralign > __alignof__(unsigned long long))
2182 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2175 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2183 /* 2176 /*
2184 * 4) Store it. 2177 * 4) Store it.
2185 */ 2178 */
2186 cachep->align = ralign; 2179 cachep->align = ralign;
2187 2180
2188 if (slab_is_available()) 2181 if (slab_is_available())
2189 gfp = GFP_KERNEL; 2182 gfp = GFP_KERNEL;
2190 else 2183 else
2191 gfp = GFP_NOWAIT; 2184 gfp = GFP_NOWAIT;
2192 2185
2193 setup_node_pointer(cachep); 2186 setup_node_pointer(cachep);
2194 #if DEBUG 2187 #if DEBUG
2195 2188
2196 /* 2189 /*
2197 * Both debugging options require word-alignment which is calculated 2190 * Both debugging options require word-alignment which is calculated
2198 * into align above. 2191 * into align above.
2199 */ 2192 */
2200 if (flags & SLAB_RED_ZONE) { 2193 if (flags & SLAB_RED_ZONE) {
2201 /* add space for red zone words */ 2194 /* add space for red zone words */
2202 cachep->obj_offset += sizeof(unsigned long long); 2195 cachep->obj_offset += sizeof(unsigned long long);
2203 size += 2 * sizeof(unsigned long long); 2196 size += 2 * sizeof(unsigned long long);
2204 } 2197 }
2205 if (flags & SLAB_STORE_USER) { 2198 if (flags & SLAB_STORE_USER) {
2206 /* user store requires one word storage behind the end of 2199 /* user store requires one word storage behind the end of
2207 * the real object. But if the second red zone needs to be 2200 * the real object. But if the second red zone needs to be
2208 * aligned to 64 bits, we must allow that much space. 2201 * aligned to 64 bits, we must allow that much space.
2209 */ 2202 */
2210 if (flags & SLAB_RED_ZONE) 2203 if (flags & SLAB_RED_ZONE)
2211 size += REDZONE_ALIGN; 2204 size += REDZONE_ALIGN;
2212 else 2205 else
2213 size += BYTES_PER_WORD; 2206 size += BYTES_PER_WORD;
2214 } 2207 }
2215 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2208 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2216 if (size >= kmalloc_size(INDEX_NODE + 1) 2209 if (size >= kmalloc_size(INDEX_NODE + 1)
2217 && cachep->object_size > cache_line_size() 2210 && cachep->object_size > cache_line_size()
2218 && ALIGN(size, cachep->align) < PAGE_SIZE) { 2211 && ALIGN(size, cachep->align) < PAGE_SIZE) {
2219 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); 2212 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2220 size = PAGE_SIZE; 2213 size = PAGE_SIZE;
2221 } 2214 }
2222 #endif 2215 #endif
2223 #endif 2216 #endif
2224 2217
2225 /* 2218 /*
2226 * Determine if the slab management is 'on' or 'off' slab. 2219 * Determine if the slab management is 'on' or 'off' slab.
2227 * (bootstrapping cannot cope with offslab caches so don't do 2220 * (bootstrapping cannot cope with offslab caches so don't do
2228 * it too early on. Always use on-slab management when 2221 * it too early on. Always use on-slab management when
2229 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2222 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2230 */ 2223 */
2231 if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && 2224 if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
2232 !(flags & SLAB_NOLEAKTRACE)) 2225 !(flags & SLAB_NOLEAKTRACE))
2233 /* 2226 /*
2234 * Size is large, assume best to place the slab management obj 2227 * Size is large, assume best to place the slab management obj
2235 * off-slab (should allow better packing of objs). 2228 * off-slab (should allow better packing of objs).
2236 */ 2229 */
2237 flags |= CFLGS_OFF_SLAB; 2230 flags |= CFLGS_OFF_SLAB;
2238 2231
2239 size = ALIGN(size, cachep->align); 2232 size = ALIGN(size, cachep->align);
2240 /* 2233 /*
2241 * We should restrict the number of objects in a slab to implement 2234 * We should restrict the number of objects in a slab to implement
2242 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. 2235 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
2243 */ 2236 */
2244 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 2237 if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
2245 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 2238 size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
2246 2239
2247 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2240 left_over = calculate_slab_order(cachep, size, cachep->align, flags);
2248 2241
2249 if (!cachep->num) 2242 if (!cachep->num)
2250 return -E2BIG; 2243 return -E2BIG;
2251 2244
2252 freelist_size = calculate_freelist_size(cachep->num, cachep->align); 2245 freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2253 2246
2254 /* 2247 /*
2255 * If the slab has been placed off-slab, and we have enough space then 2248 * If the slab has been placed off-slab, and we have enough space then
2256 * move it on-slab. This is at the expense of any extra colouring. 2249 * move it on-slab. This is at the expense of any extra colouring.
2257 */ 2250 */
2258 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { 2251 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
2259 flags &= ~CFLGS_OFF_SLAB; 2252 flags &= ~CFLGS_OFF_SLAB;
2260 left_over -= freelist_size; 2253 left_over -= freelist_size;
2261 } 2254 }
2262 2255
2263 if (flags & CFLGS_OFF_SLAB) { 2256 if (flags & CFLGS_OFF_SLAB) {
2264 /* really off slab. No need for manual alignment */ 2257 /* really off slab. No need for manual alignment */
2265 freelist_size = calculate_freelist_size(cachep->num, 0); 2258 freelist_size = calculate_freelist_size(cachep->num, 0);
2266 2259
2267 #ifdef CONFIG_PAGE_POISONING 2260 #ifdef CONFIG_PAGE_POISONING
2268 /* If we're going to use the generic kernel_map_pages() 2261 /* If we're going to use the generic kernel_map_pages()
2269 * poisoning, then it's going to smash the contents of 2262 * poisoning, then it's going to smash the contents of
2270 * the redzone and userword anyhow, so switch them off. 2263 * the redzone and userword anyhow, so switch them off.
2271 */ 2264 */
2272 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) 2265 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2273 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2266 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2274 #endif 2267 #endif
2275 } 2268 }
2276 2269
2277 cachep->colour_off = cache_line_size(); 2270 cachep->colour_off = cache_line_size();
2278 /* Offset must be a multiple of the alignment. */ 2271 /* Offset must be a multiple of the alignment. */
2279 if (cachep->colour_off < cachep->align) 2272 if (cachep->colour_off < cachep->align)
2280 cachep->colour_off = cachep->align; 2273 cachep->colour_off = cachep->align;
2281 cachep->colour = left_over / cachep->colour_off; 2274 cachep->colour = left_over / cachep->colour_off;
2282 cachep->freelist_size = freelist_size; 2275 cachep->freelist_size = freelist_size;
2283 cachep->flags = flags; 2276 cachep->flags = flags;
2284 cachep->allocflags = __GFP_COMP; 2277 cachep->allocflags = __GFP_COMP;
2285 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2278 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2286 cachep->allocflags |= GFP_DMA; 2279 cachep->allocflags |= GFP_DMA;
2287 cachep->size = size; 2280 cachep->size = size;
2288 cachep->reciprocal_buffer_size = reciprocal_value(size); 2281 cachep->reciprocal_buffer_size = reciprocal_value(size);
2289 2282
2290 if (flags & CFLGS_OFF_SLAB) { 2283 if (flags & CFLGS_OFF_SLAB) {
2291 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); 2284 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
2292 /* 2285 /*
2293 * This is a possibility for one of the kmalloc_{dma,}_caches. 2286 * This is a possibility for one of the kmalloc_{dma,}_caches.
2294 * But since we go off slab only for object size greater than 2287 * But since we go off slab only for object size greater than
2295 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created 2288 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
2296 * in ascending order,this should not happen at all. 2289 * in ascending order,this should not happen at all.
2297 * But leave a BUG_ON for some lucky dude. 2290 * But leave a BUG_ON for some lucky dude.
2298 */ 2291 */
2299 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); 2292 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
2300 } 2293 }
2301 2294
2302 err = setup_cpu_cache(cachep, gfp); 2295 err = setup_cpu_cache(cachep, gfp);
2303 if (err) { 2296 if (err) {
2304 __kmem_cache_shutdown(cachep); 2297 __kmem_cache_shutdown(cachep);
2305 return err; 2298 return err;
2306 } 2299 }
2307 2300
2308 return 0; 2301 return 0;
2309 } 2302 }
2310 2303
2311 #if DEBUG 2304 #if DEBUG
2312 static void check_irq_off(void) 2305 static void check_irq_off(void)
2313 { 2306 {
2314 BUG_ON(!irqs_disabled()); 2307 BUG_ON(!irqs_disabled());
2315 } 2308 }
2316 2309
2317 static void check_irq_on(void) 2310 static void check_irq_on(void)
2318 { 2311 {
2319 BUG_ON(irqs_disabled()); 2312 BUG_ON(irqs_disabled());
2320 } 2313 }
2321 2314
2322 static void check_spinlock_acquired(struct kmem_cache *cachep) 2315 static void check_spinlock_acquired(struct kmem_cache *cachep)
2323 { 2316 {
2324 #ifdef CONFIG_SMP 2317 #ifdef CONFIG_SMP
2325 check_irq_off(); 2318 check_irq_off();
2326 assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); 2319 assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
2327 #endif 2320 #endif
2328 } 2321 }
2329 2322
2330 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2323 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2331 { 2324 {
2332 #ifdef CONFIG_SMP 2325 #ifdef CONFIG_SMP
2333 check_irq_off(); 2326 check_irq_off();
2334 assert_spin_locked(&get_node(cachep, node)->list_lock); 2327 assert_spin_locked(&get_node(cachep, node)->list_lock);
2335 #endif 2328 #endif
2336 } 2329 }
2337 2330
2338 #else 2331 #else
2339 #define check_irq_off() do { } while(0) 2332 #define check_irq_off() do { } while(0)
2340 #define check_irq_on() do { } while(0) 2333 #define check_irq_on() do { } while(0)
2341 #define check_spinlock_acquired(x) do { } while(0) 2334 #define check_spinlock_acquired(x) do { } while(0)
2342 #define check_spinlock_acquired_node(x, y) do { } while(0) 2335 #define check_spinlock_acquired_node(x, y) do { } while(0)
2343 #endif 2336 #endif
2344 2337
2345 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 2338 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
2346 struct array_cache *ac, 2339 struct array_cache *ac,
2347 int force, int node); 2340 int force, int node);
2348 2341
2349 static void do_drain(void *arg) 2342 static void do_drain(void *arg)
2350 { 2343 {
2351 struct kmem_cache *cachep = arg; 2344 struct kmem_cache *cachep = arg;
2352 struct array_cache *ac; 2345 struct array_cache *ac;
2353 int node = numa_mem_id(); 2346 int node = numa_mem_id();
2354 struct kmem_cache_node *n; 2347 struct kmem_cache_node *n;
2355 LIST_HEAD(list); 2348 LIST_HEAD(list);
2356 2349
2357 check_irq_off(); 2350 check_irq_off();
2358 ac = cpu_cache_get(cachep); 2351 ac = cpu_cache_get(cachep);
2359 n = get_node(cachep, node); 2352 n = get_node(cachep, node);
2360 spin_lock(&n->list_lock); 2353 spin_lock(&n->list_lock);
2361 free_block(cachep, ac->entry, ac->avail, node, &list); 2354 free_block(cachep, ac->entry, ac->avail, node, &list);
2362 spin_unlock(&n->list_lock); 2355 spin_unlock(&n->list_lock);
2363 slabs_destroy(cachep, &list); 2356 slabs_destroy(cachep, &list);
2364 ac->avail = 0; 2357 ac->avail = 0;
2365 } 2358 }
2366 2359
2367 static void drain_cpu_caches(struct kmem_cache *cachep) 2360 static void drain_cpu_caches(struct kmem_cache *cachep)
2368 { 2361 {
2369 struct kmem_cache_node *n; 2362 struct kmem_cache_node *n;
2370 int node; 2363 int node;
2371 2364
2372 on_each_cpu(do_drain, cachep, 1); 2365 on_each_cpu(do_drain, cachep, 1);
2373 check_irq_on(); 2366 check_irq_on();
2374 for_each_kmem_cache_node(cachep, node, n) 2367 for_each_kmem_cache_node(cachep, node, n)
2375 if (n->alien) 2368 if (n->alien)
2376 drain_alien_cache(cachep, n->alien); 2369 drain_alien_cache(cachep, n->alien);
2377 2370
2378 for_each_kmem_cache_node(cachep, node, n) 2371 for_each_kmem_cache_node(cachep, node, n)
2379 drain_array(cachep, n, n->shared, 1, node); 2372 drain_array(cachep, n, n->shared, 1, node);
2380 } 2373 }
2381 2374
2382 /* 2375 /*
2383 * Remove slabs from the list of free slabs. 2376 * Remove slabs from the list of free slabs.
2384 * Specify the number of slabs to drain in tofree. 2377 * Specify the number of slabs to drain in tofree.
2385 * 2378 *
2386 * Returns the actual number of slabs released. 2379 * Returns the actual number of slabs released.
2387 */ 2380 */
2388 static int drain_freelist(struct kmem_cache *cache, 2381 static int drain_freelist(struct kmem_cache *cache,
2389 struct kmem_cache_node *n, int tofree) 2382 struct kmem_cache_node *n, int tofree)
2390 { 2383 {
2391 struct list_head *p; 2384 struct list_head *p;
2392 int nr_freed; 2385 int nr_freed;
2393 struct page *page; 2386 struct page *page;
2394 2387
2395 nr_freed = 0; 2388 nr_freed = 0;
2396 while (nr_freed < tofree && !list_empty(&n->slabs_free)) { 2389 while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
2397 2390
2398 spin_lock_irq(&n->list_lock); 2391 spin_lock_irq(&n->list_lock);
2399 p = n->slabs_free.prev; 2392 p = n->slabs_free.prev;
2400 if (p == &n->slabs_free) { 2393 if (p == &n->slabs_free) {
2401 spin_unlock_irq(&n->list_lock); 2394 spin_unlock_irq(&n->list_lock);
2402 goto out; 2395 goto out;
2403 } 2396 }
2404 2397
2405 page = list_entry(p, struct page, lru); 2398 page = list_entry(p, struct page, lru);
2406 #if DEBUG 2399 #if DEBUG
2407 BUG_ON(page->active); 2400 BUG_ON(page->active);
2408 #endif 2401 #endif
2409 list_del(&page->lru); 2402 list_del(&page->lru);
2410 /* 2403 /*
2411 * Safe to drop the lock. The slab is no longer linked 2404 * Safe to drop the lock. The slab is no longer linked
2412 * to the cache. 2405 * to the cache.
2413 */ 2406 */
2414 n->free_objects -= cache->num; 2407 n->free_objects -= cache->num;
2415 spin_unlock_irq(&n->list_lock); 2408 spin_unlock_irq(&n->list_lock);
2416 slab_destroy(cache, page); 2409 slab_destroy(cache, page);
2417 nr_freed++; 2410 nr_freed++;
2418 } 2411 }
2419 out: 2412 out:
2420 return nr_freed; 2413 return nr_freed;
2421 } 2414 }
2422 2415
2423 int __kmem_cache_shrink(struct kmem_cache *cachep) 2416 int __kmem_cache_shrink(struct kmem_cache *cachep)
2424 { 2417 {
2425 int ret = 0; 2418 int ret = 0;
2426 int node; 2419 int node;
2427 struct kmem_cache_node *n; 2420 struct kmem_cache_node *n;
2428 2421
2429 drain_cpu_caches(cachep); 2422 drain_cpu_caches(cachep);
2430 2423
2431 check_irq_on(); 2424 check_irq_on();
2432 for_each_kmem_cache_node(cachep, node, n) { 2425 for_each_kmem_cache_node(cachep, node, n) {
2433 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 2426 drain_freelist(cachep, n, slabs_tofree(cachep, n));
2434 2427
2435 ret += !list_empty(&n->slabs_full) || 2428 ret += !list_empty(&n->slabs_full) ||
2436 !list_empty(&n->slabs_partial); 2429 !list_empty(&n->slabs_partial);
2437 } 2430 }
2438 return (ret ? 1 : 0); 2431 return (ret ? 1 : 0);
2439 } 2432 }
2440 2433
2441 int __kmem_cache_shutdown(struct kmem_cache *cachep) 2434 int __kmem_cache_shutdown(struct kmem_cache *cachep)
2442 { 2435 {
2443 int i; 2436 int i;
2444 struct kmem_cache_node *n; 2437 struct kmem_cache_node *n;
2445 int rc = __kmem_cache_shrink(cachep); 2438 int rc = __kmem_cache_shrink(cachep);
2446 2439
2447 if (rc) 2440 if (rc)
2448 return rc; 2441 return rc;
2449 2442
2450 for_each_online_cpu(i) 2443 for_each_online_cpu(i)
2451 kfree(cachep->array[i]); 2444 kfree(cachep->array[i]);
2452 2445
2453 /* NUMA: free the node structures */ 2446 /* NUMA: free the node structures */
2454 for_each_kmem_cache_node(cachep, i, n) { 2447 for_each_kmem_cache_node(cachep, i, n) {
2455 kfree(n->shared); 2448 kfree(n->shared);
2456 free_alien_cache(n->alien); 2449 free_alien_cache(n->alien);
2457 kfree(n); 2450 kfree(n);
2458 cachep->node[i] = NULL; 2451 cachep->node[i] = NULL;
2459 } 2452 }
2460 return 0; 2453 return 0;
2461 } 2454 }
2462 2455
2463 /* 2456 /*
2464 * Get the memory for a slab management obj. 2457 * Get the memory for a slab management obj.
2465 * 2458 *
2466 * For a slab cache when the slab descriptor is off-slab, the 2459 * For a slab cache when the slab descriptor is off-slab, the
2467 * slab descriptor can't come from the same cache which is being created, 2460 * slab descriptor can't come from the same cache which is being created,
2468 * Because if it is the case, that means we defer the creation of 2461 * Because if it is the case, that means we defer the creation of
2469 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. 2462 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
2470 * And we eventually call down to __kmem_cache_create(), which 2463 * And we eventually call down to __kmem_cache_create(), which
2471 * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. 2464 * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
2472 * This is a "chicken-and-egg" problem. 2465 * This is a "chicken-and-egg" problem.
2473 * 2466 *
2474 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, 2467 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
2475 * which are all initialized during kmem_cache_init(). 2468 * which are all initialized during kmem_cache_init().
2476 */ 2469 */
2477 static void *alloc_slabmgmt(struct kmem_cache *cachep, 2470 static void *alloc_slabmgmt(struct kmem_cache *cachep,
2478 struct page *page, int colour_off, 2471 struct page *page, int colour_off,
2479 gfp_t local_flags, int nodeid) 2472 gfp_t local_flags, int nodeid)
2480 { 2473 {
2481 void *freelist; 2474 void *freelist;
2482 void *addr = page_address(page); 2475 void *addr = page_address(page);
2483 2476
2484 if (OFF_SLAB(cachep)) { 2477 if (OFF_SLAB(cachep)) {
2485 /* Slab management obj is off-slab. */ 2478 /* Slab management obj is off-slab. */
2486 freelist = kmem_cache_alloc_node(cachep->freelist_cache, 2479 freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2487 local_flags, nodeid); 2480 local_flags, nodeid);
2488 if (!freelist) 2481 if (!freelist)
2489 return NULL; 2482 return NULL;
2490 } else { 2483 } else {
2491 freelist = addr + colour_off; 2484 freelist = addr + colour_off;
2492 colour_off += cachep->freelist_size; 2485 colour_off += cachep->freelist_size;
2493 } 2486 }
2494 page->active = 0; 2487 page->active = 0;
2495 page->s_mem = addr + colour_off; 2488 page->s_mem = addr + colour_off;
2496 return freelist; 2489 return freelist;
2497 } 2490 }
2498 2491
2499 static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) 2492 static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
2500 { 2493 {
2501 return ((freelist_idx_t *)page->freelist)[idx]; 2494 return ((freelist_idx_t *)page->freelist)[idx];
2502 } 2495 }
2503 2496
2504 static inline void set_free_obj(struct page *page, 2497 static inline void set_free_obj(struct page *page,
2505 unsigned int idx, freelist_idx_t val) 2498 unsigned int idx, freelist_idx_t val)
2506 { 2499 {
2507 ((freelist_idx_t *)(page->freelist))[idx] = val; 2500 ((freelist_idx_t *)(page->freelist))[idx] = val;
2508 } 2501 }
2509 2502
2510 static void cache_init_objs(struct kmem_cache *cachep, 2503 static void cache_init_objs(struct kmem_cache *cachep,
2511 struct page *page) 2504 struct page *page)
2512 { 2505 {
2513 int i; 2506 int i;
2514 2507
2515 for (i = 0; i < cachep->num; i++) { 2508 for (i = 0; i < cachep->num; i++) {
2516 void *objp = index_to_obj(cachep, page, i); 2509 void *objp = index_to_obj(cachep, page, i);
2517 #if DEBUG 2510 #if DEBUG
2518 /* need to poison the objs? */ 2511 /* need to poison the objs? */
2519 if (cachep->flags & SLAB_POISON) 2512 if (cachep->flags & SLAB_POISON)
2520 poison_obj(cachep, objp, POISON_FREE); 2513 poison_obj(cachep, objp, POISON_FREE);
2521 if (cachep->flags & SLAB_STORE_USER) 2514 if (cachep->flags & SLAB_STORE_USER)
2522 *dbg_userword(cachep, objp) = NULL; 2515 *dbg_userword(cachep, objp) = NULL;
2523 2516
2524 if (cachep->flags & SLAB_RED_ZONE) { 2517 if (cachep->flags & SLAB_RED_ZONE) {
2525 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2518 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2526 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2519 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2527 } 2520 }
2528 /* 2521 /*
2529 * Constructors are not allowed to allocate memory from the same 2522 * Constructors are not allowed to allocate memory from the same
2530 * cache which they are a constructor for. Otherwise, deadlock. 2523 * cache which they are a constructor for. Otherwise, deadlock.
2531 * They must also be threaded. 2524 * They must also be threaded.
2532 */ 2525 */
2533 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2526 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2534 cachep->ctor(objp + obj_offset(cachep)); 2527 cachep->ctor(objp + obj_offset(cachep));
2535 2528
2536 if (cachep->flags & SLAB_RED_ZONE) { 2529 if (cachep->flags & SLAB_RED_ZONE) {
2537 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2530 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2538 slab_error(cachep, "constructor overwrote the" 2531 slab_error(cachep, "constructor overwrote the"
2539 " end of an object"); 2532 " end of an object");
2540 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2533 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2541 slab_error(cachep, "constructor overwrote the" 2534 slab_error(cachep, "constructor overwrote the"
2542 " start of an object"); 2535 " start of an object");
2543 } 2536 }
2544 if ((cachep->size % PAGE_SIZE) == 0 && 2537 if ((cachep->size % PAGE_SIZE) == 0 &&
2545 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2538 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2546 kernel_map_pages(virt_to_page(objp), 2539 kernel_map_pages(virt_to_page(objp),
2547 cachep->size / PAGE_SIZE, 0); 2540 cachep->size / PAGE_SIZE, 0);
2548 #else 2541 #else
2549 if (cachep->ctor) 2542 if (cachep->ctor)
2550 cachep->ctor(objp); 2543 cachep->ctor(objp);
2551 #endif 2544 #endif
2552 set_obj_status(page, i, OBJECT_FREE); 2545 set_obj_status(page, i, OBJECT_FREE);
2553 set_free_obj(page, i, i); 2546 set_free_obj(page, i, i);
2554 } 2547 }
2555 } 2548 }
2556 2549
2557 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2550 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2558 { 2551 {
2559 if (CONFIG_ZONE_DMA_FLAG) { 2552 if (CONFIG_ZONE_DMA_FLAG) {
2560 if (flags & GFP_DMA) 2553 if (flags & GFP_DMA)
2561 BUG_ON(!(cachep->allocflags & GFP_DMA)); 2554 BUG_ON(!(cachep->allocflags & GFP_DMA));
2562 else 2555 else
2563 BUG_ON(cachep->allocflags & GFP_DMA); 2556 BUG_ON(cachep->allocflags & GFP_DMA);
2564 } 2557 }
2565 } 2558 }
2566 2559
2567 static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, 2560 static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
2568 int nodeid) 2561 int nodeid)
2569 { 2562 {
2570 void *objp; 2563 void *objp;
2571 2564
2572 objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); 2565 objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
2573 page->active++; 2566 page->active++;
2574 #if DEBUG 2567 #if DEBUG
2575 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2568 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2576 #endif 2569 #endif
2577 2570
2578 return objp; 2571 return objp;
2579 } 2572 }
2580 2573
2581 static void slab_put_obj(struct kmem_cache *cachep, struct page *page, 2574 static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
2582 void *objp, int nodeid) 2575 void *objp, int nodeid)
2583 { 2576 {
2584 unsigned int objnr = obj_to_index(cachep, page, objp); 2577 unsigned int objnr = obj_to_index(cachep, page, objp);
2585 #if DEBUG 2578 #if DEBUG
2586 unsigned int i; 2579 unsigned int i;
2587 2580
2588 /* Verify that the slab belongs to the intended node */ 2581 /* Verify that the slab belongs to the intended node */
2589 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); 2582 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2590 2583
2591 /* Verify double free bug */ 2584 /* Verify double free bug */
2592 for (i = page->active; i < cachep->num; i++) { 2585 for (i = page->active; i < cachep->num; i++) {
2593 if (get_free_obj(page, i) == objnr) { 2586 if (get_free_obj(page, i) == objnr) {
2594 printk(KERN_ERR "slab: double free detected in cache " 2587 printk(KERN_ERR "slab: double free detected in cache "
2595 "'%s', objp %p\n", cachep->name, objp); 2588 "'%s', objp %p\n", cachep->name, objp);
2596 BUG(); 2589 BUG();
2597 } 2590 }
2598 } 2591 }
2599 #endif 2592 #endif
2600 page->active--; 2593 page->active--;
2601 set_free_obj(page, page->active, objnr); 2594 set_free_obj(page, page->active, objnr);
2602 } 2595 }
2603 2596
2604 /* 2597 /*
2605 * Map pages beginning at addr to the given cache and slab. This is required 2598 * Map pages beginning at addr to the given cache and slab. This is required
2606 * for the slab allocator to be able to lookup the cache and slab of a 2599 * for the slab allocator to be able to lookup the cache and slab of a
2607 * virtual address for kfree, ksize, and slab debugging. 2600 * virtual address for kfree, ksize, and slab debugging.
2608 */ 2601 */
2609 static void slab_map_pages(struct kmem_cache *cache, struct page *page, 2602 static void slab_map_pages(struct kmem_cache *cache, struct page *page,
2610 void *freelist) 2603 void *freelist)
2611 { 2604 {
2612 page->slab_cache = cache; 2605 page->slab_cache = cache;
2613 page->freelist = freelist; 2606 page->freelist = freelist;
2614 } 2607 }
2615 2608
2616 /* 2609 /*
2617 * Grow (by 1) the number of slabs within a cache. This is called by 2610 * Grow (by 1) the number of slabs within a cache. This is called by
2618 * kmem_cache_alloc() when there are no active objs left in a cache. 2611 * kmem_cache_alloc() when there are no active objs left in a cache.
2619 */ 2612 */
2620 static int cache_grow(struct kmem_cache *cachep, 2613 static int cache_grow(struct kmem_cache *cachep,
2621 gfp_t flags, int nodeid, struct page *page) 2614 gfp_t flags, int nodeid, struct page *page)
2622 { 2615 {
2623 void *freelist; 2616 void *freelist;
2624 size_t offset; 2617 size_t offset;
2625 gfp_t local_flags; 2618 gfp_t local_flags;
2626 struct kmem_cache_node *n; 2619 struct kmem_cache_node *n;
2627 2620
2628 /* 2621 /*
2629 * Be lazy and only check for valid flags here, keeping it out of the 2622 * Be lazy and only check for valid flags here, keeping it out of the
2630 * critical path in kmem_cache_alloc(). 2623 * critical path in kmem_cache_alloc().
2631 */ 2624 */
2632 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2625 BUG_ON(flags & GFP_SLAB_BUG_MASK);
2633 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2626 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2634 2627
2635 /* Take the node list lock to change the colour_next on this node */ 2628 /* Take the node list lock to change the colour_next on this node */
2636 check_irq_off(); 2629 check_irq_off();
2637 n = get_node(cachep, nodeid); 2630 n = get_node(cachep, nodeid);
2638 spin_lock(&n->list_lock); 2631 spin_lock(&n->list_lock);
2639 2632
2640 /* Get colour for the slab, and cal the next value. */ 2633 /* Get colour for the slab, and cal the next value. */
2641 offset = n->colour_next; 2634 offset = n->colour_next;
2642 n->colour_next++; 2635 n->colour_next++;
2643 if (n->colour_next >= cachep->colour) 2636 if (n->colour_next >= cachep->colour)
2644 n->colour_next = 0; 2637 n->colour_next = 0;
2645 spin_unlock(&n->list_lock); 2638 spin_unlock(&n->list_lock);
2646 2639
2647 offset *= cachep->colour_off; 2640 offset *= cachep->colour_off;
2648 2641
2649 if (local_flags & __GFP_WAIT) 2642 if (local_flags & __GFP_WAIT)
2650 local_irq_enable(); 2643 local_irq_enable();
2651 2644
2652 /* 2645 /*
2653 * The test for missing atomic flag is performed here, rather than 2646 * The test for missing atomic flag is performed here, rather than
2654 * the more obvious place, simply to reduce the critical path length 2647 * the more obvious place, simply to reduce the critical path length
2655 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2648 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2656 * will eventually be caught here (where it matters). 2649 * will eventually be caught here (where it matters).
2657 */ 2650 */
2658 kmem_flagcheck(cachep, flags); 2651 kmem_flagcheck(cachep, flags);
2659 2652
2660 /* 2653 /*
2661 * Get mem for the objs. Attempt to allocate a physical page from 2654 * Get mem for the objs. Attempt to allocate a physical page from
2662 * 'nodeid'. 2655 * 'nodeid'.
2663 */ 2656 */
2664 if (!page) 2657 if (!page)
2665 page = kmem_getpages(cachep, local_flags, nodeid); 2658 page = kmem_getpages(cachep, local_flags, nodeid);
2666 if (!page) 2659 if (!page)
2667 goto failed; 2660 goto failed;
2668 2661
2669 /* Get slab management. */ 2662 /* Get slab management. */
2670 freelist = alloc_slabmgmt(cachep, page, offset, 2663 freelist = alloc_slabmgmt(cachep, page, offset,
2671 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2664 local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2672 if (!freelist) 2665 if (!freelist)
2673 goto opps1; 2666 goto opps1;
2674 2667
2675 slab_map_pages(cachep, page, freelist); 2668 slab_map_pages(cachep, page, freelist);
2676 2669
2677 cache_init_objs(cachep, page); 2670 cache_init_objs(cachep, page);
2678 2671
2679 if (local_flags & __GFP_WAIT) 2672 if (local_flags & __GFP_WAIT)
2680 local_irq_disable(); 2673 local_irq_disable();
2681 check_irq_off(); 2674 check_irq_off();
2682 spin_lock(&n->list_lock); 2675 spin_lock(&n->list_lock);
2683 2676
2684 /* Make slab active. */ 2677 /* Make slab active. */
2685 list_add_tail(&page->lru, &(n->slabs_free)); 2678 list_add_tail(&page->lru, &(n->slabs_free));
2686 STATS_INC_GROWN(cachep); 2679 STATS_INC_GROWN(cachep);
2687 n->free_objects += cachep->num; 2680 n->free_objects += cachep->num;
2688 spin_unlock(&n->list_lock); 2681 spin_unlock(&n->list_lock);
2689 return 1; 2682 return 1;
2690 opps1: 2683 opps1:
2691 kmem_freepages(cachep, page); 2684 kmem_freepages(cachep, page);
2692 failed: 2685 failed:
2693 if (local_flags & __GFP_WAIT) 2686 if (local_flags & __GFP_WAIT)
2694 local_irq_disable(); 2687 local_irq_disable();
2695 return 0; 2688 return 0;
2696 } 2689 }
2697 2690
2698 #if DEBUG 2691 #if DEBUG
2699 2692
2700 /* 2693 /*
2701 * Perform extra freeing checks: 2694 * Perform extra freeing checks:
2702 * - detect bad pointers. 2695 * - detect bad pointers.
2703 * - POISON/RED_ZONE checking 2696 * - POISON/RED_ZONE checking
2704 */ 2697 */
2705 static void kfree_debugcheck(const void *objp) 2698 static void kfree_debugcheck(const void *objp)
2706 { 2699 {
2707 if (!virt_addr_valid(objp)) { 2700 if (!virt_addr_valid(objp)) {
2708 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2701 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2709 (unsigned long)objp); 2702 (unsigned long)objp);
2710 BUG(); 2703 BUG();
2711 } 2704 }
2712 } 2705 }
2713 2706
2714 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2707 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2715 { 2708 {
2716 unsigned long long redzone1, redzone2; 2709 unsigned long long redzone1, redzone2;
2717 2710
2718 redzone1 = *dbg_redzone1(cache, obj); 2711 redzone1 = *dbg_redzone1(cache, obj);
2719 redzone2 = *dbg_redzone2(cache, obj); 2712 redzone2 = *dbg_redzone2(cache, obj);
2720 2713
2721 /* 2714 /*
2722 * Redzone is ok. 2715 * Redzone is ok.
2723 */ 2716 */
2724 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2717 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2725 return; 2718 return;
2726 2719
2727 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2720 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2728 slab_error(cache, "double free detected"); 2721 slab_error(cache, "double free detected");
2729 else 2722 else
2730 slab_error(cache, "memory outside object was overwritten"); 2723 slab_error(cache, "memory outside object was overwritten");
2731 2724
2732 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", 2725 printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2733 obj, redzone1, redzone2); 2726 obj, redzone1, redzone2);
2734 } 2727 }
2735 2728
2736 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2729 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2737 unsigned long caller) 2730 unsigned long caller)
2738 { 2731 {
2739 unsigned int objnr; 2732 unsigned int objnr;
2740 struct page *page; 2733 struct page *page;
2741 2734
2742 BUG_ON(virt_to_cache(objp) != cachep); 2735 BUG_ON(virt_to_cache(objp) != cachep);
2743 2736
2744 objp -= obj_offset(cachep); 2737 objp -= obj_offset(cachep);
2745 kfree_debugcheck(objp); 2738 kfree_debugcheck(objp);
2746 page = virt_to_head_page(objp); 2739 page = virt_to_head_page(objp);
2747 2740
2748 if (cachep->flags & SLAB_RED_ZONE) { 2741 if (cachep->flags & SLAB_RED_ZONE) {
2749 verify_redzone_free(cachep, objp); 2742 verify_redzone_free(cachep, objp);
2750 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2743 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2751 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2744 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2752 } 2745 }
2753 if (cachep->flags & SLAB_STORE_USER) 2746 if (cachep->flags & SLAB_STORE_USER)
2754 *dbg_userword(cachep, objp) = (void *)caller; 2747 *dbg_userword(cachep, objp) = (void *)caller;
2755 2748
2756 objnr = obj_to_index(cachep, page, objp); 2749 objnr = obj_to_index(cachep, page, objp);
2757 2750
2758 BUG_ON(objnr >= cachep->num); 2751 BUG_ON(objnr >= cachep->num);
2759 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2752 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2760 2753
2761 set_obj_status(page, objnr, OBJECT_FREE); 2754 set_obj_status(page, objnr, OBJECT_FREE);
2762 if (cachep->flags & SLAB_POISON) { 2755 if (cachep->flags & SLAB_POISON) {
2763 #ifdef CONFIG_DEBUG_PAGEALLOC 2756 #ifdef CONFIG_DEBUG_PAGEALLOC
2764 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2757 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2765 store_stackinfo(cachep, objp, caller); 2758 store_stackinfo(cachep, objp, caller);
2766 kernel_map_pages(virt_to_page(objp), 2759 kernel_map_pages(virt_to_page(objp),
2767 cachep->size / PAGE_SIZE, 0); 2760 cachep->size / PAGE_SIZE, 0);
2768 } else { 2761 } else {
2769 poison_obj(cachep, objp, POISON_FREE); 2762 poison_obj(cachep, objp, POISON_FREE);
2770 } 2763 }
2771 #else 2764 #else
2772 poison_obj(cachep, objp, POISON_FREE); 2765 poison_obj(cachep, objp, POISON_FREE);
2773 #endif 2766 #endif
2774 } 2767 }
2775 return objp; 2768 return objp;
2776 } 2769 }
2777 2770
2778 #else 2771 #else
2779 #define kfree_debugcheck(x) do { } while(0) 2772 #define kfree_debugcheck(x) do { } while(0)
2780 #define cache_free_debugcheck(x,objp,z) (objp) 2773 #define cache_free_debugcheck(x,objp,z) (objp)
2781 #endif 2774 #endif
2782 2775
2783 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2776 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
2784 bool force_refill) 2777 bool force_refill)
2785 { 2778 {
2786 int batchcount; 2779 int batchcount;
2787 struct kmem_cache_node *n; 2780 struct kmem_cache_node *n;
2788 struct array_cache *ac; 2781 struct array_cache *ac;
2789 int node; 2782 int node;
2790 2783
2791 check_irq_off(); 2784 check_irq_off();
2792 node = numa_mem_id(); 2785 node = numa_mem_id();
2793 if (unlikely(force_refill)) 2786 if (unlikely(force_refill))
2794 goto force_grow; 2787 goto force_grow;
2795 retry: 2788 retry:
2796 ac = cpu_cache_get(cachep); 2789 ac = cpu_cache_get(cachep);
2797 batchcount = ac->batchcount; 2790 batchcount = ac->batchcount;
2798 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2791 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2799 /* 2792 /*
2800 * If there was little recent activity on this cache, then 2793 * If there was little recent activity on this cache, then
2801 * perform only a partial refill. Otherwise we could generate 2794 * perform only a partial refill. Otherwise we could generate
2802 * refill bouncing. 2795 * refill bouncing.
2803 */ 2796 */
2804 batchcount = BATCHREFILL_LIMIT; 2797 batchcount = BATCHREFILL_LIMIT;
2805 } 2798 }
2806 n = get_node(cachep, node); 2799 n = get_node(cachep, node);
2807 2800
2808 BUG_ON(ac->avail > 0 || !n); 2801 BUG_ON(ac->avail > 0 || !n);
2809 spin_lock(&n->list_lock); 2802 spin_lock(&n->list_lock);
2810 2803
2811 /* See if we can refill from the shared array */ 2804 /* See if we can refill from the shared array */
2812 if (n->shared && transfer_objects(ac, n->shared, batchcount)) { 2805 if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
2813 n->shared->touched = 1; 2806 n->shared->touched = 1;
2814 goto alloc_done; 2807 goto alloc_done;
2815 } 2808 }
2816 2809
2817 while (batchcount > 0) { 2810 while (batchcount > 0) {
2818 struct list_head *entry; 2811 struct list_head *entry;
2819 struct page *page; 2812 struct page *page;
2820 /* Get slab alloc is to come from. */ 2813 /* Get slab alloc is to come from. */
2821 entry = n->slabs_partial.next; 2814 entry = n->slabs_partial.next;
2822 if (entry == &n->slabs_partial) { 2815 if (entry == &n->slabs_partial) {
2823 n->free_touched = 1; 2816 n->free_touched = 1;
2824 entry = n->slabs_free.next; 2817 entry = n->slabs_free.next;
2825 if (entry == &n->slabs_free) 2818 if (entry == &n->slabs_free)
2826 goto must_grow; 2819 goto must_grow;
2827 } 2820 }
2828 2821
2829 page = list_entry(entry, struct page, lru); 2822 page = list_entry(entry, struct page, lru);
2830 check_spinlock_acquired(cachep); 2823 check_spinlock_acquired(cachep);
2831 2824
2832 /* 2825 /*
2833 * The slab was either on partial or free list so 2826 * The slab was either on partial or free list so
2834 * there must be at least one object available for 2827 * there must be at least one object available for
2835 * allocation. 2828 * allocation.
2836 */ 2829 */
2837 BUG_ON(page->active >= cachep->num); 2830 BUG_ON(page->active >= cachep->num);
2838 2831
2839 while (page->active < cachep->num && batchcount--) { 2832 while (page->active < cachep->num && batchcount--) {
2840 STATS_INC_ALLOCED(cachep); 2833 STATS_INC_ALLOCED(cachep);
2841 STATS_INC_ACTIVE(cachep); 2834 STATS_INC_ACTIVE(cachep);
2842 STATS_SET_HIGH(cachep); 2835 STATS_SET_HIGH(cachep);
2843 2836
2844 ac_put_obj(cachep, ac, slab_get_obj(cachep, page, 2837 ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
2845 node)); 2838 node));
2846 } 2839 }
2847 2840
2848 /* move slabp to correct slabp list: */ 2841 /* move slabp to correct slabp list: */
2849 list_del(&page->lru); 2842 list_del(&page->lru);
2850 if (page->active == cachep->num) 2843 if (page->active == cachep->num)
2851 list_add(&page->lru, &n->slabs_full); 2844 list_add(&page->lru, &n->slabs_full);
2852 else 2845 else
2853 list_add(&page->lru, &n->slabs_partial); 2846 list_add(&page->lru, &n->slabs_partial);
2854 } 2847 }
2855 2848
2856 must_grow: 2849 must_grow:
2857 n->free_objects -= ac->avail; 2850 n->free_objects -= ac->avail;
2858 alloc_done: 2851 alloc_done:
2859 spin_unlock(&n->list_lock); 2852 spin_unlock(&n->list_lock);
2860 2853
2861 if (unlikely(!ac->avail)) { 2854 if (unlikely(!ac->avail)) {
2862 int x; 2855 int x;
2863 force_grow: 2856 force_grow:
2864 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 2857 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
2865 2858
2866 /* cache_grow can reenable interrupts, then ac could change. */ 2859 /* cache_grow can reenable interrupts, then ac could change. */
2867 ac = cpu_cache_get(cachep); 2860 ac = cpu_cache_get(cachep);
2868 node = numa_mem_id(); 2861 node = numa_mem_id();
2869 2862
2870 /* no objects in sight? abort */ 2863 /* no objects in sight? abort */
2871 if (!x && (ac->avail == 0 || force_refill)) 2864 if (!x && (ac->avail == 0 || force_refill))
2872 return NULL; 2865 return NULL;
2873 2866
2874 if (!ac->avail) /* objects refilled by interrupt? */ 2867 if (!ac->avail) /* objects refilled by interrupt? */
2875 goto retry; 2868 goto retry;
2876 } 2869 }
2877 ac->touched = 1; 2870 ac->touched = 1;
2878 2871
2879 return ac_get_obj(cachep, ac, flags, force_refill); 2872 return ac_get_obj(cachep, ac, flags, force_refill);
2880 } 2873 }
2881 2874
2882 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 2875 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2883 gfp_t flags) 2876 gfp_t flags)
2884 { 2877 {
2885 might_sleep_if(flags & __GFP_WAIT); 2878 might_sleep_if(flags & __GFP_WAIT);
2886 #if DEBUG 2879 #if DEBUG
2887 kmem_flagcheck(cachep, flags); 2880 kmem_flagcheck(cachep, flags);
2888 #endif 2881 #endif
2889 } 2882 }
2890 2883
2891 #if DEBUG 2884 #if DEBUG
2892 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 2885 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2893 gfp_t flags, void *objp, unsigned long caller) 2886 gfp_t flags, void *objp, unsigned long caller)
2894 { 2887 {
2895 struct page *page; 2888 struct page *page;
2896 2889
2897 if (!objp) 2890 if (!objp)
2898 return objp; 2891 return objp;
2899 if (cachep->flags & SLAB_POISON) { 2892 if (cachep->flags & SLAB_POISON) {
2900 #ifdef CONFIG_DEBUG_PAGEALLOC 2893 #ifdef CONFIG_DEBUG_PAGEALLOC
2901 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2894 if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2902 kernel_map_pages(virt_to_page(objp), 2895 kernel_map_pages(virt_to_page(objp),
2903 cachep->size / PAGE_SIZE, 1); 2896 cachep->size / PAGE_SIZE, 1);
2904 else 2897 else
2905 check_poison_obj(cachep, objp); 2898 check_poison_obj(cachep, objp);
2906 #else 2899 #else
2907 check_poison_obj(cachep, objp); 2900 check_poison_obj(cachep, objp);
2908 #endif 2901 #endif
2909 poison_obj(cachep, objp, POISON_INUSE); 2902 poison_obj(cachep, objp, POISON_INUSE);
2910 } 2903 }
2911 if (cachep->flags & SLAB_STORE_USER) 2904 if (cachep->flags & SLAB_STORE_USER)
2912 *dbg_userword(cachep, objp) = (void *)caller; 2905 *dbg_userword(cachep, objp) = (void *)caller;
2913 2906
2914 if (cachep->flags & SLAB_RED_ZONE) { 2907 if (cachep->flags & SLAB_RED_ZONE) {
2915 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 2908 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2916 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2909 *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2917 slab_error(cachep, "double free, or memory outside" 2910 slab_error(cachep, "double free, or memory outside"
2918 " object was overwritten"); 2911 " object was overwritten");
2919 printk(KERN_ERR 2912 printk(KERN_ERR
2920 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", 2913 "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
2921 objp, *dbg_redzone1(cachep, objp), 2914 objp, *dbg_redzone1(cachep, objp),
2922 *dbg_redzone2(cachep, objp)); 2915 *dbg_redzone2(cachep, objp));
2923 } 2916 }
2924 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2917 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2925 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2918 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2926 } 2919 }
2927 2920
2928 page = virt_to_head_page(objp); 2921 page = virt_to_head_page(objp);
2929 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); 2922 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2930 objp += obj_offset(cachep); 2923 objp += obj_offset(cachep);
2931 if (cachep->ctor && cachep->flags & SLAB_POISON) 2924 if (cachep->ctor && cachep->flags & SLAB_POISON)
2932 cachep->ctor(objp); 2925 cachep->ctor(objp);
2933 if (ARCH_SLAB_MINALIGN && 2926 if (ARCH_SLAB_MINALIGN &&
2934 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { 2927 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
2935 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 2928 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
2936 objp, (int)ARCH_SLAB_MINALIGN); 2929 objp, (int)ARCH_SLAB_MINALIGN);
2937 } 2930 }
2938 return objp; 2931 return objp;
2939 } 2932 }
2940 #else 2933 #else
2941 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2934 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2942 #endif 2935 #endif
2943 2936
2944 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 2937 static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
2945 { 2938 {
2946 if (unlikely(cachep == kmem_cache)) 2939 if (unlikely(cachep == kmem_cache))
2947 return false; 2940 return false;
2948 2941
2949 return should_failslab(cachep->object_size, flags, cachep->flags); 2942 return should_failslab(cachep->object_size, flags, cachep->flags);
2950 } 2943 }
2951 2944
2952 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 2945 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2953 { 2946 {
2954 void *objp; 2947 void *objp;
2955 struct array_cache *ac; 2948 struct array_cache *ac;
2956 bool force_refill = false; 2949 bool force_refill = false;
2957 2950
2958 check_irq_off(); 2951 check_irq_off();
2959 2952
2960 ac = cpu_cache_get(cachep); 2953 ac = cpu_cache_get(cachep);
2961 if (likely(ac->avail)) { 2954 if (likely(ac->avail)) {
2962 ac->touched = 1; 2955 ac->touched = 1;
2963 objp = ac_get_obj(cachep, ac, flags, false); 2956 objp = ac_get_obj(cachep, ac, flags, false);
2964 2957
2965 /* 2958 /*
2966 * Allow for the possibility all avail objects are not allowed 2959 * Allow for the possibility all avail objects are not allowed
2967 * by the current flags 2960 * by the current flags
2968 */ 2961 */
2969 if (objp) { 2962 if (objp) {
2970 STATS_INC_ALLOCHIT(cachep); 2963 STATS_INC_ALLOCHIT(cachep);
2971 goto out; 2964 goto out;
2972 } 2965 }
2973 force_refill = true; 2966 force_refill = true;
2974 } 2967 }
2975 2968
2976 STATS_INC_ALLOCMISS(cachep); 2969 STATS_INC_ALLOCMISS(cachep);
2977 objp = cache_alloc_refill(cachep, flags, force_refill); 2970 objp = cache_alloc_refill(cachep, flags, force_refill);
2978 /* 2971 /*
2979 * the 'ac' may be updated by cache_alloc_refill(), 2972 * the 'ac' may be updated by cache_alloc_refill(),
2980 * and kmemleak_erase() requires its correct value. 2973 * and kmemleak_erase() requires its correct value.
2981 */ 2974 */
2982 ac = cpu_cache_get(cachep); 2975 ac = cpu_cache_get(cachep);
2983 2976
2984 out: 2977 out:
2985 /* 2978 /*
2986 * To avoid a false negative, if an object that is in one of the 2979 * To avoid a false negative, if an object that is in one of the
2987 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 2980 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
2988 * treat the array pointers as a reference to the object. 2981 * treat the array pointers as a reference to the object.
2989 */ 2982 */
2990 if (objp) 2983 if (objp)
2991 kmemleak_erase(&ac->entry[ac->avail]); 2984 kmemleak_erase(&ac->entry[ac->avail]);
2992 return objp; 2985 return objp;
2993 } 2986 }
2994 2987
2995 #ifdef CONFIG_NUMA 2988 #ifdef CONFIG_NUMA
2996 /* 2989 /*
2997 * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. 2990 * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
2998 * 2991 *
2999 * If we are in_interrupt, then process context, including cpusets and 2992 * If we are in_interrupt, then process context, including cpusets and
3000 * mempolicy, may not apply and should not be used for allocation policy. 2993 * mempolicy, may not apply and should not be used for allocation policy.
3001 */ 2994 */
3002 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 2995 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3003 { 2996 {
3004 int nid_alloc, nid_here; 2997 int nid_alloc, nid_here;
3005 2998
3006 if (in_interrupt() || (flags & __GFP_THISNODE)) 2999 if (in_interrupt() || (flags & __GFP_THISNODE))
3007 return NULL; 3000 return NULL;
3008 nid_alloc = nid_here = numa_mem_id(); 3001 nid_alloc = nid_here = numa_mem_id();
3009 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3002 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3010 nid_alloc = cpuset_slab_spread_node(); 3003 nid_alloc = cpuset_slab_spread_node();
3011 else if (current->mempolicy) 3004 else if (current->mempolicy)
3012 nid_alloc = mempolicy_slab_node(); 3005 nid_alloc = mempolicy_slab_node();
3013 if (nid_alloc != nid_here) 3006 if (nid_alloc != nid_here)
3014 return ____cache_alloc_node(cachep, flags, nid_alloc); 3007 return ____cache_alloc_node(cachep, flags, nid_alloc);
3015 return NULL; 3008 return NULL;
3016 } 3009 }
3017 3010
3018 /* 3011 /*
3019 * Fallback function if there was no memory available and no objects on a 3012 * Fallback function if there was no memory available and no objects on a
3020 * certain node and fall back is permitted. First we scan all the 3013 * certain node and fall back is permitted. First we scan all the
3021 * available node for available objects. If that fails then we 3014 * available node for available objects. If that fails then we
3022 * perform an allocation without specifying a node. This allows the page 3015 * perform an allocation without specifying a node. This allows the page
3023 * allocator to do its reclaim / fallback magic. We then insert the 3016 * allocator to do its reclaim / fallback magic. We then insert the
3024 * slab into the proper nodelist and then allocate from it. 3017 * slab into the proper nodelist and then allocate from it.
3025 */ 3018 */
3026 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3019 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3027 { 3020 {
3028 struct zonelist *zonelist; 3021 struct zonelist *zonelist;
3029 gfp_t local_flags; 3022 gfp_t local_flags;
3030 struct zoneref *z; 3023 struct zoneref *z;
3031 struct zone *zone; 3024 struct zone *zone;
3032 enum zone_type high_zoneidx = gfp_zone(flags); 3025 enum zone_type high_zoneidx = gfp_zone(flags);
3033 void *obj = NULL; 3026 void *obj = NULL;
3034 int nid; 3027 int nid;
3035 unsigned int cpuset_mems_cookie; 3028 unsigned int cpuset_mems_cookie;
3036 3029
3037 if (flags & __GFP_THISNODE) 3030 if (flags & __GFP_THISNODE)
3038 return NULL; 3031 return NULL;
3039 3032
3040 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3033 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3041 3034
3042 retry_cpuset: 3035 retry_cpuset:
3043 cpuset_mems_cookie = read_mems_allowed_begin(); 3036 cpuset_mems_cookie = read_mems_allowed_begin();
3044 zonelist = node_zonelist(mempolicy_slab_node(), flags); 3037 zonelist = node_zonelist(mempolicy_slab_node(), flags);
3045 3038
3046 retry: 3039 retry:
3047 /* 3040 /*
3048 * Look through allowed nodes for objects available 3041 * Look through allowed nodes for objects available
3049 * from existing per node queues. 3042 * from existing per node queues.
3050 */ 3043 */
3051 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3044 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3052 nid = zone_to_nid(zone); 3045 nid = zone_to_nid(zone);
3053 3046
3054 if (cpuset_zone_allowed_hardwall(zone, flags) && 3047 if (cpuset_zone_allowed_hardwall(zone, flags) &&
3055 get_node(cache, nid) && 3048 get_node(cache, nid) &&
3056 get_node(cache, nid)->free_objects) { 3049 get_node(cache, nid)->free_objects) {
3057 obj = ____cache_alloc_node(cache, 3050 obj = ____cache_alloc_node(cache,
3058 flags | GFP_THISNODE, nid); 3051 flags | GFP_THISNODE, nid);
3059 if (obj) 3052 if (obj)
3060 break; 3053 break;
3061 } 3054 }
3062 } 3055 }
3063 3056
3064 if (!obj) { 3057 if (!obj) {
3065 /* 3058 /*
3066 * This allocation will be performed within the constraints 3059 * This allocation will be performed within the constraints
3067 * of the current cpuset / memory policy requirements. 3060 * of the current cpuset / memory policy requirements.
3068 * We may trigger various forms of reclaim on the allowed 3061 * We may trigger various forms of reclaim on the allowed
3069 * set and go into memory reserves if necessary. 3062 * set and go into memory reserves if necessary.
3070 */ 3063 */
3071 struct page *page; 3064 struct page *page;
3072 3065
3073 if (local_flags & __GFP_WAIT) 3066 if (local_flags & __GFP_WAIT)
3074 local_irq_enable(); 3067 local_irq_enable();
3075 kmem_flagcheck(cache, flags); 3068 kmem_flagcheck(cache, flags);
3076 page = kmem_getpages(cache, local_flags, numa_mem_id()); 3069 page = kmem_getpages(cache, local_flags, numa_mem_id());
3077 if (local_flags & __GFP_WAIT) 3070 if (local_flags & __GFP_WAIT)
3078 local_irq_disable(); 3071 local_irq_disable();
3079 if (page) { 3072 if (page) {
3080 /* 3073 /*
3081 * Insert into the appropriate per node queues 3074 * Insert into the appropriate per node queues
3082 */ 3075 */
3083 nid = page_to_nid(page); 3076 nid = page_to_nid(page);
3084 if (cache_grow(cache, flags, nid, page)) { 3077 if (cache_grow(cache, flags, nid, page)) {
3085 obj = ____cache_alloc_node(cache, 3078 obj = ____cache_alloc_node(cache,
3086 flags | GFP_THISNODE, nid); 3079 flags | GFP_THISNODE, nid);
3087 if (!obj) 3080 if (!obj)
3088 /* 3081 /*
3089 * Another processor may allocate the 3082 * Another processor may allocate the
3090 * objects in the slab since we are 3083 * objects in the slab since we are
3091 * not holding any locks. 3084 * not holding any locks.
3092 */ 3085 */
3093 goto retry; 3086 goto retry;
3094 } else { 3087 } else {
3095 /* cache_grow already freed obj */ 3088 /* cache_grow already freed obj */
3096 obj = NULL; 3089 obj = NULL;
3097 } 3090 }
3098 } 3091 }
3099 } 3092 }
3100 3093
3101 if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) 3094 if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
3102 goto retry_cpuset; 3095 goto retry_cpuset;
3103 return obj; 3096 return obj;
3104 } 3097 }
3105 3098
3106 /* 3099 /*
3107 * A interface to enable slab creation on nodeid 3100 * A interface to enable slab creation on nodeid
3108 */ 3101 */
3109 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3102 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3110 int nodeid) 3103 int nodeid)
3111 { 3104 {
3112 struct list_head *entry; 3105 struct list_head *entry;
3113 struct page *page; 3106 struct page *page;
3114 struct kmem_cache_node *n; 3107 struct kmem_cache_node *n;
3115 void *obj; 3108 void *obj;
3116 int x; 3109 int x;
3117 3110
3118 VM_BUG_ON(nodeid > num_online_nodes()); 3111 VM_BUG_ON(nodeid > num_online_nodes());
3119 n = get_node(cachep, nodeid); 3112 n = get_node(cachep, nodeid);
3120 BUG_ON(!n); 3113 BUG_ON(!n);
3121 3114
3122 retry: 3115 retry:
3123 check_irq_off(); 3116 check_irq_off();
3124 spin_lock(&n->list_lock); 3117 spin_lock(&n->list_lock);
3125 entry = n->slabs_partial.next; 3118 entry = n->slabs_partial.next;
3126 if (entry == &n->slabs_partial) { 3119 if (entry == &n->slabs_partial) {
3127 n->free_touched = 1; 3120 n->free_touched = 1;
3128 entry = n->slabs_free.next; 3121 entry = n->slabs_free.next;
3129 if (entry == &n->slabs_free) 3122 if (entry == &n->slabs_free)
3130 goto must_grow; 3123 goto must_grow;
3131 } 3124 }
3132 3125
3133 page = list_entry(entry, struct page, lru); 3126 page = list_entry(entry, struct page, lru);
3134 check_spinlock_acquired_node(cachep, nodeid); 3127 check_spinlock_acquired_node(cachep, nodeid);
3135 3128
3136 STATS_INC_NODEALLOCS(cachep); 3129 STATS_INC_NODEALLOCS(cachep);
3137 STATS_INC_ACTIVE(cachep); 3130 STATS_INC_ACTIVE(cachep);
3138 STATS_SET_HIGH(cachep); 3131 STATS_SET_HIGH(cachep);
3139 3132
3140 BUG_ON(page->active == cachep->num); 3133 BUG_ON(page->active == cachep->num);
3141 3134
3142 obj = slab_get_obj(cachep, page, nodeid); 3135 obj = slab_get_obj(cachep, page, nodeid);
3143 n->free_objects--; 3136 n->free_objects--;
3144 /* move slabp to correct slabp list: */ 3137 /* move slabp to correct slabp list: */
3145 list_del(&page->lru); 3138 list_del(&page->lru);
3146 3139
3147 if (page->active == cachep->num) 3140 if (page->active == cachep->num)
3148 list_add(&page->lru, &n->slabs_full); 3141 list_add(&page->lru, &n->slabs_full);
3149 else 3142 else
3150 list_add(&page->lru, &n->slabs_partial); 3143 list_add(&page->lru, &n->slabs_partial);
3151 3144
3152 spin_unlock(&n->list_lock); 3145 spin_unlock(&n->list_lock);
3153 goto done; 3146 goto done;
3154 3147
3155 must_grow: 3148 must_grow:
3156 spin_unlock(&n->list_lock); 3149 spin_unlock(&n->list_lock);
3157 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3150 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3158 if (x) 3151 if (x)
3159 goto retry; 3152 goto retry;
3160 3153
3161 return fallback_alloc(cachep, flags); 3154 return fallback_alloc(cachep, flags);
3162 3155
3163 done: 3156 done:
3164 return obj; 3157 return obj;
3165 } 3158 }
3166 3159
3167 static __always_inline void * 3160 static __always_inline void *
3168 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3161 slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3169 unsigned long caller) 3162 unsigned long caller)
3170 { 3163 {
3171 unsigned long save_flags; 3164 unsigned long save_flags;
3172 void *ptr; 3165 void *ptr;
3173 int slab_node = numa_mem_id(); 3166 int slab_node = numa_mem_id();
3174 3167
3175 flags &= gfp_allowed_mask; 3168 flags &= gfp_allowed_mask;
3176 3169
3177 lockdep_trace_alloc(flags); 3170 lockdep_trace_alloc(flags);
3178 3171
3179 if (slab_should_failslab(cachep, flags)) 3172 if (slab_should_failslab(cachep, flags))
3180 return NULL; 3173 return NULL;
3181 3174
3182 cachep = memcg_kmem_get_cache(cachep, flags); 3175 cachep = memcg_kmem_get_cache(cachep, flags);
3183 3176
3184 cache_alloc_debugcheck_before(cachep, flags); 3177 cache_alloc_debugcheck_before(cachep, flags);
3185 local_irq_save(save_flags); 3178 local_irq_save(save_flags);
3186 3179
3187 if (nodeid == NUMA_NO_NODE) 3180 if (nodeid == NUMA_NO_NODE)
3188 nodeid = slab_node; 3181 nodeid = slab_node;
3189 3182
3190 if (unlikely(!get_node(cachep, nodeid))) { 3183 if (unlikely(!get_node(cachep, nodeid))) {
3191 /* Node not bootstrapped yet */ 3184 /* Node not bootstrapped yet */
3192 ptr = fallback_alloc(cachep, flags); 3185 ptr = fallback_alloc(cachep, flags);
3193 goto out; 3186 goto out;
3194 } 3187 }
3195 3188
3196 if (nodeid == slab_node) { 3189 if (nodeid == slab_node) {
3197 /* 3190 /*
3198 * Use the locally cached objects if possible. 3191 * Use the locally cached objects if possible.
3199 * However ____cache_alloc does not allow fallback 3192 * However ____cache_alloc does not allow fallback
3200 * to other nodes. It may fail while we still have 3193 * to other nodes. It may fail while we still have
3201 * objects on other nodes available. 3194 * objects on other nodes available.
3202 */ 3195 */
3203 ptr = ____cache_alloc(cachep, flags); 3196 ptr = ____cache_alloc(cachep, flags);
3204 if (ptr) 3197 if (ptr)
3205 goto out; 3198 goto out;
3206 } 3199 }
3207 /* ___cache_alloc_node can fall back to other nodes */ 3200 /* ___cache_alloc_node can fall back to other nodes */
3208 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3201 ptr = ____cache_alloc_node(cachep, flags, nodeid);
3209 out: 3202 out:
3210 local_irq_restore(save_flags); 3203 local_irq_restore(save_flags);
3211 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3204 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3212 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, 3205 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
3213 flags); 3206 flags);
3214 3207
3215 if (likely(ptr)) { 3208 if (likely(ptr)) {
3216 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3209 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
3217 if (unlikely(flags & __GFP_ZERO)) 3210 if (unlikely(flags & __GFP_ZERO))
3218 memset(ptr, 0, cachep->object_size); 3211 memset(ptr, 0, cachep->object_size);
3219 } 3212 }
3220 3213
3221 return ptr; 3214 return ptr;
3222 } 3215 }
3223 3216
3224 static __always_inline void * 3217 static __always_inline void *
3225 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3218 __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3226 { 3219 {
3227 void *objp; 3220 void *objp;
3228 3221
3229 if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { 3222 if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
3230 objp = alternate_node_alloc(cache, flags); 3223 objp = alternate_node_alloc(cache, flags);
3231 if (objp) 3224 if (objp)
3232 goto out; 3225 goto out;
3233 } 3226 }
3234 objp = ____cache_alloc(cache, flags); 3227 objp = ____cache_alloc(cache, flags);
3235 3228
3236 /* 3229 /*
3237 * We may just have run out of memory on the local node. 3230 * We may just have run out of memory on the local node.
3238 * ____cache_alloc_node() knows how to locate memory on other nodes 3231 * ____cache_alloc_node() knows how to locate memory on other nodes
3239 */ 3232 */
3240 if (!objp) 3233 if (!objp)
3241 objp = ____cache_alloc_node(cache, flags, numa_mem_id()); 3234 objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3242 3235
3243 out: 3236 out:
3244 return objp; 3237 return objp;
3245 } 3238 }
3246 #else 3239 #else
3247 3240
3248 static __always_inline void * 3241 static __always_inline void *
3249 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3242 __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3250 { 3243 {
3251 return ____cache_alloc(cachep, flags); 3244 return ____cache_alloc(cachep, flags);
3252 } 3245 }
3253 3246
3254 #endif /* CONFIG_NUMA */ 3247 #endif /* CONFIG_NUMA */
3255 3248
3256 static __always_inline void * 3249 static __always_inline void *
3257 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) 3250 slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3258 { 3251 {
3259 unsigned long save_flags; 3252 unsigned long save_flags;
3260 void *objp; 3253 void *objp;
3261 3254
3262 flags &= gfp_allowed_mask; 3255 flags &= gfp_allowed_mask;
3263 3256
3264 lockdep_trace_alloc(flags); 3257 lockdep_trace_alloc(flags);
3265 3258
3266 if (slab_should_failslab(cachep, flags)) 3259 if (slab_should_failslab(cachep, flags))
3267 return NULL; 3260 return NULL;
3268 3261
3269 cachep = memcg_kmem_get_cache(cachep, flags); 3262 cachep = memcg_kmem_get_cache(cachep, flags);
3270 3263
3271 cache_alloc_debugcheck_before(cachep, flags); 3264 cache_alloc_debugcheck_before(cachep, flags);
3272 local_irq_save(save_flags); 3265 local_irq_save(save_flags);
3273 objp = __do_cache_alloc(cachep, flags); 3266 objp = __do_cache_alloc(cachep, flags);
3274 local_irq_restore(save_flags); 3267 local_irq_restore(save_flags);
3275 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3268 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3276 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, 3269 kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
3277 flags); 3270 flags);
3278 prefetchw(objp); 3271 prefetchw(objp);
3279 3272
3280 if (likely(objp)) { 3273 if (likely(objp)) {
3281 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3274 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
3282 if (unlikely(flags & __GFP_ZERO)) 3275 if (unlikely(flags & __GFP_ZERO))
3283 memset(objp, 0, cachep->object_size); 3276 memset(objp, 0, cachep->object_size);
3284 } 3277 }
3285 3278
3286 return objp; 3279 return objp;
3287 } 3280 }
3288 3281
3289 /* 3282 /*
3290 * Caller needs to acquire correct kmem_cache_node's list_lock 3283 * Caller needs to acquire correct kmem_cache_node's list_lock
3291 * @list: List of detached free slabs should be freed by caller 3284 * @list: List of detached free slabs should be freed by caller
3292 */ 3285 */
3293 static void free_block(struct kmem_cache *cachep, void **objpp, 3286 static void free_block(struct kmem_cache *cachep, void **objpp,
3294 int nr_objects, int node, struct list_head *list) 3287 int nr_objects, int node, struct list_head *list)
3295 { 3288 {
3296 int i; 3289 int i;
3297 struct kmem_cache_node *n = get_node(cachep, node); 3290 struct kmem_cache_node *n = get_node(cachep, node);
3298 3291
3299 for (i = 0; i < nr_objects; i++) { 3292 for (i = 0; i < nr_objects; i++) {
3300 void *objp; 3293 void *objp;
3301 struct page *page; 3294 struct page *page;
3302 3295
3303 clear_obj_pfmemalloc(&objpp[i]); 3296 clear_obj_pfmemalloc(&objpp[i]);
3304 objp = objpp[i]; 3297 objp = objpp[i];
3305 3298
3306 page = virt_to_head_page(objp); 3299 page = virt_to_head_page(objp);
3307 list_del(&page->lru); 3300 list_del(&page->lru);
3308 check_spinlock_acquired_node(cachep, node); 3301 check_spinlock_acquired_node(cachep, node);
3309 slab_put_obj(cachep, page, objp, node); 3302 slab_put_obj(cachep, page, objp, node);
3310 STATS_DEC_ACTIVE(cachep); 3303 STATS_DEC_ACTIVE(cachep);
3311 n->free_objects++; 3304 n->free_objects++;
3312 3305
3313 /* fixup slab chains */ 3306 /* fixup slab chains */
3314 if (page->active == 0) { 3307 if (page->active == 0) {
3315 if (n->free_objects > n->free_limit) { 3308 if (n->free_objects > n->free_limit) {
3316 n->free_objects -= cachep->num; 3309 n->free_objects -= cachep->num;
3317 list_add_tail(&page->lru, list); 3310 list_add_tail(&page->lru, list);
3318 } else { 3311 } else {
3319 list_add(&page->lru, &n->slabs_free); 3312 list_add(&page->lru, &n->slabs_free);
3320 } 3313 }
3321 } else { 3314 } else {
3322 /* Unconditionally move a slab to the end of the 3315 /* Unconditionally move a slab to the end of the
3323 * partial list on free - maximum time for the 3316 * partial list on free - maximum time for the
3324 * other objects to be freed, too. 3317 * other objects to be freed, too.
3325 */ 3318 */
3326 list_add_tail(&page->lru, &n->slabs_partial); 3319 list_add_tail(&page->lru, &n->slabs_partial);
3327 } 3320 }
3328 } 3321 }
3329 } 3322 }
3330 3323
3331 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3324 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3332 { 3325 {
3333 int batchcount; 3326 int batchcount;
3334 struct kmem_cache_node *n; 3327 struct kmem_cache_node *n;
3335 int node = numa_mem_id(); 3328 int node = numa_mem_id();
3336 LIST_HEAD(list); 3329 LIST_HEAD(list);
3337 3330
3338 batchcount = ac->batchcount; 3331 batchcount = ac->batchcount;
3339 #if DEBUG 3332 #if DEBUG
3340 BUG_ON(!batchcount || batchcount > ac->avail); 3333 BUG_ON(!batchcount || batchcount > ac->avail);
3341 #endif 3334 #endif
3342 check_irq_off(); 3335 check_irq_off();
3343 n = get_node(cachep, node); 3336 n = get_node(cachep, node);
3344 spin_lock(&n->list_lock); 3337 spin_lock(&n->list_lock);
3345 if (n->shared) { 3338 if (n->shared) {
3346 struct array_cache *shared_array = n->shared; 3339 struct array_cache *shared_array = n->shared;
3347 int max = shared_array->limit - shared_array->avail; 3340 int max = shared_array->limit - shared_array->avail;
3348 if (max) { 3341 if (max) {
3349 if (batchcount > max) 3342 if (batchcount > max)
3350 batchcount = max; 3343 batchcount = max;
3351 memcpy(&(shared_array->entry[shared_array->avail]), 3344 memcpy(&(shared_array->entry[shared_array->avail]),
3352 ac->entry, sizeof(void *) * batchcount); 3345 ac->entry, sizeof(void *) * batchcount);
3353 shared_array->avail += batchcount; 3346 shared_array->avail += batchcount;
3354 goto free_done; 3347 goto free_done;
3355 } 3348 }
3356 } 3349 }
3357 3350
3358 free_block(cachep, ac->entry, batchcount, node, &list); 3351 free_block(cachep, ac->entry, batchcount, node, &list);
3359 free_done: 3352 free_done:
3360 #if STATS 3353 #if STATS
3361 { 3354 {
3362 int i = 0; 3355 int i = 0;
3363 struct list_head *p; 3356 struct list_head *p;
3364 3357
3365 p = n->slabs_free.next; 3358 p = n->slabs_free.next;
3366 while (p != &(n->slabs_free)) { 3359 while (p != &(n->slabs_free)) {
3367 struct page *page; 3360 struct page *page;
3368 3361
3369 page = list_entry(p, struct page, lru); 3362 page = list_entry(p, struct page, lru);
3370 BUG_ON(page->active); 3363 BUG_ON(page->active);
3371 3364
3372 i++; 3365 i++;
3373 p = p->next; 3366 p = p->next;
3374 } 3367 }
3375 STATS_SET_FREEABLE(cachep, i); 3368 STATS_SET_FREEABLE(cachep, i);
3376 } 3369 }
3377 #endif 3370 #endif
3378 spin_unlock(&n->list_lock); 3371 spin_unlock(&n->list_lock);
3379 slabs_destroy(cachep, &list); 3372 slabs_destroy(cachep, &list);
3380 ac->avail -= batchcount; 3373 ac->avail -= batchcount;
3381 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3374 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3382 } 3375 }
3383 3376
3384 /* 3377 /*
3385 * Release an obj back to its cache. If the obj has a constructed state, it must 3378 * Release an obj back to its cache. If the obj has a constructed state, it must
3386 * be in this state _before_ it is released. Called with disabled ints. 3379 * be in this state _before_ it is released. Called with disabled ints.
3387 */ 3380 */
3388 static inline void __cache_free(struct kmem_cache *cachep, void *objp, 3381 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3389 unsigned long caller) 3382 unsigned long caller)
3390 { 3383 {
3391 struct array_cache *ac = cpu_cache_get(cachep); 3384 struct array_cache *ac = cpu_cache_get(cachep);
3392 3385
3393 check_irq_off(); 3386 check_irq_off();
3394 kmemleak_free_recursive(objp, cachep->flags); 3387 kmemleak_free_recursive(objp, cachep->flags);
3395 objp = cache_free_debugcheck(cachep, objp, caller); 3388 objp = cache_free_debugcheck(cachep, objp, caller);
3396 3389
3397 kmemcheck_slab_free(cachep, objp, cachep->object_size); 3390 kmemcheck_slab_free(cachep, objp, cachep->object_size);
3398 3391
3399 /* 3392 /*
3400 * Skip calling cache_free_alien() when the platform is not numa. 3393 * Skip calling cache_free_alien() when the platform is not numa.
3401 * This will avoid cache misses that happen while accessing slabp (which 3394 * This will avoid cache misses that happen while accessing slabp (which
3402 * is per page memory reference) to get nodeid. Instead use a global 3395 * is per page memory reference) to get nodeid. Instead use a global
3403 * variable to skip the call, which is mostly likely to be present in 3396 * variable to skip the call, which is mostly likely to be present in
3404 * the cache. 3397 * the cache.
3405 */ 3398 */
3406 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3399 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3407 return; 3400 return;
3408 3401
3409 if (likely(ac->avail < ac->limit)) { 3402 if (likely(ac->avail < ac->limit)) {
3410 STATS_INC_FREEHIT(cachep); 3403 STATS_INC_FREEHIT(cachep);
3411 } else { 3404 } else {
3412 STATS_INC_FREEMISS(cachep); 3405 STATS_INC_FREEMISS(cachep);
3413 cache_flusharray(cachep, ac); 3406 cache_flusharray(cachep, ac);
3414 } 3407 }
3415 3408
3416 ac_put_obj(cachep, ac, objp); 3409 ac_put_obj(cachep, ac, objp);
3417 } 3410 }
3418 3411
3419 /** 3412 /**
3420 * kmem_cache_alloc - Allocate an object 3413 * kmem_cache_alloc - Allocate an object
3421 * @cachep: The cache to allocate from. 3414 * @cachep: The cache to allocate from.
3422 * @flags: See kmalloc(). 3415 * @flags: See kmalloc().
3423 * 3416 *
3424 * Allocate an object from this cache. The flags are only relevant 3417 * Allocate an object from this cache. The flags are only relevant
3425 * if the cache has no available objects. 3418 * if the cache has no available objects.
3426 */ 3419 */
3427 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3420 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3428 { 3421 {
3429 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3422 void *ret = slab_alloc(cachep, flags, _RET_IP_);
3430 3423
3431 trace_kmem_cache_alloc(_RET_IP_, ret, 3424 trace_kmem_cache_alloc(_RET_IP_, ret,
3432 cachep->object_size, cachep->size, flags); 3425 cachep->object_size, cachep->size, flags);
3433 3426
3434 return ret; 3427 return ret;
3435 } 3428 }
3436 EXPORT_SYMBOL(kmem_cache_alloc); 3429 EXPORT_SYMBOL(kmem_cache_alloc);
3437 3430
3438 #ifdef CONFIG_TRACING 3431 #ifdef CONFIG_TRACING
3439 void * 3432 void *
3440 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) 3433 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3441 { 3434 {
3442 void *ret; 3435 void *ret;
3443 3436
3444 ret = slab_alloc(cachep, flags, _RET_IP_); 3437 ret = slab_alloc(cachep, flags, _RET_IP_);
3445 3438
3446 trace_kmalloc(_RET_IP_, ret, 3439 trace_kmalloc(_RET_IP_, ret,
3447 size, cachep->size, flags); 3440 size, cachep->size, flags);
3448 return ret; 3441 return ret;
3449 } 3442 }
3450 EXPORT_SYMBOL(kmem_cache_alloc_trace); 3443 EXPORT_SYMBOL(kmem_cache_alloc_trace);
3451 #endif 3444 #endif
3452 3445
3453 #ifdef CONFIG_NUMA 3446 #ifdef CONFIG_NUMA
3454 /** 3447 /**
3455 * kmem_cache_alloc_node - Allocate an object on the specified node 3448 * kmem_cache_alloc_node - Allocate an object on the specified node
3456 * @cachep: The cache to allocate from. 3449 * @cachep: The cache to allocate from.
3457 * @flags: See kmalloc(). 3450 * @flags: See kmalloc().
3458 * @nodeid: node number of the target node. 3451 * @nodeid: node number of the target node.
3459 * 3452 *
3460 * Identical to kmem_cache_alloc but it will allocate memory on the given 3453 * Identical to kmem_cache_alloc but it will allocate memory on the given
3461 * node, which can improve the performance for cpu bound structures. 3454 * node, which can improve the performance for cpu bound structures.
3462 * 3455 *
3463 * Fallback to other node is possible if __GFP_THISNODE is not set. 3456 * Fallback to other node is possible if __GFP_THISNODE is not set.
3464 */ 3457 */
3465 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3458 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3466 { 3459 {
3467 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3460 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3468 3461
3469 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3462 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3470 cachep->object_size, cachep->size, 3463 cachep->object_size, cachep->size,
3471 flags, nodeid); 3464 flags, nodeid);
3472 3465
3473 return ret; 3466 return ret;
3474 } 3467 }
3475 EXPORT_SYMBOL(kmem_cache_alloc_node); 3468 EXPORT_SYMBOL(kmem_cache_alloc_node);
3476 3469
3477 #ifdef CONFIG_TRACING 3470 #ifdef CONFIG_TRACING
3478 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, 3471 void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3479 gfp_t flags, 3472 gfp_t flags,
3480 int nodeid, 3473 int nodeid,
3481 size_t size) 3474 size_t size)
3482 { 3475 {
3483 void *ret; 3476 void *ret;
3484 3477
3485 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3478 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3486 3479
3487 trace_kmalloc_node(_RET_IP_, ret, 3480 trace_kmalloc_node(_RET_IP_, ret,
3488 size, cachep->size, 3481 size, cachep->size,
3489 flags, nodeid); 3482 flags, nodeid);
3490 return ret; 3483 return ret;
3491 } 3484 }
3492 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 3485 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3493 #endif 3486 #endif
3494 3487
3495 static __always_inline void * 3488 static __always_inline void *
3496 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 3489 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3497 { 3490 {
3498 struct kmem_cache *cachep; 3491 struct kmem_cache *cachep;
3499 3492
3500 cachep = kmalloc_slab(size, flags); 3493 cachep = kmalloc_slab(size, flags);
3501 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3494 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3502 return cachep; 3495 return cachep;
3503 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3496 return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3504 } 3497 }
3505 3498
3506 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3499 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3507 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3500 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3508 { 3501 {
3509 return __do_kmalloc_node(size, flags, node, _RET_IP_); 3502 return __do_kmalloc_node(size, flags, node, _RET_IP_);
3510 } 3503 }
3511 EXPORT_SYMBOL(__kmalloc_node); 3504 EXPORT_SYMBOL(__kmalloc_node);
3512 3505
3513 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3506 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3514 int node, unsigned long caller) 3507 int node, unsigned long caller)
3515 { 3508 {
3516 return __do_kmalloc_node(size, flags, node, caller); 3509 return __do_kmalloc_node(size, flags, node, caller);
3517 } 3510 }
3518 EXPORT_SYMBOL(__kmalloc_node_track_caller); 3511 EXPORT_SYMBOL(__kmalloc_node_track_caller);
3519 #else 3512 #else
3520 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3513 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3521 { 3514 {
3522 return __do_kmalloc_node(size, flags, node, 0); 3515 return __do_kmalloc_node(size, flags, node, 0);
3523 } 3516 }
3524 EXPORT_SYMBOL(__kmalloc_node); 3517 EXPORT_SYMBOL(__kmalloc_node);
3525 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ 3518 #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3526 #endif /* CONFIG_NUMA */ 3519 #endif /* CONFIG_NUMA */
3527 3520
3528 /** 3521 /**
3529 * __do_kmalloc - allocate memory 3522 * __do_kmalloc - allocate memory
3530 * @size: how many bytes of memory are required. 3523 * @size: how many bytes of memory are required.
3531 * @flags: the type of memory to allocate (see kmalloc). 3524 * @flags: the type of memory to allocate (see kmalloc).
3532 * @caller: function caller for debug tracking of the caller 3525 * @caller: function caller for debug tracking of the caller
3533 */ 3526 */
3534 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3527 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3535 unsigned long caller) 3528 unsigned long caller)
3536 { 3529 {
3537 struct kmem_cache *cachep; 3530 struct kmem_cache *cachep;
3538 void *ret; 3531 void *ret;
3539 3532
3540 cachep = kmalloc_slab(size, flags); 3533 cachep = kmalloc_slab(size, flags);
3541 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3534 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3542 return cachep; 3535 return cachep;
3543 ret = slab_alloc(cachep, flags, caller); 3536 ret = slab_alloc(cachep, flags, caller);
3544 3537
3545 trace_kmalloc(caller, ret, 3538 trace_kmalloc(caller, ret,
3546 size, cachep->size, flags); 3539 size, cachep->size, flags);
3547 3540
3548 return ret; 3541 return ret;
3549 } 3542 }
3550 3543
3551 3544
3552 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3545 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3553 void *__kmalloc(size_t size, gfp_t flags) 3546 void *__kmalloc(size_t size, gfp_t flags)
3554 { 3547 {
3555 return __do_kmalloc(size, flags, _RET_IP_); 3548 return __do_kmalloc(size, flags, _RET_IP_);
3556 } 3549 }
3557 EXPORT_SYMBOL(__kmalloc); 3550 EXPORT_SYMBOL(__kmalloc);
3558 3551
3559 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3552 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3560 { 3553 {
3561 return __do_kmalloc(size, flags, caller); 3554 return __do_kmalloc(size, flags, caller);
3562 } 3555 }
3563 EXPORT_SYMBOL(__kmalloc_track_caller); 3556 EXPORT_SYMBOL(__kmalloc_track_caller);
3564 3557
3565 #else 3558 #else
3566 void *__kmalloc(size_t size, gfp_t flags) 3559 void *__kmalloc(size_t size, gfp_t flags)
3567 { 3560 {
3568 return __do_kmalloc(size, flags, 0); 3561 return __do_kmalloc(size, flags, 0);
3569 } 3562 }
3570 EXPORT_SYMBOL(__kmalloc); 3563 EXPORT_SYMBOL(__kmalloc);
3571 #endif 3564 #endif
3572 3565
3573 /** 3566 /**
3574 * kmem_cache_free - Deallocate an object 3567 * kmem_cache_free - Deallocate an object
3575 * @cachep: The cache the allocation was from. 3568 * @cachep: The cache the allocation was from.
3576 * @objp: The previously allocated object. 3569 * @objp: The previously allocated object.
3577 * 3570 *
3578 * Free an object which was previously allocated from this 3571 * Free an object which was previously allocated from this
3579 * cache. 3572 * cache.
3580 */ 3573 */
3581 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3574 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3582 { 3575 {
3583 unsigned long flags; 3576 unsigned long flags;
3584 cachep = cache_from_obj(cachep, objp); 3577 cachep = cache_from_obj(cachep, objp);
3585 if (!cachep) 3578 if (!cachep)
3586 return; 3579 return;
3587 3580
3588 local_irq_save(flags); 3581 local_irq_save(flags);
3589 debug_check_no_locks_freed(objp, cachep->object_size); 3582 debug_check_no_locks_freed(objp, cachep->object_size);
3590 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3583 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3591 debug_check_no_obj_freed(objp, cachep->object_size); 3584 debug_check_no_obj_freed(objp, cachep->object_size);
3592 __cache_free(cachep, objp, _RET_IP_); 3585 __cache_free(cachep, objp, _RET_IP_);
3593 local_irq_restore(flags); 3586 local_irq_restore(flags);
3594 3587
3595 trace_kmem_cache_free(_RET_IP_, objp); 3588 trace_kmem_cache_free(_RET_IP_, objp);
3596 } 3589 }
3597 EXPORT_SYMBOL(kmem_cache_free); 3590 EXPORT_SYMBOL(kmem_cache_free);
3598 3591
3599 /** 3592 /**
3600 * kfree - free previously allocated memory 3593 * kfree - free previously allocated memory
3601 * @objp: pointer returned by kmalloc. 3594 * @objp: pointer returned by kmalloc.
3602 * 3595 *
3603 * If @objp is NULL, no operation is performed. 3596 * If @objp is NULL, no operation is performed.
3604 * 3597 *
3605 * Don't free memory not originally allocated by kmalloc() 3598 * Don't free memory not originally allocated by kmalloc()
3606 * or you will run into trouble. 3599 * or you will run into trouble.
3607 */ 3600 */
3608 void kfree(const void *objp) 3601 void kfree(const void *objp)
3609 { 3602 {
3610 struct kmem_cache *c; 3603 struct kmem_cache *c;
3611 unsigned long flags; 3604 unsigned long flags;
3612 3605
3613 trace_kfree(_RET_IP_, objp); 3606 trace_kfree(_RET_IP_, objp);
3614 3607
3615 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3608 if (unlikely(ZERO_OR_NULL_PTR(objp)))
3616 return; 3609 return;
3617 local_irq_save(flags); 3610 local_irq_save(flags);
3618 kfree_debugcheck(objp); 3611 kfree_debugcheck(objp);
3619 c = virt_to_cache(objp); 3612 c = virt_to_cache(objp);
3620 debug_check_no_locks_freed(objp, c->object_size); 3613 debug_check_no_locks_freed(objp, c->object_size);
3621 3614
3622 debug_check_no_obj_freed(objp, c->object_size); 3615 debug_check_no_obj_freed(objp, c->object_size);
3623 __cache_free(c, (void *)objp, _RET_IP_); 3616 __cache_free(c, (void *)objp, _RET_IP_);
3624 local_irq_restore(flags); 3617 local_irq_restore(flags);
3625 } 3618 }
3626 EXPORT_SYMBOL(kfree); 3619 EXPORT_SYMBOL(kfree);
3627 3620
3628 /* 3621 /*
3629 * This initializes kmem_cache_node or resizes various caches for all nodes. 3622 * This initializes kmem_cache_node or resizes various caches for all nodes.
3630 */ 3623 */
3631 static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) 3624 static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
3632 { 3625 {
3633 int node; 3626 int node;
3634 struct kmem_cache_node *n; 3627 struct kmem_cache_node *n;
3635 struct array_cache *new_shared; 3628 struct array_cache *new_shared;
3636 struct alien_cache **new_alien = NULL; 3629 struct alien_cache **new_alien = NULL;
3637 3630
3638 for_each_online_node(node) { 3631 for_each_online_node(node) {
3639 3632
3640 if (use_alien_caches) { 3633 if (use_alien_caches) {
3641 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3634 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3642 if (!new_alien) 3635 if (!new_alien)
3643 goto fail; 3636 goto fail;
3644 } 3637 }
3645 3638
3646 new_shared = NULL; 3639 new_shared = NULL;
3647 if (cachep->shared) { 3640 if (cachep->shared) {
3648 new_shared = alloc_arraycache(node, 3641 new_shared = alloc_arraycache(node,
3649 cachep->shared*cachep->batchcount, 3642 cachep->shared*cachep->batchcount,
3650 0xbaadf00d, gfp); 3643 0xbaadf00d, gfp);
3651 if (!new_shared) { 3644 if (!new_shared) {
3652 free_alien_cache(new_alien); 3645 free_alien_cache(new_alien);
3653 goto fail; 3646 goto fail;
3654 } 3647 }
3655 } 3648 }
3656 3649
3657 n = get_node(cachep, node); 3650 n = get_node(cachep, node);
3658 if (n) { 3651 if (n) {
3659 struct array_cache *shared = n->shared; 3652 struct array_cache *shared = n->shared;
3660 LIST_HEAD(list); 3653 LIST_HEAD(list);
3661 3654
3662 spin_lock_irq(&n->list_lock); 3655 spin_lock_irq(&n->list_lock);
3663 3656
3664 if (shared) 3657 if (shared)
3665 free_block(cachep, shared->entry, 3658 free_block(cachep, shared->entry,
3666 shared->avail, node, &list); 3659 shared->avail, node, &list);
3667 3660
3668 n->shared = new_shared; 3661 n->shared = new_shared;
3669 if (!n->alien) { 3662 if (!n->alien) {
3670 n->alien = new_alien; 3663 n->alien = new_alien;
3671 new_alien = NULL; 3664 new_alien = NULL;
3672 } 3665 }
3673 n->free_limit = (1 + nr_cpus_node(node)) * 3666 n->free_limit = (1 + nr_cpus_node(node)) *
3674 cachep->batchcount + cachep->num; 3667 cachep->batchcount + cachep->num;
3675 spin_unlock_irq(&n->list_lock); 3668 spin_unlock_irq(&n->list_lock);
3676 slabs_destroy(cachep, &list); 3669 slabs_destroy(cachep, &list);
3677 kfree(shared); 3670 kfree(shared);
3678 free_alien_cache(new_alien); 3671 free_alien_cache(new_alien);
3679 continue; 3672 continue;
3680 } 3673 }
3681 n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); 3674 n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
3682 if (!n) { 3675 if (!n) {
3683 free_alien_cache(new_alien); 3676 free_alien_cache(new_alien);
3684 kfree(new_shared); 3677 kfree(new_shared);
3685 goto fail; 3678 goto fail;
3686 } 3679 }
3687 3680
3688 kmem_cache_node_init(n); 3681 kmem_cache_node_init(n);
3689 n->next_reap = jiffies + REAPTIMEOUT_NODE + 3682 n->next_reap = jiffies + REAPTIMEOUT_NODE +
3690 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 3683 ((unsigned long)cachep) % REAPTIMEOUT_NODE;
3691 n->shared = new_shared; 3684 n->shared = new_shared;
3692 n->alien = new_alien; 3685 n->alien = new_alien;
3693 n->free_limit = (1 + nr_cpus_node(node)) * 3686 n->free_limit = (1 + nr_cpus_node(node)) *
3694 cachep->batchcount + cachep->num; 3687 cachep->batchcount + cachep->num;
3695 cachep->node[node] = n; 3688 cachep->node[node] = n;
3696 } 3689 }
3697 return 0; 3690 return 0;
3698 3691
3699 fail: 3692 fail:
3700 if (!cachep->list.next) { 3693 if (!cachep->list.next) {
3701 /* Cache is not active yet. Roll back what we did */ 3694 /* Cache is not active yet. Roll back what we did */
3702 node--; 3695 node--;
3703 while (node >= 0) { 3696 while (node >= 0) {
3704 n = get_node(cachep, node); 3697 n = get_node(cachep, node);
3705 if (n) { 3698 if (n) {
3706 kfree(n->shared); 3699 kfree(n->shared);
3707 free_alien_cache(n->alien); 3700 free_alien_cache(n->alien);
3708 kfree(n); 3701 kfree(n);
3709 cachep->node[node] = NULL; 3702 cachep->node[node] = NULL;
3710 } 3703 }
3711 node--; 3704 node--;
3712 } 3705 }
3713 } 3706 }
3714 return -ENOMEM; 3707 return -ENOMEM;
3715 } 3708 }
3716 3709
3717 struct ccupdate_struct { 3710 struct ccupdate_struct {
3718 struct kmem_cache *cachep; 3711 struct kmem_cache *cachep;
3719 struct array_cache *new[0]; 3712 struct array_cache *new[0];
3720 }; 3713 };
3721 3714
3722 static void do_ccupdate_local(void *info) 3715 static void do_ccupdate_local(void *info)
3723 { 3716 {
3724 struct ccupdate_struct *new = info; 3717 struct ccupdate_struct *new = info;
3725 struct array_cache *old; 3718 struct array_cache *old;
3726 3719
3727 check_irq_off(); 3720 check_irq_off();
3728 old = cpu_cache_get(new->cachep); 3721 old = cpu_cache_get(new->cachep);
3729 3722
3730 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3723 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3731 new->new[smp_processor_id()] = old; 3724 new->new[smp_processor_id()] = old;
3732 } 3725 }
3733 3726
3734 /* Always called with the slab_mutex held */ 3727 /* Always called with the slab_mutex held */
3735 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, 3728 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
3736 int batchcount, int shared, gfp_t gfp) 3729 int batchcount, int shared, gfp_t gfp)
3737 { 3730 {
3738 struct ccupdate_struct *new; 3731 struct ccupdate_struct *new;
3739 int i; 3732 int i;
3740 3733
3741 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), 3734 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
3742 gfp); 3735 gfp);
3743 if (!new) 3736 if (!new)
3744 return -ENOMEM; 3737 return -ENOMEM;
3745 3738
3746 for_each_online_cpu(i) { 3739 for_each_online_cpu(i) {
3747 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, 3740 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3748 batchcount, gfp); 3741 batchcount, gfp);
3749 if (!new->new[i]) { 3742 if (!new->new[i]) {
3750 for (i--; i >= 0; i--) 3743 for (i--; i >= 0; i--)
3751 kfree(new->new[i]); 3744 kfree(new->new[i]);
3752 kfree(new); 3745 kfree(new);
3753 return -ENOMEM; 3746 return -ENOMEM;
3754 } 3747 }
3755 } 3748 }
3756 new->cachep = cachep; 3749 new->cachep = cachep;
3757 3750
3758 on_each_cpu(do_ccupdate_local, (void *)new, 1); 3751 on_each_cpu(do_ccupdate_local, (void *)new, 1);
3759 3752
3760 check_irq_on(); 3753 check_irq_on();
3761 cachep->batchcount = batchcount; 3754 cachep->batchcount = batchcount;
3762 cachep->limit = limit; 3755 cachep->limit = limit;
3763 cachep->shared = shared; 3756 cachep->shared = shared;
3764 3757
3765 for_each_online_cpu(i) { 3758 for_each_online_cpu(i) {
3766 LIST_HEAD(list); 3759 LIST_HEAD(list);
3767 struct array_cache *ccold = new->new[i]; 3760 struct array_cache *ccold = new->new[i];
3768 int node; 3761 int node;
3769 struct kmem_cache_node *n; 3762 struct kmem_cache_node *n;
3770 3763
3771 if (!ccold) 3764 if (!ccold)
3772 continue; 3765 continue;
3773 3766
3774 node = cpu_to_mem(i); 3767 node = cpu_to_mem(i);
3775 n = get_node(cachep, node); 3768 n = get_node(cachep, node);
3776 spin_lock_irq(&n->list_lock); 3769 spin_lock_irq(&n->list_lock);
3777 free_block(cachep, ccold->entry, ccold->avail, node, &list); 3770 free_block(cachep, ccold->entry, ccold->avail, node, &list);
3778 spin_unlock_irq(&n->list_lock); 3771 spin_unlock_irq(&n->list_lock);
3779 slabs_destroy(cachep, &list); 3772 slabs_destroy(cachep, &list);
3780 kfree(ccold); 3773 kfree(ccold);
3781 } 3774 }
3782 kfree(new); 3775 kfree(new);
3783 return alloc_kmem_cache_node(cachep, gfp); 3776 return alloc_kmem_cache_node(cachep, gfp);
3784 } 3777 }
3785 3778
3786 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3779 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3787 int batchcount, int shared, gfp_t gfp) 3780 int batchcount, int shared, gfp_t gfp)
3788 { 3781 {
3789 int ret; 3782 int ret;
3790 struct kmem_cache *c = NULL; 3783 struct kmem_cache *c = NULL;
3791 int i = 0; 3784 int i = 0;
3792 3785
3793 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3786 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3794 3787
3795 if (slab_state < FULL) 3788 if (slab_state < FULL)
3796 return ret; 3789 return ret;
3797 3790
3798 if ((ret < 0) || !is_root_cache(cachep)) 3791 if ((ret < 0) || !is_root_cache(cachep))
3799 return ret; 3792 return ret;
3800 3793
3801 VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3794 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
3802 for_each_memcg_cache_index(i) { 3795 for_each_memcg_cache_index(i) {
3803 c = cache_from_memcg_idx(cachep, i); 3796 c = cache_from_memcg_idx(cachep, i);
3804 if (c) 3797 if (c)
3805 /* return value determined by the parent cache only */ 3798 /* return value determined by the parent cache only */
3806 __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3799 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3807 } 3800 }
3808 3801
3809 return ret; 3802 return ret;
3810 } 3803 }
3811 3804
3812 /* Called with slab_mutex held always */ 3805 /* Called with slab_mutex held always */
3813 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 3806 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3814 { 3807 {
3815 int err; 3808 int err;
3816 int limit = 0; 3809 int limit = 0;
3817 int shared = 0; 3810 int shared = 0;
3818 int batchcount = 0; 3811 int batchcount = 0;
3819 3812
3820 if (!is_root_cache(cachep)) { 3813 if (!is_root_cache(cachep)) {
3821 struct kmem_cache *root = memcg_root_cache(cachep); 3814 struct kmem_cache *root = memcg_root_cache(cachep);
3822 limit = root->limit; 3815 limit = root->limit;
3823 shared = root->shared; 3816 shared = root->shared;
3824 batchcount = root->batchcount; 3817 batchcount = root->batchcount;
3825 } 3818 }
3826 3819
3827 if (limit && shared && batchcount) 3820 if (limit && shared && batchcount)
3828 goto skip_setup; 3821 goto skip_setup;
3829 /* 3822 /*
3830 * The head array serves three purposes: 3823 * The head array serves three purposes:
3831 * - create a LIFO ordering, i.e. return objects that are cache-warm 3824 * - create a LIFO ordering, i.e. return objects that are cache-warm
3832 * - reduce the number of spinlock operations. 3825 * - reduce the number of spinlock operations.
3833 * - reduce the number of linked list operations on the slab and 3826 * - reduce the number of linked list operations on the slab and
3834 * bufctl chains: array operations are cheaper. 3827 * bufctl chains: array operations are cheaper.
3835 * The numbers are guessed, we should auto-tune as described by 3828 * The numbers are guessed, we should auto-tune as described by
3836 * Bonwick. 3829 * Bonwick.
3837 */ 3830 */
3838 if (cachep->size > 131072) 3831 if (cachep->size > 131072)
3839 limit = 1; 3832 limit = 1;
3840 else if (cachep->size > PAGE_SIZE) 3833 else if (cachep->size > PAGE_SIZE)
3841 limit = 8; 3834 limit = 8;
3842 else if (cachep->size > 1024) 3835 else if (cachep->size > 1024)
3843 limit = 24; 3836 limit = 24;
3844 else if (cachep->size > 256) 3837 else if (cachep->size > 256)
3845 limit = 54; 3838 limit = 54;
3846 else 3839 else
3847 limit = 120; 3840 limit = 120;
3848 3841
3849 /* 3842 /*
3850 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3843 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3851 * allocation behaviour: Most allocs on one cpu, most free operations 3844 * allocation behaviour: Most allocs on one cpu, most free operations
3852 * on another cpu. For these cases, an efficient object passing between 3845 * on another cpu. For these cases, an efficient object passing between
3853 * cpus is necessary. This is provided by a shared array. The array 3846 * cpus is necessary. This is provided by a shared array. The array
3854 * replaces Bonwick's magazine layer. 3847 * replaces Bonwick's magazine layer.
3855 * On uniprocessor, it's functionally equivalent (but less efficient) 3848 * On uniprocessor, it's functionally equivalent (but less efficient)
3856 * to a larger limit. Thus disabled by default. 3849 * to a larger limit. Thus disabled by default.
3857 */ 3850 */
3858 shared = 0; 3851 shared = 0;
3859 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) 3852 if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
3860 shared = 8; 3853 shared = 8;
3861 3854
3862 #if DEBUG 3855 #if DEBUG
3863 /* 3856 /*
3864 * With debugging enabled, large batchcount lead to excessively long 3857 * With debugging enabled, large batchcount lead to excessively long
3865 * periods with disabled local interrupts. Limit the batchcount 3858 * periods with disabled local interrupts. Limit the batchcount
3866 */ 3859 */
3867 if (limit > 32) 3860 if (limit > 32)
3868 limit = 32; 3861 limit = 32;
3869 #endif 3862 #endif
3870 batchcount = (limit + 1) / 2; 3863 batchcount = (limit + 1) / 2;
3871 skip_setup: 3864 skip_setup:
3872 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3865 err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3873 if (err) 3866 if (err)
3874 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3867 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3875 cachep->name, -err); 3868 cachep->name, -err);
3876 return err; 3869 return err;
3877 } 3870 }
3878 3871
3879 /* 3872 /*
3880 * Drain an array if it contains any elements taking the node lock only if 3873 * Drain an array if it contains any elements taking the node lock only if
3881 * necessary. Note that the node listlock also protects the array_cache 3874 * necessary. Note that the node listlock also protects the array_cache
3882 * if drain_array() is used on the shared array. 3875 * if drain_array() is used on the shared array.
3883 */ 3876 */
3884 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 3877 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
3885 struct array_cache *ac, int force, int node) 3878 struct array_cache *ac, int force, int node)
3886 { 3879 {
3887 LIST_HEAD(list); 3880 LIST_HEAD(list);
3888 int tofree; 3881 int tofree;
3889 3882
3890 if (!ac || !ac->avail) 3883 if (!ac || !ac->avail)
3891 return; 3884 return;
3892 if (ac->touched && !force) { 3885 if (ac->touched && !force) {
3893 ac->touched = 0; 3886 ac->touched = 0;
3894 } else { 3887 } else {
3895 spin_lock_irq(&n->list_lock); 3888 spin_lock_irq(&n->list_lock);
3896 if (ac->avail) { 3889 if (ac->avail) {
3897 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3890 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3898 if (tofree > ac->avail) 3891 if (tofree > ac->avail)
3899 tofree = (ac->avail + 1) / 2; 3892 tofree = (ac->avail + 1) / 2;
3900 free_block(cachep, ac->entry, tofree, node, &list); 3893 free_block(cachep, ac->entry, tofree, node, &list);
3901 ac->avail -= tofree; 3894 ac->avail -= tofree;
3902 memmove(ac->entry, &(ac->entry[tofree]), 3895 memmove(ac->entry, &(ac->entry[tofree]),
3903 sizeof(void *) * ac->avail); 3896 sizeof(void *) * ac->avail);
3904 } 3897 }
3905 spin_unlock_irq(&n->list_lock); 3898 spin_unlock_irq(&n->list_lock);
3906 slabs_destroy(cachep, &list); 3899 slabs_destroy(cachep, &list);
3907 } 3900 }
3908 } 3901 }
3909 3902
3910 /** 3903 /**
3911 * cache_reap - Reclaim memory from caches. 3904 * cache_reap - Reclaim memory from caches.
3912 * @w: work descriptor 3905 * @w: work descriptor
3913 * 3906 *
3914 * Called from workqueue/eventd every few seconds. 3907 * Called from workqueue/eventd every few seconds.
3915 * Purpose: 3908 * Purpose:
3916 * - clear the per-cpu caches for this CPU. 3909 * - clear the per-cpu caches for this CPU.
3917 * - return freeable pages to the main free memory pool. 3910 * - return freeable pages to the main free memory pool.
3918 * 3911 *
3919 * If we cannot acquire the cache chain mutex then just give up - we'll try 3912 * If we cannot acquire the cache chain mutex then just give up - we'll try
3920 * again on the next iteration. 3913 * again on the next iteration.
3921 */ 3914 */
3922 static void cache_reap(struct work_struct *w) 3915 static void cache_reap(struct work_struct *w)
3923 { 3916 {
3924 struct kmem_cache *searchp; 3917 struct kmem_cache *searchp;
3925 struct kmem_cache_node *n; 3918 struct kmem_cache_node *n;
3926 int node = numa_mem_id(); 3919 int node = numa_mem_id();
3927 struct delayed_work *work = to_delayed_work(w); 3920 struct delayed_work *work = to_delayed_work(w);
3928 3921
3929 if (!mutex_trylock(&slab_mutex)) 3922 if (!mutex_trylock(&slab_mutex))
3930 /* Give up. Setup the next iteration. */ 3923 /* Give up. Setup the next iteration. */
3931 goto out; 3924 goto out;
3932 3925
3933 list_for_each_entry(searchp, &slab_caches, list) { 3926 list_for_each_entry(searchp, &slab_caches, list) {
3934 check_irq_on(); 3927 check_irq_on();
3935 3928
3936 /* 3929 /*
3937 * We only take the node lock if absolutely necessary and we 3930 * We only take the node lock if absolutely necessary and we
3938 * have established with reasonable certainty that 3931 * have established with reasonable certainty that
3939 * we can do some work if the lock was obtained. 3932 * we can do some work if the lock was obtained.
3940 */ 3933 */
3941 n = get_node(searchp, node); 3934 n = get_node(searchp, node);
3942 3935
3943 reap_alien(searchp, n); 3936 reap_alien(searchp, n);
3944 3937
3945 drain_array(searchp, n, cpu_cache_get(searchp), 0, node); 3938 drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
3946 3939
3947 /* 3940 /*
3948 * These are racy checks but it does not matter 3941 * These are racy checks but it does not matter
3949 * if we skip one check or scan twice. 3942 * if we skip one check or scan twice.
3950 */ 3943 */
3951 if (time_after(n->next_reap, jiffies)) 3944 if (time_after(n->next_reap, jiffies))
3952 goto next; 3945 goto next;
3953 3946
3954 n->next_reap = jiffies + REAPTIMEOUT_NODE; 3947 n->next_reap = jiffies + REAPTIMEOUT_NODE;
3955 3948
3956 drain_array(searchp, n, n->shared, 0, node); 3949 drain_array(searchp, n, n->shared, 0, node);
3957 3950
3958 if (n->free_touched) 3951 if (n->free_touched)
3959 n->free_touched = 0; 3952 n->free_touched = 0;
3960 else { 3953 else {
3961 int freed; 3954 int freed;
3962 3955
3963 freed = drain_freelist(searchp, n, (n->free_limit + 3956 freed = drain_freelist(searchp, n, (n->free_limit +
3964 5 * searchp->num - 1) / (5 * searchp->num)); 3957 5 * searchp->num - 1) / (5 * searchp->num));
3965 STATS_ADD_REAPED(searchp, freed); 3958 STATS_ADD_REAPED(searchp, freed);
3966 } 3959 }
3967 next: 3960 next:
3968 cond_resched(); 3961 cond_resched();
3969 } 3962 }
3970 check_irq_on(); 3963 check_irq_on();
3971 mutex_unlock(&slab_mutex); 3964 mutex_unlock(&slab_mutex);
3972 next_reap_node(); 3965 next_reap_node();
3973 out: 3966 out:
3974 /* Set up the next iteration */ 3967 /* Set up the next iteration */
3975 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); 3968 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
3976 } 3969 }
3977 3970
3978 #ifdef CONFIG_SLABINFO 3971 #ifdef CONFIG_SLABINFO
3979 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 3972 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
3980 { 3973 {
3981 struct page *page; 3974 struct page *page;
3982 unsigned long active_objs; 3975 unsigned long active_objs;
3983 unsigned long num_objs; 3976 unsigned long num_objs;
3984 unsigned long active_slabs = 0; 3977 unsigned long active_slabs = 0;
3985 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3978 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3986 const char *name; 3979 const char *name;
3987 char *error = NULL; 3980 char *error = NULL;
3988 int node; 3981 int node;
3989 struct kmem_cache_node *n; 3982 struct kmem_cache_node *n;
3990 3983
3991 active_objs = 0; 3984 active_objs = 0;
3992 num_slabs = 0; 3985 num_slabs = 0;
3993 for_each_kmem_cache_node(cachep, node, n) { 3986 for_each_kmem_cache_node(cachep, node, n) {
3994 3987
3995 check_irq_on(); 3988 check_irq_on();
3996 spin_lock_irq(&n->list_lock); 3989 spin_lock_irq(&n->list_lock);
3997 3990
3998 list_for_each_entry(page, &n->slabs_full, lru) { 3991 list_for_each_entry(page, &n->slabs_full, lru) {
3999 if (page->active != cachep->num && !error) 3992 if (page->active != cachep->num && !error)
4000 error = "slabs_full accounting error"; 3993 error = "slabs_full accounting error";
4001 active_objs += cachep->num; 3994 active_objs += cachep->num;
4002 active_slabs++; 3995 active_slabs++;
4003 } 3996 }
4004 list_for_each_entry(page, &n->slabs_partial, lru) { 3997 list_for_each_entry(page, &n->slabs_partial, lru) {
4005 if (page->active == cachep->num && !error) 3998 if (page->active == cachep->num && !error)
4006 error = "slabs_partial accounting error"; 3999 error = "slabs_partial accounting error";
4007 if (!page->active && !error) 4000 if (!page->active && !error)
4008 error = "slabs_partial accounting error"; 4001 error = "slabs_partial accounting error";
4009 active_objs += page->active; 4002 active_objs += page->active;
4010 active_slabs++; 4003 active_slabs++;
4011 } 4004 }
4012 list_for_each_entry(page, &n->slabs_free, lru) { 4005 list_for_each_entry(page, &n->slabs_free, lru) {
4013 if (page->active && !error) 4006 if (page->active && !error)
4014 error = "slabs_free accounting error"; 4007 error = "slabs_free accounting error";
4015 num_slabs++; 4008 num_slabs++;
4016 } 4009 }
4017 free_objects += n->free_objects; 4010 free_objects += n->free_objects;
4018 if (n->shared) 4011 if (n->shared)
4019 shared_avail += n->shared->avail; 4012 shared_avail += n->shared->avail;
4020 4013
4021 spin_unlock_irq(&n->list_lock); 4014 spin_unlock_irq(&n->list_lock);
4022 } 4015 }
4023 num_slabs += active_slabs; 4016 num_slabs += active_slabs;
4024 num_objs = num_slabs * cachep->num; 4017 num_objs = num_slabs * cachep->num;
4025 if (num_objs - active_objs != free_objects && !error) 4018 if (num_objs - active_objs != free_objects && !error)
4026 error = "free_objects accounting error"; 4019 error = "free_objects accounting error";
4027 4020
4028 name = cachep->name; 4021 name = cachep->name;
4029 if (error) 4022 if (error)
4030 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4023 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4031 4024
4032 sinfo->active_objs = active_objs; 4025 sinfo->active_objs = active_objs;
4033 sinfo->num_objs = num_objs; 4026 sinfo->num_objs = num_objs;
4034 sinfo->active_slabs = active_slabs; 4027 sinfo->active_slabs = active_slabs;
4035 sinfo->num_slabs = num_slabs; 4028 sinfo->num_slabs = num_slabs;
4036 sinfo->shared_avail = shared_avail; 4029 sinfo->shared_avail = shared_avail;
4037 sinfo->limit = cachep->limit; 4030 sinfo->limit = cachep->limit;
4038 sinfo->batchcount = cachep->batchcount; 4031 sinfo->batchcount = cachep->batchcount;
4039 sinfo->shared = cachep->shared; 4032 sinfo->shared = cachep->shared;
4040 sinfo->objects_per_slab = cachep->num; 4033 sinfo->objects_per_slab = cachep->num;
4041 sinfo->cache_order = cachep->gfporder; 4034 sinfo->cache_order = cachep->gfporder;
4042 } 4035 }
4043 4036
4044 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) 4037 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
4045 { 4038 {
4046 #if STATS 4039 #if STATS
4047 { /* node stats */ 4040 { /* node stats */
4048 unsigned long high = cachep->high_mark; 4041 unsigned long high = cachep->high_mark;
4049 unsigned long allocs = cachep->num_allocations; 4042 unsigned long allocs = cachep->num_allocations;
4050 unsigned long grown = cachep->grown; 4043 unsigned long grown = cachep->grown;
4051 unsigned long reaped = cachep->reaped; 4044 unsigned long reaped = cachep->reaped;
4052 unsigned long errors = cachep->errors; 4045 unsigned long errors = cachep->errors;
4053 unsigned long max_freeable = cachep->max_freeable; 4046 unsigned long max_freeable = cachep->max_freeable;
4054 unsigned long node_allocs = cachep->node_allocs; 4047 unsigned long node_allocs = cachep->node_allocs;
4055 unsigned long node_frees = cachep->node_frees; 4048 unsigned long node_frees = cachep->node_frees;
4056 unsigned long overflows = cachep->node_overflow; 4049 unsigned long overflows = cachep->node_overflow;
4057 4050
4058 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " 4051 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4059 "%4lu %4lu %4lu %4lu %4lu", 4052 "%4lu %4lu %4lu %4lu %4lu",
4060 allocs, high, grown, 4053 allocs, high, grown,
4061 reaped, errors, max_freeable, node_allocs, 4054 reaped, errors, max_freeable, node_allocs,
4062 node_frees, overflows); 4055 node_frees, overflows);
4063 } 4056 }
4064 /* cpu stats */ 4057 /* cpu stats */
4065 { 4058 {
4066 unsigned long allochit = atomic_read(&cachep->allochit); 4059 unsigned long allochit = atomic_read(&cachep->allochit);
4067 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4060 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4068 unsigned long freehit = atomic_read(&cachep->freehit); 4061 unsigned long freehit = atomic_read(&cachep->freehit);
4069 unsigned long freemiss = atomic_read(&cachep->freemiss); 4062 unsigned long freemiss = atomic_read(&cachep->freemiss);
4070 4063
4071 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4064 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4072 allochit, allocmiss, freehit, freemiss); 4065 allochit, allocmiss, freehit, freemiss);
4073 } 4066 }
4074 #endif 4067 #endif
4075 } 4068 }
4076 4069
4077 #define MAX_SLABINFO_WRITE 128 4070 #define MAX_SLABINFO_WRITE 128
4078 /** 4071 /**
4079 * slabinfo_write - Tuning for the slab allocator 4072 * slabinfo_write - Tuning for the slab allocator
4080 * @file: unused 4073 * @file: unused
4081 * @buffer: user buffer 4074 * @buffer: user buffer
4082 * @count: data length 4075 * @count: data length
4083 * @ppos: unused 4076 * @ppos: unused
4084 */ 4077 */
4085 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4078 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4086 size_t count, loff_t *ppos) 4079 size_t count, loff_t *ppos)
4087 { 4080 {
4088 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4081 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4089 int limit, batchcount, shared, res; 4082 int limit, batchcount, shared, res;
4090 struct kmem_cache *cachep; 4083 struct kmem_cache *cachep;
4091 4084
4092 if (count > MAX_SLABINFO_WRITE) 4085 if (count > MAX_SLABINFO_WRITE)
4093 return -EINVAL; 4086 return -EINVAL;
4094 if (copy_from_user(&kbuf, buffer, count)) 4087 if (copy_from_user(&kbuf, buffer, count))
4095 return -EFAULT; 4088 return -EFAULT;
4096 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4089 kbuf[MAX_SLABINFO_WRITE] = '\0';
4097 4090
4098 tmp = strchr(kbuf, ' '); 4091 tmp = strchr(kbuf, ' ');
4099 if (!tmp) 4092 if (!tmp)
4100 return -EINVAL; 4093 return -EINVAL;
4101 *tmp = '\0'; 4094 *tmp = '\0';
4102 tmp++; 4095 tmp++;
4103 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4096 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4104 return -EINVAL; 4097 return -EINVAL;
4105 4098
4106 /* Find the cache in the chain of caches. */ 4099 /* Find the cache in the chain of caches. */
4107 mutex_lock(&slab_mutex); 4100 mutex_lock(&slab_mutex);
4108 res = -EINVAL; 4101 res = -EINVAL;
4109 list_for_each_entry(cachep, &slab_caches, list) { 4102 list_for_each_entry(cachep, &slab_caches, list) {
4110 if (!strcmp(cachep->name, kbuf)) { 4103 if (!strcmp(cachep->name, kbuf)) {
4111 if (limit < 1 || batchcount < 1 || 4104 if (limit < 1 || batchcount < 1 ||
4112 batchcount > limit || shared < 0) { 4105 batchcount > limit || shared < 0) {
4113 res = 0; 4106 res = 0;
4114 } else { 4107 } else {
4115 res = do_tune_cpucache(cachep, limit, 4108 res = do_tune_cpucache(cachep, limit,
4116 batchcount, shared, 4109 batchcount, shared,
4117 GFP_KERNEL); 4110 GFP_KERNEL);
4118 } 4111 }
4119 break; 4112 break;
4120 } 4113 }
4121 } 4114 }
4122 mutex_unlock(&slab_mutex); 4115 mutex_unlock(&slab_mutex);
4123 if (res >= 0) 4116 if (res >= 0)
4124 res = count; 4117 res = count;
4125 return res; 4118 return res;
4126 } 4119 }
4127 4120
4128 #ifdef CONFIG_DEBUG_SLAB_LEAK 4121 #ifdef CONFIG_DEBUG_SLAB_LEAK
4129 4122
4130 static void *leaks_start(struct seq_file *m, loff_t *pos) 4123 static void *leaks_start(struct seq_file *m, loff_t *pos)
4131 { 4124 {
4132 mutex_lock(&slab_mutex); 4125 mutex_lock(&slab_mutex);
4133 return seq_list_start(&slab_caches, *pos); 4126 return seq_list_start(&slab_caches, *pos);
4134 } 4127 }
4135 4128
4136 static inline int add_caller(unsigned long *n, unsigned long v) 4129 static inline int add_caller(unsigned long *n, unsigned long v)
4137 { 4130 {
4138 unsigned long *p; 4131 unsigned long *p;
4139 int l; 4132 int l;
4140 if (!v) 4133 if (!v)
4141 return 1; 4134 return 1;
4142 l = n[1]; 4135 l = n[1];
4143 p = n + 2; 4136 p = n + 2;
4144 while (l) { 4137 while (l) {
4145 int i = l/2; 4138 int i = l/2;
4146 unsigned long *q = p + 2 * i; 4139 unsigned long *q = p + 2 * i;
4147 if (*q == v) { 4140 if (*q == v) {
4148 q[1]++; 4141 q[1]++;
4149 return 1; 4142 return 1;
4150 } 4143 }
4151 if (*q > v) { 4144 if (*q > v) {
4152 l = i; 4145 l = i;
4153 } else { 4146 } else {
4154 p = q + 2; 4147 p = q + 2;
4155 l -= i + 1; 4148 l -= i + 1;
4156 } 4149 }
4157 } 4150 }
4158 if (++n[1] == n[0]) 4151 if (++n[1] == n[0])
4159 return 0; 4152 return 0;
4160 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4153 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4161 p[0] = v; 4154 p[0] = v;
4162 p[1] = 1; 4155 p[1] = 1;
4163 return 1; 4156 return 1;
4164 } 4157 }
4165 4158
4166 static void handle_slab(unsigned long *n, struct kmem_cache *c, 4159 static void handle_slab(unsigned long *n, struct kmem_cache *c,
4167 struct page *page) 4160 struct page *page)
4168 { 4161 {
4169 void *p; 4162 void *p;
4170 int i; 4163 int i;
4171 4164
4172 if (n[0] == n[1]) 4165 if (n[0] == n[1])
4173 return; 4166 return;
4174 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4167 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4175 if (get_obj_status(page, i) != OBJECT_ACTIVE) 4168 if (get_obj_status(page, i) != OBJECT_ACTIVE)
4176 continue; 4169 continue;
4177 4170
4178 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4171 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4179 return; 4172 return;
4180 } 4173 }
4181 } 4174 }
4182 4175
4183 static void show_symbol(struct seq_file *m, unsigned long address) 4176 static void show_symbol(struct seq_file *m, unsigned long address)
4184 { 4177 {
4185 #ifdef CONFIG_KALLSYMS 4178 #ifdef CONFIG_KALLSYMS
4186 unsigned long offset, size; 4179 unsigned long offset, size;
4187 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; 4180 char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4188 4181
4189 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { 4182 if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4190 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4183 seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4191 if (modname[0]) 4184 if (modname[0])
4192 seq_printf(m, " [%s]", modname); 4185 seq_printf(m, " [%s]", modname);
4193 return; 4186 return;
4194 } 4187 }
4195 #endif 4188 #endif
4196 seq_printf(m, "%p", (void *)address); 4189 seq_printf(m, "%p", (void *)address);
4197 } 4190 }
4198 4191
4199 static int leaks_show(struct seq_file *m, void *p) 4192 static int leaks_show(struct seq_file *m, void *p)
4200 { 4193 {
4201 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); 4194 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4202 struct page *page; 4195 struct page *page;
4203 struct kmem_cache_node *n; 4196 struct kmem_cache_node *n;
4204 const char *name; 4197 const char *name;
4205 unsigned long *x = m->private; 4198 unsigned long *x = m->private;
4206 int node; 4199 int node;
4207 int i; 4200 int i;
4208 4201
4209 if (!(cachep->flags & SLAB_STORE_USER)) 4202 if (!(cachep->flags & SLAB_STORE_USER))
4210 return 0; 4203 return 0;
4211 if (!(cachep->flags & SLAB_RED_ZONE)) 4204 if (!(cachep->flags & SLAB_RED_ZONE))
4212 return 0; 4205 return 0;
4213 4206
4214 /* OK, we can do it */ 4207 /* OK, we can do it */
4215 4208
4216 x[1] = 0; 4209 x[1] = 0;
4217 4210
4218 for_each_kmem_cache_node(cachep, node, n) { 4211 for_each_kmem_cache_node(cachep, node, n) {
4219 4212
4220 check_irq_on(); 4213 check_irq_on();
4221 spin_lock_irq(&n->list_lock); 4214 spin_lock_irq(&n->list_lock);
4222 4215
4223 list_for_each_entry(page, &n->slabs_full, lru) 4216 list_for_each_entry(page, &n->slabs_full, lru)
4224 handle_slab(x, cachep, page); 4217 handle_slab(x, cachep, page);
4225 list_for_each_entry(page, &n->slabs_partial, lru) 4218 list_for_each_entry(page, &n->slabs_partial, lru)
4226 handle_slab(x, cachep, page); 4219 handle_slab(x, cachep, page);
4227 spin_unlock_irq(&n->list_lock); 4220 spin_unlock_irq(&n->list_lock);
4228 } 4221 }
4229 name = cachep->name; 4222 name = cachep->name;
4230 if (x[0] == x[1]) { 4223 if (x[0] == x[1]) {
4231 /* Increase the buffer size */ 4224 /* Increase the buffer size */
4232 mutex_unlock(&slab_mutex); 4225 mutex_unlock(&slab_mutex);
4233 m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4226 m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4234 if (!m->private) { 4227 if (!m->private) {
4235 /* Too bad, we are really out */ 4228 /* Too bad, we are really out */
4236 m->private = x; 4229 m->private = x;
4237 mutex_lock(&slab_mutex); 4230 mutex_lock(&slab_mutex);
4238 return -ENOMEM; 4231 return -ENOMEM;
4239 } 4232 }
4240 *(unsigned long *)m->private = x[0] * 2; 4233 *(unsigned long *)m->private = x[0] * 2;
4241 kfree(x); 4234 kfree(x);
4242 mutex_lock(&slab_mutex); 4235 mutex_lock(&slab_mutex);
4243 /* Now make sure this entry will be retried */ 4236 /* Now make sure this entry will be retried */
4244 m->count = m->size; 4237 m->count = m->size;
4245 return 0; 4238 return 0;
4246 } 4239 }
4247 for (i = 0; i < x[1]; i++) { 4240 for (i = 0; i < x[1]; i++) {
4248 seq_printf(m, "%s: %lu ", name, x[2*i+3]); 4241 seq_printf(m, "%s: %lu ", name, x[2*i+3]);
4249 show_symbol(m, x[2*i+2]); 4242 show_symbol(m, x[2*i+2]);
4250 seq_putc(m, '\n'); 4243 seq_putc(m, '\n');
4251 } 4244 }
4252 4245
4253 return 0; 4246 return 0;
4254 } 4247 }
4255 4248
4256 static const struct seq_operations slabstats_op = { 4249 static const struct seq_operations slabstats_op = {
4257 .start = leaks_start, 4250 .start = leaks_start,
4258 .next = slab_next, 4251 .next = slab_next,
4259 .stop = slab_stop, 4252 .stop = slab_stop,
4260 .show = leaks_show, 4253 .show = leaks_show,
4261 }; 4254 };
4262 4255
4263 static int slabstats_open(struct inode *inode, struct file *file) 4256 static int slabstats_open(struct inode *inode, struct file *file)
4264 { 4257 {
4265 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4258 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4266 int ret = -ENOMEM; 4259 int ret = -ENOMEM;
4267 if (n) { 4260 if (n) {
4268 ret = seq_open(file, &slabstats_op); 4261 ret = seq_open(file, &slabstats_op);
4269 if (!ret) { 4262 if (!ret) {
4270 struct seq_file *m = file->private_data; 4263 struct seq_file *m = file->private_data;
4271 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4264 *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4272 m->private = n; 4265 m->private = n;
4273 n = NULL; 4266 n = NULL;
4274 } 4267 }
4275 kfree(n); 4268 kfree(n);
4276 } 4269 }
4277 return ret; 4270 return ret;
4278 } 4271 }
4279 4272
4280 static const struct file_operations proc_slabstats_operations = { 4273 static const struct file_operations proc_slabstats_operations = {
4281 .open = slabstats_open, 4274 .open = slabstats_open,
4282 .read = seq_read, 4275 .read = seq_read,
4283 .llseek = seq_lseek, 4276 .llseek = seq_lseek,
4284 .release = seq_release_private, 4277 .release = seq_release_private,
4285 }; 4278 };
4286 #endif 4279 #endif
4287 4280
4288 static int __init slab_proc_init(void) 4281 static int __init slab_proc_init(void)
4289 { 4282 {
4290 #ifdef CONFIG_DEBUG_SLAB_LEAK 4283 #ifdef CONFIG_DEBUG_SLAB_LEAK
4291 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4284 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4292 #endif 4285 #endif
4293 return 0; 4286 return 0;
4294 } 4287 }
4295 module_init(slab_proc_init); 4288 module_init(slab_proc_init);
4296 #endif 4289 #endif
4297 4290
4298 /** 4291 /**
4299 * ksize - get the actual amount of memory allocated for a given object 4292 * ksize - get the actual amount of memory allocated for a given object
4300 * @objp: Pointer to the object 4293 * @objp: Pointer to the object
4301 * 4294 *
4302 * kmalloc may internally round up allocations and return more memory 4295 * kmalloc may internally round up allocations and return more memory
4303 * than requested. ksize() can be used to determine the actual amount of 4296 * than requested. ksize() can be used to determine the actual amount of
4304 * memory allocated. The caller may use this additional memory, even though 4297 * memory allocated. The caller may use this additional memory, even though
4305 * a smaller amount of memory was initially specified with the kmalloc call. 4298 * a smaller amount of memory was initially specified with the kmalloc call.
4306 * The caller must guarantee that objp points to a valid object previously 4299 * The caller must guarantee that objp points to a valid object previously
4307 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4300 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4308 * must not be freed during the duration of the call. 4301 * must not be freed during the duration of the call.
4309 */ 4302 */
4310 size_t ksize(const void *objp) 4303 size_t ksize(const void *objp)
4311 { 4304 {
4312 BUG_ON(!objp); 4305 BUG_ON(!objp);
4313 if (unlikely(objp == ZERO_SIZE_PTR)) 4306 if (unlikely(objp == ZERO_SIZE_PTR))
4314 return 0; 4307 return 0;
4315 4308
4316 return virt_to_cache(objp)->object_size; 4309 return virt_to_cache(objp)->object_size;
4317 } 4310 }
4318 EXPORT_SYMBOL(ksize); 4311 EXPORT_SYMBOL(ksize);