Commit 8207649c41bf5c28a987be47d66545fa9d2994d8
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
Merge branch 'akpm' (fixes from Andrew Morton)
Merge fixes from Andrew Morton: "9 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: softdirty: keep bit when zapping file pte fs/cachefiles: add missing \n to kerror conversions genalloc: fix device node resource counter drivers/rtc/rtc-efi.c: add missing module alias mm, slab: initialize object alignment on cache creation mm: softdirty: addresses before VMAs in PTE holes aren't softdirty ocfs2/dlm: do not get resource spinlock if lockres is new nilfs2: fix data loss with mmap() ocfs2: free vol_label in ocfs2_delete_osb()
Showing 14 changed files Inline Diff
drivers/rtc/rtc-efi.c
1 | /* | 1 | /* |
2 | * rtc-efi: RTC Class Driver for EFI-based systems | 2 | * rtc-efi: RTC Class Driver for EFI-based systems |
3 | * | 3 | * |
4 | * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. | 4 | * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. |
5 | * | 5 | * |
6 | * Author: dann frazier <dannf@hp.com> | 6 | * Author: dann frazier <dannf@hp.com> |
7 | * Based on efirtc.c by Stephane Eranian | 7 | * Based on efirtc.c by Stephane Eranian |
8 | * | 8 | * |
9 | * This program is free software; you can redistribute it and/or modify it | 9 | * This program is free software; you can redistribute it and/or modify it |
10 | * under the terms of the GNU General Public License as published by the | 10 | * under the terms of the GNU General Public License as published by the |
11 | * Free Software Foundation; either version 2 of the License, or (at your | 11 | * Free Software Foundation; either version 2 of the License, or (at your |
12 | * option) any later version. | 12 | * option) any later version. |
13 | * | 13 | * |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
17 | 17 | ||
18 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/stringify.h> | 20 | #include <linux/stringify.h> |
21 | #include <linux/time.h> | 21 | #include <linux/time.h> |
22 | #include <linux/platform_device.h> | 22 | #include <linux/platform_device.h> |
23 | #include <linux/rtc.h> | 23 | #include <linux/rtc.h> |
24 | #include <linux/efi.h> | 24 | #include <linux/efi.h> |
25 | 25 | ||
26 | #define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT) | 26 | #define EFI_ISDST (EFI_TIME_ADJUST_DAYLIGHT|EFI_TIME_IN_DAYLIGHT) |
27 | /* | 27 | /* |
28 | * EFI Epoch is 1/1/1998 | 28 | * EFI Epoch is 1/1/1998 |
29 | */ | 29 | */ |
30 | #define EFI_RTC_EPOCH 1998 | 30 | #define EFI_RTC_EPOCH 1998 |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * returns day of the year [0-365] | 33 | * returns day of the year [0-365] |
34 | */ | 34 | */ |
35 | static inline int | 35 | static inline int |
36 | compute_yday(efi_time_t *eft) | 36 | compute_yday(efi_time_t *eft) |
37 | { | 37 | { |
38 | /* efi_time_t.month is in the [1-12] so, we need -1 */ | 38 | /* efi_time_t.month is in the [1-12] so, we need -1 */ |
39 | return rtc_year_days(eft->day, eft->month - 1, eft->year); | 39 | return rtc_year_days(eft->day, eft->month - 1, eft->year); |
40 | } | 40 | } |
41 | /* | 41 | /* |
42 | * returns day of the week [0-6] 0=Sunday | 42 | * returns day of the week [0-6] 0=Sunday |
43 | * | 43 | * |
44 | * Don't try to provide a year that's before 1998, please ! | 44 | * Don't try to provide a year that's before 1998, please ! |
45 | */ | 45 | */ |
46 | static int | 46 | static int |
47 | compute_wday(efi_time_t *eft) | 47 | compute_wday(efi_time_t *eft) |
48 | { | 48 | { |
49 | int y; | 49 | int y; |
50 | int ndays = 0; | 50 | int ndays = 0; |
51 | 51 | ||
52 | if (eft->year < EFI_RTC_EPOCH) { | 52 | if (eft->year < EFI_RTC_EPOCH) { |
53 | pr_err("EFI year < " __stringify(EFI_RTC_EPOCH) ", invalid date\n"); | 53 | pr_err("EFI year < " __stringify(EFI_RTC_EPOCH) ", invalid date\n"); |
54 | return -1; | 54 | return -1; |
55 | } | 55 | } |
56 | 56 | ||
57 | for (y = EFI_RTC_EPOCH; y < eft->year; y++) | 57 | for (y = EFI_RTC_EPOCH; y < eft->year; y++) |
58 | ndays += 365 + (is_leap_year(y) ? 1 : 0); | 58 | ndays += 365 + (is_leap_year(y) ? 1 : 0); |
59 | 59 | ||
60 | ndays += compute_yday(eft); | 60 | ndays += compute_yday(eft); |
61 | 61 | ||
62 | /* | 62 | /* |
63 | * 4=1/1/1998 was a Thursday | 63 | * 4=1/1/1998 was a Thursday |
64 | */ | 64 | */ |
65 | return (ndays + 4) % 7; | 65 | return (ndays + 4) % 7; |
66 | } | 66 | } |
67 | 67 | ||
68 | static void | 68 | static void |
69 | convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft) | 69 | convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft) |
70 | { | 70 | { |
71 | eft->year = wtime->tm_year + 1900; | 71 | eft->year = wtime->tm_year + 1900; |
72 | eft->month = wtime->tm_mon + 1; | 72 | eft->month = wtime->tm_mon + 1; |
73 | eft->day = wtime->tm_mday; | 73 | eft->day = wtime->tm_mday; |
74 | eft->hour = wtime->tm_hour; | 74 | eft->hour = wtime->tm_hour; |
75 | eft->minute = wtime->tm_min; | 75 | eft->minute = wtime->tm_min; |
76 | eft->second = wtime->tm_sec; | 76 | eft->second = wtime->tm_sec; |
77 | eft->nanosecond = 0; | 77 | eft->nanosecond = 0; |
78 | eft->daylight = wtime->tm_isdst ? EFI_ISDST : 0; | 78 | eft->daylight = wtime->tm_isdst ? EFI_ISDST : 0; |
79 | eft->timezone = EFI_UNSPECIFIED_TIMEZONE; | 79 | eft->timezone = EFI_UNSPECIFIED_TIMEZONE; |
80 | } | 80 | } |
81 | 81 | ||
82 | static bool | 82 | static bool |
83 | convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime) | 83 | convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime) |
84 | { | 84 | { |
85 | memset(wtime, 0, sizeof(*wtime)); | 85 | memset(wtime, 0, sizeof(*wtime)); |
86 | 86 | ||
87 | if (eft->second >= 60) | 87 | if (eft->second >= 60) |
88 | return false; | 88 | return false; |
89 | wtime->tm_sec = eft->second; | 89 | wtime->tm_sec = eft->second; |
90 | 90 | ||
91 | if (eft->minute >= 60) | 91 | if (eft->minute >= 60) |
92 | return false; | 92 | return false; |
93 | wtime->tm_min = eft->minute; | 93 | wtime->tm_min = eft->minute; |
94 | 94 | ||
95 | if (eft->hour >= 24) | 95 | if (eft->hour >= 24) |
96 | return false; | 96 | return false; |
97 | wtime->tm_hour = eft->hour; | 97 | wtime->tm_hour = eft->hour; |
98 | 98 | ||
99 | if (!eft->day || eft->day > 31) | 99 | if (!eft->day || eft->day > 31) |
100 | return false; | 100 | return false; |
101 | wtime->tm_mday = eft->day; | 101 | wtime->tm_mday = eft->day; |
102 | 102 | ||
103 | if (!eft->month || eft->month > 12) | 103 | if (!eft->month || eft->month > 12) |
104 | return false; | 104 | return false; |
105 | wtime->tm_mon = eft->month - 1; | 105 | wtime->tm_mon = eft->month - 1; |
106 | wtime->tm_year = eft->year - 1900; | 106 | wtime->tm_year = eft->year - 1900; |
107 | 107 | ||
108 | /* day of the week [0-6], Sunday=0 */ | 108 | /* day of the week [0-6], Sunday=0 */ |
109 | wtime->tm_wday = compute_wday(eft); | 109 | wtime->tm_wday = compute_wday(eft); |
110 | if (wtime->tm_wday < 0) | 110 | if (wtime->tm_wday < 0) |
111 | return false; | 111 | return false; |
112 | 112 | ||
113 | /* day in the year [1-365]*/ | 113 | /* day in the year [1-365]*/ |
114 | wtime->tm_yday = compute_yday(eft); | 114 | wtime->tm_yday = compute_yday(eft); |
115 | 115 | ||
116 | 116 | ||
117 | switch (eft->daylight & EFI_ISDST) { | 117 | switch (eft->daylight & EFI_ISDST) { |
118 | case EFI_ISDST: | 118 | case EFI_ISDST: |
119 | wtime->tm_isdst = 1; | 119 | wtime->tm_isdst = 1; |
120 | break; | 120 | break; |
121 | case EFI_TIME_ADJUST_DAYLIGHT: | 121 | case EFI_TIME_ADJUST_DAYLIGHT: |
122 | wtime->tm_isdst = 0; | 122 | wtime->tm_isdst = 0; |
123 | break; | 123 | break; |
124 | default: | 124 | default: |
125 | wtime->tm_isdst = -1; | 125 | wtime->tm_isdst = -1; |
126 | } | 126 | } |
127 | 127 | ||
128 | return true; | 128 | return true; |
129 | } | 129 | } |
130 | 130 | ||
131 | static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) | 131 | static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) |
132 | { | 132 | { |
133 | efi_time_t eft; | 133 | efi_time_t eft; |
134 | efi_status_t status; | 134 | efi_status_t status; |
135 | 135 | ||
136 | /* | 136 | /* |
137 | * As of EFI v1.10, this call always returns an unsupported status | 137 | * As of EFI v1.10, this call always returns an unsupported status |
138 | */ | 138 | */ |
139 | status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled, | 139 | status = efi.get_wakeup_time((efi_bool_t *)&wkalrm->enabled, |
140 | (efi_bool_t *)&wkalrm->pending, &eft); | 140 | (efi_bool_t *)&wkalrm->pending, &eft); |
141 | 141 | ||
142 | if (status != EFI_SUCCESS) | 142 | if (status != EFI_SUCCESS) |
143 | return -EINVAL; | 143 | return -EINVAL; |
144 | 144 | ||
145 | if (!convert_from_efi_time(&eft, &wkalrm->time)) | 145 | if (!convert_from_efi_time(&eft, &wkalrm->time)) |
146 | return -EIO; | 146 | return -EIO; |
147 | 147 | ||
148 | return rtc_valid_tm(&wkalrm->time); | 148 | return rtc_valid_tm(&wkalrm->time); |
149 | } | 149 | } |
150 | 150 | ||
151 | static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) | 151 | static int efi_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) |
152 | { | 152 | { |
153 | efi_time_t eft; | 153 | efi_time_t eft; |
154 | efi_status_t status; | 154 | efi_status_t status; |
155 | 155 | ||
156 | convert_to_efi_time(&wkalrm->time, &eft); | 156 | convert_to_efi_time(&wkalrm->time, &eft); |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * XXX Fixme: | 159 | * XXX Fixme: |
160 | * As of EFI 0.92 with the firmware I have on my | 160 | * As of EFI 0.92 with the firmware I have on my |
161 | * machine this call does not seem to work quite | 161 | * machine this call does not seem to work quite |
162 | * right | 162 | * right |
163 | * | 163 | * |
164 | * As of v1.10, this call always returns an unsupported status | 164 | * As of v1.10, this call always returns an unsupported status |
165 | */ | 165 | */ |
166 | status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft); | 166 | status = efi.set_wakeup_time((efi_bool_t)wkalrm->enabled, &eft); |
167 | 167 | ||
168 | dev_warn(dev, "write status is %d\n", (int)status); | 168 | dev_warn(dev, "write status is %d\n", (int)status); |
169 | 169 | ||
170 | return status == EFI_SUCCESS ? 0 : -EINVAL; | 170 | return status == EFI_SUCCESS ? 0 : -EINVAL; |
171 | } | 171 | } |
172 | 172 | ||
173 | static int efi_read_time(struct device *dev, struct rtc_time *tm) | 173 | static int efi_read_time(struct device *dev, struct rtc_time *tm) |
174 | { | 174 | { |
175 | efi_status_t status; | 175 | efi_status_t status; |
176 | efi_time_t eft; | 176 | efi_time_t eft; |
177 | efi_time_cap_t cap; | 177 | efi_time_cap_t cap; |
178 | 178 | ||
179 | status = efi.get_time(&eft, &cap); | 179 | status = efi.get_time(&eft, &cap); |
180 | 180 | ||
181 | if (status != EFI_SUCCESS) { | 181 | if (status != EFI_SUCCESS) { |
182 | /* should never happen */ | 182 | /* should never happen */ |
183 | dev_err(dev, "can't read time\n"); | 183 | dev_err(dev, "can't read time\n"); |
184 | return -EINVAL; | 184 | return -EINVAL; |
185 | } | 185 | } |
186 | 186 | ||
187 | if (!convert_from_efi_time(&eft, tm)) | 187 | if (!convert_from_efi_time(&eft, tm)) |
188 | return -EIO; | 188 | return -EIO; |
189 | 189 | ||
190 | return rtc_valid_tm(tm); | 190 | return rtc_valid_tm(tm); |
191 | } | 191 | } |
192 | 192 | ||
193 | static int efi_set_time(struct device *dev, struct rtc_time *tm) | 193 | static int efi_set_time(struct device *dev, struct rtc_time *tm) |
194 | { | 194 | { |
195 | efi_status_t status; | 195 | efi_status_t status; |
196 | efi_time_t eft; | 196 | efi_time_t eft; |
197 | 197 | ||
198 | convert_to_efi_time(tm, &eft); | 198 | convert_to_efi_time(tm, &eft); |
199 | 199 | ||
200 | status = efi.set_time(&eft); | 200 | status = efi.set_time(&eft); |
201 | 201 | ||
202 | return status == EFI_SUCCESS ? 0 : -EINVAL; | 202 | return status == EFI_SUCCESS ? 0 : -EINVAL; |
203 | } | 203 | } |
204 | 204 | ||
205 | static const struct rtc_class_ops efi_rtc_ops = { | 205 | static const struct rtc_class_ops efi_rtc_ops = { |
206 | .read_time = efi_read_time, | 206 | .read_time = efi_read_time, |
207 | .set_time = efi_set_time, | 207 | .set_time = efi_set_time, |
208 | .read_alarm = efi_read_alarm, | 208 | .read_alarm = efi_read_alarm, |
209 | .set_alarm = efi_set_alarm, | 209 | .set_alarm = efi_set_alarm, |
210 | }; | 210 | }; |
211 | 211 | ||
212 | static int __init efi_rtc_probe(struct platform_device *dev) | 212 | static int __init efi_rtc_probe(struct platform_device *dev) |
213 | { | 213 | { |
214 | struct rtc_device *rtc; | 214 | struct rtc_device *rtc; |
215 | 215 | ||
216 | rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops, | 216 | rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops, |
217 | THIS_MODULE); | 217 | THIS_MODULE); |
218 | if (IS_ERR(rtc)) | 218 | if (IS_ERR(rtc)) |
219 | return PTR_ERR(rtc); | 219 | return PTR_ERR(rtc); |
220 | 220 | ||
221 | platform_set_drvdata(dev, rtc); | 221 | platform_set_drvdata(dev, rtc); |
222 | 222 | ||
223 | return 0; | 223 | return 0; |
224 | } | 224 | } |
225 | 225 | ||
226 | static struct platform_driver efi_rtc_driver = { | 226 | static struct platform_driver efi_rtc_driver = { |
227 | .driver = { | 227 | .driver = { |
228 | .name = "rtc-efi", | 228 | .name = "rtc-efi", |
229 | .owner = THIS_MODULE, | 229 | .owner = THIS_MODULE, |
230 | }, | 230 | }, |
231 | }; | 231 | }; |
232 | 232 | ||
233 | module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe); | 233 | module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe); |
234 | 234 | ||
235 | MODULE_ALIAS("platform:rtc-efi"); | ||
235 | MODULE_AUTHOR("dann frazier <dannf@hp.com>"); | 236 | MODULE_AUTHOR("dann frazier <dannf@hp.com>"); |
236 | MODULE_LICENSE("GPL"); | 237 | MODULE_LICENSE("GPL"); |
237 | MODULE_DESCRIPTION("EFI RTC driver"); | 238 | MODULE_DESCRIPTION("EFI RTC driver"); |
238 | 239 |
fs/cachefiles/bind.c
1 | /* Bind and unbind a cache from the filesystem backing it | 1 | /* Bind and unbind a cache from the filesystem backing it |
2 | * | 2 | * |
3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public Licence | 7 | * modify it under the terms of the GNU General Public Licence |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
15 | #include <linux/completion.h> | 15 | #include <linux/completion.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/file.h> | 18 | #include <linux/file.h> |
19 | #include <linux/namei.h> | 19 | #include <linux/namei.h> |
20 | #include <linux/mount.h> | 20 | #include <linux/mount.h> |
21 | #include <linux/statfs.h> | 21 | #include <linux/statfs.h> |
22 | #include <linux/ctype.h> | 22 | #include <linux/ctype.h> |
23 | #include "internal.h" | 23 | #include "internal.h" |
24 | 24 | ||
25 | static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); | 25 | static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * bind a directory as a cache | 28 | * bind a directory as a cache |
29 | */ | 29 | */ |
30 | int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) | 30 | int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) |
31 | { | 31 | { |
32 | _enter("{%u,%u,%u,%u,%u,%u},%s", | 32 | _enter("{%u,%u,%u,%u,%u,%u},%s", |
33 | cache->frun_percent, | 33 | cache->frun_percent, |
34 | cache->fcull_percent, | 34 | cache->fcull_percent, |
35 | cache->fstop_percent, | 35 | cache->fstop_percent, |
36 | cache->brun_percent, | 36 | cache->brun_percent, |
37 | cache->bcull_percent, | 37 | cache->bcull_percent, |
38 | cache->bstop_percent, | 38 | cache->bstop_percent, |
39 | args); | 39 | args); |
40 | 40 | ||
41 | /* start by checking things over */ | 41 | /* start by checking things over */ |
42 | ASSERT(cache->fstop_percent >= 0 && | 42 | ASSERT(cache->fstop_percent >= 0 && |
43 | cache->fstop_percent < cache->fcull_percent && | 43 | cache->fstop_percent < cache->fcull_percent && |
44 | cache->fcull_percent < cache->frun_percent && | 44 | cache->fcull_percent < cache->frun_percent && |
45 | cache->frun_percent < 100); | 45 | cache->frun_percent < 100); |
46 | 46 | ||
47 | ASSERT(cache->bstop_percent >= 0 && | 47 | ASSERT(cache->bstop_percent >= 0 && |
48 | cache->bstop_percent < cache->bcull_percent && | 48 | cache->bstop_percent < cache->bcull_percent && |
49 | cache->bcull_percent < cache->brun_percent && | 49 | cache->bcull_percent < cache->brun_percent && |
50 | cache->brun_percent < 100); | 50 | cache->brun_percent < 100); |
51 | 51 | ||
52 | if (*args) { | 52 | if (*args) { |
53 | pr_err("'bind' command doesn't take an argument"); | 53 | pr_err("'bind' command doesn't take an argument\n"); |
54 | return -EINVAL; | 54 | return -EINVAL; |
55 | } | 55 | } |
56 | 56 | ||
57 | if (!cache->rootdirname) { | 57 | if (!cache->rootdirname) { |
58 | pr_err("No cache directory specified"); | 58 | pr_err("No cache directory specified\n"); |
59 | return -EINVAL; | 59 | return -EINVAL; |
60 | } | 60 | } |
61 | 61 | ||
62 | /* don't permit already bound caches to be re-bound */ | 62 | /* don't permit already bound caches to be re-bound */ |
63 | if (test_bit(CACHEFILES_READY, &cache->flags)) { | 63 | if (test_bit(CACHEFILES_READY, &cache->flags)) { |
64 | pr_err("Cache already bound"); | 64 | pr_err("Cache already bound\n"); |
65 | return -EBUSY; | 65 | return -EBUSY; |
66 | } | 66 | } |
67 | 67 | ||
68 | /* make sure we have copies of the tag and dirname strings */ | 68 | /* make sure we have copies of the tag and dirname strings */ |
69 | if (!cache->tag) { | 69 | if (!cache->tag) { |
70 | /* the tag string is released by the fops->release() | 70 | /* the tag string is released by the fops->release() |
71 | * function, so we don't release it on error here */ | 71 | * function, so we don't release it on error here */ |
72 | cache->tag = kstrdup("CacheFiles", GFP_KERNEL); | 72 | cache->tag = kstrdup("CacheFiles", GFP_KERNEL); |
73 | if (!cache->tag) | 73 | if (!cache->tag) |
74 | return -ENOMEM; | 74 | return -ENOMEM; |
75 | } | 75 | } |
76 | 76 | ||
77 | /* add the cache */ | 77 | /* add the cache */ |
78 | return cachefiles_daemon_add_cache(cache); | 78 | return cachefiles_daemon_add_cache(cache); |
79 | } | 79 | } |
80 | 80 | ||
81 | /* | 81 | /* |
82 | * add a cache | 82 | * add a cache |
83 | */ | 83 | */ |
84 | static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) | 84 | static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) |
85 | { | 85 | { |
86 | struct cachefiles_object *fsdef; | 86 | struct cachefiles_object *fsdef; |
87 | struct path path; | 87 | struct path path; |
88 | struct kstatfs stats; | 88 | struct kstatfs stats; |
89 | struct dentry *graveyard, *cachedir, *root; | 89 | struct dentry *graveyard, *cachedir, *root; |
90 | const struct cred *saved_cred; | 90 | const struct cred *saved_cred; |
91 | int ret; | 91 | int ret; |
92 | 92 | ||
93 | _enter(""); | 93 | _enter(""); |
94 | 94 | ||
95 | /* we want to work under the module's security ID */ | 95 | /* we want to work under the module's security ID */ |
96 | ret = cachefiles_get_security_ID(cache); | 96 | ret = cachefiles_get_security_ID(cache); |
97 | if (ret < 0) | 97 | if (ret < 0) |
98 | return ret; | 98 | return ret; |
99 | 99 | ||
100 | cachefiles_begin_secure(cache, &saved_cred); | 100 | cachefiles_begin_secure(cache, &saved_cred); |
101 | 101 | ||
102 | /* allocate the root index object */ | 102 | /* allocate the root index object */ |
103 | ret = -ENOMEM; | 103 | ret = -ENOMEM; |
104 | 104 | ||
105 | fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); | 105 | fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); |
106 | if (!fsdef) | 106 | if (!fsdef) |
107 | goto error_root_object; | 107 | goto error_root_object; |
108 | 108 | ||
109 | ASSERTCMP(fsdef->backer, ==, NULL); | 109 | ASSERTCMP(fsdef->backer, ==, NULL); |
110 | 110 | ||
111 | atomic_set(&fsdef->usage, 1); | 111 | atomic_set(&fsdef->usage, 1); |
112 | fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; | 112 | fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; |
113 | 113 | ||
114 | _debug("- fsdef %p", fsdef); | 114 | _debug("- fsdef %p", fsdef); |
115 | 115 | ||
116 | /* look up the directory at the root of the cache */ | 116 | /* look up the directory at the root of the cache */ |
117 | ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); | 117 | ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); |
118 | if (ret < 0) | 118 | if (ret < 0) |
119 | goto error_open_root; | 119 | goto error_open_root; |
120 | 120 | ||
121 | cache->mnt = path.mnt; | 121 | cache->mnt = path.mnt; |
122 | root = path.dentry; | 122 | root = path.dentry; |
123 | 123 | ||
124 | /* check parameters */ | 124 | /* check parameters */ |
125 | ret = -EOPNOTSUPP; | 125 | ret = -EOPNOTSUPP; |
126 | if (!root->d_inode || | 126 | if (!root->d_inode || |
127 | !root->d_inode->i_op->lookup || | 127 | !root->d_inode->i_op->lookup || |
128 | !root->d_inode->i_op->mkdir || | 128 | !root->d_inode->i_op->mkdir || |
129 | !root->d_inode->i_op->setxattr || | 129 | !root->d_inode->i_op->setxattr || |
130 | !root->d_inode->i_op->getxattr || | 130 | !root->d_inode->i_op->getxattr || |
131 | !root->d_sb->s_op->statfs || | 131 | !root->d_sb->s_op->statfs || |
132 | !root->d_sb->s_op->sync_fs) | 132 | !root->d_sb->s_op->sync_fs) |
133 | goto error_unsupported; | 133 | goto error_unsupported; |
134 | 134 | ||
135 | ret = -EROFS; | 135 | ret = -EROFS; |
136 | if (root->d_sb->s_flags & MS_RDONLY) | 136 | if (root->d_sb->s_flags & MS_RDONLY) |
137 | goto error_unsupported; | 137 | goto error_unsupported; |
138 | 138 | ||
139 | /* determine the security of the on-disk cache as this governs | 139 | /* determine the security of the on-disk cache as this governs |
140 | * security ID of files we create */ | 140 | * security ID of files we create */ |
141 | ret = cachefiles_determine_cache_security(cache, root, &saved_cred); | 141 | ret = cachefiles_determine_cache_security(cache, root, &saved_cred); |
142 | if (ret < 0) | 142 | if (ret < 0) |
143 | goto error_unsupported; | 143 | goto error_unsupported; |
144 | 144 | ||
145 | /* get the cache size and blocksize */ | 145 | /* get the cache size and blocksize */ |
146 | ret = vfs_statfs(&path, &stats); | 146 | ret = vfs_statfs(&path, &stats); |
147 | if (ret < 0) | 147 | if (ret < 0) |
148 | goto error_unsupported; | 148 | goto error_unsupported; |
149 | 149 | ||
150 | ret = -ERANGE; | 150 | ret = -ERANGE; |
151 | if (stats.f_bsize <= 0) | 151 | if (stats.f_bsize <= 0) |
152 | goto error_unsupported; | 152 | goto error_unsupported; |
153 | 153 | ||
154 | ret = -EOPNOTSUPP; | 154 | ret = -EOPNOTSUPP; |
155 | if (stats.f_bsize > PAGE_SIZE) | 155 | if (stats.f_bsize > PAGE_SIZE) |
156 | goto error_unsupported; | 156 | goto error_unsupported; |
157 | 157 | ||
158 | cache->bsize = stats.f_bsize; | 158 | cache->bsize = stats.f_bsize; |
159 | cache->bshift = 0; | 159 | cache->bshift = 0; |
160 | if (stats.f_bsize < PAGE_SIZE) | 160 | if (stats.f_bsize < PAGE_SIZE) |
161 | cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); | 161 | cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); |
162 | 162 | ||
163 | _debug("blksize %u (shift %u)", | 163 | _debug("blksize %u (shift %u)", |
164 | cache->bsize, cache->bshift); | 164 | cache->bsize, cache->bshift); |
165 | 165 | ||
166 | _debug("size %llu, avail %llu", | 166 | _debug("size %llu, avail %llu", |
167 | (unsigned long long) stats.f_blocks, | 167 | (unsigned long long) stats.f_blocks, |
168 | (unsigned long long) stats.f_bavail); | 168 | (unsigned long long) stats.f_bavail); |
169 | 169 | ||
170 | /* set up caching limits */ | 170 | /* set up caching limits */ |
171 | do_div(stats.f_files, 100); | 171 | do_div(stats.f_files, 100); |
172 | cache->fstop = stats.f_files * cache->fstop_percent; | 172 | cache->fstop = stats.f_files * cache->fstop_percent; |
173 | cache->fcull = stats.f_files * cache->fcull_percent; | 173 | cache->fcull = stats.f_files * cache->fcull_percent; |
174 | cache->frun = stats.f_files * cache->frun_percent; | 174 | cache->frun = stats.f_files * cache->frun_percent; |
175 | 175 | ||
176 | _debug("limits {%llu,%llu,%llu} files", | 176 | _debug("limits {%llu,%llu,%llu} files", |
177 | (unsigned long long) cache->frun, | 177 | (unsigned long long) cache->frun, |
178 | (unsigned long long) cache->fcull, | 178 | (unsigned long long) cache->fcull, |
179 | (unsigned long long) cache->fstop); | 179 | (unsigned long long) cache->fstop); |
180 | 180 | ||
181 | stats.f_blocks >>= cache->bshift; | 181 | stats.f_blocks >>= cache->bshift; |
182 | do_div(stats.f_blocks, 100); | 182 | do_div(stats.f_blocks, 100); |
183 | cache->bstop = stats.f_blocks * cache->bstop_percent; | 183 | cache->bstop = stats.f_blocks * cache->bstop_percent; |
184 | cache->bcull = stats.f_blocks * cache->bcull_percent; | 184 | cache->bcull = stats.f_blocks * cache->bcull_percent; |
185 | cache->brun = stats.f_blocks * cache->brun_percent; | 185 | cache->brun = stats.f_blocks * cache->brun_percent; |
186 | 186 | ||
187 | _debug("limits {%llu,%llu,%llu} blocks", | 187 | _debug("limits {%llu,%llu,%llu} blocks", |
188 | (unsigned long long) cache->brun, | 188 | (unsigned long long) cache->brun, |
189 | (unsigned long long) cache->bcull, | 189 | (unsigned long long) cache->bcull, |
190 | (unsigned long long) cache->bstop); | 190 | (unsigned long long) cache->bstop); |
191 | 191 | ||
192 | /* get the cache directory and check its type */ | 192 | /* get the cache directory and check its type */ |
193 | cachedir = cachefiles_get_directory(cache, root, "cache"); | 193 | cachedir = cachefiles_get_directory(cache, root, "cache"); |
194 | if (IS_ERR(cachedir)) { | 194 | if (IS_ERR(cachedir)) { |
195 | ret = PTR_ERR(cachedir); | 195 | ret = PTR_ERR(cachedir); |
196 | goto error_unsupported; | 196 | goto error_unsupported; |
197 | } | 197 | } |
198 | 198 | ||
199 | fsdef->dentry = cachedir; | 199 | fsdef->dentry = cachedir; |
200 | fsdef->fscache.cookie = NULL; | 200 | fsdef->fscache.cookie = NULL; |
201 | 201 | ||
202 | ret = cachefiles_check_object_type(fsdef); | 202 | ret = cachefiles_check_object_type(fsdef); |
203 | if (ret < 0) | 203 | if (ret < 0) |
204 | goto error_unsupported; | 204 | goto error_unsupported; |
205 | 205 | ||
206 | /* get the graveyard directory */ | 206 | /* get the graveyard directory */ |
207 | graveyard = cachefiles_get_directory(cache, root, "graveyard"); | 207 | graveyard = cachefiles_get_directory(cache, root, "graveyard"); |
208 | if (IS_ERR(graveyard)) { | 208 | if (IS_ERR(graveyard)) { |
209 | ret = PTR_ERR(graveyard); | 209 | ret = PTR_ERR(graveyard); |
210 | goto error_unsupported; | 210 | goto error_unsupported; |
211 | } | 211 | } |
212 | 212 | ||
213 | cache->graveyard = graveyard; | 213 | cache->graveyard = graveyard; |
214 | 214 | ||
215 | /* publish the cache */ | 215 | /* publish the cache */ |
216 | fscache_init_cache(&cache->cache, | 216 | fscache_init_cache(&cache->cache, |
217 | &cachefiles_cache_ops, | 217 | &cachefiles_cache_ops, |
218 | "%s", | 218 | "%s", |
219 | fsdef->dentry->d_sb->s_id); | 219 | fsdef->dentry->d_sb->s_id); |
220 | 220 | ||
221 | fscache_object_init(&fsdef->fscache, NULL, &cache->cache); | 221 | fscache_object_init(&fsdef->fscache, NULL, &cache->cache); |
222 | 222 | ||
223 | ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); | 223 | ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); |
224 | if (ret < 0) | 224 | if (ret < 0) |
225 | goto error_add_cache; | 225 | goto error_add_cache; |
226 | 226 | ||
227 | /* done */ | 227 | /* done */ |
228 | set_bit(CACHEFILES_READY, &cache->flags); | 228 | set_bit(CACHEFILES_READY, &cache->flags); |
229 | dput(root); | 229 | dput(root); |
230 | 230 | ||
231 | pr_info("File cache on %s registered\n", cache->cache.identifier); | 231 | pr_info("File cache on %s registered\n", cache->cache.identifier); |
232 | 232 | ||
233 | /* check how much space the cache has */ | 233 | /* check how much space the cache has */ |
234 | cachefiles_has_space(cache, 0, 0); | 234 | cachefiles_has_space(cache, 0, 0); |
235 | cachefiles_end_secure(cache, saved_cred); | 235 | cachefiles_end_secure(cache, saved_cred); |
236 | return 0; | 236 | return 0; |
237 | 237 | ||
238 | error_add_cache: | 238 | error_add_cache: |
239 | dput(cache->graveyard); | 239 | dput(cache->graveyard); |
240 | cache->graveyard = NULL; | 240 | cache->graveyard = NULL; |
241 | error_unsupported: | 241 | error_unsupported: |
242 | mntput(cache->mnt); | 242 | mntput(cache->mnt); |
243 | cache->mnt = NULL; | 243 | cache->mnt = NULL; |
244 | dput(fsdef->dentry); | 244 | dput(fsdef->dentry); |
245 | fsdef->dentry = NULL; | 245 | fsdef->dentry = NULL; |
246 | dput(root); | 246 | dput(root); |
247 | error_open_root: | 247 | error_open_root: |
248 | kmem_cache_free(cachefiles_object_jar, fsdef); | 248 | kmem_cache_free(cachefiles_object_jar, fsdef); |
249 | error_root_object: | 249 | error_root_object: |
250 | cachefiles_end_secure(cache, saved_cred); | 250 | cachefiles_end_secure(cache, saved_cred); |
251 | pr_err("Failed to register: %d", ret); | 251 | pr_err("Failed to register: %d\n", ret); |
252 | return ret; | 252 | return ret; |
253 | } | 253 | } |
254 | 254 | ||
255 | /* | 255 | /* |
256 | * unbind a cache on fd release | 256 | * unbind a cache on fd release |
257 | */ | 257 | */ |
258 | void cachefiles_daemon_unbind(struct cachefiles_cache *cache) | 258 | void cachefiles_daemon_unbind(struct cachefiles_cache *cache) |
259 | { | 259 | { |
260 | _enter(""); | 260 | _enter(""); |
261 | 261 | ||
262 | if (test_bit(CACHEFILES_READY, &cache->flags)) { | 262 | if (test_bit(CACHEFILES_READY, &cache->flags)) { |
263 | pr_info("File cache on %s unregistering\n", | 263 | pr_info("File cache on %s unregistering\n", |
264 | cache->cache.identifier); | 264 | cache->cache.identifier); |
265 | 265 | ||
266 | fscache_withdraw_cache(&cache->cache); | 266 | fscache_withdraw_cache(&cache->cache); |
267 | } | 267 | } |
268 | 268 | ||
269 | dput(cache->graveyard); | 269 | dput(cache->graveyard); |
270 | mntput(cache->mnt); | 270 | mntput(cache->mnt); |
271 | 271 | ||
272 | kfree(cache->rootdirname); | 272 | kfree(cache->rootdirname); |
273 | kfree(cache->secctx); | 273 | kfree(cache->secctx); |
274 | kfree(cache->tag); | 274 | kfree(cache->tag); |
275 | 275 | ||
276 | _leave(""); | 276 | _leave(""); |
277 | } | 277 | } |
278 | 278 |
fs/cachefiles/daemon.c
1 | /* Daemon interface | 1 | /* Daemon interface |
2 | * | 2 | * |
3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public Licence | 7 | * modify it under the terms of the GNU General Public Licence |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
15 | #include <linux/completion.h> | 15 | #include <linux/completion.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/file.h> | 18 | #include <linux/file.h> |
19 | #include <linux/namei.h> | 19 | #include <linux/namei.h> |
20 | #include <linux/poll.h> | 20 | #include <linux/poll.h> |
21 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
22 | #include <linux/statfs.h> | 22 | #include <linux/statfs.h> |
23 | #include <linux/ctype.h> | 23 | #include <linux/ctype.h> |
24 | #include <linux/string.h> | 24 | #include <linux/string.h> |
25 | #include <linux/fs_struct.h> | 25 | #include <linux/fs_struct.h> |
26 | #include "internal.h" | 26 | #include "internal.h" |
27 | 27 | ||
28 | static int cachefiles_daemon_open(struct inode *, struct file *); | 28 | static int cachefiles_daemon_open(struct inode *, struct file *); |
29 | static int cachefiles_daemon_release(struct inode *, struct file *); | 29 | static int cachefiles_daemon_release(struct inode *, struct file *); |
30 | static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, | 30 | static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, |
31 | loff_t *); | 31 | loff_t *); |
32 | static ssize_t cachefiles_daemon_write(struct file *, const char __user *, | 32 | static ssize_t cachefiles_daemon_write(struct file *, const char __user *, |
33 | size_t, loff_t *); | 33 | size_t, loff_t *); |
34 | static unsigned int cachefiles_daemon_poll(struct file *, | 34 | static unsigned int cachefiles_daemon_poll(struct file *, |
35 | struct poll_table_struct *); | 35 | struct poll_table_struct *); |
36 | static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); | 36 | static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); |
37 | static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); | 37 | static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); |
38 | static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); | 38 | static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); |
39 | static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); | 39 | static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); |
40 | static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); | 40 | static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); |
41 | static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); | 41 | static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); |
42 | static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); | 42 | static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); |
43 | static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); | 43 | static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); |
44 | static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); | 44 | static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); |
45 | static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); | 45 | static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); |
46 | static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); | 46 | static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); |
47 | static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); | 47 | static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); |
48 | 48 | ||
49 | static unsigned long cachefiles_open; | 49 | static unsigned long cachefiles_open; |
50 | 50 | ||
51 | const struct file_operations cachefiles_daemon_fops = { | 51 | const struct file_operations cachefiles_daemon_fops = { |
52 | .owner = THIS_MODULE, | 52 | .owner = THIS_MODULE, |
53 | .open = cachefiles_daemon_open, | 53 | .open = cachefiles_daemon_open, |
54 | .release = cachefiles_daemon_release, | 54 | .release = cachefiles_daemon_release, |
55 | .read = cachefiles_daemon_read, | 55 | .read = cachefiles_daemon_read, |
56 | .write = cachefiles_daemon_write, | 56 | .write = cachefiles_daemon_write, |
57 | .poll = cachefiles_daemon_poll, | 57 | .poll = cachefiles_daemon_poll, |
58 | .llseek = noop_llseek, | 58 | .llseek = noop_llseek, |
59 | }; | 59 | }; |
60 | 60 | ||
61 | struct cachefiles_daemon_cmd { | 61 | struct cachefiles_daemon_cmd { |
62 | char name[8]; | 62 | char name[8]; |
63 | int (*handler)(struct cachefiles_cache *cache, char *args); | 63 | int (*handler)(struct cachefiles_cache *cache, char *args); |
64 | }; | 64 | }; |
65 | 65 | ||
66 | static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { | 66 | static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { |
67 | { "bind", cachefiles_daemon_bind }, | 67 | { "bind", cachefiles_daemon_bind }, |
68 | { "brun", cachefiles_daemon_brun }, | 68 | { "brun", cachefiles_daemon_brun }, |
69 | { "bcull", cachefiles_daemon_bcull }, | 69 | { "bcull", cachefiles_daemon_bcull }, |
70 | { "bstop", cachefiles_daemon_bstop }, | 70 | { "bstop", cachefiles_daemon_bstop }, |
71 | { "cull", cachefiles_daemon_cull }, | 71 | { "cull", cachefiles_daemon_cull }, |
72 | { "debug", cachefiles_daemon_debug }, | 72 | { "debug", cachefiles_daemon_debug }, |
73 | { "dir", cachefiles_daemon_dir }, | 73 | { "dir", cachefiles_daemon_dir }, |
74 | { "frun", cachefiles_daemon_frun }, | 74 | { "frun", cachefiles_daemon_frun }, |
75 | { "fcull", cachefiles_daemon_fcull }, | 75 | { "fcull", cachefiles_daemon_fcull }, |
76 | { "fstop", cachefiles_daemon_fstop }, | 76 | { "fstop", cachefiles_daemon_fstop }, |
77 | { "inuse", cachefiles_daemon_inuse }, | 77 | { "inuse", cachefiles_daemon_inuse }, |
78 | { "secctx", cachefiles_daemon_secctx }, | 78 | { "secctx", cachefiles_daemon_secctx }, |
79 | { "tag", cachefiles_daemon_tag }, | 79 | { "tag", cachefiles_daemon_tag }, |
80 | { "", NULL } | 80 | { "", NULL } |
81 | }; | 81 | }; |
82 | 82 | ||
83 | 83 | ||
84 | /* | 84 | /* |
85 | * do various checks | 85 | * do various checks |
86 | */ | 86 | */ |
87 | static int cachefiles_daemon_open(struct inode *inode, struct file *file) | 87 | static int cachefiles_daemon_open(struct inode *inode, struct file *file) |
88 | { | 88 | { |
89 | struct cachefiles_cache *cache; | 89 | struct cachefiles_cache *cache; |
90 | 90 | ||
91 | _enter(""); | 91 | _enter(""); |
92 | 92 | ||
93 | /* only the superuser may do this */ | 93 | /* only the superuser may do this */ |
94 | if (!capable(CAP_SYS_ADMIN)) | 94 | if (!capable(CAP_SYS_ADMIN)) |
95 | return -EPERM; | 95 | return -EPERM; |
96 | 96 | ||
97 | /* the cachefiles device may only be open once at a time */ | 97 | /* the cachefiles device may only be open once at a time */ |
98 | if (xchg(&cachefiles_open, 1) == 1) | 98 | if (xchg(&cachefiles_open, 1) == 1) |
99 | return -EBUSY; | 99 | return -EBUSY; |
100 | 100 | ||
101 | /* allocate a cache record */ | 101 | /* allocate a cache record */ |
102 | cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); | 102 | cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); |
103 | if (!cache) { | 103 | if (!cache) { |
104 | cachefiles_open = 0; | 104 | cachefiles_open = 0; |
105 | return -ENOMEM; | 105 | return -ENOMEM; |
106 | } | 106 | } |
107 | 107 | ||
108 | mutex_init(&cache->daemon_mutex); | 108 | mutex_init(&cache->daemon_mutex); |
109 | cache->active_nodes = RB_ROOT; | 109 | cache->active_nodes = RB_ROOT; |
110 | rwlock_init(&cache->active_lock); | 110 | rwlock_init(&cache->active_lock); |
111 | init_waitqueue_head(&cache->daemon_pollwq); | 111 | init_waitqueue_head(&cache->daemon_pollwq); |
112 | 112 | ||
113 | /* set default caching limits | 113 | /* set default caching limits |
114 | * - limit at 1% free space and/or free files | 114 | * - limit at 1% free space and/or free files |
115 | * - cull below 5% free space and/or free files | 115 | * - cull below 5% free space and/or free files |
116 | * - cease culling above 7% free space and/or free files | 116 | * - cease culling above 7% free space and/or free files |
117 | */ | 117 | */ |
118 | cache->frun_percent = 7; | 118 | cache->frun_percent = 7; |
119 | cache->fcull_percent = 5; | 119 | cache->fcull_percent = 5; |
120 | cache->fstop_percent = 1; | 120 | cache->fstop_percent = 1; |
121 | cache->brun_percent = 7; | 121 | cache->brun_percent = 7; |
122 | cache->bcull_percent = 5; | 122 | cache->bcull_percent = 5; |
123 | cache->bstop_percent = 1; | 123 | cache->bstop_percent = 1; |
124 | 124 | ||
125 | file->private_data = cache; | 125 | file->private_data = cache; |
126 | cache->cachefilesd = file; | 126 | cache->cachefilesd = file; |
127 | return 0; | 127 | return 0; |
128 | } | 128 | } |
129 | 129 | ||
130 | /* | 130 | /* |
131 | * release a cache | 131 | * release a cache |
132 | */ | 132 | */ |
133 | static int cachefiles_daemon_release(struct inode *inode, struct file *file) | 133 | static int cachefiles_daemon_release(struct inode *inode, struct file *file) |
134 | { | 134 | { |
135 | struct cachefiles_cache *cache = file->private_data; | 135 | struct cachefiles_cache *cache = file->private_data; |
136 | 136 | ||
137 | _enter(""); | 137 | _enter(""); |
138 | 138 | ||
139 | ASSERT(cache); | 139 | ASSERT(cache); |
140 | 140 | ||
141 | set_bit(CACHEFILES_DEAD, &cache->flags); | 141 | set_bit(CACHEFILES_DEAD, &cache->flags); |
142 | 142 | ||
143 | cachefiles_daemon_unbind(cache); | 143 | cachefiles_daemon_unbind(cache); |
144 | 144 | ||
145 | ASSERT(!cache->active_nodes.rb_node); | 145 | ASSERT(!cache->active_nodes.rb_node); |
146 | 146 | ||
147 | /* clean up the control file interface */ | 147 | /* clean up the control file interface */ |
148 | cache->cachefilesd = NULL; | 148 | cache->cachefilesd = NULL; |
149 | file->private_data = NULL; | 149 | file->private_data = NULL; |
150 | cachefiles_open = 0; | 150 | cachefiles_open = 0; |
151 | 151 | ||
152 | kfree(cache); | 152 | kfree(cache); |
153 | 153 | ||
154 | _leave(""); | 154 | _leave(""); |
155 | return 0; | 155 | return 0; |
156 | } | 156 | } |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * read the cache state | 159 | * read the cache state |
160 | */ | 160 | */ |
161 | static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, | 161 | static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, |
162 | size_t buflen, loff_t *pos) | 162 | size_t buflen, loff_t *pos) |
163 | { | 163 | { |
164 | struct cachefiles_cache *cache = file->private_data; | 164 | struct cachefiles_cache *cache = file->private_data; |
165 | char buffer[256]; | 165 | char buffer[256]; |
166 | int n; | 166 | int n; |
167 | 167 | ||
168 | //_enter(",,%zu,", buflen); | 168 | //_enter(",,%zu,", buflen); |
169 | 169 | ||
170 | if (!test_bit(CACHEFILES_READY, &cache->flags)) | 170 | if (!test_bit(CACHEFILES_READY, &cache->flags)) |
171 | return 0; | 171 | return 0; |
172 | 172 | ||
173 | /* check how much space the cache has */ | 173 | /* check how much space the cache has */ |
174 | cachefiles_has_space(cache, 0, 0); | 174 | cachefiles_has_space(cache, 0, 0); |
175 | 175 | ||
176 | /* summarise */ | 176 | /* summarise */ |
177 | clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); | 177 | clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); |
178 | 178 | ||
179 | n = snprintf(buffer, sizeof(buffer), | 179 | n = snprintf(buffer, sizeof(buffer), |
180 | "cull=%c" | 180 | "cull=%c" |
181 | " frun=%llx" | 181 | " frun=%llx" |
182 | " fcull=%llx" | 182 | " fcull=%llx" |
183 | " fstop=%llx" | 183 | " fstop=%llx" |
184 | " brun=%llx" | 184 | " brun=%llx" |
185 | " bcull=%llx" | 185 | " bcull=%llx" |
186 | " bstop=%llx", | 186 | " bstop=%llx", |
187 | test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', | 187 | test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', |
188 | (unsigned long long) cache->frun, | 188 | (unsigned long long) cache->frun, |
189 | (unsigned long long) cache->fcull, | 189 | (unsigned long long) cache->fcull, |
190 | (unsigned long long) cache->fstop, | 190 | (unsigned long long) cache->fstop, |
191 | (unsigned long long) cache->brun, | 191 | (unsigned long long) cache->brun, |
192 | (unsigned long long) cache->bcull, | 192 | (unsigned long long) cache->bcull, |
193 | (unsigned long long) cache->bstop | 193 | (unsigned long long) cache->bstop |
194 | ); | 194 | ); |
195 | 195 | ||
196 | if (n > buflen) | 196 | if (n > buflen) |
197 | return -EMSGSIZE; | 197 | return -EMSGSIZE; |
198 | 198 | ||
199 | if (copy_to_user(_buffer, buffer, n) != 0) | 199 | if (copy_to_user(_buffer, buffer, n) != 0) |
200 | return -EFAULT; | 200 | return -EFAULT; |
201 | 201 | ||
202 | return n; | 202 | return n; |
203 | } | 203 | } |
204 | 204 | ||
205 | /* | 205 | /* |
206 | * command the cache | 206 | * command the cache |
207 | */ | 207 | */ |
208 | static ssize_t cachefiles_daemon_write(struct file *file, | 208 | static ssize_t cachefiles_daemon_write(struct file *file, |
209 | const char __user *_data, | 209 | const char __user *_data, |
210 | size_t datalen, | 210 | size_t datalen, |
211 | loff_t *pos) | 211 | loff_t *pos) |
212 | { | 212 | { |
213 | const struct cachefiles_daemon_cmd *cmd; | 213 | const struct cachefiles_daemon_cmd *cmd; |
214 | struct cachefiles_cache *cache = file->private_data; | 214 | struct cachefiles_cache *cache = file->private_data; |
215 | ssize_t ret; | 215 | ssize_t ret; |
216 | char *data, *args, *cp; | 216 | char *data, *args, *cp; |
217 | 217 | ||
218 | //_enter(",,%zu,", datalen); | 218 | //_enter(",,%zu,", datalen); |
219 | 219 | ||
220 | ASSERT(cache); | 220 | ASSERT(cache); |
221 | 221 | ||
222 | if (test_bit(CACHEFILES_DEAD, &cache->flags)) | 222 | if (test_bit(CACHEFILES_DEAD, &cache->flags)) |
223 | return -EIO; | 223 | return -EIO; |
224 | 224 | ||
225 | if (datalen < 0 || datalen > PAGE_SIZE - 1) | 225 | if (datalen < 0 || datalen > PAGE_SIZE - 1) |
226 | return -EOPNOTSUPP; | 226 | return -EOPNOTSUPP; |
227 | 227 | ||
228 | /* drag the command string into the kernel so we can parse it */ | 228 | /* drag the command string into the kernel so we can parse it */ |
229 | data = kmalloc(datalen + 1, GFP_KERNEL); | 229 | data = kmalloc(datalen + 1, GFP_KERNEL); |
230 | if (!data) | 230 | if (!data) |
231 | return -ENOMEM; | 231 | return -ENOMEM; |
232 | 232 | ||
233 | ret = -EFAULT; | 233 | ret = -EFAULT; |
234 | if (copy_from_user(data, _data, datalen) != 0) | 234 | if (copy_from_user(data, _data, datalen) != 0) |
235 | goto error; | 235 | goto error; |
236 | 236 | ||
237 | data[datalen] = '\0'; | 237 | data[datalen] = '\0'; |
238 | 238 | ||
239 | ret = -EINVAL; | 239 | ret = -EINVAL; |
240 | if (memchr(data, '\0', datalen)) | 240 | if (memchr(data, '\0', datalen)) |
241 | goto error; | 241 | goto error; |
242 | 242 | ||
243 | /* strip any newline */ | 243 | /* strip any newline */ |
244 | cp = memchr(data, '\n', datalen); | 244 | cp = memchr(data, '\n', datalen); |
245 | if (cp) { | 245 | if (cp) { |
246 | if (cp == data) | 246 | if (cp == data) |
247 | goto error; | 247 | goto error; |
248 | 248 | ||
249 | *cp = '\0'; | 249 | *cp = '\0'; |
250 | } | 250 | } |
251 | 251 | ||
252 | /* parse the command */ | 252 | /* parse the command */ |
253 | ret = -EOPNOTSUPP; | 253 | ret = -EOPNOTSUPP; |
254 | 254 | ||
255 | for (args = data; *args; args++) | 255 | for (args = data; *args; args++) |
256 | if (isspace(*args)) | 256 | if (isspace(*args)) |
257 | break; | 257 | break; |
258 | if (*args) { | 258 | if (*args) { |
259 | if (args == data) | 259 | if (args == data) |
260 | goto error; | 260 | goto error; |
261 | *args = '\0'; | 261 | *args = '\0'; |
262 | args = skip_spaces(++args); | 262 | args = skip_spaces(++args); |
263 | } | 263 | } |
264 | 264 | ||
265 | /* run the appropriate command handler */ | 265 | /* run the appropriate command handler */ |
266 | for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) | 266 | for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) |
267 | if (strcmp(cmd->name, data) == 0) | 267 | if (strcmp(cmd->name, data) == 0) |
268 | goto found_command; | 268 | goto found_command; |
269 | 269 | ||
270 | error: | 270 | error: |
271 | kfree(data); | 271 | kfree(data); |
272 | //_leave(" = %zd", ret); | 272 | //_leave(" = %zd", ret); |
273 | return ret; | 273 | return ret; |
274 | 274 | ||
275 | found_command: | 275 | found_command: |
276 | mutex_lock(&cache->daemon_mutex); | 276 | mutex_lock(&cache->daemon_mutex); |
277 | 277 | ||
278 | ret = -EIO; | 278 | ret = -EIO; |
279 | if (!test_bit(CACHEFILES_DEAD, &cache->flags)) | 279 | if (!test_bit(CACHEFILES_DEAD, &cache->flags)) |
280 | ret = cmd->handler(cache, args); | 280 | ret = cmd->handler(cache, args); |
281 | 281 | ||
282 | mutex_unlock(&cache->daemon_mutex); | 282 | mutex_unlock(&cache->daemon_mutex); |
283 | 283 | ||
284 | if (ret == 0) | 284 | if (ret == 0) |
285 | ret = datalen; | 285 | ret = datalen; |
286 | goto error; | 286 | goto error; |
287 | } | 287 | } |
288 | 288 | ||
289 | /* | 289 | /* |
290 | * poll for culling state | 290 | * poll for culling state |
291 | * - use POLLOUT to indicate culling state | 291 | * - use POLLOUT to indicate culling state |
292 | */ | 292 | */ |
293 | static unsigned int cachefiles_daemon_poll(struct file *file, | 293 | static unsigned int cachefiles_daemon_poll(struct file *file, |
294 | struct poll_table_struct *poll) | 294 | struct poll_table_struct *poll) |
295 | { | 295 | { |
296 | struct cachefiles_cache *cache = file->private_data; | 296 | struct cachefiles_cache *cache = file->private_data; |
297 | unsigned int mask; | 297 | unsigned int mask; |
298 | 298 | ||
299 | poll_wait(file, &cache->daemon_pollwq, poll); | 299 | poll_wait(file, &cache->daemon_pollwq, poll); |
300 | mask = 0; | 300 | mask = 0; |
301 | 301 | ||
302 | if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) | 302 | if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) |
303 | mask |= POLLIN; | 303 | mask |= POLLIN; |
304 | 304 | ||
305 | if (test_bit(CACHEFILES_CULLING, &cache->flags)) | 305 | if (test_bit(CACHEFILES_CULLING, &cache->flags)) |
306 | mask |= POLLOUT; | 306 | mask |= POLLOUT; |
307 | 307 | ||
308 | return mask; | 308 | return mask; |
309 | } | 309 | } |
310 | 310 | ||
311 | /* | 311 | /* |
312 | * give a range error for cache space constraints | 312 | * give a range error for cache space constraints |
313 | * - can be tail-called | 313 | * - can be tail-called |
314 | */ | 314 | */ |
315 | static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, | 315 | static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, |
316 | char *args) | 316 | char *args) |
317 | { | 317 | { |
318 | pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); | 318 | pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n"); |
319 | 319 | ||
320 | return -EINVAL; | 320 | return -EINVAL; |
321 | } | 321 | } |
322 | 322 | ||
323 | /* | 323 | /* |
324 | * set the percentage of files at which to stop culling | 324 | * set the percentage of files at which to stop culling |
325 | * - command: "frun <N>%" | 325 | * - command: "frun <N>%" |
326 | */ | 326 | */ |
327 | static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) | 327 | static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) |
328 | { | 328 | { |
329 | unsigned long frun; | 329 | unsigned long frun; |
330 | 330 | ||
331 | _enter(",%s", args); | 331 | _enter(",%s", args); |
332 | 332 | ||
333 | if (!*args) | 333 | if (!*args) |
334 | return -EINVAL; | 334 | return -EINVAL; |
335 | 335 | ||
336 | frun = simple_strtoul(args, &args, 10); | 336 | frun = simple_strtoul(args, &args, 10); |
337 | if (args[0] != '%' || args[1] != '\0') | 337 | if (args[0] != '%' || args[1] != '\0') |
338 | return -EINVAL; | 338 | return -EINVAL; |
339 | 339 | ||
340 | if (frun <= cache->fcull_percent || frun >= 100) | 340 | if (frun <= cache->fcull_percent || frun >= 100) |
341 | return cachefiles_daemon_range_error(cache, args); | 341 | return cachefiles_daemon_range_error(cache, args); |
342 | 342 | ||
343 | cache->frun_percent = frun; | 343 | cache->frun_percent = frun; |
344 | return 0; | 344 | return 0; |
345 | } | 345 | } |
346 | 346 | ||
347 | /* | 347 | /* |
348 | * set the percentage of files at which to start culling | 348 | * set the percentage of files at which to start culling |
349 | * - command: "fcull <N>%" | 349 | * - command: "fcull <N>%" |
350 | */ | 350 | */ |
351 | static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) | 351 | static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) |
352 | { | 352 | { |
353 | unsigned long fcull; | 353 | unsigned long fcull; |
354 | 354 | ||
355 | _enter(",%s", args); | 355 | _enter(",%s", args); |
356 | 356 | ||
357 | if (!*args) | 357 | if (!*args) |
358 | return -EINVAL; | 358 | return -EINVAL; |
359 | 359 | ||
360 | fcull = simple_strtoul(args, &args, 10); | 360 | fcull = simple_strtoul(args, &args, 10); |
361 | if (args[0] != '%' || args[1] != '\0') | 361 | if (args[0] != '%' || args[1] != '\0') |
362 | return -EINVAL; | 362 | return -EINVAL; |
363 | 363 | ||
364 | if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) | 364 | if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) |
365 | return cachefiles_daemon_range_error(cache, args); | 365 | return cachefiles_daemon_range_error(cache, args); |
366 | 366 | ||
367 | cache->fcull_percent = fcull; | 367 | cache->fcull_percent = fcull; |
368 | return 0; | 368 | return 0; |
369 | } | 369 | } |
370 | 370 | ||
371 | /* | 371 | /* |
372 | * set the percentage of files at which to stop allocating | 372 | * set the percentage of files at which to stop allocating |
373 | * - command: "fstop <N>%" | 373 | * - command: "fstop <N>%" |
374 | */ | 374 | */ |
375 | static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) | 375 | static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) |
376 | { | 376 | { |
377 | unsigned long fstop; | 377 | unsigned long fstop; |
378 | 378 | ||
379 | _enter(",%s", args); | 379 | _enter(",%s", args); |
380 | 380 | ||
381 | if (!*args) | 381 | if (!*args) |
382 | return -EINVAL; | 382 | return -EINVAL; |
383 | 383 | ||
384 | fstop = simple_strtoul(args, &args, 10); | 384 | fstop = simple_strtoul(args, &args, 10); |
385 | if (args[0] != '%' || args[1] != '\0') | 385 | if (args[0] != '%' || args[1] != '\0') |
386 | return -EINVAL; | 386 | return -EINVAL; |
387 | 387 | ||
388 | if (fstop < 0 || fstop >= cache->fcull_percent) | 388 | if (fstop < 0 || fstop >= cache->fcull_percent) |
389 | return cachefiles_daemon_range_error(cache, args); | 389 | return cachefiles_daemon_range_error(cache, args); |
390 | 390 | ||
391 | cache->fstop_percent = fstop; | 391 | cache->fstop_percent = fstop; |
392 | return 0; | 392 | return 0; |
393 | } | 393 | } |
394 | 394 | ||
395 | /* | 395 | /* |
396 | * set the percentage of blocks at which to stop culling | 396 | * set the percentage of blocks at which to stop culling |
397 | * - command: "brun <N>%" | 397 | * - command: "brun <N>%" |
398 | */ | 398 | */ |
399 | static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) | 399 | static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) |
400 | { | 400 | { |
401 | unsigned long brun; | 401 | unsigned long brun; |
402 | 402 | ||
403 | _enter(",%s", args); | 403 | _enter(",%s", args); |
404 | 404 | ||
405 | if (!*args) | 405 | if (!*args) |
406 | return -EINVAL; | 406 | return -EINVAL; |
407 | 407 | ||
408 | brun = simple_strtoul(args, &args, 10); | 408 | brun = simple_strtoul(args, &args, 10); |
409 | if (args[0] != '%' || args[1] != '\0') | 409 | if (args[0] != '%' || args[1] != '\0') |
410 | return -EINVAL; | 410 | return -EINVAL; |
411 | 411 | ||
412 | if (brun <= cache->bcull_percent || brun >= 100) | 412 | if (brun <= cache->bcull_percent || brun >= 100) |
413 | return cachefiles_daemon_range_error(cache, args); | 413 | return cachefiles_daemon_range_error(cache, args); |
414 | 414 | ||
415 | cache->brun_percent = brun; | 415 | cache->brun_percent = brun; |
416 | return 0; | 416 | return 0; |
417 | } | 417 | } |
418 | 418 | ||
419 | /* | 419 | /* |
420 | * set the percentage of blocks at which to start culling | 420 | * set the percentage of blocks at which to start culling |
421 | * - command: "bcull <N>%" | 421 | * - command: "bcull <N>%" |
422 | */ | 422 | */ |
423 | static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) | 423 | static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) |
424 | { | 424 | { |
425 | unsigned long bcull; | 425 | unsigned long bcull; |
426 | 426 | ||
427 | _enter(",%s", args); | 427 | _enter(",%s", args); |
428 | 428 | ||
429 | if (!*args) | 429 | if (!*args) |
430 | return -EINVAL; | 430 | return -EINVAL; |
431 | 431 | ||
432 | bcull = simple_strtoul(args, &args, 10); | 432 | bcull = simple_strtoul(args, &args, 10); |
433 | if (args[0] != '%' || args[1] != '\0') | 433 | if (args[0] != '%' || args[1] != '\0') |
434 | return -EINVAL; | 434 | return -EINVAL; |
435 | 435 | ||
436 | if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) | 436 | if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) |
437 | return cachefiles_daemon_range_error(cache, args); | 437 | return cachefiles_daemon_range_error(cache, args); |
438 | 438 | ||
439 | cache->bcull_percent = bcull; | 439 | cache->bcull_percent = bcull; |
440 | return 0; | 440 | return 0; |
441 | } | 441 | } |
442 | 442 | ||
443 | /* | 443 | /* |
444 | * set the percentage of blocks at which to stop allocating | 444 | * set the percentage of blocks at which to stop allocating |
445 | * - command: "bstop <N>%" | 445 | * - command: "bstop <N>%" |
446 | */ | 446 | */ |
447 | static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) | 447 | static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) |
448 | { | 448 | { |
449 | unsigned long bstop; | 449 | unsigned long bstop; |
450 | 450 | ||
451 | _enter(",%s", args); | 451 | _enter(",%s", args); |
452 | 452 | ||
453 | if (!*args) | 453 | if (!*args) |
454 | return -EINVAL; | 454 | return -EINVAL; |
455 | 455 | ||
456 | bstop = simple_strtoul(args, &args, 10); | 456 | bstop = simple_strtoul(args, &args, 10); |
457 | if (args[0] != '%' || args[1] != '\0') | 457 | if (args[0] != '%' || args[1] != '\0') |
458 | return -EINVAL; | 458 | return -EINVAL; |
459 | 459 | ||
460 | if (bstop < 0 || bstop >= cache->bcull_percent) | 460 | if (bstop < 0 || bstop >= cache->bcull_percent) |
461 | return cachefiles_daemon_range_error(cache, args); | 461 | return cachefiles_daemon_range_error(cache, args); |
462 | 462 | ||
463 | cache->bstop_percent = bstop; | 463 | cache->bstop_percent = bstop; |
464 | return 0; | 464 | return 0; |
465 | } | 465 | } |
466 | 466 | ||
467 | /* | 467 | /* |
468 | * set the cache directory | 468 | * set the cache directory |
469 | * - command: "dir <name>" | 469 | * - command: "dir <name>" |
470 | */ | 470 | */ |
471 | static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) | 471 | static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) |
472 | { | 472 | { |
473 | char *dir; | 473 | char *dir; |
474 | 474 | ||
475 | _enter(",%s", args); | 475 | _enter(",%s", args); |
476 | 476 | ||
477 | if (!*args) { | 477 | if (!*args) { |
478 | pr_err("Empty directory specified"); | 478 | pr_err("Empty directory specified\n"); |
479 | return -EINVAL; | 479 | return -EINVAL; |
480 | } | 480 | } |
481 | 481 | ||
482 | if (cache->rootdirname) { | 482 | if (cache->rootdirname) { |
483 | pr_err("Second cache directory specified"); | 483 | pr_err("Second cache directory specified\n"); |
484 | return -EEXIST; | 484 | return -EEXIST; |
485 | } | 485 | } |
486 | 486 | ||
487 | dir = kstrdup(args, GFP_KERNEL); | 487 | dir = kstrdup(args, GFP_KERNEL); |
488 | if (!dir) | 488 | if (!dir) |
489 | return -ENOMEM; | 489 | return -ENOMEM; |
490 | 490 | ||
491 | cache->rootdirname = dir; | 491 | cache->rootdirname = dir; |
492 | return 0; | 492 | return 0; |
493 | } | 493 | } |
494 | 494 | ||
495 | /* | 495 | /* |
496 | * set the cache security context | 496 | * set the cache security context |
497 | * - command: "secctx <ctx>" | 497 | * - command: "secctx <ctx>" |
498 | */ | 498 | */ |
499 | static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) | 499 | static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) |
500 | { | 500 | { |
501 | char *secctx; | 501 | char *secctx; |
502 | 502 | ||
503 | _enter(",%s", args); | 503 | _enter(",%s", args); |
504 | 504 | ||
505 | if (!*args) { | 505 | if (!*args) { |
506 | pr_err("Empty security context specified"); | 506 | pr_err("Empty security context specified\n"); |
507 | return -EINVAL; | 507 | return -EINVAL; |
508 | } | 508 | } |
509 | 509 | ||
510 | if (cache->secctx) { | 510 | if (cache->secctx) { |
511 | pr_err("Second security context specified"); | 511 | pr_err("Second security context specified\n"); |
512 | return -EINVAL; | 512 | return -EINVAL; |
513 | } | 513 | } |
514 | 514 | ||
515 | secctx = kstrdup(args, GFP_KERNEL); | 515 | secctx = kstrdup(args, GFP_KERNEL); |
516 | if (!secctx) | 516 | if (!secctx) |
517 | return -ENOMEM; | 517 | return -ENOMEM; |
518 | 518 | ||
519 | cache->secctx = secctx; | 519 | cache->secctx = secctx; |
520 | return 0; | 520 | return 0; |
521 | } | 521 | } |
522 | 522 | ||
523 | /* | 523 | /* |
524 | * set the cache tag | 524 | * set the cache tag |
525 | * - command: "tag <name>" | 525 | * - command: "tag <name>" |
526 | */ | 526 | */ |
527 | static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) | 527 | static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) |
528 | { | 528 | { |
529 | char *tag; | 529 | char *tag; |
530 | 530 | ||
531 | _enter(",%s", args); | 531 | _enter(",%s", args); |
532 | 532 | ||
533 | if (!*args) { | 533 | if (!*args) { |
534 | pr_err("Empty tag specified"); | 534 | pr_err("Empty tag specified\n"); |
535 | return -EINVAL; | 535 | return -EINVAL; |
536 | } | 536 | } |
537 | 537 | ||
538 | if (cache->tag) | 538 | if (cache->tag) |
539 | return -EEXIST; | 539 | return -EEXIST; |
540 | 540 | ||
541 | tag = kstrdup(args, GFP_KERNEL); | 541 | tag = kstrdup(args, GFP_KERNEL); |
542 | if (!tag) | 542 | if (!tag) |
543 | return -ENOMEM; | 543 | return -ENOMEM; |
544 | 544 | ||
545 | cache->tag = tag; | 545 | cache->tag = tag; |
546 | return 0; | 546 | return 0; |
547 | } | 547 | } |
548 | 548 | ||
549 | /* | 549 | /* |
550 | * request a node in the cache be culled from the current working directory | 550 | * request a node in the cache be culled from the current working directory |
551 | * - command: "cull <name>" | 551 | * - command: "cull <name>" |
552 | */ | 552 | */ |
553 | static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) | 553 | static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) |
554 | { | 554 | { |
555 | struct path path; | 555 | struct path path; |
556 | const struct cred *saved_cred; | 556 | const struct cred *saved_cred; |
557 | int ret; | 557 | int ret; |
558 | 558 | ||
559 | _enter(",%s", args); | 559 | _enter(",%s", args); |
560 | 560 | ||
561 | if (strchr(args, '/')) | 561 | if (strchr(args, '/')) |
562 | goto inval; | 562 | goto inval; |
563 | 563 | ||
564 | if (!test_bit(CACHEFILES_READY, &cache->flags)) { | 564 | if (!test_bit(CACHEFILES_READY, &cache->flags)) { |
565 | pr_err("cull applied to unready cache"); | 565 | pr_err("cull applied to unready cache\n"); |
566 | return -EIO; | 566 | return -EIO; |
567 | } | 567 | } |
568 | 568 | ||
569 | if (test_bit(CACHEFILES_DEAD, &cache->flags)) { | 569 | if (test_bit(CACHEFILES_DEAD, &cache->flags)) { |
570 | pr_err("cull applied to dead cache"); | 570 | pr_err("cull applied to dead cache\n"); |
571 | return -EIO; | 571 | return -EIO; |
572 | } | 572 | } |
573 | 573 | ||
574 | /* extract the directory dentry from the cwd */ | 574 | /* extract the directory dentry from the cwd */ |
575 | get_fs_pwd(current->fs, &path); | 575 | get_fs_pwd(current->fs, &path); |
576 | 576 | ||
577 | if (!S_ISDIR(path.dentry->d_inode->i_mode)) | 577 | if (!S_ISDIR(path.dentry->d_inode->i_mode)) |
578 | goto notdir; | 578 | goto notdir; |
579 | 579 | ||
580 | cachefiles_begin_secure(cache, &saved_cred); | 580 | cachefiles_begin_secure(cache, &saved_cred); |
581 | ret = cachefiles_cull(cache, path.dentry, args); | 581 | ret = cachefiles_cull(cache, path.dentry, args); |
582 | cachefiles_end_secure(cache, saved_cred); | 582 | cachefiles_end_secure(cache, saved_cred); |
583 | 583 | ||
584 | path_put(&path); | 584 | path_put(&path); |
585 | _leave(" = %d", ret); | 585 | _leave(" = %d", ret); |
586 | return ret; | 586 | return ret; |
587 | 587 | ||
588 | notdir: | 588 | notdir: |
589 | path_put(&path); | 589 | path_put(&path); |
590 | pr_err("cull command requires dirfd to be a directory"); | 590 | pr_err("cull command requires dirfd to be a directory\n"); |
591 | return -ENOTDIR; | 591 | return -ENOTDIR; |
592 | 592 | ||
593 | inval: | 593 | inval: |
594 | pr_err("cull command requires dirfd and filename"); | 594 | pr_err("cull command requires dirfd and filename\n"); |
595 | return -EINVAL; | 595 | return -EINVAL; |
596 | } | 596 | } |
597 | 597 | ||
598 | /* | 598 | /* |
599 | * set debugging mode | 599 | * set debugging mode |
600 | * - command: "debug <mask>" | 600 | * - command: "debug <mask>" |
601 | */ | 601 | */ |
602 | static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) | 602 | static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) |
603 | { | 603 | { |
604 | unsigned long mask; | 604 | unsigned long mask; |
605 | 605 | ||
606 | _enter(",%s", args); | 606 | _enter(",%s", args); |
607 | 607 | ||
608 | mask = simple_strtoul(args, &args, 0); | 608 | mask = simple_strtoul(args, &args, 0); |
609 | if (args[0] != '\0') | 609 | if (args[0] != '\0') |
610 | goto inval; | 610 | goto inval; |
611 | 611 | ||
612 | cachefiles_debug = mask; | 612 | cachefiles_debug = mask; |
613 | _leave(" = 0"); | 613 | _leave(" = 0"); |
614 | return 0; | 614 | return 0; |
615 | 615 | ||
616 | inval: | 616 | inval: |
617 | pr_err("debug command requires mask"); | 617 | pr_err("debug command requires mask\n"); |
618 | return -EINVAL; | 618 | return -EINVAL; |
619 | } | 619 | } |
620 | 620 | ||
621 | /* | 621 | /* |
622 | * find out whether an object in the current working directory is in use or not | 622 | * find out whether an object in the current working directory is in use or not |
623 | * - command: "inuse <name>" | 623 | * - command: "inuse <name>" |
624 | */ | 624 | */ |
625 | static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) | 625 | static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) |
626 | { | 626 | { |
627 | struct path path; | 627 | struct path path; |
628 | const struct cred *saved_cred; | 628 | const struct cred *saved_cred; |
629 | int ret; | 629 | int ret; |
630 | 630 | ||
631 | //_enter(",%s", args); | 631 | //_enter(",%s", args); |
632 | 632 | ||
633 | if (strchr(args, '/')) | 633 | if (strchr(args, '/')) |
634 | goto inval; | 634 | goto inval; |
635 | 635 | ||
636 | if (!test_bit(CACHEFILES_READY, &cache->flags)) { | 636 | if (!test_bit(CACHEFILES_READY, &cache->flags)) { |
637 | pr_err("inuse applied to unready cache"); | 637 | pr_err("inuse applied to unready cache\n"); |
638 | return -EIO; | 638 | return -EIO; |
639 | } | 639 | } |
640 | 640 | ||
641 | if (test_bit(CACHEFILES_DEAD, &cache->flags)) { | 641 | if (test_bit(CACHEFILES_DEAD, &cache->flags)) { |
642 | pr_err("inuse applied to dead cache"); | 642 | pr_err("inuse applied to dead cache\n"); |
643 | return -EIO; | 643 | return -EIO; |
644 | } | 644 | } |
645 | 645 | ||
646 | /* extract the directory dentry from the cwd */ | 646 | /* extract the directory dentry from the cwd */ |
647 | get_fs_pwd(current->fs, &path); | 647 | get_fs_pwd(current->fs, &path); |
648 | 648 | ||
649 | if (!S_ISDIR(path.dentry->d_inode->i_mode)) | 649 | if (!S_ISDIR(path.dentry->d_inode->i_mode)) |
650 | goto notdir; | 650 | goto notdir; |
651 | 651 | ||
652 | cachefiles_begin_secure(cache, &saved_cred); | 652 | cachefiles_begin_secure(cache, &saved_cred); |
653 | ret = cachefiles_check_in_use(cache, path.dentry, args); | 653 | ret = cachefiles_check_in_use(cache, path.dentry, args); |
654 | cachefiles_end_secure(cache, saved_cred); | 654 | cachefiles_end_secure(cache, saved_cred); |
655 | 655 | ||
656 | path_put(&path); | 656 | path_put(&path); |
657 | //_leave(" = %d", ret); | 657 | //_leave(" = %d", ret); |
658 | return ret; | 658 | return ret; |
659 | 659 | ||
660 | notdir: | 660 | notdir: |
661 | path_put(&path); | 661 | path_put(&path); |
662 | pr_err("inuse command requires dirfd to be a directory"); | 662 | pr_err("inuse command requires dirfd to be a directory\n"); |
663 | return -ENOTDIR; | 663 | return -ENOTDIR; |
664 | 664 | ||
665 | inval: | 665 | inval: |
666 | pr_err("inuse command requires dirfd and filename"); | 666 | pr_err("inuse command requires dirfd and filename\n"); |
667 | return -EINVAL; | 667 | return -EINVAL; |
668 | } | 668 | } |
669 | 669 | ||
670 | /* | 670 | /* |
671 | * see if we have space for a number of pages and/or a number of files in the | 671 | * see if we have space for a number of pages and/or a number of files in the |
672 | * cache | 672 | * cache |
673 | */ | 673 | */ |
674 | int cachefiles_has_space(struct cachefiles_cache *cache, | 674 | int cachefiles_has_space(struct cachefiles_cache *cache, |
675 | unsigned fnr, unsigned bnr) | 675 | unsigned fnr, unsigned bnr) |
676 | { | 676 | { |
677 | struct kstatfs stats; | 677 | struct kstatfs stats; |
678 | struct path path = { | 678 | struct path path = { |
679 | .mnt = cache->mnt, | 679 | .mnt = cache->mnt, |
680 | .dentry = cache->mnt->mnt_root, | 680 | .dentry = cache->mnt->mnt_root, |
681 | }; | 681 | }; |
682 | int ret; | 682 | int ret; |
683 | 683 | ||
684 | //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", | 684 | //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", |
685 | // (unsigned long long) cache->frun, | 685 | // (unsigned long long) cache->frun, |
686 | // (unsigned long long) cache->fcull, | 686 | // (unsigned long long) cache->fcull, |
687 | // (unsigned long long) cache->fstop, | 687 | // (unsigned long long) cache->fstop, |
688 | // (unsigned long long) cache->brun, | 688 | // (unsigned long long) cache->brun, |
689 | // (unsigned long long) cache->bcull, | 689 | // (unsigned long long) cache->bcull, |
690 | // (unsigned long long) cache->bstop, | 690 | // (unsigned long long) cache->bstop, |
691 | // fnr, bnr); | 691 | // fnr, bnr); |
692 | 692 | ||
693 | /* find out how many pages of blockdev are available */ | 693 | /* find out how many pages of blockdev are available */ |
694 | memset(&stats, 0, sizeof(stats)); | 694 | memset(&stats, 0, sizeof(stats)); |
695 | 695 | ||
696 | ret = vfs_statfs(&path, &stats); | 696 | ret = vfs_statfs(&path, &stats); |
697 | if (ret < 0) { | 697 | if (ret < 0) { |
698 | if (ret == -EIO) | 698 | if (ret == -EIO) |
699 | cachefiles_io_error(cache, "statfs failed"); | 699 | cachefiles_io_error(cache, "statfs failed"); |
700 | _leave(" = %d", ret); | 700 | _leave(" = %d", ret); |
701 | return ret; | 701 | return ret; |
702 | } | 702 | } |
703 | 703 | ||
704 | stats.f_bavail >>= cache->bshift; | 704 | stats.f_bavail >>= cache->bshift; |
705 | 705 | ||
706 | //_debug("avail %llu,%llu", | 706 | //_debug("avail %llu,%llu", |
707 | // (unsigned long long) stats.f_ffree, | 707 | // (unsigned long long) stats.f_ffree, |
708 | // (unsigned long long) stats.f_bavail); | 708 | // (unsigned long long) stats.f_bavail); |
709 | 709 | ||
710 | /* see if there is sufficient space */ | 710 | /* see if there is sufficient space */ |
711 | if (stats.f_ffree > fnr) | 711 | if (stats.f_ffree > fnr) |
712 | stats.f_ffree -= fnr; | 712 | stats.f_ffree -= fnr; |
713 | else | 713 | else |
714 | stats.f_ffree = 0; | 714 | stats.f_ffree = 0; |
715 | 715 | ||
716 | if (stats.f_bavail > bnr) | 716 | if (stats.f_bavail > bnr) |
717 | stats.f_bavail -= bnr; | 717 | stats.f_bavail -= bnr; |
718 | else | 718 | else |
719 | stats.f_bavail = 0; | 719 | stats.f_bavail = 0; |
720 | 720 | ||
721 | ret = -ENOBUFS; | 721 | ret = -ENOBUFS; |
722 | if (stats.f_ffree < cache->fstop || | 722 | if (stats.f_ffree < cache->fstop || |
723 | stats.f_bavail < cache->bstop) | 723 | stats.f_bavail < cache->bstop) |
724 | goto begin_cull; | 724 | goto begin_cull; |
725 | 725 | ||
726 | ret = 0; | 726 | ret = 0; |
727 | if (stats.f_ffree < cache->fcull || | 727 | if (stats.f_ffree < cache->fcull || |
728 | stats.f_bavail < cache->bcull) | 728 | stats.f_bavail < cache->bcull) |
729 | goto begin_cull; | 729 | goto begin_cull; |
730 | 730 | ||
731 | if (test_bit(CACHEFILES_CULLING, &cache->flags) && | 731 | if (test_bit(CACHEFILES_CULLING, &cache->flags) && |
732 | stats.f_ffree >= cache->frun && | 732 | stats.f_ffree >= cache->frun && |
733 | stats.f_bavail >= cache->brun && | 733 | stats.f_bavail >= cache->brun && |
734 | test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) | 734 | test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) |
735 | ) { | 735 | ) { |
736 | _debug("cease culling"); | 736 | _debug("cease culling"); |
737 | cachefiles_state_changed(cache); | 737 | cachefiles_state_changed(cache); |
738 | } | 738 | } |
739 | 739 | ||
740 | //_leave(" = 0"); | 740 | //_leave(" = 0"); |
741 | return 0; | 741 | return 0; |
742 | 742 | ||
743 | begin_cull: | 743 | begin_cull: |
744 | if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { | 744 | if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { |
745 | _debug("### CULL CACHE ###"); | 745 | _debug("### CULL CACHE ###"); |
746 | cachefiles_state_changed(cache); | 746 | cachefiles_state_changed(cache); |
747 | } | 747 | } |
748 | 748 | ||
749 | _leave(" = %d", ret); | 749 | _leave(" = %d", ret); |
750 | return ret; | 750 | return ret; |
751 | } | 751 | } |
752 | 752 |
fs/cachefiles/internal.h
1 | /* General netfs cache on cache files internal defs | 1 | /* General netfs cache on cache files internal defs |
2 | * | 2 | * |
3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public Licence | 7 | * modify it under the terms of the GNU General Public Licence |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #ifdef pr_fmt | 12 | #ifdef pr_fmt |
13 | #undef pr_fmt | 13 | #undef pr_fmt |
14 | #endif | 14 | #endif |
15 | 15 | ||
16 | #define pr_fmt(fmt) "CacheFiles: " fmt | 16 | #define pr_fmt(fmt) "CacheFiles: " fmt |
17 | 17 | ||
18 | 18 | ||
19 | #include <linux/fscache-cache.h> | 19 | #include <linux/fscache-cache.h> |
20 | #include <linux/timer.h> | 20 | #include <linux/timer.h> |
21 | #include <linux/wait.h> | 21 | #include <linux/wait.h> |
22 | #include <linux/workqueue.h> | 22 | #include <linux/workqueue.h> |
23 | #include <linux/security.h> | 23 | #include <linux/security.h> |
24 | 24 | ||
25 | struct cachefiles_cache; | 25 | struct cachefiles_cache; |
26 | struct cachefiles_object; | 26 | struct cachefiles_object; |
27 | 27 | ||
28 | extern unsigned cachefiles_debug; | 28 | extern unsigned cachefiles_debug; |
29 | #define CACHEFILES_DEBUG_KENTER 1 | 29 | #define CACHEFILES_DEBUG_KENTER 1 |
30 | #define CACHEFILES_DEBUG_KLEAVE 2 | 30 | #define CACHEFILES_DEBUG_KLEAVE 2 |
31 | #define CACHEFILES_DEBUG_KDEBUG 4 | 31 | #define CACHEFILES_DEBUG_KDEBUG 4 |
32 | 32 | ||
33 | #define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) | 33 | #define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * node records | 36 | * node records |
37 | */ | 37 | */ |
38 | struct cachefiles_object { | 38 | struct cachefiles_object { |
39 | struct fscache_object fscache; /* fscache handle */ | 39 | struct fscache_object fscache; /* fscache handle */ |
40 | struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ | 40 | struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ |
41 | struct dentry *dentry; /* the file/dir representing this object */ | 41 | struct dentry *dentry; /* the file/dir representing this object */ |
42 | struct dentry *backer; /* backing file */ | 42 | struct dentry *backer; /* backing file */ |
43 | loff_t i_size; /* object size */ | 43 | loff_t i_size; /* object size */ |
44 | unsigned long flags; | 44 | unsigned long flags; |
45 | #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ | 45 | #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ |
46 | #define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ | 46 | #define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ |
47 | atomic_t usage; /* object usage count */ | 47 | atomic_t usage; /* object usage count */ |
48 | uint8_t type; /* object type */ | 48 | uint8_t type; /* object type */ |
49 | uint8_t new; /* T if object new */ | 49 | uint8_t new; /* T if object new */ |
50 | spinlock_t work_lock; | 50 | spinlock_t work_lock; |
51 | struct rb_node active_node; /* link in active tree (dentry is key) */ | 51 | struct rb_node active_node; /* link in active tree (dentry is key) */ |
52 | }; | 52 | }; |
53 | 53 | ||
54 | extern struct kmem_cache *cachefiles_object_jar; | 54 | extern struct kmem_cache *cachefiles_object_jar; |
55 | 55 | ||
56 | /* | 56 | /* |
57 | * Cache files cache definition | 57 | * Cache files cache definition |
58 | */ | 58 | */ |
59 | struct cachefiles_cache { | 59 | struct cachefiles_cache { |
60 | struct fscache_cache cache; /* FS-Cache record */ | 60 | struct fscache_cache cache; /* FS-Cache record */ |
61 | struct vfsmount *mnt; /* mountpoint holding the cache */ | 61 | struct vfsmount *mnt; /* mountpoint holding the cache */ |
62 | struct dentry *graveyard; /* directory into which dead objects go */ | 62 | struct dentry *graveyard; /* directory into which dead objects go */ |
63 | struct file *cachefilesd; /* manager daemon handle */ | 63 | struct file *cachefilesd; /* manager daemon handle */ |
64 | const struct cred *cache_cred; /* security override for accessing cache */ | 64 | const struct cred *cache_cred; /* security override for accessing cache */ |
65 | struct mutex daemon_mutex; /* command serialisation mutex */ | 65 | struct mutex daemon_mutex; /* command serialisation mutex */ |
66 | wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ | 66 | wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ |
67 | struct rb_root active_nodes; /* active nodes (can't be culled) */ | 67 | struct rb_root active_nodes; /* active nodes (can't be culled) */ |
68 | rwlock_t active_lock; /* lock for active_nodes */ | 68 | rwlock_t active_lock; /* lock for active_nodes */ |
69 | atomic_t gravecounter; /* graveyard uniquifier */ | 69 | atomic_t gravecounter; /* graveyard uniquifier */ |
70 | unsigned frun_percent; /* when to stop culling (% files) */ | 70 | unsigned frun_percent; /* when to stop culling (% files) */ |
71 | unsigned fcull_percent; /* when to start culling (% files) */ | 71 | unsigned fcull_percent; /* when to start culling (% files) */ |
72 | unsigned fstop_percent; /* when to stop allocating (% files) */ | 72 | unsigned fstop_percent; /* when to stop allocating (% files) */ |
73 | unsigned brun_percent; /* when to stop culling (% blocks) */ | 73 | unsigned brun_percent; /* when to stop culling (% blocks) */ |
74 | unsigned bcull_percent; /* when to start culling (% blocks) */ | 74 | unsigned bcull_percent; /* when to start culling (% blocks) */ |
75 | unsigned bstop_percent; /* when to stop allocating (% blocks) */ | 75 | unsigned bstop_percent; /* when to stop allocating (% blocks) */ |
76 | unsigned bsize; /* cache's block size */ | 76 | unsigned bsize; /* cache's block size */ |
77 | unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ | 77 | unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ |
78 | uint64_t frun; /* when to stop culling */ | 78 | uint64_t frun; /* when to stop culling */ |
79 | uint64_t fcull; /* when to start culling */ | 79 | uint64_t fcull; /* when to start culling */ |
80 | uint64_t fstop; /* when to stop allocating */ | 80 | uint64_t fstop; /* when to stop allocating */ |
81 | sector_t brun; /* when to stop culling */ | 81 | sector_t brun; /* when to stop culling */ |
82 | sector_t bcull; /* when to start culling */ | 82 | sector_t bcull; /* when to start culling */ |
83 | sector_t bstop; /* when to stop allocating */ | 83 | sector_t bstop; /* when to stop allocating */ |
84 | unsigned long flags; | 84 | unsigned long flags; |
85 | #define CACHEFILES_READY 0 /* T if cache prepared */ | 85 | #define CACHEFILES_READY 0 /* T if cache prepared */ |
86 | #define CACHEFILES_DEAD 1 /* T if cache dead */ | 86 | #define CACHEFILES_DEAD 1 /* T if cache dead */ |
87 | #define CACHEFILES_CULLING 2 /* T if cull engaged */ | 87 | #define CACHEFILES_CULLING 2 /* T if cull engaged */ |
88 | #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ | 88 | #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ |
89 | char *rootdirname; /* name of cache root directory */ | 89 | char *rootdirname; /* name of cache root directory */ |
90 | char *secctx; /* LSM security context */ | 90 | char *secctx; /* LSM security context */ |
91 | char *tag; /* cache binding tag */ | 91 | char *tag; /* cache binding tag */ |
92 | }; | 92 | }; |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * backing file read tracking | 95 | * backing file read tracking |
96 | */ | 96 | */ |
97 | struct cachefiles_one_read { | 97 | struct cachefiles_one_read { |
98 | wait_queue_t monitor; /* link into monitored waitqueue */ | 98 | wait_queue_t monitor; /* link into monitored waitqueue */ |
99 | struct page *back_page; /* backing file page we're waiting for */ | 99 | struct page *back_page; /* backing file page we're waiting for */ |
100 | struct page *netfs_page; /* netfs page we're going to fill */ | 100 | struct page *netfs_page; /* netfs page we're going to fill */ |
101 | struct fscache_retrieval *op; /* retrieval op covering this */ | 101 | struct fscache_retrieval *op; /* retrieval op covering this */ |
102 | struct list_head op_link; /* link in op's todo list */ | 102 | struct list_head op_link; /* link in op's todo list */ |
103 | }; | 103 | }; |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * backing file write tracking | 106 | * backing file write tracking |
107 | */ | 107 | */ |
108 | struct cachefiles_one_write { | 108 | struct cachefiles_one_write { |
109 | struct page *netfs_page; /* netfs page to copy */ | 109 | struct page *netfs_page; /* netfs page to copy */ |
110 | struct cachefiles_object *object; | 110 | struct cachefiles_object *object; |
111 | struct list_head obj_link; /* link in object's lists */ | 111 | struct list_head obj_link; /* link in object's lists */ |
112 | fscache_rw_complete_t end_io_func; | 112 | fscache_rw_complete_t end_io_func; |
113 | void *context; | 113 | void *context; |
114 | }; | 114 | }; |
115 | 115 | ||
116 | /* | 116 | /* |
117 | * auxiliary data xattr buffer | 117 | * auxiliary data xattr buffer |
118 | */ | 118 | */ |
119 | struct cachefiles_xattr { | 119 | struct cachefiles_xattr { |
120 | uint16_t len; | 120 | uint16_t len; |
121 | uint8_t type; | 121 | uint8_t type; |
122 | uint8_t data[]; | 122 | uint8_t data[]; |
123 | }; | 123 | }; |
124 | 124 | ||
125 | /* | 125 | /* |
126 | * note change of state for daemon | 126 | * note change of state for daemon |
127 | */ | 127 | */ |
128 | static inline void cachefiles_state_changed(struct cachefiles_cache *cache) | 128 | static inline void cachefiles_state_changed(struct cachefiles_cache *cache) |
129 | { | 129 | { |
130 | set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); | 130 | set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); |
131 | wake_up_all(&cache->daemon_pollwq); | 131 | wake_up_all(&cache->daemon_pollwq); |
132 | } | 132 | } |
133 | 133 | ||
134 | /* | 134 | /* |
135 | * bind.c | 135 | * bind.c |
136 | */ | 136 | */ |
137 | extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); | 137 | extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); |
138 | extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); | 138 | extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * daemon.c | 141 | * daemon.c |
142 | */ | 142 | */ |
143 | extern const struct file_operations cachefiles_daemon_fops; | 143 | extern const struct file_operations cachefiles_daemon_fops; |
144 | 144 | ||
145 | extern int cachefiles_has_space(struct cachefiles_cache *cache, | 145 | extern int cachefiles_has_space(struct cachefiles_cache *cache, |
146 | unsigned fnr, unsigned bnr); | 146 | unsigned fnr, unsigned bnr); |
147 | 147 | ||
148 | /* | 148 | /* |
149 | * interface.c | 149 | * interface.c |
150 | */ | 150 | */ |
151 | extern const struct fscache_cache_ops cachefiles_cache_ops; | 151 | extern const struct fscache_cache_ops cachefiles_cache_ops; |
152 | 152 | ||
153 | /* | 153 | /* |
154 | * key.c | 154 | * key.c |
155 | */ | 155 | */ |
156 | extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); | 156 | extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * namei.c | 159 | * namei.c |
160 | */ | 160 | */ |
161 | extern int cachefiles_delete_object(struct cachefiles_cache *cache, | 161 | extern int cachefiles_delete_object(struct cachefiles_cache *cache, |
162 | struct cachefiles_object *object); | 162 | struct cachefiles_object *object); |
163 | extern int cachefiles_walk_to_object(struct cachefiles_object *parent, | 163 | extern int cachefiles_walk_to_object(struct cachefiles_object *parent, |
164 | struct cachefiles_object *object, | 164 | struct cachefiles_object *object, |
165 | const char *key, | 165 | const char *key, |
166 | struct cachefiles_xattr *auxdata); | 166 | struct cachefiles_xattr *auxdata); |
167 | extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, | 167 | extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, |
168 | struct dentry *dir, | 168 | struct dentry *dir, |
169 | const char *name); | 169 | const char *name); |
170 | 170 | ||
171 | extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, | 171 | extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, |
172 | char *filename); | 172 | char *filename); |
173 | 173 | ||
174 | extern int cachefiles_check_in_use(struct cachefiles_cache *cache, | 174 | extern int cachefiles_check_in_use(struct cachefiles_cache *cache, |
175 | struct dentry *dir, char *filename); | 175 | struct dentry *dir, char *filename); |
176 | 176 | ||
177 | /* | 177 | /* |
178 | * proc.c | 178 | * proc.c |
179 | */ | 179 | */ |
180 | #ifdef CONFIG_CACHEFILES_HISTOGRAM | 180 | #ifdef CONFIG_CACHEFILES_HISTOGRAM |
181 | extern atomic_t cachefiles_lookup_histogram[HZ]; | 181 | extern atomic_t cachefiles_lookup_histogram[HZ]; |
182 | extern atomic_t cachefiles_mkdir_histogram[HZ]; | 182 | extern atomic_t cachefiles_mkdir_histogram[HZ]; |
183 | extern atomic_t cachefiles_create_histogram[HZ]; | 183 | extern atomic_t cachefiles_create_histogram[HZ]; |
184 | 184 | ||
185 | extern int __init cachefiles_proc_init(void); | 185 | extern int __init cachefiles_proc_init(void); |
186 | extern void cachefiles_proc_cleanup(void); | 186 | extern void cachefiles_proc_cleanup(void); |
187 | static inline | 187 | static inline |
188 | void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) | 188 | void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) |
189 | { | 189 | { |
190 | unsigned long jif = jiffies - start_jif; | 190 | unsigned long jif = jiffies - start_jif; |
191 | if (jif >= HZ) | 191 | if (jif >= HZ) |
192 | jif = HZ - 1; | 192 | jif = HZ - 1; |
193 | atomic_inc(&histogram[jif]); | 193 | atomic_inc(&histogram[jif]); |
194 | } | 194 | } |
195 | 195 | ||
196 | #else | 196 | #else |
197 | #define cachefiles_proc_init() (0) | 197 | #define cachefiles_proc_init() (0) |
198 | #define cachefiles_proc_cleanup() do {} while (0) | 198 | #define cachefiles_proc_cleanup() do {} while (0) |
199 | #define cachefiles_hist(hist, start_jif) do {} while (0) | 199 | #define cachefiles_hist(hist, start_jif) do {} while (0) |
200 | #endif | 200 | #endif |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * rdwr.c | 203 | * rdwr.c |
204 | */ | 204 | */ |
205 | extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, | 205 | extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, |
206 | struct page *, gfp_t); | 206 | struct page *, gfp_t); |
207 | extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, | 207 | extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, |
208 | struct list_head *, unsigned *, | 208 | struct list_head *, unsigned *, |
209 | gfp_t); | 209 | gfp_t); |
210 | extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, | 210 | extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, |
211 | gfp_t); | 211 | gfp_t); |
212 | extern int cachefiles_allocate_pages(struct fscache_retrieval *, | 212 | extern int cachefiles_allocate_pages(struct fscache_retrieval *, |
213 | struct list_head *, unsigned *, gfp_t); | 213 | struct list_head *, unsigned *, gfp_t); |
214 | extern int cachefiles_write_page(struct fscache_storage *, struct page *); | 214 | extern int cachefiles_write_page(struct fscache_storage *, struct page *); |
215 | extern void cachefiles_uncache_page(struct fscache_object *, struct page *); | 215 | extern void cachefiles_uncache_page(struct fscache_object *, struct page *); |
216 | 216 | ||
217 | /* | 217 | /* |
218 | * security.c | 218 | * security.c |
219 | */ | 219 | */ |
220 | extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); | 220 | extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); |
221 | extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, | 221 | extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, |
222 | struct dentry *root, | 222 | struct dentry *root, |
223 | const struct cred **_saved_cred); | 223 | const struct cred **_saved_cred); |
224 | 224 | ||
225 | static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, | 225 | static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, |
226 | const struct cred **_saved_cred) | 226 | const struct cred **_saved_cred) |
227 | { | 227 | { |
228 | *_saved_cred = override_creds(cache->cache_cred); | 228 | *_saved_cred = override_creds(cache->cache_cred); |
229 | } | 229 | } |
230 | 230 | ||
231 | static inline void cachefiles_end_secure(struct cachefiles_cache *cache, | 231 | static inline void cachefiles_end_secure(struct cachefiles_cache *cache, |
232 | const struct cred *saved_cred) | 232 | const struct cred *saved_cred) |
233 | { | 233 | { |
234 | revert_creds(saved_cred); | 234 | revert_creds(saved_cred); |
235 | } | 235 | } |
236 | 236 | ||
237 | /* | 237 | /* |
238 | * xattr.c | 238 | * xattr.c |
239 | */ | 239 | */ |
240 | extern int cachefiles_check_object_type(struct cachefiles_object *object); | 240 | extern int cachefiles_check_object_type(struct cachefiles_object *object); |
241 | extern int cachefiles_set_object_xattr(struct cachefiles_object *object, | 241 | extern int cachefiles_set_object_xattr(struct cachefiles_object *object, |
242 | struct cachefiles_xattr *auxdata); | 242 | struct cachefiles_xattr *auxdata); |
243 | extern int cachefiles_update_object_xattr(struct cachefiles_object *object, | 243 | extern int cachefiles_update_object_xattr(struct cachefiles_object *object, |
244 | struct cachefiles_xattr *auxdata); | 244 | struct cachefiles_xattr *auxdata); |
245 | extern int cachefiles_check_auxdata(struct cachefiles_object *object); | 245 | extern int cachefiles_check_auxdata(struct cachefiles_object *object); |
246 | extern int cachefiles_check_object_xattr(struct cachefiles_object *object, | 246 | extern int cachefiles_check_object_xattr(struct cachefiles_object *object, |
247 | struct cachefiles_xattr *auxdata); | 247 | struct cachefiles_xattr *auxdata); |
248 | extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, | 248 | extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, |
249 | struct dentry *dentry); | 249 | struct dentry *dentry); |
250 | 250 | ||
251 | 251 | ||
252 | /* | 252 | /* |
253 | * error handling | 253 | * error handling |
254 | */ | 254 | */ |
255 | 255 | ||
256 | #define cachefiles_io_error(___cache, FMT, ...) \ | 256 | #define cachefiles_io_error(___cache, FMT, ...) \ |
257 | do { \ | 257 | do { \ |
258 | pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ | 258 | pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \ |
259 | fscache_io_error(&(___cache)->cache); \ | 259 | fscache_io_error(&(___cache)->cache); \ |
260 | set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ | 260 | set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ |
261 | } while (0) | 261 | } while (0) |
262 | 262 | ||
263 | #define cachefiles_io_error_obj(object, FMT, ...) \ | 263 | #define cachefiles_io_error_obj(object, FMT, ...) \ |
264 | do { \ | 264 | do { \ |
265 | struct cachefiles_cache *___cache; \ | 265 | struct cachefiles_cache *___cache; \ |
266 | \ | 266 | \ |
267 | ___cache = container_of((object)->fscache.cache, \ | 267 | ___cache = container_of((object)->fscache.cache, \ |
268 | struct cachefiles_cache, cache); \ | 268 | struct cachefiles_cache, cache); \ |
269 | cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ | 269 | cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ |
270 | } while (0) | 270 | } while (0) |
271 | 271 | ||
272 | 272 | ||
273 | /* | 273 | /* |
274 | * debug tracing | 274 | * debug tracing |
275 | */ | 275 | */ |
276 | #define dbgprintk(FMT, ...) \ | 276 | #define dbgprintk(FMT, ...) \ |
277 | printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) | 277 | printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) |
278 | 278 | ||
279 | #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) | 279 | #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) |
280 | #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) | 280 | #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) |
281 | #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) | 281 | #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) |
282 | 282 | ||
283 | 283 | ||
284 | #if defined(__KDEBUG) | 284 | #if defined(__KDEBUG) |
285 | #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) | 285 | #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) |
286 | #define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) | 286 | #define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) |
287 | #define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) | 287 | #define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) |
288 | 288 | ||
289 | #elif defined(CONFIG_CACHEFILES_DEBUG) | 289 | #elif defined(CONFIG_CACHEFILES_DEBUG) |
290 | #define _enter(FMT, ...) \ | 290 | #define _enter(FMT, ...) \ |
291 | do { \ | 291 | do { \ |
292 | if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ | 292 | if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ |
293 | kenter(FMT, ##__VA_ARGS__); \ | 293 | kenter(FMT, ##__VA_ARGS__); \ |
294 | } while (0) | 294 | } while (0) |
295 | 295 | ||
296 | #define _leave(FMT, ...) \ | 296 | #define _leave(FMT, ...) \ |
297 | do { \ | 297 | do { \ |
298 | if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ | 298 | if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ |
299 | kleave(FMT, ##__VA_ARGS__); \ | 299 | kleave(FMT, ##__VA_ARGS__); \ |
300 | } while (0) | 300 | } while (0) |
301 | 301 | ||
302 | #define _debug(FMT, ...) \ | 302 | #define _debug(FMT, ...) \ |
303 | do { \ | 303 | do { \ |
304 | if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ | 304 | if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ |
305 | kdebug(FMT, ##__VA_ARGS__); \ | 305 | kdebug(FMT, ##__VA_ARGS__); \ |
306 | } while (0) | 306 | } while (0) |
307 | 307 | ||
308 | #else | 308 | #else |
309 | #define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) | 309 | #define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) |
310 | #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) | 310 | #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) |
311 | #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) | 311 | #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) |
312 | #endif | 312 | #endif |
313 | 313 | ||
314 | #if 1 /* defined(__KDEBUGALL) */ | 314 | #if 1 /* defined(__KDEBUGALL) */ |
315 | 315 | ||
316 | #define ASSERT(X) \ | 316 | #define ASSERT(X) \ |
317 | do { \ | 317 | do { \ |
318 | if (unlikely(!(X))) { \ | 318 | if (unlikely(!(X))) { \ |
319 | pr_err("\n"); \ | 319 | pr_err("\n"); \ |
320 | pr_err("Assertion failed\n"); \ | 320 | pr_err("Assertion failed\n"); \ |
321 | BUG(); \ | 321 | BUG(); \ |
322 | } \ | 322 | } \ |
323 | } while (0) | 323 | } while (0) |
324 | 324 | ||
325 | #define ASSERTCMP(X, OP, Y) \ | 325 | #define ASSERTCMP(X, OP, Y) \ |
326 | do { \ | 326 | do { \ |
327 | if (unlikely(!((X) OP (Y)))) { \ | 327 | if (unlikely(!((X) OP (Y)))) { \ |
328 | pr_err("\n"); \ | 328 | pr_err("\n"); \ |
329 | pr_err("Assertion failed\n"); \ | 329 | pr_err("Assertion failed\n"); \ |
330 | pr_err("%lx " #OP " %lx is false\n", \ | 330 | pr_err("%lx " #OP " %lx is false\n", \ |
331 | (unsigned long)(X), (unsigned long)(Y)); \ | 331 | (unsigned long)(X), (unsigned long)(Y)); \ |
332 | BUG(); \ | 332 | BUG(); \ |
333 | } \ | 333 | } \ |
334 | } while (0) | 334 | } while (0) |
335 | 335 | ||
336 | #define ASSERTIF(C, X) \ | 336 | #define ASSERTIF(C, X) \ |
337 | do { \ | 337 | do { \ |
338 | if (unlikely((C) && !(X))) { \ | 338 | if (unlikely((C) && !(X))) { \ |
339 | pr_err("\n"); \ | 339 | pr_err("\n"); \ |
340 | pr_err("Assertion failed\n"); \ | 340 | pr_err("Assertion failed\n"); \ |
341 | BUG(); \ | 341 | BUG(); \ |
342 | } \ | 342 | } \ |
343 | } while (0) | 343 | } while (0) |
344 | 344 | ||
345 | #define ASSERTIFCMP(C, X, OP, Y) \ | 345 | #define ASSERTIFCMP(C, X, OP, Y) \ |
346 | do { \ | 346 | do { \ |
347 | if (unlikely((C) && !((X) OP (Y)))) { \ | 347 | if (unlikely((C) && !((X) OP (Y)))) { \ |
348 | pr_err("\n"); \ | 348 | pr_err("\n"); \ |
349 | pr_err("Assertion failed\n"); \ | 349 | pr_err("Assertion failed\n"); \ |
350 | pr_err("%lx " #OP " %lx is false\n", \ | 350 | pr_err("%lx " #OP " %lx is false\n", \ |
351 | (unsigned long)(X), (unsigned long)(Y)); \ | 351 | (unsigned long)(X), (unsigned long)(Y)); \ |
352 | BUG(); \ | 352 | BUG(); \ |
353 | } \ | 353 | } \ |
354 | } while (0) | 354 | } while (0) |
355 | 355 | ||
356 | #else | 356 | #else |
357 | 357 | ||
358 | #define ASSERT(X) do {} while (0) | 358 | #define ASSERT(X) do {} while (0) |
359 | #define ASSERTCMP(X, OP, Y) do {} while (0) | 359 | #define ASSERTCMP(X, OP, Y) do {} while (0) |
360 | #define ASSERTIF(C, X) do {} while (0) | 360 | #define ASSERTIF(C, X) do {} while (0) |
361 | #define ASSERTIFCMP(C, X, OP, Y) do {} while (0) | 361 | #define ASSERTIFCMP(C, X, OP, Y) do {} while (0) |
362 | 362 | ||
363 | #endif | 363 | #endif |
364 | 364 |
fs/cachefiles/main.c
1 | /* Network filesystem caching backend to use cache files on a premounted | 1 | /* Network filesystem caching backend to use cache files on a premounted |
2 | * filesystem | 2 | * filesystem |
3 | * | 3 | * |
4 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. | 4 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. |
5 | * Written by David Howells (dhowells@redhat.com) | 5 | * Written by David Howells (dhowells@redhat.com) |
6 | * | 6 | * |
7 | * This program is free software; you can redistribute it and/or | 7 | * This program is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU General Public Licence | 8 | * modify it under the terms of the GNU General Public Licence |
9 | * as published by the Free Software Foundation; either version | 9 | * as published by the Free Software Foundation; either version |
10 | * 2 of the Licence, or (at your option) any later version. | 10 | * 2 of the Licence, or (at your option) any later version. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/completion.h> | 16 | #include <linux/completion.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
19 | #include <linux/file.h> | 19 | #include <linux/file.h> |
20 | #include <linux/namei.h> | 20 | #include <linux/namei.h> |
21 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
22 | #include <linux/statfs.h> | 22 | #include <linux/statfs.h> |
23 | #include <linux/sysctl.h> | 23 | #include <linux/sysctl.h> |
24 | #include <linux/miscdevice.h> | 24 | #include <linux/miscdevice.h> |
25 | #include "internal.h" | 25 | #include "internal.h" |
26 | 26 | ||
27 | unsigned cachefiles_debug; | 27 | unsigned cachefiles_debug; |
28 | module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); | 28 | module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); |
29 | MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); | 29 | MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); |
30 | 30 | ||
31 | MODULE_DESCRIPTION("Mounted-filesystem based cache"); | 31 | MODULE_DESCRIPTION("Mounted-filesystem based cache"); |
32 | MODULE_AUTHOR("Red Hat, Inc."); | 32 | MODULE_AUTHOR("Red Hat, Inc."); |
33 | MODULE_LICENSE("GPL"); | 33 | MODULE_LICENSE("GPL"); |
34 | 34 | ||
35 | struct kmem_cache *cachefiles_object_jar; | 35 | struct kmem_cache *cachefiles_object_jar; |
36 | 36 | ||
37 | static struct miscdevice cachefiles_dev = { | 37 | static struct miscdevice cachefiles_dev = { |
38 | .minor = MISC_DYNAMIC_MINOR, | 38 | .minor = MISC_DYNAMIC_MINOR, |
39 | .name = "cachefiles", | 39 | .name = "cachefiles", |
40 | .fops = &cachefiles_daemon_fops, | 40 | .fops = &cachefiles_daemon_fops, |
41 | }; | 41 | }; |
42 | 42 | ||
43 | static void cachefiles_object_init_once(void *_object) | 43 | static void cachefiles_object_init_once(void *_object) |
44 | { | 44 | { |
45 | struct cachefiles_object *object = _object; | 45 | struct cachefiles_object *object = _object; |
46 | 46 | ||
47 | memset(object, 0, sizeof(*object)); | 47 | memset(object, 0, sizeof(*object)); |
48 | spin_lock_init(&object->work_lock); | 48 | spin_lock_init(&object->work_lock); |
49 | } | 49 | } |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * initialise the fs caching module | 52 | * initialise the fs caching module |
53 | */ | 53 | */ |
54 | static int __init cachefiles_init(void) | 54 | static int __init cachefiles_init(void) |
55 | { | 55 | { |
56 | int ret; | 56 | int ret; |
57 | 57 | ||
58 | ret = misc_register(&cachefiles_dev); | 58 | ret = misc_register(&cachefiles_dev); |
59 | if (ret < 0) | 59 | if (ret < 0) |
60 | goto error_dev; | 60 | goto error_dev; |
61 | 61 | ||
62 | /* create an object jar */ | 62 | /* create an object jar */ |
63 | ret = -ENOMEM; | 63 | ret = -ENOMEM; |
64 | cachefiles_object_jar = | 64 | cachefiles_object_jar = |
65 | kmem_cache_create("cachefiles_object_jar", | 65 | kmem_cache_create("cachefiles_object_jar", |
66 | sizeof(struct cachefiles_object), | 66 | sizeof(struct cachefiles_object), |
67 | 0, | 67 | 0, |
68 | SLAB_HWCACHE_ALIGN, | 68 | SLAB_HWCACHE_ALIGN, |
69 | cachefiles_object_init_once); | 69 | cachefiles_object_init_once); |
70 | if (!cachefiles_object_jar) { | 70 | if (!cachefiles_object_jar) { |
71 | pr_notice("Failed to allocate an object jar\n"); | 71 | pr_notice("Failed to allocate an object jar\n"); |
72 | goto error_object_jar; | 72 | goto error_object_jar; |
73 | } | 73 | } |
74 | 74 | ||
75 | ret = cachefiles_proc_init(); | 75 | ret = cachefiles_proc_init(); |
76 | if (ret < 0) | 76 | if (ret < 0) |
77 | goto error_proc; | 77 | goto error_proc; |
78 | 78 | ||
79 | pr_info("Loaded\n"); | 79 | pr_info("Loaded\n"); |
80 | return 0; | 80 | return 0; |
81 | 81 | ||
82 | error_proc: | 82 | error_proc: |
83 | kmem_cache_destroy(cachefiles_object_jar); | 83 | kmem_cache_destroy(cachefiles_object_jar); |
84 | error_object_jar: | 84 | error_object_jar: |
85 | misc_deregister(&cachefiles_dev); | 85 | misc_deregister(&cachefiles_dev); |
86 | error_dev: | 86 | error_dev: |
87 | pr_err("failed to register: %d", ret); | 87 | pr_err("failed to register: %d\n", ret); |
88 | return ret; | 88 | return ret; |
89 | } | 89 | } |
90 | 90 | ||
91 | fs_initcall(cachefiles_init); | 91 | fs_initcall(cachefiles_init); |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * clean up on module removal | 94 | * clean up on module removal |
95 | */ | 95 | */ |
96 | static void __exit cachefiles_exit(void) | 96 | static void __exit cachefiles_exit(void) |
97 | { | 97 | { |
98 | pr_info("Unloading\n"); | 98 | pr_info("Unloading\n"); |
99 | 99 | ||
100 | cachefiles_proc_cleanup(); | 100 | cachefiles_proc_cleanup(); |
101 | kmem_cache_destroy(cachefiles_object_jar); | 101 | kmem_cache_destroy(cachefiles_object_jar); |
102 | misc_deregister(&cachefiles_dev); | 102 | misc_deregister(&cachefiles_dev); |
103 | } | 103 | } |
104 | 104 | ||
105 | module_exit(cachefiles_exit); | 105 | module_exit(cachefiles_exit); |
106 | 106 |
fs/cachefiles/namei.c
1 | /* CacheFiles path walking and related routines | 1 | /* CacheFiles path walking and related routines |
2 | * | 2 | * |
3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public Licence | 7 | * modify it under the terms of the GNU General Public Licence |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/fsnotify.h> | 16 | #include <linux/fsnotify.h> |
17 | #include <linux/quotaops.h> | 17 | #include <linux/quotaops.h> |
18 | #include <linux/xattr.h> | 18 | #include <linux/xattr.h> |
19 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
20 | #include <linux/namei.h> | 20 | #include <linux/namei.h> |
21 | #include <linux/security.h> | 21 | #include <linux/security.h> |
22 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
23 | #include "internal.h" | 23 | #include "internal.h" |
24 | 24 | ||
25 | #define CACHEFILES_KEYBUF_SIZE 512 | 25 | #define CACHEFILES_KEYBUF_SIZE 512 |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * dump debugging info about an object | 28 | * dump debugging info about an object |
29 | */ | 29 | */ |
30 | static noinline | 30 | static noinline |
31 | void __cachefiles_printk_object(struct cachefiles_object *object, | 31 | void __cachefiles_printk_object(struct cachefiles_object *object, |
32 | const char *prefix, | 32 | const char *prefix, |
33 | u8 *keybuf) | 33 | u8 *keybuf) |
34 | { | 34 | { |
35 | struct fscache_cookie *cookie; | 35 | struct fscache_cookie *cookie; |
36 | unsigned keylen, loop; | 36 | unsigned keylen, loop; |
37 | 37 | ||
38 | pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); | 38 | pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); |
39 | pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", | 39 | pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", |
40 | prefix, object->fscache.state->name, | 40 | prefix, object->fscache.state->name, |
41 | object->fscache.flags, work_busy(&object->fscache.work), | 41 | object->fscache.flags, work_busy(&object->fscache.work), |
42 | object->fscache.events, object->fscache.event_mask); | 42 | object->fscache.events, object->fscache.event_mask); |
43 | pr_err("%sops=%u inp=%u exc=%u\n", | 43 | pr_err("%sops=%u inp=%u exc=%u\n", |
44 | prefix, object->fscache.n_ops, object->fscache.n_in_progress, | 44 | prefix, object->fscache.n_ops, object->fscache.n_in_progress, |
45 | object->fscache.n_exclusive); | 45 | object->fscache.n_exclusive); |
46 | pr_err("%sparent=%p\n", | 46 | pr_err("%sparent=%p\n", |
47 | prefix, object->fscache.parent); | 47 | prefix, object->fscache.parent); |
48 | 48 | ||
49 | spin_lock(&object->fscache.lock); | 49 | spin_lock(&object->fscache.lock); |
50 | cookie = object->fscache.cookie; | 50 | cookie = object->fscache.cookie; |
51 | if (cookie) { | 51 | if (cookie) { |
52 | pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", | 52 | pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", |
53 | prefix, | 53 | prefix, |
54 | object->fscache.cookie, | 54 | object->fscache.cookie, |
55 | object->fscache.cookie->parent, | 55 | object->fscache.cookie->parent, |
56 | object->fscache.cookie->netfs_data, | 56 | object->fscache.cookie->netfs_data, |
57 | object->fscache.cookie->flags); | 57 | object->fscache.cookie->flags); |
58 | if (keybuf && cookie->def) | 58 | if (keybuf && cookie->def) |
59 | keylen = cookie->def->get_key(cookie->netfs_data, keybuf, | 59 | keylen = cookie->def->get_key(cookie->netfs_data, keybuf, |
60 | CACHEFILES_KEYBUF_SIZE); | 60 | CACHEFILES_KEYBUF_SIZE); |
61 | else | 61 | else |
62 | keylen = 0; | 62 | keylen = 0; |
63 | } else { | 63 | } else { |
64 | pr_err("%scookie=NULL\n", prefix); | 64 | pr_err("%scookie=NULL\n", prefix); |
65 | keylen = 0; | 65 | keylen = 0; |
66 | } | 66 | } |
67 | spin_unlock(&object->fscache.lock); | 67 | spin_unlock(&object->fscache.lock); |
68 | 68 | ||
69 | if (keylen) { | 69 | if (keylen) { |
70 | pr_err("%skey=[%u] '", prefix, keylen); | 70 | pr_err("%skey=[%u] '", prefix, keylen); |
71 | for (loop = 0; loop < keylen; loop++) | 71 | for (loop = 0; loop < keylen; loop++) |
72 | pr_cont("%02x", keybuf[loop]); | 72 | pr_cont("%02x", keybuf[loop]); |
73 | pr_cont("'\n"); | 73 | pr_cont("'\n"); |
74 | } | 74 | } |
75 | } | 75 | } |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * dump debugging info about a pair of objects | 78 | * dump debugging info about a pair of objects |
79 | */ | 79 | */ |
80 | static noinline void cachefiles_printk_object(struct cachefiles_object *object, | 80 | static noinline void cachefiles_printk_object(struct cachefiles_object *object, |
81 | struct cachefiles_object *xobject) | 81 | struct cachefiles_object *xobject) |
82 | { | 82 | { |
83 | u8 *keybuf; | 83 | u8 *keybuf; |
84 | 84 | ||
85 | keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); | 85 | keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); |
86 | if (object) | 86 | if (object) |
87 | __cachefiles_printk_object(object, "", keybuf); | 87 | __cachefiles_printk_object(object, "", keybuf); |
88 | if (xobject) | 88 | if (xobject) |
89 | __cachefiles_printk_object(xobject, "x", keybuf); | 89 | __cachefiles_printk_object(xobject, "x", keybuf); |
90 | kfree(keybuf); | 90 | kfree(keybuf); |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * mark the owner of a dentry, if there is one, to indicate that that dentry | 94 | * mark the owner of a dentry, if there is one, to indicate that that dentry |
95 | * has been preemptively deleted | 95 | * has been preemptively deleted |
96 | * - the caller must hold the i_mutex on the dentry's parent as required to | 96 | * - the caller must hold the i_mutex on the dentry's parent as required to |
97 | * call vfs_unlink(), vfs_rmdir() or vfs_rename() | 97 | * call vfs_unlink(), vfs_rmdir() or vfs_rename() |
98 | */ | 98 | */ |
99 | static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, | 99 | static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, |
100 | struct dentry *dentry) | 100 | struct dentry *dentry) |
101 | { | 101 | { |
102 | struct cachefiles_object *object; | 102 | struct cachefiles_object *object; |
103 | struct rb_node *p; | 103 | struct rb_node *p; |
104 | 104 | ||
105 | _enter(",'%*.*s'", | 105 | _enter(",'%*.*s'", |
106 | dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); | 106 | dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); |
107 | 107 | ||
108 | write_lock(&cache->active_lock); | 108 | write_lock(&cache->active_lock); |
109 | 109 | ||
110 | p = cache->active_nodes.rb_node; | 110 | p = cache->active_nodes.rb_node; |
111 | while (p) { | 111 | while (p) { |
112 | object = rb_entry(p, struct cachefiles_object, active_node); | 112 | object = rb_entry(p, struct cachefiles_object, active_node); |
113 | if (object->dentry > dentry) | 113 | if (object->dentry > dentry) |
114 | p = p->rb_left; | 114 | p = p->rb_left; |
115 | else if (object->dentry < dentry) | 115 | else if (object->dentry < dentry) |
116 | p = p->rb_right; | 116 | p = p->rb_right; |
117 | else | 117 | else |
118 | goto found_dentry; | 118 | goto found_dentry; |
119 | } | 119 | } |
120 | 120 | ||
121 | write_unlock(&cache->active_lock); | 121 | write_unlock(&cache->active_lock); |
122 | _leave(" [no owner]"); | 122 | _leave(" [no owner]"); |
123 | return; | 123 | return; |
124 | 124 | ||
125 | /* found the dentry for */ | 125 | /* found the dentry for */ |
126 | found_dentry: | 126 | found_dentry: |
127 | kdebug("preemptive burial: OBJ%x [%s] %p", | 127 | kdebug("preemptive burial: OBJ%x [%s] %p", |
128 | object->fscache.debug_id, | 128 | object->fscache.debug_id, |
129 | object->fscache.state->name, | 129 | object->fscache.state->name, |
130 | dentry); | 130 | dentry); |
131 | 131 | ||
132 | if (fscache_object_is_live(&object->fscache)) { | 132 | if (fscache_object_is_live(&object->fscache)) { |
133 | pr_err("\n"); | 133 | pr_err("\n"); |
134 | pr_err("Error: Can't preemptively bury live object\n"); | 134 | pr_err("Error: Can't preemptively bury live object\n"); |
135 | cachefiles_printk_object(object, NULL); | 135 | cachefiles_printk_object(object, NULL); |
136 | } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { | 136 | } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { |
137 | pr_err("Error: Object already preemptively buried\n"); | 137 | pr_err("Error: Object already preemptively buried\n"); |
138 | } | 138 | } |
139 | 139 | ||
140 | write_unlock(&cache->active_lock); | 140 | write_unlock(&cache->active_lock); |
141 | _leave(" [owner marked]"); | 141 | _leave(" [owner marked]"); |
142 | } | 142 | } |
143 | 143 | ||
144 | /* | 144 | /* |
145 | * record the fact that an object is now active | 145 | * record the fact that an object is now active |
146 | */ | 146 | */ |
147 | static int cachefiles_mark_object_active(struct cachefiles_cache *cache, | 147 | static int cachefiles_mark_object_active(struct cachefiles_cache *cache, |
148 | struct cachefiles_object *object) | 148 | struct cachefiles_object *object) |
149 | { | 149 | { |
150 | struct cachefiles_object *xobject; | 150 | struct cachefiles_object *xobject; |
151 | struct rb_node **_p, *_parent = NULL; | 151 | struct rb_node **_p, *_parent = NULL; |
152 | struct dentry *dentry; | 152 | struct dentry *dentry; |
153 | 153 | ||
154 | _enter(",%p", object); | 154 | _enter(",%p", object); |
155 | 155 | ||
156 | try_again: | 156 | try_again: |
157 | write_lock(&cache->active_lock); | 157 | write_lock(&cache->active_lock); |
158 | 158 | ||
159 | if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { | 159 | if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { |
160 | pr_err("Error: Object already active\n"); | 160 | pr_err("Error: Object already active\n"); |
161 | cachefiles_printk_object(object, NULL); | 161 | cachefiles_printk_object(object, NULL); |
162 | BUG(); | 162 | BUG(); |
163 | } | 163 | } |
164 | 164 | ||
165 | dentry = object->dentry; | 165 | dentry = object->dentry; |
166 | _p = &cache->active_nodes.rb_node; | 166 | _p = &cache->active_nodes.rb_node; |
167 | while (*_p) { | 167 | while (*_p) { |
168 | _parent = *_p; | 168 | _parent = *_p; |
169 | xobject = rb_entry(_parent, | 169 | xobject = rb_entry(_parent, |
170 | struct cachefiles_object, active_node); | 170 | struct cachefiles_object, active_node); |
171 | 171 | ||
172 | ASSERT(xobject != object); | 172 | ASSERT(xobject != object); |
173 | 173 | ||
174 | if (xobject->dentry > dentry) | 174 | if (xobject->dentry > dentry) |
175 | _p = &(*_p)->rb_left; | 175 | _p = &(*_p)->rb_left; |
176 | else if (xobject->dentry < dentry) | 176 | else if (xobject->dentry < dentry) |
177 | _p = &(*_p)->rb_right; | 177 | _p = &(*_p)->rb_right; |
178 | else | 178 | else |
179 | goto wait_for_old_object; | 179 | goto wait_for_old_object; |
180 | } | 180 | } |
181 | 181 | ||
182 | rb_link_node(&object->active_node, _parent, _p); | 182 | rb_link_node(&object->active_node, _parent, _p); |
183 | rb_insert_color(&object->active_node, &cache->active_nodes); | 183 | rb_insert_color(&object->active_node, &cache->active_nodes); |
184 | 184 | ||
185 | write_unlock(&cache->active_lock); | 185 | write_unlock(&cache->active_lock); |
186 | _leave(" = 0"); | 186 | _leave(" = 0"); |
187 | return 0; | 187 | return 0; |
188 | 188 | ||
189 | /* an old object from a previous incarnation is hogging the slot - we | 189 | /* an old object from a previous incarnation is hogging the slot - we |
190 | * need to wait for it to be destroyed */ | 190 | * need to wait for it to be destroyed */ |
191 | wait_for_old_object: | 191 | wait_for_old_object: |
192 | if (fscache_object_is_live(&object->fscache)) { | 192 | if (fscache_object_is_live(&object->fscache)) { |
193 | pr_err("\n"); | 193 | pr_err("\n"); |
194 | pr_err("Error: Unexpected object collision\n"); | 194 | pr_err("Error: Unexpected object collision\n"); |
195 | cachefiles_printk_object(object, xobject); | 195 | cachefiles_printk_object(object, xobject); |
196 | BUG(); | 196 | BUG(); |
197 | } | 197 | } |
198 | atomic_inc(&xobject->usage); | 198 | atomic_inc(&xobject->usage); |
199 | write_unlock(&cache->active_lock); | 199 | write_unlock(&cache->active_lock); |
200 | 200 | ||
201 | if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { | 201 | if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { |
202 | wait_queue_head_t *wq; | 202 | wait_queue_head_t *wq; |
203 | 203 | ||
204 | signed long timeout = 60 * HZ; | 204 | signed long timeout = 60 * HZ; |
205 | wait_queue_t wait; | 205 | wait_queue_t wait; |
206 | bool requeue; | 206 | bool requeue; |
207 | 207 | ||
208 | /* if the object we're waiting for is queued for processing, | 208 | /* if the object we're waiting for is queued for processing, |
209 | * then just put ourselves on the queue behind it */ | 209 | * then just put ourselves on the queue behind it */ |
210 | if (work_pending(&xobject->fscache.work)) { | 210 | if (work_pending(&xobject->fscache.work)) { |
211 | _debug("queue OBJ%x behind OBJ%x immediately", | 211 | _debug("queue OBJ%x behind OBJ%x immediately", |
212 | object->fscache.debug_id, | 212 | object->fscache.debug_id, |
213 | xobject->fscache.debug_id); | 213 | xobject->fscache.debug_id); |
214 | goto requeue; | 214 | goto requeue; |
215 | } | 215 | } |
216 | 216 | ||
217 | /* otherwise we sleep until either the object we're waiting for | 217 | /* otherwise we sleep until either the object we're waiting for |
218 | * is done, or the fscache_object is congested */ | 218 | * is done, or the fscache_object is congested */ |
219 | wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); | 219 | wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); |
220 | init_wait(&wait); | 220 | init_wait(&wait); |
221 | requeue = false; | 221 | requeue = false; |
222 | do { | 222 | do { |
223 | prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); | 223 | prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); |
224 | if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) | 224 | if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) |
225 | break; | 225 | break; |
226 | 226 | ||
227 | requeue = fscache_object_sleep_till_congested(&timeout); | 227 | requeue = fscache_object_sleep_till_congested(&timeout); |
228 | } while (timeout > 0 && !requeue); | 228 | } while (timeout > 0 && !requeue); |
229 | finish_wait(wq, &wait); | 229 | finish_wait(wq, &wait); |
230 | 230 | ||
231 | if (requeue && | 231 | if (requeue && |
232 | test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { | 232 | test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { |
233 | _debug("queue OBJ%x behind OBJ%x after wait", | 233 | _debug("queue OBJ%x behind OBJ%x after wait", |
234 | object->fscache.debug_id, | 234 | object->fscache.debug_id, |
235 | xobject->fscache.debug_id); | 235 | xobject->fscache.debug_id); |
236 | goto requeue; | 236 | goto requeue; |
237 | } | 237 | } |
238 | 238 | ||
239 | if (timeout <= 0) { | 239 | if (timeout <= 0) { |
240 | pr_err("\n"); | 240 | pr_err("\n"); |
241 | pr_err("Error: Overlong wait for old active object to go away\n"); | 241 | pr_err("Error: Overlong wait for old active object to go away\n"); |
242 | cachefiles_printk_object(object, xobject); | 242 | cachefiles_printk_object(object, xobject); |
243 | goto requeue; | 243 | goto requeue; |
244 | } | 244 | } |
245 | } | 245 | } |
246 | 246 | ||
247 | ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); | 247 | ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); |
248 | 248 | ||
249 | cache->cache.ops->put_object(&xobject->fscache); | 249 | cache->cache.ops->put_object(&xobject->fscache); |
250 | goto try_again; | 250 | goto try_again; |
251 | 251 | ||
252 | requeue: | 252 | requeue: |
253 | clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); | 253 | clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); |
254 | cache->cache.ops->put_object(&xobject->fscache); | 254 | cache->cache.ops->put_object(&xobject->fscache); |
255 | _leave(" = -ETIMEDOUT"); | 255 | _leave(" = -ETIMEDOUT"); |
256 | return -ETIMEDOUT; | 256 | return -ETIMEDOUT; |
257 | } | 257 | } |
258 | 258 | ||
259 | /* | 259 | /* |
260 | * delete an object representation from the cache | 260 | * delete an object representation from the cache |
261 | * - file backed objects are unlinked | 261 | * - file backed objects are unlinked |
262 | * - directory backed objects are stuffed into the graveyard for userspace to | 262 | * - directory backed objects are stuffed into the graveyard for userspace to |
263 | * delete | 263 | * delete |
264 | * - unlocks the directory mutex | 264 | * - unlocks the directory mutex |
265 | */ | 265 | */ |
266 | static int cachefiles_bury_object(struct cachefiles_cache *cache, | 266 | static int cachefiles_bury_object(struct cachefiles_cache *cache, |
267 | struct dentry *dir, | 267 | struct dentry *dir, |
268 | struct dentry *rep, | 268 | struct dentry *rep, |
269 | bool preemptive) | 269 | bool preemptive) |
270 | { | 270 | { |
271 | struct dentry *grave, *trap; | 271 | struct dentry *grave, *trap; |
272 | struct path path, path_to_graveyard; | 272 | struct path path, path_to_graveyard; |
273 | char nbuffer[8 + 8 + 1]; | 273 | char nbuffer[8 + 8 + 1]; |
274 | int ret; | 274 | int ret; |
275 | 275 | ||
276 | _enter(",'%*.*s','%*.*s'", | 276 | _enter(",'%*.*s','%*.*s'", |
277 | dir->d_name.len, dir->d_name.len, dir->d_name.name, | 277 | dir->d_name.len, dir->d_name.len, dir->d_name.name, |
278 | rep->d_name.len, rep->d_name.len, rep->d_name.name); | 278 | rep->d_name.len, rep->d_name.len, rep->d_name.name); |
279 | 279 | ||
280 | _debug("remove %p from %p", rep, dir); | 280 | _debug("remove %p from %p", rep, dir); |
281 | 281 | ||
282 | /* non-directories can just be unlinked */ | 282 | /* non-directories can just be unlinked */ |
283 | if (!S_ISDIR(rep->d_inode->i_mode)) { | 283 | if (!S_ISDIR(rep->d_inode->i_mode)) { |
284 | _debug("unlink stale object"); | 284 | _debug("unlink stale object"); |
285 | 285 | ||
286 | path.mnt = cache->mnt; | 286 | path.mnt = cache->mnt; |
287 | path.dentry = dir; | 287 | path.dentry = dir; |
288 | ret = security_path_unlink(&path, rep); | 288 | ret = security_path_unlink(&path, rep); |
289 | if (ret < 0) { | 289 | if (ret < 0) { |
290 | cachefiles_io_error(cache, "Unlink security error"); | 290 | cachefiles_io_error(cache, "Unlink security error"); |
291 | } else { | 291 | } else { |
292 | ret = vfs_unlink(dir->d_inode, rep, NULL); | 292 | ret = vfs_unlink(dir->d_inode, rep, NULL); |
293 | 293 | ||
294 | if (preemptive) | 294 | if (preemptive) |
295 | cachefiles_mark_object_buried(cache, rep); | 295 | cachefiles_mark_object_buried(cache, rep); |
296 | } | 296 | } |
297 | 297 | ||
298 | mutex_unlock(&dir->d_inode->i_mutex); | 298 | mutex_unlock(&dir->d_inode->i_mutex); |
299 | 299 | ||
300 | if (ret == -EIO) | 300 | if (ret == -EIO) |
301 | cachefiles_io_error(cache, "Unlink failed"); | 301 | cachefiles_io_error(cache, "Unlink failed"); |
302 | 302 | ||
303 | _leave(" = %d", ret); | 303 | _leave(" = %d", ret); |
304 | return ret; | 304 | return ret; |
305 | } | 305 | } |
306 | 306 | ||
307 | /* directories have to be moved to the graveyard */ | 307 | /* directories have to be moved to the graveyard */ |
308 | _debug("move stale object to graveyard"); | 308 | _debug("move stale object to graveyard"); |
309 | mutex_unlock(&dir->d_inode->i_mutex); | 309 | mutex_unlock(&dir->d_inode->i_mutex); |
310 | 310 | ||
311 | try_again: | 311 | try_again: |
312 | /* first step is to make up a grave dentry in the graveyard */ | 312 | /* first step is to make up a grave dentry in the graveyard */ |
313 | sprintf(nbuffer, "%08x%08x", | 313 | sprintf(nbuffer, "%08x%08x", |
314 | (uint32_t) get_seconds(), | 314 | (uint32_t) get_seconds(), |
315 | (uint32_t) atomic_inc_return(&cache->gravecounter)); | 315 | (uint32_t) atomic_inc_return(&cache->gravecounter)); |
316 | 316 | ||
317 | /* do the multiway lock magic */ | 317 | /* do the multiway lock magic */ |
318 | trap = lock_rename(cache->graveyard, dir); | 318 | trap = lock_rename(cache->graveyard, dir); |
319 | 319 | ||
320 | /* do some checks before getting the grave dentry */ | 320 | /* do some checks before getting the grave dentry */ |
321 | if (rep->d_parent != dir) { | 321 | if (rep->d_parent != dir) { |
322 | /* the entry was probably culled when we dropped the parent dir | 322 | /* the entry was probably culled when we dropped the parent dir |
323 | * lock */ | 323 | * lock */ |
324 | unlock_rename(cache->graveyard, dir); | 324 | unlock_rename(cache->graveyard, dir); |
325 | _leave(" = 0 [culled?]"); | 325 | _leave(" = 0 [culled?]"); |
326 | return 0; | 326 | return 0; |
327 | } | 327 | } |
328 | 328 | ||
329 | if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { | 329 | if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { |
330 | unlock_rename(cache->graveyard, dir); | 330 | unlock_rename(cache->graveyard, dir); |
331 | cachefiles_io_error(cache, "Graveyard no longer a directory"); | 331 | cachefiles_io_error(cache, "Graveyard no longer a directory"); |
332 | return -EIO; | 332 | return -EIO; |
333 | } | 333 | } |
334 | 334 | ||
335 | if (trap == rep) { | 335 | if (trap == rep) { |
336 | unlock_rename(cache->graveyard, dir); | 336 | unlock_rename(cache->graveyard, dir); |
337 | cachefiles_io_error(cache, "May not make directory loop"); | 337 | cachefiles_io_error(cache, "May not make directory loop"); |
338 | return -EIO; | 338 | return -EIO; |
339 | } | 339 | } |
340 | 340 | ||
341 | if (d_mountpoint(rep)) { | 341 | if (d_mountpoint(rep)) { |
342 | unlock_rename(cache->graveyard, dir); | 342 | unlock_rename(cache->graveyard, dir); |
343 | cachefiles_io_error(cache, "Mountpoint in cache"); | 343 | cachefiles_io_error(cache, "Mountpoint in cache"); |
344 | return -EIO; | 344 | return -EIO; |
345 | } | 345 | } |
346 | 346 | ||
347 | grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); | 347 | grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); |
348 | if (IS_ERR(grave)) { | 348 | if (IS_ERR(grave)) { |
349 | unlock_rename(cache->graveyard, dir); | 349 | unlock_rename(cache->graveyard, dir); |
350 | 350 | ||
351 | if (PTR_ERR(grave) == -ENOMEM) { | 351 | if (PTR_ERR(grave) == -ENOMEM) { |
352 | _leave(" = -ENOMEM"); | 352 | _leave(" = -ENOMEM"); |
353 | return -ENOMEM; | 353 | return -ENOMEM; |
354 | } | 354 | } |
355 | 355 | ||
356 | cachefiles_io_error(cache, "Lookup error %ld", | 356 | cachefiles_io_error(cache, "Lookup error %ld", |
357 | PTR_ERR(grave)); | 357 | PTR_ERR(grave)); |
358 | return -EIO; | 358 | return -EIO; |
359 | } | 359 | } |
360 | 360 | ||
361 | if (grave->d_inode) { | 361 | if (grave->d_inode) { |
362 | unlock_rename(cache->graveyard, dir); | 362 | unlock_rename(cache->graveyard, dir); |
363 | dput(grave); | 363 | dput(grave); |
364 | grave = NULL; | 364 | grave = NULL; |
365 | cond_resched(); | 365 | cond_resched(); |
366 | goto try_again; | 366 | goto try_again; |
367 | } | 367 | } |
368 | 368 | ||
369 | if (d_mountpoint(grave)) { | 369 | if (d_mountpoint(grave)) { |
370 | unlock_rename(cache->graveyard, dir); | 370 | unlock_rename(cache->graveyard, dir); |
371 | dput(grave); | 371 | dput(grave); |
372 | cachefiles_io_error(cache, "Mountpoint in graveyard"); | 372 | cachefiles_io_error(cache, "Mountpoint in graveyard"); |
373 | return -EIO; | 373 | return -EIO; |
374 | } | 374 | } |
375 | 375 | ||
376 | /* target should not be an ancestor of source */ | 376 | /* target should not be an ancestor of source */ |
377 | if (trap == grave) { | 377 | if (trap == grave) { |
378 | unlock_rename(cache->graveyard, dir); | 378 | unlock_rename(cache->graveyard, dir); |
379 | dput(grave); | 379 | dput(grave); |
380 | cachefiles_io_error(cache, "May not make directory loop"); | 380 | cachefiles_io_error(cache, "May not make directory loop"); |
381 | return -EIO; | 381 | return -EIO; |
382 | } | 382 | } |
383 | 383 | ||
384 | /* attempt the rename */ | 384 | /* attempt the rename */ |
385 | path.mnt = cache->mnt; | 385 | path.mnt = cache->mnt; |
386 | path.dentry = dir; | 386 | path.dentry = dir; |
387 | path_to_graveyard.mnt = cache->mnt; | 387 | path_to_graveyard.mnt = cache->mnt; |
388 | path_to_graveyard.dentry = cache->graveyard; | 388 | path_to_graveyard.dentry = cache->graveyard; |
389 | ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); | 389 | ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); |
390 | if (ret < 0) { | 390 | if (ret < 0) { |
391 | cachefiles_io_error(cache, "Rename security error %d", ret); | 391 | cachefiles_io_error(cache, "Rename security error %d", ret); |
392 | } else { | 392 | } else { |
393 | ret = vfs_rename(dir->d_inode, rep, | 393 | ret = vfs_rename(dir->d_inode, rep, |
394 | cache->graveyard->d_inode, grave, NULL, 0); | 394 | cache->graveyard->d_inode, grave, NULL, 0); |
395 | if (ret != 0 && ret != -ENOMEM) | 395 | if (ret != 0 && ret != -ENOMEM) |
396 | cachefiles_io_error(cache, | 396 | cachefiles_io_error(cache, |
397 | "Rename failed with error %d", ret); | 397 | "Rename failed with error %d", ret); |
398 | 398 | ||
399 | if (preemptive) | 399 | if (preemptive) |
400 | cachefiles_mark_object_buried(cache, rep); | 400 | cachefiles_mark_object_buried(cache, rep); |
401 | } | 401 | } |
402 | 402 | ||
403 | unlock_rename(cache->graveyard, dir); | 403 | unlock_rename(cache->graveyard, dir); |
404 | dput(grave); | 404 | dput(grave); |
405 | _leave(" = 0"); | 405 | _leave(" = 0"); |
406 | return 0; | 406 | return 0; |
407 | } | 407 | } |
408 | 408 | ||
409 | /* | 409 | /* |
410 | * delete an object representation from the cache | 410 | * delete an object representation from the cache |
411 | */ | 411 | */ |
412 | int cachefiles_delete_object(struct cachefiles_cache *cache, | 412 | int cachefiles_delete_object(struct cachefiles_cache *cache, |
413 | struct cachefiles_object *object) | 413 | struct cachefiles_object *object) |
414 | { | 414 | { |
415 | struct dentry *dir; | 415 | struct dentry *dir; |
416 | int ret; | 416 | int ret; |
417 | 417 | ||
418 | _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); | 418 | _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); |
419 | 419 | ||
420 | ASSERT(object->dentry); | 420 | ASSERT(object->dentry); |
421 | ASSERT(object->dentry->d_inode); | 421 | ASSERT(object->dentry->d_inode); |
422 | ASSERT(object->dentry->d_parent); | 422 | ASSERT(object->dentry->d_parent); |
423 | 423 | ||
424 | dir = dget_parent(object->dentry); | 424 | dir = dget_parent(object->dentry); |
425 | 425 | ||
426 | mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); | 426 | mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); |
427 | 427 | ||
428 | if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { | 428 | if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { |
429 | /* object allocation for the same key preemptively deleted this | 429 | /* object allocation for the same key preemptively deleted this |
430 | * object's file so that it could create its own file */ | 430 | * object's file so that it could create its own file */ |
431 | _debug("object preemptively buried"); | 431 | _debug("object preemptively buried"); |
432 | mutex_unlock(&dir->d_inode->i_mutex); | 432 | mutex_unlock(&dir->d_inode->i_mutex); |
433 | ret = 0; | 433 | ret = 0; |
434 | } else { | 434 | } else { |
435 | /* we need to check that our parent is _still_ our parent - it | 435 | /* we need to check that our parent is _still_ our parent - it |
436 | * may have been renamed */ | 436 | * may have been renamed */ |
437 | if (dir == object->dentry->d_parent) { | 437 | if (dir == object->dentry->d_parent) { |
438 | ret = cachefiles_bury_object(cache, dir, | 438 | ret = cachefiles_bury_object(cache, dir, |
439 | object->dentry, false); | 439 | object->dentry, false); |
440 | } else { | 440 | } else { |
441 | /* it got moved, presumably by cachefilesd culling it, | 441 | /* it got moved, presumably by cachefilesd culling it, |
442 | * so it's no longer in the key path and we can ignore | 442 | * so it's no longer in the key path and we can ignore |
443 | * it */ | 443 | * it */ |
444 | mutex_unlock(&dir->d_inode->i_mutex); | 444 | mutex_unlock(&dir->d_inode->i_mutex); |
445 | ret = 0; | 445 | ret = 0; |
446 | } | 446 | } |
447 | } | 447 | } |
448 | 448 | ||
449 | dput(dir); | 449 | dput(dir); |
450 | _leave(" = %d", ret); | 450 | _leave(" = %d", ret); |
451 | return ret; | 451 | return ret; |
452 | } | 452 | } |
453 | 453 | ||
454 | /* | 454 | /* |
455 | * walk from the parent object to the child object through the backing | 455 | * walk from the parent object to the child object through the backing |
456 | * filesystem, creating directories as we go | 456 | * filesystem, creating directories as we go |
457 | */ | 457 | */ |
458 | int cachefiles_walk_to_object(struct cachefiles_object *parent, | 458 | int cachefiles_walk_to_object(struct cachefiles_object *parent, |
459 | struct cachefiles_object *object, | 459 | struct cachefiles_object *object, |
460 | const char *key, | 460 | const char *key, |
461 | struct cachefiles_xattr *auxdata) | 461 | struct cachefiles_xattr *auxdata) |
462 | { | 462 | { |
463 | struct cachefiles_cache *cache; | 463 | struct cachefiles_cache *cache; |
464 | struct dentry *dir, *next = NULL; | 464 | struct dentry *dir, *next = NULL; |
465 | struct path path; | 465 | struct path path; |
466 | unsigned long start; | 466 | unsigned long start; |
467 | const char *name; | 467 | const char *name; |
468 | int ret, nlen; | 468 | int ret, nlen; |
469 | 469 | ||
470 | _enter("OBJ%x{%p},OBJ%x,%s,", | 470 | _enter("OBJ%x{%p},OBJ%x,%s,", |
471 | parent->fscache.debug_id, parent->dentry, | 471 | parent->fscache.debug_id, parent->dentry, |
472 | object->fscache.debug_id, key); | 472 | object->fscache.debug_id, key); |
473 | 473 | ||
474 | cache = container_of(parent->fscache.cache, | 474 | cache = container_of(parent->fscache.cache, |
475 | struct cachefiles_cache, cache); | 475 | struct cachefiles_cache, cache); |
476 | path.mnt = cache->mnt; | 476 | path.mnt = cache->mnt; |
477 | 477 | ||
478 | ASSERT(parent->dentry); | 478 | ASSERT(parent->dentry); |
479 | ASSERT(parent->dentry->d_inode); | 479 | ASSERT(parent->dentry->d_inode); |
480 | 480 | ||
481 | if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { | 481 | if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { |
482 | // TODO: convert file to dir | 482 | // TODO: convert file to dir |
483 | _leave("looking up in none directory"); | 483 | _leave("looking up in none directory"); |
484 | return -ENOBUFS; | 484 | return -ENOBUFS; |
485 | } | 485 | } |
486 | 486 | ||
487 | dir = dget(parent->dentry); | 487 | dir = dget(parent->dentry); |
488 | 488 | ||
489 | advance: | 489 | advance: |
490 | /* attempt to transit the first directory component */ | 490 | /* attempt to transit the first directory component */ |
491 | name = key; | 491 | name = key; |
492 | nlen = strlen(key); | 492 | nlen = strlen(key); |
493 | 493 | ||
494 | /* key ends in a double NUL */ | 494 | /* key ends in a double NUL */ |
495 | key = key + nlen + 1; | 495 | key = key + nlen + 1; |
496 | if (!*key) | 496 | if (!*key) |
497 | key = NULL; | 497 | key = NULL; |
498 | 498 | ||
499 | lookup_again: | 499 | lookup_again: |
500 | /* search the current directory for the element name */ | 500 | /* search the current directory for the element name */ |
501 | _debug("lookup '%s'", name); | 501 | _debug("lookup '%s'", name); |
502 | 502 | ||
503 | mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); | 503 | mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); |
504 | 504 | ||
505 | start = jiffies; | 505 | start = jiffies; |
506 | next = lookup_one_len(name, dir, nlen); | 506 | next = lookup_one_len(name, dir, nlen); |
507 | cachefiles_hist(cachefiles_lookup_histogram, start); | 507 | cachefiles_hist(cachefiles_lookup_histogram, start); |
508 | if (IS_ERR(next)) | 508 | if (IS_ERR(next)) |
509 | goto lookup_error; | 509 | goto lookup_error; |
510 | 510 | ||
511 | _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); | 511 | _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); |
512 | 512 | ||
513 | if (!key) | 513 | if (!key) |
514 | object->new = !next->d_inode; | 514 | object->new = !next->d_inode; |
515 | 515 | ||
516 | /* if this element of the path doesn't exist, then the lookup phase | 516 | /* if this element of the path doesn't exist, then the lookup phase |
517 | * failed, and we can release any readers in the certain knowledge that | 517 | * failed, and we can release any readers in the certain knowledge that |
518 | * there's nothing for them to actually read */ | 518 | * there's nothing for them to actually read */ |
519 | if (!next->d_inode) | 519 | if (!next->d_inode) |
520 | fscache_object_lookup_negative(&object->fscache); | 520 | fscache_object_lookup_negative(&object->fscache); |
521 | 521 | ||
522 | /* we need to create the object if it's negative */ | 522 | /* we need to create the object if it's negative */ |
523 | if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { | 523 | if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { |
524 | /* index objects and intervening tree levels must be subdirs */ | 524 | /* index objects and intervening tree levels must be subdirs */ |
525 | if (!next->d_inode) { | 525 | if (!next->d_inode) { |
526 | ret = cachefiles_has_space(cache, 1, 0); | 526 | ret = cachefiles_has_space(cache, 1, 0); |
527 | if (ret < 0) | 527 | if (ret < 0) |
528 | goto create_error; | 528 | goto create_error; |
529 | 529 | ||
530 | path.dentry = dir; | 530 | path.dentry = dir; |
531 | ret = security_path_mkdir(&path, next, 0); | 531 | ret = security_path_mkdir(&path, next, 0); |
532 | if (ret < 0) | 532 | if (ret < 0) |
533 | goto create_error; | 533 | goto create_error; |
534 | start = jiffies; | 534 | start = jiffies; |
535 | ret = vfs_mkdir(dir->d_inode, next, 0); | 535 | ret = vfs_mkdir(dir->d_inode, next, 0); |
536 | cachefiles_hist(cachefiles_mkdir_histogram, start); | 536 | cachefiles_hist(cachefiles_mkdir_histogram, start); |
537 | if (ret < 0) | 537 | if (ret < 0) |
538 | goto create_error; | 538 | goto create_error; |
539 | 539 | ||
540 | ASSERT(next->d_inode); | 540 | ASSERT(next->d_inode); |
541 | 541 | ||
542 | _debug("mkdir -> %p{%p{ino=%lu}}", | 542 | _debug("mkdir -> %p{%p{ino=%lu}}", |
543 | next, next->d_inode, next->d_inode->i_ino); | 543 | next, next->d_inode, next->d_inode->i_ino); |
544 | 544 | ||
545 | } else if (!S_ISDIR(next->d_inode->i_mode)) { | 545 | } else if (!S_ISDIR(next->d_inode->i_mode)) { |
546 | pr_err("inode %lu is not a directory", | 546 | pr_err("inode %lu is not a directory\n", |
547 | next->d_inode->i_ino); | 547 | next->d_inode->i_ino); |
548 | ret = -ENOBUFS; | 548 | ret = -ENOBUFS; |
549 | goto error; | 549 | goto error; |
550 | } | 550 | } |
551 | 551 | ||
552 | } else { | 552 | } else { |
553 | /* non-index objects start out life as files */ | 553 | /* non-index objects start out life as files */ |
554 | if (!next->d_inode) { | 554 | if (!next->d_inode) { |
555 | ret = cachefiles_has_space(cache, 1, 0); | 555 | ret = cachefiles_has_space(cache, 1, 0); |
556 | if (ret < 0) | 556 | if (ret < 0) |
557 | goto create_error; | 557 | goto create_error; |
558 | 558 | ||
559 | path.dentry = dir; | 559 | path.dentry = dir; |
560 | ret = security_path_mknod(&path, next, S_IFREG, 0); | 560 | ret = security_path_mknod(&path, next, S_IFREG, 0); |
561 | if (ret < 0) | 561 | if (ret < 0) |
562 | goto create_error; | 562 | goto create_error; |
563 | start = jiffies; | 563 | start = jiffies; |
564 | ret = vfs_create(dir->d_inode, next, S_IFREG, true); | 564 | ret = vfs_create(dir->d_inode, next, S_IFREG, true); |
565 | cachefiles_hist(cachefiles_create_histogram, start); | 565 | cachefiles_hist(cachefiles_create_histogram, start); |
566 | if (ret < 0) | 566 | if (ret < 0) |
567 | goto create_error; | 567 | goto create_error; |
568 | 568 | ||
569 | ASSERT(next->d_inode); | 569 | ASSERT(next->d_inode); |
570 | 570 | ||
571 | _debug("create -> %p{%p{ino=%lu}}", | 571 | _debug("create -> %p{%p{ino=%lu}}", |
572 | next, next->d_inode, next->d_inode->i_ino); | 572 | next, next->d_inode, next->d_inode->i_ino); |
573 | 573 | ||
574 | } else if (!S_ISDIR(next->d_inode->i_mode) && | 574 | } else if (!S_ISDIR(next->d_inode->i_mode) && |
575 | !S_ISREG(next->d_inode->i_mode) | 575 | !S_ISREG(next->d_inode->i_mode) |
576 | ) { | 576 | ) { |
577 | pr_err("inode %lu is not a file or directory", | 577 | pr_err("inode %lu is not a file or directory\n", |
578 | next->d_inode->i_ino); | 578 | next->d_inode->i_ino); |
579 | ret = -ENOBUFS; | 579 | ret = -ENOBUFS; |
580 | goto error; | 580 | goto error; |
581 | } | 581 | } |
582 | } | 582 | } |
583 | 583 | ||
584 | /* process the next component */ | 584 | /* process the next component */ |
585 | if (key) { | 585 | if (key) { |
586 | _debug("advance"); | 586 | _debug("advance"); |
587 | mutex_unlock(&dir->d_inode->i_mutex); | 587 | mutex_unlock(&dir->d_inode->i_mutex); |
588 | dput(dir); | 588 | dput(dir); |
589 | dir = next; | 589 | dir = next; |
590 | next = NULL; | 590 | next = NULL; |
591 | goto advance; | 591 | goto advance; |
592 | } | 592 | } |
593 | 593 | ||
594 | /* we've found the object we were looking for */ | 594 | /* we've found the object we were looking for */ |
595 | object->dentry = next; | 595 | object->dentry = next; |
596 | 596 | ||
597 | /* if we've found that the terminal object exists, then we need to | 597 | /* if we've found that the terminal object exists, then we need to |
598 | * check its attributes and delete it if it's out of date */ | 598 | * check its attributes and delete it if it's out of date */ |
599 | if (!object->new) { | 599 | if (!object->new) { |
600 | _debug("validate '%*.*s'", | 600 | _debug("validate '%*.*s'", |
601 | next->d_name.len, next->d_name.len, next->d_name.name); | 601 | next->d_name.len, next->d_name.len, next->d_name.name); |
602 | 602 | ||
603 | ret = cachefiles_check_object_xattr(object, auxdata); | 603 | ret = cachefiles_check_object_xattr(object, auxdata); |
604 | if (ret == -ESTALE) { | 604 | if (ret == -ESTALE) { |
605 | /* delete the object (the deleter drops the directory | 605 | /* delete the object (the deleter drops the directory |
606 | * mutex) */ | 606 | * mutex) */ |
607 | object->dentry = NULL; | 607 | object->dentry = NULL; |
608 | 608 | ||
609 | ret = cachefiles_bury_object(cache, dir, next, true); | 609 | ret = cachefiles_bury_object(cache, dir, next, true); |
610 | dput(next); | 610 | dput(next); |
611 | next = NULL; | 611 | next = NULL; |
612 | 612 | ||
613 | if (ret < 0) | 613 | if (ret < 0) |
614 | goto delete_error; | 614 | goto delete_error; |
615 | 615 | ||
616 | _debug("redo lookup"); | 616 | _debug("redo lookup"); |
617 | goto lookup_again; | 617 | goto lookup_again; |
618 | } | 618 | } |
619 | } | 619 | } |
620 | 620 | ||
621 | /* note that we're now using this object */ | 621 | /* note that we're now using this object */ |
622 | ret = cachefiles_mark_object_active(cache, object); | 622 | ret = cachefiles_mark_object_active(cache, object); |
623 | 623 | ||
624 | mutex_unlock(&dir->d_inode->i_mutex); | 624 | mutex_unlock(&dir->d_inode->i_mutex); |
625 | dput(dir); | 625 | dput(dir); |
626 | dir = NULL; | 626 | dir = NULL; |
627 | 627 | ||
628 | if (ret == -ETIMEDOUT) | 628 | if (ret == -ETIMEDOUT) |
629 | goto mark_active_timed_out; | 629 | goto mark_active_timed_out; |
630 | 630 | ||
631 | _debug("=== OBTAINED_OBJECT ==="); | 631 | _debug("=== OBTAINED_OBJECT ==="); |
632 | 632 | ||
633 | if (object->new) { | 633 | if (object->new) { |
634 | /* attach data to a newly constructed terminal object */ | 634 | /* attach data to a newly constructed terminal object */ |
635 | ret = cachefiles_set_object_xattr(object, auxdata); | 635 | ret = cachefiles_set_object_xattr(object, auxdata); |
636 | if (ret < 0) | 636 | if (ret < 0) |
637 | goto check_error; | 637 | goto check_error; |
638 | } else { | 638 | } else { |
639 | /* always update the atime on an object we've just looked up | 639 | /* always update the atime on an object we've just looked up |
640 | * (this is used to keep track of culling, and atimes are only | 640 | * (this is used to keep track of culling, and atimes are only |
641 | * updated by read, write and readdir but not lookup or | 641 | * updated by read, write and readdir but not lookup or |
642 | * open) */ | 642 | * open) */ |
643 | path.dentry = next; | 643 | path.dentry = next; |
644 | touch_atime(&path); | 644 | touch_atime(&path); |
645 | } | 645 | } |
646 | 646 | ||
647 | /* open a file interface onto a data file */ | 647 | /* open a file interface onto a data file */ |
648 | if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { | 648 | if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { |
649 | if (S_ISREG(object->dentry->d_inode->i_mode)) { | 649 | if (S_ISREG(object->dentry->d_inode->i_mode)) { |
650 | const struct address_space_operations *aops; | 650 | const struct address_space_operations *aops; |
651 | 651 | ||
652 | ret = -EPERM; | 652 | ret = -EPERM; |
653 | aops = object->dentry->d_inode->i_mapping->a_ops; | 653 | aops = object->dentry->d_inode->i_mapping->a_ops; |
654 | if (!aops->bmap) | 654 | if (!aops->bmap) |
655 | goto check_error; | 655 | goto check_error; |
656 | 656 | ||
657 | object->backer = object->dentry; | 657 | object->backer = object->dentry; |
658 | } else { | 658 | } else { |
659 | BUG(); // TODO: open file in data-class subdir | 659 | BUG(); // TODO: open file in data-class subdir |
660 | } | 660 | } |
661 | } | 661 | } |
662 | 662 | ||
663 | object->new = 0; | 663 | object->new = 0; |
664 | fscache_obtained_object(&object->fscache); | 664 | fscache_obtained_object(&object->fscache); |
665 | 665 | ||
666 | _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); | 666 | _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); |
667 | return 0; | 667 | return 0; |
668 | 668 | ||
669 | create_error: | 669 | create_error: |
670 | _debug("create error %d", ret); | 670 | _debug("create error %d", ret); |
671 | if (ret == -EIO) | 671 | if (ret == -EIO) |
672 | cachefiles_io_error(cache, "Create/mkdir failed"); | 672 | cachefiles_io_error(cache, "Create/mkdir failed"); |
673 | goto error; | 673 | goto error; |
674 | 674 | ||
675 | mark_active_timed_out: | 675 | mark_active_timed_out: |
676 | _debug("mark active timed out"); | 676 | _debug("mark active timed out"); |
677 | goto release_dentry; | 677 | goto release_dentry; |
678 | 678 | ||
679 | check_error: | 679 | check_error: |
680 | _debug("check error %d", ret); | 680 | _debug("check error %d", ret); |
681 | write_lock(&cache->active_lock); | 681 | write_lock(&cache->active_lock); |
682 | rb_erase(&object->active_node, &cache->active_nodes); | 682 | rb_erase(&object->active_node, &cache->active_nodes); |
683 | clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); | 683 | clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); |
684 | wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); | 684 | wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); |
685 | write_unlock(&cache->active_lock); | 685 | write_unlock(&cache->active_lock); |
686 | release_dentry: | 686 | release_dentry: |
687 | dput(object->dentry); | 687 | dput(object->dentry); |
688 | object->dentry = NULL; | 688 | object->dentry = NULL; |
689 | goto error_out; | 689 | goto error_out; |
690 | 690 | ||
691 | delete_error: | 691 | delete_error: |
692 | _debug("delete error %d", ret); | 692 | _debug("delete error %d", ret); |
693 | goto error_out2; | 693 | goto error_out2; |
694 | 694 | ||
695 | lookup_error: | 695 | lookup_error: |
696 | _debug("lookup error %ld", PTR_ERR(next)); | 696 | _debug("lookup error %ld", PTR_ERR(next)); |
697 | ret = PTR_ERR(next); | 697 | ret = PTR_ERR(next); |
698 | if (ret == -EIO) | 698 | if (ret == -EIO) |
699 | cachefiles_io_error(cache, "Lookup failed"); | 699 | cachefiles_io_error(cache, "Lookup failed"); |
700 | next = NULL; | 700 | next = NULL; |
701 | error: | 701 | error: |
702 | mutex_unlock(&dir->d_inode->i_mutex); | 702 | mutex_unlock(&dir->d_inode->i_mutex); |
703 | dput(next); | 703 | dput(next); |
704 | error_out2: | 704 | error_out2: |
705 | dput(dir); | 705 | dput(dir); |
706 | error_out: | 706 | error_out: |
707 | _leave(" = error %d", -ret); | 707 | _leave(" = error %d", -ret); |
708 | return ret; | 708 | return ret; |
709 | } | 709 | } |
710 | 710 | ||
711 | /* | 711 | /* |
712 | * get a subdirectory | 712 | * get a subdirectory |
713 | */ | 713 | */ |
714 | struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, | 714 | struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, |
715 | struct dentry *dir, | 715 | struct dentry *dir, |
716 | const char *dirname) | 716 | const char *dirname) |
717 | { | 717 | { |
718 | struct dentry *subdir; | 718 | struct dentry *subdir; |
719 | unsigned long start; | 719 | unsigned long start; |
720 | struct path path; | 720 | struct path path; |
721 | int ret; | 721 | int ret; |
722 | 722 | ||
723 | _enter(",,%s", dirname); | 723 | _enter(",,%s", dirname); |
724 | 724 | ||
725 | /* search the current directory for the element name */ | 725 | /* search the current directory for the element name */ |
726 | mutex_lock(&dir->d_inode->i_mutex); | 726 | mutex_lock(&dir->d_inode->i_mutex); |
727 | 727 | ||
728 | start = jiffies; | 728 | start = jiffies; |
729 | subdir = lookup_one_len(dirname, dir, strlen(dirname)); | 729 | subdir = lookup_one_len(dirname, dir, strlen(dirname)); |
730 | cachefiles_hist(cachefiles_lookup_histogram, start); | 730 | cachefiles_hist(cachefiles_lookup_histogram, start); |
731 | if (IS_ERR(subdir)) { | 731 | if (IS_ERR(subdir)) { |
732 | if (PTR_ERR(subdir) == -ENOMEM) | 732 | if (PTR_ERR(subdir) == -ENOMEM) |
733 | goto nomem_d_alloc; | 733 | goto nomem_d_alloc; |
734 | goto lookup_error; | 734 | goto lookup_error; |
735 | } | 735 | } |
736 | 736 | ||
737 | _debug("subdir -> %p %s", | 737 | _debug("subdir -> %p %s", |
738 | subdir, subdir->d_inode ? "positive" : "negative"); | 738 | subdir, subdir->d_inode ? "positive" : "negative"); |
739 | 739 | ||
740 | /* we need to create the subdir if it doesn't exist yet */ | 740 | /* we need to create the subdir if it doesn't exist yet */ |
741 | if (!subdir->d_inode) { | 741 | if (!subdir->d_inode) { |
742 | ret = cachefiles_has_space(cache, 1, 0); | 742 | ret = cachefiles_has_space(cache, 1, 0); |
743 | if (ret < 0) | 743 | if (ret < 0) |
744 | goto mkdir_error; | 744 | goto mkdir_error; |
745 | 745 | ||
746 | _debug("attempt mkdir"); | 746 | _debug("attempt mkdir"); |
747 | 747 | ||
748 | path.mnt = cache->mnt; | 748 | path.mnt = cache->mnt; |
749 | path.dentry = dir; | 749 | path.dentry = dir; |
750 | ret = security_path_mkdir(&path, subdir, 0700); | 750 | ret = security_path_mkdir(&path, subdir, 0700); |
751 | if (ret < 0) | 751 | if (ret < 0) |
752 | goto mkdir_error; | 752 | goto mkdir_error; |
753 | ret = vfs_mkdir(dir->d_inode, subdir, 0700); | 753 | ret = vfs_mkdir(dir->d_inode, subdir, 0700); |
754 | if (ret < 0) | 754 | if (ret < 0) |
755 | goto mkdir_error; | 755 | goto mkdir_error; |
756 | 756 | ||
757 | ASSERT(subdir->d_inode); | 757 | ASSERT(subdir->d_inode); |
758 | 758 | ||
759 | _debug("mkdir -> %p{%p{ino=%lu}}", | 759 | _debug("mkdir -> %p{%p{ino=%lu}}", |
760 | subdir, | 760 | subdir, |
761 | subdir->d_inode, | 761 | subdir->d_inode, |
762 | subdir->d_inode->i_ino); | 762 | subdir->d_inode->i_ino); |
763 | } | 763 | } |
764 | 764 | ||
765 | mutex_unlock(&dir->d_inode->i_mutex); | 765 | mutex_unlock(&dir->d_inode->i_mutex); |
766 | 766 | ||
767 | /* we need to make sure the subdir is a directory */ | 767 | /* we need to make sure the subdir is a directory */ |
768 | ASSERT(subdir->d_inode); | 768 | ASSERT(subdir->d_inode); |
769 | 769 | ||
770 | if (!S_ISDIR(subdir->d_inode->i_mode)) { | 770 | if (!S_ISDIR(subdir->d_inode->i_mode)) { |
771 | pr_err("%s is not a directory", dirname); | 771 | pr_err("%s is not a directory\n", dirname); |
772 | ret = -EIO; | 772 | ret = -EIO; |
773 | goto check_error; | 773 | goto check_error; |
774 | } | 774 | } |
775 | 775 | ||
776 | ret = -EPERM; | 776 | ret = -EPERM; |
777 | if (!subdir->d_inode->i_op->setxattr || | 777 | if (!subdir->d_inode->i_op->setxattr || |
778 | !subdir->d_inode->i_op->getxattr || | 778 | !subdir->d_inode->i_op->getxattr || |
779 | !subdir->d_inode->i_op->lookup || | 779 | !subdir->d_inode->i_op->lookup || |
780 | !subdir->d_inode->i_op->mkdir || | 780 | !subdir->d_inode->i_op->mkdir || |
781 | !subdir->d_inode->i_op->create || | 781 | !subdir->d_inode->i_op->create || |
782 | (!subdir->d_inode->i_op->rename && | 782 | (!subdir->d_inode->i_op->rename && |
783 | !subdir->d_inode->i_op->rename2) || | 783 | !subdir->d_inode->i_op->rename2) || |
784 | !subdir->d_inode->i_op->rmdir || | 784 | !subdir->d_inode->i_op->rmdir || |
785 | !subdir->d_inode->i_op->unlink) | 785 | !subdir->d_inode->i_op->unlink) |
786 | goto check_error; | 786 | goto check_error; |
787 | 787 | ||
788 | _leave(" = [%lu]", subdir->d_inode->i_ino); | 788 | _leave(" = [%lu]", subdir->d_inode->i_ino); |
789 | return subdir; | 789 | return subdir; |
790 | 790 | ||
791 | check_error: | 791 | check_error: |
792 | dput(subdir); | 792 | dput(subdir); |
793 | _leave(" = %d [check]", ret); | 793 | _leave(" = %d [check]", ret); |
794 | return ERR_PTR(ret); | 794 | return ERR_PTR(ret); |
795 | 795 | ||
796 | mkdir_error: | 796 | mkdir_error: |
797 | mutex_unlock(&dir->d_inode->i_mutex); | 797 | mutex_unlock(&dir->d_inode->i_mutex); |
798 | dput(subdir); | 798 | dput(subdir); |
799 | pr_err("mkdir %s failed with error %d", dirname, ret); | 799 | pr_err("mkdir %s failed with error %d\n", dirname, ret); |
800 | return ERR_PTR(ret); | 800 | return ERR_PTR(ret); |
801 | 801 | ||
802 | lookup_error: | 802 | lookup_error: |
803 | mutex_unlock(&dir->d_inode->i_mutex); | 803 | mutex_unlock(&dir->d_inode->i_mutex); |
804 | ret = PTR_ERR(subdir); | 804 | ret = PTR_ERR(subdir); |
805 | pr_err("Lookup %s failed with error %d", dirname, ret); | 805 | pr_err("Lookup %s failed with error %d\n", dirname, ret); |
806 | return ERR_PTR(ret); | 806 | return ERR_PTR(ret); |
807 | 807 | ||
808 | nomem_d_alloc: | 808 | nomem_d_alloc: |
809 | mutex_unlock(&dir->d_inode->i_mutex); | 809 | mutex_unlock(&dir->d_inode->i_mutex); |
810 | _leave(" = -ENOMEM"); | 810 | _leave(" = -ENOMEM"); |
811 | return ERR_PTR(-ENOMEM); | 811 | return ERR_PTR(-ENOMEM); |
812 | } | 812 | } |
813 | 813 | ||
814 | /* | 814 | /* |
815 | * find out if an object is in use or not | 815 | * find out if an object is in use or not |
816 | * - if finds object and it's not in use: | 816 | * - if finds object and it's not in use: |
817 | * - returns a pointer to the object and a reference on it | 817 | * - returns a pointer to the object and a reference on it |
818 | * - returns with the directory locked | 818 | * - returns with the directory locked |
819 | */ | 819 | */ |
820 | static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, | 820 | static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, |
821 | struct dentry *dir, | 821 | struct dentry *dir, |
822 | char *filename) | 822 | char *filename) |
823 | { | 823 | { |
824 | struct cachefiles_object *object; | 824 | struct cachefiles_object *object; |
825 | struct rb_node *_n; | 825 | struct rb_node *_n; |
826 | struct dentry *victim; | 826 | struct dentry *victim; |
827 | unsigned long start; | 827 | unsigned long start; |
828 | int ret; | 828 | int ret; |
829 | 829 | ||
830 | //_enter(",%*.*s/,%s", | 830 | //_enter(",%*.*s/,%s", |
831 | // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); | 831 | // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); |
832 | 832 | ||
833 | /* look up the victim */ | 833 | /* look up the victim */ |
834 | mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); | 834 | mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); |
835 | 835 | ||
836 | start = jiffies; | 836 | start = jiffies; |
837 | victim = lookup_one_len(filename, dir, strlen(filename)); | 837 | victim = lookup_one_len(filename, dir, strlen(filename)); |
838 | cachefiles_hist(cachefiles_lookup_histogram, start); | 838 | cachefiles_hist(cachefiles_lookup_histogram, start); |
839 | if (IS_ERR(victim)) | 839 | if (IS_ERR(victim)) |
840 | goto lookup_error; | 840 | goto lookup_error; |
841 | 841 | ||
842 | //_debug("victim -> %p %s", | 842 | //_debug("victim -> %p %s", |
843 | // victim, victim->d_inode ? "positive" : "negative"); | 843 | // victim, victim->d_inode ? "positive" : "negative"); |
844 | 844 | ||
845 | /* if the object is no longer there then we probably retired the object | 845 | /* if the object is no longer there then we probably retired the object |
846 | * at the netfs's request whilst the cull was in progress | 846 | * at the netfs's request whilst the cull was in progress |
847 | */ | 847 | */ |
848 | if (!victim->d_inode) { | 848 | if (!victim->d_inode) { |
849 | mutex_unlock(&dir->d_inode->i_mutex); | 849 | mutex_unlock(&dir->d_inode->i_mutex); |
850 | dput(victim); | 850 | dput(victim); |
851 | _leave(" = -ENOENT [absent]"); | 851 | _leave(" = -ENOENT [absent]"); |
852 | return ERR_PTR(-ENOENT); | 852 | return ERR_PTR(-ENOENT); |
853 | } | 853 | } |
854 | 854 | ||
855 | /* check to see if we're using this object */ | 855 | /* check to see if we're using this object */ |
856 | read_lock(&cache->active_lock); | 856 | read_lock(&cache->active_lock); |
857 | 857 | ||
858 | _n = cache->active_nodes.rb_node; | 858 | _n = cache->active_nodes.rb_node; |
859 | 859 | ||
860 | while (_n) { | 860 | while (_n) { |
861 | object = rb_entry(_n, struct cachefiles_object, active_node); | 861 | object = rb_entry(_n, struct cachefiles_object, active_node); |
862 | 862 | ||
863 | if (object->dentry > victim) | 863 | if (object->dentry > victim) |
864 | _n = _n->rb_left; | 864 | _n = _n->rb_left; |
865 | else if (object->dentry < victim) | 865 | else if (object->dentry < victim) |
866 | _n = _n->rb_right; | 866 | _n = _n->rb_right; |
867 | else | 867 | else |
868 | goto object_in_use; | 868 | goto object_in_use; |
869 | } | 869 | } |
870 | 870 | ||
871 | read_unlock(&cache->active_lock); | 871 | read_unlock(&cache->active_lock); |
872 | 872 | ||
873 | //_leave(" = %p", victim); | 873 | //_leave(" = %p", victim); |
874 | return victim; | 874 | return victim; |
875 | 875 | ||
876 | object_in_use: | 876 | object_in_use: |
877 | read_unlock(&cache->active_lock); | 877 | read_unlock(&cache->active_lock); |
878 | mutex_unlock(&dir->d_inode->i_mutex); | 878 | mutex_unlock(&dir->d_inode->i_mutex); |
879 | dput(victim); | 879 | dput(victim); |
880 | //_leave(" = -EBUSY [in use]"); | 880 | //_leave(" = -EBUSY [in use]"); |
881 | return ERR_PTR(-EBUSY); | 881 | return ERR_PTR(-EBUSY); |
882 | 882 | ||
883 | lookup_error: | 883 | lookup_error: |
884 | mutex_unlock(&dir->d_inode->i_mutex); | 884 | mutex_unlock(&dir->d_inode->i_mutex); |
885 | ret = PTR_ERR(victim); | 885 | ret = PTR_ERR(victim); |
886 | if (ret == -ENOENT) { | 886 | if (ret == -ENOENT) { |
887 | /* file or dir now absent - probably retired by netfs */ | 887 | /* file or dir now absent - probably retired by netfs */ |
888 | _leave(" = -ESTALE [absent]"); | 888 | _leave(" = -ESTALE [absent]"); |
889 | return ERR_PTR(-ESTALE); | 889 | return ERR_PTR(-ESTALE); |
890 | } | 890 | } |
891 | 891 | ||
892 | if (ret == -EIO) { | 892 | if (ret == -EIO) { |
893 | cachefiles_io_error(cache, "Lookup failed"); | 893 | cachefiles_io_error(cache, "Lookup failed"); |
894 | } else if (ret != -ENOMEM) { | 894 | } else if (ret != -ENOMEM) { |
895 | pr_err("Internal error: %d", ret); | 895 | pr_err("Internal error: %d\n", ret); |
896 | ret = -EIO; | 896 | ret = -EIO; |
897 | } | 897 | } |
898 | 898 | ||
899 | _leave(" = %d", ret); | 899 | _leave(" = %d", ret); |
900 | return ERR_PTR(ret); | 900 | return ERR_PTR(ret); |
901 | } | 901 | } |
902 | 902 | ||
903 | /* | 903 | /* |
904 | * cull an object if it's not in use | 904 | * cull an object if it's not in use |
905 | * - called only by cache manager daemon | 905 | * - called only by cache manager daemon |
906 | */ | 906 | */ |
907 | int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, | 907 | int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, |
908 | char *filename) | 908 | char *filename) |
909 | { | 909 | { |
910 | struct dentry *victim; | 910 | struct dentry *victim; |
911 | int ret; | 911 | int ret; |
912 | 912 | ||
913 | _enter(",%*.*s/,%s", | 913 | _enter(",%*.*s/,%s", |
914 | dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); | 914 | dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); |
915 | 915 | ||
916 | victim = cachefiles_check_active(cache, dir, filename); | 916 | victim = cachefiles_check_active(cache, dir, filename); |
917 | if (IS_ERR(victim)) | 917 | if (IS_ERR(victim)) |
918 | return PTR_ERR(victim); | 918 | return PTR_ERR(victim); |
919 | 919 | ||
920 | _debug("victim -> %p %s", | 920 | _debug("victim -> %p %s", |
921 | victim, victim->d_inode ? "positive" : "negative"); | 921 | victim, victim->d_inode ? "positive" : "negative"); |
922 | 922 | ||
923 | /* okay... the victim is not being used so we can cull it | 923 | /* okay... the victim is not being used so we can cull it |
924 | * - start by marking it as stale | 924 | * - start by marking it as stale |
925 | */ | 925 | */ |
926 | _debug("victim is cullable"); | 926 | _debug("victim is cullable"); |
927 | 927 | ||
928 | ret = cachefiles_remove_object_xattr(cache, victim); | 928 | ret = cachefiles_remove_object_xattr(cache, victim); |
929 | if (ret < 0) | 929 | if (ret < 0) |
930 | goto error_unlock; | 930 | goto error_unlock; |
931 | 931 | ||
932 | /* actually remove the victim (drops the dir mutex) */ | 932 | /* actually remove the victim (drops the dir mutex) */ |
933 | _debug("bury"); | 933 | _debug("bury"); |
934 | 934 | ||
935 | ret = cachefiles_bury_object(cache, dir, victim, false); | 935 | ret = cachefiles_bury_object(cache, dir, victim, false); |
936 | if (ret < 0) | 936 | if (ret < 0) |
937 | goto error; | 937 | goto error; |
938 | 938 | ||
939 | dput(victim); | 939 | dput(victim); |
940 | _leave(" = 0"); | 940 | _leave(" = 0"); |
941 | return 0; | 941 | return 0; |
942 | 942 | ||
943 | error_unlock: | 943 | error_unlock: |
944 | mutex_unlock(&dir->d_inode->i_mutex); | 944 | mutex_unlock(&dir->d_inode->i_mutex); |
945 | error: | 945 | error: |
946 | dput(victim); | 946 | dput(victim); |
947 | if (ret == -ENOENT) { | 947 | if (ret == -ENOENT) { |
948 | /* file or dir now absent - probably retired by netfs */ | 948 | /* file or dir now absent - probably retired by netfs */ |
949 | _leave(" = -ESTALE [absent]"); | 949 | _leave(" = -ESTALE [absent]"); |
950 | return -ESTALE; | 950 | return -ESTALE; |
951 | } | 951 | } |
952 | 952 | ||
953 | if (ret != -ENOMEM) { | 953 | if (ret != -ENOMEM) { |
954 | pr_err("Internal error: %d", ret); | 954 | pr_err("Internal error: %d\n", ret); |
955 | ret = -EIO; | 955 | ret = -EIO; |
956 | } | 956 | } |
957 | 957 | ||
958 | _leave(" = %d", ret); | 958 | _leave(" = %d", ret); |
959 | return ret; | 959 | return ret; |
960 | } | 960 | } |
961 | 961 | ||
962 | /* | 962 | /* |
963 | * find out if an object is in use or not | 963 | * find out if an object is in use or not |
964 | * - called only by cache manager daemon | 964 | * - called only by cache manager daemon |
965 | * - returns -EBUSY or 0 to indicate whether an object is in use or not | 965 | * - returns -EBUSY or 0 to indicate whether an object is in use or not |
966 | */ | 966 | */ |
967 | int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, | 967 | int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, |
968 | char *filename) | 968 | char *filename) |
969 | { | 969 | { |
970 | struct dentry *victim; | 970 | struct dentry *victim; |
971 | 971 | ||
972 | //_enter(",%*.*s/,%s", | 972 | //_enter(",%*.*s/,%s", |
973 | // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); | 973 | // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); |
974 | 974 | ||
975 | victim = cachefiles_check_active(cache, dir, filename); | 975 | victim = cachefiles_check_active(cache, dir, filename); |
976 | if (IS_ERR(victim)) | 976 | if (IS_ERR(victim)) |
977 | return PTR_ERR(victim); | 977 | return PTR_ERR(victim); |
978 | 978 | ||
979 | mutex_unlock(&dir->d_inode->i_mutex); | 979 | mutex_unlock(&dir->d_inode->i_mutex); |
980 | dput(victim); | 980 | dput(victim); |
981 | //_leave(" = 0"); | 981 | //_leave(" = 0"); |
982 | return 0; | 982 | return 0; |
983 | } | 983 | } |
984 | 984 |
fs/cachefiles/xattr.c
1 | /* CacheFiles extended attribute management | 1 | /* CacheFiles extended attribute management |
2 | * | 2 | * |
3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public Licence | 7 | * modify it under the terms of the GNU General Public Licence |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/fsnotify.h> | 16 | #include <linux/fsnotify.h> |
17 | #include <linux/quotaops.h> | 17 | #include <linux/quotaops.h> |
18 | #include <linux/xattr.h> | 18 | #include <linux/xattr.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include "internal.h" | 20 | #include "internal.h" |
21 | 21 | ||
22 | static const char cachefiles_xattr_cache[] = | 22 | static const char cachefiles_xattr_cache[] = |
23 | XATTR_USER_PREFIX "CacheFiles.cache"; | 23 | XATTR_USER_PREFIX "CacheFiles.cache"; |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * check the type label on an object | 26 | * check the type label on an object |
27 | * - done using xattrs | 27 | * - done using xattrs |
28 | */ | 28 | */ |
29 | int cachefiles_check_object_type(struct cachefiles_object *object) | 29 | int cachefiles_check_object_type(struct cachefiles_object *object) |
30 | { | 30 | { |
31 | struct dentry *dentry = object->dentry; | 31 | struct dentry *dentry = object->dentry; |
32 | char type[3], xtype[3]; | 32 | char type[3], xtype[3]; |
33 | int ret; | 33 | int ret; |
34 | 34 | ||
35 | ASSERT(dentry); | 35 | ASSERT(dentry); |
36 | ASSERT(dentry->d_inode); | 36 | ASSERT(dentry->d_inode); |
37 | 37 | ||
38 | if (!object->fscache.cookie) | 38 | if (!object->fscache.cookie) |
39 | strcpy(type, "C3"); | 39 | strcpy(type, "C3"); |
40 | else | 40 | else |
41 | snprintf(type, 3, "%02x", object->fscache.cookie->def->type); | 41 | snprintf(type, 3, "%02x", object->fscache.cookie->def->type); |
42 | 42 | ||
43 | _enter("%p{%s}", object, type); | 43 | _enter("%p{%s}", object, type); |
44 | 44 | ||
45 | /* attempt to install a type label directly */ | 45 | /* attempt to install a type label directly */ |
46 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, | 46 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, |
47 | XATTR_CREATE); | 47 | XATTR_CREATE); |
48 | if (ret == 0) { | 48 | if (ret == 0) { |
49 | _debug("SET"); /* we succeeded */ | 49 | _debug("SET"); /* we succeeded */ |
50 | goto error; | 50 | goto error; |
51 | } | 51 | } |
52 | 52 | ||
53 | if (ret != -EEXIST) { | 53 | if (ret != -EEXIST) { |
54 | pr_err("Can't set xattr on %*.*s [%lu] (err %d)", | 54 | pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n", |
55 | dentry->d_name.len, dentry->d_name.len, | 55 | dentry->d_name.len, dentry->d_name.len, |
56 | dentry->d_name.name, dentry->d_inode->i_ino, | 56 | dentry->d_name.name, dentry->d_inode->i_ino, |
57 | -ret); | 57 | -ret); |
58 | goto error; | 58 | goto error; |
59 | } | 59 | } |
60 | 60 | ||
61 | /* read the current type label */ | 61 | /* read the current type label */ |
62 | ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); | 62 | ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); |
63 | if (ret < 0) { | 63 | if (ret < 0) { |
64 | if (ret == -ERANGE) | 64 | if (ret == -ERANGE) |
65 | goto bad_type_length; | 65 | goto bad_type_length; |
66 | 66 | ||
67 | pr_err("Can't read xattr on %*.*s [%lu] (err %d)", | 67 | pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n", |
68 | dentry->d_name.len, dentry->d_name.len, | 68 | dentry->d_name.len, dentry->d_name.len, |
69 | dentry->d_name.name, dentry->d_inode->i_ino, | 69 | dentry->d_name.name, dentry->d_inode->i_ino, |
70 | -ret); | 70 | -ret); |
71 | goto error; | 71 | goto error; |
72 | } | 72 | } |
73 | 73 | ||
74 | /* check the type is what we're expecting */ | 74 | /* check the type is what we're expecting */ |
75 | if (ret != 2) | 75 | if (ret != 2) |
76 | goto bad_type_length; | 76 | goto bad_type_length; |
77 | 77 | ||
78 | if (xtype[0] != type[0] || xtype[1] != type[1]) | 78 | if (xtype[0] != type[0] || xtype[1] != type[1]) |
79 | goto bad_type; | 79 | goto bad_type; |
80 | 80 | ||
81 | ret = 0; | 81 | ret = 0; |
82 | 82 | ||
83 | error: | 83 | error: |
84 | _leave(" = %d", ret); | 84 | _leave(" = %d", ret); |
85 | return ret; | 85 | return ret; |
86 | 86 | ||
87 | bad_type_length: | 87 | bad_type_length: |
88 | pr_err("Cache object %lu type xattr length incorrect", | 88 | pr_err("Cache object %lu type xattr length incorrect\n", |
89 | dentry->d_inode->i_ino); | 89 | dentry->d_inode->i_ino); |
90 | ret = -EIO; | 90 | ret = -EIO; |
91 | goto error; | 91 | goto error; |
92 | 92 | ||
93 | bad_type: | 93 | bad_type: |
94 | xtype[2] = 0; | 94 | xtype[2] = 0; |
95 | pr_err("Cache object %*.*s [%lu] type %s not %s", | 95 | pr_err("Cache object %*.*s [%lu] type %s not %s\n", |
96 | dentry->d_name.len, dentry->d_name.len, | 96 | dentry->d_name.len, dentry->d_name.len, |
97 | dentry->d_name.name, dentry->d_inode->i_ino, | 97 | dentry->d_name.name, dentry->d_inode->i_ino, |
98 | xtype, type); | 98 | xtype, type); |
99 | ret = -EIO; | 99 | ret = -EIO; |
100 | goto error; | 100 | goto error; |
101 | } | 101 | } |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * set the state xattr on a cache file | 104 | * set the state xattr on a cache file |
105 | */ | 105 | */ |
106 | int cachefiles_set_object_xattr(struct cachefiles_object *object, | 106 | int cachefiles_set_object_xattr(struct cachefiles_object *object, |
107 | struct cachefiles_xattr *auxdata) | 107 | struct cachefiles_xattr *auxdata) |
108 | { | 108 | { |
109 | struct dentry *dentry = object->dentry; | 109 | struct dentry *dentry = object->dentry; |
110 | int ret; | 110 | int ret; |
111 | 111 | ||
112 | ASSERT(dentry); | 112 | ASSERT(dentry); |
113 | 113 | ||
114 | _enter("%p,#%d", object, auxdata->len); | 114 | _enter("%p,#%d", object, auxdata->len); |
115 | 115 | ||
116 | /* attempt to install the cache metadata directly */ | 116 | /* attempt to install the cache metadata directly */ |
117 | _debug("SET #%u", auxdata->len); | 117 | _debug("SET #%u", auxdata->len); |
118 | 118 | ||
119 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, | 119 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, |
120 | &auxdata->type, auxdata->len, | 120 | &auxdata->type, auxdata->len, |
121 | XATTR_CREATE); | 121 | XATTR_CREATE); |
122 | if (ret < 0 && ret != -ENOMEM) | 122 | if (ret < 0 && ret != -ENOMEM) |
123 | cachefiles_io_error_obj( | 123 | cachefiles_io_error_obj( |
124 | object, | 124 | object, |
125 | "Failed to set xattr with error %d", ret); | 125 | "Failed to set xattr with error %d", ret); |
126 | 126 | ||
127 | _leave(" = %d", ret); | 127 | _leave(" = %d", ret); |
128 | return ret; | 128 | return ret; |
129 | } | 129 | } |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * update the state xattr on a cache file | 132 | * update the state xattr on a cache file |
133 | */ | 133 | */ |
134 | int cachefiles_update_object_xattr(struct cachefiles_object *object, | 134 | int cachefiles_update_object_xattr(struct cachefiles_object *object, |
135 | struct cachefiles_xattr *auxdata) | 135 | struct cachefiles_xattr *auxdata) |
136 | { | 136 | { |
137 | struct dentry *dentry = object->dentry; | 137 | struct dentry *dentry = object->dentry; |
138 | int ret; | 138 | int ret; |
139 | 139 | ||
140 | ASSERT(dentry); | 140 | ASSERT(dentry); |
141 | 141 | ||
142 | _enter("%p,#%d", object, auxdata->len); | 142 | _enter("%p,#%d", object, auxdata->len); |
143 | 143 | ||
144 | /* attempt to install the cache metadata directly */ | 144 | /* attempt to install the cache metadata directly */ |
145 | _debug("SET #%u", auxdata->len); | 145 | _debug("SET #%u", auxdata->len); |
146 | 146 | ||
147 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, | 147 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, |
148 | &auxdata->type, auxdata->len, | 148 | &auxdata->type, auxdata->len, |
149 | XATTR_REPLACE); | 149 | XATTR_REPLACE); |
150 | if (ret < 0 && ret != -ENOMEM) | 150 | if (ret < 0 && ret != -ENOMEM) |
151 | cachefiles_io_error_obj( | 151 | cachefiles_io_error_obj( |
152 | object, | 152 | object, |
153 | "Failed to update xattr with error %d", ret); | 153 | "Failed to update xattr with error %d", ret); |
154 | 154 | ||
155 | _leave(" = %d", ret); | 155 | _leave(" = %d", ret); |
156 | return ret; | 156 | return ret; |
157 | } | 157 | } |
158 | 158 | ||
159 | /* | 159 | /* |
160 | * check the consistency between the backing cache and the FS-Cache cookie | 160 | * check the consistency between the backing cache and the FS-Cache cookie |
161 | */ | 161 | */ |
162 | int cachefiles_check_auxdata(struct cachefiles_object *object) | 162 | int cachefiles_check_auxdata(struct cachefiles_object *object) |
163 | { | 163 | { |
164 | struct cachefiles_xattr *auxbuf; | 164 | struct cachefiles_xattr *auxbuf; |
165 | enum fscache_checkaux validity; | 165 | enum fscache_checkaux validity; |
166 | struct dentry *dentry = object->dentry; | 166 | struct dentry *dentry = object->dentry; |
167 | ssize_t xlen; | 167 | ssize_t xlen; |
168 | int ret; | 168 | int ret; |
169 | 169 | ||
170 | ASSERT(dentry); | 170 | ASSERT(dentry); |
171 | ASSERT(dentry->d_inode); | 171 | ASSERT(dentry->d_inode); |
172 | ASSERT(object->fscache.cookie->def->check_aux); | 172 | ASSERT(object->fscache.cookie->def->check_aux); |
173 | 173 | ||
174 | auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); | 174 | auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); |
175 | if (!auxbuf) | 175 | if (!auxbuf) |
176 | return -ENOMEM; | 176 | return -ENOMEM; |
177 | 177 | ||
178 | xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, | 178 | xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, |
179 | &auxbuf->type, 512 + 1); | 179 | &auxbuf->type, 512 + 1); |
180 | ret = -ESTALE; | 180 | ret = -ESTALE; |
181 | if (xlen < 1 || | 181 | if (xlen < 1 || |
182 | auxbuf->type != object->fscache.cookie->def->type) | 182 | auxbuf->type != object->fscache.cookie->def->type) |
183 | goto error; | 183 | goto error; |
184 | 184 | ||
185 | xlen--; | 185 | xlen--; |
186 | validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); | 186 | validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); |
187 | if (validity != FSCACHE_CHECKAUX_OKAY) | 187 | if (validity != FSCACHE_CHECKAUX_OKAY) |
188 | goto error; | 188 | goto error; |
189 | 189 | ||
190 | ret = 0; | 190 | ret = 0; |
191 | error: | 191 | error: |
192 | kfree(auxbuf); | 192 | kfree(auxbuf); |
193 | return ret; | 193 | return ret; |
194 | } | 194 | } |
195 | 195 | ||
196 | /* | 196 | /* |
197 | * check the state xattr on a cache file | 197 | * check the state xattr on a cache file |
198 | * - return -ESTALE if the object should be deleted | 198 | * - return -ESTALE if the object should be deleted |
199 | */ | 199 | */ |
200 | int cachefiles_check_object_xattr(struct cachefiles_object *object, | 200 | int cachefiles_check_object_xattr(struct cachefiles_object *object, |
201 | struct cachefiles_xattr *auxdata) | 201 | struct cachefiles_xattr *auxdata) |
202 | { | 202 | { |
203 | struct cachefiles_xattr *auxbuf; | 203 | struct cachefiles_xattr *auxbuf; |
204 | struct dentry *dentry = object->dentry; | 204 | struct dentry *dentry = object->dentry; |
205 | int ret; | 205 | int ret; |
206 | 206 | ||
207 | _enter("%p,#%d", object, auxdata->len); | 207 | _enter("%p,#%d", object, auxdata->len); |
208 | 208 | ||
209 | ASSERT(dentry); | 209 | ASSERT(dentry); |
210 | ASSERT(dentry->d_inode); | 210 | ASSERT(dentry->d_inode); |
211 | 211 | ||
212 | auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); | 212 | auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); |
213 | if (!auxbuf) { | 213 | if (!auxbuf) { |
214 | _leave(" = -ENOMEM"); | 214 | _leave(" = -ENOMEM"); |
215 | return -ENOMEM; | 215 | return -ENOMEM; |
216 | } | 216 | } |
217 | 217 | ||
218 | /* read the current type label */ | 218 | /* read the current type label */ |
219 | ret = vfs_getxattr(dentry, cachefiles_xattr_cache, | 219 | ret = vfs_getxattr(dentry, cachefiles_xattr_cache, |
220 | &auxbuf->type, 512 + 1); | 220 | &auxbuf->type, 512 + 1); |
221 | if (ret < 0) { | 221 | if (ret < 0) { |
222 | if (ret == -ENODATA) | 222 | if (ret == -ENODATA) |
223 | goto stale; /* no attribute - power went off | 223 | goto stale; /* no attribute - power went off |
224 | * mid-cull? */ | 224 | * mid-cull? */ |
225 | 225 | ||
226 | if (ret == -ERANGE) | 226 | if (ret == -ERANGE) |
227 | goto bad_type_length; | 227 | goto bad_type_length; |
228 | 228 | ||
229 | cachefiles_io_error_obj(object, | 229 | cachefiles_io_error_obj(object, |
230 | "Can't read xattr on %lu (err %d)", | 230 | "Can't read xattr on %lu (err %d)", |
231 | dentry->d_inode->i_ino, -ret); | 231 | dentry->d_inode->i_ino, -ret); |
232 | goto error; | 232 | goto error; |
233 | } | 233 | } |
234 | 234 | ||
235 | /* check the on-disk object */ | 235 | /* check the on-disk object */ |
236 | if (ret < 1) | 236 | if (ret < 1) |
237 | goto bad_type_length; | 237 | goto bad_type_length; |
238 | 238 | ||
239 | if (auxbuf->type != auxdata->type) | 239 | if (auxbuf->type != auxdata->type) |
240 | goto stale; | 240 | goto stale; |
241 | 241 | ||
242 | auxbuf->len = ret; | 242 | auxbuf->len = ret; |
243 | 243 | ||
244 | /* consult the netfs */ | 244 | /* consult the netfs */ |
245 | if (object->fscache.cookie->def->check_aux) { | 245 | if (object->fscache.cookie->def->check_aux) { |
246 | enum fscache_checkaux result; | 246 | enum fscache_checkaux result; |
247 | unsigned int dlen; | 247 | unsigned int dlen; |
248 | 248 | ||
249 | dlen = auxbuf->len - 1; | 249 | dlen = auxbuf->len - 1; |
250 | 250 | ||
251 | _debug("checkaux %s #%u", | 251 | _debug("checkaux %s #%u", |
252 | object->fscache.cookie->def->name, dlen); | 252 | object->fscache.cookie->def->name, dlen); |
253 | 253 | ||
254 | result = fscache_check_aux(&object->fscache, | 254 | result = fscache_check_aux(&object->fscache, |
255 | &auxbuf->data, dlen); | 255 | &auxbuf->data, dlen); |
256 | 256 | ||
257 | switch (result) { | 257 | switch (result) { |
258 | /* entry okay as is */ | 258 | /* entry okay as is */ |
259 | case FSCACHE_CHECKAUX_OKAY: | 259 | case FSCACHE_CHECKAUX_OKAY: |
260 | goto okay; | 260 | goto okay; |
261 | 261 | ||
262 | /* entry requires update */ | 262 | /* entry requires update */ |
263 | case FSCACHE_CHECKAUX_NEEDS_UPDATE: | 263 | case FSCACHE_CHECKAUX_NEEDS_UPDATE: |
264 | break; | 264 | break; |
265 | 265 | ||
266 | /* entry requires deletion */ | 266 | /* entry requires deletion */ |
267 | case FSCACHE_CHECKAUX_OBSOLETE: | 267 | case FSCACHE_CHECKAUX_OBSOLETE: |
268 | goto stale; | 268 | goto stale; |
269 | 269 | ||
270 | default: | 270 | default: |
271 | BUG(); | 271 | BUG(); |
272 | } | 272 | } |
273 | 273 | ||
274 | /* update the current label */ | 274 | /* update the current label */ |
275 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, | 275 | ret = vfs_setxattr(dentry, cachefiles_xattr_cache, |
276 | &auxdata->type, auxdata->len, | 276 | &auxdata->type, auxdata->len, |
277 | XATTR_REPLACE); | 277 | XATTR_REPLACE); |
278 | if (ret < 0) { | 278 | if (ret < 0) { |
279 | cachefiles_io_error_obj(object, | 279 | cachefiles_io_error_obj(object, |
280 | "Can't update xattr on %lu" | 280 | "Can't update xattr on %lu" |
281 | " (error %d)", | 281 | " (error %d)", |
282 | dentry->d_inode->i_ino, -ret); | 282 | dentry->d_inode->i_ino, -ret); |
283 | goto error; | 283 | goto error; |
284 | } | 284 | } |
285 | } | 285 | } |
286 | 286 | ||
287 | okay: | 287 | okay: |
288 | ret = 0; | 288 | ret = 0; |
289 | 289 | ||
290 | error: | 290 | error: |
291 | kfree(auxbuf); | 291 | kfree(auxbuf); |
292 | _leave(" = %d", ret); | 292 | _leave(" = %d", ret); |
293 | return ret; | 293 | return ret; |
294 | 294 | ||
295 | bad_type_length: | 295 | bad_type_length: |
296 | pr_err("Cache object %lu xattr length incorrect", | 296 | pr_err("Cache object %lu xattr length incorrect\n", |
297 | dentry->d_inode->i_ino); | 297 | dentry->d_inode->i_ino); |
298 | ret = -EIO; | 298 | ret = -EIO; |
299 | goto error; | 299 | goto error; |
300 | 300 | ||
301 | stale: | 301 | stale: |
302 | ret = -ESTALE; | 302 | ret = -ESTALE; |
303 | goto error; | 303 | goto error; |
304 | } | 304 | } |
305 | 305 | ||
306 | /* | 306 | /* |
307 | * remove the object's xattr to mark it stale | 307 | * remove the object's xattr to mark it stale |
308 | */ | 308 | */ |
309 | int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, | 309 | int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, |
310 | struct dentry *dentry) | 310 | struct dentry *dentry) |
311 | { | 311 | { |
312 | int ret; | 312 | int ret; |
313 | 313 | ||
314 | ret = vfs_removexattr(dentry, cachefiles_xattr_cache); | 314 | ret = vfs_removexattr(dentry, cachefiles_xattr_cache); |
315 | if (ret < 0) { | 315 | if (ret < 0) { |
316 | if (ret == -ENOENT || ret == -ENODATA) | 316 | if (ret == -ENOENT || ret == -ENODATA) |
317 | ret = 0; | 317 | ret = 0; |
318 | else if (ret != -ENOMEM) | 318 | else if (ret != -ENOMEM) |
319 | cachefiles_io_error(cache, | 319 | cachefiles_io_error(cache, |
320 | "Can't remove xattr from %lu" | 320 | "Can't remove xattr from %lu" |
321 | " (error %d)", | 321 | " (error %d)", |
322 | dentry->d_inode->i_ino, -ret); | 322 | dentry->d_inode->i_ino, -ret); |
323 | } | 323 | } |
324 | 324 | ||
325 | _leave(" = %d", ret); | 325 | _leave(" = %d", ret); |
326 | return ret; | 326 | return ret; |
327 | } | 327 | } |
328 | 328 |
fs/nilfs2/inode.c
1 | /* | 1 | /* |
2 | * inode.c - NILFS inode operations. | 2 | * inode.c - NILFS inode operations. |
3 | * | 3 | * |
4 | * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. | 4 | * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation; either version 2 of the License, or | 8 | * the Free Software Foundation; either version 2 of the License, or |
9 | * (at your option) any later version. | 9 | * (at your option) any later version. |
10 | * | 10 | * |
11 | * This program is distributed in the hope that it will be useful, | 11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
15 | * | 15 | * |
16 | * You should have received a copy of the GNU General Public License | 16 | * You should have received a copy of the GNU General Public License |
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
19 | * | 19 | * |
20 | * Written by Ryusuke Konishi <ryusuke@osrg.net> | 20 | * Written by Ryusuke Konishi <ryusuke@osrg.net> |
21 | * | 21 | * |
22 | */ | 22 | */ |
23 | 23 | ||
24 | #include <linux/buffer_head.h> | 24 | #include <linux/buffer_head.h> |
25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
26 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
27 | #include <linux/pagemap.h> | ||
27 | #include <linux/writeback.h> | 28 | #include <linux/writeback.h> |
28 | #include <linux/aio.h> | 29 | #include <linux/aio.h> |
29 | #include "nilfs.h" | 30 | #include "nilfs.h" |
30 | #include "btnode.h" | 31 | #include "btnode.h" |
31 | #include "segment.h" | 32 | #include "segment.h" |
32 | #include "page.h" | 33 | #include "page.h" |
33 | #include "mdt.h" | 34 | #include "mdt.h" |
34 | #include "cpfile.h" | 35 | #include "cpfile.h" |
35 | #include "ifile.h" | 36 | #include "ifile.h" |
36 | 37 | ||
37 | /** | 38 | /** |
38 | * struct nilfs_iget_args - arguments used during comparison between inodes | 39 | * struct nilfs_iget_args - arguments used during comparison between inodes |
39 | * @ino: inode number | 40 | * @ino: inode number |
40 | * @cno: checkpoint number | 41 | * @cno: checkpoint number |
41 | * @root: pointer on NILFS root object (mounted checkpoint) | 42 | * @root: pointer on NILFS root object (mounted checkpoint) |
42 | * @for_gc: inode for GC flag | 43 | * @for_gc: inode for GC flag |
43 | */ | 44 | */ |
44 | struct nilfs_iget_args { | 45 | struct nilfs_iget_args { |
45 | u64 ino; | 46 | u64 ino; |
46 | __u64 cno; | 47 | __u64 cno; |
47 | struct nilfs_root *root; | 48 | struct nilfs_root *root; |
48 | int for_gc; | 49 | int for_gc; |
49 | }; | 50 | }; |
50 | 51 | ||
51 | void nilfs_inode_add_blocks(struct inode *inode, int n) | 52 | void nilfs_inode_add_blocks(struct inode *inode, int n) |
52 | { | 53 | { |
53 | struct nilfs_root *root = NILFS_I(inode)->i_root; | 54 | struct nilfs_root *root = NILFS_I(inode)->i_root; |
54 | 55 | ||
55 | inode_add_bytes(inode, (1 << inode->i_blkbits) * n); | 56 | inode_add_bytes(inode, (1 << inode->i_blkbits) * n); |
56 | if (root) | 57 | if (root) |
57 | atomic64_add(n, &root->blocks_count); | 58 | atomic64_add(n, &root->blocks_count); |
58 | } | 59 | } |
59 | 60 | ||
60 | void nilfs_inode_sub_blocks(struct inode *inode, int n) | 61 | void nilfs_inode_sub_blocks(struct inode *inode, int n) |
61 | { | 62 | { |
62 | struct nilfs_root *root = NILFS_I(inode)->i_root; | 63 | struct nilfs_root *root = NILFS_I(inode)->i_root; |
63 | 64 | ||
64 | inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); | 65 | inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); |
65 | if (root) | 66 | if (root) |
66 | atomic64_sub(n, &root->blocks_count); | 67 | atomic64_sub(n, &root->blocks_count); |
67 | } | 68 | } |
68 | 69 | ||
69 | /** | 70 | /** |
70 | * nilfs_get_block() - get a file block on the filesystem (callback function) | 71 | * nilfs_get_block() - get a file block on the filesystem (callback function) |
71 | * @inode - inode struct of the target file | 72 | * @inode - inode struct of the target file |
72 | * @blkoff - file block number | 73 | * @blkoff - file block number |
73 | * @bh_result - buffer head to be mapped on | 74 | * @bh_result - buffer head to be mapped on |
74 | * @create - indicate whether allocating the block or not when it has not | 75 | * @create - indicate whether allocating the block or not when it has not |
75 | * been allocated yet. | 76 | * been allocated yet. |
76 | * | 77 | * |
77 | * This function does not issue actual read request of the specified data | 78 | * This function does not issue actual read request of the specified data |
78 | * block. It is done by VFS. | 79 | * block. It is done by VFS. |
79 | */ | 80 | */ |
80 | int nilfs_get_block(struct inode *inode, sector_t blkoff, | 81 | int nilfs_get_block(struct inode *inode, sector_t blkoff, |
81 | struct buffer_head *bh_result, int create) | 82 | struct buffer_head *bh_result, int create) |
82 | { | 83 | { |
83 | struct nilfs_inode_info *ii = NILFS_I(inode); | 84 | struct nilfs_inode_info *ii = NILFS_I(inode); |
84 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; | 85 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; |
85 | __u64 blknum = 0; | 86 | __u64 blknum = 0; |
86 | int err = 0, ret; | 87 | int err = 0, ret; |
87 | unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; | 88 | unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; |
88 | 89 | ||
89 | down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 90 | down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
90 | ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); | 91 | ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); |
91 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 92 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
92 | if (ret >= 0) { /* found */ | 93 | if (ret >= 0) { /* found */ |
93 | map_bh(bh_result, inode->i_sb, blknum); | 94 | map_bh(bh_result, inode->i_sb, blknum); |
94 | if (ret > 0) | 95 | if (ret > 0) |
95 | bh_result->b_size = (ret << inode->i_blkbits); | 96 | bh_result->b_size = (ret << inode->i_blkbits); |
96 | goto out; | 97 | goto out; |
97 | } | 98 | } |
98 | /* data block was not found */ | 99 | /* data block was not found */ |
99 | if (ret == -ENOENT && create) { | 100 | if (ret == -ENOENT && create) { |
100 | struct nilfs_transaction_info ti; | 101 | struct nilfs_transaction_info ti; |
101 | 102 | ||
102 | bh_result->b_blocknr = 0; | 103 | bh_result->b_blocknr = 0; |
103 | err = nilfs_transaction_begin(inode->i_sb, &ti, 1); | 104 | err = nilfs_transaction_begin(inode->i_sb, &ti, 1); |
104 | if (unlikely(err)) | 105 | if (unlikely(err)) |
105 | goto out; | 106 | goto out; |
106 | err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, | 107 | err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, |
107 | (unsigned long)bh_result); | 108 | (unsigned long)bh_result); |
108 | if (unlikely(err != 0)) { | 109 | if (unlikely(err != 0)) { |
109 | if (err == -EEXIST) { | 110 | if (err == -EEXIST) { |
110 | /* | 111 | /* |
111 | * The get_block() function could be called | 112 | * The get_block() function could be called |
112 | * from multiple callers for an inode. | 113 | * from multiple callers for an inode. |
113 | * However, the page having this block must | 114 | * However, the page having this block must |
114 | * be locked in this case. | 115 | * be locked in this case. |
115 | */ | 116 | */ |
116 | printk(KERN_WARNING | 117 | printk(KERN_WARNING |
117 | "nilfs_get_block: a race condition " | 118 | "nilfs_get_block: a race condition " |
118 | "while inserting a data block. " | 119 | "while inserting a data block. " |
119 | "(inode number=%lu, file block " | 120 | "(inode number=%lu, file block " |
120 | "offset=%llu)\n", | 121 | "offset=%llu)\n", |
121 | inode->i_ino, | 122 | inode->i_ino, |
122 | (unsigned long long)blkoff); | 123 | (unsigned long long)blkoff); |
123 | err = 0; | 124 | err = 0; |
124 | } | 125 | } |
125 | nilfs_transaction_abort(inode->i_sb); | 126 | nilfs_transaction_abort(inode->i_sb); |
126 | goto out; | 127 | goto out; |
127 | } | 128 | } |
128 | nilfs_mark_inode_dirty(inode); | 129 | nilfs_mark_inode_dirty(inode); |
129 | nilfs_transaction_commit(inode->i_sb); /* never fails */ | 130 | nilfs_transaction_commit(inode->i_sb); /* never fails */ |
130 | /* Error handling should be detailed */ | 131 | /* Error handling should be detailed */ |
131 | set_buffer_new(bh_result); | 132 | set_buffer_new(bh_result); |
132 | set_buffer_delay(bh_result); | 133 | set_buffer_delay(bh_result); |
133 | map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed | 134 | map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed |
134 | to proper value */ | 135 | to proper value */ |
135 | } else if (ret == -ENOENT) { | 136 | } else if (ret == -ENOENT) { |
136 | /* not found is not error (e.g. hole); must return without | 137 | /* not found is not error (e.g. hole); must return without |
137 | the mapped state flag. */ | 138 | the mapped state flag. */ |
138 | ; | 139 | ; |
139 | } else { | 140 | } else { |
140 | err = ret; | 141 | err = ret; |
141 | } | 142 | } |
142 | 143 | ||
143 | out: | 144 | out: |
144 | return err; | 145 | return err; |
145 | } | 146 | } |
146 | 147 | ||
147 | /** | 148 | /** |
148 | * nilfs_readpage() - implement readpage() method of nilfs_aops {} | 149 | * nilfs_readpage() - implement readpage() method of nilfs_aops {} |
149 | * address_space_operations. | 150 | * address_space_operations. |
150 | * @file - file struct of the file to be read | 151 | * @file - file struct of the file to be read |
151 | * @page - the page to be read | 152 | * @page - the page to be read |
152 | */ | 153 | */ |
153 | static int nilfs_readpage(struct file *file, struct page *page) | 154 | static int nilfs_readpage(struct file *file, struct page *page) |
154 | { | 155 | { |
155 | return mpage_readpage(page, nilfs_get_block); | 156 | return mpage_readpage(page, nilfs_get_block); |
156 | } | 157 | } |
157 | 158 | ||
158 | /** | 159 | /** |
159 | * nilfs_readpages() - implement readpages() method of nilfs_aops {} | 160 | * nilfs_readpages() - implement readpages() method of nilfs_aops {} |
160 | * address_space_operations. | 161 | * address_space_operations. |
161 | * @file - file struct of the file to be read | 162 | * @file - file struct of the file to be read |
162 | * @mapping - address_space struct used for reading multiple pages | 163 | * @mapping - address_space struct used for reading multiple pages |
163 | * @pages - the pages to be read | 164 | * @pages - the pages to be read |
164 | * @nr_pages - number of pages to be read | 165 | * @nr_pages - number of pages to be read |
165 | */ | 166 | */ |
166 | static int nilfs_readpages(struct file *file, struct address_space *mapping, | 167 | static int nilfs_readpages(struct file *file, struct address_space *mapping, |
167 | struct list_head *pages, unsigned nr_pages) | 168 | struct list_head *pages, unsigned nr_pages) |
168 | { | 169 | { |
169 | return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); | 170 | return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); |
170 | } | 171 | } |
171 | 172 | ||
172 | static int nilfs_writepages(struct address_space *mapping, | 173 | static int nilfs_writepages(struct address_space *mapping, |
173 | struct writeback_control *wbc) | 174 | struct writeback_control *wbc) |
174 | { | 175 | { |
175 | struct inode *inode = mapping->host; | 176 | struct inode *inode = mapping->host; |
176 | int err = 0; | 177 | int err = 0; |
177 | 178 | ||
178 | if (inode->i_sb->s_flags & MS_RDONLY) { | 179 | if (inode->i_sb->s_flags & MS_RDONLY) { |
179 | nilfs_clear_dirty_pages(mapping, false); | 180 | nilfs_clear_dirty_pages(mapping, false); |
180 | return -EROFS; | 181 | return -EROFS; |
181 | } | 182 | } |
182 | 183 | ||
183 | if (wbc->sync_mode == WB_SYNC_ALL) | 184 | if (wbc->sync_mode == WB_SYNC_ALL) |
184 | err = nilfs_construct_dsync_segment(inode->i_sb, inode, | 185 | err = nilfs_construct_dsync_segment(inode->i_sb, inode, |
185 | wbc->range_start, | 186 | wbc->range_start, |
186 | wbc->range_end); | 187 | wbc->range_end); |
187 | return err; | 188 | return err; |
188 | } | 189 | } |
189 | 190 | ||
190 | static int nilfs_writepage(struct page *page, struct writeback_control *wbc) | 191 | static int nilfs_writepage(struct page *page, struct writeback_control *wbc) |
191 | { | 192 | { |
192 | struct inode *inode = page->mapping->host; | 193 | struct inode *inode = page->mapping->host; |
193 | int err; | 194 | int err; |
194 | 195 | ||
195 | if (inode->i_sb->s_flags & MS_RDONLY) { | 196 | if (inode->i_sb->s_flags & MS_RDONLY) { |
196 | /* | 197 | /* |
197 | * It means that filesystem was remounted in read-only | 198 | * It means that filesystem was remounted in read-only |
198 | * mode because of error or metadata corruption. But we | 199 | * mode because of error or metadata corruption. But we |
199 | * have dirty pages that try to be flushed in background. | 200 | * have dirty pages that try to be flushed in background. |
200 | * So, here we simply discard this dirty page. | 201 | * So, here we simply discard this dirty page. |
201 | */ | 202 | */ |
202 | nilfs_clear_dirty_page(page, false); | 203 | nilfs_clear_dirty_page(page, false); |
203 | unlock_page(page); | 204 | unlock_page(page); |
204 | return -EROFS; | 205 | return -EROFS; |
205 | } | 206 | } |
206 | 207 | ||
207 | redirty_page_for_writepage(wbc, page); | 208 | redirty_page_for_writepage(wbc, page); |
208 | unlock_page(page); | 209 | unlock_page(page); |
209 | 210 | ||
210 | if (wbc->sync_mode == WB_SYNC_ALL) { | 211 | if (wbc->sync_mode == WB_SYNC_ALL) { |
211 | err = nilfs_construct_segment(inode->i_sb); | 212 | err = nilfs_construct_segment(inode->i_sb); |
212 | if (unlikely(err)) | 213 | if (unlikely(err)) |
213 | return err; | 214 | return err; |
214 | } else if (wbc->for_reclaim) | 215 | } else if (wbc->for_reclaim) |
215 | nilfs_flush_segment(inode->i_sb, inode->i_ino); | 216 | nilfs_flush_segment(inode->i_sb, inode->i_ino); |
216 | 217 | ||
217 | return 0; | 218 | return 0; |
218 | } | 219 | } |
219 | 220 | ||
220 | static int nilfs_set_page_dirty(struct page *page) | 221 | static int nilfs_set_page_dirty(struct page *page) |
221 | { | 222 | { |
223 | struct inode *inode = page->mapping->host; | ||
222 | int ret = __set_page_dirty_nobuffers(page); | 224 | int ret = __set_page_dirty_nobuffers(page); |
223 | 225 | ||
224 | if (page_has_buffers(page)) { | 226 | if (page_has_buffers(page)) { |
225 | struct inode *inode = page->mapping->host; | ||
226 | unsigned nr_dirty = 0; | 227 | unsigned nr_dirty = 0; |
227 | struct buffer_head *bh, *head; | 228 | struct buffer_head *bh, *head; |
228 | 229 | ||
229 | /* | 230 | /* |
230 | * This page is locked by callers, and no other thread | 231 | * This page is locked by callers, and no other thread |
231 | * concurrently marks its buffers dirty since they are | 232 | * concurrently marks its buffers dirty since they are |
232 | * only dirtied through routines in fs/buffer.c in | 233 | * only dirtied through routines in fs/buffer.c in |
233 | * which call sites of mark_buffer_dirty are protected | 234 | * which call sites of mark_buffer_dirty are protected |
234 | * by page lock. | 235 | * by page lock. |
235 | */ | 236 | */ |
236 | bh = head = page_buffers(page); | 237 | bh = head = page_buffers(page); |
237 | do { | 238 | do { |
238 | /* Do not mark hole blocks dirty */ | 239 | /* Do not mark hole blocks dirty */ |
239 | if (buffer_dirty(bh) || !buffer_mapped(bh)) | 240 | if (buffer_dirty(bh) || !buffer_mapped(bh)) |
240 | continue; | 241 | continue; |
241 | 242 | ||
242 | set_buffer_dirty(bh); | 243 | set_buffer_dirty(bh); |
243 | nr_dirty++; | 244 | nr_dirty++; |
244 | } while (bh = bh->b_this_page, bh != head); | 245 | } while (bh = bh->b_this_page, bh != head); |
245 | 246 | ||
246 | if (nr_dirty) | 247 | if (nr_dirty) |
247 | nilfs_set_file_dirty(inode, nr_dirty); | 248 | nilfs_set_file_dirty(inode, nr_dirty); |
249 | } else if (ret) { | ||
250 | unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
251 | |||
252 | nilfs_set_file_dirty(inode, nr_dirty); | ||
248 | } | 253 | } |
249 | return ret; | 254 | return ret; |
250 | } | 255 | } |
251 | 256 | ||
252 | void nilfs_write_failed(struct address_space *mapping, loff_t to) | 257 | void nilfs_write_failed(struct address_space *mapping, loff_t to) |
253 | { | 258 | { |
254 | struct inode *inode = mapping->host; | 259 | struct inode *inode = mapping->host; |
255 | 260 | ||
256 | if (to > inode->i_size) { | 261 | if (to > inode->i_size) { |
257 | truncate_pagecache(inode, inode->i_size); | 262 | truncate_pagecache(inode, inode->i_size); |
258 | nilfs_truncate(inode); | 263 | nilfs_truncate(inode); |
259 | } | 264 | } |
260 | } | 265 | } |
261 | 266 | ||
262 | static int nilfs_write_begin(struct file *file, struct address_space *mapping, | 267 | static int nilfs_write_begin(struct file *file, struct address_space *mapping, |
263 | loff_t pos, unsigned len, unsigned flags, | 268 | loff_t pos, unsigned len, unsigned flags, |
264 | struct page **pagep, void **fsdata) | 269 | struct page **pagep, void **fsdata) |
265 | 270 | ||
266 | { | 271 | { |
267 | struct inode *inode = mapping->host; | 272 | struct inode *inode = mapping->host; |
268 | int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); | 273 | int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); |
269 | 274 | ||
270 | if (unlikely(err)) | 275 | if (unlikely(err)) |
271 | return err; | 276 | return err; |
272 | 277 | ||
273 | err = block_write_begin(mapping, pos, len, flags, pagep, | 278 | err = block_write_begin(mapping, pos, len, flags, pagep, |
274 | nilfs_get_block); | 279 | nilfs_get_block); |
275 | if (unlikely(err)) { | 280 | if (unlikely(err)) { |
276 | nilfs_write_failed(mapping, pos + len); | 281 | nilfs_write_failed(mapping, pos + len); |
277 | nilfs_transaction_abort(inode->i_sb); | 282 | nilfs_transaction_abort(inode->i_sb); |
278 | } | 283 | } |
279 | return err; | 284 | return err; |
280 | } | 285 | } |
281 | 286 | ||
282 | static int nilfs_write_end(struct file *file, struct address_space *mapping, | 287 | static int nilfs_write_end(struct file *file, struct address_space *mapping, |
283 | loff_t pos, unsigned len, unsigned copied, | 288 | loff_t pos, unsigned len, unsigned copied, |
284 | struct page *page, void *fsdata) | 289 | struct page *page, void *fsdata) |
285 | { | 290 | { |
286 | struct inode *inode = mapping->host; | 291 | struct inode *inode = mapping->host; |
287 | unsigned start = pos & (PAGE_CACHE_SIZE - 1); | 292 | unsigned start = pos & (PAGE_CACHE_SIZE - 1); |
288 | unsigned nr_dirty; | 293 | unsigned nr_dirty; |
289 | int err; | 294 | int err; |
290 | 295 | ||
291 | nr_dirty = nilfs_page_count_clean_buffers(page, start, | 296 | nr_dirty = nilfs_page_count_clean_buffers(page, start, |
292 | start + copied); | 297 | start + copied); |
293 | copied = generic_write_end(file, mapping, pos, len, copied, page, | 298 | copied = generic_write_end(file, mapping, pos, len, copied, page, |
294 | fsdata); | 299 | fsdata); |
295 | nilfs_set_file_dirty(inode, nr_dirty); | 300 | nilfs_set_file_dirty(inode, nr_dirty); |
296 | err = nilfs_transaction_commit(inode->i_sb); | 301 | err = nilfs_transaction_commit(inode->i_sb); |
297 | return err ? : copied; | 302 | return err ? : copied; |
298 | } | 303 | } |
299 | 304 | ||
300 | static ssize_t | 305 | static ssize_t |
301 | nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, | 306 | nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, |
302 | loff_t offset) | 307 | loff_t offset) |
303 | { | 308 | { |
304 | struct file *file = iocb->ki_filp; | 309 | struct file *file = iocb->ki_filp; |
305 | struct address_space *mapping = file->f_mapping; | 310 | struct address_space *mapping = file->f_mapping; |
306 | struct inode *inode = file->f_mapping->host; | 311 | struct inode *inode = file->f_mapping->host; |
307 | size_t count = iov_iter_count(iter); | 312 | size_t count = iov_iter_count(iter); |
308 | ssize_t size; | 313 | ssize_t size; |
309 | 314 | ||
310 | if (rw == WRITE) | 315 | if (rw == WRITE) |
311 | return 0; | 316 | return 0; |
312 | 317 | ||
313 | /* Needs synchronization with the cleaner */ | 318 | /* Needs synchronization with the cleaner */ |
314 | size = blockdev_direct_IO(rw, iocb, inode, iter, offset, | 319 | size = blockdev_direct_IO(rw, iocb, inode, iter, offset, |
315 | nilfs_get_block); | 320 | nilfs_get_block); |
316 | 321 | ||
317 | /* | 322 | /* |
318 | * In case of error extending write may have instantiated a few | 323 | * In case of error extending write may have instantiated a few |
319 | * blocks outside i_size. Trim these off again. | 324 | * blocks outside i_size. Trim these off again. |
320 | */ | 325 | */ |
321 | if (unlikely((rw & WRITE) && size < 0)) { | 326 | if (unlikely((rw & WRITE) && size < 0)) { |
322 | loff_t isize = i_size_read(inode); | 327 | loff_t isize = i_size_read(inode); |
323 | loff_t end = offset + count; | 328 | loff_t end = offset + count; |
324 | 329 | ||
325 | if (end > isize) | 330 | if (end > isize) |
326 | nilfs_write_failed(mapping, end); | 331 | nilfs_write_failed(mapping, end); |
327 | } | 332 | } |
328 | 333 | ||
329 | return size; | 334 | return size; |
330 | } | 335 | } |
331 | 336 | ||
332 | const struct address_space_operations nilfs_aops = { | 337 | const struct address_space_operations nilfs_aops = { |
333 | .writepage = nilfs_writepage, | 338 | .writepage = nilfs_writepage, |
334 | .readpage = nilfs_readpage, | 339 | .readpage = nilfs_readpage, |
335 | .writepages = nilfs_writepages, | 340 | .writepages = nilfs_writepages, |
336 | .set_page_dirty = nilfs_set_page_dirty, | 341 | .set_page_dirty = nilfs_set_page_dirty, |
337 | .readpages = nilfs_readpages, | 342 | .readpages = nilfs_readpages, |
338 | .write_begin = nilfs_write_begin, | 343 | .write_begin = nilfs_write_begin, |
339 | .write_end = nilfs_write_end, | 344 | .write_end = nilfs_write_end, |
340 | /* .releasepage = nilfs_releasepage, */ | 345 | /* .releasepage = nilfs_releasepage, */ |
341 | .invalidatepage = block_invalidatepage, | 346 | .invalidatepage = block_invalidatepage, |
342 | .direct_IO = nilfs_direct_IO, | 347 | .direct_IO = nilfs_direct_IO, |
343 | .is_partially_uptodate = block_is_partially_uptodate, | 348 | .is_partially_uptodate = block_is_partially_uptodate, |
344 | }; | 349 | }; |
345 | 350 | ||
346 | struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) | 351 | struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) |
347 | { | 352 | { |
348 | struct super_block *sb = dir->i_sb; | 353 | struct super_block *sb = dir->i_sb; |
349 | struct the_nilfs *nilfs = sb->s_fs_info; | 354 | struct the_nilfs *nilfs = sb->s_fs_info; |
350 | struct inode *inode; | 355 | struct inode *inode; |
351 | struct nilfs_inode_info *ii; | 356 | struct nilfs_inode_info *ii; |
352 | struct nilfs_root *root; | 357 | struct nilfs_root *root; |
353 | int err = -ENOMEM; | 358 | int err = -ENOMEM; |
354 | ino_t ino; | 359 | ino_t ino; |
355 | 360 | ||
356 | inode = new_inode(sb); | 361 | inode = new_inode(sb); |
357 | if (unlikely(!inode)) | 362 | if (unlikely(!inode)) |
358 | goto failed; | 363 | goto failed; |
359 | 364 | ||
360 | mapping_set_gfp_mask(inode->i_mapping, | 365 | mapping_set_gfp_mask(inode->i_mapping, |
361 | mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); | 366 | mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); |
362 | 367 | ||
363 | root = NILFS_I(dir)->i_root; | 368 | root = NILFS_I(dir)->i_root; |
364 | ii = NILFS_I(inode); | 369 | ii = NILFS_I(inode); |
365 | ii->i_state = 1 << NILFS_I_NEW; | 370 | ii->i_state = 1 << NILFS_I_NEW; |
366 | ii->i_root = root; | 371 | ii->i_root = root; |
367 | 372 | ||
368 | err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); | 373 | err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); |
369 | if (unlikely(err)) | 374 | if (unlikely(err)) |
370 | goto failed_ifile_create_inode; | 375 | goto failed_ifile_create_inode; |
371 | /* reference count of i_bh inherits from nilfs_mdt_read_block() */ | 376 | /* reference count of i_bh inherits from nilfs_mdt_read_block() */ |
372 | 377 | ||
373 | atomic64_inc(&root->inodes_count); | 378 | atomic64_inc(&root->inodes_count); |
374 | inode_init_owner(inode, dir, mode); | 379 | inode_init_owner(inode, dir, mode); |
375 | inode->i_ino = ino; | 380 | inode->i_ino = ino; |
376 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | 381 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; |
377 | 382 | ||
378 | if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { | 383 | if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { |
379 | err = nilfs_bmap_read(ii->i_bmap, NULL); | 384 | err = nilfs_bmap_read(ii->i_bmap, NULL); |
380 | if (err < 0) | 385 | if (err < 0) |
381 | goto failed_bmap; | 386 | goto failed_bmap; |
382 | 387 | ||
383 | set_bit(NILFS_I_BMAP, &ii->i_state); | 388 | set_bit(NILFS_I_BMAP, &ii->i_state); |
384 | /* No lock is needed; iget() ensures it. */ | 389 | /* No lock is needed; iget() ensures it. */ |
385 | } | 390 | } |
386 | 391 | ||
387 | ii->i_flags = nilfs_mask_flags( | 392 | ii->i_flags = nilfs_mask_flags( |
388 | mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED); | 393 | mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED); |
389 | 394 | ||
390 | /* ii->i_file_acl = 0; */ | 395 | /* ii->i_file_acl = 0; */ |
391 | /* ii->i_dir_acl = 0; */ | 396 | /* ii->i_dir_acl = 0; */ |
392 | ii->i_dir_start_lookup = 0; | 397 | ii->i_dir_start_lookup = 0; |
393 | nilfs_set_inode_flags(inode); | 398 | nilfs_set_inode_flags(inode); |
394 | spin_lock(&nilfs->ns_next_gen_lock); | 399 | spin_lock(&nilfs->ns_next_gen_lock); |
395 | inode->i_generation = nilfs->ns_next_generation++; | 400 | inode->i_generation = nilfs->ns_next_generation++; |
396 | spin_unlock(&nilfs->ns_next_gen_lock); | 401 | spin_unlock(&nilfs->ns_next_gen_lock); |
397 | insert_inode_hash(inode); | 402 | insert_inode_hash(inode); |
398 | 403 | ||
399 | err = nilfs_init_acl(inode, dir); | 404 | err = nilfs_init_acl(inode, dir); |
400 | if (unlikely(err)) | 405 | if (unlikely(err)) |
401 | goto failed_acl; /* never occur. When supporting | 406 | goto failed_acl; /* never occur. When supporting |
402 | nilfs_init_acl(), proper cancellation of | 407 | nilfs_init_acl(), proper cancellation of |
403 | above jobs should be considered */ | 408 | above jobs should be considered */ |
404 | 409 | ||
405 | return inode; | 410 | return inode; |
406 | 411 | ||
407 | failed_acl: | 412 | failed_acl: |
408 | failed_bmap: | 413 | failed_bmap: |
409 | clear_nlink(inode); | 414 | clear_nlink(inode); |
410 | iput(inode); /* raw_inode will be deleted through | 415 | iput(inode); /* raw_inode will be deleted through |
411 | generic_delete_inode() */ | 416 | generic_delete_inode() */ |
412 | goto failed; | 417 | goto failed; |
413 | 418 | ||
414 | failed_ifile_create_inode: | 419 | failed_ifile_create_inode: |
415 | make_bad_inode(inode); | 420 | make_bad_inode(inode); |
416 | iput(inode); /* if i_nlink == 1, generic_forget_inode() will be | 421 | iput(inode); /* if i_nlink == 1, generic_forget_inode() will be |
417 | called */ | 422 | called */ |
418 | failed: | 423 | failed: |
419 | return ERR_PTR(err); | 424 | return ERR_PTR(err); |
420 | } | 425 | } |
421 | 426 | ||
422 | void nilfs_set_inode_flags(struct inode *inode) | 427 | void nilfs_set_inode_flags(struct inode *inode) |
423 | { | 428 | { |
424 | unsigned int flags = NILFS_I(inode)->i_flags; | 429 | unsigned int flags = NILFS_I(inode)->i_flags; |
425 | 430 | ||
426 | inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | | 431 | inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | |
427 | S_DIRSYNC); | 432 | S_DIRSYNC); |
428 | if (flags & FS_SYNC_FL) | 433 | if (flags & FS_SYNC_FL) |
429 | inode->i_flags |= S_SYNC; | 434 | inode->i_flags |= S_SYNC; |
430 | if (flags & FS_APPEND_FL) | 435 | if (flags & FS_APPEND_FL) |
431 | inode->i_flags |= S_APPEND; | 436 | inode->i_flags |= S_APPEND; |
432 | if (flags & FS_IMMUTABLE_FL) | 437 | if (flags & FS_IMMUTABLE_FL) |
433 | inode->i_flags |= S_IMMUTABLE; | 438 | inode->i_flags |= S_IMMUTABLE; |
434 | if (flags & FS_NOATIME_FL) | 439 | if (flags & FS_NOATIME_FL) |
435 | inode->i_flags |= S_NOATIME; | 440 | inode->i_flags |= S_NOATIME; |
436 | if (flags & FS_DIRSYNC_FL) | 441 | if (flags & FS_DIRSYNC_FL) |
437 | inode->i_flags |= S_DIRSYNC; | 442 | inode->i_flags |= S_DIRSYNC; |
438 | mapping_set_gfp_mask(inode->i_mapping, | 443 | mapping_set_gfp_mask(inode->i_mapping, |
439 | mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); | 444 | mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); |
440 | } | 445 | } |
441 | 446 | ||
442 | int nilfs_read_inode_common(struct inode *inode, | 447 | int nilfs_read_inode_common(struct inode *inode, |
443 | struct nilfs_inode *raw_inode) | 448 | struct nilfs_inode *raw_inode) |
444 | { | 449 | { |
445 | struct nilfs_inode_info *ii = NILFS_I(inode); | 450 | struct nilfs_inode_info *ii = NILFS_I(inode); |
446 | int err; | 451 | int err; |
447 | 452 | ||
448 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); | 453 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); |
449 | i_uid_write(inode, le32_to_cpu(raw_inode->i_uid)); | 454 | i_uid_write(inode, le32_to_cpu(raw_inode->i_uid)); |
450 | i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); | 455 | i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); |
451 | set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); | 456 | set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); |
452 | inode->i_size = le64_to_cpu(raw_inode->i_size); | 457 | inode->i_size = le64_to_cpu(raw_inode->i_size); |
453 | inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); | 458 | inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); |
454 | inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); | 459 | inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); |
455 | inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); | 460 | inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); |
456 | inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); | 461 | inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); |
457 | inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); | 462 | inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); |
458 | inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); | 463 | inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); |
459 | if (inode->i_nlink == 0 && inode->i_mode == 0) | 464 | if (inode->i_nlink == 0 && inode->i_mode == 0) |
460 | return -EINVAL; /* this inode is deleted */ | 465 | return -EINVAL; /* this inode is deleted */ |
461 | 466 | ||
462 | inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); | 467 | inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); |
463 | ii->i_flags = le32_to_cpu(raw_inode->i_flags); | 468 | ii->i_flags = le32_to_cpu(raw_inode->i_flags); |
464 | #if 0 | 469 | #if 0 |
465 | ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); | 470 | ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); |
466 | ii->i_dir_acl = S_ISREG(inode->i_mode) ? | 471 | ii->i_dir_acl = S_ISREG(inode->i_mode) ? |
467 | 0 : le32_to_cpu(raw_inode->i_dir_acl); | 472 | 0 : le32_to_cpu(raw_inode->i_dir_acl); |
468 | #endif | 473 | #endif |
469 | ii->i_dir_start_lookup = 0; | 474 | ii->i_dir_start_lookup = 0; |
470 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); | 475 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); |
471 | 476 | ||
472 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 477 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
473 | S_ISLNK(inode->i_mode)) { | 478 | S_ISLNK(inode->i_mode)) { |
474 | err = nilfs_bmap_read(ii->i_bmap, raw_inode); | 479 | err = nilfs_bmap_read(ii->i_bmap, raw_inode); |
475 | if (err < 0) | 480 | if (err < 0) |
476 | return err; | 481 | return err; |
477 | set_bit(NILFS_I_BMAP, &ii->i_state); | 482 | set_bit(NILFS_I_BMAP, &ii->i_state); |
478 | /* No lock is needed; iget() ensures it. */ | 483 | /* No lock is needed; iget() ensures it. */ |
479 | } | 484 | } |
480 | return 0; | 485 | return 0; |
481 | } | 486 | } |
482 | 487 | ||
483 | static int __nilfs_read_inode(struct super_block *sb, | 488 | static int __nilfs_read_inode(struct super_block *sb, |
484 | struct nilfs_root *root, unsigned long ino, | 489 | struct nilfs_root *root, unsigned long ino, |
485 | struct inode *inode) | 490 | struct inode *inode) |
486 | { | 491 | { |
487 | struct the_nilfs *nilfs = sb->s_fs_info; | 492 | struct the_nilfs *nilfs = sb->s_fs_info; |
488 | struct buffer_head *bh; | 493 | struct buffer_head *bh; |
489 | struct nilfs_inode *raw_inode; | 494 | struct nilfs_inode *raw_inode; |
490 | int err; | 495 | int err; |
491 | 496 | ||
492 | down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 497 | down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
493 | err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); | 498 | err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); |
494 | if (unlikely(err)) | 499 | if (unlikely(err)) |
495 | goto bad_inode; | 500 | goto bad_inode; |
496 | 501 | ||
497 | raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh); | 502 | raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh); |
498 | 503 | ||
499 | err = nilfs_read_inode_common(inode, raw_inode); | 504 | err = nilfs_read_inode_common(inode, raw_inode); |
500 | if (err) | 505 | if (err) |
501 | goto failed_unmap; | 506 | goto failed_unmap; |
502 | 507 | ||
503 | if (S_ISREG(inode->i_mode)) { | 508 | if (S_ISREG(inode->i_mode)) { |
504 | inode->i_op = &nilfs_file_inode_operations; | 509 | inode->i_op = &nilfs_file_inode_operations; |
505 | inode->i_fop = &nilfs_file_operations; | 510 | inode->i_fop = &nilfs_file_operations; |
506 | inode->i_mapping->a_ops = &nilfs_aops; | 511 | inode->i_mapping->a_ops = &nilfs_aops; |
507 | } else if (S_ISDIR(inode->i_mode)) { | 512 | } else if (S_ISDIR(inode->i_mode)) { |
508 | inode->i_op = &nilfs_dir_inode_operations; | 513 | inode->i_op = &nilfs_dir_inode_operations; |
509 | inode->i_fop = &nilfs_dir_operations; | 514 | inode->i_fop = &nilfs_dir_operations; |
510 | inode->i_mapping->a_ops = &nilfs_aops; | 515 | inode->i_mapping->a_ops = &nilfs_aops; |
511 | } else if (S_ISLNK(inode->i_mode)) { | 516 | } else if (S_ISLNK(inode->i_mode)) { |
512 | inode->i_op = &nilfs_symlink_inode_operations; | 517 | inode->i_op = &nilfs_symlink_inode_operations; |
513 | inode->i_mapping->a_ops = &nilfs_aops; | 518 | inode->i_mapping->a_ops = &nilfs_aops; |
514 | } else { | 519 | } else { |
515 | inode->i_op = &nilfs_special_inode_operations; | 520 | inode->i_op = &nilfs_special_inode_operations; |
516 | init_special_inode( | 521 | init_special_inode( |
517 | inode, inode->i_mode, | 522 | inode, inode->i_mode, |
518 | huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); | 523 | huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); |
519 | } | 524 | } |
520 | nilfs_ifile_unmap_inode(root->ifile, ino, bh); | 525 | nilfs_ifile_unmap_inode(root->ifile, ino, bh); |
521 | brelse(bh); | 526 | brelse(bh); |
522 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 527 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
523 | nilfs_set_inode_flags(inode); | 528 | nilfs_set_inode_flags(inode); |
524 | return 0; | 529 | return 0; |
525 | 530 | ||
526 | failed_unmap: | 531 | failed_unmap: |
527 | nilfs_ifile_unmap_inode(root->ifile, ino, bh); | 532 | nilfs_ifile_unmap_inode(root->ifile, ino, bh); |
528 | brelse(bh); | 533 | brelse(bh); |
529 | 534 | ||
530 | bad_inode: | 535 | bad_inode: |
531 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 536 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
532 | return err; | 537 | return err; |
533 | } | 538 | } |
534 | 539 | ||
535 | static int nilfs_iget_test(struct inode *inode, void *opaque) | 540 | static int nilfs_iget_test(struct inode *inode, void *opaque) |
536 | { | 541 | { |
537 | struct nilfs_iget_args *args = opaque; | 542 | struct nilfs_iget_args *args = opaque; |
538 | struct nilfs_inode_info *ii; | 543 | struct nilfs_inode_info *ii; |
539 | 544 | ||
540 | if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root) | 545 | if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root) |
541 | return 0; | 546 | return 0; |
542 | 547 | ||
543 | ii = NILFS_I(inode); | 548 | ii = NILFS_I(inode); |
544 | if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) | 549 | if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) |
545 | return !args->for_gc; | 550 | return !args->for_gc; |
546 | 551 | ||
547 | return args->for_gc && args->cno == ii->i_cno; | 552 | return args->for_gc && args->cno == ii->i_cno; |
548 | } | 553 | } |
549 | 554 | ||
550 | static int nilfs_iget_set(struct inode *inode, void *opaque) | 555 | static int nilfs_iget_set(struct inode *inode, void *opaque) |
551 | { | 556 | { |
552 | struct nilfs_iget_args *args = opaque; | 557 | struct nilfs_iget_args *args = opaque; |
553 | 558 | ||
554 | inode->i_ino = args->ino; | 559 | inode->i_ino = args->ino; |
555 | if (args->for_gc) { | 560 | if (args->for_gc) { |
556 | NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; | 561 | NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; |
557 | NILFS_I(inode)->i_cno = args->cno; | 562 | NILFS_I(inode)->i_cno = args->cno; |
558 | NILFS_I(inode)->i_root = NULL; | 563 | NILFS_I(inode)->i_root = NULL; |
559 | } else { | 564 | } else { |
560 | if (args->root && args->ino == NILFS_ROOT_INO) | 565 | if (args->root && args->ino == NILFS_ROOT_INO) |
561 | nilfs_get_root(args->root); | 566 | nilfs_get_root(args->root); |
562 | NILFS_I(inode)->i_root = args->root; | 567 | NILFS_I(inode)->i_root = args->root; |
563 | } | 568 | } |
564 | return 0; | 569 | return 0; |
565 | } | 570 | } |
566 | 571 | ||
567 | struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, | 572 | struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, |
568 | unsigned long ino) | 573 | unsigned long ino) |
569 | { | 574 | { |
570 | struct nilfs_iget_args args = { | 575 | struct nilfs_iget_args args = { |
571 | .ino = ino, .root = root, .cno = 0, .for_gc = 0 | 576 | .ino = ino, .root = root, .cno = 0, .for_gc = 0 |
572 | }; | 577 | }; |
573 | 578 | ||
574 | return ilookup5(sb, ino, nilfs_iget_test, &args); | 579 | return ilookup5(sb, ino, nilfs_iget_test, &args); |
575 | } | 580 | } |
576 | 581 | ||
577 | struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, | 582 | struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, |
578 | unsigned long ino) | 583 | unsigned long ino) |
579 | { | 584 | { |
580 | struct nilfs_iget_args args = { | 585 | struct nilfs_iget_args args = { |
581 | .ino = ino, .root = root, .cno = 0, .for_gc = 0 | 586 | .ino = ino, .root = root, .cno = 0, .for_gc = 0 |
582 | }; | 587 | }; |
583 | 588 | ||
584 | return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); | 589 | return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); |
585 | } | 590 | } |
586 | 591 | ||
587 | struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, | 592 | struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, |
588 | unsigned long ino) | 593 | unsigned long ino) |
589 | { | 594 | { |
590 | struct inode *inode; | 595 | struct inode *inode; |
591 | int err; | 596 | int err; |
592 | 597 | ||
593 | inode = nilfs_iget_locked(sb, root, ino); | 598 | inode = nilfs_iget_locked(sb, root, ino); |
594 | if (unlikely(!inode)) | 599 | if (unlikely(!inode)) |
595 | return ERR_PTR(-ENOMEM); | 600 | return ERR_PTR(-ENOMEM); |
596 | if (!(inode->i_state & I_NEW)) | 601 | if (!(inode->i_state & I_NEW)) |
597 | return inode; | 602 | return inode; |
598 | 603 | ||
599 | err = __nilfs_read_inode(sb, root, ino, inode); | 604 | err = __nilfs_read_inode(sb, root, ino, inode); |
600 | if (unlikely(err)) { | 605 | if (unlikely(err)) { |
601 | iget_failed(inode); | 606 | iget_failed(inode); |
602 | return ERR_PTR(err); | 607 | return ERR_PTR(err); |
603 | } | 608 | } |
604 | unlock_new_inode(inode); | 609 | unlock_new_inode(inode); |
605 | return inode; | 610 | return inode; |
606 | } | 611 | } |
607 | 612 | ||
608 | struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, | 613 | struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, |
609 | __u64 cno) | 614 | __u64 cno) |
610 | { | 615 | { |
611 | struct nilfs_iget_args args = { | 616 | struct nilfs_iget_args args = { |
612 | .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 | 617 | .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 |
613 | }; | 618 | }; |
614 | struct inode *inode; | 619 | struct inode *inode; |
615 | int err; | 620 | int err; |
616 | 621 | ||
617 | inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); | 622 | inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); |
618 | if (unlikely(!inode)) | 623 | if (unlikely(!inode)) |
619 | return ERR_PTR(-ENOMEM); | 624 | return ERR_PTR(-ENOMEM); |
620 | if (!(inode->i_state & I_NEW)) | 625 | if (!(inode->i_state & I_NEW)) |
621 | return inode; | 626 | return inode; |
622 | 627 | ||
623 | err = nilfs_init_gcinode(inode); | 628 | err = nilfs_init_gcinode(inode); |
624 | if (unlikely(err)) { | 629 | if (unlikely(err)) { |
625 | iget_failed(inode); | 630 | iget_failed(inode); |
626 | return ERR_PTR(err); | 631 | return ERR_PTR(err); |
627 | } | 632 | } |
628 | unlock_new_inode(inode); | 633 | unlock_new_inode(inode); |
629 | return inode; | 634 | return inode; |
630 | } | 635 | } |
631 | 636 | ||
632 | void nilfs_write_inode_common(struct inode *inode, | 637 | void nilfs_write_inode_common(struct inode *inode, |
633 | struct nilfs_inode *raw_inode, int has_bmap) | 638 | struct nilfs_inode *raw_inode, int has_bmap) |
634 | { | 639 | { |
635 | struct nilfs_inode_info *ii = NILFS_I(inode); | 640 | struct nilfs_inode_info *ii = NILFS_I(inode); |
636 | 641 | ||
637 | raw_inode->i_mode = cpu_to_le16(inode->i_mode); | 642 | raw_inode->i_mode = cpu_to_le16(inode->i_mode); |
638 | raw_inode->i_uid = cpu_to_le32(i_uid_read(inode)); | 643 | raw_inode->i_uid = cpu_to_le32(i_uid_read(inode)); |
639 | raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); | 644 | raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); |
640 | raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); | 645 | raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); |
641 | raw_inode->i_size = cpu_to_le64(inode->i_size); | 646 | raw_inode->i_size = cpu_to_le64(inode->i_size); |
642 | raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | 647 | raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); |
643 | raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); | 648 | raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); |
644 | raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | 649 | raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); |
645 | raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | 650 | raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); |
646 | raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); | 651 | raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); |
647 | 652 | ||
648 | raw_inode->i_flags = cpu_to_le32(ii->i_flags); | 653 | raw_inode->i_flags = cpu_to_le32(ii->i_flags); |
649 | raw_inode->i_generation = cpu_to_le32(inode->i_generation); | 654 | raw_inode->i_generation = cpu_to_le32(inode->i_generation); |
650 | 655 | ||
651 | if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { | 656 | if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { |
652 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; | 657 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; |
653 | 658 | ||
654 | /* zero-fill unused portion in the case of super root block */ | 659 | /* zero-fill unused portion in the case of super root block */ |
655 | raw_inode->i_xattr = 0; | 660 | raw_inode->i_xattr = 0; |
656 | raw_inode->i_pad = 0; | 661 | raw_inode->i_pad = 0; |
657 | memset((void *)raw_inode + sizeof(*raw_inode), 0, | 662 | memset((void *)raw_inode + sizeof(*raw_inode), 0, |
658 | nilfs->ns_inode_size - sizeof(*raw_inode)); | 663 | nilfs->ns_inode_size - sizeof(*raw_inode)); |
659 | } | 664 | } |
660 | 665 | ||
661 | if (has_bmap) | 666 | if (has_bmap) |
662 | nilfs_bmap_write(ii->i_bmap, raw_inode); | 667 | nilfs_bmap_write(ii->i_bmap, raw_inode); |
663 | else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) | 668 | else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) |
664 | raw_inode->i_device_code = | 669 | raw_inode->i_device_code = |
665 | cpu_to_le64(huge_encode_dev(inode->i_rdev)); | 670 | cpu_to_le64(huge_encode_dev(inode->i_rdev)); |
666 | /* When extending inode, nilfs->ns_inode_size should be checked | 671 | /* When extending inode, nilfs->ns_inode_size should be checked |
667 | for substitutions of appended fields */ | 672 | for substitutions of appended fields */ |
668 | } | 673 | } |
669 | 674 | ||
670 | void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) | 675 | void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) |
671 | { | 676 | { |
672 | ino_t ino = inode->i_ino; | 677 | ino_t ino = inode->i_ino; |
673 | struct nilfs_inode_info *ii = NILFS_I(inode); | 678 | struct nilfs_inode_info *ii = NILFS_I(inode); |
674 | struct inode *ifile = ii->i_root->ifile; | 679 | struct inode *ifile = ii->i_root->ifile; |
675 | struct nilfs_inode *raw_inode; | 680 | struct nilfs_inode *raw_inode; |
676 | 681 | ||
677 | raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh); | 682 | raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh); |
678 | 683 | ||
679 | if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) | 684 | if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) |
680 | memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); | 685 | memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); |
681 | set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); | 686 | set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); |
682 | 687 | ||
683 | nilfs_write_inode_common(inode, raw_inode, 0); | 688 | nilfs_write_inode_common(inode, raw_inode, 0); |
684 | /* XXX: call with has_bmap = 0 is a workaround to avoid | 689 | /* XXX: call with has_bmap = 0 is a workaround to avoid |
685 | deadlock of bmap. This delays update of i_bmap to just | 690 | deadlock of bmap. This delays update of i_bmap to just |
686 | before writing */ | 691 | before writing */ |
687 | nilfs_ifile_unmap_inode(ifile, ino, ibh); | 692 | nilfs_ifile_unmap_inode(ifile, ino, ibh); |
688 | } | 693 | } |
689 | 694 | ||
690 | #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ | 695 | #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ |
691 | 696 | ||
692 | static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, | 697 | static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, |
693 | unsigned long from) | 698 | unsigned long from) |
694 | { | 699 | { |
695 | unsigned long b; | 700 | unsigned long b; |
696 | int ret; | 701 | int ret; |
697 | 702 | ||
698 | if (!test_bit(NILFS_I_BMAP, &ii->i_state)) | 703 | if (!test_bit(NILFS_I_BMAP, &ii->i_state)) |
699 | return; | 704 | return; |
700 | repeat: | 705 | repeat: |
701 | ret = nilfs_bmap_last_key(ii->i_bmap, &b); | 706 | ret = nilfs_bmap_last_key(ii->i_bmap, &b); |
702 | if (ret == -ENOENT) | 707 | if (ret == -ENOENT) |
703 | return; | 708 | return; |
704 | else if (ret < 0) | 709 | else if (ret < 0) |
705 | goto failed; | 710 | goto failed; |
706 | 711 | ||
707 | if (b < from) | 712 | if (b < from) |
708 | return; | 713 | return; |
709 | 714 | ||
710 | b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); | 715 | b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); |
711 | ret = nilfs_bmap_truncate(ii->i_bmap, b); | 716 | ret = nilfs_bmap_truncate(ii->i_bmap, b); |
712 | nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); | 717 | nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); |
713 | if (!ret || (ret == -ENOMEM && | 718 | if (!ret || (ret == -ENOMEM && |
714 | nilfs_bmap_truncate(ii->i_bmap, b) == 0)) | 719 | nilfs_bmap_truncate(ii->i_bmap, b) == 0)) |
715 | goto repeat; | 720 | goto repeat; |
716 | 721 | ||
717 | failed: | 722 | failed: |
718 | nilfs_warning(ii->vfs_inode.i_sb, __func__, | 723 | nilfs_warning(ii->vfs_inode.i_sb, __func__, |
719 | "failed to truncate bmap (ino=%lu, err=%d)", | 724 | "failed to truncate bmap (ino=%lu, err=%d)", |
720 | ii->vfs_inode.i_ino, ret); | 725 | ii->vfs_inode.i_ino, ret); |
721 | } | 726 | } |
722 | 727 | ||
723 | void nilfs_truncate(struct inode *inode) | 728 | void nilfs_truncate(struct inode *inode) |
724 | { | 729 | { |
725 | unsigned long blkoff; | 730 | unsigned long blkoff; |
726 | unsigned int blocksize; | 731 | unsigned int blocksize; |
727 | struct nilfs_transaction_info ti; | 732 | struct nilfs_transaction_info ti; |
728 | struct super_block *sb = inode->i_sb; | 733 | struct super_block *sb = inode->i_sb; |
729 | struct nilfs_inode_info *ii = NILFS_I(inode); | 734 | struct nilfs_inode_info *ii = NILFS_I(inode); |
730 | 735 | ||
731 | if (!test_bit(NILFS_I_BMAP, &ii->i_state)) | 736 | if (!test_bit(NILFS_I_BMAP, &ii->i_state)) |
732 | return; | 737 | return; |
733 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | 738 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) |
734 | return; | 739 | return; |
735 | 740 | ||
736 | blocksize = sb->s_blocksize; | 741 | blocksize = sb->s_blocksize; |
737 | blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; | 742 | blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; |
738 | nilfs_transaction_begin(sb, &ti, 0); /* never fails */ | 743 | nilfs_transaction_begin(sb, &ti, 0); /* never fails */ |
739 | 744 | ||
740 | block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); | 745 | block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); |
741 | 746 | ||
742 | nilfs_truncate_bmap(ii, blkoff); | 747 | nilfs_truncate_bmap(ii, blkoff); |
743 | 748 | ||
744 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 749 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
745 | if (IS_SYNC(inode)) | 750 | if (IS_SYNC(inode)) |
746 | nilfs_set_transaction_flag(NILFS_TI_SYNC); | 751 | nilfs_set_transaction_flag(NILFS_TI_SYNC); |
747 | 752 | ||
748 | nilfs_mark_inode_dirty(inode); | 753 | nilfs_mark_inode_dirty(inode); |
749 | nilfs_set_file_dirty(inode, 0); | 754 | nilfs_set_file_dirty(inode, 0); |
750 | nilfs_transaction_commit(sb); | 755 | nilfs_transaction_commit(sb); |
751 | /* May construct a logical segment and may fail in sync mode. | 756 | /* May construct a logical segment and may fail in sync mode. |
752 | But truncate has no return value. */ | 757 | But truncate has no return value. */ |
753 | } | 758 | } |
754 | 759 | ||
755 | static void nilfs_clear_inode(struct inode *inode) | 760 | static void nilfs_clear_inode(struct inode *inode) |
756 | { | 761 | { |
757 | struct nilfs_inode_info *ii = NILFS_I(inode); | 762 | struct nilfs_inode_info *ii = NILFS_I(inode); |
758 | struct nilfs_mdt_info *mdi = NILFS_MDT(inode); | 763 | struct nilfs_mdt_info *mdi = NILFS_MDT(inode); |
759 | 764 | ||
760 | /* | 765 | /* |
761 | * Free resources allocated in nilfs_read_inode(), here. | 766 | * Free resources allocated in nilfs_read_inode(), here. |
762 | */ | 767 | */ |
763 | BUG_ON(!list_empty(&ii->i_dirty)); | 768 | BUG_ON(!list_empty(&ii->i_dirty)); |
764 | brelse(ii->i_bh); | 769 | brelse(ii->i_bh); |
765 | ii->i_bh = NULL; | 770 | ii->i_bh = NULL; |
766 | 771 | ||
767 | if (mdi && mdi->mi_palloc_cache) | 772 | if (mdi && mdi->mi_palloc_cache) |
768 | nilfs_palloc_destroy_cache(inode); | 773 | nilfs_palloc_destroy_cache(inode); |
769 | 774 | ||
770 | if (test_bit(NILFS_I_BMAP, &ii->i_state)) | 775 | if (test_bit(NILFS_I_BMAP, &ii->i_state)) |
771 | nilfs_bmap_clear(ii->i_bmap); | 776 | nilfs_bmap_clear(ii->i_bmap); |
772 | 777 | ||
773 | nilfs_btnode_cache_clear(&ii->i_btnode_cache); | 778 | nilfs_btnode_cache_clear(&ii->i_btnode_cache); |
774 | 779 | ||
775 | if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) | 780 | if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) |
776 | nilfs_put_root(ii->i_root); | 781 | nilfs_put_root(ii->i_root); |
777 | } | 782 | } |
778 | 783 | ||
779 | void nilfs_evict_inode(struct inode *inode) | 784 | void nilfs_evict_inode(struct inode *inode) |
780 | { | 785 | { |
781 | struct nilfs_transaction_info ti; | 786 | struct nilfs_transaction_info ti; |
782 | struct super_block *sb = inode->i_sb; | 787 | struct super_block *sb = inode->i_sb; |
783 | struct nilfs_inode_info *ii = NILFS_I(inode); | 788 | struct nilfs_inode_info *ii = NILFS_I(inode); |
784 | int ret; | 789 | int ret; |
785 | 790 | ||
786 | if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { | 791 | if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { |
787 | truncate_inode_pages_final(&inode->i_data); | 792 | truncate_inode_pages_final(&inode->i_data); |
788 | clear_inode(inode); | 793 | clear_inode(inode); |
789 | nilfs_clear_inode(inode); | 794 | nilfs_clear_inode(inode); |
790 | return; | 795 | return; |
791 | } | 796 | } |
792 | nilfs_transaction_begin(sb, &ti, 0); /* never fails */ | 797 | nilfs_transaction_begin(sb, &ti, 0); /* never fails */ |
793 | 798 | ||
794 | truncate_inode_pages_final(&inode->i_data); | 799 | truncate_inode_pages_final(&inode->i_data); |
795 | 800 | ||
796 | /* TODO: some of the following operations may fail. */ | 801 | /* TODO: some of the following operations may fail. */ |
797 | nilfs_truncate_bmap(ii, 0); | 802 | nilfs_truncate_bmap(ii, 0); |
798 | nilfs_mark_inode_dirty(inode); | 803 | nilfs_mark_inode_dirty(inode); |
799 | clear_inode(inode); | 804 | clear_inode(inode); |
800 | 805 | ||
801 | ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); | 806 | ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); |
802 | if (!ret) | 807 | if (!ret) |
803 | atomic64_dec(&ii->i_root->inodes_count); | 808 | atomic64_dec(&ii->i_root->inodes_count); |
804 | 809 | ||
805 | nilfs_clear_inode(inode); | 810 | nilfs_clear_inode(inode); |
806 | 811 | ||
807 | if (IS_SYNC(inode)) | 812 | if (IS_SYNC(inode)) |
808 | nilfs_set_transaction_flag(NILFS_TI_SYNC); | 813 | nilfs_set_transaction_flag(NILFS_TI_SYNC); |
809 | nilfs_transaction_commit(sb); | 814 | nilfs_transaction_commit(sb); |
810 | /* May construct a logical segment and may fail in sync mode. | 815 | /* May construct a logical segment and may fail in sync mode. |
811 | But delete_inode has no return value. */ | 816 | But delete_inode has no return value. */ |
812 | } | 817 | } |
813 | 818 | ||
814 | int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) | 819 | int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) |
815 | { | 820 | { |
816 | struct nilfs_transaction_info ti; | 821 | struct nilfs_transaction_info ti; |
817 | struct inode *inode = dentry->d_inode; | 822 | struct inode *inode = dentry->d_inode; |
818 | struct super_block *sb = inode->i_sb; | 823 | struct super_block *sb = inode->i_sb; |
819 | int err; | 824 | int err; |
820 | 825 | ||
821 | err = inode_change_ok(inode, iattr); | 826 | err = inode_change_ok(inode, iattr); |
822 | if (err) | 827 | if (err) |
823 | return err; | 828 | return err; |
824 | 829 | ||
825 | err = nilfs_transaction_begin(sb, &ti, 0); | 830 | err = nilfs_transaction_begin(sb, &ti, 0); |
826 | if (unlikely(err)) | 831 | if (unlikely(err)) |
827 | return err; | 832 | return err; |
828 | 833 | ||
829 | if ((iattr->ia_valid & ATTR_SIZE) && | 834 | if ((iattr->ia_valid & ATTR_SIZE) && |
830 | iattr->ia_size != i_size_read(inode)) { | 835 | iattr->ia_size != i_size_read(inode)) { |
831 | inode_dio_wait(inode); | 836 | inode_dio_wait(inode); |
832 | truncate_setsize(inode, iattr->ia_size); | 837 | truncate_setsize(inode, iattr->ia_size); |
833 | nilfs_truncate(inode); | 838 | nilfs_truncate(inode); |
834 | } | 839 | } |
835 | 840 | ||
836 | setattr_copy(inode, iattr); | 841 | setattr_copy(inode, iattr); |
837 | mark_inode_dirty(inode); | 842 | mark_inode_dirty(inode); |
838 | 843 | ||
839 | if (iattr->ia_valid & ATTR_MODE) { | 844 | if (iattr->ia_valid & ATTR_MODE) { |
840 | err = nilfs_acl_chmod(inode); | 845 | err = nilfs_acl_chmod(inode); |
841 | if (unlikely(err)) | 846 | if (unlikely(err)) |
842 | goto out_err; | 847 | goto out_err; |
843 | } | 848 | } |
844 | 849 | ||
845 | return nilfs_transaction_commit(sb); | 850 | return nilfs_transaction_commit(sb); |
846 | 851 | ||
847 | out_err: | 852 | out_err: |
848 | nilfs_transaction_abort(sb); | 853 | nilfs_transaction_abort(sb); |
849 | return err; | 854 | return err; |
850 | } | 855 | } |
851 | 856 | ||
852 | int nilfs_permission(struct inode *inode, int mask) | 857 | int nilfs_permission(struct inode *inode, int mask) |
853 | { | 858 | { |
854 | struct nilfs_root *root = NILFS_I(inode)->i_root; | 859 | struct nilfs_root *root = NILFS_I(inode)->i_root; |
855 | if ((mask & MAY_WRITE) && root && | 860 | if ((mask & MAY_WRITE) && root && |
856 | root->cno != NILFS_CPTREE_CURRENT_CNO) | 861 | root->cno != NILFS_CPTREE_CURRENT_CNO) |
857 | return -EROFS; /* snapshot is not writable */ | 862 | return -EROFS; /* snapshot is not writable */ |
858 | 863 | ||
859 | return generic_permission(inode, mask); | 864 | return generic_permission(inode, mask); |
860 | } | 865 | } |
861 | 866 | ||
862 | int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) | 867 | int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) |
863 | { | 868 | { |
864 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; | 869 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; |
865 | struct nilfs_inode_info *ii = NILFS_I(inode); | 870 | struct nilfs_inode_info *ii = NILFS_I(inode); |
866 | int err; | 871 | int err; |
867 | 872 | ||
868 | spin_lock(&nilfs->ns_inode_lock); | 873 | spin_lock(&nilfs->ns_inode_lock); |
869 | if (ii->i_bh == NULL) { | 874 | if (ii->i_bh == NULL) { |
870 | spin_unlock(&nilfs->ns_inode_lock); | 875 | spin_unlock(&nilfs->ns_inode_lock); |
871 | err = nilfs_ifile_get_inode_block(ii->i_root->ifile, | 876 | err = nilfs_ifile_get_inode_block(ii->i_root->ifile, |
872 | inode->i_ino, pbh); | 877 | inode->i_ino, pbh); |
873 | if (unlikely(err)) | 878 | if (unlikely(err)) |
874 | return err; | 879 | return err; |
875 | spin_lock(&nilfs->ns_inode_lock); | 880 | spin_lock(&nilfs->ns_inode_lock); |
876 | if (ii->i_bh == NULL) | 881 | if (ii->i_bh == NULL) |
877 | ii->i_bh = *pbh; | 882 | ii->i_bh = *pbh; |
878 | else { | 883 | else { |
879 | brelse(*pbh); | 884 | brelse(*pbh); |
880 | *pbh = ii->i_bh; | 885 | *pbh = ii->i_bh; |
881 | } | 886 | } |
882 | } else | 887 | } else |
883 | *pbh = ii->i_bh; | 888 | *pbh = ii->i_bh; |
884 | 889 | ||
885 | get_bh(*pbh); | 890 | get_bh(*pbh); |
886 | spin_unlock(&nilfs->ns_inode_lock); | 891 | spin_unlock(&nilfs->ns_inode_lock); |
887 | return 0; | 892 | return 0; |
888 | } | 893 | } |
889 | 894 | ||
890 | int nilfs_inode_dirty(struct inode *inode) | 895 | int nilfs_inode_dirty(struct inode *inode) |
891 | { | 896 | { |
892 | struct nilfs_inode_info *ii = NILFS_I(inode); | 897 | struct nilfs_inode_info *ii = NILFS_I(inode); |
893 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; | 898 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; |
894 | int ret = 0; | 899 | int ret = 0; |
895 | 900 | ||
896 | if (!list_empty(&ii->i_dirty)) { | 901 | if (!list_empty(&ii->i_dirty)) { |
897 | spin_lock(&nilfs->ns_inode_lock); | 902 | spin_lock(&nilfs->ns_inode_lock); |
898 | ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || | 903 | ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || |
899 | test_bit(NILFS_I_BUSY, &ii->i_state); | 904 | test_bit(NILFS_I_BUSY, &ii->i_state); |
900 | spin_unlock(&nilfs->ns_inode_lock); | 905 | spin_unlock(&nilfs->ns_inode_lock); |
901 | } | 906 | } |
902 | return ret; | 907 | return ret; |
903 | } | 908 | } |
904 | 909 | ||
905 | int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) | 910 | int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) |
906 | { | 911 | { |
907 | struct nilfs_inode_info *ii = NILFS_I(inode); | 912 | struct nilfs_inode_info *ii = NILFS_I(inode); |
908 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; | 913 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; |
909 | 914 | ||
910 | atomic_add(nr_dirty, &nilfs->ns_ndirtyblks); | 915 | atomic_add(nr_dirty, &nilfs->ns_ndirtyblks); |
911 | 916 | ||
912 | if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) | 917 | if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) |
913 | return 0; | 918 | return 0; |
914 | 919 | ||
915 | spin_lock(&nilfs->ns_inode_lock); | 920 | spin_lock(&nilfs->ns_inode_lock); |
916 | if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && | 921 | if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && |
917 | !test_bit(NILFS_I_BUSY, &ii->i_state)) { | 922 | !test_bit(NILFS_I_BUSY, &ii->i_state)) { |
918 | /* Because this routine may race with nilfs_dispose_list(), | 923 | /* Because this routine may race with nilfs_dispose_list(), |
919 | we have to check NILFS_I_QUEUED here, too. */ | 924 | we have to check NILFS_I_QUEUED here, too. */ |
920 | if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { | 925 | if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { |
921 | /* This will happen when somebody is freeing | 926 | /* This will happen when somebody is freeing |
922 | this inode. */ | 927 | this inode. */ |
923 | nilfs_warning(inode->i_sb, __func__, | 928 | nilfs_warning(inode->i_sb, __func__, |
924 | "cannot get inode (ino=%lu)\n", | 929 | "cannot get inode (ino=%lu)\n", |
925 | inode->i_ino); | 930 | inode->i_ino); |
926 | spin_unlock(&nilfs->ns_inode_lock); | 931 | spin_unlock(&nilfs->ns_inode_lock); |
927 | return -EINVAL; /* NILFS_I_DIRTY may remain for | 932 | return -EINVAL; /* NILFS_I_DIRTY may remain for |
928 | freeing inode */ | 933 | freeing inode */ |
929 | } | 934 | } |
930 | list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); | 935 | list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); |
931 | set_bit(NILFS_I_QUEUED, &ii->i_state); | 936 | set_bit(NILFS_I_QUEUED, &ii->i_state); |
932 | } | 937 | } |
933 | spin_unlock(&nilfs->ns_inode_lock); | 938 | spin_unlock(&nilfs->ns_inode_lock); |
934 | return 0; | 939 | return 0; |
935 | } | 940 | } |
936 | 941 | ||
937 | int nilfs_mark_inode_dirty(struct inode *inode) | 942 | int nilfs_mark_inode_dirty(struct inode *inode) |
938 | { | 943 | { |
939 | struct buffer_head *ibh; | 944 | struct buffer_head *ibh; |
940 | int err; | 945 | int err; |
941 | 946 | ||
942 | err = nilfs_load_inode_block(inode, &ibh); | 947 | err = nilfs_load_inode_block(inode, &ibh); |
943 | if (unlikely(err)) { | 948 | if (unlikely(err)) { |
944 | nilfs_warning(inode->i_sb, __func__, | 949 | nilfs_warning(inode->i_sb, __func__, |
945 | "failed to reget inode block.\n"); | 950 | "failed to reget inode block.\n"); |
946 | return err; | 951 | return err; |
947 | } | 952 | } |
948 | nilfs_update_inode(inode, ibh); | 953 | nilfs_update_inode(inode, ibh); |
949 | mark_buffer_dirty(ibh); | 954 | mark_buffer_dirty(ibh); |
950 | nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); | 955 | nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); |
951 | brelse(ibh); | 956 | brelse(ibh); |
952 | return 0; | 957 | return 0; |
953 | } | 958 | } |
954 | 959 | ||
955 | /** | 960 | /** |
956 | * nilfs_dirty_inode - reflect changes on given inode to an inode block. | 961 | * nilfs_dirty_inode - reflect changes on given inode to an inode block. |
957 | * @inode: inode of the file to be registered. | 962 | * @inode: inode of the file to be registered. |
958 | * | 963 | * |
959 | * nilfs_dirty_inode() loads a inode block containing the specified | 964 | * nilfs_dirty_inode() loads a inode block containing the specified |
960 | * @inode and copies data from a nilfs_inode to a corresponding inode | 965 | * @inode and copies data from a nilfs_inode to a corresponding inode |
961 | * entry in the inode block. This operation is excluded from the segment | 966 | * entry in the inode block. This operation is excluded from the segment |
962 | * construction. This function can be called both as a single operation | 967 | * construction. This function can be called both as a single operation |
963 | * and as a part of indivisible file operations. | 968 | * and as a part of indivisible file operations. |
964 | */ | 969 | */ |
965 | void nilfs_dirty_inode(struct inode *inode, int flags) | 970 | void nilfs_dirty_inode(struct inode *inode, int flags) |
966 | { | 971 | { |
967 | struct nilfs_transaction_info ti; | 972 | struct nilfs_transaction_info ti; |
968 | struct nilfs_mdt_info *mdi = NILFS_MDT(inode); | 973 | struct nilfs_mdt_info *mdi = NILFS_MDT(inode); |
969 | 974 | ||
970 | if (is_bad_inode(inode)) { | 975 | if (is_bad_inode(inode)) { |
971 | nilfs_warning(inode->i_sb, __func__, | 976 | nilfs_warning(inode->i_sb, __func__, |
972 | "tried to mark bad_inode dirty. ignored.\n"); | 977 | "tried to mark bad_inode dirty. ignored.\n"); |
973 | dump_stack(); | 978 | dump_stack(); |
974 | return; | 979 | return; |
975 | } | 980 | } |
976 | if (mdi) { | 981 | if (mdi) { |
977 | nilfs_mdt_mark_dirty(inode); | 982 | nilfs_mdt_mark_dirty(inode); |
978 | return; | 983 | return; |
979 | } | 984 | } |
980 | nilfs_transaction_begin(inode->i_sb, &ti, 0); | 985 | nilfs_transaction_begin(inode->i_sb, &ti, 0); |
981 | nilfs_mark_inode_dirty(inode); | 986 | nilfs_mark_inode_dirty(inode); |
982 | nilfs_transaction_commit(inode->i_sb); /* never fails */ | 987 | nilfs_transaction_commit(inode->i_sb); /* never fails */ |
983 | } | 988 | } |
984 | 989 | ||
985 | int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 990 | int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
986 | __u64 start, __u64 len) | 991 | __u64 start, __u64 len) |
987 | { | 992 | { |
988 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; | 993 | struct the_nilfs *nilfs = inode->i_sb->s_fs_info; |
989 | __u64 logical = 0, phys = 0, size = 0; | 994 | __u64 logical = 0, phys = 0, size = 0; |
990 | __u32 flags = 0; | 995 | __u32 flags = 0; |
991 | loff_t isize; | 996 | loff_t isize; |
992 | sector_t blkoff, end_blkoff; | 997 | sector_t blkoff, end_blkoff; |
993 | sector_t delalloc_blkoff; | 998 | sector_t delalloc_blkoff; |
994 | unsigned long delalloc_blklen; | 999 | unsigned long delalloc_blklen; |
995 | unsigned int blkbits = inode->i_blkbits; | 1000 | unsigned int blkbits = inode->i_blkbits; |
996 | int ret, n; | 1001 | int ret, n; |
997 | 1002 | ||
998 | ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); | 1003 | ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); |
999 | if (ret) | 1004 | if (ret) |
1000 | return ret; | 1005 | return ret; |
1001 | 1006 | ||
1002 | mutex_lock(&inode->i_mutex); | 1007 | mutex_lock(&inode->i_mutex); |
1003 | 1008 | ||
1004 | isize = i_size_read(inode); | 1009 | isize = i_size_read(inode); |
1005 | 1010 | ||
1006 | blkoff = start >> blkbits; | 1011 | blkoff = start >> blkbits; |
1007 | end_blkoff = (start + len - 1) >> blkbits; | 1012 | end_blkoff = (start + len - 1) >> blkbits; |
1008 | 1013 | ||
1009 | delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, | 1014 | delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, |
1010 | &delalloc_blkoff); | 1015 | &delalloc_blkoff); |
1011 | 1016 | ||
1012 | do { | 1017 | do { |
1013 | __u64 blkphy; | 1018 | __u64 blkphy; |
1014 | unsigned int maxblocks; | 1019 | unsigned int maxblocks; |
1015 | 1020 | ||
1016 | if (delalloc_blklen && blkoff == delalloc_blkoff) { | 1021 | if (delalloc_blklen && blkoff == delalloc_blkoff) { |
1017 | if (size) { | 1022 | if (size) { |
1018 | /* End of the current extent */ | 1023 | /* End of the current extent */ |
1019 | ret = fiemap_fill_next_extent( | 1024 | ret = fiemap_fill_next_extent( |
1020 | fieinfo, logical, phys, size, flags); | 1025 | fieinfo, logical, phys, size, flags); |
1021 | if (ret) | 1026 | if (ret) |
1022 | break; | 1027 | break; |
1023 | } | 1028 | } |
1024 | if (blkoff > end_blkoff) | 1029 | if (blkoff > end_blkoff) |
1025 | break; | 1030 | break; |
1026 | 1031 | ||
1027 | flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; | 1032 | flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; |
1028 | logical = blkoff << blkbits; | 1033 | logical = blkoff << blkbits; |
1029 | phys = 0; | 1034 | phys = 0; |
1030 | size = delalloc_blklen << blkbits; | 1035 | size = delalloc_blklen << blkbits; |
1031 | 1036 | ||
1032 | blkoff = delalloc_blkoff + delalloc_blklen; | 1037 | blkoff = delalloc_blkoff + delalloc_blklen; |
1033 | delalloc_blklen = nilfs_find_uncommitted_extent( | 1038 | delalloc_blklen = nilfs_find_uncommitted_extent( |
1034 | inode, blkoff, &delalloc_blkoff); | 1039 | inode, blkoff, &delalloc_blkoff); |
1035 | continue; | 1040 | continue; |
1036 | } | 1041 | } |
1037 | 1042 | ||
1038 | /* | 1043 | /* |
1039 | * Limit the number of blocks that we look up so as | 1044 | * Limit the number of blocks that we look up so as |
1040 | * not to get into the next delayed allocation extent. | 1045 | * not to get into the next delayed allocation extent. |
1041 | */ | 1046 | */ |
1042 | maxblocks = INT_MAX; | 1047 | maxblocks = INT_MAX; |
1043 | if (delalloc_blklen) | 1048 | if (delalloc_blklen) |
1044 | maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, | 1049 | maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, |
1045 | maxblocks); | 1050 | maxblocks); |
1046 | blkphy = 0; | 1051 | blkphy = 0; |
1047 | 1052 | ||
1048 | down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 1053 | down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
1049 | n = nilfs_bmap_lookup_contig( | 1054 | n = nilfs_bmap_lookup_contig( |
1050 | NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); | 1055 | NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); |
1051 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); | 1056 | up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); |
1052 | 1057 | ||
1053 | if (n < 0) { | 1058 | if (n < 0) { |
1054 | int past_eof; | 1059 | int past_eof; |
1055 | 1060 | ||
1056 | if (unlikely(n != -ENOENT)) | 1061 | if (unlikely(n != -ENOENT)) |
1057 | break; /* error */ | 1062 | break; /* error */ |
1058 | 1063 | ||
1059 | /* HOLE */ | 1064 | /* HOLE */ |
1060 | blkoff++; | 1065 | blkoff++; |
1061 | past_eof = ((blkoff << blkbits) >= isize); | 1066 | past_eof = ((blkoff << blkbits) >= isize); |
1062 | 1067 | ||
1063 | if (size) { | 1068 | if (size) { |
1064 | /* End of the current extent */ | 1069 | /* End of the current extent */ |
1065 | 1070 | ||
1066 | if (past_eof) | 1071 | if (past_eof) |
1067 | flags |= FIEMAP_EXTENT_LAST; | 1072 | flags |= FIEMAP_EXTENT_LAST; |
1068 | 1073 | ||
1069 | ret = fiemap_fill_next_extent( | 1074 | ret = fiemap_fill_next_extent( |
1070 | fieinfo, logical, phys, size, flags); | 1075 | fieinfo, logical, phys, size, flags); |
1071 | if (ret) | 1076 | if (ret) |
1072 | break; | 1077 | break; |
1073 | size = 0; | 1078 | size = 0; |
1074 | } | 1079 | } |
1075 | if (blkoff > end_blkoff || past_eof) | 1080 | if (blkoff > end_blkoff || past_eof) |
1076 | break; | 1081 | break; |
1077 | } else { | 1082 | } else { |
1078 | if (size) { | 1083 | if (size) { |
1079 | if (phys && blkphy << blkbits == phys + size) { | 1084 | if (phys && blkphy << blkbits == phys + size) { |
1080 | /* The current extent goes on */ | 1085 | /* The current extent goes on */ |
1081 | size += n << blkbits; | 1086 | size += n << blkbits; |
1082 | } else { | 1087 | } else { |
1083 | /* Terminate the current extent */ | 1088 | /* Terminate the current extent */ |
1084 | ret = fiemap_fill_next_extent( | 1089 | ret = fiemap_fill_next_extent( |
1085 | fieinfo, logical, phys, size, | 1090 | fieinfo, logical, phys, size, |
1086 | flags); | 1091 | flags); |
1087 | if (ret || blkoff > end_blkoff) | 1092 | if (ret || blkoff > end_blkoff) |
1088 | break; | 1093 | break; |
1089 | 1094 | ||
1090 | /* Start another extent */ | 1095 | /* Start another extent */ |
1091 | flags = FIEMAP_EXTENT_MERGED; | 1096 | flags = FIEMAP_EXTENT_MERGED; |
1092 | logical = blkoff << blkbits; | 1097 | logical = blkoff << blkbits; |
1093 | phys = blkphy << blkbits; | 1098 | phys = blkphy << blkbits; |
1094 | size = n << blkbits; | 1099 | size = n << blkbits; |
1095 | } | 1100 | } |
1096 | } else { | 1101 | } else { |
1097 | /* Start a new extent */ | 1102 | /* Start a new extent */ |
1098 | flags = FIEMAP_EXTENT_MERGED; | 1103 | flags = FIEMAP_EXTENT_MERGED; |
1099 | logical = blkoff << blkbits; | 1104 | logical = blkoff << blkbits; |
1100 | phys = blkphy << blkbits; | 1105 | phys = blkphy << blkbits; |
1101 | size = n << blkbits; | 1106 | size = n << blkbits; |
1102 | } | 1107 | } |
1103 | blkoff += n; | 1108 | blkoff += n; |
1104 | } | 1109 | } |
1105 | cond_resched(); | 1110 | cond_resched(); |
1106 | } while (true); | 1111 | } while (true); |
1107 | 1112 | ||
1108 | /* If ret is 1 then we just hit the end of the extent array */ | 1113 | /* If ret is 1 then we just hit the end of the extent array */ |
1109 | if (ret == 1) | 1114 | if (ret == 1) |
1110 | ret = 0; | 1115 | ret = 0; |
1111 | 1116 | ||
1112 | mutex_unlock(&inode->i_mutex); | 1117 | mutex_unlock(&inode->i_mutex); |
1113 | return ret; | 1118 | return ret; |
1114 | } | 1119 | } |
fs/ocfs2/dlm/dlmmaster.c
1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
3 | * | 3 | * |
4 | * dlmmod.c | 4 | * dlmmod.c |
5 | * | 5 | * |
6 | * standalone DLM module | 6 | * standalone DLM module |
7 | * | 7 | * |
8 | * Copyright (C) 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2004 Oracle. All rights reserved. |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU General Public | 11 | * modify it under the terms of the GNU General Public |
12 | * License as published by the Free Software Foundation; either | 12 | * License as published by the Free Software Foundation; either |
13 | * version 2 of the License, or (at your option) any later version. | 13 | * version 2 of the License, or (at your option) any later version. |
14 | * | 14 | * |
15 | * This program is distributed in the hope that it will be useful, | 15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * General Public License for more details. | 18 | * General Public License for more details. |
19 | * | 19 | * |
20 | * You should have received a copy of the GNU General Public | 20 | * You should have received a copy of the GNU General Public |
21 | * License along with this program; if not, write to the | 21 | * License along with this program; if not, write to the |
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
23 | * Boston, MA 021110-1307, USA. | 23 | * Boston, MA 021110-1307, USA. |
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | 27 | ||
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/fs.h> | 29 | #include <linux/fs.h> |
30 | #include <linux/types.h> | 30 | #include <linux/types.h> |
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/highmem.h> | 32 | #include <linux/highmem.h> |
33 | #include <linux/init.h> | 33 | #include <linux/init.h> |
34 | #include <linux/sysctl.h> | 34 | #include <linux/sysctl.h> |
35 | #include <linux/random.h> | 35 | #include <linux/random.h> |
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/socket.h> | 37 | #include <linux/socket.h> |
38 | #include <linux/inet.h> | 38 | #include <linux/inet.h> |
39 | #include <linux/spinlock.h> | 39 | #include <linux/spinlock.h> |
40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
41 | 41 | ||
42 | 42 | ||
43 | #include "cluster/heartbeat.h" | 43 | #include "cluster/heartbeat.h" |
44 | #include "cluster/nodemanager.h" | 44 | #include "cluster/nodemanager.h" |
45 | #include "cluster/tcp.h" | 45 | #include "cluster/tcp.h" |
46 | 46 | ||
47 | #include "dlmapi.h" | 47 | #include "dlmapi.h" |
48 | #include "dlmcommon.h" | 48 | #include "dlmcommon.h" |
49 | #include "dlmdomain.h" | 49 | #include "dlmdomain.h" |
50 | #include "dlmdebug.h" | 50 | #include "dlmdebug.h" |
51 | 51 | ||
52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) | 52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) |
53 | #include "cluster/masklog.h" | 53 | #include "cluster/masklog.h" |
54 | 54 | ||
55 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, | 55 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, |
56 | struct dlm_master_list_entry *mle, | 56 | struct dlm_master_list_entry *mle, |
57 | struct o2nm_node *node, | 57 | struct o2nm_node *node, |
58 | int idx); | 58 | int idx); |
59 | static void dlm_mle_node_up(struct dlm_ctxt *dlm, | 59 | static void dlm_mle_node_up(struct dlm_ctxt *dlm, |
60 | struct dlm_master_list_entry *mle, | 60 | struct dlm_master_list_entry *mle, |
61 | struct o2nm_node *node, | 61 | struct o2nm_node *node, |
62 | int idx); | 62 | int idx); |
63 | 63 | ||
64 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); | 64 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); |
65 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, | 65 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, |
66 | struct dlm_lock_resource *res, | 66 | struct dlm_lock_resource *res, |
67 | void *nodemap, u32 flags); | 67 | void *nodemap, u32 flags); |
68 | static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); | 68 | static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); |
69 | 69 | ||
70 | static inline int dlm_mle_equal(struct dlm_ctxt *dlm, | 70 | static inline int dlm_mle_equal(struct dlm_ctxt *dlm, |
71 | struct dlm_master_list_entry *mle, | 71 | struct dlm_master_list_entry *mle, |
72 | const char *name, | 72 | const char *name, |
73 | unsigned int namelen) | 73 | unsigned int namelen) |
74 | { | 74 | { |
75 | if (dlm != mle->dlm) | 75 | if (dlm != mle->dlm) |
76 | return 0; | 76 | return 0; |
77 | 77 | ||
78 | if (namelen != mle->mnamelen || | 78 | if (namelen != mle->mnamelen || |
79 | memcmp(name, mle->mname, namelen) != 0) | 79 | memcmp(name, mle->mname, namelen) != 0) |
80 | return 0; | 80 | return 0; |
81 | 81 | ||
82 | return 1; | 82 | return 1; |
83 | } | 83 | } |
84 | 84 | ||
85 | static struct kmem_cache *dlm_lockres_cache; | 85 | static struct kmem_cache *dlm_lockres_cache; |
86 | static struct kmem_cache *dlm_lockname_cache; | 86 | static struct kmem_cache *dlm_lockname_cache; |
87 | static struct kmem_cache *dlm_mle_cache; | 87 | static struct kmem_cache *dlm_mle_cache; |
88 | 88 | ||
89 | static void dlm_mle_release(struct kref *kref); | 89 | static void dlm_mle_release(struct kref *kref); |
90 | static void dlm_init_mle(struct dlm_master_list_entry *mle, | 90 | static void dlm_init_mle(struct dlm_master_list_entry *mle, |
91 | enum dlm_mle_type type, | 91 | enum dlm_mle_type type, |
92 | struct dlm_ctxt *dlm, | 92 | struct dlm_ctxt *dlm, |
93 | struct dlm_lock_resource *res, | 93 | struct dlm_lock_resource *res, |
94 | const char *name, | 94 | const char *name, |
95 | unsigned int namelen); | 95 | unsigned int namelen); |
96 | static void dlm_put_mle(struct dlm_master_list_entry *mle); | 96 | static void dlm_put_mle(struct dlm_master_list_entry *mle); |
97 | static void __dlm_put_mle(struct dlm_master_list_entry *mle); | 97 | static void __dlm_put_mle(struct dlm_master_list_entry *mle); |
98 | static int dlm_find_mle(struct dlm_ctxt *dlm, | 98 | static int dlm_find_mle(struct dlm_ctxt *dlm, |
99 | struct dlm_master_list_entry **mle, | 99 | struct dlm_master_list_entry **mle, |
100 | char *name, unsigned int namelen); | 100 | char *name, unsigned int namelen); |
101 | 101 | ||
102 | static int dlm_do_master_request(struct dlm_lock_resource *res, | 102 | static int dlm_do_master_request(struct dlm_lock_resource *res, |
103 | struct dlm_master_list_entry *mle, int to); | 103 | struct dlm_master_list_entry *mle, int to); |
104 | 104 | ||
105 | 105 | ||
106 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, | 106 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, |
107 | struct dlm_lock_resource *res, | 107 | struct dlm_lock_resource *res, |
108 | struct dlm_master_list_entry *mle, | 108 | struct dlm_master_list_entry *mle, |
109 | int *blocked); | 109 | int *blocked); |
110 | static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | 110 | static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, |
111 | struct dlm_lock_resource *res, | 111 | struct dlm_lock_resource *res, |
112 | struct dlm_master_list_entry *mle, | 112 | struct dlm_master_list_entry *mle, |
113 | int blocked); | 113 | int blocked); |
114 | static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | 114 | static int dlm_add_migration_mle(struct dlm_ctxt *dlm, |
115 | struct dlm_lock_resource *res, | 115 | struct dlm_lock_resource *res, |
116 | struct dlm_master_list_entry *mle, | 116 | struct dlm_master_list_entry *mle, |
117 | struct dlm_master_list_entry **oldmle, | 117 | struct dlm_master_list_entry **oldmle, |
118 | const char *name, unsigned int namelen, | 118 | const char *name, unsigned int namelen, |
119 | u8 new_master, u8 master); | 119 | u8 new_master, u8 master); |
120 | 120 | ||
121 | static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, | 121 | static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, |
122 | struct dlm_lock_resource *res); | 122 | struct dlm_lock_resource *res); |
123 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | 123 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, |
124 | struct dlm_lock_resource *res); | 124 | struct dlm_lock_resource *res); |
125 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, | 125 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, |
126 | struct dlm_lock_resource *res, | 126 | struct dlm_lock_resource *res, |
127 | u8 target); | 127 | u8 target); |
128 | static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, | 128 | static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, |
129 | struct dlm_lock_resource *res); | 129 | struct dlm_lock_resource *res); |
130 | 130 | ||
131 | 131 | ||
132 | int dlm_is_host_down(int errno) | 132 | int dlm_is_host_down(int errno) |
133 | { | 133 | { |
134 | switch (errno) { | 134 | switch (errno) { |
135 | case -EBADF: | 135 | case -EBADF: |
136 | case -ECONNREFUSED: | 136 | case -ECONNREFUSED: |
137 | case -ENOTCONN: | 137 | case -ENOTCONN: |
138 | case -ECONNRESET: | 138 | case -ECONNRESET: |
139 | case -EPIPE: | 139 | case -EPIPE: |
140 | case -EHOSTDOWN: | 140 | case -EHOSTDOWN: |
141 | case -EHOSTUNREACH: | 141 | case -EHOSTUNREACH: |
142 | case -ETIMEDOUT: | 142 | case -ETIMEDOUT: |
143 | case -ECONNABORTED: | 143 | case -ECONNABORTED: |
144 | case -ENETDOWN: | 144 | case -ENETDOWN: |
145 | case -ENETUNREACH: | 145 | case -ENETUNREACH: |
146 | case -ENETRESET: | 146 | case -ENETRESET: |
147 | case -ESHUTDOWN: | 147 | case -ESHUTDOWN: |
148 | case -ENOPROTOOPT: | 148 | case -ENOPROTOOPT: |
149 | case -EINVAL: /* if returned from our tcp code, | 149 | case -EINVAL: /* if returned from our tcp code, |
150 | this means there is no socket */ | 150 | this means there is no socket */ |
151 | return 1; | 151 | return 1; |
152 | } | 152 | } |
153 | return 0; | 153 | return 0; |
154 | } | 154 | } |
155 | 155 | ||
156 | 156 | ||
157 | /* | 157 | /* |
158 | * MASTER LIST FUNCTIONS | 158 | * MASTER LIST FUNCTIONS |
159 | */ | 159 | */ |
160 | 160 | ||
161 | 161 | ||
162 | /* | 162 | /* |
163 | * regarding master list entries and heartbeat callbacks: | 163 | * regarding master list entries and heartbeat callbacks: |
164 | * | 164 | * |
165 | * in order to avoid sleeping and allocation that occurs in | 165 | * in order to avoid sleeping and allocation that occurs in |
166 | * heartbeat, master list entries are simply attached to the | 166 | * heartbeat, master list entries are simply attached to the |
167 | * dlm's established heartbeat callbacks. the mle is attached | 167 | * dlm's established heartbeat callbacks. the mle is attached |
168 | * when it is created, and since the dlm->spinlock is held at | 168 | * when it is created, and since the dlm->spinlock is held at |
169 | * that time, any heartbeat event will be properly discovered | 169 | * that time, any heartbeat event will be properly discovered |
170 | * by the mle. the mle needs to be detached from the | 170 | * by the mle. the mle needs to be detached from the |
171 | * dlm->mle_hb_events list as soon as heartbeat events are no | 171 | * dlm->mle_hb_events list as soon as heartbeat events are no |
172 | * longer useful to the mle, and before the mle is freed. | 172 | * longer useful to the mle, and before the mle is freed. |
173 | * | 173 | * |
174 | * as a general rule, heartbeat events are no longer needed by | 174 | * as a general rule, heartbeat events are no longer needed by |
175 | * the mle once an "answer" regarding the lock master has been | 175 | * the mle once an "answer" regarding the lock master has been |
176 | * received. | 176 | * received. |
177 | */ | 177 | */ |
178 | static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, | 178 | static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, |
179 | struct dlm_master_list_entry *mle) | 179 | struct dlm_master_list_entry *mle) |
180 | { | 180 | { |
181 | assert_spin_locked(&dlm->spinlock); | 181 | assert_spin_locked(&dlm->spinlock); |
182 | 182 | ||
183 | list_add_tail(&mle->hb_events, &dlm->mle_hb_events); | 183 | list_add_tail(&mle->hb_events, &dlm->mle_hb_events); |
184 | } | 184 | } |
185 | 185 | ||
186 | 186 | ||
187 | static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, | 187 | static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, |
188 | struct dlm_master_list_entry *mle) | 188 | struct dlm_master_list_entry *mle) |
189 | { | 189 | { |
190 | if (!list_empty(&mle->hb_events)) | 190 | if (!list_empty(&mle->hb_events)) |
191 | list_del_init(&mle->hb_events); | 191 | list_del_init(&mle->hb_events); |
192 | } | 192 | } |
193 | 193 | ||
194 | 194 | ||
195 | static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, | 195 | static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, |
196 | struct dlm_master_list_entry *mle) | 196 | struct dlm_master_list_entry *mle) |
197 | { | 197 | { |
198 | spin_lock(&dlm->spinlock); | 198 | spin_lock(&dlm->spinlock); |
199 | __dlm_mle_detach_hb_events(dlm, mle); | 199 | __dlm_mle_detach_hb_events(dlm, mle); |
200 | spin_unlock(&dlm->spinlock); | 200 | spin_unlock(&dlm->spinlock); |
201 | } | 201 | } |
202 | 202 | ||
203 | static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) | 203 | static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) |
204 | { | 204 | { |
205 | struct dlm_ctxt *dlm; | 205 | struct dlm_ctxt *dlm; |
206 | dlm = mle->dlm; | 206 | dlm = mle->dlm; |
207 | 207 | ||
208 | assert_spin_locked(&dlm->spinlock); | 208 | assert_spin_locked(&dlm->spinlock); |
209 | assert_spin_locked(&dlm->master_lock); | 209 | assert_spin_locked(&dlm->master_lock); |
210 | mle->inuse++; | 210 | mle->inuse++; |
211 | kref_get(&mle->mle_refs); | 211 | kref_get(&mle->mle_refs); |
212 | } | 212 | } |
213 | 213 | ||
214 | static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) | 214 | static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) |
215 | { | 215 | { |
216 | struct dlm_ctxt *dlm; | 216 | struct dlm_ctxt *dlm; |
217 | dlm = mle->dlm; | 217 | dlm = mle->dlm; |
218 | 218 | ||
219 | spin_lock(&dlm->spinlock); | 219 | spin_lock(&dlm->spinlock); |
220 | spin_lock(&dlm->master_lock); | 220 | spin_lock(&dlm->master_lock); |
221 | mle->inuse--; | 221 | mle->inuse--; |
222 | __dlm_put_mle(mle); | 222 | __dlm_put_mle(mle); |
223 | spin_unlock(&dlm->master_lock); | 223 | spin_unlock(&dlm->master_lock); |
224 | spin_unlock(&dlm->spinlock); | 224 | spin_unlock(&dlm->spinlock); |
225 | 225 | ||
226 | } | 226 | } |
227 | 227 | ||
228 | /* remove from list and free */ | 228 | /* remove from list and free */ |
229 | static void __dlm_put_mle(struct dlm_master_list_entry *mle) | 229 | static void __dlm_put_mle(struct dlm_master_list_entry *mle) |
230 | { | 230 | { |
231 | struct dlm_ctxt *dlm; | 231 | struct dlm_ctxt *dlm; |
232 | dlm = mle->dlm; | 232 | dlm = mle->dlm; |
233 | 233 | ||
234 | assert_spin_locked(&dlm->spinlock); | 234 | assert_spin_locked(&dlm->spinlock); |
235 | assert_spin_locked(&dlm->master_lock); | 235 | assert_spin_locked(&dlm->master_lock); |
236 | if (!atomic_read(&mle->mle_refs.refcount)) { | 236 | if (!atomic_read(&mle->mle_refs.refcount)) { |
237 | /* this may or may not crash, but who cares. | 237 | /* this may or may not crash, but who cares. |
238 | * it's a BUG. */ | 238 | * it's a BUG. */ |
239 | mlog(ML_ERROR, "bad mle: %p\n", mle); | 239 | mlog(ML_ERROR, "bad mle: %p\n", mle); |
240 | dlm_print_one_mle(mle); | 240 | dlm_print_one_mle(mle); |
241 | BUG(); | 241 | BUG(); |
242 | } else | 242 | } else |
243 | kref_put(&mle->mle_refs, dlm_mle_release); | 243 | kref_put(&mle->mle_refs, dlm_mle_release); |
244 | } | 244 | } |
245 | 245 | ||
246 | 246 | ||
247 | /* must not have any spinlocks coming in */ | 247 | /* must not have any spinlocks coming in */ |
248 | static void dlm_put_mle(struct dlm_master_list_entry *mle) | 248 | static void dlm_put_mle(struct dlm_master_list_entry *mle) |
249 | { | 249 | { |
250 | struct dlm_ctxt *dlm; | 250 | struct dlm_ctxt *dlm; |
251 | dlm = mle->dlm; | 251 | dlm = mle->dlm; |
252 | 252 | ||
253 | spin_lock(&dlm->spinlock); | 253 | spin_lock(&dlm->spinlock); |
254 | spin_lock(&dlm->master_lock); | 254 | spin_lock(&dlm->master_lock); |
255 | __dlm_put_mle(mle); | 255 | __dlm_put_mle(mle); |
256 | spin_unlock(&dlm->master_lock); | 256 | spin_unlock(&dlm->master_lock); |
257 | spin_unlock(&dlm->spinlock); | 257 | spin_unlock(&dlm->spinlock); |
258 | } | 258 | } |
259 | 259 | ||
260 | static inline void dlm_get_mle(struct dlm_master_list_entry *mle) | 260 | static inline void dlm_get_mle(struct dlm_master_list_entry *mle) |
261 | { | 261 | { |
262 | kref_get(&mle->mle_refs); | 262 | kref_get(&mle->mle_refs); |
263 | } | 263 | } |
264 | 264 | ||
265 | static void dlm_init_mle(struct dlm_master_list_entry *mle, | 265 | static void dlm_init_mle(struct dlm_master_list_entry *mle, |
266 | enum dlm_mle_type type, | 266 | enum dlm_mle_type type, |
267 | struct dlm_ctxt *dlm, | 267 | struct dlm_ctxt *dlm, |
268 | struct dlm_lock_resource *res, | 268 | struct dlm_lock_resource *res, |
269 | const char *name, | 269 | const char *name, |
270 | unsigned int namelen) | 270 | unsigned int namelen) |
271 | { | 271 | { |
272 | assert_spin_locked(&dlm->spinlock); | 272 | assert_spin_locked(&dlm->spinlock); |
273 | 273 | ||
274 | mle->dlm = dlm; | 274 | mle->dlm = dlm; |
275 | mle->type = type; | 275 | mle->type = type; |
276 | INIT_HLIST_NODE(&mle->master_hash_node); | 276 | INIT_HLIST_NODE(&mle->master_hash_node); |
277 | INIT_LIST_HEAD(&mle->hb_events); | 277 | INIT_LIST_HEAD(&mle->hb_events); |
278 | memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); | 278 | memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); |
279 | spin_lock_init(&mle->spinlock); | 279 | spin_lock_init(&mle->spinlock); |
280 | init_waitqueue_head(&mle->wq); | 280 | init_waitqueue_head(&mle->wq); |
281 | atomic_set(&mle->woken, 0); | 281 | atomic_set(&mle->woken, 0); |
282 | kref_init(&mle->mle_refs); | 282 | kref_init(&mle->mle_refs); |
283 | memset(mle->response_map, 0, sizeof(mle->response_map)); | 283 | memset(mle->response_map, 0, sizeof(mle->response_map)); |
284 | mle->master = O2NM_MAX_NODES; | 284 | mle->master = O2NM_MAX_NODES; |
285 | mle->new_master = O2NM_MAX_NODES; | 285 | mle->new_master = O2NM_MAX_NODES; |
286 | mle->inuse = 0; | 286 | mle->inuse = 0; |
287 | 287 | ||
288 | BUG_ON(mle->type != DLM_MLE_BLOCK && | 288 | BUG_ON(mle->type != DLM_MLE_BLOCK && |
289 | mle->type != DLM_MLE_MASTER && | 289 | mle->type != DLM_MLE_MASTER && |
290 | mle->type != DLM_MLE_MIGRATION); | 290 | mle->type != DLM_MLE_MIGRATION); |
291 | 291 | ||
292 | if (mle->type == DLM_MLE_MASTER) { | 292 | if (mle->type == DLM_MLE_MASTER) { |
293 | BUG_ON(!res); | 293 | BUG_ON(!res); |
294 | mle->mleres = res; | 294 | mle->mleres = res; |
295 | memcpy(mle->mname, res->lockname.name, res->lockname.len); | 295 | memcpy(mle->mname, res->lockname.name, res->lockname.len); |
296 | mle->mnamelen = res->lockname.len; | 296 | mle->mnamelen = res->lockname.len; |
297 | mle->mnamehash = res->lockname.hash; | 297 | mle->mnamehash = res->lockname.hash; |
298 | } else { | 298 | } else { |
299 | BUG_ON(!name); | 299 | BUG_ON(!name); |
300 | mle->mleres = NULL; | 300 | mle->mleres = NULL; |
301 | memcpy(mle->mname, name, namelen); | 301 | memcpy(mle->mname, name, namelen); |
302 | mle->mnamelen = namelen; | 302 | mle->mnamelen = namelen; |
303 | mle->mnamehash = dlm_lockid_hash(name, namelen); | 303 | mle->mnamehash = dlm_lockid_hash(name, namelen); |
304 | } | 304 | } |
305 | 305 | ||
306 | atomic_inc(&dlm->mle_tot_count[mle->type]); | 306 | atomic_inc(&dlm->mle_tot_count[mle->type]); |
307 | atomic_inc(&dlm->mle_cur_count[mle->type]); | 307 | atomic_inc(&dlm->mle_cur_count[mle->type]); |
308 | 308 | ||
309 | /* copy off the node_map and register hb callbacks on our copy */ | 309 | /* copy off the node_map and register hb callbacks on our copy */ |
310 | memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); | 310 | memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); |
311 | memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); | 311 | memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); |
312 | clear_bit(dlm->node_num, mle->vote_map); | 312 | clear_bit(dlm->node_num, mle->vote_map); |
313 | clear_bit(dlm->node_num, mle->node_map); | 313 | clear_bit(dlm->node_num, mle->node_map); |
314 | 314 | ||
315 | /* attach the mle to the domain node up/down events */ | 315 | /* attach the mle to the domain node up/down events */ |
316 | __dlm_mle_attach_hb_events(dlm, mle); | 316 | __dlm_mle_attach_hb_events(dlm, mle); |
317 | } | 317 | } |
318 | 318 | ||
319 | void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) | 319 | void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) |
320 | { | 320 | { |
321 | assert_spin_locked(&dlm->spinlock); | 321 | assert_spin_locked(&dlm->spinlock); |
322 | assert_spin_locked(&dlm->master_lock); | 322 | assert_spin_locked(&dlm->master_lock); |
323 | 323 | ||
324 | if (!hlist_unhashed(&mle->master_hash_node)) | 324 | if (!hlist_unhashed(&mle->master_hash_node)) |
325 | hlist_del_init(&mle->master_hash_node); | 325 | hlist_del_init(&mle->master_hash_node); |
326 | } | 326 | } |
327 | 327 | ||
328 | void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) | 328 | void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) |
329 | { | 329 | { |
330 | struct hlist_head *bucket; | 330 | struct hlist_head *bucket; |
331 | 331 | ||
332 | assert_spin_locked(&dlm->master_lock); | 332 | assert_spin_locked(&dlm->master_lock); |
333 | 333 | ||
334 | bucket = dlm_master_hash(dlm, mle->mnamehash); | 334 | bucket = dlm_master_hash(dlm, mle->mnamehash); |
335 | hlist_add_head(&mle->master_hash_node, bucket); | 335 | hlist_add_head(&mle->master_hash_node, bucket); |
336 | } | 336 | } |
337 | 337 | ||
338 | /* returns 1 if found, 0 if not */ | 338 | /* returns 1 if found, 0 if not */ |
339 | static int dlm_find_mle(struct dlm_ctxt *dlm, | 339 | static int dlm_find_mle(struct dlm_ctxt *dlm, |
340 | struct dlm_master_list_entry **mle, | 340 | struct dlm_master_list_entry **mle, |
341 | char *name, unsigned int namelen) | 341 | char *name, unsigned int namelen) |
342 | { | 342 | { |
343 | struct dlm_master_list_entry *tmpmle; | 343 | struct dlm_master_list_entry *tmpmle; |
344 | struct hlist_head *bucket; | 344 | struct hlist_head *bucket; |
345 | unsigned int hash; | 345 | unsigned int hash; |
346 | 346 | ||
347 | assert_spin_locked(&dlm->master_lock); | 347 | assert_spin_locked(&dlm->master_lock); |
348 | 348 | ||
349 | hash = dlm_lockid_hash(name, namelen); | 349 | hash = dlm_lockid_hash(name, namelen); |
350 | bucket = dlm_master_hash(dlm, hash); | 350 | bucket = dlm_master_hash(dlm, hash); |
351 | hlist_for_each_entry(tmpmle, bucket, master_hash_node) { | 351 | hlist_for_each_entry(tmpmle, bucket, master_hash_node) { |
352 | if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) | 352 | if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) |
353 | continue; | 353 | continue; |
354 | dlm_get_mle(tmpmle); | 354 | dlm_get_mle(tmpmle); |
355 | *mle = tmpmle; | 355 | *mle = tmpmle; |
356 | return 1; | 356 | return 1; |
357 | } | 357 | } |
358 | return 0; | 358 | return 0; |
359 | } | 359 | } |
360 | 360 | ||
361 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) | 361 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) |
362 | { | 362 | { |
363 | struct dlm_master_list_entry *mle; | 363 | struct dlm_master_list_entry *mle; |
364 | 364 | ||
365 | assert_spin_locked(&dlm->spinlock); | 365 | assert_spin_locked(&dlm->spinlock); |
366 | 366 | ||
367 | list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { | 367 | list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { |
368 | if (node_up) | 368 | if (node_up) |
369 | dlm_mle_node_up(dlm, mle, NULL, idx); | 369 | dlm_mle_node_up(dlm, mle, NULL, idx); |
370 | else | 370 | else |
371 | dlm_mle_node_down(dlm, mle, NULL, idx); | 371 | dlm_mle_node_down(dlm, mle, NULL, idx); |
372 | } | 372 | } |
373 | } | 373 | } |
374 | 374 | ||
375 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, | 375 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, |
376 | struct dlm_master_list_entry *mle, | 376 | struct dlm_master_list_entry *mle, |
377 | struct o2nm_node *node, int idx) | 377 | struct o2nm_node *node, int idx) |
378 | { | 378 | { |
379 | spin_lock(&mle->spinlock); | 379 | spin_lock(&mle->spinlock); |
380 | 380 | ||
381 | if (!test_bit(idx, mle->node_map)) | 381 | if (!test_bit(idx, mle->node_map)) |
382 | mlog(0, "node %u already removed from nodemap!\n", idx); | 382 | mlog(0, "node %u already removed from nodemap!\n", idx); |
383 | else | 383 | else |
384 | clear_bit(idx, mle->node_map); | 384 | clear_bit(idx, mle->node_map); |
385 | 385 | ||
386 | spin_unlock(&mle->spinlock); | 386 | spin_unlock(&mle->spinlock); |
387 | } | 387 | } |
388 | 388 | ||
389 | static void dlm_mle_node_up(struct dlm_ctxt *dlm, | 389 | static void dlm_mle_node_up(struct dlm_ctxt *dlm, |
390 | struct dlm_master_list_entry *mle, | 390 | struct dlm_master_list_entry *mle, |
391 | struct o2nm_node *node, int idx) | 391 | struct o2nm_node *node, int idx) |
392 | { | 392 | { |
393 | spin_lock(&mle->spinlock); | 393 | spin_lock(&mle->spinlock); |
394 | 394 | ||
395 | if (test_bit(idx, mle->node_map)) | 395 | if (test_bit(idx, mle->node_map)) |
396 | mlog(0, "node %u already in node map!\n", idx); | 396 | mlog(0, "node %u already in node map!\n", idx); |
397 | else | 397 | else |
398 | set_bit(idx, mle->node_map); | 398 | set_bit(idx, mle->node_map); |
399 | 399 | ||
400 | spin_unlock(&mle->spinlock); | 400 | spin_unlock(&mle->spinlock); |
401 | } | 401 | } |
402 | 402 | ||
403 | 403 | ||
404 | int dlm_init_mle_cache(void) | 404 | int dlm_init_mle_cache(void) |
405 | { | 405 | { |
406 | dlm_mle_cache = kmem_cache_create("o2dlm_mle", | 406 | dlm_mle_cache = kmem_cache_create("o2dlm_mle", |
407 | sizeof(struct dlm_master_list_entry), | 407 | sizeof(struct dlm_master_list_entry), |
408 | 0, SLAB_HWCACHE_ALIGN, | 408 | 0, SLAB_HWCACHE_ALIGN, |
409 | NULL); | 409 | NULL); |
410 | if (dlm_mle_cache == NULL) | 410 | if (dlm_mle_cache == NULL) |
411 | return -ENOMEM; | 411 | return -ENOMEM; |
412 | return 0; | 412 | return 0; |
413 | } | 413 | } |
414 | 414 | ||
415 | void dlm_destroy_mle_cache(void) | 415 | void dlm_destroy_mle_cache(void) |
416 | { | 416 | { |
417 | if (dlm_mle_cache) | 417 | if (dlm_mle_cache) |
418 | kmem_cache_destroy(dlm_mle_cache); | 418 | kmem_cache_destroy(dlm_mle_cache); |
419 | } | 419 | } |
420 | 420 | ||
421 | static void dlm_mle_release(struct kref *kref) | 421 | static void dlm_mle_release(struct kref *kref) |
422 | { | 422 | { |
423 | struct dlm_master_list_entry *mle; | 423 | struct dlm_master_list_entry *mle; |
424 | struct dlm_ctxt *dlm; | 424 | struct dlm_ctxt *dlm; |
425 | 425 | ||
426 | mle = container_of(kref, struct dlm_master_list_entry, mle_refs); | 426 | mle = container_of(kref, struct dlm_master_list_entry, mle_refs); |
427 | dlm = mle->dlm; | 427 | dlm = mle->dlm; |
428 | 428 | ||
429 | assert_spin_locked(&dlm->spinlock); | 429 | assert_spin_locked(&dlm->spinlock); |
430 | assert_spin_locked(&dlm->master_lock); | 430 | assert_spin_locked(&dlm->master_lock); |
431 | 431 | ||
432 | mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, | 432 | mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, |
433 | mle->type); | 433 | mle->type); |
434 | 434 | ||
435 | /* remove from list if not already */ | 435 | /* remove from list if not already */ |
436 | __dlm_unlink_mle(dlm, mle); | 436 | __dlm_unlink_mle(dlm, mle); |
437 | 437 | ||
438 | /* detach the mle from the domain node up/down events */ | 438 | /* detach the mle from the domain node up/down events */ |
439 | __dlm_mle_detach_hb_events(dlm, mle); | 439 | __dlm_mle_detach_hb_events(dlm, mle); |
440 | 440 | ||
441 | atomic_dec(&dlm->mle_cur_count[mle->type]); | 441 | atomic_dec(&dlm->mle_cur_count[mle->type]); |
442 | 442 | ||
443 | /* NOTE: kfree under spinlock here. | 443 | /* NOTE: kfree under spinlock here. |
444 | * if this is bad, we can move this to a freelist. */ | 444 | * if this is bad, we can move this to a freelist. */ |
445 | kmem_cache_free(dlm_mle_cache, mle); | 445 | kmem_cache_free(dlm_mle_cache, mle); |
446 | } | 446 | } |
447 | 447 | ||
448 | 448 | ||
449 | /* | 449 | /* |
450 | * LOCK RESOURCE FUNCTIONS | 450 | * LOCK RESOURCE FUNCTIONS |
451 | */ | 451 | */ |
452 | 452 | ||
453 | int dlm_init_master_caches(void) | 453 | int dlm_init_master_caches(void) |
454 | { | 454 | { |
455 | dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", | 455 | dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", |
456 | sizeof(struct dlm_lock_resource), | 456 | sizeof(struct dlm_lock_resource), |
457 | 0, SLAB_HWCACHE_ALIGN, NULL); | 457 | 0, SLAB_HWCACHE_ALIGN, NULL); |
458 | if (!dlm_lockres_cache) | 458 | if (!dlm_lockres_cache) |
459 | goto bail; | 459 | goto bail; |
460 | 460 | ||
461 | dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", | 461 | dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", |
462 | DLM_LOCKID_NAME_MAX, 0, | 462 | DLM_LOCKID_NAME_MAX, 0, |
463 | SLAB_HWCACHE_ALIGN, NULL); | 463 | SLAB_HWCACHE_ALIGN, NULL); |
464 | if (!dlm_lockname_cache) | 464 | if (!dlm_lockname_cache) |
465 | goto bail; | 465 | goto bail; |
466 | 466 | ||
467 | return 0; | 467 | return 0; |
468 | bail: | 468 | bail: |
469 | dlm_destroy_master_caches(); | 469 | dlm_destroy_master_caches(); |
470 | return -ENOMEM; | 470 | return -ENOMEM; |
471 | } | 471 | } |
472 | 472 | ||
473 | void dlm_destroy_master_caches(void) | 473 | void dlm_destroy_master_caches(void) |
474 | { | 474 | { |
475 | if (dlm_lockname_cache) { | 475 | if (dlm_lockname_cache) { |
476 | kmem_cache_destroy(dlm_lockname_cache); | 476 | kmem_cache_destroy(dlm_lockname_cache); |
477 | dlm_lockname_cache = NULL; | 477 | dlm_lockname_cache = NULL; |
478 | } | 478 | } |
479 | 479 | ||
480 | if (dlm_lockres_cache) { | 480 | if (dlm_lockres_cache) { |
481 | kmem_cache_destroy(dlm_lockres_cache); | 481 | kmem_cache_destroy(dlm_lockres_cache); |
482 | dlm_lockres_cache = NULL; | 482 | dlm_lockres_cache = NULL; |
483 | } | 483 | } |
484 | } | 484 | } |
485 | 485 | ||
486 | static void dlm_lockres_release(struct kref *kref) | 486 | static void dlm_lockres_release(struct kref *kref) |
487 | { | 487 | { |
488 | struct dlm_lock_resource *res; | 488 | struct dlm_lock_resource *res; |
489 | struct dlm_ctxt *dlm; | 489 | struct dlm_ctxt *dlm; |
490 | 490 | ||
491 | res = container_of(kref, struct dlm_lock_resource, refs); | 491 | res = container_of(kref, struct dlm_lock_resource, refs); |
492 | dlm = res->dlm; | 492 | dlm = res->dlm; |
493 | 493 | ||
494 | /* This should not happen -- all lockres' have a name | 494 | /* This should not happen -- all lockres' have a name |
495 | * associated with them at init time. */ | 495 | * associated with them at init time. */ |
496 | BUG_ON(!res->lockname.name); | 496 | BUG_ON(!res->lockname.name); |
497 | 497 | ||
498 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, | 498 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, |
499 | res->lockname.name); | 499 | res->lockname.name); |
500 | 500 | ||
501 | spin_lock(&dlm->track_lock); | 501 | spin_lock(&dlm->track_lock); |
502 | if (!list_empty(&res->tracking)) | 502 | if (!list_empty(&res->tracking)) |
503 | list_del_init(&res->tracking); | 503 | list_del_init(&res->tracking); |
504 | else { | 504 | else { |
505 | mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", | 505 | mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", |
506 | res->lockname.len, res->lockname.name); | 506 | res->lockname.len, res->lockname.name); |
507 | dlm_print_one_lock_resource(res); | 507 | dlm_print_one_lock_resource(res); |
508 | } | 508 | } |
509 | spin_unlock(&dlm->track_lock); | 509 | spin_unlock(&dlm->track_lock); |
510 | 510 | ||
511 | atomic_dec(&dlm->res_cur_count); | 511 | atomic_dec(&dlm->res_cur_count); |
512 | 512 | ||
513 | if (!hlist_unhashed(&res->hash_node) || | 513 | if (!hlist_unhashed(&res->hash_node) || |
514 | !list_empty(&res->granted) || | 514 | !list_empty(&res->granted) || |
515 | !list_empty(&res->converting) || | 515 | !list_empty(&res->converting) || |
516 | !list_empty(&res->blocked) || | 516 | !list_empty(&res->blocked) || |
517 | !list_empty(&res->dirty) || | 517 | !list_empty(&res->dirty) || |
518 | !list_empty(&res->recovering) || | 518 | !list_empty(&res->recovering) || |
519 | !list_empty(&res->purge)) { | 519 | !list_empty(&res->purge)) { |
520 | mlog(ML_ERROR, | 520 | mlog(ML_ERROR, |
521 | "Going to BUG for resource %.*s." | 521 | "Going to BUG for resource %.*s." |
522 | " We're on a list! [%c%c%c%c%c%c%c]\n", | 522 | " We're on a list! [%c%c%c%c%c%c%c]\n", |
523 | res->lockname.len, res->lockname.name, | 523 | res->lockname.len, res->lockname.name, |
524 | !hlist_unhashed(&res->hash_node) ? 'H' : ' ', | 524 | !hlist_unhashed(&res->hash_node) ? 'H' : ' ', |
525 | !list_empty(&res->granted) ? 'G' : ' ', | 525 | !list_empty(&res->granted) ? 'G' : ' ', |
526 | !list_empty(&res->converting) ? 'C' : ' ', | 526 | !list_empty(&res->converting) ? 'C' : ' ', |
527 | !list_empty(&res->blocked) ? 'B' : ' ', | 527 | !list_empty(&res->blocked) ? 'B' : ' ', |
528 | !list_empty(&res->dirty) ? 'D' : ' ', | 528 | !list_empty(&res->dirty) ? 'D' : ' ', |
529 | !list_empty(&res->recovering) ? 'R' : ' ', | 529 | !list_empty(&res->recovering) ? 'R' : ' ', |
530 | !list_empty(&res->purge) ? 'P' : ' '); | 530 | !list_empty(&res->purge) ? 'P' : ' '); |
531 | 531 | ||
532 | dlm_print_one_lock_resource(res); | 532 | dlm_print_one_lock_resource(res); |
533 | } | 533 | } |
534 | 534 | ||
535 | /* By the time we're ready to blow this guy away, we shouldn't | 535 | /* By the time we're ready to blow this guy away, we shouldn't |
536 | * be on any lists. */ | 536 | * be on any lists. */ |
537 | BUG_ON(!hlist_unhashed(&res->hash_node)); | 537 | BUG_ON(!hlist_unhashed(&res->hash_node)); |
538 | BUG_ON(!list_empty(&res->granted)); | 538 | BUG_ON(!list_empty(&res->granted)); |
539 | BUG_ON(!list_empty(&res->converting)); | 539 | BUG_ON(!list_empty(&res->converting)); |
540 | BUG_ON(!list_empty(&res->blocked)); | 540 | BUG_ON(!list_empty(&res->blocked)); |
541 | BUG_ON(!list_empty(&res->dirty)); | 541 | BUG_ON(!list_empty(&res->dirty)); |
542 | BUG_ON(!list_empty(&res->recovering)); | 542 | BUG_ON(!list_empty(&res->recovering)); |
543 | BUG_ON(!list_empty(&res->purge)); | 543 | BUG_ON(!list_empty(&res->purge)); |
544 | 544 | ||
545 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); | 545 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); |
546 | 546 | ||
547 | kmem_cache_free(dlm_lockres_cache, res); | 547 | kmem_cache_free(dlm_lockres_cache, res); |
548 | } | 548 | } |
549 | 549 | ||
550 | void dlm_lockres_put(struct dlm_lock_resource *res) | 550 | void dlm_lockres_put(struct dlm_lock_resource *res) |
551 | { | 551 | { |
552 | kref_put(&res->refs, dlm_lockres_release); | 552 | kref_put(&res->refs, dlm_lockres_release); |
553 | } | 553 | } |
554 | 554 | ||
555 | static void dlm_init_lockres(struct dlm_ctxt *dlm, | 555 | static void dlm_init_lockres(struct dlm_ctxt *dlm, |
556 | struct dlm_lock_resource *res, | 556 | struct dlm_lock_resource *res, |
557 | const char *name, unsigned int namelen) | 557 | const char *name, unsigned int namelen) |
558 | { | 558 | { |
559 | char *qname; | 559 | char *qname; |
560 | 560 | ||
561 | /* If we memset here, we lose our reference to the kmalloc'd | 561 | /* If we memset here, we lose our reference to the kmalloc'd |
562 | * res->lockname.name, so be sure to init every field | 562 | * res->lockname.name, so be sure to init every field |
563 | * correctly! */ | 563 | * correctly! */ |
564 | 564 | ||
565 | qname = (char *) res->lockname.name; | 565 | qname = (char *) res->lockname.name; |
566 | memcpy(qname, name, namelen); | 566 | memcpy(qname, name, namelen); |
567 | 567 | ||
568 | res->lockname.len = namelen; | 568 | res->lockname.len = namelen; |
569 | res->lockname.hash = dlm_lockid_hash(name, namelen); | 569 | res->lockname.hash = dlm_lockid_hash(name, namelen); |
570 | 570 | ||
571 | init_waitqueue_head(&res->wq); | 571 | init_waitqueue_head(&res->wq); |
572 | spin_lock_init(&res->spinlock); | 572 | spin_lock_init(&res->spinlock); |
573 | INIT_HLIST_NODE(&res->hash_node); | 573 | INIT_HLIST_NODE(&res->hash_node); |
574 | INIT_LIST_HEAD(&res->granted); | 574 | INIT_LIST_HEAD(&res->granted); |
575 | INIT_LIST_HEAD(&res->converting); | 575 | INIT_LIST_HEAD(&res->converting); |
576 | INIT_LIST_HEAD(&res->blocked); | 576 | INIT_LIST_HEAD(&res->blocked); |
577 | INIT_LIST_HEAD(&res->dirty); | 577 | INIT_LIST_HEAD(&res->dirty); |
578 | INIT_LIST_HEAD(&res->recovering); | 578 | INIT_LIST_HEAD(&res->recovering); |
579 | INIT_LIST_HEAD(&res->purge); | 579 | INIT_LIST_HEAD(&res->purge); |
580 | INIT_LIST_HEAD(&res->tracking); | 580 | INIT_LIST_HEAD(&res->tracking); |
581 | atomic_set(&res->asts_reserved, 0); | 581 | atomic_set(&res->asts_reserved, 0); |
582 | res->migration_pending = 0; | 582 | res->migration_pending = 0; |
583 | res->inflight_locks = 0; | 583 | res->inflight_locks = 0; |
584 | res->inflight_assert_workers = 0; | 584 | res->inflight_assert_workers = 0; |
585 | 585 | ||
586 | res->dlm = dlm; | 586 | res->dlm = dlm; |
587 | 587 | ||
588 | kref_init(&res->refs); | 588 | kref_init(&res->refs); |
589 | 589 | ||
590 | atomic_inc(&dlm->res_tot_count); | 590 | atomic_inc(&dlm->res_tot_count); |
591 | atomic_inc(&dlm->res_cur_count); | 591 | atomic_inc(&dlm->res_cur_count); |
592 | 592 | ||
593 | /* just for consistency */ | 593 | /* just for consistency */ |
594 | spin_lock(&res->spinlock); | 594 | spin_lock(&res->spinlock); |
595 | dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); | 595 | dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); |
596 | spin_unlock(&res->spinlock); | 596 | spin_unlock(&res->spinlock); |
597 | 597 | ||
598 | res->state = DLM_LOCK_RES_IN_PROGRESS; | 598 | res->state = DLM_LOCK_RES_IN_PROGRESS; |
599 | 599 | ||
600 | res->last_used = 0; | 600 | res->last_used = 0; |
601 | 601 | ||
602 | spin_lock(&dlm->spinlock); | 602 | spin_lock(&dlm->spinlock); |
603 | list_add_tail(&res->tracking, &dlm->tracking_list); | 603 | list_add_tail(&res->tracking, &dlm->tracking_list); |
604 | spin_unlock(&dlm->spinlock); | 604 | spin_unlock(&dlm->spinlock); |
605 | 605 | ||
606 | memset(res->lvb, 0, DLM_LVB_LEN); | 606 | memset(res->lvb, 0, DLM_LVB_LEN); |
607 | memset(res->refmap, 0, sizeof(res->refmap)); | 607 | memset(res->refmap, 0, sizeof(res->refmap)); |
608 | } | 608 | } |
609 | 609 | ||
610 | struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | 610 | struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, |
611 | const char *name, | 611 | const char *name, |
612 | unsigned int namelen) | 612 | unsigned int namelen) |
613 | { | 613 | { |
614 | struct dlm_lock_resource *res = NULL; | 614 | struct dlm_lock_resource *res = NULL; |
615 | 615 | ||
616 | res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); | 616 | res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); |
617 | if (!res) | 617 | if (!res) |
618 | goto error; | 618 | goto error; |
619 | 619 | ||
620 | res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); | 620 | res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); |
621 | if (!res->lockname.name) | 621 | if (!res->lockname.name) |
622 | goto error; | 622 | goto error; |
623 | 623 | ||
624 | dlm_init_lockres(dlm, res, name, namelen); | 624 | dlm_init_lockres(dlm, res, name, namelen); |
625 | return res; | 625 | return res; |
626 | 626 | ||
627 | error: | 627 | error: |
628 | if (res && res->lockname.name) | 628 | if (res && res->lockname.name) |
629 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); | 629 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); |
630 | 630 | ||
631 | if (res) | 631 | if (res) |
632 | kmem_cache_free(dlm_lockres_cache, res); | 632 | kmem_cache_free(dlm_lockres_cache, res); |
633 | return NULL; | 633 | return NULL; |
634 | } | 634 | } |
635 | 635 | ||
636 | void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, | 636 | void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, |
637 | struct dlm_lock_resource *res, int bit) | 637 | struct dlm_lock_resource *res, int bit) |
638 | { | 638 | { |
639 | assert_spin_locked(&res->spinlock); | 639 | assert_spin_locked(&res->spinlock); |
640 | 640 | ||
641 | mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, | 641 | mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, |
642 | res->lockname.name, bit, __builtin_return_address(0)); | 642 | res->lockname.name, bit, __builtin_return_address(0)); |
643 | 643 | ||
644 | set_bit(bit, res->refmap); | 644 | set_bit(bit, res->refmap); |
645 | } | 645 | } |
646 | 646 | ||
647 | void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, | 647 | void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, |
648 | struct dlm_lock_resource *res, int bit) | 648 | struct dlm_lock_resource *res, int bit) |
649 | { | 649 | { |
650 | assert_spin_locked(&res->spinlock); | 650 | assert_spin_locked(&res->spinlock); |
651 | 651 | ||
652 | mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, | 652 | mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, |
653 | res->lockname.name, bit, __builtin_return_address(0)); | 653 | res->lockname.name, bit, __builtin_return_address(0)); |
654 | 654 | ||
655 | clear_bit(bit, res->refmap); | 655 | clear_bit(bit, res->refmap); |
656 | } | 656 | } |
657 | 657 | ||
658 | 658 | static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | |
659 | void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
660 | struct dlm_lock_resource *res) | 659 | struct dlm_lock_resource *res) |
661 | { | 660 | { |
662 | assert_spin_locked(&res->spinlock); | ||
663 | |||
664 | res->inflight_locks++; | 661 | res->inflight_locks++; |
665 | 662 | ||
666 | mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, | 663 | mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, |
667 | res->lockname.len, res->lockname.name, res->inflight_locks, | 664 | res->lockname.len, res->lockname.name, res->inflight_locks, |
668 | __builtin_return_address(0)); | 665 | __builtin_return_address(0)); |
669 | } | 666 | } |
670 | 667 | ||
668 | void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
669 | struct dlm_lock_resource *res) | ||
670 | { | ||
671 | assert_spin_locked(&res->spinlock); | ||
672 | __dlm_lockres_grab_inflight_ref(dlm, res); | ||
673 | } | ||
674 | |||
671 | void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, | 675 | void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, |
672 | struct dlm_lock_resource *res) | 676 | struct dlm_lock_resource *res) |
673 | { | 677 | { |
674 | assert_spin_locked(&res->spinlock); | 678 | assert_spin_locked(&res->spinlock); |
675 | 679 | ||
676 | BUG_ON(res->inflight_locks == 0); | 680 | BUG_ON(res->inflight_locks == 0); |
677 | 681 | ||
678 | res->inflight_locks--; | 682 | res->inflight_locks--; |
679 | 683 | ||
680 | mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, | 684 | mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, |
681 | res->lockname.len, res->lockname.name, res->inflight_locks, | 685 | res->lockname.len, res->lockname.name, res->inflight_locks, |
682 | __builtin_return_address(0)); | 686 | __builtin_return_address(0)); |
683 | 687 | ||
684 | wake_up(&res->wq); | 688 | wake_up(&res->wq); |
685 | } | 689 | } |
686 | 690 | ||
687 | void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, | 691 | void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, |
688 | struct dlm_lock_resource *res) | 692 | struct dlm_lock_resource *res) |
689 | { | 693 | { |
690 | assert_spin_locked(&res->spinlock); | 694 | assert_spin_locked(&res->spinlock); |
691 | res->inflight_assert_workers++; | 695 | res->inflight_assert_workers++; |
692 | mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", | 696 | mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", |
693 | dlm->name, res->lockname.len, res->lockname.name, | 697 | dlm->name, res->lockname.len, res->lockname.name, |
694 | res->inflight_assert_workers); | 698 | res->inflight_assert_workers); |
695 | } | 699 | } |
696 | 700 | ||
697 | static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, | 701 | static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, |
698 | struct dlm_lock_resource *res) | 702 | struct dlm_lock_resource *res) |
699 | { | 703 | { |
700 | spin_lock(&res->spinlock); | 704 | spin_lock(&res->spinlock); |
701 | __dlm_lockres_grab_inflight_worker(dlm, res); | 705 | __dlm_lockres_grab_inflight_worker(dlm, res); |
702 | spin_unlock(&res->spinlock); | 706 | spin_unlock(&res->spinlock); |
703 | } | 707 | } |
704 | 708 | ||
705 | static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, | 709 | static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, |
706 | struct dlm_lock_resource *res) | 710 | struct dlm_lock_resource *res) |
707 | { | 711 | { |
708 | assert_spin_locked(&res->spinlock); | 712 | assert_spin_locked(&res->spinlock); |
709 | BUG_ON(res->inflight_assert_workers == 0); | 713 | BUG_ON(res->inflight_assert_workers == 0); |
710 | res->inflight_assert_workers--; | 714 | res->inflight_assert_workers--; |
711 | mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", | 715 | mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", |
712 | dlm->name, res->lockname.len, res->lockname.name, | 716 | dlm->name, res->lockname.len, res->lockname.name, |
713 | res->inflight_assert_workers); | 717 | res->inflight_assert_workers); |
714 | } | 718 | } |
715 | 719 | ||
716 | static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, | 720 | static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, |
717 | struct dlm_lock_resource *res) | 721 | struct dlm_lock_resource *res) |
718 | { | 722 | { |
719 | spin_lock(&res->spinlock); | 723 | spin_lock(&res->spinlock); |
720 | __dlm_lockres_drop_inflight_worker(dlm, res); | 724 | __dlm_lockres_drop_inflight_worker(dlm, res); |
721 | spin_unlock(&res->spinlock); | 725 | spin_unlock(&res->spinlock); |
722 | } | 726 | } |
723 | 727 | ||
724 | /* | 728 | /* |
725 | * lookup a lock resource by name. | 729 | * lookup a lock resource by name. |
726 | * may already exist in the hashtable. | 730 | * may already exist in the hashtable. |
727 | * lockid is null terminated | 731 | * lockid is null terminated |
728 | * | 732 | * |
729 | * if not, allocate enough for the lockres and for | 733 | * if not, allocate enough for the lockres and for |
730 | * the temporary structure used in doing the mastering. | 734 | * the temporary structure used in doing the mastering. |
731 | * | 735 | * |
732 | * also, do a lookup in the dlm->master_list to see | 736 | * also, do a lookup in the dlm->master_list to see |
733 | * if another node has begun mastering the same lock. | 737 | * if another node has begun mastering the same lock. |
734 | * if so, there should be a block entry in there | 738 | * if so, there should be a block entry in there |
735 | * for this name, and we should *not* attempt to master | 739 | * for this name, and we should *not* attempt to master |
736 | * the lock here. need to wait around for that node | 740 | * the lock here. need to wait around for that node |
737 | * to assert_master (or die). | 741 | * to assert_master (or die). |
738 | * | 742 | * |
739 | */ | 743 | */ |
740 | struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | 744 | struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, |
741 | const char *lockid, | 745 | const char *lockid, |
742 | int namelen, | 746 | int namelen, |
743 | int flags) | 747 | int flags) |
744 | { | 748 | { |
745 | struct dlm_lock_resource *tmpres=NULL, *res=NULL; | 749 | struct dlm_lock_resource *tmpres=NULL, *res=NULL; |
746 | struct dlm_master_list_entry *mle = NULL; | 750 | struct dlm_master_list_entry *mle = NULL; |
747 | struct dlm_master_list_entry *alloc_mle = NULL; | 751 | struct dlm_master_list_entry *alloc_mle = NULL; |
748 | int blocked = 0; | 752 | int blocked = 0; |
749 | int ret, nodenum; | 753 | int ret, nodenum; |
750 | struct dlm_node_iter iter; | 754 | struct dlm_node_iter iter; |
751 | unsigned int hash; | 755 | unsigned int hash; |
752 | int tries = 0; | 756 | int tries = 0; |
753 | int bit, wait_on_recovery = 0; | 757 | int bit, wait_on_recovery = 0; |
754 | 758 | ||
755 | BUG_ON(!lockid); | 759 | BUG_ON(!lockid); |
756 | 760 | ||
757 | hash = dlm_lockid_hash(lockid, namelen); | 761 | hash = dlm_lockid_hash(lockid, namelen); |
758 | 762 | ||
759 | mlog(0, "get lockres %s (len %d)\n", lockid, namelen); | 763 | mlog(0, "get lockres %s (len %d)\n", lockid, namelen); |
760 | 764 | ||
761 | lookup: | 765 | lookup: |
762 | spin_lock(&dlm->spinlock); | 766 | spin_lock(&dlm->spinlock); |
763 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); | 767 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); |
764 | if (tmpres) { | 768 | if (tmpres) { |
765 | spin_unlock(&dlm->spinlock); | 769 | spin_unlock(&dlm->spinlock); |
766 | spin_lock(&tmpres->spinlock); | 770 | spin_lock(&tmpres->spinlock); |
767 | /* Wait on the thread that is mastering the resource */ | 771 | /* Wait on the thread that is mastering the resource */ |
768 | if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | 772 | if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { |
769 | __dlm_wait_on_lockres(tmpres); | 773 | __dlm_wait_on_lockres(tmpres); |
770 | BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); | 774 | BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); |
771 | spin_unlock(&tmpres->spinlock); | 775 | spin_unlock(&tmpres->spinlock); |
772 | dlm_lockres_put(tmpres); | 776 | dlm_lockres_put(tmpres); |
773 | tmpres = NULL; | 777 | tmpres = NULL; |
774 | goto lookup; | 778 | goto lookup; |
775 | } | 779 | } |
776 | 780 | ||
777 | /* Wait on the resource purge to complete before continuing */ | 781 | /* Wait on the resource purge to complete before continuing */ |
778 | if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { | 782 | if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { |
779 | BUG_ON(tmpres->owner == dlm->node_num); | 783 | BUG_ON(tmpres->owner == dlm->node_num); |
780 | __dlm_wait_on_lockres_flags(tmpres, | 784 | __dlm_wait_on_lockres_flags(tmpres, |
781 | DLM_LOCK_RES_DROPPING_REF); | 785 | DLM_LOCK_RES_DROPPING_REF); |
782 | spin_unlock(&tmpres->spinlock); | 786 | spin_unlock(&tmpres->spinlock); |
783 | dlm_lockres_put(tmpres); | 787 | dlm_lockres_put(tmpres); |
784 | tmpres = NULL; | 788 | tmpres = NULL; |
785 | goto lookup; | 789 | goto lookup; |
786 | } | 790 | } |
787 | 791 | ||
788 | /* Grab inflight ref to pin the resource */ | 792 | /* Grab inflight ref to pin the resource */ |
789 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | 793 | dlm_lockres_grab_inflight_ref(dlm, tmpres); |
790 | 794 | ||
791 | spin_unlock(&tmpres->spinlock); | 795 | spin_unlock(&tmpres->spinlock); |
792 | if (res) | 796 | if (res) |
793 | dlm_lockres_put(res); | 797 | dlm_lockres_put(res); |
794 | res = tmpres; | 798 | res = tmpres; |
795 | goto leave; | 799 | goto leave; |
796 | } | 800 | } |
797 | 801 | ||
798 | if (!res) { | 802 | if (!res) { |
799 | spin_unlock(&dlm->spinlock); | 803 | spin_unlock(&dlm->spinlock); |
800 | mlog(0, "allocating a new resource\n"); | 804 | mlog(0, "allocating a new resource\n"); |
801 | /* nothing found and we need to allocate one. */ | 805 | /* nothing found and we need to allocate one. */ |
802 | alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); | 806 | alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); |
803 | if (!alloc_mle) | 807 | if (!alloc_mle) |
804 | goto leave; | 808 | goto leave; |
805 | res = dlm_new_lockres(dlm, lockid, namelen); | 809 | res = dlm_new_lockres(dlm, lockid, namelen); |
806 | if (!res) | 810 | if (!res) |
807 | goto leave; | 811 | goto leave; |
808 | goto lookup; | 812 | goto lookup; |
809 | } | 813 | } |
810 | 814 | ||
811 | mlog(0, "no lockres found, allocated our own: %p\n", res); | 815 | mlog(0, "no lockres found, allocated our own: %p\n", res); |
812 | 816 | ||
813 | if (flags & LKM_LOCAL) { | 817 | if (flags & LKM_LOCAL) { |
814 | /* caller knows it's safe to assume it's not mastered elsewhere | 818 | /* caller knows it's safe to assume it's not mastered elsewhere |
815 | * DONE! return right away */ | 819 | * DONE! return right away */ |
816 | spin_lock(&res->spinlock); | 820 | spin_lock(&res->spinlock); |
817 | dlm_change_lockres_owner(dlm, res, dlm->node_num); | 821 | dlm_change_lockres_owner(dlm, res, dlm->node_num); |
818 | __dlm_insert_lockres(dlm, res); | 822 | __dlm_insert_lockres(dlm, res); |
819 | dlm_lockres_grab_inflight_ref(dlm, res); | 823 | dlm_lockres_grab_inflight_ref(dlm, res); |
820 | spin_unlock(&res->spinlock); | 824 | spin_unlock(&res->spinlock); |
821 | spin_unlock(&dlm->spinlock); | 825 | spin_unlock(&dlm->spinlock); |
822 | /* lockres still marked IN_PROGRESS */ | 826 | /* lockres still marked IN_PROGRESS */ |
823 | goto wake_waiters; | 827 | goto wake_waiters; |
824 | } | 828 | } |
825 | 829 | ||
826 | /* check master list to see if another node has started mastering it */ | 830 | /* check master list to see if another node has started mastering it */ |
827 | spin_lock(&dlm->master_lock); | 831 | spin_lock(&dlm->master_lock); |
828 | 832 | ||
829 | /* if we found a block, wait for lock to be mastered by another node */ | 833 | /* if we found a block, wait for lock to be mastered by another node */ |
830 | blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); | 834 | blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); |
831 | if (blocked) { | 835 | if (blocked) { |
832 | int mig; | 836 | int mig; |
833 | if (mle->type == DLM_MLE_MASTER) { | 837 | if (mle->type == DLM_MLE_MASTER) { |
834 | mlog(ML_ERROR, "master entry for nonexistent lock!\n"); | 838 | mlog(ML_ERROR, "master entry for nonexistent lock!\n"); |
835 | BUG(); | 839 | BUG(); |
836 | } | 840 | } |
837 | mig = (mle->type == DLM_MLE_MIGRATION); | 841 | mig = (mle->type == DLM_MLE_MIGRATION); |
838 | /* if there is a migration in progress, let the migration | 842 | /* if there is a migration in progress, let the migration |
839 | * finish before continuing. we can wait for the absence | 843 | * finish before continuing. we can wait for the absence |
840 | * of the MIGRATION mle: either the migrate finished or | 844 | * of the MIGRATION mle: either the migrate finished or |
841 | * one of the nodes died and the mle was cleaned up. | 845 | * one of the nodes died and the mle was cleaned up. |
842 | * if there is a BLOCK here, but it already has a master | 846 | * if there is a BLOCK here, but it already has a master |
843 | * set, we are too late. the master does not have a ref | 847 | * set, we are too late. the master does not have a ref |
844 | * for us in the refmap. detach the mle and drop it. | 848 | * for us in the refmap. detach the mle and drop it. |
845 | * either way, go back to the top and start over. */ | 849 | * either way, go back to the top and start over. */ |
846 | if (mig || mle->master != O2NM_MAX_NODES) { | 850 | if (mig || mle->master != O2NM_MAX_NODES) { |
847 | BUG_ON(mig && mle->master == dlm->node_num); | 851 | BUG_ON(mig && mle->master == dlm->node_num); |
848 | /* we arrived too late. the master does not | 852 | /* we arrived too late. the master does not |
849 | * have a ref for us. retry. */ | 853 | * have a ref for us. retry. */ |
850 | mlog(0, "%s:%.*s: late on %s\n", | 854 | mlog(0, "%s:%.*s: late on %s\n", |
851 | dlm->name, namelen, lockid, | 855 | dlm->name, namelen, lockid, |
852 | mig ? "MIGRATION" : "BLOCK"); | 856 | mig ? "MIGRATION" : "BLOCK"); |
853 | spin_unlock(&dlm->master_lock); | 857 | spin_unlock(&dlm->master_lock); |
854 | spin_unlock(&dlm->spinlock); | 858 | spin_unlock(&dlm->spinlock); |
855 | 859 | ||
856 | /* master is known, detach */ | 860 | /* master is known, detach */ |
857 | if (!mig) | 861 | if (!mig) |
858 | dlm_mle_detach_hb_events(dlm, mle); | 862 | dlm_mle_detach_hb_events(dlm, mle); |
859 | dlm_put_mle(mle); | 863 | dlm_put_mle(mle); |
860 | mle = NULL; | 864 | mle = NULL; |
861 | /* this is lame, but we can't wait on either | 865 | /* this is lame, but we can't wait on either |
862 | * the mle or lockres waitqueue here */ | 866 | * the mle or lockres waitqueue here */ |
863 | if (mig) | 867 | if (mig) |
864 | msleep(100); | 868 | msleep(100); |
865 | goto lookup; | 869 | goto lookup; |
866 | } | 870 | } |
867 | } else { | 871 | } else { |
868 | /* go ahead and try to master lock on this node */ | 872 | /* go ahead and try to master lock on this node */ |
869 | mle = alloc_mle; | 873 | mle = alloc_mle; |
870 | /* make sure this does not get freed below */ | 874 | /* make sure this does not get freed below */ |
871 | alloc_mle = NULL; | 875 | alloc_mle = NULL; |
872 | dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); | 876 | dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); |
873 | set_bit(dlm->node_num, mle->maybe_map); | 877 | set_bit(dlm->node_num, mle->maybe_map); |
874 | __dlm_insert_mle(dlm, mle); | 878 | __dlm_insert_mle(dlm, mle); |
875 | 879 | ||
876 | /* still holding the dlm spinlock, check the recovery map | 880 | /* still holding the dlm spinlock, check the recovery map |
877 | * to see if there are any nodes that still need to be | 881 | * to see if there are any nodes that still need to be |
878 | * considered. these will not appear in the mle nodemap | 882 | * considered. these will not appear in the mle nodemap |
879 | * but they might own this lockres. wait on them. */ | 883 | * but they might own this lockres. wait on them. */ |
880 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | 884 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); |
881 | if (bit < O2NM_MAX_NODES) { | 885 | if (bit < O2NM_MAX_NODES) { |
882 | mlog(0, "%s: res %.*s, At least one node (%d) " | 886 | mlog(0, "%s: res %.*s, At least one node (%d) " |
883 | "to recover before lock mastery can begin\n", | 887 | "to recover before lock mastery can begin\n", |
884 | dlm->name, namelen, (char *)lockid, bit); | 888 | dlm->name, namelen, (char *)lockid, bit); |
885 | wait_on_recovery = 1; | 889 | wait_on_recovery = 1; |
886 | } | 890 | } |
887 | } | 891 | } |
888 | 892 | ||
889 | /* at this point there is either a DLM_MLE_BLOCK or a | 893 | /* at this point there is either a DLM_MLE_BLOCK or a |
890 | * DLM_MLE_MASTER on the master list, so it's safe to add the | 894 | * DLM_MLE_MASTER on the master list, so it's safe to add the |
891 | * lockres to the hashtable. anyone who finds the lock will | 895 | * lockres to the hashtable. anyone who finds the lock will |
892 | * still have to wait on the IN_PROGRESS. */ | 896 | * still have to wait on the IN_PROGRESS. */ |
893 | 897 | ||
894 | /* finally add the lockres to its hash bucket */ | 898 | /* finally add the lockres to its hash bucket */ |
895 | __dlm_insert_lockres(dlm, res); | 899 | __dlm_insert_lockres(dlm, res); |
896 | 900 | ||
897 | /* Grab inflight ref to pin the resource */ | 901 | /* since this lockres is new it doesn't not require the spinlock */ |
898 | spin_lock(&res->spinlock); | 902 | __dlm_lockres_grab_inflight_ref(dlm, res); |
899 | dlm_lockres_grab_inflight_ref(dlm, res); | ||
900 | spin_unlock(&res->spinlock); | ||
901 | 903 | ||
902 | /* get an extra ref on the mle in case this is a BLOCK | 904 | /* get an extra ref on the mle in case this is a BLOCK |
903 | * if so, the creator of the BLOCK may try to put the last | 905 | * if so, the creator of the BLOCK may try to put the last |
904 | * ref at this time in the assert master handler, so we | 906 | * ref at this time in the assert master handler, so we |
905 | * need an extra one to keep from a bad ptr deref. */ | 907 | * need an extra one to keep from a bad ptr deref. */ |
906 | dlm_get_mle_inuse(mle); | 908 | dlm_get_mle_inuse(mle); |
907 | spin_unlock(&dlm->master_lock); | 909 | spin_unlock(&dlm->master_lock); |
908 | spin_unlock(&dlm->spinlock); | 910 | spin_unlock(&dlm->spinlock); |
909 | 911 | ||
910 | redo_request: | 912 | redo_request: |
911 | while (wait_on_recovery) { | 913 | while (wait_on_recovery) { |
912 | /* any cluster changes that occurred after dropping the | 914 | /* any cluster changes that occurred after dropping the |
913 | * dlm spinlock would be detectable be a change on the mle, | 915 | * dlm spinlock would be detectable be a change on the mle, |
914 | * so we only need to clear out the recovery map once. */ | 916 | * so we only need to clear out the recovery map once. */ |
915 | if (dlm_is_recovery_lock(lockid, namelen)) { | 917 | if (dlm_is_recovery_lock(lockid, namelen)) { |
916 | mlog(0, "%s: Recovery map is not empty, but must " | 918 | mlog(0, "%s: Recovery map is not empty, but must " |
917 | "master $RECOVERY lock now\n", dlm->name); | 919 | "master $RECOVERY lock now\n", dlm->name); |
918 | if (!dlm_pre_master_reco_lockres(dlm, res)) | 920 | if (!dlm_pre_master_reco_lockres(dlm, res)) |
919 | wait_on_recovery = 0; | 921 | wait_on_recovery = 0; |
920 | else { | 922 | else { |
921 | mlog(0, "%s: waiting 500ms for heartbeat state " | 923 | mlog(0, "%s: waiting 500ms for heartbeat state " |
922 | "change\n", dlm->name); | 924 | "change\n", dlm->name); |
923 | msleep(500); | 925 | msleep(500); |
924 | } | 926 | } |
925 | continue; | 927 | continue; |
926 | } | 928 | } |
927 | 929 | ||
928 | dlm_kick_recovery_thread(dlm); | 930 | dlm_kick_recovery_thread(dlm); |
929 | msleep(1000); | 931 | msleep(1000); |
930 | dlm_wait_for_recovery(dlm); | 932 | dlm_wait_for_recovery(dlm); |
931 | 933 | ||
932 | spin_lock(&dlm->spinlock); | 934 | spin_lock(&dlm->spinlock); |
933 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | 935 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); |
934 | if (bit < O2NM_MAX_NODES) { | 936 | if (bit < O2NM_MAX_NODES) { |
935 | mlog(0, "%s: res %.*s, At least one node (%d) " | 937 | mlog(0, "%s: res %.*s, At least one node (%d) " |
936 | "to recover before lock mastery can begin\n", | 938 | "to recover before lock mastery can begin\n", |
937 | dlm->name, namelen, (char *)lockid, bit); | 939 | dlm->name, namelen, (char *)lockid, bit); |
938 | wait_on_recovery = 1; | 940 | wait_on_recovery = 1; |
939 | } else | 941 | } else |
940 | wait_on_recovery = 0; | 942 | wait_on_recovery = 0; |
941 | spin_unlock(&dlm->spinlock); | 943 | spin_unlock(&dlm->spinlock); |
942 | 944 | ||
943 | if (wait_on_recovery) | 945 | if (wait_on_recovery) |
944 | dlm_wait_for_node_recovery(dlm, bit, 10000); | 946 | dlm_wait_for_node_recovery(dlm, bit, 10000); |
945 | } | 947 | } |
946 | 948 | ||
947 | /* must wait for lock to be mastered elsewhere */ | 949 | /* must wait for lock to be mastered elsewhere */ |
948 | if (blocked) | 950 | if (blocked) |
949 | goto wait; | 951 | goto wait; |
950 | 952 | ||
951 | ret = -EINVAL; | 953 | ret = -EINVAL; |
952 | dlm_node_iter_init(mle->vote_map, &iter); | 954 | dlm_node_iter_init(mle->vote_map, &iter); |
953 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 955 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { |
954 | ret = dlm_do_master_request(res, mle, nodenum); | 956 | ret = dlm_do_master_request(res, mle, nodenum); |
955 | if (ret < 0) | 957 | if (ret < 0) |
956 | mlog_errno(ret); | 958 | mlog_errno(ret); |
957 | if (mle->master != O2NM_MAX_NODES) { | 959 | if (mle->master != O2NM_MAX_NODES) { |
958 | /* found a master ! */ | 960 | /* found a master ! */ |
959 | if (mle->master <= nodenum) | 961 | if (mle->master <= nodenum) |
960 | break; | 962 | break; |
961 | /* if our master request has not reached the master | 963 | /* if our master request has not reached the master |
962 | * yet, keep going until it does. this is how the | 964 | * yet, keep going until it does. this is how the |
963 | * master will know that asserts are needed back to | 965 | * master will know that asserts are needed back to |
964 | * the lower nodes. */ | 966 | * the lower nodes. */ |
965 | mlog(0, "%s: res %.*s, Requests only up to %u but " | 967 | mlog(0, "%s: res %.*s, Requests only up to %u but " |
966 | "master is %u, keep going\n", dlm->name, namelen, | 968 | "master is %u, keep going\n", dlm->name, namelen, |
967 | lockid, nodenum, mle->master); | 969 | lockid, nodenum, mle->master); |
968 | } | 970 | } |
969 | } | 971 | } |
970 | 972 | ||
971 | wait: | 973 | wait: |
972 | /* keep going until the response map includes all nodes */ | 974 | /* keep going until the response map includes all nodes */ |
973 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); | 975 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); |
974 | if (ret < 0) { | 976 | if (ret < 0) { |
975 | wait_on_recovery = 1; | 977 | wait_on_recovery = 1; |
976 | mlog(0, "%s: res %.*s, Node map changed, redo the master " | 978 | mlog(0, "%s: res %.*s, Node map changed, redo the master " |
977 | "request now, blocked=%d\n", dlm->name, res->lockname.len, | 979 | "request now, blocked=%d\n", dlm->name, res->lockname.len, |
978 | res->lockname.name, blocked); | 980 | res->lockname.name, blocked); |
979 | if (++tries > 20) { | 981 | if (++tries > 20) { |
980 | mlog(ML_ERROR, "%s: res %.*s, Spinning on " | 982 | mlog(ML_ERROR, "%s: res %.*s, Spinning on " |
981 | "dlm_wait_for_lock_mastery, blocked = %d\n", | 983 | "dlm_wait_for_lock_mastery, blocked = %d\n", |
982 | dlm->name, res->lockname.len, | 984 | dlm->name, res->lockname.len, |
983 | res->lockname.name, blocked); | 985 | res->lockname.name, blocked); |
984 | dlm_print_one_lock_resource(res); | 986 | dlm_print_one_lock_resource(res); |
985 | dlm_print_one_mle(mle); | 987 | dlm_print_one_mle(mle); |
986 | tries = 0; | 988 | tries = 0; |
987 | } | 989 | } |
988 | goto redo_request; | 990 | goto redo_request; |
989 | } | 991 | } |
990 | 992 | ||
991 | mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, | 993 | mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, |
992 | res->lockname.name, res->owner); | 994 | res->lockname.name, res->owner); |
993 | /* make sure we never continue without this */ | 995 | /* make sure we never continue without this */ |
994 | BUG_ON(res->owner == O2NM_MAX_NODES); | 996 | BUG_ON(res->owner == O2NM_MAX_NODES); |
995 | 997 | ||
996 | /* master is known, detach if not already detached */ | 998 | /* master is known, detach if not already detached */ |
997 | dlm_mle_detach_hb_events(dlm, mle); | 999 | dlm_mle_detach_hb_events(dlm, mle); |
998 | dlm_put_mle(mle); | 1000 | dlm_put_mle(mle); |
999 | /* put the extra ref */ | 1001 | /* put the extra ref */ |
1000 | dlm_put_mle_inuse(mle); | 1002 | dlm_put_mle_inuse(mle); |
1001 | 1003 | ||
1002 | wake_waiters: | 1004 | wake_waiters: |
1003 | spin_lock(&res->spinlock); | 1005 | spin_lock(&res->spinlock); |
1004 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 1006 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
1005 | spin_unlock(&res->spinlock); | 1007 | spin_unlock(&res->spinlock); |
1006 | wake_up(&res->wq); | 1008 | wake_up(&res->wq); |
1007 | 1009 | ||
1008 | leave: | 1010 | leave: |
1009 | /* need to free the unused mle */ | 1011 | /* need to free the unused mle */ |
1010 | if (alloc_mle) | 1012 | if (alloc_mle) |
1011 | kmem_cache_free(dlm_mle_cache, alloc_mle); | 1013 | kmem_cache_free(dlm_mle_cache, alloc_mle); |
1012 | 1014 | ||
1013 | return res; | 1015 | return res; |
1014 | } | 1016 | } |
1015 | 1017 | ||
1016 | 1018 | ||
1017 | #define DLM_MASTERY_TIMEOUT_MS 5000 | 1019 | #define DLM_MASTERY_TIMEOUT_MS 5000 |
1018 | 1020 | ||
1019 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, | 1021 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, |
1020 | struct dlm_lock_resource *res, | 1022 | struct dlm_lock_resource *res, |
1021 | struct dlm_master_list_entry *mle, | 1023 | struct dlm_master_list_entry *mle, |
1022 | int *blocked) | 1024 | int *blocked) |
1023 | { | 1025 | { |
1024 | u8 m; | 1026 | u8 m; |
1025 | int ret, bit; | 1027 | int ret, bit; |
1026 | int map_changed, voting_done; | 1028 | int map_changed, voting_done; |
1027 | int assert, sleep; | 1029 | int assert, sleep; |
1028 | 1030 | ||
1029 | recheck: | 1031 | recheck: |
1030 | ret = 0; | 1032 | ret = 0; |
1031 | assert = 0; | 1033 | assert = 0; |
1032 | 1034 | ||
1033 | /* check if another node has already become the owner */ | 1035 | /* check if another node has already become the owner */ |
1034 | spin_lock(&res->spinlock); | 1036 | spin_lock(&res->spinlock); |
1035 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { | 1037 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { |
1036 | mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, | 1038 | mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, |
1037 | res->lockname.len, res->lockname.name, res->owner); | 1039 | res->lockname.len, res->lockname.name, res->owner); |
1038 | spin_unlock(&res->spinlock); | 1040 | spin_unlock(&res->spinlock); |
1039 | /* this will cause the master to re-assert across | 1041 | /* this will cause the master to re-assert across |
1040 | * the whole cluster, freeing up mles */ | 1042 | * the whole cluster, freeing up mles */ |
1041 | if (res->owner != dlm->node_num) { | 1043 | if (res->owner != dlm->node_num) { |
1042 | ret = dlm_do_master_request(res, mle, res->owner); | 1044 | ret = dlm_do_master_request(res, mle, res->owner); |
1043 | if (ret < 0) { | 1045 | if (ret < 0) { |
1044 | /* give recovery a chance to run */ | 1046 | /* give recovery a chance to run */ |
1045 | mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); | 1047 | mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); |
1046 | msleep(500); | 1048 | msleep(500); |
1047 | goto recheck; | 1049 | goto recheck; |
1048 | } | 1050 | } |
1049 | } | 1051 | } |
1050 | ret = 0; | 1052 | ret = 0; |
1051 | goto leave; | 1053 | goto leave; |
1052 | } | 1054 | } |
1053 | spin_unlock(&res->spinlock); | 1055 | spin_unlock(&res->spinlock); |
1054 | 1056 | ||
1055 | spin_lock(&mle->spinlock); | 1057 | spin_lock(&mle->spinlock); |
1056 | m = mle->master; | 1058 | m = mle->master; |
1057 | map_changed = (memcmp(mle->vote_map, mle->node_map, | 1059 | map_changed = (memcmp(mle->vote_map, mle->node_map, |
1058 | sizeof(mle->vote_map)) != 0); | 1060 | sizeof(mle->vote_map)) != 0); |
1059 | voting_done = (memcmp(mle->vote_map, mle->response_map, | 1061 | voting_done = (memcmp(mle->vote_map, mle->response_map, |
1060 | sizeof(mle->vote_map)) == 0); | 1062 | sizeof(mle->vote_map)) == 0); |
1061 | 1063 | ||
1062 | /* restart if we hit any errors */ | 1064 | /* restart if we hit any errors */ |
1063 | if (map_changed) { | 1065 | if (map_changed) { |
1064 | int b; | 1066 | int b; |
1065 | mlog(0, "%s: %.*s: node map changed, restarting\n", | 1067 | mlog(0, "%s: %.*s: node map changed, restarting\n", |
1066 | dlm->name, res->lockname.len, res->lockname.name); | 1068 | dlm->name, res->lockname.len, res->lockname.name); |
1067 | ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); | 1069 | ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); |
1068 | b = (mle->type == DLM_MLE_BLOCK); | 1070 | b = (mle->type == DLM_MLE_BLOCK); |
1069 | if ((*blocked && !b) || (!*blocked && b)) { | 1071 | if ((*blocked && !b) || (!*blocked && b)) { |
1070 | mlog(0, "%s:%.*s: status change: old=%d new=%d\n", | 1072 | mlog(0, "%s:%.*s: status change: old=%d new=%d\n", |
1071 | dlm->name, res->lockname.len, res->lockname.name, | 1073 | dlm->name, res->lockname.len, res->lockname.name, |
1072 | *blocked, b); | 1074 | *blocked, b); |
1073 | *blocked = b; | 1075 | *blocked = b; |
1074 | } | 1076 | } |
1075 | spin_unlock(&mle->spinlock); | 1077 | spin_unlock(&mle->spinlock); |
1076 | if (ret < 0) { | 1078 | if (ret < 0) { |
1077 | mlog_errno(ret); | 1079 | mlog_errno(ret); |
1078 | goto leave; | 1080 | goto leave; |
1079 | } | 1081 | } |
1080 | mlog(0, "%s:%.*s: restart lock mastery succeeded, " | 1082 | mlog(0, "%s:%.*s: restart lock mastery succeeded, " |
1081 | "rechecking now\n", dlm->name, res->lockname.len, | 1083 | "rechecking now\n", dlm->name, res->lockname.len, |
1082 | res->lockname.name); | 1084 | res->lockname.name); |
1083 | goto recheck; | 1085 | goto recheck; |
1084 | } else { | 1086 | } else { |
1085 | if (!voting_done) { | 1087 | if (!voting_done) { |
1086 | mlog(0, "map not changed and voting not done " | 1088 | mlog(0, "map not changed and voting not done " |
1087 | "for %s:%.*s\n", dlm->name, res->lockname.len, | 1089 | "for %s:%.*s\n", dlm->name, res->lockname.len, |
1088 | res->lockname.name); | 1090 | res->lockname.name); |
1089 | } | 1091 | } |
1090 | } | 1092 | } |
1091 | 1093 | ||
1092 | if (m != O2NM_MAX_NODES) { | 1094 | if (m != O2NM_MAX_NODES) { |
1093 | /* another node has done an assert! | 1095 | /* another node has done an assert! |
1094 | * all done! */ | 1096 | * all done! */ |
1095 | sleep = 0; | 1097 | sleep = 0; |
1096 | } else { | 1098 | } else { |
1097 | sleep = 1; | 1099 | sleep = 1; |
1098 | /* have all nodes responded? */ | 1100 | /* have all nodes responded? */ |
1099 | if (voting_done && !*blocked) { | 1101 | if (voting_done && !*blocked) { |
1100 | bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); | 1102 | bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); |
1101 | if (dlm->node_num <= bit) { | 1103 | if (dlm->node_num <= bit) { |
1102 | /* my node number is lowest. | 1104 | /* my node number is lowest. |
1103 | * now tell other nodes that I am | 1105 | * now tell other nodes that I am |
1104 | * mastering this. */ | 1106 | * mastering this. */ |
1105 | mle->master = dlm->node_num; | 1107 | mle->master = dlm->node_num; |
1106 | /* ref was grabbed in get_lock_resource | 1108 | /* ref was grabbed in get_lock_resource |
1107 | * will be dropped in dlmlock_master */ | 1109 | * will be dropped in dlmlock_master */ |
1108 | assert = 1; | 1110 | assert = 1; |
1109 | sleep = 0; | 1111 | sleep = 0; |
1110 | } | 1112 | } |
1111 | /* if voting is done, but we have not received | 1113 | /* if voting is done, but we have not received |
1112 | * an assert master yet, we must sleep */ | 1114 | * an assert master yet, we must sleep */ |
1113 | } | 1115 | } |
1114 | } | 1116 | } |
1115 | 1117 | ||
1116 | spin_unlock(&mle->spinlock); | 1118 | spin_unlock(&mle->spinlock); |
1117 | 1119 | ||
1118 | /* sleep if we haven't finished voting yet */ | 1120 | /* sleep if we haven't finished voting yet */ |
1119 | if (sleep) { | 1121 | if (sleep) { |
1120 | unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); | 1122 | unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); |
1121 | 1123 | ||
1122 | /* | 1124 | /* |
1123 | if (atomic_read(&mle->mle_refs.refcount) < 2) | 1125 | if (atomic_read(&mle->mle_refs.refcount) < 2) |
1124 | mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, | 1126 | mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, |
1125 | atomic_read(&mle->mle_refs.refcount), | 1127 | atomic_read(&mle->mle_refs.refcount), |
1126 | res->lockname.len, res->lockname.name); | 1128 | res->lockname.len, res->lockname.name); |
1127 | */ | 1129 | */ |
1128 | atomic_set(&mle->woken, 0); | 1130 | atomic_set(&mle->woken, 0); |
1129 | (void)wait_event_timeout(mle->wq, | 1131 | (void)wait_event_timeout(mle->wq, |
1130 | (atomic_read(&mle->woken) == 1), | 1132 | (atomic_read(&mle->woken) == 1), |
1131 | timeo); | 1133 | timeo); |
1132 | if (res->owner == O2NM_MAX_NODES) { | 1134 | if (res->owner == O2NM_MAX_NODES) { |
1133 | mlog(0, "%s:%.*s: waiting again\n", dlm->name, | 1135 | mlog(0, "%s:%.*s: waiting again\n", dlm->name, |
1134 | res->lockname.len, res->lockname.name); | 1136 | res->lockname.len, res->lockname.name); |
1135 | goto recheck; | 1137 | goto recheck; |
1136 | } | 1138 | } |
1137 | mlog(0, "done waiting, master is %u\n", res->owner); | 1139 | mlog(0, "done waiting, master is %u\n", res->owner); |
1138 | ret = 0; | 1140 | ret = 0; |
1139 | goto leave; | 1141 | goto leave; |
1140 | } | 1142 | } |
1141 | 1143 | ||
1142 | ret = 0; /* done */ | 1144 | ret = 0; /* done */ |
1143 | if (assert) { | 1145 | if (assert) { |
1144 | m = dlm->node_num; | 1146 | m = dlm->node_num; |
1145 | mlog(0, "about to master %.*s here, this=%u\n", | 1147 | mlog(0, "about to master %.*s here, this=%u\n", |
1146 | res->lockname.len, res->lockname.name, m); | 1148 | res->lockname.len, res->lockname.name, m); |
1147 | ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); | 1149 | ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); |
1148 | if (ret) { | 1150 | if (ret) { |
1149 | /* This is a failure in the network path, | 1151 | /* This is a failure in the network path, |
1150 | * not in the response to the assert_master | 1152 | * not in the response to the assert_master |
1151 | * (any nonzero response is a BUG on this node). | 1153 | * (any nonzero response is a BUG on this node). |
1152 | * Most likely a socket just got disconnected | 1154 | * Most likely a socket just got disconnected |
1153 | * due to node death. */ | 1155 | * due to node death. */ |
1154 | mlog_errno(ret); | 1156 | mlog_errno(ret); |
1155 | } | 1157 | } |
1156 | /* no longer need to restart lock mastery. | 1158 | /* no longer need to restart lock mastery. |
1157 | * all living nodes have been contacted. */ | 1159 | * all living nodes have been contacted. */ |
1158 | ret = 0; | 1160 | ret = 0; |
1159 | } | 1161 | } |
1160 | 1162 | ||
1161 | /* set the lockres owner */ | 1163 | /* set the lockres owner */ |
1162 | spin_lock(&res->spinlock); | 1164 | spin_lock(&res->spinlock); |
1163 | /* mastery reference obtained either during | 1165 | /* mastery reference obtained either during |
1164 | * assert_master_handler or in get_lock_resource */ | 1166 | * assert_master_handler or in get_lock_resource */ |
1165 | dlm_change_lockres_owner(dlm, res, m); | 1167 | dlm_change_lockres_owner(dlm, res, m); |
1166 | spin_unlock(&res->spinlock); | 1168 | spin_unlock(&res->spinlock); |
1167 | 1169 | ||
1168 | leave: | 1170 | leave: |
1169 | return ret; | 1171 | return ret; |
1170 | } | 1172 | } |
1171 | 1173 | ||
1172 | struct dlm_bitmap_diff_iter | 1174 | struct dlm_bitmap_diff_iter |
1173 | { | 1175 | { |
1174 | int curnode; | 1176 | int curnode; |
1175 | unsigned long *orig_bm; | 1177 | unsigned long *orig_bm; |
1176 | unsigned long *cur_bm; | 1178 | unsigned long *cur_bm; |
1177 | unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 1179 | unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
1178 | }; | 1180 | }; |
1179 | 1181 | ||
1180 | enum dlm_node_state_change | 1182 | enum dlm_node_state_change |
1181 | { | 1183 | { |
1182 | NODE_DOWN = -1, | 1184 | NODE_DOWN = -1, |
1183 | NODE_NO_CHANGE = 0, | 1185 | NODE_NO_CHANGE = 0, |
1184 | NODE_UP | 1186 | NODE_UP |
1185 | }; | 1187 | }; |
1186 | 1188 | ||
1187 | static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, | 1189 | static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, |
1188 | unsigned long *orig_bm, | 1190 | unsigned long *orig_bm, |
1189 | unsigned long *cur_bm) | 1191 | unsigned long *cur_bm) |
1190 | { | 1192 | { |
1191 | unsigned long p1, p2; | 1193 | unsigned long p1, p2; |
1192 | int i; | 1194 | int i; |
1193 | 1195 | ||
1194 | iter->curnode = -1; | 1196 | iter->curnode = -1; |
1195 | iter->orig_bm = orig_bm; | 1197 | iter->orig_bm = orig_bm; |
1196 | iter->cur_bm = cur_bm; | 1198 | iter->cur_bm = cur_bm; |
1197 | 1199 | ||
1198 | for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { | 1200 | for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { |
1199 | p1 = *(iter->orig_bm + i); | 1201 | p1 = *(iter->orig_bm + i); |
1200 | p2 = *(iter->cur_bm + i); | 1202 | p2 = *(iter->cur_bm + i); |
1201 | iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); | 1203 | iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); |
1202 | } | 1204 | } |
1203 | } | 1205 | } |
1204 | 1206 | ||
1205 | static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, | 1207 | static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, |
1206 | enum dlm_node_state_change *state) | 1208 | enum dlm_node_state_change *state) |
1207 | { | 1209 | { |
1208 | int bit; | 1210 | int bit; |
1209 | 1211 | ||
1210 | if (iter->curnode >= O2NM_MAX_NODES) | 1212 | if (iter->curnode >= O2NM_MAX_NODES) |
1211 | return -ENOENT; | 1213 | return -ENOENT; |
1212 | 1214 | ||
1213 | bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, | 1215 | bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, |
1214 | iter->curnode+1); | 1216 | iter->curnode+1); |
1215 | if (bit >= O2NM_MAX_NODES) { | 1217 | if (bit >= O2NM_MAX_NODES) { |
1216 | iter->curnode = O2NM_MAX_NODES; | 1218 | iter->curnode = O2NM_MAX_NODES; |
1217 | return -ENOENT; | 1219 | return -ENOENT; |
1218 | } | 1220 | } |
1219 | 1221 | ||
1220 | /* if it was there in the original then this node died */ | 1222 | /* if it was there in the original then this node died */ |
1221 | if (test_bit(bit, iter->orig_bm)) | 1223 | if (test_bit(bit, iter->orig_bm)) |
1222 | *state = NODE_DOWN; | 1224 | *state = NODE_DOWN; |
1223 | else | 1225 | else |
1224 | *state = NODE_UP; | 1226 | *state = NODE_UP; |
1225 | 1227 | ||
1226 | iter->curnode = bit; | 1228 | iter->curnode = bit; |
1227 | return bit; | 1229 | return bit; |
1228 | } | 1230 | } |
1229 | 1231 | ||
1230 | 1232 | ||
1231 | static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | 1233 | static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, |
1232 | struct dlm_lock_resource *res, | 1234 | struct dlm_lock_resource *res, |
1233 | struct dlm_master_list_entry *mle, | 1235 | struct dlm_master_list_entry *mle, |
1234 | int blocked) | 1236 | int blocked) |
1235 | { | 1237 | { |
1236 | struct dlm_bitmap_diff_iter bdi; | 1238 | struct dlm_bitmap_diff_iter bdi; |
1237 | enum dlm_node_state_change sc; | 1239 | enum dlm_node_state_change sc; |
1238 | int node; | 1240 | int node; |
1239 | int ret = 0; | 1241 | int ret = 0; |
1240 | 1242 | ||
1241 | mlog(0, "something happened such that the " | 1243 | mlog(0, "something happened such that the " |
1242 | "master process may need to be restarted!\n"); | 1244 | "master process may need to be restarted!\n"); |
1243 | 1245 | ||
1244 | assert_spin_locked(&mle->spinlock); | 1246 | assert_spin_locked(&mle->spinlock); |
1245 | 1247 | ||
1246 | dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); | 1248 | dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); |
1247 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); | 1249 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); |
1248 | while (node >= 0) { | 1250 | while (node >= 0) { |
1249 | if (sc == NODE_UP) { | 1251 | if (sc == NODE_UP) { |
1250 | /* a node came up. clear any old vote from | 1252 | /* a node came up. clear any old vote from |
1251 | * the response map and set it in the vote map | 1253 | * the response map and set it in the vote map |
1252 | * then restart the mastery. */ | 1254 | * then restart the mastery. */ |
1253 | mlog(ML_NOTICE, "node %d up while restarting\n", node); | 1255 | mlog(ML_NOTICE, "node %d up while restarting\n", node); |
1254 | 1256 | ||
1255 | /* redo the master request, but only for the new node */ | 1257 | /* redo the master request, but only for the new node */ |
1256 | mlog(0, "sending request to new node\n"); | 1258 | mlog(0, "sending request to new node\n"); |
1257 | clear_bit(node, mle->response_map); | 1259 | clear_bit(node, mle->response_map); |
1258 | set_bit(node, mle->vote_map); | 1260 | set_bit(node, mle->vote_map); |
1259 | } else { | 1261 | } else { |
1260 | mlog(ML_ERROR, "node down! %d\n", node); | 1262 | mlog(ML_ERROR, "node down! %d\n", node); |
1261 | if (blocked) { | 1263 | if (blocked) { |
1262 | int lowest = find_next_bit(mle->maybe_map, | 1264 | int lowest = find_next_bit(mle->maybe_map, |
1263 | O2NM_MAX_NODES, 0); | 1265 | O2NM_MAX_NODES, 0); |
1264 | 1266 | ||
1265 | /* act like it was never there */ | 1267 | /* act like it was never there */ |
1266 | clear_bit(node, mle->maybe_map); | 1268 | clear_bit(node, mle->maybe_map); |
1267 | 1269 | ||
1268 | if (node == lowest) { | 1270 | if (node == lowest) { |
1269 | mlog(0, "expected master %u died" | 1271 | mlog(0, "expected master %u died" |
1270 | " while this node was blocked " | 1272 | " while this node was blocked " |
1271 | "waiting on it!\n", node); | 1273 | "waiting on it!\n", node); |
1272 | lowest = find_next_bit(mle->maybe_map, | 1274 | lowest = find_next_bit(mle->maybe_map, |
1273 | O2NM_MAX_NODES, | 1275 | O2NM_MAX_NODES, |
1274 | lowest+1); | 1276 | lowest+1); |
1275 | if (lowest < O2NM_MAX_NODES) { | 1277 | if (lowest < O2NM_MAX_NODES) { |
1276 | mlog(0, "%s:%.*s:still " | 1278 | mlog(0, "%s:%.*s:still " |
1277 | "blocked. waiting on %u " | 1279 | "blocked. waiting on %u " |
1278 | "now\n", dlm->name, | 1280 | "now\n", dlm->name, |
1279 | res->lockname.len, | 1281 | res->lockname.len, |
1280 | res->lockname.name, | 1282 | res->lockname.name, |
1281 | lowest); | 1283 | lowest); |
1282 | } else { | 1284 | } else { |
1283 | /* mle is an MLE_BLOCK, but | 1285 | /* mle is an MLE_BLOCK, but |
1284 | * there is now nothing left to | 1286 | * there is now nothing left to |
1285 | * block on. we need to return | 1287 | * block on. we need to return |
1286 | * all the way back out and try | 1288 | * all the way back out and try |
1287 | * again with an MLE_MASTER. | 1289 | * again with an MLE_MASTER. |
1288 | * dlm_do_local_recovery_cleanup | 1290 | * dlm_do_local_recovery_cleanup |
1289 | * has already run, so the mle | 1291 | * has already run, so the mle |
1290 | * refcount is ok */ | 1292 | * refcount is ok */ |
1291 | mlog(0, "%s:%.*s: no " | 1293 | mlog(0, "%s:%.*s: no " |
1292 | "longer blocking. try to " | 1294 | "longer blocking. try to " |
1293 | "master this here\n", | 1295 | "master this here\n", |
1294 | dlm->name, | 1296 | dlm->name, |
1295 | res->lockname.len, | 1297 | res->lockname.len, |
1296 | res->lockname.name); | 1298 | res->lockname.name); |
1297 | mle->type = DLM_MLE_MASTER; | 1299 | mle->type = DLM_MLE_MASTER; |
1298 | mle->mleres = res; | 1300 | mle->mleres = res; |
1299 | } | 1301 | } |
1300 | } | 1302 | } |
1301 | } | 1303 | } |
1302 | 1304 | ||
1303 | /* now blank out everything, as if we had never | 1305 | /* now blank out everything, as if we had never |
1304 | * contacted anyone */ | 1306 | * contacted anyone */ |
1305 | memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); | 1307 | memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); |
1306 | memset(mle->response_map, 0, sizeof(mle->response_map)); | 1308 | memset(mle->response_map, 0, sizeof(mle->response_map)); |
1307 | /* reset the vote_map to the current node_map */ | 1309 | /* reset the vote_map to the current node_map */ |
1308 | memcpy(mle->vote_map, mle->node_map, | 1310 | memcpy(mle->vote_map, mle->node_map, |
1309 | sizeof(mle->node_map)); | 1311 | sizeof(mle->node_map)); |
1310 | /* put myself into the maybe map */ | 1312 | /* put myself into the maybe map */ |
1311 | if (mle->type != DLM_MLE_BLOCK) | 1313 | if (mle->type != DLM_MLE_BLOCK) |
1312 | set_bit(dlm->node_num, mle->maybe_map); | 1314 | set_bit(dlm->node_num, mle->maybe_map); |
1313 | } | 1315 | } |
1314 | ret = -EAGAIN; | 1316 | ret = -EAGAIN; |
1315 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); | 1317 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); |
1316 | } | 1318 | } |
1317 | return ret; | 1319 | return ret; |
1318 | } | 1320 | } |
1319 | 1321 | ||
1320 | 1322 | ||
1321 | /* | 1323 | /* |
1322 | * DLM_MASTER_REQUEST_MSG | 1324 | * DLM_MASTER_REQUEST_MSG |
1323 | * | 1325 | * |
1324 | * returns: 0 on success, | 1326 | * returns: 0 on success, |
1325 | * -errno on a network error | 1327 | * -errno on a network error |
1326 | * | 1328 | * |
1327 | * on error, the caller should assume the target node is "dead" | 1329 | * on error, the caller should assume the target node is "dead" |
1328 | * | 1330 | * |
1329 | */ | 1331 | */ |
1330 | 1332 | ||
1331 | static int dlm_do_master_request(struct dlm_lock_resource *res, | 1333 | static int dlm_do_master_request(struct dlm_lock_resource *res, |
1332 | struct dlm_master_list_entry *mle, int to) | 1334 | struct dlm_master_list_entry *mle, int to) |
1333 | { | 1335 | { |
1334 | struct dlm_ctxt *dlm = mle->dlm; | 1336 | struct dlm_ctxt *dlm = mle->dlm; |
1335 | struct dlm_master_request request; | 1337 | struct dlm_master_request request; |
1336 | int ret, response=0, resend; | 1338 | int ret, response=0, resend; |
1337 | 1339 | ||
1338 | memset(&request, 0, sizeof(request)); | 1340 | memset(&request, 0, sizeof(request)); |
1339 | request.node_idx = dlm->node_num; | 1341 | request.node_idx = dlm->node_num; |
1340 | 1342 | ||
1341 | BUG_ON(mle->type == DLM_MLE_MIGRATION); | 1343 | BUG_ON(mle->type == DLM_MLE_MIGRATION); |
1342 | 1344 | ||
1343 | request.namelen = (u8)mle->mnamelen; | 1345 | request.namelen = (u8)mle->mnamelen; |
1344 | memcpy(request.name, mle->mname, request.namelen); | 1346 | memcpy(request.name, mle->mname, request.namelen); |
1345 | 1347 | ||
1346 | again: | 1348 | again: |
1347 | ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, | 1349 | ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, |
1348 | sizeof(request), to, &response); | 1350 | sizeof(request), to, &response); |
1349 | if (ret < 0) { | 1351 | if (ret < 0) { |
1350 | if (ret == -ESRCH) { | 1352 | if (ret == -ESRCH) { |
1351 | /* should never happen */ | 1353 | /* should never happen */ |
1352 | mlog(ML_ERROR, "TCP stack not ready!\n"); | 1354 | mlog(ML_ERROR, "TCP stack not ready!\n"); |
1353 | BUG(); | 1355 | BUG(); |
1354 | } else if (ret == -EINVAL) { | 1356 | } else if (ret == -EINVAL) { |
1355 | mlog(ML_ERROR, "bad args passed to o2net!\n"); | 1357 | mlog(ML_ERROR, "bad args passed to o2net!\n"); |
1356 | BUG(); | 1358 | BUG(); |
1357 | } else if (ret == -ENOMEM) { | 1359 | } else if (ret == -ENOMEM) { |
1358 | mlog(ML_ERROR, "out of memory while trying to send " | 1360 | mlog(ML_ERROR, "out of memory while trying to send " |
1359 | "network message! retrying\n"); | 1361 | "network message! retrying\n"); |
1360 | /* this is totally crude */ | 1362 | /* this is totally crude */ |
1361 | msleep(50); | 1363 | msleep(50); |
1362 | goto again; | 1364 | goto again; |
1363 | } else if (!dlm_is_host_down(ret)) { | 1365 | } else if (!dlm_is_host_down(ret)) { |
1364 | /* not a network error. bad. */ | 1366 | /* not a network error. bad. */ |
1365 | mlog_errno(ret); | 1367 | mlog_errno(ret); |
1366 | mlog(ML_ERROR, "unhandled error!"); | 1368 | mlog(ML_ERROR, "unhandled error!"); |
1367 | BUG(); | 1369 | BUG(); |
1368 | } | 1370 | } |
1369 | /* all other errors should be network errors, | 1371 | /* all other errors should be network errors, |
1370 | * and likely indicate node death */ | 1372 | * and likely indicate node death */ |
1371 | mlog(ML_ERROR, "link to %d went down!\n", to); | 1373 | mlog(ML_ERROR, "link to %d went down!\n", to); |
1372 | goto out; | 1374 | goto out; |
1373 | } | 1375 | } |
1374 | 1376 | ||
1375 | ret = 0; | 1377 | ret = 0; |
1376 | resend = 0; | 1378 | resend = 0; |
1377 | spin_lock(&mle->spinlock); | 1379 | spin_lock(&mle->spinlock); |
1378 | switch (response) { | 1380 | switch (response) { |
1379 | case DLM_MASTER_RESP_YES: | 1381 | case DLM_MASTER_RESP_YES: |
1380 | set_bit(to, mle->response_map); | 1382 | set_bit(to, mle->response_map); |
1381 | mlog(0, "node %u is the master, response=YES\n", to); | 1383 | mlog(0, "node %u is the master, response=YES\n", to); |
1382 | mlog(0, "%s:%.*s: master node %u now knows I have a " | 1384 | mlog(0, "%s:%.*s: master node %u now knows I have a " |
1383 | "reference\n", dlm->name, res->lockname.len, | 1385 | "reference\n", dlm->name, res->lockname.len, |
1384 | res->lockname.name, to); | 1386 | res->lockname.name, to); |
1385 | mle->master = to; | 1387 | mle->master = to; |
1386 | break; | 1388 | break; |
1387 | case DLM_MASTER_RESP_NO: | 1389 | case DLM_MASTER_RESP_NO: |
1388 | mlog(0, "node %u not master, response=NO\n", to); | 1390 | mlog(0, "node %u not master, response=NO\n", to); |
1389 | set_bit(to, mle->response_map); | 1391 | set_bit(to, mle->response_map); |
1390 | break; | 1392 | break; |
1391 | case DLM_MASTER_RESP_MAYBE: | 1393 | case DLM_MASTER_RESP_MAYBE: |
1392 | mlog(0, "node %u not master, response=MAYBE\n", to); | 1394 | mlog(0, "node %u not master, response=MAYBE\n", to); |
1393 | set_bit(to, mle->response_map); | 1395 | set_bit(to, mle->response_map); |
1394 | set_bit(to, mle->maybe_map); | 1396 | set_bit(to, mle->maybe_map); |
1395 | break; | 1397 | break; |
1396 | case DLM_MASTER_RESP_ERROR: | 1398 | case DLM_MASTER_RESP_ERROR: |
1397 | mlog(0, "node %u hit an error, resending\n", to); | 1399 | mlog(0, "node %u hit an error, resending\n", to); |
1398 | resend = 1; | 1400 | resend = 1; |
1399 | response = 0; | 1401 | response = 0; |
1400 | break; | 1402 | break; |
1401 | default: | 1403 | default: |
1402 | mlog(ML_ERROR, "bad response! %u\n", response); | 1404 | mlog(ML_ERROR, "bad response! %u\n", response); |
1403 | BUG(); | 1405 | BUG(); |
1404 | } | 1406 | } |
1405 | spin_unlock(&mle->spinlock); | 1407 | spin_unlock(&mle->spinlock); |
1406 | if (resend) { | 1408 | if (resend) { |
1407 | /* this is also totally crude */ | 1409 | /* this is also totally crude */ |
1408 | msleep(50); | 1410 | msleep(50); |
1409 | goto again; | 1411 | goto again; |
1410 | } | 1412 | } |
1411 | 1413 | ||
1412 | out: | 1414 | out: |
1413 | return ret; | 1415 | return ret; |
1414 | } | 1416 | } |
1415 | 1417 | ||
1416 | /* | 1418 | /* |
1417 | * locks that can be taken here: | 1419 | * locks that can be taken here: |
1418 | * dlm->spinlock | 1420 | * dlm->spinlock |
1419 | * res->spinlock | 1421 | * res->spinlock |
1420 | * mle->spinlock | 1422 | * mle->spinlock |
1421 | * dlm->master_list | 1423 | * dlm->master_list |
1422 | * | 1424 | * |
1423 | * if possible, TRIM THIS DOWN!!! | 1425 | * if possible, TRIM THIS DOWN!!! |
1424 | */ | 1426 | */ |
1425 | int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, | 1427 | int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, |
1426 | void **ret_data) | 1428 | void **ret_data) |
1427 | { | 1429 | { |
1428 | u8 response = DLM_MASTER_RESP_MAYBE; | 1430 | u8 response = DLM_MASTER_RESP_MAYBE; |
1429 | struct dlm_ctxt *dlm = data; | 1431 | struct dlm_ctxt *dlm = data; |
1430 | struct dlm_lock_resource *res = NULL; | 1432 | struct dlm_lock_resource *res = NULL; |
1431 | struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; | 1433 | struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; |
1432 | struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; | 1434 | struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; |
1433 | char *name; | 1435 | char *name; |
1434 | unsigned int namelen, hash; | 1436 | unsigned int namelen, hash; |
1435 | int found, ret; | 1437 | int found, ret; |
1436 | int set_maybe; | 1438 | int set_maybe; |
1437 | int dispatch_assert = 0; | 1439 | int dispatch_assert = 0; |
1438 | 1440 | ||
1439 | if (!dlm_grab(dlm)) | 1441 | if (!dlm_grab(dlm)) |
1440 | return DLM_MASTER_RESP_NO; | 1442 | return DLM_MASTER_RESP_NO; |
1441 | 1443 | ||
1442 | if (!dlm_domain_fully_joined(dlm)) { | 1444 | if (!dlm_domain_fully_joined(dlm)) { |
1443 | response = DLM_MASTER_RESP_NO; | 1445 | response = DLM_MASTER_RESP_NO; |
1444 | goto send_response; | 1446 | goto send_response; |
1445 | } | 1447 | } |
1446 | 1448 | ||
1447 | name = request->name; | 1449 | name = request->name; |
1448 | namelen = request->namelen; | 1450 | namelen = request->namelen; |
1449 | hash = dlm_lockid_hash(name, namelen); | 1451 | hash = dlm_lockid_hash(name, namelen); |
1450 | 1452 | ||
1451 | if (namelen > DLM_LOCKID_NAME_MAX) { | 1453 | if (namelen > DLM_LOCKID_NAME_MAX) { |
1452 | response = DLM_IVBUFLEN; | 1454 | response = DLM_IVBUFLEN; |
1453 | goto send_response; | 1455 | goto send_response; |
1454 | } | 1456 | } |
1455 | 1457 | ||
1456 | way_up_top: | 1458 | way_up_top: |
1457 | spin_lock(&dlm->spinlock); | 1459 | spin_lock(&dlm->spinlock); |
1458 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); | 1460 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); |
1459 | if (res) { | 1461 | if (res) { |
1460 | spin_unlock(&dlm->spinlock); | 1462 | spin_unlock(&dlm->spinlock); |
1461 | 1463 | ||
1462 | /* take care of the easy cases up front */ | 1464 | /* take care of the easy cases up front */ |
1463 | spin_lock(&res->spinlock); | 1465 | spin_lock(&res->spinlock); |
1464 | if (res->state & (DLM_LOCK_RES_RECOVERING| | 1466 | if (res->state & (DLM_LOCK_RES_RECOVERING| |
1465 | DLM_LOCK_RES_MIGRATING)) { | 1467 | DLM_LOCK_RES_MIGRATING)) { |
1466 | spin_unlock(&res->spinlock); | 1468 | spin_unlock(&res->spinlock); |
1467 | mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " | 1469 | mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " |
1468 | "being recovered/migrated\n"); | 1470 | "being recovered/migrated\n"); |
1469 | response = DLM_MASTER_RESP_ERROR; | 1471 | response = DLM_MASTER_RESP_ERROR; |
1470 | if (mle) | 1472 | if (mle) |
1471 | kmem_cache_free(dlm_mle_cache, mle); | 1473 | kmem_cache_free(dlm_mle_cache, mle); |
1472 | goto send_response; | 1474 | goto send_response; |
1473 | } | 1475 | } |
1474 | 1476 | ||
1475 | if (res->owner == dlm->node_num) { | 1477 | if (res->owner == dlm->node_num) { |
1476 | dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); | 1478 | dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); |
1477 | spin_unlock(&res->spinlock); | 1479 | spin_unlock(&res->spinlock); |
1478 | response = DLM_MASTER_RESP_YES; | 1480 | response = DLM_MASTER_RESP_YES; |
1479 | if (mle) | 1481 | if (mle) |
1480 | kmem_cache_free(dlm_mle_cache, mle); | 1482 | kmem_cache_free(dlm_mle_cache, mle); |
1481 | 1483 | ||
1482 | /* this node is the owner. | 1484 | /* this node is the owner. |
1483 | * there is some extra work that needs to | 1485 | * there is some extra work that needs to |
1484 | * happen now. the requesting node has | 1486 | * happen now. the requesting node has |
1485 | * caused all nodes up to this one to | 1487 | * caused all nodes up to this one to |
1486 | * create mles. this node now needs to | 1488 | * create mles. this node now needs to |
1487 | * go back and clean those up. */ | 1489 | * go back and clean those up. */ |
1488 | dispatch_assert = 1; | 1490 | dispatch_assert = 1; |
1489 | goto send_response; | 1491 | goto send_response; |
1490 | } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { | 1492 | } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { |
1491 | spin_unlock(&res->spinlock); | 1493 | spin_unlock(&res->spinlock); |
1492 | // mlog(0, "node %u is the master\n", res->owner); | 1494 | // mlog(0, "node %u is the master\n", res->owner); |
1493 | response = DLM_MASTER_RESP_NO; | 1495 | response = DLM_MASTER_RESP_NO; |
1494 | if (mle) | 1496 | if (mle) |
1495 | kmem_cache_free(dlm_mle_cache, mle); | 1497 | kmem_cache_free(dlm_mle_cache, mle); |
1496 | goto send_response; | 1498 | goto send_response; |
1497 | } | 1499 | } |
1498 | 1500 | ||
1499 | /* ok, there is no owner. either this node is | 1501 | /* ok, there is no owner. either this node is |
1500 | * being blocked, or it is actively trying to | 1502 | * being blocked, or it is actively trying to |
1501 | * master this lock. */ | 1503 | * master this lock. */ |
1502 | if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { | 1504 | if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { |
1503 | mlog(ML_ERROR, "lock with no owner should be " | 1505 | mlog(ML_ERROR, "lock with no owner should be " |
1504 | "in-progress!\n"); | 1506 | "in-progress!\n"); |
1505 | BUG(); | 1507 | BUG(); |
1506 | } | 1508 | } |
1507 | 1509 | ||
1508 | // mlog(0, "lockres is in progress...\n"); | 1510 | // mlog(0, "lockres is in progress...\n"); |
1509 | spin_lock(&dlm->master_lock); | 1511 | spin_lock(&dlm->master_lock); |
1510 | found = dlm_find_mle(dlm, &tmpmle, name, namelen); | 1512 | found = dlm_find_mle(dlm, &tmpmle, name, namelen); |
1511 | if (!found) { | 1513 | if (!found) { |
1512 | mlog(ML_ERROR, "no mle found for this lock!\n"); | 1514 | mlog(ML_ERROR, "no mle found for this lock!\n"); |
1513 | BUG(); | 1515 | BUG(); |
1514 | } | 1516 | } |
1515 | set_maybe = 1; | 1517 | set_maybe = 1; |
1516 | spin_lock(&tmpmle->spinlock); | 1518 | spin_lock(&tmpmle->spinlock); |
1517 | if (tmpmle->type == DLM_MLE_BLOCK) { | 1519 | if (tmpmle->type == DLM_MLE_BLOCK) { |
1518 | // mlog(0, "this node is waiting for " | 1520 | // mlog(0, "this node is waiting for " |
1519 | // "lockres to be mastered\n"); | 1521 | // "lockres to be mastered\n"); |
1520 | response = DLM_MASTER_RESP_NO; | 1522 | response = DLM_MASTER_RESP_NO; |
1521 | } else if (tmpmle->type == DLM_MLE_MIGRATION) { | 1523 | } else if (tmpmle->type == DLM_MLE_MIGRATION) { |
1522 | mlog(0, "node %u is master, but trying to migrate to " | 1524 | mlog(0, "node %u is master, but trying to migrate to " |
1523 | "node %u.\n", tmpmle->master, tmpmle->new_master); | 1525 | "node %u.\n", tmpmle->master, tmpmle->new_master); |
1524 | if (tmpmle->master == dlm->node_num) { | 1526 | if (tmpmle->master == dlm->node_num) { |
1525 | mlog(ML_ERROR, "no owner on lockres, but this " | 1527 | mlog(ML_ERROR, "no owner on lockres, but this " |
1526 | "node is trying to migrate it to %u?!\n", | 1528 | "node is trying to migrate it to %u?!\n", |
1527 | tmpmle->new_master); | 1529 | tmpmle->new_master); |
1528 | BUG(); | 1530 | BUG(); |
1529 | } else { | 1531 | } else { |
1530 | /* the real master can respond on its own */ | 1532 | /* the real master can respond on its own */ |
1531 | response = DLM_MASTER_RESP_NO; | 1533 | response = DLM_MASTER_RESP_NO; |
1532 | } | 1534 | } |
1533 | } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { | 1535 | } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { |
1534 | set_maybe = 0; | 1536 | set_maybe = 0; |
1535 | if (tmpmle->master == dlm->node_num) { | 1537 | if (tmpmle->master == dlm->node_num) { |
1536 | response = DLM_MASTER_RESP_YES; | 1538 | response = DLM_MASTER_RESP_YES; |
1537 | /* this node will be the owner. | 1539 | /* this node will be the owner. |
1538 | * go back and clean the mles on any | 1540 | * go back and clean the mles on any |
1539 | * other nodes */ | 1541 | * other nodes */ |
1540 | dispatch_assert = 1; | 1542 | dispatch_assert = 1; |
1541 | dlm_lockres_set_refmap_bit(dlm, res, | 1543 | dlm_lockres_set_refmap_bit(dlm, res, |
1542 | request->node_idx); | 1544 | request->node_idx); |
1543 | } else | 1545 | } else |
1544 | response = DLM_MASTER_RESP_NO; | 1546 | response = DLM_MASTER_RESP_NO; |
1545 | } else { | 1547 | } else { |
1546 | // mlog(0, "this node is attempting to " | 1548 | // mlog(0, "this node is attempting to " |
1547 | // "master lockres\n"); | 1549 | // "master lockres\n"); |
1548 | response = DLM_MASTER_RESP_MAYBE; | 1550 | response = DLM_MASTER_RESP_MAYBE; |
1549 | } | 1551 | } |
1550 | if (set_maybe) | 1552 | if (set_maybe) |
1551 | set_bit(request->node_idx, tmpmle->maybe_map); | 1553 | set_bit(request->node_idx, tmpmle->maybe_map); |
1552 | spin_unlock(&tmpmle->spinlock); | 1554 | spin_unlock(&tmpmle->spinlock); |
1553 | 1555 | ||
1554 | spin_unlock(&dlm->master_lock); | 1556 | spin_unlock(&dlm->master_lock); |
1555 | spin_unlock(&res->spinlock); | 1557 | spin_unlock(&res->spinlock); |
1556 | 1558 | ||
1557 | /* keep the mle attached to heartbeat events */ | 1559 | /* keep the mle attached to heartbeat events */ |
1558 | dlm_put_mle(tmpmle); | 1560 | dlm_put_mle(tmpmle); |
1559 | if (mle) | 1561 | if (mle) |
1560 | kmem_cache_free(dlm_mle_cache, mle); | 1562 | kmem_cache_free(dlm_mle_cache, mle); |
1561 | goto send_response; | 1563 | goto send_response; |
1562 | } | 1564 | } |
1563 | 1565 | ||
1564 | /* | 1566 | /* |
1565 | * lockres doesn't exist on this node | 1567 | * lockres doesn't exist on this node |
1566 | * if there is an MLE_BLOCK, return NO | 1568 | * if there is an MLE_BLOCK, return NO |
1567 | * if there is an MLE_MASTER, return MAYBE | 1569 | * if there is an MLE_MASTER, return MAYBE |
1568 | * otherwise, add an MLE_BLOCK, return NO | 1570 | * otherwise, add an MLE_BLOCK, return NO |
1569 | */ | 1571 | */ |
1570 | spin_lock(&dlm->master_lock); | 1572 | spin_lock(&dlm->master_lock); |
1571 | found = dlm_find_mle(dlm, &tmpmle, name, namelen); | 1573 | found = dlm_find_mle(dlm, &tmpmle, name, namelen); |
1572 | if (!found) { | 1574 | if (!found) { |
1573 | /* this lockid has never been seen on this node yet */ | 1575 | /* this lockid has never been seen on this node yet */ |
1574 | // mlog(0, "no mle found\n"); | 1576 | // mlog(0, "no mle found\n"); |
1575 | if (!mle) { | 1577 | if (!mle) { |
1576 | spin_unlock(&dlm->master_lock); | 1578 | spin_unlock(&dlm->master_lock); |
1577 | spin_unlock(&dlm->spinlock); | 1579 | spin_unlock(&dlm->spinlock); |
1578 | 1580 | ||
1579 | mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); | 1581 | mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); |
1580 | if (!mle) { | 1582 | if (!mle) { |
1581 | response = DLM_MASTER_RESP_ERROR; | 1583 | response = DLM_MASTER_RESP_ERROR; |
1582 | mlog_errno(-ENOMEM); | 1584 | mlog_errno(-ENOMEM); |
1583 | goto send_response; | 1585 | goto send_response; |
1584 | } | 1586 | } |
1585 | goto way_up_top; | 1587 | goto way_up_top; |
1586 | } | 1588 | } |
1587 | 1589 | ||
1588 | // mlog(0, "this is second time thru, already allocated, " | 1590 | // mlog(0, "this is second time thru, already allocated, " |
1589 | // "add the block.\n"); | 1591 | // "add the block.\n"); |
1590 | dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); | 1592 | dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); |
1591 | set_bit(request->node_idx, mle->maybe_map); | 1593 | set_bit(request->node_idx, mle->maybe_map); |
1592 | __dlm_insert_mle(dlm, mle); | 1594 | __dlm_insert_mle(dlm, mle); |
1593 | response = DLM_MASTER_RESP_NO; | 1595 | response = DLM_MASTER_RESP_NO; |
1594 | } else { | 1596 | } else { |
1595 | // mlog(0, "mle was found\n"); | 1597 | // mlog(0, "mle was found\n"); |
1596 | set_maybe = 1; | 1598 | set_maybe = 1; |
1597 | spin_lock(&tmpmle->spinlock); | 1599 | spin_lock(&tmpmle->spinlock); |
1598 | if (tmpmle->master == dlm->node_num) { | 1600 | if (tmpmle->master == dlm->node_num) { |
1599 | mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); | 1601 | mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); |
1600 | BUG(); | 1602 | BUG(); |
1601 | } | 1603 | } |
1602 | if (tmpmle->type == DLM_MLE_BLOCK) | 1604 | if (tmpmle->type == DLM_MLE_BLOCK) |
1603 | response = DLM_MASTER_RESP_NO; | 1605 | response = DLM_MASTER_RESP_NO; |
1604 | else if (tmpmle->type == DLM_MLE_MIGRATION) { | 1606 | else if (tmpmle->type == DLM_MLE_MIGRATION) { |
1605 | mlog(0, "migration mle was found (%u->%u)\n", | 1607 | mlog(0, "migration mle was found (%u->%u)\n", |
1606 | tmpmle->master, tmpmle->new_master); | 1608 | tmpmle->master, tmpmle->new_master); |
1607 | /* real master can respond on its own */ | 1609 | /* real master can respond on its own */ |
1608 | response = DLM_MASTER_RESP_NO; | 1610 | response = DLM_MASTER_RESP_NO; |
1609 | } else | 1611 | } else |
1610 | response = DLM_MASTER_RESP_MAYBE; | 1612 | response = DLM_MASTER_RESP_MAYBE; |
1611 | if (set_maybe) | 1613 | if (set_maybe) |
1612 | set_bit(request->node_idx, tmpmle->maybe_map); | 1614 | set_bit(request->node_idx, tmpmle->maybe_map); |
1613 | spin_unlock(&tmpmle->spinlock); | 1615 | spin_unlock(&tmpmle->spinlock); |
1614 | } | 1616 | } |
1615 | spin_unlock(&dlm->master_lock); | 1617 | spin_unlock(&dlm->master_lock); |
1616 | spin_unlock(&dlm->spinlock); | 1618 | spin_unlock(&dlm->spinlock); |
1617 | 1619 | ||
1618 | if (found) { | 1620 | if (found) { |
1619 | /* keep the mle attached to heartbeat events */ | 1621 | /* keep the mle attached to heartbeat events */ |
1620 | dlm_put_mle(tmpmle); | 1622 | dlm_put_mle(tmpmle); |
1621 | } | 1623 | } |
1622 | send_response: | 1624 | send_response: |
1623 | /* | 1625 | /* |
1624 | * __dlm_lookup_lockres() grabbed a reference to this lockres. | 1626 | * __dlm_lookup_lockres() grabbed a reference to this lockres. |
1625 | * The reference is released by dlm_assert_master_worker() under | 1627 | * The reference is released by dlm_assert_master_worker() under |
1626 | * the call to dlm_dispatch_assert_master(). If | 1628 | * the call to dlm_dispatch_assert_master(). If |
1627 | * dlm_assert_master_worker() isn't called, we drop it here. | 1629 | * dlm_assert_master_worker() isn't called, we drop it here. |
1628 | */ | 1630 | */ |
1629 | if (dispatch_assert) { | 1631 | if (dispatch_assert) { |
1630 | if (response != DLM_MASTER_RESP_YES) | 1632 | if (response != DLM_MASTER_RESP_YES) |
1631 | mlog(ML_ERROR, "invalid response %d\n", response); | 1633 | mlog(ML_ERROR, "invalid response %d\n", response); |
1632 | if (!res) { | 1634 | if (!res) { |
1633 | mlog(ML_ERROR, "bad lockres while trying to assert!\n"); | 1635 | mlog(ML_ERROR, "bad lockres while trying to assert!\n"); |
1634 | BUG(); | 1636 | BUG(); |
1635 | } | 1637 | } |
1636 | mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", | 1638 | mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", |
1637 | dlm->node_num, res->lockname.len, res->lockname.name); | 1639 | dlm->node_num, res->lockname.len, res->lockname.name); |
1638 | ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, | 1640 | ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, |
1639 | DLM_ASSERT_MASTER_MLE_CLEANUP); | 1641 | DLM_ASSERT_MASTER_MLE_CLEANUP); |
1640 | if (ret < 0) { | 1642 | if (ret < 0) { |
1641 | mlog(ML_ERROR, "failed to dispatch assert master work\n"); | 1643 | mlog(ML_ERROR, "failed to dispatch assert master work\n"); |
1642 | response = DLM_MASTER_RESP_ERROR; | 1644 | response = DLM_MASTER_RESP_ERROR; |
1643 | dlm_lockres_put(res); | 1645 | dlm_lockres_put(res); |
1644 | } else | 1646 | } else |
1645 | dlm_lockres_grab_inflight_worker(dlm, res); | 1647 | dlm_lockres_grab_inflight_worker(dlm, res); |
1646 | } else { | 1648 | } else { |
1647 | if (res) | 1649 | if (res) |
1648 | dlm_lockres_put(res); | 1650 | dlm_lockres_put(res); |
1649 | } | 1651 | } |
1650 | 1652 | ||
1651 | dlm_put(dlm); | 1653 | dlm_put(dlm); |
1652 | return response; | 1654 | return response; |
1653 | } | 1655 | } |
1654 | 1656 | ||
1655 | /* | 1657 | /* |
1656 | * DLM_ASSERT_MASTER_MSG | 1658 | * DLM_ASSERT_MASTER_MSG |
1657 | */ | 1659 | */ |
1658 | 1660 | ||
1659 | 1661 | ||
1660 | /* | 1662 | /* |
1661 | * NOTE: this can be used for debugging | 1663 | * NOTE: this can be used for debugging |
1662 | * can periodically run all locks owned by this node | 1664 | * can periodically run all locks owned by this node |
1663 | * and re-assert across the cluster... | 1665 | * and re-assert across the cluster... |
1664 | */ | 1666 | */ |
1665 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, | 1667 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, |
1666 | struct dlm_lock_resource *res, | 1668 | struct dlm_lock_resource *res, |
1667 | void *nodemap, u32 flags) | 1669 | void *nodemap, u32 flags) |
1668 | { | 1670 | { |
1669 | struct dlm_assert_master assert; | 1671 | struct dlm_assert_master assert; |
1670 | int to, tmpret; | 1672 | int to, tmpret; |
1671 | struct dlm_node_iter iter; | 1673 | struct dlm_node_iter iter; |
1672 | int ret = 0; | 1674 | int ret = 0; |
1673 | int reassert; | 1675 | int reassert; |
1674 | const char *lockname = res->lockname.name; | 1676 | const char *lockname = res->lockname.name; |
1675 | unsigned int namelen = res->lockname.len; | 1677 | unsigned int namelen = res->lockname.len; |
1676 | 1678 | ||
1677 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | 1679 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); |
1678 | 1680 | ||
1679 | spin_lock(&res->spinlock); | 1681 | spin_lock(&res->spinlock); |
1680 | res->state |= DLM_LOCK_RES_SETREF_INPROG; | 1682 | res->state |= DLM_LOCK_RES_SETREF_INPROG; |
1681 | spin_unlock(&res->spinlock); | 1683 | spin_unlock(&res->spinlock); |
1682 | 1684 | ||
1683 | again: | 1685 | again: |
1684 | reassert = 0; | 1686 | reassert = 0; |
1685 | 1687 | ||
1686 | /* note that if this nodemap is empty, it returns 0 */ | 1688 | /* note that if this nodemap is empty, it returns 0 */ |
1687 | dlm_node_iter_init(nodemap, &iter); | 1689 | dlm_node_iter_init(nodemap, &iter); |
1688 | while ((to = dlm_node_iter_next(&iter)) >= 0) { | 1690 | while ((to = dlm_node_iter_next(&iter)) >= 0) { |
1689 | int r = 0; | 1691 | int r = 0; |
1690 | struct dlm_master_list_entry *mle = NULL; | 1692 | struct dlm_master_list_entry *mle = NULL; |
1691 | 1693 | ||
1692 | mlog(0, "sending assert master to %d (%.*s)\n", to, | 1694 | mlog(0, "sending assert master to %d (%.*s)\n", to, |
1693 | namelen, lockname); | 1695 | namelen, lockname); |
1694 | memset(&assert, 0, sizeof(assert)); | 1696 | memset(&assert, 0, sizeof(assert)); |
1695 | assert.node_idx = dlm->node_num; | 1697 | assert.node_idx = dlm->node_num; |
1696 | assert.namelen = namelen; | 1698 | assert.namelen = namelen; |
1697 | memcpy(assert.name, lockname, namelen); | 1699 | memcpy(assert.name, lockname, namelen); |
1698 | assert.flags = cpu_to_be32(flags); | 1700 | assert.flags = cpu_to_be32(flags); |
1699 | 1701 | ||
1700 | tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, | 1702 | tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, |
1701 | &assert, sizeof(assert), to, &r); | 1703 | &assert, sizeof(assert), to, &r); |
1702 | if (tmpret < 0) { | 1704 | if (tmpret < 0) { |
1703 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 1705 | mlog(ML_ERROR, "Error %d when sending message %u (key " |
1704 | "0x%x) to node %u\n", tmpret, | 1706 | "0x%x) to node %u\n", tmpret, |
1705 | DLM_ASSERT_MASTER_MSG, dlm->key, to); | 1707 | DLM_ASSERT_MASTER_MSG, dlm->key, to); |
1706 | if (!dlm_is_host_down(tmpret)) { | 1708 | if (!dlm_is_host_down(tmpret)) { |
1707 | mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); | 1709 | mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); |
1708 | BUG(); | 1710 | BUG(); |
1709 | } | 1711 | } |
1710 | /* a node died. finish out the rest of the nodes. */ | 1712 | /* a node died. finish out the rest of the nodes. */ |
1711 | mlog(0, "link to %d went down!\n", to); | 1713 | mlog(0, "link to %d went down!\n", to); |
1712 | /* any nonzero status return will do */ | 1714 | /* any nonzero status return will do */ |
1713 | ret = tmpret; | 1715 | ret = tmpret; |
1714 | r = 0; | 1716 | r = 0; |
1715 | } else if (r < 0) { | 1717 | } else if (r < 0) { |
1716 | /* ok, something horribly messed. kill thyself. */ | 1718 | /* ok, something horribly messed. kill thyself. */ |
1717 | mlog(ML_ERROR,"during assert master of %.*s to %u, " | 1719 | mlog(ML_ERROR,"during assert master of %.*s to %u, " |
1718 | "got %d.\n", namelen, lockname, to, r); | 1720 | "got %d.\n", namelen, lockname, to, r); |
1719 | spin_lock(&dlm->spinlock); | 1721 | spin_lock(&dlm->spinlock); |
1720 | spin_lock(&dlm->master_lock); | 1722 | spin_lock(&dlm->master_lock); |
1721 | if (dlm_find_mle(dlm, &mle, (char *)lockname, | 1723 | if (dlm_find_mle(dlm, &mle, (char *)lockname, |
1722 | namelen)) { | 1724 | namelen)) { |
1723 | dlm_print_one_mle(mle); | 1725 | dlm_print_one_mle(mle); |
1724 | __dlm_put_mle(mle); | 1726 | __dlm_put_mle(mle); |
1725 | } | 1727 | } |
1726 | spin_unlock(&dlm->master_lock); | 1728 | spin_unlock(&dlm->master_lock); |
1727 | spin_unlock(&dlm->spinlock); | 1729 | spin_unlock(&dlm->spinlock); |
1728 | BUG(); | 1730 | BUG(); |
1729 | } | 1731 | } |
1730 | 1732 | ||
1731 | if (r & DLM_ASSERT_RESPONSE_REASSERT && | 1733 | if (r & DLM_ASSERT_RESPONSE_REASSERT && |
1732 | !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { | 1734 | !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { |
1733 | mlog(ML_ERROR, "%.*s: very strange, " | 1735 | mlog(ML_ERROR, "%.*s: very strange, " |
1734 | "master MLE but no lockres on %u\n", | 1736 | "master MLE but no lockres on %u\n", |
1735 | namelen, lockname, to); | 1737 | namelen, lockname, to); |
1736 | } | 1738 | } |
1737 | 1739 | ||
1738 | if (r & DLM_ASSERT_RESPONSE_REASSERT) { | 1740 | if (r & DLM_ASSERT_RESPONSE_REASSERT) { |
1739 | mlog(0, "%.*s: node %u create mles on other " | 1741 | mlog(0, "%.*s: node %u create mles on other " |
1740 | "nodes and requests a re-assert\n", | 1742 | "nodes and requests a re-assert\n", |
1741 | namelen, lockname, to); | 1743 | namelen, lockname, to); |
1742 | reassert = 1; | 1744 | reassert = 1; |
1743 | } | 1745 | } |
1744 | if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { | 1746 | if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { |
1745 | mlog(0, "%.*s: node %u has a reference to this " | 1747 | mlog(0, "%.*s: node %u has a reference to this " |
1746 | "lockres, set the bit in the refmap\n", | 1748 | "lockres, set the bit in the refmap\n", |
1747 | namelen, lockname, to); | 1749 | namelen, lockname, to); |
1748 | spin_lock(&res->spinlock); | 1750 | spin_lock(&res->spinlock); |
1749 | dlm_lockres_set_refmap_bit(dlm, res, to); | 1751 | dlm_lockres_set_refmap_bit(dlm, res, to); |
1750 | spin_unlock(&res->spinlock); | 1752 | spin_unlock(&res->spinlock); |
1751 | } | 1753 | } |
1752 | } | 1754 | } |
1753 | 1755 | ||
1754 | if (reassert) | 1756 | if (reassert) |
1755 | goto again; | 1757 | goto again; |
1756 | 1758 | ||
1757 | spin_lock(&res->spinlock); | 1759 | spin_lock(&res->spinlock); |
1758 | res->state &= ~DLM_LOCK_RES_SETREF_INPROG; | 1760 | res->state &= ~DLM_LOCK_RES_SETREF_INPROG; |
1759 | spin_unlock(&res->spinlock); | 1761 | spin_unlock(&res->spinlock); |
1760 | wake_up(&res->wq); | 1762 | wake_up(&res->wq); |
1761 | 1763 | ||
1762 | return ret; | 1764 | return ret; |
1763 | } | 1765 | } |
1764 | 1766 | ||
1765 | /* | 1767 | /* |
1766 | * locks that can be taken here: | 1768 | * locks that can be taken here: |
1767 | * dlm->spinlock | 1769 | * dlm->spinlock |
1768 | * res->spinlock | 1770 | * res->spinlock |
1769 | * mle->spinlock | 1771 | * mle->spinlock |
1770 | * dlm->master_list | 1772 | * dlm->master_list |
1771 | * | 1773 | * |
1772 | * if possible, TRIM THIS DOWN!!! | 1774 | * if possible, TRIM THIS DOWN!!! |
1773 | */ | 1775 | */ |
1774 | int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, | 1776 | int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, |
1775 | void **ret_data) | 1777 | void **ret_data) |
1776 | { | 1778 | { |
1777 | struct dlm_ctxt *dlm = data; | 1779 | struct dlm_ctxt *dlm = data; |
1778 | struct dlm_master_list_entry *mle = NULL; | 1780 | struct dlm_master_list_entry *mle = NULL; |
1779 | struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; | 1781 | struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; |
1780 | struct dlm_lock_resource *res = NULL; | 1782 | struct dlm_lock_resource *res = NULL; |
1781 | char *name; | 1783 | char *name; |
1782 | unsigned int namelen, hash; | 1784 | unsigned int namelen, hash; |
1783 | u32 flags; | 1785 | u32 flags; |
1784 | int master_request = 0, have_lockres_ref = 0; | 1786 | int master_request = 0, have_lockres_ref = 0; |
1785 | int ret = 0; | 1787 | int ret = 0; |
1786 | 1788 | ||
1787 | if (!dlm_grab(dlm)) | 1789 | if (!dlm_grab(dlm)) |
1788 | return 0; | 1790 | return 0; |
1789 | 1791 | ||
1790 | name = assert->name; | 1792 | name = assert->name; |
1791 | namelen = assert->namelen; | 1793 | namelen = assert->namelen; |
1792 | hash = dlm_lockid_hash(name, namelen); | 1794 | hash = dlm_lockid_hash(name, namelen); |
1793 | flags = be32_to_cpu(assert->flags); | 1795 | flags = be32_to_cpu(assert->flags); |
1794 | 1796 | ||
1795 | if (namelen > DLM_LOCKID_NAME_MAX) { | 1797 | if (namelen > DLM_LOCKID_NAME_MAX) { |
1796 | mlog(ML_ERROR, "Invalid name length!"); | 1798 | mlog(ML_ERROR, "Invalid name length!"); |
1797 | goto done; | 1799 | goto done; |
1798 | } | 1800 | } |
1799 | 1801 | ||
1800 | spin_lock(&dlm->spinlock); | 1802 | spin_lock(&dlm->spinlock); |
1801 | 1803 | ||
1802 | if (flags) | 1804 | if (flags) |
1803 | mlog(0, "assert_master with flags: %u\n", flags); | 1805 | mlog(0, "assert_master with flags: %u\n", flags); |
1804 | 1806 | ||
1805 | /* find the MLE */ | 1807 | /* find the MLE */ |
1806 | spin_lock(&dlm->master_lock); | 1808 | spin_lock(&dlm->master_lock); |
1807 | if (!dlm_find_mle(dlm, &mle, name, namelen)) { | 1809 | if (!dlm_find_mle(dlm, &mle, name, namelen)) { |
1808 | /* not an error, could be master just re-asserting */ | 1810 | /* not an error, could be master just re-asserting */ |
1809 | mlog(0, "just got an assert_master from %u, but no " | 1811 | mlog(0, "just got an assert_master from %u, but no " |
1810 | "MLE for it! (%.*s)\n", assert->node_idx, | 1812 | "MLE for it! (%.*s)\n", assert->node_idx, |
1811 | namelen, name); | 1813 | namelen, name); |
1812 | } else { | 1814 | } else { |
1813 | int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); | 1815 | int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); |
1814 | if (bit >= O2NM_MAX_NODES) { | 1816 | if (bit >= O2NM_MAX_NODES) { |
1815 | /* not necessarily an error, though less likely. | 1817 | /* not necessarily an error, though less likely. |
1816 | * could be master just re-asserting. */ | 1818 | * could be master just re-asserting. */ |
1817 | mlog(0, "no bits set in the maybe_map, but %u " | 1819 | mlog(0, "no bits set in the maybe_map, but %u " |
1818 | "is asserting! (%.*s)\n", assert->node_idx, | 1820 | "is asserting! (%.*s)\n", assert->node_idx, |
1819 | namelen, name); | 1821 | namelen, name); |
1820 | } else if (bit != assert->node_idx) { | 1822 | } else if (bit != assert->node_idx) { |
1821 | if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { | 1823 | if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { |
1822 | mlog(0, "master %u was found, %u should " | 1824 | mlog(0, "master %u was found, %u should " |
1823 | "back off\n", assert->node_idx, bit); | 1825 | "back off\n", assert->node_idx, bit); |
1824 | } else { | 1826 | } else { |
1825 | /* with the fix for bug 569, a higher node | 1827 | /* with the fix for bug 569, a higher node |
1826 | * number winning the mastery will respond | 1828 | * number winning the mastery will respond |
1827 | * YES to mastery requests, but this node | 1829 | * YES to mastery requests, but this node |
1828 | * had no way of knowing. let it pass. */ | 1830 | * had no way of knowing. let it pass. */ |
1829 | mlog(0, "%u is the lowest node, " | 1831 | mlog(0, "%u is the lowest node, " |
1830 | "%u is asserting. (%.*s) %u must " | 1832 | "%u is asserting. (%.*s) %u must " |
1831 | "have begun after %u won.\n", bit, | 1833 | "have begun after %u won.\n", bit, |
1832 | assert->node_idx, namelen, name, bit, | 1834 | assert->node_idx, namelen, name, bit, |
1833 | assert->node_idx); | 1835 | assert->node_idx); |
1834 | } | 1836 | } |
1835 | } | 1837 | } |
1836 | if (mle->type == DLM_MLE_MIGRATION) { | 1838 | if (mle->type == DLM_MLE_MIGRATION) { |
1837 | if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { | 1839 | if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { |
1838 | mlog(0, "%s:%.*s: got cleanup assert" | 1840 | mlog(0, "%s:%.*s: got cleanup assert" |
1839 | " from %u for migration\n", | 1841 | " from %u for migration\n", |
1840 | dlm->name, namelen, name, | 1842 | dlm->name, namelen, name, |
1841 | assert->node_idx); | 1843 | assert->node_idx); |
1842 | } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { | 1844 | } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { |
1843 | mlog(0, "%s:%.*s: got unrelated assert" | 1845 | mlog(0, "%s:%.*s: got unrelated assert" |
1844 | " from %u for migration, ignoring\n", | 1846 | " from %u for migration, ignoring\n", |
1845 | dlm->name, namelen, name, | 1847 | dlm->name, namelen, name, |
1846 | assert->node_idx); | 1848 | assert->node_idx); |
1847 | __dlm_put_mle(mle); | 1849 | __dlm_put_mle(mle); |
1848 | spin_unlock(&dlm->master_lock); | 1850 | spin_unlock(&dlm->master_lock); |
1849 | spin_unlock(&dlm->spinlock); | 1851 | spin_unlock(&dlm->spinlock); |
1850 | goto done; | 1852 | goto done; |
1851 | } | 1853 | } |
1852 | } | 1854 | } |
1853 | } | 1855 | } |
1854 | spin_unlock(&dlm->master_lock); | 1856 | spin_unlock(&dlm->master_lock); |
1855 | 1857 | ||
1856 | /* ok everything checks out with the MLE | 1858 | /* ok everything checks out with the MLE |
1857 | * now check to see if there is a lockres */ | 1859 | * now check to see if there is a lockres */ |
1858 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); | 1860 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); |
1859 | if (res) { | 1861 | if (res) { |
1860 | spin_lock(&res->spinlock); | 1862 | spin_lock(&res->spinlock); |
1861 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 1863 | if (res->state & DLM_LOCK_RES_RECOVERING) { |
1862 | mlog(ML_ERROR, "%u asserting but %.*s is " | 1864 | mlog(ML_ERROR, "%u asserting but %.*s is " |
1863 | "RECOVERING!\n", assert->node_idx, namelen, name); | 1865 | "RECOVERING!\n", assert->node_idx, namelen, name); |
1864 | goto kill; | 1866 | goto kill; |
1865 | } | 1867 | } |
1866 | if (!mle) { | 1868 | if (!mle) { |
1867 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && | 1869 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && |
1868 | res->owner != assert->node_idx) { | 1870 | res->owner != assert->node_idx) { |
1869 | mlog(ML_ERROR, "DIE! Mastery assert from %u, " | 1871 | mlog(ML_ERROR, "DIE! Mastery assert from %u, " |
1870 | "but current owner is %u! (%.*s)\n", | 1872 | "but current owner is %u! (%.*s)\n", |
1871 | assert->node_idx, res->owner, namelen, | 1873 | assert->node_idx, res->owner, namelen, |
1872 | name); | 1874 | name); |
1873 | __dlm_print_one_lock_resource(res); | 1875 | __dlm_print_one_lock_resource(res); |
1874 | BUG(); | 1876 | BUG(); |
1875 | } | 1877 | } |
1876 | } else if (mle->type != DLM_MLE_MIGRATION) { | 1878 | } else if (mle->type != DLM_MLE_MIGRATION) { |
1877 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { | 1879 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { |
1878 | /* owner is just re-asserting */ | 1880 | /* owner is just re-asserting */ |
1879 | if (res->owner == assert->node_idx) { | 1881 | if (res->owner == assert->node_idx) { |
1880 | mlog(0, "owner %u re-asserting on " | 1882 | mlog(0, "owner %u re-asserting on " |
1881 | "lock %.*s\n", assert->node_idx, | 1883 | "lock %.*s\n", assert->node_idx, |
1882 | namelen, name); | 1884 | namelen, name); |
1883 | goto ok; | 1885 | goto ok; |
1884 | } | 1886 | } |
1885 | mlog(ML_ERROR, "got assert_master from " | 1887 | mlog(ML_ERROR, "got assert_master from " |
1886 | "node %u, but %u is the owner! " | 1888 | "node %u, but %u is the owner! " |
1887 | "(%.*s)\n", assert->node_idx, | 1889 | "(%.*s)\n", assert->node_idx, |
1888 | res->owner, namelen, name); | 1890 | res->owner, namelen, name); |
1889 | goto kill; | 1891 | goto kill; |
1890 | } | 1892 | } |
1891 | if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { | 1893 | if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { |
1892 | mlog(ML_ERROR, "got assert from %u, but lock " | 1894 | mlog(ML_ERROR, "got assert from %u, but lock " |
1893 | "with no owner should be " | 1895 | "with no owner should be " |
1894 | "in-progress! (%.*s)\n", | 1896 | "in-progress! (%.*s)\n", |
1895 | assert->node_idx, | 1897 | assert->node_idx, |
1896 | namelen, name); | 1898 | namelen, name); |
1897 | goto kill; | 1899 | goto kill; |
1898 | } | 1900 | } |
1899 | } else /* mle->type == DLM_MLE_MIGRATION */ { | 1901 | } else /* mle->type == DLM_MLE_MIGRATION */ { |
1900 | /* should only be getting an assert from new master */ | 1902 | /* should only be getting an assert from new master */ |
1901 | if (assert->node_idx != mle->new_master) { | 1903 | if (assert->node_idx != mle->new_master) { |
1902 | mlog(ML_ERROR, "got assert from %u, but " | 1904 | mlog(ML_ERROR, "got assert from %u, but " |
1903 | "new master is %u, and old master " | 1905 | "new master is %u, and old master " |
1904 | "was %u (%.*s)\n", | 1906 | "was %u (%.*s)\n", |
1905 | assert->node_idx, mle->new_master, | 1907 | assert->node_idx, mle->new_master, |
1906 | mle->master, namelen, name); | 1908 | mle->master, namelen, name); |
1907 | goto kill; | 1909 | goto kill; |
1908 | } | 1910 | } |
1909 | 1911 | ||
1910 | } | 1912 | } |
1911 | ok: | 1913 | ok: |
1912 | spin_unlock(&res->spinlock); | 1914 | spin_unlock(&res->spinlock); |
1913 | } | 1915 | } |
1914 | 1916 | ||
1915 | // mlog(0, "woo! got an assert_master from node %u!\n", | 1917 | // mlog(0, "woo! got an assert_master from node %u!\n", |
1916 | // assert->node_idx); | 1918 | // assert->node_idx); |
1917 | if (mle) { | 1919 | if (mle) { |
1918 | int extra_ref = 0; | 1920 | int extra_ref = 0; |
1919 | int nn = -1; | 1921 | int nn = -1; |
1920 | int rr, err = 0; | 1922 | int rr, err = 0; |
1921 | 1923 | ||
1922 | spin_lock(&mle->spinlock); | 1924 | spin_lock(&mle->spinlock); |
1923 | if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) | 1925 | if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) |
1924 | extra_ref = 1; | 1926 | extra_ref = 1; |
1925 | else { | 1927 | else { |
1926 | /* MASTER mle: if any bits set in the response map | 1928 | /* MASTER mle: if any bits set in the response map |
1927 | * then the calling node needs to re-assert to clear | 1929 | * then the calling node needs to re-assert to clear |
1928 | * up nodes that this node contacted */ | 1930 | * up nodes that this node contacted */ |
1929 | while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, | 1931 | while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, |
1930 | nn+1)) < O2NM_MAX_NODES) { | 1932 | nn+1)) < O2NM_MAX_NODES) { |
1931 | if (nn != dlm->node_num && nn != assert->node_idx) { | 1933 | if (nn != dlm->node_num && nn != assert->node_idx) { |
1932 | master_request = 1; | 1934 | master_request = 1; |
1933 | break; | 1935 | break; |
1934 | } | 1936 | } |
1935 | } | 1937 | } |
1936 | } | 1938 | } |
1937 | mle->master = assert->node_idx; | 1939 | mle->master = assert->node_idx; |
1938 | atomic_set(&mle->woken, 1); | 1940 | atomic_set(&mle->woken, 1); |
1939 | wake_up(&mle->wq); | 1941 | wake_up(&mle->wq); |
1940 | spin_unlock(&mle->spinlock); | 1942 | spin_unlock(&mle->spinlock); |
1941 | 1943 | ||
1942 | if (res) { | 1944 | if (res) { |
1943 | int wake = 0; | 1945 | int wake = 0; |
1944 | spin_lock(&res->spinlock); | 1946 | spin_lock(&res->spinlock); |
1945 | if (mle->type == DLM_MLE_MIGRATION) { | 1947 | if (mle->type == DLM_MLE_MIGRATION) { |
1946 | mlog(0, "finishing off migration of lockres %.*s, " | 1948 | mlog(0, "finishing off migration of lockres %.*s, " |
1947 | "from %u to %u\n", | 1949 | "from %u to %u\n", |
1948 | res->lockname.len, res->lockname.name, | 1950 | res->lockname.len, res->lockname.name, |
1949 | dlm->node_num, mle->new_master); | 1951 | dlm->node_num, mle->new_master); |
1950 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 1952 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
1951 | wake = 1; | 1953 | wake = 1; |
1952 | dlm_change_lockres_owner(dlm, res, mle->new_master); | 1954 | dlm_change_lockres_owner(dlm, res, mle->new_master); |
1953 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); | 1955 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); |
1954 | } else { | 1956 | } else { |
1955 | dlm_change_lockres_owner(dlm, res, mle->master); | 1957 | dlm_change_lockres_owner(dlm, res, mle->master); |
1956 | } | 1958 | } |
1957 | spin_unlock(&res->spinlock); | 1959 | spin_unlock(&res->spinlock); |
1958 | have_lockres_ref = 1; | 1960 | have_lockres_ref = 1; |
1959 | if (wake) | 1961 | if (wake) |
1960 | wake_up(&res->wq); | 1962 | wake_up(&res->wq); |
1961 | } | 1963 | } |
1962 | 1964 | ||
1963 | /* master is known, detach if not already detached. | 1965 | /* master is known, detach if not already detached. |
1964 | * ensures that only one assert_master call will happen | 1966 | * ensures that only one assert_master call will happen |
1965 | * on this mle. */ | 1967 | * on this mle. */ |
1966 | spin_lock(&dlm->master_lock); | 1968 | spin_lock(&dlm->master_lock); |
1967 | 1969 | ||
1968 | rr = atomic_read(&mle->mle_refs.refcount); | 1970 | rr = atomic_read(&mle->mle_refs.refcount); |
1969 | if (mle->inuse > 0) { | 1971 | if (mle->inuse > 0) { |
1970 | if (extra_ref && rr < 3) | 1972 | if (extra_ref && rr < 3) |
1971 | err = 1; | 1973 | err = 1; |
1972 | else if (!extra_ref && rr < 2) | 1974 | else if (!extra_ref && rr < 2) |
1973 | err = 1; | 1975 | err = 1; |
1974 | } else { | 1976 | } else { |
1975 | if (extra_ref && rr < 2) | 1977 | if (extra_ref && rr < 2) |
1976 | err = 1; | 1978 | err = 1; |
1977 | else if (!extra_ref && rr < 1) | 1979 | else if (!extra_ref && rr < 1) |
1978 | err = 1; | 1980 | err = 1; |
1979 | } | 1981 | } |
1980 | if (err) { | 1982 | if (err) { |
1981 | mlog(ML_ERROR, "%s:%.*s: got assert master from %u " | 1983 | mlog(ML_ERROR, "%s:%.*s: got assert master from %u " |
1982 | "that will mess up this node, refs=%d, extra=%d, " | 1984 | "that will mess up this node, refs=%d, extra=%d, " |
1983 | "inuse=%d\n", dlm->name, namelen, name, | 1985 | "inuse=%d\n", dlm->name, namelen, name, |
1984 | assert->node_idx, rr, extra_ref, mle->inuse); | 1986 | assert->node_idx, rr, extra_ref, mle->inuse); |
1985 | dlm_print_one_mle(mle); | 1987 | dlm_print_one_mle(mle); |
1986 | } | 1988 | } |
1987 | __dlm_unlink_mle(dlm, mle); | 1989 | __dlm_unlink_mle(dlm, mle); |
1988 | __dlm_mle_detach_hb_events(dlm, mle); | 1990 | __dlm_mle_detach_hb_events(dlm, mle); |
1989 | __dlm_put_mle(mle); | 1991 | __dlm_put_mle(mle); |
1990 | if (extra_ref) { | 1992 | if (extra_ref) { |
1991 | /* the assert master message now balances the extra | 1993 | /* the assert master message now balances the extra |
1992 | * ref given by the master / migration request message. | 1994 | * ref given by the master / migration request message. |
1993 | * if this is the last put, it will be removed | 1995 | * if this is the last put, it will be removed |
1994 | * from the list. */ | 1996 | * from the list. */ |
1995 | __dlm_put_mle(mle); | 1997 | __dlm_put_mle(mle); |
1996 | } | 1998 | } |
1997 | spin_unlock(&dlm->master_lock); | 1999 | spin_unlock(&dlm->master_lock); |
1998 | } else if (res) { | 2000 | } else if (res) { |
1999 | if (res->owner != assert->node_idx) { | 2001 | if (res->owner != assert->node_idx) { |
2000 | mlog(0, "assert_master from %u, but current " | 2002 | mlog(0, "assert_master from %u, but current " |
2001 | "owner is %u (%.*s), no mle\n", assert->node_idx, | 2003 | "owner is %u (%.*s), no mle\n", assert->node_idx, |
2002 | res->owner, namelen, name); | 2004 | res->owner, namelen, name); |
2003 | } | 2005 | } |
2004 | } | 2006 | } |
2005 | spin_unlock(&dlm->spinlock); | 2007 | spin_unlock(&dlm->spinlock); |
2006 | 2008 | ||
2007 | done: | 2009 | done: |
2008 | ret = 0; | 2010 | ret = 0; |
2009 | if (res) { | 2011 | if (res) { |
2010 | spin_lock(&res->spinlock); | 2012 | spin_lock(&res->spinlock); |
2011 | res->state |= DLM_LOCK_RES_SETREF_INPROG; | 2013 | res->state |= DLM_LOCK_RES_SETREF_INPROG; |
2012 | spin_unlock(&res->spinlock); | 2014 | spin_unlock(&res->spinlock); |
2013 | *ret_data = (void *)res; | 2015 | *ret_data = (void *)res; |
2014 | } | 2016 | } |
2015 | dlm_put(dlm); | 2017 | dlm_put(dlm); |
2016 | if (master_request) { | 2018 | if (master_request) { |
2017 | mlog(0, "need to tell master to reassert\n"); | 2019 | mlog(0, "need to tell master to reassert\n"); |
2018 | /* positive. negative would shoot down the node. */ | 2020 | /* positive. negative would shoot down the node. */ |
2019 | ret |= DLM_ASSERT_RESPONSE_REASSERT; | 2021 | ret |= DLM_ASSERT_RESPONSE_REASSERT; |
2020 | if (!have_lockres_ref) { | 2022 | if (!have_lockres_ref) { |
2021 | mlog(ML_ERROR, "strange, got assert from %u, MASTER " | 2023 | mlog(ML_ERROR, "strange, got assert from %u, MASTER " |
2022 | "mle present here for %s:%.*s, but no lockres!\n", | 2024 | "mle present here for %s:%.*s, but no lockres!\n", |
2023 | assert->node_idx, dlm->name, namelen, name); | 2025 | assert->node_idx, dlm->name, namelen, name); |
2024 | } | 2026 | } |
2025 | } | 2027 | } |
2026 | if (have_lockres_ref) { | 2028 | if (have_lockres_ref) { |
2027 | /* let the master know we have a reference to the lockres */ | 2029 | /* let the master know we have a reference to the lockres */ |
2028 | ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; | 2030 | ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; |
2029 | mlog(0, "%s:%.*s: got assert from %u, need a ref\n", | 2031 | mlog(0, "%s:%.*s: got assert from %u, need a ref\n", |
2030 | dlm->name, namelen, name, assert->node_idx); | 2032 | dlm->name, namelen, name, assert->node_idx); |
2031 | } | 2033 | } |
2032 | return ret; | 2034 | return ret; |
2033 | 2035 | ||
2034 | kill: | 2036 | kill: |
2035 | /* kill the caller! */ | 2037 | /* kill the caller! */ |
2036 | mlog(ML_ERROR, "Bad message received from another node. Dumping state " | 2038 | mlog(ML_ERROR, "Bad message received from another node. Dumping state " |
2037 | "and killing the other node now! This node is OK and can continue.\n"); | 2039 | "and killing the other node now! This node is OK and can continue.\n"); |
2038 | __dlm_print_one_lock_resource(res); | 2040 | __dlm_print_one_lock_resource(res); |
2039 | spin_unlock(&res->spinlock); | 2041 | spin_unlock(&res->spinlock); |
2040 | spin_unlock(&dlm->spinlock); | 2042 | spin_unlock(&dlm->spinlock); |
2041 | *ret_data = (void *)res; | 2043 | *ret_data = (void *)res; |
2042 | dlm_put(dlm); | 2044 | dlm_put(dlm); |
2043 | return -EINVAL; | 2045 | return -EINVAL; |
2044 | } | 2046 | } |
2045 | 2047 | ||
2046 | void dlm_assert_master_post_handler(int status, void *data, void *ret_data) | 2048 | void dlm_assert_master_post_handler(int status, void *data, void *ret_data) |
2047 | { | 2049 | { |
2048 | struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; | 2050 | struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; |
2049 | 2051 | ||
2050 | if (ret_data) { | 2052 | if (ret_data) { |
2051 | spin_lock(&res->spinlock); | 2053 | spin_lock(&res->spinlock); |
2052 | res->state &= ~DLM_LOCK_RES_SETREF_INPROG; | 2054 | res->state &= ~DLM_LOCK_RES_SETREF_INPROG; |
2053 | spin_unlock(&res->spinlock); | 2055 | spin_unlock(&res->spinlock); |
2054 | wake_up(&res->wq); | 2056 | wake_up(&res->wq); |
2055 | dlm_lockres_put(res); | 2057 | dlm_lockres_put(res); |
2056 | } | 2058 | } |
2057 | return; | 2059 | return; |
2058 | } | 2060 | } |
2059 | 2061 | ||
2060 | int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, | 2062 | int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, |
2061 | struct dlm_lock_resource *res, | 2063 | struct dlm_lock_resource *res, |
2062 | int ignore_higher, u8 request_from, u32 flags) | 2064 | int ignore_higher, u8 request_from, u32 flags) |
2063 | { | 2065 | { |
2064 | struct dlm_work_item *item; | 2066 | struct dlm_work_item *item; |
2065 | item = kzalloc(sizeof(*item), GFP_ATOMIC); | 2067 | item = kzalloc(sizeof(*item), GFP_ATOMIC); |
2066 | if (!item) | 2068 | if (!item) |
2067 | return -ENOMEM; | 2069 | return -ENOMEM; |
2068 | 2070 | ||
2069 | 2071 | ||
2070 | /* queue up work for dlm_assert_master_worker */ | 2072 | /* queue up work for dlm_assert_master_worker */ |
2071 | dlm_grab(dlm); /* get an extra ref for the work item */ | 2073 | dlm_grab(dlm); /* get an extra ref for the work item */ |
2072 | dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); | 2074 | dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); |
2073 | item->u.am.lockres = res; /* already have a ref */ | 2075 | item->u.am.lockres = res; /* already have a ref */ |
2074 | /* can optionally ignore node numbers higher than this node */ | 2076 | /* can optionally ignore node numbers higher than this node */ |
2075 | item->u.am.ignore_higher = ignore_higher; | 2077 | item->u.am.ignore_higher = ignore_higher; |
2076 | item->u.am.request_from = request_from; | 2078 | item->u.am.request_from = request_from; |
2077 | item->u.am.flags = flags; | 2079 | item->u.am.flags = flags; |
2078 | 2080 | ||
2079 | if (ignore_higher) | 2081 | if (ignore_higher) |
2080 | mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, | 2082 | mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, |
2081 | res->lockname.name); | 2083 | res->lockname.name); |
2082 | 2084 | ||
2083 | spin_lock(&dlm->work_lock); | 2085 | spin_lock(&dlm->work_lock); |
2084 | list_add_tail(&item->list, &dlm->work_list); | 2086 | list_add_tail(&item->list, &dlm->work_list); |
2085 | spin_unlock(&dlm->work_lock); | 2087 | spin_unlock(&dlm->work_lock); |
2086 | 2088 | ||
2087 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); | 2089 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); |
2088 | return 0; | 2090 | return 0; |
2089 | } | 2091 | } |
2090 | 2092 | ||
2091 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) | 2093 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) |
2092 | { | 2094 | { |
2093 | struct dlm_ctxt *dlm = data; | 2095 | struct dlm_ctxt *dlm = data; |
2094 | int ret = 0; | 2096 | int ret = 0; |
2095 | struct dlm_lock_resource *res; | 2097 | struct dlm_lock_resource *res; |
2096 | unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 2098 | unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
2097 | int ignore_higher; | 2099 | int ignore_higher; |
2098 | int bit; | 2100 | int bit; |
2099 | u8 request_from; | 2101 | u8 request_from; |
2100 | u32 flags; | 2102 | u32 flags; |
2101 | 2103 | ||
2102 | dlm = item->dlm; | 2104 | dlm = item->dlm; |
2103 | res = item->u.am.lockres; | 2105 | res = item->u.am.lockres; |
2104 | ignore_higher = item->u.am.ignore_higher; | 2106 | ignore_higher = item->u.am.ignore_higher; |
2105 | request_from = item->u.am.request_from; | 2107 | request_from = item->u.am.request_from; |
2106 | flags = item->u.am.flags; | 2108 | flags = item->u.am.flags; |
2107 | 2109 | ||
2108 | spin_lock(&dlm->spinlock); | 2110 | spin_lock(&dlm->spinlock); |
2109 | memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); | 2111 | memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); |
2110 | spin_unlock(&dlm->spinlock); | 2112 | spin_unlock(&dlm->spinlock); |
2111 | 2113 | ||
2112 | clear_bit(dlm->node_num, nodemap); | 2114 | clear_bit(dlm->node_num, nodemap); |
2113 | if (ignore_higher) { | 2115 | if (ignore_higher) { |
2114 | /* if is this just to clear up mles for nodes below | 2116 | /* if is this just to clear up mles for nodes below |
2115 | * this node, do not send the message to the original | 2117 | * this node, do not send the message to the original |
2116 | * caller or any node number higher than this */ | 2118 | * caller or any node number higher than this */ |
2117 | clear_bit(request_from, nodemap); | 2119 | clear_bit(request_from, nodemap); |
2118 | bit = dlm->node_num; | 2120 | bit = dlm->node_num; |
2119 | while (1) { | 2121 | while (1) { |
2120 | bit = find_next_bit(nodemap, O2NM_MAX_NODES, | 2122 | bit = find_next_bit(nodemap, O2NM_MAX_NODES, |
2121 | bit+1); | 2123 | bit+1); |
2122 | if (bit >= O2NM_MAX_NODES) | 2124 | if (bit >= O2NM_MAX_NODES) |
2123 | break; | 2125 | break; |
2124 | clear_bit(bit, nodemap); | 2126 | clear_bit(bit, nodemap); |
2125 | } | 2127 | } |
2126 | } | 2128 | } |
2127 | 2129 | ||
2128 | /* | 2130 | /* |
2129 | * If we're migrating this lock to someone else, we are no | 2131 | * If we're migrating this lock to someone else, we are no |
2130 | * longer allowed to assert out own mastery. OTOH, we need to | 2132 | * longer allowed to assert out own mastery. OTOH, we need to |
2131 | * prevent migration from starting while we're still asserting | 2133 | * prevent migration from starting while we're still asserting |
2132 | * our dominance. The reserved ast delays migration. | 2134 | * our dominance. The reserved ast delays migration. |
2133 | */ | 2135 | */ |
2134 | spin_lock(&res->spinlock); | 2136 | spin_lock(&res->spinlock); |
2135 | if (res->state & DLM_LOCK_RES_MIGRATING) { | 2137 | if (res->state & DLM_LOCK_RES_MIGRATING) { |
2136 | mlog(0, "Someone asked us to assert mastery, but we're " | 2138 | mlog(0, "Someone asked us to assert mastery, but we're " |
2137 | "in the middle of migration. Skipping assert, " | 2139 | "in the middle of migration. Skipping assert, " |
2138 | "the new master will handle that.\n"); | 2140 | "the new master will handle that.\n"); |
2139 | spin_unlock(&res->spinlock); | 2141 | spin_unlock(&res->spinlock); |
2140 | goto put; | 2142 | goto put; |
2141 | } else | 2143 | } else |
2142 | __dlm_lockres_reserve_ast(res); | 2144 | __dlm_lockres_reserve_ast(res); |
2143 | spin_unlock(&res->spinlock); | 2145 | spin_unlock(&res->spinlock); |
2144 | 2146 | ||
2145 | /* this call now finishes out the nodemap | 2147 | /* this call now finishes out the nodemap |
2146 | * even if one or more nodes die */ | 2148 | * even if one or more nodes die */ |
2147 | mlog(0, "worker about to master %.*s here, this=%u\n", | 2149 | mlog(0, "worker about to master %.*s here, this=%u\n", |
2148 | res->lockname.len, res->lockname.name, dlm->node_num); | 2150 | res->lockname.len, res->lockname.name, dlm->node_num); |
2149 | ret = dlm_do_assert_master(dlm, res, nodemap, flags); | 2151 | ret = dlm_do_assert_master(dlm, res, nodemap, flags); |
2150 | if (ret < 0) { | 2152 | if (ret < 0) { |
2151 | /* no need to restart, we are done */ | 2153 | /* no need to restart, we are done */ |
2152 | if (!dlm_is_host_down(ret)) | 2154 | if (!dlm_is_host_down(ret)) |
2153 | mlog_errno(ret); | 2155 | mlog_errno(ret); |
2154 | } | 2156 | } |
2155 | 2157 | ||
2156 | /* Ok, we've asserted ourselves. Let's let migration start. */ | 2158 | /* Ok, we've asserted ourselves. Let's let migration start. */ |
2157 | dlm_lockres_release_ast(dlm, res); | 2159 | dlm_lockres_release_ast(dlm, res); |
2158 | 2160 | ||
2159 | put: | 2161 | put: |
2160 | dlm_lockres_drop_inflight_worker(dlm, res); | 2162 | dlm_lockres_drop_inflight_worker(dlm, res); |
2161 | 2163 | ||
2162 | dlm_lockres_put(res); | 2164 | dlm_lockres_put(res); |
2163 | 2165 | ||
2164 | mlog(0, "finished with dlm_assert_master_worker\n"); | 2166 | mlog(0, "finished with dlm_assert_master_worker\n"); |
2165 | } | 2167 | } |
2166 | 2168 | ||
2167 | /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. | 2169 | /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. |
2168 | * We cannot wait for node recovery to complete to begin mastering this | 2170 | * We cannot wait for node recovery to complete to begin mastering this |
2169 | * lockres because this lockres is used to kick off recovery! ;-) | 2171 | * lockres because this lockres is used to kick off recovery! ;-) |
2170 | * So, do a pre-check on all living nodes to see if any of those nodes | 2172 | * So, do a pre-check on all living nodes to see if any of those nodes |
2171 | * think that $RECOVERY is currently mastered by a dead node. If so, | 2173 | * think that $RECOVERY is currently mastered by a dead node. If so, |
2172 | * we wait a short time to allow that node to get notified by its own | 2174 | * we wait a short time to allow that node to get notified by its own |
2173 | * heartbeat stack, then check again. All $RECOVERY lock resources | 2175 | * heartbeat stack, then check again. All $RECOVERY lock resources |
2174 | * mastered by dead nodes are purged when the hearbeat callback is | 2176 | * mastered by dead nodes are purged when the hearbeat callback is |
2175 | * fired, so we can know for sure that it is safe to continue once | 2177 | * fired, so we can know for sure that it is safe to continue once |
2176 | * the node returns a live node or no node. */ | 2178 | * the node returns a live node or no node. */ |
2177 | static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, | 2179 | static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, |
2178 | struct dlm_lock_resource *res) | 2180 | struct dlm_lock_resource *res) |
2179 | { | 2181 | { |
2180 | struct dlm_node_iter iter; | 2182 | struct dlm_node_iter iter; |
2181 | int nodenum; | 2183 | int nodenum; |
2182 | int ret = 0; | 2184 | int ret = 0; |
2183 | u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; | 2185 | u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; |
2184 | 2186 | ||
2185 | spin_lock(&dlm->spinlock); | 2187 | spin_lock(&dlm->spinlock); |
2186 | dlm_node_iter_init(dlm->domain_map, &iter); | 2188 | dlm_node_iter_init(dlm->domain_map, &iter); |
2187 | spin_unlock(&dlm->spinlock); | 2189 | spin_unlock(&dlm->spinlock); |
2188 | 2190 | ||
2189 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 2191 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { |
2190 | /* do not send to self */ | 2192 | /* do not send to self */ |
2191 | if (nodenum == dlm->node_num) | 2193 | if (nodenum == dlm->node_num) |
2192 | continue; | 2194 | continue; |
2193 | ret = dlm_do_master_requery(dlm, res, nodenum, &master); | 2195 | ret = dlm_do_master_requery(dlm, res, nodenum, &master); |
2194 | if (ret < 0) { | 2196 | if (ret < 0) { |
2195 | mlog_errno(ret); | 2197 | mlog_errno(ret); |
2196 | if (!dlm_is_host_down(ret)) | 2198 | if (!dlm_is_host_down(ret)) |
2197 | BUG(); | 2199 | BUG(); |
2198 | /* host is down, so answer for that node would be | 2200 | /* host is down, so answer for that node would be |
2199 | * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ | 2201 | * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ |
2200 | ret = 0; | 2202 | ret = 0; |
2201 | } | 2203 | } |
2202 | 2204 | ||
2203 | if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { | 2205 | if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { |
2204 | /* check to see if this master is in the recovery map */ | 2206 | /* check to see if this master is in the recovery map */ |
2205 | spin_lock(&dlm->spinlock); | 2207 | spin_lock(&dlm->spinlock); |
2206 | if (test_bit(master, dlm->recovery_map)) { | 2208 | if (test_bit(master, dlm->recovery_map)) { |
2207 | mlog(ML_NOTICE, "%s: node %u has not seen " | 2209 | mlog(ML_NOTICE, "%s: node %u has not seen " |
2208 | "node %u go down yet, and thinks the " | 2210 | "node %u go down yet, and thinks the " |
2209 | "dead node is mastering the recovery " | 2211 | "dead node is mastering the recovery " |
2210 | "lock. must wait.\n", dlm->name, | 2212 | "lock. must wait.\n", dlm->name, |
2211 | nodenum, master); | 2213 | nodenum, master); |
2212 | ret = -EAGAIN; | 2214 | ret = -EAGAIN; |
2213 | } | 2215 | } |
2214 | spin_unlock(&dlm->spinlock); | 2216 | spin_unlock(&dlm->spinlock); |
2215 | mlog(0, "%s: reco lock master is %u\n", dlm->name, | 2217 | mlog(0, "%s: reco lock master is %u\n", dlm->name, |
2216 | master); | 2218 | master); |
2217 | break; | 2219 | break; |
2218 | } | 2220 | } |
2219 | } | 2221 | } |
2220 | return ret; | 2222 | return ret; |
2221 | } | 2223 | } |
2222 | 2224 | ||
2223 | /* | 2225 | /* |
2224 | * DLM_DEREF_LOCKRES_MSG | 2226 | * DLM_DEREF_LOCKRES_MSG |
2225 | */ | 2227 | */ |
2226 | 2228 | ||
2227 | int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | 2229 | int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) |
2228 | { | 2230 | { |
2229 | struct dlm_deref_lockres deref; | 2231 | struct dlm_deref_lockres deref; |
2230 | int ret = 0, r; | 2232 | int ret = 0, r; |
2231 | const char *lockname; | 2233 | const char *lockname; |
2232 | unsigned int namelen; | 2234 | unsigned int namelen; |
2233 | 2235 | ||
2234 | lockname = res->lockname.name; | 2236 | lockname = res->lockname.name; |
2235 | namelen = res->lockname.len; | 2237 | namelen = res->lockname.len; |
2236 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | 2238 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); |
2237 | 2239 | ||
2238 | memset(&deref, 0, sizeof(deref)); | 2240 | memset(&deref, 0, sizeof(deref)); |
2239 | deref.node_idx = dlm->node_num; | 2241 | deref.node_idx = dlm->node_num; |
2240 | deref.namelen = namelen; | 2242 | deref.namelen = namelen; |
2241 | memcpy(deref.name, lockname, namelen); | 2243 | memcpy(deref.name, lockname, namelen); |
2242 | 2244 | ||
2243 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, | 2245 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, |
2244 | &deref, sizeof(deref), res->owner, &r); | 2246 | &deref, sizeof(deref), res->owner, &r); |
2245 | if (ret < 0) | 2247 | if (ret < 0) |
2246 | mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", | 2248 | mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", |
2247 | dlm->name, namelen, lockname, ret, res->owner); | 2249 | dlm->name, namelen, lockname, ret, res->owner); |
2248 | else if (r < 0) { | 2250 | else if (r < 0) { |
2249 | /* BAD. other node says I did not have a ref. */ | 2251 | /* BAD. other node says I did not have a ref. */ |
2250 | mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", | 2252 | mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", |
2251 | dlm->name, namelen, lockname, res->owner, r); | 2253 | dlm->name, namelen, lockname, res->owner, r); |
2252 | dlm_print_one_lock_resource(res); | 2254 | dlm_print_one_lock_resource(res); |
2253 | BUG(); | 2255 | BUG(); |
2254 | } | 2256 | } |
2255 | return ret; | 2257 | return ret; |
2256 | } | 2258 | } |
2257 | 2259 | ||
2258 | int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, | 2260 | int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, |
2259 | void **ret_data) | 2261 | void **ret_data) |
2260 | { | 2262 | { |
2261 | struct dlm_ctxt *dlm = data; | 2263 | struct dlm_ctxt *dlm = data; |
2262 | struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; | 2264 | struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; |
2263 | struct dlm_lock_resource *res = NULL; | 2265 | struct dlm_lock_resource *res = NULL; |
2264 | char *name; | 2266 | char *name; |
2265 | unsigned int namelen; | 2267 | unsigned int namelen; |
2266 | int ret = -EINVAL; | 2268 | int ret = -EINVAL; |
2267 | u8 node; | 2269 | u8 node; |
2268 | unsigned int hash; | 2270 | unsigned int hash; |
2269 | struct dlm_work_item *item; | 2271 | struct dlm_work_item *item; |
2270 | int cleared = 0; | 2272 | int cleared = 0; |
2271 | int dispatch = 0; | 2273 | int dispatch = 0; |
2272 | 2274 | ||
2273 | if (!dlm_grab(dlm)) | 2275 | if (!dlm_grab(dlm)) |
2274 | return 0; | 2276 | return 0; |
2275 | 2277 | ||
2276 | name = deref->name; | 2278 | name = deref->name; |
2277 | namelen = deref->namelen; | 2279 | namelen = deref->namelen; |
2278 | node = deref->node_idx; | 2280 | node = deref->node_idx; |
2279 | 2281 | ||
2280 | if (namelen > DLM_LOCKID_NAME_MAX) { | 2282 | if (namelen > DLM_LOCKID_NAME_MAX) { |
2281 | mlog(ML_ERROR, "Invalid name length!"); | 2283 | mlog(ML_ERROR, "Invalid name length!"); |
2282 | goto done; | 2284 | goto done; |
2283 | } | 2285 | } |
2284 | if (deref->node_idx >= O2NM_MAX_NODES) { | 2286 | if (deref->node_idx >= O2NM_MAX_NODES) { |
2285 | mlog(ML_ERROR, "Invalid node number: %u\n", node); | 2287 | mlog(ML_ERROR, "Invalid node number: %u\n", node); |
2286 | goto done; | 2288 | goto done; |
2287 | } | 2289 | } |
2288 | 2290 | ||
2289 | hash = dlm_lockid_hash(name, namelen); | 2291 | hash = dlm_lockid_hash(name, namelen); |
2290 | 2292 | ||
2291 | spin_lock(&dlm->spinlock); | 2293 | spin_lock(&dlm->spinlock); |
2292 | res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); | 2294 | res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); |
2293 | if (!res) { | 2295 | if (!res) { |
2294 | spin_unlock(&dlm->spinlock); | 2296 | spin_unlock(&dlm->spinlock); |
2295 | mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", | 2297 | mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", |
2296 | dlm->name, namelen, name); | 2298 | dlm->name, namelen, name); |
2297 | goto done; | 2299 | goto done; |
2298 | } | 2300 | } |
2299 | spin_unlock(&dlm->spinlock); | 2301 | spin_unlock(&dlm->spinlock); |
2300 | 2302 | ||
2301 | spin_lock(&res->spinlock); | 2303 | spin_lock(&res->spinlock); |
2302 | if (res->state & DLM_LOCK_RES_SETREF_INPROG) | 2304 | if (res->state & DLM_LOCK_RES_SETREF_INPROG) |
2303 | dispatch = 1; | 2305 | dispatch = 1; |
2304 | else { | 2306 | else { |
2305 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | 2307 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); |
2306 | if (test_bit(node, res->refmap)) { | 2308 | if (test_bit(node, res->refmap)) { |
2307 | dlm_lockres_clear_refmap_bit(dlm, res, node); | 2309 | dlm_lockres_clear_refmap_bit(dlm, res, node); |
2308 | cleared = 1; | 2310 | cleared = 1; |
2309 | } | 2311 | } |
2310 | } | 2312 | } |
2311 | spin_unlock(&res->spinlock); | 2313 | spin_unlock(&res->spinlock); |
2312 | 2314 | ||
2313 | if (!dispatch) { | 2315 | if (!dispatch) { |
2314 | if (cleared) | 2316 | if (cleared) |
2315 | dlm_lockres_calc_usage(dlm, res); | 2317 | dlm_lockres_calc_usage(dlm, res); |
2316 | else { | 2318 | else { |
2317 | mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " | 2319 | mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " |
2318 | "but it is already dropped!\n", dlm->name, | 2320 | "but it is already dropped!\n", dlm->name, |
2319 | res->lockname.len, res->lockname.name, node); | 2321 | res->lockname.len, res->lockname.name, node); |
2320 | dlm_print_one_lock_resource(res); | 2322 | dlm_print_one_lock_resource(res); |
2321 | } | 2323 | } |
2322 | ret = 0; | 2324 | ret = 0; |
2323 | goto done; | 2325 | goto done; |
2324 | } | 2326 | } |
2325 | 2327 | ||
2326 | item = kzalloc(sizeof(*item), GFP_NOFS); | 2328 | item = kzalloc(sizeof(*item), GFP_NOFS); |
2327 | if (!item) { | 2329 | if (!item) { |
2328 | ret = -ENOMEM; | 2330 | ret = -ENOMEM; |
2329 | mlog_errno(ret); | 2331 | mlog_errno(ret); |
2330 | goto done; | 2332 | goto done; |
2331 | } | 2333 | } |
2332 | 2334 | ||
2333 | dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); | 2335 | dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); |
2334 | item->u.dl.deref_res = res; | 2336 | item->u.dl.deref_res = res; |
2335 | item->u.dl.deref_node = node; | 2337 | item->u.dl.deref_node = node; |
2336 | 2338 | ||
2337 | spin_lock(&dlm->work_lock); | 2339 | spin_lock(&dlm->work_lock); |
2338 | list_add_tail(&item->list, &dlm->work_list); | 2340 | list_add_tail(&item->list, &dlm->work_list); |
2339 | spin_unlock(&dlm->work_lock); | 2341 | spin_unlock(&dlm->work_lock); |
2340 | 2342 | ||
2341 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); | 2343 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); |
2342 | return 0; | 2344 | return 0; |
2343 | 2345 | ||
2344 | done: | 2346 | done: |
2345 | if (res) | 2347 | if (res) |
2346 | dlm_lockres_put(res); | 2348 | dlm_lockres_put(res); |
2347 | dlm_put(dlm); | 2349 | dlm_put(dlm); |
2348 | 2350 | ||
2349 | return ret; | 2351 | return ret; |
2350 | } | 2352 | } |
2351 | 2353 | ||
2352 | static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) | 2354 | static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) |
2353 | { | 2355 | { |
2354 | struct dlm_ctxt *dlm; | 2356 | struct dlm_ctxt *dlm; |
2355 | struct dlm_lock_resource *res; | 2357 | struct dlm_lock_resource *res; |
2356 | u8 node; | 2358 | u8 node; |
2357 | u8 cleared = 0; | 2359 | u8 cleared = 0; |
2358 | 2360 | ||
2359 | dlm = item->dlm; | 2361 | dlm = item->dlm; |
2360 | res = item->u.dl.deref_res; | 2362 | res = item->u.dl.deref_res; |
2361 | node = item->u.dl.deref_node; | 2363 | node = item->u.dl.deref_node; |
2362 | 2364 | ||
2363 | spin_lock(&res->spinlock); | 2365 | spin_lock(&res->spinlock); |
2364 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | 2366 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); |
2365 | if (test_bit(node, res->refmap)) { | 2367 | if (test_bit(node, res->refmap)) { |
2366 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); | 2368 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); |
2367 | dlm_lockres_clear_refmap_bit(dlm, res, node); | 2369 | dlm_lockres_clear_refmap_bit(dlm, res, node); |
2368 | cleared = 1; | 2370 | cleared = 1; |
2369 | } | 2371 | } |
2370 | spin_unlock(&res->spinlock); | 2372 | spin_unlock(&res->spinlock); |
2371 | 2373 | ||
2372 | if (cleared) { | 2374 | if (cleared) { |
2373 | mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", | 2375 | mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", |
2374 | dlm->name, res->lockname.len, res->lockname.name, node); | 2376 | dlm->name, res->lockname.len, res->lockname.name, node); |
2375 | dlm_lockres_calc_usage(dlm, res); | 2377 | dlm_lockres_calc_usage(dlm, res); |
2376 | } else { | 2378 | } else { |
2377 | mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " | 2379 | mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " |
2378 | "but it is already dropped!\n", dlm->name, | 2380 | "but it is already dropped!\n", dlm->name, |
2379 | res->lockname.len, res->lockname.name, node); | 2381 | res->lockname.len, res->lockname.name, node); |
2380 | dlm_print_one_lock_resource(res); | 2382 | dlm_print_one_lock_resource(res); |
2381 | } | 2383 | } |
2382 | 2384 | ||
2383 | dlm_lockres_put(res); | 2385 | dlm_lockres_put(res); |
2384 | } | 2386 | } |
2385 | 2387 | ||
2386 | /* | 2388 | /* |
2387 | * A migrateable resource is one that is : | 2389 | * A migrateable resource is one that is : |
2388 | * 1. locally mastered, and, | 2390 | * 1. locally mastered, and, |
2389 | * 2. zero local locks, and, | 2391 | * 2. zero local locks, and, |
2390 | * 3. one or more non-local locks, or, one or more references | 2392 | * 3. one or more non-local locks, or, one or more references |
2391 | * Returns 1 if yes, 0 if not. | 2393 | * Returns 1 if yes, 0 if not. |
2392 | */ | 2394 | */ |
2393 | static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, | 2395 | static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, |
2394 | struct dlm_lock_resource *res) | 2396 | struct dlm_lock_resource *res) |
2395 | { | 2397 | { |
2396 | enum dlm_lockres_list idx; | 2398 | enum dlm_lockres_list idx; |
2397 | int nonlocal = 0, node_ref; | 2399 | int nonlocal = 0, node_ref; |
2398 | struct list_head *queue; | 2400 | struct list_head *queue; |
2399 | struct dlm_lock *lock; | 2401 | struct dlm_lock *lock; |
2400 | u64 cookie; | 2402 | u64 cookie; |
2401 | 2403 | ||
2402 | assert_spin_locked(&res->spinlock); | 2404 | assert_spin_locked(&res->spinlock); |
2403 | 2405 | ||
2404 | /* delay migration when the lockres is in MIGRATING state */ | 2406 | /* delay migration when the lockres is in MIGRATING state */ |
2405 | if (res->state & DLM_LOCK_RES_MIGRATING) | 2407 | if (res->state & DLM_LOCK_RES_MIGRATING) |
2406 | return 0; | 2408 | return 0; |
2407 | 2409 | ||
2408 | /* delay migration when the lockres is in RECOCERING state */ | 2410 | /* delay migration when the lockres is in RECOCERING state */ |
2409 | if (res->state & DLM_LOCK_RES_RECOVERING) | 2411 | if (res->state & DLM_LOCK_RES_RECOVERING) |
2410 | return 0; | 2412 | return 0; |
2411 | 2413 | ||
2412 | if (res->owner != dlm->node_num) | 2414 | if (res->owner != dlm->node_num) |
2413 | return 0; | 2415 | return 0; |
2414 | 2416 | ||
2415 | for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { | 2417 | for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { |
2416 | queue = dlm_list_idx_to_ptr(res, idx); | 2418 | queue = dlm_list_idx_to_ptr(res, idx); |
2417 | list_for_each_entry(lock, queue, list) { | 2419 | list_for_each_entry(lock, queue, list) { |
2418 | if (lock->ml.node != dlm->node_num) { | 2420 | if (lock->ml.node != dlm->node_num) { |
2419 | nonlocal++; | 2421 | nonlocal++; |
2420 | continue; | 2422 | continue; |
2421 | } | 2423 | } |
2422 | cookie = be64_to_cpu(lock->ml.cookie); | 2424 | cookie = be64_to_cpu(lock->ml.cookie); |
2423 | mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " | 2425 | mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " |
2424 | "%s list\n", dlm->name, res->lockname.len, | 2426 | "%s list\n", dlm->name, res->lockname.len, |
2425 | res->lockname.name, | 2427 | res->lockname.name, |
2426 | dlm_get_lock_cookie_node(cookie), | 2428 | dlm_get_lock_cookie_node(cookie), |
2427 | dlm_get_lock_cookie_seq(cookie), | 2429 | dlm_get_lock_cookie_seq(cookie), |
2428 | dlm_list_in_text(idx)); | 2430 | dlm_list_in_text(idx)); |
2429 | return 0; | 2431 | return 0; |
2430 | } | 2432 | } |
2431 | } | 2433 | } |
2432 | 2434 | ||
2433 | if (!nonlocal) { | 2435 | if (!nonlocal) { |
2434 | node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); | 2436 | node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); |
2435 | if (node_ref >= O2NM_MAX_NODES) | 2437 | if (node_ref >= O2NM_MAX_NODES) |
2436 | return 0; | 2438 | return 0; |
2437 | } | 2439 | } |
2438 | 2440 | ||
2439 | mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, | 2441 | mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, |
2440 | res->lockname.name); | 2442 | res->lockname.name); |
2441 | 2443 | ||
2442 | return 1; | 2444 | return 1; |
2443 | } | 2445 | } |
2444 | 2446 | ||
2445 | /* | 2447 | /* |
2446 | * DLM_MIGRATE_LOCKRES | 2448 | * DLM_MIGRATE_LOCKRES |
2447 | */ | 2449 | */ |
2448 | 2450 | ||
2449 | 2451 | ||
2450 | static int dlm_migrate_lockres(struct dlm_ctxt *dlm, | 2452 | static int dlm_migrate_lockres(struct dlm_ctxt *dlm, |
2451 | struct dlm_lock_resource *res, u8 target) | 2453 | struct dlm_lock_resource *res, u8 target) |
2452 | { | 2454 | { |
2453 | struct dlm_master_list_entry *mle = NULL; | 2455 | struct dlm_master_list_entry *mle = NULL; |
2454 | struct dlm_master_list_entry *oldmle = NULL; | 2456 | struct dlm_master_list_entry *oldmle = NULL; |
2455 | struct dlm_migratable_lockres *mres = NULL; | 2457 | struct dlm_migratable_lockres *mres = NULL; |
2456 | int ret = 0; | 2458 | int ret = 0; |
2457 | const char *name; | 2459 | const char *name; |
2458 | unsigned int namelen; | 2460 | unsigned int namelen; |
2459 | int mle_added = 0; | 2461 | int mle_added = 0; |
2460 | int wake = 0; | 2462 | int wake = 0; |
2461 | 2463 | ||
2462 | if (!dlm_grab(dlm)) | 2464 | if (!dlm_grab(dlm)) |
2463 | return -EINVAL; | 2465 | return -EINVAL; |
2464 | 2466 | ||
2465 | BUG_ON(target == O2NM_MAX_NODES); | 2467 | BUG_ON(target == O2NM_MAX_NODES); |
2466 | 2468 | ||
2467 | name = res->lockname.name; | 2469 | name = res->lockname.name; |
2468 | namelen = res->lockname.len; | 2470 | namelen = res->lockname.len; |
2469 | 2471 | ||
2470 | mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, | 2472 | mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, |
2471 | target); | 2473 | target); |
2472 | 2474 | ||
2473 | /* preallocate up front. if this fails, abort */ | 2475 | /* preallocate up front. if this fails, abort */ |
2474 | ret = -ENOMEM; | 2476 | ret = -ENOMEM; |
2475 | mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); | 2477 | mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); |
2476 | if (!mres) { | 2478 | if (!mres) { |
2477 | mlog_errno(ret); | 2479 | mlog_errno(ret); |
2478 | goto leave; | 2480 | goto leave; |
2479 | } | 2481 | } |
2480 | 2482 | ||
2481 | mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); | 2483 | mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); |
2482 | if (!mle) { | 2484 | if (!mle) { |
2483 | mlog_errno(ret); | 2485 | mlog_errno(ret); |
2484 | goto leave; | 2486 | goto leave; |
2485 | } | 2487 | } |
2486 | ret = 0; | 2488 | ret = 0; |
2487 | 2489 | ||
2488 | /* | 2490 | /* |
2489 | * clear any existing master requests and | 2491 | * clear any existing master requests and |
2490 | * add the migration mle to the list | 2492 | * add the migration mle to the list |
2491 | */ | 2493 | */ |
2492 | spin_lock(&dlm->spinlock); | 2494 | spin_lock(&dlm->spinlock); |
2493 | spin_lock(&dlm->master_lock); | 2495 | spin_lock(&dlm->master_lock); |
2494 | ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, | 2496 | ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, |
2495 | namelen, target, dlm->node_num); | 2497 | namelen, target, dlm->node_num); |
2496 | spin_unlock(&dlm->master_lock); | 2498 | spin_unlock(&dlm->master_lock); |
2497 | spin_unlock(&dlm->spinlock); | 2499 | spin_unlock(&dlm->spinlock); |
2498 | 2500 | ||
2499 | if (ret == -EEXIST) { | 2501 | if (ret == -EEXIST) { |
2500 | mlog(0, "another process is already migrating it\n"); | 2502 | mlog(0, "another process is already migrating it\n"); |
2501 | goto fail; | 2503 | goto fail; |
2502 | } | 2504 | } |
2503 | mle_added = 1; | 2505 | mle_added = 1; |
2504 | 2506 | ||
2505 | /* | 2507 | /* |
2506 | * set the MIGRATING flag and flush asts | 2508 | * set the MIGRATING flag and flush asts |
2507 | * if we fail after this we need to re-dirty the lockres | 2509 | * if we fail after this we need to re-dirty the lockres |
2508 | */ | 2510 | */ |
2509 | if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { | 2511 | if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { |
2510 | mlog(ML_ERROR, "tried to migrate %.*s to %u, but " | 2512 | mlog(ML_ERROR, "tried to migrate %.*s to %u, but " |
2511 | "the target went down.\n", res->lockname.len, | 2513 | "the target went down.\n", res->lockname.len, |
2512 | res->lockname.name, target); | 2514 | res->lockname.name, target); |
2513 | spin_lock(&res->spinlock); | 2515 | spin_lock(&res->spinlock); |
2514 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 2516 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
2515 | wake = 1; | 2517 | wake = 1; |
2516 | spin_unlock(&res->spinlock); | 2518 | spin_unlock(&res->spinlock); |
2517 | ret = -EINVAL; | 2519 | ret = -EINVAL; |
2518 | } | 2520 | } |
2519 | 2521 | ||
2520 | fail: | 2522 | fail: |
2521 | if (oldmle) { | 2523 | if (oldmle) { |
2522 | /* master is known, detach if not already detached */ | 2524 | /* master is known, detach if not already detached */ |
2523 | dlm_mle_detach_hb_events(dlm, oldmle); | 2525 | dlm_mle_detach_hb_events(dlm, oldmle); |
2524 | dlm_put_mle(oldmle); | 2526 | dlm_put_mle(oldmle); |
2525 | } | 2527 | } |
2526 | 2528 | ||
2527 | if (ret < 0) { | 2529 | if (ret < 0) { |
2528 | if (mle_added) { | 2530 | if (mle_added) { |
2529 | dlm_mle_detach_hb_events(dlm, mle); | 2531 | dlm_mle_detach_hb_events(dlm, mle); |
2530 | dlm_put_mle(mle); | 2532 | dlm_put_mle(mle); |
2531 | } else if (mle) { | 2533 | } else if (mle) { |
2532 | kmem_cache_free(dlm_mle_cache, mle); | 2534 | kmem_cache_free(dlm_mle_cache, mle); |
2533 | mle = NULL; | 2535 | mle = NULL; |
2534 | } | 2536 | } |
2535 | goto leave; | 2537 | goto leave; |
2536 | } | 2538 | } |
2537 | 2539 | ||
2538 | /* | 2540 | /* |
2539 | * at this point, we have a migration target, an mle | 2541 | * at this point, we have a migration target, an mle |
2540 | * in the master list, and the MIGRATING flag set on | 2542 | * in the master list, and the MIGRATING flag set on |
2541 | * the lockres | 2543 | * the lockres |
2542 | */ | 2544 | */ |
2543 | 2545 | ||
2544 | /* now that remote nodes are spinning on the MIGRATING flag, | 2546 | /* now that remote nodes are spinning on the MIGRATING flag, |
2545 | * ensure that all assert_master work is flushed. */ | 2547 | * ensure that all assert_master work is flushed. */ |
2546 | flush_workqueue(dlm->dlm_worker); | 2548 | flush_workqueue(dlm->dlm_worker); |
2547 | 2549 | ||
2548 | /* get an extra reference on the mle. | 2550 | /* get an extra reference on the mle. |
2549 | * otherwise the assert_master from the new | 2551 | * otherwise the assert_master from the new |
2550 | * master will destroy this. | 2552 | * master will destroy this. |
2551 | * also, make sure that all callers of dlm_get_mle | 2553 | * also, make sure that all callers of dlm_get_mle |
2552 | * take both dlm->spinlock and dlm->master_lock */ | 2554 | * take both dlm->spinlock and dlm->master_lock */ |
2553 | spin_lock(&dlm->spinlock); | 2555 | spin_lock(&dlm->spinlock); |
2554 | spin_lock(&dlm->master_lock); | 2556 | spin_lock(&dlm->master_lock); |
2555 | dlm_get_mle_inuse(mle); | 2557 | dlm_get_mle_inuse(mle); |
2556 | spin_unlock(&dlm->master_lock); | 2558 | spin_unlock(&dlm->master_lock); |
2557 | spin_unlock(&dlm->spinlock); | 2559 | spin_unlock(&dlm->spinlock); |
2558 | 2560 | ||
2559 | /* notify new node and send all lock state */ | 2561 | /* notify new node and send all lock state */ |
2560 | /* call send_one_lockres with migration flag. | 2562 | /* call send_one_lockres with migration flag. |
2561 | * this serves as notice to the target node that a | 2563 | * this serves as notice to the target node that a |
2562 | * migration is starting. */ | 2564 | * migration is starting. */ |
2563 | ret = dlm_send_one_lockres(dlm, res, mres, target, | 2565 | ret = dlm_send_one_lockres(dlm, res, mres, target, |
2564 | DLM_MRES_MIGRATION); | 2566 | DLM_MRES_MIGRATION); |
2565 | 2567 | ||
2566 | if (ret < 0) { | 2568 | if (ret < 0) { |
2567 | mlog(0, "migration to node %u failed with %d\n", | 2569 | mlog(0, "migration to node %u failed with %d\n", |
2568 | target, ret); | 2570 | target, ret); |
2569 | /* migration failed, detach and clean up mle */ | 2571 | /* migration failed, detach and clean up mle */ |
2570 | dlm_mle_detach_hb_events(dlm, mle); | 2572 | dlm_mle_detach_hb_events(dlm, mle); |
2571 | dlm_put_mle(mle); | 2573 | dlm_put_mle(mle); |
2572 | dlm_put_mle_inuse(mle); | 2574 | dlm_put_mle_inuse(mle); |
2573 | spin_lock(&res->spinlock); | 2575 | spin_lock(&res->spinlock); |
2574 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 2576 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
2575 | wake = 1; | 2577 | wake = 1; |
2576 | spin_unlock(&res->spinlock); | 2578 | spin_unlock(&res->spinlock); |
2577 | if (dlm_is_host_down(ret)) | 2579 | if (dlm_is_host_down(ret)) |
2578 | dlm_wait_for_node_death(dlm, target, | 2580 | dlm_wait_for_node_death(dlm, target, |
2579 | DLM_NODE_DEATH_WAIT_MAX); | 2581 | DLM_NODE_DEATH_WAIT_MAX); |
2580 | goto leave; | 2582 | goto leave; |
2581 | } | 2583 | } |
2582 | 2584 | ||
2583 | /* at this point, the target sends a message to all nodes, | 2585 | /* at this point, the target sends a message to all nodes, |
2584 | * (using dlm_do_migrate_request). this node is skipped since | 2586 | * (using dlm_do_migrate_request). this node is skipped since |
2585 | * we had to put an mle in the list to begin the process. this | 2587 | * we had to put an mle in the list to begin the process. this |
2586 | * node now waits for target to do an assert master. this node | 2588 | * node now waits for target to do an assert master. this node |
2587 | * will be the last one notified, ensuring that the migration | 2589 | * will be the last one notified, ensuring that the migration |
2588 | * is complete everywhere. if the target dies while this is | 2590 | * is complete everywhere. if the target dies while this is |
2589 | * going on, some nodes could potentially see the target as the | 2591 | * going on, some nodes could potentially see the target as the |
2590 | * master, so it is important that my recovery finds the migration | 2592 | * master, so it is important that my recovery finds the migration |
2591 | * mle and sets the master to UNKNOWN. */ | 2593 | * mle and sets the master to UNKNOWN. */ |
2592 | 2594 | ||
2593 | 2595 | ||
2594 | /* wait for new node to assert master */ | 2596 | /* wait for new node to assert master */ |
2595 | while (1) { | 2597 | while (1) { |
2596 | ret = wait_event_interruptible_timeout(mle->wq, | 2598 | ret = wait_event_interruptible_timeout(mle->wq, |
2597 | (atomic_read(&mle->woken) == 1), | 2599 | (atomic_read(&mle->woken) == 1), |
2598 | msecs_to_jiffies(5000)); | 2600 | msecs_to_jiffies(5000)); |
2599 | 2601 | ||
2600 | if (ret >= 0) { | 2602 | if (ret >= 0) { |
2601 | if (atomic_read(&mle->woken) == 1 || | 2603 | if (atomic_read(&mle->woken) == 1 || |
2602 | res->owner == target) | 2604 | res->owner == target) |
2603 | break; | 2605 | break; |
2604 | 2606 | ||
2605 | mlog(0, "%s:%.*s: timed out during migration\n", | 2607 | mlog(0, "%s:%.*s: timed out during migration\n", |
2606 | dlm->name, res->lockname.len, res->lockname.name); | 2608 | dlm->name, res->lockname.len, res->lockname.name); |
2607 | /* avoid hang during shutdown when migrating lockres | 2609 | /* avoid hang during shutdown when migrating lockres |
2608 | * to a node which also goes down */ | 2610 | * to a node which also goes down */ |
2609 | if (dlm_is_node_dead(dlm, target)) { | 2611 | if (dlm_is_node_dead(dlm, target)) { |
2610 | mlog(0, "%s:%.*s: expected migration " | 2612 | mlog(0, "%s:%.*s: expected migration " |
2611 | "target %u is no longer up, restarting\n", | 2613 | "target %u is no longer up, restarting\n", |
2612 | dlm->name, res->lockname.len, | 2614 | dlm->name, res->lockname.len, |
2613 | res->lockname.name, target); | 2615 | res->lockname.name, target); |
2614 | ret = -EINVAL; | 2616 | ret = -EINVAL; |
2615 | /* migration failed, detach and clean up mle */ | 2617 | /* migration failed, detach and clean up mle */ |
2616 | dlm_mle_detach_hb_events(dlm, mle); | 2618 | dlm_mle_detach_hb_events(dlm, mle); |
2617 | dlm_put_mle(mle); | 2619 | dlm_put_mle(mle); |
2618 | dlm_put_mle_inuse(mle); | 2620 | dlm_put_mle_inuse(mle); |
2619 | spin_lock(&res->spinlock); | 2621 | spin_lock(&res->spinlock); |
2620 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 2622 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
2621 | wake = 1; | 2623 | wake = 1; |
2622 | spin_unlock(&res->spinlock); | 2624 | spin_unlock(&res->spinlock); |
2623 | goto leave; | 2625 | goto leave; |
2624 | } | 2626 | } |
2625 | } else | 2627 | } else |
2626 | mlog(0, "%s:%.*s: caught signal during migration\n", | 2628 | mlog(0, "%s:%.*s: caught signal during migration\n", |
2627 | dlm->name, res->lockname.len, res->lockname.name); | 2629 | dlm->name, res->lockname.len, res->lockname.name); |
2628 | } | 2630 | } |
2629 | 2631 | ||
2630 | /* all done, set the owner, clear the flag */ | 2632 | /* all done, set the owner, clear the flag */ |
2631 | spin_lock(&res->spinlock); | 2633 | spin_lock(&res->spinlock); |
2632 | dlm_set_lockres_owner(dlm, res, target); | 2634 | dlm_set_lockres_owner(dlm, res, target); |
2633 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 2635 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
2634 | dlm_remove_nonlocal_locks(dlm, res); | 2636 | dlm_remove_nonlocal_locks(dlm, res); |
2635 | spin_unlock(&res->spinlock); | 2637 | spin_unlock(&res->spinlock); |
2636 | wake_up(&res->wq); | 2638 | wake_up(&res->wq); |
2637 | 2639 | ||
2638 | /* master is known, detach if not already detached */ | 2640 | /* master is known, detach if not already detached */ |
2639 | dlm_mle_detach_hb_events(dlm, mle); | 2641 | dlm_mle_detach_hb_events(dlm, mle); |
2640 | dlm_put_mle_inuse(mle); | 2642 | dlm_put_mle_inuse(mle); |
2641 | ret = 0; | 2643 | ret = 0; |
2642 | 2644 | ||
2643 | dlm_lockres_calc_usage(dlm, res); | 2645 | dlm_lockres_calc_usage(dlm, res); |
2644 | 2646 | ||
2645 | leave: | 2647 | leave: |
2646 | /* re-dirty the lockres if we failed */ | 2648 | /* re-dirty the lockres if we failed */ |
2647 | if (ret < 0) | 2649 | if (ret < 0) |
2648 | dlm_kick_thread(dlm, res); | 2650 | dlm_kick_thread(dlm, res); |
2649 | 2651 | ||
2650 | /* wake up waiters if the MIGRATING flag got set | 2652 | /* wake up waiters if the MIGRATING flag got set |
2651 | * but migration failed */ | 2653 | * but migration failed */ |
2652 | if (wake) | 2654 | if (wake) |
2653 | wake_up(&res->wq); | 2655 | wake_up(&res->wq); |
2654 | 2656 | ||
2655 | if (mres) | 2657 | if (mres) |
2656 | free_page((unsigned long)mres); | 2658 | free_page((unsigned long)mres); |
2657 | 2659 | ||
2658 | dlm_put(dlm); | 2660 | dlm_put(dlm); |
2659 | 2661 | ||
2660 | mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, | 2662 | mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, |
2661 | name, target, ret); | 2663 | name, target, ret); |
2662 | return ret; | 2664 | return ret; |
2663 | } | 2665 | } |
2664 | 2666 | ||
2665 | #define DLM_MIGRATION_RETRY_MS 100 | 2667 | #define DLM_MIGRATION_RETRY_MS 100 |
2666 | 2668 | ||
2667 | /* | 2669 | /* |
2668 | * Should be called only after beginning the domain leave process. | 2670 | * Should be called only after beginning the domain leave process. |
2669 | * There should not be any remaining locks on nonlocal lock resources, | 2671 | * There should not be any remaining locks on nonlocal lock resources, |
2670 | * and there should be no local locks left on locally mastered resources. | 2672 | * and there should be no local locks left on locally mastered resources. |
2671 | * | 2673 | * |
2672 | * Called with the dlm spinlock held, may drop it to do migration, but | 2674 | * Called with the dlm spinlock held, may drop it to do migration, but |
2673 | * will re-acquire before exit. | 2675 | * will re-acquire before exit. |
2674 | * | 2676 | * |
2675 | * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped | 2677 | * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped |
2676 | */ | 2678 | */ |
2677 | int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | 2679 | int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) |
2678 | { | 2680 | { |
2679 | int ret; | 2681 | int ret; |
2680 | int lock_dropped = 0; | 2682 | int lock_dropped = 0; |
2681 | u8 target = O2NM_MAX_NODES; | 2683 | u8 target = O2NM_MAX_NODES; |
2682 | 2684 | ||
2683 | assert_spin_locked(&dlm->spinlock); | 2685 | assert_spin_locked(&dlm->spinlock); |
2684 | 2686 | ||
2685 | spin_lock(&res->spinlock); | 2687 | spin_lock(&res->spinlock); |
2686 | if (dlm_is_lockres_migrateable(dlm, res)) | 2688 | if (dlm_is_lockres_migrateable(dlm, res)) |
2687 | target = dlm_pick_migration_target(dlm, res); | 2689 | target = dlm_pick_migration_target(dlm, res); |
2688 | spin_unlock(&res->spinlock); | 2690 | spin_unlock(&res->spinlock); |
2689 | 2691 | ||
2690 | if (target == O2NM_MAX_NODES) | 2692 | if (target == O2NM_MAX_NODES) |
2691 | goto leave; | 2693 | goto leave; |
2692 | 2694 | ||
2693 | /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ | 2695 | /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ |
2694 | spin_unlock(&dlm->spinlock); | 2696 | spin_unlock(&dlm->spinlock); |
2695 | lock_dropped = 1; | 2697 | lock_dropped = 1; |
2696 | ret = dlm_migrate_lockres(dlm, res, target); | 2698 | ret = dlm_migrate_lockres(dlm, res, target); |
2697 | if (ret) | 2699 | if (ret) |
2698 | mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", | 2700 | mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", |
2699 | dlm->name, res->lockname.len, res->lockname.name, | 2701 | dlm->name, res->lockname.len, res->lockname.name, |
2700 | target, ret); | 2702 | target, ret); |
2701 | spin_lock(&dlm->spinlock); | 2703 | spin_lock(&dlm->spinlock); |
2702 | leave: | 2704 | leave: |
2703 | return lock_dropped; | 2705 | return lock_dropped; |
2704 | } | 2706 | } |
2705 | 2707 | ||
2706 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) | 2708 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
2707 | { | 2709 | { |
2708 | int ret; | 2710 | int ret; |
2709 | spin_lock(&dlm->ast_lock); | 2711 | spin_lock(&dlm->ast_lock); |
2710 | spin_lock(&lock->spinlock); | 2712 | spin_lock(&lock->spinlock); |
2711 | ret = (list_empty(&lock->bast_list) && !lock->bast_pending); | 2713 | ret = (list_empty(&lock->bast_list) && !lock->bast_pending); |
2712 | spin_unlock(&lock->spinlock); | 2714 | spin_unlock(&lock->spinlock); |
2713 | spin_unlock(&dlm->ast_lock); | 2715 | spin_unlock(&dlm->ast_lock); |
2714 | return ret; | 2716 | return ret; |
2715 | } | 2717 | } |
2716 | 2718 | ||
2717 | static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, | 2719 | static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, |
2718 | struct dlm_lock_resource *res, | 2720 | struct dlm_lock_resource *res, |
2719 | u8 mig_target) | 2721 | u8 mig_target) |
2720 | { | 2722 | { |
2721 | int can_proceed; | 2723 | int can_proceed; |
2722 | spin_lock(&res->spinlock); | 2724 | spin_lock(&res->spinlock); |
2723 | can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); | 2725 | can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); |
2724 | spin_unlock(&res->spinlock); | 2726 | spin_unlock(&res->spinlock); |
2725 | 2727 | ||
2726 | /* target has died, so make the caller break out of the | 2728 | /* target has died, so make the caller break out of the |
2727 | * wait_event, but caller must recheck the domain_map */ | 2729 | * wait_event, but caller must recheck the domain_map */ |
2728 | spin_lock(&dlm->spinlock); | 2730 | spin_lock(&dlm->spinlock); |
2729 | if (!test_bit(mig_target, dlm->domain_map)) | 2731 | if (!test_bit(mig_target, dlm->domain_map)) |
2730 | can_proceed = 1; | 2732 | can_proceed = 1; |
2731 | spin_unlock(&dlm->spinlock); | 2733 | spin_unlock(&dlm->spinlock); |
2732 | return can_proceed; | 2734 | return can_proceed; |
2733 | } | 2735 | } |
2734 | 2736 | ||
2735 | static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, | 2737 | static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, |
2736 | struct dlm_lock_resource *res) | 2738 | struct dlm_lock_resource *res) |
2737 | { | 2739 | { |
2738 | int ret; | 2740 | int ret; |
2739 | spin_lock(&res->spinlock); | 2741 | spin_lock(&res->spinlock); |
2740 | ret = !!(res->state & DLM_LOCK_RES_DIRTY); | 2742 | ret = !!(res->state & DLM_LOCK_RES_DIRTY); |
2741 | spin_unlock(&res->spinlock); | 2743 | spin_unlock(&res->spinlock); |
2742 | return ret; | 2744 | return ret; |
2743 | } | 2745 | } |
2744 | 2746 | ||
2745 | 2747 | ||
2746 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, | 2748 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, |
2747 | struct dlm_lock_resource *res, | 2749 | struct dlm_lock_resource *res, |
2748 | u8 target) | 2750 | u8 target) |
2749 | { | 2751 | { |
2750 | int ret = 0; | 2752 | int ret = 0; |
2751 | 2753 | ||
2752 | mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", | 2754 | mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", |
2753 | res->lockname.len, res->lockname.name, dlm->node_num, | 2755 | res->lockname.len, res->lockname.name, dlm->node_num, |
2754 | target); | 2756 | target); |
2755 | /* need to set MIGRATING flag on lockres. this is done by | 2757 | /* need to set MIGRATING flag on lockres. this is done by |
2756 | * ensuring that all asts have been flushed for this lockres. */ | 2758 | * ensuring that all asts have been flushed for this lockres. */ |
2757 | spin_lock(&res->spinlock); | 2759 | spin_lock(&res->spinlock); |
2758 | BUG_ON(res->migration_pending); | 2760 | BUG_ON(res->migration_pending); |
2759 | res->migration_pending = 1; | 2761 | res->migration_pending = 1; |
2760 | /* strategy is to reserve an extra ast then release | 2762 | /* strategy is to reserve an extra ast then release |
2761 | * it below, letting the release do all of the work */ | 2763 | * it below, letting the release do all of the work */ |
2762 | __dlm_lockres_reserve_ast(res); | 2764 | __dlm_lockres_reserve_ast(res); |
2763 | spin_unlock(&res->spinlock); | 2765 | spin_unlock(&res->spinlock); |
2764 | 2766 | ||
2765 | /* now flush all the pending asts */ | 2767 | /* now flush all the pending asts */ |
2766 | dlm_kick_thread(dlm, res); | 2768 | dlm_kick_thread(dlm, res); |
2767 | /* before waiting on DIRTY, block processes which may | 2769 | /* before waiting on DIRTY, block processes which may |
2768 | * try to dirty the lockres before MIGRATING is set */ | 2770 | * try to dirty the lockres before MIGRATING is set */ |
2769 | spin_lock(&res->spinlock); | 2771 | spin_lock(&res->spinlock); |
2770 | BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); | 2772 | BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); |
2771 | res->state |= DLM_LOCK_RES_BLOCK_DIRTY; | 2773 | res->state |= DLM_LOCK_RES_BLOCK_DIRTY; |
2772 | spin_unlock(&res->spinlock); | 2774 | spin_unlock(&res->spinlock); |
2773 | /* now wait on any pending asts and the DIRTY state */ | 2775 | /* now wait on any pending asts and the DIRTY state */ |
2774 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); | 2776 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); |
2775 | dlm_lockres_release_ast(dlm, res); | 2777 | dlm_lockres_release_ast(dlm, res); |
2776 | 2778 | ||
2777 | mlog(0, "about to wait on migration_wq, dirty=%s\n", | 2779 | mlog(0, "about to wait on migration_wq, dirty=%s\n", |
2778 | res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); | 2780 | res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); |
2779 | /* if the extra ref we just put was the final one, this | 2781 | /* if the extra ref we just put was the final one, this |
2780 | * will pass thru immediately. otherwise, we need to wait | 2782 | * will pass thru immediately. otherwise, we need to wait |
2781 | * for the last ast to finish. */ | 2783 | * for the last ast to finish. */ |
2782 | again: | 2784 | again: |
2783 | ret = wait_event_interruptible_timeout(dlm->migration_wq, | 2785 | ret = wait_event_interruptible_timeout(dlm->migration_wq, |
2784 | dlm_migration_can_proceed(dlm, res, target), | 2786 | dlm_migration_can_proceed(dlm, res, target), |
2785 | msecs_to_jiffies(1000)); | 2787 | msecs_to_jiffies(1000)); |
2786 | if (ret < 0) { | 2788 | if (ret < 0) { |
2787 | mlog(0, "woken again: migrating? %s, dead? %s\n", | 2789 | mlog(0, "woken again: migrating? %s, dead? %s\n", |
2788 | res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", | 2790 | res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", |
2789 | test_bit(target, dlm->domain_map) ? "no":"yes"); | 2791 | test_bit(target, dlm->domain_map) ? "no":"yes"); |
2790 | } else { | 2792 | } else { |
2791 | mlog(0, "all is well: migrating? %s, dead? %s\n", | 2793 | mlog(0, "all is well: migrating? %s, dead? %s\n", |
2792 | res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", | 2794 | res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", |
2793 | test_bit(target, dlm->domain_map) ? "no":"yes"); | 2795 | test_bit(target, dlm->domain_map) ? "no":"yes"); |
2794 | } | 2796 | } |
2795 | if (!dlm_migration_can_proceed(dlm, res, target)) { | 2797 | if (!dlm_migration_can_proceed(dlm, res, target)) { |
2796 | mlog(0, "trying again...\n"); | 2798 | mlog(0, "trying again...\n"); |
2797 | goto again; | 2799 | goto again; |
2798 | } | 2800 | } |
2799 | 2801 | ||
2800 | ret = 0; | 2802 | ret = 0; |
2801 | /* did the target go down or die? */ | 2803 | /* did the target go down or die? */ |
2802 | spin_lock(&dlm->spinlock); | 2804 | spin_lock(&dlm->spinlock); |
2803 | if (!test_bit(target, dlm->domain_map)) { | 2805 | if (!test_bit(target, dlm->domain_map)) { |
2804 | mlog(ML_ERROR, "aha. migration target %u just went down\n", | 2806 | mlog(ML_ERROR, "aha. migration target %u just went down\n", |
2805 | target); | 2807 | target); |
2806 | ret = -EHOSTDOWN; | 2808 | ret = -EHOSTDOWN; |
2807 | } | 2809 | } |
2808 | spin_unlock(&dlm->spinlock); | 2810 | spin_unlock(&dlm->spinlock); |
2809 | 2811 | ||
2810 | /* | 2812 | /* |
2811 | * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for | 2813 | * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for |
2812 | * another try; otherwise, we are sure the MIGRATING state is there, | 2814 | * another try; otherwise, we are sure the MIGRATING state is there, |
2813 | * drop the unneded state which blocked threads trying to DIRTY | 2815 | * drop the unneded state which blocked threads trying to DIRTY |
2814 | */ | 2816 | */ |
2815 | spin_lock(&res->spinlock); | 2817 | spin_lock(&res->spinlock); |
2816 | BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); | 2818 | BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); |
2817 | res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; | 2819 | res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; |
2818 | if (!ret) | 2820 | if (!ret) |
2819 | BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); | 2821 | BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); |
2820 | spin_unlock(&res->spinlock); | 2822 | spin_unlock(&res->spinlock); |
2821 | 2823 | ||
2822 | /* | 2824 | /* |
2823 | * at this point: | 2825 | * at this point: |
2824 | * | 2826 | * |
2825 | * o the DLM_LOCK_RES_MIGRATING flag is set if target not down | 2827 | * o the DLM_LOCK_RES_MIGRATING flag is set if target not down |
2826 | * o there are no pending asts on this lockres | 2828 | * o there are no pending asts on this lockres |
2827 | * o all processes trying to reserve an ast on this | 2829 | * o all processes trying to reserve an ast on this |
2828 | * lockres must wait for the MIGRATING flag to clear | 2830 | * lockres must wait for the MIGRATING flag to clear |
2829 | */ | 2831 | */ |
2830 | return ret; | 2832 | return ret; |
2831 | } | 2833 | } |
2832 | 2834 | ||
2833 | /* last step in the migration process. | 2835 | /* last step in the migration process. |
2834 | * original master calls this to free all of the dlm_lock | 2836 | * original master calls this to free all of the dlm_lock |
2835 | * structures that used to be for other nodes. */ | 2837 | * structures that used to be for other nodes. */ |
2836 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | 2838 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, |
2837 | struct dlm_lock_resource *res) | 2839 | struct dlm_lock_resource *res) |
2838 | { | 2840 | { |
2839 | struct list_head *queue = &res->granted; | 2841 | struct list_head *queue = &res->granted; |
2840 | int i, bit; | 2842 | int i, bit; |
2841 | struct dlm_lock *lock, *next; | 2843 | struct dlm_lock *lock, *next; |
2842 | 2844 | ||
2843 | assert_spin_locked(&res->spinlock); | 2845 | assert_spin_locked(&res->spinlock); |
2844 | 2846 | ||
2845 | BUG_ON(res->owner == dlm->node_num); | 2847 | BUG_ON(res->owner == dlm->node_num); |
2846 | 2848 | ||
2847 | for (i=0; i<3; i++) { | 2849 | for (i=0; i<3; i++) { |
2848 | list_for_each_entry_safe(lock, next, queue, list) { | 2850 | list_for_each_entry_safe(lock, next, queue, list) { |
2849 | if (lock->ml.node != dlm->node_num) { | 2851 | if (lock->ml.node != dlm->node_num) { |
2850 | mlog(0, "putting lock for node %u\n", | 2852 | mlog(0, "putting lock for node %u\n", |
2851 | lock->ml.node); | 2853 | lock->ml.node); |
2852 | /* be extra careful */ | 2854 | /* be extra careful */ |
2853 | BUG_ON(!list_empty(&lock->ast_list)); | 2855 | BUG_ON(!list_empty(&lock->ast_list)); |
2854 | BUG_ON(!list_empty(&lock->bast_list)); | 2856 | BUG_ON(!list_empty(&lock->bast_list)); |
2855 | BUG_ON(lock->ast_pending); | 2857 | BUG_ON(lock->ast_pending); |
2856 | BUG_ON(lock->bast_pending); | 2858 | BUG_ON(lock->bast_pending); |
2857 | dlm_lockres_clear_refmap_bit(dlm, res, | 2859 | dlm_lockres_clear_refmap_bit(dlm, res, |
2858 | lock->ml.node); | 2860 | lock->ml.node); |
2859 | list_del_init(&lock->list); | 2861 | list_del_init(&lock->list); |
2860 | dlm_lock_put(lock); | 2862 | dlm_lock_put(lock); |
2861 | /* In a normal unlock, we would have added a | 2863 | /* In a normal unlock, we would have added a |
2862 | * DLM_UNLOCK_FREE_LOCK action. Force it. */ | 2864 | * DLM_UNLOCK_FREE_LOCK action. Force it. */ |
2863 | dlm_lock_put(lock); | 2865 | dlm_lock_put(lock); |
2864 | } | 2866 | } |
2865 | } | 2867 | } |
2866 | queue++; | 2868 | queue++; |
2867 | } | 2869 | } |
2868 | bit = 0; | 2870 | bit = 0; |
2869 | while (1) { | 2871 | while (1) { |
2870 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); | 2872 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); |
2871 | if (bit >= O2NM_MAX_NODES) | 2873 | if (bit >= O2NM_MAX_NODES) |
2872 | break; | 2874 | break; |
2873 | /* do not clear the local node reference, if there is a | 2875 | /* do not clear the local node reference, if there is a |
2874 | * process holding this, let it drop the ref itself */ | 2876 | * process holding this, let it drop the ref itself */ |
2875 | if (bit != dlm->node_num) { | 2877 | if (bit != dlm->node_num) { |
2876 | mlog(0, "%s:%.*s: node %u had a ref to this " | 2878 | mlog(0, "%s:%.*s: node %u had a ref to this " |
2877 | "migrating lockres, clearing\n", dlm->name, | 2879 | "migrating lockres, clearing\n", dlm->name, |
2878 | res->lockname.len, res->lockname.name, bit); | 2880 | res->lockname.len, res->lockname.name, bit); |
2879 | dlm_lockres_clear_refmap_bit(dlm, res, bit); | 2881 | dlm_lockres_clear_refmap_bit(dlm, res, bit); |
2880 | } | 2882 | } |
2881 | bit++; | 2883 | bit++; |
2882 | } | 2884 | } |
2883 | } | 2885 | } |
2884 | 2886 | ||
2885 | /* | 2887 | /* |
2886 | * Pick a node to migrate the lock resource to. This function selects a | 2888 | * Pick a node to migrate the lock resource to. This function selects a |
2887 | * potential target based first on the locks and then on refmap. It skips | 2889 | * potential target based first on the locks and then on refmap. It skips |
2888 | * nodes that are in the process of exiting the domain. | 2890 | * nodes that are in the process of exiting the domain. |
2889 | */ | 2891 | */ |
2890 | static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, | 2892 | static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, |
2891 | struct dlm_lock_resource *res) | 2893 | struct dlm_lock_resource *res) |
2892 | { | 2894 | { |
2893 | enum dlm_lockres_list idx; | 2895 | enum dlm_lockres_list idx; |
2894 | struct list_head *queue = &res->granted; | 2896 | struct list_head *queue = &res->granted; |
2895 | struct dlm_lock *lock; | 2897 | struct dlm_lock *lock; |
2896 | int noderef; | 2898 | int noderef; |
2897 | u8 nodenum = O2NM_MAX_NODES; | 2899 | u8 nodenum = O2NM_MAX_NODES; |
2898 | 2900 | ||
2899 | assert_spin_locked(&dlm->spinlock); | 2901 | assert_spin_locked(&dlm->spinlock); |
2900 | assert_spin_locked(&res->spinlock); | 2902 | assert_spin_locked(&res->spinlock); |
2901 | 2903 | ||
2902 | /* Go through all the locks */ | 2904 | /* Go through all the locks */ |
2903 | for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { | 2905 | for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { |
2904 | queue = dlm_list_idx_to_ptr(res, idx); | 2906 | queue = dlm_list_idx_to_ptr(res, idx); |
2905 | list_for_each_entry(lock, queue, list) { | 2907 | list_for_each_entry(lock, queue, list) { |
2906 | if (lock->ml.node == dlm->node_num) | 2908 | if (lock->ml.node == dlm->node_num) |
2907 | continue; | 2909 | continue; |
2908 | if (test_bit(lock->ml.node, dlm->exit_domain_map)) | 2910 | if (test_bit(lock->ml.node, dlm->exit_domain_map)) |
2909 | continue; | 2911 | continue; |
2910 | nodenum = lock->ml.node; | 2912 | nodenum = lock->ml.node; |
2911 | goto bail; | 2913 | goto bail; |
2912 | } | 2914 | } |
2913 | } | 2915 | } |
2914 | 2916 | ||
2915 | /* Go thru the refmap */ | 2917 | /* Go thru the refmap */ |
2916 | noderef = -1; | 2918 | noderef = -1; |
2917 | while (1) { | 2919 | while (1) { |
2918 | noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, | 2920 | noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, |
2919 | noderef + 1); | 2921 | noderef + 1); |
2920 | if (noderef >= O2NM_MAX_NODES) | 2922 | if (noderef >= O2NM_MAX_NODES) |
2921 | break; | 2923 | break; |
2922 | if (noderef == dlm->node_num) | 2924 | if (noderef == dlm->node_num) |
2923 | continue; | 2925 | continue; |
2924 | if (test_bit(noderef, dlm->exit_domain_map)) | 2926 | if (test_bit(noderef, dlm->exit_domain_map)) |
2925 | continue; | 2927 | continue; |
2926 | nodenum = noderef; | 2928 | nodenum = noderef; |
2927 | goto bail; | 2929 | goto bail; |
2928 | } | 2930 | } |
2929 | 2931 | ||
2930 | bail: | 2932 | bail: |
2931 | return nodenum; | 2933 | return nodenum; |
2932 | } | 2934 | } |
2933 | 2935 | ||
2934 | /* this is called by the new master once all lockres | 2936 | /* this is called by the new master once all lockres |
2935 | * data has been received */ | 2937 | * data has been received */ |
2936 | static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | 2938 | static int dlm_do_migrate_request(struct dlm_ctxt *dlm, |
2937 | struct dlm_lock_resource *res, | 2939 | struct dlm_lock_resource *res, |
2938 | u8 master, u8 new_master, | 2940 | u8 master, u8 new_master, |
2939 | struct dlm_node_iter *iter) | 2941 | struct dlm_node_iter *iter) |
2940 | { | 2942 | { |
2941 | struct dlm_migrate_request migrate; | 2943 | struct dlm_migrate_request migrate; |
2942 | int ret, skip, status = 0; | 2944 | int ret, skip, status = 0; |
2943 | int nodenum; | 2945 | int nodenum; |
2944 | 2946 | ||
2945 | memset(&migrate, 0, sizeof(migrate)); | 2947 | memset(&migrate, 0, sizeof(migrate)); |
2946 | migrate.namelen = res->lockname.len; | 2948 | migrate.namelen = res->lockname.len; |
2947 | memcpy(migrate.name, res->lockname.name, migrate.namelen); | 2949 | memcpy(migrate.name, res->lockname.name, migrate.namelen); |
2948 | migrate.new_master = new_master; | 2950 | migrate.new_master = new_master; |
2949 | migrate.master = master; | 2951 | migrate.master = master; |
2950 | 2952 | ||
2951 | ret = 0; | 2953 | ret = 0; |
2952 | 2954 | ||
2953 | /* send message to all nodes, except the master and myself */ | 2955 | /* send message to all nodes, except the master and myself */ |
2954 | while ((nodenum = dlm_node_iter_next(iter)) >= 0) { | 2956 | while ((nodenum = dlm_node_iter_next(iter)) >= 0) { |
2955 | if (nodenum == master || | 2957 | if (nodenum == master || |
2956 | nodenum == new_master) | 2958 | nodenum == new_master) |
2957 | continue; | 2959 | continue; |
2958 | 2960 | ||
2959 | /* We could race exit domain. If exited, skip. */ | 2961 | /* We could race exit domain. If exited, skip. */ |
2960 | spin_lock(&dlm->spinlock); | 2962 | spin_lock(&dlm->spinlock); |
2961 | skip = (!test_bit(nodenum, dlm->domain_map)); | 2963 | skip = (!test_bit(nodenum, dlm->domain_map)); |
2962 | spin_unlock(&dlm->spinlock); | 2964 | spin_unlock(&dlm->spinlock); |
2963 | if (skip) { | 2965 | if (skip) { |
2964 | clear_bit(nodenum, iter->node_map); | 2966 | clear_bit(nodenum, iter->node_map); |
2965 | continue; | 2967 | continue; |
2966 | } | 2968 | } |
2967 | 2969 | ||
2968 | ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, | 2970 | ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, |
2969 | &migrate, sizeof(migrate), nodenum, | 2971 | &migrate, sizeof(migrate), nodenum, |
2970 | &status); | 2972 | &status); |
2971 | if (ret < 0) { | 2973 | if (ret < 0) { |
2972 | mlog(ML_ERROR, "%s: res %.*s, Error %d send " | 2974 | mlog(ML_ERROR, "%s: res %.*s, Error %d send " |
2973 | "MIGRATE_REQUEST to node %u\n", dlm->name, | 2975 | "MIGRATE_REQUEST to node %u\n", dlm->name, |
2974 | migrate.namelen, migrate.name, ret, nodenum); | 2976 | migrate.namelen, migrate.name, ret, nodenum); |
2975 | if (!dlm_is_host_down(ret)) { | 2977 | if (!dlm_is_host_down(ret)) { |
2976 | mlog(ML_ERROR, "unhandled error=%d!\n", ret); | 2978 | mlog(ML_ERROR, "unhandled error=%d!\n", ret); |
2977 | BUG(); | 2979 | BUG(); |
2978 | } | 2980 | } |
2979 | clear_bit(nodenum, iter->node_map); | 2981 | clear_bit(nodenum, iter->node_map); |
2980 | ret = 0; | 2982 | ret = 0; |
2981 | } else if (status < 0) { | 2983 | } else if (status < 0) { |
2982 | mlog(0, "migrate request (node %u) returned %d!\n", | 2984 | mlog(0, "migrate request (node %u) returned %d!\n", |
2983 | nodenum, status); | 2985 | nodenum, status); |
2984 | ret = status; | 2986 | ret = status; |
2985 | } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { | 2987 | } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { |
2986 | /* during the migration request we short-circuited | 2988 | /* during the migration request we short-circuited |
2987 | * the mastery of the lockres. make sure we have | 2989 | * the mastery of the lockres. make sure we have |
2988 | * a mastery ref for nodenum */ | 2990 | * a mastery ref for nodenum */ |
2989 | mlog(0, "%s:%.*s: need ref for node %u\n", | 2991 | mlog(0, "%s:%.*s: need ref for node %u\n", |
2990 | dlm->name, res->lockname.len, res->lockname.name, | 2992 | dlm->name, res->lockname.len, res->lockname.name, |
2991 | nodenum); | 2993 | nodenum); |
2992 | spin_lock(&res->spinlock); | 2994 | spin_lock(&res->spinlock); |
2993 | dlm_lockres_set_refmap_bit(dlm, res, nodenum); | 2995 | dlm_lockres_set_refmap_bit(dlm, res, nodenum); |
2994 | spin_unlock(&res->spinlock); | 2996 | spin_unlock(&res->spinlock); |
2995 | } | 2997 | } |
2996 | } | 2998 | } |
2997 | 2999 | ||
2998 | if (ret < 0) | 3000 | if (ret < 0) |
2999 | mlog_errno(ret); | 3001 | mlog_errno(ret); |
3000 | 3002 | ||
3001 | mlog(0, "returning ret=%d\n", ret); | 3003 | mlog(0, "returning ret=%d\n", ret); |
3002 | return ret; | 3004 | return ret; |
3003 | } | 3005 | } |
3004 | 3006 | ||
3005 | 3007 | ||
3006 | /* if there is an existing mle for this lockres, we now know who the master is. | 3008 | /* if there is an existing mle for this lockres, we now know who the master is. |
3007 | * (the one who sent us *this* message) we can clear it up right away. | 3009 | * (the one who sent us *this* message) we can clear it up right away. |
3008 | * since the process that put the mle on the list still has a reference to it, | 3010 | * since the process that put the mle on the list still has a reference to it, |
3009 | * we can unhash it now, set the master and wake the process. as a result, | 3011 | * we can unhash it now, set the master and wake the process. as a result, |
3010 | * we will have no mle in the list to start with. now we can add an mle for | 3012 | * we will have no mle in the list to start with. now we can add an mle for |
3011 | * the migration and this should be the only one found for those scanning the | 3013 | * the migration and this should be the only one found for those scanning the |
3012 | * list. */ | 3014 | * list. */ |
3013 | int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, | 3015 | int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, |
3014 | void **ret_data) | 3016 | void **ret_data) |
3015 | { | 3017 | { |
3016 | struct dlm_ctxt *dlm = data; | 3018 | struct dlm_ctxt *dlm = data; |
3017 | struct dlm_lock_resource *res = NULL; | 3019 | struct dlm_lock_resource *res = NULL; |
3018 | struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; | 3020 | struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; |
3019 | struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; | 3021 | struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; |
3020 | const char *name; | 3022 | const char *name; |
3021 | unsigned int namelen, hash; | 3023 | unsigned int namelen, hash; |
3022 | int ret = 0; | 3024 | int ret = 0; |
3023 | 3025 | ||
3024 | if (!dlm_grab(dlm)) | 3026 | if (!dlm_grab(dlm)) |
3025 | return -EINVAL; | 3027 | return -EINVAL; |
3026 | 3028 | ||
3027 | name = migrate->name; | 3029 | name = migrate->name; |
3028 | namelen = migrate->namelen; | 3030 | namelen = migrate->namelen; |
3029 | hash = dlm_lockid_hash(name, namelen); | 3031 | hash = dlm_lockid_hash(name, namelen); |
3030 | 3032 | ||
3031 | /* preallocate.. if this fails, abort */ | 3033 | /* preallocate.. if this fails, abort */ |
3032 | mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); | 3034 | mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); |
3033 | 3035 | ||
3034 | if (!mle) { | 3036 | if (!mle) { |
3035 | ret = -ENOMEM; | 3037 | ret = -ENOMEM; |
3036 | goto leave; | 3038 | goto leave; |
3037 | } | 3039 | } |
3038 | 3040 | ||
3039 | /* check for pre-existing lock */ | 3041 | /* check for pre-existing lock */ |
3040 | spin_lock(&dlm->spinlock); | 3042 | spin_lock(&dlm->spinlock); |
3041 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); | 3043 | res = __dlm_lookup_lockres(dlm, name, namelen, hash); |
3042 | if (res) { | 3044 | if (res) { |
3043 | spin_lock(&res->spinlock); | 3045 | spin_lock(&res->spinlock); |
3044 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 3046 | if (res->state & DLM_LOCK_RES_RECOVERING) { |
3045 | /* if all is working ok, this can only mean that we got | 3047 | /* if all is working ok, this can only mean that we got |
3046 | * a migrate request from a node that we now see as | 3048 | * a migrate request from a node that we now see as |
3047 | * dead. what can we do here? drop it to the floor? */ | 3049 | * dead. what can we do here? drop it to the floor? */ |
3048 | spin_unlock(&res->spinlock); | 3050 | spin_unlock(&res->spinlock); |
3049 | mlog(ML_ERROR, "Got a migrate request, but the " | 3051 | mlog(ML_ERROR, "Got a migrate request, but the " |
3050 | "lockres is marked as recovering!"); | 3052 | "lockres is marked as recovering!"); |
3051 | kmem_cache_free(dlm_mle_cache, mle); | 3053 | kmem_cache_free(dlm_mle_cache, mle); |
3052 | ret = -EINVAL; /* need a better solution */ | 3054 | ret = -EINVAL; /* need a better solution */ |
3053 | goto unlock; | 3055 | goto unlock; |
3054 | } | 3056 | } |
3055 | res->state |= DLM_LOCK_RES_MIGRATING; | 3057 | res->state |= DLM_LOCK_RES_MIGRATING; |
3056 | spin_unlock(&res->spinlock); | 3058 | spin_unlock(&res->spinlock); |
3057 | } | 3059 | } |
3058 | 3060 | ||
3059 | spin_lock(&dlm->master_lock); | 3061 | spin_lock(&dlm->master_lock); |
3060 | /* ignore status. only nonzero status would BUG. */ | 3062 | /* ignore status. only nonzero status would BUG. */ |
3061 | ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, | 3063 | ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, |
3062 | name, namelen, | 3064 | name, namelen, |
3063 | migrate->new_master, | 3065 | migrate->new_master, |
3064 | migrate->master); | 3066 | migrate->master); |
3065 | 3067 | ||
3066 | spin_unlock(&dlm->master_lock); | 3068 | spin_unlock(&dlm->master_lock); |
3067 | unlock: | 3069 | unlock: |
3068 | spin_unlock(&dlm->spinlock); | 3070 | spin_unlock(&dlm->spinlock); |
3069 | 3071 | ||
3070 | if (oldmle) { | 3072 | if (oldmle) { |
3071 | /* master is known, detach if not already detached */ | 3073 | /* master is known, detach if not already detached */ |
3072 | dlm_mle_detach_hb_events(dlm, oldmle); | 3074 | dlm_mle_detach_hb_events(dlm, oldmle); |
3073 | dlm_put_mle(oldmle); | 3075 | dlm_put_mle(oldmle); |
3074 | } | 3076 | } |
3075 | 3077 | ||
3076 | if (res) | 3078 | if (res) |
3077 | dlm_lockres_put(res); | 3079 | dlm_lockres_put(res); |
3078 | leave: | 3080 | leave: |
3079 | dlm_put(dlm); | 3081 | dlm_put(dlm); |
3080 | return ret; | 3082 | return ret; |
3081 | } | 3083 | } |
3082 | 3084 | ||
3083 | /* must be holding dlm->spinlock and dlm->master_lock | 3085 | /* must be holding dlm->spinlock and dlm->master_lock |
3084 | * when adding a migration mle, we can clear any other mles | 3086 | * when adding a migration mle, we can clear any other mles |
3085 | * in the master list because we know with certainty that | 3087 | * in the master list because we know with certainty that |
3086 | * the master is "master". so we remove any old mle from | 3088 | * the master is "master". so we remove any old mle from |
3087 | * the list after setting it's master field, and then add | 3089 | * the list after setting it's master field, and then add |
3088 | * the new migration mle. this way we can hold with the rule | 3090 | * the new migration mle. this way we can hold with the rule |
3089 | * of having only one mle for a given lock name at all times. */ | 3091 | * of having only one mle for a given lock name at all times. */ |
3090 | static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | 3092 | static int dlm_add_migration_mle(struct dlm_ctxt *dlm, |
3091 | struct dlm_lock_resource *res, | 3093 | struct dlm_lock_resource *res, |
3092 | struct dlm_master_list_entry *mle, | 3094 | struct dlm_master_list_entry *mle, |
3093 | struct dlm_master_list_entry **oldmle, | 3095 | struct dlm_master_list_entry **oldmle, |
3094 | const char *name, unsigned int namelen, | 3096 | const char *name, unsigned int namelen, |
3095 | u8 new_master, u8 master) | 3097 | u8 new_master, u8 master) |
3096 | { | 3098 | { |
3097 | int found; | 3099 | int found; |
3098 | int ret = 0; | 3100 | int ret = 0; |
3099 | 3101 | ||
3100 | *oldmle = NULL; | 3102 | *oldmle = NULL; |
3101 | 3103 | ||
3102 | assert_spin_locked(&dlm->spinlock); | 3104 | assert_spin_locked(&dlm->spinlock); |
3103 | assert_spin_locked(&dlm->master_lock); | 3105 | assert_spin_locked(&dlm->master_lock); |
3104 | 3106 | ||
3105 | /* caller is responsible for any ref taken here on oldmle */ | 3107 | /* caller is responsible for any ref taken here on oldmle */ |
3106 | found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); | 3108 | found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); |
3107 | if (found) { | 3109 | if (found) { |
3108 | struct dlm_master_list_entry *tmp = *oldmle; | 3110 | struct dlm_master_list_entry *tmp = *oldmle; |
3109 | spin_lock(&tmp->spinlock); | 3111 | spin_lock(&tmp->spinlock); |
3110 | if (tmp->type == DLM_MLE_MIGRATION) { | 3112 | if (tmp->type == DLM_MLE_MIGRATION) { |
3111 | if (master == dlm->node_num) { | 3113 | if (master == dlm->node_num) { |
3112 | /* ah another process raced me to it */ | 3114 | /* ah another process raced me to it */ |
3113 | mlog(0, "tried to migrate %.*s, but some " | 3115 | mlog(0, "tried to migrate %.*s, but some " |
3114 | "process beat me to it\n", | 3116 | "process beat me to it\n", |
3115 | namelen, name); | 3117 | namelen, name); |
3116 | ret = -EEXIST; | 3118 | ret = -EEXIST; |
3117 | } else { | 3119 | } else { |
3118 | /* bad. 2 NODES are trying to migrate! */ | 3120 | /* bad. 2 NODES are trying to migrate! */ |
3119 | mlog(ML_ERROR, "migration error mle: " | 3121 | mlog(ML_ERROR, "migration error mle: " |
3120 | "master=%u new_master=%u // request: " | 3122 | "master=%u new_master=%u // request: " |
3121 | "master=%u new_master=%u // " | 3123 | "master=%u new_master=%u // " |
3122 | "lockres=%.*s\n", | 3124 | "lockres=%.*s\n", |
3123 | tmp->master, tmp->new_master, | 3125 | tmp->master, tmp->new_master, |
3124 | master, new_master, | 3126 | master, new_master, |
3125 | namelen, name); | 3127 | namelen, name); |
3126 | BUG(); | 3128 | BUG(); |
3127 | } | 3129 | } |
3128 | } else { | 3130 | } else { |
3129 | /* this is essentially what assert_master does */ | 3131 | /* this is essentially what assert_master does */ |
3130 | tmp->master = master; | 3132 | tmp->master = master; |
3131 | atomic_set(&tmp->woken, 1); | 3133 | atomic_set(&tmp->woken, 1); |
3132 | wake_up(&tmp->wq); | 3134 | wake_up(&tmp->wq); |
3133 | /* remove it so that only one mle will be found */ | 3135 | /* remove it so that only one mle will be found */ |
3134 | __dlm_unlink_mle(dlm, tmp); | 3136 | __dlm_unlink_mle(dlm, tmp); |
3135 | __dlm_mle_detach_hb_events(dlm, tmp); | 3137 | __dlm_mle_detach_hb_events(dlm, tmp); |
3136 | if (tmp->type == DLM_MLE_MASTER) { | 3138 | if (tmp->type == DLM_MLE_MASTER) { |
3137 | ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; | 3139 | ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; |
3138 | mlog(0, "%s:%.*s: master=%u, newmaster=%u, " | 3140 | mlog(0, "%s:%.*s: master=%u, newmaster=%u, " |
3139 | "telling master to get ref " | 3141 | "telling master to get ref " |
3140 | "for cleared out mle during " | 3142 | "for cleared out mle during " |
3141 | "migration\n", dlm->name, | 3143 | "migration\n", dlm->name, |
3142 | namelen, name, master, | 3144 | namelen, name, master, |
3143 | new_master); | 3145 | new_master); |
3144 | } | 3146 | } |
3145 | } | 3147 | } |
3146 | spin_unlock(&tmp->spinlock); | 3148 | spin_unlock(&tmp->spinlock); |
3147 | } | 3149 | } |
3148 | 3150 | ||
3149 | /* now add a migration mle to the tail of the list */ | 3151 | /* now add a migration mle to the tail of the list */ |
3150 | dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); | 3152 | dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); |
3151 | mle->new_master = new_master; | 3153 | mle->new_master = new_master; |
3152 | /* the new master will be sending an assert master for this. | 3154 | /* the new master will be sending an assert master for this. |
3153 | * at that point we will get the refmap reference */ | 3155 | * at that point we will get the refmap reference */ |
3154 | mle->master = master; | 3156 | mle->master = master; |
3155 | /* do this for consistency with other mle types */ | 3157 | /* do this for consistency with other mle types */ |
3156 | set_bit(new_master, mle->maybe_map); | 3158 | set_bit(new_master, mle->maybe_map); |
3157 | __dlm_insert_mle(dlm, mle); | 3159 | __dlm_insert_mle(dlm, mle); |
3158 | 3160 | ||
3159 | return ret; | 3161 | return ret; |
3160 | } | 3162 | } |
3161 | 3163 | ||
3162 | /* | 3164 | /* |
3163 | * Sets the owner of the lockres, associated to the mle, to UNKNOWN | 3165 | * Sets the owner of the lockres, associated to the mle, to UNKNOWN |
3164 | */ | 3166 | */ |
3165 | static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, | 3167 | static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, |
3166 | struct dlm_master_list_entry *mle) | 3168 | struct dlm_master_list_entry *mle) |
3167 | { | 3169 | { |
3168 | struct dlm_lock_resource *res; | 3170 | struct dlm_lock_resource *res; |
3169 | 3171 | ||
3170 | /* Find the lockres associated to the mle and set its owner to UNK */ | 3172 | /* Find the lockres associated to the mle and set its owner to UNK */ |
3171 | res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, | 3173 | res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, |
3172 | mle->mnamehash); | 3174 | mle->mnamehash); |
3173 | if (res) { | 3175 | if (res) { |
3174 | spin_unlock(&dlm->master_lock); | 3176 | spin_unlock(&dlm->master_lock); |
3175 | 3177 | ||
3176 | /* move lockres onto recovery list */ | 3178 | /* move lockres onto recovery list */ |
3177 | spin_lock(&res->spinlock); | 3179 | spin_lock(&res->spinlock); |
3178 | dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); | 3180 | dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); |
3179 | dlm_move_lockres_to_recovery_list(dlm, res); | 3181 | dlm_move_lockres_to_recovery_list(dlm, res); |
3180 | spin_unlock(&res->spinlock); | 3182 | spin_unlock(&res->spinlock); |
3181 | dlm_lockres_put(res); | 3183 | dlm_lockres_put(res); |
3182 | 3184 | ||
3183 | /* about to get rid of mle, detach from heartbeat */ | 3185 | /* about to get rid of mle, detach from heartbeat */ |
3184 | __dlm_mle_detach_hb_events(dlm, mle); | 3186 | __dlm_mle_detach_hb_events(dlm, mle); |
3185 | 3187 | ||
3186 | /* dump the mle */ | 3188 | /* dump the mle */ |
3187 | spin_lock(&dlm->master_lock); | 3189 | spin_lock(&dlm->master_lock); |
3188 | __dlm_put_mle(mle); | 3190 | __dlm_put_mle(mle); |
3189 | spin_unlock(&dlm->master_lock); | 3191 | spin_unlock(&dlm->master_lock); |
3190 | } | 3192 | } |
3191 | 3193 | ||
3192 | return res; | 3194 | return res; |
3193 | } | 3195 | } |
3194 | 3196 | ||
3195 | static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, | 3197 | static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, |
3196 | struct dlm_master_list_entry *mle) | 3198 | struct dlm_master_list_entry *mle) |
3197 | { | 3199 | { |
3198 | __dlm_mle_detach_hb_events(dlm, mle); | 3200 | __dlm_mle_detach_hb_events(dlm, mle); |
3199 | 3201 | ||
3200 | spin_lock(&mle->spinlock); | 3202 | spin_lock(&mle->spinlock); |
3201 | __dlm_unlink_mle(dlm, mle); | 3203 | __dlm_unlink_mle(dlm, mle); |
3202 | atomic_set(&mle->woken, 1); | 3204 | atomic_set(&mle->woken, 1); |
3203 | spin_unlock(&mle->spinlock); | 3205 | spin_unlock(&mle->spinlock); |
3204 | 3206 | ||
3205 | wake_up(&mle->wq); | 3207 | wake_up(&mle->wq); |
3206 | } | 3208 | } |
3207 | 3209 | ||
3208 | static void dlm_clean_block_mle(struct dlm_ctxt *dlm, | 3210 | static void dlm_clean_block_mle(struct dlm_ctxt *dlm, |
3209 | struct dlm_master_list_entry *mle, u8 dead_node) | 3211 | struct dlm_master_list_entry *mle, u8 dead_node) |
3210 | { | 3212 | { |
3211 | int bit; | 3213 | int bit; |
3212 | 3214 | ||
3213 | BUG_ON(mle->type != DLM_MLE_BLOCK); | 3215 | BUG_ON(mle->type != DLM_MLE_BLOCK); |
3214 | 3216 | ||
3215 | spin_lock(&mle->spinlock); | 3217 | spin_lock(&mle->spinlock); |
3216 | bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); | 3218 | bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); |
3217 | if (bit != dead_node) { | 3219 | if (bit != dead_node) { |
3218 | mlog(0, "mle found, but dead node %u would not have been " | 3220 | mlog(0, "mle found, but dead node %u would not have been " |
3219 | "master\n", dead_node); | 3221 | "master\n", dead_node); |
3220 | spin_unlock(&mle->spinlock); | 3222 | spin_unlock(&mle->spinlock); |
3221 | } else { | 3223 | } else { |
3222 | /* Must drop the refcount by one since the assert_master will | 3224 | /* Must drop the refcount by one since the assert_master will |
3223 | * never arrive. This may result in the mle being unlinked and | 3225 | * never arrive. This may result in the mle being unlinked and |
3224 | * freed, but there may still be a process waiting in the | 3226 | * freed, but there may still be a process waiting in the |
3225 | * dlmlock path which is fine. */ | 3227 | * dlmlock path which is fine. */ |
3226 | mlog(0, "node %u was expected master\n", dead_node); | 3228 | mlog(0, "node %u was expected master\n", dead_node); |
3227 | atomic_set(&mle->woken, 1); | 3229 | atomic_set(&mle->woken, 1); |
3228 | spin_unlock(&mle->spinlock); | 3230 | spin_unlock(&mle->spinlock); |
3229 | wake_up(&mle->wq); | 3231 | wake_up(&mle->wq); |
3230 | 3232 | ||
3231 | /* Do not need events any longer, so detach from heartbeat */ | 3233 | /* Do not need events any longer, so detach from heartbeat */ |
3232 | __dlm_mle_detach_hb_events(dlm, mle); | 3234 | __dlm_mle_detach_hb_events(dlm, mle); |
3233 | __dlm_put_mle(mle); | 3235 | __dlm_put_mle(mle); |
3234 | } | 3236 | } |
3235 | } | 3237 | } |
3236 | 3238 | ||
3237 | void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) | 3239 | void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) |
3238 | { | 3240 | { |
3239 | struct dlm_master_list_entry *mle; | 3241 | struct dlm_master_list_entry *mle; |
3240 | struct dlm_lock_resource *res; | 3242 | struct dlm_lock_resource *res; |
3241 | struct hlist_head *bucket; | 3243 | struct hlist_head *bucket; |
3242 | struct hlist_node *tmp; | 3244 | struct hlist_node *tmp; |
3243 | unsigned int i; | 3245 | unsigned int i; |
3244 | 3246 | ||
3245 | mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); | 3247 | mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); |
3246 | top: | 3248 | top: |
3247 | assert_spin_locked(&dlm->spinlock); | 3249 | assert_spin_locked(&dlm->spinlock); |
3248 | 3250 | ||
3249 | /* clean the master list */ | 3251 | /* clean the master list */ |
3250 | spin_lock(&dlm->master_lock); | 3252 | spin_lock(&dlm->master_lock); |
3251 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 3253 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
3252 | bucket = dlm_master_hash(dlm, i); | 3254 | bucket = dlm_master_hash(dlm, i); |
3253 | hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { | 3255 | hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { |
3254 | BUG_ON(mle->type != DLM_MLE_BLOCK && | 3256 | BUG_ON(mle->type != DLM_MLE_BLOCK && |
3255 | mle->type != DLM_MLE_MASTER && | 3257 | mle->type != DLM_MLE_MASTER && |
3256 | mle->type != DLM_MLE_MIGRATION); | 3258 | mle->type != DLM_MLE_MIGRATION); |
3257 | 3259 | ||
3258 | /* MASTER mles are initiated locally. The waiting | 3260 | /* MASTER mles are initiated locally. The waiting |
3259 | * process will notice the node map change shortly. | 3261 | * process will notice the node map change shortly. |
3260 | * Let that happen as normal. */ | 3262 | * Let that happen as normal. */ |
3261 | if (mle->type == DLM_MLE_MASTER) | 3263 | if (mle->type == DLM_MLE_MASTER) |
3262 | continue; | 3264 | continue; |
3263 | 3265 | ||
3264 | /* BLOCK mles are initiated by other nodes. Need to | 3266 | /* BLOCK mles are initiated by other nodes. Need to |
3265 | * clean up if the dead node would have been the | 3267 | * clean up if the dead node would have been the |
3266 | * master. */ | 3268 | * master. */ |
3267 | if (mle->type == DLM_MLE_BLOCK) { | 3269 | if (mle->type == DLM_MLE_BLOCK) { |
3268 | dlm_clean_block_mle(dlm, mle, dead_node); | 3270 | dlm_clean_block_mle(dlm, mle, dead_node); |
3269 | continue; | 3271 | continue; |
3270 | } | 3272 | } |
3271 | 3273 | ||
3272 | /* Everything else is a MIGRATION mle */ | 3274 | /* Everything else is a MIGRATION mle */ |
3273 | 3275 | ||
3274 | /* The rule for MIGRATION mles is that the master | 3276 | /* The rule for MIGRATION mles is that the master |
3275 | * becomes UNKNOWN if *either* the original or the new | 3277 | * becomes UNKNOWN if *either* the original or the new |
3276 | * master dies. All UNKNOWN lockres' are sent to | 3278 | * master dies. All UNKNOWN lockres' are sent to |
3277 | * whichever node becomes the recovery master. The new | 3279 | * whichever node becomes the recovery master. The new |
3278 | * master is responsible for determining if there is | 3280 | * master is responsible for determining if there is |
3279 | * still a master for this lockres, or if he needs to | 3281 | * still a master for this lockres, or if he needs to |
3280 | * take over mastery. Either way, this node should | 3282 | * take over mastery. Either way, this node should |
3281 | * expect another message to resolve this. */ | 3283 | * expect another message to resolve this. */ |
3282 | 3284 | ||
3283 | if (mle->master != dead_node && | 3285 | if (mle->master != dead_node && |
3284 | mle->new_master != dead_node) | 3286 | mle->new_master != dead_node) |
3285 | continue; | 3287 | continue; |
3286 | 3288 | ||
3287 | /* If we have reached this point, this mle needs to be | 3289 | /* If we have reached this point, this mle needs to be |
3288 | * removed from the list and freed. */ | 3290 | * removed from the list and freed. */ |
3289 | dlm_clean_migration_mle(dlm, mle); | 3291 | dlm_clean_migration_mle(dlm, mle); |
3290 | 3292 | ||
3291 | mlog(0, "%s: node %u died during migration from " | 3293 | mlog(0, "%s: node %u died during migration from " |
3292 | "%u to %u!\n", dlm->name, dead_node, mle->master, | 3294 | "%u to %u!\n", dlm->name, dead_node, mle->master, |
3293 | mle->new_master); | 3295 | mle->new_master); |
3294 | 3296 | ||
3295 | /* If we find a lockres associated with the mle, we've | 3297 | /* If we find a lockres associated with the mle, we've |
3296 | * hit this rare case that messes up our lock ordering. | 3298 | * hit this rare case that messes up our lock ordering. |
3297 | * If so, we need to drop the master lock so that we can | 3299 | * If so, we need to drop the master lock so that we can |
3298 | * take the lockres lock, meaning that we will have to | 3300 | * take the lockres lock, meaning that we will have to |
3299 | * restart from the head of list. */ | 3301 | * restart from the head of list. */ |
3300 | res = dlm_reset_mleres_owner(dlm, mle); | 3302 | res = dlm_reset_mleres_owner(dlm, mle); |
3301 | if (res) | 3303 | if (res) |
3302 | /* restart */ | 3304 | /* restart */ |
3303 | goto top; | 3305 | goto top; |
3304 | 3306 | ||
3305 | /* This may be the last reference */ | 3307 | /* This may be the last reference */ |
3306 | __dlm_put_mle(mle); | 3308 | __dlm_put_mle(mle); |
3307 | } | 3309 | } |
3308 | } | 3310 | } |
3309 | spin_unlock(&dlm->master_lock); | 3311 | spin_unlock(&dlm->master_lock); |
3310 | } | 3312 | } |
3311 | 3313 | ||
3312 | int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | 3314 | int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
3313 | u8 old_master) | 3315 | u8 old_master) |
3314 | { | 3316 | { |
3315 | struct dlm_node_iter iter; | 3317 | struct dlm_node_iter iter; |
3316 | int ret = 0; | 3318 | int ret = 0; |
3317 | 3319 | ||
3318 | spin_lock(&dlm->spinlock); | 3320 | spin_lock(&dlm->spinlock); |
3319 | dlm_node_iter_init(dlm->domain_map, &iter); | 3321 | dlm_node_iter_init(dlm->domain_map, &iter); |
3320 | clear_bit(old_master, iter.node_map); | 3322 | clear_bit(old_master, iter.node_map); |
3321 | clear_bit(dlm->node_num, iter.node_map); | 3323 | clear_bit(dlm->node_num, iter.node_map); |
3322 | spin_unlock(&dlm->spinlock); | 3324 | spin_unlock(&dlm->spinlock); |
3323 | 3325 | ||
3324 | /* ownership of the lockres is changing. account for the | 3326 | /* ownership of the lockres is changing. account for the |
3325 | * mastery reference here since old_master will briefly have | 3327 | * mastery reference here since old_master will briefly have |
3326 | * a reference after the migration completes */ | 3328 | * a reference after the migration completes */ |
3327 | spin_lock(&res->spinlock); | 3329 | spin_lock(&res->spinlock); |
3328 | dlm_lockres_set_refmap_bit(dlm, res, old_master); | 3330 | dlm_lockres_set_refmap_bit(dlm, res, old_master); |
3329 | spin_unlock(&res->spinlock); | 3331 | spin_unlock(&res->spinlock); |
3330 | 3332 | ||
3331 | mlog(0, "now time to do a migrate request to other nodes\n"); | 3333 | mlog(0, "now time to do a migrate request to other nodes\n"); |
3332 | ret = dlm_do_migrate_request(dlm, res, old_master, | 3334 | ret = dlm_do_migrate_request(dlm, res, old_master, |
3333 | dlm->node_num, &iter); | 3335 | dlm->node_num, &iter); |
3334 | if (ret < 0) { | 3336 | if (ret < 0) { |
3335 | mlog_errno(ret); | 3337 | mlog_errno(ret); |
3336 | goto leave; | 3338 | goto leave; |
3337 | } | 3339 | } |
3338 | 3340 | ||
3339 | mlog(0, "doing assert master of %.*s to all except the original node\n", | 3341 | mlog(0, "doing assert master of %.*s to all except the original node\n", |
3340 | res->lockname.len, res->lockname.name); | 3342 | res->lockname.len, res->lockname.name); |
3341 | /* this call now finishes out the nodemap | 3343 | /* this call now finishes out the nodemap |
3342 | * even if one or more nodes die */ | 3344 | * even if one or more nodes die */ |
3343 | ret = dlm_do_assert_master(dlm, res, iter.node_map, | 3345 | ret = dlm_do_assert_master(dlm, res, iter.node_map, |
3344 | DLM_ASSERT_MASTER_FINISH_MIGRATION); | 3346 | DLM_ASSERT_MASTER_FINISH_MIGRATION); |
3345 | if (ret < 0) { | 3347 | if (ret < 0) { |
3346 | /* no longer need to retry. all living nodes contacted. */ | 3348 | /* no longer need to retry. all living nodes contacted. */ |
3347 | mlog_errno(ret); | 3349 | mlog_errno(ret); |
3348 | ret = 0; | 3350 | ret = 0; |
3349 | } | 3351 | } |
3350 | 3352 | ||
3351 | memset(iter.node_map, 0, sizeof(iter.node_map)); | 3353 | memset(iter.node_map, 0, sizeof(iter.node_map)); |
3352 | set_bit(old_master, iter.node_map); | 3354 | set_bit(old_master, iter.node_map); |
3353 | mlog(0, "doing assert master of %.*s back to %u\n", | 3355 | mlog(0, "doing assert master of %.*s back to %u\n", |
3354 | res->lockname.len, res->lockname.name, old_master); | 3356 | res->lockname.len, res->lockname.name, old_master); |
3355 | ret = dlm_do_assert_master(dlm, res, iter.node_map, | 3357 | ret = dlm_do_assert_master(dlm, res, iter.node_map, |
3356 | DLM_ASSERT_MASTER_FINISH_MIGRATION); | 3358 | DLM_ASSERT_MASTER_FINISH_MIGRATION); |
3357 | if (ret < 0) { | 3359 | if (ret < 0) { |
3358 | mlog(0, "assert master to original master failed " | 3360 | mlog(0, "assert master to original master failed " |
3359 | "with %d.\n", ret); | 3361 | "with %d.\n", ret); |
3360 | /* the only nonzero status here would be because of | 3362 | /* the only nonzero status here would be because of |
3361 | * a dead original node. we're done. */ | 3363 | * a dead original node. we're done. */ |
3362 | ret = 0; | 3364 | ret = 0; |
3363 | } | 3365 | } |
3364 | 3366 | ||
3365 | /* all done, set the owner, clear the flag */ | 3367 | /* all done, set the owner, clear the flag */ |
3366 | spin_lock(&res->spinlock); | 3368 | spin_lock(&res->spinlock); |
3367 | dlm_set_lockres_owner(dlm, res, dlm->node_num); | 3369 | dlm_set_lockres_owner(dlm, res, dlm->node_num); |
3368 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 3370 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
3369 | spin_unlock(&res->spinlock); | 3371 | spin_unlock(&res->spinlock); |
3370 | /* re-dirty it on the new master */ | 3372 | /* re-dirty it on the new master */ |
3371 | dlm_kick_thread(dlm, res); | 3373 | dlm_kick_thread(dlm, res); |
3372 | wake_up(&res->wq); | 3374 | wake_up(&res->wq); |
3373 | leave: | 3375 | leave: |
3374 | return ret; | 3376 | return ret; |
3375 | } | 3377 | } |
3376 | 3378 | ||
3377 | /* | 3379 | /* |
3378 | * LOCKRES AST REFCOUNT | 3380 | * LOCKRES AST REFCOUNT |
3379 | * this is integral to migration | 3381 | * this is integral to migration |
3380 | */ | 3382 | */ |
3381 | 3383 | ||
3382 | /* for future intent to call an ast, reserve one ahead of time. | 3384 | /* for future intent to call an ast, reserve one ahead of time. |
3383 | * this should be called only after waiting on the lockres | 3385 | * this should be called only after waiting on the lockres |
3384 | * with dlm_wait_on_lockres, and while still holding the | 3386 | * with dlm_wait_on_lockres, and while still holding the |
3385 | * spinlock after the call. */ | 3387 | * spinlock after the call. */ |
3386 | void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) | 3388 | void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) |
3387 | { | 3389 | { |
3388 | assert_spin_locked(&res->spinlock); | 3390 | assert_spin_locked(&res->spinlock); |
3389 | if (res->state & DLM_LOCK_RES_MIGRATING) { | 3391 | if (res->state & DLM_LOCK_RES_MIGRATING) { |
3390 | __dlm_print_one_lock_resource(res); | 3392 | __dlm_print_one_lock_resource(res); |
3391 | } | 3393 | } |
3392 | BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); | 3394 | BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); |
3393 | 3395 | ||
3394 | atomic_inc(&res->asts_reserved); | 3396 | atomic_inc(&res->asts_reserved); |
3395 | } | 3397 | } |
3396 | 3398 | ||
3397 | /* | 3399 | /* |
3398 | * used to drop the reserved ast, either because it went unused, | 3400 | * used to drop the reserved ast, either because it went unused, |
3399 | * or because the ast/bast was actually called. | 3401 | * or because the ast/bast was actually called. |
3400 | * | 3402 | * |
3401 | * also, if there is a pending migration on this lockres, | 3403 | * also, if there is a pending migration on this lockres, |
3402 | * and this was the last pending ast on the lockres, | 3404 | * and this was the last pending ast on the lockres, |
3403 | * atomically set the MIGRATING flag before we drop the lock. | 3405 | * atomically set the MIGRATING flag before we drop the lock. |
3404 | * this is how we ensure that migration can proceed with no | 3406 | * this is how we ensure that migration can proceed with no |
3405 | * asts in progress. note that it is ok if the state of the | 3407 | * asts in progress. note that it is ok if the state of the |
3406 | * queues is such that a lock should be granted in the future | 3408 | * queues is such that a lock should be granted in the future |
3407 | * or that a bast should be fired, because the new master will | 3409 | * or that a bast should be fired, because the new master will |
3408 | * shuffle the lists on this lockres as soon as it is migrated. | 3410 | * shuffle the lists on this lockres as soon as it is migrated. |
3409 | */ | 3411 | */ |
3410 | void dlm_lockres_release_ast(struct dlm_ctxt *dlm, | 3412 | void dlm_lockres_release_ast(struct dlm_ctxt *dlm, |
3411 | struct dlm_lock_resource *res) | 3413 | struct dlm_lock_resource *res) |
3412 | { | 3414 | { |
3413 | if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) | 3415 | if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) |
3414 | return; | 3416 | return; |
3415 | 3417 | ||
3416 | if (!res->migration_pending) { | 3418 | if (!res->migration_pending) { |
3417 | spin_unlock(&res->spinlock); | 3419 | spin_unlock(&res->spinlock); |
3418 | return; | 3420 | return; |
3419 | } | 3421 | } |
3420 | 3422 | ||
3421 | BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); | 3423 | BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); |
3422 | res->migration_pending = 0; | 3424 | res->migration_pending = 0; |
3423 | res->state |= DLM_LOCK_RES_MIGRATING; | 3425 | res->state |= DLM_LOCK_RES_MIGRATING; |
3424 | spin_unlock(&res->spinlock); | 3426 | spin_unlock(&res->spinlock); |
3425 | wake_up(&res->wq); | 3427 | wake_up(&res->wq); |
3426 | wake_up(&dlm->migration_wq); | 3428 | wake_up(&dlm->migration_wq); |
3427 | } | 3429 | } |
3428 | 3430 | ||
3429 | void dlm_force_free_mles(struct dlm_ctxt *dlm) | 3431 | void dlm_force_free_mles(struct dlm_ctxt *dlm) |
3430 | { | 3432 | { |
3431 | int i; | 3433 | int i; |
3432 | struct hlist_head *bucket; | 3434 | struct hlist_head *bucket; |
3433 | struct dlm_master_list_entry *mle; | 3435 | struct dlm_master_list_entry *mle; |
3434 | struct hlist_node *tmp; | 3436 | struct hlist_node *tmp; |
3435 | 3437 | ||
3436 | /* | 3438 | /* |
3437 | * We notified all other nodes that we are exiting the domain and | 3439 | * We notified all other nodes that we are exiting the domain and |
3438 | * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still | 3440 | * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still |
3439 | * around we force free them and wake any processes that are waiting | 3441 | * around we force free them and wake any processes that are waiting |
3440 | * on the mles | 3442 | * on the mles |
3441 | */ | 3443 | */ |
3442 | spin_lock(&dlm->spinlock); | 3444 | spin_lock(&dlm->spinlock); |
3443 | spin_lock(&dlm->master_lock); | 3445 | spin_lock(&dlm->master_lock); |
3444 | 3446 | ||
3445 | BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); | 3447 | BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); |
3446 | BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); | 3448 | BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); |
3447 | 3449 | ||
3448 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 3450 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
3449 | bucket = dlm_master_hash(dlm, i); | 3451 | bucket = dlm_master_hash(dlm, i); |
3450 | hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { | 3452 | hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { |
3451 | if (mle->type != DLM_MLE_BLOCK) { | 3453 | if (mle->type != DLM_MLE_BLOCK) { |
3452 | mlog(ML_ERROR, "bad mle: %p\n", mle); | 3454 | mlog(ML_ERROR, "bad mle: %p\n", mle); |
3453 | dlm_print_one_mle(mle); | 3455 | dlm_print_one_mle(mle); |
3454 | } | 3456 | } |
3455 | atomic_set(&mle->woken, 1); | 3457 | atomic_set(&mle->woken, 1); |
3456 | wake_up(&mle->wq); | 3458 | wake_up(&mle->wq); |
3457 | 3459 | ||
3458 | __dlm_unlink_mle(dlm, mle); | 3460 | __dlm_unlink_mle(dlm, mle); |
3459 | __dlm_mle_detach_hb_events(dlm, mle); | 3461 | __dlm_mle_detach_hb_events(dlm, mle); |
3460 | __dlm_put_mle(mle); | 3462 | __dlm_put_mle(mle); |
3461 | } | 3463 | } |
fs/ocfs2/super.c
1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
3 | * | 3 | * |
4 | * super.c | 4 | * super.c |
5 | * | 5 | * |
6 | * load/unload driver, mount/dismount volumes | 6 | * load/unload driver, mount/dismount volumes |
7 | * | 7 | * |
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU General Public | 11 | * modify it under the terms of the GNU General Public |
12 | * License as published by the Free Software Foundation; either | 12 | * License as published by the Free Software Foundation; either |
13 | * version 2 of the License, or (at your option) any later version. | 13 | * version 2 of the License, or (at your option) any later version. |
14 | * | 14 | * |
15 | * This program is distributed in the hope that it will be useful, | 15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * General Public License for more details. | 18 | * General Public License for more details. |
19 | * | 19 | * |
20 | * You should have received a copy of the GNU General Public | 20 | * You should have received a copy of the GNU General Public |
21 | * License along with this program; if not, write to the | 21 | * License along with this program; if not, write to the |
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
23 | * Boston, MA 021110-1307, USA. | 23 | * Boston, MA 021110-1307, USA. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/highmem.h> | 30 | #include <linux/highmem.h> |
31 | #include <linux/init.h> | 31 | #include <linux/init.h> |
32 | #include <linux/random.h> | 32 | #include <linux/random.h> |
33 | #include <linux/statfs.h> | 33 | #include <linux/statfs.h> |
34 | #include <linux/moduleparam.h> | 34 | #include <linux/moduleparam.h> |
35 | #include <linux/blkdev.h> | 35 | #include <linux/blkdev.h> |
36 | #include <linux/socket.h> | 36 | #include <linux/socket.h> |
37 | #include <linux/inet.h> | 37 | #include <linux/inet.h> |
38 | #include <linux/parser.h> | 38 | #include <linux/parser.h> |
39 | #include <linux/crc32.h> | 39 | #include <linux/crc32.h> |
40 | #include <linux/debugfs.h> | 40 | #include <linux/debugfs.h> |
41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
42 | #include <linux/seq_file.h> | 42 | #include <linux/seq_file.h> |
43 | #include <linux/quotaops.h> | 43 | #include <linux/quotaops.h> |
44 | #include <linux/cleancache.h> | 44 | #include <linux/cleancache.h> |
45 | 45 | ||
46 | #define CREATE_TRACE_POINTS | 46 | #define CREATE_TRACE_POINTS |
47 | #include "ocfs2_trace.h" | 47 | #include "ocfs2_trace.h" |
48 | 48 | ||
49 | #include <cluster/masklog.h> | 49 | #include <cluster/masklog.h> |
50 | 50 | ||
51 | #include "ocfs2.h" | 51 | #include "ocfs2.h" |
52 | 52 | ||
53 | /* this should be the only file to include a version 1 header */ | 53 | /* this should be the only file to include a version 1 header */ |
54 | #include "ocfs1_fs_compat.h" | 54 | #include "ocfs1_fs_compat.h" |
55 | 55 | ||
56 | #include "alloc.h" | 56 | #include "alloc.h" |
57 | #include "aops.h" | 57 | #include "aops.h" |
58 | #include "blockcheck.h" | 58 | #include "blockcheck.h" |
59 | #include "dlmglue.h" | 59 | #include "dlmglue.h" |
60 | #include "export.h" | 60 | #include "export.h" |
61 | #include "extent_map.h" | 61 | #include "extent_map.h" |
62 | #include "heartbeat.h" | 62 | #include "heartbeat.h" |
63 | #include "inode.h" | 63 | #include "inode.h" |
64 | #include "journal.h" | 64 | #include "journal.h" |
65 | #include "localalloc.h" | 65 | #include "localalloc.h" |
66 | #include "namei.h" | 66 | #include "namei.h" |
67 | #include "slot_map.h" | 67 | #include "slot_map.h" |
68 | #include "super.h" | 68 | #include "super.h" |
69 | #include "sysfile.h" | 69 | #include "sysfile.h" |
70 | #include "uptodate.h" | 70 | #include "uptodate.h" |
71 | #include "xattr.h" | 71 | #include "xattr.h" |
72 | #include "quota.h" | 72 | #include "quota.h" |
73 | #include "refcounttree.h" | 73 | #include "refcounttree.h" |
74 | #include "suballoc.h" | 74 | #include "suballoc.h" |
75 | 75 | ||
76 | #include "buffer_head_io.h" | 76 | #include "buffer_head_io.h" |
77 | 77 | ||
78 | static struct kmem_cache *ocfs2_inode_cachep; | 78 | static struct kmem_cache *ocfs2_inode_cachep; |
79 | struct kmem_cache *ocfs2_dquot_cachep; | 79 | struct kmem_cache *ocfs2_dquot_cachep; |
80 | struct kmem_cache *ocfs2_qf_chunk_cachep; | 80 | struct kmem_cache *ocfs2_qf_chunk_cachep; |
81 | 81 | ||
82 | /* OCFS2 needs to schedule several different types of work which | 82 | /* OCFS2 needs to schedule several different types of work which |
83 | * require cluster locking, disk I/O, recovery waits, etc. Since these | 83 | * require cluster locking, disk I/O, recovery waits, etc. Since these |
84 | * types of work tend to be heavy we avoid using the kernel events | 84 | * types of work tend to be heavy we avoid using the kernel events |
85 | * workqueue and schedule on our own. */ | 85 | * workqueue and schedule on our own. */ |
86 | struct workqueue_struct *ocfs2_wq = NULL; | 86 | struct workqueue_struct *ocfs2_wq = NULL; |
87 | 87 | ||
88 | static struct dentry *ocfs2_debugfs_root; | 88 | static struct dentry *ocfs2_debugfs_root; |
89 | 89 | ||
90 | MODULE_AUTHOR("Oracle"); | 90 | MODULE_AUTHOR("Oracle"); |
91 | MODULE_LICENSE("GPL"); | 91 | MODULE_LICENSE("GPL"); |
92 | MODULE_DESCRIPTION("OCFS2 cluster file system"); | 92 | MODULE_DESCRIPTION("OCFS2 cluster file system"); |
93 | 93 | ||
94 | struct mount_options | 94 | struct mount_options |
95 | { | 95 | { |
96 | unsigned long commit_interval; | 96 | unsigned long commit_interval; |
97 | unsigned long mount_opt; | 97 | unsigned long mount_opt; |
98 | unsigned int atime_quantum; | 98 | unsigned int atime_quantum; |
99 | signed short slot; | 99 | signed short slot; |
100 | int localalloc_opt; | 100 | int localalloc_opt; |
101 | unsigned int resv_level; | 101 | unsigned int resv_level; |
102 | int dir_resv_level; | 102 | int dir_resv_level; |
103 | char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; | 103 | char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; |
104 | }; | 104 | }; |
105 | 105 | ||
106 | static int ocfs2_parse_options(struct super_block *sb, char *options, | 106 | static int ocfs2_parse_options(struct super_block *sb, char *options, |
107 | struct mount_options *mopt, | 107 | struct mount_options *mopt, |
108 | int is_remount); | 108 | int is_remount); |
109 | static int ocfs2_check_set_options(struct super_block *sb, | 109 | static int ocfs2_check_set_options(struct super_block *sb, |
110 | struct mount_options *options); | 110 | struct mount_options *options); |
111 | static int ocfs2_show_options(struct seq_file *s, struct dentry *root); | 111 | static int ocfs2_show_options(struct seq_file *s, struct dentry *root); |
112 | static void ocfs2_put_super(struct super_block *sb); | 112 | static void ocfs2_put_super(struct super_block *sb); |
113 | static int ocfs2_mount_volume(struct super_block *sb); | 113 | static int ocfs2_mount_volume(struct super_block *sb); |
114 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data); | 114 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data); |
115 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); | 115 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); |
116 | static int ocfs2_initialize_mem_caches(void); | 116 | static int ocfs2_initialize_mem_caches(void); |
117 | static void ocfs2_free_mem_caches(void); | 117 | static void ocfs2_free_mem_caches(void); |
118 | static void ocfs2_delete_osb(struct ocfs2_super *osb); | 118 | static void ocfs2_delete_osb(struct ocfs2_super *osb); |
119 | 119 | ||
120 | static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); | 120 | static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); |
121 | 121 | ||
122 | static int ocfs2_sync_fs(struct super_block *sb, int wait); | 122 | static int ocfs2_sync_fs(struct super_block *sb, int wait); |
123 | 123 | ||
124 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); | 124 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); |
125 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); | 125 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); |
126 | static void ocfs2_release_system_inodes(struct ocfs2_super *osb); | 126 | static void ocfs2_release_system_inodes(struct ocfs2_super *osb); |
127 | static int ocfs2_check_volume(struct ocfs2_super *osb); | 127 | static int ocfs2_check_volume(struct ocfs2_super *osb); |
128 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | 128 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, |
129 | struct buffer_head *bh, | 129 | struct buffer_head *bh, |
130 | u32 sectsize, | 130 | u32 sectsize, |
131 | struct ocfs2_blockcheck_stats *stats); | 131 | struct ocfs2_blockcheck_stats *stats); |
132 | static int ocfs2_initialize_super(struct super_block *sb, | 132 | static int ocfs2_initialize_super(struct super_block *sb, |
133 | struct buffer_head *bh, | 133 | struct buffer_head *bh, |
134 | int sector_size, | 134 | int sector_size, |
135 | struct ocfs2_blockcheck_stats *stats); | 135 | struct ocfs2_blockcheck_stats *stats); |
136 | static int ocfs2_get_sector(struct super_block *sb, | 136 | static int ocfs2_get_sector(struct super_block *sb, |
137 | struct buffer_head **bh, | 137 | struct buffer_head **bh, |
138 | int block, | 138 | int block, |
139 | int sect_size); | 139 | int sect_size); |
140 | static struct inode *ocfs2_alloc_inode(struct super_block *sb); | 140 | static struct inode *ocfs2_alloc_inode(struct super_block *sb); |
141 | static void ocfs2_destroy_inode(struct inode *inode); | 141 | static void ocfs2_destroy_inode(struct inode *inode); |
142 | static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); | 142 | static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); |
143 | static int ocfs2_enable_quotas(struct ocfs2_super *osb); | 143 | static int ocfs2_enable_quotas(struct ocfs2_super *osb); |
144 | static void ocfs2_disable_quotas(struct ocfs2_super *osb); | 144 | static void ocfs2_disable_quotas(struct ocfs2_super *osb); |
145 | 145 | ||
146 | static const struct super_operations ocfs2_sops = { | 146 | static const struct super_operations ocfs2_sops = { |
147 | .statfs = ocfs2_statfs, | 147 | .statfs = ocfs2_statfs, |
148 | .alloc_inode = ocfs2_alloc_inode, | 148 | .alloc_inode = ocfs2_alloc_inode, |
149 | .destroy_inode = ocfs2_destroy_inode, | 149 | .destroy_inode = ocfs2_destroy_inode, |
150 | .drop_inode = ocfs2_drop_inode, | 150 | .drop_inode = ocfs2_drop_inode, |
151 | .evict_inode = ocfs2_evict_inode, | 151 | .evict_inode = ocfs2_evict_inode, |
152 | .sync_fs = ocfs2_sync_fs, | 152 | .sync_fs = ocfs2_sync_fs, |
153 | .put_super = ocfs2_put_super, | 153 | .put_super = ocfs2_put_super, |
154 | .remount_fs = ocfs2_remount, | 154 | .remount_fs = ocfs2_remount, |
155 | .show_options = ocfs2_show_options, | 155 | .show_options = ocfs2_show_options, |
156 | .quota_read = ocfs2_quota_read, | 156 | .quota_read = ocfs2_quota_read, |
157 | .quota_write = ocfs2_quota_write, | 157 | .quota_write = ocfs2_quota_write, |
158 | }; | 158 | }; |
159 | 159 | ||
160 | enum { | 160 | enum { |
161 | Opt_barrier, | 161 | Opt_barrier, |
162 | Opt_err_panic, | 162 | Opt_err_panic, |
163 | Opt_err_ro, | 163 | Opt_err_ro, |
164 | Opt_intr, | 164 | Opt_intr, |
165 | Opt_nointr, | 165 | Opt_nointr, |
166 | Opt_hb_none, | 166 | Opt_hb_none, |
167 | Opt_hb_local, | 167 | Opt_hb_local, |
168 | Opt_hb_global, | 168 | Opt_hb_global, |
169 | Opt_data_ordered, | 169 | Opt_data_ordered, |
170 | Opt_data_writeback, | 170 | Opt_data_writeback, |
171 | Opt_atime_quantum, | 171 | Opt_atime_quantum, |
172 | Opt_slot, | 172 | Opt_slot, |
173 | Opt_commit, | 173 | Opt_commit, |
174 | Opt_localalloc, | 174 | Opt_localalloc, |
175 | Opt_localflocks, | 175 | Opt_localflocks, |
176 | Opt_stack, | 176 | Opt_stack, |
177 | Opt_user_xattr, | 177 | Opt_user_xattr, |
178 | Opt_nouser_xattr, | 178 | Opt_nouser_xattr, |
179 | Opt_inode64, | 179 | Opt_inode64, |
180 | Opt_acl, | 180 | Opt_acl, |
181 | Opt_noacl, | 181 | Opt_noacl, |
182 | Opt_usrquota, | 182 | Opt_usrquota, |
183 | Opt_grpquota, | 183 | Opt_grpquota, |
184 | Opt_coherency_buffered, | 184 | Opt_coherency_buffered, |
185 | Opt_coherency_full, | 185 | Opt_coherency_full, |
186 | Opt_resv_level, | 186 | Opt_resv_level, |
187 | Opt_dir_resv_level, | 187 | Opt_dir_resv_level, |
188 | Opt_err, | 188 | Opt_err, |
189 | }; | 189 | }; |
190 | 190 | ||
191 | static const match_table_t tokens = { | 191 | static const match_table_t tokens = { |
192 | {Opt_barrier, "barrier=%u"}, | 192 | {Opt_barrier, "barrier=%u"}, |
193 | {Opt_err_panic, "errors=panic"}, | 193 | {Opt_err_panic, "errors=panic"}, |
194 | {Opt_err_ro, "errors=remount-ro"}, | 194 | {Opt_err_ro, "errors=remount-ro"}, |
195 | {Opt_intr, "intr"}, | 195 | {Opt_intr, "intr"}, |
196 | {Opt_nointr, "nointr"}, | 196 | {Opt_nointr, "nointr"}, |
197 | {Opt_hb_none, OCFS2_HB_NONE}, | 197 | {Opt_hb_none, OCFS2_HB_NONE}, |
198 | {Opt_hb_local, OCFS2_HB_LOCAL}, | 198 | {Opt_hb_local, OCFS2_HB_LOCAL}, |
199 | {Opt_hb_global, OCFS2_HB_GLOBAL}, | 199 | {Opt_hb_global, OCFS2_HB_GLOBAL}, |
200 | {Opt_data_ordered, "data=ordered"}, | 200 | {Opt_data_ordered, "data=ordered"}, |
201 | {Opt_data_writeback, "data=writeback"}, | 201 | {Opt_data_writeback, "data=writeback"}, |
202 | {Opt_atime_quantum, "atime_quantum=%u"}, | 202 | {Opt_atime_quantum, "atime_quantum=%u"}, |
203 | {Opt_slot, "preferred_slot=%u"}, | 203 | {Opt_slot, "preferred_slot=%u"}, |
204 | {Opt_commit, "commit=%u"}, | 204 | {Opt_commit, "commit=%u"}, |
205 | {Opt_localalloc, "localalloc=%d"}, | 205 | {Opt_localalloc, "localalloc=%d"}, |
206 | {Opt_localflocks, "localflocks"}, | 206 | {Opt_localflocks, "localflocks"}, |
207 | {Opt_stack, "cluster_stack=%s"}, | 207 | {Opt_stack, "cluster_stack=%s"}, |
208 | {Opt_user_xattr, "user_xattr"}, | 208 | {Opt_user_xattr, "user_xattr"}, |
209 | {Opt_nouser_xattr, "nouser_xattr"}, | 209 | {Opt_nouser_xattr, "nouser_xattr"}, |
210 | {Opt_inode64, "inode64"}, | 210 | {Opt_inode64, "inode64"}, |
211 | {Opt_acl, "acl"}, | 211 | {Opt_acl, "acl"}, |
212 | {Opt_noacl, "noacl"}, | 212 | {Opt_noacl, "noacl"}, |
213 | {Opt_usrquota, "usrquota"}, | 213 | {Opt_usrquota, "usrquota"}, |
214 | {Opt_grpquota, "grpquota"}, | 214 | {Opt_grpquota, "grpquota"}, |
215 | {Opt_coherency_buffered, "coherency=buffered"}, | 215 | {Opt_coherency_buffered, "coherency=buffered"}, |
216 | {Opt_coherency_full, "coherency=full"}, | 216 | {Opt_coherency_full, "coherency=full"}, |
217 | {Opt_resv_level, "resv_level=%u"}, | 217 | {Opt_resv_level, "resv_level=%u"}, |
218 | {Opt_dir_resv_level, "dir_resv_level=%u"}, | 218 | {Opt_dir_resv_level, "dir_resv_level=%u"}, |
219 | {Opt_err, NULL} | 219 | {Opt_err, NULL} |
220 | }; | 220 | }; |
221 | 221 | ||
222 | #ifdef CONFIG_DEBUG_FS | 222 | #ifdef CONFIG_DEBUG_FS |
223 | static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) | 223 | static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) |
224 | { | 224 | { |
225 | struct ocfs2_cluster_connection *cconn = osb->cconn; | 225 | struct ocfs2_cluster_connection *cconn = osb->cconn; |
226 | struct ocfs2_recovery_map *rm = osb->recovery_map; | 226 | struct ocfs2_recovery_map *rm = osb->recovery_map; |
227 | struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; | 227 | struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; |
228 | int i, out = 0; | 228 | int i, out = 0; |
229 | 229 | ||
230 | out += snprintf(buf + out, len - out, | 230 | out += snprintf(buf + out, len - out, |
231 | "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", | 231 | "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", |
232 | "Device", osb->dev_str, osb->uuid_str, | 232 | "Device", osb->dev_str, osb->uuid_str, |
233 | osb->fs_generation, osb->vol_label); | 233 | osb->fs_generation, osb->vol_label); |
234 | 234 | ||
235 | out += snprintf(buf + out, len - out, | 235 | out += snprintf(buf + out, len - out, |
236 | "%10s => State: %d Flags: 0x%lX\n", "Volume", | 236 | "%10s => State: %d Flags: 0x%lX\n", "Volume", |
237 | atomic_read(&osb->vol_state), osb->osb_flags); | 237 | atomic_read(&osb->vol_state), osb->osb_flags); |
238 | 238 | ||
239 | out += snprintf(buf + out, len - out, | 239 | out += snprintf(buf + out, len - out, |
240 | "%10s => Block: %lu Cluster: %d\n", "Sizes", | 240 | "%10s => Block: %lu Cluster: %d\n", "Sizes", |
241 | osb->sb->s_blocksize, osb->s_clustersize); | 241 | osb->sb->s_blocksize, osb->s_clustersize); |
242 | 242 | ||
243 | out += snprintf(buf + out, len - out, | 243 | out += snprintf(buf + out, len - out, |
244 | "%10s => Compat: 0x%X Incompat: 0x%X " | 244 | "%10s => Compat: 0x%X Incompat: 0x%X " |
245 | "ROcompat: 0x%X\n", | 245 | "ROcompat: 0x%X\n", |
246 | "Features", osb->s_feature_compat, | 246 | "Features", osb->s_feature_compat, |
247 | osb->s_feature_incompat, osb->s_feature_ro_compat); | 247 | osb->s_feature_incompat, osb->s_feature_ro_compat); |
248 | 248 | ||
249 | out += snprintf(buf + out, len - out, | 249 | out += snprintf(buf + out, len - out, |
250 | "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", | 250 | "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", |
251 | osb->s_mount_opt, osb->s_atime_quantum); | 251 | osb->s_mount_opt, osb->s_atime_quantum); |
252 | 252 | ||
253 | if (cconn) { | 253 | if (cconn) { |
254 | out += snprintf(buf + out, len - out, | 254 | out += snprintf(buf + out, len - out, |
255 | "%10s => Stack: %s Name: %*s " | 255 | "%10s => Stack: %s Name: %*s " |
256 | "Version: %d.%d\n", "Cluster", | 256 | "Version: %d.%d\n", "Cluster", |
257 | (*osb->osb_cluster_stack == '\0' ? | 257 | (*osb->osb_cluster_stack == '\0' ? |
258 | "o2cb" : osb->osb_cluster_stack), | 258 | "o2cb" : osb->osb_cluster_stack), |
259 | cconn->cc_namelen, cconn->cc_name, | 259 | cconn->cc_namelen, cconn->cc_name, |
260 | cconn->cc_version.pv_major, | 260 | cconn->cc_version.pv_major, |
261 | cconn->cc_version.pv_minor); | 261 | cconn->cc_version.pv_minor); |
262 | } | 262 | } |
263 | 263 | ||
264 | spin_lock(&osb->dc_task_lock); | 264 | spin_lock(&osb->dc_task_lock); |
265 | out += snprintf(buf + out, len - out, | 265 | out += snprintf(buf + out, len - out, |
266 | "%10s => Pid: %d Count: %lu WakeSeq: %lu " | 266 | "%10s => Pid: %d Count: %lu WakeSeq: %lu " |
267 | "WorkSeq: %lu\n", "DownCnvt", | 267 | "WorkSeq: %lu\n", "DownCnvt", |
268 | (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), | 268 | (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), |
269 | osb->blocked_lock_count, osb->dc_wake_sequence, | 269 | osb->blocked_lock_count, osb->dc_wake_sequence, |
270 | osb->dc_work_sequence); | 270 | osb->dc_work_sequence); |
271 | spin_unlock(&osb->dc_task_lock); | 271 | spin_unlock(&osb->dc_task_lock); |
272 | 272 | ||
273 | spin_lock(&osb->osb_lock); | 273 | spin_lock(&osb->osb_lock); |
274 | out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", | 274 | out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", |
275 | "Recovery", | 275 | "Recovery", |
276 | (osb->recovery_thread_task ? | 276 | (osb->recovery_thread_task ? |
277 | task_pid_nr(osb->recovery_thread_task) : -1)); | 277 | task_pid_nr(osb->recovery_thread_task) : -1)); |
278 | if (rm->rm_used == 0) | 278 | if (rm->rm_used == 0) |
279 | out += snprintf(buf + out, len - out, " None\n"); | 279 | out += snprintf(buf + out, len - out, " None\n"); |
280 | else { | 280 | else { |
281 | for (i = 0; i < rm->rm_used; i++) | 281 | for (i = 0; i < rm->rm_used; i++) |
282 | out += snprintf(buf + out, len - out, " %d", | 282 | out += snprintf(buf + out, len - out, " %d", |
283 | rm->rm_entries[i]); | 283 | rm->rm_entries[i]); |
284 | out += snprintf(buf + out, len - out, "\n"); | 284 | out += snprintf(buf + out, len - out, "\n"); |
285 | } | 285 | } |
286 | spin_unlock(&osb->osb_lock); | 286 | spin_unlock(&osb->osb_lock); |
287 | 287 | ||
288 | out += snprintf(buf + out, len - out, | 288 | out += snprintf(buf + out, len - out, |
289 | "%10s => Pid: %d Interval: %lu\n", "Commit", | 289 | "%10s => Pid: %d Interval: %lu\n", "Commit", |
290 | (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), | 290 | (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), |
291 | osb->osb_commit_interval); | 291 | osb->osb_commit_interval); |
292 | 292 | ||
293 | out += snprintf(buf + out, len - out, | 293 | out += snprintf(buf + out, len - out, |
294 | "%10s => State: %d TxnId: %lu NumTxns: %d\n", | 294 | "%10s => State: %d TxnId: %lu NumTxns: %d\n", |
295 | "Journal", osb->journal->j_state, | 295 | "Journal", osb->journal->j_state, |
296 | osb->journal->j_trans_id, | 296 | osb->journal->j_trans_id, |
297 | atomic_read(&osb->journal->j_num_trans)); | 297 | atomic_read(&osb->journal->j_num_trans)); |
298 | 298 | ||
299 | out += snprintf(buf + out, len - out, | 299 | out += snprintf(buf + out, len - out, |
300 | "%10s => GlobalAllocs: %d LocalAllocs: %d " | 300 | "%10s => GlobalAllocs: %d LocalAllocs: %d " |
301 | "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", | 301 | "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", |
302 | "Stats", | 302 | "Stats", |
303 | atomic_read(&osb->alloc_stats.bitmap_data), | 303 | atomic_read(&osb->alloc_stats.bitmap_data), |
304 | atomic_read(&osb->alloc_stats.local_data), | 304 | atomic_read(&osb->alloc_stats.local_data), |
305 | atomic_read(&osb->alloc_stats.bg_allocs), | 305 | atomic_read(&osb->alloc_stats.bg_allocs), |
306 | atomic_read(&osb->alloc_stats.moves), | 306 | atomic_read(&osb->alloc_stats.moves), |
307 | atomic_read(&osb->alloc_stats.bg_extends)); | 307 | atomic_read(&osb->alloc_stats.bg_extends)); |
308 | 308 | ||
309 | out += snprintf(buf + out, len - out, | 309 | out += snprintf(buf + out, len - out, |
310 | "%10s => State: %u Descriptor: %llu Size: %u bits " | 310 | "%10s => State: %u Descriptor: %llu Size: %u bits " |
311 | "Default: %u bits\n", | 311 | "Default: %u bits\n", |
312 | "LocalAlloc", osb->local_alloc_state, | 312 | "LocalAlloc", osb->local_alloc_state, |
313 | (unsigned long long)osb->la_last_gd, | 313 | (unsigned long long)osb->la_last_gd, |
314 | osb->local_alloc_bits, osb->local_alloc_default_bits); | 314 | osb->local_alloc_bits, osb->local_alloc_default_bits); |
315 | 315 | ||
316 | spin_lock(&osb->osb_lock); | 316 | spin_lock(&osb->osb_lock); |
317 | out += snprintf(buf + out, len - out, | 317 | out += snprintf(buf + out, len - out, |
318 | "%10s => InodeSlot: %d StolenInodes: %d, " | 318 | "%10s => InodeSlot: %d StolenInodes: %d, " |
319 | "MetaSlot: %d StolenMeta: %d\n", "Steal", | 319 | "MetaSlot: %d StolenMeta: %d\n", "Steal", |
320 | osb->s_inode_steal_slot, | 320 | osb->s_inode_steal_slot, |
321 | atomic_read(&osb->s_num_inodes_stolen), | 321 | atomic_read(&osb->s_num_inodes_stolen), |
322 | osb->s_meta_steal_slot, | 322 | osb->s_meta_steal_slot, |
323 | atomic_read(&osb->s_num_meta_stolen)); | 323 | atomic_read(&osb->s_num_meta_stolen)); |
324 | spin_unlock(&osb->osb_lock); | 324 | spin_unlock(&osb->osb_lock); |
325 | 325 | ||
326 | out += snprintf(buf + out, len - out, "OrphanScan => "); | 326 | out += snprintf(buf + out, len - out, "OrphanScan => "); |
327 | out += snprintf(buf + out, len - out, "Local: %u Global: %u ", | 327 | out += snprintf(buf + out, len - out, "Local: %u Global: %u ", |
328 | os->os_count, os->os_seqno); | 328 | os->os_count, os->os_seqno); |
329 | out += snprintf(buf + out, len - out, " Last Scan: "); | 329 | out += snprintf(buf + out, len - out, " Last Scan: "); |
330 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) | 330 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) |
331 | out += snprintf(buf + out, len - out, "Disabled\n"); | 331 | out += snprintf(buf + out, len - out, "Disabled\n"); |
332 | else | 332 | else |
333 | out += snprintf(buf + out, len - out, "%lu seconds ago\n", | 333 | out += snprintf(buf + out, len - out, "%lu seconds ago\n", |
334 | (get_seconds() - os->os_scantime.tv_sec)); | 334 | (get_seconds() - os->os_scantime.tv_sec)); |
335 | 335 | ||
336 | out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", | 336 | out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", |
337 | "Slots", "Num", "RecoGen"); | 337 | "Slots", "Num", "RecoGen"); |
338 | for (i = 0; i < osb->max_slots; ++i) { | 338 | for (i = 0; i < osb->max_slots; ++i) { |
339 | out += snprintf(buf + out, len - out, | 339 | out += snprintf(buf + out, len - out, |
340 | "%10s %c %3d %10d\n", | 340 | "%10s %c %3d %10d\n", |
341 | " ", | 341 | " ", |
342 | (i == osb->slot_num ? '*' : ' '), | 342 | (i == osb->slot_num ? '*' : ' '), |
343 | i, osb->slot_recovery_generations[i]); | 343 | i, osb->slot_recovery_generations[i]); |
344 | } | 344 | } |
345 | 345 | ||
346 | return out; | 346 | return out; |
347 | } | 347 | } |
348 | 348 | ||
349 | static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) | 349 | static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) |
350 | { | 350 | { |
351 | struct ocfs2_super *osb = inode->i_private; | 351 | struct ocfs2_super *osb = inode->i_private; |
352 | char *buf = NULL; | 352 | char *buf = NULL; |
353 | 353 | ||
354 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 354 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
355 | if (!buf) | 355 | if (!buf) |
356 | goto bail; | 356 | goto bail; |
357 | 357 | ||
358 | i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); | 358 | i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); |
359 | 359 | ||
360 | file->private_data = buf; | 360 | file->private_data = buf; |
361 | 361 | ||
362 | return 0; | 362 | return 0; |
363 | bail: | 363 | bail: |
364 | return -ENOMEM; | 364 | return -ENOMEM; |
365 | } | 365 | } |
366 | 366 | ||
367 | static int ocfs2_debug_release(struct inode *inode, struct file *file) | 367 | static int ocfs2_debug_release(struct inode *inode, struct file *file) |
368 | { | 368 | { |
369 | kfree(file->private_data); | 369 | kfree(file->private_data); |
370 | return 0; | 370 | return 0; |
371 | } | 371 | } |
372 | 372 | ||
373 | static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, | 373 | static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, |
374 | size_t nbytes, loff_t *ppos) | 374 | size_t nbytes, loff_t *ppos) |
375 | { | 375 | { |
376 | return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, | 376 | return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, |
377 | i_size_read(file->f_mapping->host)); | 377 | i_size_read(file->f_mapping->host)); |
378 | } | 378 | } |
379 | #else | 379 | #else |
380 | static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) | 380 | static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) |
381 | { | 381 | { |
382 | return 0; | 382 | return 0; |
383 | } | 383 | } |
384 | static int ocfs2_debug_release(struct inode *inode, struct file *file) | 384 | static int ocfs2_debug_release(struct inode *inode, struct file *file) |
385 | { | 385 | { |
386 | return 0; | 386 | return 0; |
387 | } | 387 | } |
388 | static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, | 388 | static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, |
389 | size_t nbytes, loff_t *ppos) | 389 | size_t nbytes, loff_t *ppos) |
390 | { | 390 | { |
391 | return 0; | 391 | return 0; |
392 | } | 392 | } |
393 | #endif /* CONFIG_DEBUG_FS */ | 393 | #endif /* CONFIG_DEBUG_FS */ |
394 | 394 | ||
395 | static const struct file_operations ocfs2_osb_debug_fops = { | 395 | static const struct file_operations ocfs2_osb_debug_fops = { |
396 | .open = ocfs2_osb_debug_open, | 396 | .open = ocfs2_osb_debug_open, |
397 | .release = ocfs2_debug_release, | 397 | .release = ocfs2_debug_release, |
398 | .read = ocfs2_debug_read, | 398 | .read = ocfs2_debug_read, |
399 | .llseek = generic_file_llseek, | 399 | .llseek = generic_file_llseek, |
400 | }; | 400 | }; |
401 | 401 | ||
402 | static int ocfs2_sync_fs(struct super_block *sb, int wait) | 402 | static int ocfs2_sync_fs(struct super_block *sb, int wait) |
403 | { | 403 | { |
404 | int status; | 404 | int status; |
405 | tid_t target; | 405 | tid_t target; |
406 | struct ocfs2_super *osb = OCFS2_SB(sb); | 406 | struct ocfs2_super *osb = OCFS2_SB(sb); |
407 | 407 | ||
408 | if (ocfs2_is_hard_readonly(osb)) | 408 | if (ocfs2_is_hard_readonly(osb)) |
409 | return -EROFS; | 409 | return -EROFS; |
410 | 410 | ||
411 | if (wait) { | 411 | if (wait) { |
412 | status = ocfs2_flush_truncate_log(osb); | 412 | status = ocfs2_flush_truncate_log(osb); |
413 | if (status < 0) | 413 | if (status < 0) |
414 | mlog_errno(status); | 414 | mlog_errno(status); |
415 | } else { | 415 | } else { |
416 | ocfs2_schedule_truncate_log_flush(osb, 0); | 416 | ocfs2_schedule_truncate_log_flush(osb, 0); |
417 | } | 417 | } |
418 | 418 | ||
419 | if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, | 419 | if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, |
420 | &target)) { | 420 | &target)) { |
421 | if (wait) | 421 | if (wait) |
422 | jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, | 422 | jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, |
423 | target); | 423 | target); |
424 | } | 424 | } |
425 | return 0; | 425 | return 0; |
426 | } | 426 | } |
427 | 427 | ||
428 | static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) | 428 | static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) |
429 | { | 429 | { |
430 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) | 430 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) |
431 | && (ino == USER_QUOTA_SYSTEM_INODE | 431 | && (ino == USER_QUOTA_SYSTEM_INODE |
432 | || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) | 432 | || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) |
433 | return 0; | 433 | return 0; |
434 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) | 434 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) |
435 | && (ino == GROUP_QUOTA_SYSTEM_INODE | 435 | && (ino == GROUP_QUOTA_SYSTEM_INODE |
436 | || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) | 436 | || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) |
437 | return 0; | 437 | return 0; |
438 | return 1; | 438 | return 1; |
439 | } | 439 | } |
440 | 440 | ||
441 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) | 441 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) |
442 | { | 442 | { |
443 | struct inode *new = NULL; | 443 | struct inode *new = NULL; |
444 | int status = 0; | 444 | int status = 0; |
445 | int i; | 445 | int i; |
446 | 446 | ||
447 | new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); | 447 | new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); |
448 | if (IS_ERR(new)) { | 448 | if (IS_ERR(new)) { |
449 | status = PTR_ERR(new); | 449 | status = PTR_ERR(new); |
450 | mlog_errno(status); | 450 | mlog_errno(status); |
451 | goto bail; | 451 | goto bail; |
452 | } | 452 | } |
453 | osb->root_inode = new; | 453 | osb->root_inode = new; |
454 | 454 | ||
455 | new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); | 455 | new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); |
456 | if (IS_ERR(new)) { | 456 | if (IS_ERR(new)) { |
457 | status = PTR_ERR(new); | 457 | status = PTR_ERR(new); |
458 | mlog_errno(status); | 458 | mlog_errno(status); |
459 | goto bail; | 459 | goto bail; |
460 | } | 460 | } |
461 | osb->sys_root_inode = new; | 461 | osb->sys_root_inode = new; |
462 | 462 | ||
463 | for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; | 463 | for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; |
464 | i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { | 464 | i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { |
465 | if (!ocfs2_need_system_inode(osb, i)) | 465 | if (!ocfs2_need_system_inode(osb, i)) |
466 | continue; | 466 | continue; |
467 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); | 467 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); |
468 | if (!new) { | 468 | if (!new) { |
469 | ocfs2_release_system_inodes(osb); | 469 | ocfs2_release_system_inodes(osb); |
470 | status = -EINVAL; | 470 | status = -EINVAL; |
471 | mlog_errno(status); | 471 | mlog_errno(status); |
472 | /* FIXME: Should ERROR_RO_FS */ | 472 | /* FIXME: Should ERROR_RO_FS */ |
473 | mlog(ML_ERROR, "Unable to load system inode %d, " | 473 | mlog(ML_ERROR, "Unable to load system inode %d, " |
474 | "possibly corrupt fs?", i); | 474 | "possibly corrupt fs?", i); |
475 | goto bail; | 475 | goto bail; |
476 | } | 476 | } |
477 | // the array now has one ref, so drop this one | 477 | // the array now has one ref, so drop this one |
478 | iput(new); | 478 | iput(new); |
479 | } | 479 | } |
480 | 480 | ||
481 | bail: | 481 | bail: |
482 | if (status) | 482 | if (status) |
483 | mlog_errno(status); | 483 | mlog_errno(status); |
484 | return status; | 484 | return status; |
485 | } | 485 | } |
486 | 486 | ||
487 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) | 487 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) |
488 | { | 488 | { |
489 | struct inode *new = NULL; | 489 | struct inode *new = NULL; |
490 | int status = 0; | 490 | int status = 0; |
491 | int i; | 491 | int i; |
492 | 492 | ||
493 | for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; | 493 | for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; |
494 | i < NUM_SYSTEM_INODES; | 494 | i < NUM_SYSTEM_INODES; |
495 | i++) { | 495 | i++) { |
496 | if (!ocfs2_need_system_inode(osb, i)) | 496 | if (!ocfs2_need_system_inode(osb, i)) |
497 | continue; | 497 | continue; |
498 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); | 498 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); |
499 | if (!new) { | 499 | if (!new) { |
500 | ocfs2_release_system_inodes(osb); | 500 | ocfs2_release_system_inodes(osb); |
501 | status = -EINVAL; | 501 | status = -EINVAL; |
502 | mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", | 502 | mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", |
503 | status, i, osb->slot_num); | 503 | status, i, osb->slot_num); |
504 | goto bail; | 504 | goto bail; |
505 | } | 505 | } |
506 | /* the array now has one ref, so drop this one */ | 506 | /* the array now has one ref, so drop this one */ |
507 | iput(new); | 507 | iput(new); |
508 | } | 508 | } |
509 | 509 | ||
510 | bail: | 510 | bail: |
511 | if (status) | 511 | if (status) |
512 | mlog_errno(status); | 512 | mlog_errno(status); |
513 | return status; | 513 | return status; |
514 | } | 514 | } |
515 | 515 | ||
516 | static void ocfs2_release_system_inodes(struct ocfs2_super *osb) | 516 | static void ocfs2_release_system_inodes(struct ocfs2_super *osb) |
517 | { | 517 | { |
518 | int i; | 518 | int i; |
519 | struct inode *inode; | 519 | struct inode *inode; |
520 | 520 | ||
521 | for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { | 521 | for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { |
522 | inode = osb->global_system_inodes[i]; | 522 | inode = osb->global_system_inodes[i]; |
523 | if (inode) { | 523 | if (inode) { |
524 | iput(inode); | 524 | iput(inode); |
525 | osb->global_system_inodes[i] = NULL; | 525 | osb->global_system_inodes[i] = NULL; |
526 | } | 526 | } |
527 | } | 527 | } |
528 | 528 | ||
529 | inode = osb->sys_root_inode; | 529 | inode = osb->sys_root_inode; |
530 | if (inode) { | 530 | if (inode) { |
531 | iput(inode); | 531 | iput(inode); |
532 | osb->sys_root_inode = NULL; | 532 | osb->sys_root_inode = NULL; |
533 | } | 533 | } |
534 | 534 | ||
535 | inode = osb->root_inode; | 535 | inode = osb->root_inode; |
536 | if (inode) { | 536 | if (inode) { |
537 | iput(inode); | 537 | iput(inode); |
538 | osb->root_inode = NULL; | 538 | osb->root_inode = NULL; |
539 | } | 539 | } |
540 | 540 | ||
541 | if (!osb->local_system_inodes) | 541 | if (!osb->local_system_inodes) |
542 | return; | 542 | return; |
543 | 543 | ||
544 | for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { | 544 | for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { |
545 | if (osb->local_system_inodes[i]) { | 545 | if (osb->local_system_inodes[i]) { |
546 | iput(osb->local_system_inodes[i]); | 546 | iput(osb->local_system_inodes[i]); |
547 | osb->local_system_inodes[i] = NULL; | 547 | osb->local_system_inodes[i] = NULL; |
548 | } | 548 | } |
549 | } | 549 | } |
550 | 550 | ||
551 | kfree(osb->local_system_inodes); | 551 | kfree(osb->local_system_inodes); |
552 | osb->local_system_inodes = NULL; | 552 | osb->local_system_inodes = NULL; |
553 | } | 553 | } |
554 | 554 | ||
555 | /* We're allocating fs objects, use GFP_NOFS */ | 555 | /* We're allocating fs objects, use GFP_NOFS */ |
556 | static struct inode *ocfs2_alloc_inode(struct super_block *sb) | 556 | static struct inode *ocfs2_alloc_inode(struct super_block *sb) |
557 | { | 557 | { |
558 | struct ocfs2_inode_info *oi; | 558 | struct ocfs2_inode_info *oi; |
559 | 559 | ||
560 | oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); | 560 | oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); |
561 | if (!oi) | 561 | if (!oi) |
562 | return NULL; | 562 | return NULL; |
563 | 563 | ||
564 | oi->i_sync_tid = 0; | 564 | oi->i_sync_tid = 0; |
565 | oi->i_datasync_tid = 0; | 565 | oi->i_datasync_tid = 0; |
566 | 566 | ||
567 | jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); | 567 | jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); |
568 | return &oi->vfs_inode; | 568 | return &oi->vfs_inode; |
569 | } | 569 | } |
570 | 570 | ||
571 | static void ocfs2_i_callback(struct rcu_head *head) | 571 | static void ocfs2_i_callback(struct rcu_head *head) |
572 | { | 572 | { |
573 | struct inode *inode = container_of(head, struct inode, i_rcu); | 573 | struct inode *inode = container_of(head, struct inode, i_rcu); |
574 | kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); | 574 | kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); |
575 | } | 575 | } |
576 | 576 | ||
577 | static void ocfs2_destroy_inode(struct inode *inode) | 577 | static void ocfs2_destroy_inode(struct inode *inode) |
578 | { | 578 | { |
579 | call_rcu(&inode->i_rcu, ocfs2_i_callback); | 579 | call_rcu(&inode->i_rcu, ocfs2_i_callback); |
580 | } | 580 | } |
581 | 581 | ||
582 | static unsigned long long ocfs2_max_file_offset(unsigned int bbits, | 582 | static unsigned long long ocfs2_max_file_offset(unsigned int bbits, |
583 | unsigned int cbits) | 583 | unsigned int cbits) |
584 | { | 584 | { |
585 | unsigned int bytes = 1 << cbits; | 585 | unsigned int bytes = 1 << cbits; |
586 | unsigned int trim = bytes; | 586 | unsigned int trim = bytes; |
587 | unsigned int bitshift = 32; | 587 | unsigned int bitshift = 32; |
588 | 588 | ||
589 | /* | 589 | /* |
590 | * i_size and all block offsets in ocfs2 are always 64 bits | 590 | * i_size and all block offsets in ocfs2 are always 64 bits |
591 | * wide. i_clusters is 32 bits, in cluster-sized units. So on | 591 | * wide. i_clusters is 32 bits, in cluster-sized units. So on |
592 | * 64 bit platforms, cluster size will be the limiting factor. | 592 | * 64 bit platforms, cluster size will be the limiting factor. |
593 | */ | 593 | */ |
594 | 594 | ||
595 | #if BITS_PER_LONG == 32 | 595 | #if BITS_PER_LONG == 32 |
596 | # if defined(CONFIG_LBDAF) | 596 | # if defined(CONFIG_LBDAF) |
597 | BUILD_BUG_ON(sizeof(sector_t) != 8); | 597 | BUILD_BUG_ON(sizeof(sector_t) != 8); |
598 | /* | 598 | /* |
599 | * We might be limited by page cache size. | 599 | * We might be limited by page cache size. |
600 | */ | 600 | */ |
601 | if (bytes > PAGE_CACHE_SIZE) { | 601 | if (bytes > PAGE_CACHE_SIZE) { |
602 | bytes = PAGE_CACHE_SIZE; | 602 | bytes = PAGE_CACHE_SIZE; |
603 | trim = 1; | 603 | trim = 1; |
604 | /* | 604 | /* |
605 | * Shift by 31 here so that we don't get larger than | 605 | * Shift by 31 here so that we don't get larger than |
606 | * MAX_LFS_FILESIZE | 606 | * MAX_LFS_FILESIZE |
607 | */ | 607 | */ |
608 | bitshift = 31; | 608 | bitshift = 31; |
609 | } | 609 | } |
610 | # else | 610 | # else |
611 | /* | 611 | /* |
612 | * We are limited by the size of sector_t. Use block size, as | 612 | * We are limited by the size of sector_t. Use block size, as |
613 | * that's what we expose to the VFS. | 613 | * that's what we expose to the VFS. |
614 | */ | 614 | */ |
615 | bytes = 1 << bbits; | 615 | bytes = 1 << bbits; |
616 | trim = 1; | 616 | trim = 1; |
617 | bitshift = 31; | 617 | bitshift = 31; |
618 | # endif | 618 | # endif |
619 | #endif | 619 | #endif |
620 | 620 | ||
621 | /* | 621 | /* |
622 | * Trim by a whole cluster when we can actually approach the | 622 | * Trim by a whole cluster when we can actually approach the |
623 | * on-disk limits. Otherwise we can overflow i_clusters when | 623 | * on-disk limits. Otherwise we can overflow i_clusters when |
624 | * an extent start is at the max offset. | 624 | * an extent start is at the max offset. |
625 | */ | 625 | */ |
626 | return (((unsigned long long)bytes) << bitshift) - trim; | 626 | return (((unsigned long long)bytes) << bitshift) - trim; |
627 | } | 627 | } |
628 | 628 | ||
629 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data) | 629 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data) |
630 | { | 630 | { |
631 | int incompat_features; | 631 | int incompat_features; |
632 | int ret = 0; | 632 | int ret = 0; |
633 | struct mount_options parsed_options; | 633 | struct mount_options parsed_options; |
634 | struct ocfs2_super *osb = OCFS2_SB(sb); | 634 | struct ocfs2_super *osb = OCFS2_SB(sb); |
635 | u32 tmp; | 635 | u32 tmp; |
636 | 636 | ||
637 | sync_filesystem(sb); | 637 | sync_filesystem(sb); |
638 | 638 | ||
639 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || | 639 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || |
640 | !ocfs2_check_set_options(sb, &parsed_options)) { | 640 | !ocfs2_check_set_options(sb, &parsed_options)) { |
641 | ret = -EINVAL; | 641 | ret = -EINVAL; |
642 | goto out; | 642 | goto out; |
643 | } | 643 | } |
644 | 644 | ||
645 | tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | | 645 | tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | |
646 | OCFS2_MOUNT_HB_NONE; | 646 | OCFS2_MOUNT_HB_NONE; |
647 | if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { | 647 | if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { |
648 | ret = -EINVAL; | 648 | ret = -EINVAL; |
649 | mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); | 649 | mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); |
650 | goto out; | 650 | goto out; |
651 | } | 651 | } |
652 | 652 | ||
653 | if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != | 653 | if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != |
654 | (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { | 654 | (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { |
655 | ret = -EINVAL; | 655 | ret = -EINVAL; |
656 | mlog(ML_ERROR, "Cannot change data mode on remount\n"); | 656 | mlog(ML_ERROR, "Cannot change data mode on remount\n"); |
657 | goto out; | 657 | goto out; |
658 | } | 658 | } |
659 | 659 | ||
660 | /* Probably don't want this on remount; it might | 660 | /* Probably don't want this on remount; it might |
661 | * mess with other nodes */ | 661 | * mess with other nodes */ |
662 | if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && | 662 | if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && |
663 | (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { | 663 | (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { |
664 | ret = -EINVAL; | 664 | ret = -EINVAL; |
665 | mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); | 665 | mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); |
666 | goto out; | 666 | goto out; |
667 | } | 667 | } |
668 | 668 | ||
669 | /* We're going to/from readonly mode. */ | 669 | /* We're going to/from readonly mode. */ |
670 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { | 670 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { |
671 | /* Disable quota accounting before remounting RO */ | 671 | /* Disable quota accounting before remounting RO */ |
672 | if (*flags & MS_RDONLY) { | 672 | if (*flags & MS_RDONLY) { |
673 | ret = ocfs2_susp_quotas(osb, 0); | 673 | ret = ocfs2_susp_quotas(osb, 0); |
674 | if (ret < 0) | 674 | if (ret < 0) |
675 | goto out; | 675 | goto out; |
676 | } | 676 | } |
677 | /* Lock here so the check of HARD_RO and the potential | 677 | /* Lock here so the check of HARD_RO and the potential |
678 | * setting of SOFT_RO is atomic. */ | 678 | * setting of SOFT_RO is atomic. */ |
679 | spin_lock(&osb->osb_lock); | 679 | spin_lock(&osb->osb_lock); |
680 | if (osb->osb_flags & OCFS2_OSB_HARD_RO) { | 680 | if (osb->osb_flags & OCFS2_OSB_HARD_RO) { |
681 | mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); | 681 | mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); |
682 | ret = -EROFS; | 682 | ret = -EROFS; |
683 | goto unlock_osb; | 683 | goto unlock_osb; |
684 | } | 684 | } |
685 | 685 | ||
686 | if (*flags & MS_RDONLY) { | 686 | if (*flags & MS_RDONLY) { |
687 | sb->s_flags |= MS_RDONLY; | 687 | sb->s_flags |= MS_RDONLY; |
688 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; | 688 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; |
689 | } else { | 689 | } else { |
690 | if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { | 690 | if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { |
691 | mlog(ML_ERROR, "Cannot remount RDWR " | 691 | mlog(ML_ERROR, "Cannot remount RDWR " |
692 | "filesystem due to previous errors.\n"); | 692 | "filesystem due to previous errors.\n"); |
693 | ret = -EROFS; | 693 | ret = -EROFS; |
694 | goto unlock_osb; | 694 | goto unlock_osb; |
695 | } | 695 | } |
696 | incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); | 696 | incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); |
697 | if (incompat_features) { | 697 | if (incompat_features) { |
698 | mlog(ML_ERROR, "Cannot remount RDWR because " | 698 | mlog(ML_ERROR, "Cannot remount RDWR because " |
699 | "of unsupported optional features " | 699 | "of unsupported optional features " |
700 | "(%x).\n", incompat_features); | 700 | "(%x).\n", incompat_features); |
701 | ret = -EINVAL; | 701 | ret = -EINVAL; |
702 | goto unlock_osb; | 702 | goto unlock_osb; |
703 | } | 703 | } |
704 | sb->s_flags &= ~MS_RDONLY; | 704 | sb->s_flags &= ~MS_RDONLY; |
705 | osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; | 705 | osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; |
706 | } | 706 | } |
707 | trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); | 707 | trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); |
708 | unlock_osb: | 708 | unlock_osb: |
709 | spin_unlock(&osb->osb_lock); | 709 | spin_unlock(&osb->osb_lock); |
710 | /* Enable quota accounting after remounting RW */ | 710 | /* Enable quota accounting after remounting RW */ |
711 | if (!ret && !(*flags & MS_RDONLY)) { | 711 | if (!ret && !(*flags & MS_RDONLY)) { |
712 | if (sb_any_quota_suspended(sb)) | 712 | if (sb_any_quota_suspended(sb)) |
713 | ret = ocfs2_susp_quotas(osb, 1); | 713 | ret = ocfs2_susp_quotas(osb, 1); |
714 | else | 714 | else |
715 | ret = ocfs2_enable_quotas(osb); | 715 | ret = ocfs2_enable_quotas(osb); |
716 | if (ret < 0) { | 716 | if (ret < 0) { |
717 | /* Return back changes... */ | 717 | /* Return back changes... */ |
718 | spin_lock(&osb->osb_lock); | 718 | spin_lock(&osb->osb_lock); |
719 | sb->s_flags |= MS_RDONLY; | 719 | sb->s_flags |= MS_RDONLY; |
720 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; | 720 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; |
721 | spin_unlock(&osb->osb_lock); | 721 | spin_unlock(&osb->osb_lock); |
722 | goto out; | 722 | goto out; |
723 | } | 723 | } |
724 | } | 724 | } |
725 | } | 725 | } |
726 | 726 | ||
727 | if (!ret) { | 727 | if (!ret) { |
728 | /* Only save off the new mount options in case of a successful | 728 | /* Only save off the new mount options in case of a successful |
729 | * remount. */ | 729 | * remount. */ |
730 | osb->s_mount_opt = parsed_options.mount_opt; | 730 | osb->s_mount_opt = parsed_options.mount_opt; |
731 | osb->s_atime_quantum = parsed_options.atime_quantum; | 731 | osb->s_atime_quantum = parsed_options.atime_quantum; |
732 | osb->preferred_slot = parsed_options.slot; | 732 | osb->preferred_slot = parsed_options.slot; |
733 | if (parsed_options.commit_interval) | 733 | if (parsed_options.commit_interval) |
734 | osb->osb_commit_interval = parsed_options.commit_interval; | 734 | osb->osb_commit_interval = parsed_options.commit_interval; |
735 | 735 | ||
736 | if (!ocfs2_is_hard_readonly(osb)) | 736 | if (!ocfs2_is_hard_readonly(osb)) |
737 | ocfs2_set_journal_params(osb); | 737 | ocfs2_set_journal_params(osb); |
738 | 738 | ||
739 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | | 739 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | |
740 | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? | 740 | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? |
741 | MS_POSIXACL : 0); | 741 | MS_POSIXACL : 0); |
742 | } | 742 | } |
743 | out: | 743 | out: |
744 | return ret; | 744 | return ret; |
745 | } | 745 | } |
746 | 746 | ||
747 | static int ocfs2_sb_probe(struct super_block *sb, | 747 | static int ocfs2_sb_probe(struct super_block *sb, |
748 | struct buffer_head **bh, | 748 | struct buffer_head **bh, |
749 | int *sector_size, | 749 | int *sector_size, |
750 | struct ocfs2_blockcheck_stats *stats) | 750 | struct ocfs2_blockcheck_stats *stats) |
751 | { | 751 | { |
752 | int status, tmpstat; | 752 | int status, tmpstat; |
753 | struct ocfs1_vol_disk_hdr *hdr; | 753 | struct ocfs1_vol_disk_hdr *hdr; |
754 | struct ocfs2_dinode *di; | 754 | struct ocfs2_dinode *di; |
755 | int blksize; | 755 | int blksize; |
756 | 756 | ||
757 | *bh = NULL; | 757 | *bh = NULL; |
758 | 758 | ||
759 | /* may be > 512 */ | 759 | /* may be > 512 */ |
760 | *sector_size = bdev_logical_block_size(sb->s_bdev); | 760 | *sector_size = bdev_logical_block_size(sb->s_bdev); |
761 | if (*sector_size > OCFS2_MAX_BLOCKSIZE) { | 761 | if (*sector_size > OCFS2_MAX_BLOCKSIZE) { |
762 | mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", | 762 | mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", |
763 | *sector_size, OCFS2_MAX_BLOCKSIZE); | 763 | *sector_size, OCFS2_MAX_BLOCKSIZE); |
764 | status = -EINVAL; | 764 | status = -EINVAL; |
765 | goto bail; | 765 | goto bail; |
766 | } | 766 | } |
767 | 767 | ||
768 | /* Can this really happen? */ | 768 | /* Can this really happen? */ |
769 | if (*sector_size < OCFS2_MIN_BLOCKSIZE) | 769 | if (*sector_size < OCFS2_MIN_BLOCKSIZE) |
770 | *sector_size = OCFS2_MIN_BLOCKSIZE; | 770 | *sector_size = OCFS2_MIN_BLOCKSIZE; |
771 | 771 | ||
772 | /* check block zero for old format */ | 772 | /* check block zero for old format */ |
773 | status = ocfs2_get_sector(sb, bh, 0, *sector_size); | 773 | status = ocfs2_get_sector(sb, bh, 0, *sector_size); |
774 | if (status < 0) { | 774 | if (status < 0) { |
775 | mlog_errno(status); | 775 | mlog_errno(status); |
776 | goto bail; | 776 | goto bail; |
777 | } | 777 | } |
778 | hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; | 778 | hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; |
779 | if (hdr->major_version == OCFS1_MAJOR_VERSION) { | 779 | if (hdr->major_version == OCFS1_MAJOR_VERSION) { |
780 | mlog(ML_ERROR, "incompatible version: %u.%u\n", | 780 | mlog(ML_ERROR, "incompatible version: %u.%u\n", |
781 | hdr->major_version, hdr->minor_version); | 781 | hdr->major_version, hdr->minor_version); |
782 | status = -EINVAL; | 782 | status = -EINVAL; |
783 | } | 783 | } |
784 | if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, | 784 | if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, |
785 | strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { | 785 | strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { |
786 | mlog(ML_ERROR, "incompatible volume signature: %8s\n", | 786 | mlog(ML_ERROR, "incompatible volume signature: %8s\n", |
787 | hdr->signature); | 787 | hdr->signature); |
788 | status = -EINVAL; | 788 | status = -EINVAL; |
789 | } | 789 | } |
790 | brelse(*bh); | 790 | brelse(*bh); |
791 | *bh = NULL; | 791 | *bh = NULL; |
792 | if (status < 0) { | 792 | if (status < 0) { |
793 | mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " | 793 | mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " |
794 | "upgraded before mounting with ocfs v2\n"); | 794 | "upgraded before mounting with ocfs v2\n"); |
795 | goto bail; | 795 | goto bail; |
796 | } | 796 | } |
797 | 797 | ||
798 | /* | 798 | /* |
799 | * Now check at magic offset for 512, 1024, 2048, 4096 | 799 | * Now check at magic offset for 512, 1024, 2048, 4096 |
800 | * blocksizes. 4096 is the maximum blocksize because it is | 800 | * blocksizes. 4096 is the maximum blocksize because it is |
801 | * the minimum clustersize. | 801 | * the minimum clustersize. |
802 | */ | 802 | */ |
803 | status = -EINVAL; | 803 | status = -EINVAL; |
804 | for (blksize = *sector_size; | 804 | for (blksize = *sector_size; |
805 | blksize <= OCFS2_MAX_BLOCKSIZE; | 805 | blksize <= OCFS2_MAX_BLOCKSIZE; |
806 | blksize <<= 1) { | 806 | blksize <<= 1) { |
807 | tmpstat = ocfs2_get_sector(sb, bh, | 807 | tmpstat = ocfs2_get_sector(sb, bh, |
808 | OCFS2_SUPER_BLOCK_BLKNO, | 808 | OCFS2_SUPER_BLOCK_BLKNO, |
809 | blksize); | 809 | blksize); |
810 | if (tmpstat < 0) { | 810 | if (tmpstat < 0) { |
811 | status = tmpstat; | 811 | status = tmpstat; |
812 | mlog_errno(status); | 812 | mlog_errno(status); |
813 | break; | 813 | break; |
814 | } | 814 | } |
815 | di = (struct ocfs2_dinode *) (*bh)->b_data; | 815 | di = (struct ocfs2_dinode *) (*bh)->b_data; |
816 | memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); | 816 | memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); |
817 | spin_lock_init(&stats->b_lock); | 817 | spin_lock_init(&stats->b_lock); |
818 | tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); | 818 | tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); |
819 | if (tmpstat < 0) { | 819 | if (tmpstat < 0) { |
820 | brelse(*bh); | 820 | brelse(*bh); |
821 | *bh = NULL; | 821 | *bh = NULL; |
822 | } | 822 | } |
823 | if (tmpstat != -EAGAIN) { | 823 | if (tmpstat != -EAGAIN) { |
824 | status = tmpstat; | 824 | status = tmpstat; |
825 | break; | 825 | break; |
826 | } | 826 | } |
827 | } | 827 | } |
828 | 828 | ||
829 | bail: | 829 | bail: |
830 | return status; | 830 | return status; |
831 | } | 831 | } |
832 | 832 | ||
833 | static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) | 833 | static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) |
834 | { | 834 | { |
835 | u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; | 835 | u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; |
836 | 836 | ||
837 | if (osb->s_mount_opt & hb_enabled) { | 837 | if (osb->s_mount_opt & hb_enabled) { |
838 | if (ocfs2_mount_local(osb)) { | 838 | if (ocfs2_mount_local(osb)) { |
839 | mlog(ML_ERROR, "Cannot heartbeat on a locally " | 839 | mlog(ML_ERROR, "Cannot heartbeat on a locally " |
840 | "mounted device.\n"); | 840 | "mounted device.\n"); |
841 | return -EINVAL; | 841 | return -EINVAL; |
842 | } | 842 | } |
843 | if (ocfs2_userspace_stack(osb)) { | 843 | if (ocfs2_userspace_stack(osb)) { |
844 | mlog(ML_ERROR, "Userspace stack expected, but " | 844 | mlog(ML_ERROR, "Userspace stack expected, but " |
845 | "o2cb heartbeat arguments passed to mount\n"); | 845 | "o2cb heartbeat arguments passed to mount\n"); |
846 | return -EINVAL; | 846 | return -EINVAL; |
847 | } | 847 | } |
848 | if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && | 848 | if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && |
849 | !ocfs2_cluster_o2cb_global_heartbeat(osb)) || | 849 | !ocfs2_cluster_o2cb_global_heartbeat(osb)) || |
850 | ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && | 850 | ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && |
851 | ocfs2_cluster_o2cb_global_heartbeat(osb))) { | 851 | ocfs2_cluster_o2cb_global_heartbeat(osb))) { |
852 | mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); | 852 | mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); |
853 | return -EINVAL; | 853 | return -EINVAL; |
854 | } | 854 | } |
855 | } | 855 | } |
856 | 856 | ||
857 | if (!(osb->s_mount_opt & hb_enabled)) { | 857 | if (!(osb->s_mount_opt & hb_enabled)) { |
858 | if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && | 858 | if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && |
859 | !ocfs2_userspace_stack(osb)) { | 859 | !ocfs2_userspace_stack(osb)) { |
860 | mlog(ML_ERROR, "Heartbeat has to be started to mount " | 860 | mlog(ML_ERROR, "Heartbeat has to be started to mount " |
861 | "a read-write clustered device.\n"); | 861 | "a read-write clustered device.\n"); |
862 | return -EINVAL; | 862 | return -EINVAL; |
863 | } | 863 | } |
864 | } | 864 | } |
865 | 865 | ||
866 | return 0; | 866 | return 0; |
867 | } | 867 | } |
868 | 868 | ||
869 | /* | 869 | /* |
870 | * If we're using a userspace stack, mount should have passed | 870 | * If we're using a userspace stack, mount should have passed |
871 | * a name that matches the disk. If not, mount should not | 871 | * a name that matches the disk. If not, mount should not |
872 | * have passed a stack. | 872 | * have passed a stack. |
873 | */ | 873 | */ |
874 | static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, | 874 | static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, |
875 | struct mount_options *mopt) | 875 | struct mount_options *mopt) |
876 | { | 876 | { |
877 | if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { | 877 | if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { |
878 | mlog(ML_ERROR, | 878 | mlog(ML_ERROR, |
879 | "cluster stack passed to mount, but this filesystem " | 879 | "cluster stack passed to mount, but this filesystem " |
880 | "does not support it\n"); | 880 | "does not support it\n"); |
881 | return -EINVAL; | 881 | return -EINVAL; |
882 | } | 882 | } |
883 | 883 | ||
884 | if (ocfs2_userspace_stack(osb) && | 884 | if (ocfs2_userspace_stack(osb) && |
885 | strncmp(osb->osb_cluster_stack, mopt->cluster_stack, | 885 | strncmp(osb->osb_cluster_stack, mopt->cluster_stack, |
886 | OCFS2_STACK_LABEL_LEN)) { | 886 | OCFS2_STACK_LABEL_LEN)) { |
887 | mlog(ML_ERROR, | 887 | mlog(ML_ERROR, |
888 | "cluster stack passed to mount (\"%s\") does not " | 888 | "cluster stack passed to mount (\"%s\") does not " |
889 | "match the filesystem (\"%s\")\n", | 889 | "match the filesystem (\"%s\")\n", |
890 | mopt->cluster_stack, | 890 | mopt->cluster_stack, |
891 | osb->osb_cluster_stack); | 891 | osb->osb_cluster_stack); |
892 | return -EINVAL; | 892 | return -EINVAL; |
893 | } | 893 | } |
894 | 894 | ||
895 | return 0; | 895 | return 0; |
896 | } | 896 | } |
897 | 897 | ||
898 | static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) | 898 | static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) |
899 | { | 899 | { |
900 | int type; | 900 | int type; |
901 | struct super_block *sb = osb->sb; | 901 | struct super_block *sb = osb->sb; |
902 | unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, | 902 | unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, |
903 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; | 903 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; |
904 | int status = 0; | 904 | int status = 0; |
905 | 905 | ||
906 | for (type = 0; type < MAXQUOTAS; type++) { | 906 | for (type = 0; type < MAXQUOTAS; type++) { |
907 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) | 907 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) |
908 | continue; | 908 | continue; |
909 | if (unsuspend) | 909 | if (unsuspend) |
910 | status = dquot_resume(sb, type); | 910 | status = dquot_resume(sb, type); |
911 | else { | 911 | else { |
912 | struct ocfs2_mem_dqinfo *oinfo; | 912 | struct ocfs2_mem_dqinfo *oinfo; |
913 | 913 | ||
914 | /* Cancel periodic syncing before suspending */ | 914 | /* Cancel periodic syncing before suspending */ |
915 | oinfo = sb_dqinfo(sb, type)->dqi_priv; | 915 | oinfo = sb_dqinfo(sb, type)->dqi_priv; |
916 | cancel_delayed_work_sync(&oinfo->dqi_sync_work); | 916 | cancel_delayed_work_sync(&oinfo->dqi_sync_work); |
917 | status = dquot_suspend(sb, type); | 917 | status = dquot_suspend(sb, type); |
918 | } | 918 | } |
919 | if (status < 0) | 919 | if (status < 0) |
920 | break; | 920 | break; |
921 | } | 921 | } |
922 | if (status < 0) | 922 | if (status < 0) |
923 | mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " | 923 | mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " |
924 | "remount (error = %d).\n", status); | 924 | "remount (error = %d).\n", status); |
925 | return status; | 925 | return status; |
926 | } | 926 | } |
927 | 927 | ||
928 | static int ocfs2_enable_quotas(struct ocfs2_super *osb) | 928 | static int ocfs2_enable_quotas(struct ocfs2_super *osb) |
929 | { | 929 | { |
930 | struct inode *inode[MAXQUOTAS] = { NULL, NULL }; | 930 | struct inode *inode[MAXQUOTAS] = { NULL, NULL }; |
931 | struct super_block *sb = osb->sb; | 931 | struct super_block *sb = osb->sb; |
932 | unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, | 932 | unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, |
933 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; | 933 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; |
934 | unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, | 934 | unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, |
935 | LOCAL_GROUP_QUOTA_SYSTEM_INODE }; | 935 | LOCAL_GROUP_QUOTA_SYSTEM_INODE }; |
936 | int status; | 936 | int status; |
937 | int type; | 937 | int type; |
938 | 938 | ||
939 | sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; | 939 | sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; |
940 | for (type = 0; type < MAXQUOTAS; type++) { | 940 | for (type = 0; type < MAXQUOTAS; type++) { |
941 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) | 941 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) |
942 | continue; | 942 | continue; |
943 | inode[type] = ocfs2_get_system_file_inode(osb, ino[type], | 943 | inode[type] = ocfs2_get_system_file_inode(osb, ino[type], |
944 | osb->slot_num); | 944 | osb->slot_num); |
945 | if (!inode[type]) { | 945 | if (!inode[type]) { |
946 | status = -ENOENT; | 946 | status = -ENOENT; |
947 | goto out_quota_off; | 947 | goto out_quota_off; |
948 | } | 948 | } |
949 | status = dquot_enable(inode[type], type, QFMT_OCFS2, | 949 | status = dquot_enable(inode[type], type, QFMT_OCFS2, |
950 | DQUOT_USAGE_ENABLED); | 950 | DQUOT_USAGE_ENABLED); |
951 | if (status < 0) | 951 | if (status < 0) |
952 | goto out_quota_off; | 952 | goto out_quota_off; |
953 | } | 953 | } |
954 | 954 | ||
955 | for (type = 0; type < MAXQUOTAS; type++) | 955 | for (type = 0; type < MAXQUOTAS; type++) |
956 | iput(inode[type]); | 956 | iput(inode[type]); |
957 | return 0; | 957 | return 0; |
958 | out_quota_off: | 958 | out_quota_off: |
959 | ocfs2_disable_quotas(osb); | 959 | ocfs2_disable_quotas(osb); |
960 | for (type = 0; type < MAXQUOTAS; type++) | 960 | for (type = 0; type < MAXQUOTAS; type++) |
961 | iput(inode[type]); | 961 | iput(inode[type]); |
962 | mlog_errno(status); | 962 | mlog_errno(status); |
963 | return status; | 963 | return status; |
964 | } | 964 | } |
965 | 965 | ||
966 | static void ocfs2_disable_quotas(struct ocfs2_super *osb) | 966 | static void ocfs2_disable_quotas(struct ocfs2_super *osb) |
967 | { | 967 | { |
968 | int type; | 968 | int type; |
969 | struct inode *inode; | 969 | struct inode *inode; |
970 | struct super_block *sb = osb->sb; | 970 | struct super_block *sb = osb->sb; |
971 | struct ocfs2_mem_dqinfo *oinfo; | 971 | struct ocfs2_mem_dqinfo *oinfo; |
972 | 972 | ||
973 | /* We mostly ignore errors in this function because there's not much | 973 | /* We mostly ignore errors in this function because there's not much |
974 | * we can do when we see them */ | 974 | * we can do when we see them */ |
975 | for (type = 0; type < MAXQUOTAS; type++) { | 975 | for (type = 0; type < MAXQUOTAS; type++) { |
976 | if (!sb_has_quota_loaded(sb, type)) | 976 | if (!sb_has_quota_loaded(sb, type)) |
977 | continue; | 977 | continue; |
978 | /* Cancel periodic syncing before we grab dqonoff_mutex */ | 978 | /* Cancel periodic syncing before we grab dqonoff_mutex */ |
979 | oinfo = sb_dqinfo(sb, type)->dqi_priv; | 979 | oinfo = sb_dqinfo(sb, type)->dqi_priv; |
980 | cancel_delayed_work_sync(&oinfo->dqi_sync_work); | 980 | cancel_delayed_work_sync(&oinfo->dqi_sync_work); |
981 | inode = igrab(sb->s_dquot.files[type]); | 981 | inode = igrab(sb->s_dquot.files[type]); |
982 | /* Turn off quotas. This will remove all dquot structures from | 982 | /* Turn off quotas. This will remove all dquot structures from |
983 | * memory and so they will be automatically synced to global | 983 | * memory and so they will be automatically synced to global |
984 | * quota files */ | 984 | * quota files */ |
985 | dquot_disable(sb, type, DQUOT_USAGE_ENABLED | | 985 | dquot_disable(sb, type, DQUOT_USAGE_ENABLED | |
986 | DQUOT_LIMITS_ENABLED); | 986 | DQUOT_LIMITS_ENABLED); |
987 | if (!inode) | 987 | if (!inode) |
988 | continue; | 988 | continue; |
989 | iput(inode); | 989 | iput(inode); |
990 | } | 990 | } |
991 | } | 991 | } |
992 | 992 | ||
993 | /* Handle quota on quotactl */ | 993 | /* Handle quota on quotactl */ |
994 | static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) | 994 | static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) |
995 | { | 995 | { |
996 | unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, | 996 | unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, |
997 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; | 997 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; |
998 | 998 | ||
999 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) | 999 | if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) |
1000 | return -EINVAL; | 1000 | return -EINVAL; |
1001 | 1001 | ||
1002 | return dquot_enable(sb_dqopt(sb)->files[type], type, | 1002 | return dquot_enable(sb_dqopt(sb)->files[type], type, |
1003 | format_id, DQUOT_LIMITS_ENABLED); | 1003 | format_id, DQUOT_LIMITS_ENABLED); |
1004 | } | 1004 | } |
1005 | 1005 | ||
1006 | /* Handle quota off quotactl */ | 1006 | /* Handle quota off quotactl */ |
1007 | static int ocfs2_quota_off(struct super_block *sb, int type) | 1007 | static int ocfs2_quota_off(struct super_block *sb, int type) |
1008 | { | 1008 | { |
1009 | return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); | 1009 | return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); |
1010 | } | 1010 | } |
1011 | 1011 | ||
1012 | static const struct quotactl_ops ocfs2_quotactl_ops = { | 1012 | static const struct quotactl_ops ocfs2_quotactl_ops = { |
1013 | .quota_on_meta = ocfs2_quota_on, | 1013 | .quota_on_meta = ocfs2_quota_on, |
1014 | .quota_off = ocfs2_quota_off, | 1014 | .quota_off = ocfs2_quota_off, |
1015 | .quota_sync = dquot_quota_sync, | 1015 | .quota_sync = dquot_quota_sync, |
1016 | .get_info = dquot_get_dqinfo, | 1016 | .get_info = dquot_get_dqinfo, |
1017 | .set_info = dquot_set_dqinfo, | 1017 | .set_info = dquot_set_dqinfo, |
1018 | .get_dqblk = dquot_get_dqblk, | 1018 | .get_dqblk = dquot_get_dqblk, |
1019 | .set_dqblk = dquot_set_dqblk, | 1019 | .set_dqblk = dquot_set_dqblk, |
1020 | }; | 1020 | }; |
1021 | 1021 | ||
1022 | static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | 1022 | static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) |
1023 | { | 1023 | { |
1024 | struct dentry *root; | 1024 | struct dentry *root; |
1025 | int status, sector_size; | 1025 | int status, sector_size; |
1026 | struct mount_options parsed_options; | 1026 | struct mount_options parsed_options; |
1027 | struct inode *inode = NULL; | 1027 | struct inode *inode = NULL; |
1028 | struct ocfs2_super *osb = NULL; | 1028 | struct ocfs2_super *osb = NULL; |
1029 | struct buffer_head *bh = NULL; | 1029 | struct buffer_head *bh = NULL; |
1030 | char nodestr[12]; | 1030 | char nodestr[12]; |
1031 | struct ocfs2_blockcheck_stats stats; | 1031 | struct ocfs2_blockcheck_stats stats; |
1032 | 1032 | ||
1033 | trace_ocfs2_fill_super(sb, data, silent); | 1033 | trace_ocfs2_fill_super(sb, data, silent); |
1034 | 1034 | ||
1035 | if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { | 1035 | if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { |
1036 | status = -EINVAL; | 1036 | status = -EINVAL; |
1037 | goto read_super_error; | 1037 | goto read_super_error; |
1038 | } | 1038 | } |
1039 | 1039 | ||
1040 | /* probe for superblock */ | 1040 | /* probe for superblock */ |
1041 | status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); | 1041 | status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); |
1042 | if (status < 0) { | 1042 | if (status < 0) { |
1043 | mlog(ML_ERROR, "superblock probe failed!\n"); | 1043 | mlog(ML_ERROR, "superblock probe failed!\n"); |
1044 | goto read_super_error; | 1044 | goto read_super_error; |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | status = ocfs2_initialize_super(sb, bh, sector_size, &stats); | 1047 | status = ocfs2_initialize_super(sb, bh, sector_size, &stats); |
1048 | osb = OCFS2_SB(sb); | 1048 | osb = OCFS2_SB(sb); |
1049 | if (status < 0) { | 1049 | if (status < 0) { |
1050 | mlog_errno(status); | 1050 | mlog_errno(status); |
1051 | goto read_super_error; | 1051 | goto read_super_error; |
1052 | } | 1052 | } |
1053 | brelse(bh); | 1053 | brelse(bh); |
1054 | bh = NULL; | 1054 | bh = NULL; |
1055 | 1055 | ||
1056 | if (!ocfs2_check_set_options(sb, &parsed_options)) { | 1056 | if (!ocfs2_check_set_options(sb, &parsed_options)) { |
1057 | status = -EINVAL; | 1057 | status = -EINVAL; |
1058 | goto read_super_error; | 1058 | goto read_super_error; |
1059 | } | 1059 | } |
1060 | osb->s_mount_opt = parsed_options.mount_opt; | 1060 | osb->s_mount_opt = parsed_options.mount_opt; |
1061 | osb->s_atime_quantum = parsed_options.atime_quantum; | 1061 | osb->s_atime_quantum = parsed_options.atime_quantum; |
1062 | osb->preferred_slot = parsed_options.slot; | 1062 | osb->preferred_slot = parsed_options.slot; |
1063 | osb->osb_commit_interval = parsed_options.commit_interval; | 1063 | osb->osb_commit_interval = parsed_options.commit_interval; |
1064 | 1064 | ||
1065 | ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); | 1065 | ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); |
1066 | osb->osb_resv_level = parsed_options.resv_level; | 1066 | osb->osb_resv_level = parsed_options.resv_level; |
1067 | osb->osb_dir_resv_level = parsed_options.resv_level; | 1067 | osb->osb_dir_resv_level = parsed_options.resv_level; |
1068 | if (parsed_options.dir_resv_level == -1) | 1068 | if (parsed_options.dir_resv_level == -1) |
1069 | osb->osb_dir_resv_level = parsed_options.resv_level; | 1069 | osb->osb_dir_resv_level = parsed_options.resv_level; |
1070 | else | 1070 | else |
1071 | osb->osb_dir_resv_level = parsed_options.dir_resv_level; | 1071 | osb->osb_dir_resv_level = parsed_options.dir_resv_level; |
1072 | 1072 | ||
1073 | status = ocfs2_verify_userspace_stack(osb, &parsed_options); | 1073 | status = ocfs2_verify_userspace_stack(osb, &parsed_options); |
1074 | if (status) | 1074 | if (status) |
1075 | goto read_super_error; | 1075 | goto read_super_error; |
1076 | 1076 | ||
1077 | sb->s_magic = OCFS2_SUPER_MAGIC; | 1077 | sb->s_magic = OCFS2_SUPER_MAGIC; |
1078 | 1078 | ||
1079 | sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | | 1079 | sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | |
1080 | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); | 1080 | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); |
1081 | 1081 | ||
1082 | /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, | 1082 | /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, |
1083 | * heartbeat=none */ | 1083 | * heartbeat=none */ |
1084 | if (bdev_read_only(sb->s_bdev)) { | 1084 | if (bdev_read_only(sb->s_bdev)) { |
1085 | if (!(sb->s_flags & MS_RDONLY)) { | 1085 | if (!(sb->s_flags & MS_RDONLY)) { |
1086 | status = -EACCES; | 1086 | status = -EACCES; |
1087 | mlog(ML_ERROR, "Readonly device detected but readonly " | 1087 | mlog(ML_ERROR, "Readonly device detected but readonly " |
1088 | "mount was not specified.\n"); | 1088 | "mount was not specified.\n"); |
1089 | goto read_super_error; | 1089 | goto read_super_error; |
1090 | } | 1090 | } |
1091 | 1091 | ||
1092 | /* You should not be able to start a local heartbeat | 1092 | /* You should not be able to start a local heartbeat |
1093 | * on a readonly device. */ | 1093 | * on a readonly device. */ |
1094 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { | 1094 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { |
1095 | status = -EROFS; | 1095 | status = -EROFS; |
1096 | mlog(ML_ERROR, "Local heartbeat specified on readonly " | 1096 | mlog(ML_ERROR, "Local heartbeat specified on readonly " |
1097 | "device.\n"); | 1097 | "device.\n"); |
1098 | goto read_super_error; | 1098 | goto read_super_error; |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | status = ocfs2_check_journals_nolocks(osb); | 1101 | status = ocfs2_check_journals_nolocks(osb); |
1102 | if (status < 0) { | 1102 | if (status < 0) { |
1103 | if (status == -EROFS) | 1103 | if (status == -EROFS) |
1104 | mlog(ML_ERROR, "Recovery required on readonly " | 1104 | mlog(ML_ERROR, "Recovery required on readonly " |
1105 | "file system, but write access is " | 1105 | "file system, but write access is " |
1106 | "unavailable.\n"); | 1106 | "unavailable.\n"); |
1107 | else | 1107 | else |
1108 | mlog_errno(status); | 1108 | mlog_errno(status); |
1109 | goto read_super_error; | 1109 | goto read_super_error; |
1110 | } | 1110 | } |
1111 | 1111 | ||
1112 | ocfs2_set_ro_flag(osb, 1); | 1112 | ocfs2_set_ro_flag(osb, 1); |
1113 | 1113 | ||
1114 | printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " | 1114 | printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " |
1115 | "Cluster services will not be used for this mount. " | 1115 | "Cluster services will not be used for this mount. " |
1116 | "Recovery will be skipped.\n", osb->dev_str); | 1116 | "Recovery will be skipped.\n", osb->dev_str); |
1117 | } | 1117 | } |
1118 | 1118 | ||
1119 | if (!ocfs2_is_hard_readonly(osb)) { | 1119 | if (!ocfs2_is_hard_readonly(osb)) { |
1120 | if (sb->s_flags & MS_RDONLY) | 1120 | if (sb->s_flags & MS_RDONLY) |
1121 | ocfs2_set_ro_flag(osb, 0); | 1121 | ocfs2_set_ro_flag(osb, 0); |
1122 | } | 1122 | } |
1123 | 1123 | ||
1124 | status = ocfs2_verify_heartbeat(osb); | 1124 | status = ocfs2_verify_heartbeat(osb); |
1125 | if (status < 0) { | 1125 | if (status < 0) { |
1126 | mlog_errno(status); | 1126 | mlog_errno(status); |
1127 | goto read_super_error; | 1127 | goto read_super_error; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, | 1130 | osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, |
1131 | ocfs2_debugfs_root); | 1131 | ocfs2_debugfs_root); |
1132 | if (!osb->osb_debug_root) { | 1132 | if (!osb->osb_debug_root) { |
1133 | status = -EINVAL; | 1133 | status = -EINVAL; |
1134 | mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); | 1134 | mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); |
1135 | goto read_super_error; | 1135 | goto read_super_error; |
1136 | } | 1136 | } |
1137 | 1137 | ||
1138 | osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, | 1138 | osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, |
1139 | osb->osb_debug_root, | 1139 | osb->osb_debug_root, |
1140 | osb, | 1140 | osb, |
1141 | &ocfs2_osb_debug_fops); | 1141 | &ocfs2_osb_debug_fops); |
1142 | if (!osb->osb_ctxt) { | 1142 | if (!osb->osb_ctxt) { |
1143 | status = -EINVAL; | 1143 | status = -EINVAL; |
1144 | mlog_errno(status); | 1144 | mlog_errno(status); |
1145 | goto read_super_error; | 1145 | goto read_super_error; |
1146 | } | 1146 | } |
1147 | 1147 | ||
1148 | if (ocfs2_meta_ecc(osb)) { | 1148 | if (ocfs2_meta_ecc(osb)) { |
1149 | status = ocfs2_blockcheck_stats_debugfs_install( | 1149 | status = ocfs2_blockcheck_stats_debugfs_install( |
1150 | &osb->osb_ecc_stats, | 1150 | &osb->osb_ecc_stats, |
1151 | osb->osb_debug_root); | 1151 | osb->osb_debug_root); |
1152 | if (status) { | 1152 | if (status) { |
1153 | mlog(ML_ERROR, | 1153 | mlog(ML_ERROR, |
1154 | "Unable to create blockcheck statistics " | 1154 | "Unable to create blockcheck statistics " |
1155 | "files\n"); | 1155 | "files\n"); |
1156 | goto read_super_error; | 1156 | goto read_super_error; |
1157 | } | 1157 | } |
1158 | } | 1158 | } |
1159 | 1159 | ||
1160 | status = ocfs2_mount_volume(sb); | 1160 | status = ocfs2_mount_volume(sb); |
1161 | if (status < 0) | 1161 | if (status < 0) |
1162 | goto read_super_error; | 1162 | goto read_super_error; |
1163 | 1163 | ||
1164 | if (osb->root_inode) | 1164 | if (osb->root_inode) |
1165 | inode = igrab(osb->root_inode); | 1165 | inode = igrab(osb->root_inode); |
1166 | 1166 | ||
1167 | if (!inode) { | 1167 | if (!inode) { |
1168 | status = -EIO; | 1168 | status = -EIO; |
1169 | mlog_errno(status); | 1169 | mlog_errno(status); |
1170 | goto read_super_error; | 1170 | goto read_super_error; |
1171 | } | 1171 | } |
1172 | 1172 | ||
1173 | root = d_make_root(inode); | 1173 | root = d_make_root(inode); |
1174 | if (!root) { | 1174 | if (!root) { |
1175 | status = -ENOMEM; | 1175 | status = -ENOMEM; |
1176 | mlog_errno(status); | 1176 | mlog_errno(status); |
1177 | goto read_super_error; | 1177 | goto read_super_error; |
1178 | } | 1178 | } |
1179 | 1179 | ||
1180 | sb->s_root = root; | 1180 | sb->s_root = root; |
1181 | 1181 | ||
1182 | ocfs2_complete_mount_recovery(osb); | 1182 | ocfs2_complete_mount_recovery(osb); |
1183 | 1183 | ||
1184 | if (ocfs2_mount_local(osb)) | 1184 | if (ocfs2_mount_local(osb)) |
1185 | snprintf(nodestr, sizeof(nodestr), "local"); | 1185 | snprintf(nodestr, sizeof(nodestr), "local"); |
1186 | else | 1186 | else |
1187 | snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); | 1187 | snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); |
1188 | 1188 | ||
1189 | printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " | 1189 | printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " |
1190 | "with %s data mode.\n", | 1190 | "with %s data mode.\n", |
1191 | osb->dev_str, nodestr, osb->slot_num, | 1191 | osb->dev_str, nodestr, osb->slot_num, |
1192 | osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : | 1192 | osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : |
1193 | "ordered"); | 1193 | "ordered"); |
1194 | 1194 | ||
1195 | atomic_set(&osb->vol_state, VOLUME_MOUNTED); | 1195 | atomic_set(&osb->vol_state, VOLUME_MOUNTED); |
1196 | wake_up(&osb->osb_mount_event); | 1196 | wake_up(&osb->osb_mount_event); |
1197 | 1197 | ||
1198 | /* Now we can initialize quotas because we can afford to wait | 1198 | /* Now we can initialize quotas because we can afford to wait |
1199 | * for cluster locks recovery now. That also means that truncation | 1199 | * for cluster locks recovery now. That also means that truncation |
1200 | * log recovery can happen but that waits for proper quota setup */ | 1200 | * log recovery can happen but that waits for proper quota setup */ |
1201 | if (!(sb->s_flags & MS_RDONLY)) { | 1201 | if (!(sb->s_flags & MS_RDONLY)) { |
1202 | status = ocfs2_enable_quotas(osb); | 1202 | status = ocfs2_enable_quotas(osb); |
1203 | if (status < 0) { | 1203 | if (status < 0) { |
1204 | /* We have to err-out specially here because | 1204 | /* We have to err-out specially here because |
1205 | * s_root is already set */ | 1205 | * s_root is already set */ |
1206 | mlog_errno(status); | 1206 | mlog_errno(status); |
1207 | atomic_set(&osb->vol_state, VOLUME_DISABLED); | 1207 | atomic_set(&osb->vol_state, VOLUME_DISABLED); |
1208 | wake_up(&osb->osb_mount_event); | 1208 | wake_up(&osb->osb_mount_event); |
1209 | return status; | 1209 | return status; |
1210 | } | 1210 | } |
1211 | } | 1211 | } |
1212 | 1212 | ||
1213 | ocfs2_complete_quota_recovery(osb); | 1213 | ocfs2_complete_quota_recovery(osb); |
1214 | 1214 | ||
1215 | /* Now we wake up again for processes waiting for quotas */ | 1215 | /* Now we wake up again for processes waiting for quotas */ |
1216 | atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); | 1216 | atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); |
1217 | wake_up(&osb->osb_mount_event); | 1217 | wake_up(&osb->osb_mount_event); |
1218 | 1218 | ||
1219 | /* Start this when the mount is almost sure of being successful */ | 1219 | /* Start this when the mount is almost sure of being successful */ |
1220 | ocfs2_orphan_scan_start(osb); | 1220 | ocfs2_orphan_scan_start(osb); |
1221 | 1221 | ||
1222 | return status; | 1222 | return status; |
1223 | 1223 | ||
1224 | read_super_error: | 1224 | read_super_error: |
1225 | brelse(bh); | 1225 | brelse(bh); |
1226 | 1226 | ||
1227 | if (osb) { | 1227 | if (osb) { |
1228 | atomic_set(&osb->vol_state, VOLUME_DISABLED); | 1228 | atomic_set(&osb->vol_state, VOLUME_DISABLED); |
1229 | wake_up(&osb->osb_mount_event); | 1229 | wake_up(&osb->osb_mount_event); |
1230 | ocfs2_dismount_volume(sb, 1); | 1230 | ocfs2_dismount_volume(sb, 1); |
1231 | } | 1231 | } |
1232 | 1232 | ||
1233 | if (status) | 1233 | if (status) |
1234 | mlog_errno(status); | 1234 | mlog_errno(status); |
1235 | return status; | 1235 | return status; |
1236 | } | 1236 | } |
1237 | 1237 | ||
1238 | static struct dentry *ocfs2_mount(struct file_system_type *fs_type, | 1238 | static struct dentry *ocfs2_mount(struct file_system_type *fs_type, |
1239 | int flags, | 1239 | int flags, |
1240 | const char *dev_name, | 1240 | const char *dev_name, |
1241 | void *data) | 1241 | void *data) |
1242 | { | 1242 | { |
1243 | return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); | 1243 | return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | static struct file_system_type ocfs2_fs_type = { | 1246 | static struct file_system_type ocfs2_fs_type = { |
1247 | .owner = THIS_MODULE, | 1247 | .owner = THIS_MODULE, |
1248 | .name = "ocfs2", | 1248 | .name = "ocfs2", |
1249 | .mount = ocfs2_mount, | 1249 | .mount = ocfs2_mount, |
1250 | .kill_sb = kill_block_super, | 1250 | .kill_sb = kill_block_super, |
1251 | .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, | 1251 | .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, |
1252 | .next = NULL | 1252 | .next = NULL |
1253 | }; | 1253 | }; |
1254 | MODULE_ALIAS_FS("ocfs2"); | 1254 | MODULE_ALIAS_FS("ocfs2"); |
1255 | 1255 | ||
1256 | static int ocfs2_check_set_options(struct super_block *sb, | 1256 | static int ocfs2_check_set_options(struct super_block *sb, |
1257 | struct mount_options *options) | 1257 | struct mount_options *options) |
1258 | { | 1258 | { |
1259 | if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && | 1259 | if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && |
1260 | !OCFS2_HAS_RO_COMPAT_FEATURE(sb, | 1260 | !OCFS2_HAS_RO_COMPAT_FEATURE(sb, |
1261 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { | 1261 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { |
1262 | mlog(ML_ERROR, "User quotas were requested, but this " | 1262 | mlog(ML_ERROR, "User quotas were requested, but this " |
1263 | "filesystem does not have the feature enabled.\n"); | 1263 | "filesystem does not have the feature enabled.\n"); |
1264 | return 0; | 1264 | return 0; |
1265 | } | 1265 | } |
1266 | if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && | 1266 | if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && |
1267 | !OCFS2_HAS_RO_COMPAT_FEATURE(sb, | 1267 | !OCFS2_HAS_RO_COMPAT_FEATURE(sb, |
1268 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { | 1268 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { |
1269 | mlog(ML_ERROR, "Group quotas were requested, but this " | 1269 | mlog(ML_ERROR, "Group quotas were requested, but this " |
1270 | "filesystem does not have the feature enabled.\n"); | 1270 | "filesystem does not have the feature enabled.\n"); |
1271 | return 0; | 1271 | return 0; |
1272 | } | 1272 | } |
1273 | if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && | 1273 | if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && |
1274 | !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { | 1274 | !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { |
1275 | mlog(ML_ERROR, "ACL support requested but extended attributes " | 1275 | mlog(ML_ERROR, "ACL support requested but extended attributes " |
1276 | "feature is not enabled\n"); | 1276 | "feature is not enabled\n"); |
1277 | return 0; | 1277 | return 0; |
1278 | } | 1278 | } |
1279 | /* No ACL setting specified? Use XATTR feature... */ | 1279 | /* No ACL setting specified? Use XATTR feature... */ |
1280 | if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | | 1280 | if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | |
1281 | OCFS2_MOUNT_NO_POSIX_ACL))) { | 1281 | OCFS2_MOUNT_NO_POSIX_ACL))) { |
1282 | if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) | 1282 | if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) |
1283 | options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; | 1283 | options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; |
1284 | else | 1284 | else |
1285 | options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; | 1285 | options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; |
1286 | } | 1286 | } |
1287 | return 1; | 1287 | return 1; |
1288 | } | 1288 | } |
1289 | 1289 | ||
1290 | static int ocfs2_parse_options(struct super_block *sb, | 1290 | static int ocfs2_parse_options(struct super_block *sb, |
1291 | char *options, | 1291 | char *options, |
1292 | struct mount_options *mopt, | 1292 | struct mount_options *mopt, |
1293 | int is_remount) | 1293 | int is_remount) |
1294 | { | 1294 | { |
1295 | int status, user_stack = 0; | 1295 | int status, user_stack = 0; |
1296 | char *p; | 1296 | char *p; |
1297 | u32 tmp; | 1297 | u32 tmp; |
1298 | 1298 | ||
1299 | trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); | 1299 | trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); |
1300 | 1300 | ||
1301 | mopt->commit_interval = 0; | 1301 | mopt->commit_interval = 0; |
1302 | mopt->mount_opt = OCFS2_MOUNT_NOINTR; | 1302 | mopt->mount_opt = OCFS2_MOUNT_NOINTR; |
1303 | mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; | 1303 | mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; |
1304 | mopt->slot = OCFS2_INVALID_SLOT; | 1304 | mopt->slot = OCFS2_INVALID_SLOT; |
1305 | mopt->localalloc_opt = -1; | 1305 | mopt->localalloc_opt = -1; |
1306 | mopt->cluster_stack[0] = '\0'; | 1306 | mopt->cluster_stack[0] = '\0'; |
1307 | mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; | 1307 | mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; |
1308 | mopt->dir_resv_level = -1; | 1308 | mopt->dir_resv_level = -1; |
1309 | 1309 | ||
1310 | if (!options) { | 1310 | if (!options) { |
1311 | status = 1; | 1311 | status = 1; |
1312 | goto bail; | 1312 | goto bail; |
1313 | } | 1313 | } |
1314 | 1314 | ||
1315 | while ((p = strsep(&options, ",")) != NULL) { | 1315 | while ((p = strsep(&options, ",")) != NULL) { |
1316 | int token, option; | 1316 | int token, option; |
1317 | substring_t args[MAX_OPT_ARGS]; | 1317 | substring_t args[MAX_OPT_ARGS]; |
1318 | 1318 | ||
1319 | if (!*p) | 1319 | if (!*p) |
1320 | continue; | 1320 | continue; |
1321 | 1321 | ||
1322 | token = match_token(p, tokens, args); | 1322 | token = match_token(p, tokens, args); |
1323 | switch (token) { | 1323 | switch (token) { |
1324 | case Opt_hb_local: | 1324 | case Opt_hb_local: |
1325 | mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; | 1325 | mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; |
1326 | break; | 1326 | break; |
1327 | case Opt_hb_none: | 1327 | case Opt_hb_none: |
1328 | mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; | 1328 | mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; |
1329 | break; | 1329 | break; |
1330 | case Opt_hb_global: | 1330 | case Opt_hb_global: |
1331 | mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; | 1331 | mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; |
1332 | break; | 1332 | break; |
1333 | case Opt_barrier: | 1333 | case Opt_barrier: |
1334 | if (match_int(&args[0], &option)) { | 1334 | if (match_int(&args[0], &option)) { |
1335 | status = 0; | 1335 | status = 0; |
1336 | goto bail; | 1336 | goto bail; |
1337 | } | 1337 | } |
1338 | if (option) | 1338 | if (option) |
1339 | mopt->mount_opt |= OCFS2_MOUNT_BARRIER; | 1339 | mopt->mount_opt |= OCFS2_MOUNT_BARRIER; |
1340 | else | 1340 | else |
1341 | mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; | 1341 | mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; |
1342 | break; | 1342 | break; |
1343 | case Opt_intr: | 1343 | case Opt_intr: |
1344 | mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; | 1344 | mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; |
1345 | break; | 1345 | break; |
1346 | case Opt_nointr: | 1346 | case Opt_nointr: |
1347 | mopt->mount_opt |= OCFS2_MOUNT_NOINTR; | 1347 | mopt->mount_opt |= OCFS2_MOUNT_NOINTR; |
1348 | break; | 1348 | break; |
1349 | case Opt_err_panic: | 1349 | case Opt_err_panic: |
1350 | mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | 1350 | mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; |
1351 | break; | 1351 | break; |
1352 | case Opt_err_ro: | 1352 | case Opt_err_ro: |
1353 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; | 1353 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; |
1354 | break; | 1354 | break; |
1355 | case Opt_data_ordered: | 1355 | case Opt_data_ordered: |
1356 | mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; | 1356 | mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; |
1357 | break; | 1357 | break; |
1358 | case Opt_data_writeback: | 1358 | case Opt_data_writeback: |
1359 | mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; | 1359 | mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; |
1360 | break; | 1360 | break; |
1361 | case Opt_user_xattr: | 1361 | case Opt_user_xattr: |
1362 | mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; | 1362 | mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; |
1363 | break; | 1363 | break; |
1364 | case Opt_nouser_xattr: | 1364 | case Opt_nouser_xattr: |
1365 | mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; | 1365 | mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; |
1366 | break; | 1366 | break; |
1367 | case Opt_atime_quantum: | 1367 | case Opt_atime_quantum: |
1368 | if (match_int(&args[0], &option)) { | 1368 | if (match_int(&args[0], &option)) { |
1369 | status = 0; | 1369 | status = 0; |
1370 | goto bail; | 1370 | goto bail; |
1371 | } | 1371 | } |
1372 | if (option >= 0) | 1372 | if (option >= 0) |
1373 | mopt->atime_quantum = option; | 1373 | mopt->atime_quantum = option; |
1374 | break; | 1374 | break; |
1375 | case Opt_slot: | 1375 | case Opt_slot: |
1376 | option = 0; | 1376 | option = 0; |
1377 | if (match_int(&args[0], &option)) { | 1377 | if (match_int(&args[0], &option)) { |
1378 | status = 0; | 1378 | status = 0; |
1379 | goto bail; | 1379 | goto bail; |
1380 | } | 1380 | } |
1381 | if (option) | 1381 | if (option) |
1382 | mopt->slot = (s16)option; | 1382 | mopt->slot = (s16)option; |
1383 | break; | 1383 | break; |
1384 | case Opt_commit: | 1384 | case Opt_commit: |
1385 | option = 0; | 1385 | option = 0; |
1386 | if (match_int(&args[0], &option)) { | 1386 | if (match_int(&args[0], &option)) { |
1387 | status = 0; | 1387 | status = 0; |
1388 | goto bail; | 1388 | goto bail; |
1389 | } | 1389 | } |
1390 | if (option < 0) | 1390 | if (option < 0) |
1391 | return 0; | 1391 | return 0; |
1392 | if (option == 0) | 1392 | if (option == 0) |
1393 | option = JBD2_DEFAULT_MAX_COMMIT_AGE; | 1393 | option = JBD2_DEFAULT_MAX_COMMIT_AGE; |
1394 | mopt->commit_interval = HZ * option; | 1394 | mopt->commit_interval = HZ * option; |
1395 | break; | 1395 | break; |
1396 | case Opt_localalloc: | 1396 | case Opt_localalloc: |
1397 | option = 0; | 1397 | option = 0; |
1398 | if (match_int(&args[0], &option)) { | 1398 | if (match_int(&args[0], &option)) { |
1399 | status = 0; | 1399 | status = 0; |
1400 | goto bail; | 1400 | goto bail; |
1401 | } | 1401 | } |
1402 | if (option >= 0) | 1402 | if (option >= 0) |
1403 | mopt->localalloc_opt = option; | 1403 | mopt->localalloc_opt = option; |
1404 | break; | 1404 | break; |
1405 | case Opt_localflocks: | 1405 | case Opt_localflocks: |
1406 | /* | 1406 | /* |
1407 | * Changing this during remount could race | 1407 | * Changing this during remount could race |
1408 | * flock() requests, or "unbalance" existing | 1408 | * flock() requests, or "unbalance" existing |
1409 | * ones (e.g., a lock is taken in one mode but | 1409 | * ones (e.g., a lock is taken in one mode but |
1410 | * dropped in the other). If users care enough | 1410 | * dropped in the other). If users care enough |
1411 | * to flip locking modes during remount, we | 1411 | * to flip locking modes during remount, we |
1412 | * could add a "local" flag to individual | 1412 | * could add a "local" flag to individual |
1413 | * flock structures for proper tracking of | 1413 | * flock structures for proper tracking of |
1414 | * state. | 1414 | * state. |
1415 | */ | 1415 | */ |
1416 | if (!is_remount) | 1416 | if (!is_remount) |
1417 | mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; | 1417 | mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; |
1418 | break; | 1418 | break; |
1419 | case Opt_stack: | 1419 | case Opt_stack: |
1420 | /* Check both that the option we were passed | 1420 | /* Check both that the option we were passed |
1421 | * is of the right length and that it is a proper | 1421 | * is of the right length and that it is a proper |
1422 | * string of the right length. | 1422 | * string of the right length. |
1423 | */ | 1423 | */ |
1424 | if (((args[0].to - args[0].from) != | 1424 | if (((args[0].to - args[0].from) != |
1425 | OCFS2_STACK_LABEL_LEN) || | 1425 | OCFS2_STACK_LABEL_LEN) || |
1426 | (strnlen(args[0].from, | 1426 | (strnlen(args[0].from, |
1427 | OCFS2_STACK_LABEL_LEN) != | 1427 | OCFS2_STACK_LABEL_LEN) != |
1428 | OCFS2_STACK_LABEL_LEN)) { | 1428 | OCFS2_STACK_LABEL_LEN)) { |
1429 | mlog(ML_ERROR, | 1429 | mlog(ML_ERROR, |
1430 | "Invalid cluster_stack option\n"); | 1430 | "Invalid cluster_stack option\n"); |
1431 | status = 0; | 1431 | status = 0; |
1432 | goto bail; | 1432 | goto bail; |
1433 | } | 1433 | } |
1434 | memcpy(mopt->cluster_stack, args[0].from, | 1434 | memcpy(mopt->cluster_stack, args[0].from, |
1435 | OCFS2_STACK_LABEL_LEN); | 1435 | OCFS2_STACK_LABEL_LEN); |
1436 | mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; | 1436 | mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; |
1437 | /* | 1437 | /* |
1438 | * Open code the memcmp here as we don't have | 1438 | * Open code the memcmp here as we don't have |
1439 | * an osb to pass to | 1439 | * an osb to pass to |
1440 | * ocfs2_userspace_stack(). | 1440 | * ocfs2_userspace_stack(). |
1441 | */ | 1441 | */ |
1442 | if (memcmp(mopt->cluster_stack, | 1442 | if (memcmp(mopt->cluster_stack, |
1443 | OCFS2_CLASSIC_CLUSTER_STACK, | 1443 | OCFS2_CLASSIC_CLUSTER_STACK, |
1444 | OCFS2_STACK_LABEL_LEN)) | 1444 | OCFS2_STACK_LABEL_LEN)) |
1445 | user_stack = 1; | 1445 | user_stack = 1; |
1446 | break; | 1446 | break; |
1447 | case Opt_inode64: | 1447 | case Opt_inode64: |
1448 | mopt->mount_opt |= OCFS2_MOUNT_INODE64; | 1448 | mopt->mount_opt |= OCFS2_MOUNT_INODE64; |
1449 | break; | 1449 | break; |
1450 | case Opt_usrquota: | 1450 | case Opt_usrquota: |
1451 | mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; | 1451 | mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; |
1452 | break; | 1452 | break; |
1453 | case Opt_grpquota: | 1453 | case Opt_grpquota: |
1454 | mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; | 1454 | mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; |
1455 | break; | 1455 | break; |
1456 | case Opt_coherency_buffered: | 1456 | case Opt_coherency_buffered: |
1457 | mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; | 1457 | mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; |
1458 | break; | 1458 | break; |
1459 | case Opt_coherency_full: | 1459 | case Opt_coherency_full: |
1460 | mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; | 1460 | mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; |
1461 | break; | 1461 | break; |
1462 | case Opt_acl: | 1462 | case Opt_acl: |
1463 | mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; | 1463 | mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; |
1464 | mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; | 1464 | mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; |
1465 | break; | 1465 | break; |
1466 | case Opt_noacl: | 1466 | case Opt_noacl: |
1467 | mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; | 1467 | mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; |
1468 | mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; | 1468 | mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; |
1469 | break; | 1469 | break; |
1470 | case Opt_resv_level: | 1470 | case Opt_resv_level: |
1471 | if (is_remount) | 1471 | if (is_remount) |
1472 | break; | 1472 | break; |
1473 | if (match_int(&args[0], &option)) { | 1473 | if (match_int(&args[0], &option)) { |
1474 | status = 0; | 1474 | status = 0; |
1475 | goto bail; | 1475 | goto bail; |
1476 | } | 1476 | } |
1477 | if (option >= OCFS2_MIN_RESV_LEVEL && | 1477 | if (option >= OCFS2_MIN_RESV_LEVEL && |
1478 | option < OCFS2_MAX_RESV_LEVEL) | 1478 | option < OCFS2_MAX_RESV_LEVEL) |
1479 | mopt->resv_level = option; | 1479 | mopt->resv_level = option; |
1480 | break; | 1480 | break; |
1481 | case Opt_dir_resv_level: | 1481 | case Opt_dir_resv_level: |
1482 | if (is_remount) | 1482 | if (is_remount) |
1483 | break; | 1483 | break; |
1484 | if (match_int(&args[0], &option)) { | 1484 | if (match_int(&args[0], &option)) { |
1485 | status = 0; | 1485 | status = 0; |
1486 | goto bail; | 1486 | goto bail; |
1487 | } | 1487 | } |
1488 | if (option >= OCFS2_MIN_RESV_LEVEL && | 1488 | if (option >= OCFS2_MIN_RESV_LEVEL && |
1489 | option < OCFS2_MAX_RESV_LEVEL) | 1489 | option < OCFS2_MAX_RESV_LEVEL) |
1490 | mopt->dir_resv_level = option; | 1490 | mopt->dir_resv_level = option; |
1491 | break; | 1491 | break; |
1492 | default: | 1492 | default: |
1493 | mlog(ML_ERROR, | 1493 | mlog(ML_ERROR, |
1494 | "Unrecognized mount option \"%s\" " | 1494 | "Unrecognized mount option \"%s\" " |
1495 | "or missing value\n", p); | 1495 | "or missing value\n", p); |
1496 | status = 0; | 1496 | status = 0; |
1497 | goto bail; | 1497 | goto bail; |
1498 | } | 1498 | } |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | if (user_stack == 0) { | 1501 | if (user_stack == 0) { |
1502 | /* Ensure only one heartbeat mode */ | 1502 | /* Ensure only one heartbeat mode */ |
1503 | tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | | 1503 | tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | |
1504 | OCFS2_MOUNT_HB_GLOBAL | | 1504 | OCFS2_MOUNT_HB_GLOBAL | |
1505 | OCFS2_MOUNT_HB_NONE); | 1505 | OCFS2_MOUNT_HB_NONE); |
1506 | if (hweight32(tmp) != 1) { | 1506 | if (hweight32(tmp) != 1) { |
1507 | mlog(ML_ERROR, "Invalid heartbeat mount options\n"); | 1507 | mlog(ML_ERROR, "Invalid heartbeat mount options\n"); |
1508 | status = 0; | 1508 | status = 0; |
1509 | goto bail; | 1509 | goto bail; |
1510 | } | 1510 | } |
1511 | } | 1511 | } |
1512 | 1512 | ||
1513 | status = 1; | 1513 | status = 1; |
1514 | 1514 | ||
1515 | bail: | 1515 | bail: |
1516 | return status; | 1516 | return status; |
1517 | } | 1517 | } |
1518 | 1518 | ||
1519 | static int ocfs2_show_options(struct seq_file *s, struct dentry *root) | 1519 | static int ocfs2_show_options(struct seq_file *s, struct dentry *root) |
1520 | { | 1520 | { |
1521 | struct ocfs2_super *osb = OCFS2_SB(root->d_sb); | 1521 | struct ocfs2_super *osb = OCFS2_SB(root->d_sb); |
1522 | unsigned long opts = osb->s_mount_opt; | 1522 | unsigned long opts = osb->s_mount_opt; |
1523 | unsigned int local_alloc_megs; | 1523 | unsigned int local_alloc_megs; |
1524 | 1524 | ||
1525 | if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { | 1525 | if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { |
1526 | seq_printf(s, ",_netdev"); | 1526 | seq_printf(s, ",_netdev"); |
1527 | if (opts & OCFS2_MOUNT_HB_LOCAL) | 1527 | if (opts & OCFS2_MOUNT_HB_LOCAL) |
1528 | seq_printf(s, ",%s", OCFS2_HB_LOCAL); | 1528 | seq_printf(s, ",%s", OCFS2_HB_LOCAL); |
1529 | else | 1529 | else |
1530 | seq_printf(s, ",%s", OCFS2_HB_GLOBAL); | 1530 | seq_printf(s, ",%s", OCFS2_HB_GLOBAL); |
1531 | } else | 1531 | } else |
1532 | seq_printf(s, ",%s", OCFS2_HB_NONE); | 1532 | seq_printf(s, ",%s", OCFS2_HB_NONE); |
1533 | 1533 | ||
1534 | if (opts & OCFS2_MOUNT_NOINTR) | 1534 | if (opts & OCFS2_MOUNT_NOINTR) |
1535 | seq_printf(s, ",nointr"); | 1535 | seq_printf(s, ",nointr"); |
1536 | 1536 | ||
1537 | if (opts & OCFS2_MOUNT_DATA_WRITEBACK) | 1537 | if (opts & OCFS2_MOUNT_DATA_WRITEBACK) |
1538 | seq_printf(s, ",data=writeback"); | 1538 | seq_printf(s, ",data=writeback"); |
1539 | else | 1539 | else |
1540 | seq_printf(s, ",data=ordered"); | 1540 | seq_printf(s, ",data=ordered"); |
1541 | 1541 | ||
1542 | if (opts & OCFS2_MOUNT_BARRIER) | 1542 | if (opts & OCFS2_MOUNT_BARRIER) |
1543 | seq_printf(s, ",barrier=1"); | 1543 | seq_printf(s, ",barrier=1"); |
1544 | 1544 | ||
1545 | if (opts & OCFS2_MOUNT_ERRORS_PANIC) | 1545 | if (opts & OCFS2_MOUNT_ERRORS_PANIC) |
1546 | seq_printf(s, ",errors=panic"); | 1546 | seq_printf(s, ",errors=panic"); |
1547 | else | 1547 | else |
1548 | seq_printf(s, ",errors=remount-ro"); | 1548 | seq_printf(s, ",errors=remount-ro"); |
1549 | 1549 | ||
1550 | if (osb->preferred_slot != OCFS2_INVALID_SLOT) | 1550 | if (osb->preferred_slot != OCFS2_INVALID_SLOT) |
1551 | seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); | 1551 | seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); |
1552 | 1552 | ||
1553 | seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); | 1553 | seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); |
1554 | 1554 | ||
1555 | if (osb->osb_commit_interval) | 1555 | if (osb->osb_commit_interval) |
1556 | seq_printf(s, ",commit=%u", | 1556 | seq_printf(s, ",commit=%u", |
1557 | (unsigned) (osb->osb_commit_interval / HZ)); | 1557 | (unsigned) (osb->osb_commit_interval / HZ)); |
1558 | 1558 | ||
1559 | local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); | 1559 | local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); |
1560 | if (local_alloc_megs != ocfs2_la_default_mb(osb)) | 1560 | if (local_alloc_megs != ocfs2_la_default_mb(osb)) |
1561 | seq_printf(s, ",localalloc=%d", local_alloc_megs); | 1561 | seq_printf(s, ",localalloc=%d", local_alloc_megs); |
1562 | 1562 | ||
1563 | if (opts & OCFS2_MOUNT_LOCALFLOCKS) | 1563 | if (opts & OCFS2_MOUNT_LOCALFLOCKS) |
1564 | seq_printf(s, ",localflocks,"); | 1564 | seq_printf(s, ",localflocks,"); |
1565 | 1565 | ||
1566 | if (osb->osb_cluster_stack[0]) | 1566 | if (osb->osb_cluster_stack[0]) |
1567 | seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, | 1567 | seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, |
1568 | osb->osb_cluster_stack); | 1568 | osb->osb_cluster_stack); |
1569 | if (opts & OCFS2_MOUNT_USRQUOTA) | 1569 | if (opts & OCFS2_MOUNT_USRQUOTA) |
1570 | seq_printf(s, ",usrquota"); | 1570 | seq_printf(s, ",usrquota"); |
1571 | if (opts & OCFS2_MOUNT_GRPQUOTA) | 1571 | if (opts & OCFS2_MOUNT_GRPQUOTA) |
1572 | seq_printf(s, ",grpquota"); | 1572 | seq_printf(s, ",grpquota"); |
1573 | 1573 | ||
1574 | if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) | 1574 | if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) |
1575 | seq_printf(s, ",coherency=buffered"); | 1575 | seq_printf(s, ",coherency=buffered"); |
1576 | else | 1576 | else |
1577 | seq_printf(s, ",coherency=full"); | 1577 | seq_printf(s, ",coherency=full"); |
1578 | 1578 | ||
1579 | if (opts & OCFS2_MOUNT_NOUSERXATTR) | 1579 | if (opts & OCFS2_MOUNT_NOUSERXATTR) |
1580 | seq_printf(s, ",nouser_xattr"); | 1580 | seq_printf(s, ",nouser_xattr"); |
1581 | else | 1581 | else |
1582 | seq_printf(s, ",user_xattr"); | 1582 | seq_printf(s, ",user_xattr"); |
1583 | 1583 | ||
1584 | if (opts & OCFS2_MOUNT_INODE64) | 1584 | if (opts & OCFS2_MOUNT_INODE64) |
1585 | seq_printf(s, ",inode64"); | 1585 | seq_printf(s, ",inode64"); |
1586 | 1586 | ||
1587 | if (opts & OCFS2_MOUNT_POSIX_ACL) | 1587 | if (opts & OCFS2_MOUNT_POSIX_ACL) |
1588 | seq_printf(s, ",acl"); | 1588 | seq_printf(s, ",acl"); |
1589 | else | 1589 | else |
1590 | seq_printf(s, ",noacl"); | 1590 | seq_printf(s, ",noacl"); |
1591 | 1591 | ||
1592 | if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) | 1592 | if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) |
1593 | seq_printf(s, ",resv_level=%d", osb->osb_resv_level); | 1593 | seq_printf(s, ",resv_level=%d", osb->osb_resv_level); |
1594 | 1594 | ||
1595 | if (osb->osb_dir_resv_level != osb->osb_resv_level) | 1595 | if (osb->osb_dir_resv_level != osb->osb_resv_level) |
1596 | seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); | 1596 | seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); |
1597 | 1597 | ||
1598 | return 0; | 1598 | return 0; |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | static int __init ocfs2_init(void) | 1601 | static int __init ocfs2_init(void) |
1602 | { | 1602 | { |
1603 | int status; | 1603 | int status; |
1604 | 1604 | ||
1605 | status = init_ocfs2_uptodate_cache(); | 1605 | status = init_ocfs2_uptodate_cache(); |
1606 | if (status < 0) | 1606 | if (status < 0) |
1607 | goto out1; | 1607 | goto out1; |
1608 | 1608 | ||
1609 | status = ocfs2_initialize_mem_caches(); | 1609 | status = ocfs2_initialize_mem_caches(); |
1610 | if (status < 0) | 1610 | if (status < 0) |
1611 | goto out2; | 1611 | goto out2; |
1612 | 1612 | ||
1613 | ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | 1613 | ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); |
1614 | if (!ocfs2_wq) { | 1614 | if (!ocfs2_wq) { |
1615 | status = -ENOMEM; | 1615 | status = -ENOMEM; |
1616 | goto out3; | 1616 | goto out3; |
1617 | } | 1617 | } |
1618 | 1618 | ||
1619 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); | 1619 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); |
1620 | if (!ocfs2_debugfs_root) { | 1620 | if (!ocfs2_debugfs_root) { |
1621 | status = -EFAULT; | 1621 | status = -EFAULT; |
1622 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | 1622 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); |
1623 | } | 1623 | } |
1624 | 1624 | ||
1625 | ocfs2_set_locking_protocol(); | 1625 | ocfs2_set_locking_protocol(); |
1626 | 1626 | ||
1627 | status = register_quota_format(&ocfs2_quota_format); | 1627 | status = register_quota_format(&ocfs2_quota_format); |
1628 | if (status < 0) | 1628 | if (status < 0) |
1629 | goto out4; | 1629 | goto out4; |
1630 | status = register_filesystem(&ocfs2_fs_type); | 1630 | status = register_filesystem(&ocfs2_fs_type); |
1631 | if (!status) | 1631 | if (!status) |
1632 | return 0; | 1632 | return 0; |
1633 | 1633 | ||
1634 | unregister_quota_format(&ocfs2_quota_format); | 1634 | unregister_quota_format(&ocfs2_quota_format); |
1635 | out4: | 1635 | out4: |
1636 | destroy_workqueue(ocfs2_wq); | 1636 | destroy_workqueue(ocfs2_wq); |
1637 | debugfs_remove(ocfs2_debugfs_root); | 1637 | debugfs_remove(ocfs2_debugfs_root); |
1638 | out3: | 1638 | out3: |
1639 | ocfs2_free_mem_caches(); | 1639 | ocfs2_free_mem_caches(); |
1640 | out2: | 1640 | out2: |
1641 | exit_ocfs2_uptodate_cache(); | 1641 | exit_ocfs2_uptodate_cache(); |
1642 | out1: | 1642 | out1: |
1643 | mlog_errno(status); | 1643 | mlog_errno(status); |
1644 | return status; | 1644 | return status; |
1645 | } | 1645 | } |
1646 | 1646 | ||
1647 | static void __exit ocfs2_exit(void) | 1647 | static void __exit ocfs2_exit(void) |
1648 | { | 1648 | { |
1649 | if (ocfs2_wq) { | 1649 | if (ocfs2_wq) { |
1650 | flush_workqueue(ocfs2_wq); | 1650 | flush_workqueue(ocfs2_wq); |
1651 | destroy_workqueue(ocfs2_wq); | 1651 | destroy_workqueue(ocfs2_wq); |
1652 | } | 1652 | } |
1653 | 1653 | ||
1654 | unregister_quota_format(&ocfs2_quota_format); | 1654 | unregister_quota_format(&ocfs2_quota_format); |
1655 | 1655 | ||
1656 | debugfs_remove(ocfs2_debugfs_root); | 1656 | debugfs_remove(ocfs2_debugfs_root); |
1657 | 1657 | ||
1658 | ocfs2_free_mem_caches(); | 1658 | ocfs2_free_mem_caches(); |
1659 | 1659 | ||
1660 | unregister_filesystem(&ocfs2_fs_type); | 1660 | unregister_filesystem(&ocfs2_fs_type); |
1661 | 1661 | ||
1662 | exit_ocfs2_uptodate_cache(); | 1662 | exit_ocfs2_uptodate_cache(); |
1663 | } | 1663 | } |
1664 | 1664 | ||
1665 | static void ocfs2_put_super(struct super_block *sb) | 1665 | static void ocfs2_put_super(struct super_block *sb) |
1666 | { | 1666 | { |
1667 | trace_ocfs2_put_super(sb); | 1667 | trace_ocfs2_put_super(sb); |
1668 | 1668 | ||
1669 | ocfs2_sync_blockdev(sb); | 1669 | ocfs2_sync_blockdev(sb); |
1670 | ocfs2_dismount_volume(sb, 0); | 1670 | ocfs2_dismount_volume(sb, 0); |
1671 | } | 1671 | } |
1672 | 1672 | ||
1673 | static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) | 1673 | static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) |
1674 | { | 1674 | { |
1675 | struct ocfs2_super *osb; | 1675 | struct ocfs2_super *osb; |
1676 | u32 numbits, freebits; | 1676 | u32 numbits, freebits; |
1677 | int status; | 1677 | int status; |
1678 | struct ocfs2_dinode *bm_lock; | 1678 | struct ocfs2_dinode *bm_lock; |
1679 | struct buffer_head *bh = NULL; | 1679 | struct buffer_head *bh = NULL; |
1680 | struct inode *inode = NULL; | 1680 | struct inode *inode = NULL; |
1681 | 1681 | ||
1682 | trace_ocfs2_statfs(dentry->d_sb, buf); | 1682 | trace_ocfs2_statfs(dentry->d_sb, buf); |
1683 | 1683 | ||
1684 | osb = OCFS2_SB(dentry->d_sb); | 1684 | osb = OCFS2_SB(dentry->d_sb); |
1685 | 1685 | ||
1686 | inode = ocfs2_get_system_file_inode(osb, | 1686 | inode = ocfs2_get_system_file_inode(osb, |
1687 | GLOBAL_BITMAP_SYSTEM_INODE, | 1687 | GLOBAL_BITMAP_SYSTEM_INODE, |
1688 | OCFS2_INVALID_SLOT); | 1688 | OCFS2_INVALID_SLOT); |
1689 | if (!inode) { | 1689 | if (!inode) { |
1690 | mlog(ML_ERROR, "failed to get bitmap inode\n"); | 1690 | mlog(ML_ERROR, "failed to get bitmap inode\n"); |
1691 | status = -EIO; | 1691 | status = -EIO; |
1692 | goto bail; | 1692 | goto bail; |
1693 | } | 1693 | } |
1694 | 1694 | ||
1695 | status = ocfs2_inode_lock(inode, &bh, 0); | 1695 | status = ocfs2_inode_lock(inode, &bh, 0); |
1696 | if (status < 0) { | 1696 | if (status < 0) { |
1697 | mlog_errno(status); | 1697 | mlog_errno(status); |
1698 | goto bail; | 1698 | goto bail; |
1699 | } | 1699 | } |
1700 | 1700 | ||
1701 | bm_lock = (struct ocfs2_dinode *) bh->b_data; | 1701 | bm_lock = (struct ocfs2_dinode *) bh->b_data; |
1702 | 1702 | ||
1703 | numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); | 1703 | numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); |
1704 | freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); | 1704 | freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); |
1705 | 1705 | ||
1706 | buf->f_type = OCFS2_SUPER_MAGIC; | 1706 | buf->f_type = OCFS2_SUPER_MAGIC; |
1707 | buf->f_bsize = dentry->d_sb->s_blocksize; | 1707 | buf->f_bsize = dentry->d_sb->s_blocksize; |
1708 | buf->f_namelen = OCFS2_MAX_FILENAME_LEN; | 1708 | buf->f_namelen = OCFS2_MAX_FILENAME_LEN; |
1709 | buf->f_blocks = ((sector_t) numbits) * | 1709 | buf->f_blocks = ((sector_t) numbits) * |
1710 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); | 1710 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); |
1711 | buf->f_bfree = ((sector_t) freebits) * | 1711 | buf->f_bfree = ((sector_t) freebits) * |
1712 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); | 1712 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); |
1713 | buf->f_bavail = buf->f_bfree; | 1713 | buf->f_bavail = buf->f_bfree; |
1714 | buf->f_files = numbits; | 1714 | buf->f_files = numbits; |
1715 | buf->f_ffree = freebits; | 1715 | buf->f_ffree = freebits; |
1716 | buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) | 1716 | buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) |
1717 | & 0xFFFFFFFFUL; | 1717 | & 0xFFFFFFFFUL; |
1718 | buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, | 1718 | buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, |
1719 | OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; | 1719 | OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; |
1720 | 1720 | ||
1721 | brelse(bh); | 1721 | brelse(bh); |
1722 | 1722 | ||
1723 | ocfs2_inode_unlock(inode, 0); | 1723 | ocfs2_inode_unlock(inode, 0); |
1724 | status = 0; | 1724 | status = 0; |
1725 | bail: | 1725 | bail: |
1726 | if (inode) | 1726 | if (inode) |
1727 | iput(inode); | 1727 | iput(inode); |
1728 | 1728 | ||
1729 | if (status) | 1729 | if (status) |
1730 | mlog_errno(status); | 1730 | mlog_errno(status); |
1731 | 1731 | ||
1732 | return status; | 1732 | return status; |
1733 | } | 1733 | } |
1734 | 1734 | ||
1735 | static void ocfs2_inode_init_once(void *data) | 1735 | static void ocfs2_inode_init_once(void *data) |
1736 | { | 1736 | { |
1737 | struct ocfs2_inode_info *oi = data; | 1737 | struct ocfs2_inode_info *oi = data; |
1738 | 1738 | ||
1739 | oi->ip_flags = 0; | 1739 | oi->ip_flags = 0; |
1740 | oi->ip_open_count = 0; | 1740 | oi->ip_open_count = 0; |
1741 | spin_lock_init(&oi->ip_lock); | 1741 | spin_lock_init(&oi->ip_lock); |
1742 | ocfs2_extent_map_init(&oi->vfs_inode); | 1742 | ocfs2_extent_map_init(&oi->vfs_inode); |
1743 | INIT_LIST_HEAD(&oi->ip_io_markers); | 1743 | INIT_LIST_HEAD(&oi->ip_io_markers); |
1744 | oi->ip_dir_start_lookup = 0; | 1744 | oi->ip_dir_start_lookup = 0; |
1745 | mutex_init(&oi->ip_unaligned_aio); | 1745 | mutex_init(&oi->ip_unaligned_aio); |
1746 | init_rwsem(&oi->ip_alloc_sem); | 1746 | init_rwsem(&oi->ip_alloc_sem); |
1747 | init_rwsem(&oi->ip_xattr_sem); | 1747 | init_rwsem(&oi->ip_xattr_sem); |
1748 | mutex_init(&oi->ip_io_mutex); | 1748 | mutex_init(&oi->ip_io_mutex); |
1749 | 1749 | ||
1750 | oi->ip_blkno = 0ULL; | 1750 | oi->ip_blkno = 0ULL; |
1751 | oi->ip_clusters = 0; | 1751 | oi->ip_clusters = 0; |
1752 | 1752 | ||
1753 | ocfs2_resv_init_once(&oi->ip_la_data_resv); | 1753 | ocfs2_resv_init_once(&oi->ip_la_data_resv); |
1754 | 1754 | ||
1755 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); | 1755 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); |
1756 | ocfs2_lock_res_init_once(&oi->ip_inode_lockres); | 1756 | ocfs2_lock_res_init_once(&oi->ip_inode_lockres); |
1757 | ocfs2_lock_res_init_once(&oi->ip_open_lockres); | 1757 | ocfs2_lock_res_init_once(&oi->ip_open_lockres); |
1758 | 1758 | ||
1759 | ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), | 1759 | ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), |
1760 | &ocfs2_inode_caching_ops); | 1760 | &ocfs2_inode_caching_ops); |
1761 | 1761 | ||
1762 | inode_init_once(&oi->vfs_inode); | 1762 | inode_init_once(&oi->vfs_inode); |
1763 | } | 1763 | } |
1764 | 1764 | ||
1765 | static int ocfs2_initialize_mem_caches(void) | 1765 | static int ocfs2_initialize_mem_caches(void) |
1766 | { | 1766 | { |
1767 | ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", | 1767 | ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", |
1768 | sizeof(struct ocfs2_inode_info), | 1768 | sizeof(struct ocfs2_inode_info), |
1769 | 0, | 1769 | 0, |
1770 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | 1770 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
1771 | SLAB_MEM_SPREAD), | 1771 | SLAB_MEM_SPREAD), |
1772 | ocfs2_inode_init_once); | 1772 | ocfs2_inode_init_once); |
1773 | ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", | 1773 | ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", |
1774 | sizeof(struct ocfs2_dquot), | 1774 | sizeof(struct ocfs2_dquot), |
1775 | 0, | 1775 | 0, |
1776 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | 1776 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
1777 | SLAB_MEM_SPREAD), | 1777 | SLAB_MEM_SPREAD), |
1778 | NULL); | 1778 | NULL); |
1779 | ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", | 1779 | ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", |
1780 | sizeof(struct ocfs2_quota_chunk), | 1780 | sizeof(struct ocfs2_quota_chunk), |
1781 | 0, | 1781 | 0, |
1782 | (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), | 1782 | (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), |
1783 | NULL); | 1783 | NULL); |
1784 | if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || | 1784 | if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || |
1785 | !ocfs2_qf_chunk_cachep) { | 1785 | !ocfs2_qf_chunk_cachep) { |
1786 | if (ocfs2_inode_cachep) | 1786 | if (ocfs2_inode_cachep) |
1787 | kmem_cache_destroy(ocfs2_inode_cachep); | 1787 | kmem_cache_destroy(ocfs2_inode_cachep); |
1788 | if (ocfs2_dquot_cachep) | 1788 | if (ocfs2_dquot_cachep) |
1789 | kmem_cache_destroy(ocfs2_dquot_cachep); | 1789 | kmem_cache_destroy(ocfs2_dquot_cachep); |
1790 | if (ocfs2_qf_chunk_cachep) | 1790 | if (ocfs2_qf_chunk_cachep) |
1791 | kmem_cache_destroy(ocfs2_qf_chunk_cachep); | 1791 | kmem_cache_destroy(ocfs2_qf_chunk_cachep); |
1792 | return -ENOMEM; | 1792 | return -ENOMEM; |
1793 | } | 1793 | } |
1794 | 1794 | ||
1795 | return 0; | 1795 | return 0; |
1796 | } | 1796 | } |
1797 | 1797 | ||
1798 | static void ocfs2_free_mem_caches(void) | 1798 | static void ocfs2_free_mem_caches(void) |
1799 | { | 1799 | { |
1800 | /* | 1800 | /* |
1801 | * Make sure all delayed rcu free inodes are flushed before we | 1801 | * Make sure all delayed rcu free inodes are flushed before we |
1802 | * destroy cache. | 1802 | * destroy cache. |
1803 | */ | 1803 | */ |
1804 | rcu_barrier(); | 1804 | rcu_barrier(); |
1805 | if (ocfs2_inode_cachep) | 1805 | if (ocfs2_inode_cachep) |
1806 | kmem_cache_destroy(ocfs2_inode_cachep); | 1806 | kmem_cache_destroy(ocfs2_inode_cachep); |
1807 | ocfs2_inode_cachep = NULL; | 1807 | ocfs2_inode_cachep = NULL; |
1808 | 1808 | ||
1809 | if (ocfs2_dquot_cachep) | 1809 | if (ocfs2_dquot_cachep) |
1810 | kmem_cache_destroy(ocfs2_dquot_cachep); | 1810 | kmem_cache_destroy(ocfs2_dquot_cachep); |
1811 | ocfs2_dquot_cachep = NULL; | 1811 | ocfs2_dquot_cachep = NULL; |
1812 | 1812 | ||
1813 | if (ocfs2_qf_chunk_cachep) | 1813 | if (ocfs2_qf_chunk_cachep) |
1814 | kmem_cache_destroy(ocfs2_qf_chunk_cachep); | 1814 | kmem_cache_destroy(ocfs2_qf_chunk_cachep); |
1815 | ocfs2_qf_chunk_cachep = NULL; | 1815 | ocfs2_qf_chunk_cachep = NULL; |
1816 | } | 1816 | } |
1817 | 1817 | ||
1818 | static int ocfs2_get_sector(struct super_block *sb, | 1818 | static int ocfs2_get_sector(struct super_block *sb, |
1819 | struct buffer_head **bh, | 1819 | struct buffer_head **bh, |
1820 | int block, | 1820 | int block, |
1821 | int sect_size) | 1821 | int sect_size) |
1822 | { | 1822 | { |
1823 | if (!sb_set_blocksize(sb, sect_size)) { | 1823 | if (!sb_set_blocksize(sb, sect_size)) { |
1824 | mlog(ML_ERROR, "unable to set blocksize\n"); | 1824 | mlog(ML_ERROR, "unable to set blocksize\n"); |
1825 | return -EIO; | 1825 | return -EIO; |
1826 | } | 1826 | } |
1827 | 1827 | ||
1828 | *bh = sb_getblk(sb, block); | 1828 | *bh = sb_getblk(sb, block); |
1829 | if (!*bh) { | 1829 | if (!*bh) { |
1830 | mlog_errno(-ENOMEM); | 1830 | mlog_errno(-ENOMEM); |
1831 | return -ENOMEM; | 1831 | return -ENOMEM; |
1832 | } | 1832 | } |
1833 | lock_buffer(*bh); | 1833 | lock_buffer(*bh); |
1834 | if (!buffer_dirty(*bh)) | 1834 | if (!buffer_dirty(*bh)) |
1835 | clear_buffer_uptodate(*bh); | 1835 | clear_buffer_uptodate(*bh); |
1836 | unlock_buffer(*bh); | 1836 | unlock_buffer(*bh); |
1837 | ll_rw_block(READ, 1, bh); | 1837 | ll_rw_block(READ, 1, bh); |
1838 | wait_on_buffer(*bh); | 1838 | wait_on_buffer(*bh); |
1839 | if (!buffer_uptodate(*bh)) { | 1839 | if (!buffer_uptodate(*bh)) { |
1840 | mlog_errno(-EIO); | 1840 | mlog_errno(-EIO); |
1841 | brelse(*bh); | 1841 | brelse(*bh); |
1842 | *bh = NULL; | 1842 | *bh = NULL; |
1843 | return -EIO; | 1843 | return -EIO; |
1844 | } | 1844 | } |
1845 | 1845 | ||
1846 | return 0; | 1846 | return 0; |
1847 | } | 1847 | } |
1848 | 1848 | ||
1849 | static int ocfs2_mount_volume(struct super_block *sb) | 1849 | static int ocfs2_mount_volume(struct super_block *sb) |
1850 | { | 1850 | { |
1851 | int status = 0; | 1851 | int status = 0; |
1852 | int unlock_super = 0; | 1852 | int unlock_super = 0; |
1853 | struct ocfs2_super *osb = OCFS2_SB(sb); | 1853 | struct ocfs2_super *osb = OCFS2_SB(sb); |
1854 | 1854 | ||
1855 | if (ocfs2_is_hard_readonly(osb)) | 1855 | if (ocfs2_is_hard_readonly(osb)) |
1856 | goto leave; | 1856 | goto leave; |
1857 | 1857 | ||
1858 | status = ocfs2_dlm_init(osb); | 1858 | status = ocfs2_dlm_init(osb); |
1859 | if (status < 0) { | 1859 | if (status < 0) { |
1860 | mlog_errno(status); | 1860 | mlog_errno(status); |
1861 | goto leave; | 1861 | goto leave; |
1862 | } | 1862 | } |
1863 | 1863 | ||
1864 | status = ocfs2_super_lock(osb, 1); | 1864 | status = ocfs2_super_lock(osb, 1); |
1865 | if (status < 0) { | 1865 | if (status < 0) { |
1866 | mlog_errno(status); | 1866 | mlog_errno(status); |
1867 | goto leave; | 1867 | goto leave; |
1868 | } | 1868 | } |
1869 | unlock_super = 1; | 1869 | unlock_super = 1; |
1870 | 1870 | ||
1871 | /* This will load up the node map and add ourselves to it. */ | 1871 | /* This will load up the node map and add ourselves to it. */ |
1872 | status = ocfs2_find_slot(osb); | 1872 | status = ocfs2_find_slot(osb); |
1873 | if (status < 0) { | 1873 | if (status < 0) { |
1874 | mlog_errno(status); | 1874 | mlog_errno(status); |
1875 | goto leave; | 1875 | goto leave; |
1876 | } | 1876 | } |
1877 | 1877 | ||
1878 | /* load all node-local system inodes */ | 1878 | /* load all node-local system inodes */ |
1879 | status = ocfs2_init_local_system_inodes(osb); | 1879 | status = ocfs2_init_local_system_inodes(osb); |
1880 | if (status < 0) { | 1880 | if (status < 0) { |
1881 | mlog_errno(status); | 1881 | mlog_errno(status); |
1882 | goto leave; | 1882 | goto leave; |
1883 | } | 1883 | } |
1884 | 1884 | ||
1885 | status = ocfs2_check_volume(osb); | 1885 | status = ocfs2_check_volume(osb); |
1886 | if (status < 0) { | 1886 | if (status < 0) { |
1887 | mlog_errno(status); | 1887 | mlog_errno(status); |
1888 | goto leave; | 1888 | goto leave; |
1889 | } | 1889 | } |
1890 | 1890 | ||
1891 | status = ocfs2_truncate_log_init(osb); | 1891 | status = ocfs2_truncate_log_init(osb); |
1892 | if (status < 0) | 1892 | if (status < 0) |
1893 | mlog_errno(status); | 1893 | mlog_errno(status); |
1894 | 1894 | ||
1895 | leave: | 1895 | leave: |
1896 | if (unlock_super) | 1896 | if (unlock_super) |
1897 | ocfs2_super_unlock(osb, 1); | 1897 | ocfs2_super_unlock(osb, 1); |
1898 | 1898 | ||
1899 | return status; | 1899 | return status; |
1900 | } | 1900 | } |
1901 | 1901 | ||
1902 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | 1902 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) |
1903 | { | 1903 | { |
1904 | int tmp, hangup_needed = 0; | 1904 | int tmp, hangup_needed = 0; |
1905 | struct ocfs2_super *osb = NULL; | 1905 | struct ocfs2_super *osb = NULL; |
1906 | char nodestr[12]; | 1906 | char nodestr[12]; |
1907 | 1907 | ||
1908 | trace_ocfs2_dismount_volume(sb); | 1908 | trace_ocfs2_dismount_volume(sb); |
1909 | 1909 | ||
1910 | BUG_ON(!sb); | 1910 | BUG_ON(!sb); |
1911 | osb = OCFS2_SB(sb); | 1911 | osb = OCFS2_SB(sb); |
1912 | BUG_ON(!osb); | 1912 | BUG_ON(!osb); |
1913 | 1913 | ||
1914 | debugfs_remove(osb->osb_ctxt); | 1914 | debugfs_remove(osb->osb_ctxt); |
1915 | 1915 | ||
1916 | /* Orphan scan should be stopped as early as possible */ | 1916 | /* Orphan scan should be stopped as early as possible */ |
1917 | ocfs2_orphan_scan_stop(osb); | 1917 | ocfs2_orphan_scan_stop(osb); |
1918 | 1918 | ||
1919 | ocfs2_disable_quotas(osb); | 1919 | ocfs2_disable_quotas(osb); |
1920 | 1920 | ||
1921 | /* All dquots should be freed by now */ | 1921 | /* All dquots should be freed by now */ |
1922 | WARN_ON(!llist_empty(&osb->dquot_drop_list)); | 1922 | WARN_ON(!llist_empty(&osb->dquot_drop_list)); |
1923 | /* Wait for worker to be done with the work structure in osb */ | 1923 | /* Wait for worker to be done with the work structure in osb */ |
1924 | cancel_work_sync(&osb->dquot_drop_work); | 1924 | cancel_work_sync(&osb->dquot_drop_work); |
1925 | 1925 | ||
1926 | ocfs2_shutdown_local_alloc(osb); | 1926 | ocfs2_shutdown_local_alloc(osb); |
1927 | 1927 | ||
1928 | ocfs2_truncate_log_shutdown(osb); | 1928 | ocfs2_truncate_log_shutdown(osb); |
1929 | 1929 | ||
1930 | /* This will disable recovery and flush any recovery work. */ | 1930 | /* This will disable recovery and flush any recovery work. */ |
1931 | ocfs2_recovery_exit(osb); | 1931 | ocfs2_recovery_exit(osb); |
1932 | 1932 | ||
1933 | ocfs2_journal_shutdown(osb); | 1933 | ocfs2_journal_shutdown(osb); |
1934 | 1934 | ||
1935 | ocfs2_sync_blockdev(sb); | 1935 | ocfs2_sync_blockdev(sb); |
1936 | 1936 | ||
1937 | ocfs2_purge_refcount_trees(osb); | 1937 | ocfs2_purge_refcount_trees(osb); |
1938 | 1938 | ||
1939 | /* No cluster connection means we've failed during mount, so skip | 1939 | /* No cluster connection means we've failed during mount, so skip |
1940 | * all the steps which depended on that to complete. */ | 1940 | * all the steps which depended on that to complete. */ |
1941 | if (osb->cconn) { | 1941 | if (osb->cconn) { |
1942 | tmp = ocfs2_super_lock(osb, 1); | 1942 | tmp = ocfs2_super_lock(osb, 1); |
1943 | if (tmp < 0) { | 1943 | if (tmp < 0) { |
1944 | mlog_errno(tmp); | 1944 | mlog_errno(tmp); |
1945 | return; | 1945 | return; |
1946 | } | 1946 | } |
1947 | } | 1947 | } |
1948 | 1948 | ||
1949 | if (osb->slot_num != OCFS2_INVALID_SLOT) | 1949 | if (osb->slot_num != OCFS2_INVALID_SLOT) |
1950 | ocfs2_put_slot(osb); | 1950 | ocfs2_put_slot(osb); |
1951 | 1951 | ||
1952 | if (osb->cconn) | 1952 | if (osb->cconn) |
1953 | ocfs2_super_unlock(osb, 1); | 1953 | ocfs2_super_unlock(osb, 1); |
1954 | 1954 | ||
1955 | ocfs2_release_system_inodes(osb); | 1955 | ocfs2_release_system_inodes(osb); |
1956 | 1956 | ||
1957 | /* | 1957 | /* |
1958 | * If we're dismounting due to mount error, mount.ocfs2 will clean | 1958 | * If we're dismounting due to mount error, mount.ocfs2 will clean |
1959 | * up heartbeat. If we're a local mount, there is no heartbeat. | 1959 | * up heartbeat. If we're a local mount, there is no heartbeat. |
1960 | * If we failed before we got a uuid_str yet, we can't stop | 1960 | * If we failed before we got a uuid_str yet, we can't stop |
1961 | * heartbeat. Otherwise, do it. | 1961 | * heartbeat. Otherwise, do it. |
1962 | */ | 1962 | */ |
1963 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && | 1963 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && |
1964 | !ocfs2_is_hard_readonly(osb)) | 1964 | !ocfs2_is_hard_readonly(osb)) |
1965 | hangup_needed = 1; | 1965 | hangup_needed = 1; |
1966 | 1966 | ||
1967 | if (osb->cconn) | 1967 | if (osb->cconn) |
1968 | ocfs2_dlm_shutdown(osb, hangup_needed); | 1968 | ocfs2_dlm_shutdown(osb, hangup_needed); |
1969 | 1969 | ||
1970 | ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); | 1970 | ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); |
1971 | debugfs_remove(osb->osb_debug_root); | 1971 | debugfs_remove(osb->osb_debug_root); |
1972 | 1972 | ||
1973 | if (hangup_needed) | 1973 | if (hangup_needed) |
1974 | ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); | 1974 | ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); |
1975 | 1975 | ||
1976 | atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); | 1976 | atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); |
1977 | 1977 | ||
1978 | if (ocfs2_mount_local(osb)) | 1978 | if (ocfs2_mount_local(osb)) |
1979 | snprintf(nodestr, sizeof(nodestr), "local"); | 1979 | snprintf(nodestr, sizeof(nodestr), "local"); |
1980 | else | 1980 | else |
1981 | snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); | 1981 | snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); |
1982 | 1982 | ||
1983 | printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", | 1983 | printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", |
1984 | osb->dev_str, nodestr); | 1984 | osb->dev_str, nodestr); |
1985 | 1985 | ||
1986 | ocfs2_delete_osb(osb); | 1986 | ocfs2_delete_osb(osb); |
1987 | kfree(osb); | 1987 | kfree(osb); |
1988 | sb->s_dev = 0; | 1988 | sb->s_dev = 0; |
1989 | sb->s_fs_info = NULL; | 1989 | sb->s_fs_info = NULL; |
1990 | } | 1990 | } |
1991 | 1991 | ||
1992 | static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, | 1992 | static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, |
1993 | unsigned uuid_bytes) | 1993 | unsigned uuid_bytes) |
1994 | { | 1994 | { |
1995 | int i, ret; | 1995 | int i, ret; |
1996 | char *ptr; | 1996 | char *ptr; |
1997 | 1997 | ||
1998 | BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); | 1998 | BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); |
1999 | 1999 | ||
2000 | osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); | 2000 | osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); |
2001 | if (osb->uuid_str == NULL) | 2001 | if (osb->uuid_str == NULL) |
2002 | return -ENOMEM; | 2002 | return -ENOMEM; |
2003 | 2003 | ||
2004 | for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { | 2004 | for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { |
2005 | /* print with null */ | 2005 | /* print with null */ |
2006 | ret = snprintf(ptr, 3, "%02X", uuid[i]); | 2006 | ret = snprintf(ptr, 3, "%02X", uuid[i]); |
2007 | if (ret != 2) /* drop super cleans up */ | 2007 | if (ret != 2) /* drop super cleans up */ |
2008 | return -EINVAL; | 2008 | return -EINVAL; |
2009 | /* then only advance past the last char */ | 2009 | /* then only advance past the last char */ |
2010 | ptr += 2; | 2010 | ptr += 2; |
2011 | } | 2011 | } |
2012 | 2012 | ||
2013 | return 0; | 2013 | return 0; |
2014 | } | 2014 | } |
2015 | 2015 | ||
2016 | /* Make sure entire volume is addressable by our journal. Requires | 2016 | /* Make sure entire volume is addressable by our journal. Requires |
2017 | osb_clusters_at_boot to be valid and for the journal to have been | 2017 | osb_clusters_at_boot to be valid and for the journal to have been |
2018 | initialized by ocfs2_journal_init(). */ | 2018 | initialized by ocfs2_journal_init(). */ |
2019 | static int ocfs2_journal_addressable(struct ocfs2_super *osb) | 2019 | static int ocfs2_journal_addressable(struct ocfs2_super *osb) |
2020 | { | 2020 | { |
2021 | int status = 0; | 2021 | int status = 0; |
2022 | u64 max_block = | 2022 | u64 max_block = |
2023 | ocfs2_clusters_to_blocks(osb->sb, | 2023 | ocfs2_clusters_to_blocks(osb->sb, |
2024 | osb->osb_clusters_at_boot) - 1; | 2024 | osb->osb_clusters_at_boot) - 1; |
2025 | 2025 | ||
2026 | /* 32-bit block number is always OK. */ | 2026 | /* 32-bit block number is always OK. */ |
2027 | if (max_block <= (u32)~0ULL) | 2027 | if (max_block <= (u32)~0ULL) |
2028 | goto out; | 2028 | goto out; |
2029 | 2029 | ||
2030 | /* Volume is "huge", so see if our journal is new enough to | 2030 | /* Volume is "huge", so see if our journal is new enough to |
2031 | support it. */ | 2031 | support it. */ |
2032 | if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, | 2032 | if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, |
2033 | OCFS2_FEATURE_COMPAT_JBD2_SB) && | 2033 | OCFS2_FEATURE_COMPAT_JBD2_SB) && |
2034 | jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, | 2034 | jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, |
2035 | JBD2_FEATURE_INCOMPAT_64BIT))) { | 2035 | JBD2_FEATURE_INCOMPAT_64BIT))) { |
2036 | mlog(ML_ERROR, "The journal cannot address the entire volume. " | 2036 | mlog(ML_ERROR, "The journal cannot address the entire volume. " |
2037 | "Enable the 'block64' journal option with tunefs.ocfs2"); | 2037 | "Enable the 'block64' journal option with tunefs.ocfs2"); |
2038 | status = -EFBIG; | 2038 | status = -EFBIG; |
2039 | goto out; | 2039 | goto out; |
2040 | } | 2040 | } |
2041 | 2041 | ||
2042 | out: | 2042 | out: |
2043 | return status; | 2043 | return status; |
2044 | } | 2044 | } |
2045 | 2045 | ||
2046 | static int ocfs2_initialize_super(struct super_block *sb, | 2046 | static int ocfs2_initialize_super(struct super_block *sb, |
2047 | struct buffer_head *bh, | 2047 | struct buffer_head *bh, |
2048 | int sector_size, | 2048 | int sector_size, |
2049 | struct ocfs2_blockcheck_stats *stats) | 2049 | struct ocfs2_blockcheck_stats *stats) |
2050 | { | 2050 | { |
2051 | int status; | 2051 | int status; |
2052 | int i, cbits, bbits; | 2052 | int i, cbits, bbits; |
2053 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; | 2053 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; |
2054 | struct inode *inode = NULL; | 2054 | struct inode *inode = NULL; |
2055 | struct ocfs2_journal *journal; | 2055 | struct ocfs2_journal *journal; |
2056 | struct ocfs2_super *osb; | 2056 | struct ocfs2_super *osb; |
2057 | u64 total_blocks; | 2057 | u64 total_blocks; |
2058 | 2058 | ||
2059 | osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); | 2059 | osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); |
2060 | if (!osb) { | 2060 | if (!osb) { |
2061 | status = -ENOMEM; | 2061 | status = -ENOMEM; |
2062 | mlog_errno(status); | 2062 | mlog_errno(status); |
2063 | goto bail; | 2063 | goto bail; |
2064 | } | 2064 | } |
2065 | 2065 | ||
2066 | sb->s_fs_info = osb; | 2066 | sb->s_fs_info = osb; |
2067 | sb->s_op = &ocfs2_sops; | 2067 | sb->s_op = &ocfs2_sops; |
2068 | sb->s_d_op = &ocfs2_dentry_ops; | 2068 | sb->s_d_op = &ocfs2_dentry_ops; |
2069 | sb->s_export_op = &ocfs2_export_ops; | 2069 | sb->s_export_op = &ocfs2_export_ops; |
2070 | sb->s_qcop = &ocfs2_quotactl_ops; | 2070 | sb->s_qcop = &ocfs2_quotactl_ops; |
2071 | sb->dq_op = &ocfs2_quota_operations; | 2071 | sb->dq_op = &ocfs2_quota_operations; |
2072 | sb->s_xattr = ocfs2_xattr_handlers; | 2072 | sb->s_xattr = ocfs2_xattr_handlers; |
2073 | sb->s_time_gran = 1; | 2073 | sb->s_time_gran = 1; |
2074 | sb->s_flags |= MS_NOATIME; | 2074 | sb->s_flags |= MS_NOATIME; |
2075 | /* this is needed to support O_LARGEFILE */ | 2075 | /* this is needed to support O_LARGEFILE */ |
2076 | cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); | 2076 | cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); |
2077 | bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); | 2077 | bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); |
2078 | sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); | 2078 | sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); |
2079 | 2079 | ||
2080 | osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; | 2080 | osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; |
2081 | 2081 | ||
2082 | for (i = 0; i < 3; i++) | 2082 | for (i = 0; i < 3; i++) |
2083 | osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); | 2083 | osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); |
2084 | osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); | 2084 | osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); |
2085 | 2085 | ||
2086 | osb->sb = sb; | 2086 | osb->sb = sb; |
2087 | /* Save off for ocfs2_rw_direct */ | 2087 | /* Save off for ocfs2_rw_direct */ |
2088 | osb->s_sectsize_bits = blksize_bits(sector_size); | 2088 | osb->s_sectsize_bits = blksize_bits(sector_size); |
2089 | BUG_ON(!osb->s_sectsize_bits); | 2089 | BUG_ON(!osb->s_sectsize_bits); |
2090 | 2090 | ||
2091 | spin_lock_init(&osb->dc_task_lock); | 2091 | spin_lock_init(&osb->dc_task_lock); |
2092 | init_waitqueue_head(&osb->dc_event); | 2092 | init_waitqueue_head(&osb->dc_event); |
2093 | osb->dc_work_sequence = 0; | 2093 | osb->dc_work_sequence = 0; |
2094 | osb->dc_wake_sequence = 0; | 2094 | osb->dc_wake_sequence = 0; |
2095 | INIT_LIST_HEAD(&osb->blocked_lock_list); | 2095 | INIT_LIST_HEAD(&osb->blocked_lock_list); |
2096 | osb->blocked_lock_count = 0; | 2096 | osb->blocked_lock_count = 0; |
2097 | spin_lock_init(&osb->osb_lock); | 2097 | spin_lock_init(&osb->osb_lock); |
2098 | spin_lock_init(&osb->osb_xattr_lock); | 2098 | spin_lock_init(&osb->osb_xattr_lock); |
2099 | ocfs2_init_steal_slots(osb); | 2099 | ocfs2_init_steal_slots(osb); |
2100 | 2100 | ||
2101 | mutex_init(&osb->system_file_mutex); | 2101 | mutex_init(&osb->system_file_mutex); |
2102 | 2102 | ||
2103 | atomic_set(&osb->alloc_stats.moves, 0); | 2103 | atomic_set(&osb->alloc_stats.moves, 0); |
2104 | atomic_set(&osb->alloc_stats.local_data, 0); | 2104 | atomic_set(&osb->alloc_stats.local_data, 0); |
2105 | atomic_set(&osb->alloc_stats.bitmap_data, 0); | 2105 | atomic_set(&osb->alloc_stats.bitmap_data, 0); |
2106 | atomic_set(&osb->alloc_stats.bg_allocs, 0); | 2106 | atomic_set(&osb->alloc_stats.bg_allocs, 0); |
2107 | atomic_set(&osb->alloc_stats.bg_extends, 0); | 2107 | atomic_set(&osb->alloc_stats.bg_extends, 0); |
2108 | 2108 | ||
2109 | /* Copy the blockcheck stats from the superblock probe */ | 2109 | /* Copy the blockcheck stats from the superblock probe */ |
2110 | osb->osb_ecc_stats = *stats; | 2110 | osb->osb_ecc_stats = *stats; |
2111 | 2111 | ||
2112 | ocfs2_init_node_maps(osb); | 2112 | ocfs2_init_node_maps(osb); |
2113 | 2113 | ||
2114 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", | 2114 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", |
2115 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | 2115 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); |
2116 | 2116 | ||
2117 | osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); | 2117 | osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); |
2118 | if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { | 2118 | if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { |
2119 | mlog(ML_ERROR, "Invalid number of node slots (%u)\n", | 2119 | mlog(ML_ERROR, "Invalid number of node slots (%u)\n", |
2120 | osb->max_slots); | 2120 | osb->max_slots); |
2121 | status = -EINVAL; | 2121 | status = -EINVAL; |
2122 | goto bail; | 2122 | goto bail; |
2123 | } | 2123 | } |
2124 | 2124 | ||
2125 | ocfs2_orphan_scan_init(osb); | 2125 | ocfs2_orphan_scan_init(osb); |
2126 | 2126 | ||
2127 | status = ocfs2_recovery_init(osb); | 2127 | status = ocfs2_recovery_init(osb); |
2128 | if (status) { | 2128 | if (status) { |
2129 | mlog(ML_ERROR, "Unable to initialize recovery state\n"); | 2129 | mlog(ML_ERROR, "Unable to initialize recovery state\n"); |
2130 | mlog_errno(status); | 2130 | mlog_errno(status); |
2131 | goto bail; | 2131 | goto bail; |
2132 | } | 2132 | } |
2133 | 2133 | ||
2134 | init_waitqueue_head(&osb->checkpoint_event); | 2134 | init_waitqueue_head(&osb->checkpoint_event); |
2135 | 2135 | ||
2136 | osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; | 2136 | osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; |
2137 | 2137 | ||
2138 | osb->slot_num = OCFS2_INVALID_SLOT; | 2138 | osb->slot_num = OCFS2_INVALID_SLOT; |
2139 | 2139 | ||
2140 | osb->s_xattr_inline_size = le16_to_cpu( | 2140 | osb->s_xattr_inline_size = le16_to_cpu( |
2141 | di->id2.i_super.s_xattr_inline_size); | 2141 | di->id2.i_super.s_xattr_inline_size); |
2142 | 2142 | ||
2143 | osb->local_alloc_state = OCFS2_LA_UNUSED; | 2143 | osb->local_alloc_state = OCFS2_LA_UNUSED; |
2144 | osb->local_alloc_bh = NULL; | 2144 | osb->local_alloc_bh = NULL; |
2145 | INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); | 2145 | INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); |
2146 | 2146 | ||
2147 | init_waitqueue_head(&osb->osb_mount_event); | 2147 | init_waitqueue_head(&osb->osb_mount_event); |
2148 | 2148 | ||
2149 | status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); | 2149 | status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); |
2150 | if (status) { | 2150 | if (status) { |
2151 | mlog_errno(status); | 2151 | mlog_errno(status); |
2152 | goto bail; | 2152 | goto bail; |
2153 | } | 2153 | } |
2154 | 2154 | ||
2155 | osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); | 2155 | osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); |
2156 | if (!osb->vol_label) { | 2156 | if (!osb->vol_label) { |
2157 | mlog(ML_ERROR, "unable to alloc vol label\n"); | 2157 | mlog(ML_ERROR, "unable to alloc vol label\n"); |
2158 | status = -ENOMEM; | 2158 | status = -ENOMEM; |
2159 | goto bail; | 2159 | goto bail; |
2160 | } | 2160 | } |
2161 | 2161 | ||
2162 | osb->slot_recovery_generations = | 2162 | osb->slot_recovery_generations = |
2163 | kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), | 2163 | kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), |
2164 | GFP_KERNEL); | 2164 | GFP_KERNEL); |
2165 | if (!osb->slot_recovery_generations) { | 2165 | if (!osb->slot_recovery_generations) { |
2166 | status = -ENOMEM; | 2166 | status = -ENOMEM; |
2167 | mlog_errno(status); | 2167 | mlog_errno(status); |
2168 | goto bail; | 2168 | goto bail; |
2169 | } | 2169 | } |
2170 | 2170 | ||
2171 | init_waitqueue_head(&osb->osb_wipe_event); | 2171 | init_waitqueue_head(&osb->osb_wipe_event); |
2172 | osb->osb_orphan_wipes = kcalloc(osb->max_slots, | 2172 | osb->osb_orphan_wipes = kcalloc(osb->max_slots, |
2173 | sizeof(*osb->osb_orphan_wipes), | 2173 | sizeof(*osb->osb_orphan_wipes), |
2174 | GFP_KERNEL); | 2174 | GFP_KERNEL); |
2175 | if (!osb->osb_orphan_wipes) { | 2175 | if (!osb->osb_orphan_wipes) { |
2176 | status = -ENOMEM; | 2176 | status = -ENOMEM; |
2177 | mlog_errno(status); | 2177 | mlog_errno(status); |
2178 | goto bail; | 2178 | goto bail; |
2179 | } | 2179 | } |
2180 | 2180 | ||
2181 | osb->osb_rf_lock_tree = RB_ROOT; | 2181 | osb->osb_rf_lock_tree = RB_ROOT; |
2182 | 2182 | ||
2183 | osb->s_feature_compat = | 2183 | osb->s_feature_compat = |
2184 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); | 2184 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); |
2185 | osb->s_feature_ro_compat = | 2185 | osb->s_feature_ro_compat = |
2186 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); | 2186 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); |
2187 | osb->s_feature_incompat = | 2187 | osb->s_feature_incompat = |
2188 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); | 2188 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); |
2189 | 2189 | ||
2190 | if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { | 2190 | if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { |
2191 | mlog(ML_ERROR, "couldn't mount because of unsupported " | 2191 | mlog(ML_ERROR, "couldn't mount because of unsupported " |
2192 | "optional features (%x).\n", i); | 2192 | "optional features (%x).\n", i); |
2193 | status = -EINVAL; | 2193 | status = -EINVAL; |
2194 | goto bail; | 2194 | goto bail; |
2195 | } | 2195 | } |
2196 | if (!(osb->sb->s_flags & MS_RDONLY) && | 2196 | if (!(osb->sb->s_flags & MS_RDONLY) && |
2197 | (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { | 2197 | (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { |
2198 | mlog(ML_ERROR, "couldn't mount RDWR because of " | 2198 | mlog(ML_ERROR, "couldn't mount RDWR because of " |
2199 | "unsupported optional features (%x).\n", i); | 2199 | "unsupported optional features (%x).\n", i); |
2200 | status = -EINVAL; | 2200 | status = -EINVAL; |
2201 | goto bail; | 2201 | goto bail; |
2202 | } | 2202 | } |
2203 | 2203 | ||
2204 | if (ocfs2_clusterinfo_valid(osb)) { | 2204 | if (ocfs2_clusterinfo_valid(osb)) { |
2205 | osb->osb_stackflags = | 2205 | osb->osb_stackflags = |
2206 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; | 2206 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; |
2207 | strlcpy(osb->osb_cluster_stack, | 2207 | strlcpy(osb->osb_cluster_stack, |
2208 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, | 2208 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, |
2209 | OCFS2_STACK_LABEL_LEN + 1); | 2209 | OCFS2_STACK_LABEL_LEN + 1); |
2210 | if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { | 2210 | if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { |
2211 | mlog(ML_ERROR, | 2211 | mlog(ML_ERROR, |
2212 | "couldn't mount because of an invalid " | 2212 | "couldn't mount because of an invalid " |
2213 | "cluster stack label (%s) \n", | 2213 | "cluster stack label (%s) \n", |
2214 | osb->osb_cluster_stack); | 2214 | osb->osb_cluster_stack); |
2215 | status = -EINVAL; | 2215 | status = -EINVAL; |
2216 | goto bail; | 2216 | goto bail; |
2217 | } | 2217 | } |
2218 | strlcpy(osb->osb_cluster_name, | 2218 | strlcpy(osb->osb_cluster_name, |
2219 | OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, | 2219 | OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, |
2220 | OCFS2_CLUSTER_NAME_LEN + 1); | 2220 | OCFS2_CLUSTER_NAME_LEN + 1); |
2221 | } else { | 2221 | } else { |
2222 | /* The empty string is identical with classic tools that | 2222 | /* The empty string is identical with classic tools that |
2223 | * don't know about s_cluster_info. */ | 2223 | * don't know about s_cluster_info. */ |
2224 | osb->osb_cluster_stack[0] = '\0'; | 2224 | osb->osb_cluster_stack[0] = '\0'; |
2225 | } | 2225 | } |
2226 | 2226 | ||
2227 | get_random_bytes(&osb->s_next_generation, sizeof(u32)); | 2227 | get_random_bytes(&osb->s_next_generation, sizeof(u32)); |
2228 | 2228 | ||
2229 | /* FIXME | 2229 | /* FIXME |
2230 | * This should be done in ocfs2_journal_init(), but unknown | 2230 | * This should be done in ocfs2_journal_init(), but unknown |
2231 | * ordering issues will cause the filesystem to crash. | 2231 | * ordering issues will cause the filesystem to crash. |
2232 | * If anyone wants to figure out what part of the code | 2232 | * If anyone wants to figure out what part of the code |
2233 | * refers to osb->journal before ocfs2_journal_init() is run, | 2233 | * refers to osb->journal before ocfs2_journal_init() is run, |
2234 | * be my guest. | 2234 | * be my guest. |
2235 | */ | 2235 | */ |
2236 | /* initialize our journal structure */ | 2236 | /* initialize our journal structure */ |
2237 | 2237 | ||
2238 | journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); | 2238 | journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); |
2239 | if (!journal) { | 2239 | if (!journal) { |
2240 | mlog(ML_ERROR, "unable to alloc journal\n"); | 2240 | mlog(ML_ERROR, "unable to alloc journal\n"); |
2241 | status = -ENOMEM; | 2241 | status = -ENOMEM; |
2242 | goto bail; | 2242 | goto bail; |
2243 | } | 2243 | } |
2244 | osb->journal = journal; | 2244 | osb->journal = journal; |
2245 | journal->j_osb = osb; | 2245 | journal->j_osb = osb; |
2246 | 2246 | ||
2247 | atomic_set(&journal->j_num_trans, 0); | 2247 | atomic_set(&journal->j_num_trans, 0); |
2248 | init_rwsem(&journal->j_trans_barrier); | 2248 | init_rwsem(&journal->j_trans_barrier); |
2249 | init_waitqueue_head(&journal->j_checkpointed); | 2249 | init_waitqueue_head(&journal->j_checkpointed); |
2250 | spin_lock_init(&journal->j_lock); | 2250 | spin_lock_init(&journal->j_lock); |
2251 | journal->j_trans_id = (unsigned long) 1; | 2251 | journal->j_trans_id = (unsigned long) 1; |
2252 | INIT_LIST_HEAD(&journal->j_la_cleanups); | 2252 | INIT_LIST_HEAD(&journal->j_la_cleanups); |
2253 | INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); | 2253 | INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); |
2254 | journal->j_state = OCFS2_JOURNAL_FREE; | 2254 | journal->j_state = OCFS2_JOURNAL_FREE; |
2255 | 2255 | ||
2256 | INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); | 2256 | INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); |
2257 | init_llist_head(&osb->dquot_drop_list); | 2257 | init_llist_head(&osb->dquot_drop_list); |
2258 | 2258 | ||
2259 | /* get some pseudo constants for clustersize bits */ | 2259 | /* get some pseudo constants for clustersize bits */ |
2260 | osb->s_clustersize_bits = | 2260 | osb->s_clustersize_bits = |
2261 | le32_to_cpu(di->id2.i_super.s_clustersize_bits); | 2261 | le32_to_cpu(di->id2.i_super.s_clustersize_bits); |
2262 | osb->s_clustersize = 1 << osb->s_clustersize_bits; | 2262 | osb->s_clustersize = 1 << osb->s_clustersize_bits; |
2263 | 2263 | ||
2264 | if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || | 2264 | if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || |
2265 | osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { | 2265 | osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { |
2266 | mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", | 2266 | mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", |
2267 | osb->s_clustersize); | 2267 | osb->s_clustersize); |
2268 | status = -EINVAL; | 2268 | status = -EINVAL; |
2269 | goto bail; | 2269 | goto bail; |
2270 | } | 2270 | } |
2271 | 2271 | ||
2272 | total_blocks = ocfs2_clusters_to_blocks(osb->sb, | 2272 | total_blocks = ocfs2_clusters_to_blocks(osb->sb, |
2273 | le32_to_cpu(di->i_clusters)); | 2273 | le32_to_cpu(di->i_clusters)); |
2274 | 2274 | ||
2275 | status = generic_check_addressable(osb->sb->s_blocksize_bits, | 2275 | status = generic_check_addressable(osb->sb->s_blocksize_bits, |
2276 | total_blocks); | 2276 | total_blocks); |
2277 | if (status) { | 2277 | if (status) { |
2278 | mlog(ML_ERROR, "Volume too large " | 2278 | mlog(ML_ERROR, "Volume too large " |
2279 | "to mount safely on this system"); | 2279 | "to mount safely on this system"); |
2280 | status = -EFBIG; | 2280 | status = -EFBIG; |
2281 | goto bail; | 2281 | goto bail; |
2282 | } | 2282 | } |
2283 | 2283 | ||
2284 | if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, | 2284 | if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, |
2285 | sizeof(di->id2.i_super.s_uuid))) { | 2285 | sizeof(di->id2.i_super.s_uuid))) { |
2286 | mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); | 2286 | mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); |
2287 | status = -ENOMEM; | 2287 | status = -ENOMEM; |
2288 | goto bail; | 2288 | goto bail; |
2289 | } | 2289 | } |
2290 | 2290 | ||
2291 | strlcpy(osb->vol_label, di->id2.i_super.s_label, | 2291 | strlcpy(osb->vol_label, di->id2.i_super.s_label, |
2292 | OCFS2_MAX_VOL_LABEL_LEN); | 2292 | OCFS2_MAX_VOL_LABEL_LEN); |
2293 | osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); | 2293 | osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); |
2294 | osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); | 2294 | osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); |
2295 | osb->first_cluster_group_blkno = | 2295 | osb->first_cluster_group_blkno = |
2296 | le64_to_cpu(di->id2.i_super.s_first_cluster_group); | 2296 | le64_to_cpu(di->id2.i_super.s_first_cluster_group); |
2297 | osb->fs_generation = le32_to_cpu(di->i_fs_generation); | 2297 | osb->fs_generation = le32_to_cpu(di->i_fs_generation); |
2298 | osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); | 2298 | osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); |
2299 | trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, | 2299 | trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, |
2300 | (unsigned long long)osb->root_blkno, | 2300 | (unsigned long long)osb->root_blkno, |
2301 | (unsigned long long)osb->system_dir_blkno, | 2301 | (unsigned long long)osb->system_dir_blkno, |
2302 | osb->s_clustersize_bits); | 2302 | osb->s_clustersize_bits); |
2303 | 2303 | ||
2304 | osb->osb_dlm_debug = ocfs2_new_dlm_debug(); | 2304 | osb->osb_dlm_debug = ocfs2_new_dlm_debug(); |
2305 | if (!osb->osb_dlm_debug) { | 2305 | if (!osb->osb_dlm_debug) { |
2306 | status = -ENOMEM; | 2306 | status = -ENOMEM; |
2307 | mlog_errno(status); | 2307 | mlog_errno(status); |
2308 | goto bail; | 2308 | goto bail; |
2309 | } | 2309 | } |
2310 | 2310 | ||
2311 | atomic_set(&osb->vol_state, VOLUME_INIT); | 2311 | atomic_set(&osb->vol_state, VOLUME_INIT); |
2312 | 2312 | ||
2313 | /* load root, system_dir, and all global system inodes */ | 2313 | /* load root, system_dir, and all global system inodes */ |
2314 | status = ocfs2_init_global_system_inodes(osb); | 2314 | status = ocfs2_init_global_system_inodes(osb); |
2315 | if (status < 0) { | 2315 | if (status < 0) { |
2316 | mlog_errno(status); | 2316 | mlog_errno(status); |
2317 | goto bail; | 2317 | goto bail; |
2318 | } | 2318 | } |
2319 | 2319 | ||
2320 | /* | 2320 | /* |
2321 | * global bitmap | 2321 | * global bitmap |
2322 | */ | 2322 | */ |
2323 | inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, | 2323 | inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, |
2324 | OCFS2_INVALID_SLOT); | 2324 | OCFS2_INVALID_SLOT); |
2325 | if (!inode) { | 2325 | if (!inode) { |
2326 | status = -EINVAL; | 2326 | status = -EINVAL; |
2327 | mlog_errno(status); | 2327 | mlog_errno(status); |
2328 | goto bail; | 2328 | goto bail; |
2329 | } | 2329 | } |
2330 | 2330 | ||
2331 | osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; | 2331 | osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; |
2332 | osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; | 2332 | osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; |
2333 | iput(inode); | 2333 | iput(inode); |
2334 | 2334 | ||
2335 | osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, | 2335 | osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, |
2336 | osb->s_feature_incompat) * 8; | 2336 | osb->s_feature_incompat) * 8; |
2337 | 2337 | ||
2338 | status = ocfs2_init_slot_info(osb); | 2338 | status = ocfs2_init_slot_info(osb); |
2339 | if (status < 0) { | 2339 | if (status < 0) { |
2340 | mlog_errno(status); | 2340 | mlog_errno(status); |
2341 | goto bail; | 2341 | goto bail; |
2342 | } | 2342 | } |
2343 | cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); | 2343 | cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); |
2344 | 2344 | ||
2345 | bail: | 2345 | bail: |
2346 | return status; | 2346 | return status; |
2347 | } | 2347 | } |
2348 | 2348 | ||
2349 | /* | 2349 | /* |
2350 | * will return: -EAGAIN if it is ok to keep searching for superblocks | 2350 | * will return: -EAGAIN if it is ok to keep searching for superblocks |
2351 | * -EINVAL if there is a bad superblock | 2351 | * -EINVAL if there is a bad superblock |
2352 | * 0 on success | 2352 | * 0 on success |
2353 | */ | 2353 | */ |
2354 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | 2354 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, |
2355 | struct buffer_head *bh, | 2355 | struct buffer_head *bh, |
2356 | u32 blksz, | 2356 | u32 blksz, |
2357 | struct ocfs2_blockcheck_stats *stats) | 2357 | struct ocfs2_blockcheck_stats *stats) |
2358 | { | 2358 | { |
2359 | int status = -EAGAIN; | 2359 | int status = -EAGAIN; |
2360 | 2360 | ||
2361 | if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, | 2361 | if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, |
2362 | strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { | 2362 | strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { |
2363 | /* We have to do a raw check of the feature here */ | 2363 | /* We have to do a raw check of the feature here */ |
2364 | if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & | 2364 | if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & |
2365 | OCFS2_FEATURE_INCOMPAT_META_ECC) { | 2365 | OCFS2_FEATURE_INCOMPAT_META_ECC) { |
2366 | status = ocfs2_block_check_validate(bh->b_data, | 2366 | status = ocfs2_block_check_validate(bh->b_data, |
2367 | bh->b_size, | 2367 | bh->b_size, |
2368 | &di->i_check, | 2368 | &di->i_check, |
2369 | stats); | 2369 | stats); |
2370 | if (status) | 2370 | if (status) |
2371 | goto out; | 2371 | goto out; |
2372 | } | 2372 | } |
2373 | status = -EINVAL; | 2373 | status = -EINVAL; |
2374 | if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { | 2374 | if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { |
2375 | mlog(ML_ERROR, "found superblock with incorrect block " | 2375 | mlog(ML_ERROR, "found superblock with incorrect block " |
2376 | "size: found %u, should be %u\n", | 2376 | "size: found %u, should be %u\n", |
2377 | 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), | 2377 | 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), |
2378 | blksz); | 2378 | blksz); |
2379 | } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != | 2379 | } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != |
2380 | OCFS2_MAJOR_REV_LEVEL || | 2380 | OCFS2_MAJOR_REV_LEVEL || |
2381 | le16_to_cpu(di->id2.i_super.s_minor_rev_level) != | 2381 | le16_to_cpu(di->id2.i_super.s_minor_rev_level) != |
2382 | OCFS2_MINOR_REV_LEVEL) { | 2382 | OCFS2_MINOR_REV_LEVEL) { |
2383 | mlog(ML_ERROR, "found superblock with bad version: " | 2383 | mlog(ML_ERROR, "found superblock with bad version: " |
2384 | "found %u.%u, should be %u.%u\n", | 2384 | "found %u.%u, should be %u.%u\n", |
2385 | le16_to_cpu(di->id2.i_super.s_major_rev_level), | 2385 | le16_to_cpu(di->id2.i_super.s_major_rev_level), |
2386 | le16_to_cpu(di->id2.i_super.s_minor_rev_level), | 2386 | le16_to_cpu(di->id2.i_super.s_minor_rev_level), |
2387 | OCFS2_MAJOR_REV_LEVEL, | 2387 | OCFS2_MAJOR_REV_LEVEL, |
2388 | OCFS2_MINOR_REV_LEVEL); | 2388 | OCFS2_MINOR_REV_LEVEL); |
2389 | } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { | 2389 | } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { |
2390 | mlog(ML_ERROR, "bad block number on superblock: " | 2390 | mlog(ML_ERROR, "bad block number on superblock: " |
2391 | "found %llu, should be %llu\n", | 2391 | "found %llu, should be %llu\n", |
2392 | (unsigned long long)le64_to_cpu(di->i_blkno), | 2392 | (unsigned long long)le64_to_cpu(di->i_blkno), |
2393 | (unsigned long long)bh->b_blocknr); | 2393 | (unsigned long long)bh->b_blocknr); |
2394 | } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || | 2394 | } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || |
2395 | le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { | 2395 | le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { |
2396 | mlog(ML_ERROR, "bad cluster size found: %u\n", | 2396 | mlog(ML_ERROR, "bad cluster size found: %u\n", |
2397 | 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); | 2397 | 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); |
2398 | } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { | 2398 | } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { |
2399 | mlog(ML_ERROR, "bad root_blkno: 0\n"); | 2399 | mlog(ML_ERROR, "bad root_blkno: 0\n"); |
2400 | } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { | 2400 | } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { |
2401 | mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); | 2401 | mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); |
2402 | } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { | 2402 | } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { |
2403 | mlog(ML_ERROR, | 2403 | mlog(ML_ERROR, |
2404 | "Superblock slots found greater than file system " | 2404 | "Superblock slots found greater than file system " |
2405 | "maximum: found %u, max %u\n", | 2405 | "maximum: found %u, max %u\n", |
2406 | le16_to_cpu(di->id2.i_super.s_max_slots), | 2406 | le16_to_cpu(di->id2.i_super.s_max_slots), |
2407 | OCFS2_MAX_SLOTS); | 2407 | OCFS2_MAX_SLOTS); |
2408 | } else { | 2408 | } else { |
2409 | /* found it! */ | 2409 | /* found it! */ |
2410 | status = 0; | 2410 | status = 0; |
2411 | } | 2411 | } |
2412 | } | 2412 | } |
2413 | 2413 | ||
2414 | out: | 2414 | out: |
2415 | if (status && status != -EAGAIN) | 2415 | if (status && status != -EAGAIN) |
2416 | mlog_errno(status); | 2416 | mlog_errno(status); |
2417 | return status; | 2417 | return status; |
2418 | } | 2418 | } |
2419 | 2419 | ||
2420 | static int ocfs2_check_volume(struct ocfs2_super *osb) | 2420 | static int ocfs2_check_volume(struct ocfs2_super *osb) |
2421 | { | 2421 | { |
2422 | int status; | 2422 | int status; |
2423 | int dirty; | 2423 | int dirty; |
2424 | int local; | 2424 | int local; |
2425 | struct ocfs2_dinode *local_alloc = NULL; /* only used if we | 2425 | struct ocfs2_dinode *local_alloc = NULL; /* only used if we |
2426 | * recover | 2426 | * recover |
2427 | * ourselves. */ | 2427 | * ourselves. */ |
2428 | 2428 | ||
2429 | /* Init our journal object. */ | 2429 | /* Init our journal object. */ |
2430 | status = ocfs2_journal_init(osb->journal, &dirty); | 2430 | status = ocfs2_journal_init(osb->journal, &dirty); |
2431 | if (status < 0) { | 2431 | if (status < 0) { |
2432 | mlog(ML_ERROR, "Could not initialize journal!\n"); | 2432 | mlog(ML_ERROR, "Could not initialize journal!\n"); |
2433 | goto finally; | 2433 | goto finally; |
2434 | } | 2434 | } |
2435 | 2435 | ||
2436 | /* Now that journal has been initialized, check to make sure | 2436 | /* Now that journal has been initialized, check to make sure |
2437 | entire volume is addressable. */ | 2437 | entire volume is addressable. */ |
2438 | status = ocfs2_journal_addressable(osb); | 2438 | status = ocfs2_journal_addressable(osb); |
2439 | if (status) | 2439 | if (status) |
2440 | goto finally; | 2440 | goto finally; |
2441 | 2441 | ||
2442 | /* If the journal was unmounted cleanly then we don't want to | 2442 | /* If the journal was unmounted cleanly then we don't want to |
2443 | * recover anything. Otherwise, journal_load will do that | 2443 | * recover anything. Otherwise, journal_load will do that |
2444 | * dirty work for us :) */ | 2444 | * dirty work for us :) */ |
2445 | if (!dirty) { | 2445 | if (!dirty) { |
2446 | status = ocfs2_journal_wipe(osb->journal, 0); | 2446 | status = ocfs2_journal_wipe(osb->journal, 0); |
2447 | if (status < 0) { | 2447 | if (status < 0) { |
2448 | mlog_errno(status); | 2448 | mlog_errno(status); |
2449 | goto finally; | 2449 | goto finally; |
2450 | } | 2450 | } |
2451 | } else { | 2451 | } else { |
2452 | printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " | 2452 | printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " |
2453 | "unmounted cleanly, recovering it.\n", osb->dev_str); | 2453 | "unmounted cleanly, recovering it.\n", osb->dev_str); |
2454 | } | 2454 | } |
2455 | 2455 | ||
2456 | local = ocfs2_mount_local(osb); | 2456 | local = ocfs2_mount_local(osb); |
2457 | 2457 | ||
2458 | /* will play back anything left in the journal. */ | 2458 | /* will play back anything left in the journal. */ |
2459 | status = ocfs2_journal_load(osb->journal, local, dirty); | 2459 | status = ocfs2_journal_load(osb->journal, local, dirty); |
2460 | if (status < 0) { | 2460 | if (status < 0) { |
2461 | mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); | 2461 | mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); |
2462 | goto finally; | 2462 | goto finally; |
2463 | } | 2463 | } |
2464 | 2464 | ||
2465 | if (dirty) { | 2465 | if (dirty) { |
2466 | /* recover my local alloc if we didn't unmount cleanly. */ | 2466 | /* recover my local alloc if we didn't unmount cleanly. */ |
2467 | status = ocfs2_begin_local_alloc_recovery(osb, | 2467 | status = ocfs2_begin_local_alloc_recovery(osb, |
2468 | osb->slot_num, | 2468 | osb->slot_num, |
2469 | &local_alloc); | 2469 | &local_alloc); |
2470 | if (status < 0) { | 2470 | if (status < 0) { |
2471 | mlog_errno(status); | 2471 | mlog_errno(status); |
2472 | goto finally; | 2472 | goto finally; |
2473 | } | 2473 | } |
2474 | /* we complete the recovery process after we've marked | 2474 | /* we complete the recovery process after we've marked |
2475 | * ourselves as mounted. */ | 2475 | * ourselves as mounted. */ |
2476 | } | 2476 | } |
2477 | 2477 | ||
2478 | status = ocfs2_load_local_alloc(osb); | 2478 | status = ocfs2_load_local_alloc(osb); |
2479 | if (status < 0) { | 2479 | if (status < 0) { |
2480 | mlog_errno(status); | 2480 | mlog_errno(status); |
2481 | goto finally; | 2481 | goto finally; |
2482 | } | 2482 | } |
2483 | 2483 | ||
2484 | if (dirty) { | 2484 | if (dirty) { |
2485 | /* Recovery will be completed after we've mounted the | 2485 | /* Recovery will be completed after we've mounted the |
2486 | * rest of the volume. */ | 2486 | * rest of the volume. */ |
2487 | osb->dirty = 1; | 2487 | osb->dirty = 1; |
2488 | osb->local_alloc_copy = local_alloc; | 2488 | osb->local_alloc_copy = local_alloc; |
2489 | local_alloc = NULL; | 2489 | local_alloc = NULL; |
2490 | } | 2490 | } |
2491 | 2491 | ||
2492 | /* go through each journal, trylock it and if you get the | 2492 | /* go through each journal, trylock it and if you get the |
2493 | * lock, and it's marked as dirty, set the bit in the recover | 2493 | * lock, and it's marked as dirty, set the bit in the recover |
2494 | * map and launch a recovery thread for it. */ | 2494 | * map and launch a recovery thread for it. */ |
2495 | status = ocfs2_mark_dead_nodes(osb); | 2495 | status = ocfs2_mark_dead_nodes(osb); |
2496 | if (status < 0) { | 2496 | if (status < 0) { |
2497 | mlog_errno(status); | 2497 | mlog_errno(status); |
2498 | goto finally; | 2498 | goto finally; |
2499 | } | 2499 | } |
2500 | 2500 | ||
2501 | status = ocfs2_compute_replay_slots(osb); | 2501 | status = ocfs2_compute_replay_slots(osb); |
2502 | if (status < 0) | 2502 | if (status < 0) |
2503 | mlog_errno(status); | 2503 | mlog_errno(status); |
2504 | 2504 | ||
2505 | finally: | 2505 | finally: |
2506 | kfree(local_alloc); | 2506 | kfree(local_alloc); |
2507 | 2507 | ||
2508 | if (status) | 2508 | if (status) |
2509 | mlog_errno(status); | 2509 | mlog_errno(status); |
2510 | return status; | 2510 | return status; |
2511 | } | 2511 | } |
2512 | 2512 | ||
2513 | /* | 2513 | /* |
2514 | * The routine gets called from dismount or close whenever a dismount on | 2514 | * The routine gets called from dismount or close whenever a dismount on |
2515 | * volume is requested and the osb open count becomes 1. | 2515 | * volume is requested and the osb open count becomes 1. |
2516 | * It will remove the osb from the global list and also free up all the | 2516 | * It will remove the osb from the global list and also free up all the |
2517 | * initialized resources and fileobject. | 2517 | * initialized resources and fileobject. |
2518 | */ | 2518 | */ |
2519 | static void ocfs2_delete_osb(struct ocfs2_super *osb) | 2519 | static void ocfs2_delete_osb(struct ocfs2_super *osb) |
2520 | { | 2520 | { |
2521 | /* This function assumes that the caller has the main osb resource */ | 2521 | /* This function assumes that the caller has the main osb resource */ |
2522 | 2522 | ||
2523 | ocfs2_free_slot_info(osb); | 2523 | ocfs2_free_slot_info(osb); |
2524 | 2524 | ||
2525 | kfree(osb->osb_orphan_wipes); | 2525 | kfree(osb->osb_orphan_wipes); |
2526 | kfree(osb->slot_recovery_generations); | 2526 | kfree(osb->slot_recovery_generations); |
2527 | /* FIXME | 2527 | /* FIXME |
2528 | * This belongs in journal shutdown, but because we have to | 2528 | * This belongs in journal shutdown, but because we have to |
2529 | * allocate osb->journal at the start of ocfs2_initialize_osb(), | 2529 | * allocate osb->journal at the start of ocfs2_initialize_osb(), |
2530 | * we free it here. | 2530 | * we free it here. |
2531 | */ | 2531 | */ |
2532 | kfree(osb->journal); | 2532 | kfree(osb->journal); |
2533 | kfree(osb->local_alloc_copy); | 2533 | kfree(osb->local_alloc_copy); |
2534 | kfree(osb->uuid_str); | 2534 | kfree(osb->uuid_str); |
2535 | kfree(osb->vol_label); | ||
2535 | ocfs2_put_dlm_debug(osb->osb_dlm_debug); | 2536 | ocfs2_put_dlm_debug(osb->osb_dlm_debug); |
2536 | memset(osb, 0, sizeof(struct ocfs2_super)); | 2537 | memset(osb, 0, sizeof(struct ocfs2_super)); |
2537 | } | 2538 | } |
2538 | 2539 | ||
2539 | /* Put OCFS2 into a readonly state, or (if the user specifies it), | 2540 | /* Put OCFS2 into a readonly state, or (if the user specifies it), |
2540 | * panic(). We do not support continue-on-error operation. */ | 2541 | * panic(). We do not support continue-on-error operation. */ |
2541 | static void ocfs2_handle_error(struct super_block *sb) | 2542 | static void ocfs2_handle_error(struct super_block *sb) |
2542 | { | 2543 | { |
2543 | struct ocfs2_super *osb = OCFS2_SB(sb); | 2544 | struct ocfs2_super *osb = OCFS2_SB(sb); |
2544 | 2545 | ||
2545 | if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) | 2546 | if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) |
2546 | panic("OCFS2: (device %s): panic forced after error\n", | 2547 | panic("OCFS2: (device %s): panic forced after error\n", |
2547 | sb->s_id); | 2548 | sb->s_id); |
2548 | 2549 | ||
2549 | ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); | 2550 | ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); |
2550 | 2551 | ||
2551 | if (sb->s_flags & MS_RDONLY && | 2552 | if (sb->s_flags & MS_RDONLY && |
2552 | (ocfs2_is_soft_readonly(osb) || | 2553 | (ocfs2_is_soft_readonly(osb) || |
2553 | ocfs2_is_hard_readonly(osb))) | 2554 | ocfs2_is_hard_readonly(osb))) |
2554 | return; | 2555 | return; |
2555 | 2556 | ||
2556 | printk(KERN_CRIT "File system is now read-only due to the potential " | 2557 | printk(KERN_CRIT "File system is now read-only due to the potential " |
2557 | "of on-disk corruption. Please run fsck.ocfs2 once the file " | 2558 | "of on-disk corruption. Please run fsck.ocfs2 once the file " |
2558 | "system is unmounted.\n"); | 2559 | "system is unmounted.\n"); |
2559 | sb->s_flags |= MS_RDONLY; | 2560 | sb->s_flags |= MS_RDONLY; |
2560 | ocfs2_set_ro_flag(osb, 0); | 2561 | ocfs2_set_ro_flag(osb, 0); |
2561 | } | 2562 | } |
2562 | 2563 | ||
2563 | static char error_buf[1024]; | 2564 | static char error_buf[1024]; |
2564 | 2565 | ||
2565 | void __ocfs2_error(struct super_block *sb, | 2566 | void __ocfs2_error(struct super_block *sb, |
2566 | const char *function, | 2567 | const char *function, |
2567 | const char *fmt, ...) | 2568 | const char *fmt, ...) |
2568 | { | 2569 | { |
2569 | va_list args; | 2570 | va_list args; |
2570 | 2571 | ||
2571 | va_start(args, fmt); | 2572 | va_start(args, fmt); |
2572 | vsnprintf(error_buf, sizeof(error_buf), fmt, args); | 2573 | vsnprintf(error_buf, sizeof(error_buf), fmt, args); |
2573 | va_end(args); | 2574 | va_end(args); |
2574 | 2575 | ||
2575 | /* Not using mlog here because we want to show the actual | 2576 | /* Not using mlog here because we want to show the actual |
2576 | * function the error came from. */ | 2577 | * function the error came from. */ |
2577 | printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", | 2578 | printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", |
2578 | sb->s_id, function, error_buf); | 2579 | sb->s_id, function, error_buf); |
2579 | 2580 | ||
2580 | ocfs2_handle_error(sb); | 2581 | ocfs2_handle_error(sb); |
2581 | } | 2582 | } |
2582 | 2583 | ||
2583 | /* Handle critical errors. This is intentionally more drastic than | 2584 | /* Handle critical errors. This is intentionally more drastic than |
2584 | * ocfs2_handle_error, so we only use for things like journal errors, | 2585 | * ocfs2_handle_error, so we only use for things like journal errors, |
2585 | * etc. */ | 2586 | * etc. */ |
2586 | void __ocfs2_abort(struct super_block* sb, | 2587 | void __ocfs2_abort(struct super_block* sb, |
2587 | const char *function, | 2588 | const char *function, |
2588 | const char *fmt, ...) | 2589 | const char *fmt, ...) |
2589 | { | 2590 | { |
2590 | va_list args; | 2591 | va_list args; |
2591 | 2592 | ||
2592 | va_start(args, fmt); | 2593 | va_start(args, fmt); |
2593 | vsnprintf(error_buf, sizeof(error_buf), fmt, args); | 2594 | vsnprintf(error_buf, sizeof(error_buf), fmt, args); |
2594 | va_end(args); | 2595 | va_end(args); |
2595 | 2596 | ||
2596 | printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", | 2597 | printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", |
2597 | sb->s_id, function, error_buf); | 2598 | sb->s_id, function, error_buf); |
2598 | 2599 | ||
2599 | /* We don't have the cluster support yet to go straight to | 2600 | /* We don't have the cluster support yet to go straight to |
2600 | * hard readonly in here. Until then, we want to keep | 2601 | * hard readonly in here. Until then, we want to keep |
2601 | * ocfs2_abort() so that we can at least mark critical | 2602 | * ocfs2_abort() so that we can at least mark critical |
2602 | * errors. | 2603 | * errors. |
2603 | * | 2604 | * |
2604 | * TODO: This should abort the journal and alert other nodes | 2605 | * TODO: This should abort the journal and alert other nodes |
2605 | * that our slot needs recovery. */ | 2606 | * that our slot needs recovery. */ |
2606 | 2607 | ||
2607 | /* Force a panic(). This stinks, but it's better than letting | 2608 | /* Force a panic(). This stinks, but it's better than letting |
2608 | * things continue without having a proper hard readonly | 2609 | * things continue without having a proper hard readonly |
2609 | * here. */ | 2610 | * here. */ |
2610 | if (!ocfs2_mount_local(OCFS2_SB(sb))) | 2611 | if (!ocfs2_mount_local(OCFS2_SB(sb))) |
2611 | OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | 2612 | OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; |
2612 | ocfs2_handle_error(sb); | 2613 | ocfs2_handle_error(sb); |
2613 | } | 2614 | } |
2614 | 2615 | ||
2615 | /* | 2616 | /* |
2616 | * Void signal blockers, because in-kernel sigprocmask() only fails | 2617 | * Void signal blockers, because in-kernel sigprocmask() only fails |
2617 | * when SIG_* is wrong. | 2618 | * when SIG_* is wrong. |
2618 | */ | 2619 | */ |
2619 | void ocfs2_block_signals(sigset_t *oldset) | 2620 | void ocfs2_block_signals(sigset_t *oldset) |
2620 | { | 2621 | { |
2621 | int rc; | 2622 | int rc; |
2622 | sigset_t blocked; | 2623 | sigset_t blocked; |
2623 | 2624 | ||
2624 | sigfillset(&blocked); | 2625 | sigfillset(&blocked); |
2625 | rc = sigprocmask(SIG_BLOCK, &blocked, oldset); | 2626 | rc = sigprocmask(SIG_BLOCK, &blocked, oldset); |
2626 | BUG_ON(rc); | 2627 | BUG_ON(rc); |
2627 | } | 2628 | } |
2628 | 2629 | ||
2629 | void ocfs2_unblock_signals(sigset_t *oldset) | 2630 | void ocfs2_unblock_signals(sigset_t *oldset) |
2630 | { | 2631 | { |
2631 | int rc = sigprocmask(SIG_SETMASK, oldset, NULL); | 2632 | int rc = sigprocmask(SIG_SETMASK, oldset, NULL); |
2632 | BUG_ON(rc); | 2633 | BUG_ON(rc); |
2633 | } | 2634 | } |
2634 | 2635 | ||
2635 | module_init(ocfs2_init); | 2636 | module_init(ocfs2_init); |
2636 | module_exit(ocfs2_exit); | 2637 | module_exit(ocfs2_exit); |
2637 | 2638 |
fs/proc/task_mmu.c
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/vmacache.h> | 2 | #include <linux/vmacache.h> |
3 | #include <linux/hugetlb.h> | 3 | #include <linux/hugetlb.h> |
4 | #include <linux/huge_mm.h> | 4 | #include <linux/huge_mm.h> |
5 | #include <linux/mount.h> | 5 | #include <linux/mount.h> |
6 | #include <linux/seq_file.h> | 6 | #include <linux/seq_file.h> |
7 | #include <linux/highmem.h> | 7 | #include <linux/highmem.h> |
8 | #include <linux/ptrace.h> | 8 | #include <linux/ptrace.h> |
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/pagemap.h> | 10 | #include <linux/pagemap.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/rmap.h> | 12 | #include <linux/rmap.h> |
13 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
14 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
15 | #include <linux/mmu_notifier.h> | 15 | #include <linux/mmu_notifier.h> |
16 | 16 | ||
17 | #include <asm/elf.h> | 17 | #include <asm/elf.h> |
18 | #include <asm/uaccess.h> | 18 | #include <asm/uaccess.h> |
19 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
20 | #include "internal.h" | 20 | #include "internal.h" |
21 | 21 | ||
22 | void task_mem(struct seq_file *m, struct mm_struct *mm) | 22 | void task_mem(struct seq_file *m, struct mm_struct *mm) |
23 | { | 23 | { |
24 | unsigned long data, text, lib, swap; | 24 | unsigned long data, text, lib, swap; |
25 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; | 25 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * Note: to minimize their overhead, mm maintains hiwater_vm and | 28 | * Note: to minimize their overhead, mm maintains hiwater_vm and |
29 | * hiwater_rss only when about to *lower* total_vm or rss. Any | 29 | * hiwater_rss only when about to *lower* total_vm or rss. Any |
30 | * collector of these hiwater stats must therefore get total_vm | 30 | * collector of these hiwater stats must therefore get total_vm |
31 | * and rss too, which will usually be the higher. Barriers? not | 31 | * and rss too, which will usually be the higher. Barriers? not |
32 | * worth the effort, such snapshots can always be inconsistent. | 32 | * worth the effort, such snapshots can always be inconsistent. |
33 | */ | 33 | */ |
34 | hiwater_vm = total_vm = mm->total_vm; | 34 | hiwater_vm = total_vm = mm->total_vm; |
35 | if (hiwater_vm < mm->hiwater_vm) | 35 | if (hiwater_vm < mm->hiwater_vm) |
36 | hiwater_vm = mm->hiwater_vm; | 36 | hiwater_vm = mm->hiwater_vm; |
37 | hiwater_rss = total_rss = get_mm_rss(mm); | 37 | hiwater_rss = total_rss = get_mm_rss(mm); |
38 | if (hiwater_rss < mm->hiwater_rss) | 38 | if (hiwater_rss < mm->hiwater_rss) |
39 | hiwater_rss = mm->hiwater_rss; | 39 | hiwater_rss = mm->hiwater_rss; |
40 | 40 | ||
41 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; | 41 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; |
42 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; | 42 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; |
43 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; | 43 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; |
44 | swap = get_mm_counter(mm, MM_SWAPENTS); | 44 | swap = get_mm_counter(mm, MM_SWAPENTS); |
45 | seq_printf(m, | 45 | seq_printf(m, |
46 | "VmPeak:\t%8lu kB\n" | 46 | "VmPeak:\t%8lu kB\n" |
47 | "VmSize:\t%8lu kB\n" | 47 | "VmSize:\t%8lu kB\n" |
48 | "VmLck:\t%8lu kB\n" | 48 | "VmLck:\t%8lu kB\n" |
49 | "VmPin:\t%8lu kB\n" | 49 | "VmPin:\t%8lu kB\n" |
50 | "VmHWM:\t%8lu kB\n" | 50 | "VmHWM:\t%8lu kB\n" |
51 | "VmRSS:\t%8lu kB\n" | 51 | "VmRSS:\t%8lu kB\n" |
52 | "VmData:\t%8lu kB\n" | 52 | "VmData:\t%8lu kB\n" |
53 | "VmStk:\t%8lu kB\n" | 53 | "VmStk:\t%8lu kB\n" |
54 | "VmExe:\t%8lu kB\n" | 54 | "VmExe:\t%8lu kB\n" |
55 | "VmLib:\t%8lu kB\n" | 55 | "VmLib:\t%8lu kB\n" |
56 | "VmPTE:\t%8lu kB\n" | 56 | "VmPTE:\t%8lu kB\n" |
57 | "VmSwap:\t%8lu kB\n", | 57 | "VmSwap:\t%8lu kB\n", |
58 | hiwater_vm << (PAGE_SHIFT-10), | 58 | hiwater_vm << (PAGE_SHIFT-10), |
59 | total_vm << (PAGE_SHIFT-10), | 59 | total_vm << (PAGE_SHIFT-10), |
60 | mm->locked_vm << (PAGE_SHIFT-10), | 60 | mm->locked_vm << (PAGE_SHIFT-10), |
61 | mm->pinned_vm << (PAGE_SHIFT-10), | 61 | mm->pinned_vm << (PAGE_SHIFT-10), |
62 | hiwater_rss << (PAGE_SHIFT-10), | 62 | hiwater_rss << (PAGE_SHIFT-10), |
63 | total_rss << (PAGE_SHIFT-10), | 63 | total_rss << (PAGE_SHIFT-10), |
64 | data << (PAGE_SHIFT-10), | 64 | data << (PAGE_SHIFT-10), |
65 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, | 65 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, |
66 | (PTRS_PER_PTE * sizeof(pte_t) * | 66 | (PTRS_PER_PTE * sizeof(pte_t) * |
67 | atomic_long_read(&mm->nr_ptes)) >> 10, | 67 | atomic_long_read(&mm->nr_ptes)) >> 10, |
68 | swap << (PAGE_SHIFT-10)); | 68 | swap << (PAGE_SHIFT-10)); |
69 | } | 69 | } |
70 | 70 | ||
71 | unsigned long task_vsize(struct mm_struct *mm) | 71 | unsigned long task_vsize(struct mm_struct *mm) |
72 | { | 72 | { |
73 | return PAGE_SIZE * mm->total_vm; | 73 | return PAGE_SIZE * mm->total_vm; |
74 | } | 74 | } |
75 | 75 | ||
76 | unsigned long task_statm(struct mm_struct *mm, | 76 | unsigned long task_statm(struct mm_struct *mm, |
77 | unsigned long *shared, unsigned long *text, | 77 | unsigned long *shared, unsigned long *text, |
78 | unsigned long *data, unsigned long *resident) | 78 | unsigned long *data, unsigned long *resident) |
79 | { | 79 | { |
80 | *shared = get_mm_counter(mm, MM_FILEPAGES); | 80 | *shared = get_mm_counter(mm, MM_FILEPAGES); |
81 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) | 81 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) |
82 | >> PAGE_SHIFT; | 82 | >> PAGE_SHIFT; |
83 | *data = mm->total_vm - mm->shared_vm; | 83 | *data = mm->total_vm - mm->shared_vm; |
84 | *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); | 84 | *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); |
85 | return mm->total_vm; | 85 | return mm->total_vm; |
86 | } | 86 | } |
87 | 87 | ||
88 | #ifdef CONFIG_NUMA | 88 | #ifdef CONFIG_NUMA |
89 | /* | 89 | /* |
90 | * These functions are for numa_maps but called in generic **maps seq_file | 90 | * These functions are for numa_maps but called in generic **maps seq_file |
91 | * ->start(), ->stop() ops. | 91 | * ->start(), ->stop() ops. |
92 | * | 92 | * |
93 | * numa_maps scans all vmas under mmap_sem and checks their mempolicy. | 93 | * numa_maps scans all vmas under mmap_sem and checks their mempolicy. |
94 | * Each mempolicy object is controlled by reference counting. The problem here | 94 | * Each mempolicy object is controlled by reference counting. The problem here |
95 | * is how to avoid accessing dead mempolicy object. | 95 | * is how to avoid accessing dead mempolicy object. |
96 | * | 96 | * |
97 | * Because we're holding mmap_sem while reading seq_file, it's safe to access | 97 | * Because we're holding mmap_sem while reading seq_file, it's safe to access |
98 | * each vma's mempolicy, no vma objects will never drop refs to mempolicy. | 98 | * each vma's mempolicy, no vma objects will never drop refs to mempolicy. |
99 | * | 99 | * |
100 | * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy | 100 | * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy |
101 | * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). | 101 | * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). |
102 | * So, without task_lock(), we cannot trust get_vma_policy() because we cannot | 102 | * So, without task_lock(), we cannot trust get_vma_policy() because we cannot |
103 | * gurantee the task never exits under us. But taking task_lock() around | 103 | * gurantee the task never exits under us. But taking task_lock() around |
104 | * get_vma_plicy() causes lock order problem. | 104 | * get_vma_plicy() causes lock order problem. |
105 | * | 105 | * |
106 | * To access task->mempolicy without lock, we hold a reference count of an | 106 | * To access task->mempolicy without lock, we hold a reference count of an |
107 | * object pointed by task->mempolicy and remember it. This will guarantee | 107 | * object pointed by task->mempolicy and remember it. This will guarantee |
108 | * that task->mempolicy points to an alive object or NULL in numa_maps accesses. | 108 | * that task->mempolicy points to an alive object or NULL in numa_maps accesses. |
109 | */ | 109 | */ |
110 | static void hold_task_mempolicy(struct proc_maps_private *priv) | 110 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
111 | { | 111 | { |
112 | struct task_struct *task = priv->task; | 112 | struct task_struct *task = priv->task; |
113 | 113 | ||
114 | task_lock(task); | 114 | task_lock(task); |
115 | priv->task_mempolicy = task->mempolicy; | 115 | priv->task_mempolicy = task->mempolicy; |
116 | mpol_get(priv->task_mempolicy); | 116 | mpol_get(priv->task_mempolicy); |
117 | task_unlock(task); | 117 | task_unlock(task); |
118 | } | 118 | } |
119 | static void release_task_mempolicy(struct proc_maps_private *priv) | 119 | static void release_task_mempolicy(struct proc_maps_private *priv) |
120 | { | 120 | { |
121 | mpol_put(priv->task_mempolicy); | 121 | mpol_put(priv->task_mempolicy); |
122 | } | 122 | } |
123 | #else | 123 | #else |
124 | static void hold_task_mempolicy(struct proc_maps_private *priv) | 124 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
125 | { | 125 | { |
126 | } | 126 | } |
127 | static void release_task_mempolicy(struct proc_maps_private *priv) | 127 | static void release_task_mempolicy(struct proc_maps_private *priv) |
128 | { | 128 | { |
129 | } | 129 | } |
130 | #endif | 130 | #endif |
131 | 131 | ||
132 | static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) | 132 | static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) |
133 | { | 133 | { |
134 | if (vma && vma != priv->tail_vma) { | 134 | if (vma && vma != priv->tail_vma) { |
135 | struct mm_struct *mm = vma->vm_mm; | 135 | struct mm_struct *mm = vma->vm_mm; |
136 | release_task_mempolicy(priv); | 136 | release_task_mempolicy(priv); |
137 | up_read(&mm->mmap_sem); | 137 | up_read(&mm->mmap_sem); |
138 | mmput(mm); | 138 | mmput(mm); |
139 | } | 139 | } |
140 | } | 140 | } |
141 | 141 | ||
142 | static void *m_start(struct seq_file *m, loff_t *pos) | 142 | static void *m_start(struct seq_file *m, loff_t *pos) |
143 | { | 143 | { |
144 | struct proc_maps_private *priv = m->private; | 144 | struct proc_maps_private *priv = m->private; |
145 | unsigned long last_addr = m->version; | 145 | unsigned long last_addr = m->version; |
146 | struct mm_struct *mm; | 146 | struct mm_struct *mm; |
147 | struct vm_area_struct *vma, *tail_vma = NULL; | 147 | struct vm_area_struct *vma, *tail_vma = NULL; |
148 | loff_t l = *pos; | 148 | loff_t l = *pos; |
149 | 149 | ||
150 | /* Clear the per syscall fields in priv */ | 150 | /* Clear the per syscall fields in priv */ |
151 | priv->task = NULL; | 151 | priv->task = NULL; |
152 | priv->tail_vma = NULL; | 152 | priv->tail_vma = NULL; |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * We remember last_addr rather than next_addr to hit with | 155 | * We remember last_addr rather than next_addr to hit with |
156 | * vmacache most of the time. We have zero last_addr at | 156 | * vmacache most of the time. We have zero last_addr at |
157 | * the beginning and also after lseek. We will have -1 last_addr | 157 | * the beginning and also after lseek. We will have -1 last_addr |
158 | * after the end of the vmas. | 158 | * after the end of the vmas. |
159 | */ | 159 | */ |
160 | 160 | ||
161 | if (last_addr == -1UL) | 161 | if (last_addr == -1UL) |
162 | return NULL; | 162 | return NULL; |
163 | 163 | ||
164 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); | 164 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); |
165 | if (!priv->task) | 165 | if (!priv->task) |
166 | return ERR_PTR(-ESRCH); | 166 | return ERR_PTR(-ESRCH); |
167 | 167 | ||
168 | mm = mm_access(priv->task, PTRACE_MODE_READ); | 168 | mm = mm_access(priv->task, PTRACE_MODE_READ); |
169 | if (!mm || IS_ERR(mm)) | 169 | if (!mm || IS_ERR(mm)) |
170 | return mm; | 170 | return mm; |
171 | down_read(&mm->mmap_sem); | 171 | down_read(&mm->mmap_sem); |
172 | 172 | ||
173 | tail_vma = get_gate_vma(priv->task->mm); | 173 | tail_vma = get_gate_vma(priv->task->mm); |
174 | priv->tail_vma = tail_vma; | 174 | priv->tail_vma = tail_vma; |
175 | hold_task_mempolicy(priv); | 175 | hold_task_mempolicy(priv); |
176 | /* Start with last addr hint */ | 176 | /* Start with last addr hint */ |
177 | vma = find_vma(mm, last_addr); | 177 | vma = find_vma(mm, last_addr); |
178 | if (last_addr && vma) { | 178 | if (last_addr && vma) { |
179 | vma = vma->vm_next; | 179 | vma = vma->vm_next; |
180 | goto out; | 180 | goto out; |
181 | } | 181 | } |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Check the vma index is within the range and do | 184 | * Check the vma index is within the range and do |
185 | * sequential scan until m_index. | 185 | * sequential scan until m_index. |
186 | */ | 186 | */ |
187 | vma = NULL; | 187 | vma = NULL; |
188 | if ((unsigned long)l < mm->map_count) { | 188 | if ((unsigned long)l < mm->map_count) { |
189 | vma = mm->mmap; | 189 | vma = mm->mmap; |
190 | while (l-- && vma) | 190 | while (l-- && vma) |
191 | vma = vma->vm_next; | 191 | vma = vma->vm_next; |
192 | goto out; | 192 | goto out; |
193 | } | 193 | } |
194 | 194 | ||
195 | if (l != mm->map_count) | 195 | if (l != mm->map_count) |
196 | tail_vma = NULL; /* After gate vma */ | 196 | tail_vma = NULL; /* After gate vma */ |
197 | 197 | ||
198 | out: | 198 | out: |
199 | if (vma) | 199 | if (vma) |
200 | return vma; | 200 | return vma; |
201 | 201 | ||
202 | release_task_mempolicy(priv); | 202 | release_task_mempolicy(priv); |
203 | /* End of vmas has been reached */ | 203 | /* End of vmas has been reached */ |
204 | m->version = (tail_vma != NULL)? 0: -1UL; | 204 | m->version = (tail_vma != NULL)? 0: -1UL; |
205 | up_read(&mm->mmap_sem); | 205 | up_read(&mm->mmap_sem); |
206 | mmput(mm); | 206 | mmput(mm); |
207 | return tail_vma; | 207 | return tail_vma; |
208 | } | 208 | } |
209 | 209 | ||
210 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) | 210 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) |
211 | { | 211 | { |
212 | struct proc_maps_private *priv = m->private; | 212 | struct proc_maps_private *priv = m->private; |
213 | struct vm_area_struct *vma = v; | 213 | struct vm_area_struct *vma = v; |
214 | struct vm_area_struct *tail_vma = priv->tail_vma; | 214 | struct vm_area_struct *tail_vma = priv->tail_vma; |
215 | 215 | ||
216 | (*pos)++; | 216 | (*pos)++; |
217 | if (vma && (vma != tail_vma) && vma->vm_next) | 217 | if (vma && (vma != tail_vma) && vma->vm_next) |
218 | return vma->vm_next; | 218 | return vma->vm_next; |
219 | vma_stop(priv, vma); | 219 | vma_stop(priv, vma); |
220 | return (vma != tail_vma)? tail_vma: NULL; | 220 | return (vma != tail_vma)? tail_vma: NULL; |
221 | } | 221 | } |
222 | 222 | ||
223 | static void m_stop(struct seq_file *m, void *v) | 223 | static void m_stop(struct seq_file *m, void *v) |
224 | { | 224 | { |
225 | struct proc_maps_private *priv = m->private; | 225 | struct proc_maps_private *priv = m->private; |
226 | struct vm_area_struct *vma = v; | 226 | struct vm_area_struct *vma = v; |
227 | 227 | ||
228 | if (!IS_ERR(vma)) | 228 | if (!IS_ERR(vma)) |
229 | vma_stop(priv, vma); | 229 | vma_stop(priv, vma); |
230 | if (priv->task) | 230 | if (priv->task) |
231 | put_task_struct(priv->task); | 231 | put_task_struct(priv->task); |
232 | } | 232 | } |
233 | 233 | ||
234 | static int do_maps_open(struct inode *inode, struct file *file, | 234 | static int do_maps_open(struct inode *inode, struct file *file, |
235 | const struct seq_operations *ops) | 235 | const struct seq_operations *ops) |
236 | { | 236 | { |
237 | struct proc_maps_private *priv; | 237 | struct proc_maps_private *priv; |
238 | int ret = -ENOMEM; | 238 | int ret = -ENOMEM; |
239 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 239 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
240 | if (priv) { | 240 | if (priv) { |
241 | priv->pid = proc_pid(inode); | 241 | priv->pid = proc_pid(inode); |
242 | ret = seq_open(file, ops); | 242 | ret = seq_open(file, ops); |
243 | if (!ret) { | 243 | if (!ret) { |
244 | struct seq_file *m = file->private_data; | 244 | struct seq_file *m = file->private_data; |
245 | m->private = priv; | 245 | m->private = priv; |
246 | } else { | 246 | } else { |
247 | kfree(priv); | 247 | kfree(priv); |
248 | } | 248 | } |
249 | } | 249 | } |
250 | return ret; | 250 | return ret; |
251 | } | 251 | } |
252 | 252 | ||
253 | static void | 253 | static void |
254 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | 254 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) |
255 | { | 255 | { |
256 | struct mm_struct *mm = vma->vm_mm; | 256 | struct mm_struct *mm = vma->vm_mm; |
257 | struct file *file = vma->vm_file; | 257 | struct file *file = vma->vm_file; |
258 | struct proc_maps_private *priv = m->private; | 258 | struct proc_maps_private *priv = m->private; |
259 | struct task_struct *task = priv->task; | 259 | struct task_struct *task = priv->task; |
260 | vm_flags_t flags = vma->vm_flags; | 260 | vm_flags_t flags = vma->vm_flags; |
261 | unsigned long ino = 0; | 261 | unsigned long ino = 0; |
262 | unsigned long long pgoff = 0; | 262 | unsigned long long pgoff = 0; |
263 | unsigned long start, end; | 263 | unsigned long start, end; |
264 | dev_t dev = 0; | 264 | dev_t dev = 0; |
265 | const char *name = NULL; | 265 | const char *name = NULL; |
266 | 266 | ||
267 | if (file) { | 267 | if (file) { |
268 | struct inode *inode = file_inode(vma->vm_file); | 268 | struct inode *inode = file_inode(vma->vm_file); |
269 | dev = inode->i_sb->s_dev; | 269 | dev = inode->i_sb->s_dev; |
270 | ino = inode->i_ino; | 270 | ino = inode->i_ino; |
271 | pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; | 271 | pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; |
272 | } | 272 | } |
273 | 273 | ||
274 | /* We don't show the stack guard page in /proc/maps */ | 274 | /* We don't show the stack guard page in /proc/maps */ |
275 | start = vma->vm_start; | 275 | start = vma->vm_start; |
276 | if (stack_guard_page_start(vma, start)) | 276 | if (stack_guard_page_start(vma, start)) |
277 | start += PAGE_SIZE; | 277 | start += PAGE_SIZE; |
278 | end = vma->vm_end; | 278 | end = vma->vm_end; |
279 | if (stack_guard_page_end(vma, end)) | 279 | if (stack_guard_page_end(vma, end)) |
280 | end -= PAGE_SIZE; | 280 | end -= PAGE_SIZE; |
281 | 281 | ||
282 | seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); | 282 | seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); |
283 | seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", | 283 | seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", |
284 | start, | 284 | start, |
285 | end, | 285 | end, |
286 | flags & VM_READ ? 'r' : '-', | 286 | flags & VM_READ ? 'r' : '-', |
287 | flags & VM_WRITE ? 'w' : '-', | 287 | flags & VM_WRITE ? 'w' : '-', |
288 | flags & VM_EXEC ? 'x' : '-', | 288 | flags & VM_EXEC ? 'x' : '-', |
289 | flags & VM_MAYSHARE ? 's' : 'p', | 289 | flags & VM_MAYSHARE ? 's' : 'p', |
290 | pgoff, | 290 | pgoff, |
291 | MAJOR(dev), MINOR(dev), ino); | 291 | MAJOR(dev), MINOR(dev), ino); |
292 | 292 | ||
293 | /* | 293 | /* |
294 | * Print the dentry name for named mappings, and a | 294 | * Print the dentry name for named mappings, and a |
295 | * special [heap] marker for the heap: | 295 | * special [heap] marker for the heap: |
296 | */ | 296 | */ |
297 | if (file) { | 297 | if (file) { |
298 | seq_pad(m, ' '); | 298 | seq_pad(m, ' '); |
299 | seq_path(m, &file->f_path, "\n"); | 299 | seq_path(m, &file->f_path, "\n"); |
300 | goto done; | 300 | goto done; |
301 | } | 301 | } |
302 | 302 | ||
303 | if (vma->vm_ops && vma->vm_ops->name) { | 303 | if (vma->vm_ops && vma->vm_ops->name) { |
304 | name = vma->vm_ops->name(vma); | 304 | name = vma->vm_ops->name(vma); |
305 | if (name) | 305 | if (name) |
306 | goto done; | 306 | goto done; |
307 | } | 307 | } |
308 | 308 | ||
309 | name = arch_vma_name(vma); | 309 | name = arch_vma_name(vma); |
310 | if (!name) { | 310 | if (!name) { |
311 | pid_t tid; | 311 | pid_t tid; |
312 | 312 | ||
313 | if (!mm) { | 313 | if (!mm) { |
314 | name = "[vdso]"; | 314 | name = "[vdso]"; |
315 | goto done; | 315 | goto done; |
316 | } | 316 | } |
317 | 317 | ||
318 | if (vma->vm_start <= mm->brk && | 318 | if (vma->vm_start <= mm->brk && |
319 | vma->vm_end >= mm->start_brk) { | 319 | vma->vm_end >= mm->start_brk) { |
320 | name = "[heap]"; | 320 | name = "[heap]"; |
321 | goto done; | 321 | goto done; |
322 | } | 322 | } |
323 | 323 | ||
324 | tid = vm_is_stack(task, vma, is_pid); | 324 | tid = vm_is_stack(task, vma, is_pid); |
325 | 325 | ||
326 | if (tid != 0) { | 326 | if (tid != 0) { |
327 | /* | 327 | /* |
328 | * Thread stack in /proc/PID/task/TID/maps or | 328 | * Thread stack in /proc/PID/task/TID/maps or |
329 | * the main process stack. | 329 | * the main process stack. |
330 | */ | 330 | */ |
331 | if (!is_pid || (vma->vm_start <= mm->start_stack && | 331 | if (!is_pid || (vma->vm_start <= mm->start_stack && |
332 | vma->vm_end >= mm->start_stack)) { | 332 | vma->vm_end >= mm->start_stack)) { |
333 | name = "[stack]"; | 333 | name = "[stack]"; |
334 | } else { | 334 | } else { |
335 | /* Thread stack in /proc/PID/maps */ | 335 | /* Thread stack in /proc/PID/maps */ |
336 | seq_pad(m, ' '); | 336 | seq_pad(m, ' '); |
337 | seq_printf(m, "[stack:%d]", tid); | 337 | seq_printf(m, "[stack:%d]", tid); |
338 | } | 338 | } |
339 | } | 339 | } |
340 | } | 340 | } |
341 | 341 | ||
342 | done: | 342 | done: |
343 | if (name) { | 343 | if (name) { |
344 | seq_pad(m, ' '); | 344 | seq_pad(m, ' '); |
345 | seq_puts(m, name); | 345 | seq_puts(m, name); |
346 | } | 346 | } |
347 | seq_putc(m, '\n'); | 347 | seq_putc(m, '\n'); |
348 | } | 348 | } |
349 | 349 | ||
350 | static int show_map(struct seq_file *m, void *v, int is_pid) | 350 | static int show_map(struct seq_file *m, void *v, int is_pid) |
351 | { | 351 | { |
352 | struct vm_area_struct *vma = v; | 352 | struct vm_area_struct *vma = v; |
353 | struct proc_maps_private *priv = m->private; | 353 | struct proc_maps_private *priv = m->private; |
354 | struct task_struct *task = priv->task; | 354 | struct task_struct *task = priv->task; |
355 | 355 | ||
356 | show_map_vma(m, vma, is_pid); | 356 | show_map_vma(m, vma, is_pid); |
357 | 357 | ||
358 | if (m->count < m->size) /* vma is copied successfully */ | 358 | if (m->count < m->size) /* vma is copied successfully */ |
359 | m->version = (vma != get_gate_vma(task->mm)) | 359 | m->version = (vma != get_gate_vma(task->mm)) |
360 | ? vma->vm_start : 0; | 360 | ? vma->vm_start : 0; |
361 | return 0; | 361 | return 0; |
362 | } | 362 | } |
363 | 363 | ||
364 | static int show_pid_map(struct seq_file *m, void *v) | 364 | static int show_pid_map(struct seq_file *m, void *v) |
365 | { | 365 | { |
366 | return show_map(m, v, 1); | 366 | return show_map(m, v, 1); |
367 | } | 367 | } |
368 | 368 | ||
369 | static int show_tid_map(struct seq_file *m, void *v) | 369 | static int show_tid_map(struct seq_file *m, void *v) |
370 | { | 370 | { |
371 | return show_map(m, v, 0); | 371 | return show_map(m, v, 0); |
372 | } | 372 | } |
373 | 373 | ||
374 | static const struct seq_operations proc_pid_maps_op = { | 374 | static const struct seq_operations proc_pid_maps_op = { |
375 | .start = m_start, | 375 | .start = m_start, |
376 | .next = m_next, | 376 | .next = m_next, |
377 | .stop = m_stop, | 377 | .stop = m_stop, |
378 | .show = show_pid_map | 378 | .show = show_pid_map |
379 | }; | 379 | }; |
380 | 380 | ||
381 | static const struct seq_operations proc_tid_maps_op = { | 381 | static const struct seq_operations proc_tid_maps_op = { |
382 | .start = m_start, | 382 | .start = m_start, |
383 | .next = m_next, | 383 | .next = m_next, |
384 | .stop = m_stop, | 384 | .stop = m_stop, |
385 | .show = show_tid_map | 385 | .show = show_tid_map |
386 | }; | 386 | }; |
387 | 387 | ||
388 | static int pid_maps_open(struct inode *inode, struct file *file) | 388 | static int pid_maps_open(struct inode *inode, struct file *file) |
389 | { | 389 | { |
390 | return do_maps_open(inode, file, &proc_pid_maps_op); | 390 | return do_maps_open(inode, file, &proc_pid_maps_op); |
391 | } | 391 | } |
392 | 392 | ||
393 | static int tid_maps_open(struct inode *inode, struct file *file) | 393 | static int tid_maps_open(struct inode *inode, struct file *file) |
394 | { | 394 | { |
395 | return do_maps_open(inode, file, &proc_tid_maps_op); | 395 | return do_maps_open(inode, file, &proc_tid_maps_op); |
396 | } | 396 | } |
397 | 397 | ||
398 | const struct file_operations proc_pid_maps_operations = { | 398 | const struct file_operations proc_pid_maps_operations = { |
399 | .open = pid_maps_open, | 399 | .open = pid_maps_open, |
400 | .read = seq_read, | 400 | .read = seq_read, |
401 | .llseek = seq_lseek, | 401 | .llseek = seq_lseek, |
402 | .release = seq_release_private, | 402 | .release = seq_release_private, |
403 | }; | 403 | }; |
404 | 404 | ||
405 | const struct file_operations proc_tid_maps_operations = { | 405 | const struct file_operations proc_tid_maps_operations = { |
406 | .open = tid_maps_open, | 406 | .open = tid_maps_open, |
407 | .read = seq_read, | 407 | .read = seq_read, |
408 | .llseek = seq_lseek, | 408 | .llseek = seq_lseek, |
409 | .release = seq_release_private, | 409 | .release = seq_release_private, |
410 | }; | 410 | }; |
411 | 411 | ||
412 | /* | 412 | /* |
413 | * Proportional Set Size(PSS): my share of RSS. | 413 | * Proportional Set Size(PSS): my share of RSS. |
414 | * | 414 | * |
415 | * PSS of a process is the count of pages it has in memory, where each | 415 | * PSS of a process is the count of pages it has in memory, where each |
416 | * page is divided by the number of processes sharing it. So if a | 416 | * page is divided by the number of processes sharing it. So if a |
417 | * process has 1000 pages all to itself, and 1000 shared with one other | 417 | * process has 1000 pages all to itself, and 1000 shared with one other |
418 | * process, its PSS will be 1500. | 418 | * process, its PSS will be 1500. |
419 | * | 419 | * |
420 | * To keep (accumulated) division errors low, we adopt a 64bit | 420 | * To keep (accumulated) division errors low, we adopt a 64bit |
421 | * fixed-point pss counter to minimize division errors. So (pss >> | 421 | * fixed-point pss counter to minimize division errors. So (pss >> |
422 | * PSS_SHIFT) would be the real byte count. | 422 | * PSS_SHIFT) would be the real byte count. |
423 | * | 423 | * |
424 | * A shift of 12 before division means (assuming 4K page size): | 424 | * A shift of 12 before division means (assuming 4K page size): |
425 | * - 1M 3-user-pages add up to 8KB errors; | 425 | * - 1M 3-user-pages add up to 8KB errors; |
426 | * - supports mapcount up to 2^24, or 16M; | 426 | * - supports mapcount up to 2^24, or 16M; |
427 | * - supports PSS up to 2^52 bytes, or 4PB. | 427 | * - supports PSS up to 2^52 bytes, or 4PB. |
428 | */ | 428 | */ |
429 | #define PSS_SHIFT 12 | 429 | #define PSS_SHIFT 12 |
430 | 430 | ||
431 | #ifdef CONFIG_PROC_PAGE_MONITOR | 431 | #ifdef CONFIG_PROC_PAGE_MONITOR |
432 | struct mem_size_stats { | 432 | struct mem_size_stats { |
433 | struct vm_area_struct *vma; | 433 | struct vm_area_struct *vma; |
434 | unsigned long resident; | 434 | unsigned long resident; |
435 | unsigned long shared_clean; | 435 | unsigned long shared_clean; |
436 | unsigned long shared_dirty; | 436 | unsigned long shared_dirty; |
437 | unsigned long private_clean; | 437 | unsigned long private_clean; |
438 | unsigned long private_dirty; | 438 | unsigned long private_dirty; |
439 | unsigned long referenced; | 439 | unsigned long referenced; |
440 | unsigned long anonymous; | 440 | unsigned long anonymous; |
441 | unsigned long anonymous_thp; | 441 | unsigned long anonymous_thp; |
442 | unsigned long swap; | 442 | unsigned long swap; |
443 | unsigned long nonlinear; | 443 | unsigned long nonlinear; |
444 | u64 pss; | 444 | u64 pss; |
445 | }; | 445 | }; |
446 | 446 | ||
447 | 447 | ||
448 | static void smaps_pte_entry(pte_t ptent, unsigned long addr, | 448 | static void smaps_pte_entry(pte_t ptent, unsigned long addr, |
449 | unsigned long ptent_size, struct mm_walk *walk) | 449 | unsigned long ptent_size, struct mm_walk *walk) |
450 | { | 450 | { |
451 | struct mem_size_stats *mss = walk->private; | 451 | struct mem_size_stats *mss = walk->private; |
452 | struct vm_area_struct *vma = mss->vma; | 452 | struct vm_area_struct *vma = mss->vma; |
453 | pgoff_t pgoff = linear_page_index(vma, addr); | 453 | pgoff_t pgoff = linear_page_index(vma, addr); |
454 | struct page *page = NULL; | 454 | struct page *page = NULL; |
455 | int mapcount; | 455 | int mapcount; |
456 | 456 | ||
457 | if (pte_present(ptent)) { | 457 | if (pte_present(ptent)) { |
458 | page = vm_normal_page(vma, addr, ptent); | 458 | page = vm_normal_page(vma, addr, ptent); |
459 | } else if (is_swap_pte(ptent)) { | 459 | } else if (is_swap_pte(ptent)) { |
460 | swp_entry_t swpent = pte_to_swp_entry(ptent); | 460 | swp_entry_t swpent = pte_to_swp_entry(ptent); |
461 | 461 | ||
462 | if (!non_swap_entry(swpent)) | 462 | if (!non_swap_entry(swpent)) |
463 | mss->swap += ptent_size; | 463 | mss->swap += ptent_size; |
464 | else if (is_migration_entry(swpent)) | 464 | else if (is_migration_entry(swpent)) |
465 | page = migration_entry_to_page(swpent); | 465 | page = migration_entry_to_page(swpent); |
466 | } else if (pte_file(ptent)) { | 466 | } else if (pte_file(ptent)) { |
467 | if (pte_to_pgoff(ptent) != pgoff) | 467 | if (pte_to_pgoff(ptent) != pgoff) |
468 | mss->nonlinear += ptent_size; | 468 | mss->nonlinear += ptent_size; |
469 | } | 469 | } |
470 | 470 | ||
471 | if (!page) | 471 | if (!page) |
472 | return; | 472 | return; |
473 | 473 | ||
474 | if (PageAnon(page)) | 474 | if (PageAnon(page)) |
475 | mss->anonymous += ptent_size; | 475 | mss->anonymous += ptent_size; |
476 | 476 | ||
477 | if (page->index != pgoff) | 477 | if (page->index != pgoff) |
478 | mss->nonlinear += ptent_size; | 478 | mss->nonlinear += ptent_size; |
479 | 479 | ||
480 | mss->resident += ptent_size; | 480 | mss->resident += ptent_size; |
481 | /* Accumulate the size in pages that have been accessed. */ | 481 | /* Accumulate the size in pages that have been accessed. */ |
482 | if (pte_young(ptent) || PageReferenced(page)) | 482 | if (pte_young(ptent) || PageReferenced(page)) |
483 | mss->referenced += ptent_size; | 483 | mss->referenced += ptent_size; |
484 | mapcount = page_mapcount(page); | 484 | mapcount = page_mapcount(page); |
485 | if (mapcount >= 2) { | 485 | if (mapcount >= 2) { |
486 | if (pte_dirty(ptent) || PageDirty(page)) | 486 | if (pte_dirty(ptent) || PageDirty(page)) |
487 | mss->shared_dirty += ptent_size; | 487 | mss->shared_dirty += ptent_size; |
488 | else | 488 | else |
489 | mss->shared_clean += ptent_size; | 489 | mss->shared_clean += ptent_size; |
490 | mss->pss += (ptent_size << PSS_SHIFT) / mapcount; | 490 | mss->pss += (ptent_size << PSS_SHIFT) / mapcount; |
491 | } else { | 491 | } else { |
492 | if (pte_dirty(ptent) || PageDirty(page)) | 492 | if (pte_dirty(ptent) || PageDirty(page)) |
493 | mss->private_dirty += ptent_size; | 493 | mss->private_dirty += ptent_size; |
494 | else | 494 | else |
495 | mss->private_clean += ptent_size; | 495 | mss->private_clean += ptent_size; |
496 | mss->pss += (ptent_size << PSS_SHIFT); | 496 | mss->pss += (ptent_size << PSS_SHIFT); |
497 | } | 497 | } |
498 | } | 498 | } |
499 | 499 | ||
500 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 500 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
501 | struct mm_walk *walk) | 501 | struct mm_walk *walk) |
502 | { | 502 | { |
503 | struct mem_size_stats *mss = walk->private; | 503 | struct mem_size_stats *mss = walk->private; |
504 | struct vm_area_struct *vma = mss->vma; | 504 | struct vm_area_struct *vma = mss->vma; |
505 | pte_t *pte; | 505 | pte_t *pte; |
506 | spinlock_t *ptl; | 506 | spinlock_t *ptl; |
507 | 507 | ||
508 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 508 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
509 | smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); | 509 | smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); |
510 | spin_unlock(ptl); | 510 | spin_unlock(ptl); |
511 | mss->anonymous_thp += HPAGE_PMD_SIZE; | 511 | mss->anonymous_thp += HPAGE_PMD_SIZE; |
512 | return 0; | 512 | return 0; |
513 | } | 513 | } |
514 | 514 | ||
515 | if (pmd_trans_unstable(pmd)) | 515 | if (pmd_trans_unstable(pmd)) |
516 | return 0; | 516 | return 0; |
517 | /* | 517 | /* |
518 | * The mmap_sem held all the way back in m_start() is what | 518 | * The mmap_sem held all the way back in m_start() is what |
519 | * keeps khugepaged out of here and from collapsing things | 519 | * keeps khugepaged out of here and from collapsing things |
520 | * in here. | 520 | * in here. |
521 | */ | 521 | */ |
522 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 522 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
523 | for (; addr != end; pte++, addr += PAGE_SIZE) | 523 | for (; addr != end; pte++, addr += PAGE_SIZE) |
524 | smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); | 524 | smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); |
525 | pte_unmap_unlock(pte - 1, ptl); | 525 | pte_unmap_unlock(pte - 1, ptl); |
526 | cond_resched(); | 526 | cond_resched(); |
527 | return 0; | 527 | return 0; |
528 | } | 528 | } |
529 | 529 | ||
530 | static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | 530 | static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) |
531 | { | 531 | { |
532 | /* | 532 | /* |
533 | * Don't forget to update Documentation/ on changes. | 533 | * Don't forget to update Documentation/ on changes. |
534 | */ | 534 | */ |
535 | static const char mnemonics[BITS_PER_LONG][2] = { | 535 | static const char mnemonics[BITS_PER_LONG][2] = { |
536 | /* | 536 | /* |
537 | * In case if we meet a flag we don't know about. | 537 | * In case if we meet a flag we don't know about. |
538 | */ | 538 | */ |
539 | [0 ... (BITS_PER_LONG-1)] = "??", | 539 | [0 ... (BITS_PER_LONG-1)] = "??", |
540 | 540 | ||
541 | [ilog2(VM_READ)] = "rd", | 541 | [ilog2(VM_READ)] = "rd", |
542 | [ilog2(VM_WRITE)] = "wr", | 542 | [ilog2(VM_WRITE)] = "wr", |
543 | [ilog2(VM_EXEC)] = "ex", | 543 | [ilog2(VM_EXEC)] = "ex", |
544 | [ilog2(VM_SHARED)] = "sh", | 544 | [ilog2(VM_SHARED)] = "sh", |
545 | [ilog2(VM_MAYREAD)] = "mr", | 545 | [ilog2(VM_MAYREAD)] = "mr", |
546 | [ilog2(VM_MAYWRITE)] = "mw", | 546 | [ilog2(VM_MAYWRITE)] = "mw", |
547 | [ilog2(VM_MAYEXEC)] = "me", | 547 | [ilog2(VM_MAYEXEC)] = "me", |
548 | [ilog2(VM_MAYSHARE)] = "ms", | 548 | [ilog2(VM_MAYSHARE)] = "ms", |
549 | [ilog2(VM_GROWSDOWN)] = "gd", | 549 | [ilog2(VM_GROWSDOWN)] = "gd", |
550 | [ilog2(VM_PFNMAP)] = "pf", | 550 | [ilog2(VM_PFNMAP)] = "pf", |
551 | [ilog2(VM_DENYWRITE)] = "dw", | 551 | [ilog2(VM_DENYWRITE)] = "dw", |
552 | [ilog2(VM_LOCKED)] = "lo", | 552 | [ilog2(VM_LOCKED)] = "lo", |
553 | [ilog2(VM_IO)] = "io", | 553 | [ilog2(VM_IO)] = "io", |
554 | [ilog2(VM_SEQ_READ)] = "sr", | 554 | [ilog2(VM_SEQ_READ)] = "sr", |
555 | [ilog2(VM_RAND_READ)] = "rr", | 555 | [ilog2(VM_RAND_READ)] = "rr", |
556 | [ilog2(VM_DONTCOPY)] = "dc", | 556 | [ilog2(VM_DONTCOPY)] = "dc", |
557 | [ilog2(VM_DONTEXPAND)] = "de", | 557 | [ilog2(VM_DONTEXPAND)] = "de", |
558 | [ilog2(VM_ACCOUNT)] = "ac", | 558 | [ilog2(VM_ACCOUNT)] = "ac", |
559 | [ilog2(VM_NORESERVE)] = "nr", | 559 | [ilog2(VM_NORESERVE)] = "nr", |
560 | [ilog2(VM_HUGETLB)] = "ht", | 560 | [ilog2(VM_HUGETLB)] = "ht", |
561 | [ilog2(VM_NONLINEAR)] = "nl", | 561 | [ilog2(VM_NONLINEAR)] = "nl", |
562 | [ilog2(VM_ARCH_1)] = "ar", | 562 | [ilog2(VM_ARCH_1)] = "ar", |
563 | [ilog2(VM_DONTDUMP)] = "dd", | 563 | [ilog2(VM_DONTDUMP)] = "dd", |
564 | #ifdef CONFIG_MEM_SOFT_DIRTY | 564 | #ifdef CONFIG_MEM_SOFT_DIRTY |
565 | [ilog2(VM_SOFTDIRTY)] = "sd", | 565 | [ilog2(VM_SOFTDIRTY)] = "sd", |
566 | #endif | 566 | #endif |
567 | [ilog2(VM_MIXEDMAP)] = "mm", | 567 | [ilog2(VM_MIXEDMAP)] = "mm", |
568 | [ilog2(VM_HUGEPAGE)] = "hg", | 568 | [ilog2(VM_HUGEPAGE)] = "hg", |
569 | [ilog2(VM_NOHUGEPAGE)] = "nh", | 569 | [ilog2(VM_NOHUGEPAGE)] = "nh", |
570 | [ilog2(VM_MERGEABLE)] = "mg", | 570 | [ilog2(VM_MERGEABLE)] = "mg", |
571 | }; | 571 | }; |
572 | size_t i; | 572 | size_t i; |
573 | 573 | ||
574 | seq_puts(m, "VmFlags: "); | 574 | seq_puts(m, "VmFlags: "); |
575 | for (i = 0; i < BITS_PER_LONG; i++) { | 575 | for (i = 0; i < BITS_PER_LONG; i++) { |
576 | if (vma->vm_flags & (1UL << i)) { | 576 | if (vma->vm_flags & (1UL << i)) { |
577 | seq_printf(m, "%c%c ", | 577 | seq_printf(m, "%c%c ", |
578 | mnemonics[i][0], mnemonics[i][1]); | 578 | mnemonics[i][0], mnemonics[i][1]); |
579 | } | 579 | } |
580 | } | 580 | } |
581 | seq_putc(m, '\n'); | 581 | seq_putc(m, '\n'); |
582 | } | 582 | } |
583 | 583 | ||
584 | static int show_smap(struct seq_file *m, void *v, int is_pid) | 584 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
585 | { | 585 | { |
586 | struct proc_maps_private *priv = m->private; | 586 | struct proc_maps_private *priv = m->private; |
587 | struct task_struct *task = priv->task; | 587 | struct task_struct *task = priv->task; |
588 | struct vm_area_struct *vma = v; | 588 | struct vm_area_struct *vma = v; |
589 | struct mem_size_stats mss; | 589 | struct mem_size_stats mss; |
590 | struct mm_walk smaps_walk = { | 590 | struct mm_walk smaps_walk = { |
591 | .pmd_entry = smaps_pte_range, | 591 | .pmd_entry = smaps_pte_range, |
592 | .mm = vma->vm_mm, | 592 | .mm = vma->vm_mm, |
593 | .private = &mss, | 593 | .private = &mss, |
594 | }; | 594 | }; |
595 | 595 | ||
596 | memset(&mss, 0, sizeof mss); | 596 | memset(&mss, 0, sizeof mss); |
597 | mss.vma = vma; | 597 | mss.vma = vma; |
598 | /* mmap_sem is held in m_start */ | 598 | /* mmap_sem is held in m_start */ |
599 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) | 599 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) |
600 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); | 600 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); |
601 | 601 | ||
602 | show_map_vma(m, vma, is_pid); | 602 | show_map_vma(m, vma, is_pid); |
603 | 603 | ||
604 | seq_printf(m, | 604 | seq_printf(m, |
605 | "Size: %8lu kB\n" | 605 | "Size: %8lu kB\n" |
606 | "Rss: %8lu kB\n" | 606 | "Rss: %8lu kB\n" |
607 | "Pss: %8lu kB\n" | 607 | "Pss: %8lu kB\n" |
608 | "Shared_Clean: %8lu kB\n" | 608 | "Shared_Clean: %8lu kB\n" |
609 | "Shared_Dirty: %8lu kB\n" | 609 | "Shared_Dirty: %8lu kB\n" |
610 | "Private_Clean: %8lu kB\n" | 610 | "Private_Clean: %8lu kB\n" |
611 | "Private_Dirty: %8lu kB\n" | 611 | "Private_Dirty: %8lu kB\n" |
612 | "Referenced: %8lu kB\n" | 612 | "Referenced: %8lu kB\n" |
613 | "Anonymous: %8lu kB\n" | 613 | "Anonymous: %8lu kB\n" |
614 | "AnonHugePages: %8lu kB\n" | 614 | "AnonHugePages: %8lu kB\n" |
615 | "Swap: %8lu kB\n" | 615 | "Swap: %8lu kB\n" |
616 | "KernelPageSize: %8lu kB\n" | 616 | "KernelPageSize: %8lu kB\n" |
617 | "MMUPageSize: %8lu kB\n" | 617 | "MMUPageSize: %8lu kB\n" |
618 | "Locked: %8lu kB\n", | 618 | "Locked: %8lu kB\n", |
619 | (vma->vm_end - vma->vm_start) >> 10, | 619 | (vma->vm_end - vma->vm_start) >> 10, |
620 | mss.resident >> 10, | 620 | mss.resident >> 10, |
621 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), | 621 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), |
622 | mss.shared_clean >> 10, | 622 | mss.shared_clean >> 10, |
623 | mss.shared_dirty >> 10, | 623 | mss.shared_dirty >> 10, |
624 | mss.private_clean >> 10, | 624 | mss.private_clean >> 10, |
625 | mss.private_dirty >> 10, | 625 | mss.private_dirty >> 10, |
626 | mss.referenced >> 10, | 626 | mss.referenced >> 10, |
627 | mss.anonymous >> 10, | 627 | mss.anonymous >> 10, |
628 | mss.anonymous_thp >> 10, | 628 | mss.anonymous_thp >> 10, |
629 | mss.swap >> 10, | 629 | mss.swap >> 10, |
630 | vma_kernel_pagesize(vma) >> 10, | 630 | vma_kernel_pagesize(vma) >> 10, |
631 | vma_mmu_pagesize(vma) >> 10, | 631 | vma_mmu_pagesize(vma) >> 10, |
632 | (vma->vm_flags & VM_LOCKED) ? | 632 | (vma->vm_flags & VM_LOCKED) ? |
633 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); | 633 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); |
634 | 634 | ||
635 | if (vma->vm_flags & VM_NONLINEAR) | 635 | if (vma->vm_flags & VM_NONLINEAR) |
636 | seq_printf(m, "Nonlinear: %8lu kB\n", | 636 | seq_printf(m, "Nonlinear: %8lu kB\n", |
637 | mss.nonlinear >> 10); | 637 | mss.nonlinear >> 10); |
638 | 638 | ||
639 | show_smap_vma_flags(m, vma); | 639 | show_smap_vma_flags(m, vma); |
640 | 640 | ||
641 | if (m->count < m->size) /* vma is copied successfully */ | 641 | if (m->count < m->size) /* vma is copied successfully */ |
642 | m->version = (vma != get_gate_vma(task->mm)) | 642 | m->version = (vma != get_gate_vma(task->mm)) |
643 | ? vma->vm_start : 0; | 643 | ? vma->vm_start : 0; |
644 | return 0; | 644 | return 0; |
645 | } | 645 | } |
646 | 646 | ||
647 | static int show_pid_smap(struct seq_file *m, void *v) | 647 | static int show_pid_smap(struct seq_file *m, void *v) |
648 | { | 648 | { |
649 | return show_smap(m, v, 1); | 649 | return show_smap(m, v, 1); |
650 | } | 650 | } |
651 | 651 | ||
652 | static int show_tid_smap(struct seq_file *m, void *v) | 652 | static int show_tid_smap(struct seq_file *m, void *v) |
653 | { | 653 | { |
654 | return show_smap(m, v, 0); | 654 | return show_smap(m, v, 0); |
655 | } | 655 | } |
656 | 656 | ||
657 | static const struct seq_operations proc_pid_smaps_op = { | 657 | static const struct seq_operations proc_pid_smaps_op = { |
658 | .start = m_start, | 658 | .start = m_start, |
659 | .next = m_next, | 659 | .next = m_next, |
660 | .stop = m_stop, | 660 | .stop = m_stop, |
661 | .show = show_pid_smap | 661 | .show = show_pid_smap |
662 | }; | 662 | }; |
663 | 663 | ||
664 | static const struct seq_operations proc_tid_smaps_op = { | 664 | static const struct seq_operations proc_tid_smaps_op = { |
665 | .start = m_start, | 665 | .start = m_start, |
666 | .next = m_next, | 666 | .next = m_next, |
667 | .stop = m_stop, | 667 | .stop = m_stop, |
668 | .show = show_tid_smap | 668 | .show = show_tid_smap |
669 | }; | 669 | }; |
670 | 670 | ||
671 | static int pid_smaps_open(struct inode *inode, struct file *file) | 671 | static int pid_smaps_open(struct inode *inode, struct file *file) |
672 | { | 672 | { |
673 | return do_maps_open(inode, file, &proc_pid_smaps_op); | 673 | return do_maps_open(inode, file, &proc_pid_smaps_op); |
674 | } | 674 | } |
675 | 675 | ||
676 | static int tid_smaps_open(struct inode *inode, struct file *file) | 676 | static int tid_smaps_open(struct inode *inode, struct file *file) |
677 | { | 677 | { |
678 | return do_maps_open(inode, file, &proc_tid_smaps_op); | 678 | return do_maps_open(inode, file, &proc_tid_smaps_op); |
679 | } | 679 | } |
680 | 680 | ||
681 | const struct file_operations proc_pid_smaps_operations = { | 681 | const struct file_operations proc_pid_smaps_operations = { |
682 | .open = pid_smaps_open, | 682 | .open = pid_smaps_open, |
683 | .read = seq_read, | 683 | .read = seq_read, |
684 | .llseek = seq_lseek, | 684 | .llseek = seq_lseek, |
685 | .release = seq_release_private, | 685 | .release = seq_release_private, |
686 | }; | 686 | }; |
687 | 687 | ||
688 | const struct file_operations proc_tid_smaps_operations = { | 688 | const struct file_operations proc_tid_smaps_operations = { |
689 | .open = tid_smaps_open, | 689 | .open = tid_smaps_open, |
690 | .read = seq_read, | 690 | .read = seq_read, |
691 | .llseek = seq_lseek, | 691 | .llseek = seq_lseek, |
692 | .release = seq_release_private, | 692 | .release = seq_release_private, |
693 | }; | 693 | }; |
694 | 694 | ||
695 | /* | 695 | /* |
696 | * We do not want to have constant page-shift bits sitting in | 696 | * We do not want to have constant page-shift bits sitting in |
697 | * pagemap entries and are about to reuse them some time soon. | 697 | * pagemap entries and are about to reuse them some time soon. |
698 | * | 698 | * |
699 | * Here's the "migration strategy": | 699 | * Here's the "migration strategy": |
700 | * 1. when the system boots these bits remain what they are, | 700 | * 1. when the system boots these bits remain what they are, |
701 | * but a warning about future change is printed in log; | 701 | * but a warning about future change is printed in log; |
702 | * 2. once anyone clears soft-dirty bits via clear_refs file, | 702 | * 2. once anyone clears soft-dirty bits via clear_refs file, |
703 | * these flag is set to denote, that user is aware of the | 703 | * these flag is set to denote, that user is aware of the |
704 | * new API and those page-shift bits change their meaning. | 704 | * new API and those page-shift bits change their meaning. |
705 | * The respective warning is printed in dmesg; | 705 | * The respective warning is printed in dmesg; |
706 | * 3. In a couple of releases we will remove all the mentions | 706 | * 3. In a couple of releases we will remove all the mentions |
707 | * of page-shift in pagemap entries. | 707 | * of page-shift in pagemap entries. |
708 | */ | 708 | */ |
709 | 709 | ||
710 | static bool soft_dirty_cleared __read_mostly; | 710 | static bool soft_dirty_cleared __read_mostly; |
711 | 711 | ||
712 | enum clear_refs_types { | 712 | enum clear_refs_types { |
713 | CLEAR_REFS_ALL = 1, | 713 | CLEAR_REFS_ALL = 1, |
714 | CLEAR_REFS_ANON, | 714 | CLEAR_REFS_ANON, |
715 | CLEAR_REFS_MAPPED, | 715 | CLEAR_REFS_MAPPED, |
716 | CLEAR_REFS_SOFT_DIRTY, | 716 | CLEAR_REFS_SOFT_DIRTY, |
717 | CLEAR_REFS_LAST, | 717 | CLEAR_REFS_LAST, |
718 | }; | 718 | }; |
719 | 719 | ||
720 | struct clear_refs_private { | 720 | struct clear_refs_private { |
721 | struct vm_area_struct *vma; | 721 | struct vm_area_struct *vma; |
722 | enum clear_refs_types type; | 722 | enum clear_refs_types type; |
723 | }; | 723 | }; |
724 | 724 | ||
725 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | 725 | static inline void clear_soft_dirty(struct vm_area_struct *vma, |
726 | unsigned long addr, pte_t *pte) | 726 | unsigned long addr, pte_t *pte) |
727 | { | 727 | { |
728 | #ifdef CONFIG_MEM_SOFT_DIRTY | 728 | #ifdef CONFIG_MEM_SOFT_DIRTY |
729 | /* | 729 | /* |
730 | * The soft-dirty tracker uses #PF-s to catch writes | 730 | * The soft-dirty tracker uses #PF-s to catch writes |
731 | * to pages, so write-protect the pte as well. See the | 731 | * to pages, so write-protect the pte as well. See the |
732 | * Documentation/vm/soft-dirty.txt for full description | 732 | * Documentation/vm/soft-dirty.txt for full description |
733 | * of how soft-dirty works. | 733 | * of how soft-dirty works. |
734 | */ | 734 | */ |
735 | pte_t ptent = *pte; | 735 | pte_t ptent = *pte; |
736 | 736 | ||
737 | if (pte_present(ptent)) { | 737 | if (pte_present(ptent)) { |
738 | ptent = pte_wrprotect(ptent); | 738 | ptent = pte_wrprotect(ptent); |
739 | ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); | 739 | ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); |
740 | } else if (is_swap_pte(ptent)) { | 740 | } else if (is_swap_pte(ptent)) { |
741 | ptent = pte_swp_clear_soft_dirty(ptent); | 741 | ptent = pte_swp_clear_soft_dirty(ptent); |
742 | } else if (pte_file(ptent)) { | 742 | } else if (pte_file(ptent)) { |
743 | ptent = pte_file_clear_soft_dirty(ptent); | 743 | ptent = pte_file_clear_soft_dirty(ptent); |
744 | } | 744 | } |
745 | 745 | ||
746 | set_pte_at(vma->vm_mm, addr, pte, ptent); | 746 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
747 | #endif | 747 | #endif |
748 | } | 748 | } |
749 | 749 | ||
750 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | 750 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, |
751 | unsigned long end, struct mm_walk *walk) | 751 | unsigned long end, struct mm_walk *walk) |
752 | { | 752 | { |
753 | struct clear_refs_private *cp = walk->private; | 753 | struct clear_refs_private *cp = walk->private; |
754 | struct vm_area_struct *vma = cp->vma; | 754 | struct vm_area_struct *vma = cp->vma; |
755 | pte_t *pte, ptent; | 755 | pte_t *pte, ptent; |
756 | spinlock_t *ptl; | 756 | spinlock_t *ptl; |
757 | struct page *page; | 757 | struct page *page; |
758 | 758 | ||
759 | split_huge_page_pmd(vma, addr, pmd); | 759 | split_huge_page_pmd(vma, addr, pmd); |
760 | if (pmd_trans_unstable(pmd)) | 760 | if (pmd_trans_unstable(pmd)) |
761 | return 0; | 761 | return 0; |
762 | 762 | ||
763 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 763 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
764 | for (; addr != end; pte++, addr += PAGE_SIZE) { | 764 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
765 | ptent = *pte; | 765 | ptent = *pte; |
766 | 766 | ||
767 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { | 767 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { |
768 | clear_soft_dirty(vma, addr, pte); | 768 | clear_soft_dirty(vma, addr, pte); |
769 | continue; | 769 | continue; |
770 | } | 770 | } |
771 | 771 | ||
772 | if (!pte_present(ptent)) | 772 | if (!pte_present(ptent)) |
773 | continue; | 773 | continue; |
774 | 774 | ||
775 | page = vm_normal_page(vma, addr, ptent); | 775 | page = vm_normal_page(vma, addr, ptent); |
776 | if (!page) | 776 | if (!page) |
777 | continue; | 777 | continue; |
778 | 778 | ||
779 | /* Clear accessed and referenced bits. */ | 779 | /* Clear accessed and referenced bits. */ |
780 | ptep_test_and_clear_young(vma, addr, pte); | 780 | ptep_test_and_clear_young(vma, addr, pte); |
781 | ClearPageReferenced(page); | 781 | ClearPageReferenced(page); |
782 | } | 782 | } |
783 | pte_unmap_unlock(pte - 1, ptl); | 783 | pte_unmap_unlock(pte - 1, ptl); |
784 | cond_resched(); | 784 | cond_resched(); |
785 | return 0; | 785 | return 0; |
786 | } | 786 | } |
787 | 787 | ||
788 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, | 788 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, |
789 | size_t count, loff_t *ppos) | 789 | size_t count, loff_t *ppos) |
790 | { | 790 | { |
791 | struct task_struct *task; | 791 | struct task_struct *task; |
792 | char buffer[PROC_NUMBUF]; | 792 | char buffer[PROC_NUMBUF]; |
793 | struct mm_struct *mm; | 793 | struct mm_struct *mm; |
794 | struct vm_area_struct *vma; | 794 | struct vm_area_struct *vma; |
795 | enum clear_refs_types type; | 795 | enum clear_refs_types type; |
796 | int itype; | 796 | int itype; |
797 | int rv; | 797 | int rv; |
798 | 798 | ||
799 | memset(buffer, 0, sizeof(buffer)); | 799 | memset(buffer, 0, sizeof(buffer)); |
800 | if (count > sizeof(buffer) - 1) | 800 | if (count > sizeof(buffer) - 1) |
801 | count = sizeof(buffer) - 1; | 801 | count = sizeof(buffer) - 1; |
802 | if (copy_from_user(buffer, buf, count)) | 802 | if (copy_from_user(buffer, buf, count)) |
803 | return -EFAULT; | 803 | return -EFAULT; |
804 | rv = kstrtoint(strstrip(buffer), 10, &itype); | 804 | rv = kstrtoint(strstrip(buffer), 10, &itype); |
805 | if (rv < 0) | 805 | if (rv < 0) |
806 | return rv; | 806 | return rv; |
807 | type = (enum clear_refs_types)itype; | 807 | type = (enum clear_refs_types)itype; |
808 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) | 808 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) |
809 | return -EINVAL; | 809 | return -EINVAL; |
810 | 810 | ||
811 | if (type == CLEAR_REFS_SOFT_DIRTY) { | 811 | if (type == CLEAR_REFS_SOFT_DIRTY) { |
812 | soft_dirty_cleared = true; | 812 | soft_dirty_cleared = true; |
813 | pr_warn_once("The pagemap bits 55-60 has changed their meaning!" | 813 | pr_warn_once("The pagemap bits 55-60 has changed their meaning!" |
814 | " See the linux/Documentation/vm/pagemap.txt for " | 814 | " See the linux/Documentation/vm/pagemap.txt for " |
815 | "details.\n"); | 815 | "details.\n"); |
816 | } | 816 | } |
817 | 817 | ||
818 | task = get_proc_task(file_inode(file)); | 818 | task = get_proc_task(file_inode(file)); |
819 | if (!task) | 819 | if (!task) |
820 | return -ESRCH; | 820 | return -ESRCH; |
821 | mm = get_task_mm(task); | 821 | mm = get_task_mm(task); |
822 | if (mm) { | 822 | if (mm) { |
823 | struct clear_refs_private cp = { | 823 | struct clear_refs_private cp = { |
824 | .type = type, | 824 | .type = type, |
825 | }; | 825 | }; |
826 | struct mm_walk clear_refs_walk = { | 826 | struct mm_walk clear_refs_walk = { |
827 | .pmd_entry = clear_refs_pte_range, | 827 | .pmd_entry = clear_refs_pte_range, |
828 | .mm = mm, | 828 | .mm = mm, |
829 | .private = &cp, | 829 | .private = &cp, |
830 | }; | 830 | }; |
831 | down_read(&mm->mmap_sem); | 831 | down_read(&mm->mmap_sem); |
832 | if (type == CLEAR_REFS_SOFT_DIRTY) | 832 | if (type == CLEAR_REFS_SOFT_DIRTY) |
833 | mmu_notifier_invalidate_range_start(mm, 0, -1); | 833 | mmu_notifier_invalidate_range_start(mm, 0, -1); |
834 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 834 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
835 | cp.vma = vma; | 835 | cp.vma = vma; |
836 | if (is_vm_hugetlb_page(vma)) | 836 | if (is_vm_hugetlb_page(vma)) |
837 | continue; | 837 | continue; |
838 | /* | 838 | /* |
839 | * Writing 1 to /proc/pid/clear_refs affects all pages. | 839 | * Writing 1 to /proc/pid/clear_refs affects all pages. |
840 | * | 840 | * |
841 | * Writing 2 to /proc/pid/clear_refs only affects | 841 | * Writing 2 to /proc/pid/clear_refs only affects |
842 | * Anonymous pages. | 842 | * Anonymous pages. |
843 | * | 843 | * |
844 | * Writing 3 to /proc/pid/clear_refs only affects file | 844 | * Writing 3 to /proc/pid/clear_refs only affects file |
845 | * mapped pages. | 845 | * mapped pages. |
846 | * | 846 | * |
847 | * Writing 4 to /proc/pid/clear_refs affects all pages. | 847 | * Writing 4 to /proc/pid/clear_refs affects all pages. |
848 | */ | 848 | */ |
849 | if (type == CLEAR_REFS_ANON && vma->vm_file) | 849 | if (type == CLEAR_REFS_ANON && vma->vm_file) |
850 | continue; | 850 | continue; |
851 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) | 851 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) |
852 | continue; | 852 | continue; |
853 | if (type == CLEAR_REFS_SOFT_DIRTY) { | 853 | if (type == CLEAR_REFS_SOFT_DIRTY) { |
854 | if (vma->vm_flags & VM_SOFTDIRTY) | 854 | if (vma->vm_flags & VM_SOFTDIRTY) |
855 | vma->vm_flags &= ~VM_SOFTDIRTY; | 855 | vma->vm_flags &= ~VM_SOFTDIRTY; |
856 | } | 856 | } |
857 | walk_page_range(vma->vm_start, vma->vm_end, | 857 | walk_page_range(vma->vm_start, vma->vm_end, |
858 | &clear_refs_walk); | 858 | &clear_refs_walk); |
859 | } | 859 | } |
860 | if (type == CLEAR_REFS_SOFT_DIRTY) | 860 | if (type == CLEAR_REFS_SOFT_DIRTY) |
861 | mmu_notifier_invalidate_range_end(mm, 0, -1); | 861 | mmu_notifier_invalidate_range_end(mm, 0, -1); |
862 | flush_tlb_mm(mm); | 862 | flush_tlb_mm(mm); |
863 | up_read(&mm->mmap_sem); | 863 | up_read(&mm->mmap_sem); |
864 | mmput(mm); | 864 | mmput(mm); |
865 | } | 865 | } |
866 | put_task_struct(task); | 866 | put_task_struct(task); |
867 | 867 | ||
868 | return count; | 868 | return count; |
869 | } | 869 | } |
870 | 870 | ||
871 | const struct file_operations proc_clear_refs_operations = { | 871 | const struct file_operations proc_clear_refs_operations = { |
872 | .write = clear_refs_write, | 872 | .write = clear_refs_write, |
873 | .llseek = noop_llseek, | 873 | .llseek = noop_llseek, |
874 | }; | 874 | }; |
875 | 875 | ||
876 | typedef struct { | 876 | typedef struct { |
877 | u64 pme; | 877 | u64 pme; |
878 | } pagemap_entry_t; | 878 | } pagemap_entry_t; |
879 | 879 | ||
880 | struct pagemapread { | 880 | struct pagemapread { |
881 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ | 881 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ |
882 | pagemap_entry_t *buffer; | 882 | pagemap_entry_t *buffer; |
883 | bool v2; | 883 | bool v2; |
884 | }; | 884 | }; |
885 | 885 | ||
886 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) | 886 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) |
887 | #define PAGEMAP_WALK_MASK (PMD_MASK) | 887 | #define PAGEMAP_WALK_MASK (PMD_MASK) |
888 | 888 | ||
889 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) | 889 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) |
890 | #define PM_STATUS_BITS 3 | 890 | #define PM_STATUS_BITS 3 |
891 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) | 891 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) |
892 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) | 892 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) |
893 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) | 893 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) |
894 | #define PM_PSHIFT_BITS 6 | 894 | #define PM_PSHIFT_BITS 6 |
895 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) | 895 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) |
896 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) | 896 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) |
897 | #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) | 897 | #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) |
898 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) | 898 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) |
899 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) | 899 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) |
900 | /* in "new" pagemap pshift bits are occupied with more status bits */ | 900 | /* in "new" pagemap pshift bits are occupied with more status bits */ |
901 | #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) | 901 | #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) |
902 | 902 | ||
903 | #define __PM_SOFT_DIRTY (1LL) | 903 | #define __PM_SOFT_DIRTY (1LL) |
904 | #define PM_PRESENT PM_STATUS(4LL) | 904 | #define PM_PRESENT PM_STATUS(4LL) |
905 | #define PM_SWAP PM_STATUS(2LL) | 905 | #define PM_SWAP PM_STATUS(2LL) |
906 | #define PM_FILE PM_STATUS(1LL) | 906 | #define PM_FILE PM_STATUS(1LL) |
907 | #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) | 907 | #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) |
908 | #define PM_END_OF_BUFFER 1 | 908 | #define PM_END_OF_BUFFER 1 |
909 | 909 | ||
910 | static inline pagemap_entry_t make_pme(u64 val) | 910 | static inline pagemap_entry_t make_pme(u64 val) |
911 | { | 911 | { |
912 | return (pagemap_entry_t) { .pme = val }; | 912 | return (pagemap_entry_t) { .pme = val }; |
913 | } | 913 | } |
914 | 914 | ||
915 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, | 915 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, |
916 | struct pagemapread *pm) | 916 | struct pagemapread *pm) |
917 | { | 917 | { |
918 | pm->buffer[pm->pos++] = *pme; | 918 | pm->buffer[pm->pos++] = *pme; |
919 | if (pm->pos >= pm->len) | 919 | if (pm->pos >= pm->len) |
920 | return PM_END_OF_BUFFER; | 920 | return PM_END_OF_BUFFER; |
921 | return 0; | 921 | return 0; |
922 | } | 922 | } |
923 | 923 | ||
924 | static int pagemap_pte_hole(unsigned long start, unsigned long end, | 924 | static int pagemap_pte_hole(unsigned long start, unsigned long end, |
925 | struct mm_walk *walk) | 925 | struct mm_walk *walk) |
926 | { | 926 | { |
927 | struct pagemapread *pm = walk->private; | 927 | struct pagemapread *pm = walk->private; |
928 | unsigned long addr = start; | 928 | unsigned long addr = start; |
929 | int err = 0; | 929 | int err = 0; |
930 | 930 | ||
931 | while (addr < end) { | 931 | while (addr < end) { |
932 | struct vm_area_struct *vma = find_vma(walk->mm, addr); | 932 | struct vm_area_struct *vma = find_vma(walk->mm, addr); |
933 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 933 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); |
934 | unsigned long vm_end; | 934 | /* End of address space hole, which we mark as non-present. */ |
935 | unsigned long hole_end; | ||
935 | 936 | ||
936 | if (!vma) { | 937 | if (vma) |
937 | vm_end = end; | 938 | hole_end = min(end, vma->vm_start); |
938 | } else { | 939 | else |
939 | vm_end = min(end, vma->vm_end); | 940 | hole_end = end; |
940 | if (vma->vm_flags & VM_SOFTDIRTY) | 941 | |
941 | pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); | 942 | for (; addr < hole_end; addr += PAGE_SIZE) { |
943 | err = add_to_pagemap(addr, &pme, pm); | ||
944 | if (err) | ||
945 | goto out; | ||
942 | } | 946 | } |
943 | 947 | ||
944 | for (; addr < vm_end; addr += PAGE_SIZE) { | 948 | if (!vma) |
949 | break; | ||
950 | |||
951 | /* Addresses in the VMA. */ | ||
952 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
953 | pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); | ||
954 | for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { | ||
945 | err = add_to_pagemap(addr, &pme, pm); | 955 | err = add_to_pagemap(addr, &pme, pm); |
946 | if (err) | 956 | if (err) |
947 | goto out; | 957 | goto out; |
948 | } | 958 | } |
949 | } | 959 | } |
950 | |||
951 | out: | 960 | out: |
952 | return err; | 961 | return err; |
953 | } | 962 | } |
954 | 963 | ||
955 | static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 964 | static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
956 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 965 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
957 | { | 966 | { |
958 | u64 frame, flags; | 967 | u64 frame, flags; |
959 | struct page *page = NULL; | 968 | struct page *page = NULL; |
960 | int flags2 = 0; | 969 | int flags2 = 0; |
961 | 970 | ||
962 | if (pte_present(pte)) { | 971 | if (pte_present(pte)) { |
963 | frame = pte_pfn(pte); | 972 | frame = pte_pfn(pte); |
964 | flags = PM_PRESENT; | 973 | flags = PM_PRESENT; |
965 | page = vm_normal_page(vma, addr, pte); | 974 | page = vm_normal_page(vma, addr, pte); |
966 | if (pte_soft_dirty(pte)) | 975 | if (pte_soft_dirty(pte)) |
967 | flags2 |= __PM_SOFT_DIRTY; | 976 | flags2 |= __PM_SOFT_DIRTY; |
968 | } else if (is_swap_pte(pte)) { | 977 | } else if (is_swap_pte(pte)) { |
969 | swp_entry_t entry; | 978 | swp_entry_t entry; |
970 | if (pte_swp_soft_dirty(pte)) | 979 | if (pte_swp_soft_dirty(pte)) |
971 | flags2 |= __PM_SOFT_DIRTY; | 980 | flags2 |= __PM_SOFT_DIRTY; |
972 | entry = pte_to_swp_entry(pte); | 981 | entry = pte_to_swp_entry(pte); |
973 | frame = swp_type(entry) | | 982 | frame = swp_type(entry) | |
974 | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); | 983 | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); |
975 | flags = PM_SWAP; | 984 | flags = PM_SWAP; |
976 | if (is_migration_entry(entry)) | 985 | if (is_migration_entry(entry)) |
977 | page = migration_entry_to_page(entry); | 986 | page = migration_entry_to_page(entry); |
978 | } else { | 987 | } else { |
979 | if (vma->vm_flags & VM_SOFTDIRTY) | 988 | if (vma->vm_flags & VM_SOFTDIRTY) |
980 | flags2 |= __PM_SOFT_DIRTY; | 989 | flags2 |= __PM_SOFT_DIRTY; |
981 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | 990 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); |
982 | return; | 991 | return; |
983 | } | 992 | } |
984 | 993 | ||
985 | if (page && !PageAnon(page)) | 994 | if (page && !PageAnon(page)) |
986 | flags |= PM_FILE; | 995 | flags |= PM_FILE; |
987 | if ((vma->vm_flags & VM_SOFTDIRTY)) | 996 | if ((vma->vm_flags & VM_SOFTDIRTY)) |
988 | flags2 |= __PM_SOFT_DIRTY; | 997 | flags2 |= __PM_SOFT_DIRTY; |
989 | 998 | ||
990 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); | 999 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); |
991 | } | 1000 | } |
992 | 1001 | ||
993 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 1002 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
994 | static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 1003 | static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
995 | pmd_t pmd, int offset, int pmd_flags2) | 1004 | pmd_t pmd, int offset, int pmd_flags2) |
996 | { | 1005 | { |
997 | /* | 1006 | /* |
998 | * Currently pmd for thp is always present because thp can not be | 1007 | * Currently pmd for thp is always present because thp can not be |
999 | * swapped-out, migrated, or HWPOISONed (split in such cases instead.) | 1008 | * swapped-out, migrated, or HWPOISONed (split in such cases instead.) |
1000 | * This if-check is just to prepare for future implementation. | 1009 | * This if-check is just to prepare for future implementation. |
1001 | */ | 1010 | */ |
1002 | if (pmd_present(pmd)) | 1011 | if (pmd_present(pmd)) |
1003 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | 1012 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) |
1004 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); | 1013 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); |
1005 | else | 1014 | else |
1006 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); | 1015 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); |
1007 | } | 1016 | } |
1008 | #else | 1017 | #else |
1009 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 1018 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
1010 | pmd_t pmd, int offset, int pmd_flags2) | 1019 | pmd_t pmd, int offset, int pmd_flags2) |
1011 | { | 1020 | { |
1012 | } | 1021 | } |
1013 | #endif | 1022 | #endif |
1014 | 1023 | ||
1015 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 1024 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
1016 | struct mm_walk *walk) | 1025 | struct mm_walk *walk) |
1017 | { | 1026 | { |
1018 | struct vm_area_struct *vma; | 1027 | struct vm_area_struct *vma; |
1019 | struct pagemapread *pm = walk->private; | 1028 | struct pagemapread *pm = walk->private; |
1020 | spinlock_t *ptl; | 1029 | spinlock_t *ptl; |
1021 | pte_t *pte; | 1030 | pte_t *pte; |
1022 | int err = 0; | 1031 | int err = 0; |
1023 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 1032 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); |
1024 | 1033 | ||
1025 | /* find the first VMA at or above 'addr' */ | 1034 | /* find the first VMA at or above 'addr' */ |
1026 | vma = find_vma(walk->mm, addr); | 1035 | vma = find_vma(walk->mm, addr); |
1027 | if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1036 | if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1028 | int pmd_flags2; | 1037 | int pmd_flags2; |
1029 | 1038 | ||
1030 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) | 1039 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) |
1031 | pmd_flags2 = __PM_SOFT_DIRTY; | 1040 | pmd_flags2 = __PM_SOFT_DIRTY; |
1032 | else | 1041 | else |
1033 | pmd_flags2 = 0; | 1042 | pmd_flags2 = 0; |
1034 | 1043 | ||
1035 | for (; addr != end; addr += PAGE_SIZE) { | 1044 | for (; addr != end; addr += PAGE_SIZE) { |
1036 | unsigned long offset; | 1045 | unsigned long offset; |
1037 | 1046 | ||
1038 | offset = (addr & ~PAGEMAP_WALK_MASK) >> | 1047 | offset = (addr & ~PAGEMAP_WALK_MASK) >> |
1039 | PAGE_SHIFT; | 1048 | PAGE_SHIFT; |
1040 | thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); | 1049 | thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); |
1041 | err = add_to_pagemap(addr, &pme, pm); | 1050 | err = add_to_pagemap(addr, &pme, pm); |
1042 | if (err) | 1051 | if (err) |
1043 | break; | 1052 | break; |
1044 | } | 1053 | } |
1045 | spin_unlock(ptl); | 1054 | spin_unlock(ptl); |
1046 | return err; | 1055 | return err; |
1047 | } | 1056 | } |
1048 | 1057 | ||
1049 | if (pmd_trans_unstable(pmd)) | 1058 | if (pmd_trans_unstable(pmd)) |
1050 | return 0; | 1059 | return 0; |
1051 | for (; addr != end; addr += PAGE_SIZE) { | 1060 | for (; addr != end; addr += PAGE_SIZE) { |
1052 | int flags2; | 1061 | int flags2; |
1053 | 1062 | ||
1054 | /* check to see if we've left 'vma' behind | 1063 | /* check to see if we've left 'vma' behind |
1055 | * and need a new, higher one */ | 1064 | * and need a new, higher one */ |
1056 | if (vma && (addr >= vma->vm_end)) { | 1065 | if (vma && (addr >= vma->vm_end)) { |
1057 | vma = find_vma(walk->mm, addr); | 1066 | vma = find_vma(walk->mm, addr); |
1058 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | 1067 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) |
1059 | flags2 = __PM_SOFT_DIRTY; | 1068 | flags2 = __PM_SOFT_DIRTY; |
1060 | else | 1069 | else |
1061 | flags2 = 0; | 1070 | flags2 = 0; |
1062 | pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | 1071 | pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); |
1063 | } | 1072 | } |
1064 | 1073 | ||
1065 | /* check that 'vma' actually covers this address, | 1074 | /* check that 'vma' actually covers this address, |
1066 | * and that it isn't a huge page vma */ | 1075 | * and that it isn't a huge page vma */ |
1067 | if (vma && (vma->vm_start <= addr) && | 1076 | if (vma && (vma->vm_start <= addr) && |
1068 | !is_vm_hugetlb_page(vma)) { | 1077 | !is_vm_hugetlb_page(vma)) { |
1069 | pte = pte_offset_map(pmd, addr); | 1078 | pte = pte_offset_map(pmd, addr); |
1070 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); | 1079 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); |
1071 | /* unmap before userspace copy */ | 1080 | /* unmap before userspace copy */ |
1072 | pte_unmap(pte); | 1081 | pte_unmap(pte); |
1073 | } | 1082 | } |
1074 | err = add_to_pagemap(addr, &pme, pm); | 1083 | err = add_to_pagemap(addr, &pme, pm); |
1075 | if (err) | 1084 | if (err) |
1076 | return err; | 1085 | return err; |
1077 | } | 1086 | } |
1078 | 1087 | ||
1079 | cond_resched(); | 1088 | cond_resched(); |
1080 | 1089 | ||
1081 | return err; | 1090 | return err; |
1082 | } | 1091 | } |
1083 | 1092 | ||
1084 | #ifdef CONFIG_HUGETLB_PAGE | 1093 | #ifdef CONFIG_HUGETLB_PAGE |
1085 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 1094 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
1086 | pte_t pte, int offset, int flags2) | 1095 | pte_t pte, int offset, int flags2) |
1087 | { | 1096 | { |
1088 | if (pte_present(pte)) | 1097 | if (pte_present(pte)) |
1089 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | | 1098 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | |
1090 | PM_STATUS2(pm->v2, flags2) | | 1099 | PM_STATUS2(pm->v2, flags2) | |
1091 | PM_PRESENT); | 1100 | PM_PRESENT); |
1092 | else | 1101 | else |
1093 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | | 1102 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | |
1094 | PM_STATUS2(pm->v2, flags2)); | 1103 | PM_STATUS2(pm->v2, flags2)); |
1095 | } | 1104 | } |
1096 | 1105 | ||
1097 | /* This function walks within one hugetlb entry in the single call */ | 1106 | /* This function walks within one hugetlb entry in the single call */ |
1098 | static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | 1107 | static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, |
1099 | unsigned long addr, unsigned long end, | 1108 | unsigned long addr, unsigned long end, |
1100 | struct mm_walk *walk) | 1109 | struct mm_walk *walk) |
1101 | { | 1110 | { |
1102 | struct pagemapread *pm = walk->private; | 1111 | struct pagemapread *pm = walk->private; |
1103 | struct vm_area_struct *vma; | 1112 | struct vm_area_struct *vma; |
1104 | int err = 0; | 1113 | int err = 0; |
1105 | int flags2; | 1114 | int flags2; |
1106 | pagemap_entry_t pme; | 1115 | pagemap_entry_t pme; |
1107 | 1116 | ||
1108 | vma = find_vma(walk->mm, addr); | 1117 | vma = find_vma(walk->mm, addr); |
1109 | WARN_ON_ONCE(!vma); | 1118 | WARN_ON_ONCE(!vma); |
1110 | 1119 | ||
1111 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | 1120 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) |
1112 | flags2 = __PM_SOFT_DIRTY; | 1121 | flags2 = __PM_SOFT_DIRTY; |
1113 | else | 1122 | else |
1114 | flags2 = 0; | 1123 | flags2 = 0; |
1115 | 1124 | ||
1116 | for (; addr != end; addr += PAGE_SIZE) { | 1125 | for (; addr != end; addr += PAGE_SIZE) { |
1117 | int offset = (addr & ~hmask) >> PAGE_SHIFT; | 1126 | int offset = (addr & ~hmask) >> PAGE_SHIFT; |
1118 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); | 1127 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); |
1119 | err = add_to_pagemap(addr, &pme, pm); | 1128 | err = add_to_pagemap(addr, &pme, pm); |
1120 | if (err) | 1129 | if (err) |
1121 | return err; | 1130 | return err; |
1122 | } | 1131 | } |
1123 | 1132 | ||
1124 | cond_resched(); | 1133 | cond_resched(); |
1125 | 1134 | ||
1126 | return err; | 1135 | return err; |
1127 | } | 1136 | } |
1128 | #endif /* HUGETLB_PAGE */ | 1137 | #endif /* HUGETLB_PAGE */ |
1129 | 1138 | ||
1130 | /* | 1139 | /* |
1131 | * /proc/pid/pagemap - an array mapping virtual pages to pfns | 1140 | * /proc/pid/pagemap - an array mapping virtual pages to pfns |
1132 | * | 1141 | * |
1133 | * For each page in the address space, this file contains one 64-bit entry | 1142 | * For each page in the address space, this file contains one 64-bit entry |
1134 | * consisting of the following: | 1143 | * consisting of the following: |
1135 | * | 1144 | * |
1136 | * Bits 0-54 page frame number (PFN) if present | 1145 | * Bits 0-54 page frame number (PFN) if present |
1137 | * Bits 0-4 swap type if swapped | 1146 | * Bits 0-4 swap type if swapped |
1138 | * Bits 5-54 swap offset if swapped | 1147 | * Bits 5-54 swap offset if swapped |
1139 | * Bits 55-60 page shift (page size = 1<<page shift) | 1148 | * Bits 55-60 page shift (page size = 1<<page shift) |
1140 | * Bit 61 page is file-page or shared-anon | 1149 | * Bit 61 page is file-page or shared-anon |
1141 | * Bit 62 page swapped | 1150 | * Bit 62 page swapped |
1142 | * Bit 63 page present | 1151 | * Bit 63 page present |
1143 | * | 1152 | * |
1144 | * If the page is not present but in swap, then the PFN contains an | 1153 | * If the page is not present but in swap, then the PFN contains an |
1145 | * encoding of the swap file number and the page's offset into the | 1154 | * encoding of the swap file number and the page's offset into the |
1146 | * swap. Unmapped pages return a null PFN. This allows determining | 1155 | * swap. Unmapped pages return a null PFN. This allows determining |
1147 | * precisely which pages are mapped (or in swap) and comparing mapped | 1156 | * precisely which pages are mapped (or in swap) and comparing mapped |
1148 | * pages between processes. | 1157 | * pages between processes. |
1149 | * | 1158 | * |
1150 | * Efficient users of this interface will use /proc/pid/maps to | 1159 | * Efficient users of this interface will use /proc/pid/maps to |
1151 | * determine which areas of memory are actually mapped and llseek to | 1160 | * determine which areas of memory are actually mapped and llseek to |
1152 | * skip over unmapped regions. | 1161 | * skip over unmapped regions. |
1153 | */ | 1162 | */ |
1154 | static ssize_t pagemap_read(struct file *file, char __user *buf, | 1163 | static ssize_t pagemap_read(struct file *file, char __user *buf, |
1155 | size_t count, loff_t *ppos) | 1164 | size_t count, loff_t *ppos) |
1156 | { | 1165 | { |
1157 | struct task_struct *task = get_proc_task(file_inode(file)); | 1166 | struct task_struct *task = get_proc_task(file_inode(file)); |
1158 | struct mm_struct *mm; | 1167 | struct mm_struct *mm; |
1159 | struct pagemapread pm; | 1168 | struct pagemapread pm; |
1160 | int ret = -ESRCH; | 1169 | int ret = -ESRCH; |
1161 | struct mm_walk pagemap_walk = {}; | 1170 | struct mm_walk pagemap_walk = {}; |
1162 | unsigned long src; | 1171 | unsigned long src; |
1163 | unsigned long svpfn; | 1172 | unsigned long svpfn; |
1164 | unsigned long start_vaddr; | 1173 | unsigned long start_vaddr; |
1165 | unsigned long end_vaddr; | 1174 | unsigned long end_vaddr; |
1166 | int copied = 0; | 1175 | int copied = 0; |
1167 | 1176 | ||
1168 | if (!task) | 1177 | if (!task) |
1169 | goto out; | 1178 | goto out; |
1170 | 1179 | ||
1171 | ret = -EINVAL; | 1180 | ret = -EINVAL; |
1172 | /* file position must be aligned */ | 1181 | /* file position must be aligned */ |
1173 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) | 1182 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) |
1174 | goto out_task; | 1183 | goto out_task; |
1175 | 1184 | ||
1176 | ret = 0; | 1185 | ret = 0; |
1177 | if (!count) | 1186 | if (!count) |
1178 | goto out_task; | 1187 | goto out_task; |
1179 | 1188 | ||
1180 | pm.v2 = soft_dirty_cleared; | 1189 | pm.v2 = soft_dirty_cleared; |
1181 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); | 1190 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); |
1182 | pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); | 1191 | pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); |
1183 | ret = -ENOMEM; | 1192 | ret = -ENOMEM; |
1184 | if (!pm.buffer) | 1193 | if (!pm.buffer) |
1185 | goto out_task; | 1194 | goto out_task; |
1186 | 1195 | ||
1187 | mm = mm_access(task, PTRACE_MODE_READ); | 1196 | mm = mm_access(task, PTRACE_MODE_READ); |
1188 | ret = PTR_ERR(mm); | 1197 | ret = PTR_ERR(mm); |
1189 | if (!mm || IS_ERR(mm)) | 1198 | if (!mm || IS_ERR(mm)) |
1190 | goto out_free; | 1199 | goto out_free; |
1191 | 1200 | ||
1192 | pagemap_walk.pmd_entry = pagemap_pte_range; | 1201 | pagemap_walk.pmd_entry = pagemap_pte_range; |
1193 | pagemap_walk.pte_hole = pagemap_pte_hole; | 1202 | pagemap_walk.pte_hole = pagemap_pte_hole; |
1194 | #ifdef CONFIG_HUGETLB_PAGE | 1203 | #ifdef CONFIG_HUGETLB_PAGE |
1195 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; | 1204 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; |
1196 | #endif | 1205 | #endif |
1197 | pagemap_walk.mm = mm; | 1206 | pagemap_walk.mm = mm; |
1198 | pagemap_walk.private = ± | 1207 | pagemap_walk.private = ± |
1199 | 1208 | ||
1200 | src = *ppos; | 1209 | src = *ppos; |
1201 | svpfn = src / PM_ENTRY_BYTES; | 1210 | svpfn = src / PM_ENTRY_BYTES; |
1202 | start_vaddr = svpfn << PAGE_SHIFT; | 1211 | start_vaddr = svpfn << PAGE_SHIFT; |
1203 | end_vaddr = TASK_SIZE_OF(task); | 1212 | end_vaddr = TASK_SIZE_OF(task); |
1204 | 1213 | ||
1205 | /* watch out for wraparound */ | 1214 | /* watch out for wraparound */ |
1206 | if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) | 1215 | if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) |
1207 | start_vaddr = end_vaddr; | 1216 | start_vaddr = end_vaddr; |
1208 | 1217 | ||
1209 | /* | 1218 | /* |
1210 | * The odds are that this will stop walking way | 1219 | * The odds are that this will stop walking way |
1211 | * before end_vaddr, because the length of the | 1220 | * before end_vaddr, because the length of the |
1212 | * user buffer is tracked in "pm", and the walk | 1221 | * user buffer is tracked in "pm", and the walk |
1213 | * will stop when we hit the end of the buffer. | 1222 | * will stop when we hit the end of the buffer. |
1214 | */ | 1223 | */ |
1215 | ret = 0; | 1224 | ret = 0; |
1216 | while (count && (start_vaddr < end_vaddr)) { | 1225 | while (count && (start_vaddr < end_vaddr)) { |
1217 | int len; | 1226 | int len; |
1218 | unsigned long end; | 1227 | unsigned long end; |
1219 | 1228 | ||
1220 | pm.pos = 0; | 1229 | pm.pos = 0; |
1221 | end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; | 1230 | end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; |
1222 | /* overflow ? */ | 1231 | /* overflow ? */ |
1223 | if (end < start_vaddr || end > end_vaddr) | 1232 | if (end < start_vaddr || end > end_vaddr) |
1224 | end = end_vaddr; | 1233 | end = end_vaddr; |
1225 | down_read(&mm->mmap_sem); | 1234 | down_read(&mm->mmap_sem); |
1226 | ret = walk_page_range(start_vaddr, end, &pagemap_walk); | 1235 | ret = walk_page_range(start_vaddr, end, &pagemap_walk); |
1227 | up_read(&mm->mmap_sem); | 1236 | up_read(&mm->mmap_sem); |
1228 | start_vaddr = end; | 1237 | start_vaddr = end; |
1229 | 1238 | ||
1230 | len = min(count, PM_ENTRY_BYTES * pm.pos); | 1239 | len = min(count, PM_ENTRY_BYTES * pm.pos); |
1231 | if (copy_to_user(buf, pm.buffer, len)) { | 1240 | if (copy_to_user(buf, pm.buffer, len)) { |
1232 | ret = -EFAULT; | 1241 | ret = -EFAULT; |
1233 | goto out_mm; | 1242 | goto out_mm; |
1234 | } | 1243 | } |
1235 | copied += len; | 1244 | copied += len; |
1236 | buf += len; | 1245 | buf += len; |
1237 | count -= len; | 1246 | count -= len; |
1238 | } | 1247 | } |
1239 | *ppos += copied; | 1248 | *ppos += copied; |
1240 | if (!ret || ret == PM_END_OF_BUFFER) | 1249 | if (!ret || ret == PM_END_OF_BUFFER) |
1241 | ret = copied; | 1250 | ret = copied; |
1242 | 1251 | ||
1243 | out_mm: | 1252 | out_mm: |
1244 | mmput(mm); | 1253 | mmput(mm); |
1245 | out_free: | 1254 | out_free: |
1246 | kfree(pm.buffer); | 1255 | kfree(pm.buffer); |
1247 | out_task: | 1256 | out_task: |
1248 | put_task_struct(task); | 1257 | put_task_struct(task); |
1249 | out: | 1258 | out: |
1250 | return ret; | 1259 | return ret; |
1251 | } | 1260 | } |
1252 | 1261 | ||
1253 | static int pagemap_open(struct inode *inode, struct file *file) | 1262 | static int pagemap_open(struct inode *inode, struct file *file) |
1254 | { | 1263 | { |
1255 | pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " | 1264 | pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " |
1256 | "to stop being page-shift some time soon. See the " | 1265 | "to stop being page-shift some time soon. See the " |
1257 | "linux/Documentation/vm/pagemap.txt for details.\n"); | 1266 | "linux/Documentation/vm/pagemap.txt for details.\n"); |
1258 | return 0; | 1267 | return 0; |
1259 | } | 1268 | } |
1260 | 1269 | ||
1261 | const struct file_operations proc_pagemap_operations = { | 1270 | const struct file_operations proc_pagemap_operations = { |
1262 | .llseek = mem_lseek, /* borrow this */ | 1271 | .llseek = mem_lseek, /* borrow this */ |
1263 | .read = pagemap_read, | 1272 | .read = pagemap_read, |
1264 | .open = pagemap_open, | 1273 | .open = pagemap_open, |
1265 | }; | 1274 | }; |
1266 | #endif /* CONFIG_PROC_PAGE_MONITOR */ | 1275 | #endif /* CONFIG_PROC_PAGE_MONITOR */ |
1267 | 1276 | ||
1268 | #ifdef CONFIG_NUMA | 1277 | #ifdef CONFIG_NUMA |
1269 | 1278 | ||
1270 | struct numa_maps { | 1279 | struct numa_maps { |
1271 | struct vm_area_struct *vma; | 1280 | struct vm_area_struct *vma; |
1272 | unsigned long pages; | 1281 | unsigned long pages; |
1273 | unsigned long anon; | 1282 | unsigned long anon; |
1274 | unsigned long active; | 1283 | unsigned long active; |
1275 | unsigned long writeback; | 1284 | unsigned long writeback; |
1276 | unsigned long mapcount_max; | 1285 | unsigned long mapcount_max; |
1277 | unsigned long dirty; | 1286 | unsigned long dirty; |
1278 | unsigned long swapcache; | 1287 | unsigned long swapcache; |
1279 | unsigned long node[MAX_NUMNODES]; | 1288 | unsigned long node[MAX_NUMNODES]; |
1280 | }; | 1289 | }; |
1281 | 1290 | ||
1282 | struct numa_maps_private { | 1291 | struct numa_maps_private { |
1283 | struct proc_maps_private proc_maps; | 1292 | struct proc_maps_private proc_maps; |
1284 | struct numa_maps md; | 1293 | struct numa_maps md; |
1285 | }; | 1294 | }; |
1286 | 1295 | ||
1287 | static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, | 1296 | static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, |
1288 | unsigned long nr_pages) | 1297 | unsigned long nr_pages) |
1289 | { | 1298 | { |
1290 | int count = page_mapcount(page); | 1299 | int count = page_mapcount(page); |
1291 | 1300 | ||
1292 | md->pages += nr_pages; | 1301 | md->pages += nr_pages; |
1293 | if (pte_dirty || PageDirty(page)) | 1302 | if (pte_dirty || PageDirty(page)) |
1294 | md->dirty += nr_pages; | 1303 | md->dirty += nr_pages; |
1295 | 1304 | ||
1296 | if (PageSwapCache(page)) | 1305 | if (PageSwapCache(page)) |
1297 | md->swapcache += nr_pages; | 1306 | md->swapcache += nr_pages; |
1298 | 1307 | ||
1299 | if (PageActive(page) || PageUnevictable(page)) | 1308 | if (PageActive(page) || PageUnevictable(page)) |
1300 | md->active += nr_pages; | 1309 | md->active += nr_pages; |
1301 | 1310 | ||
1302 | if (PageWriteback(page)) | 1311 | if (PageWriteback(page)) |
1303 | md->writeback += nr_pages; | 1312 | md->writeback += nr_pages; |
1304 | 1313 | ||
1305 | if (PageAnon(page)) | 1314 | if (PageAnon(page)) |
1306 | md->anon += nr_pages; | 1315 | md->anon += nr_pages; |
1307 | 1316 | ||
1308 | if (count > md->mapcount_max) | 1317 | if (count > md->mapcount_max) |
1309 | md->mapcount_max = count; | 1318 | md->mapcount_max = count; |
1310 | 1319 | ||
1311 | md->node[page_to_nid(page)] += nr_pages; | 1320 | md->node[page_to_nid(page)] += nr_pages; |
1312 | } | 1321 | } |
1313 | 1322 | ||
1314 | static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, | 1323 | static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, |
1315 | unsigned long addr) | 1324 | unsigned long addr) |
1316 | { | 1325 | { |
1317 | struct page *page; | 1326 | struct page *page; |
1318 | int nid; | 1327 | int nid; |
1319 | 1328 | ||
1320 | if (!pte_present(pte)) | 1329 | if (!pte_present(pte)) |
1321 | return NULL; | 1330 | return NULL; |
1322 | 1331 | ||
1323 | page = vm_normal_page(vma, addr, pte); | 1332 | page = vm_normal_page(vma, addr, pte); |
1324 | if (!page) | 1333 | if (!page) |
1325 | return NULL; | 1334 | return NULL; |
1326 | 1335 | ||
1327 | if (PageReserved(page)) | 1336 | if (PageReserved(page)) |
1328 | return NULL; | 1337 | return NULL; |
1329 | 1338 | ||
1330 | nid = page_to_nid(page); | 1339 | nid = page_to_nid(page); |
1331 | if (!node_isset(nid, node_states[N_MEMORY])) | 1340 | if (!node_isset(nid, node_states[N_MEMORY])) |
1332 | return NULL; | 1341 | return NULL; |
1333 | 1342 | ||
1334 | return page; | 1343 | return page; |
1335 | } | 1344 | } |
1336 | 1345 | ||
1337 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | 1346 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, |
1338 | unsigned long end, struct mm_walk *walk) | 1347 | unsigned long end, struct mm_walk *walk) |
1339 | { | 1348 | { |
1340 | struct numa_maps *md; | 1349 | struct numa_maps *md; |
1341 | spinlock_t *ptl; | 1350 | spinlock_t *ptl; |
1342 | pte_t *orig_pte; | 1351 | pte_t *orig_pte; |
1343 | pte_t *pte; | 1352 | pte_t *pte; |
1344 | 1353 | ||
1345 | md = walk->private; | 1354 | md = walk->private; |
1346 | 1355 | ||
1347 | if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { | 1356 | if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { |
1348 | pte_t huge_pte = *(pte_t *)pmd; | 1357 | pte_t huge_pte = *(pte_t *)pmd; |
1349 | struct page *page; | 1358 | struct page *page; |
1350 | 1359 | ||
1351 | page = can_gather_numa_stats(huge_pte, md->vma, addr); | 1360 | page = can_gather_numa_stats(huge_pte, md->vma, addr); |
1352 | if (page) | 1361 | if (page) |
1353 | gather_stats(page, md, pte_dirty(huge_pte), | 1362 | gather_stats(page, md, pte_dirty(huge_pte), |
1354 | HPAGE_PMD_SIZE/PAGE_SIZE); | 1363 | HPAGE_PMD_SIZE/PAGE_SIZE); |
1355 | spin_unlock(ptl); | 1364 | spin_unlock(ptl); |
1356 | return 0; | 1365 | return 0; |
1357 | } | 1366 | } |
1358 | 1367 | ||
1359 | if (pmd_trans_unstable(pmd)) | 1368 | if (pmd_trans_unstable(pmd)) |
1360 | return 0; | 1369 | return 0; |
1361 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 1370 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
1362 | do { | 1371 | do { |
1363 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); | 1372 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); |
1364 | if (!page) | 1373 | if (!page) |
1365 | continue; | 1374 | continue; |
1366 | gather_stats(page, md, pte_dirty(*pte), 1); | 1375 | gather_stats(page, md, pte_dirty(*pte), 1); |
1367 | 1376 | ||
1368 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1377 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1369 | pte_unmap_unlock(orig_pte, ptl); | 1378 | pte_unmap_unlock(orig_pte, ptl); |
1370 | return 0; | 1379 | return 0; |
1371 | } | 1380 | } |
1372 | #ifdef CONFIG_HUGETLB_PAGE | 1381 | #ifdef CONFIG_HUGETLB_PAGE |
1373 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | 1382 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, |
1374 | unsigned long addr, unsigned long end, struct mm_walk *walk) | 1383 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
1375 | { | 1384 | { |
1376 | struct numa_maps *md; | 1385 | struct numa_maps *md; |
1377 | struct page *page; | 1386 | struct page *page; |
1378 | 1387 | ||
1379 | if (!pte_present(*pte)) | 1388 | if (!pte_present(*pte)) |
1380 | return 0; | 1389 | return 0; |
1381 | 1390 | ||
1382 | page = pte_page(*pte); | 1391 | page = pte_page(*pte); |
1383 | if (!page) | 1392 | if (!page) |
1384 | return 0; | 1393 | return 0; |
1385 | 1394 | ||
1386 | md = walk->private; | 1395 | md = walk->private; |
1387 | gather_stats(page, md, pte_dirty(*pte), 1); | 1396 | gather_stats(page, md, pte_dirty(*pte), 1); |
1388 | return 0; | 1397 | return 0; |
1389 | } | 1398 | } |
1390 | 1399 | ||
1391 | #else | 1400 | #else |
1392 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | 1401 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, |
1393 | unsigned long addr, unsigned long end, struct mm_walk *walk) | 1402 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
1394 | { | 1403 | { |
1395 | return 0; | 1404 | return 0; |
1396 | } | 1405 | } |
1397 | #endif | 1406 | #endif |
1398 | 1407 | ||
1399 | /* | 1408 | /* |
1400 | * Display pages allocated per node and memory policy via /proc. | 1409 | * Display pages allocated per node and memory policy via /proc. |
1401 | */ | 1410 | */ |
1402 | static int show_numa_map(struct seq_file *m, void *v, int is_pid) | 1411 | static int show_numa_map(struct seq_file *m, void *v, int is_pid) |
1403 | { | 1412 | { |
1404 | struct numa_maps_private *numa_priv = m->private; | 1413 | struct numa_maps_private *numa_priv = m->private; |
1405 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; | 1414 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; |
1406 | struct vm_area_struct *vma = v; | 1415 | struct vm_area_struct *vma = v; |
1407 | struct numa_maps *md = &numa_priv->md; | 1416 | struct numa_maps *md = &numa_priv->md; |
1408 | struct file *file = vma->vm_file; | 1417 | struct file *file = vma->vm_file; |
1409 | struct task_struct *task = proc_priv->task; | 1418 | struct task_struct *task = proc_priv->task; |
1410 | struct mm_struct *mm = vma->vm_mm; | 1419 | struct mm_struct *mm = vma->vm_mm; |
1411 | struct mm_walk walk = {}; | 1420 | struct mm_walk walk = {}; |
1412 | struct mempolicy *pol; | 1421 | struct mempolicy *pol; |
1413 | char buffer[64]; | 1422 | char buffer[64]; |
1414 | int nid; | 1423 | int nid; |
1415 | 1424 | ||
1416 | if (!mm) | 1425 | if (!mm) |
1417 | return 0; | 1426 | return 0; |
1418 | 1427 | ||
1419 | /* Ensure we start with an empty set of numa_maps statistics. */ | 1428 | /* Ensure we start with an empty set of numa_maps statistics. */ |
1420 | memset(md, 0, sizeof(*md)); | 1429 | memset(md, 0, sizeof(*md)); |
1421 | 1430 | ||
1422 | md->vma = vma; | 1431 | md->vma = vma; |
1423 | 1432 | ||
1424 | walk.hugetlb_entry = gather_hugetbl_stats; | 1433 | walk.hugetlb_entry = gather_hugetbl_stats; |
1425 | walk.pmd_entry = gather_pte_stats; | 1434 | walk.pmd_entry = gather_pte_stats; |
1426 | walk.private = md; | 1435 | walk.private = md; |
1427 | walk.mm = mm; | 1436 | walk.mm = mm; |
1428 | 1437 | ||
1429 | pol = get_vma_policy(task, vma, vma->vm_start); | 1438 | pol = get_vma_policy(task, vma, vma->vm_start); |
1430 | mpol_to_str(buffer, sizeof(buffer), pol); | 1439 | mpol_to_str(buffer, sizeof(buffer), pol); |
1431 | mpol_cond_put(pol); | 1440 | mpol_cond_put(pol); |
1432 | 1441 | ||
1433 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 1442 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
1434 | 1443 | ||
1435 | if (file) { | 1444 | if (file) { |
1436 | seq_puts(m, " file="); | 1445 | seq_puts(m, " file="); |
1437 | seq_path(m, &file->f_path, "\n\t= "); | 1446 | seq_path(m, &file->f_path, "\n\t= "); |
1438 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1447 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1439 | seq_puts(m, " heap"); | 1448 | seq_puts(m, " heap"); |
1440 | } else { | 1449 | } else { |
1441 | pid_t tid = vm_is_stack(task, vma, is_pid); | 1450 | pid_t tid = vm_is_stack(task, vma, is_pid); |
1442 | if (tid != 0) { | 1451 | if (tid != 0) { |
1443 | /* | 1452 | /* |
1444 | * Thread stack in /proc/PID/task/TID/maps or | 1453 | * Thread stack in /proc/PID/task/TID/maps or |
1445 | * the main process stack. | 1454 | * the main process stack. |
1446 | */ | 1455 | */ |
1447 | if (!is_pid || (vma->vm_start <= mm->start_stack && | 1456 | if (!is_pid || (vma->vm_start <= mm->start_stack && |
1448 | vma->vm_end >= mm->start_stack)) | 1457 | vma->vm_end >= mm->start_stack)) |
1449 | seq_puts(m, " stack"); | 1458 | seq_puts(m, " stack"); |
1450 | else | 1459 | else |
1451 | seq_printf(m, " stack:%d", tid); | 1460 | seq_printf(m, " stack:%d", tid); |
1452 | } | 1461 | } |
1453 | } | 1462 | } |
1454 | 1463 | ||
1455 | if (is_vm_hugetlb_page(vma)) | 1464 | if (is_vm_hugetlb_page(vma)) |
1456 | seq_puts(m, " huge"); | 1465 | seq_puts(m, " huge"); |
1457 | 1466 | ||
1458 | walk_page_range(vma->vm_start, vma->vm_end, &walk); | 1467 | walk_page_range(vma->vm_start, vma->vm_end, &walk); |
1459 | 1468 | ||
1460 | if (!md->pages) | 1469 | if (!md->pages) |
1461 | goto out; | 1470 | goto out; |
1462 | 1471 | ||
1463 | if (md->anon) | 1472 | if (md->anon) |
1464 | seq_printf(m, " anon=%lu", md->anon); | 1473 | seq_printf(m, " anon=%lu", md->anon); |
1465 | 1474 | ||
1466 | if (md->dirty) | 1475 | if (md->dirty) |
1467 | seq_printf(m, " dirty=%lu", md->dirty); | 1476 | seq_printf(m, " dirty=%lu", md->dirty); |
1468 | 1477 | ||
1469 | if (md->pages != md->anon && md->pages != md->dirty) | 1478 | if (md->pages != md->anon && md->pages != md->dirty) |
1470 | seq_printf(m, " mapped=%lu", md->pages); | 1479 | seq_printf(m, " mapped=%lu", md->pages); |
1471 | 1480 | ||
1472 | if (md->mapcount_max > 1) | 1481 | if (md->mapcount_max > 1) |
1473 | seq_printf(m, " mapmax=%lu", md->mapcount_max); | 1482 | seq_printf(m, " mapmax=%lu", md->mapcount_max); |
1474 | 1483 | ||
1475 | if (md->swapcache) | 1484 | if (md->swapcache) |
1476 | seq_printf(m, " swapcache=%lu", md->swapcache); | 1485 | seq_printf(m, " swapcache=%lu", md->swapcache); |
1477 | 1486 | ||
1478 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) | 1487 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) |
1479 | seq_printf(m, " active=%lu", md->active); | 1488 | seq_printf(m, " active=%lu", md->active); |
1480 | 1489 | ||
1481 | if (md->writeback) | 1490 | if (md->writeback) |
1482 | seq_printf(m, " writeback=%lu", md->writeback); | 1491 | seq_printf(m, " writeback=%lu", md->writeback); |
1483 | 1492 | ||
1484 | for_each_node_state(nid, N_MEMORY) | 1493 | for_each_node_state(nid, N_MEMORY) |
1485 | if (md->node[nid]) | 1494 | if (md->node[nid]) |
1486 | seq_printf(m, " N%d=%lu", nid, md->node[nid]); | 1495 | seq_printf(m, " N%d=%lu", nid, md->node[nid]); |
1487 | out: | 1496 | out: |
1488 | seq_putc(m, '\n'); | 1497 | seq_putc(m, '\n'); |
1489 | 1498 | ||
1490 | if (m->count < m->size) | 1499 | if (m->count < m->size) |
1491 | m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; | 1500 | m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; |
1492 | return 0; | 1501 | return 0; |
1493 | } | 1502 | } |
1494 | 1503 | ||
1495 | static int show_pid_numa_map(struct seq_file *m, void *v) | 1504 | static int show_pid_numa_map(struct seq_file *m, void *v) |
1496 | { | 1505 | { |
1497 | return show_numa_map(m, v, 1); | 1506 | return show_numa_map(m, v, 1); |
1498 | } | 1507 | } |
1499 | 1508 | ||
1500 | static int show_tid_numa_map(struct seq_file *m, void *v) | 1509 | static int show_tid_numa_map(struct seq_file *m, void *v) |
1501 | { | 1510 | { |
1502 | return show_numa_map(m, v, 0); | 1511 | return show_numa_map(m, v, 0); |
1503 | } | 1512 | } |
1504 | 1513 | ||
1505 | static const struct seq_operations proc_pid_numa_maps_op = { | 1514 | static const struct seq_operations proc_pid_numa_maps_op = { |
1506 | .start = m_start, | 1515 | .start = m_start, |
1507 | .next = m_next, | 1516 | .next = m_next, |
1508 | .stop = m_stop, | 1517 | .stop = m_stop, |
1509 | .show = show_pid_numa_map, | 1518 | .show = show_pid_numa_map, |
1510 | }; | 1519 | }; |
1511 | 1520 | ||
1512 | static const struct seq_operations proc_tid_numa_maps_op = { | 1521 | static const struct seq_operations proc_tid_numa_maps_op = { |
1513 | .start = m_start, | 1522 | .start = m_start, |
1514 | .next = m_next, | 1523 | .next = m_next, |
1515 | .stop = m_stop, | 1524 | .stop = m_stop, |
1516 | .show = show_tid_numa_map, | 1525 | .show = show_tid_numa_map, |
1517 | }; | 1526 | }; |
1518 | 1527 | ||
1519 | static int numa_maps_open(struct inode *inode, struct file *file, | 1528 | static int numa_maps_open(struct inode *inode, struct file *file, |
1520 | const struct seq_operations *ops) | 1529 | const struct seq_operations *ops) |
1521 | { | 1530 | { |
1522 | struct numa_maps_private *priv; | 1531 | struct numa_maps_private *priv; |
1523 | int ret = -ENOMEM; | 1532 | int ret = -ENOMEM; |
1524 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 1533 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
1525 | if (priv) { | 1534 | if (priv) { |
1526 | priv->proc_maps.pid = proc_pid(inode); | 1535 | priv->proc_maps.pid = proc_pid(inode); |
1527 | ret = seq_open(file, ops); | 1536 | ret = seq_open(file, ops); |
1528 | if (!ret) { | 1537 | if (!ret) { |
1529 | struct seq_file *m = file->private_data; | 1538 | struct seq_file *m = file->private_data; |
1530 | m->private = priv; | 1539 | m->private = priv; |
1531 | } else { | 1540 | } else { |
1532 | kfree(priv); | 1541 | kfree(priv); |
1533 | } | 1542 | } |
1534 | } | 1543 | } |
1535 | return ret; | 1544 | return ret; |
1536 | } | 1545 | } |
1537 | 1546 | ||
1538 | static int pid_numa_maps_open(struct inode *inode, struct file *file) | 1547 | static int pid_numa_maps_open(struct inode *inode, struct file *file) |
1539 | { | 1548 | { |
1540 | return numa_maps_open(inode, file, &proc_pid_numa_maps_op); | 1549 | return numa_maps_open(inode, file, &proc_pid_numa_maps_op); |
1541 | } | 1550 | } |
1542 | 1551 | ||
1543 | static int tid_numa_maps_open(struct inode *inode, struct file *file) | 1552 | static int tid_numa_maps_open(struct inode *inode, struct file *file) |
1544 | { | 1553 | { |
1545 | return numa_maps_open(inode, file, &proc_tid_numa_maps_op); | 1554 | return numa_maps_open(inode, file, &proc_tid_numa_maps_op); |
1546 | } | 1555 | } |
1547 | 1556 | ||
1548 | const struct file_operations proc_pid_numa_maps_operations = { | 1557 | const struct file_operations proc_pid_numa_maps_operations = { |
1549 | .open = pid_numa_maps_open, | 1558 | .open = pid_numa_maps_open, |
1550 | .read = seq_read, | 1559 | .read = seq_read, |
1551 | .llseek = seq_lseek, | 1560 | .llseek = seq_lseek, |
1552 | .release = seq_release_private, | 1561 | .release = seq_release_private, |
1553 | }; | 1562 | }; |
1554 | 1563 | ||
1555 | const struct file_operations proc_tid_numa_maps_operations = { | 1564 | const struct file_operations proc_tid_numa_maps_operations = { |
1556 | .open = tid_numa_maps_open, | 1565 | .open = tid_numa_maps_open, |
1557 | .read = seq_read, | 1566 | .read = seq_read, |
1558 | .llseek = seq_lseek, | 1567 | .llseek = seq_lseek, |
1559 | .release = seq_release_private, | 1568 | .release = seq_release_private, |
1560 | }; | 1569 | }; |
1561 | #endif /* CONFIG_NUMA */ | 1570 | #endif /* CONFIG_NUMA */ |
lib/genalloc.c
1 | /* | 1 | /* |
2 | * Basic general purpose allocator for managing special purpose | 2 | * Basic general purpose allocator for managing special purpose |
3 | * memory, for example, memory that is not managed by the regular | 3 | * memory, for example, memory that is not managed by the regular |
4 | * kmalloc/kfree interface. Uses for this includes on-device special | 4 | * kmalloc/kfree interface. Uses for this includes on-device special |
5 | * memory, uncached memory etc. | 5 | * memory, uncached memory etc. |
6 | * | 6 | * |
7 | * It is safe to use the allocator in NMI handlers and other special | 7 | * It is safe to use the allocator in NMI handlers and other special |
8 | * unblockable contexts that could otherwise deadlock on locks. This | 8 | * unblockable contexts that could otherwise deadlock on locks. This |
9 | * is implemented by using atomic operations and retries on any | 9 | * is implemented by using atomic operations and retries on any |
10 | * conflicts. The disadvantage is that there may be livelocks in | 10 | * conflicts. The disadvantage is that there may be livelocks in |
11 | * extreme cases. For better scalability, one allocator can be used | 11 | * extreme cases. For better scalability, one allocator can be used |
12 | * for each CPU. | 12 | * for each CPU. |
13 | * | 13 | * |
14 | * The lockless operation only works if there is enough memory | 14 | * The lockless operation only works if there is enough memory |
15 | * available. If new memory is added to the pool a lock has to be | 15 | * available. If new memory is added to the pool a lock has to be |
16 | * still taken. So any user relying on locklessness has to ensure | 16 | * still taken. So any user relying on locklessness has to ensure |
17 | * that sufficient memory is preallocated. | 17 | * that sufficient memory is preallocated. |
18 | * | 18 | * |
19 | * The basic atomic operation of this allocator is cmpxchg on long. | 19 | * The basic atomic operation of this allocator is cmpxchg on long. |
20 | * On architectures that don't have NMI-safe cmpxchg implementation, | 20 | * On architectures that don't have NMI-safe cmpxchg implementation, |
21 | * the allocator can NOT be used in NMI handler. So code uses the | 21 | * the allocator can NOT be used in NMI handler. So code uses the |
22 | * allocator in NMI handler should depend on | 22 | * allocator in NMI handler should depend on |
23 | * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. | 23 | * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. |
24 | * | 24 | * |
25 | * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org> | 25 | * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org> |
26 | * | 26 | * |
27 | * This source code is licensed under the GNU General Public License, | 27 | * This source code is licensed under the GNU General Public License, |
28 | * Version 2. See the file COPYING for more details. | 28 | * Version 2. See the file COPYING for more details. |
29 | */ | 29 | */ |
30 | 30 | ||
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/export.h> | 32 | #include <linux/export.h> |
33 | #include <linux/bitmap.h> | 33 | #include <linux/bitmap.h> |
34 | #include <linux/rculist.h> | 34 | #include <linux/rculist.h> |
35 | #include <linux/interrupt.h> | 35 | #include <linux/interrupt.h> |
36 | #include <linux/genalloc.h> | 36 | #include <linux/genalloc.h> |
37 | #include <linux/of_address.h> | 37 | #include <linux/of_address.h> |
38 | #include <linux/of_device.h> | 38 | #include <linux/of_device.h> |
39 | 39 | ||
40 | static inline size_t chunk_size(const struct gen_pool_chunk *chunk) | 40 | static inline size_t chunk_size(const struct gen_pool_chunk *chunk) |
41 | { | 41 | { |
42 | return chunk->end_addr - chunk->start_addr + 1; | 42 | return chunk->end_addr - chunk->start_addr + 1; |
43 | } | 43 | } |
44 | 44 | ||
45 | static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) | 45 | static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) |
46 | { | 46 | { |
47 | unsigned long val, nval; | 47 | unsigned long val, nval; |
48 | 48 | ||
49 | nval = *addr; | 49 | nval = *addr; |
50 | do { | 50 | do { |
51 | val = nval; | 51 | val = nval; |
52 | if (val & mask_to_set) | 52 | if (val & mask_to_set) |
53 | return -EBUSY; | 53 | return -EBUSY; |
54 | cpu_relax(); | 54 | cpu_relax(); |
55 | } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); | 55 | } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); |
56 | 56 | ||
57 | return 0; | 57 | return 0; |
58 | } | 58 | } |
59 | 59 | ||
60 | static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) | 60 | static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) |
61 | { | 61 | { |
62 | unsigned long val, nval; | 62 | unsigned long val, nval; |
63 | 63 | ||
64 | nval = *addr; | 64 | nval = *addr; |
65 | do { | 65 | do { |
66 | val = nval; | 66 | val = nval; |
67 | if ((val & mask_to_clear) != mask_to_clear) | 67 | if ((val & mask_to_clear) != mask_to_clear) |
68 | return -EBUSY; | 68 | return -EBUSY; |
69 | cpu_relax(); | 69 | cpu_relax(); |
70 | } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); | 70 | } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); |
71 | 71 | ||
72 | return 0; | 72 | return 0; |
73 | } | 73 | } |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * bitmap_set_ll - set the specified number of bits at the specified position | 76 | * bitmap_set_ll - set the specified number of bits at the specified position |
77 | * @map: pointer to a bitmap | 77 | * @map: pointer to a bitmap |
78 | * @start: a bit position in @map | 78 | * @start: a bit position in @map |
79 | * @nr: number of bits to set | 79 | * @nr: number of bits to set |
80 | * | 80 | * |
81 | * Set @nr bits start from @start in @map lock-lessly. Several users | 81 | * Set @nr bits start from @start in @map lock-lessly. Several users |
82 | * can set/clear the same bitmap simultaneously without lock. If two | 82 | * can set/clear the same bitmap simultaneously without lock. If two |
83 | * users set the same bit, one user will return remain bits, otherwise | 83 | * users set the same bit, one user will return remain bits, otherwise |
84 | * return 0. | 84 | * return 0. |
85 | */ | 85 | */ |
86 | static int bitmap_set_ll(unsigned long *map, int start, int nr) | 86 | static int bitmap_set_ll(unsigned long *map, int start, int nr) |
87 | { | 87 | { |
88 | unsigned long *p = map + BIT_WORD(start); | 88 | unsigned long *p = map + BIT_WORD(start); |
89 | const int size = start + nr; | 89 | const int size = start + nr; |
90 | int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); | 90 | int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); |
91 | unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); | 91 | unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); |
92 | 92 | ||
93 | while (nr - bits_to_set >= 0) { | 93 | while (nr - bits_to_set >= 0) { |
94 | if (set_bits_ll(p, mask_to_set)) | 94 | if (set_bits_ll(p, mask_to_set)) |
95 | return nr; | 95 | return nr; |
96 | nr -= bits_to_set; | 96 | nr -= bits_to_set; |
97 | bits_to_set = BITS_PER_LONG; | 97 | bits_to_set = BITS_PER_LONG; |
98 | mask_to_set = ~0UL; | 98 | mask_to_set = ~0UL; |
99 | p++; | 99 | p++; |
100 | } | 100 | } |
101 | if (nr) { | 101 | if (nr) { |
102 | mask_to_set &= BITMAP_LAST_WORD_MASK(size); | 102 | mask_to_set &= BITMAP_LAST_WORD_MASK(size); |
103 | if (set_bits_ll(p, mask_to_set)) | 103 | if (set_bits_ll(p, mask_to_set)) |
104 | return nr; | 104 | return nr; |
105 | } | 105 | } |
106 | 106 | ||
107 | return 0; | 107 | return 0; |
108 | } | 108 | } |
109 | 109 | ||
110 | /* | 110 | /* |
111 | * bitmap_clear_ll - clear the specified number of bits at the specified position | 111 | * bitmap_clear_ll - clear the specified number of bits at the specified position |
112 | * @map: pointer to a bitmap | 112 | * @map: pointer to a bitmap |
113 | * @start: a bit position in @map | 113 | * @start: a bit position in @map |
114 | * @nr: number of bits to set | 114 | * @nr: number of bits to set |
115 | * | 115 | * |
116 | * Clear @nr bits start from @start in @map lock-lessly. Several users | 116 | * Clear @nr bits start from @start in @map lock-lessly. Several users |
117 | * can set/clear the same bitmap simultaneously without lock. If two | 117 | * can set/clear the same bitmap simultaneously without lock. If two |
118 | * users clear the same bit, one user will return remain bits, | 118 | * users clear the same bit, one user will return remain bits, |
119 | * otherwise return 0. | 119 | * otherwise return 0. |
120 | */ | 120 | */ |
121 | static int bitmap_clear_ll(unsigned long *map, int start, int nr) | 121 | static int bitmap_clear_ll(unsigned long *map, int start, int nr) |
122 | { | 122 | { |
123 | unsigned long *p = map + BIT_WORD(start); | 123 | unsigned long *p = map + BIT_WORD(start); |
124 | const int size = start + nr; | 124 | const int size = start + nr; |
125 | int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); | 125 | int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); |
126 | unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); | 126 | unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); |
127 | 127 | ||
128 | while (nr - bits_to_clear >= 0) { | 128 | while (nr - bits_to_clear >= 0) { |
129 | if (clear_bits_ll(p, mask_to_clear)) | 129 | if (clear_bits_ll(p, mask_to_clear)) |
130 | return nr; | 130 | return nr; |
131 | nr -= bits_to_clear; | 131 | nr -= bits_to_clear; |
132 | bits_to_clear = BITS_PER_LONG; | 132 | bits_to_clear = BITS_PER_LONG; |
133 | mask_to_clear = ~0UL; | 133 | mask_to_clear = ~0UL; |
134 | p++; | 134 | p++; |
135 | } | 135 | } |
136 | if (nr) { | 136 | if (nr) { |
137 | mask_to_clear &= BITMAP_LAST_WORD_MASK(size); | 137 | mask_to_clear &= BITMAP_LAST_WORD_MASK(size); |
138 | if (clear_bits_ll(p, mask_to_clear)) | 138 | if (clear_bits_ll(p, mask_to_clear)) |
139 | return nr; | 139 | return nr; |
140 | } | 140 | } |
141 | 141 | ||
142 | return 0; | 142 | return 0; |
143 | } | 143 | } |
144 | 144 | ||
145 | /** | 145 | /** |
146 | * gen_pool_create - create a new special memory pool | 146 | * gen_pool_create - create a new special memory pool |
147 | * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents | 147 | * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents |
148 | * @nid: node id of the node the pool structure should be allocated on, or -1 | 148 | * @nid: node id of the node the pool structure should be allocated on, or -1 |
149 | * | 149 | * |
150 | * Create a new special memory pool that can be used to manage special purpose | 150 | * Create a new special memory pool that can be used to manage special purpose |
151 | * memory not managed by the regular kmalloc/kfree interface. | 151 | * memory not managed by the regular kmalloc/kfree interface. |
152 | */ | 152 | */ |
153 | struct gen_pool *gen_pool_create(int min_alloc_order, int nid) | 153 | struct gen_pool *gen_pool_create(int min_alloc_order, int nid) |
154 | { | 154 | { |
155 | struct gen_pool *pool; | 155 | struct gen_pool *pool; |
156 | 156 | ||
157 | pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid); | 157 | pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid); |
158 | if (pool != NULL) { | 158 | if (pool != NULL) { |
159 | spin_lock_init(&pool->lock); | 159 | spin_lock_init(&pool->lock); |
160 | INIT_LIST_HEAD(&pool->chunks); | 160 | INIT_LIST_HEAD(&pool->chunks); |
161 | pool->min_alloc_order = min_alloc_order; | 161 | pool->min_alloc_order = min_alloc_order; |
162 | pool->algo = gen_pool_first_fit; | 162 | pool->algo = gen_pool_first_fit; |
163 | pool->data = NULL; | 163 | pool->data = NULL; |
164 | } | 164 | } |
165 | return pool; | 165 | return pool; |
166 | } | 166 | } |
167 | EXPORT_SYMBOL(gen_pool_create); | 167 | EXPORT_SYMBOL(gen_pool_create); |
168 | 168 | ||
169 | /** | 169 | /** |
170 | * gen_pool_add_virt - add a new chunk of special memory to the pool | 170 | * gen_pool_add_virt - add a new chunk of special memory to the pool |
171 | * @pool: pool to add new memory chunk to | 171 | * @pool: pool to add new memory chunk to |
172 | * @virt: virtual starting address of memory chunk to add to pool | 172 | * @virt: virtual starting address of memory chunk to add to pool |
173 | * @phys: physical starting address of memory chunk to add to pool | 173 | * @phys: physical starting address of memory chunk to add to pool |
174 | * @size: size in bytes of the memory chunk to add to pool | 174 | * @size: size in bytes of the memory chunk to add to pool |
175 | * @nid: node id of the node the chunk structure and bitmap should be | 175 | * @nid: node id of the node the chunk structure and bitmap should be |
176 | * allocated on, or -1 | 176 | * allocated on, or -1 |
177 | * | 177 | * |
178 | * Add a new chunk of special memory to the specified pool. | 178 | * Add a new chunk of special memory to the specified pool. |
179 | * | 179 | * |
180 | * Returns 0 on success or a -ve errno on failure. | 180 | * Returns 0 on success or a -ve errno on failure. |
181 | */ | 181 | */ |
182 | int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phys, | 182 | int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phys, |
183 | size_t size, int nid) | 183 | size_t size, int nid) |
184 | { | 184 | { |
185 | struct gen_pool_chunk *chunk; | 185 | struct gen_pool_chunk *chunk; |
186 | int nbits = size >> pool->min_alloc_order; | 186 | int nbits = size >> pool->min_alloc_order; |
187 | int nbytes = sizeof(struct gen_pool_chunk) + | 187 | int nbytes = sizeof(struct gen_pool_chunk) + |
188 | BITS_TO_LONGS(nbits) * sizeof(long); | 188 | BITS_TO_LONGS(nbits) * sizeof(long); |
189 | 189 | ||
190 | chunk = kzalloc_node(nbytes, GFP_KERNEL, nid); | 190 | chunk = kzalloc_node(nbytes, GFP_KERNEL, nid); |
191 | if (unlikely(chunk == NULL)) | 191 | if (unlikely(chunk == NULL)) |
192 | return -ENOMEM; | 192 | return -ENOMEM; |
193 | 193 | ||
194 | chunk->phys_addr = phys; | 194 | chunk->phys_addr = phys; |
195 | chunk->start_addr = virt; | 195 | chunk->start_addr = virt; |
196 | chunk->end_addr = virt + size - 1; | 196 | chunk->end_addr = virt + size - 1; |
197 | atomic_set(&chunk->avail, size); | 197 | atomic_set(&chunk->avail, size); |
198 | 198 | ||
199 | spin_lock(&pool->lock); | 199 | spin_lock(&pool->lock); |
200 | list_add_rcu(&chunk->next_chunk, &pool->chunks); | 200 | list_add_rcu(&chunk->next_chunk, &pool->chunks); |
201 | spin_unlock(&pool->lock); | 201 | spin_unlock(&pool->lock); |
202 | 202 | ||
203 | return 0; | 203 | return 0; |
204 | } | 204 | } |
205 | EXPORT_SYMBOL(gen_pool_add_virt); | 205 | EXPORT_SYMBOL(gen_pool_add_virt); |
206 | 206 | ||
207 | /** | 207 | /** |
208 | * gen_pool_virt_to_phys - return the physical address of memory | 208 | * gen_pool_virt_to_phys - return the physical address of memory |
209 | * @pool: pool to allocate from | 209 | * @pool: pool to allocate from |
210 | * @addr: starting address of memory | 210 | * @addr: starting address of memory |
211 | * | 211 | * |
212 | * Returns the physical address on success, or -1 on error. | 212 | * Returns the physical address on success, or -1 on error. |
213 | */ | 213 | */ |
214 | phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) | 214 | phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) |
215 | { | 215 | { |
216 | struct gen_pool_chunk *chunk; | 216 | struct gen_pool_chunk *chunk; |
217 | phys_addr_t paddr = -1; | 217 | phys_addr_t paddr = -1; |
218 | 218 | ||
219 | rcu_read_lock(); | 219 | rcu_read_lock(); |
220 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { | 220 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { |
221 | if (addr >= chunk->start_addr && addr <= chunk->end_addr) { | 221 | if (addr >= chunk->start_addr && addr <= chunk->end_addr) { |
222 | paddr = chunk->phys_addr + (addr - chunk->start_addr); | 222 | paddr = chunk->phys_addr + (addr - chunk->start_addr); |
223 | break; | 223 | break; |
224 | } | 224 | } |
225 | } | 225 | } |
226 | rcu_read_unlock(); | 226 | rcu_read_unlock(); |
227 | 227 | ||
228 | return paddr; | 228 | return paddr; |
229 | } | 229 | } |
230 | EXPORT_SYMBOL(gen_pool_virt_to_phys); | 230 | EXPORT_SYMBOL(gen_pool_virt_to_phys); |
231 | 231 | ||
232 | /** | 232 | /** |
233 | * gen_pool_destroy - destroy a special memory pool | 233 | * gen_pool_destroy - destroy a special memory pool |
234 | * @pool: pool to destroy | 234 | * @pool: pool to destroy |
235 | * | 235 | * |
236 | * Destroy the specified special memory pool. Verifies that there are no | 236 | * Destroy the specified special memory pool. Verifies that there are no |
237 | * outstanding allocations. | 237 | * outstanding allocations. |
238 | */ | 238 | */ |
239 | void gen_pool_destroy(struct gen_pool *pool) | 239 | void gen_pool_destroy(struct gen_pool *pool) |
240 | { | 240 | { |
241 | struct list_head *_chunk, *_next_chunk; | 241 | struct list_head *_chunk, *_next_chunk; |
242 | struct gen_pool_chunk *chunk; | 242 | struct gen_pool_chunk *chunk; |
243 | int order = pool->min_alloc_order; | 243 | int order = pool->min_alloc_order; |
244 | int bit, end_bit; | 244 | int bit, end_bit; |
245 | 245 | ||
246 | list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { | 246 | list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { |
247 | chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); | 247 | chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); |
248 | list_del(&chunk->next_chunk); | 248 | list_del(&chunk->next_chunk); |
249 | 249 | ||
250 | end_bit = chunk_size(chunk) >> order; | 250 | end_bit = chunk_size(chunk) >> order; |
251 | bit = find_next_bit(chunk->bits, end_bit, 0); | 251 | bit = find_next_bit(chunk->bits, end_bit, 0); |
252 | BUG_ON(bit < end_bit); | 252 | BUG_ON(bit < end_bit); |
253 | 253 | ||
254 | kfree(chunk); | 254 | kfree(chunk); |
255 | } | 255 | } |
256 | kfree(pool); | 256 | kfree(pool); |
257 | return; | 257 | return; |
258 | } | 258 | } |
259 | EXPORT_SYMBOL(gen_pool_destroy); | 259 | EXPORT_SYMBOL(gen_pool_destroy); |
260 | 260 | ||
261 | /** | 261 | /** |
262 | * gen_pool_alloc - allocate special memory from the pool | 262 | * gen_pool_alloc - allocate special memory from the pool |
263 | * @pool: pool to allocate from | 263 | * @pool: pool to allocate from |
264 | * @size: number of bytes to allocate from the pool | 264 | * @size: number of bytes to allocate from the pool |
265 | * | 265 | * |
266 | * Allocate the requested number of bytes from the specified pool. | 266 | * Allocate the requested number of bytes from the specified pool. |
267 | * Uses the pool allocation function (with first-fit algorithm by default). | 267 | * Uses the pool allocation function (with first-fit algorithm by default). |
268 | * Can not be used in NMI handler on architectures without | 268 | * Can not be used in NMI handler on architectures without |
269 | * NMI-safe cmpxchg implementation. | 269 | * NMI-safe cmpxchg implementation. |
270 | */ | 270 | */ |
271 | unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) | 271 | unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) |
272 | { | 272 | { |
273 | struct gen_pool_chunk *chunk; | 273 | struct gen_pool_chunk *chunk; |
274 | unsigned long addr = 0; | 274 | unsigned long addr = 0; |
275 | int order = pool->min_alloc_order; | 275 | int order = pool->min_alloc_order; |
276 | int nbits, start_bit = 0, end_bit, remain; | 276 | int nbits, start_bit = 0, end_bit, remain; |
277 | 277 | ||
278 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | 278 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG |
279 | BUG_ON(in_nmi()); | 279 | BUG_ON(in_nmi()); |
280 | #endif | 280 | #endif |
281 | 281 | ||
282 | if (size == 0) | 282 | if (size == 0) |
283 | return 0; | 283 | return 0; |
284 | 284 | ||
285 | nbits = (size + (1UL << order) - 1) >> order; | 285 | nbits = (size + (1UL << order) - 1) >> order; |
286 | rcu_read_lock(); | 286 | rcu_read_lock(); |
287 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { | 287 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { |
288 | if (size > atomic_read(&chunk->avail)) | 288 | if (size > atomic_read(&chunk->avail)) |
289 | continue; | 289 | continue; |
290 | 290 | ||
291 | end_bit = chunk_size(chunk) >> order; | 291 | end_bit = chunk_size(chunk) >> order; |
292 | retry: | 292 | retry: |
293 | start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, | 293 | start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, |
294 | pool->data); | 294 | pool->data); |
295 | if (start_bit >= end_bit) | 295 | if (start_bit >= end_bit) |
296 | continue; | 296 | continue; |
297 | remain = bitmap_set_ll(chunk->bits, start_bit, nbits); | 297 | remain = bitmap_set_ll(chunk->bits, start_bit, nbits); |
298 | if (remain) { | 298 | if (remain) { |
299 | remain = bitmap_clear_ll(chunk->bits, start_bit, | 299 | remain = bitmap_clear_ll(chunk->bits, start_bit, |
300 | nbits - remain); | 300 | nbits - remain); |
301 | BUG_ON(remain); | 301 | BUG_ON(remain); |
302 | goto retry; | 302 | goto retry; |
303 | } | 303 | } |
304 | 304 | ||
305 | addr = chunk->start_addr + ((unsigned long)start_bit << order); | 305 | addr = chunk->start_addr + ((unsigned long)start_bit << order); |
306 | size = nbits << order; | 306 | size = nbits << order; |
307 | atomic_sub(size, &chunk->avail); | 307 | atomic_sub(size, &chunk->avail); |
308 | break; | 308 | break; |
309 | } | 309 | } |
310 | rcu_read_unlock(); | 310 | rcu_read_unlock(); |
311 | return addr; | 311 | return addr; |
312 | } | 312 | } |
313 | EXPORT_SYMBOL(gen_pool_alloc); | 313 | EXPORT_SYMBOL(gen_pool_alloc); |
314 | 314 | ||
315 | /** | 315 | /** |
316 | * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage | 316 | * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage |
317 | * @pool: pool to allocate from | 317 | * @pool: pool to allocate from |
318 | * @size: number of bytes to allocate from the pool | 318 | * @size: number of bytes to allocate from the pool |
319 | * @dma: dma-view physical address return value. Use NULL if unneeded. | 319 | * @dma: dma-view physical address return value. Use NULL if unneeded. |
320 | * | 320 | * |
321 | * Allocate the requested number of bytes from the specified pool. | 321 | * Allocate the requested number of bytes from the specified pool. |
322 | * Uses the pool allocation function (with first-fit algorithm by default). | 322 | * Uses the pool allocation function (with first-fit algorithm by default). |
323 | * Can not be used in NMI handler on architectures without | 323 | * Can not be used in NMI handler on architectures without |
324 | * NMI-safe cmpxchg implementation. | 324 | * NMI-safe cmpxchg implementation. |
325 | */ | 325 | */ |
326 | void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma) | 326 | void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma) |
327 | { | 327 | { |
328 | unsigned long vaddr; | 328 | unsigned long vaddr; |
329 | 329 | ||
330 | if (!pool) | 330 | if (!pool) |
331 | return NULL; | 331 | return NULL; |
332 | 332 | ||
333 | vaddr = gen_pool_alloc(pool, size); | 333 | vaddr = gen_pool_alloc(pool, size); |
334 | if (!vaddr) | 334 | if (!vaddr) |
335 | return NULL; | 335 | return NULL; |
336 | 336 | ||
337 | if (dma) | 337 | if (dma) |
338 | *dma = gen_pool_virt_to_phys(pool, vaddr); | 338 | *dma = gen_pool_virt_to_phys(pool, vaddr); |
339 | 339 | ||
340 | return (void *)vaddr; | 340 | return (void *)vaddr; |
341 | } | 341 | } |
342 | EXPORT_SYMBOL(gen_pool_dma_alloc); | 342 | EXPORT_SYMBOL(gen_pool_dma_alloc); |
343 | 343 | ||
344 | /** | 344 | /** |
345 | * gen_pool_free - free allocated special memory back to the pool | 345 | * gen_pool_free - free allocated special memory back to the pool |
346 | * @pool: pool to free to | 346 | * @pool: pool to free to |
347 | * @addr: starting address of memory to free back to pool | 347 | * @addr: starting address of memory to free back to pool |
348 | * @size: size in bytes of memory to free | 348 | * @size: size in bytes of memory to free |
349 | * | 349 | * |
350 | * Free previously allocated special memory back to the specified | 350 | * Free previously allocated special memory back to the specified |
351 | * pool. Can not be used in NMI handler on architectures without | 351 | * pool. Can not be used in NMI handler on architectures without |
352 | * NMI-safe cmpxchg implementation. | 352 | * NMI-safe cmpxchg implementation. |
353 | */ | 353 | */ |
354 | void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) | 354 | void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) |
355 | { | 355 | { |
356 | struct gen_pool_chunk *chunk; | 356 | struct gen_pool_chunk *chunk; |
357 | int order = pool->min_alloc_order; | 357 | int order = pool->min_alloc_order; |
358 | int start_bit, nbits, remain; | 358 | int start_bit, nbits, remain; |
359 | 359 | ||
360 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | 360 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG |
361 | BUG_ON(in_nmi()); | 361 | BUG_ON(in_nmi()); |
362 | #endif | 362 | #endif |
363 | 363 | ||
364 | nbits = (size + (1UL << order) - 1) >> order; | 364 | nbits = (size + (1UL << order) - 1) >> order; |
365 | rcu_read_lock(); | 365 | rcu_read_lock(); |
366 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { | 366 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { |
367 | if (addr >= chunk->start_addr && addr <= chunk->end_addr) { | 367 | if (addr >= chunk->start_addr && addr <= chunk->end_addr) { |
368 | BUG_ON(addr + size - 1 > chunk->end_addr); | 368 | BUG_ON(addr + size - 1 > chunk->end_addr); |
369 | start_bit = (addr - chunk->start_addr) >> order; | 369 | start_bit = (addr - chunk->start_addr) >> order; |
370 | remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); | 370 | remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); |
371 | BUG_ON(remain); | 371 | BUG_ON(remain); |
372 | size = nbits << order; | 372 | size = nbits << order; |
373 | atomic_add(size, &chunk->avail); | 373 | atomic_add(size, &chunk->avail); |
374 | rcu_read_unlock(); | 374 | rcu_read_unlock(); |
375 | return; | 375 | return; |
376 | } | 376 | } |
377 | } | 377 | } |
378 | rcu_read_unlock(); | 378 | rcu_read_unlock(); |
379 | BUG(); | 379 | BUG(); |
380 | } | 380 | } |
381 | EXPORT_SYMBOL(gen_pool_free); | 381 | EXPORT_SYMBOL(gen_pool_free); |
382 | 382 | ||
383 | /** | 383 | /** |
384 | * gen_pool_for_each_chunk - call func for every chunk of generic memory pool | 384 | * gen_pool_for_each_chunk - call func for every chunk of generic memory pool |
385 | * @pool: the generic memory pool | 385 | * @pool: the generic memory pool |
386 | * @func: func to call | 386 | * @func: func to call |
387 | * @data: additional data used by @func | 387 | * @data: additional data used by @func |
388 | * | 388 | * |
389 | * Call @func for every chunk of generic memory pool. The @func is | 389 | * Call @func for every chunk of generic memory pool. The @func is |
390 | * called with rcu_read_lock held. | 390 | * called with rcu_read_lock held. |
391 | */ | 391 | */ |
392 | void gen_pool_for_each_chunk(struct gen_pool *pool, | 392 | void gen_pool_for_each_chunk(struct gen_pool *pool, |
393 | void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data), | 393 | void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data), |
394 | void *data) | 394 | void *data) |
395 | { | 395 | { |
396 | struct gen_pool_chunk *chunk; | 396 | struct gen_pool_chunk *chunk; |
397 | 397 | ||
398 | rcu_read_lock(); | 398 | rcu_read_lock(); |
399 | list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) | 399 | list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) |
400 | func(pool, chunk, data); | 400 | func(pool, chunk, data); |
401 | rcu_read_unlock(); | 401 | rcu_read_unlock(); |
402 | } | 402 | } |
403 | EXPORT_SYMBOL(gen_pool_for_each_chunk); | 403 | EXPORT_SYMBOL(gen_pool_for_each_chunk); |
404 | 404 | ||
405 | /** | 405 | /** |
406 | * gen_pool_avail - get available free space of the pool | 406 | * gen_pool_avail - get available free space of the pool |
407 | * @pool: pool to get available free space | 407 | * @pool: pool to get available free space |
408 | * | 408 | * |
409 | * Return available free space of the specified pool. | 409 | * Return available free space of the specified pool. |
410 | */ | 410 | */ |
411 | size_t gen_pool_avail(struct gen_pool *pool) | 411 | size_t gen_pool_avail(struct gen_pool *pool) |
412 | { | 412 | { |
413 | struct gen_pool_chunk *chunk; | 413 | struct gen_pool_chunk *chunk; |
414 | size_t avail = 0; | 414 | size_t avail = 0; |
415 | 415 | ||
416 | rcu_read_lock(); | 416 | rcu_read_lock(); |
417 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) | 417 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) |
418 | avail += atomic_read(&chunk->avail); | 418 | avail += atomic_read(&chunk->avail); |
419 | rcu_read_unlock(); | 419 | rcu_read_unlock(); |
420 | return avail; | 420 | return avail; |
421 | } | 421 | } |
422 | EXPORT_SYMBOL_GPL(gen_pool_avail); | 422 | EXPORT_SYMBOL_GPL(gen_pool_avail); |
423 | 423 | ||
424 | /** | 424 | /** |
425 | * gen_pool_size - get size in bytes of memory managed by the pool | 425 | * gen_pool_size - get size in bytes of memory managed by the pool |
426 | * @pool: pool to get size | 426 | * @pool: pool to get size |
427 | * | 427 | * |
428 | * Return size in bytes of memory managed by the pool. | 428 | * Return size in bytes of memory managed by the pool. |
429 | */ | 429 | */ |
430 | size_t gen_pool_size(struct gen_pool *pool) | 430 | size_t gen_pool_size(struct gen_pool *pool) |
431 | { | 431 | { |
432 | struct gen_pool_chunk *chunk; | 432 | struct gen_pool_chunk *chunk; |
433 | size_t size = 0; | 433 | size_t size = 0; |
434 | 434 | ||
435 | rcu_read_lock(); | 435 | rcu_read_lock(); |
436 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) | 436 | list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) |
437 | size += chunk_size(chunk); | 437 | size += chunk_size(chunk); |
438 | rcu_read_unlock(); | 438 | rcu_read_unlock(); |
439 | return size; | 439 | return size; |
440 | } | 440 | } |
441 | EXPORT_SYMBOL_GPL(gen_pool_size); | 441 | EXPORT_SYMBOL_GPL(gen_pool_size); |
442 | 442 | ||
443 | /** | 443 | /** |
444 | * gen_pool_set_algo - set the allocation algorithm | 444 | * gen_pool_set_algo - set the allocation algorithm |
445 | * @pool: pool to change allocation algorithm | 445 | * @pool: pool to change allocation algorithm |
446 | * @algo: custom algorithm function | 446 | * @algo: custom algorithm function |
447 | * @data: additional data used by @algo | 447 | * @data: additional data used by @algo |
448 | * | 448 | * |
449 | * Call @algo for each memory allocation in the pool. | 449 | * Call @algo for each memory allocation in the pool. |
450 | * If @algo is NULL use gen_pool_first_fit as default | 450 | * If @algo is NULL use gen_pool_first_fit as default |
451 | * memory allocation function. | 451 | * memory allocation function. |
452 | */ | 452 | */ |
453 | void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, void *data) | 453 | void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, void *data) |
454 | { | 454 | { |
455 | rcu_read_lock(); | 455 | rcu_read_lock(); |
456 | 456 | ||
457 | pool->algo = algo; | 457 | pool->algo = algo; |
458 | if (!pool->algo) | 458 | if (!pool->algo) |
459 | pool->algo = gen_pool_first_fit; | 459 | pool->algo = gen_pool_first_fit; |
460 | 460 | ||
461 | pool->data = data; | 461 | pool->data = data; |
462 | 462 | ||
463 | rcu_read_unlock(); | 463 | rcu_read_unlock(); |
464 | } | 464 | } |
465 | EXPORT_SYMBOL(gen_pool_set_algo); | 465 | EXPORT_SYMBOL(gen_pool_set_algo); |
466 | 466 | ||
467 | /** | 467 | /** |
468 | * gen_pool_first_fit - find the first available region | 468 | * gen_pool_first_fit - find the first available region |
469 | * of memory matching the size requirement (no alignment constraint) | 469 | * of memory matching the size requirement (no alignment constraint) |
470 | * @map: The address to base the search on | 470 | * @map: The address to base the search on |
471 | * @size: The bitmap size in bits | 471 | * @size: The bitmap size in bits |
472 | * @start: The bitnumber to start searching at | 472 | * @start: The bitnumber to start searching at |
473 | * @nr: The number of zeroed bits we're looking for | 473 | * @nr: The number of zeroed bits we're looking for |
474 | * @data: additional data - unused | 474 | * @data: additional data - unused |
475 | */ | 475 | */ |
476 | unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, | 476 | unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, |
477 | unsigned long start, unsigned int nr, void *data) | 477 | unsigned long start, unsigned int nr, void *data) |
478 | { | 478 | { |
479 | return bitmap_find_next_zero_area(map, size, start, nr, 0); | 479 | return bitmap_find_next_zero_area(map, size, start, nr, 0); |
480 | } | 480 | } |
481 | EXPORT_SYMBOL(gen_pool_first_fit); | 481 | EXPORT_SYMBOL(gen_pool_first_fit); |
482 | 482 | ||
483 | /** | 483 | /** |
484 | * gen_pool_best_fit - find the best fitting region of memory | 484 | * gen_pool_best_fit - find the best fitting region of memory |
485 | * macthing the size requirement (no alignment constraint) | 485 | * macthing the size requirement (no alignment constraint) |
486 | * @map: The address to base the search on | 486 | * @map: The address to base the search on |
487 | * @size: The bitmap size in bits | 487 | * @size: The bitmap size in bits |
488 | * @start: The bitnumber to start searching at | 488 | * @start: The bitnumber to start searching at |
489 | * @nr: The number of zeroed bits we're looking for | 489 | * @nr: The number of zeroed bits we're looking for |
490 | * @data: additional data - unused | 490 | * @data: additional data - unused |
491 | * | 491 | * |
492 | * Iterate over the bitmap to find the smallest free region | 492 | * Iterate over the bitmap to find the smallest free region |
493 | * which we can allocate the memory. | 493 | * which we can allocate the memory. |
494 | */ | 494 | */ |
495 | unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, | 495 | unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, |
496 | unsigned long start, unsigned int nr, void *data) | 496 | unsigned long start, unsigned int nr, void *data) |
497 | { | 497 | { |
498 | unsigned long start_bit = size; | 498 | unsigned long start_bit = size; |
499 | unsigned long len = size + 1; | 499 | unsigned long len = size + 1; |
500 | unsigned long index; | 500 | unsigned long index; |
501 | 501 | ||
502 | index = bitmap_find_next_zero_area(map, size, start, nr, 0); | 502 | index = bitmap_find_next_zero_area(map, size, start, nr, 0); |
503 | 503 | ||
504 | while (index < size) { | 504 | while (index < size) { |
505 | int next_bit = find_next_bit(map, size, index + nr); | 505 | int next_bit = find_next_bit(map, size, index + nr); |
506 | if ((next_bit - index) < len) { | 506 | if ((next_bit - index) < len) { |
507 | len = next_bit - index; | 507 | len = next_bit - index; |
508 | start_bit = index; | 508 | start_bit = index; |
509 | if (len == nr) | 509 | if (len == nr) |
510 | return start_bit; | 510 | return start_bit; |
511 | } | 511 | } |
512 | index = bitmap_find_next_zero_area(map, size, | 512 | index = bitmap_find_next_zero_area(map, size, |
513 | next_bit + 1, nr, 0); | 513 | next_bit + 1, nr, 0); |
514 | } | 514 | } |
515 | 515 | ||
516 | return start_bit; | 516 | return start_bit; |
517 | } | 517 | } |
518 | EXPORT_SYMBOL(gen_pool_best_fit); | 518 | EXPORT_SYMBOL(gen_pool_best_fit); |
519 | 519 | ||
520 | static void devm_gen_pool_release(struct device *dev, void *res) | 520 | static void devm_gen_pool_release(struct device *dev, void *res) |
521 | { | 521 | { |
522 | gen_pool_destroy(*(struct gen_pool **)res); | 522 | gen_pool_destroy(*(struct gen_pool **)res); |
523 | } | 523 | } |
524 | 524 | ||
525 | /** | 525 | /** |
526 | * devm_gen_pool_create - managed gen_pool_create | 526 | * devm_gen_pool_create - managed gen_pool_create |
527 | * @dev: device that provides the gen_pool | 527 | * @dev: device that provides the gen_pool |
528 | * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents | 528 | * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents |
529 | * @nid: node id of the node the pool structure should be allocated on, or -1 | 529 | * @nid: node id of the node the pool structure should be allocated on, or -1 |
530 | * | 530 | * |
531 | * Create a new special memory pool that can be used to manage special purpose | 531 | * Create a new special memory pool that can be used to manage special purpose |
532 | * memory not managed by the regular kmalloc/kfree interface. The pool will be | 532 | * memory not managed by the regular kmalloc/kfree interface. The pool will be |
533 | * automatically destroyed by the device management code. | 533 | * automatically destroyed by the device management code. |
534 | */ | 534 | */ |
535 | struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, | 535 | struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, |
536 | int nid) | 536 | int nid) |
537 | { | 537 | { |
538 | struct gen_pool **ptr, *pool; | 538 | struct gen_pool **ptr, *pool; |
539 | 539 | ||
540 | ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); | 540 | ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); |
541 | 541 | ||
542 | pool = gen_pool_create(min_alloc_order, nid); | 542 | pool = gen_pool_create(min_alloc_order, nid); |
543 | if (pool) { | 543 | if (pool) { |
544 | *ptr = pool; | 544 | *ptr = pool; |
545 | devres_add(dev, ptr); | 545 | devres_add(dev, ptr); |
546 | } else { | 546 | } else { |
547 | devres_free(ptr); | 547 | devres_free(ptr); |
548 | } | 548 | } |
549 | 549 | ||
550 | return pool; | 550 | return pool; |
551 | } | 551 | } |
552 | 552 | ||
553 | /** | 553 | /** |
554 | * dev_get_gen_pool - Obtain the gen_pool (if any) for a device | 554 | * dev_get_gen_pool - Obtain the gen_pool (if any) for a device |
555 | * @dev: device to retrieve the gen_pool from | 555 | * @dev: device to retrieve the gen_pool from |
556 | * | 556 | * |
557 | * Returns the gen_pool for the device if one is present, or NULL. | 557 | * Returns the gen_pool for the device if one is present, or NULL. |
558 | */ | 558 | */ |
559 | struct gen_pool *dev_get_gen_pool(struct device *dev) | 559 | struct gen_pool *dev_get_gen_pool(struct device *dev) |
560 | { | 560 | { |
561 | struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, | 561 | struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, |
562 | NULL); | 562 | NULL); |
563 | 563 | ||
564 | if (!p) | 564 | if (!p) |
565 | return NULL; | 565 | return NULL; |
566 | return *p; | 566 | return *p; |
567 | } | 567 | } |
568 | EXPORT_SYMBOL_GPL(dev_get_gen_pool); | 568 | EXPORT_SYMBOL_GPL(dev_get_gen_pool); |
569 | 569 | ||
570 | #ifdef CONFIG_OF | 570 | #ifdef CONFIG_OF |
571 | /** | 571 | /** |
572 | * of_get_named_gen_pool - find a pool by phandle property | 572 | * of_get_named_gen_pool - find a pool by phandle property |
573 | * @np: device node | 573 | * @np: device node |
574 | * @propname: property name containing phandle(s) | 574 | * @propname: property name containing phandle(s) |
575 | * @index: index into the phandle array | 575 | * @index: index into the phandle array |
576 | * | 576 | * |
577 | * Returns the pool that contains the chunk starting at the physical | 577 | * Returns the pool that contains the chunk starting at the physical |
578 | * address of the device tree node pointed at by the phandle property, | 578 | * address of the device tree node pointed at by the phandle property, |
579 | * or NULL if not found. | 579 | * or NULL if not found. |
580 | */ | 580 | */ |
581 | struct gen_pool *of_get_named_gen_pool(struct device_node *np, | 581 | struct gen_pool *of_get_named_gen_pool(struct device_node *np, |
582 | const char *propname, int index) | 582 | const char *propname, int index) |
583 | { | 583 | { |
584 | struct platform_device *pdev; | 584 | struct platform_device *pdev; |
585 | struct device_node *np_pool; | 585 | struct device_node *np_pool; |
586 | 586 | ||
587 | np_pool = of_parse_phandle(np, propname, index); | 587 | np_pool = of_parse_phandle(np, propname, index); |
588 | if (!np_pool) | 588 | if (!np_pool) |
589 | return NULL; | 589 | return NULL; |
590 | pdev = of_find_device_by_node(np_pool); | 590 | pdev = of_find_device_by_node(np_pool); |
591 | of_node_put(np_pool); | ||
591 | if (!pdev) | 592 | if (!pdev) |
592 | return NULL; | 593 | return NULL; |
593 | return dev_get_gen_pool(&pdev->dev); | 594 | return dev_get_gen_pool(&pdev->dev); |
594 | } | 595 | } |
595 | EXPORT_SYMBOL_GPL(of_get_named_gen_pool); | 596 | EXPORT_SYMBOL_GPL(of_get_named_gen_pool); |
596 | #endif /* CONFIG_OF */ | 597 | #endif /* CONFIG_OF */ |
597 | 598 |
mm/memory.c
1 | /* | 1 | /* |
2 | * linux/mm/memory.c | 2 | * linux/mm/memory.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * demand-loading started 01.12.91 - seems it is high on the list of | 8 | * demand-loading started 01.12.91 - seems it is high on the list of |
9 | * things wanted, and it should be easy to implement. - Linus | 9 | * things wanted, and it should be easy to implement. - Linus |
10 | */ | 10 | */ |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * Ok, demand-loading was easy, shared pages a little bit tricker. Shared | 13 | * Ok, demand-loading was easy, shared pages a little bit tricker. Shared |
14 | * pages started 02.12.91, seems to work. - Linus. | 14 | * pages started 02.12.91, seems to work. - Linus. |
15 | * | 15 | * |
16 | * Tested sharing by executing about 30 /bin/sh: under the old kernel it | 16 | * Tested sharing by executing about 30 /bin/sh: under the old kernel it |
17 | * would have taken more than the 6M I have free, but it worked well as | 17 | * would have taken more than the 6M I have free, but it worked well as |
18 | * far as I could see. | 18 | * far as I could see. |
19 | * | 19 | * |
20 | * Also corrected some "invalidate()"s - I wasn't doing enough of them. | 20 | * Also corrected some "invalidate()"s - I wasn't doing enough of them. |
21 | */ | 21 | */ |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Real VM (paging to/from disk) started 18.12.91. Much more work and | 24 | * Real VM (paging to/from disk) started 18.12.91. Much more work and |
25 | * thought has to go into this. Oh, well.. | 25 | * thought has to go into this. Oh, well.. |
26 | * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. | 26 | * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. |
27 | * Found it. Everything seems to work now. | 27 | * Found it. Everything seems to work now. |
28 | * 20.12.91 - Ok, making the swap-device changeable like the root. | 28 | * 20.12.91 - Ok, making the swap-device changeable like the root. |
29 | */ | 29 | */ |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * 05.04.94 - Multi-page memory management added for v1.1. | 32 | * 05.04.94 - Multi-page memory management added for v1.1. |
33 | * Idea by Alex Bligh (alex@cconcepts.co.uk) | 33 | * Idea by Alex Bligh (alex@cconcepts.co.uk) |
34 | * | 34 | * |
35 | * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG | 35 | * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG |
36 | * (Gerhard.Wichert@pdb.siemens.de) | 36 | * (Gerhard.Wichert@pdb.siemens.de) |
37 | * | 37 | * |
38 | * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) | 38 | * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) |
39 | */ | 39 | */ |
40 | 40 | ||
41 | #include <linux/kernel_stat.h> | 41 | #include <linux/kernel_stat.h> |
42 | #include <linux/mm.h> | 42 | #include <linux/mm.h> |
43 | #include <linux/hugetlb.h> | 43 | #include <linux/hugetlb.h> |
44 | #include <linux/mman.h> | 44 | #include <linux/mman.h> |
45 | #include <linux/swap.h> | 45 | #include <linux/swap.h> |
46 | #include <linux/highmem.h> | 46 | #include <linux/highmem.h> |
47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> |
48 | #include <linux/ksm.h> | 48 | #include <linux/ksm.h> |
49 | #include <linux/rmap.h> | 49 | #include <linux/rmap.h> |
50 | #include <linux/export.h> | 50 | #include <linux/export.h> |
51 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
52 | #include <linux/init.h> | 52 | #include <linux/init.h> |
53 | #include <linux/writeback.h> | 53 | #include <linux/writeback.h> |
54 | #include <linux/memcontrol.h> | 54 | #include <linux/memcontrol.h> |
55 | #include <linux/mmu_notifier.h> | 55 | #include <linux/mmu_notifier.h> |
56 | #include <linux/kallsyms.h> | 56 | #include <linux/kallsyms.h> |
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
62 | #include <linux/dma-debug.h> | 62 | #include <linux/dma-debug.h> |
63 | #include <linux/debugfs.h> | 63 | #include <linux/debugfs.h> |
64 | 64 | ||
65 | #include <asm/io.h> | 65 | #include <asm/io.h> |
66 | #include <asm/pgalloc.h> | 66 | #include <asm/pgalloc.h> |
67 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
68 | #include <asm/tlb.h> | 68 | #include <asm/tlb.h> |
69 | #include <asm/tlbflush.h> | 69 | #include <asm/tlbflush.h> |
70 | #include <asm/pgtable.h> | 70 | #include <asm/pgtable.h> |
71 | 71 | ||
72 | #include "internal.h" | 72 | #include "internal.h" |
73 | 73 | ||
74 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS | 74 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
75 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. | 75 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. |
76 | #endif | 76 | #endif |
77 | 77 | ||
78 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 78 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
79 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 79 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
80 | unsigned long max_mapnr; | 80 | unsigned long max_mapnr; |
81 | struct page *mem_map; | 81 | struct page *mem_map; |
82 | 82 | ||
83 | EXPORT_SYMBOL(max_mapnr); | 83 | EXPORT_SYMBOL(max_mapnr); |
84 | EXPORT_SYMBOL(mem_map); | 84 | EXPORT_SYMBOL(mem_map); |
85 | #endif | 85 | #endif |
86 | 86 | ||
87 | /* | 87 | /* |
88 | * A number of key systems in x86 including ioremap() rely on the assumption | 88 | * A number of key systems in x86 including ioremap() rely on the assumption |
89 | * that high_memory defines the upper bound on direct map memory, then end | 89 | * that high_memory defines the upper bound on direct map memory, then end |
90 | * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and | 90 | * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and |
91 | * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL | 91 | * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL |
92 | * and ZONE_HIGHMEM. | 92 | * and ZONE_HIGHMEM. |
93 | */ | 93 | */ |
94 | void * high_memory; | 94 | void * high_memory; |
95 | 95 | ||
96 | EXPORT_SYMBOL(high_memory); | 96 | EXPORT_SYMBOL(high_memory); |
97 | 97 | ||
98 | /* | 98 | /* |
99 | * Randomize the address space (stacks, mmaps, brk, etc.). | 99 | * Randomize the address space (stacks, mmaps, brk, etc.). |
100 | * | 100 | * |
101 | * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, | 101 | * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, |
102 | * as ancient (libc5 based) binaries can segfault. ) | 102 | * as ancient (libc5 based) binaries can segfault. ) |
103 | */ | 103 | */ |
104 | int randomize_va_space __read_mostly = | 104 | int randomize_va_space __read_mostly = |
105 | #ifdef CONFIG_COMPAT_BRK | 105 | #ifdef CONFIG_COMPAT_BRK |
106 | 1; | 106 | 1; |
107 | #else | 107 | #else |
108 | 2; | 108 | 2; |
109 | #endif | 109 | #endif |
110 | 110 | ||
111 | static int __init disable_randmaps(char *s) | 111 | static int __init disable_randmaps(char *s) |
112 | { | 112 | { |
113 | randomize_va_space = 0; | 113 | randomize_va_space = 0; |
114 | return 1; | 114 | return 1; |
115 | } | 115 | } |
116 | __setup("norandmaps", disable_randmaps); | 116 | __setup("norandmaps", disable_randmaps); |
117 | 117 | ||
118 | unsigned long zero_pfn __read_mostly; | 118 | unsigned long zero_pfn __read_mostly; |
119 | unsigned long highest_memmap_pfn __read_mostly; | 119 | unsigned long highest_memmap_pfn __read_mostly; |
120 | 120 | ||
121 | EXPORT_SYMBOL(zero_pfn); | 121 | EXPORT_SYMBOL(zero_pfn); |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() | 124 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() |
125 | */ | 125 | */ |
126 | static int __init init_zero_pfn(void) | 126 | static int __init init_zero_pfn(void) |
127 | { | 127 | { |
128 | zero_pfn = page_to_pfn(ZERO_PAGE(0)); | 128 | zero_pfn = page_to_pfn(ZERO_PAGE(0)); |
129 | return 0; | 129 | return 0; |
130 | } | 130 | } |
131 | core_initcall(init_zero_pfn); | 131 | core_initcall(init_zero_pfn); |
132 | 132 | ||
133 | 133 | ||
134 | #if defined(SPLIT_RSS_COUNTING) | 134 | #if defined(SPLIT_RSS_COUNTING) |
135 | 135 | ||
136 | void sync_mm_rss(struct mm_struct *mm) | 136 | void sync_mm_rss(struct mm_struct *mm) |
137 | { | 137 | { |
138 | int i; | 138 | int i; |
139 | 139 | ||
140 | for (i = 0; i < NR_MM_COUNTERS; i++) { | 140 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
141 | if (current->rss_stat.count[i]) { | 141 | if (current->rss_stat.count[i]) { |
142 | add_mm_counter(mm, i, current->rss_stat.count[i]); | 142 | add_mm_counter(mm, i, current->rss_stat.count[i]); |
143 | current->rss_stat.count[i] = 0; | 143 | current->rss_stat.count[i] = 0; |
144 | } | 144 | } |
145 | } | 145 | } |
146 | current->rss_stat.events = 0; | 146 | current->rss_stat.events = 0; |
147 | } | 147 | } |
148 | 148 | ||
149 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | 149 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) |
150 | { | 150 | { |
151 | struct task_struct *task = current; | 151 | struct task_struct *task = current; |
152 | 152 | ||
153 | if (likely(task->mm == mm)) | 153 | if (likely(task->mm == mm)) |
154 | task->rss_stat.count[member] += val; | 154 | task->rss_stat.count[member] += val; |
155 | else | 155 | else |
156 | add_mm_counter(mm, member, val); | 156 | add_mm_counter(mm, member, val); |
157 | } | 157 | } |
158 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | 158 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) |
159 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | 159 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) |
160 | 160 | ||
161 | /* sync counter once per 64 page faults */ | 161 | /* sync counter once per 64 page faults */ |
162 | #define TASK_RSS_EVENTS_THRESH (64) | 162 | #define TASK_RSS_EVENTS_THRESH (64) |
163 | static void check_sync_rss_stat(struct task_struct *task) | 163 | static void check_sync_rss_stat(struct task_struct *task) |
164 | { | 164 | { |
165 | if (unlikely(task != current)) | 165 | if (unlikely(task != current)) |
166 | return; | 166 | return; |
167 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | 167 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) |
168 | sync_mm_rss(task->mm); | 168 | sync_mm_rss(task->mm); |
169 | } | 169 | } |
170 | #else /* SPLIT_RSS_COUNTING */ | 170 | #else /* SPLIT_RSS_COUNTING */ |
171 | 171 | ||
172 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | 172 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) |
173 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | 173 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) |
174 | 174 | ||
175 | static void check_sync_rss_stat(struct task_struct *task) | 175 | static void check_sync_rss_stat(struct task_struct *task) |
176 | { | 176 | { |
177 | } | 177 | } |
178 | 178 | ||
179 | #endif /* SPLIT_RSS_COUNTING */ | 179 | #endif /* SPLIT_RSS_COUNTING */ |
180 | 180 | ||
181 | #ifdef HAVE_GENERIC_MMU_GATHER | 181 | #ifdef HAVE_GENERIC_MMU_GATHER |
182 | 182 | ||
183 | static int tlb_next_batch(struct mmu_gather *tlb) | 183 | static int tlb_next_batch(struct mmu_gather *tlb) |
184 | { | 184 | { |
185 | struct mmu_gather_batch *batch; | 185 | struct mmu_gather_batch *batch; |
186 | 186 | ||
187 | batch = tlb->active; | 187 | batch = tlb->active; |
188 | if (batch->next) { | 188 | if (batch->next) { |
189 | tlb->active = batch->next; | 189 | tlb->active = batch->next; |
190 | return 1; | 190 | return 1; |
191 | } | 191 | } |
192 | 192 | ||
193 | if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) | 193 | if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) |
194 | return 0; | 194 | return 0; |
195 | 195 | ||
196 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); | 196 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); |
197 | if (!batch) | 197 | if (!batch) |
198 | return 0; | 198 | return 0; |
199 | 199 | ||
200 | tlb->batch_count++; | 200 | tlb->batch_count++; |
201 | batch->next = NULL; | 201 | batch->next = NULL; |
202 | batch->nr = 0; | 202 | batch->nr = 0; |
203 | batch->max = MAX_GATHER_BATCH; | 203 | batch->max = MAX_GATHER_BATCH; |
204 | 204 | ||
205 | tlb->active->next = batch; | 205 | tlb->active->next = batch; |
206 | tlb->active = batch; | 206 | tlb->active = batch; |
207 | 207 | ||
208 | return 1; | 208 | return 1; |
209 | } | 209 | } |
210 | 210 | ||
211 | /* tlb_gather_mmu | 211 | /* tlb_gather_mmu |
212 | * Called to initialize an (on-stack) mmu_gather structure for page-table | 212 | * Called to initialize an (on-stack) mmu_gather structure for page-table |
213 | * tear-down from @mm. The @fullmm argument is used when @mm is without | 213 | * tear-down from @mm. The @fullmm argument is used when @mm is without |
214 | * users and we're going to destroy the full address space (exit/execve). | 214 | * users and we're going to destroy the full address space (exit/execve). |
215 | */ | 215 | */ |
216 | void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) | 216 | void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) |
217 | { | 217 | { |
218 | tlb->mm = mm; | 218 | tlb->mm = mm; |
219 | 219 | ||
220 | /* Is it from 0 to ~0? */ | 220 | /* Is it from 0 to ~0? */ |
221 | tlb->fullmm = !(start | (end+1)); | 221 | tlb->fullmm = !(start | (end+1)); |
222 | tlb->need_flush_all = 0; | 222 | tlb->need_flush_all = 0; |
223 | tlb->start = start; | 223 | tlb->start = start; |
224 | tlb->end = end; | 224 | tlb->end = end; |
225 | tlb->need_flush = 0; | 225 | tlb->need_flush = 0; |
226 | tlb->local.next = NULL; | 226 | tlb->local.next = NULL; |
227 | tlb->local.nr = 0; | 227 | tlb->local.nr = 0; |
228 | tlb->local.max = ARRAY_SIZE(tlb->__pages); | 228 | tlb->local.max = ARRAY_SIZE(tlb->__pages); |
229 | tlb->active = &tlb->local; | 229 | tlb->active = &tlb->local; |
230 | tlb->batch_count = 0; | 230 | tlb->batch_count = 0; |
231 | 231 | ||
232 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 232 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
233 | tlb->batch = NULL; | 233 | tlb->batch = NULL; |
234 | #endif | 234 | #endif |
235 | } | 235 | } |
236 | 236 | ||
237 | static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) | 237 | static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) |
238 | { | 238 | { |
239 | tlb->need_flush = 0; | 239 | tlb->need_flush = 0; |
240 | tlb_flush(tlb); | 240 | tlb_flush(tlb); |
241 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 241 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
242 | tlb_table_flush(tlb); | 242 | tlb_table_flush(tlb); |
243 | #endif | 243 | #endif |
244 | } | 244 | } |
245 | 245 | ||
246 | static void tlb_flush_mmu_free(struct mmu_gather *tlb) | 246 | static void tlb_flush_mmu_free(struct mmu_gather *tlb) |
247 | { | 247 | { |
248 | struct mmu_gather_batch *batch; | 248 | struct mmu_gather_batch *batch; |
249 | 249 | ||
250 | for (batch = &tlb->local; batch; batch = batch->next) { | 250 | for (batch = &tlb->local; batch; batch = batch->next) { |
251 | free_pages_and_swap_cache(batch->pages, batch->nr); | 251 | free_pages_and_swap_cache(batch->pages, batch->nr); |
252 | batch->nr = 0; | 252 | batch->nr = 0; |
253 | } | 253 | } |
254 | tlb->active = &tlb->local; | 254 | tlb->active = &tlb->local; |
255 | } | 255 | } |
256 | 256 | ||
257 | void tlb_flush_mmu(struct mmu_gather *tlb) | 257 | void tlb_flush_mmu(struct mmu_gather *tlb) |
258 | { | 258 | { |
259 | if (!tlb->need_flush) | 259 | if (!tlb->need_flush) |
260 | return; | 260 | return; |
261 | tlb_flush_mmu_tlbonly(tlb); | 261 | tlb_flush_mmu_tlbonly(tlb); |
262 | tlb_flush_mmu_free(tlb); | 262 | tlb_flush_mmu_free(tlb); |
263 | } | 263 | } |
264 | 264 | ||
265 | /* tlb_finish_mmu | 265 | /* tlb_finish_mmu |
266 | * Called at the end of the shootdown operation to free up any resources | 266 | * Called at the end of the shootdown operation to free up any resources |
267 | * that were required. | 267 | * that were required. |
268 | */ | 268 | */ |
269 | void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | 269 | void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) |
270 | { | 270 | { |
271 | struct mmu_gather_batch *batch, *next; | 271 | struct mmu_gather_batch *batch, *next; |
272 | 272 | ||
273 | tlb_flush_mmu(tlb); | 273 | tlb_flush_mmu(tlb); |
274 | 274 | ||
275 | /* keep the page table cache within bounds */ | 275 | /* keep the page table cache within bounds */ |
276 | check_pgt_cache(); | 276 | check_pgt_cache(); |
277 | 277 | ||
278 | for (batch = tlb->local.next; batch; batch = next) { | 278 | for (batch = tlb->local.next; batch; batch = next) { |
279 | next = batch->next; | 279 | next = batch->next; |
280 | free_pages((unsigned long)batch, 0); | 280 | free_pages((unsigned long)batch, 0); |
281 | } | 281 | } |
282 | tlb->local.next = NULL; | 282 | tlb->local.next = NULL; |
283 | } | 283 | } |
284 | 284 | ||
285 | /* __tlb_remove_page | 285 | /* __tlb_remove_page |
286 | * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while | 286 | * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while |
287 | * handling the additional races in SMP caused by other CPUs caching valid | 287 | * handling the additional races in SMP caused by other CPUs caching valid |
288 | * mappings in their TLBs. Returns the number of free page slots left. | 288 | * mappings in their TLBs. Returns the number of free page slots left. |
289 | * When out of page slots we must call tlb_flush_mmu(). | 289 | * When out of page slots we must call tlb_flush_mmu(). |
290 | */ | 290 | */ |
291 | int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 291 | int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
292 | { | 292 | { |
293 | struct mmu_gather_batch *batch; | 293 | struct mmu_gather_batch *batch; |
294 | 294 | ||
295 | VM_BUG_ON(!tlb->need_flush); | 295 | VM_BUG_ON(!tlb->need_flush); |
296 | 296 | ||
297 | batch = tlb->active; | 297 | batch = tlb->active; |
298 | batch->pages[batch->nr++] = page; | 298 | batch->pages[batch->nr++] = page; |
299 | if (batch->nr == batch->max) { | 299 | if (batch->nr == batch->max) { |
300 | if (!tlb_next_batch(tlb)) | 300 | if (!tlb_next_batch(tlb)) |
301 | return 0; | 301 | return 0; |
302 | batch = tlb->active; | 302 | batch = tlb->active; |
303 | } | 303 | } |
304 | VM_BUG_ON_PAGE(batch->nr > batch->max, page); | 304 | VM_BUG_ON_PAGE(batch->nr > batch->max, page); |
305 | 305 | ||
306 | return batch->max - batch->nr; | 306 | return batch->max - batch->nr; |
307 | } | 307 | } |
308 | 308 | ||
309 | #endif /* HAVE_GENERIC_MMU_GATHER */ | 309 | #endif /* HAVE_GENERIC_MMU_GATHER */ |
310 | 310 | ||
311 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 311 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
312 | 312 | ||
313 | /* | 313 | /* |
314 | * See the comment near struct mmu_table_batch. | 314 | * See the comment near struct mmu_table_batch. |
315 | */ | 315 | */ |
316 | 316 | ||
317 | static void tlb_remove_table_smp_sync(void *arg) | 317 | static void tlb_remove_table_smp_sync(void *arg) |
318 | { | 318 | { |
319 | /* Simply deliver the interrupt */ | 319 | /* Simply deliver the interrupt */ |
320 | } | 320 | } |
321 | 321 | ||
322 | static void tlb_remove_table_one(void *table) | 322 | static void tlb_remove_table_one(void *table) |
323 | { | 323 | { |
324 | /* | 324 | /* |
325 | * This isn't an RCU grace period and hence the page-tables cannot be | 325 | * This isn't an RCU grace period and hence the page-tables cannot be |
326 | * assumed to be actually RCU-freed. | 326 | * assumed to be actually RCU-freed. |
327 | * | 327 | * |
328 | * It is however sufficient for software page-table walkers that rely on | 328 | * It is however sufficient for software page-table walkers that rely on |
329 | * IRQ disabling. See the comment near struct mmu_table_batch. | 329 | * IRQ disabling. See the comment near struct mmu_table_batch. |
330 | */ | 330 | */ |
331 | smp_call_function(tlb_remove_table_smp_sync, NULL, 1); | 331 | smp_call_function(tlb_remove_table_smp_sync, NULL, 1); |
332 | __tlb_remove_table(table); | 332 | __tlb_remove_table(table); |
333 | } | 333 | } |
334 | 334 | ||
335 | static void tlb_remove_table_rcu(struct rcu_head *head) | 335 | static void tlb_remove_table_rcu(struct rcu_head *head) |
336 | { | 336 | { |
337 | struct mmu_table_batch *batch; | 337 | struct mmu_table_batch *batch; |
338 | int i; | 338 | int i; |
339 | 339 | ||
340 | batch = container_of(head, struct mmu_table_batch, rcu); | 340 | batch = container_of(head, struct mmu_table_batch, rcu); |
341 | 341 | ||
342 | for (i = 0; i < batch->nr; i++) | 342 | for (i = 0; i < batch->nr; i++) |
343 | __tlb_remove_table(batch->tables[i]); | 343 | __tlb_remove_table(batch->tables[i]); |
344 | 344 | ||
345 | free_page((unsigned long)batch); | 345 | free_page((unsigned long)batch); |
346 | } | 346 | } |
347 | 347 | ||
348 | void tlb_table_flush(struct mmu_gather *tlb) | 348 | void tlb_table_flush(struct mmu_gather *tlb) |
349 | { | 349 | { |
350 | struct mmu_table_batch **batch = &tlb->batch; | 350 | struct mmu_table_batch **batch = &tlb->batch; |
351 | 351 | ||
352 | if (*batch) { | 352 | if (*batch) { |
353 | call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); | 353 | call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); |
354 | *batch = NULL; | 354 | *batch = NULL; |
355 | } | 355 | } |
356 | } | 356 | } |
357 | 357 | ||
358 | void tlb_remove_table(struct mmu_gather *tlb, void *table) | 358 | void tlb_remove_table(struct mmu_gather *tlb, void *table) |
359 | { | 359 | { |
360 | struct mmu_table_batch **batch = &tlb->batch; | 360 | struct mmu_table_batch **batch = &tlb->batch; |
361 | 361 | ||
362 | tlb->need_flush = 1; | 362 | tlb->need_flush = 1; |
363 | 363 | ||
364 | /* | 364 | /* |
365 | * When there's less then two users of this mm there cannot be a | 365 | * When there's less then two users of this mm there cannot be a |
366 | * concurrent page-table walk. | 366 | * concurrent page-table walk. |
367 | */ | 367 | */ |
368 | if (atomic_read(&tlb->mm->mm_users) < 2) { | 368 | if (atomic_read(&tlb->mm->mm_users) < 2) { |
369 | __tlb_remove_table(table); | 369 | __tlb_remove_table(table); |
370 | return; | 370 | return; |
371 | } | 371 | } |
372 | 372 | ||
373 | if (*batch == NULL) { | 373 | if (*batch == NULL) { |
374 | *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); | 374 | *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); |
375 | if (*batch == NULL) { | 375 | if (*batch == NULL) { |
376 | tlb_remove_table_one(table); | 376 | tlb_remove_table_one(table); |
377 | return; | 377 | return; |
378 | } | 378 | } |
379 | (*batch)->nr = 0; | 379 | (*batch)->nr = 0; |
380 | } | 380 | } |
381 | (*batch)->tables[(*batch)->nr++] = table; | 381 | (*batch)->tables[(*batch)->nr++] = table; |
382 | if ((*batch)->nr == MAX_TABLE_BATCH) | 382 | if ((*batch)->nr == MAX_TABLE_BATCH) |
383 | tlb_table_flush(tlb); | 383 | tlb_table_flush(tlb); |
384 | } | 384 | } |
385 | 385 | ||
386 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | 386 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ |
387 | 387 | ||
388 | /* | 388 | /* |
389 | * Note: this doesn't free the actual pages themselves. That | 389 | * Note: this doesn't free the actual pages themselves. That |
390 | * has been handled earlier when unmapping all the memory regions. | 390 | * has been handled earlier when unmapping all the memory regions. |
391 | */ | 391 | */ |
392 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | 392 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, |
393 | unsigned long addr) | 393 | unsigned long addr) |
394 | { | 394 | { |
395 | pgtable_t token = pmd_pgtable(*pmd); | 395 | pgtable_t token = pmd_pgtable(*pmd); |
396 | pmd_clear(pmd); | 396 | pmd_clear(pmd); |
397 | pte_free_tlb(tlb, token, addr); | 397 | pte_free_tlb(tlb, token, addr); |
398 | atomic_long_dec(&tlb->mm->nr_ptes); | 398 | atomic_long_dec(&tlb->mm->nr_ptes); |
399 | } | 399 | } |
400 | 400 | ||
401 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 401 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
402 | unsigned long addr, unsigned long end, | 402 | unsigned long addr, unsigned long end, |
403 | unsigned long floor, unsigned long ceiling) | 403 | unsigned long floor, unsigned long ceiling) |
404 | { | 404 | { |
405 | pmd_t *pmd; | 405 | pmd_t *pmd; |
406 | unsigned long next; | 406 | unsigned long next; |
407 | unsigned long start; | 407 | unsigned long start; |
408 | 408 | ||
409 | start = addr; | 409 | start = addr; |
410 | pmd = pmd_offset(pud, addr); | 410 | pmd = pmd_offset(pud, addr); |
411 | do { | 411 | do { |
412 | next = pmd_addr_end(addr, end); | 412 | next = pmd_addr_end(addr, end); |
413 | if (pmd_none_or_clear_bad(pmd)) | 413 | if (pmd_none_or_clear_bad(pmd)) |
414 | continue; | 414 | continue; |
415 | free_pte_range(tlb, pmd, addr); | 415 | free_pte_range(tlb, pmd, addr); |
416 | } while (pmd++, addr = next, addr != end); | 416 | } while (pmd++, addr = next, addr != end); |
417 | 417 | ||
418 | start &= PUD_MASK; | 418 | start &= PUD_MASK; |
419 | if (start < floor) | 419 | if (start < floor) |
420 | return; | 420 | return; |
421 | if (ceiling) { | 421 | if (ceiling) { |
422 | ceiling &= PUD_MASK; | 422 | ceiling &= PUD_MASK; |
423 | if (!ceiling) | 423 | if (!ceiling) |
424 | return; | 424 | return; |
425 | } | 425 | } |
426 | if (end - 1 > ceiling - 1) | 426 | if (end - 1 > ceiling - 1) |
427 | return; | 427 | return; |
428 | 428 | ||
429 | pmd = pmd_offset(pud, start); | 429 | pmd = pmd_offset(pud, start); |
430 | pud_clear(pud); | 430 | pud_clear(pud); |
431 | pmd_free_tlb(tlb, pmd, start); | 431 | pmd_free_tlb(tlb, pmd, start); |
432 | } | 432 | } |
433 | 433 | ||
434 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 434 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
435 | unsigned long addr, unsigned long end, | 435 | unsigned long addr, unsigned long end, |
436 | unsigned long floor, unsigned long ceiling) | 436 | unsigned long floor, unsigned long ceiling) |
437 | { | 437 | { |
438 | pud_t *pud; | 438 | pud_t *pud; |
439 | unsigned long next; | 439 | unsigned long next; |
440 | unsigned long start; | 440 | unsigned long start; |
441 | 441 | ||
442 | start = addr; | 442 | start = addr; |
443 | pud = pud_offset(pgd, addr); | 443 | pud = pud_offset(pgd, addr); |
444 | do { | 444 | do { |
445 | next = pud_addr_end(addr, end); | 445 | next = pud_addr_end(addr, end); |
446 | if (pud_none_or_clear_bad(pud)) | 446 | if (pud_none_or_clear_bad(pud)) |
447 | continue; | 447 | continue; |
448 | free_pmd_range(tlb, pud, addr, next, floor, ceiling); | 448 | free_pmd_range(tlb, pud, addr, next, floor, ceiling); |
449 | } while (pud++, addr = next, addr != end); | 449 | } while (pud++, addr = next, addr != end); |
450 | 450 | ||
451 | start &= PGDIR_MASK; | 451 | start &= PGDIR_MASK; |
452 | if (start < floor) | 452 | if (start < floor) |
453 | return; | 453 | return; |
454 | if (ceiling) { | 454 | if (ceiling) { |
455 | ceiling &= PGDIR_MASK; | 455 | ceiling &= PGDIR_MASK; |
456 | if (!ceiling) | 456 | if (!ceiling) |
457 | return; | 457 | return; |
458 | } | 458 | } |
459 | if (end - 1 > ceiling - 1) | 459 | if (end - 1 > ceiling - 1) |
460 | return; | 460 | return; |
461 | 461 | ||
462 | pud = pud_offset(pgd, start); | 462 | pud = pud_offset(pgd, start); |
463 | pgd_clear(pgd); | 463 | pgd_clear(pgd); |
464 | pud_free_tlb(tlb, pud, start); | 464 | pud_free_tlb(tlb, pud, start); |
465 | } | 465 | } |
466 | 466 | ||
467 | /* | 467 | /* |
468 | * This function frees user-level page tables of a process. | 468 | * This function frees user-level page tables of a process. |
469 | */ | 469 | */ |
470 | void free_pgd_range(struct mmu_gather *tlb, | 470 | void free_pgd_range(struct mmu_gather *tlb, |
471 | unsigned long addr, unsigned long end, | 471 | unsigned long addr, unsigned long end, |
472 | unsigned long floor, unsigned long ceiling) | 472 | unsigned long floor, unsigned long ceiling) |
473 | { | 473 | { |
474 | pgd_t *pgd; | 474 | pgd_t *pgd; |
475 | unsigned long next; | 475 | unsigned long next; |
476 | 476 | ||
477 | /* | 477 | /* |
478 | * The next few lines have given us lots of grief... | 478 | * The next few lines have given us lots of grief... |
479 | * | 479 | * |
480 | * Why are we testing PMD* at this top level? Because often | 480 | * Why are we testing PMD* at this top level? Because often |
481 | * there will be no work to do at all, and we'd prefer not to | 481 | * there will be no work to do at all, and we'd prefer not to |
482 | * go all the way down to the bottom just to discover that. | 482 | * go all the way down to the bottom just to discover that. |
483 | * | 483 | * |
484 | * Why all these "- 1"s? Because 0 represents both the bottom | 484 | * Why all these "- 1"s? Because 0 represents both the bottom |
485 | * of the address space and the top of it (using -1 for the | 485 | * of the address space and the top of it (using -1 for the |
486 | * top wouldn't help much: the masks would do the wrong thing). | 486 | * top wouldn't help much: the masks would do the wrong thing). |
487 | * The rule is that addr 0 and floor 0 refer to the bottom of | 487 | * The rule is that addr 0 and floor 0 refer to the bottom of |
488 | * the address space, but end 0 and ceiling 0 refer to the top | 488 | * the address space, but end 0 and ceiling 0 refer to the top |
489 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | 489 | * Comparisons need to use "end - 1" and "ceiling - 1" (though |
490 | * that end 0 case should be mythical). | 490 | * that end 0 case should be mythical). |
491 | * | 491 | * |
492 | * Wherever addr is brought up or ceiling brought down, we must | 492 | * Wherever addr is brought up or ceiling brought down, we must |
493 | * be careful to reject "the opposite 0" before it confuses the | 493 | * be careful to reject "the opposite 0" before it confuses the |
494 | * subsequent tests. But what about where end is brought down | 494 | * subsequent tests. But what about where end is brought down |
495 | * by PMD_SIZE below? no, end can't go down to 0 there. | 495 | * by PMD_SIZE below? no, end can't go down to 0 there. |
496 | * | 496 | * |
497 | * Whereas we round start (addr) and ceiling down, by different | 497 | * Whereas we round start (addr) and ceiling down, by different |
498 | * masks at different levels, in order to test whether a table | 498 | * masks at different levels, in order to test whether a table |
499 | * now has no other vmas using it, so can be freed, we don't | 499 | * now has no other vmas using it, so can be freed, we don't |
500 | * bother to round floor or end up - the tests don't need that. | 500 | * bother to round floor or end up - the tests don't need that. |
501 | */ | 501 | */ |
502 | 502 | ||
503 | addr &= PMD_MASK; | 503 | addr &= PMD_MASK; |
504 | if (addr < floor) { | 504 | if (addr < floor) { |
505 | addr += PMD_SIZE; | 505 | addr += PMD_SIZE; |
506 | if (!addr) | 506 | if (!addr) |
507 | return; | 507 | return; |
508 | } | 508 | } |
509 | if (ceiling) { | 509 | if (ceiling) { |
510 | ceiling &= PMD_MASK; | 510 | ceiling &= PMD_MASK; |
511 | if (!ceiling) | 511 | if (!ceiling) |
512 | return; | 512 | return; |
513 | } | 513 | } |
514 | if (end - 1 > ceiling - 1) | 514 | if (end - 1 > ceiling - 1) |
515 | end -= PMD_SIZE; | 515 | end -= PMD_SIZE; |
516 | if (addr > end - 1) | 516 | if (addr > end - 1) |
517 | return; | 517 | return; |
518 | 518 | ||
519 | pgd = pgd_offset(tlb->mm, addr); | 519 | pgd = pgd_offset(tlb->mm, addr); |
520 | do { | 520 | do { |
521 | next = pgd_addr_end(addr, end); | 521 | next = pgd_addr_end(addr, end); |
522 | if (pgd_none_or_clear_bad(pgd)) | 522 | if (pgd_none_or_clear_bad(pgd)) |
523 | continue; | 523 | continue; |
524 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); | 524 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
525 | } while (pgd++, addr = next, addr != end); | 525 | } while (pgd++, addr = next, addr != end); |
526 | } | 526 | } |
527 | 527 | ||
528 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | 528 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, |
529 | unsigned long floor, unsigned long ceiling) | 529 | unsigned long floor, unsigned long ceiling) |
530 | { | 530 | { |
531 | while (vma) { | 531 | while (vma) { |
532 | struct vm_area_struct *next = vma->vm_next; | 532 | struct vm_area_struct *next = vma->vm_next; |
533 | unsigned long addr = vma->vm_start; | 533 | unsigned long addr = vma->vm_start; |
534 | 534 | ||
535 | /* | 535 | /* |
536 | * Hide vma from rmap and truncate_pagecache before freeing | 536 | * Hide vma from rmap and truncate_pagecache before freeing |
537 | * pgtables | 537 | * pgtables |
538 | */ | 538 | */ |
539 | unlink_anon_vmas(vma); | 539 | unlink_anon_vmas(vma); |
540 | unlink_file_vma(vma); | 540 | unlink_file_vma(vma); |
541 | 541 | ||
542 | if (is_vm_hugetlb_page(vma)) { | 542 | if (is_vm_hugetlb_page(vma)) { |
543 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 543 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
544 | floor, next? next->vm_start: ceiling); | 544 | floor, next? next->vm_start: ceiling); |
545 | } else { | 545 | } else { |
546 | /* | 546 | /* |
547 | * Optimization: gather nearby vmas into one call down | 547 | * Optimization: gather nearby vmas into one call down |
548 | */ | 548 | */ |
549 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE | 549 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE |
550 | && !is_vm_hugetlb_page(next)) { | 550 | && !is_vm_hugetlb_page(next)) { |
551 | vma = next; | 551 | vma = next; |
552 | next = vma->vm_next; | 552 | next = vma->vm_next; |
553 | unlink_anon_vmas(vma); | 553 | unlink_anon_vmas(vma); |
554 | unlink_file_vma(vma); | 554 | unlink_file_vma(vma); |
555 | } | 555 | } |
556 | free_pgd_range(tlb, addr, vma->vm_end, | 556 | free_pgd_range(tlb, addr, vma->vm_end, |
557 | floor, next? next->vm_start: ceiling); | 557 | floor, next? next->vm_start: ceiling); |
558 | } | 558 | } |
559 | vma = next; | 559 | vma = next; |
560 | } | 560 | } |
561 | } | 561 | } |
562 | 562 | ||
563 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | 563 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
564 | pmd_t *pmd, unsigned long address) | 564 | pmd_t *pmd, unsigned long address) |
565 | { | 565 | { |
566 | spinlock_t *ptl; | 566 | spinlock_t *ptl; |
567 | pgtable_t new = pte_alloc_one(mm, address); | 567 | pgtable_t new = pte_alloc_one(mm, address); |
568 | int wait_split_huge_page; | 568 | int wait_split_huge_page; |
569 | if (!new) | 569 | if (!new) |
570 | return -ENOMEM; | 570 | return -ENOMEM; |
571 | 571 | ||
572 | /* | 572 | /* |
573 | * Ensure all pte setup (eg. pte page lock and page clearing) are | 573 | * Ensure all pte setup (eg. pte page lock and page clearing) are |
574 | * visible before the pte is made visible to other CPUs by being | 574 | * visible before the pte is made visible to other CPUs by being |
575 | * put into page tables. | 575 | * put into page tables. |
576 | * | 576 | * |
577 | * The other side of the story is the pointer chasing in the page | 577 | * The other side of the story is the pointer chasing in the page |
578 | * table walking code (when walking the page table without locking; | 578 | * table walking code (when walking the page table without locking; |
579 | * ie. most of the time). Fortunately, these data accesses consist | 579 | * ie. most of the time). Fortunately, these data accesses consist |
580 | * of a chain of data-dependent loads, meaning most CPUs (alpha | 580 | * of a chain of data-dependent loads, meaning most CPUs (alpha |
581 | * being the notable exception) will already guarantee loads are | 581 | * being the notable exception) will already guarantee loads are |
582 | * seen in-order. See the alpha page table accessors for the | 582 | * seen in-order. See the alpha page table accessors for the |
583 | * smp_read_barrier_depends() barriers in page table walking code. | 583 | * smp_read_barrier_depends() barriers in page table walking code. |
584 | */ | 584 | */ |
585 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 585 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
586 | 586 | ||
587 | ptl = pmd_lock(mm, pmd); | 587 | ptl = pmd_lock(mm, pmd); |
588 | wait_split_huge_page = 0; | 588 | wait_split_huge_page = 0; |
589 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | 589 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
590 | atomic_long_inc(&mm->nr_ptes); | 590 | atomic_long_inc(&mm->nr_ptes); |
591 | pmd_populate(mm, pmd, new); | 591 | pmd_populate(mm, pmd, new); |
592 | new = NULL; | 592 | new = NULL; |
593 | } else if (unlikely(pmd_trans_splitting(*pmd))) | 593 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
594 | wait_split_huge_page = 1; | 594 | wait_split_huge_page = 1; |
595 | spin_unlock(ptl); | 595 | spin_unlock(ptl); |
596 | if (new) | 596 | if (new) |
597 | pte_free(mm, new); | 597 | pte_free(mm, new); |
598 | if (wait_split_huge_page) | 598 | if (wait_split_huge_page) |
599 | wait_split_huge_page(vma->anon_vma, pmd); | 599 | wait_split_huge_page(vma->anon_vma, pmd); |
600 | return 0; | 600 | return 0; |
601 | } | 601 | } |
602 | 602 | ||
603 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | 603 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) |
604 | { | 604 | { |
605 | pte_t *new = pte_alloc_one_kernel(&init_mm, address); | 605 | pte_t *new = pte_alloc_one_kernel(&init_mm, address); |
606 | if (!new) | 606 | if (!new) |
607 | return -ENOMEM; | 607 | return -ENOMEM; |
608 | 608 | ||
609 | smp_wmb(); /* See comment in __pte_alloc */ | 609 | smp_wmb(); /* See comment in __pte_alloc */ |
610 | 610 | ||
611 | spin_lock(&init_mm.page_table_lock); | 611 | spin_lock(&init_mm.page_table_lock); |
612 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | 612 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
613 | pmd_populate_kernel(&init_mm, pmd, new); | 613 | pmd_populate_kernel(&init_mm, pmd, new); |
614 | new = NULL; | 614 | new = NULL; |
615 | } else | 615 | } else |
616 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | 616 | VM_BUG_ON(pmd_trans_splitting(*pmd)); |
617 | spin_unlock(&init_mm.page_table_lock); | 617 | spin_unlock(&init_mm.page_table_lock); |
618 | if (new) | 618 | if (new) |
619 | pte_free_kernel(&init_mm, new); | 619 | pte_free_kernel(&init_mm, new); |
620 | return 0; | 620 | return 0; |
621 | } | 621 | } |
622 | 622 | ||
623 | static inline void init_rss_vec(int *rss) | 623 | static inline void init_rss_vec(int *rss) |
624 | { | 624 | { |
625 | memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); | 625 | memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); |
626 | } | 626 | } |
627 | 627 | ||
628 | static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) | 628 | static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) |
629 | { | 629 | { |
630 | int i; | 630 | int i; |
631 | 631 | ||
632 | if (current->mm == mm) | 632 | if (current->mm == mm) |
633 | sync_mm_rss(mm); | 633 | sync_mm_rss(mm); |
634 | for (i = 0; i < NR_MM_COUNTERS; i++) | 634 | for (i = 0; i < NR_MM_COUNTERS; i++) |
635 | if (rss[i]) | 635 | if (rss[i]) |
636 | add_mm_counter(mm, i, rss[i]); | 636 | add_mm_counter(mm, i, rss[i]); |
637 | } | 637 | } |
638 | 638 | ||
639 | /* | 639 | /* |
640 | * This function is called to print an error when a bad pte | 640 | * This function is called to print an error when a bad pte |
641 | * is found. For example, we might have a PFN-mapped pte in | 641 | * is found. For example, we might have a PFN-mapped pte in |
642 | * a region that doesn't allow it. | 642 | * a region that doesn't allow it. |
643 | * | 643 | * |
644 | * The calling function must still handle the error. | 644 | * The calling function must still handle the error. |
645 | */ | 645 | */ |
646 | static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | 646 | static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, |
647 | pte_t pte, struct page *page) | 647 | pte_t pte, struct page *page) |
648 | { | 648 | { |
649 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); | 649 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); |
650 | pud_t *pud = pud_offset(pgd, addr); | 650 | pud_t *pud = pud_offset(pgd, addr); |
651 | pmd_t *pmd = pmd_offset(pud, addr); | 651 | pmd_t *pmd = pmd_offset(pud, addr); |
652 | struct address_space *mapping; | 652 | struct address_space *mapping; |
653 | pgoff_t index; | 653 | pgoff_t index; |
654 | static unsigned long resume; | 654 | static unsigned long resume; |
655 | static unsigned long nr_shown; | 655 | static unsigned long nr_shown; |
656 | static unsigned long nr_unshown; | 656 | static unsigned long nr_unshown; |
657 | 657 | ||
658 | /* | 658 | /* |
659 | * Allow a burst of 60 reports, then keep quiet for that minute; | 659 | * Allow a burst of 60 reports, then keep quiet for that minute; |
660 | * or allow a steady drip of one report per second. | 660 | * or allow a steady drip of one report per second. |
661 | */ | 661 | */ |
662 | if (nr_shown == 60) { | 662 | if (nr_shown == 60) { |
663 | if (time_before(jiffies, resume)) { | 663 | if (time_before(jiffies, resume)) { |
664 | nr_unshown++; | 664 | nr_unshown++; |
665 | return; | 665 | return; |
666 | } | 666 | } |
667 | if (nr_unshown) { | 667 | if (nr_unshown) { |
668 | printk(KERN_ALERT | 668 | printk(KERN_ALERT |
669 | "BUG: Bad page map: %lu messages suppressed\n", | 669 | "BUG: Bad page map: %lu messages suppressed\n", |
670 | nr_unshown); | 670 | nr_unshown); |
671 | nr_unshown = 0; | 671 | nr_unshown = 0; |
672 | } | 672 | } |
673 | nr_shown = 0; | 673 | nr_shown = 0; |
674 | } | 674 | } |
675 | if (nr_shown++ == 0) | 675 | if (nr_shown++ == 0) |
676 | resume = jiffies + 60 * HZ; | 676 | resume = jiffies + 60 * HZ; |
677 | 677 | ||
678 | mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; | 678 | mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; |
679 | index = linear_page_index(vma, addr); | 679 | index = linear_page_index(vma, addr); |
680 | 680 | ||
681 | printk(KERN_ALERT | 681 | printk(KERN_ALERT |
682 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | 682 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", |
683 | current->comm, | 683 | current->comm, |
684 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 684 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
685 | if (page) | 685 | if (page) |
686 | dump_page(page, "bad pte"); | 686 | dump_page(page, "bad pte"); |
687 | printk(KERN_ALERT | 687 | printk(KERN_ALERT |
688 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 688 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", |
689 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 689 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
690 | /* | 690 | /* |
691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | 691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y |
692 | */ | 692 | */ |
693 | if (vma->vm_ops) | 693 | if (vma->vm_ops) |
694 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", | 694 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", |
695 | vma->vm_ops->fault); | 695 | vma->vm_ops->fault); |
696 | if (vma->vm_file) | 696 | if (vma->vm_file) |
697 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", | 697 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", |
698 | vma->vm_file->f_op->mmap); | 698 | vma->vm_file->f_op->mmap); |
699 | dump_stack(); | 699 | dump_stack(); |
700 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 700 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
701 | } | 701 | } |
702 | 702 | ||
703 | /* | 703 | /* |
704 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 704 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
705 | * | 705 | * |
706 | * "Special" mappings do not wish to be associated with a "struct page" (either | 706 | * "Special" mappings do not wish to be associated with a "struct page" (either |
707 | * it doesn't exist, or it exists but they don't want to touch it). In this | 707 | * it doesn't exist, or it exists but they don't want to touch it). In this |
708 | * case, NULL is returned here. "Normal" mappings do have a struct page. | 708 | * case, NULL is returned here. "Normal" mappings do have a struct page. |
709 | * | 709 | * |
710 | * There are 2 broad cases. Firstly, an architecture may define a pte_special() | 710 | * There are 2 broad cases. Firstly, an architecture may define a pte_special() |
711 | * pte bit, in which case this function is trivial. Secondly, an architecture | 711 | * pte bit, in which case this function is trivial. Secondly, an architecture |
712 | * may not have a spare pte bit, which requires a more complicated scheme, | 712 | * may not have a spare pte bit, which requires a more complicated scheme, |
713 | * described below. | 713 | * described below. |
714 | * | 714 | * |
715 | * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a | 715 | * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a |
716 | * special mapping (even if there are underlying and valid "struct pages"). | 716 | * special mapping (even if there are underlying and valid "struct pages"). |
717 | * COWed pages of a VM_PFNMAP are always normal. | 717 | * COWed pages of a VM_PFNMAP are always normal. |
718 | * | 718 | * |
719 | * The way we recognize COWed pages within VM_PFNMAP mappings is through the | 719 | * The way we recognize COWed pages within VM_PFNMAP mappings is through the |
720 | * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit | 720 | * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit |
721 | * set, and the vm_pgoff will point to the first PFN mapped: thus every special | 721 | * set, and the vm_pgoff will point to the first PFN mapped: thus every special |
722 | * mapping will always honor the rule | 722 | * mapping will always honor the rule |
723 | * | 723 | * |
724 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) | 724 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) |
725 | * | 725 | * |
726 | * And for normal mappings this is false. | 726 | * And for normal mappings this is false. |
727 | * | 727 | * |
728 | * This restricts such mappings to be a linear translation from virtual address | 728 | * This restricts such mappings to be a linear translation from virtual address |
729 | * to pfn. To get around this restriction, we allow arbitrary mappings so long | 729 | * to pfn. To get around this restriction, we allow arbitrary mappings so long |
730 | * as the vma is not a COW mapping; in that case, we know that all ptes are | 730 | * as the vma is not a COW mapping; in that case, we know that all ptes are |
731 | * special (because none can have been COWed). | 731 | * special (because none can have been COWed). |
732 | * | 732 | * |
733 | * | 733 | * |
734 | * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. | 734 | * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. |
735 | * | 735 | * |
736 | * VM_MIXEDMAP mappings can likewise contain memory with or without "struct | 736 | * VM_MIXEDMAP mappings can likewise contain memory with or without "struct |
737 | * page" backing, however the difference is that _all_ pages with a struct | 737 | * page" backing, however the difference is that _all_ pages with a struct |
738 | * page (that is, those where pfn_valid is true) are refcounted and considered | 738 | * page (that is, those where pfn_valid is true) are refcounted and considered |
739 | * normal pages by the VM. The disadvantage is that pages are refcounted | 739 | * normal pages by the VM. The disadvantage is that pages are refcounted |
740 | * (which can be slower and simply not an option for some PFNMAP users). The | 740 | * (which can be slower and simply not an option for some PFNMAP users). The |
741 | * advantage is that we don't have to follow the strict linearity rule of | 741 | * advantage is that we don't have to follow the strict linearity rule of |
742 | * PFNMAP mappings in order to support COWable mappings. | 742 | * PFNMAP mappings in order to support COWable mappings. |
743 | * | 743 | * |
744 | */ | 744 | */ |
745 | #ifdef __HAVE_ARCH_PTE_SPECIAL | 745 | #ifdef __HAVE_ARCH_PTE_SPECIAL |
746 | # define HAVE_PTE_SPECIAL 1 | 746 | # define HAVE_PTE_SPECIAL 1 |
747 | #else | 747 | #else |
748 | # define HAVE_PTE_SPECIAL 0 | 748 | # define HAVE_PTE_SPECIAL 0 |
749 | #endif | 749 | #endif |
750 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 750 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
751 | pte_t pte) | 751 | pte_t pte) |
752 | { | 752 | { |
753 | unsigned long pfn = pte_pfn(pte); | 753 | unsigned long pfn = pte_pfn(pte); |
754 | 754 | ||
755 | if (HAVE_PTE_SPECIAL) { | 755 | if (HAVE_PTE_SPECIAL) { |
756 | if (likely(!pte_special(pte))) | 756 | if (likely(!pte_special(pte))) |
757 | goto check_pfn; | 757 | goto check_pfn; |
758 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) | 758 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
759 | return NULL; | 759 | return NULL; |
760 | if (!is_zero_pfn(pfn)) | 760 | if (!is_zero_pfn(pfn)) |
761 | print_bad_pte(vma, addr, pte, NULL); | 761 | print_bad_pte(vma, addr, pte, NULL); |
762 | return NULL; | 762 | return NULL; |
763 | } | 763 | } |
764 | 764 | ||
765 | /* !HAVE_PTE_SPECIAL case follows: */ | 765 | /* !HAVE_PTE_SPECIAL case follows: */ |
766 | 766 | ||
767 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { | 767 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
768 | if (vma->vm_flags & VM_MIXEDMAP) { | 768 | if (vma->vm_flags & VM_MIXEDMAP) { |
769 | if (!pfn_valid(pfn)) | 769 | if (!pfn_valid(pfn)) |
770 | return NULL; | 770 | return NULL; |
771 | goto out; | 771 | goto out; |
772 | } else { | 772 | } else { |
773 | unsigned long off; | 773 | unsigned long off; |
774 | off = (addr - vma->vm_start) >> PAGE_SHIFT; | 774 | off = (addr - vma->vm_start) >> PAGE_SHIFT; |
775 | if (pfn == vma->vm_pgoff + off) | 775 | if (pfn == vma->vm_pgoff + off) |
776 | return NULL; | 776 | return NULL; |
777 | if (!is_cow_mapping(vma->vm_flags)) | 777 | if (!is_cow_mapping(vma->vm_flags)) |
778 | return NULL; | 778 | return NULL; |
779 | } | 779 | } |
780 | } | 780 | } |
781 | 781 | ||
782 | if (is_zero_pfn(pfn)) | 782 | if (is_zero_pfn(pfn)) |
783 | return NULL; | 783 | return NULL; |
784 | check_pfn: | 784 | check_pfn: |
785 | if (unlikely(pfn > highest_memmap_pfn)) { | 785 | if (unlikely(pfn > highest_memmap_pfn)) { |
786 | print_bad_pte(vma, addr, pte, NULL); | 786 | print_bad_pte(vma, addr, pte, NULL); |
787 | return NULL; | 787 | return NULL; |
788 | } | 788 | } |
789 | 789 | ||
790 | /* | 790 | /* |
791 | * NOTE! We still have PageReserved() pages in the page tables. | 791 | * NOTE! We still have PageReserved() pages in the page tables. |
792 | * eg. VDSO mappings can cause them to exist. | 792 | * eg. VDSO mappings can cause them to exist. |
793 | */ | 793 | */ |
794 | out: | 794 | out: |
795 | return pfn_to_page(pfn); | 795 | return pfn_to_page(pfn); |
796 | } | 796 | } |
797 | 797 | ||
798 | /* | 798 | /* |
799 | * copy one vm_area from one task to the other. Assumes the page tables | 799 | * copy one vm_area from one task to the other. Assumes the page tables |
800 | * already present in the new task to be cleared in the whole range | 800 | * already present in the new task to be cleared in the whole range |
801 | * covered by this vma. | 801 | * covered by this vma. |
802 | */ | 802 | */ |
803 | 803 | ||
804 | static inline unsigned long | 804 | static inline unsigned long |
805 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 805 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
806 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 806 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
807 | unsigned long addr, int *rss) | 807 | unsigned long addr, int *rss) |
808 | { | 808 | { |
809 | unsigned long vm_flags = vma->vm_flags; | 809 | unsigned long vm_flags = vma->vm_flags; |
810 | pte_t pte = *src_pte; | 810 | pte_t pte = *src_pte; |
811 | struct page *page; | 811 | struct page *page; |
812 | 812 | ||
813 | /* pte contains position in swap or file, so copy. */ | 813 | /* pte contains position in swap or file, so copy. */ |
814 | if (unlikely(!pte_present(pte))) { | 814 | if (unlikely(!pte_present(pte))) { |
815 | if (!pte_file(pte)) { | 815 | if (!pte_file(pte)) { |
816 | swp_entry_t entry = pte_to_swp_entry(pte); | 816 | swp_entry_t entry = pte_to_swp_entry(pte); |
817 | 817 | ||
818 | if (swap_duplicate(entry) < 0) | 818 | if (swap_duplicate(entry) < 0) |
819 | return entry.val; | 819 | return entry.val; |
820 | 820 | ||
821 | /* make sure dst_mm is on swapoff's mmlist. */ | 821 | /* make sure dst_mm is on swapoff's mmlist. */ |
822 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 822 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
823 | spin_lock(&mmlist_lock); | 823 | spin_lock(&mmlist_lock); |
824 | if (list_empty(&dst_mm->mmlist)) | 824 | if (list_empty(&dst_mm->mmlist)) |
825 | list_add(&dst_mm->mmlist, | 825 | list_add(&dst_mm->mmlist, |
826 | &src_mm->mmlist); | 826 | &src_mm->mmlist); |
827 | spin_unlock(&mmlist_lock); | 827 | spin_unlock(&mmlist_lock); |
828 | } | 828 | } |
829 | if (likely(!non_swap_entry(entry))) | 829 | if (likely(!non_swap_entry(entry))) |
830 | rss[MM_SWAPENTS]++; | 830 | rss[MM_SWAPENTS]++; |
831 | else if (is_migration_entry(entry)) { | 831 | else if (is_migration_entry(entry)) { |
832 | page = migration_entry_to_page(entry); | 832 | page = migration_entry_to_page(entry); |
833 | 833 | ||
834 | if (PageAnon(page)) | 834 | if (PageAnon(page)) |
835 | rss[MM_ANONPAGES]++; | 835 | rss[MM_ANONPAGES]++; |
836 | else | 836 | else |
837 | rss[MM_FILEPAGES]++; | 837 | rss[MM_FILEPAGES]++; |
838 | 838 | ||
839 | if (is_write_migration_entry(entry) && | 839 | if (is_write_migration_entry(entry) && |
840 | is_cow_mapping(vm_flags)) { | 840 | is_cow_mapping(vm_flags)) { |
841 | /* | 841 | /* |
842 | * COW mappings require pages in both | 842 | * COW mappings require pages in both |
843 | * parent and child to be set to read. | 843 | * parent and child to be set to read. |
844 | */ | 844 | */ |
845 | make_migration_entry_read(&entry); | 845 | make_migration_entry_read(&entry); |
846 | pte = swp_entry_to_pte(entry); | 846 | pte = swp_entry_to_pte(entry); |
847 | if (pte_swp_soft_dirty(*src_pte)) | 847 | if (pte_swp_soft_dirty(*src_pte)) |
848 | pte = pte_swp_mksoft_dirty(pte); | 848 | pte = pte_swp_mksoft_dirty(pte); |
849 | set_pte_at(src_mm, addr, src_pte, pte); | 849 | set_pte_at(src_mm, addr, src_pte, pte); |
850 | } | 850 | } |
851 | } | 851 | } |
852 | } | 852 | } |
853 | goto out_set_pte; | 853 | goto out_set_pte; |
854 | } | 854 | } |
855 | 855 | ||
856 | /* | 856 | /* |
857 | * If it's a COW mapping, write protect it both | 857 | * If it's a COW mapping, write protect it both |
858 | * in the parent and the child | 858 | * in the parent and the child |
859 | */ | 859 | */ |
860 | if (is_cow_mapping(vm_flags)) { | 860 | if (is_cow_mapping(vm_flags)) { |
861 | ptep_set_wrprotect(src_mm, addr, src_pte); | 861 | ptep_set_wrprotect(src_mm, addr, src_pte); |
862 | pte = pte_wrprotect(pte); | 862 | pte = pte_wrprotect(pte); |
863 | } | 863 | } |
864 | 864 | ||
865 | /* | 865 | /* |
866 | * If it's a shared mapping, mark it clean in | 866 | * If it's a shared mapping, mark it clean in |
867 | * the child | 867 | * the child |
868 | */ | 868 | */ |
869 | if (vm_flags & VM_SHARED) | 869 | if (vm_flags & VM_SHARED) |
870 | pte = pte_mkclean(pte); | 870 | pte = pte_mkclean(pte); |
871 | pte = pte_mkold(pte); | 871 | pte = pte_mkold(pte); |
872 | 872 | ||
873 | page = vm_normal_page(vma, addr, pte); | 873 | page = vm_normal_page(vma, addr, pte); |
874 | if (page) { | 874 | if (page) { |
875 | get_page(page); | 875 | get_page(page); |
876 | page_dup_rmap(page); | 876 | page_dup_rmap(page); |
877 | if (PageAnon(page)) | 877 | if (PageAnon(page)) |
878 | rss[MM_ANONPAGES]++; | 878 | rss[MM_ANONPAGES]++; |
879 | else | 879 | else |
880 | rss[MM_FILEPAGES]++; | 880 | rss[MM_FILEPAGES]++; |
881 | } | 881 | } |
882 | 882 | ||
883 | out_set_pte: | 883 | out_set_pte: |
884 | set_pte_at(dst_mm, addr, dst_pte, pte); | 884 | set_pte_at(dst_mm, addr, dst_pte, pte); |
885 | return 0; | 885 | return 0; |
886 | } | 886 | } |
887 | 887 | ||
888 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 888 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
889 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 889 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
890 | unsigned long addr, unsigned long end) | 890 | unsigned long addr, unsigned long end) |
891 | { | 891 | { |
892 | pte_t *orig_src_pte, *orig_dst_pte; | 892 | pte_t *orig_src_pte, *orig_dst_pte; |
893 | pte_t *src_pte, *dst_pte; | 893 | pte_t *src_pte, *dst_pte; |
894 | spinlock_t *src_ptl, *dst_ptl; | 894 | spinlock_t *src_ptl, *dst_ptl; |
895 | int progress = 0; | 895 | int progress = 0; |
896 | int rss[NR_MM_COUNTERS]; | 896 | int rss[NR_MM_COUNTERS]; |
897 | swp_entry_t entry = (swp_entry_t){0}; | 897 | swp_entry_t entry = (swp_entry_t){0}; |
898 | 898 | ||
899 | again: | 899 | again: |
900 | init_rss_vec(rss); | 900 | init_rss_vec(rss); |
901 | 901 | ||
902 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 902 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
903 | if (!dst_pte) | 903 | if (!dst_pte) |
904 | return -ENOMEM; | 904 | return -ENOMEM; |
905 | src_pte = pte_offset_map(src_pmd, addr); | 905 | src_pte = pte_offset_map(src_pmd, addr); |
906 | src_ptl = pte_lockptr(src_mm, src_pmd); | 906 | src_ptl = pte_lockptr(src_mm, src_pmd); |
907 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 907 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
908 | orig_src_pte = src_pte; | 908 | orig_src_pte = src_pte; |
909 | orig_dst_pte = dst_pte; | 909 | orig_dst_pte = dst_pte; |
910 | arch_enter_lazy_mmu_mode(); | 910 | arch_enter_lazy_mmu_mode(); |
911 | 911 | ||
912 | do { | 912 | do { |
913 | /* | 913 | /* |
914 | * We are holding two locks at this point - either of them | 914 | * We are holding two locks at this point - either of them |
915 | * could generate latencies in another task on another CPU. | 915 | * could generate latencies in another task on another CPU. |
916 | */ | 916 | */ |
917 | if (progress >= 32) { | 917 | if (progress >= 32) { |
918 | progress = 0; | 918 | progress = 0; |
919 | if (need_resched() || | 919 | if (need_resched() || |
920 | spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) | 920 | spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) |
921 | break; | 921 | break; |
922 | } | 922 | } |
923 | if (pte_none(*src_pte)) { | 923 | if (pte_none(*src_pte)) { |
924 | progress++; | 924 | progress++; |
925 | continue; | 925 | continue; |
926 | } | 926 | } |
927 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, | 927 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, |
928 | vma, addr, rss); | 928 | vma, addr, rss); |
929 | if (entry.val) | 929 | if (entry.val) |
930 | break; | 930 | break; |
931 | progress += 8; | 931 | progress += 8; |
932 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 932 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
933 | 933 | ||
934 | arch_leave_lazy_mmu_mode(); | 934 | arch_leave_lazy_mmu_mode(); |
935 | spin_unlock(src_ptl); | 935 | spin_unlock(src_ptl); |
936 | pte_unmap(orig_src_pte); | 936 | pte_unmap(orig_src_pte); |
937 | add_mm_rss_vec(dst_mm, rss); | 937 | add_mm_rss_vec(dst_mm, rss); |
938 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 938 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
939 | cond_resched(); | 939 | cond_resched(); |
940 | 940 | ||
941 | if (entry.val) { | 941 | if (entry.val) { |
942 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | 942 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) |
943 | return -ENOMEM; | 943 | return -ENOMEM; |
944 | progress = 0; | 944 | progress = 0; |
945 | } | 945 | } |
946 | if (addr != end) | 946 | if (addr != end) |
947 | goto again; | 947 | goto again; |
948 | return 0; | 948 | return 0; |
949 | } | 949 | } |
950 | 950 | ||
951 | static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 951 | static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
952 | pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, | 952 | pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, |
953 | unsigned long addr, unsigned long end) | 953 | unsigned long addr, unsigned long end) |
954 | { | 954 | { |
955 | pmd_t *src_pmd, *dst_pmd; | 955 | pmd_t *src_pmd, *dst_pmd; |
956 | unsigned long next; | 956 | unsigned long next; |
957 | 957 | ||
958 | dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); | 958 | dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); |
959 | if (!dst_pmd) | 959 | if (!dst_pmd) |
960 | return -ENOMEM; | 960 | return -ENOMEM; |
961 | src_pmd = pmd_offset(src_pud, addr); | 961 | src_pmd = pmd_offset(src_pud, addr); |
962 | do { | 962 | do { |
963 | next = pmd_addr_end(addr, end); | 963 | next = pmd_addr_end(addr, end); |
964 | if (pmd_trans_huge(*src_pmd)) { | 964 | if (pmd_trans_huge(*src_pmd)) { |
965 | int err; | 965 | int err; |
966 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | 966 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); |
967 | err = copy_huge_pmd(dst_mm, src_mm, | 967 | err = copy_huge_pmd(dst_mm, src_mm, |
968 | dst_pmd, src_pmd, addr, vma); | 968 | dst_pmd, src_pmd, addr, vma); |
969 | if (err == -ENOMEM) | 969 | if (err == -ENOMEM) |
970 | return -ENOMEM; | 970 | return -ENOMEM; |
971 | if (!err) | 971 | if (!err) |
972 | continue; | 972 | continue; |
973 | /* fall through */ | 973 | /* fall through */ |
974 | } | 974 | } |
975 | if (pmd_none_or_clear_bad(src_pmd)) | 975 | if (pmd_none_or_clear_bad(src_pmd)) |
976 | continue; | 976 | continue; |
977 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 977 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
978 | vma, addr, next)) | 978 | vma, addr, next)) |
979 | return -ENOMEM; | 979 | return -ENOMEM; |
980 | } while (dst_pmd++, src_pmd++, addr = next, addr != end); | 980 | } while (dst_pmd++, src_pmd++, addr = next, addr != end); |
981 | return 0; | 981 | return 0; |
982 | } | 982 | } |
983 | 983 | ||
984 | static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 984 | static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
985 | pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, | 985 | pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, |
986 | unsigned long addr, unsigned long end) | 986 | unsigned long addr, unsigned long end) |
987 | { | 987 | { |
988 | pud_t *src_pud, *dst_pud; | 988 | pud_t *src_pud, *dst_pud; |
989 | unsigned long next; | 989 | unsigned long next; |
990 | 990 | ||
991 | dst_pud = pud_alloc(dst_mm, dst_pgd, addr); | 991 | dst_pud = pud_alloc(dst_mm, dst_pgd, addr); |
992 | if (!dst_pud) | 992 | if (!dst_pud) |
993 | return -ENOMEM; | 993 | return -ENOMEM; |
994 | src_pud = pud_offset(src_pgd, addr); | 994 | src_pud = pud_offset(src_pgd, addr); |
995 | do { | 995 | do { |
996 | next = pud_addr_end(addr, end); | 996 | next = pud_addr_end(addr, end); |
997 | if (pud_none_or_clear_bad(src_pud)) | 997 | if (pud_none_or_clear_bad(src_pud)) |
998 | continue; | 998 | continue; |
999 | if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, | 999 | if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, |
1000 | vma, addr, next)) | 1000 | vma, addr, next)) |
1001 | return -ENOMEM; | 1001 | return -ENOMEM; |
1002 | } while (dst_pud++, src_pud++, addr = next, addr != end); | 1002 | } while (dst_pud++, src_pud++, addr = next, addr != end); |
1003 | return 0; | 1003 | return 0; |
1004 | } | 1004 | } |
1005 | 1005 | ||
1006 | int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 1006 | int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
1007 | struct vm_area_struct *vma) | 1007 | struct vm_area_struct *vma) |
1008 | { | 1008 | { |
1009 | pgd_t *src_pgd, *dst_pgd; | 1009 | pgd_t *src_pgd, *dst_pgd; |
1010 | unsigned long next; | 1010 | unsigned long next; |
1011 | unsigned long addr = vma->vm_start; | 1011 | unsigned long addr = vma->vm_start; |
1012 | unsigned long end = vma->vm_end; | 1012 | unsigned long end = vma->vm_end; |
1013 | unsigned long mmun_start; /* For mmu_notifiers */ | 1013 | unsigned long mmun_start; /* For mmu_notifiers */ |
1014 | unsigned long mmun_end; /* For mmu_notifiers */ | 1014 | unsigned long mmun_end; /* For mmu_notifiers */ |
1015 | bool is_cow; | 1015 | bool is_cow; |
1016 | int ret; | 1016 | int ret; |
1017 | 1017 | ||
1018 | /* | 1018 | /* |
1019 | * Don't copy ptes where a page fault will fill them correctly. | 1019 | * Don't copy ptes where a page fault will fill them correctly. |
1020 | * Fork becomes much lighter when there are big shared or private | 1020 | * Fork becomes much lighter when there are big shared or private |
1021 | * readonly mappings. The tradeoff is that copy_page_range is more | 1021 | * readonly mappings. The tradeoff is that copy_page_range is more |
1022 | * efficient than faulting. | 1022 | * efficient than faulting. |
1023 | */ | 1023 | */ |
1024 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | | 1024 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | |
1025 | VM_PFNMAP | VM_MIXEDMAP))) { | 1025 | VM_PFNMAP | VM_MIXEDMAP))) { |
1026 | if (!vma->anon_vma) | 1026 | if (!vma->anon_vma) |
1027 | return 0; | 1027 | return 0; |
1028 | } | 1028 | } |
1029 | 1029 | ||
1030 | if (is_vm_hugetlb_page(vma)) | 1030 | if (is_vm_hugetlb_page(vma)) |
1031 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1031 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
1032 | 1032 | ||
1033 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { | 1033 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
1034 | /* | 1034 | /* |
1035 | * We do not free on error cases below as remove_vma | 1035 | * We do not free on error cases below as remove_vma |
1036 | * gets called on error from higher level routine | 1036 | * gets called on error from higher level routine |
1037 | */ | 1037 | */ |
1038 | ret = track_pfn_copy(vma); | 1038 | ret = track_pfn_copy(vma); |
1039 | if (ret) | 1039 | if (ret) |
1040 | return ret; | 1040 | return ret; |
1041 | } | 1041 | } |
1042 | 1042 | ||
1043 | /* | 1043 | /* |
1044 | * We need to invalidate the secondary MMU mappings only when | 1044 | * We need to invalidate the secondary MMU mappings only when |
1045 | * there could be a permission downgrade on the ptes of the | 1045 | * there could be a permission downgrade on the ptes of the |
1046 | * parent mm. And a permission downgrade will only happen if | 1046 | * parent mm. And a permission downgrade will only happen if |
1047 | * is_cow_mapping() returns true. | 1047 | * is_cow_mapping() returns true. |
1048 | */ | 1048 | */ |
1049 | is_cow = is_cow_mapping(vma->vm_flags); | 1049 | is_cow = is_cow_mapping(vma->vm_flags); |
1050 | mmun_start = addr; | 1050 | mmun_start = addr; |
1051 | mmun_end = end; | 1051 | mmun_end = end; |
1052 | if (is_cow) | 1052 | if (is_cow) |
1053 | mmu_notifier_invalidate_range_start(src_mm, mmun_start, | 1053 | mmu_notifier_invalidate_range_start(src_mm, mmun_start, |
1054 | mmun_end); | 1054 | mmun_end); |
1055 | 1055 | ||
1056 | ret = 0; | 1056 | ret = 0; |
1057 | dst_pgd = pgd_offset(dst_mm, addr); | 1057 | dst_pgd = pgd_offset(dst_mm, addr); |
1058 | src_pgd = pgd_offset(src_mm, addr); | 1058 | src_pgd = pgd_offset(src_mm, addr); |
1059 | do { | 1059 | do { |
1060 | next = pgd_addr_end(addr, end); | 1060 | next = pgd_addr_end(addr, end); |
1061 | if (pgd_none_or_clear_bad(src_pgd)) | 1061 | if (pgd_none_or_clear_bad(src_pgd)) |
1062 | continue; | 1062 | continue; |
1063 | if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | 1063 | if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, |
1064 | vma, addr, next))) { | 1064 | vma, addr, next))) { |
1065 | ret = -ENOMEM; | 1065 | ret = -ENOMEM; |
1066 | break; | 1066 | break; |
1067 | } | 1067 | } |
1068 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 1068 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
1069 | 1069 | ||
1070 | if (is_cow) | 1070 | if (is_cow) |
1071 | mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); | 1071 | mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); |
1072 | return ret; | 1072 | return ret; |
1073 | } | 1073 | } |
1074 | 1074 | ||
1075 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 1075 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
1076 | struct vm_area_struct *vma, pmd_t *pmd, | 1076 | struct vm_area_struct *vma, pmd_t *pmd, |
1077 | unsigned long addr, unsigned long end, | 1077 | unsigned long addr, unsigned long end, |
1078 | struct zap_details *details) | 1078 | struct zap_details *details) |
1079 | { | 1079 | { |
1080 | struct mm_struct *mm = tlb->mm; | 1080 | struct mm_struct *mm = tlb->mm; |
1081 | int force_flush = 0; | 1081 | int force_flush = 0; |
1082 | int rss[NR_MM_COUNTERS]; | 1082 | int rss[NR_MM_COUNTERS]; |
1083 | spinlock_t *ptl; | 1083 | spinlock_t *ptl; |
1084 | pte_t *start_pte; | 1084 | pte_t *start_pte; |
1085 | pte_t *pte; | 1085 | pte_t *pte; |
1086 | 1086 | ||
1087 | again: | 1087 | again: |
1088 | init_rss_vec(rss); | 1088 | init_rss_vec(rss); |
1089 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1089 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
1090 | pte = start_pte; | 1090 | pte = start_pte; |
1091 | arch_enter_lazy_mmu_mode(); | 1091 | arch_enter_lazy_mmu_mode(); |
1092 | do { | 1092 | do { |
1093 | pte_t ptent = *pte; | 1093 | pte_t ptent = *pte; |
1094 | if (pte_none(ptent)) { | 1094 | if (pte_none(ptent)) { |
1095 | continue; | 1095 | continue; |
1096 | } | 1096 | } |
1097 | 1097 | ||
1098 | if (pte_present(ptent)) { | 1098 | if (pte_present(ptent)) { |
1099 | struct page *page; | 1099 | struct page *page; |
1100 | 1100 | ||
1101 | page = vm_normal_page(vma, addr, ptent); | 1101 | page = vm_normal_page(vma, addr, ptent); |
1102 | if (unlikely(details) && page) { | 1102 | if (unlikely(details) && page) { |
1103 | /* | 1103 | /* |
1104 | * unmap_shared_mapping_pages() wants to | 1104 | * unmap_shared_mapping_pages() wants to |
1105 | * invalidate cache without truncating: | 1105 | * invalidate cache without truncating: |
1106 | * unmap shared but keep private pages. | 1106 | * unmap shared but keep private pages. |
1107 | */ | 1107 | */ |
1108 | if (details->check_mapping && | 1108 | if (details->check_mapping && |
1109 | details->check_mapping != page->mapping) | 1109 | details->check_mapping != page->mapping) |
1110 | continue; | 1110 | continue; |
1111 | /* | 1111 | /* |
1112 | * Each page->index must be checked when | 1112 | * Each page->index must be checked when |
1113 | * invalidating or truncating nonlinear. | 1113 | * invalidating or truncating nonlinear. |
1114 | */ | 1114 | */ |
1115 | if (details->nonlinear_vma && | 1115 | if (details->nonlinear_vma && |
1116 | (page->index < details->first_index || | 1116 | (page->index < details->first_index || |
1117 | page->index > details->last_index)) | 1117 | page->index > details->last_index)) |
1118 | continue; | 1118 | continue; |
1119 | } | 1119 | } |
1120 | ptent = ptep_get_and_clear_full(mm, addr, pte, | 1120 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
1121 | tlb->fullmm); | 1121 | tlb->fullmm); |
1122 | tlb_remove_tlb_entry(tlb, pte, addr); | 1122 | tlb_remove_tlb_entry(tlb, pte, addr); |
1123 | if (unlikely(!page)) | 1123 | if (unlikely(!page)) |
1124 | continue; | 1124 | continue; |
1125 | if (unlikely(details) && details->nonlinear_vma | 1125 | if (unlikely(details) && details->nonlinear_vma |
1126 | && linear_page_index(details->nonlinear_vma, | 1126 | && linear_page_index(details->nonlinear_vma, |
1127 | addr) != page->index) { | 1127 | addr) != page->index) { |
1128 | pte_t ptfile = pgoff_to_pte(page->index); | 1128 | pte_t ptfile = pgoff_to_pte(page->index); |
1129 | if (pte_soft_dirty(ptent)) | 1129 | if (pte_soft_dirty(ptent)) |
1130 | pte_file_mksoft_dirty(ptfile); | 1130 | ptfile = pte_file_mksoft_dirty(ptfile); |
1131 | set_pte_at(mm, addr, pte, ptfile); | 1131 | set_pte_at(mm, addr, pte, ptfile); |
1132 | } | 1132 | } |
1133 | if (PageAnon(page)) | 1133 | if (PageAnon(page)) |
1134 | rss[MM_ANONPAGES]--; | 1134 | rss[MM_ANONPAGES]--; |
1135 | else { | 1135 | else { |
1136 | if (pte_dirty(ptent)) { | 1136 | if (pte_dirty(ptent)) { |
1137 | force_flush = 1; | 1137 | force_flush = 1; |
1138 | set_page_dirty(page); | 1138 | set_page_dirty(page); |
1139 | } | 1139 | } |
1140 | if (pte_young(ptent) && | 1140 | if (pte_young(ptent) && |
1141 | likely(!(vma->vm_flags & VM_SEQ_READ))) | 1141 | likely(!(vma->vm_flags & VM_SEQ_READ))) |
1142 | mark_page_accessed(page); | 1142 | mark_page_accessed(page); |
1143 | rss[MM_FILEPAGES]--; | 1143 | rss[MM_FILEPAGES]--; |
1144 | } | 1144 | } |
1145 | page_remove_rmap(page); | 1145 | page_remove_rmap(page); |
1146 | if (unlikely(page_mapcount(page) < 0)) | 1146 | if (unlikely(page_mapcount(page) < 0)) |
1147 | print_bad_pte(vma, addr, ptent, page); | 1147 | print_bad_pte(vma, addr, ptent, page); |
1148 | if (unlikely(!__tlb_remove_page(tlb, page))) { | 1148 | if (unlikely(!__tlb_remove_page(tlb, page))) { |
1149 | force_flush = 1; | 1149 | force_flush = 1; |
1150 | break; | 1150 | break; |
1151 | } | 1151 | } |
1152 | continue; | 1152 | continue; |
1153 | } | 1153 | } |
1154 | /* | 1154 | /* |
1155 | * If details->check_mapping, we leave swap entries; | 1155 | * If details->check_mapping, we leave swap entries; |
1156 | * if details->nonlinear_vma, we leave file entries. | 1156 | * if details->nonlinear_vma, we leave file entries. |
1157 | */ | 1157 | */ |
1158 | if (unlikely(details)) | 1158 | if (unlikely(details)) |
1159 | continue; | 1159 | continue; |
1160 | if (pte_file(ptent)) { | 1160 | if (pte_file(ptent)) { |
1161 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | 1161 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
1162 | print_bad_pte(vma, addr, ptent, NULL); | 1162 | print_bad_pte(vma, addr, ptent, NULL); |
1163 | } else { | 1163 | } else { |
1164 | swp_entry_t entry = pte_to_swp_entry(ptent); | 1164 | swp_entry_t entry = pte_to_swp_entry(ptent); |
1165 | 1165 | ||
1166 | if (!non_swap_entry(entry)) | 1166 | if (!non_swap_entry(entry)) |
1167 | rss[MM_SWAPENTS]--; | 1167 | rss[MM_SWAPENTS]--; |
1168 | else if (is_migration_entry(entry)) { | 1168 | else if (is_migration_entry(entry)) { |
1169 | struct page *page; | 1169 | struct page *page; |
1170 | 1170 | ||
1171 | page = migration_entry_to_page(entry); | 1171 | page = migration_entry_to_page(entry); |
1172 | 1172 | ||
1173 | if (PageAnon(page)) | 1173 | if (PageAnon(page)) |
1174 | rss[MM_ANONPAGES]--; | 1174 | rss[MM_ANONPAGES]--; |
1175 | else | 1175 | else |
1176 | rss[MM_FILEPAGES]--; | 1176 | rss[MM_FILEPAGES]--; |
1177 | } | 1177 | } |
1178 | if (unlikely(!free_swap_and_cache(entry))) | 1178 | if (unlikely(!free_swap_and_cache(entry))) |
1179 | print_bad_pte(vma, addr, ptent, NULL); | 1179 | print_bad_pte(vma, addr, ptent, NULL); |
1180 | } | 1180 | } |
1181 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 1181 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
1182 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1182 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1183 | 1183 | ||
1184 | add_mm_rss_vec(mm, rss); | 1184 | add_mm_rss_vec(mm, rss); |
1185 | arch_leave_lazy_mmu_mode(); | 1185 | arch_leave_lazy_mmu_mode(); |
1186 | 1186 | ||
1187 | /* Do the actual TLB flush before dropping ptl */ | 1187 | /* Do the actual TLB flush before dropping ptl */ |
1188 | if (force_flush) { | 1188 | if (force_flush) { |
1189 | unsigned long old_end; | 1189 | unsigned long old_end; |
1190 | 1190 | ||
1191 | /* | 1191 | /* |
1192 | * Flush the TLB just for the previous segment, | 1192 | * Flush the TLB just for the previous segment, |
1193 | * then update the range to be the remaining | 1193 | * then update the range to be the remaining |
1194 | * TLB range. | 1194 | * TLB range. |
1195 | */ | 1195 | */ |
1196 | old_end = tlb->end; | 1196 | old_end = tlb->end; |
1197 | tlb->end = addr; | 1197 | tlb->end = addr; |
1198 | tlb_flush_mmu_tlbonly(tlb); | 1198 | tlb_flush_mmu_tlbonly(tlb); |
1199 | tlb->start = addr; | 1199 | tlb->start = addr; |
1200 | tlb->end = old_end; | 1200 | tlb->end = old_end; |
1201 | } | 1201 | } |
1202 | pte_unmap_unlock(start_pte, ptl); | 1202 | pte_unmap_unlock(start_pte, ptl); |
1203 | 1203 | ||
1204 | /* | 1204 | /* |
1205 | * If we forced a TLB flush (either due to running out of | 1205 | * If we forced a TLB flush (either due to running out of |
1206 | * batch buffers or because we needed to flush dirty TLB | 1206 | * batch buffers or because we needed to flush dirty TLB |
1207 | * entries before releasing the ptl), free the batched | 1207 | * entries before releasing the ptl), free the batched |
1208 | * memory too. Restart if we didn't do everything. | 1208 | * memory too. Restart if we didn't do everything. |
1209 | */ | 1209 | */ |
1210 | if (force_flush) { | 1210 | if (force_flush) { |
1211 | force_flush = 0; | 1211 | force_flush = 0; |
1212 | tlb_flush_mmu_free(tlb); | 1212 | tlb_flush_mmu_free(tlb); |
1213 | 1213 | ||
1214 | if (addr != end) | 1214 | if (addr != end) |
1215 | goto again; | 1215 | goto again; |
1216 | } | 1216 | } |
1217 | 1217 | ||
1218 | return addr; | 1218 | return addr; |
1219 | } | 1219 | } |
1220 | 1220 | ||
1221 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | 1221 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, |
1222 | struct vm_area_struct *vma, pud_t *pud, | 1222 | struct vm_area_struct *vma, pud_t *pud, |
1223 | unsigned long addr, unsigned long end, | 1223 | unsigned long addr, unsigned long end, |
1224 | struct zap_details *details) | 1224 | struct zap_details *details) |
1225 | { | 1225 | { |
1226 | pmd_t *pmd; | 1226 | pmd_t *pmd; |
1227 | unsigned long next; | 1227 | unsigned long next; |
1228 | 1228 | ||
1229 | pmd = pmd_offset(pud, addr); | 1229 | pmd = pmd_offset(pud, addr); |
1230 | do { | 1230 | do { |
1231 | next = pmd_addr_end(addr, end); | 1231 | next = pmd_addr_end(addr, end); |
1232 | if (pmd_trans_huge(*pmd)) { | 1232 | if (pmd_trans_huge(*pmd)) { |
1233 | if (next - addr != HPAGE_PMD_SIZE) { | 1233 | if (next - addr != HPAGE_PMD_SIZE) { |
1234 | #ifdef CONFIG_DEBUG_VM | 1234 | #ifdef CONFIG_DEBUG_VM |
1235 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | 1235 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { |
1236 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", | 1236 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", |
1237 | __func__, addr, end, | 1237 | __func__, addr, end, |
1238 | vma->vm_start, | 1238 | vma->vm_start, |
1239 | vma->vm_end); | 1239 | vma->vm_end); |
1240 | BUG(); | 1240 | BUG(); |
1241 | } | 1241 | } |
1242 | #endif | 1242 | #endif |
1243 | split_huge_page_pmd(vma, addr, pmd); | 1243 | split_huge_page_pmd(vma, addr, pmd); |
1244 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1244 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1245 | goto next; | 1245 | goto next; |
1246 | /* fall through */ | 1246 | /* fall through */ |
1247 | } | 1247 | } |
1248 | /* | 1248 | /* |
1249 | * Here there can be other concurrent MADV_DONTNEED or | 1249 | * Here there can be other concurrent MADV_DONTNEED or |
1250 | * trans huge page faults running, and if the pmd is | 1250 | * trans huge page faults running, and if the pmd is |
1251 | * none or trans huge it can change under us. This is | 1251 | * none or trans huge it can change under us. This is |
1252 | * because MADV_DONTNEED holds the mmap_sem in read | 1252 | * because MADV_DONTNEED holds the mmap_sem in read |
1253 | * mode. | 1253 | * mode. |
1254 | */ | 1254 | */ |
1255 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 1255 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
1256 | goto next; | 1256 | goto next; |
1257 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); | 1257 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1258 | next: | 1258 | next: |
1259 | cond_resched(); | 1259 | cond_resched(); |
1260 | } while (pmd++, addr = next, addr != end); | 1260 | } while (pmd++, addr = next, addr != end); |
1261 | 1261 | ||
1262 | return addr; | 1262 | return addr; |
1263 | } | 1263 | } |
1264 | 1264 | ||
1265 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | 1265 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, |
1266 | struct vm_area_struct *vma, pgd_t *pgd, | 1266 | struct vm_area_struct *vma, pgd_t *pgd, |
1267 | unsigned long addr, unsigned long end, | 1267 | unsigned long addr, unsigned long end, |
1268 | struct zap_details *details) | 1268 | struct zap_details *details) |
1269 | { | 1269 | { |
1270 | pud_t *pud; | 1270 | pud_t *pud; |
1271 | unsigned long next; | 1271 | unsigned long next; |
1272 | 1272 | ||
1273 | pud = pud_offset(pgd, addr); | 1273 | pud = pud_offset(pgd, addr); |
1274 | do { | 1274 | do { |
1275 | next = pud_addr_end(addr, end); | 1275 | next = pud_addr_end(addr, end); |
1276 | if (pud_none_or_clear_bad(pud)) | 1276 | if (pud_none_or_clear_bad(pud)) |
1277 | continue; | 1277 | continue; |
1278 | next = zap_pmd_range(tlb, vma, pud, addr, next, details); | 1278 | next = zap_pmd_range(tlb, vma, pud, addr, next, details); |
1279 | } while (pud++, addr = next, addr != end); | 1279 | } while (pud++, addr = next, addr != end); |
1280 | 1280 | ||
1281 | return addr; | 1281 | return addr; |
1282 | } | 1282 | } |
1283 | 1283 | ||
1284 | static void unmap_page_range(struct mmu_gather *tlb, | 1284 | static void unmap_page_range(struct mmu_gather *tlb, |
1285 | struct vm_area_struct *vma, | 1285 | struct vm_area_struct *vma, |
1286 | unsigned long addr, unsigned long end, | 1286 | unsigned long addr, unsigned long end, |
1287 | struct zap_details *details) | 1287 | struct zap_details *details) |
1288 | { | 1288 | { |
1289 | pgd_t *pgd; | 1289 | pgd_t *pgd; |
1290 | unsigned long next; | 1290 | unsigned long next; |
1291 | 1291 | ||
1292 | if (details && !details->check_mapping && !details->nonlinear_vma) | 1292 | if (details && !details->check_mapping && !details->nonlinear_vma) |
1293 | details = NULL; | 1293 | details = NULL; |
1294 | 1294 | ||
1295 | BUG_ON(addr >= end); | 1295 | BUG_ON(addr >= end); |
1296 | tlb_start_vma(tlb, vma); | 1296 | tlb_start_vma(tlb, vma); |
1297 | pgd = pgd_offset(vma->vm_mm, addr); | 1297 | pgd = pgd_offset(vma->vm_mm, addr); |
1298 | do { | 1298 | do { |
1299 | next = pgd_addr_end(addr, end); | 1299 | next = pgd_addr_end(addr, end); |
1300 | if (pgd_none_or_clear_bad(pgd)) | 1300 | if (pgd_none_or_clear_bad(pgd)) |
1301 | continue; | 1301 | continue; |
1302 | next = zap_pud_range(tlb, vma, pgd, addr, next, details); | 1302 | next = zap_pud_range(tlb, vma, pgd, addr, next, details); |
1303 | } while (pgd++, addr = next, addr != end); | 1303 | } while (pgd++, addr = next, addr != end); |
1304 | tlb_end_vma(tlb, vma); | 1304 | tlb_end_vma(tlb, vma); |
1305 | } | 1305 | } |
1306 | 1306 | ||
1307 | 1307 | ||
1308 | static void unmap_single_vma(struct mmu_gather *tlb, | 1308 | static void unmap_single_vma(struct mmu_gather *tlb, |
1309 | struct vm_area_struct *vma, unsigned long start_addr, | 1309 | struct vm_area_struct *vma, unsigned long start_addr, |
1310 | unsigned long end_addr, | 1310 | unsigned long end_addr, |
1311 | struct zap_details *details) | 1311 | struct zap_details *details) |
1312 | { | 1312 | { |
1313 | unsigned long start = max(vma->vm_start, start_addr); | 1313 | unsigned long start = max(vma->vm_start, start_addr); |
1314 | unsigned long end; | 1314 | unsigned long end; |
1315 | 1315 | ||
1316 | if (start >= vma->vm_end) | 1316 | if (start >= vma->vm_end) |
1317 | return; | 1317 | return; |
1318 | end = min(vma->vm_end, end_addr); | 1318 | end = min(vma->vm_end, end_addr); |
1319 | if (end <= vma->vm_start) | 1319 | if (end <= vma->vm_start) |
1320 | return; | 1320 | return; |
1321 | 1321 | ||
1322 | if (vma->vm_file) | 1322 | if (vma->vm_file) |
1323 | uprobe_munmap(vma, start, end); | 1323 | uprobe_munmap(vma, start, end); |
1324 | 1324 | ||
1325 | if (unlikely(vma->vm_flags & VM_PFNMAP)) | 1325 | if (unlikely(vma->vm_flags & VM_PFNMAP)) |
1326 | untrack_pfn(vma, 0, 0); | 1326 | untrack_pfn(vma, 0, 0); |
1327 | 1327 | ||
1328 | if (start != end) { | 1328 | if (start != end) { |
1329 | if (unlikely(is_vm_hugetlb_page(vma))) { | 1329 | if (unlikely(is_vm_hugetlb_page(vma))) { |
1330 | /* | 1330 | /* |
1331 | * It is undesirable to test vma->vm_file as it | 1331 | * It is undesirable to test vma->vm_file as it |
1332 | * should be non-null for valid hugetlb area. | 1332 | * should be non-null for valid hugetlb area. |
1333 | * However, vm_file will be NULL in the error | 1333 | * However, vm_file will be NULL in the error |
1334 | * cleanup path of mmap_region. When | 1334 | * cleanup path of mmap_region. When |
1335 | * hugetlbfs ->mmap method fails, | 1335 | * hugetlbfs ->mmap method fails, |
1336 | * mmap_region() nullifies vma->vm_file | 1336 | * mmap_region() nullifies vma->vm_file |
1337 | * before calling this function to clean up. | 1337 | * before calling this function to clean up. |
1338 | * Since no pte has actually been setup, it is | 1338 | * Since no pte has actually been setup, it is |
1339 | * safe to do nothing in this case. | 1339 | * safe to do nothing in this case. |
1340 | */ | 1340 | */ |
1341 | if (vma->vm_file) { | 1341 | if (vma->vm_file) { |
1342 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1342 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
1343 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | 1343 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); |
1344 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1344 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
1345 | } | 1345 | } |
1346 | } else | 1346 | } else |
1347 | unmap_page_range(tlb, vma, start, end, details); | 1347 | unmap_page_range(tlb, vma, start, end, details); |
1348 | } | 1348 | } |
1349 | } | 1349 | } |
1350 | 1350 | ||
1351 | /** | 1351 | /** |
1352 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1352 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1353 | * @tlb: address of the caller's struct mmu_gather | 1353 | * @tlb: address of the caller's struct mmu_gather |
1354 | * @vma: the starting vma | 1354 | * @vma: the starting vma |
1355 | * @start_addr: virtual address at which to start unmapping | 1355 | * @start_addr: virtual address at which to start unmapping |
1356 | * @end_addr: virtual address at which to end unmapping | 1356 | * @end_addr: virtual address at which to end unmapping |
1357 | * | 1357 | * |
1358 | * Unmap all pages in the vma list. | 1358 | * Unmap all pages in the vma list. |
1359 | * | 1359 | * |
1360 | * Only addresses between `start' and `end' will be unmapped. | 1360 | * Only addresses between `start' and `end' will be unmapped. |
1361 | * | 1361 | * |
1362 | * The VMA list must be sorted in ascending virtual address order. | 1362 | * The VMA list must be sorted in ascending virtual address order. |
1363 | * | 1363 | * |
1364 | * unmap_vmas() assumes that the caller will flush the whole unmapped address | 1364 | * unmap_vmas() assumes that the caller will flush the whole unmapped address |
1365 | * range after unmap_vmas() returns. So the only responsibility here is to | 1365 | * range after unmap_vmas() returns. So the only responsibility here is to |
1366 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 1366 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
1367 | * drops the lock and schedules. | 1367 | * drops the lock and schedules. |
1368 | */ | 1368 | */ |
1369 | void unmap_vmas(struct mmu_gather *tlb, | 1369 | void unmap_vmas(struct mmu_gather *tlb, |
1370 | struct vm_area_struct *vma, unsigned long start_addr, | 1370 | struct vm_area_struct *vma, unsigned long start_addr, |
1371 | unsigned long end_addr) | 1371 | unsigned long end_addr) |
1372 | { | 1372 | { |
1373 | struct mm_struct *mm = vma->vm_mm; | 1373 | struct mm_struct *mm = vma->vm_mm; |
1374 | 1374 | ||
1375 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1375 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); |
1376 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) | 1376 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) |
1377 | unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); | 1377 | unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); |
1378 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1378 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); |
1379 | } | 1379 | } |
1380 | 1380 | ||
1381 | /** | 1381 | /** |
1382 | * zap_page_range - remove user pages in a given range | 1382 | * zap_page_range - remove user pages in a given range |
1383 | * @vma: vm_area_struct holding the applicable pages | 1383 | * @vma: vm_area_struct holding the applicable pages |
1384 | * @start: starting address of pages to zap | 1384 | * @start: starting address of pages to zap |
1385 | * @size: number of bytes to zap | 1385 | * @size: number of bytes to zap |
1386 | * @details: details of nonlinear truncation or shared cache invalidation | 1386 | * @details: details of nonlinear truncation or shared cache invalidation |
1387 | * | 1387 | * |
1388 | * Caller must protect the VMA list | 1388 | * Caller must protect the VMA list |
1389 | */ | 1389 | */ |
1390 | void zap_page_range(struct vm_area_struct *vma, unsigned long start, | 1390 | void zap_page_range(struct vm_area_struct *vma, unsigned long start, |
1391 | unsigned long size, struct zap_details *details) | 1391 | unsigned long size, struct zap_details *details) |
1392 | { | 1392 | { |
1393 | struct mm_struct *mm = vma->vm_mm; | 1393 | struct mm_struct *mm = vma->vm_mm; |
1394 | struct mmu_gather tlb; | 1394 | struct mmu_gather tlb; |
1395 | unsigned long end = start + size; | 1395 | unsigned long end = start + size; |
1396 | 1396 | ||
1397 | lru_add_drain(); | 1397 | lru_add_drain(); |
1398 | tlb_gather_mmu(&tlb, mm, start, end); | 1398 | tlb_gather_mmu(&tlb, mm, start, end); |
1399 | update_hiwater_rss(mm); | 1399 | update_hiwater_rss(mm); |
1400 | mmu_notifier_invalidate_range_start(mm, start, end); | 1400 | mmu_notifier_invalidate_range_start(mm, start, end); |
1401 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) | 1401 | for ( ; vma && vma->vm_start < end; vma = vma->vm_next) |
1402 | unmap_single_vma(&tlb, vma, start, end, details); | 1402 | unmap_single_vma(&tlb, vma, start, end, details); |
1403 | mmu_notifier_invalidate_range_end(mm, start, end); | 1403 | mmu_notifier_invalidate_range_end(mm, start, end); |
1404 | tlb_finish_mmu(&tlb, start, end); | 1404 | tlb_finish_mmu(&tlb, start, end); |
1405 | } | 1405 | } |
1406 | 1406 | ||
1407 | /** | 1407 | /** |
1408 | * zap_page_range_single - remove user pages in a given range | 1408 | * zap_page_range_single - remove user pages in a given range |
1409 | * @vma: vm_area_struct holding the applicable pages | 1409 | * @vma: vm_area_struct holding the applicable pages |
1410 | * @address: starting address of pages to zap | 1410 | * @address: starting address of pages to zap |
1411 | * @size: number of bytes to zap | 1411 | * @size: number of bytes to zap |
1412 | * @details: details of nonlinear truncation or shared cache invalidation | 1412 | * @details: details of nonlinear truncation or shared cache invalidation |
1413 | * | 1413 | * |
1414 | * The range must fit into one VMA. | 1414 | * The range must fit into one VMA. |
1415 | */ | 1415 | */ |
1416 | static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, | 1416 | static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, |
1417 | unsigned long size, struct zap_details *details) | 1417 | unsigned long size, struct zap_details *details) |
1418 | { | 1418 | { |
1419 | struct mm_struct *mm = vma->vm_mm; | 1419 | struct mm_struct *mm = vma->vm_mm; |
1420 | struct mmu_gather tlb; | 1420 | struct mmu_gather tlb; |
1421 | unsigned long end = address + size; | 1421 | unsigned long end = address + size; |
1422 | 1422 | ||
1423 | lru_add_drain(); | 1423 | lru_add_drain(); |
1424 | tlb_gather_mmu(&tlb, mm, address, end); | 1424 | tlb_gather_mmu(&tlb, mm, address, end); |
1425 | update_hiwater_rss(mm); | 1425 | update_hiwater_rss(mm); |
1426 | mmu_notifier_invalidate_range_start(mm, address, end); | 1426 | mmu_notifier_invalidate_range_start(mm, address, end); |
1427 | unmap_single_vma(&tlb, vma, address, end, details); | 1427 | unmap_single_vma(&tlb, vma, address, end, details); |
1428 | mmu_notifier_invalidate_range_end(mm, address, end); | 1428 | mmu_notifier_invalidate_range_end(mm, address, end); |
1429 | tlb_finish_mmu(&tlb, address, end); | 1429 | tlb_finish_mmu(&tlb, address, end); |
1430 | } | 1430 | } |
1431 | 1431 | ||
1432 | /** | 1432 | /** |
1433 | * zap_vma_ptes - remove ptes mapping the vma | 1433 | * zap_vma_ptes - remove ptes mapping the vma |
1434 | * @vma: vm_area_struct holding ptes to be zapped | 1434 | * @vma: vm_area_struct holding ptes to be zapped |
1435 | * @address: starting address of pages to zap | 1435 | * @address: starting address of pages to zap |
1436 | * @size: number of bytes to zap | 1436 | * @size: number of bytes to zap |
1437 | * | 1437 | * |
1438 | * This function only unmaps ptes assigned to VM_PFNMAP vmas. | 1438 | * This function only unmaps ptes assigned to VM_PFNMAP vmas. |
1439 | * | 1439 | * |
1440 | * The entire address range must be fully contained within the vma. | 1440 | * The entire address range must be fully contained within the vma. |
1441 | * | 1441 | * |
1442 | * Returns 0 if successful. | 1442 | * Returns 0 if successful. |
1443 | */ | 1443 | */ |
1444 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | 1444 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, |
1445 | unsigned long size) | 1445 | unsigned long size) |
1446 | { | 1446 | { |
1447 | if (address < vma->vm_start || address + size > vma->vm_end || | 1447 | if (address < vma->vm_start || address + size > vma->vm_end || |
1448 | !(vma->vm_flags & VM_PFNMAP)) | 1448 | !(vma->vm_flags & VM_PFNMAP)) |
1449 | return -1; | 1449 | return -1; |
1450 | zap_page_range_single(vma, address, size, NULL); | 1450 | zap_page_range_single(vma, address, size, NULL); |
1451 | return 0; | 1451 | return 0; |
1452 | } | 1452 | } |
1453 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1453 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
1454 | 1454 | ||
1455 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1455 | pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, |
1456 | spinlock_t **ptl) | 1456 | spinlock_t **ptl) |
1457 | { | 1457 | { |
1458 | pgd_t * pgd = pgd_offset(mm, addr); | 1458 | pgd_t * pgd = pgd_offset(mm, addr); |
1459 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1459 | pud_t * pud = pud_alloc(mm, pgd, addr); |
1460 | if (pud) { | 1460 | if (pud) { |
1461 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1461 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
1462 | if (pmd) { | 1462 | if (pmd) { |
1463 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 1463 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
1464 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1464 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
1465 | } | 1465 | } |
1466 | } | 1466 | } |
1467 | return NULL; | 1467 | return NULL; |
1468 | } | 1468 | } |
1469 | 1469 | ||
1470 | /* | 1470 | /* |
1471 | * This is the old fallback for page remapping. | 1471 | * This is the old fallback for page remapping. |
1472 | * | 1472 | * |
1473 | * For historical reasons, it only allows reserved pages. Only | 1473 | * For historical reasons, it only allows reserved pages. Only |
1474 | * old drivers should use this, and they needed to mark their | 1474 | * old drivers should use this, and they needed to mark their |
1475 | * pages reserved for the old functions anyway. | 1475 | * pages reserved for the old functions anyway. |
1476 | */ | 1476 | */ |
1477 | static int insert_page(struct vm_area_struct *vma, unsigned long addr, | 1477 | static int insert_page(struct vm_area_struct *vma, unsigned long addr, |
1478 | struct page *page, pgprot_t prot) | 1478 | struct page *page, pgprot_t prot) |
1479 | { | 1479 | { |
1480 | struct mm_struct *mm = vma->vm_mm; | 1480 | struct mm_struct *mm = vma->vm_mm; |
1481 | int retval; | 1481 | int retval; |
1482 | pte_t *pte; | 1482 | pte_t *pte; |
1483 | spinlock_t *ptl; | 1483 | spinlock_t *ptl; |
1484 | 1484 | ||
1485 | retval = -EINVAL; | 1485 | retval = -EINVAL; |
1486 | if (PageAnon(page)) | 1486 | if (PageAnon(page)) |
1487 | goto out; | 1487 | goto out; |
1488 | retval = -ENOMEM; | 1488 | retval = -ENOMEM; |
1489 | flush_dcache_page(page); | 1489 | flush_dcache_page(page); |
1490 | pte = get_locked_pte(mm, addr, &ptl); | 1490 | pte = get_locked_pte(mm, addr, &ptl); |
1491 | if (!pte) | 1491 | if (!pte) |
1492 | goto out; | 1492 | goto out; |
1493 | retval = -EBUSY; | 1493 | retval = -EBUSY; |
1494 | if (!pte_none(*pte)) | 1494 | if (!pte_none(*pte)) |
1495 | goto out_unlock; | 1495 | goto out_unlock; |
1496 | 1496 | ||
1497 | /* Ok, finally just insert the thing.. */ | 1497 | /* Ok, finally just insert the thing.. */ |
1498 | get_page(page); | 1498 | get_page(page); |
1499 | inc_mm_counter_fast(mm, MM_FILEPAGES); | 1499 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
1500 | page_add_file_rmap(page); | 1500 | page_add_file_rmap(page); |
1501 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1501 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1502 | 1502 | ||
1503 | retval = 0; | 1503 | retval = 0; |
1504 | pte_unmap_unlock(pte, ptl); | 1504 | pte_unmap_unlock(pte, ptl); |
1505 | return retval; | 1505 | return retval; |
1506 | out_unlock: | 1506 | out_unlock: |
1507 | pte_unmap_unlock(pte, ptl); | 1507 | pte_unmap_unlock(pte, ptl); |
1508 | out: | 1508 | out: |
1509 | return retval; | 1509 | return retval; |
1510 | } | 1510 | } |
1511 | 1511 | ||
1512 | /** | 1512 | /** |
1513 | * vm_insert_page - insert single page into user vma | 1513 | * vm_insert_page - insert single page into user vma |
1514 | * @vma: user vma to map to | 1514 | * @vma: user vma to map to |
1515 | * @addr: target user address of this page | 1515 | * @addr: target user address of this page |
1516 | * @page: source kernel page | 1516 | * @page: source kernel page |
1517 | * | 1517 | * |
1518 | * This allows drivers to insert individual pages they've allocated | 1518 | * This allows drivers to insert individual pages they've allocated |
1519 | * into a user vma. | 1519 | * into a user vma. |
1520 | * | 1520 | * |
1521 | * The page has to be a nice clean _individual_ kernel allocation. | 1521 | * The page has to be a nice clean _individual_ kernel allocation. |
1522 | * If you allocate a compound page, you need to have marked it as | 1522 | * If you allocate a compound page, you need to have marked it as |
1523 | * such (__GFP_COMP), or manually just split the page up yourself | 1523 | * such (__GFP_COMP), or manually just split the page up yourself |
1524 | * (see split_page()). | 1524 | * (see split_page()). |
1525 | * | 1525 | * |
1526 | * NOTE! Traditionally this was done with "remap_pfn_range()" which | 1526 | * NOTE! Traditionally this was done with "remap_pfn_range()" which |
1527 | * took an arbitrary page protection parameter. This doesn't allow | 1527 | * took an arbitrary page protection parameter. This doesn't allow |
1528 | * that. Your vma protection will have to be set up correctly, which | 1528 | * that. Your vma protection will have to be set up correctly, which |
1529 | * means that if you want a shared writable mapping, you'd better | 1529 | * means that if you want a shared writable mapping, you'd better |
1530 | * ask for a shared writable mapping! | 1530 | * ask for a shared writable mapping! |
1531 | * | 1531 | * |
1532 | * The page does not need to be reserved. | 1532 | * The page does not need to be reserved. |
1533 | * | 1533 | * |
1534 | * Usually this function is called from f_op->mmap() handler | 1534 | * Usually this function is called from f_op->mmap() handler |
1535 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. | 1535 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. |
1536 | * Caller must set VM_MIXEDMAP on vma if it wants to call this | 1536 | * Caller must set VM_MIXEDMAP on vma if it wants to call this |
1537 | * function from other places, for example from page-fault handler. | 1537 | * function from other places, for example from page-fault handler. |
1538 | */ | 1538 | */ |
1539 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 1539 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
1540 | struct page *page) | 1540 | struct page *page) |
1541 | { | 1541 | { |
1542 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1542 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1543 | return -EFAULT; | 1543 | return -EFAULT; |
1544 | if (!page_count(page)) | 1544 | if (!page_count(page)) |
1545 | return -EINVAL; | 1545 | return -EINVAL; |
1546 | if (!(vma->vm_flags & VM_MIXEDMAP)) { | 1546 | if (!(vma->vm_flags & VM_MIXEDMAP)) { |
1547 | BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); | 1547 | BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); |
1548 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 1548 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
1549 | vma->vm_flags |= VM_MIXEDMAP; | 1549 | vma->vm_flags |= VM_MIXEDMAP; |
1550 | } | 1550 | } |
1551 | return insert_page(vma, addr, page, vma->vm_page_prot); | 1551 | return insert_page(vma, addr, page, vma->vm_page_prot); |
1552 | } | 1552 | } |
1553 | EXPORT_SYMBOL(vm_insert_page); | 1553 | EXPORT_SYMBOL(vm_insert_page); |
1554 | 1554 | ||
1555 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1555 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1556 | unsigned long pfn, pgprot_t prot) | 1556 | unsigned long pfn, pgprot_t prot) |
1557 | { | 1557 | { |
1558 | struct mm_struct *mm = vma->vm_mm; | 1558 | struct mm_struct *mm = vma->vm_mm; |
1559 | int retval; | 1559 | int retval; |
1560 | pte_t *pte, entry; | 1560 | pte_t *pte, entry; |
1561 | spinlock_t *ptl; | 1561 | spinlock_t *ptl; |
1562 | 1562 | ||
1563 | retval = -ENOMEM; | 1563 | retval = -ENOMEM; |
1564 | pte = get_locked_pte(mm, addr, &ptl); | 1564 | pte = get_locked_pte(mm, addr, &ptl); |
1565 | if (!pte) | 1565 | if (!pte) |
1566 | goto out; | 1566 | goto out; |
1567 | retval = -EBUSY; | 1567 | retval = -EBUSY; |
1568 | if (!pte_none(*pte)) | 1568 | if (!pte_none(*pte)) |
1569 | goto out_unlock; | 1569 | goto out_unlock; |
1570 | 1570 | ||
1571 | /* Ok, finally just insert the thing.. */ | 1571 | /* Ok, finally just insert the thing.. */ |
1572 | entry = pte_mkspecial(pfn_pte(pfn, prot)); | 1572 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
1573 | set_pte_at(mm, addr, pte, entry); | 1573 | set_pte_at(mm, addr, pte, entry); |
1574 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ | 1574 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
1575 | 1575 | ||
1576 | retval = 0; | 1576 | retval = 0; |
1577 | out_unlock: | 1577 | out_unlock: |
1578 | pte_unmap_unlock(pte, ptl); | 1578 | pte_unmap_unlock(pte, ptl); |
1579 | out: | 1579 | out: |
1580 | return retval; | 1580 | return retval; |
1581 | } | 1581 | } |
1582 | 1582 | ||
1583 | /** | 1583 | /** |
1584 | * vm_insert_pfn - insert single pfn into user vma | 1584 | * vm_insert_pfn - insert single pfn into user vma |
1585 | * @vma: user vma to map to | 1585 | * @vma: user vma to map to |
1586 | * @addr: target user address of this page | 1586 | * @addr: target user address of this page |
1587 | * @pfn: source kernel pfn | 1587 | * @pfn: source kernel pfn |
1588 | * | 1588 | * |
1589 | * Similar to vm_insert_page, this allows drivers to insert individual pages | 1589 | * Similar to vm_insert_page, this allows drivers to insert individual pages |
1590 | * they've allocated into a user vma. Same comments apply. | 1590 | * they've allocated into a user vma. Same comments apply. |
1591 | * | 1591 | * |
1592 | * This function should only be called from a vm_ops->fault handler, and | 1592 | * This function should only be called from a vm_ops->fault handler, and |
1593 | * in that case the handler should return NULL. | 1593 | * in that case the handler should return NULL. |
1594 | * | 1594 | * |
1595 | * vma cannot be a COW mapping. | 1595 | * vma cannot be a COW mapping. |
1596 | * | 1596 | * |
1597 | * As this is called only for pages that do not currently exist, we | 1597 | * As this is called only for pages that do not currently exist, we |
1598 | * do not need to flush old virtual caches or the TLB. | 1598 | * do not need to flush old virtual caches or the TLB. |
1599 | */ | 1599 | */ |
1600 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1600 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1601 | unsigned long pfn) | 1601 | unsigned long pfn) |
1602 | { | 1602 | { |
1603 | int ret; | 1603 | int ret; |
1604 | pgprot_t pgprot = vma->vm_page_prot; | 1604 | pgprot_t pgprot = vma->vm_page_prot; |
1605 | /* | 1605 | /* |
1606 | * Technically, architectures with pte_special can avoid all these | 1606 | * Technically, architectures with pte_special can avoid all these |
1607 | * restrictions (same for remap_pfn_range). However we would like | 1607 | * restrictions (same for remap_pfn_range). However we would like |
1608 | * consistency in testing and feature parity among all, so we should | 1608 | * consistency in testing and feature parity among all, so we should |
1609 | * try to keep these invariants in place for everybody. | 1609 | * try to keep these invariants in place for everybody. |
1610 | */ | 1610 | */ |
1611 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | 1611 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); |
1612 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == | 1612 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == |
1613 | (VM_PFNMAP|VM_MIXEDMAP)); | 1613 | (VM_PFNMAP|VM_MIXEDMAP)); |
1614 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | 1614 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); |
1615 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | 1615 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); |
1616 | 1616 | ||
1617 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1617 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1618 | return -EFAULT; | 1618 | return -EFAULT; |
1619 | if (track_pfn_insert(vma, &pgprot, pfn)) | 1619 | if (track_pfn_insert(vma, &pgprot, pfn)) |
1620 | return -EINVAL; | 1620 | return -EINVAL; |
1621 | 1621 | ||
1622 | ret = insert_pfn(vma, addr, pfn, pgprot); | 1622 | ret = insert_pfn(vma, addr, pfn, pgprot); |
1623 | 1623 | ||
1624 | return ret; | 1624 | return ret; |
1625 | } | 1625 | } |
1626 | EXPORT_SYMBOL(vm_insert_pfn); | 1626 | EXPORT_SYMBOL(vm_insert_pfn); |
1627 | 1627 | ||
1628 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1628 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1629 | unsigned long pfn) | 1629 | unsigned long pfn) |
1630 | { | 1630 | { |
1631 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); | 1631 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); |
1632 | 1632 | ||
1633 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1633 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1634 | return -EFAULT; | 1634 | return -EFAULT; |
1635 | 1635 | ||
1636 | /* | 1636 | /* |
1637 | * If we don't have pte special, then we have to use the pfn_valid() | 1637 | * If we don't have pte special, then we have to use the pfn_valid() |
1638 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* | 1638 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* |
1639 | * refcount the page if pfn_valid is true (hence insert_page rather | 1639 | * refcount the page if pfn_valid is true (hence insert_page rather |
1640 | * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP | 1640 | * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP |
1641 | * without pte special, it would there be refcounted as a normal page. | 1641 | * without pte special, it would there be refcounted as a normal page. |
1642 | */ | 1642 | */ |
1643 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { | 1643 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { |
1644 | struct page *page; | 1644 | struct page *page; |
1645 | 1645 | ||
1646 | page = pfn_to_page(pfn); | 1646 | page = pfn_to_page(pfn); |
1647 | return insert_page(vma, addr, page, vma->vm_page_prot); | 1647 | return insert_page(vma, addr, page, vma->vm_page_prot); |
1648 | } | 1648 | } |
1649 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | 1649 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); |
1650 | } | 1650 | } |
1651 | EXPORT_SYMBOL(vm_insert_mixed); | 1651 | EXPORT_SYMBOL(vm_insert_mixed); |
1652 | 1652 | ||
1653 | /* | 1653 | /* |
1654 | * maps a range of physical memory into the requested pages. the old | 1654 | * maps a range of physical memory into the requested pages. the old |
1655 | * mappings are removed. any references to nonexistent pages results | 1655 | * mappings are removed. any references to nonexistent pages results |
1656 | * in null mappings (currently treated as "copy-on-access") | 1656 | * in null mappings (currently treated as "copy-on-access") |
1657 | */ | 1657 | */ |
1658 | static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | 1658 | static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, |
1659 | unsigned long addr, unsigned long end, | 1659 | unsigned long addr, unsigned long end, |
1660 | unsigned long pfn, pgprot_t prot) | 1660 | unsigned long pfn, pgprot_t prot) |
1661 | { | 1661 | { |
1662 | pte_t *pte; | 1662 | pte_t *pte; |
1663 | spinlock_t *ptl; | 1663 | spinlock_t *ptl; |
1664 | 1664 | ||
1665 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1665 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1666 | if (!pte) | 1666 | if (!pte) |
1667 | return -ENOMEM; | 1667 | return -ENOMEM; |
1668 | arch_enter_lazy_mmu_mode(); | 1668 | arch_enter_lazy_mmu_mode(); |
1669 | do { | 1669 | do { |
1670 | BUG_ON(!pte_none(*pte)); | 1670 | BUG_ON(!pte_none(*pte)); |
1671 | set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); | 1671 | set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); |
1672 | pfn++; | 1672 | pfn++; |
1673 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1673 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1674 | arch_leave_lazy_mmu_mode(); | 1674 | arch_leave_lazy_mmu_mode(); |
1675 | pte_unmap_unlock(pte - 1, ptl); | 1675 | pte_unmap_unlock(pte - 1, ptl); |
1676 | return 0; | 1676 | return 0; |
1677 | } | 1677 | } |
1678 | 1678 | ||
1679 | static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | 1679 | static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, |
1680 | unsigned long addr, unsigned long end, | 1680 | unsigned long addr, unsigned long end, |
1681 | unsigned long pfn, pgprot_t prot) | 1681 | unsigned long pfn, pgprot_t prot) |
1682 | { | 1682 | { |
1683 | pmd_t *pmd; | 1683 | pmd_t *pmd; |
1684 | unsigned long next; | 1684 | unsigned long next; |
1685 | 1685 | ||
1686 | pfn -= addr >> PAGE_SHIFT; | 1686 | pfn -= addr >> PAGE_SHIFT; |
1687 | pmd = pmd_alloc(mm, pud, addr); | 1687 | pmd = pmd_alloc(mm, pud, addr); |
1688 | if (!pmd) | 1688 | if (!pmd) |
1689 | return -ENOMEM; | 1689 | return -ENOMEM; |
1690 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 1690 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
1691 | do { | 1691 | do { |
1692 | next = pmd_addr_end(addr, end); | 1692 | next = pmd_addr_end(addr, end); |
1693 | if (remap_pte_range(mm, pmd, addr, next, | 1693 | if (remap_pte_range(mm, pmd, addr, next, |
1694 | pfn + (addr >> PAGE_SHIFT), prot)) | 1694 | pfn + (addr >> PAGE_SHIFT), prot)) |
1695 | return -ENOMEM; | 1695 | return -ENOMEM; |
1696 | } while (pmd++, addr = next, addr != end); | 1696 | } while (pmd++, addr = next, addr != end); |
1697 | return 0; | 1697 | return 0; |
1698 | } | 1698 | } |
1699 | 1699 | ||
1700 | static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1700 | static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, |
1701 | unsigned long addr, unsigned long end, | 1701 | unsigned long addr, unsigned long end, |
1702 | unsigned long pfn, pgprot_t prot) | 1702 | unsigned long pfn, pgprot_t prot) |
1703 | { | 1703 | { |
1704 | pud_t *pud; | 1704 | pud_t *pud; |
1705 | unsigned long next; | 1705 | unsigned long next; |
1706 | 1706 | ||
1707 | pfn -= addr >> PAGE_SHIFT; | 1707 | pfn -= addr >> PAGE_SHIFT; |
1708 | pud = pud_alloc(mm, pgd, addr); | 1708 | pud = pud_alloc(mm, pgd, addr); |
1709 | if (!pud) | 1709 | if (!pud) |
1710 | return -ENOMEM; | 1710 | return -ENOMEM; |
1711 | do { | 1711 | do { |
1712 | next = pud_addr_end(addr, end); | 1712 | next = pud_addr_end(addr, end); |
1713 | if (remap_pmd_range(mm, pud, addr, next, | 1713 | if (remap_pmd_range(mm, pud, addr, next, |
1714 | pfn + (addr >> PAGE_SHIFT), prot)) | 1714 | pfn + (addr >> PAGE_SHIFT), prot)) |
1715 | return -ENOMEM; | 1715 | return -ENOMEM; |
1716 | } while (pud++, addr = next, addr != end); | 1716 | } while (pud++, addr = next, addr != end); |
1717 | return 0; | 1717 | return 0; |
1718 | } | 1718 | } |
1719 | 1719 | ||
1720 | /** | 1720 | /** |
1721 | * remap_pfn_range - remap kernel memory to userspace | 1721 | * remap_pfn_range - remap kernel memory to userspace |
1722 | * @vma: user vma to map to | 1722 | * @vma: user vma to map to |
1723 | * @addr: target user address to start at | 1723 | * @addr: target user address to start at |
1724 | * @pfn: physical address of kernel memory | 1724 | * @pfn: physical address of kernel memory |
1725 | * @size: size of map area | 1725 | * @size: size of map area |
1726 | * @prot: page protection flags for this mapping | 1726 | * @prot: page protection flags for this mapping |
1727 | * | 1727 | * |
1728 | * Note: this is only safe if the mm semaphore is held when called. | 1728 | * Note: this is only safe if the mm semaphore is held when called. |
1729 | */ | 1729 | */ |
1730 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | 1730 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1731 | unsigned long pfn, unsigned long size, pgprot_t prot) | 1731 | unsigned long pfn, unsigned long size, pgprot_t prot) |
1732 | { | 1732 | { |
1733 | pgd_t *pgd; | 1733 | pgd_t *pgd; |
1734 | unsigned long next; | 1734 | unsigned long next; |
1735 | unsigned long end = addr + PAGE_ALIGN(size); | 1735 | unsigned long end = addr + PAGE_ALIGN(size); |
1736 | struct mm_struct *mm = vma->vm_mm; | 1736 | struct mm_struct *mm = vma->vm_mm; |
1737 | int err; | 1737 | int err; |
1738 | 1738 | ||
1739 | /* | 1739 | /* |
1740 | * Physically remapped pages are special. Tell the | 1740 | * Physically remapped pages are special. Tell the |
1741 | * rest of the world about it: | 1741 | * rest of the world about it: |
1742 | * VM_IO tells people not to look at these pages | 1742 | * VM_IO tells people not to look at these pages |
1743 | * (accesses can have side effects). | 1743 | * (accesses can have side effects). |
1744 | * VM_PFNMAP tells the core MM that the base pages are just | 1744 | * VM_PFNMAP tells the core MM that the base pages are just |
1745 | * raw PFN mappings, and do not have a "struct page" associated | 1745 | * raw PFN mappings, and do not have a "struct page" associated |
1746 | * with them. | 1746 | * with them. |
1747 | * VM_DONTEXPAND | 1747 | * VM_DONTEXPAND |
1748 | * Disable vma merging and expanding with mremap(). | 1748 | * Disable vma merging and expanding with mremap(). |
1749 | * VM_DONTDUMP | 1749 | * VM_DONTDUMP |
1750 | * Omit vma from core dump, even when VM_IO turned off. | 1750 | * Omit vma from core dump, even when VM_IO turned off. |
1751 | * | 1751 | * |
1752 | * There's a horrible special case to handle copy-on-write | 1752 | * There's a horrible special case to handle copy-on-write |
1753 | * behaviour that some programs depend on. We mark the "original" | 1753 | * behaviour that some programs depend on. We mark the "original" |
1754 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". | 1754 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". |
1755 | * See vm_normal_page() for details. | 1755 | * See vm_normal_page() for details. |
1756 | */ | 1756 | */ |
1757 | if (is_cow_mapping(vma->vm_flags)) { | 1757 | if (is_cow_mapping(vma->vm_flags)) { |
1758 | if (addr != vma->vm_start || end != vma->vm_end) | 1758 | if (addr != vma->vm_start || end != vma->vm_end) |
1759 | return -EINVAL; | 1759 | return -EINVAL; |
1760 | vma->vm_pgoff = pfn; | 1760 | vma->vm_pgoff = pfn; |
1761 | } | 1761 | } |
1762 | 1762 | ||
1763 | err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); | 1763 | err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); |
1764 | if (err) | 1764 | if (err) |
1765 | return -EINVAL; | 1765 | return -EINVAL; |
1766 | 1766 | ||
1767 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; | 1767 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
1768 | 1768 | ||
1769 | BUG_ON(addr >= end); | 1769 | BUG_ON(addr >= end); |
1770 | pfn -= addr >> PAGE_SHIFT; | 1770 | pfn -= addr >> PAGE_SHIFT; |
1771 | pgd = pgd_offset(mm, addr); | 1771 | pgd = pgd_offset(mm, addr); |
1772 | flush_cache_range(vma, addr, end); | 1772 | flush_cache_range(vma, addr, end); |
1773 | do { | 1773 | do { |
1774 | next = pgd_addr_end(addr, end); | 1774 | next = pgd_addr_end(addr, end); |
1775 | err = remap_pud_range(mm, pgd, addr, next, | 1775 | err = remap_pud_range(mm, pgd, addr, next, |
1776 | pfn + (addr >> PAGE_SHIFT), prot); | 1776 | pfn + (addr >> PAGE_SHIFT), prot); |
1777 | if (err) | 1777 | if (err) |
1778 | break; | 1778 | break; |
1779 | } while (pgd++, addr = next, addr != end); | 1779 | } while (pgd++, addr = next, addr != end); |
1780 | 1780 | ||
1781 | if (err) | 1781 | if (err) |
1782 | untrack_pfn(vma, pfn, PAGE_ALIGN(size)); | 1782 | untrack_pfn(vma, pfn, PAGE_ALIGN(size)); |
1783 | 1783 | ||
1784 | return err; | 1784 | return err; |
1785 | } | 1785 | } |
1786 | EXPORT_SYMBOL(remap_pfn_range); | 1786 | EXPORT_SYMBOL(remap_pfn_range); |
1787 | 1787 | ||
1788 | /** | 1788 | /** |
1789 | * vm_iomap_memory - remap memory to userspace | 1789 | * vm_iomap_memory - remap memory to userspace |
1790 | * @vma: user vma to map to | 1790 | * @vma: user vma to map to |
1791 | * @start: start of area | 1791 | * @start: start of area |
1792 | * @len: size of area | 1792 | * @len: size of area |
1793 | * | 1793 | * |
1794 | * This is a simplified io_remap_pfn_range() for common driver use. The | 1794 | * This is a simplified io_remap_pfn_range() for common driver use. The |
1795 | * driver just needs to give us the physical memory range to be mapped, | 1795 | * driver just needs to give us the physical memory range to be mapped, |
1796 | * we'll figure out the rest from the vma information. | 1796 | * we'll figure out the rest from the vma information. |
1797 | * | 1797 | * |
1798 | * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get | 1798 | * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get |
1799 | * whatever write-combining details or similar. | 1799 | * whatever write-combining details or similar. |
1800 | */ | 1800 | */ |
1801 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) | 1801 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) |
1802 | { | 1802 | { |
1803 | unsigned long vm_len, pfn, pages; | 1803 | unsigned long vm_len, pfn, pages; |
1804 | 1804 | ||
1805 | /* Check that the physical memory area passed in looks valid */ | 1805 | /* Check that the physical memory area passed in looks valid */ |
1806 | if (start + len < start) | 1806 | if (start + len < start) |
1807 | return -EINVAL; | 1807 | return -EINVAL; |
1808 | /* | 1808 | /* |
1809 | * You *really* shouldn't map things that aren't page-aligned, | 1809 | * You *really* shouldn't map things that aren't page-aligned, |
1810 | * but we've historically allowed it because IO memory might | 1810 | * but we've historically allowed it because IO memory might |
1811 | * just have smaller alignment. | 1811 | * just have smaller alignment. |
1812 | */ | 1812 | */ |
1813 | len += start & ~PAGE_MASK; | 1813 | len += start & ~PAGE_MASK; |
1814 | pfn = start >> PAGE_SHIFT; | 1814 | pfn = start >> PAGE_SHIFT; |
1815 | pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; | 1815 | pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; |
1816 | if (pfn + pages < pfn) | 1816 | if (pfn + pages < pfn) |
1817 | return -EINVAL; | 1817 | return -EINVAL; |
1818 | 1818 | ||
1819 | /* We start the mapping 'vm_pgoff' pages into the area */ | 1819 | /* We start the mapping 'vm_pgoff' pages into the area */ |
1820 | if (vma->vm_pgoff > pages) | 1820 | if (vma->vm_pgoff > pages) |
1821 | return -EINVAL; | 1821 | return -EINVAL; |
1822 | pfn += vma->vm_pgoff; | 1822 | pfn += vma->vm_pgoff; |
1823 | pages -= vma->vm_pgoff; | 1823 | pages -= vma->vm_pgoff; |
1824 | 1824 | ||
1825 | /* Can we fit all of the mapping? */ | 1825 | /* Can we fit all of the mapping? */ |
1826 | vm_len = vma->vm_end - vma->vm_start; | 1826 | vm_len = vma->vm_end - vma->vm_start; |
1827 | if (vm_len >> PAGE_SHIFT > pages) | 1827 | if (vm_len >> PAGE_SHIFT > pages) |
1828 | return -EINVAL; | 1828 | return -EINVAL; |
1829 | 1829 | ||
1830 | /* Ok, let it rip */ | 1830 | /* Ok, let it rip */ |
1831 | return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); | 1831 | return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); |
1832 | } | 1832 | } |
1833 | EXPORT_SYMBOL(vm_iomap_memory); | 1833 | EXPORT_SYMBOL(vm_iomap_memory); |
1834 | 1834 | ||
1835 | static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | 1835 | static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, |
1836 | unsigned long addr, unsigned long end, | 1836 | unsigned long addr, unsigned long end, |
1837 | pte_fn_t fn, void *data) | 1837 | pte_fn_t fn, void *data) |
1838 | { | 1838 | { |
1839 | pte_t *pte; | 1839 | pte_t *pte; |
1840 | int err; | 1840 | int err; |
1841 | pgtable_t token; | 1841 | pgtable_t token; |
1842 | spinlock_t *uninitialized_var(ptl); | 1842 | spinlock_t *uninitialized_var(ptl); |
1843 | 1843 | ||
1844 | pte = (mm == &init_mm) ? | 1844 | pte = (mm == &init_mm) ? |
1845 | pte_alloc_kernel(pmd, addr) : | 1845 | pte_alloc_kernel(pmd, addr) : |
1846 | pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1846 | pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1847 | if (!pte) | 1847 | if (!pte) |
1848 | return -ENOMEM; | 1848 | return -ENOMEM; |
1849 | 1849 | ||
1850 | BUG_ON(pmd_huge(*pmd)); | 1850 | BUG_ON(pmd_huge(*pmd)); |
1851 | 1851 | ||
1852 | arch_enter_lazy_mmu_mode(); | 1852 | arch_enter_lazy_mmu_mode(); |
1853 | 1853 | ||
1854 | token = pmd_pgtable(*pmd); | 1854 | token = pmd_pgtable(*pmd); |
1855 | 1855 | ||
1856 | do { | 1856 | do { |
1857 | err = fn(pte++, token, addr, data); | 1857 | err = fn(pte++, token, addr, data); |
1858 | if (err) | 1858 | if (err) |
1859 | break; | 1859 | break; |
1860 | } while (addr += PAGE_SIZE, addr != end); | 1860 | } while (addr += PAGE_SIZE, addr != end); |
1861 | 1861 | ||
1862 | arch_leave_lazy_mmu_mode(); | 1862 | arch_leave_lazy_mmu_mode(); |
1863 | 1863 | ||
1864 | if (mm != &init_mm) | 1864 | if (mm != &init_mm) |
1865 | pte_unmap_unlock(pte-1, ptl); | 1865 | pte_unmap_unlock(pte-1, ptl); |
1866 | return err; | 1866 | return err; |
1867 | } | 1867 | } |
1868 | 1868 | ||
1869 | static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, | 1869 | static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, |
1870 | unsigned long addr, unsigned long end, | 1870 | unsigned long addr, unsigned long end, |
1871 | pte_fn_t fn, void *data) | 1871 | pte_fn_t fn, void *data) |
1872 | { | 1872 | { |
1873 | pmd_t *pmd; | 1873 | pmd_t *pmd; |
1874 | unsigned long next; | 1874 | unsigned long next; |
1875 | int err; | 1875 | int err; |
1876 | 1876 | ||
1877 | BUG_ON(pud_huge(*pud)); | 1877 | BUG_ON(pud_huge(*pud)); |
1878 | 1878 | ||
1879 | pmd = pmd_alloc(mm, pud, addr); | 1879 | pmd = pmd_alloc(mm, pud, addr); |
1880 | if (!pmd) | 1880 | if (!pmd) |
1881 | return -ENOMEM; | 1881 | return -ENOMEM; |
1882 | do { | 1882 | do { |
1883 | next = pmd_addr_end(addr, end); | 1883 | next = pmd_addr_end(addr, end); |
1884 | err = apply_to_pte_range(mm, pmd, addr, next, fn, data); | 1884 | err = apply_to_pte_range(mm, pmd, addr, next, fn, data); |
1885 | if (err) | 1885 | if (err) |
1886 | break; | 1886 | break; |
1887 | } while (pmd++, addr = next, addr != end); | 1887 | } while (pmd++, addr = next, addr != end); |
1888 | return err; | 1888 | return err; |
1889 | } | 1889 | } |
1890 | 1890 | ||
1891 | static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1891 | static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, |
1892 | unsigned long addr, unsigned long end, | 1892 | unsigned long addr, unsigned long end, |
1893 | pte_fn_t fn, void *data) | 1893 | pte_fn_t fn, void *data) |
1894 | { | 1894 | { |
1895 | pud_t *pud; | 1895 | pud_t *pud; |
1896 | unsigned long next; | 1896 | unsigned long next; |
1897 | int err; | 1897 | int err; |
1898 | 1898 | ||
1899 | pud = pud_alloc(mm, pgd, addr); | 1899 | pud = pud_alloc(mm, pgd, addr); |
1900 | if (!pud) | 1900 | if (!pud) |
1901 | return -ENOMEM; | 1901 | return -ENOMEM; |
1902 | do { | 1902 | do { |
1903 | next = pud_addr_end(addr, end); | 1903 | next = pud_addr_end(addr, end); |
1904 | err = apply_to_pmd_range(mm, pud, addr, next, fn, data); | 1904 | err = apply_to_pmd_range(mm, pud, addr, next, fn, data); |
1905 | if (err) | 1905 | if (err) |
1906 | break; | 1906 | break; |
1907 | } while (pud++, addr = next, addr != end); | 1907 | } while (pud++, addr = next, addr != end); |
1908 | return err; | 1908 | return err; |
1909 | } | 1909 | } |
1910 | 1910 | ||
1911 | /* | 1911 | /* |
1912 | * Scan a region of virtual memory, filling in page tables as necessary | 1912 | * Scan a region of virtual memory, filling in page tables as necessary |
1913 | * and calling a provided function on each leaf page table. | 1913 | * and calling a provided function on each leaf page table. |
1914 | */ | 1914 | */ |
1915 | int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | 1915 | int apply_to_page_range(struct mm_struct *mm, unsigned long addr, |
1916 | unsigned long size, pte_fn_t fn, void *data) | 1916 | unsigned long size, pte_fn_t fn, void *data) |
1917 | { | 1917 | { |
1918 | pgd_t *pgd; | 1918 | pgd_t *pgd; |
1919 | unsigned long next; | 1919 | unsigned long next; |
1920 | unsigned long end = addr + size; | 1920 | unsigned long end = addr + size; |
1921 | int err; | 1921 | int err; |
1922 | 1922 | ||
1923 | BUG_ON(addr >= end); | 1923 | BUG_ON(addr >= end); |
1924 | pgd = pgd_offset(mm, addr); | 1924 | pgd = pgd_offset(mm, addr); |
1925 | do { | 1925 | do { |
1926 | next = pgd_addr_end(addr, end); | 1926 | next = pgd_addr_end(addr, end); |
1927 | err = apply_to_pud_range(mm, pgd, addr, next, fn, data); | 1927 | err = apply_to_pud_range(mm, pgd, addr, next, fn, data); |
1928 | if (err) | 1928 | if (err) |
1929 | break; | 1929 | break; |
1930 | } while (pgd++, addr = next, addr != end); | 1930 | } while (pgd++, addr = next, addr != end); |
1931 | 1931 | ||
1932 | return err; | 1932 | return err; |
1933 | } | 1933 | } |
1934 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1934 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
1935 | 1935 | ||
1936 | /* | 1936 | /* |
1937 | * handle_pte_fault chooses page fault handler according to an entry | 1937 | * handle_pte_fault chooses page fault handler according to an entry |
1938 | * which was read non-atomically. Before making any commitment, on | 1938 | * which was read non-atomically. Before making any commitment, on |
1939 | * those architectures or configurations (e.g. i386 with PAE) which | 1939 | * those architectures or configurations (e.g. i386 with PAE) which |
1940 | * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault | 1940 | * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault |
1941 | * must check under lock before unmapping the pte and proceeding | 1941 | * must check under lock before unmapping the pte and proceeding |
1942 | * (but do_wp_page is only called after already making such a check; | 1942 | * (but do_wp_page is only called after already making such a check; |
1943 | * and do_anonymous_page can safely check later on). | 1943 | * and do_anonymous_page can safely check later on). |
1944 | */ | 1944 | */ |
1945 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | 1945 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
1946 | pte_t *page_table, pte_t orig_pte) | 1946 | pte_t *page_table, pte_t orig_pte) |
1947 | { | 1947 | { |
1948 | int same = 1; | 1948 | int same = 1; |
1949 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | 1949 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
1950 | if (sizeof(pte_t) > sizeof(unsigned long)) { | 1950 | if (sizeof(pte_t) > sizeof(unsigned long)) { |
1951 | spinlock_t *ptl = pte_lockptr(mm, pmd); | 1951 | spinlock_t *ptl = pte_lockptr(mm, pmd); |
1952 | spin_lock(ptl); | 1952 | spin_lock(ptl); |
1953 | same = pte_same(*page_table, orig_pte); | 1953 | same = pte_same(*page_table, orig_pte); |
1954 | spin_unlock(ptl); | 1954 | spin_unlock(ptl); |
1955 | } | 1955 | } |
1956 | #endif | 1956 | #endif |
1957 | pte_unmap(page_table); | 1957 | pte_unmap(page_table); |
1958 | return same; | 1958 | return same; |
1959 | } | 1959 | } |
1960 | 1960 | ||
1961 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 1961 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
1962 | { | 1962 | { |
1963 | debug_dma_assert_idle(src); | 1963 | debug_dma_assert_idle(src); |
1964 | 1964 | ||
1965 | /* | 1965 | /* |
1966 | * If the source page was a PFN mapping, we don't have | 1966 | * If the source page was a PFN mapping, we don't have |
1967 | * a "struct page" for it. We do a best-effort copy by | 1967 | * a "struct page" for it. We do a best-effort copy by |
1968 | * just copying from the original user address. If that | 1968 | * just copying from the original user address. If that |
1969 | * fails, we just zero-fill it. Live with it. | 1969 | * fails, we just zero-fill it. Live with it. |
1970 | */ | 1970 | */ |
1971 | if (unlikely(!src)) { | 1971 | if (unlikely(!src)) { |
1972 | void *kaddr = kmap_atomic(dst); | 1972 | void *kaddr = kmap_atomic(dst); |
1973 | void __user *uaddr = (void __user *)(va & PAGE_MASK); | 1973 | void __user *uaddr = (void __user *)(va & PAGE_MASK); |
1974 | 1974 | ||
1975 | /* | 1975 | /* |
1976 | * This really shouldn't fail, because the page is there | 1976 | * This really shouldn't fail, because the page is there |
1977 | * in the page tables. But it might just be unreadable, | 1977 | * in the page tables. But it might just be unreadable, |
1978 | * in which case we just give up and fill the result with | 1978 | * in which case we just give up and fill the result with |
1979 | * zeroes. | 1979 | * zeroes. |
1980 | */ | 1980 | */ |
1981 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) | 1981 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) |
1982 | clear_page(kaddr); | 1982 | clear_page(kaddr); |
1983 | kunmap_atomic(kaddr); | 1983 | kunmap_atomic(kaddr); |
1984 | flush_dcache_page(dst); | 1984 | flush_dcache_page(dst); |
1985 | } else | 1985 | } else |
1986 | copy_user_highpage(dst, src, va, vma); | 1986 | copy_user_highpage(dst, src, va, vma); |
1987 | } | 1987 | } |
1988 | 1988 | ||
1989 | /* | 1989 | /* |
1990 | * Notify the address space that the page is about to become writable so that | 1990 | * Notify the address space that the page is about to become writable so that |
1991 | * it can prohibit this or wait for the page to get into an appropriate state. | 1991 | * it can prohibit this or wait for the page to get into an appropriate state. |
1992 | * | 1992 | * |
1993 | * We do this without the lock held, so that it can sleep if it needs to. | 1993 | * We do this without the lock held, so that it can sleep if it needs to. |
1994 | */ | 1994 | */ |
1995 | static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 1995 | static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, |
1996 | unsigned long address) | 1996 | unsigned long address) |
1997 | { | 1997 | { |
1998 | struct vm_fault vmf; | 1998 | struct vm_fault vmf; |
1999 | int ret; | 1999 | int ret; |
2000 | 2000 | ||
2001 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2001 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
2002 | vmf.pgoff = page->index; | 2002 | vmf.pgoff = page->index; |
2003 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | 2003 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
2004 | vmf.page = page; | 2004 | vmf.page = page; |
2005 | 2005 | ||
2006 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); | 2006 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); |
2007 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2007 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2008 | return ret; | 2008 | return ret; |
2009 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { | 2009 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { |
2010 | lock_page(page); | 2010 | lock_page(page); |
2011 | if (!page->mapping) { | 2011 | if (!page->mapping) { |
2012 | unlock_page(page); | 2012 | unlock_page(page); |
2013 | return 0; /* retry */ | 2013 | return 0; /* retry */ |
2014 | } | 2014 | } |
2015 | ret |= VM_FAULT_LOCKED; | 2015 | ret |= VM_FAULT_LOCKED; |
2016 | } else | 2016 | } else |
2017 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2017 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2018 | return ret; | 2018 | return ret; |
2019 | } | 2019 | } |
2020 | 2020 | ||
2021 | /* | 2021 | /* |
2022 | * This routine handles present pages, when users try to write | 2022 | * This routine handles present pages, when users try to write |
2023 | * to a shared page. It is done by copying the page to a new address | 2023 | * to a shared page. It is done by copying the page to a new address |
2024 | * and decrementing the shared-page counter for the old page. | 2024 | * and decrementing the shared-page counter for the old page. |
2025 | * | 2025 | * |
2026 | * Note that this routine assumes that the protection checks have been | 2026 | * Note that this routine assumes that the protection checks have been |
2027 | * done by the caller (the low-level page fault routine in most cases). | 2027 | * done by the caller (the low-level page fault routine in most cases). |
2028 | * Thus we can safely just mark it writable once we've done any necessary | 2028 | * Thus we can safely just mark it writable once we've done any necessary |
2029 | * COW. | 2029 | * COW. |
2030 | * | 2030 | * |
2031 | * We also mark the page dirty at this point even though the page will | 2031 | * We also mark the page dirty at this point even though the page will |
2032 | * change only once the write actually happens. This avoids a few races, | 2032 | * change only once the write actually happens. This avoids a few races, |
2033 | * and potentially makes it more efficient. | 2033 | * and potentially makes it more efficient. |
2034 | * | 2034 | * |
2035 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2035 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2036 | * but allow concurrent faults), with pte both mapped and locked. | 2036 | * but allow concurrent faults), with pte both mapped and locked. |
2037 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2037 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2038 | */ | 2038 | */ |
2039 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2039 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2040 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2040 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2041 | spinlock_t *ptl, pte_t orig_pte) | 2041 | spinlock_t *ptl, pte_t orig_pte) |
2042 | __releases(ptl) | 2042 | __releases(ptl) |
2043 | { | 2043 | { |
2044 | struct page *old_page, *new_page = NULL; | 2044 | struct page *old_page, *new_page = NULL; |
2045 | pte_t entry; | 2045 | pte_t entry; |
2046 | int ret = 0; | 2046 | int ret = 0; |
2047 | int page_mkwrite = 0; | 2047 | int page_mkwrite = 0; |
2048 | struct page *dirty_page = NULL; | 2048 | struct page *dirty_page = NULL; |
2049 | unsigned long mmun_start = 0; /* For mmu_notifiers */ | 2049 | unsigned long mmun_start = 0; /* For mmu_notifiers */ |
2050 | unsigned long mmun_end = 0; /* For mmu_notifiers */ | 2050 | unsigned long mmun_end = 0; /* For mmu_notifiers */ |
2051 | struct mem_cgroup *memcg; | 2051 | struct mem_cgroup *memcg; |
2052 | 2052 | ||
2053 | old_page = vm_normal_page(vma, address, orig_pte); | 2053 | old_page = vm_normal_page(vma, address, orig_pte); |
2054 | if (!old_page) { | 2054 | if (!old_page) { |
2055 | /* | 2055 | /* |
2056 | * VM_MIXEDMAP !pfn_valid() case | 2056 | * VM_MIXEDMAP !pfn_valid() case |
2057 | * | 2057 | * |
2058 | * We should not cow pages in a shared writeable mapping. | 2058 | * We should not cow pages in a shared writeable mapping. |
2059 | * Just mark the pages writable as we can't do any dirty | 2059 | * Just mark the pages writable as we can't do any dirty |
2060 | * accounting on raw pfn maps. | 2060 | * accounting on raw pfn maps. |
2061 | */ | 2061 | */ |
2062 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2062 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2063 | (VM_WRITE|VM_SHARED)) | 2063 | (VM_WRITE|VM_SHARED)) |
2064 | goto reuse; | 2064 | goto reuse; |
2065 | goto gotten; | 2065 | goto gotten; |
2066 | } | 2066 | } |
2067 | 2067 | ||
2068 | /* | 2068 | /* |
2069 | * Take out anonymous pages first, anonymous shared vmas are | 2069 | * Take out anonymous pages first, anonymous shared vmas are |
2070 | * not dirty accountable. | 2070 | * not dirty accountable. |
2071 | */ | 2071 | */ |
2072 | if (PageAnon(old_page) && !PageKsm(old_page)) { | 2072 | if (PageAnon(old_page) && !PageKsm(old_page)) { |
2073 | if (!trylock_page(old_page)) { | 2073 | if (!trylock_page(old_page)) { |
2074 | page_cache_get(old_page); | 2074 | page_cache_get(old_page); |
2075 | pte_unmap_unlock(page_table, ptl); | 2075 | pte_unmap_unlock(page_table, ptl); |
2076 | lock_page(old_page); | 2076 | lock_page(old_page); |
2077 | page_table = pte_offset_map_lock(mm, pmd, address, | 2077 | page_table = pte_offset_map_lock(mm, pmd, address, |
2078 | &ptl); | 2078 | &ptl); |
2079 | if (!pte_same(*page_table, orig_pte)) { | 2079 | if (!pte_same(*page_table, orig_pte)) { |
2080 | unlock_page(old_page); | 2080 | unlock_page(old_page); |
2081 | goto unlock; | 2081 | goto unlock; |
2082 | } | 2082 | } |
2083 | page_cache_release(old_page); | 2083 | page_cache_release(old_page); |
2084 | } | 2084 | } |
2085 | if (reuse_swap_page(old_page)) { | 2085 | if (reuse_swap_page(old_page)) { |
2086 | /* | 2086 | /* |
2087 | * The page is all ours. Move it to our anon_vma so | 2087 | * The page is all ours. Move it to our anon_vma so |
2088 | * the rmap code will not search our parent or siblings. | 2088 | * the rmap code will not search our parent or siblings. |
2089 | * Protected against the rmap code by the page lock. | 2089 | * Protected against the rmap code by the page lock. |
2090 | */ | 2090 | */ |
2091 | page_move_anon_rmap(old_page, vma, address); | 2091 | page_move_anon_rmap(old_page, vma, address); |
2092 | unlock_page(old_page); | 2092 | unlock_page(old_page); |
2093 | goto reuse; | 2093 | goto reuse; |
2094 | } | 2094 | } |
2095 | unlock_page(old_page); | 2095 | unlock_page(old_page); |
2096 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2096 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2097 | (VM_WRITE|VM_SHARED))) { | 2097 | (VM_WRITE|VM_SHARED))) { |
2098 | /* | 2098 | /* |
2099 | * Only catch write-faults on shared writable pages, | 2099 | * Only catch write-faults on shared writable pages, |
2100 | * read-only shared pages can get COWed by | 2100 | * read-only shared pages can get COWed by |
2101 | * get_user_pages(.write=1, .force=1). | 2101 | * get_user_pages(.write=1, .force=1). |
2102 | */ | 2102 | */ |
2103 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2103 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2104 | int tmp; | 2104 | int tmp; |
2105 | page_cache_get(old_page); | 2105 | page_cache_get(old_page); |
2106 | pte_unmap_unlock(page_table, ptl); | 2106 | pte_unmap_unlock(page_table, ptl); |
2107 | tmp = do_page_mkwrite(vma, old_page, address); | 2107 | tmp = do_page_mkwrite(vma, old_page, address); |
2108 | if (unlikely(!tmp || (tmp & | 2108 | if (unlikely(!tmp || (tmp & |
2109 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2109 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2110 | page_cache_release(old_page); | 2110 | page_cache_release(old_page); |
2111 | return tmp; | 2111 | return tmp; |
2112 | } | 2112 | } |
2113 | /* | 2113 | /* |
2114 | * Since we dropped the lock we need to revalidate | 2114 | * Since we dropped the lock we need to revalidate |
2115 | * the PTE as someone else may have changed it. If | 2115 | * the PTE as someone else may have changed it. If |
2116 | * they did, we just return, as we can count on the | 2116 | * they did, we just return, as we can count on the |
2117 | * MMU to tell us if they didn't also make it writable. | 2117 | * MMU to tell us if they didn't also make it writable. |
2118 | */ | 2118 | */ |
2119 | page_table = pte_offset_map_lock(mm, pmd, address, | 2119 | page_table = pte_offset_map_lock(mm, pmd, address, |
2120 | &ptl); | 2120 | &ptl); |
2121 | if (!pte_same(*page_table, orig_pte)) { | 2121 | if (!pte_same(*page_table, orig_pte)) { |
2122 | unlock_page(old_page); | 2122 | unlock_page(old_page); |
2123 | goto unlock; | 2123 | goto unlock; |
2124 | } | 2124 | } |
2125 | 2125 | ||
2126 | page_mkwrite = 1; | 2126 | page_mkwrite = 1; |
2127 | } | 2127 | } |
2128 | dirty_page = old_page; | 2128 | dirty_page = old_page; |
2129 | get_page(dirty_page); | 2129 | get_page(dirty_page); |
2130 | 2130 | ||
2131 | reuse: | 2131 | reuse: |
2132 | /* | 2132 | /* |
2133 | * Clear the pages cpupid information as the existing | 2133 | * Clear the pages cpupid information as the existing |
2134 | * information potentially belongs to a now completely | 2134 | * information potentially belongs to a now completely |
2135 | * unrelated process. | 2135 | * unrelated process. |
2136 | */ | 2136 | */ |
2137 | if (old_page) | 2137 | if (old_page) |
2138 | page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); | 2138 | page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); |
2139 | 2139 | ||
2140 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2140 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2141 | entry = pte_mkyoung(orig_pte); | 2141 | entry = pte_mkyoung(orig_pte); |
2142 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2142 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2143 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2143 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2144 | update_mmu_cache(vma, address, page_table); | 2144 | update_mmu_cache(vma, address, page_table); |
2145 | pte_unmap_unlock(page_table, ptl); | 2145 | pte_unmap_unlock(page_table, ptl); |
2146 | ret |= VM_FAULT_WRITE; | 2146 | ret |= VM_FAULT_WRITE; |
2147 | 2147 | ||
2148 | if (!dirty_page) | 2148 | if (!dirty_page) |
2149 | return ret; | 2149 | return ret; |
2150 | 2150 | ||
2151 | /* | 2151 | /* |
2152 | * Yes, Virginia, this is actually required to prevent a race | 2152 | * Yes, Virginia, this is actually required to prevent a race |
2153 | * with clear_page_dirty_for_io() from clearing the page dirty | 2153 | * with clear_page_dirty_for_io() from clearing the page dirty |
2154 | * bit after it clear all dirty ptes, but before a racing | 2154 | * bit after it clear all dirty ptes, but before a racing |
2155 | * do_wp_page installs a dirty pte. | 2155 | * do_wp_page installs a dirty pte. |
2156 | * | 2156 | * |
2157 | * do_shared_fault is protected similarly. | 2157 | * do_shared_fault is protected similarly. |
2158 | */ | 2158 | */ |
2159 | if (!page_mkwrite) { | 2159 | if (!page_mkwrite) { |
2160 | wait_on_page_locked(dirty_page); | 2160 | wait_on_page_locked(dirty_page); |
2161 | set_page_dirty_balance(dirty_page); | 2161 | set_page_dirty_balance(dirty_page); |
2162 | /* file_update_time outside page_lock */ | 2162 | /* file_update_time outside page_lock */ |
2163 | if (vma->vm_file) | 2163 | if (vma->vm_file) |
2164 | file_update_time(vma->vm_file); | 2164 | file_update_time(vma->vm_file); |
2165 | } | 2165 | } |
2166 | put_page(dirty_page); | 2166 | put_page(dirty_page); |
2167 | if (page_mkwrite) { | 2167 | if (page_mkwrite) { |
2168 | struct address_space *mapping = dirty_page->mapping; | 2168 | struct address_space *mapping = dirty_page->mapping; |
2169 | 2169 | ||
2170 | set_page_dirty(dirty_page); | 2170 | set_page_dirty(dirty_page); |
2171 | unlock_page(dirty_page); | 2171 | unlock_page(dirty_page); |
2172 | page_cache_release(dirty_page); | 2172 | page_cache_release(dirty_page); |
2173 | if (mapping) { | 2173 | if (mapping) { |
2174 | /* | 2174 | /* |
2175 | * Some device drivers do not set page.mapping | 2175 | * Some device drivers do not set page.mapping |
2176 | * but still dirty their pages | 2176 | * but still dirty their pages |
2177 | */ | 2177 | */ |
2178 | balance_dirty_pages_ratelimited(mapping); | 2178 | balance_dirty_pages_ratelimited(mapping); |
2179 | } | 2179 | } |
2180 | } | 2180 | } |
2181 | 2181 | ||
2182 | return ret; | 2182 | return ret; |
2183 | } | 2183 | } |
2184 | 2184 | ||
2185 | /* | 2185 | /* |
2186 | * Ok, we need to copy. Oh, well.. | 2186 | * Ok, we need to copy. Oh, well.. |
2187 | */ | 2187 | */ |
2188 | page_cache_get(old_page); | 2188 | page_cache_get(old_page); |
2189 | gotten: | 2189 | gotten: |
2190 | pte_unmap_unlock(page_table, ptl); | 2190 | pte_unmap_unlock(page_table, ptl); |
2191 | 2191 | ||
2192 | if (unlikely(anon_vma_prepare(vma))) | 2192 | if (unlikely(anon_vma_prepare(vma))) |
2193 | goto oom; | 2193 | goto oom; |
2194 | 2194 | ||
2195 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2195 | if (is_zero_pfn(pte_pfn(orig_pte))) { |
2196 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 2196 | new_page = alloc_zeroed_user_highpage_movable(vma, address); |
2197 | if (!new_page) | 2197 | if (!new_page) |
2198 | goto oom; | 2198 | goto oom; |
2199 | } else { | 2199 | } else { |
2200 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2200 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
2201 | if (!new_page) | 2201 | if (!new_page) |
2202 | goto oom; | 2202 | goto oom; |
2203 | cow_user_page(new_page, old_page, address, vma); | 2203 | cow_user_page(new_page, old_page, address, vma); |
2204 | } | 2204 | } |
2205 | __SetPageUptodate(new_page); | 2205 | __SetPageUptodate(new_page); |
2206 | 2206 | ||
2207 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) | 2207 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) |
2208 | goto oom_free_new; | 2208 | goto oom_free_new; |
2209 | 2209 | ||
2210 | mmun_start = address & PAGE_MASK; | 2210 | mmun_start = address & PAGE_MASK; |
2211 | mmun_end = mmun_start + PAGE_SIZE; | 2211 | mmun_end = mmun_start + PAGE_SIZE; |
2212 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2212 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2213 | 2213 | ||
2214 | /* | 2214 | /* |
2215 | * Re-check the pte - we dropped the lock | 2215 | * Re-check the pte - we dropped the lock |
2216 | */ | 2216 | */ |
2217 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2217 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2218 | if (likely(pte_same(*page_table, orig_pte))) { | 2218 | if (likely(pte_same(*page_table, orig_pte))) { |
2219 | if (old_page) { | 2219 | if (old_page) { |
2220 | if (!PageAnon(old_page)) { | 2220 | if (!PageAnon(old_page)) { |
2221 | dec_mm_counter_fast(mm, MM_FILEPAGES); | 2221 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
2222 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2222 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2223 | } | 2223 | } |
2224 | } else | 2224 | } else |
2225 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2225 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2226 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2226 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2227 | entry = mk_pte(new_page, vma->vm_page_prot); | 2227 | entry = mk_pte(new_page, vma->vm_page_prot); |
2228 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2228 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2229 | /* | 2229 | /* |
2230 | * Clear the pte entry and flush it first, before updating the | 2230 | * Clear the pte entry and flush it first, before updating the |
2231 | * pte with the new entry. This will avoid a race condition | 2231 | * pte with the new entry. This will avoid a race condition |
2232 | * seen in the presence of one thread doing SMC and another | 2232 | * seen in the presence of one thread doing SMC and another |
2233 | * thread doing COW. | 2233 | * thread doing COW. |
2234 | */ | 2234 | */ |
2235 | ptep_clear_flush(vma, address, page_table); | 2235 | ptep_clear_flush(vma, address, page_table); |
2236 | page_add_new_anon_rmap(new_page, vma, address); | 2236 | page_add_new_anon_rmap(new_page, vma, address); |
2237 | mem_cgroup_commit_charge(new_page, memcg, false); | 2237 | mem_cgroup_commit_charge(new_page, memcg, false); |
2238 | lru_cache_add_active_or_unevictable(new_page, vma); | 2238 | lru_cache_add_active_or_unevictable(new_page, vma); |
2239 | /* | 2239 | /* |
2240 | * We call the notify macro here because, when using secondary | 2240 | * We call the notify macro here because, when using secondary |
2241 | * mmu page tables (such as kvm shadow page tables), we want the | 2241 | * mmu page tables (such as kvm shadow page tables), we want the |
2242 | * new page to be mapped directly into the secondary page table. | 2242 | * new page to be mapped directly into the secondary page table. |
2243 | */ | 2243 | */ |
2244 | set_pte_at_notify(mm, address, page_table, entry); | 2244 | set_pte_at_notify(mm, address, page_table, entry); |
2245 | update_mmu_cache(vma, address, page_table); | 2245 | update_mmu_cache(vma, address, page_table); |
2246 | if (old_page) { | 2246 | if (old_page) { |
2247 | /* | 2247 | /* |
2248 | * Only after switching the pte to the new page may | 2248 | * Only after switching the pte to the new page may |
2249 | * we remove the mapcount here. Otherwise another | 2249 | * we remove the mapcount here. Otherwise another |
2250 | * process may come and find the rmap count decremented | 2250 | * process may come and find the rmap count decremented |
2251 | * before the pte is switched to the new page, and | 2251 | * before the pte is switched to the new page, and |
2252 | * "reuse" the old page writing into it while our pte | 2252 | * "reuse" the old page writing into it while our pte |
2253 | * here still points into it and can be read by other | 2253 | * here still points into it and can be read by other |
2254 | * threads. | 2254 | * threads. |
2255 | * | 2255 | * |
2256 | * The critical issue is to order this | 2256 | * The critical issue is to order this |
2257 | * page_remove_rmap with the ptp_clear_flush above. | 2257 | * page_remove_rmap with the ptp_clear_flush above. |
2258 | * Those stores are ordered by (if nothing else,) | 2258 | * Those stores are ordered by (if nothing else,) |
2259 | * the barrier present in the atomic_add_negative | 2259 | * the barrier present in the atomic_add_negative |
2260 | * in page_remove_rmap. | 2260 | * in page_remove_rmap. |
2261 | * | 2261 | * |
2262 | * Then the TLB flush in ptep_clear_flush ensures that | 2262 | * Then the TLB flush in ptep_clear_flush ensures that |
2263 | * no process can access the old page before the | 2263 | * no process can access the old page before the |
2264 | * decremented mapcount is visible. And the old page | 2264 | * decremented mapcount is visible. And the old page |
2265 | * cannot be reused until after the decremented | 2265 | * cannot be reused until after the decremented |
2266 | * mapcount is visible. So transitively, TLBs to | 2266 | * mapcount is visible. So transitively, TLBs to |
2267 | * old page will be flushed before it can be reused. | 2267 | * old page will be flushed before it can be reused. |
2268 | */ | 2268 | */ |
2269 | page_remove_rmap(old_page); | 2269 | page_remove_rmap(old_page); |
2270 | } | 2270 | } |
2271 | 2271 | ||
2272 | /* Free the old page.. */ | 2272 | /* Free the old page.. */ |
2273 | new_page = old_page; | 2273 | new_page = old_page; |
2274 | ret |= VM_FAULT_WRITE; | 2274 | ret |= VM_FAULT_WRITE; |
2275 | } else | 2275 | } else |
2276 | mem_cgroup_cancel_charge(new_page, memcg); | 2276 | mem_cgroup_cancel_charge(new_page, memcg); |
2277 | 2277 | ||
2278 | if (new_page) | 2278 | if (new_page) |
2279 | page_cache_release(new_page); | 2279 | page_cache_release(new_page); |
2280 | unlock: | 2280 | unlock: |
2281 | pte_unmap_unlock(page_table, ptl); | 2281 | pte_unmap_unlock(page_table, ptl); |
2282 | if (mmun_end > mmun_start) | 2282 | if (mmun_end > mmun_start) |
2283 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2283 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2284 | if (old_page) { | 2284 | if (old_page) { |
2285 | /* | 2285 | /* |
2286 | * Don't let another task, with possibly unlocked vma, | 2286 | * Don't let another task, with possibly unlocked vma, |
2287 | * keep the mlocked page. | 2287 | * keep the mlocked page. |
2288 | */ | 2288 | */ |
2289 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { | 2289 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { |
2290 | lock_page(old_page); /* LRU manipulation */ | 2290 | lock_page(old_page); /* LRU manipulation */ |
2291 | munlock_vma_page(old_page); | 2291 | munlock_vma_page(old_page); |
2292 | unlock_page(old_page); | 2292 | unlock_page(old_page); |
2293 | } | 2293 | } |
2294 | page_cache_release(old_page); | 2294 | page_cache_release(old_page); |
2295 | } | 2295 | } |
2296 | return ret; | 2296 | return ret; |
2297 | oom_free_new: | 2297 | oom_free_new: |
2298 | page_cache_release(new_page); | 2298 | page_cache_release(new_page); |
2299 | oom: | 2299 | oom: |
2300 | if (old_page) | 2300 | if (old_page) |
2301 | page_cache_release(old_page); | 2301 | page_cache_release(old_page); |
2302 | return VM_FAULT_OOM; | 2302 | return VM_FAULT_OOM; |
2303 | } | 2303 | } |
2304 | 2304 | ||
2305 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2305 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
2306 | unsigned long start_addr, unsigned long end_addr, | 2306 | unsigned long start_addr, unsigned long end_addr, |
2307 | struct zap_details *details) | 2307 | struct zap_details *details) |
2308 | { | 2308 | { |
2309 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); | 2309 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); |
2310 | } | 2310 | } |
2311 | 2311 | ||
2312 | static inline void unmap_mapping_range_tree(struct rb_root *root, | 2312 | static inline void unmap_mapping_range_tree(struct rb_root *root, |
2313 | struct zap_details *details) | 2313 | struct zap_details *details) |
2314 | { | 2314 | { |
2315 | struct vm_area_struct *vma; | 2315 | struct vm_area_struct *vma; |
2316 | pgoff_t vba, vea, zba, zea; | 2316 | pgoff_t vba, vea, zba, zea; |
2317 | 2317 | ||
2318 | vma_interval_tree_foreach(vma, root, | 2318 | vma_interval_tree_foreach(vma, root, |
2319 | details->first_index, details->last_index) { | 2319 | details->first_index, details->last_index) { |
2320 | 2320 | ||
2321 | vba = vma->vm_pgoff; | 2321 | vba = vma->vm_pgoff; |
2322 | vea = vba + vma_pages(vma) - 1; | 2322 | vea = vba + vma_pages(vma) - 1; |
2323 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ | 2323 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ |
2324 | zba = details->first_index; | 2324 | zba = details->first_index; |
2325 | if (zba < vba) | 2325 | if (zba < vba) |
2326 | zba = vba; | 2326 | zba = vba; |
2327 | zea = details->last_index; | 2327 | zea = details->last_index; |
2328 | if (zea > vea) | 2328 | if (zea > vea) |
2329 | zea = vea; | 2329 | zea = vea; |
2330 | 2330 | ||
2331 | unmap_mapping_range_vma(vma, | 2331 | unmap_mapping_range_vma(vma, |
2332 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, | 2332 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, |
2333 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, | 2333 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, |
2334 | details); | 2334 | details); |
2335 | } | 2335 | } |
2336 | } | 2336 | } |
2337 | 2337 | ||
2338 | static inline void unmap_mapping_range_list(struct list_head *head, | 2338 | static inline void unmap_mapping_range_list(struct list_head *head, |
2339 | struct zap_details *details) | 2339 | struct zap_details *details) |
2340 | { | 2340 | { |
2341 | struct vm_area_struct *vma; | 2341 | struct vm_area_struct *vma; |
2342 | 2342 | ||
2343 | /* | 2343 | /* |
2344 | * In nonlinear VMAs there is no correspondence between virtual address | 2344 | * In nonlinear VMAs there is no correspondence between virtual address |
2345 | * offset and file offset. So we must perform an exhaustive search | 2345 | * offset and file offset. So we must perform an exhaustive search |
2346 | * across *all* the pages in each nonlinear VMA, not just the pages | 2346 | * across *all* the pages in each nonlinear VMA, not just the pages |
2347 | * whose virtual address lies outside the file truncation point. | 2347 | * whose virtual address lies outside the file truncation point. |
2348 | */ | 2348 | */ |
2349 | list_for_each_entry(vma, head, shared.nonlinear) { | 2349 | list_for_each_entry(vma, head, shared.nonlinear) { |
2350 | details->nonlinear_vma = vma; | 2350 | details->nonlinear_vma = vma; |
2351 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | 2351 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); |
2352 | } | 2352 | } |
2353 | } | 2353 | } |
2354 | 2354 | ||
2355 | /** | 2355 | /** |
2356 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. | 2356 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. |
2357 | * @mapping: the address space containing mmaps to be unmapped. | 2357 | * @mapping: the address space containing mmaps to be unmapped. |
2358 | * @holebegin: byte in first page to unmap, relative to the start of | 2358 | * @holebegin: byte in first page to unmap, relative to the start of |
2359 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2359 | * the underlying file. This will be rounded down to a PAGE_SIZE |
2360 | * boundary. Note that this is different from truncate_pagecache(), which | 2360 | * boundary. Note that this is different from truncate_pagecache(), which |
2361 | * must keep the partial page. In contrast, we must get rid of | 2361 | * must keep the partial page. In contrast, we must get rid of |
2362 | * partial pages. | 2362 | * partial pages. |
2363 | * @holelen: size of prospective hole in bytes. This will be rounded | 2363 | * @holelen: size of prospective hole in bytes. This will be rounded |
2364 | * up to a PAGE_SIZE boundary. A holelen of zero truncates to the | 2364 | * up to a PAGE_SIZE boundary. A holelen of zero truncates to the |
2365 | * end of the file. | 2365 | * end of the file. |
2366 | * @even_cows: 1 when truncating a file, unmap even private COWed pages; | 2366 | * @even_cows: 1 when truncating a file, unmap even private COWed pages; |
2367 | * but 0 when invalidating pagecache, don't throw away private data. | 2367 | * but 0 when invalidating pagecache, don't throw away private data. |
2368 | */ | 2368 | */ |
2369 | void unmap_mapping_range(struct address_space *mapping, | 2369 | void unmap_mapping_range(struct address_space *mapping, |
2370 | loff_t const holebegin, loff_t const holelen, int even_cows) | 2370 | loff_t const holebegin, loff_t const holelen, int even_cows) |
2371 | { | 2371 | { |
2372 | struct zap_details details; | 2372 | struct zap_details details; |
2373 | pgoff_t hba = holebegin >> PAGE_SHIFT; | 2373 | pgoff_t hba = holebegin >> PAGE_SHIFT; |
2374 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2374 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2375 | 2375 | ||
2376 | /* Check for overflow. */ | 2376 | /* Check for overflow. */ |
2377 | if (sizeof(holelen) > sizeof(hlen)) { | 2377 | if (sizeof(holelen) > sizeof(hlen)) { |
2378 | long long holeend = | 2378 | long long holeend = |
2379 | (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2379 | (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2380 | if (holeend & ~(long long)ULONG_MAX) | 2380 | if (holeend & ~(long long)ULONG_MAX) |
2381 | hlen = ULONG_MAX - hba + 1; | 2381 | hlen = ULONG_MAX - hba + 1; |
2382 | } | 2382 | } |
2383 | 2383 | ||
2384 | details.check_mapping = even_cows? NULL: mapping; | 2384 | details.check_mapping = even_cows? NULL: mapping; |
2385 | details.nonlinear_vma = NULL; | 2385 | details.nonlinear_vma = NULL; |
2386 | details.first_index = hba; | 2386 | details.first_index = hba; |
2387 | details.last_index = hba + hlen - 1; | 2387 | details.last_index = hba + hlen - 1; |
2388 | if (details.last_index < details.first_index) | 2388 | if (details.last_index < details.first_index) |
2389 | details.last_index = ULONG_MAX; | 2389 | details.last_index = ULONG_MAX; |
2390 | 2390 | ||
2391 | 2391 | ||
2392 | mutex_lock(&mapping->i_mmap_mutex); | 2392 | mutex_lock(&mapping->i_mmap_mutex); |
2393 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2393 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2394 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2394 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2395 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2395 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2396 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2396 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
2397 | mutex_unlock(&mapping->i_mmap_mutex); | 2397 | mutex_unlock(&mapping->i_mmap_mutex); |
2398 | } | 2398 | } |
2399 | EXPORT_SYMBOL(unmap_mapping_range); | 2399 | EXPORT_SYMBOL(unmap_mapping_range); |
2400 | 2400 | ||
2401 | /* | 2401 | /* |
2402 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2402 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2403 | * but allow concurrent faults), and pte mapped but not yet locked. | 2403 | * but allow concurrent faults), and pte mapped but not yet locked. |
2404 | * We return with pte unmapped and unlocked. | 2404 | * We return with pte unmapped and unlocked. |
2405 | * | 2405 | * |
2406 | * We return with the mmap_sem locked or unlocked in the same cases | 2406 | * We return with the mmap_sem locked or unlocked in the same cases |
2407 | * as does filemap_fault(). | 2407 | * as does filemap_fault(). |
2408 | */ | 2408 | */ |
2409 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2409 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2410 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2410 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2411 | unsigned int flags, pte_t orig_pte) | 2411 | unsigned int flags, pte_t orig_pte) |
2412 | { | 2412 | { |
2413 | spinlock_t *ptl; | 2413 | spinlock_t *ptl; |
2414 | struct page *page, *swapcache; | 2414 | struct page *page, *swapcache; |
2415 | struct mem_cgroup *memcg; | 2415 | struct mem_cgroup *memcg; |
2416 | swp_entry_t entry; | 2416 | swp_entry_t entry; |
2417 | pte_t pte; | 2417 | pte_t pte; |
2418 | int locked; | 2418 | int locked; |
2419 | int exclusive = 0; | 2419 | int exclusive = 0; |
2420 | int ret = 0; | 2420 | int ret = 0; |
2421 | 2421 | ||
2422 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2422 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2423 | goto out; | 2423 | goto out; |
2424 | 2424 | ||
2425 | entry = pte_to_swp_entry(orig_pte); | 2425 | entry = pte_to_swp_entry(orig_pte); |
2426 | if (unlikely(non_swap_entry(entry))) { | 2426 | if (unlikely(non_swap_entry(entry))) { |
2427 | if (is_migration_entry(entry)) { | 2427 | if (is_migration_entry(entry)) { |
2428 | migration_entry_wait(mm, pmd, address); | 2428 | migration_entry_wait(mm, pmd, address); |
2429 | } else if (is_hwpoison_entry(entry)) { | 2429 | } else if (is_hwpoison_entry(entry)) { |
2430 | ret = VM_FAULT_HWPOISON; | 2430 | ret = VM_FAULT_HWPOISON; |
2431 | } else { | 2431 | } else { |
2432 | print_bad_pte(vma, address, orig_pte, NULL); | 2432 | print_bad_pte(vma, address, orig_pte, NULL); |
2433 | ret = VM_FAULT_SIGBUS; | 2433 | ret = VM_FAULT_SIGBUS; |
2434 | } | 2434 | } |
2435 | goto out; | 2435 | goto out; |
2436 | } | 2436 | } |
2437 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2437 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2438 | page = lookup_swap_cache(entry); | 2438 | page = lookup_swap_cache(entry); |
2439 | if (!page) { | 2439 | if (!page) { |
2440 | page = swapin_readahead(entry, | 2440 | page = swapin_readahead(entry, |
2441 | GFP_HIGHUSER_MOVABLE, vma, address); | 2441 | GFP_HIGHUSER_MOVABLE, vma, address); |
2442 | if (!page) { | 2442 | if (!page) { |
2443 | /* | 2443 | /* |
2444 | * Back out if somebody else faulted in this pte | 2444 | * Back out if somebody else faulted in this pte |
2445 | * while we released the pte lock. | 2445 | * while we released the pte lock. |
2446 | */ | 2446 | */ |
2447 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2447 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2448 | if (likely(pte_same(*page_table, orig_pte))) | 2448 | if (likely(pte_same(*page_table, orig_pte))) |
2449 | ret = VM_FAULT_OOM; | 2449 | ret = VM_FAULT_OOM; |
2450 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2450 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2451 | goto unlock; | 2451 | goto unlock; |
2452 | } | 2452 | } |
2453 | 2453 | ||
2454 | /* Had to read the page from swap area: Major fault */ | 2454 | /* Had to read the page from swap area: Major fault */ |
2455 | ret = VM_FAULT_MAJOR; | 2455 | ret = VM_FAULT_MAJOR; |
2456 | count_vm_event(PGMAJFAULT); | 2456 | count_vm_event(PGMAJFAULT); |
2457 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); | 2457 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); |
2458 | } else if (PageHWPoison(page)) { | 2458 | } else if (PageHWPoison(page)) { |
2459 | /* | 2459 | /* |
2460 | * hwpoisoned dirty swapcache pages are kept for killing | 2460 | * hwpoisoned dirty swapcache pages are kept for killing |
2461 | * owner processes (which may be unknown at hwpoison time) | 2461 | * owner processes (which may be unknown at hwpoison time) |
2462 | */ | 2462 | */ |
2463 | ret = VM_FAULT_HWPOISON; | 2463 | ret = VM_FAULT_HWPOISON; |
2464 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2464 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2465 | swapcache = page; | 2465 | swapcache = page; |
2466 | goto out_release; | 2466 | goto out_release; |
2467 | } | 2467 | } |
2468 | 2468 | ||
2469 | swapcache = page; | 2469 | swapcache = page; |
2470 | locked = lock_page_or_retry(page, mm, flags); | 2470 | locked = lock_page_or_retry(page, mm, flags); |
2471 | 2471 | ||
2472 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2472 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2473 | if (!locked) { | 2473 | if (!locked) { |
2474 | ret |= VM_FAULT_RETRY; | 2474 | ret |= VM_FAULT_RETRY; |
2475 | goto out_release; | 2475 | goto out_release; |
2476 | } | 2476 | } |
2477 | 2477 | ||
2478 | /* | 2478 | /* |
2479 | * Make sure try_to_free_swap or reuse_swap_page or swapoff did not | 2479 | * Make sure try_to_free_swap or reuse_swap_page or swapoff did not |
2480 | * release the swapcache from under us. The page pin, and pte_same | 2480 | * release the swapcache from under us. The page pin, and pte_same |
2481 | * test below, are not enough to exclude that. Even if it is still | 2481 | * test below, are not enough to exclude that. Even if it is still |
2482 | * swapcache, we need to check that the page's swap has not changed. | 2482 | * swapcache, we need to check that the page's swap has not changed. |
2483 | */ | 2483 | */ |
2484 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2484 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2485 | goto out_page; | 2485 | goto out_page; |
2486 | 2486 | ||
2487 | page = ksm_might_need_to_copy(page, vma, address); | 2487 | page = ksm_might_need_to_copy(page, vma, address); |
2488 | if (unlikely(!page)) { | 2488 | if (unlikely(!page)) { |
2489 | ret = VM_FAULT_OOM; | 2489 | ret = VM_FAULT_OOM; |
2490 | page = swapcache; | 2490 | page = swapcache; |
2491 | goto out_page; | 2491 | goto out_page; |
2492 | } | 2492 | } |
2493 | 2493 | ||
2494 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { | 2494 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { |
2495 | ret = VM_FAULT_OOM; | 2495 | ret = VM_FAULT_OOM; |
2496 | goto out_page; | 2496 | goto out_page; |
2497 | } | 2497 | } |
2498 | 2498 | ||
2499 | /* | 2499 | /* |
2500 | * Back out if somebody else already faulted in this pte. | 2500 | * Back out if somebody else already faulted in this pte. |
2501 | */ | 2501 | */ |
2502 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2502 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2503 | if (unlikely(!pte_same(*page_table, orig_pte))) | 2503 | if (unlikely(!pte_same(*page_table, orig_pte))) |
2504 | goto out_nomap; | 2504 | goto out_nomap; |
2505 | 2505 | ||
2506 | if (unlikely(!PageUptodate(page))) { | 2506 | if (unlikely(!PageUptodate(page))) { |
2507 | ret = VM_FAULT_SIGBUS; | 2507 | ret = VM_FAULT_SIGBUS; |
2508 | goto out_nomap; | 2508 | goto out_nomap; |
2509 | } | 2509 | } |
2510 | 2510 | ||
2511 | /* | 2511 | /* |
2512 | * The page isn't present yet, go ahead with the fault. | 2512 | * The page isn't present yet, go ahead with the fault. |
2513 | * | 2513 | * |
2514 | * Be careful about the sequence of operations here. | 2514 | * Be careful about the sequence of operations here. |
2515 | * To get its accounting right, reuse_swap_page() must be called | 2515 | * To get its accounting right, reuse_swap_page() must be called |
2516 | * while the page is counted on swap but not yet in mapcount i.e. | 2516 | * while the page is counted on swap but not yet in mapcount i.e. |
2517 | * before page_add_anon_rmap() and swap_free(); try_to_free_swap() | 2517 | * before page_add_anon_rmap() and swap_free(); try_to_free_swap() |
2518 | * must be called after the swap_free(), or it will never succeed. | 2518 | * must be called after the swap_free(), or it will never succeed. |
2519 | */ | 2519 | */ |
2520 | 2520 | ||
2521 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2521 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2522 | dec_mm_counter_fast(mm, MM_SWAPENTS); | 2522 | dec_mm_counter_fast(mm, MM_SWAPENTS); |
2523 | pte = mk_pte(page, vma->vm_page_prot); | 2523 | pte = mk_pte(page, vma->vm_page_prot); |
2524 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2524 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
2525 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2525 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2526 | flags &= ~FAULT_FLAG_WRITE; | 2526 | flags &= ~FAULT_FLAG_WRITE; |
2527 | ret |= VM_FAULT_WRITE; | 2527 | ret |= VM_FAULT_WRITE; |
2528 | exclusive = 1; | 2528 | exclusive = 1; |
2529 | } | 2529 | } |
2530 | flush_icache_page(vma, page); | 2530 | flush_icache_page(vma, page); |
2531 | if (pte_swp_soft_dirty(orig_pte)) | 2531 | if (pte_swp_soft_dirty(orig_pte)) |
2532 | pte = pte_mksoft_dirty(pte); | 2532 | pte = pte_mksoft_dirty(pte); |
2533 | set_pte_at(mm, address, page_table, pte); | 2533 | set_pte_at(mm, address, page_table, pte); |
2534 | if (page == swapcache) { | 2534 | if (page == swapcache) { |
2535 | do_page_add_anon_rmap(page, vma, address, exclusive); | 2535 | do_page_add_anon_rmap(page, vma, address, exclusive); |
2536 | mem_cgroup_commit_charge(page, memcg, true); | 2536 | mem_cgroup_commit_charge(page, memcg, true); |
2537 | } else { /* ksm created a completely new copy */ | 2537 | } else { /* ksm created a completely new copy */ |
2538 | page_add_new_anon_rmap(page, vma, address); | 2538 | page_add_new_anon_rmap(page, vma, address); |
2539 | mem_cgroup_commit_charge(page, memcg, false); | 2539 | mem_cgroup_commit_charge(page, memcg, false); |
2540 | lru_cache_add_active_or_unevictable(page, vma); | 2540 | lru_cache_add_active_or_unevictable(page, vma); |
2541 | } | 2541 | } |
2542 | 2542 | ||
2543 | swap_free(entry); | 2543 | swap_free(entry); |
2544 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2544 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
2545 | try_to_free_swap(page); | 2545 | try_to_free_swap(page); |
2546 | unlock_page(page); | 2546 | unlock_page(page); |
2547 | if (page != swapcache) { | 2547 | if (page != swapcache) { |
2548 | /* | 2548 | /* |
2549 | * Hold the lock to avoid the swap entry to be reused | 2549 | * Hold the lock to avoid the swap entry to be reused |
2550 | * until we take the PT lock for the pte_same() check | 2550 | * until we take the PT lock for the pte_same() check |
2551 | * (to avoid false positives from pte_same). For | 2551 | * (to avoid false positives from pte_same). For |
2552 | * further safety release the lock after the swap_free | 2552 | * further safety release the lock after the swap_free |
2553 | * so that the swap count won't change under a | 2553 | * so that the swap count won't change under a |
2554 | * parallel locked swapcache. | 2554 | * parallel locked swapcache. |
2555 | */ | 2555 | */ |
2556 | unlock_page(swapcache); | 2556 | unlock_page(swapcache); |
2557 | page_cache_release(swapcache); | 2557 | page_cache_release(swapcache); |
2558 | } | 2558 | } |
2559 | 2559 | ||
2560 | if (flags & FAULT_FLAG_WRITE) { | 2560 | if (flags & FAULT_FLAG_WRITE) { |
2561 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); | 2561 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); |
2562 | if (ret & VM_FAULT_ERROR) | 2562 | if (ret & VM_FAULT_ERROR) |
2563 | ret &= VM_FAULT_ERROR; | 2563 | ret &= VM_FAULT_ERROR; |
2564 | goto out; | 2564 | goto out; |
2565 | } | 2565 | } |
2566 | 2566 | ||
2567 | /* No need to invalidate - it was non-present before */ | 2567 | /* No need to invalidate - it was non-present before */ |
2568 | update_mmu_cache(vma, address, page_table); | 2568 | update_mmu_cache(vma, address, page_table); |
2569 | unlock: | 2569 | unlock: |
2570 | pte_unmap_unlock(page_table, ptl); | 2570 | pte_unmap_unlock(page_table, ptl); |
2571 | out: | 2571 | out: |
2572 | return ret; | 2572 | return ret; |
2573 | out_nomap: | 2573 | out_nomap: |
2574 | mem_cgroup_cancel_charge(page, memcg); | 2574 | mem_cgroup_cancel_charge(page, memcg); |
2575 | pte_unmap_unlock(page_table, ptl); | 2575 | pte_unmap_unlock(page_table, ptl); |
2576 | out_page: | 2576 | out_page: |
2577 | unlock_page(page); | 2577 | unlock_page(page); |
2578 | out_release: | 2578 | out_release: |
2579 | page_cache_release(page); | 2579 | page_cache_release(page); |
2580 | if (page != swapcache) { | 2580 | if (page != swapcache) { |
2581 | unlock_page(swapcache); | 2581 | unlock_page(swapcache); |
2582 | page_cache_release(swapcache); | 2582 | page_cache_release(swapcache); |
2583 | } | 2583 | } |
2584 | return ret; | 2584 | return ret; |
2585 | } | 2585 | } |
2586 | 2586 | ||
2587 | /* | 2587 | /* |
2588 | * This is like a special single-page "expand_{down|up}wards()", | 2588 | * This is like a special single-page "expand_{down|up}wards()", |
2589 | * except we must first make sure that 'address{-|+}PAGE_SIZE' | 2589 | * except we must first make sure that 'address{-|+}PAGE_SIZE' |
2590 | * doesn't hit another vma. | 2590 | * doesn't hit another vma. |
2591 | */ | 2591 | */ |
2592 | static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) | 2592 | static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) |
2593 | { | 2593 | { |
2594 | address &= PAGE_MASK; | 2594 | address &= PAGE_MASK; |
2595 | if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { | 2595 | if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { |
2596 | struct vm_area_struct *prev = vma->vm_prev; | 2596 | struct vm_area_struct *prev = vma->vm_prev; |
2597 | 2597 | ||
2598 | /* | 2598 | /* |
2599 | * Is there a mapping abutting this one below? | 2599 | * Is there a mapping abutting this one below? |
2600 | * | 2600 | * |
2601 | * That's only ok if it's the same stack mapping | 2601 | * That's only ok if it's the same stack mapping |
2602 | * that has gotten split.. | 2602 | * that has gotten split.. |
2603 | */ | 2603 | */ |
2604 | if (prev && prev->vm_end == address) | 2604 | if (prev && prev->vm_end == address) |
2605 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; | 2605 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; |
2606 | 2606 | ||
2607 | expand_downwards(vma, address - PAGE_SIZE); | 2607 | expand_downwards(vma, address - PAGE_SIZE); |
2608 | } | 2608 | } |
2609 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { | 2609 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { |
2610 | struct vm_area_struct *next = vma->vm_next; | 2610 | struct vm_area_struct *next = vma->vm_next; |
2611 | 2611 | ||
2612 | /* As VM_GROWSDOWN but s/below/above/ */ | 2612 | /* As VM_GROWSDOWN but s/below/above/ */ |
2613 | if (next && next->vm_start == address + PAGE_SIZE) | 2613 | if (next && next->vm_start == address + PAGE_SIZE) |
2614 | return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; | 2614 | return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; |
2615 | 2615 | ||
2616 | expand_upwards(vma, address + PAGE_SIZE); | 2616 | expand_upwards(vma, address + PAGE_SIZE); |
2617 | } | 2617 | } |
2618 | return 0; | 2618 | return 0; |
2619 | } | 2619 | } |
2620 | 2620 | ||
2621 | /* | 2621 | /* |
2622 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2622 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2623 | * but allow concurrent faults), and pte mapped but not yet locked. | 2623 | * but allow concurrent faults), and pte mapped but not yet locked. |
2624 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2624 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2625 | */ | 2625 | */ |
2626 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2626 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2627 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2627 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2628 | unsigned int flags) | 2628 | unsigned int flags) |
2629 | { | 2629 | { |
2630 | struct mem_cgroup *memcg; | 2630 | struct mem_cgroup *memcg; |
2631 | struct page *page; | 2631 | struct page *page; |
2632 | spinlock_t *ptl; | 2632 | spinlock_t *ptl; |
2633 | pte_t entry; | 2633 | pte_t entry; |
2634 | 2634 | ||
2635 | pte_unmap(page_table); | 2635 | pte_unmap(page_table); |
2636 | 2636 | ||
2637 | /* Check if we need to add a guard page to the stack */ | 2637 | /* Check if we need to add a guard page to the stack */ |
2638 | if (check_stack_guard_page(vma, address) < 0) | 2638 | if (check_stack_guard_page(vma, address) < 0) |
2639 | return VM_FAULT_SIGBUS; | 2639 | return VM_FAULT_SIGBUS; |
2640 | 2640 | ||
2641 | /* Use the zero-page for reads */ | 2641 | /* Use the zero-page for reads */ |
2642 | if (!(flags & FAULT_FLAG_WRITE)) { | 2642 | if (!(flags & FAULT_FLAG_WRITE)) { |
2643 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), | 2643 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), |
2644 | vma->vm_page_prot)); | 2644 | vma->vm_page_prot)); |
2645 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2645 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2646 | if (!pte_none(*page_table)) | 2646 | if (!pte_none(*page_table)) |
2647 | goto unlock; | 2647 | goto unlock; |
2648 | goto setpte; | 2648 | goto setpte; |
2649 | } | 2649 | } |
2650 | 2650 | ||
2651 | /* Allocate our own private page. */ | 2651 | /* Allocate our own private page. */ |
2652 | if (unlikely(anon_vma_prepare(vma))) | 2652 | if (unlikely(anon_vma_prepare(vma))) |
2653 | goto oom; | 2653 | goto oom; |
2654 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2654 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2655 | if (!page) | 2655 | if (!page) |
2656 | goto oom; | 2656 | goto oom; |
2657 | /* | 2657 | /* |
2658 | * The memory barrier inside __SetPageUptodate makes sure that | 2658 | * The memory barrier inside __SetPageUptodate makes sure that |
2659 | * preceeding stores to the page contents become visible before | 2659 | * preceeding stores to the page contents become visible before |
2660 | * the set_pte_at() write. | 2660 | * the set_pte_at() write. |
2661 | */ | 2661 | */ |
2662 | __SetPageUptodate(page); | 2662 | __SetPageUptodate(page); |
2663 | 2663 | ||
2664 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) | 2664 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) |
2665 | goto oom_free_page; | 2665 | goto oom_free_page; |
2666 | 2666 | ||
2667 | entry = mk_pte(page, vma->vm_page_prot); | 2667 | entry = mk_pte(page, vma->vm_page_prot); |
2668 | if (vma->vm_flags & VM_WRITE) | 2668 | if (vma->vm_flags & VM_WRITE) |
2669 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2669 | entry = pte_mkwrite(pte_mkdirty(entry)); |
2670 | 2670 | ||
2671 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2671 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2672 | if (!pte_none(*page_table)) | 2672 | if (!pte_none(*page_table)) |
2673 | goto release; | 2673 | goto release; |
2674 | 2674 | ||
2675 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2675 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2676 | page_add_new_anon_rmap(page, vma, address); | 2676 | page_add_new_anon_rmap(page, vma, address); |
2677 | mem_cgroup_commit_charge(page, memcg, false); | 2677 | mem_cgroup_commit_charge(page, memcg, false); |
2678 | lru_cache_add_active_or_unevictable(page, vma); | 2678 | lru_cache_add_active_or_unevictable(page, vma); |
2679 | setpte: | 2679 | setpte: |
2680 | set_pte_at(mm, address, page_table, entry); | 2680 | set_pte_at(mm, address, page_table, entry); |
2681 | 2681 | ||
2682 | /* No need to invalidate - it was non-present before */ | 2682 | /* No need to invalidate - it was non-present before */ |
2683 | update_mmu_cache(vma, address, page_table); | 2683 | update_mmu_cache(vma, address, page_table); |
2684 | unlock: | 2684 | unlock: |
2685 | pte_unmap_unlock(page_table, ptl); | 2685 | pte_unmap_unlock(page_table, ptl); |
2686 | return 0; | 2686 | return 0; |
2687 | release: | 2687 | release: |
2688 | mem_cgroup_cancel_charge(page, memcg); | 2688 | mem_cgroup_cancel_charge(page, memcg); |
2689 | page_cache_release(page); | 2689 | page_cache_release(page); |
2690 | goto unlock; | 2690 | goto unlock; |
2691 | oom_free_page: | 2691 | oom_free_page: |
2692 | page_cache_release(page); | 2692 | page_cache_release(page); |
2693 | oom: | 2693 | oom: |
2694 | return VM_FAULT_OOM; | 2694 | return VM_FAULT_OOM; |
2695 | } | 2695 | } |
2696 | 2696 | ||
2697 | /* | 2697 | /* |
2698 | * The mmap_sem must have been held on entry, and may have been | 2698 | * The mmap_sem must have been held on entry, and may have been |
2699 | * released depending on flags and vma->vm_ops->fault() return value. | 2699 | * released depending on flags and vma->vm_ops->fault() return value. |
2700 | * See filemap_fault() and __lock_page_retry(). | 2700 | * See filemap_fault() and __lock_page_retry(). |
2701 | */ | 2701 | */ |
2702 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, | 2702 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, |
2703 | pgoff_t pgoff, unsigned int flags, struct page **page) | 2703 | pgoff_t pgoff, unsigned int flags, struct page **page) |
2704 | { | 2704 | { |
2705 | struct vm_fault vmf; | 2705 | struct vm_fault vmf; |
2706 | int ret; | 2706 | int ret; |
2707 | 2707 | ||
2708 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2708 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
2709 | vmf.pgoff = pgoff; | 2709 | vmf.pgoff = pgoff; |
2710 | vmf.flags = flags; | 2710 | vmf.flags = flags; |
2711 | vmf.page = NULL; | 2711 | vmf.page = NULL; |
2712 | 2712 | ||
2713 | ret = vma->vm_ops->fault(vma, &vmf); | 2713 | ret = vma->vm_ops->fault(vma, &vmf); |
2714 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2714 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
2715 | return ret; | 2715 | return ret; |
2716 | 2716 | ||
2717 | if (unlikely(PageHWPoison(vmf.page))) { | 2717 | if (unlikely(PageHWPoison(vmf.page))) { |
2718 | if (ret & VM_FAULT_LOCKED) | 2718 | if (ret & VM_FAULT_LOCKED) |
2719 | unlock_page(vmf.page); | 2719 | unlock_page(vmf.page); |
2720 | page_cache_release(vmf.page); | 2720 | page_cache_release(vmf.page); |
2721 | return VM_FAULT_HWPOISON; | 2721 | return VM_FAULT_HWPOISON; |
2722 | } | 2722 | } |
2723 | 2723 | ||
2724 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 2724 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
2725 | lock_page(vmf.page); | 2725 | lock_page(vmf.page); |
2726 | else | 2726 | else |
2727 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); | 2727 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); |
2728 | 2728 | ||
2729 | *page = vmf.page; | 2729 | *page = vmf.page; |
2730 | return ret; | 2730 | return ret; |
2731 | } | 2731 | } |
2732 | 2732 | ||
2733 | /** | 2733 | /** |
2734 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | 2734 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. |
2735 | * | 2735 | * |
2736 | * @vma: virtual memory area | 2736 | * @vma: virtual memory area |
2737 | * @address: user virtual address | 2737 | * @address: user virtual address |
2738 | * @page: page to map | 2738 | * @page: page to map |
2739 | * @pte: pointer to target page table entry | 2739 | * @pte: pointer to target page table entry |
2740 | * @write: true, if new entry is writable | 2740 | * @write: true, if new entry is writable |
2741 | * @anon: true, if it's anonymous page | 2741 | * @anon: true, if it's anonymous page |
2742 | * | 2742 | * |
2743 | * Caller must hold page table lock relevant for @pte. | 2743 | * Caller must hold page table lock relevant for @pte. |
2744 | * | 2744 | * |
2745 | * Target users are page handler itself and implementations of | 2745 | * Target users are page handler itself and implementations of |
2746 | * vm_ops->map_pages. | 2746 | * vm_ops->map_pages. |
2747 | */ | 2747 | */ |
2748 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 2748 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, |
2749 | struct page *page, pte_t *pte, bool write, bool anon) | 2749 | struct page *page, pte_t *pte, bool write, bool anon) |
2750 | { | 2750 | { |
2751 | pte_t entry; | 2751 | pte_t entry; |
2752 | 2752 | ||
2753 | flush_icache_page(vma, page); | 2753 | flush_icache_page(vma, page); |
2754 | entry = mk_pte(page, vma->vm_page_prot); | 2754 | entry = mk_pte(page, vma->vm_page_prot); |
2755 | if (write) | 2755 | if (write) |
2756 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2756 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2757 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) | 2757 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) |
2758 | entry = pte_mksoft_dirty(entry); | 2758 | entry = pte_mksoft_dirty(entry); |
2759 | if (anon) { | 2759 | if (anon) { |
2760 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2760 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2761 | page_add_new_anon_rmap(page, vma, address); | 2761 | page_add_new_anon_rmap(page, vma, address); |
2762 | } else { | 2762 | } else { |
2763 | inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); | 2763 | inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); |
2764 | page_add_file_rmap(page); | 2764 | page_add_file_rmap(page); |
2765 | } | 2765 | } |
2766 | set_pte_at(vma->vm_mm, address, pte, entry); | 2766 | set_pte_at(vma->vm_mm, address, pte, entry); |
2767 | 2767 | ||
2768 | /* no need to invalidate: a not-present page won't be cached */ | 2768 | /* no need to invalidate: a not-present page won't be cached */ |
2769 | update_mmu_cache(vma, address, pte); | 2769 | update_mmu_cache(vma, address, pte); |
2770 | } | 2770 | } |
2771 | 2771 | ||
2772 | static unsigned long fault_around_bytes __read_mostly = | 2772 | static unsigned long fault_around_bytes __read_mostly = |
2773 | rounddown_pow_of_two(65536); | 2773 | rounddown_pow_of_two(65536); |
2774 | 2774 | ||
2775 | #ifdef CONFIG_DEBUG_FS | 2775 | #ifdef CONFIG_DEBUG_FS |
2776 | static int fault_around_bytes_get(void *data, u64 *val) | 2776 | static int fault_around_bytes_get(void *data, u64 *val) |
2777 | { | 2777 | { |
2778 | *val = fault_around_bytes; | 2778 | *val = fault_around_bytes; |
2779 | return 0; | 2779 | return 0; |
2780 | } | 2780 | } |
2781 | 2781 | ||
2782 | /* | 2782 | /* |
2783 | * fault_around_pages() and fault_around_mask() expects fault_around_bytes | 2783 | * fault_around_pages() and fault_around_mask() expects fault_around_bytes |
2784 | * rounded down to nearest page order. It's what do_fault_around() expects to | 2784 | * rounded down to nearest page order. It's what do_fault_around() expects to |
2785 | * see. | 2785 | * see. |
2786 | */ | 2786 | */ |
2787 | static int fault_around_bytes_set(void *data, u64 val) | 2787 | static int fault_around_bytes_set(void *data, u64 val) |
2788 | { | 2788 | { |
2789 | if (val / PAGE_SIZE > PTRS_PER_PTE) | 2789 | if (val / PAGE_SIZE > PTRS_PER_PTE) |
2790 | return -EINVAL; | 2790 | return -EINVAL; |
2791 | if (val > PAGE_SIZE) | 2791 | if (val > PAGE_SIZE) |
2792 | fault_around_bytes = rounddown_pow_of_two(val); | 2792 | fault_around_bytes = rounddown_pow_of_two(val); |
2793 | else | 2793 | else |
2794 | fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ | 2794 | fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ |
2795 | return 0; | 2795 | return 0; |
2796 | } | 2796 | } |
2797 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, | 2797 | DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, |
2798 | fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); | 2798 | fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); |
2799 | 2799 | ||
2800 | static int __init fault_around_debugfs(void) | 2800 | static int __init fault_around_debugfs(void) |
2801 | { | 2801 | { |
2802 | void *ret; | 2802 | void *ret; |
2803 | 2803 | ||
2804 | ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, | 2804 | ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, |
2805 | &fault_around_bytes_fops); | 2805 | &fault_around_bytes_fops); |
2806 | if (!ret) | 2806 | if (!ret) |
2807 | pr_warn("Failed to create fault_around_bytes in debugfs"); | 2807 | pr_warn("Failed to create fault_around_bytes in debugfs"); |
2808 | return 0; | 2808 | return 0; |
2809 | } | 2809 | } |
2810 | late_initcall(fault_around_debugfs); | 2810 | late_initcall(fault_around_debugfs); |
2811 | #endif | 2811 | #endif |
2812 | 2812 | ||
2813 | /* | 2813 | /* |
2814 | * do_fault_around() tries to map few pages around the fault address. The hope | 2814 | * do_fault_around() tries to map few pages around the fault address. The hope |
2815 | * is that the pages will be needed soon and this will lower the number of | 2815 | * is that the pages will be needed soon and this will lower the number of |
2816 | * faults to handle. | 2816 | * faults to handle. |
2817 | * | 2817 | * |
2818 | * It uses vm_ops->map_pages() to map the pages, which skips the page if it's | 2818 | * It uses vm_ops->map_pages() to map the pages, which skips the page if it's |
2819 | * not ready to be mapped: not up-to-date, locked, etc. | 2819 | * not ready to be mapped: not up-to-date, locked, etc. |
2820 | * | 2820 | * |
2821 | * This function is called with the page table lock taken. In the split ptlock | 2821 | * This function is called with the page table lock taken. In the split ptlock |
2822 | * case the page table lock only protects only those entries which belong to | 2822 | * case the page table lock only protects only those entries which belong to |
2823 | * the page table corresponding to the fault address. | 2823 | * the page table corresponding to the fault address. |
2824 | * | 2824 | * |
2825 | * This function doesn't cross the VMA boundaries, in order to call map_pages() | 2825 | * This function doesn't cross the VMA boundaries, in order to call map_pages() |
2826 | * only once. | 2826 | * only once. |
2827 | * | 2827 | * |
2828 | * fault_around_pages() defines how many pages we'll try to map. | 2828 | * fault_around_pages() defines how many pages we'll try to map. |
2829 | * do_fault_around() expects it to return a power of two less than or equal to | 2829 | * do_fault_around() expects it to return a power of two less than or equal to |
2830 | * PTRS_PER_PTE. | 2830 | * PTRS_PER_PTE. |
2831 | * | 2831 | * |
2832 | * The virtual address of the area that we map is naturally aligned to the | 2832 | * The virtual address of the area that we map is naturally aligned to the |
2833 | * fault_around_pages() value (and therefore to page order). This way it's | 2833 | * fault_around_pages() value (and therefore to page order). This way it's |
2834 | * easier to guarantee that we don't cross page table boundaries. | 2834 | * easier to guarantee that we don't cross page table boundaries. |
2835 | */ | 2835 | */ |
2836 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 2836 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, |
2837 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | 2837 | pte_t *pte, pgoff_t pgoff, unsigned int flags) |
2838 | { | 2838 | { |
2839 | unsigned long start_addr, nr_pages, mask; | 2839 | unsigned long start_addr, nr_pages, mask; |
2840 | pgoff_t max_pgoff; | 2840 | pgoff_t max_pgoff; |
2841 | struct vm_fault vmf; | 2841 | struct vm_fault vmf; |
2842 | int off; | 2842 | int off; |
2843 | 2843 | ||
2844 | nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 2844 | nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2845 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 2845 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
2846 | 2846 | ||
2847 | start_addr = max(address & mask, vma->vm_start); | 2847 | start_addr = max(address & mask, vma->vm_start); |
2848 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 2848 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
2849 | pte -= off; | 2849 | pte -= off; |
2850 | pgoff -= off; | 2850 | pgoff -= off; |
2851 | 2851 | ||
2852 | /* | 2852 | /* |
2853 | * max_pgoff is either end of page table or end of vma | 2853 | * max_pgoff is either end of page table or end of vma |
2854 | * or fault_around_pages() from pgoff, depending what is nearest. | 2854 | * or fault_around_pages() from pgoff, depending what is nearest. |
2855 | */ | 2855 | */ |
2856 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 2856 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
2857 | PTRS_PER_PTE - 1; | 2857 | PTRS_PER_PTE - 1; |
2858 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | 2858 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, |
2859 | pgoff + nr_pages - 1); | 2859 | pgoff + nr_pages - 1); |
2860 | 2860 | ||
2861 | /* Check if it makes any sense to call ->map_pages */ | 2861 | /* Check if it makes any sense to call ->map_pages */ |
2862 | while (!pte_none(*pte)) { | 2862 | while (!pte_none(*pte)) { |
2863 | if (++pgoff > max_pgoff) | 2863 | if (++pgoff > max_pgoff) |
2864 | return; | 2864 | return; |
2865 | start_addr += PAGE_SIZE; | 2865 | start_addr += PAGE_SIZE; |
2866 | if (start_addr >= vma->vm_end) | 2866 | if (start_addr >= vma->vm_end) |
2867 | return; | 2867 | return; |
2868 | pte++; | 2868 | pte++; |
2869 | } | 2869 | } |
2870 | 2870 | ||
2871 | vmf.virtual_address = (void __user *) start_addr; | 2871 | vmf.virtual_address = (void __user *) start_addr; |
2872 | vmf.pte = pte; | 2872 | vmf.pte = pte; |
2873 | vmf.pgoff = pgoff; | 2873 | vmf.pgoff = pgoff; |
2874 | vmf.max_pgoff = max_pgoff; | 2874 | vmf.max_pgoff = max_pgoff; |
2875 | vmf.flags = flags; | 2875 | vmf.flags = flags; |
2876 | vma->vm_ops->map_pages(vma, &vmf); | 2876 | vma->vm_ops->map_pages(vma, &vmf); |
2877 | } | 2877 | } |
2878 | 2878 | ||
2879 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2879 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2880 | unsigned long address, pmd_t *pmd, | 2880 | unsigned long address, pmd_t *pmd, |
2881 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 2881 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
2882 | { | 2882 | { |
2883 | struct page *fault_page; | 2883 | struct page *fault_page; |
2884 | spinlock_t *ptl; | 2884 | spinlock_t *ptl; |
2885 | pte_t *pte; | 2885 | pte_t *pte; |
2886 | int ret = 0; | 2886 | int ret = 0; |
2887 | 2887 | ||
2888 | /* | 2888 | /* |
2889 | * Let's call ->map_pages() first and use ->fault() as fallback | 2889 | * Let's call ->map_pages() first and use ->fault() as fallback |
2890 | * if page by the offset is not ready to be mapped (cold cache or | 2890 | * if page by the offset is not ready to be mapped (cold cache or |
2891 | * something). | 2891 | * something). |
2892 | */ | 2892 | */ |
2893 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && | 2893 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && |
2894 | fault_around_bytes >> PAGE_SHIFT > 1) { | 2894 | fault_around_bytes >> PAGE_SHIFT > 1) { |
2895 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2895 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2896 | do_fault_around(vma, address, pte, pgoff, flags); | 2896 | do_fault_around(vma, address, pte, pgoff, flags); |
2897 | if (!pte_same(*pte, orig_pte)) | 2897 | if (!pte_same(*pte, orig_pte)) |
2898 | goto unlock_out; | 2898 | goto unlock_out; |
2899 | pte_unmap_unlock(pte, ptl); | 2899 | pte_unmap_unlock(pte, ptl); |
2900 | } | 2900 | } |
2901 | 2901 | ||
2902 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 2902 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); |
2903 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2903 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
2904 | return ret; | 2904 | return ret; |
2905 | 2905 | ||
2906 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2906 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2907 | if (unlikely(!pte_same(*pte, orig_pte))) { | 2907 | if (unlikely(!pte_same(*pte, orig_pte))) { |
2908 | pte_unmap_unlock(pte, ptl); | 2908 | pte_unmap_unlock(pte, ptl); |
2909 | unlock_page(fault_page); | 2909 | unlock_page(fault_page); |
2910 | page_cache_release(fault_page); | 2910 | page_cache_release(fault_page); |
2911 | return ret; | 2911 | return ret; |
2912 | } | 2912 | } |
2913 | do_set_pte(vma, address, fault_page, pte, false, false); | 2913 | do_set_pte(vma, address, fault_page, pte, false, false); |
2914 | unlock_page(fault_page); | 2914 | unlock_page(fault_page); |
2915 | unlock_out: | 2915 | unlock_out: |
2916 | pte_unmap_unlock(pte, ptl); | 2916 | pte_unmap_unlock(pte, ptl); |
2917 | return ret; | 2917 | return ret; |
2918 | } | 2918 | } |
2919 | 2919 | ||
2920 | static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2920 | static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2921 | unsigned long address, pmd_t *pmd, | 2921 | unsigned long address, pmd_t *pmd, |
2922 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 2922 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
2923 | { | 2923 | { |
2924 | struct page *fault_page, *new_page; | 2924 | struct page *fault_page, *new_page; |
2925 | struct mem_cgroup *memcg; | 2925 | struct mem_cgroup *memcg; |
2926 | spinlock_t *ptl; | 2926 | spinlock_t *ptl; |
2927 | pte_t *pte; | 2927 | pte_t *pte; |
2928 | int ret; | 2928 | int ret; |
2929 | 2929 | ||
2930 | if (unlikely(anon_vma_prepare(vma))) | 2930 | if (unlikely(anon_vma_prepare(vma))) |
2931 | return VM_FAULT_OOM; | 2931 | return VM_FAULT_OOM; |
2932 | 2932 | ||
2933 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2933 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
2934 | if (!new_page) | 2934 | if (!new_page) |
2935 | return VM_FAULT_OOM; | 2935 | return VM_FAULT_OOM; |
2936 | 2936 | ||
2937 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { | 2937 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { |
2938 | page_cache_release(new_page); | 2938 | page_cache_release(new_page); |
2939 | return VM_FAULT_OOM; | 2939 | return VM_FAULT_OOM; |
2940 | } | 2940 | } |
2941 | 2941 | ||
2942 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 2942 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); |
2943 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2943 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
2944 | goto uncharge_out; | 2944 | goto uncharge_out; |
2945 | 2945 | ||
2946 | copy_user_highpage(new_page, fault_page, address, vma); | 2946 | copy_user_highpage(new_page, fault_page, address, vma); |
2947 | __SetPageUptodate(new_page); | 2947 | __SetPageUptodate(new_page); |
2948 | 2948 | ||
2949 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2949 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2950 | if (unlikely(!pte_same(*pte, orig_pte))) { | 2950 | if (unlikely(!pte_same(*pte, orig_pte))) { |
2951 | pte_unmap_unlock(pte, ptl); | 2951 | pte_unmap_unlock(pte, ptl); |
2952 | unlock_page(fault_page); | 2952 | unlock_page(fault_page); |
2953 | page_cache_release(fault_page); | 2953 | page_cache_release(fault_page); |
2954 | goto uncharge_out; | 2954 | goto uncharge_out; |
2955 | } | 2955 | } |
2956 | do_set_pte(vma, address, new_page, pte, true, true); | 2956 | do_set_pte(vma, address, new_page, pte, true, true); |
2957 | mem_cgroup_commit_charge(new_page, memcg, false); | 2957 | mem_cgroup_commit_charge(new_page, memcg, false); |
2958 | lru_cache_add_active_or_unevictable(new_page, vma); | 2958 | lru_cache_add_active_or_unevictable(new_page, vma); |
2959 | pte_unmap_unlock(pte, ptl); | 2959 | pte_unmap_unlock(pte, ptl); |
2960 | unlock_page(fault_page); | 2960 | unlock_page(fault_page); |
2961 | page_cache_release(fault_page); | 2961 | page_cache_release(fault_page); |
2962 | return ret; | 2962 | return ret; |
2963 | uncharge_out: | 2963 | uncharge_out: |
2964 | mem_cgroup_cancel_charge(new_page, memcg); | 2964 | mem_cgroup_cancel_charge(new_page, memcg); |
2965 | page_cache_release(new_page); | 2965 | page_cache_release(new_page); |
2966 | return ret; | 2966 | return ret; |
2967 | } | 2967 | } |
2968 | 2968 | ||
2969 | static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2969 | static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2970 | unsigned long address, pmd_t *pmd, | 2970 | unsigned long address, pmd_t *pmd, |
2971 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 2971 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
2972 | { | 2972 | { |
2973 | struct page *fault_page; | 2973 | struct page *fault_page; |
2974 | struct address_space *mapping; | 2974 | struct address_space *mapping; |
2975 | spinlock_t *ptl; | 2975 | spinlock_t *ptl; |
2976 | pte_t *pte; | 2976 | pte_t *pte; |
2977 | int dirtied = 0; | 2977 | int dirtied = 0; |
2978 | int ret, tmp; | 2978 | int ret, tmp; |
2979 | 2979 | ||
2980 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 2980 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); |
2981 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 2981 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
2982 | return ret; | 2982 | return ret; |
2983 | 2983 | ||
2984 | /* | 2984 | /* |
2985 | * Check if the backing address space wants to know that the page is | 2985 | * Check if the backing address space wants to know that the page is |
2986 | * about to become writable | 2986 | * about to become writable |
2987 | */ | 2987 | */ |
2988 | if (vma->vm_ops->page_mkwrite) { | 2988 | if (vma->vm_ops->page_mkwrite) { |
2989 | unlock_page(fault_page); | 2989 | unlock_page(fault_page); |
2990 | tmp = do_page_mkwrite(vma, fault_page, address); | 2990 | tmp = do_page_mkwrite(vma, fault_page, address); |
2991 | if (unlikely(!tmp || | 2991 | if (unlikely(!tmp || |
2992 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2992 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2993 | page_cache_release(fault_page); | 2993 | page_cache_release(fault_page); |
2994 | return tmp; | 2994 | return tmp; |
2995 | } | 2995 | } |
2996 | } | 2996 | } |
2997 | 2997 | ||
2998 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2998 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2999 | if (unlikely(!pte_same(*pte, orig_pte))) { | 2999 | if (unlikely(!pte_same(*pte, orig_pte))) { |
3000 | pte_unmap_unlock(pte, ptl); | 3000 | pte_unmap_unlock(pte, ptl); |
3001 | unlock_page(fault_page); | 3001 | unlock_page(fault_page); |
3002 | page_cache_release(fault_page); | 3002 | page_cache_release(fault_page); |
3003 | return ret; | 3003 | return ret; |
3004 | } | 3004 | } |
3005 | do_set_pte(vma, address, fault_page, pte, true, false); | 3005 | do_set_pte(vma, address, fault_page, pte, true, false); |
3006 | pte_unmap_unlock(pte, ptl); | 3006 | pte_unmap_unlock(pte, ptl); |
3007 | 3007 | ||
3008 | if (set_page_dirty(fault_page)) | 3008 | if (set_page_dirty(fault_page)) |
3009 | dirtied = 1; | 3009 | dirtied = 1; |
3010 | mapping = fault_page->mapping; | 3010 | mapping = fault_page->mapping; |
3011 | unlock_page(fault_page); | 3011 | unlock_page(fault_page); |
3012 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { | 3012 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { |
3013 | /* | 3013 | /* |
3014 | * Some device drivers do not set page.mapping but still | 3014 | * Some device drivers do not set page.mapping but still |
3015 | * dirty their pages | 3015 | * dirty their pages |
3016 | */ | 3016 | */ |
3017 | balance_dirty_pages_ratelimited(mapping); | 3017 | balance_dirty_pages_ratelimited(mapping); |
3018 | } | 3018 | } |
3019 | 3019 | ||
3020 | /* file_update_time outside page_lock */ | 3020 | /* file_update_time outside page_lock */ |
3021 | if (vma->vm_file && !vma->vm_ops->page_mkwrite) | 3021 | if (vma->vm_file && !vma->vm_ops->page_mkwrite) |
3022 | file_update_time(vma->vm_file); | 3022 | file_update_time(vma->vm_file); |
3023 | 3023 | ||
3024 | return ret; | 3024 | return ret; |
3025 | } | 3025 | } |
3026 | 3026 | ||
3027 | /* | 3027 | /* |
3028 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3028 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
3029 | * but allow concurrent faults). | 3029 | * but allow concurrent faults). |
3030 | * The mmap_sem may have been released depending on flags and our | 3030 | * The mmap_sem may have been released depending on flags and our |
3031 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3031 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3032 | */ | 3032 | */ |
3033 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3033 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3034 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 3034 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
3035 | unsigned int flags, pte_t orig_pte) | 3035 | unsigned int flags, pte_t orig_pte) |
3036 | { | 3036 | { |
3037 | pgoff_t pgoff = (((address & PAGE_MASK) | 3037 | pgoff_t pgoff = (((address & PAGE_MASK) |
3038 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 3038 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
3039 | 3039 | ||
3040 | pte_unmap(page_table); | 3040 | pte_unmap(page_table); |
3041 | if (!(flags & FAULT_FLAG_WRITE)) | 3041 | if (!(flags & FAULT_FLAG_WRITE)) |
3042 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | 3042 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, |
3043 | orig_pte); | 3043 | orig_pte); |
3044 | if (!(vma->vm_flags & VM_SHARED)) | 3044 | if (!(vma->vm_flags & VM_SHARED)) |
3045 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3045 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, |
3046 | orig_pte); | 3046 | orig_pte); |
3047 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3047 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3048 | } | 3048 | } |
3049 | 3049 | ||
3050 | /* | 3050 | /* |
3051 | * Fault of a previously existing named mapping. Repopulate the pte | 3051 | * Fault of a previously existing named mapping. Repopulate the pte |
3052 | * from the encoded file_pte if possible. This enables swappable | 3052 | * from the encoded file_pte if possible. This enables swappable |
3053 | * nonlinear vmas. | 3053 | * nonlinear vmas. |
3054 | * | 3054 | * |
3055 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3055 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
3056 | * but allow concurrent faults), and pte mapped but not yet locked. | 3056 | * but allow concurrent faults), and pte mapped but not yet locked. |
3057 | * We return with pte unmapped and unlocked. | 3057 | * We return with pte unmapped and unlocked. |
3058 | * The mmap_sem may have been released depending on flags and our | 3058 | * The mmap_sem may have been released depending on flags and our |
3059 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3059 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3060 | */ | 3060 | */ |
3061 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3061 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3062 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 3062 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
3063 | unsigned int flags, pte_t orig_pte) | 3063 | unsigned int flags, pte_t orig_pte) |
3064 | { | 3064 | { |
3065 | pgoff_t pgoff; | 3065 | pgoff_t pgoff; |
3066 | 3066 | ||
3067 | flags |= FAULT_FLAG_NONLINEAR; | 3067 | flags |= FAULT_FLAG_NONLINEAR; |
3068 | 3068 | ||
3069 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 3069 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
3070 | return 0; | 3070 | return 0; |
3071 | 3071 | ||
3072 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { | 3072 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
3073 | /* | 3073 | /* |
3074 | * Page table corrupted: show pte and kill process. | 3074 | * Page table corrupted: show pte and kill process. |
3075 | */ | 3075 | */ |
3076 | print_bad_pte(vma, address, orig_pte, NULL); | 3076 | print_bad_pte(vma, address, orig_pte, NULL); |
3077 | return VM_FAULT_SIGBUS; | 3077 | return VM_FAULT_SIGBUS; |
3078 | } | 3078 | } |
3079 | 3079 | ||
3080 | pgoff = pte_to_pgoff(orig_pte); | 3080 | pgoff = pte_to_pgoff(orig_pte); |
3081 | if (!(flags & FAULT_FLAG_WRITE)) | 3081 | if (!(flags & FAULT_FLAG_WRITE)) |
3082 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | 3082 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, |
3083 | orig_pte); | 3083 | orig_pte); |
3084 | if (!(vma->vm_flags & VM_SHARED)) | 3084 | if (!(vma->vm_flags & VM_SHARED)) |
3085 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3085 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, |
3086 | orig_pte); | 3086 | orig_pte); |
3087 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3087 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3088 | } | 3088 | } |
3089 | 3089 | ||
3090 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3090 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
3091 | unsigned long addr, int page_nid, | 3091 | unsigned long addr, int page_nid, |
3092 | int *flags) | 3092 | int *flags) |
3093 | { | 3093 | { |
3094 | get_page(page); | 3094 | get_page(page); |
3095 | 3095 | ||
3096 | count_vm_numa_event(NUMA_HINT_FAULTS); | 3096 | count_vm_numa_event(NUMA_HINT_FAULTS); |
3097 | if (page_nid == numa_node_id()) { | 3097 | if (page_nid == numa_node_id()) { |
3098 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 3098 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
3099 | *flags |= TNF_FAULT_LOCAL; | 3099 | *flags |= TNF_FAULT_LOCAL; |
3100 | } | 3100 | } |
3101 | 3101 | ||
3102 | return mpol_misplaced(page, vma, addr); | 3102 | return mpol_misplaced(page, vma, addr); |
3103 | } | 3103 | } |
3104 | 3104 | ||
3105 | static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 3105 | static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
3106 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | 3106 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) |
3107 | { | 3107 | { |
3108 | struct page *page = NULL; | 3108 | struct page *page = NULL; |
3109 | spinlock_t *ptl; | 3109 | spinlock_t *ptl; |
3110 | int page_nid = -1; | 3110 | int page_nid = -1; |
3111 | int last_cpupid; | 3111 | int last_cpupid; |
3112 | int target_nid; | 3112 | int target_nid; |
3113 | bool migrated = false; | 3113 | bool migrated = false; |
3114 | int flags = 0; | 3114 | int flags = 0; |
3115 | 3115 | ||
3116 | /* | 3116 | /* |
3117 | * The "pte" at this point cannot be used safely without | 3117 | * The "pte" at this point cannot be used safely without |
3118 | * validation through pte_unmap_same(). It's of NUMA type but | 3118 | * validation through pte_unmap_same(). It's of NUMA type but |
3119 | * the pfn may be screwed if the read is non atomic. | 3119 | * the pfn may be screwed if the read is non atomic. |
3120 | * | 3120 | * |
3121 | * ptep_modify_prot_start is not called as this is clearing | 3121 | * ptep_modify_prot_start is not called as this is clearing |
3122 | * the _PAGE_NUMA bit and it is not really expected that there | 3122 | * the _PAGE_NUMA bit and it is not really expected that there |
3123 | * would be concurrent hardware modifications to the PTE. | 3123 | * would be concurrent hardware modifications to the PTE. |
3124 | */ | 3124 | */ |
3125 | ptl = pte_lockptr(mm, pmd); | 3125 | ptl = pte_lockptr(mm, pmd); |
3126 | spin_lock(ptl); | 3126 | spin_lock(ptl); |
3127 | if (unlikely(!pte_same(*ptep, pte))) { | 3127 | if (unlikely(!pte_same(*ptep, pte))) { |
3128 | pte_unmap_unlock(ptep, ptl); | 3128 | pte_unmap_unlock(ptep, ptl); |
3129 | goto out; | 3129 | goto out; |
3130 | } | 3130 | } |
3131 | 3131 | ||
3132 | pte = pte_mknonnuma(pte); | 3132 | pte = pte_mknonnuma(pte); |
3133 | set_pte_at(mm, addr, ptep, pte); | 3133 | set_pte_at(mm, addr, ptep, pte); |
3134 | update_mmu_cache(vma, addr, ptep); | 3134 | update_mmu_cache(vma, addr, ptep); |
3135 | 3135 | ||
3136 | page = vm_normal_page(vma, addr, pte); | 3136 | page = vm_normal_page(vma, addr, pte); |
3137 | if (!page) { | 3137 | if (!page) { |
3138 | pte_unmap_unlock(ptep, ptl); | 3138 | pte_unmap_unlock(ptep, ptl); |
3139 | return 0; | 3139 | return 0; |
3140 | } | 3140 | } |
3141 | BUG_ON(is_zero_pfn(page_to_pfn(page))); | 3141 | BUG_ON(is_zero_pfn(page_to_pfn(page))); |
3142 | 3142 | ||
3143 | /* | 3143 | /* |
3144 | * Avoid grouping on DSO/COW pages in specific and RO pages | 3144 | * Avoid grouping on DSO/COW pages in specific and RO pages |
3145 | * in general, RO pages shouldn't hurt as much anyway since | 3145 | * in general, RO pages shouldn't hurt as much anyway since |
3146 | * they can be in shared cache state. | 3146 | * they can be in shared cache state. |
3147 | */ | 3147 | */ |
3148 | if (!pte_write(pte)) | 3148 | if (!pte_write(pte)) |
3149 | flags |= TNF_NO_GROUP; | 3149 | flags |= TNF_NO_GROUP; |
3150 | 3150 | ||
3151 | /* | 3151 | /* |
3152 | * Flag if the page is shared between multiple address spaces. This | 3152 | * Flag if the page is shared between multiple address spaces. This |
3153 | * is later used when determining whether to group tasks together | 3153 | * is later used when determining whether to group tasks together |
3154 | */ | 3154 | */ |
3155 | if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) | 3155 | if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) |
3156 | flags |= TNF_SHARED; | 3156 | flags |= TNF_SHARED; |
3157 | 3157 | ||
3158 | last_cpupid = page_cpupid_last(page); | 3158 | last_cpupid = page_cpupid_last(page); |
3159 | page_nid = page_to_nid(page); | 3159 | page_nid = page_to_nid(page); |
3160 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); | 3160 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); |
3161 | pte_unmap_unlock(ptep, ptl); | 3161 | pte_unmap_unlock(ptep, ptl); |
3162 | if (target_nid == -1) { | 3162 | if (target_nid == -1) { |
3163 | put_page(page); | 3163 | put_page(page); |
3164 | goto out; | 3164 | goto out; |
3165 | } | 3165 | } |
3166 | 3166 | ||
3167 | /* Migrate to the requested node */ | 3167 | /* Migrate to the requested node */ |
3168 | migrated = migrate_misplaced_page(page, vma, target_nid); | 3168 | migrated = migrate_misplaced_page(page, vma, target_nid); |
3169 | if (migrated) { | 3169 | if (migrated) { |
3170 | page_nid = target_nid; | 3170 | page_nid = target_nid; |
3171 | flags |= TNF_MIGRATED; | 3171 | flags |= TNF_MIGRATED; |
3172 | } | 3172 | } |
3173 | 3173 | ||
3174 | out: | 3174 | out: |
3175 | if (page_nid != -1) | 3175 | if (page_nid != -1) |
3176 | task_numa_fault(last_cpupid, page_nid, 1, flags); | 3176 | task_numa_fault(last_cpupid, page_nid, 1, flags); |
3177 | return 0; | 3177 | return 0; |
3178 | } | 3178 | } |
3179 | 3179 | ||
3180 | /* | 3180 | /* |
3181 | * These routines also need to handle stuff like marking pages dirty | 3181 | * These routines also need to handle stuff like marking pages dirty |
3182 | * and/or accessed for architectures that don't do it in hardware (most | 3182 | * and/or accessed for architectures that don't do it in hardware (most |
3183 | * RISC architectures). The early dirtying is also good on the i386. | 3183 | * RISC architectures). The early dirtying is also good on the i386. |
3184 | * | 3184 | * |
3185 | * There is also a hook called "update_mmu_cache()" that architectures | 3185 | * There is also a hook called "update_mmu_cache()" that architectures |
3186 | * with external mmu caches can use to update those (ie the Sparc or | 3186 | * with external mmu caches can use to update those (ie the Sparc or |
3187 | * PowerPC hashed page tables that act as extended TLBs). | 3187 | * PowerPC hashed page tables that act as extended TLBs). |
3188 | * | 3188 | * |
3189 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3189 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
3190 | * but allow concurrent faults), and pte mapped but not yet locked. | 3190 | * but allow concurrent faults), and pte mapped but not yet locked. |
3191 | * We return with pte unmapped and unlocked. | 3191 | * We return with pte unmapped and unlocked. |
3192 | * | 3192 | * |
3193 | * The mmap_sem may have been released depending on flags and our | 3193 | * The mmap_sem may have been released depending on flags and our |
3194 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3194 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3195 | */ | 3195 | */ |
3196 | static int handle_pte_fault(struct mm_struct *mm, | 3196 | static int handle_pte_fault(struct mm_struct *mm, |
3197 | struct vm_area_struct *vma, unsigned long address, | 3197 | struct vm_area_struct *vma, unsigned long address, |
3198 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3198 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3199 | { | 3199 | { |
3200 | pte_t entry; | 3200 | pte_t entry; |
3201 | spinlock_t *ptl; | 3201 | spinlock_t *ptl; |
3202 | 3202 | ||
3203 | entry = ACCESS_ONCE(*pte); | 3203 | entry = ACCESS_ONCE(*pte); |
3204 | if (!pte_present(entry)) { | 3204 | if (!pte_present(entry)) { |
3205 | if (pte_none(entry)) { | 3205 | if (pte_none(entry)) { |
3206 | if (vma->vm_ops) { | 3206 | if (vma->vm_ops) { |
3207 | if (likely(vma->vm_ops->fault)) | 3207 | if (likely(vma->vm_ops->fault)) |
3208 | return do_linear_fault(mm, vma, address, | 3208 | return do_linear_fault(mm, vma, address, |
3209 | pte, pmd, flags, entry); | 3209 | pte, pmd, flags, entry); |
3210 | } | 3210 | } |
3211 | return do_anonymous_page(mm, vma, address, | 3211 | return do_anonymous_page(mm, vma, address, |
3212 | pte, pmd, flags); | 3212 | pte, pmd, flags); |
3213 | } | 3213 | } |
3214 | if (pte_file(entry)) | 3214 | if (pte_file(entry)) |
3215 | return do_nonlinear_fault(mm, vma, address, | 3215 | return do_nonlinear_fault(mm, vma, address, |
3216 | pte, pmd, flags, entry); | 3216 | pte, pmd, flags, entry); |
3217 | return do_swap_page(mm, vma, address, | 3217 | return do_swap_page(mm, vma, address, |
3218 | pte, pmd, flags, entry); | 3218 | pte, pmd, flags, entry); |
3219 | } | 3219 | } |
3220 | 3220 | ||
3221 | if (pte_numa(entry)) | 3221 | if (pte_numa(entry)) |
3222 | return do_numa_page(mm, vma, address, entry, pte, pmd); | 3222 | return do_numa_page(mm, vma, address, entry, pte, pmd); |
3223 | 3223 | ||
3224 | ptl = pte_lockptr(mm, pmd); | 3224 | ptl = pte_lockptr(mm, pmd); |
3225 | spin_lock(ptl); | 3225 | spin_lock(ptl); |
3226 | if (unlikely(!pte_same(*pte, entry))) | 3226 | if (unlikely(!pte_same(*pte, entry))) |
3227 | goto unlock; | 3227 | goto unlock; |
3228 | if (flags & FAULT_FLAG_WRITE) { | 3228 | if (flags & FAULT_FLAG_WRITE) { |
3229 | if (!pte_write(entry)) | 3229 | if (!pte_write(entry)) |
3230 | return do_wp_page(mm, vma, address, | 3230 | return do_wp_page(mm, vma, address, |
3231 | pte, pmd, ptl, entry); | 3231 | pte, pmd, ptl, entry); |
3232 | entry = pte_mkdirty(entry); | 3232 | entry = pte_mkdirty(entry); |
3233 | } | 3233 | } |
3234 | entry = pte_mkyoung(entry); | 3234 | entry = pte_mkyoung(entry); |
3235 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3235 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
3236 | update_mmu_cache(vma, address, pte); | 3236 | update_mmu_cache(vma, address, pte); |
3237 | } else { | 3237 | } else { |
3238 | /* | 3238 | /* |
3239 | * This is needed only for protection faults but the arch code | 3239 | * This is needed only for protection faults but the arch code |
3240 | * is not yet telling us if this is a protection fault or not. | 3240 | * is not yet telling us if this is a protection fault or not. |
3241 | * This still avoids useless tlb flushes for .text page faults | 3241 | * This still avoids useless tlb flushes for .text page faults |
3242 | * with threads. | 3242 | * with threads. |
3243 | */ | 3243 | */ |
3244 | if (flags & FAULT_FLAG_WRITE) | 3244 | if (flags & FAULT_FLAG_WRITE) |
3245 | flush_tlb_fix_spurious_fault(vma, address); | 3245 | flush_tlb_fix_spurious_fault(vma, address); |
3246 | } | 3246 | } |
3247 | unlock: | 3247 | unlock: |
3248 | pte_unmap_unlock(pte, ptl); | 3248 | pte_unmap_unlock(pte, ptl); |
3249 | return 0; | 3249 | return 0; |
3250 | } | 3250 | } |
3251 | 3251 | ||
3252 | /* | 3252 | /* |
3253 | * By the time we get here, we already hold the mm semaphore | 3253 | * By the time we get here, we already hold the mm semaphore |
3254 | * | 3254 | * |
3255 | * The mmap_sem may have been released depending on flags and our | 3255 | * The mmap_sem may have been released depending on flags and our |
3256 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3256 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3257 | */ | 3257 | */ |
3258 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3258 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3259 | unsigned long address, unsigned int flags) | 3259 | unsigned long address, unsigned int flags) |
3260 | { | 3260 | { |
3261 | pgd_t *pgd; | 3261 | pgd_t *pgd; |
3262 | pud_t *pud; | 3262 | pud_t *pud; |
3263 | pmd_t *pmd; | 3263 | pmd_t *pmd; |
3264 | pte_t *pte; | 3264 | pte_t *pte; |
3265 | 3265 | ||
3266 | if (unlikely(is_vm_hugetlb_page(vma))) | 3266 | if (unlikely(is_vm_hugetlb_page(vma))) |
3267 | return hugetlb_fault(mm, vma, address, flags); | 3267 | return hugetlb_fault(mm, vma, address, flags); |
3268 | 3268 | ||
3269 | pgd = pgd_offset(mm, address); | 3269 | pgd = pgd_offset(mm, address); |
3270 | pud = pud_alloc(mm, pgd, address); | 3270 | pud = pud_alloc(mm, pgd, address); |
3271 | if (!pud) | 3271 | if (!pud) |
3272 | return VM_FAULT_OOM; | 3272 | return VM_FAULT_OOM; |
3273 | pmd = pmd_alloc(mm, pud, address); | 3273 | pmd = pmd_alloc(mm, pud, address); |
3274 | if (!pmd) | 3274 | if (!pmd) |
3275 | return VM_FAULT_OOM; | 3275 | return VM_FAULT_OOM; |
3276 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3276 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3277 | int ret = VM_FAULT_FALLBACK; | 3277 | int ret = VM_FAULT_FALLBACK; |
3278 | if (!vma->vm_ops) | 3278 | if (!vma->vm_ops) |
3279 | ret = do_huge_pmd_anonymous_page(mm, vma, address, | 3279 | ret = do_huge_pmd_anonymous_page(mm, vma, address, |
3280 | pmd, flags); | 3280 | pmd, flags); |
3281 | if (!(ret & VM_FAULT_FALLBACK)) | 3281 | if (!(ret & VM_FAULT_FALLBACK)) |
3282 | return ret; | 3282 | return ret; |
3283 | } else { | 3283 | } else { |
3284 | pmd_t orig_pmd = *pmd; | 3284 | pmd_t orig_pmd = *pmd; |
3285 | int ret; | 3285 | int ret; |
3286 | 3286 | ||
3287 | barrier(); | 3287 | barrier(); |
3288 | if (pmd_trans_huge(orig_pmd)) { | 3288 | if (pmd_trans_huge(orig_pmd)) { |
3289 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | 3289 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3290 | 3290 | ||
3291 | /* | 3291 | /* |
3292 | * If the pmd is splitting, return and retry the | 3292 | * If the pmd is splitting, return and retry the |
3293 | * the fault. Alternative: wait until the split | 3293 | * the fault. Alternative: wait until the split |
3294 | * is done, and goto retry. | 3294 | * is done, and goto retry. |
3295 | */ | 3295 | */ |
3296 | if (pmd_trans_splitting(orig_pmd)) | 3296 | if (pmd_trans_splitting(orig_pmd)) |
3297 | return 0; | 3297 | return 0; |
3298 | 3298 | ||
3299 | if (pmd_numa(orig_pmd)) | 3299 | if (pmd_numa(orig_pmd)) |
3300 | return do_huge_pmd_numa_page(mm, vma, address, | 3300 | return do_huge_pmd_numa_page(mm, vma, address, |
3301 | orig_pmd, pmd); | 3301 | orig_pmd, pmd); |
3302 | 3302 | ||
3303 | if (dirty && !pmd_write(orig_pmd)) { | 3303 | if (dirty && !pmd_write(orig_pmd)) { |
3304 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3304 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3305 | orig_pmd); | 3305 | orig_pmd); |
3306 | if (!(ret & VM_FAULT_FALLBACK)) | 3306 | if (!(ret & VM_FAULT_FALLBACK)) |
3307 | return ret; | 3307 | return ret; |
3308 | } else { | 3308 | } else { |
3309 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3309 | huge_pmd_set_accessed(mm, vma, address, pmd, |
3310 | orig_pmd, dirty); | 3310 | orig_pmd, dirty); |
3311 | return 0; | 3311 | return 0; |
3312 | } | 3312 | } |
3313 | } | 3313 | } |
3314 | } | 3314 | } |
3315 | 3315 | ||
3316 | /* | 3316 | /* |
3317 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3317 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3318 | * run pte_offset_map on the pmd, if an huge pmd could | 3318 | * run pte_offset_map on the pmd, if an huge pmd could |
3319 | * materialize from under us from a different thread. | 3319 | * materialize from under us from a different thread. |
3320 | */ | 3320 | */ |
3321 | if (unlikely(pmd_none(*pmd)) && | 3321 | if (unlikely(pmd_none(*pmd)) && |
3322 | unlikely(__pte_alloc(mm, vma, pmd, address))) | 3322 | unlikely(__pte_alloc(mm, vma, pmd, address))) |
3323 | return VM_FAULT_OOM; | 3323 | return VM_FAULT_OOM; |
3324 | /* if an huge pmd materialized from under us just retry later */ | 3324 | /* if an huge pmd materialized from under us just retry later */ |
3325 | if (unlikely(pmd_trans_huge(*pmd))) | 3325 | if (unlikely(pmd_trans_huge(*pmd))) |
3326 | return 0; | 3326 | return 0; |
3327 | /* | 3327 | /* |
3328 | * A regular pmd is established and it can't morph into a huge pmd | 3328 | * A regular pmd is established and it can't morph into a huge pmd |
3329 | * from under us anymore at this point because we hold the mmap_sem | 3329 | * from under us anymore at this point because we hold the mmap_sem |
3330 | * read mode and khugepaged takes it in write mode. So now it's | 3330 | * read mode and khugepaged takes it in write mode. So now it's |
3331 | * safe to run pte_offset_map(). | 3331 | * safe to run pte_offset_map(). |
3332 | */ | 3332 | */ |
3333 | pte = pte_offset_map(pmd, address); | 3333 | pte = pte_offset_map(pmd, address); |
3334 | 3334 | ||
3335 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3335 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3336 | } | 3336 | } |
3337 | 3337 | ||
3338 | /* | 3338 | /* |
3339 | * By the time we get here, we already hold the mm semaphore | 3339 | * By the time we get here, we already hold the mm semaphore |
3340 | * | 3340 | * |
3341 | * The mmap_sem may have been released depending on flags and our | 3341 | * The mmap_sem may have been released depending on flags and our |
3342 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3342 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3343 | */ | 3343 | */ |
3344 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3344 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3345 | unsigned long address, unsigned int flags) | 3345 | unsigned long address, unsigned int flags) |
3346 | { | 3346 | { |
3347 | int ret; | 3347 | int ret; |
3348 | 3348 | ||
3349 | __set_current_state(TASK_RUNNING); | 3349 | __set_current_state(TASK_RUNNING); |
3350 | 3350 | ||
3351 | count_vm_event(PGFAULT); | 3351 | count_vm_event(PGFAULT); |
3352 | mem_cgroup_count_vm_event(mm, PGFAULT); | 3352 | mem_cgroup_count_vm_event(mm, PGFAULT); |
3353 | 3353 | ||
3354 | /* do counter updates before entering really critical section. */ | 3354 | /* do counter updates before entering really critical section. */ |
3355 | check_sync_rss_stat(current); | 3355 | check_sync_rss_stat(current); |
3356 | 3356 | ||
3357 | /* | 3357 | /* |
3358 | * Enable the memcg OOM handling for faults triggered in user | 3358 | * Enable the memcg OOM handling for faults triggered in user |
3359 | * space. Kernel faults are handled more gracefully. | 3359 | * space. Kernel faults are handled more gracefully. |
3360 | */ | 3360 | */ |
3361 | if (flags & FAULT_FLAG_USER) | 3361 | if (flags & FAULT_FLAG_USER) |
3362 | mem_cgroup_oom_enable(); | 3362 | mem_cgroup_oom_enable(); |
3363 | 3363 | ||
3364 | ret = __handle_mm_fault(mm, vma, address, flags); | 3364 | ret = __handle_mm_fault(mm, vma, address, flags); |
3365 | 3365 | ||
3366 | if (flags & FAULT_FLAG_USER) { | 3366 | if (flags & FAULT_FLAG_USER) { |
3367 | mem_cgroup_oom_disable(); | 3367 | mem_cgroup_oom_disable(); |
3368 | /* | 3368 | /* |
3369 | * The task may have entered a memcg OOM situation but | 3369 | * The task may have entered a memcg OOM situation but |
3370 | * if the allocation error was handled gracefully (no | 3370 | * if the allocation error was handled gracefully (no |
3371 | * VM_FAULT_OOM), there is no need to kill anything. | 3371 | * VM_FAULT_OOM), there is no need to kill anything. |
3372 | * Just clean up the OOM state peacefully. | 3372 | * Just clean up the OOM state peacefully. |
3373 | */ | 3373 | */ |
3374 | if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) | 3374 | if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) |
3375 | mem_cgroup_oom_synchronize(false); | 3375 | mem_cgroup_oom_synchronize(false); |
3376 | } | 3376 | } |
3377 | 3377 | ||
3378 | return ret; | 3378 | return ret; |
3379 | } | 3379 | } |
3380 | 3380 | ||
3381 | #ifndef __PAGETABLE_PUD_FOLDED | 3381 | #ifndef __PAGETABLE_PUD_FOLDED |
3382 | /* | 3382 | /* |
3383 | * Allocate page upper directory. | 3383 | * Allocate page upper directory. |
3384 | * We've already handled the fast-path in-line. | 3384 | * We've already handled the fast-path in-line. |
3385 | */ | 3385 | */ |
3386 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | 3386 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) |
3387 | { | 3387 | { |
3388 | pud_t *new = pud_alloc_one(mm, address); | 3388 | pud_t *new = pud_alloc_one(mm, address); |
3389 | if (!new) | 3389 | if (!new) |
3390 | return -ENOMEM; | 3390 | return -ENOMEM; |
3391 | 3391 | ||
3392 | smp_wmb(); /* See comment in __pte_alloc */ | 3392 | smp_wmb(); /* See comment in __pte_alloc */ |
3393 | 3393 | ||
3394 | spin_lock(&mm->page_table_lock); | 3394 | spin_lock(&mm->page_table_lock); |
3395 | if (pgd_present(*pgd)) /* Another has populated it */ | 3395 | if (pgd_present(*pgd)) /* Another has populated it */ |
3396 | pud_free(mm, new); | 3396 | pud_free(mm, new); |
3397 | else | 3397 | else |
3398 | pgd_populate(mm, pgd, new); | 3398 | pgd_populate(mm, pgd, new); |
3399 | spin_unlock(&mm->page_table_lock); | 3399 | spin_unlock(&mm->page_table_lock); |
3400 | return 0; | 3400 | return 0; |
3401 | } | 3401 | } |
3402 | #endif /* __PAGETABLE_PUD_FOLDED */ | 3402 | #endif /* __PAGETABLE_PUD_FOLDED */ |
3403 | 3403 | ||
3404 | #ifndef __PAGETABLE_PMD_FOLDED | 3404 | #ifndef __PAGETABLE_PMD_FOLDED |
3405 | /* | 3405 | /* |
3406 | * Allocate page middle directory. | 3406 | * Allocate page middle directory. |
3407 | * We've already handled the fast-path in-line. | 3407 | * We've already handled the fast-path in-line. |
3408 | */ | 3408 | */ |
3409 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | 3409 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
3410 | { | 3410 | { |
3411 | pmd_t *new = pmd_alloc_one(mm, address); | 3411 | pmd_t *new = pmd_alloc_one(mm, address); |
3412 | if (!new) | 3412 | if (!new) |
3413 | return -ENOMEM; | 3413 | return -ENOMEM; |
3414 | 3414 | ||
3415 | smp_wmb(); /* See comment in __pte_alloc */ | 3415 | smp_wmb(); /* See comment in __pte_alloc */ |
3416 | 3416 | ||
3417 | spin_lock(&mm->page_table_lock); | 3417 | spin_lock(&mm->page_table_lock); |
3418 | #ifndef __ARCH_HAS_4LEVEL_HACK | 3418 | #ifndef __ARCH_HAS_4LEVEL_HACK |
3419 | if (pud_present(*pud)) /* Another has populated it */ | 3419 | if (pud_present(*pud)) /* Another has populated it */ |
3420 | pmd_free(mm, new); | 3420 | pmd_free(mm, new); |
3421 | else | 3421 | else |
3422 | pud_populate(mm, pud, new); | 3422 | pud_populate(mm, pud, new); |
3423 | #else | 3423 | #else |
3424 | if (pgd_present(*pud)) /* Another has populated it */ | 3424 | if (pgd_present(*pud)) /* Another has populated it */ |
3425 | pmd_free(mm, new); | 3425 | pmd_free(mm, new); |
3426 | else | 3426 | else |
3427 | pgd_populate(mm, pud, new); | 3427 | pgd_populate(mm, pud, new); |
3428 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 3428 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
3429 | spin_unlock(&mm->page_table_lock); | 3429 | spin_unlock(&mm->page_table_lock); |
3430 | return 0; | 3430 | return 0; |
3431 | } | 3431 | } |
3432 | #endif /* __PAGETABLE_PMD_FOLDED */ | 3432 | #endif /* __PAGETABLE_PMD_FOLDED */ |
3433 | 3433 | ||
3434 | static int __follow_pte(struct mm_struct *mm, unsigned long address, | 3434 | static int __follow_pte(struct mm_struct *mm, unsigned long address, |
3435 | pte_t **ptepp, spinlock_t **ptlp) | 3435 | pte_t **ptepp, spinlock_t **ptlp) |
3436 | { | 3436 | { |
3437 | pgd_t *pgd; | 3437 | pgd_t *pgd; |
3438 | pud_t *pud; | 3438 | pud_t *pud; |
3439 | pmd_t *pmd; | 3439 | pmd_t *pmd; |
3440 | pte_t *ptep; | 3440 | pte_t *ptep; |
3441 | 3441 | ||
3442 | pgd = pgd_offset(mm, address); | 3442 | pgd = pgd_offset(mm, address); |
3443 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 3443 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
3444 | goto out; | 3444 | goto out; |
3445 | 3445 | ||
3446 | pud = pud_offset(pgd, address); | 3446 | pud = pud_offset(pgd, address); |
3447 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 3447 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
3448 | goto out; | 3448 | goto out; |
3449 | 3449 | ||
3450 | pmd = pmd_offset(pud, address); | 3450 | pmd = pmd_offset(pud, address); |
3451 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 3451 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
3452 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 3452 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
3453 | goto out; | 3453 | goto out; |
3454 | 3454 | ||
3455 | /* We cannot handle huge page PFN maps. Luckily they don't exist. */ | 3455 | /* We cannot handle huge page PFN maps. Luckily they don't exist. */ |
3456 | if (pmd_huge(*pmd)) | 3456 | if (pmd_huge(*pmd)) |
3457 | goto out; | 3457 | goto out; |
3458 | 3458 | ||
3459 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); | 3459 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
3460 | if (!ptep) | 3460 | if (!ptep) |
3461 | goto out; | 3461 | goto out; |
3462 | if (!pte_present(*ptep)) | 3462 | if (!pte_present(*ptep)) |
3463 | goto unlock; | 3463 | goto unlock; |
3464 | *ptepp = ptep; | 3464 | *ptepp = ptep; |
3465 | return 0; | 3465 | return 0; |
3466 | unlock: | 3466 | unlock: |
3467 | pte_unmap_unlock(ptep, *ptlp); | 3467 | pte_unmap_unlock(ptep, *ptlp); |
3468 | out: | 3468 | out: |
3469 | return -EINVAL; | 3469 | return -EINVAL; |
3470 | } | 3470 | } |
3471 | 3471 | ||
3472 | static inline int follow_pte(struct mm_struct *mm, unsigned long address, | 3472 | static inline int follow_pte(struct mm_struct *mm, unsigned long address, |
3473 | pte_t **ptepp, spinlock_t **ptlp) | 3473 | pte_t **ptepp, spinlock_t **ptlp) |
3474 | { | 3474 | { |
3475 | int res; | 3475 | int res; |
3476 | 3476 | ||
3477 | /* (void) is needed to make gcc happy */ | 3477 | /* (void) is needed to make gcc happy */ |
3478 | (void) __cond_lock(*ptlp, | 3478 | (void) __cond_lock(*ptlp, |
3479 | !(res = __follow_pte(mm, address, ptepp, ptlp))); | 3479 | !(res = __follow_pte(mm, address, ptepp, ptlp))); |
3480 | return res; | 3480 | return res; |
3481 | } | 3481 | } |
3482 | 3482 | ||
3483 | /** | 3483 | /** |
3484 | * follow_pfn - look up PFN at a user virtual address | 3484 | * follow_pfn - look up PFN at a user virtual address |
3485 | * @vma: memory mapping | 3485 | * @vma: memory mapping |
3486 | * @address: user virtual address | 3486 | * @address: user virtual address |
3487 | * @pfn: location to store found PFN | 3487 | * @pfn: location to store found PFN |
3488 | * | 3488 | * |
3489 | * Only IO mappings and raw PFN mappings are allowed. | 3489 | * Only IO mappings and raw PFN mappings are allowed. |
3490 | * | 3490 | * |
3491 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | 3491 | * Returns zero and the pfn at @pfn on success, -ve otherwise. |
3492 | */ | 3492 | */ |
3493 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | 3493 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, |
3494 | unsigned long *pfn) | 3494 | unsigned long *pfn) |
3495 | { | 3495 | { |
3496 | int ret = -EINVAL; | 3496 | int ret = -EINVAL; |
3497 | spinlock_t *ptl; | 3497 | spinlock_t *ptl; |
3498 | pte_t *ptep; | 3498 | pte_t *ptep; |
3499 | 3499 | ||
3500 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | 3500 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) |
3501 | return ret; | 3501 | return ret; |
3502 | 3502 | ||
3503 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); | 3503 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); |
3504 | if (ret) | 3504 | if (ret) |
3505 | return ret; | 3505 | return ret; |
3506 | *pfn = pte_pfn(*ptep); | 3506 | *pfn = pte_pfn(*ptep); |
3507 | pte_unmap_unlock(ptep, ptl); | 3507 | pte_unmap_unlock(ptep, ptl); |
3508 | return 0; | 3508 | return 0; |
3509 | } | 3509 | } |
3510 | EXPORT_SYMBOL(follow_pfn); | 3510 | EXPORT_SYMBOL(follow_pfn); |
3511 | 3511 | ||
3512 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3512 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
3513 | int follow_phys(struct vm_area_struct *vma, | 3513 | int follow_phys(struct vm_area_struct *vma, |
3514 | unsigned long address, unsigned int flags, | 3514 | unsigned long address, unsigned int flags, |
3515 | unsigned long *prot, resource_size_t *phys) | 3515 | unsigned long *prot, resource_size_t *phys) |
3516 | { | 3516 | { |
3517 | int ret = -EINVAL; | 3517 | int ret = -EINVAL; |
3518 | pte_t *ptep, pte; | 3518 | pte_t *ptep, pte; |
3519 | spinlock_t *ptl; | 3519 | spinlock_t *ptl; |
3520 | 3520 | ||
3521 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | 3521 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) |
3522 | goto out; | 3522 | goto out; |
3523 | 3523 | ||
3524 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | 3524 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) |
3525 | goto out; | 3525 | goto out; |
3526 | pte = *ptep; | 3526 | pte = *ptep; |
3527 | 3527 | ||
3528 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 3528 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
3529 | goto unlock; | 3529 | goto unlock; |
3530 | 3530 | ||
3531 | *prot = pgprot_val(pte_pgprot(pte)); | 3531 | *prot = pgprot_val(pte_pgprot(pte)); |
3532 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; | 3532 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; |
3533 | 3533 | ||
3534 | ret = 0; | 3534 | ret = 0; |
3535 | unlock: | 3535 | unlock: |
3536 | pte_unmap_unlock(ptep, ptl); | 3536 | pte_unmap_unlock(ptep, ptl); |
3537 | out: | 3537 | out: |
3538 | return ret; | 3538 | return ret; |
3539 | } | 3539 | } |
3540 | 3540 | ||
3541 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | 3541 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, |
3542 | void *buf, int len, int write) | 3542 | void *buf, int len, int write) |
3543 | { | 3543 | { |
3544 | resource_size_t phys_addr; | 3544 | resource_size_t phys_addr; |
3545 | unsigned long prot = 0; | 3545 | unsigned long prot = 0; |
3546 | void __iomem *maddr; | 3546 | void __iomem *maddr; |
3547 | int offset = addr & (PAGE_SIZE-1); | 3547 | int offset = addr & (PAGE_SIZE-1); |
3548 | 3548 | ||
3549 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3549 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
3550 | return -EINVAL; | 3550 | return -EINVAL; |
3551 | 3551 | ||
3552 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | 3552 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); |
3553 | if (write) | 3553 | if (write) |
3554 | memcpy_toio(maddr + offset, buf, len); | 3554 | memcpy_toio(maddr + offset, buf, len); |
3555 | else | 3555 | else |
3556 | memcpy_fromio(buf, maddr + offset, len); | 3556 | memcpy_fromio(buf, maddr + offset, len); |
3557 | iounmap(maddr); | 3557 | iounmap(maddr); |
3558 | 3558 | ||
3559 | return len; | 3559 | return len; |
3560 | } | 3560 | } |
3561 | EXPORT_SYMBOL_GPL(generic_access_phys); | 3561 | EXPORT_SYMBOL_GPL(generic_access_phys); |
3562 | #endif | 3562 | #endif |
3563 | 3563 | ||
3564 | /* | 3564 | /* |
3565 | * Access another process' address space as given in mm. If non-NULL, use the | 3565 | * Access another process' address space as given in mm. If non-NULL, use the |
3566 | * given task for page fault accounting. | 3566 | * given task for page fault accounting. |
3567 | */ | 3567 | */ |
3568 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 3568 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
3569 | unsigned long addr, void *buf, int len, int write) | 3569 | unsigned long addr, void *buf, int len, int write) |
3570 | { | 3570 | { |
3571 | struct vm_area_struct *vma; | 3571 | struct vm_area_struct *vma; |
3572 | void *old_buf = buf; | 3572 | void *old_buf = buf; |
3573 | 3573 | ||
3574 | down_read(&mm->mmap_sem); | 3574 | down_read(&mm->mmap_sem); |
3575 | /* ignore errors, just check how much was successfully transferred */ | 3575 | /* ignore errors, just check how much was successfully transferred */ |
3576 | while (len) { | 3576 | while (len) { |
3577 | int bytes, ret, offset; | 3577 | int bytes, ret, offset; |
3578 | void *maddr; | 3578 | void *maddr; |
3579 | struct page *page = NULL; | 3579 | struct page *page = NULL; |
3580 | 3580 | ||
3581 | ret = get_user_pages(tsk, mm, addr, 1, | 3581 | ret = get_user_pages(tsk, mm, addr, 1, |
3582 | write, 1, &page, &vma); | 3582 | write, 1, &page, &vma); |
3583 | if (ret <= 0) { | 3583 | if (ret <= 0) { |
3584 | #ifndef CONFIG_HAVE_IOREMAP_PROT | 3584 | #ifndef CONFIG_HAVE_IOREMAP_PROT |
3585 | break; | 3585 | break; |
3586 | #else | 3586 | #else |
3587 | /* | 3587 | /* |
3588 | * Check if this is a VM_IO | VM_PFNMAP VMA, which | 3588 | * Check if this is a VM_IO | VM_PFNMAP VMA, which |
3589 | * we can access using slightly different code. | 3589 | * we can access using slightly different code. |
3590 | */ | 3590 | */ |
3591 | vma = find_vma(mm, addr); | 3591 | vma = find_vma(mm, addr); |
3592 | if (!vma || vma->vm_start > addr) | 3592 | if (!vma || vma->vm_start > addr) |
3593 | break; | 3593 | break; |
3594 | if (vma->vm_ops && vma->vm_ops->access) | 3594 | if (vma->vm_ops && vma->vm_ops->access) |
3595 | ret = vma->vm_ops->access(vma, addr, buf, | 3595 | ret = vma->vm_ops->access(vma, addr, buf, |
3596 | len, write); | 3596 | len, write); |
3597 | if (ret <= 0) | 3597 | if (ret <= 0) |
3598 | break; | 3598 | break; |
3599 | bytes = ret; | 3599 | bytes = ret; |
3600 | #endif | 3600 | #endif |
3601 | } else { | 3601 | } else { |
3602 | bytes = len; | 3602 | bytes = len; |
3603 | offset = addr & (PAGE_SIZE-1); | 3603 | offset = addr & (PAGE_SIZE-1); |
3604 | if (bytes > PAGE_SIZE-offset) | 3604 | if (bytes > PAGE_SIZE-offset) |
3605 | bytes = PAGE_SIZE-offset; | 3605 | bytes = PAGE_SIZE-offset; |
3606 | 3606 | ||
3607 | maddr = kmap(page); | 3607 | maddr = kmap(page); |
3608 | if (write) { | 3608 | if (write) { |
3609 | copy_to_user_page(vma, page, addr, | 3609 | copy_to_user_page(vma, page, addr, |
3610 | maddr + offset, buf, bytes); | 3610 | maddr + offset, buf, bytes); |
3611 | set_page_dirty_lock(page); | 3611 | set_page_dirty_lock(page); |
3612 | } else { | 3612 | } else { |
3613 | copy_from_user_page(vma, page, addr, | 3613 | copy_from_user_page(vma, page, addr, |
3614 | buf, maddr + offset, bytes); | 3614 | buf, maddr + offset, bytes); |
3615 | } | 3615 | } |
3616 | kunmap(page); | 3616 | kunmap(page); |
3617 | page_cache_release(page); | 3617 | page_cache_release(page); |
3618 | } | 3618 | } |
3619 | len -= bytes; | 3619 | len -= bytes; |
3620 | buf += bytes; | 3620 | buf += bytes; |
3621 | addr += bytes; | 3621 | addr += bytes; |
3622 | } | 3622 | } |
3623 | up_read(&mm->mmap_sem); | 3623 | up_read(&mm->mmap_sem); |
3624 | 3624 | ||
3625 | return buf - old_buf; | 3625 | return buf - old_buf; |
3626 | } | 3626 | } |
3627 | 3627 | ||
3628 | /** | 3628 | /** |
3629 | * access_remote_vm - access another process' address space | 3629 | * access_remote_vm - access another process' address space |
3630 | * @mm: the mm_struct of the target address space | 3630 | * @mm: the mm_struct of the target address space |
3631 | * @addr: start address to access | 3631 | * @addr: start address to access |
3632 | * @buf: source or destination buffer | 3632 | * @buf: source or destination buffer |
3633 | * @len: number of bytes to transfer | 3633 | * @len: number of bytes to transfer |
3634 | * @write: whether the access is a write | 3634 | * @write: whether the access is a write |
3635 | * | 3635 | * |
3636 | * The caller must hold a reference on @mm. | 3636 | * The caller must hold a reference on @mm. |
3637 | */ | 3637 | */ |
3638 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, | 3638 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, |
3639 | void *buf, int len, int write) | 3639 | void *buf, int len, int write) |
3640 | { | 3640 | { |
3641 | return __access_remote_vm(NULL, mm, addr, buf, len, write); | 3641 | return __access_remote_vm(NULL, mm, addr, buf, len, write); |
3642 | } | 3642 | } |
3643 | 3643 | ||
3644 | /* | 3644 | /* |
3645 | * Access another process' address space. | 3645 | * Access another process' address space. |
3646 | * Source/target buffer must be kernel space, | 3646 | * Source/target buffer must be kernel space, |
3647 | * Do not walk the page table directly, use get_user_pages | 3647 | * Do not walk the page table directly, use get_user_pages |
3648 | */ | 3648 | */ |
3649 | int access_process_vm(struct task_struct *tsk, unsigned long addr, | 3649 | int access_process_vm(struct task_struct *tsk, unsigned long addr, |
3650 | void *buf, int len, int write) | 3650 | void *buf, int len, int write) |
3651 | { | 3651 | { |
3652 | struct mm_struct *mm; | 3652 | struct mm_struct *mm; |
3653 | int ret; | 3653 | int ret; |
3654 | 3654 | ||
3655 | mm = get_task_mm(tsk); | 3655 | mm = get_task_mm(tsk); |
3656 | if (!mm) | 3656 | if (!mm) |
3657 | return 0; | 3657 | return 0; |
3658 | 3658 | ||
3659 | ret = __access_remote_vm(tsk, mm, addr, buf, len, write); | 3659 | ret = __access_remote_vm(tsk, mm, addr, buf, len, write); |
3660 | mmput(mm); | 3660 | mmput(mm); |
3661 | 3661 | ||
3662 | return ret; | 3662 | return ret; |
3663 | } | 3663 | } |
3664 | 3664 | ||
3665 | /* | 3665 | /* |
3666 | * Print the name of a VMA. | 3666 | * Print the name of a VMA. |
3667 | */ | 3667 | */ |
3668 | void print_vma_addr(char *prefix, unsigned long ip) | 3668 | void print_vma_addr(char *prefix, unsigned long ip) |
3669 | { | 3669 | { |
3670 | struct mm_struct *mm = current->mm; | 3670 | struct mm_struct *mm = current->mm; |
3671 | struct vm_area_struct *vma; | 3671 | struct vm_area_struct *vma; |
3672 | 3672 | ||
3673 | /* | 3673 | /* |
3674 | * Do not print if we are in atomic | 3674 | * Do not print if we are in atomic |
3675 | * contexts (in exception stacks, etc.): | 3675 | * contexts (in exception stacks, etc.): |
3676 | */ | 3676 | */ |
3677 | if (preempt_count()) | 3677 | if (preempt_count()) |
3678 | return; | 3678 | return; |
3679 | 3679 | ||
3680 | down_read(&mm->mmap_sem); | 3680 | down_read(&mm->mmap_sem); |
3681 | vma = find_vma(mm, ip); | 3681 | vma = find_vma(mm, ip); |
3682 | if (vma && vma->vm_file) { | 3682 | if (vma && vma->vm_file) { |
3683 | struct file *f = vma->vm_file; | 3683 | struct file *f = vma->vm_file; |
3684 | char *buf = (char *)__get_free_page(GFP_KERNEL); | 3684 | char *buf = (char *)__get_free_page(GFP_KERNEL); |
3685 | if (buf) { | 3685 | if (buf) { |
3686 | char *p; | 3686 | char *p; |
3687 | 3687 | ||
3688 | p = d_path(&f->f_path, buf, PAGE_SIZE); | 3688 | p = d_path(&f->f_path, buf, PAGE_SIZE); |
3689 | if (IS_ERR(p)) | 3689 | if (IS_ERR(p)) |
3690 | p = "?"; | 3690 | p = "?"; |
3691 | printk("%s%s[%lx+%lx]", prefix, kbasename(p), | 3691 | printk("%s%s[%lx+%lx]", prefix, kbasename(p), |
3692 | vma->vm_start, | 3692 | vma->vm_start, |
3693 | vma->vm_end - vma->vm_start); | 3693 | vma->vm_end - vma->vm_start); |
3694 | free_page((unsigned long)buf); | 3694 | free_page((unsigned long)buf); |
3695 | } | 3695 | } |
3696 | } | 3696 | } |
3697 | up_read(&mm->mmap_sem); | 3697 | up_read(&mm->mmap_sem); |
3698 | } | 3698 | } |
3699 | 3699 | ||
3700 | #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) | 3700 | #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) |
3701 | void might_fault(void) | 3701 | void might_fault(void) |
3702 | { | 3702 | { |
3703 | /* | 3703 | /* |
3704 | * Some code (nfs/sunrpc) uses socket ops on kernel memory while | 3704 | * Some code (nfs/sunrpc) uses socket ops on kernel memory while |
3705 | * holding the mmap_sem, this is safe because kernel memory doesn't | 3705 | * holding the mmap_sem, this is safe because kernel memory doesn't |
3706 | * get paged out, therefore we'll never actually fault, and the | 3706 | * get paged out, therefore we'll never actually fault, and the |
3707 | * below annotations will generate false positives. | 3707 | * below annotations will generate false positives. |
3708 | */ | 3708 | */ |
3709 | if (segment_eq(get_fs(), KERNEL_DS)) | 3709 | if (segment_eq(get_fs(), KERNEL_DS)) |
3710 | return; | 3710 | return; |
3711 | 3711 | ||
3712 | /* | 3712 | /* |
3713 | * it would be nicer only to annotate paths which are not under | 3713 | * it would be nicer only to annotate paths which are not under |
3714 | * pagefault_disable, however that requires a larger audit and | 3714 | * pagefault_disable, however that requires a larger audit and |
3715 | * providing helpers like get_user_atomic. | 3715 | * providing helpers like get_user_atomic. |
3716 | */ | 3716 | */ |
3717 | if (in_atomic()) | 3717 | if (in_atomic()) |
3718 | return; | 3718 | return; |
3719 | 3719 | ||
3720 | __might_sleep(__FILE__, __LINE__, 0); | 3720 | __might_sleep(__FILE__, __LINE__, 0); |
3721 | 3721 | ||
3722 | if (current->mm) | 3722 | if (current->mm) |
3723 | might_lock_read(¤t->mm->mmap_sem); | 3723 | might_lock_read(¤t->mm->mmap_sem); |
3724 | } | 3724 | } |
3725 | EXPORT_SYMBOL(might_fault); | 3725 | EXPORT_SYMBOL(might_fault); |
3726 | #endif | 3726 | #endif |
3727 | 3727 | ||
3728 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | 3728 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) |
3729 | static void clear_gigantic_page(struct page *page, | 3729 | static void clear_gigantic_page(struct page *page, |
3730 | unsigned long addr, | 3730 | unsigned long addr, |
3731 | unsigned int pages_per_huge_page) | 3731 | unsigned int pages_per_huge_page) |
3732 | { | 3732 | { |
3733 | int i; | 3733 | int i; |
3734 | struct page *p = page; | 3734 | struct page *p = page; |
3735 | 3735 | ||
3736 | might_sleep(); | 3736 | might_sleep(); |
3737 | for (i = 0; i < pages_per_huge_page; | 3737 | for (i = 0; i < pages_per_huge_page; |
3738 | i++, p = mem_map_next(p, page, i)) { | 3738 | i++, p = mem_map_next(p, page, i)) { |
3739 | cond_resched(); | 3739 | cond_resched(); |
3740 | clear_user_highpage(p, addr + i * PAGE_SIZE); | 3740 | clear_user_highpage(p, addr + i * PAGE_SIZE); |
3741 | } | 3741 | } |
3742 | } | 3742 | } |
3743 | void clear_huge_page(struct page *page, | 3743 | void clear_huge_page(struct page *page, |
3744 | unsigned long addr, unsigned int pages_per_huge_page) | 3744 | unsigned long addr, unsigned int pages_per_huge_page) |
3745 | { | 3745 | { |
3746 | int i; | 3746 | int i; |
3747 | 3747 | ||
3748 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | 3748 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { |
3749 | clear_gigantic_page(page, addr, pages_per_huge_page); | 3749 | clear_gigantic_page(page, addr, pages_per_huge_page); |
3750 | return; | 3750 | return; |
3751 | } | 3751 | } |
3752 | 3752 | ||
3753 | might_sleep(); | 3753 | might_sleep(); |
3754 | for (i = 0; i < pages_per_huge_page; i++) { | 3754 | for (i = 0; i < pages_per_huge_page; i++) { |
3755 | cond_resched(); | 3755 | cond_resched(); |
3756 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | 3756 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); |
3757 | } | 3757 | } |
3758 | } | 3758 | } |
3759 | 3759 | ||
3760 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | 3760 | static void copy_user_gigantic_page(struct page *dst, struct page *src, |
3761 | unsigned long addr, | 3761 | unsigned long addr, |
3762 | struct vm_area_struct *vma, | 3762 | struct vm_area_struct *vma, |
3763 | unsigned int pages_per_huge_page) | 3763 | unsigned int pages_per_huge_page) |
3764 | { | 3764 | { |
3765 | int i; | 3765 | int i; |
3766 | struct page *dst_base = dst; | 3766 | struct page *dst_base = dst; |
3767 | struct page *src_base = src; | 3767 | struct page *src_base = src; |
3768 | 3768 | ||
3769 | for (i = 0; i < pages_per_huge_page; ) { | 3769 | for (i = 0; i < pages_per_huge_page; ) { |
3770 | cond_resched(); | 3770 | cond_resched(); |
3771 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | 3771 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); |
3772 | 3772 | ||
3773 | i++; | 3773 | i++; |
3774 | dst = mem_map_next(dst, dst_base, i); | 3774 | dst = mem_map_next(dst, dst_base, i); |
3775 | src = mem_map_next(src, src_base, i); | 3775 | src = mem_map_next(src, src_base, i); |
3776 | } | 3776 | } |
3777 | } | 3777 | } |
3778 | 3778 | ||
3779 | void copy_user_huge_page(struct page *dst, struct page *src, | 3779 | void copy_user_huge_page(struct page *dst, struct page *src, |
3780 | unsigned long addr, struct vm_area_struct *vma, | 3780 | unsigned long addr, struct vm_area_struct *vma, |
3781 | unsigned int pages_per_huge_page) | 3781 | unsigned int pages_per_huge_page) |
3782 | { | 3782 | { |
3783 | int i; | 3783 | int i; |
3784 | 3784 | ||
3785 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | 3785 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { |
3786 | copy_user_gigantic_page(dst, src, addr, vma, | 3786 | copy_user_gigantic_page(dst, src, addr, vma, |
3787 | pages_per_huge_page); | 3787 | pages_per_huge_page); |
3788 | return; | 3788 | return; |
3789 | } | 3789 | } |
3790 | 3790 | ||
3791 | might_sleep(); | 3791 | might_sleep(); |
3792 | for (i = 0; i < pages_per_huge_page; i++) { | 3792 | for (i = 0; i < pages_per_huge_page; i++) { |
3793 | cond_resched(); | 3793 | cond_resched(); |
3794 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | 3794 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
3795 | } | 3795 | } |
3796 | } | 3796 | } |
3797 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 3797 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
3798 | 3798 | ||
3799 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS | 3799 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS |
3800 | 3800 | ||
3801 | static struct kmem_cache *page_ptl_cachep; | 3801 | static struct kmem_cache *page_ptl_cachep; |
3802 | 3802 | ||
3803 | void __init ptlock_cache_init(void) | 3803 | void __init ptlock_cache_init(void) |
3804 | { | 3804 | { |
3805 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, | 3805 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, |
3806 | SLAB_PANIC, NULL); | 3806 | SLAB_PANIC, NULL); |
3807 | } | 3807 | } |
3808 | 3808 | ||
3809 | bool ptlock_alloc(struct page *page) | 3809 | bool ptlock_alloc(struct page *page) |
3810 | { | 3810 | { |
3811 | spinlock_t *ptl; | 3811 | spinlock_t *ptl; |
3812 | 3812 | ||
3813 | ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); | 3813 | ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); |
3814 | if (!ptl) | 3814 | if (!ptl) |
3815 | return false; | 3815 | return false; |
3816 | page->ptl = ptl; | 3816 | page->ptl = ptl; |
3817 | return true; | 3817 | return true; |
3818 | } | 3818 | } |
3819 | 3819 | ||
3820 | void ptlock_free(struct page *page) | 3820 | void ptlock_free(struct page *page) |
3821 | { | 3821 | { |
3822 | kmem_cache_free(page_ptl_cachep, page->ptl); | 3822 | kmem_cache_free(page_ptl_cachep, page->ptl); |
3823 | } | 3823 | } |
3824 | #endif | 3824 | #endif |
3825 | 3825 |
mm/slab.c
1 | /* | 1 | /* |
2 | * linux/mm/slab.c | 2 | * linux/mm/slab.c |
3 | * Written by Mark Hemment, 1996/97. | 3 | * Written by Mark Hemment, 1996/97. |
4 | * (markhe@nextd.demon.co.uk) | 4 | * (markhe@nextd.demon.co.uk) |
5 | * | 5 | * |
6 | * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli | 6 | * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli |
7 | * | 7 | * |
8 | * Major cleanup, different bufctl logic, per-cpu arrays | 8 | * Major cleanup, different bufctl logic, per-cpu arrays |
9 | * (c) 2000 Manfred Spraul | 9 | * (c) 2000 Manfred Spraul |
10 | * | 10 | * |
11 | * Cleanup, make the head arrays unconditional, preparation for NUMA | 11 | * Cleanup, make the head arrays unconditional, preparation for NUMA |
12 | * (c) 2002 Manfred Spraul | 12 | * (c) 2002 Manfred Spraul |
13 | * | 13 | * |
14 | * An implementation of the Slab Allocator as described in outline in; | 14 | * An implementation of the Slab Allocator as described in outline in; |
15 | * UNIX Internals: The New Frontiers by Uresh Vahalia | 15 | * UNIX Internals: The New Frontiers by Uresh Vahalia |
16 | * Pub: Prentice Hall ISBN 0-13-101908-2 | 16 | * Pub: Prentice Hall ISBN 0-13-101908-2 |
17 | * or with a little more detail in; | 17 | * or with a little more detail in; |
18 | * The Slab Allocator: An Object-Caching Kernel Memory Allocator | 18 | * The Slab Allocator: An Object-Caching Kernel Memory Allocator |
19 | * Jeff Bonwick (Sun Microsystems). | 19 | * Jeff Bonwick (Sun Microsystems). |
20 | * Presented at: USENIX Summer 1994 Technical Conference | 20 | * Presented at: USENIX Summer 1994 Technical Conference |
21 | * | 21 | * |
22 | * The memory is organized in caches, one cache for each object type. | 22 | * The memory is organized in caches, one cache for each object type. |
23 | * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) | 23 | * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) |
24 | * Each cache consists out of many slabs (they are small (usually one | 24 | * Each cache consists out of many slabs (they are small (usually one |
25 | * page long) and always contiguous), and each slab contains multiple | 25 | * page long) and always contiguous), and each slab contains multiple |
26 | * initialized objects. | 26 | * initialized objects. |
27 | * | 27 | * |
28 | * This means, that your constructor is used only for newly allocated | 28 | * This means, that your constructor is used only for newly allocated |
29 | * slabs and you must pass objects with the same initializations to | 29 | * slabs and you must pass objects with the same initializations to |
30 | * kmem_cache_free. | 30 | * kmem_cache_free. |
31 | * | 31 | * |
32 | * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, | 32 | * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, |
33 | * normal). If you need a special memory type, then must create a new | 33 | * normal). If you need a special memory type, then must create a new |
34 | * cache for that memory type. | 34 | * cache for that memory type. |
35 | * | 35 | * |
36 | * In order to reduce fragmentation, the slabs are sorted in 3 groups: | 36 | * In order to reduce fragmentation, the slabs are sorted in 3 groups: |
37 | * full slabs with 0 free objects | 37 | * full slabs with 0 free objects |
38 | * partial slabs | 38 | * partial slabs |
39 | * empty slabs with no allocated objects | 39 | * empty slabs with no allocated objects |
40 | * | 40 | * |
41 | * If partial slabs exist, then new allocations come from these slabs, | 41 | * If partial slabs exist, then new allocations come from these slabs, |
42 | * otherwise from empty slabs or new slabs are allocated. | 42 | * otherwise from empty slabs or new slabs are allocated. |
43 | * | 43 | * |
44 | * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache | 44 | * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache |
45 | * during kmem_cache_destroy(). The caller must prevent concurrent allocs. | 45 | * during kmem_cache_destroy(). The caller must prevent concurrent allocs. |
46 | * | 46 | * |
47 | * Each cache has a short per-cpu head array, most allocs | 47 | * Each cache has a short per-cpu head array, most allocs |
48 | * and frees go into that array, and if that array overflows, then 1/2 | 48 | * and frees go into that array, and if that array overflows, then 1/2 |
49 | * of the entries in the array are given back into the global cache. | 49 | * of the entries in the array are given back into the global cache. |
50 | * The head array is strictly LIFO and should improve the cache hit rates. | 50 | * The head array is strictly LIFO and should improve the cache hit rates. |
51 | * On SMP, it additionally reduces the spinlock operations. | 51 | * On SMP, it additionally reduces the spinlock operations. |
52 | * | 52 | * |
53 | * The c_cpuarray may not be read with enabled local interrupts - | 53 | * The c_cpuarray may not be read with enabled local interrupts - |
54 | * it's changed with a smp_call_function(). | 54 | * it's changed with a smp_call_function(). |
55 | * | 55 | * |
56 | * SMP synchronization: | 56 | * SMP synchronization: |
57 | * constructors and destructors are called without any locking. | 57 | * constructors and destructors are called without any locking. |
58 | * Several members in struct kmem_cache and struct slab never change, they | 58 | * Several members in struct kmem_cache and struct slab never change, they |
59 | * are accessed without any locking. | 59 | * are accessed without any locking. |
60 | * The per-cpu arrays are never accessed from the wrong cpu, no locking, | 60 | * The per-cpu arrays are never accessed from the wrong cpu, no locking, |
61 | * and local interrupts are disabled so slab code is preempt-safe. | 61 | * and local interrupts are disabled so slab code is preempt-safe. |
62 | * The non-constant members are protected with a per-cache irq spinlock. | 62 | * The non-constant members are protected with a per-cache irq spinlock. |
63 | * | 63 | * |
64 | * Many thanks to Mark Hemment, who wrote another per-cpu slab patch | 64 | * Many thanks to Mark Hemment, who wrote another per-cpu slab patch |
65 | * in 2000 - many ideas in the current implementation are derived from | 65 | * in 2000 - many ideas in the current implementation are derived from |
66 | * his patch. | 66 | * his patch. |
67 | * | 67 | * |
68 | * Further notes from the original documentation: | 68 | * Further notes from the original documentation: |
69 | * | 69 | * |
70 | * 11 April '97. Started multi-threading - markhe | 70 | * 11 April '97. Started multi-threading - markhe |
71 | * The global cache-chain is protected by the mutex 'slab_mutex'. | 71 | * The global cache-chain is protected by the mutex 'slab_mutex'. |
72 | * The sem is only needed when accessing/extending the cache-chain, which | 72 | * The sem is only needed when accessing/extending the cache-chain, which |
73 | * can never happen inside an interrupt (kmem_cache_create(), | 73 | * can never happen inside an interrupt (kmem_cache_create(), |
74 | * kmem_cache_shrink() and kmem_cache_reap()). | 74 | * kmem_cache_shrink() and kmem_cache_reap()). |
75 | * | 75 | * |
76 | * At present, each engine can be growing a cache. This should be blocked. | 76 | * At present, each engine can be growing a cache. This should be blocked. |
77 | * | 77 | * |
78 | * 15 March 2005. NUMA slab allocator. | 78 | * 15 March 2005. NUMA slab allocator. |
79 | * Shai Fultheim <shai@scalex86.org>. | 79 | * Shai Fultheim <shai@scalex86.org>. |
80 | * Shobhit Dayal <shobhit@calsoftinc.com> | 80 | * Shobhit Dayal <shobhit@calsoftinc.com> |
81 | * Alok N Kataria <alokk@calsoftinc.com> | 81 | * Alok N Kataria <alokk@calsoftinc.com> |
82 | * Christoph Lameter <christoph@lameter.com> | 82 | * Christoph Lameter <christoph@lameter.com> |
83 | * | 83 | * |
84 | * Modified the slab allocator to be node aware on NUMA systems. | 84 | * Modified the slab allocator to be node aware on NUMA systems. |
85 | * Each node has its own list of partial, free and full slabs. | 85 | * Each node has its own list of partial, free and full slabs. |
86 | * All object allocations for a node occur from node specific slab lists. | 86 | * All object allocations for a node occur from node specific slab lists. |
87 | */ | 87 | */ |
88 | 88 | ||
89 | #include <linux/slab.h> | 89 | #include <linux/slab.h> |
90 | #include <linux/mm.h> | 90 | #include <linux/mm.h> |
91 | #include <linux/poison.h> | 91 | #include <linux/poison.h> |
92 | #include <linux/swap.h> | 92 | #include <linux/swap.h> |
93 | #include <linux/cache.h> | 93 | #include <linux/cache.h> |
94 | #include <linux/interrupt.h> | 94 | #include <linux/interrupt.h> |
95 | #include <linux/init.h> | 95 | #include <linux/init.h> |
96 | #include <linux/compiler.h> | 96 | #include <linux/compiler.h> |
97 | #include <linux/cpuset.h> | 97 | #include <linux/cpuset.h> |
98 | #include <linux/proc_fs.h> | 98 | #include <linux/proc_fs.h> |
99 | #include <linux/seq_file.h> | 99 | #include <linux/seq_file.h> |
100 | #include <linux/notifier.h> | 100 | #include <linux/notifier.h> |
101 | #include <linux/kallsyms.h> | 101 | #include <linux/kallsyms.h> |
102 | #include <linux/cpu.h> | 102 | #include <linux/cpu.h> |
103 | #include <linux/sysctl.h> | 103 | #include <linux/sysctl.h> |
104 | #include <linux/module.h> | 104 | #include <linux/module.h> |
105 | #include <linux/rcupdate.h> | 105 | #include <linux/rcupdate.h> |
106 | #include <linux/string.h> | 106 | #include <linux/string.h> |
107 | #include <linux/uaccess.h> | 107 | #include <linux/uaccess.h> |
108 | #include <linux/nodemask.h> | 108 | #include <linux/nodemask.h> |
109 | #include <linux/kmemleak.h> | 109 | #include <linux/kmemleak.h> |
110 | #include <linux/mempolicy.h> | 110 | #include <linux/mempolicy.h> |
111 | #include <linux/mutex.h> | 111 | #include <linux/mutex.h> |
112 | #include <linux/fault-inject.h> | 112 | #include <linux/fault-inject.h> |
113 | #include <linux/rtmutex.h> | 113 | #include <linux/rtmutex.h> |
114 | #include <linux/reciprocal_div.h> | 114 | #include <linux/reciprocal_div.h> |
115 | #include <linux/debugobjects.h> | 115 | #include <linux/debugobjects.h> |
116 | #include <linux/kmemcheck.h> | 116 | #include <linux/kmemcheck.h> |
117 | #include <linux/memory.h> | 117 | #include <linux/memory.h> |
118 | #include <linux/prefetch.h> | 118 | #include <linux/prefetch.h> |
119 | 119 | ||
120 | #include <net/sock.h> | 120 | #include <net/sock.h> |
121 | 121 | ||
122 | #include <asm/cacheflush.h> | 122 | #include <asm/cacheflush.h> |
123 | #include <asm/tlbflush.h> | 123 | #include <asm/tlbflush.h> |
124 | #include <asm/page.h> | 124 | #include <asm/page.h> |
125 | 125 | ||
126 | #include <trace/events/kmem.h> | 126 | #include <trace/events/kmem.h> |
127 | 127 | ||
128 | #include "internal.h" | 128 | #include "internal.h" |
129 | 129 | ||
130 | #include "slab.h" | 130 | #include "slab.h" |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 133 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
134 | * 0 for faster, smaller code (especially in the critical paths). | 134 | * 0 for faster, smaller code (especially in the critical paths). |
135 | * | 135 | * |
136 | * STATS - 1 to collect stats for /proc/slabinfo. | 136 | * STATS - 1 to collect stats for /proc/slabinfo. |
137 | * 0 for faster, smaller code (especially in the critical paths). | 137 | * 0 for faster, smaller code (especially in the critical paths). |
138 | * | 138 | * |
139 | * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) | 139 | * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) |
140 | */ | 140 | */ |
141 | 141 | ||
142 | #ifdef CONFIG_DEBUG_SLAB | 142 | #ifdef CONFIG_DEBUG_SLAB |
143 | #define DEBUG 1 | 143 | #define DEBUG 1 |
144 | #define STATS 1 | 144 | #define STATS 1 |
145 | #define FORCED_DEBUG 1 | 145 | #define FORCED_DEBUG 1 |
146 | #else | 146 | #else |
147 | #define DEBUG 0 | 147 | #define DEBUG 0 |
148 | #define STATS 0 | 148 | #define STATS 0 |
149 | #define FORCED_DEBUG 0 | 149 | #define FORCED_DEBUG 0 |
150 | #endif | 150 | #endif |
151 | 151 | ||
152 | /* Shouldn't this be in a header file somewhere? */ | 152 | /* Shouldn't this be in a header file somewhere? */ |
153 | #define BYTES_PER_WORD sizeof(void *) | 153 | #define BYTES_PER_WORD sizeof(void *) |
154 | #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) | 154 | #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) |
155 | 155 | ||
156 | #ifndef ARCH_KMALLOC_FLAGS | 156 | #ifndef ARCH_KMALLOC_FLAGS |
157 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | 157 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN |
158 | #endif | 158 | #endif |
159 | 159 | ||
160 | #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ | 160 | #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ |
161 | <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) | 161 | <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) |
162 | 162 | ||
163 | #if FREELIST_BYTE_INDEX | 163 | #if FREELIST_BYTE_INDEX |
164 | typedef unsigned char freelist_idx_t; | 164 | typedef unsigned char freelist_idx_t; |
165 | #else | 165 | #else |
166 | typedef unsigned short freelist_idx_t; | 166 | typedef unsigned short freelist_idx_t; |
167 | #endif | 167 | #endif |
168 | 168 | ||
169 | #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) | 169 | #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) |
170 | 170 | ||
171 | /* | 171 | /* |
172 | * true if a page was allocated from pfmemalloc reserves for network-based | 172 | * true if a page was allocated from pfmemalloc reserves for network-based |
173 | * swap | 173 | * swap |
174 | */ | 174 | */ |
175 | static bool pfmemalloc_active __read_mostly; | 175 | static bool pfmemalloc_active __read_mostly; |
176 | 176 | ||
177 | /* | 177 | /* |
178 | * struct array_cache | 178 | * struct array_cache |
179 | * | 179 | * |
180 | * Purpose: | 180 | * Purpose: |
181 | * - LIFO ordering, to hand out cache-warm objects from _alloc | 181 | * - LIFO ordering, to hand out cache-warm objects from _alloc |
182 | * - reduce the number of linked list operations | 182 | * - reduce the number of linked list operations |
183 | * - reduce spinlock operations | 183 | * - reduce spinlock operations |
184 | * | 184 | * |
185 | * The limit is stored in the per-cpu structure to reduce the data cache | 185 | * The limit is stored in the per-cpu structure to reduce the data cache |
186 | * footprint. | 186 | * footprint. |
187 | * | 187 | * |
188 | */ | 188 | */ |
189 | struct array_cache { | 189 | struct array_cache { |
190 | unsigned int avail; | 190 | unsigned int avail; |
191 | unsigned int limit; | 191 | unsigned int limit; |
192 | unsigned int batchcount; | 192 | unsigned int batchcount; |
193 | unsigned int touched; | 193 | unsigned int touched; |
194 | void *entry[]; /* | 194 | void *entry[]; /* |
195 | * Must have this definition in here for the proper | 195 | * Must have this definition in here for the proper |
196 | * alignment of array_cache. Also simplifies accessing | 196 | * alignment of array_cache. Also simplifies accessing |
197 | * the entries. | 197 | * the entries. |
198 | * | 198 | * |
199 | * Entries should not be directly dereferenced as | 199 | * Entries should not be directly dereferenced as |
200 | * entries belonging to slabs marked pfmemalloc will | 200 | * entries belonging to slabs marked pfmemalloc will |
201 | * have the lower bits set SLAB_OBJ_PFMEMALLOC | 201 | * have the lower bits set SLAB_OBJ_PFMEMALLOC |
202 | */ | 202 | */ |
203 | }; | 203 | }; |
204 | 204 | ||
205 | struct alien_cache { | 205 | struct alien_cache { |
206 | spinlock_t lock; | 206 | spinlock_t lock; |
207 | struct array_cache ac; | 207 | struct array_cache ac; |
208 | }; | 208 | }; |
209 | 209 | ||
210 | #define SLAB_OBJ_PFMEMALLOC 1 | 210 | #define SLAB_OBJ_PFMEMALLOC 1 |
211 | static inline bool is_obj_pfmemalloc(void *objp) | 211 | static inline bool is_obj_pfmemalloc(void *objp) |
212 | { | 212 | { |
213 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; | 213 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; |
214 | } | 214 | } |
215 | 215 | ||
216 | static inline void set_obj_pfmemalloc(void **objp) | 216 | static inline void set_obj_pfmemalloc(void **objp) |
217 | { | 217 | { |
218 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); | 218 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); |
219 | return; | 219 | return; |
220 | } | 220 | } |
221 | 221 | ||
222 | static inline void clear_obj_pfmemalloc(void **objp) | 222 | static inline void clear_obj_pfmemalloc(void **objp) |
223 | { | 223 | { |
224 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); | 224 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); |
225 | } | 225 | } |
226 | 226 | ||
227 | /* | 227 | /* |
228 | * bootstrap: The caches do not work without cpuarrays anymore, but the | 228 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
229 | * cpuarrays are allocated from the generic caches... | 229 | * cpuarrays are allocated from the generic caches... |
230 | */ | 230 | */ |
231 | #define BOOT_CPUCACHE_ENTRIES 1 | 231 | #define BOOT_CPUCACHE_ENTRIES 1 |
232 | struct arraycache_init { | 232 | struct arraycache_init { |
233 | struct array_cache cache; | 233 | struct array_cache cache; |
234 | void *entries[BOOT_CPUCACHE_ENTRIES]; | 234 | void *entries[BOOT_CPUCACHE_ENTRIES]; |
235 | }; | 235 | }; |
236 | 236 | ||
237 | /* | 237 | /* |
238 | * Need this for bootstrapping a per node allocator. | 238 | * Need this for bootstrapping a per node allocator. |
239 | */ | 239 | */ |
240 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) | 240 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) |
241 | static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; | 241 | static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; |
242 | #define CACHE_CACHE 0 | 242 | #define CACHE_CACHE 0 |
243 | #define SIZE_AC MAX_NUMNODES | 243 | #define SIZE_AC MAX_NUMNODES |
244 | #define SIZE_NODE (2 * MAX_NUMNODES) | 244 | #define SIZE_NODE (2 * MAX_NUMNODES) |
245 | 245 | ||
246 | static int drain_freelist(struct kmem_cache *cache, | 246 | static int drain_freelist(struct kmem_cache *cache, |
247 | struct kmem_cache_node *n, int tofree); | 247 | struct kmem_cache_node *n, int tofree); |
248 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 248 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
249 | int node, struct list_head *list); | 249 | int node, struct list_head *list); |
250 | static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); | 250 | static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); |
251 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); | 251 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); |
252 | static void cache_reap(struct work_struct *unused); | 252 | static void cache_reap(struct work_struct *unused); |
253 | 253 | ||
254 | static int slab_early_init = 1; | 254 | static int slab_early_init = 1; |
255 | 255 | ||
256 | #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) | 256 | #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) |
257 | #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) | 257 | #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) |
258 | 258 | ||
259 | static void kmem_cache_node_init(struct kmem_cache_node *parent) | 259 | static void kmem_cache_node_init(struct kmem_cache_node *parent) |
260 | { | 260 | { |
261 | INIT_LIST_HEAD(&parent->slabs_full); | 261 | INIT_LIST_HEAD(&parent->slabs_full); |
262 | INIT_LIST_HEAD(&parent->slabs_partial); | 262 | INIT_LIST_HEAD(&parent->slabs_partial); |
263 | INIT_LIST_HEAD(&parent->slabs_free); | 263 | INIT_LIST_HEAD(&parent->slabs_free); |
264 | parent->shared = NULL; | 264 | parent->shared = NULL; |
265 | parent->alien = NULL; | 265 | parent->alien = NULL; |
266 | parent->colour_next = 0; | 266 | parent->colour_next = 0; |
267 | spin_lock_init(&parent->list_lock); | 267 | spin_lock_init(&parent->list_lock); |
268 | parent->free_objects = 0; | 268 | parent->free_objects = 0; |
269 | parent->free_touched = 0; | 269 | parent->free_touched = 0; |
270 | } | 270 | } |
271 | 271 | ||
272 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ | 272 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ |
273 | do { \ | 273 | do { \ |
274 | INIT_LIST_HEAD(listp); \ | 274 | INIT_LIST_HEAD(listp); \ |
275 | list_splice(&get_node(cachep, nodeid)->slab, listp); \ | 275 | list_splice(&get_node(cachep, nodeid)->slab, listp); \ |
276 | } while (0) | 276 | } while (0) |
277 | 277 | ||
278 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ | 278 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ |
279 | do { \ | 279 | do { \ |
280 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ | 280 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ |
281 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ | 281 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ |
282 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 282 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
283 | } while (0) | 283 | } while (0) |
284 | 284 | ||
285 | #define CFLGS_OFF_SLAB (0x80000000UL) | 285 | #define CFLGS_OFF_SLAB (0x80000000UL) |
286 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 286 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
287 | 287 | ||
288 | #define BATCHREFILL_LIMIT 16 | 288 | #define BATCHREFILL_LIMIT 16 |
289 | /* | 289 | /* |
290 | * Optimization question: fewer reaps means less probability for unnessary | 290 | * Optimization question: fewer reaps means less probability for unnessary |
291 | * cpucache drain/refill cycles. | 291 | * cpucache drain/refill cycles. |
292 | * | 292 | * |
293 | * OTOH the cpuarrays can contain lots of objects, | 293 | * OTOH the cpuarrays can contain lots of objects, |
294 | * which could lock up otherwise freeable slabs. | 294 | * which could lock up otherwise freeable slabs. |
295 | */ | 295 | */ |
296 | #define REAPTIMEOUT_AC (2*HZ) | 296 | #define REAPTIMEOUT_AC (2*HZ) |
297 | #define REAPTIMEOUT_NODE (4*HZ) | 297 | #define REAPTIMEOUT_NODE (4*HZ) |
298 | 298 | ||
299 | #if STATS | 299 | #if STATS |
300 | #define STATS_INC_ACTIVE(x) ((x)->num_active++) | 300 | #define STATS_INC_ACTIVE(x) ((x)->num_active++) |
301 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) | 301 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) |
302 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 302 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) |
303 | #define STATS_INC_GROWN(x) ((x)->grown++) | 303 | #define STATS_INC_GROWN(x) ((x)->grown++) |
304 | #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) | 304 | #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) |
305 | #define STATS_SET_HIGH(x) \ | 305 | #define STATS_SET_HIGH(x) \ |
306 | do { \ | 306 | do { \ |
307 | if ((x)->num_active > (x)->high_mark) \ | 307 | if ((x)->num_active > (x)->high_mark) \ |
308 | (x)->high_mark = (x)->num_active; \ | 308 | (x)->high_mark = (x)->num_active; \ |
309 | } while (0) | 309 | } while (0) |
310 | #define STATS_INC_ERR(x) ((x)->errors++) | 310 | #define STATS_INC_ERR(x) ((x)->errors++) |
311 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) | 311 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) |
312 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) | 312 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) |
313 | #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) | 313 | #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) |
314 | #define STATS_SET_FREEABLE(x, i) \ | 314 | #define STATS_SET_FREEABLE(x, i) \ |
315 | do { \ | 315 | do { \ |
316 | if ((x)->max_freeable < i) \ | 316 | if ((x)->max_freeable < i) \ |
317 | (x)->max_freeable = i; \ | 317 | (x)->max_freeable = i; \ |
318 | } while (0) | 318 | } while (0) |
319 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) | 319 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) |
320 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) | 320 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) |
321 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) | 321 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) |
322 | #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) | 322 | #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) |
323 | #else | 323 | #else |
324 | #define STATS_INC_ACTIVE(x) do { } while (0) | 324 | #define STATS_INC_ACTIVE(x) do { } while (0) |
325 | #define STATS_DEC_ACTIVE(x) do { } while (0) | 325 | #define STATS_DEC_ACTIVE(x) do { } while (0) |
326 | #define STATS_INC_ALLOCED(x) do { } while (0) | 326 | #define STATS_INC_ALLOCED(x) do { } while (0) |
327 | #define STATS_INC_GROWN(x) do { } while (0) | 327 | #define STATS_INC_GROWN(x) do { } while (0) |
328 | #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) | 328 | #define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) |
329 | #define STATS_SET_HIGH(x) do { } while (0) | 329 | #define STATS_SET_HIGH(x) do { } while (0) |
330 | #define STATS_INC_ERR(x) do { } while (0) | 330 | #define STATS_INC_ERR(x) do { } while (0) |
331 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 331 | #define STATS_INC_NODEALLOCS(x) do { } while (0) |
332 | #define STATS_INC_NODEFREES(x) do { } while (0) | 332 | #define STATS_INC_NODEFREES(x) do { } while (0) |
333 | #define STATS_INC_ACOVERFLOW(x) do { } while (0) | 333 | #define STATS_INC_ACOVERFLOW(x) do { } while (0) |
334 | #define STATS_SET_FREEABLE(x, i) do { } while (0) | 334 | #define STATS_SET_FREEABLE(x, i) do { } while (0) |
335 | #define STATS_INC_ALLOCHIT(x) do { } while (0) | 335 | #define STATS_INC_ALLOCHIT(x) do { } while (0) |
336 | #define STATS_INC_ALLOCMISS(x) do { } while (0) | 336 | #define STATS_INC_ALLOCMISS(x) do { } while (0) |
337 | #define STATS_INC_FREEHIT(x) do { } while (0) | 337 | #define STATS_INC_FREEHIT(x) do { } while (0) |
338 | #define STATS_INC_FREEMISS(x) do { } while (0) | 338 | #define STATS_INC_FREEMISS(x) do { } while (0) |
339 | #endif | 339 | #endif |
340 | 340 | ||
341 | #if DEBUG | 341 | #if DEBUG |
342 | 342 | ||
343 | /* | 343 | /* |
344 | * memory layout of objects: | 344 | * memory layout of objects: |
345 | * 0 : objp | 345 | * 0 : objp |
346 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that | 346 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that |
347 | * the end of an object is aligned with the end of the real | 347 | * the end of an object is aligned with the end of the real |
348 | * allocation. Catches writes behind the end of the allocation. | 348 | * allocation. Catches writes behind the end of the allocation. |
349 | * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: | 349 | * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: |
350 | * redzone word. | 350 | * redzone word. |
351 | * cachep->obj_offset: The real object. | 351 | * cachep->obj_offset: The real object. |
352 | * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | 352 | * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] |
353 | * cachep->size - 1* BYTES_PER_WORD: last caller address | 353 | * cachep->size - 1* BYTES_PER_WORD: last caller address |
354 | * [BYTES_PER_WORD long] | 354 | * [BYTES_PER_WORD long] |
355 | */ | 355 | */ |
356 | static int obj_offset(struct kmem_cache *cachep) | 356 | static int obj_offset(struct kmem_cache *cachep) |
357 | { | 357 | { |
358 | return cachep->obj_offset; | 358 | return cachep->obj_offset; |
359 | } | 359 | } |
360 | 360 | ||
361 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) | 361 | static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) |
362 | { | 362 | { |
363 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 363 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
364 | return (unsigned long long*) (objp + obj_offset(cachep) - | 364 | return (unsigned long long*) (objp + obj_offset(cachep) - |
365 | sizeof(unsigned long long)); | 365 | sizeof(unsigned long long)); |
366 | } | 366 | } |
367 | 367 | ||
368 | static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) | 368 | static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) |
369 | { | 369 | { |
370 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 370 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
371 | if (cachep->flags & SLAB_STORE_USER) | 371 | if (cachep->flags & SLAB_STORE_USER) |
372 | return (unsigned long long *)(objp + cachep->size - | 372 | return (unsigned long long *)(objp + cachep->size - |
373 | sizeof(unsigned long long) - | 373 | sizeof(unsigned long long) - |
374 | REDZONE_ALIGN); | 374 | REDZONE_ALIGN); |
375 | return (unsigned long long *) (objp + cachep->size - | 375 | return (unsigned long long *) (objp + cachep->size - |
376 | sizeof(unsigned long long)); | 376 | sizeof(unsigned long long)); |
377 | } | 377 | } |
378 | 378 | ||
379 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) | 379 | static void **dbg_userword(struct kmem_cache *cachep, void *objp) |
380 | { | 380 | { |
381 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); | 381 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); |
382 | return (void **)(objp + cachep->size - BYTES_PER_WORD); | 382 | return (void **)(objp + cachep->size - BYTES_PER_WORD); |
383 | } | 383 | } |
384 | 384 | ||
385 | #else | 385 | #else |
386 | 386 | ||
387 | #define obj_offset(x) 0 | 387 | #define obj_offset(x) 0 |
388 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) | 388 | #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
389 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) | 389 | #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) |
390 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) | 390 | #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) |
391 | 391 | ||
392 | #endif | 392 | #endif |
393 | 393 | ||
394 | #define OBJECT_FREE (0) | 394 | #define OBJECT_FREE (0) |
395 | #define OBJECT_ACTIVE (1) | 395 | #define OBJECT_ACTIVE (1) |
396 | 396 | ||
397 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 397 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
398 | 398 | ||
399 | static void set_obj_status(struct page *page, int idx, int val) | 399 | static void set_obj_status(struct page *page, int idx, int val) |
400 | { | 400 | { |
401 | int freelist_size; | 401 | int freelist_size; |
402 | char *status; | 402 | char *status; |
403 | struct kmem_cache *cachep = page->slab_cache; | 403 | struct kmem_cache *cachep = page->slab_cache; |
404 | 404 | ||
405 | freelist_size = cachep->num * sizeof(freelist_idx_t); | 405 | freelist_size = cachep->num * sizeof(freelist_idx_t); |
406 | status = (char *)page->freelist + freelist_size; | 406 | status = (char *)page->freelist + freelist_size; |
407 | status[idx] = val; | 407 | status[idx] = val; |
408 | } | 408 | } |
409 | 409 | ||
410 | static inline unsigned int get_obj_status(struct page *page, int idx) | 410 | static inline unsigned int get_obj_status(struct page *page, int idx) |
411 | { | 411 | { |
412 | int freelist_size; | 412 | int freelist_size; |
413 | char *status; | 413 | char *status; |
414 | struct kmem_cache *cachep = page->slab_cache; | 414 | struct kmem_cache *cachep = page->slab_cache; |
415 | 415 | ||
416 | freelist_size = cachep->num * sizeof(freelist_idx_t); | 416 | freelist_size = cachep->num * sizeof(freelist_idx_t); |
417 | status = (char *)page->freelist + freelist_size; | 417 | status = (char *)page->freelist + freelist_size; |
418 | 418 | ||
419 | return status[idx]; | 419 | return status[idx]; |
420 | } | 420 | } |
421 | 421 | ||
422 | #else | 422 | #else |
423 | static inline void set_obj_status(struct page *page, int idx, int val) {} | 423 | static inline void set_obj_status(struct page *page, int idx, int val) {} |
424 | 424 | ||
425 | #endif | 425 | #endif |
426 | 426 | ||
427 | /* | 427 | /* |
428 | * Do not go above this order unless 0 objects fit into the slab or | 428 | * Do not go above this order unless 0 objects fit into the slab or |
429 | * overridden on the command line. | 429 | * overridden on the command line. |
430 | */ | 430 | */ |
431 | #define SLAB_MAX_ORDER_HI 1 | 431 | #define SLAB_MAX_ORDER_HI 1 |
432 | #define SLAB_MAX_ORDER_LO 0 | 432 | #define SLAB_MAX_ORDER_LO 0 |
433 | static int slab_max_order = SLAB_MAX_ORDER_LO; | 433 | static int slab_max_order = SLAB_MAX_ORDER_LO; |
434 | static bool slab_max_order_set __initdata; | 434 | static bool slab_max_order_set __initdata; |
435 | 435 | ||
436 | static inline struct kmem_cache *virt_to_cache(const void *obj) | 436 | static inline struct kmem_cache *virt_to_cache(const void *obj) |
437 | { | 437 | { |
438 | struct page *page = virt_to_head_page(obj); | 438 | struct page *page = virt_to_head_page(obj); |
439 | return page->slab_cache; | 439 | return page->slab_cache; |
440 | } | 440 | } |
441 | 441 | ||
442 | static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, | 442 | static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, |
443 | unsigned int idx) | 443 | unsigned int idx) |
444 | { | 444 | { |
445 | return page->s_mem + cache->size * idx; | 445 | return page->s_mem + cache->size * idx; |
446 | } | 446 | } |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * We want to avoid an expensive divide : (offset / cache->size) | 449 | * We want to avoid an expensive divide : (offset / cache->size) |
450 | * Using the fact that size is a constant for a particular cache, | 450 | * Using the fact that size is a constant for a particular cache, |
451 | * we can replace (offset / cache->size) by | 451 | * we can replace (offset / cache->size) by |
452 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) | 452 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) |
453 | */ | 453 | */ |
454 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, | 454 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, |
455 | const struct page *page, void *obj) | 455 | const struct page *page, void *obj) |
456 | { | 456 | { |
457 | u32 offset = (obj - page->s_mem); | 457 | u32 offset = (obj - page->s_mem); |
458 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | 458 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); |
459 | } | 459 | } |
460 | 460 | ||
461 | static struct arraycache_init initarray_generic = | 461 | static struct arraycache_init initarray_generic = |
462 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 462 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
463 | 463 | ||
464 | /* internal cache of cache description objs */ | 464 | /* internal cache of cache description objs */ |
465 | static struct kmem_cache kmem_cache_boot = { | 465 | static struct kmem_cache kmem_cache_boot = { |
466 | .batchcount = 1, | 466 | .batchcount = 1, |
467 | .limit = BOOT_CPUCACHE_ENTRIES, | 467 | .limit = BOOT_CPUCACHE_ENTRIES, |
468 | .shared = 1, | 468 | .shared = 1, |
469 | .size = sizeof(struct kmem_cache), | 469 | .size = sizeof(struct kmem_cache), |
470 | .name = "kmem_cache", | 470 | .name = "kmem_cache", |
471 | }; | 471 | }; |
472 | 472 | ||
473 | #define BAD_ALIEN_MAGIC 0x01020304ul | 473 | #define BAD_ALIEN_MAGIC 0x01020304ul |
474 | 474 | ||
475 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | 475 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
476 | 476 | ||
477 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 477 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
478 | { | 478 | { |
479 | return cachep->array[smp_processor_id()]; | 479 | return cachep->array[smp_processor_id()]; |
480 | } | 480 | } |
481 | 481 | ||
482 | static size_t calculate_freelist_size(int nr_objs, size_t align) | 482 | static size_t calculate_freelist_size(int nr_objs, size_t align) |
483 | { | 483 | { |
484 | size_t freelist_size; | 484 | size_t freelist_size; |
485 | 485 | ||
486 | freelist_size = nr_objs * sizeof(freelist_idx_t); | 486 | freelist_size = nr_objs * sizeof(freelist_idx_t); |
487 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | 487 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) |
488 | freelist_size += nr_objs * sizeof(char); | 488 | freelist_size += nr_objs * sizeof(char); |
489 | 489 | ||
490 | if (align) | 490 | if (align) |
491 | freelist_size = ALIGN(freelist_size, align); | 491 | freelist_size = ALIGN(freelist_size, align); |
492 | 492 | ||
493 | return freelist_size; | 493 | return freelist_size; |
494 | } | 494 | } |
495 | 495 | ||
496 | static int calculate_nr_objs(size_t slab_size, size_t buffer_size, | 496 | static int calculate_nr_objs(size_t slab_size, size_t buffer_size, |
497 | size_t idx_size, size_t align) | 497 | size_t idx_size, size_t align) |
498 | { | 498 | { |
499 | int nr_objs; | 499 | int nr_objs; |
500 | size_t remained_size; | 500 | size_t remained_size; |
501 | size_t freelist_size; | 501 | size_t freelist_size; |
502 | int extra_space = 0; | 502 | int extra_space = 0; |
503 | 503 | ||
504 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | 504 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) |
505 | extra_space = sizeof(char); | 505 | extra_space = sizeof(char); |
506 | /* | 506 | /* |
507 | * Ignore padding for the initial guess. The padding | 507 | * Ignore padding for the initial guess. The padding |
508 | * is at most @align-1 bytes, and @buffer_size is at | 508 | * is at most @align-1 bytes, and @buffer_size is at |
509 | * least @align. In the worst case, this result will | 509 | * least @align. In the worst case, this result will |
510 | * be one greater than the number of objects that fit | 510 | * be one greater than the number of objects that fit |
511 | * into the memory allocation when taking the padding | 511 | * into the memory allocation when taking the padding |
512 | * into account. | 512 | * into account. |
513 | */ | 513 | */ |
514 | nr_objs = slab_size / (buffer_size + idx_size + extra_space); | 514 | nr_objs = slab_size / (buffer_size + idx_size + extra_space); |
515 | 515 | ||
516 | /* | 516 | /* |
517 | * This calculated number will be either the right | 517 | * This calculated number will be either the right |
518 | * amount, or one greater than what we want. | 518 | * amount, or one greater than what we want. |
519 | */ | 519 | */ |
520 | remained_size = slab_size - nr_objs * buffer_size; | 520 | remained_size = slab_size - nr_objs * buffer_size; |
521 | freelist_size = calculate_freelist_size(nr_objs, align); | 521 | freelist_size = calculate_freelist_size(nr_objs, align); |
522 | if (remained_size < freelist_size) | 522 | if (remained_size < freelist_size) |
523 | nr_objs--; | 523 | nr_objs--; |
524 | 524 | ||
525 | return nr_objs; | 525 | return nr_objs; |
526 | } | 526 | } |
527 | 527 | ||
528 | /* | 528 | /* |
529 | * Calculate the number of objects and left-over bytes for a given buffer size. | 529 | * Calculate the number of objects and left-over bytes for a given buffer size. |
530 | */ | 530 | */ |
531 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, | 531 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, |
532 | size_t align, int flags, size_t *left_over, | 532 | size_t align, int flags, size_t *left_over, |
533 | unsigned int *num) | 533 | unsigned int *num) |
534 | { | 534 | { |
535 | int nr_objs; | 535 | int nr_objs; |
536 | size_t mgmt_size; | 536 | size_t mgmt_size; |
537 | size_t slab_size = PAGE_SIZE << gfporder; | 537 | size_t slab_size = PAGE_SIZE << gfporder; |
538 | 538 | ||
539 | /* | 539 | /* |
540 | * The slab management structure can be either off the slab or | 540 | * The slab management structure can be either off the slab or |
541 | * on it. For the latter case, the memory allocated for a | 541 | * on it. For the latter case, the memory allocated for a |
542 | * slab is used for: | 542 | * slab is used for: |
543 | * | 543 | * |
544 | * - One unsigned int for each object | 544 | * - One unsigned int for each object |
545 | * - Padding to respect alignment of @align | 545 | * - Padding to respect alignment of @align |
546 | * - @buffer_size bytes for each object | 546 | * - @buffer_size bytes for each object |
547 | * | 547 | * |
548 | * If the slab management structure is off the slab, then the | 548 | * If the slab management structure is off the slab, then the |
549 | * alignment will already be calculated into the size. Because | 549 | * alignment will already be calculated into the size. Because |
550 | * the slabs are all pages aligned, the objects will be at the | 550 | * the slabs are all pages aligned, the objects will be at the |
551 | * correct alignment when allocated. | 551 | * correct alignment when allocated. |
552 | */ | 552 | */ |
553 | if (flags & CFLGS_OFF_SLAB) { | 553 | if (flags & CFLGS_OFF_SLAB) { |
554 | mgmt_size = 0; | 554 | mgmt_size = 0; |
555 | nr_objs = slab_size / buffer_size; | 555 | nr_objs = slab_size / buffer_size; |
556 | 556 | ||
557 | } else { | 557 | } else { |
558 | nr_objs = calculate_nr_objs(slab_size, buffer_size, | 558 | nr_objs = calculate_nr_objs(slab_size, buffer_size, |
559 | sizeof(freelist_idx_t), align); | 559 | sizeof(freelist_idx_t), align); |
560 | mgmt_size = calculate_freelist_size(nr_objs, align); | 560 | mgmt_size = calculate_freelist_size(nr_objs, align); |
561 | } | 561 | } |
562 | *num = nr_objs; | 562 | *num = nr_objs; |
563 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; | 563 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; |
564 | } | 564 | } |
565 | 565 | ||
566 | #if DEBUG | 566 | #if DEBUG |
567 | #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) | 567 | #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) |
568 | 568 | ||
569 | static void __slab_error(const char *function, struct kmem_cache *cachep, | 569 | static void __slab_error(const char *function, struct kmem_cache *cachep, |
570 | char *msg) | 570 | char *msg) |
571 | { | 571 | { |
572 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 572 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
573 | function, cachep->name, msg); | 573 | function, cachep->name, msg); |
574 | dump_stack(); | 574 | dump_stack(); |
575 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 575 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
576 | } | 576 | } |
577 | #endif | 577 | #endif |
578 | 578 | ||
579 | /* | 579 | /* |
580 | * By default on NUMA we use alien caches to stage the freeing of | 580 | * By default on NUMA we use alien caches to stage the freeing of |
581 | * objects allocated from other nodes. This causes massive memory | 581 | * objects allocated from other nodes. This causes massive memory |
582 | * inefficiencies when using fake NUMA setup to split memory into a | 582 | * inefficiencies when using fake NUMA setup to split memory into a |
583 | * large number of small nodes, so it can be disabled on the command | 583 | * large number of small nodes, so it can be disabled on the command |
584 | * line | 584 | * line |
585 | */ | 585 | */ |
586 | 586 | ||
587 | static int use_alien_caches __read_mostly = 1; | 587 | static int use_alien_caches __read_mostly = 1; |
588 | static int __init noaliencache_setup(char *s) | 588 | static int __init noaliencache_setup(char *s) |
589 | { | 589 | { |
590 | use_alien_caches = 0; | 590 | use_alien_caches = 0; |
591 | return 1; | 591 | return 1; |
592 | } | 592 | } |
593 | __setup("noaliencache", noaliencache_setup); | 593 | __setup("noaliencache", noaliencache_setup); |
594 | 594 | ||
595 | static int __init slab_max_order_setup(char *str) | 595 | static int __init slab_max_order_setup(char *str) |
596 | { | 596 | { |
597 | get_option(&str, &slab_max_order); | 597 | get_option(&str, &slab_max_order); |
598 | slab_max_order = slab_max_order < 0 ? 0 : | 598 | slab_max_order = slab_max_order < 0 ? 0 : |
599 | min(slab_max_order, MAX_ORDER - 1); | 599 | min(slab_max_order, MAX_ORDER - 1); |
600 | slab_max_order_set = true; | 600 | slab_max_order_set = true; |
601 | 601 | ||
602 | return 1; | 602 | return 1; |
603 | } | 603 | } |
604 | __setup("slab_max_order=", slab_max_order_setup); | 604 | __setup("slab_max_order=", slab_max_order_setup); |
605 | 605 | ||
606 | #ifdef CONFIG_NUMA | 606 | #ifdef CONFIG_NUMA |
607 | /* | 607 | /* |
608 | * Special reaping functions for NUMA systems called from cache_reap(). | 608 | * Special reaping functions for NUMA systems called from cache_reap(). |
609 | * These take care of doing round robin flushing of alien caches (containing | 609 | * These take care of doing round robin flushing of alien caches (containing |
610 | * objects freed on different nodes from which they were allocated) and the | 610 | * objects freed on different nodes from which they were allocated) and the |
611 | * flushing of remote pcps by calling drain_node_pages. | 611 | * flushing of remote pcps by calling drain_node_pages. |
612 | */ | 612 | */ |
613 | static DEFINE_PER_CPU(unsigned long, slab_reap_node); | 613 | static DEFINE_PER_CPU(unsigned long, slab_reap_node); |
614 | 614 | ||
615 | static void init_reap_node(int cpu) | 615 | static void init_reap_node(int cpu) |
616 | { | 616 | { |
617 | int node; | 617 | int node; |
618 | 618 | ||
619 | node = next_node(cpu_to_mem(cpu), node_online_map); | 619 | node = next_node(cpu_to_mem(cpu), node_online_map); |
620 | if (node == MAX_NUMNODES) | 620 | if (node == MAX_NUMNODES) |
621 | node = first_node(node_online_map); | 621 | node = first_node(node_online_map); |
622 | 622 | ||
623 | per_cpu(slab_reap_node, cpu) = node; | 623 | per_cpu(slab_reap_node, cpu) = node; |
624 | } | 624 | } |
625 | 625 | ||
626 | static void next_reap_node(void) | 626 | static void next_reap_node(void) |
627 | { | 627 | { |
628 | int node = __this_cpu_read(slab_reap_node); | 628 | int node = __this_cpu_read(slab_reap_node); |
629 | 629 | ||
630 | node = next_node(node, node_online_map); | 630 | node = next_node(node, node_online_map); |
631 | if (unlikely(node >= MAX_NUMNODES)) | 631 | if (unlikely(node >= MAX_NUMNODES)) |
632 | node = first_node(node_online_map); | 632 | node = first_node(node_online_map); |
633 | __this_cpu_write(slab_reap_node, node); | 633 | __this_cpu_write(slab_reap_node, node); |
634 | } | 634 | } |
635 | 635 | ||
636 | #else | 636 | #else |
637 | #define init_reap_node(cpu) do { } while (0) | 637 | #define init_reap_node(cpu) do { } while (0) |
638 | #define next_reap_node(void) do { } while (0) | 638 | #define next_reap_node(void) do { } while (0) |
639 | #endif | 639 | #endif |
640 | 640 | ||
641 | /* | 641 | /* |
642 | * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz | 642 | * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz |
643 | * via the workqueue/eventd. | 643 | * via the workqueue/eventd. |
644 | * Add the CPU number into the expiration time to minimize the possibility of | 644 | * Add the CPU number into the expiration time to minimize the possibility of |
645 | * the CPUs getting into lockstep and contending for the global cache chain | 645 | * the CPUs getting into lockstep and contending for the global cache chain |
646 | * lock. | 646 | * lock. |
647 | */ | 647 | */ |
648 | static void start_cpu_timer(int cpu) | 648 | static void start_cpu_timer(int cpu) |
649 | { | 649 | { |
650 | struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); | 650 | struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); |
651 | 651 | ||
652 | /* | 652 | /* |
653 | * When this gets called from do_initcalls via cpucache_init(), | 653 | * When this gets called from do_initcalls via cpucache_init(), |
654 | * init_workqueues() has already run, so keventd will be setup | 654 | * init_workqueues() has already run, so keventd will be setup |
655 | * at that time. | 655 | * at that time. |
656 | */ | 656 | */ |
657 | if (keventd_up() && reap_work->work.func == NULL) { | 657 | if (keventd_up() && reap_work->work.func == NULL) { |
658 | init_reap_node(cpu); | 658 | init_reap_node(cpu); |
659 | INIT_DEFERRABLE_WORK(reap_work, cache_reap); | 659 | INIT_DEFERRABLE_WORK(reap_work, cache_reap); |
660 | schedule_delayed_work_on(cpu, reap_work, | 660 | schedule_delayed_work_on(cpu, reap_work, |
661 | __round_jiffies_relative(HZ, cpu)); | 661 | __round_jiffies_relative(HZ, cpu)); |
662 | } | 662 | } |
663 | } | 663 | } |
664 | 664 | ||
665 | static void init_arraycache(struct array_cache *ac, int limit, int batch) | 665 | static void init_arraycache(struct array_cache *ac, int limit, int batch) |
666 | { | 666 | { |
667 | /* | 667 | /* |
668 | * The array_cache structures contain pointers to free object. | 668 | * The array_cache structures contain pointers to free object. |
669 | * However, when such objects are allocated or transferred to another | 669 | * However, when such objects are allocated or transferred to another |
670 | * cache the pointers are not cleared and they could be counted as | 670 | * cache the pointers are not cleared and they could be counted as |
671 | * valid references during a kmemleak scan. Therefore, kmemleak must | 671 | * valid references during a kmemleak scan. Therefore, kmemleak must |
672 | * not scan such objects. | 672 | * not scan such objects. |
673 | */ | 673 | */ |
674 | kmemleak_no_scan(ac); | 674 | kmemleak_no_scan(ac); |
675 | if (ac) { | 675 | if (ac) { |
676 | ac->avail = 0; | 676 | ac->avail = 0; |
677 | ac->limit = limit; | 677 | ac->limit = limit; |
678 | ac->batchcount = batch; | 678 | ac->batchcount = batch; |
679 | ac->touched = 0; | 679 | ac->touched = 0; |
680 | } | 680 | } |
681 | } | 681 | } |
682 | 682 | ||
683 | static struct array_cache *alloc_arraycache(int node, int entries, | 683 | static struct array_cache *alloc_arraycache(int node, int entries, |
684 | int batchcount, gfp_t gfp) | 684 | int batchcount, gfp_t gfp) |
685 | { | 685 | { |
686 | size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); | 686 | size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); |
687 | struct array_cache *ac = NULL; | 687 | struct array_cache *ac = NULL; |
688 | 688 | ||
689 | ac = kmalloc_node(memsize, gfp, node); | 689 | ac = kmalloc_node(memsize, gfp, node); |
690 | init_arraycache(ac, entries, batchcount); | 690 | init_arraycache(ac, entries, batchcount); |
691 | return ac; | 691 | return ac; |
692 | } | 692 | } |
693 | 693 | ||
694 | static inline bool is_slab_pfmemalloc(struct page *page) | 694 | static inline bool is_slab_pfmemalloc(struct page *page) |
695 | { | 695 | { |
696 | return PageSlabPfmemalloc(page); | 696 | return PageSlabPfmemalloc(page); |
697 | } | 697 | } |
698 | 698 | ||
699 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ | 699 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ |
700 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | 700 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, |
701 | struct array_cache *ac) | 701 | struct array_cache *ac) |
702 | { | 702 | { |
703 | struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); | 703 | struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); |
704 | struct page *page; | 704 | struct page *page; |
705 | unsigned long flags; | 705 | unsigned long flags; |
706 | 706 | ||
707 | if (!pfmemalloc_active) | 707 | if (!pfmemalloc_active) |
708 | return; | 708 | return; |
709 | 709 | ||
710 | spin_lock_irqsave(&n->list_lock, flags); | 710 | spin_lock_irqsave(&n->list_lock, flags); |
711 | list_for_each_entry(page, &n->slabs_full, lru) | 711 | list_for_each_entry(page, &n->slabs_full, lru) |
712 | if (is_slab_pfmemalloc(page)) | 712 | if (is_slab_pfmemalloc(page)) |
713 | goto out; | 713 | goto out; |
714 | 714 | ||
715 | list_for_each_entry(page, &n->slabs_partial, lru) | 715 | list_for_each_entry(page, &n->slabs_partial, lru) |
716 | if (is_slab_pfmemalloc(page)) | 716 | if (is_slab_pfmemalloc(page)) |
717 | goto out; | 717 | goto out; |
718 | 718 | ||
719 | list_for_each_entry(page, &n->slabs_free, lru) | 719 | list_for_each_entry(page, &n->slabs_free, lru) |
720 | if (is_slab_pfmemalloc(page)) | 720 | if (is_slab_pfmemalloc(page)) |
721 | goto out; | 721 | goto out; |
722 | 722 | ||
723 | pfmemalloc_active = false; | 723 | pfmemalloc_active = false; |
724 | out: | 724 | out: |
725 | spin_unlock_irqrestore(&n->list_lock, flags); | 725 | spin_unlock_irqrestore(&n->list_lock, flags); |
726 | } | 726 | } |
727 | 727 | ||
728 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | 728 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, |
729 | gfp_t flags, bool force_refill) | 729 | gfp_t flags, bool force_refill) |
730 | { | 730 | { |
731 | int i; | 731 | int i; |
732 | void *objp = ac->entry[--ac->avail]; | 732 | void *objp = ac->entry[--ac->avail]; |
733 | 733 | ||
734 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ | 734 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ |
735 | if (unlikely(is_obj_pfmemalloc(objp))) { | 735 | if (unlikely(is_obj_pfmemalloc(objp))) { |
736 | struct kmem_cache_node *n; | 736 | struct kmem_cache_node *n; |
737 | 737 | ||
738 | if (gfp_pfmemalloc_allowed(flags)) { | 738 | if (gfp_pfmemalloc_allowed(flags)) { |
739 | clear_obj_pfmemalloc(&objp); | 739 | clear_obj_pfmemalloc(&objp); |
740 | return objp; | 740 | return objp; |
741 | } | 741 | } |
742 | 742 | ||
743 | /* The caller cannot use PFMEMALLOC objects, find another one */ | 743 | /* The caller cannot use PFMEMALLOC objects, find another one */ |
744 | for (i = 0; i < ac->avail; i++) { | 744 | for (i = 0; i < ac->avail; i++) { |
745 | /* If a !PFMEMALLOC object is found, swap them */ | 745 | /* If a !PFMEMALLOC object is found, swap them */ |
746 | if (!is_obj_pfmemalloc(ac->entry[i])) { | 746 | if (!is_obj_pfmemalloc(ac->entry[i])) { |
747 | objp = ac->entry[i]; | 747 | objp = ac->entry[i]; |
748 | ac->entry[i] = ac->entry[ac->avail]; | 748 | ac->entry[i] = ac->entry[ac->avail]; |
749 | ac->entry[ac->avail] = objp; | 749 | ac->entry[ac->avail] = objp; |
750 | return objp; | 750 | return objp; |
751 | } | 751 | } |
752 | } | 752 | } |
753 | 753 | ||
754 | /* | 754 | /* |
755 | * If there are empty slabs on the slabs_free list and we are | 755 | * If there are empty slabs on the slabs_free list and we are |
756 | * being forced to refill the cache, mark this one !pfmemalloc. | 756 | * being forced to refill the cache, mark this one !pfmemalloc. |
757 | */ | 757 | */ |
758 | n = get_node(cachep, numa_mem_id()); | 758 | n = get_node(cachep, numa_mem_id()); |
759 | if (!list_empty(&n->slabs_free) && force_refill) { | 759 | if (!list_empty(&n->slabs_free) && force_refill) { |
760 | struct page *page = virt_to_head_page(objp); | 760 | struct page *page = virt_to_head_page(objp); |
761 | ClearPageSlabPfmemalloc(page); | 761 | ClearPageSlabPfmemalloc(page); |
762 | clear_obj_pfmemalloc(&objp); | 762 | clear_obj_pfmemalloc(&objp); |
763 | recheck_pfmemalloc_active(cachep, ac); | 763 | recheck_pfmemalloc_active(cachep, ac); |
764 | return objp; | 764 | return objp; |
765 | } | 765 | } |
766 | 766 | ||
767 | /* No !PFMEMALLOC objects available */ | 767 | /* No !PFMEMALLOC objects available */ |
768 | ac->avail++; | 768 | ac->avail++; |
769 | objp = NULL; | 769 | objp = NULL; |
770 | } | 770 | } |
771 | 771 | ||
772 | return objp; | 772 | return objp; |
773 | } | 773 | } |
774 | 774 | ||
775 | static inline void *ac_get_obj(struct kmem_cache *cachep, | 775 | static inline void *ac_get_obj(struct kmem_cache *cachep, |
776 | struct array_cache *ac, gfp_t flags, bool force_refill) | 776 | struct array_cache *ac, gfp_t flags, bool force_refill) |
777 | { | 777 | { |
778 | void *objp; | 778 | void *objp; |
779 | 779 | ||
780 | if (unlikely(sk_memalloc_socks())) | 780 | if (unlikely(sk_memalloc_socks())) |
781 | objp = __ac_get_obj(cachep, ac, flags, force_refill); | 781 | objp = __ac_get_obj(cachep, ac, flags, force_refill); |
782 | else | 782 | else |
783 | objp = ac->entry[--ac->avail]; | 783 | objp = ac->entry[--ac->avail]; |
784 | 784 | ||
785 | return objp; | 785 | return objp; |
786 | } | 786 | } |
787 | 787 | ||
788 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | 788 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, |
789 | void *objp) | 789 | void *objp) |
790 | { | 790 | { |
791 | if (unlikely(pfmemalloc_active)) { | 791 | if (unlikely(pfmemalloc_active)) { |
792 | /* Some pfmemalloc slabs exist, check if this is one */ | 792 | /* Some pfmemalloc slabs exist, check if this is one */ |
793 | struct page *page = virt_to_head_page(objp); | 793 | struct page *page = virt_to_head_page(objp); |
794 | if (PageSlabPfmemalloc(page)) | 794 | if (PageSlabPfmemalloc(page)) |
795 | set_obj_pfmemalloc(&objp); | 795 | set_obj_pfmemalloc(&objp); |
796 | } | 796 | } |
797 | 797 | ||
798 | return objp; | 798 | return objp; |
799 | } | 799 | } |
800 | 800 | ||
801 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | 801 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, |
802 | void *objp) | 802 | void *objp) |
803 | { | 803 | { |
804 | if (unlikely(sk_memalloc_socks())) | 804 | if (unlikely(sk_memalloc_socks())) |
805 | objp = __ac_put_obj(cachep, ac, objp); | 805 | objp = __ac_put_obj(cachep, ac, objp); |
806 | 806 | ||
807 | ac->entry[ac->avail++] = objp; | 807 | ac->entry[ac->avail++] = objp; |
808 | } | 808 | } |
809 | 809 | ||
810 | /* | 810 | /* |
811 | * Transfer objects in one arraycache to another. | 811 | * Transfer objects in one arraycache to another. |
812 | * Locking must be handled by the caller. | 812 | * Locking must be handled by the caller. |
813 | * | 813 | * |
814 | * Return the number of entries transferred. | 814 | * Return the number of entries transferred. |
815 | */ | 815 | */ |
816 | static int transfer_objects(struct array_cache *to, | 816 | static int transfer_objects(struct array_cache *to, |
817 | struct array_cache *from, unsigned int max) | 817 | struct array_cache *from, unsigned int max) |
818 | { | 818 | { |
819 | /* Figure out how many entries to transfer */ | 819 | /* Figure out how many entries to transfer */ |
820 | int nr = min3(from->avail, max, to->limit - to->avail); | 820 | int nr = min3(from->avail, max, to->limit - to->avail); |
821 | 821 | ||
822 | if (!nr) | 822 | if (!nr) |
823 | return 0; | 823 | return 0; |
824 | 824 | ||
825 | memcpy(to->entry + to->avail, from->entry + from->avail -nr, | 825 | memcpy(to->entry + to->avail, from->entry + from->avail -nr, |
826 | sizeof(void *) *nr); | 826 | sizeof(void *) *nr); |
827 | 827 | ||
828 | from->avail -= nr; | 828 | from->avail -= nr; |
829 | to->avail += nr; | 829 | to->avail += nr; |
830 | return nr; | 830 | return nr; |
831 | } | 831 | } |
832 | 832 | ||
833 | #ifndef CONFIG_NUMA | 833 | #ifndef CONFIG_NUMA |
834 | 834 | ||
835 | #define drain_alien_cache(cachep, alien) do { } while (0) | 835 | #define drain_alien_cache(cachep, alien) do { } while (0) |
836 | #define reap_alien(cachep, n) do { } while (0) | 836 | #define reap_alien(cachep, n) do { } while (0) |
837 | 837 | ||
838 | static inline struct alien_cache **alloc_alien_cache(int node, | 838 | static inline struct alien_cache **alloc_alien_cache(int node, |
839 | int limit, gfp_t gfp) | 839 | int limit, gfp_t gfp) |
840 | { | 840 | { |
841 | return (struct alien_cache **)BAD_ALIEN_MAGIC; | 841 | return (struct alien_cache **)BAD_ALIEN_MAGIC; |
842 | } | 842 | } |
843 | 843 | ||
844 | static inline void free_alien_cache(struct alien_cache **ac_ptr) | 844 | static inline void free_alien_cache(struct alien_cache **ac_ptr) |
845 | { | 845 | { |
846 | } | 846 | } |
847 | 847 | ||
848 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | 848 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) |
849 | { | 849 | { |
850 | return 0; | 850 | return 0; |
851 | } | 851 | } |
852 | 852 | ||
853 | static inline void *alternate_node_alloc(struct kmem_cache *cachep, | 853 | static inline void *alternate_node_alloc(struct kmem_cache *cachep, |
854 | gfp_t flags) | 854 | gfp_t flags) |
855 | { | 855 | { |
856 | return NULL; | 856 | return NULL; |
857 | } | 857 | } |
858 | 858 | ||
859 | static inline void *____cache_alloc_node(struct kmem_cache *cachep, | 859 | static inline void *____cache_alloc_node(struct kmem_cache *cachep, |
860 | gfp_t flags, int nodeid) | 860 | gfp_t flags, int nodeid) |
861 | { | 861 | { |
862 | return NULL; | 862 | return NULL; |
863 | } | 863 | } |
864 | 864 | ||
865 | #else /* CONFIG_NUMA */ | 865 | #else /* CONFIG_NUMA */ |
866 | 866 | ||
867 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); | 867 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
868 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 868 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
869 | 869 | ||
870 | static struct alien_cache *__alloc_alien_cache(int node, int entries, | 870 | static struct alien_cache *__alloc_alien_cache(int node, int entries, |
871 | int batch, gfp_t gfp) | 871 | int batch, gfp_t gfp) |
872 | { | 872 | { |
873 | size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); | 873 | size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); |
874 | struct alien_cache *alc = NULL; | 874 | struct alien_cache *alc = NULL; |
875 | 875 | ||
876 | alc = kmalloc_node(memsize, gfp, node); | 876 | alc = kmalloc_node(memsize, gfp, node); |
877 | init_arraycache(&alc->ac, entries, batch); | 877 | init_arraycache(&alc->ac, entries, batch); |
878 | spin_lock_init(&alc->lock); | 878 | spin_lock_init(&alc->lock); |
879 | return alc; | 879 | return alc; |
880 | } | 880 | } |
881 | 881 | ||
882 | static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | 882 | static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
883 | { | 883 | { |
884 | struct alien_cache **alc_ptr; | 884 | struct alien_cache **alc_ptr; |
885 | size_t memsize = sizeof(void *) * nr_node_ids; | 885 | size_t memsize = sizeof(void *) * nr_node_ids; |
886 | int i; | 886 | int i; |
887 | 887 | ||
888 | if (limit > 1) | 888 | if (limit > 1) |
889 | limit = 12; | 889 | limit = 12; |
890 | alc_ptr = kzalloc_node(memsize, gfp, node); | 890 | alc_ptr = kzalloc_node(memsize, gfp, node); |
891 | if (!alc_ptr) | 891 | if (!alc_ptr) |
892 | return NULL; | 892 | return NULL; |
893 | 893 | ||
894 | for_each_node(i) { | 894 | for_each_node(i) { |
895 | if (i == node || !node_online(i)) | 895 | if (i == node || !node_online(i)) |
896 | continue; | 896 | continue; |
897 | alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); | 897 | alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); |
898 | if (!alc_ptr[i]) { | 898 | if (!alc_ptr[i]) { |
899 | for (i--; i >= 0; i--) | 899 | for (i--; i >= 0; i--) |
900 | kfree(alc_ptr[i]); | 900 | kfree(alc_ptr[i]); |
901 | kfree(alc_ptr); | 901 | kfree(alc_ptr); |
902 | return NULL; | 902 | return NULL; |
903 | } | 903 | } |
904 | } | 904 | } |
905 | return alc_ptr; | 905 | return alc_ptr; |
906 | } | 906 | } |
907 | 907 | ||
908 | static void free_alien_cache(struct alien_cache **alc_ptr) | 908 | static void free_alien_cache(struct alien_cache **alc_ptr) |
909 | { | 909 | { |
910 | int i; | 910 | int i; |
911 | 911 | ||
912 | if (!alc_ptr) | 912 | if (!alc_ptr) |
913 | return; | 913 | return; |
914 | for_each_node(i) | 914 | for_each_node(i) |
915 | kfree(alc_ptr[i]); | 915 | kfree(alc_ptr[i]); |
916 | kfree(alc_ptr); | 916 | kfree(alc_ptr); |
917 | } | 917 | } |
918 | 918 | ||
919 | static void __drain_alien_cache(struct kmem_cache *cachep, | 919 | static void __drain_alien_cache(struct kmem_cache *cachep, |
920 | struct array_cache *ac, int node, | 920 | struct array_cache *ac, int node, |
921 | struct list_head *list) | 921 | struct list_head *list) |
922 | { | 922 | { |
923 | struct kmem_cache_node *n = get_node(cachep, node); | 923 | struct kmem_cache_node *n = get_node(cachep, node); |
924 | 924 | ||
925 | if (ac->avail) { | 925 | if (ac->avail) { |
926 | spin_lock(&n->list_lock); | 926 | spin_lock(&n->list_lock); |
927 | /* | 927 | /* |
928 | * Stuff objects into the remote nodes shared array first. | 928 | * Stuff objects into the remote nodes shared array first. |
929 | * That way we could avoid the overhead of putting the objects | 929 | * That way we could avoid the overhead of putting the objects |
930 | * into the free lists and getting them back later. | 930 | * into the free lists and getting them back later. |
931 | */ | 931 | */ |
932 | if (n->shared) | 932 | if (n->shared) |
933 | transfer_objects(n->shared, ac, ac->limit); | 933 | transfer_objects(n->shared, ac, ac->limit); |
934 | 934 | ||
935 | free_block(cachep, ac->entry, ac->avail, node, list); | 935 | free_block(cachep, ac->entry, ac->avail, node, list); |
936 | ac->avail = 0; | 936 | ac->avail = 0; |
937 | spin_unlock(&n->list_lock); | 937 | spin_unlock(&n->list_lock); |
938 | } | 938 | } |
939 | } | 939 | } |
940 | 940 | ||
941 | /* | 941 | /* |
942 | * Called from cache_reap() to regularly drain alien caches round robin. | 942 | * Called from cache_reap() to regularly drain alien caches round robin. |
943 | */ | 943 | */ |
944 | static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) | 944 | static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) |
945 | { | 945 | { |
946 | int node = __this_cpu_read(slab_reap_node); | 946 | int node = __this_cpu_read(slab_reap_node); |
947 | 947 | ||
948 | if (n->alien) { | 948 | if (n->alien) { |
949 | struct alien_cache *alc = n->alien[node]; | 949 | struct alien_cache *alc = n->alien[node]; |
950 | struct array_cache *ac; | 950 | struct array_cache *ac; |
951 | 951 | ||
952 | if (alc) { | 952 | if (alc) { |
953 | ac = &alc->ac; | 953 | ac = &alc->ac; |
954 | if (ac->avail && spin_trylock_irq(&alc->lock)) { | 954 | if (ac->avail && spin_trylock_irq(&alc->lock)) { |
955 | LIST_HEAD(list); | 955 | LIST_HEAD(list); |
956 | 956 | ||
957 | __drain_alien_cache(cachep, ac, node, &list); | 957 | __drain_alien_cache(cachep, ac, node, &list); |
958 | spin_unlock_irq(&alc->lock); | 958 | spin_unlock_irq(&alc->lock); |
959 | slabs_destroy(cachep, &list); | 959 | slabs_destroy(cachep, &list); |
960 | } | 960 | } |
961 | } | 961 | } |
962 | } | 962 | } |
963 | } | 963 | } |
964 | 964 | ||
965 | static void drain_alien_cache(struct kmem_cache *cachep, | 965 | static void drain_alien_cache(struct kmem_cache *cachep, |
966 | struct alien_cache **alien) | 966 | struct alien_cache **alien) |
967 | { | 967 | { |
968 | int i = 0; | 968 | int i = 0; |
969 | struct alien_cache *alc; | 969 | struct alien_cache *alc; |
970 | struct array_cache *ac; | 970 | struct array_cache *ac; |
971 | unsigned long flags; | 971 | unsigned long flags; |
972 | 972 | ||
973 | for_each_online_node(i) { | 973 | for_each_online_node(i) { |
974 | alc = alien[i]; | 974 | alc = alien[i]; |
975 | if (alc) { | 975 | if (alc) { |
976 | LIST_HEAD(list); | 976 | LIST_HEAD(list); |
977 | 977 | ||
978 | ac = &alc->ac; | 978 | ac = &alc->ac; |
979 | spin_lock_irqsave(&alc->lock, flags); | 979 | spin_lock_irqsave(&alc->lock, flags); |
980 | __drain_alien_cache(cachep, ac, i, &list); | 980 | __drain_alien_cache(cachep, ac, i, &list); |
981 | spin_unlock_irqrestore(&alc->lock, flags); | 981 | spin_unlock_irqrestore(&alc->lock, flags); |
982 | slabs_destroy(cachep, &list); | 982 | slabs_destroy(cachep, &list); |
983 | } | 983 | } |
984 | } | 984 | } |
985 | } | 985 | } |
986 | 986 | ||
987 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | 987 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) |
988 | { | 988 | { |
989 | int nodeid = page_to_nid(virt_to_page(objp)); | 989 | int nodeid = page_to_nid(virt_to_page(objp)); |
990 | struct kmem_cache_node *n; | 990 | struct kmem_cache_node *n; |
991 | struct alien_cache *alien = NULL; | 991 | struct alien_cache *alien = NULL; |
992 | struct array_cache *ac; | 992 | struct array_cache *ac; |
993 | int node; | 993 | int node; |
994 | LIST_HEAD(list); | 994 | LIST_HEAD(list); |
995 | 995 | ||
996 | node = numa_mem_id(); | 996 | node = numa_mem_id(); |
997 | 997 | ||
998 | /* | 998 | /* |
999 | * Make sure we are not freeing a object from another node to the array | 999 | * Make sure we are not freeing a object from another node to the array |
1000 | * cache on this cpu. | 1000 | * cache on this cpu. |
1001 | */ | 1001 | */ |
1002 | if (likely(nodeid == node)) | 1002 | if (likely(nodeid == node)) |
1003 | return 0; | 1003 | return 0; |
1004 | 1004 | ||
1005 | n = get_node(cachep, node); | 1005 | n = get_node(cachep, node); |
1006 | STATS_INC_NODEFREES(cachep); | 1006 | STATS_INC_NODEFREES(cachep); |
1007 | if (n->alien && n->alien[nodeid]) { | 1007 | if (n->alien && n->alien[nodeid]) { |
1008 | alien = n->alien[nodeid]; | 1008 | alien = n->alien[nodeid]; |
1009 | ac = &alien->ac; | 1009 | ac = &alien->ac; |
1010 | spin_lock(&alien->lock); | 1010 | spin_lock(&alien->lock); |
1011 | if (unlikely(ac->avail == ac->limit)) { | 1011 | if (unlikely(ac->avail == ac->limit)) { |
1012 | STATS_INC_ACOVERFLOW(cachep); | 1012 | STATS_INC_ACOVERFLOW(cachep); |
1013 | __drain_alien_cache(cachep, ac, nodeid, &list); | 1013 | __drain_alien_cache(cachep, ac, nodeid, &list); |
1014 | } | 1014 | } |
1015 | ac_put_obj(cachep, ac, objp); | 1015 | ac_put_obj(cachep, ac, objp); |
1016 | spin_unlock(&alien->lock); | 1016 | spin_unlock(&alien->lock); |
1017 | slabs_destroy(cachep, &list); | 1017 | slabs_destroy(cachep, &list); |
1018 | } else { | 1018 | } else { |
1019 | n = get_node(cachep, nodeid); | 1019 | n = get_node(cachep, nodeid); |
1020 | spin_lock(&n->list_lock); | 1020 | spin_lock(&n->list_lock); |
1021 | free_block(cachep, &objp, 1, nodeid, &list); | 1021 | free_block(cachep, &objp, 1, nodeid, &list); |
1022 | spin_unlock(&n->list_lock); | 1022 | spin_unlock(&n->list_lock); |
1023 | slabs_destroy(cachep, &list); | 1023 | slabs_destroy(cachep, &list); |
1024 | } | 1024 | } |
1025 | return 1; | 1025 | return 1; |
1026 | } | 1026 | } |
1027 | #endif | 1027 | #endif |
1028 | 1028 | ||
1029 | /* | 1029 | /* |
1030 | * Allocates and initializes node for a node on each slab cache, used for | 1030 | * Allocates and initializes node for a node on each slab cache, used for |
1031 | * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node | 1031 | * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node |
1032 | * will be allocated off-node since memory is not yet online for the new node. | 1032 | * will be allocated off-node since memory is not yet online for the new node. |
1033 | * When hotplugging memory or a cpu, existing node are not replaced if | 1033 | * When hotplugging memory or a cpu, existing node are not replaced if |
1034 | * already in use. | 1034 | * already in use. |
1035 | * | 1035 | * |
1036 | * Must hold slab_mutex. | 1036 | * Must hold slab_mutex. |
1037 | */ | 1037 | */ |
1038 | static int init_cache_node_node(int node) | 1038 | static int init_cache_node_node(int node) |
1039 | { | 1039 | { |
1040 | struct kmem_cache *cachep; | 1040 | struct kmem_cache *cachep; |
1041 | struct kmem_cache_node *n; | 1041 | struct kmem_cache_node *n; |
1042 | const size_t memsize = sizeof(struct kmem_cache_node); | 1042 | const size_t memsize = sizeof(struct kmem_cache_node); |
1043 | 1043 | ||
1044 | list_for_each_entry(cachep, &slab_caches, list) { | 1044 | list_for_each_entry(cachep, &slab_caches, list) { |
1045 | /* | 1045 | /* |
1046 | * Set up the kmem_cache_node for cpu before we can | 1046 | * Set up the kmem_cache_node for cpu before we can |
1047 | * begin anything. Make sure some other cpu on this | 1047 | * begin anything. Make sure some other cpu on this |
1048 | * node has not already allocated this | 1048 | * node has not already allocated this |
1049 | */ | 1049 | */ |
1050 | n = get_node(cachep, node); | 1050 | n = get_node(cachep, node); |
1051 | if (!n) { | 1051 | if (!n) { |
1052 | n = kmalloc_node(memsize, GFP_KERNEL, node); | 1052 | n = kmalloc_node(memsize, GFP_KERNEL, node); |
1053 | if (!n) | 1053 | if (!n) |
1054 | return -ENOMEM; | 1054 | return -ENOMEM; |
1055 | kmem_cache_node_init(n); | 1055 | kmem_cache_node_init(n); |
1056 | n->next_reap = jiffies + REAPTIMEOUT_NODE + | 1056 | n->next_reap = jiffies + REAPTIMEOUT_NODE + |
1057 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; | 1057 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; |
1058 | 1058 | ||
1059 | /* | 1059 | /* |
1060 | * The kmem_cache_nodes don't come and go as CPUs | 1060 | * The kmem_cache_nodes don't come and go as CPUs |
1061 | * come and go. slab_mutex is sufficient | 1061 | * come and go. slab_mutex is sufficient |
1062 | * protection here. | 1062 | * protection here. |
1063 | */ | 1063 | */ |
1064 | cachep->node[node] = n; | 1064 | cachep->node[node] = n; |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | spin_lock_irq(&n->list_lock); | 1067 | spin_lock_irq(&n->list_lock); |
1068 | n->free_limit = | 1068 | n->free_limit = |
1069 | (1 + nr_cpus_node(node)) * | 1069 | (1 + nr_cpus_node(node)) * |
1070 | cachep->batchcount + cachep->num; | 1070 | cachep->batchcount + cachep->num; |
1071 | spin_unlock_irq(&n->list_lock); | 1071 | spin_unlock_irq(&n->list_lock); |
1072 | } | 1072 | } |
1073 | return 0; | 1073 | return 0; |
1074 | } | 1074 | } |
1075 | 1075 | ||
1076 | static inline int slabs_tofree(struct kmem_cache *cachep, | 1076 | static inline int slabs_tofree(struct kmem_cache *cachep, |
1077 | struct kmem_cache_node *n) | 1077 | struct kmem_cache_node *n) |
1078 | { | 1078 | { |
1079 | return (n->free_objects + cachep->num - 1) / cachep->num; | 1079 | return (n->free_objects + cachep->num - 1) / cachep->num; |
1080 | } | 1080 | } |
1081 | 1081 | ||
1082 | static void cpuup_canceled(long cpu) | 1082 | static void cpuup_canceled(long cpu) |
1083 | { | 1083 | { |
1084 | struct kmem_cache *cachep; | 1084 | struct kmem_cache *cachep; |
1085 | struct kmem_cache_node *n = NULL; | 1085 | struct kmem_cache_node *n = NULL; |
1086 | int node = cpu_to_mem(cpu); | 1086 | int node = cpu_to_mem(cpu); |
1087 | const struct cpumask *mask = cpumask_of_node(node); | 1087 | const struct cpumask *mask = cpumask_of_node(node); |
1088 | 1088 | ||
1089 | list_for_each_entry(cachep, &slab_caches, list) { | 1089 | list_for_each_entry(cachep, &slab_caches, list) { |
1090 | struct array_cache *nc; | 1090 | struct array_cache *nc; |
1091 | struct array_cache *shared; | 1091 | struct array_cache *shared; |
1092 | struct alien_cache **alien; | 1092 | struct alien_cache **alien; |
1093 | LIST_HEAD(list); | 1093 | LIST_HEAD(list); |
1094 | 1094 | ||
1095 | /* cpu is dead; no one can alloc from it. */ | 1095 | /* cpu is dead; no one can alloc from it. */ |
1096 | nc = cachep->array[cpu]; | 1096 | nc = cachep->array[cpu]; |
1097 | cachep->array[cpu] = NULL; | 1097 | cachep->array[cpu] = NULL; |
1098 | n = get_node(cachep, node); | 1098 | n = get_node(cachep, node); |
1099 | 1099 | ||
1100 | if (!n) | 1100 | if (!n) |
1101 | goto free_array_cache; | 1101 | goto free_array_cache; |
1102 | 1102 | ||
1103 | spin_lock_irq(&n->list_lock); | 1103 | spin_lock_irq(&n->list_lock); |
1104 | 1104 | ||
1105 | /* Free limit for this kmem_cache_node */ | 1105 | /* Free limit for this kmem_cache_node */ |
1106 | n->free_limit -= cachep->batchcount; | 1106 | n->free_limit -= cachep->batchcount; |
1107 | if (nc) | 1107 | if (nc) |
1108 | free_block(cachep, nc->entry, nc->avail, node, &list); | 1108 | free_block(cachep, nc->entry, nc->avail, node, &list); |
1109 | 1109 | ||
1110 | if (!cpumask_empty(mask)) { | 1110 | if (!cpumask_empty(mask)) { |
1111 | spin_unlock_irq(&n->list_lock); | 1111 | spin_unlock_irq(&n->list_lock); |
1112 | goto free_array_cache; | 1112 | goto free_array_cache; |
1113 | } | 1113 | } |
1114 | 1114 | ||
1115 | shared = n->shared; | 1115 | shared = n->shared; |
1116 | if (shared) { | 1116 | if (shared) { |
1117 | free_block(cachep, shared->entry, | 1117 | free_block(cachep, shared->entry, |
1118 | shared->avail, node, &list); | 1118 | shared->avail, node, &list); |
1119 | n->shared = NULL; | 1119 | n->shared = NULL; |
1120 | } | 1120 | } |
1121 | 1121 | ||
1122 | alien = n->alien; | 1122 | alien = n->alien; |
1123 | n->alien = NULL; | 1123 | n->alien = NULL; |
1124 | 1124 | ||
1125 | spin_unlock_irq(&n->list_lock); | 1125 | spin_unlock_irq(&n->list_lock); |
1126 | 1126 | ||
1127 | kfree(shared); | 1127 | kfree(shared); |
1128 | if (alien) { | 1128 | if (alien) { |
1129 | drain_alien_cache(cachep, alien); | 1129 | drain_alien_cache(cachep, alien); |
1130 | free_alien_cache(alien); | 1130 | free_alien_cache(alien); |
1131 | } | 1131 | } |
1132 | free_array_cache: | 1132 | free_array_cache: |
1133 | slabs_destroy(cachep, &list); | 1133 | slabs_destroy(cachep, &list); |
1134 | kfree(nc); | 1134 | kfree(nc); |
1135 | } | 1135 | } |
1136 | /* | 1136 | /* |
1137 | * In the previous loop, all the objects were freed to | 1137 | * In the previous loop, all the objects were freed to |
1138 | * the respective cache's slabs, now we can go ahead and | 1138 | * the respective cache's slabs, now we can go ahead and |
1139 | * shrink each nodelist to its limit. | 1139 | * shrink each nodelist to its limit. |
1140 | */ | 1140 | */ |
1141 | list_for_each_entry(cachep, &slab_caches, list) { | 1141 | list_for_each_entry(cachep, &slab_caches, list) { |
1142 | n = get_node(cachep, node); | 1142 | n = get_node(cachep, node); |
1143 | if (!n) | 1143 | if (!n) |
1144 | continue; | 1144 | continue; |
1145 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); | 1145 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
1146 | } | 1146 | } |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | static int cpuup_prepare(long cpu) | 1149 | static int cpuup_prepare(long cpu) |
1150 | { | 1150 | { |
1151 | struct kmem_cache *cachep; | 1151 | struct kmem_cache *cachep; |
1152 | struct kmem_cache_node *n = NULL; | 1152 | struct kmem_cache_node *n = NULL; |
1153 | int node = cpu_to_mem(cpu); | 1153 | int node = cpu_to_mem(cpu); |
1154 | int err; | 1154 | int err; |
1155 | 1155 | ||
1156 | /* | 1156 | /* |
1157 | * We need to do this right in the beginning since | 1157 | * We need to do this right in the beginning since |
1158 | * alloc_arraycache's are going to use this list. | 1158 | * alloc_arraycache's are going to use this list. |
1159 | * kmalloc_node allows us to add the slab to the right | 1159 | * kmalloc_node allows us to add the slab to the right |
1160 | * kmem_cache_node and not this cpu's kmem_cache_node | 1160 | * kmem_cache_node and not this cpu's kmem_cache_node |
1161 | */ | 1161 | */ |
1162 | err = init_cache_node_node(node); | 1162 | err = init_cache_node_node(node); |
1163 | if (err < 0) | 1163 | if (err < 0) |
1164 | goto bad; | 1164 | goto bad; |
1165 | 1165 | ||
1166 | /* | 1166 | /* |
1167 | * Now we can go ahead with allocating the shared arrays and | 1167 | * Now we can go ahead with allocating the shared arrays and |
1168 | * array caches | 1168 | * array caches |
1169 | */ | 1169 | */ |
1170 | list_for_each_entry(cachep, &slab_caches, list) { | 1170 | list_for_each_entry(cachep, &slab_caches, list) { |
1171 | struct array_cache *nc; | 1171 | struct array_cache *nc; |
1172 | struct array_cache *shared = NULL; | 1172 | struct array_cache *shared = NULL; |
1173 | struct alien_cache **alien = NULL; | 1173 | struct alien_cache **alien = NULL; |
1174 | 1174 | ||
1175 | nc = alloc_arraycache(node, cachep->limit, | 1175 | nc = alloc_arraycache(node, cachep->limit, |
1176 | cachep->batchcount, GFP_KERNEL); | 1176 | cachep->batchcount, GFP_KERNEL); |
1177 | if (!nc) | 1177 | if (!nc) |
1178 | goto bad; | 1178 | goto bad; |
1179 | if (cachep->shared) { | 1179 | if (cachep->shared) { |
1180 | shared = alloc_arraycache(node, | 1180 | shared = alloc_arraycache(node, |
1181 | cachep->shared * cachep->batchcount, | 1181 | cachep->shared * cachep->batchcount, |
1182 | 0xbaadf00d, GFP_KERNEL); | 1182 | 0xbaadf00d, GFP_KERNEL); |
1183 | if (!shared) { | 1183 | if (!shared) { |
1184 | kfree(nc); | 1184 | kfree(nc); |
1185 | goto bad; | 1185 | goto bad; |
1186 | } | 1186 | } |
1187 | } | 1187 | } |
1188 | if (use_alien_caches) { | 1188 | if (use_alien_caches) { |
1189 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); | 1189 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); |
1190 | if (!alien) { | 1190 | if (!alien) { |
1191 | kfree(shared); | 1191 | kfree(shared); |
1192 | kfree(nc); | 1192 | kfree(nc); |
1193 | goto bad; | 1193 | goto bad; |
1194 | } | 1194 | } |
1195 | } | 1195 | } |
1196 | cachep->array[cpu] = nc; | 1196 | cachep->array[cpu] = nc; |
1197 | n = get_node(cachep, node); | 1197 | n = get_node(cachep, node); |
1198 | BUG_ON(!n); | 1198 | BUG_ON(!n); |
1199 | 1199 | ||
1200 | spin_lock_irq(&n->list_lock); | 1200 | spin_lock_irq(&n->list_lock); |
1201 | if (!n->shared) { | 1201 | if (!n->shared) { |
1202 | /* | 1202 | /* |
1203 | * We are serialised from CPU_DEAD or | 1203 | * We are serialised from CPU_DEAD or |
1204 | * CPU_UP_CANCELLED by the cpucontrol lock | 1204 | * CPU_UP_CANCELLED by the cpucontrol lock |
1205 | */ | 1205 | */ |
1206 | n->shared = shared; | 1206 | n->shared = shared; |
1207 | shared = NULL; | 1207 | shared = NULL; |
1208 | } | 1208 | } |
1209 | #ifdef CONFIG_NUMA | 1209 | #ifdef CONFIG_NUMA |
1210 | if (!n->alien) { | 1210 | if (!n->alien) { |
1211 | n->alien = alien; | 1211 | n->alien = alien; |
1212 | alien = NULL; | 1212 | alien = NULL; |
1213 | } | 1213 | } |
1214 | #endif | 1214 | #endif |
1215 | spin_unlock_irq(&n->list_lock); | 1215 | spin_unlock_irq(&n->list_lock); |
1216 | kfree(shared); | 1216 | kfree(shared); |
1217 | free_alien_cache(alien); | 1217 | free_alien_cache(alien); |
1218 | } | 1218 | } |
1219 | 1219 | ||
1220 | return 0; | 1220 | return 0; |
1221 | bad: | 1221 | bad: |
1222 | cpuup_canceled(cpu); | 1222 | cpuup_canceled(cpu); |
1223 | return -ENOMEM; | 1223 | return -ENOMEM; |
1224 | } | 1224 | } |
1225 | 1225 | ||
1226 | static int cpuup_callback(struct notifier_block *nfb, | 1226 | static int cpuup_callback(struct notifier_block *nfb, |
1227 | unsigned long action, void *hcpu) | 1227 | unsigned long action, void *hcpu) |
1228 | { | 1228 | { |
1229 | long cpu = (long)hcpu; | 1229 | long cpu = (long)hcpu; |
1230 | int err = 0; | 1230 | int err = 0; |
1231 | 1231 | ||
1232 | switch (action) { | 1232 | switch (action) { |
1233 | case CPU_UP_PREPARE: | 1233 | case CPU_UP_PREPARE: |
1234 | case CPU_UP_PREPARE_FROZEN: | 1234 | case CPU_UP_PREPARE_FROZEN: |
1235 | mutex_lock(&slab_mutex); | 1235 | mutex_lock(&slab_mutex); |
1236 | err = cpuup_prepare(cpu); | 1236 | err = cpuup_prepare(cpu); |
1237 | mutex_unlock(&slab_mutex); | 1237 | mutex_unlock(&slab_mutex); |
1238 | break; | 1238 | break; |
1239 | case CPU_ONLINE: | 1239 | case CPU_ONLINE: |
1240 | case CPU_ONLINE_FROZEN: | 1240 | case CPU_ONLINE_FROZEN: |
1241 | start_cpu_timer(cpu); | 1241 | start_cpu_timer(cpu); |
1242 | break; | 1242 | break; |
1243 | #ifdef CONFIG_HOTPLUG_CPU | 1243 | #ifdef CONFIG_HOTPLUG_CPU |
1244 | case CPU_DOWN_PREPARE: | 1244 | case CPU_DOWN_PREPARE: |
1245 | case CPU_DOWN_PREPARE_FROZEN: | 1245 | case CPU_DOWN_PREPARE_FROZEN: |
1246 | /* | 1246 | /* |
1247 | * Shutdown cache reaper. Note that the slab_mutex is | 1247 | * Shutdown cache reaper. Note that the slab_mutex is |
1248 | * held so that if cache_reap() is invoked it cannot do | 1248 | * held so that if cache_reap() is invoked it cannot do |
1249 | * anything expensive but will only modify reap_work | 1249 | * anything expensive but will only modify reap_work |
1250 | * and reschedule the timer. | 1250 | * and reschedule the timer. |
1251 | */ | 1251 | */ |
1252 | cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); | 1252 | cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); |
1253 | /* Now the cache_reaper is guaranteed to be not running. */ | 1253 | /* Now the cache_reaper is guaranteed to be not running. */ |
1254 | per_cpu(slab_reap_work, cpu).work.func = NULL; | 1254 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
1255 | break; | 1255 | break; |
1256 | case CPU_DOWN_FAILED: | 1256 | case CPU_DOWN_FAILED: |
1257 | case CPU_DOWN_FAILED_FROZEN: | 1257 | case CPU_DOWN_FAILED_FROZEN: |
1258 | start_cpu_timer(cpu); | 1258 | start_cpu_timer(cpu); |
1259 | break; | 1259 | break; |
1260 | case CPU_DEAD: | 1260 | case CPU_DEAD: |
1261 | case CPU_DEAD_FROZEN: | 1261 | case CPU_DEAD_FROZEN: |
1262 | /* | 1262 | /* |
1263 | * Even if all the cpus of a node are down, we don't free the | 1263 | * Even if all the cpus of a node are down, we don't free the |
1264 | * kmem_cache_node of any cache. This to avoid a race between | 1264 | * kmem_cache_node of any cache. This to avoid a race between |
1265 | * cpu_down, and a kmalloc allocation from another cpu for | 1265 | * cpu_down, and a kmalloc allocation from another cpu for |
1266 | * memory from the node of the cpu going down. The node | 1266 | * memory from the node of the cpu going down. The node |
1267 | * structure is usually allocated from kmem_cache_create() and | 1267 | * structure is usually allocated from kmem_cache_create() and |
1268 | * gets destroyed at kmem_cache_destroy(). | 1268 | * gets destroyed at kmem_cache_destroy(). |
1269 | */ | 1269 | */ |
1270 | /* fall through */ | 1270 | /* fall through */ |
1271 | #endif | 1271 | #endif |
1272 | case CPU_UP_CANCELED: | 1272 | case CPU_UP_CANCELED: |
1273 | case CPU_UP_CANCELED_FROZEN: | 1273 | case CPU_UP_CANCELED_FROZEN: |
1274 | mutex_lock(&slab_mutex); | 1274 | mutex_lock(&slab_mutex); |
1275 | cpuup_canceled(cpu); | 1275 | cpuup_canceled(cpu); |
1276 | mutex_unlock(&slab_mutex); | 1276 | mutex_unlock(&slab_mutex); |
1277 | break; | 1277 | break; |
1278 | } | 1278 | } |
1279 | return notifier_from_errno(err); | 1279 | return notifier_from_errno(err); |
1280 | } | 1280 | } |
1281 | 1281 | ||
1282 | static struct notifier_block cpucache_notifier = { | 1282 | static struct notifier_block cpucache_notifier = { |
1283 | &cpuup_callback, NULL, 0 | 1283 | &cpuup_callback, NULL, 0 |
1284 | }; | 1284 | }; |
1285 | 1285 | ||
1286 | #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) | 1286 | #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) |
1287 | /* | 1287 | /* |
1288 | * Drains freelist for a node on each slab cache, used for memory hot-remove. | 1288 | * Drains freelist for a node on each slab cache, used for memory hot-remove. |
1289 | * Returns -EBUSY if all objects cannot be drained so that the node is not | 1289 | * Returns -EBUSY if all objects cannot be drained so that the node is not |
1290 | * removed. | 1290 | * removed. |
1291 | * | 1291 | * |
1292 | * Must hold slab_mutex. | 1292 | * Must hold slab_mutex. |
1293 | */ | 1293 | */ |
1294 | static int __meminit drain_cache_node_node(int node) | 1294 | static int __meminit drain_cache_node_node(int node) |
1295 | { | 1295 | { |
1296 | struct kmem_cache *cachep; | 1296 | struct kmem_cache *cachep; |
1297 | int ret = 0; | 1297 | int ret = 0; |
1298 | 1298 | ||
1299 | list_for_each_entry(cachep, &slab_caches, list) { | 1299 | list_for_each_entry(cachep, &slab_caches, list) { |
1300 | struct kmem_cache_node *n; | 1300 | struct kmem_cache_node *n; |
1301 | 1301 | ||
1302 | n = get_node(cachep, node); | 1302 | n = get_node(cachep, node); |
1303 | if (!n) | 1303 | if (!n) |
1304 | continue; | 1304 | continue; |
1305 | 1305 | ||
1306 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); | 1306 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
1307 | 1307 | ||
1308 | if (!list_empty(&n->slabs_full) || | 1308 | if (!list_empty(&n->slabs_full) || |
1309 | !list_empty(&n->slabs_partial)) { | 1309 | !list_empty(&n->slabs_partial)) { |
1310 | ret = -EBUSY; | 1310 | ret = -EBUSY; |
1311 | break; | 1311 | break; |
1312 | } | 1312 | } |
1313 | } | 1313 | } |
1314 | return ret; | 1314 | return ret; |
1315 | } | 1315 | } |
1316 | 1316 | ||
1317 | static int __meminit slab_memory_callback(struct notifier_block *self, | 1317 | static int __meminit slab_memory_callback(struct notifier_block *self, |
1318 | unsigned long action, void *arg) | 1318 | unsigned long action, void *arg) |
1319 | { | 1319 | { |
1320 | struct memory_notify *mnb = arg; | 1320 | struct memory_notify *mnb = arg; |
1321 | int ret = 0; | 1321 | int ret = 0; |
1322 | int nid; | 1322 | int nid; |
1323 | 1323 | ||
1324 | nid = mnb->status_change_nid; | 1324 | nid = mnb->status_change_nid; |
1325 | if (nid < 0) | 1325 | if (nid < 0) |
1326 | goto out; | 1326 | goto out; |
1327 | 1327 | ||
1328 | switch (action) { | 1328 | switch (action) { |
1329 | case MEM_GOING_ONLINE: | 1329 | case MEM_GOING_ONLINE: |
1330 | mutex_lock(&slab_mutex); | 1330 | mutex_lock(&slab_mutex); |
1331 | ret = init_cache_node_node(nid); | 1331 | ret = init_cache_node_node(nid); |
1332 | mutex_unlock(&slab_mutex); | 1332 | mutex_unlock(&slab_mutex); |
1333 | break; | 1333 | break; |
1334 | case MEM_GOING_OFFLINE: | 1334 | case MEM_GOING_OFFLINE: |
1335 | mutex_lock(&slab_mutex); | 1335 | mutex_lock(&slab_mutex); |
1336 | ret = drain_cache_node_node(nid); | 1336 | ret = drain_cache_node_node(nid); |
1337 | mutex_unlock(&slab_mutex); | 1337 | mutex_unlock(&slab_mutex); |
1338 | break; | 1338 | break; |
1339 | case MEM_ONLINE: | 1339 | case MEM_ONLINE: |
1340 | case MEM_OFFLINE: | 1340 | case MEM_OFFLINE: |
1341 | case MEM_CANCEL_ONLINE: | 1341 | case MEM_CANCEL_ONLINE: |
1342 | case MEM_CANCEL_OFFLINE: | 1342 | case MEM_CANCEL_OFFLINE: |
1343 | break; | 1343 | break; |
1344 | } | 1344 | } |
1345 | out: | 1345 | out: |
1346 | return notifier_from_errno(ret); | 1346 | return notifier_from_errno(ret); |
1347 | } | 1347 | } |
1348 | #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ | 1348 | #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ |
1349 | 1349 | ||
1350 | /* | 1350 | /* |
1351 | * swap the static kmem_cache_node with kmalloced memory | 1351 | * swap the static kmem_cache_node with kmalloced memory |
1352 | */ | 1352 | */ |
1353 | static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, | 1353 | static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, |
1354 | int nodeid) | 1354 | int nodeid) |
1355 | { | 1355 | { |
1356 | struct kmem_cache_node *ptr; | 1356 | struct kmem_cache_node *ptr; |
1357 | 1357 | ||
1358 | ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); | 1358 | ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); |
1359 | BUG_ON(!ptr); | 1359 | BUG_ON(!ptr); |
1360 | 1360 | ||
1361 | memcpy(ptr, list, sizeof(struct kmem_cache_node)); | 1361 | memcpy(ptr, list, sizeof(struct kmem_cache_node)); |
1362 | /* | 1362 | /* |
1363 | * Do not assume that spinlocks can be initialized via memcpy: | 1363 | * Do not assume that spinlocks can be initialized via memcpy: |
1364 | */ | 1364 | */ |
1365 | spin_lock_init(&ptr->list_lock); | 1365 | spin_lock_init(&ptr->list_lock); |
1366 | 1366 | ||
1367 | MAKE_ALL_LISTS(cachep, ptr, nodeid); | 1367 | MAKE_ALL_LISTS(cachep, ptr, nodeid); |
1368 | cachep->node[nodeid] = ptr; | 1368 | cachep->node[nodeid] = ptr; |
1369 | } | 1369 | } |
1370 | 1370 | ||
1371 | /* | 1371 | /* |
1372 | * For setting up all the kmem_cache_node for cache whose buffer_size is same as | 1372 | * For setting up all the kmem_cache_node for cache whose buffer_size is same as |
1373 | * size of kmem_cache_node. | 1373 | * size of kmem_cache_node. |
1374 | */ | 1374 | */ |
1375 | static void __init set_up_node(struct kmem_cache *cachep, int index) | 1375 | static void __init set_up_node(struct kmem_cache *cachep, int index) |
1376 | { | 1376 | { |
1377 | int node; | 1377 | int node; |
1378 | 1378 | ||
1379 | for_each_online_node(node) { | 1379 | for_each_online_node(node) { |
1380 | cachep->node[node] = &init_kmem_cache_node[index + node]; | 1380 | cachep->node[node] = &init_kmem_cache_node[index + node]; |
1381 | cachep->node[node]->next_reap = jiffies + | 1381 | cachep->node[node]->next_reap = jiffies + |
1382 | REAPTIMEOUT_NODE + | 1382 | REAPTIMEOUT_NODE + |
1383 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; | 1383 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; |
1384 | } | 1384 | } |
1385 | } | 1385 | } |
1386 | 1386 | ||
1387 | /* | 1387 | /* |
1388 | * The memory after the last cpu cache pointer is used for the | 1388 | * The memory after the last cpu cache pointer is used for the |
1389 | * the node pointer. | 1389 | * the node pointer. |
1390 | */ | 1390 | */ |
1391 | static void setup_node_pointer(struct kmem_cache *cachep) | 1391 | static void setup_node_pointer(struct kmem_cache *cachep) |
1392 | { | 1392 | { |
1393 | cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; | 1393 | cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | /* | 1396 | /* |
1397 | * Initialisation. Called after the page allocator have been initialised and | 1397 | * Initialisation. Called after the page allocator have been initialised and |
1398 | * before smp_init(). | 1398 | * before smp_init(). |
1399 | */ | 1399 | */ |
1400 | void __init kmem_cache_init(void) | 1400 | void __init kmem_cache_init(void) |
1401 | { | 1401 | { |
1402 | int i; | 1402 | int i; |
1403 | 1403 | ||
1404 | BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < | 1404 | BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < |
1405 | sizeof(struct rcu_head)); | 1405 | sizeof(struct rcu_head)); |
1406 | kmem_cache = &kmem_cache_boot; | 1406 | kmem_cache = &kmem_cache_boot; |
1407 | setup_node_pointer(kmem_cache); | 1407 | setup_node_pointer(kmem_cache); |
1408 | 1408 | ||
1409 | if (num_possible_nodes() == 1) | 1409 | if (num_possible_nodes() == 1) |
1410 | use_alien_caches = 0; | 1410 | use_alien_caches = 0; |
1411 | 1411 | ||
1412 | for (i = 0; i < NUM_INIT_LISTS; i++) | 1412 | for (i = 0; i < NUM_INIT_LISTS; i++) |
1413 | kmem_cache_node_init(&init_kmem_cache_node[i]); | 1413 | kmem_cache_node_init(&init_kmem_cache_node[i]); |
1414 | 1414 | ||
1415 | set_up_node(kmem_cache, CACHE_CACHE); | 1415 | set_up_node(kmem_cache, CACHE_CACHE); |
1416 | 1416 | ||
1417 | /* | 1417 | /* |
1418 | * Fragmentation resistance on low memory - only use bigger | 1418 | * Fragmentation resistance on low memory - only use bigger |
1419 | * page orders on machines with more than 32MB of memory if | 1419 | * page orders on machines with more than 32MB of memory if |
1420 | * not overridden on the command line. | 1420 | * not overridden on the command line. |
1421 | */ | 1421 | */ |
1422 | if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) | 1422 | if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) |
1423 | slab_max_order = SLAB_MAX_ORDER_HI; | 1423 | slab_max_order = SLAB_MAX_ORDER_HI; |
1424 | 1424 | ||
1425 | /* Bootstrap is tricky, because several objects are allocated | 1425 | /* Bootstrap is tricky, because several objects are allocated |
1426 | * from caches that do not exist yet: | 1426 | * from caches that do not exist yet: |
1427 | * 1) initialize the kmem_cache cache: it contains the struct | 1427 | * 1) initialize the kmem_cache cache: it contains the struct |
1428 | * kmem_cache structures of all caches, except kmem_cache itself: | 1428 | * kmem_cache structures of all caches, except kmem_cache itself: |
1429 | * kmem_cache is statically allocated. | 1429 | * kmem_cache is statically allocated. |
1430 | * Initially an __init data area is used for the head array and the | 1430 | * Initially an __init data area is used for the head array and the |
1431 | * kmem_cache_node structures, it's replaced with a kmalloc allocated | 1431 | * kmem_cache_node structures, it's replaced with a kmalloc allocated |
1432 | * array at the end of the bootstrap. | 1432 | * array at the end of the bootstrap. |
1433 | * 2) Create the first kmalloc cache. | 1433 | * 2) Create the first kmalloc cache. |
1434 | * The struct kmem_cache for the new cache is allocated normally. | 1434 | * The struct kmem_cache for the new cache is allocated normally. |
1435 | * An __init data area is used for the head array. | 1435 | * An __init data area is used for the head array. |
1436 | * 3) Create the remaining kmalloc caches, with minimally sized | 1436 | * 3) Create the remaining kmalloc caches, with minimally sized |
1437 | * head arrays. | 1437 | * head arrays. |
1438 | * 4) Replace the __init data head arrays for kmem_cache and the first | 1438 | * 4) Replace the __init data head arrays for kmem_cache and the first |
1439 | * kmalloc cache with kmalloc allocated arrays. | 1439 | * kmalloc cache with kmalloc allocated arrays. |
1440 | * 5) Replace the __init data for kmem_cache_node for kmem_cache and | 1440 | * 5) Replace the __init data for kmem_cache_node for kmem_cache and |
1441 | * the other cache's with kmalloc allocated memory. | 1441 | * the other cache's with kmalloc allocated memory. |
1442 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. | 1442 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. |
1443 | */ | 1443 | */ |
1444 | 1444 | ||
1445 | /* 1) create the kmem_cache */ | 1445 | /* 1) create the kmem_cache */ |
1446 | 1446 | ||
1447 | /* | 1447 | /* |
1448 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1448 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1449 | */ | 1449 | */ |
1450 | create_boot_cache(kmem_cache, "kmem_cache", | 1450 | create_boot_cache(kmem_cache, "kmem_cache", |
1451 | offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1451 | offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1452 | nr_node_ids * sizeof(struct kmem_cache_node *), | 1452 | nr_node_ids * sizeof(struct kmem_cache_node *), |
1453 | SLAB_HWCACHE_ALIGN); | 1453 | SLAB_HWCACHE_ALIGN); |
1454 | list_add(&kmem_cache->list, &slab_caches); | 1454 | list_add(&kmem_cache->list, &slab_caches); |
1455 | 1455 | ||
1456 | /* 2+3) create the kmalloc caches */ | 1456 | /* 2+3) create the kmalloc caches */ |
1457 | 1457 | ||
1458 | /* | 1458 | /* |
1459 | * Initialize the caches that provide memory for the array cache and the | 1459 | * Initialize the caches that provide memory for the array cache and the |
1460 | * kmem_cache_node structures first. Without this, further allocations will | 1460 | * kmem_cache_node structures first. Without this, further allocations will |
1461 | * bug. | 1461 | * bug. |
1462 | */ | 1462 | */ |
1463 | 1463 | ||
1464 | kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", | 1464 | kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", |
1465 | kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); | 1465 | kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); |
1466 | 1466 | ||
1467 | if (INDEX_AC != INDEX_NODE) | 1467 | if (INDEX_AC != INDEX_NODE) |
1468 | kmalloc_caches[INDEX_NODE] = | 1468 | kmalloc_caches[INDEX_NODE] = |
1469 | create_kmalloc_cache("kmalloc-node", | 1469 | create_kmalloc_cache("kmalloc-node", |
1470 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); | 1470 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); |
1471 | 1471 | ||
1472 | slab_early_init = 0; | 1472 | slab_early_init = 0; |
1473 | 1473 | ||
1474 | /* 4) Replace the bootstrap head arrays */ | 1474 | /* 4) Replace the bootstrap head arrays */ |
1475 | { | 1475 | { |
1476 | struct array_cache *ptr; | 1476 | struct array_cache *ptr; |
1477 | 1477 | ||
1478 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | 1478 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1479 | 1479 | ||
1480 | memcpy(ptr, cpu_cache_get(kmem_cache), | 1480 | memcpy(ptr, cpu_cache_get(kmem_cache), |
1481 | sizeof(struct arraycache_init)); | 1481 | sizeof(struct arraycache_init)); |
1482 | 1482 | ||
1483 | kmem_cache->array[smp_processor_id()] = ptr; | 1483 | kmem_cache->array[smp_processor_id()] = ptr; |
1484 | 1484 | ||
1485 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | 1485 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1486 | 1486 | ||
1487 | BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) | 1487 | BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) |
1488 | != &initarray_generic.cache); | 1488 | != &initarray_generic.cache); |
1489 | memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), | 1489 | memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), |
1490 | sizeof(struct arraycache_init)); | 1490 | sizeof(struct arraycache_init)); |
1491 | 1491 | ||
1492 | kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; | 1492 | kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; |
1493 | } | 1493 | } |
1494 | /* 5) Replace the bootstrap kmem_cache_node */ | 1494 | /* 5) Replace the bootstrap kmem_cache_node */ |
1495 | { | 1495 | { |
1496 | int nid; | 1496 | int nid; |
1497 | 1497 | ||
1498 | for_each_online_node(nid) { | 1498 | for_each_online_node(nid) { |
1499 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); | 1499 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); |
1500 | 1500 | ||
1501 | init_list(kmalloc_caches[INDEX_AC], | 1501 | init_list(kmalloc_caches[INDEX_AC], |
1502 | &init_kmem_cache_node[SIZE_AC + nid], nid); | 1502 | &init_kmem_cache_node[SIZE_AC + nid], nid); |
1503 | 1503 | ||
1504 | if (INDEX_AC != INDEX_NODE) { | 1504 | if (INDEX_AC != INDEX_NODE) { |
1505 | init_list(kmalloc_caches[INDEX_NODE], | 1505 | init_list(kmalloc_caches[INDEX_NODE], |
1506 | &init_kmem_cache_node[SIZE_NODE + nid], nid); | 1506 | &init_kmem_cache_node[SIZE_NODE + nid], nid); |
1507 | } | 1507 | } |
1508 | } | 1508 | } |
1509 | } | 1509 | } |
1510 | 1510 | ||
1511 | create_kmalloc_caches(ARCH_KMALLOC_FLAGS); | 1511 | create_kmalloc_caches(ARCH_KMALLOC_FLAGS); |
1512 | } | 1512 | } |
1513 | 1513 | ||
1514 | void __init kmem_cache_init_late(void) | 1514 | void __init kmem_cache_init_late(void) |
1515 | { | 1515 | { |
1516 | struct kmem_cache *cachep; | 1516 | struct kmem_cache *cachep; |
1517 | 1517 | ||
1518 | slab_state = UP; | 1518 | slab_state = UP; |
1519 | 1519 | ||
1520 | /* 6) resize the head arrays to their final sizes */ | 1520 | /* 6) resize the head arrays to their final sizes */ |
1521 | mutex_lock(&slab_mutex); | 1521 | mutex_lock(&slab_mutex); |
1522 | list_for_each_entry(cachep, &slab_caches, list) | 1522 | list_for_each_entry(cachep, &slab_caches, list) |
1523 | if (enable_cpucache(cachep, GFP_NOWAIT)) | 1523 | if (enable_cpucache(cachep, GFP_NOWAIT)) |
1524 | BUG(); | 1524 | BUG(); |
1525 | mutex_unlock(&slab_mutex); | 1525 | mutex_unlock(&slab_mutex); |
1526 | 1526 | ||
1527 | /* Done! */ | 1527 | /* Done! */ |
1528 | slab_state = FULL; | 1528 | slab_state = FULL; |
1529 | 1529 | ||
1530 | /* | 1530 | /* |
1531 | * Register a cpu startup notifier callback that initializes | 1531 | * Register a cpu startup notifier callback that initializes |
1532 | * cpu_cache_get for all new cpus | 1532 | * cpu_cache_get for all new cpus |
1533 | */ | 1533 | */ |
1534 | register_cpu_notifier(&cpucache_notifier); | 1534 | register_cpu_notifier(&cpucache_notifier); |
1535 | 1535 | ||
1536 | #ifdef CONFIG_NUMA | 1536 | #ifdef CONFIG_NUMA |
1537 | /* | 1537 | /* |
1538 | * Register a memory hotplug callback that initializes and frees | 1538 | * Register a memory hotplug callback that initializes and frees |
1539 | * node. | 1539 | * node. |
1540 | */ | 1540 | */ |
1541 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); | 1541 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); |
1542 | #endif | 1542 | #endif |
1543 | 1543 | ||
1544 | /* | 1544 | /* |
1545 | * The reap timers are started later, with a module init call: That part | 1545 | * The reap timers are started later, with a module init call: That part |
1546 | * of the kernel is not yet operational. | 1546 | * of the kernel is not yet operational. |
1547 | */ | 1547 | */ |
1548 | } | 1548 | } |
1549 | 1549 | ||
1550 | static int __init cpucache_init(void) | 1550 | static int __init cpucache_init(void) |
1551 | { | 1551 | { |
1552 | int cpu; | 1552 | int cpu; |
1553 | 1553 | ||
1554 | /* | 1554 | /* |
1555 | * Register the timers that return unneeded pages to the page allocator | 1555 | * Register the timers that return unneeded pages to the page allocator |
1556 | */ | 1556 | */ |
1557 | for_each_online_cpu(cpu) | 1557 | for_each_online_cpu(cpu) |
1558 | start_cpu_timer(cpu); | 1558 | start_cpu_timer(cpu); |
1559 | 1559 | ||
1560 | /* Done! */ | 1560 | /* Done! */ |
1561 | slab_state = FULL; | 1561 | slab_state = FULL; |
1562 | return 0; | 1562 | return 0; |
1563 | } | 1563 | } |
1564 | __initcall(cpucache_init); | 1564 | __initcall(cpucache_init); |
1565 | 1565 | ||
1566 | static noinline void | 1566 | static noinline void |
1567 | slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | 1567 | slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) |
1568 | { | 1568 | { |
1569 | #if DEBUG | 1569 | #if DEBUG |
1570 | struct kmem_cache_node *n; | 1570 | struct kmem_cache_node *n; |
1571 | struct page *page; | 1571 | struct page *page; |
1572 | unsigned long flags; | 1572 | unsigned long flags; |
1573 | int node; | 1573 | int node; |
1574 | static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 1574 | static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
1575 | DEFAULT_RATELIMIT_BURST); | 1575 | DEFAULT_RATELIMIT_BURST); |
1576 | 1576 | ||
1577 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) | 1577 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) |
1578 | return; | 1578 | return; |
1579 | 1579 | ||
1580 | printk(KERN_WARNING | 1580 | printk(KERN_WARNING |
1581 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", | 1581 | "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", |
1582 | nodeid, gfpflags); | 1582 | nodeid, gfpflags); |
1583 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", | 1583 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", |
1584 | cachep->name, cachep->size, cachep->gfporder); | 1584 | cachep->name, cachep->size, cachep->gfporder); |
1585 | 1585 | ||
1586 | for_each_kmem_cache_node(cachep, node, n) { | 1586 | for_each_kmem_cache_node(cachep, node, n) { |
1587 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; | 1587 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; |
1588 | unsigned long active_slabs = 0, num_slabs = 0; | 1588 | unsigned long active_slabs = 0, num_slabs = 0; |
1589 | 1589 | ||
1590 | spin_lock_irqsave(&n->list_lock, flags); | 1590 | spin_lock_irqsave(&n->list_lock, flags); |
1591 | list_for_each_entry(page, &n->slabs_full, lru) { | 1591 | list_for_each_entry(page, &n->slabs_full, lru) { |
1592 | active_objs += cachep->num; | 1592 | active_objs += cachep->num; |
1593 | active_slabs++; | 1593 | active_slabs++; |
1594 | } | 1594 | } |
1595 | list_for_each_entry(page, &n->slabs_partial, lru) { | 1595 | list_for_each_entry(page, &n->slabs_partial, lru) { |
1596 | active_objs += page->active; | 1596 | active_objs += page->active; |
1597 | active_slabs++; | 1597 | active_slabs++; |
1598 | } | 1598 | } |
1599 | list_for_each_entry(page, &n->slabs_free, lru) | 1599 | list_for_each_entry(page, &n->slabs_free, lru) |
1600 | num_slabs++; | 1600 | num_slabs++; |
1601 | 1601 | ||
1602 | free_objects += n->free_objects; | 1602 | free_objects += n->free_objects; |
1603 | spin_unlock_irqrestore(&n->list_lock, flags); | 1603 | spin_unlock_irqrestore(&n->list_lock, flags); |
1604 | 1604 | ||
1605 | num_slabs += active_slabs; | 1605 | num_slabs += active_slabs; |
1606 | num_objs = num_slabs * cachep->num; | 1606 | num_objs = num_slabs * cachep->num; |
1607 | printk(KERN_WARNING | 1607 | printk(KERN_WARNING |
1608 | " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", | 1608 | " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", |
1609 | node, active_slabs, num_slabs, active_objs, num_objs, | 1609 | node, active_slabs, num_slabs, active_objs, num_objs, |
1610 | free_objects); | 1610 | free_objects); |
1611 | } | 1611 | } |
1612 | #endif | 1612 | #endif |
1613 | } | 1613 | } |
1614 | 1614 | ||
1615 | /* | 1615 | /* |
1616 | * Interface to system's page allocator. No need to hold the | 1616 | * Interface to system's page allocator. No need to hold the |
1617 | * kmem_cache_node ->list_lock. | 1617 | * kmem_cache_node ->list_lock. |
1618 | * | 1618 | * |
1619 | * If we requested dmaable memory, we will get it. Even if we | 1619 | * If we requested dmaable memory, we will get it. Even if we |
1620 | * did not request dmaable memory, we might get it, but that | 1620 | * did not request dmaable memory, we might get it, but that |
1621 | * would be relatively rare and ignorable. | 1621 | * would be relatively rare and ignorable. |
1622 | */ | 1622 | */ |
1623 | static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, | 1623 | static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, |
1624 | int nodeid) | 1624 | int nodeid) |
1625 | { | 1625 | { |
1626 | struct page *page; | 1626 | struct page *page; |
1627 | int nr_pages; | 1627 | int nr_pages; |
1628 | 1628 | ||
1629 | flags |= cachep->allocflags; | 1629 | flags |= cachep->allocflags; |
1630 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1630 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1631 | flags |= __GFP_RECLAIMABLE; | 1631 | flags |= __GFP_RECLAIMABLE; |
1632 | 1632 | ||
1633 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) | 1633 | if (memcg_charge_slab(cachep, flags, cachep->gfporder)) |
1634 | return NULL; | 1634 | return NULL; |
1635 | 1635 | ||
1636 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); | 1636 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1637 | if (!page) { | 1637 | if (!page) { |
1638 | memcg_uncharge_slab(cachep, cachep->gfporder); | 1638 | memcg_uncharge_slab(cachep, cachep->gfporder); |
1639 | slab_out_of_memory(cachep, flags, nodeid); | 1639 | slab_out_of_memory(cachep, flags, nodeid); |
1640 | return NULL; | 1640 | return NULL; |
1641 | } | 1641 | } |
1642 | 1642 | ||
1643 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | 1643 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ |
1644 | if (unlikely(page->pfmemalloc)) | 1644 | if (unlikely(page->pfmemalloc)) |
1645 | pfmemalloc_active = true; | 1645 | pfmemalloc_active = true; |
1646 | 1646 | ||
1647 | nr_pages = (1 << cachep->gfporder); | 1647 | nr_pages = (1 << cachep->gfporder); |
1648 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1648 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1649 | add_zone_page_state(page_zone(page), | 1649 | add_zone_page_state(page_zone(page), |
1650 | NR_SLAB_RECLAIMABLE, nr_pages); | 1650 | NR_SLAB_RECLAIMABLE, nr_pages); |
1651 | else | 1651 | else |
1652 | add_zone_page_state(page_zone(page), | 1652 | add_zone_page_state(page_zone(page), |
1653 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1653 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1654 | __SetPageSlab(page); | 1654 | __SetPageSlab(page); |
1655 | if (page->pfmemalloc) | 1655 | if (page->pfmemalloc) |
1656 | SetPageSlabPfmemalloc(page); | 1656 | SetPageSlabPfmemalloc(page); |
1657 | 1657 | ||
1658 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1658 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1659 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1659 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
1660 | 1660 | ||
1661 | if (cachep->ctor) | 1661 | if (cachep->ctor) |
1662 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | 1662 | kmemcheck_mark_uninitialized_pages(page, nr_pages); |
1663 | else | 1663 | else |
1664 | kmemcheck_mark_unallocated_pages(page, nr_pages); | 1664 | kmemcheck_mark_unallocated_pages(page, nr_pages); |
1665 | } | 1665 | } |
1666 | 1666 | ||
1667 | return page; | 1667 | return page; |
1668 | } | 1668 | } |
1669 | 1669 | ||
1670 | /* | 1670 | /* |
1671 | * Interface to system's page release. | 1671 | * Interface to system's page release. |
1672 | */ | 1672 | */ |
1673 | static void kmem_freepages(struct kmem_cache *cachep, struct page *page) | 1673 | static void kmem_freepages(struct kmem_cache *cachep, struct page *page) |
1674 | { | 1674 | { |
1675 | const unsigned long nr_freed = (1 << cachep->gfporder); | 1675 | const unsigned long nr_freed = (1 << cachep->gfporder); |
1676 | 1676 | ||
1677 | kmemcheck_free_shadow(page, cachep->gfporder); | 1677 | kmemcheck_free_shadow(page, cachep->gfporder); |
1678 | 1678 | ||
1679 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1679 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1680 | sub_zone_page_state(page_zone(page), | 1680 | sub_zone_page_state(page_zone(page), |
1681 | NR_SLAB_RECLAIMABLE, nr_freed); | 1681 | NR_SLAB_RECLAIMABLE, nr_freed); |
1682 | else | 1682 | else |
1683 | sub_zone_page_state(page_zone(page), | 1683 | sub_zone_page_state(page_zone(page), |
1684 | NR_SLAB_UNRECLAIMABLE, nr_freed); | 1684 | NR_SLAB_UNRECLAIMABLE, nr_freed); |
1685 | 1685 | ||
1686 | BUG_ON(!PageSlab(page)); | 1686 | BUG_ON(!PageSlab(page)); |
1687 | __ClearPageSlabPfmemalloc(page); | 1687 | __ClearPageSlabPfmemalloc(page); |
1688 | __ClearPageSlab(page); | 1688 | __ClearPageSlab(page); |
1689 | page_mapcount_reset(page); | 1689 | page_mapcount_reset(page); |
1690 | page->mapping = NULL; | 1690 | page->mapping = NULL; |
1691 | 1691 | ||
1692 | if (current->reclaim_state) | 1692 | if (current->reclaim_state) |
1693 | current->reclaim_state->reclaimed_slab += nr_freed; | 1693 | current->reclaim_state->reclaimed_slab += nr_freed; |
1694 | __free_pages(page, cachep->gfporder); | 1694 | __free_pages(page, cachep->gfporder); |
1695 | memcg_uncharge_slab(cachep, cachep->gfporder); | 1695 | memcg_uncharge_slab(cachep, cachep->gfporder); |
1696 | } | 1696 | } |
1697 | 1697 | ||
1698 | static void kmem_rcu_free(struct rcu_head *head) | 1698 | static void kmem_rcu_free(struct rcu_head *head) |
1699 | { | 1699 | { |
1700 | struct kmem_cache *cachep; | 1700 | struct kmem_cache *cachep; |
1701 | struct page *page; | 1701 | struct page *page; |
1702 | 1702 | ||
1703 | page = container_of(head, struct page, rcu_head); | 1703 | page = container_of(head, struct page, rcu_head); |
1704 | cachep = page->slab_cache; | 1704 | cachep = page->slab_cache; |
1705 | 1705 | ||
1706 | kmem_freepages(cachep, page); | 1706 | kmem_freepages(cachep, page); |
1707 | } | 1707 | } |
1708 | 1708 | ||
1709 | #if DEBUG | 1709 | #if DEBUG |
1710 | 1710 | ||
1711 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1711 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1712 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, | 1712 | static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, |
1713 | unsigned long caller) | 1713 | unsigned long caller) |
1714 | { | 1714 | { |
1715 | int size = cachep->object_size; | 1715 | int size = cachep->object_size; |
1716 | 1716 | ||
1717 | addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; | 1717 | addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; |
1718 | 1718 | ||
1719 | if (size < 5 * sizeof(unsigned long)) | 1719 | if (size < 5 * sizeof(unsigned long)) |
1720 | return; | 1720 | return; |
1721 | 1721 | ||
1722 | *addr++ = 0x12345678; | 1722 | *addr++ = 0x12345678; |
1723 | *addr++ = caller; | 1723 | *addr++ = caller; |
1724 | *addr++ = smp_processor_id(); | 1724 | *addr++ = smp_processor_id(); |
1725 | size -= 3 * sizeof(unsigned long); | 1725 | size -= 3 * sizeof(unsigned long); |
1726 | { | 1726 | { |
1727 | unsigned long *sptr = &caller; | 1727 | unsigned long *sptr = &caller; |
1728 | unsigned long svalue; | 1728 | unsigned long svalue; |
1729 | 1729 | ||
1730 | while (!kstack_end(sptr)) { | 1730 | while (!kstack_end(sptr)) { |
1731 | svalue = *sptr++; | 1731 | svalue = *sptr++; |
1732 | if (kernel_text_address(svalue)) { | 1732 | if (kernel_text_address(svalue)) { |
1733 | *addr++ = svalue; | 1733 | *addr++ = svalue; |
1734 | size -= sizeof(unsigned long); | 1734 | size -= sizeof(unsigned long); |
1735 | if (size <= sizeof(unsigned long)) | 1735 | if (size <= sizeof(unsigned long)) |
1736 | break; | 1736 | break; |
1737 | } | 1737 | } |
1738 | } | 1738 | } |
1739 | 1739 | ||
1740 | } | 1740 | } |
1741 | *addr++ = 0x87654321; | 1741 | *addr++ = 0x87654321; |
1742 | } | 1742 | } |
1743 | #endif | 1743 | #endif |
1744 | 1744 | ||
1745 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) | 1745 | static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) |
1746 | { | 1746 | { |
1747 | int size = cachep->object_size; | 1747 | int size = cachep->object_size; |
1748 | addr = &((char *)addr)[obj_offset(cachep)]; | 1748 | addr = &((char *)addr)[obj_offset(cachep)]; |
1749 | 1749 | ||
1750 | memset(addr, val, size); | 1750 | memset(addr, val, size); |
1751 | *(unsigned char *)(addr + size - 1) = POISON_END; | 1751 | *(unsigned char *)(addr + size - 1) = POISON_END; |
1752 | } | 1752 | } |
1753 | 1753 | ||
1754 | static void dump_line(char *data, int offset, int limit) | 1754 | static void dump_line(char *data, int offset, int limit) |
1755 | { | 1755 | { |
1756 | int i; | 1756 | int i; |
1757 | unsigned char error = 0; | 1757 | unsigned char error = 0; |
1758 | int bad_count = 0; | 1758 | int bad_count = 0; |
1759 | 1759 | ||
1760 | printk(KERN_ERR "%03x: ", offset); | 1760 | printk(KERN_ERR "%03x: ", offset); |
1761 | for (i = 0; i < limit; i++) { | 1761 | for (i = 0; i < limit; i++) { |
1762 | if (data[offset + i] != POISON_FREE) { | 1762 | if (data[offset + i] != POISON_FREE) { |
1763 | error = data[offset + i]; | 1763 | error = data[offset + i]; |
1764 | bad_count++; | 1764 | bad_count++; |
1765 | } | 1765 | } |
1766 | } | 1766 | } |
1767 | print_hex_dump(KERN_CONT, "", 0, 16, 1, | 1767 | print_hex_dump(KERN_CONT, "", 0, 16, 1, |
1768 | &data[offset], limit, 1); | 1768 | &data[offset], limit, 1); |
1769 | 1769 | ||
1770 | if (bad_count == 1) { | 1770 | if (bad_count == 1) { |
1771 | error ^= POISON_FREE; | 1771 | error ^= POISON_FREE; |
1772 | if (!(error & (error - 1))) { | 1772 | if (!(error & (error - 1))) { |
1773 | printk(KERN_ERR "Single bit error detected. Probably " | 1773 | printk(KERN_ERR "Single bit error detected. Probably " |
1774 | "bad RAM.\n"); | 1774 | "bad RAM.\n"); |
1775 | #ifdef CONFIG_X86 | 1775 | #ifdef CONFIG_X86 |
1776 | printk(KERN_ERR "Run memtest86+ or a similar memory " | 1776 | printk(KERN_ERR "Run memtest86+ or a similar memory " |
1777 | "test tool.\n"); | 1777 | "test tool.\n"); |
1778 | #else | 1778 | #else |
1779 | printk(KERN_ERR "Run a memory test tool.\n"); | 1779 | printk(KERN_ERR "Run a memory test tool.\n"); |
1780 | #endif | 1780 | #endif |
1781 | } | 1781 | } |
1782 | } | 1782 | } |
1783 | } | 1783 | } |
1784 | #endif | 1784 | #endif |
1785 | 1785 | ||
1786 | #if DEBUG | 1786 | #if DEBUG |
1787 | 1787 | ||
1788 | static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | 1788 | static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) |
1789 | { | 1789 | { |
1790 | int i, size; | 1790 | int i, size; |
1791 | char *realobj; | 1791 | char *realobj; |
1792 | 1792 | ||
1793 | if (cachep->flags & SLAB_RED_ZONE) { | 1793 | if (cachep->flags & SLAB_RED_ZONE) { |
1794 | printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", | 1794 | printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", |
1795 | *dbg_redzone1(cachep, objp), | 1795 | *dbg_redzone1(cachep, objp), |
1796 | *dbg_redzone2(cachep, objp)); | 1796 | *dbg_redzone2(cachep, objp)); |
1797 | } | 1797 | } |
1798 | 1798 | ||
1799 | if (cachep->flags & SLAB_STORE_USER) { | 1799 | if (cachep->flags & SLAB_STORE_USER) { |
1800 | printk(KERN_ERR "Last user: [<%p>](%pSR)\n", | 1800 | printk(KERN_ERR "Last user: [<%p>](%pSR)\n", |
1801 | *dbg_userword(cachep, objp), | 1801 | *dbg_userword(cachep, objp), |
1802 | *dbg_userword(cachep, objp)); | 1802 | *dbg_userword(cachep, objp)); |
1803 | } | 1803 | } |
1804 | realobj = (char *)objp + obj_offset(cachep); | 1804 | realobj = (char *)objp + obj_offset(cachep); |
1805 | size = cachep->object_size; | 1805 | size = cachep->object_size; |
1806 | for (i = 0; i < size && lines; i += 16, lines--) { | 1806 | for (i = 0; i < size && lines; i += 16, lines--) { |
1807 | int limit; | 1807 | int limit; |
1808 | limit = 16; | 1808 | limit = 16; |
1809 | if (i + limit > size) | 1809 | if (i + limit > size) |
1810 | limit = size - i; | 1810 | limit = size - i; |
1811 | dump_line(realobj, i, limit); | 1811 | dump_line(realobj, i, limit); |
1812 | } | 1812 | } |
1813 | } | 1813 | } |
1814 | 1814 | ||
1815 | static void check_poison_obj(struct kmem_cache *cachep, void *objp) | 1815 | static void check_poison_obj(struct kmem_cache *cachep, void *objp) |
1816 | { | 1816 | { |
1817 | char *realobj; | 1817 | char *realobj; |
1818 | int size, i; | 1818 | int size, i; |
1819 | int lines = 0; | 1819 | int lines = 0; |
1820 | 1820 | ||
1821 | realobj = (char *)objp + obj_offset(cachep); | 1821 | realobj = (char *)objp + obj_offset(cachep); |
1822 | size = cachep->object_size; | 1822 | size = cachep->object_size; |
1823 | 1823 | ||
1824 | for (i = 0; i < size; i++) { | 1824 | for (i = 0; i < size; i++) { |
1825 | char exp = POISON_FREE; | 1825 | char exp = POISON_FREE; |
1826 | if (i == size - 1) | 1826 | if (i == size - 1) |
1827 | exp = POISON_END; | 1827 | exp = POISON_END; |
1828 | if (realobj[i] != exp) { | 1828 | if (realobj[i] != exp) { |
1829 | int limit; | 1829 | int limit; |
1830 | /* Mismatch ! */ | 1830 | /* Mismatch ! */ |
1831 | /* Print header */ | 1831 | /* Print header */ |
1832 | if (lines == 0) { | 1832 | if (lines == 0) { |
1833 | printk(KERN_ERR | 1833 | printk(KERN_ERR |
1834 | "Slab corruption (%s): %s start=%p, len=%d\n", | 1834 | "Slab corruption (%s): %s start=%p, len=%d\n", |
1835 | print_tainted(), cachep->name, realobj, size); | 1835 | print_tainted(), cachep->name, realobj, size); |
1836 | print_objinfo(cachep, objp, 0); | 1836 | print_objinfo(cachep, objp, 0); |
1837 | } | 1837 | } |
1838 | /* Hexdump the affected line */ | 1838 | /* Hexdump the affected line */ |
1839 | i = (i / 16) * 16; | 1839 | i = (i / 16) * 16; |
1840 | limit = 16; | 1840 | limit = 16; |
1841 | if (i + limit > size) | 1841 | if (i + limit > size) |
1842 | limit = size - i; | 1842 | limit = size - i; |
1843 | dump_line(realobj, i, limit); | 1843 | dump_line(realobj, i, limit); |
1844 | i += 16; | 1844 | i += 16; |
1845 | lines++; | 1845 | lines++; |
1846 | /* Limit to 5 lines */ | 1846 | /* Limit to 5 lines */ |
1847 | if (lines > 5) | 1847 | if (lines > 5) |
1848 | break; | 1848 | break; |
1849 | } | 1849 | } |
1850 | } | 1850 | } |
1851 | if (lines != 0) { | 1851 | if (lines != 0) { |
1852 | /* Print some data about the neighboring objects, if they | 1852 | /* Print some data about the neighboring objects, if they |
1853 | * exist: | 1853 | * exist: |
1854 | */ | 1854 | */ |
1855 | struct page *page = virt_to_head_page(objp); | 1855 | struct page *page = virt_to_head_page(objp); |
1856 | unsigned int objnr; | 1856 | unsigned int objnr; |
1857 | 1857 | ||
1858 | objnr = obj_to_index(cachep, page, objp); | 1858 | objnr = obj_to_index(cachep, page, objp); |
1859 | if (objnr) { | 1859 | if (objnr) { |
1860 | objp = index_to_obj(cachep, page, objnr - 1); | 1860 | objp = index_to_obj(cachep, page, objnr - 1); |
1861 | realobj = (char *)objp + obj_offset(cachep); | 1861 | realobj = (char *)objp + obj_offset(cachep); |
1862 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", | 1862 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", |
1863 | realobj, size); | 1863 | realobj, size); |
1864 | print_objinfo(cachep, objp, 2); | 1864 | print_objinfo(cachep, objp, 2); |
1865 | } | 1865 | } |
1866 | if (objnr + 1 < cachep->num) { | 1866 | if (objnr + 1 < cachep->num) { |
1867 | objp = index_to_obj(cachep, page, objnr + 1); | 1867 | objp = index_to_obj(cachep, page, objnr + 1); |
1868 | realobj = (char *)objp + obj_offset(cachep); | 1868 | realobj = (char *)objp + obj_offset(cachep); |
1869 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", | 1869 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", |
1870 | realobj, size); | 1870 | realobj, size); |
1871 | print_objinfo(cachep, objp, 2); | 1871 | print_objinfo(cachep, objp, 2); |
1872 | } | 1872 | } |
1873 | } | 1873 | } |
1874 | } | 1874 | } |
1875 | #endif | 1875 | #endif |
1876 | 1876 | ||
1877 | #if DEBUG | 1877 | #if DEBUG |
1878 | static void slab_destroy_debugcheck(struct kmem_cache *cachep, | 1878 | static void slab_destroy_debugcheck(struct kmem_cache *cachep, |
1879 | struct page *page) | 1879 | struct page *page) |
1880 | { | 1880 | { |
1881 | int i; | 1881 | int i; |
1882 | for (i = 0; i < cachep->num; i++) { | 1882 | for (i = 0; i < cachep->num; i++) { |
1883 | void *objp = index_to_obj(cachep, page, i); | 1883 | void *objp = index_to_obj(cachep, page, i); |
1884 | 1884 | ||
1885 | if (cachep->flags & SLAB_POISON) { | 1885 | if (cachep->flags & SLAB_POISON) { |
1886 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1886 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1887 | if (cachep->size % PAGE_SIZE == 0 && | 1887 | if (cachep->size % PAGE_SIZE == 0 && |
1888 | OFF_SLAB(cachep)) | 1888 | OFF_SLAB(cachep)) |
1889 | kernel_map_pages(virt_to_page(objp), | 1889 | kernel_map_pages(virt_to_page(objp), |
1890 | cachep->size / PAGE_SIZE, 1); | 1890 | cachep->size / PAGE_SIZE, 1); |
1891 | else | 1891 | else |
1892 | check_poison_obj(cachep, objp); | 1892 | check_poison_obj(cachep, objp); |
1893 | #else | 1893 | #else |
1894 | check_poison_obj(cachep, objp); | 1894 | check_poison_obj(cachep, objp); |
1895 | #endif | 1895 | #endif |
1896 | } | 1896 | } |
1897 | if (cachep->flags & SLAB_RED_ZONE) { | 1897 | if (cachep->flags & SLAB_RED_ZONE) { |
1898 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | 1898 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) |
1899 | slab_error(cachep, "start of a freed object " | 1899 | slab_error(cachep, "start of a freed object " |
1900 | "was overwritten"); | 1900 | "was overwritten"); |
1901 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 1901 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
1902 | slab_error(cachep, "end of a freed object " | 1902 | slab_error(cachep, "end of a freed object " |
1903 | "was overwritten"); | 1903 | "was overwritten"); |
1904 | } | 1904 | } |
1905 | } | 1905 | } |
1906 | } | 1906 | } |
1907 | #else | 1907 | #else |
1908 | static void slab_destroy_debugcheck(struct kmem_cache *cachep, | 1908 | static void slab_destroy_debugcheck(struct kmem_cache *cachep, |
1909 | struct page *page) | 1909 | struct page *page) |
1910 | { | 1910 | { |
1911 | } | 1911 | } |
1912 | #endif | 1912 | #endif |
1913 | 1913 | ||
1914 | /** | 1914 | /** |
1915 | * slab_destroy - destroy and release all objects in a slab | 1915 | * slab_destroy - destroy and release all objects in a slab |
1916 | * @cachep: cache pointer being destroyed | 1916 | * @cachep: cache pointer being destroyed |
1917 | * @page: page pointer being destroyed | 1917 | * @page: page pointer being destroyed |
1918 | * | 1918 | * |
1919 | * Destroy all the objs in a slab page, and release the mem back to the system. | 1919 | * Destroy all the objs in a slab page, and release the mem back to the system. |
1920 | * Before calling the slab page must have been unlinked from the cache. The | 1920 | * Before calling the slab page must have been unlinked from the cache. The |
1921 | * kmem_cache_node ->list_lock is not held/needed. | 1921 | * kmem_cache_node ->list_lock is not held/needed. |
1922 | */ | 1922 | */ |
1923 | static void slab_destroy(struct kmem_cache *cachep, struct page *page) | 1923 | static void slab_destroy(struct kmem_cache *cachep, struct page *page) |
1924 | { | 1924 | { |
1925 | void *freelist; | 1925 | void *freelist; |
1926 | 1926 | ||
1927 | freelist = page->freelist; | 1927 | freelist = page->freelist; |
1928 | slab_destroy_debugcheck(cachep, page); | 1928 | slab_destroy_debugcheck(cachep, page); |
1929 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { | 1929 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { |
1930 | struct rcu_head *head; | 1930 | struct rcu_head *head; |
1931 | 1931 | ||
1932 | /* | 1932 | /* |
1933 | * RCU free overloads the RCU head over the LRU. | 1933 | * RCU free overloads the RCU head over the LRU. |
1934 | * slab_page has been overloeaded over the LRU, | 1934 | * slab_page has been overloeaded over the LRU, |
1935 | * however it is not used from now on so that | 1935 | * however it is not used from now on so that |
1936 | * we can use it safely. | 1936 | * we can use it safely. |
1937 | */ | 1937 | */ |
1938 | head = (void *)&page->rcu_head; | 1938 | head = (void *)&page->rcu_head; |
1939 | call_rcu(head, kmem_rcu_free); | 1939 | call_rcu(head, kmem_rcu_free); |
1940 | 1940 | ||
1941 | } else { | 1941 | } else { |
1942 | kmem_freepages(cachep, page); | 1942 | kmem_freepages(cachep, page); |
1943 | } | 1943 | } |
1944 | 1944 | ||
1945 | /* | 1945 | /* |
1946 | * From now on, we don't use freelist | 1946 | * From now on, we don't use freelist |
1947 | * although actual page can be freed in rcu context | 1947 | * although actual page can be freed in rcu context |
1948 | */ | 1948 | */ |
1949 | if (OFF_SLAB(cachep)) | 1949 | if (OFF_SLAB(cachep)) |
1950 | kmem_cache_free(cachep->freelist_cache, freelist); | 1950 | kmem_cache_free(cachep->freelist_cache, freelist); |
1951 | } | 1951 | } |
1952 | 1952 | ||
1953 | static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) | 1953 | static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) |
1954 | { | 1954 | { |
1955 | struct page *page, *n; | 1955 | struct page *page, *n; |
1956 | 1956 | ||
1957 | list_for_each_entry_safe(page, n, list, lru) { | 1957 | list_for_each_entry_safe(page, n, list, lru) { |
1958 | list_del(&page->lru); | 1958 | list_del(&page->lru); |
1959 | slab_destroy(cachep, page); | 1959 | slab_destroy(cachep, page); |
1960 | } | 1960 | } |
1961 | } | 1961 | } |
1962 | 1962 | ||
1963 | /** | 1963 | /** |
1964 | * calculate_slab_order - calculate size (page order) of slabs | 1964 | * calculate_slab_order - calculate size (page order) of slabs |
1965 | * @cachep: pointer to the cache that is being created | 1965 | * @cachep: pointer to the cache that is being created |
1966 | * @size: size of objects to be created in this cache. | 1966 | * @size: size of objects to be created in this cache. |
1967 | * @align: required alignment for the objects. | 1967 | * @align: required alignment for the objects. |
1968 | * @flags: slab allocation flags | 1968 | * @flags: slab allocation flags |
1969 | * | 1969 | * |
1970 | * Also calculates the number of objects per slab. | 1970 | * Also calculates the number of objects per slab. |
1971 | * | 1971 | * |
1972 | * This could be made much more intelligent. For now, try to avoid using | 1972 | * This could be made much more intelligent. For now, try to avoid using |
1973 | * high order pages for slabs. When the gfp() functions are more friendly | 1973 | * high order pages for slabs. When the gfp() functions are more friendly |
1974 | * towards high-order requests, this should be changed. | 1974 | * towards high-order requests, this should be changed. |
1975 | */ | 1975 | */ |
1976 | static size_t calculate_slab_order(struct kmem_cache *cachep, | 1976 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
1977 | size_t size, size_t align, unsigned long flags) | 1977 | size_t size, size_t align, unsigned long flags) |
1978 | { | 1978 | { |
1979 | unsigned long offslab_limit; | 1979 | unsigned long offslab_limit; |
1980 | size_t left_over = 0; | 1980 | size_t left_over = 0; |
1981 | int gfporder; | 1981 | int gfporder; |
1982 | 1982 | ||
1983 | for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { | 1983 | for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { |
1984 | unsigned int num; | 1984 | unsigned int num; |
1985 | size_t remainder; | 1985 | size_t remainder; |
1986 | 1986 | ||
1987 | cache_estimate(gfporder, size, align, flags, &remainder, &num); | 1987 | cache_estimate(gfporder, size, align, flags, &remainder, &num); |
1988 | if (!num) | 1988 | if (!num) |
1989 | continue; | 1989 | continue; |
1990 | 1990 | ||
1991 | /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ | 1991 | /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ |
1992 | if (num > SLAB_OBJ_MAX_NUM) | 1992 | if (num > SLAB_OBJ_MAX_NUM) |
1993 | break; | 1993 | break; |
1994 | 1994 | ||
1995 | if (flags & CFLGS_OFF_SLAB) { | 1995 | if (flags & CFLGS_OFF_SLAB) { |
1996 | size_t freelist_size_per_obj = sizeof(freelist_idx_t); | 1996 | size_t freelist_size_per_obj = sizeof(freelist_idx_t); |
1997 | /* | 1997 | /* |
1998 | * Max number of objs-per-slab for caches which | 1998 | * Max number of objs-per-slab for caches which |
1999 | * use off-slab slabs. Needed to avoid a possible | 1999 | * use off-slab slabs. Needed to avoid a possible |
2000 | * looping condition in cache_grow(). | 2000 | * looping condition in cache_grow(). |
2001 | */ | 2001 | */ |
2002 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) | 2002 | if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) |
2003 | freelist_size_per_obj += sizeof(char); | 2003 | freelist_size_per_obj += sizeof(char); |
2004 | offslab_limit = size; | 2004 | offslab_limit = size; |
2005 | offslab_limit /= freelist_size_per_obj; | 2005 | offslab_limit /= freelist_size_per_obj; |
2006 | 2006 | ||
2007 | if (num > offslab_limit) | 2007 | if (num > offslab_limit) |
2008 | break; | 2008 | break; |
2009 | } | 2009 | } |
2010 | 2010 | ||
2011 | /* Found something acceptable - save it away */ | 2011 | /* Found something acceptable - save it away */ |
2012 | cachep->num = num; | 2012 | cachep->num = num; |
2013 | cachep->gfporder = gfporder; | 2013 | cachep->gfporder = gfporder; |
2014 | left_over = remainder; | 2014 | left_over = remainder; |
2015 | 2015 | ||
2016 | /* | 2016 | /* |
2017 | * A VFS-reclaimable slab tends to have most allocations | 2017 | * A VFS-reclaimable slab tends to have most allocations |
2018 | * as GFP_NOFS and we really don't want to have to be allocating | 2018 | * as GFP_NOFS and we really don't want to have to be allocating |
2019 | * higher-order pages when we are unable to shrink dcache. | 2019 | * higher-order pages when we are unable to shrink dcache. |
2020 | */ | 2020 | */ |
2021 | if (flags & SLAB_RECLAIM_ACCOUNT) | 2021 | if (flags & SLAB_RECLAIM_ACCOUNT) |
2022 | break; | 2022 | break; |
2023 | 2023 | ||
2024 | /* | 2024 | /* |
2025 | * Large number of objects is good, but very large slabs are | 2025 | * Large number of objects is good, but very large slabs are |
2026 | * currently bad for the gfp()s. | 2026 | * currently bad for the gfp()s. |
2027 | */ | 2027 | */ |
2028 | if (gfporder >= slab_max_order) | 2028 | if (gfporder >= slab_max_order) |
2029 | break; | 2029 | break; |
2030 | 2030 | ||
2031 | /* | 2031 | /* |
2032 | * Acceptable internal fragmentation? | 2032 | * Acceptable internal fragmentation? |
2033 | */ | 2033 | */ |
2034 | if (left_over * 8 <= (PAGE_SIZE << gfporder)) | 2034 | if (left_over * 8 <= (PAGE_SIZE << gfporder)) |
2035 | break; | 2035 | break; |
2036 | } | 2036 | } |
2037 | return left_over; | 2037 | return left_over; |
2038 | } | 2038 | } |
2039 | 2039 | ||
2040 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | 2040 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2041 | { | 2041 | { |
2042 | if (slab_state >= FULL) | 2042 | if (slab_state >= FULL) |
2043 | return enable_cpucache(cachep, gfp); | 2043 | return enable_cpucache(cachep, gfp); |
2044 | 2044 | ||
2045 | if (slab_state == DOWN) { | 2045 | if (slab_state == DOWN) { |
2046 | /* | 2046 | /* |
2047 | * Note: Creation of first cache (kmem_cache). | 2047 | * Note: Creation of first cache (kmem_cache). |
2048 | * The setup_node is taken care | 2048 | * The setup_node is taken care |
2049 | * of by the caller of __kmem_cache_create | 2049 | * of by the caller of __kmem_cache_create |
2050 | */ | 2050 | */ |
2051 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | 2051 | cachep->array[smp_processor_id()] = &initarray_generic.cache; |
2052 | slab_state = PARTIAL; | 2052 | slab_state = PARTIAL; |
2053 | } else if (slab_state == PARTIAL) { | 2053 | } else if (slab_state == PARTIAL) { |
2054 | /* | 2054 | /* |
2055 | * Note: the second kmem_cache_create must create the cache | 2055 | * Note: the second kmem_cache_create must create the cache |
2056 | * that's used by kmalloc(24), otherwise the creation of | 2056 | * that's used by kmalloc(24), otherwise the creation of |
2057 | * further caches will BUG(). | 2057 | * further caches will BUG(). |
2058 | */ | 2058 | */ |
2059 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | 2059 | cachep->array[smp_processor_id()] = &initarray_generic.cache; |
2060 | 2060 | ||
2061 | /* | 2061 | /* |
2062 | * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is | 2062 | * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is |
2063 | * the second cache, then we need to set up all its node/, | 2063 | * the second cache, then we need to set up all its node/, |
2064 | * otherwise the creation of further caches will BUG(). | 2064 | * otherwise the creation of further caches will BUG(). |
2065 | */ | 2065 | */ |
2066 | set_up_node(cachep, SIZE_AC); | 2066 | set_up_node(cachep, SIZE_AC); |
2067 | if (INDEX_AC == INDEX_NODE) | 2067 | if (INDEX_AC == INDEX_NODE) |
2068 | slab_state = PARTIAL_NODE; | 2068 | slab_state = PARTIAL_NODE; |
2069 | else | 2069 | else |
2070 | slab_state = PARTIAL_ARRAYCACHE; | 2070 | slab_state = PARTIAL_ARRAYCACHE; |
2071 | } else { | 2071 | } else { |
2072 | /* Remaining boot caches */ | 2072 | /* Remaining boot caches */ |
2073 | cachep->array[smp_processor_id()] = | 2073 | cachep->array[smp_processor_id()] = |
2074 | kmalloc(sizeof(struct arraycache_init), gfp); | 2074 | kmalloc(sizeof(struct arraycache_init), gfp); |
2075 | 2075 | ||
2076 | if (slab_state == PARTIAL_ARRAYCACHE) { | 2076 | if (slab_state == PARTIAL_ARRAYCACHE) { |
2077 | set_up_node(cachep, SIZE_NODE); | 2077 | set_up_node(cachep, SIZE_NODE); |
2078 | slab_state = PARTIAL_NODE; | 2078 | slab_state = PARTIAL_NODE; |
2079 | } else { | 2079 | } else { |
2080 | int node; | 2080 | int node; |
2081 | for_each_online_node(node) { | 2081 | for_each_online_node(node) { |
2082 | cachep->node[node] = | 2082 | cachep->node[node] = |
2083 | kmalloc_node(sizeof(struct kmem_cache_node), | 2083 | kmalloc_node(sizeof(struct kmem_cache_node), |
2084 | gfp, node); | 2084 | gfp, node); |
2085 | BUG_ON(!cachep->node[node]); | 2085 | BUG_ON(!cachep->node[node]); |
2086 | kmem_cache_node_init(cachep->node[node]); | 2086 | kmem_cache_node_init(cachep->node[node]); |
2087 | } | 2087 | } |
2088 | } | 2088 | } |
2089 | } | 2089 | } |
2090 | cachep->node[numa_mem_id()]->next_reap = | 2090 | cachep->node[numa_mem_id()]->next_reap = |
2091 | jiffies + REAPTIMEOUT_NODE + | 2091 | jiffies + REAPTIMEOUT_NODE + |
2092 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; | 2092 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; |
2093 | 2093 | ||
2094 | cpu_cache_get(cachep)->avail = 0; | 2094 | cpu_cache_get(cachep)->avail = 0; |
2095 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | 2095 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; |
2096 | cpu_cache_get(cachep)->batchcount = 1; | 2096 | cpu_cache_get(cachep)->batchcount = 1; |
2097 | cpu_cache_get(cachep)->touched = 0; | 2097 | cpu_cache_get(cachep)->touched = 0; |
2098 | cachep->batchcount = 1; | 2098 | cachep->batchcount = 1; |
2099 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | 2099 | cachep->limit = BOOT_CPUCACHE_ENTRIES; |
2100 | return 0; | 2100 | return 0; |
2101 | } | 2101 | } |
2102 | 2102 | ||
2103 | /** | 2103 | /** |
2104 | * __kmem_cache_create - Create a cache. | 2104 | * __kmem_cache_create - Create a cache. |
2105 | * @cachep: cache management descriptor | 2105 | * @cachep: cache management descriptor |
2106 | * @flags: SLAB flags | 2106 | * @flags: SLAB flags |
2107 | * | 2107 | * |
2108 | * Returns a ptr to the cache on success, NULL on failure. | 2108 | * Returns a ptr to the cache on success, NULL on failure. |
2109 | * Cannot be called within a int, but can be interrupted. | 2109 | * Cannot be called within a int, but can be interrupted. |
2110 | * The @ctor is run when new pages are allocated by the cache. | 2110 | * The @ctor is run when new pages are allocated by the cache. |
2111 | * | 2111 | * |
2112 | * The flags are | 2112 | * The flags are |
2113 | * | 2113 | * |
2114 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | 2114 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) |
2115 | * to catch references to uninitialised memory. | 2115 | * to catch references to uninitialised memory. |
2116 | * | 2116 | * |
2117 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | 2117 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check |
2118 | * for buffer overruns. | 2118 | * for buffer overruns. |
2119 | * | 2119 | * |
2120 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | 2120 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware |
2121 | * cacheline. This can be beneficial if you're counting cycles as closely | 2121 | * cacheline. This can be beneficial if you're counting cycles as closely |
2122 | * as davem. | 2122 | * as davem. |
2123 | */ | 2123 | */ |
2124 | int | 2124 | int |
2125 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | 2125 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) |
2126 | { | 2126 | { |
2127 | size_t left_over, freelist_size, ralign; | 2127 | size_t left_over, freelist_size; |
2128 | size_t ralign = BYTES_PER_WORD; | ||
2128 | gfp_t gfp; | 2129 | gfp_t gfp; |
2129 | int err; | 2130 | int err; |
2130 | size_t size = cachep->size; | 2131 | size_t size = cachep->size; |
2131 | 2132 | ||
2132 | #if DEBUG | 2133 | #if DEBUG |
2133 | #if FORCED_DEBUG | 2134 | #if FORCED_DEBUG |
2134 | /* | 2135 | /* |
2135 | * Enable redzoning and last user accounting, except for caches with | 2136 | * Enable redzoning and last user accounting, except for caches with |
2136 | * large objects, if the increased size would increase the object size | 2137 | * large objects, if the increased size would increase the object size |
2137 | * above the next power of two: caches with object sizes just above a | 2138 | * above the next power of two: caches with object sizes just above a |
2138 | * power of two have a significant amount of internal fragmentation. | 2139 | * power of two have a significant amount of internal fragmentation. |
2139 | */ | 2140 | */ |
2140 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + | 2141 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + |
2141 | 2 * sizeof(unsigned long long))) | 2142 | 2 * sizeof(unsigned long long))) |
2142 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 2143 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
2143 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 2144 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
2144 | flags |= SLAB_POISON; | 2145 | flags |= SLAB_POISON; |
2145 | #endif | 2146 | #endif |
2146 | if (flags & SLAB_DESTROY_BY_RCU) | 2147 | if (flags & SLAB_DESTROY_BY_RCU) |
2147 | BUG_ON(flags & SLAB_POISON); | 2148 | BUG_ON(flags & SLAB_POISON); |
2148 | #endif | 2149 | #endif |
2149 | 2150 | ||
2150 | /* | 2151 | /* |
2151 | * Check that size is in terms of words. This is needed to avoid | 2152 | * Check that size is in terms of words. This is needed to avoid |
2152 | * unaligned accesses for some archs when redzoning is used, and makes | 2153 | * unaligned accesses for some archs when redzoning is used, and makes |
2153 | * sure any on-slab bufctl's are also correctly aligned. | 2154 | * sure any on-slab bufctl's are also correctly aligned. |
2154 | */ | 2155 | */ |
2155 | if (size & (BYTES_PER_WORD - 1)) { | 2156 | if (size & (BYTES_PER_WORD - 1)) { |
2156 | size += (BYTES_PER_WORD - 1); | 2157 | size += (BYTES_PER_WORD - 1); |
2157 | size &= ~(BYTES_PER_WORD - 1); | 2158 | size &= ~(BYTES_PER_WORD - 1); |
2158 | } | 2159 | } |
2159 | |||
2160 | /* | ||
2161 | * Redzoning and user store require word alignment or possibly larger. | ||
2162 | * Note this will be overridden by architecture or caller mandated | ||
2163 | * alignment if either is greater than BYTES_PER_WORD. | ||
2164 | */ | ||
2165 | if (flags & SLAB_STORE_USER) | ||
2166 | ralign = BYTES_PER_WORD; | ||
2167 | 2160 | ||
2168 | if (flags & SLAB_RED_ZONE) { | 2161 | if (flags & SLAB_RED_ZONE) { |
2169 | ralign = REDZONE_ALIGN; | 2162 | ralign = REDZONE_ALIGN; |
2170 | /* If redzoning, ensure that the second redzone is suitably | 2163 | /* If redzoning, ensure that the second redzone is suitably |
2171 | * aligned, by adjusting the object size accordingly. */ | 2164 | * aligned, by adjusting the object size accordingly. */ |
2172 | size += REDZONE_ALIGN - 1; | 2165 | size += REDZONE_ALIGN - 1; |
2173 | size &= ~(REDZONE_ALIGN - 1); | 2166 | size &= ~(REDZONE_ALIGN - 1); |
2174 | } | 2167 | } |
2175 | 2168 | ||
2176 | /* 3) caller mandated alignment */ | 2169 | /* 3) caller mandated alignment */ |
2177 | if (ralign < cachep->align) { | 2170 | if (ralign < cachep->align) { |
2178 | ralign = cachep->align; | 2171 | ralign = cachep->align; |
2179 | } | 2172 | } |
2180 | /* disable debug if necessary */ | 2173 | /* disable debug if necessary */ |
2181 | if (ralign > __alignof__(unsigned long long)) | 2174 | if (ralign > __alignof__(unsigned long long)) |
2182 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2175 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
2183 | /* | 2176 | /* |
2184 | * 4) Store it. | 2177 | * 4) Store it. |
2185 | */ | 2178 | */ |
2186 | cachep->align = ralign; | 2179 | cachep->align = ralign; |
2187 | 2180 | ||
2188 | if (slab_is_available()) | 2181 | if (slab_is_available()) |
2189 | gfp = GFP_KERNEL; | 2182 | gfp = GFP_KERNEL; |
2190 | else | 2183 | else |
2191 | gfp = GFP_NOWAIT; | 2184 | gfp = GFP_NOWAIT; |
2192 | 2185 | ||
2193 | setup_node_pointer(cachep); | 2186 | setup_node_pointer(cachep); |
2194 | #if DEBUG | 2187 | #if DEBUG |
2195 | 2188 | ||
2196 | /* | 2189 | /* |
2197 | * Both debugging options require word-alignment which is calculated | 2190 | * Both debugging options require word-alignment which is calculated |
2198 | * into align above. | 2191 | * into align above. |
2199 | */ | 2192 | */ |
2200 | if (flags & SLAB_RED_ZONE) { | 2193 | if (flags & SLAB_RED_ZONE) { |
2201 | /* add space for red zone words */ | 2194 | /* add space for red zone words */ |
2202 | cachep->obj_offset += sizeof(unsigned long long); | 2195 | cachep->obj_offset += sizeof(unsigned long long); |
2203 | size += 2 * sizeof(unsigned long long); | 2196 | size += 2 * sizeof(unsigned long long); |
2204 | } | 2197 | } |
2205 | if (flags & SLAB_STORE_USER) { | 2198 | if (flags & SLAB_STORE_USER) { |
2206 | /* user store requires one word storage behind the end of | 2199 | /* user store requires one word storage behind the end of |
2207 | * the real object. But if the second red zone needs to be | 2200 | * the real object. But if the second red zone needs to be |
2208 | * aligned to 64 bits, we must allow that much space. | 2201 | * aligned to 64 bits, we must allow that much space. |
2209 | */ | 2202 | */ |
2210 | if (flags & SLAB_RED_ZONE) | 2203 | if (flags & SLAB_RED_ZONE) |
2211 | size += REDZONE_ALIGN; | 2204 | size += REDZONE_ALIGN; |
2212 | else | 2205 | else |
2213 | size += BYTES_PER_WORD; | 2206 | size += BYTES_PER_WORD; |
2214 | } | 2207 | } |
2215 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2208 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
2216 | if (size >= kmalloc_size(INDEX_NODE + 1) | 2209 | if (size >= kmalloc_size(INDEX_NODE + 1) |
2217 | && cachep->object_size > cache_line_size() | 2210 | && cachep->object_size > cache_line_size() |
2218 | && ALIGN(size, cachep->align) < PAGE_SIZE) { | 2211 | && ALIGN(size, cachep->align) < PAGE_SIZE) { |
2219 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); | 2212 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); |
2220 | size = PAGE_SIZE; | 2213 | size = PAGE_SIZE; |
2221 | } | 2214 | } |
2222 | #endif | 2215 | #endif |
2223 | #endif | 2216 | #endif |
2224 | 2217 | ||
2225 | /* | 2218 | /* |
2226 | * Determine if the slab management is 'on' or 'off' slab. | 2219 | * Determine if the slab management is 'on' or 'off' slab. |
2227 | * (bootstrapping cannot cope with offslab caches so don't do | 2220 | * (bootstrapping cannot cope with offslab caches so don't do |
2228 | * it too early on. Always use on-slab management when | 2221 | * it too early on. Always use on-slab management when |
2229 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | 2222 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) |
2230 | */ | 2223 | */ |
2231 | if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && | 2224 | if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && |
2232 | !(flags & SLAB_NOLEAKTRACE)) | 2225 | !(flags & SLAB_NOLEAKTRACE)) |
2233 | /* | 2226 | /* |
2234 | * Size is large, assume best to place the slab management obj | 2227 | * Size is large, assume best to place the slab management obj |
2235 | * off-slab (should allow better packing of objs). | 2228 | * off-slab (should allow better packing of objs). |
2236 | */ | 2229 | */ |
2237 | flags |= CFLGS_OFF_SLAB; | 2230 | flags |= CFLGS_OFF_SLAB; |
2238 | 2231 | ||
2239 | size = ALIGN(size, cachep->align); | 2232 | size = ALIGN(size, cachep->align); |
2240 | /* | 2233 | /* |
2241 | * We should restrict the number of objects in a slab to implement | 2234 | * We should restrict the number of objects in a slab to implement |
2242 | * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. | 2235 | * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. |
2243 | */ | 2236 | */ |
2244 | if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) | 2237 | if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) |
2245 | size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); | 2238 | size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); |
2246 | 2239 | ||
2247 | left_over = calculate_slab_order(cachep, size, cachep->align, flags); | 2240 | left_over = calculate_slab_order(cachep, size, cachep->align, flags); |
2248 | 2241 | ||
2249 | if (!cachep->num) | 2242 | if (!cachep->num) |
2250 | return -E2BIG; | 2243 | return -E2BIG; |
2251 | 2244 | ||
2252 | freelist_size = calculate_freelist_size(cachep->num, cachep->align); | 2245 | freelist_size = calculate_freelist_size(cachep->num, cachep->align); |
2253 | 2246 | ||
2254 | /* | 2247 | /* |
2255 | * If the slab has been placed off-slab, and we have enough space then | 2248 | * If the slab has been placed off-slab, and we have enough space then |
2256 | * move it on-slab. This is at the expense of any extra colouring. | 2249 | * move it on-slab. This is at the expense of any extra colouring. |
2257 | */ | 2250 | */ |
2258 | if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { | 2251 | if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { |
2259 | flags &= ~CFLGS_OFF_SLAB; | 2252 | flags &= ~CFLGS_OFF_SLAB; |
2260 | left_over -= freelist_size; | 2253 | left_over -= freelist_size; |
2261 | } | 2254 | } |
2262 | 2255 | ||
2263 | if (flags & CFLGS_OFF_SLAB) { | 2256 | if (flags & CFLGS_OFF_SLAB) { |
2264 | /* really off slab. No need for manual alignment */ | 2257 | /* really off slab. No need for manual alignment */ |
2265 | freelist_size = calculate_freelist_size(cachep->num, 0); | 2258 | freelist_size = calculate_freelist_size(cachep->num, 0); |
2266 | 2259 | ||
2267 | #ifdef CONFIG_PAGE_POISONING | 2260 | #ifdef CONFIG_PAGE_POISONING |
2268 | /* If we're going to use the generic kernel_map_pages() | 2261 | /* If we're going to use the generic kernel_map_pages() |
2269 | * poisoning, then it's going to smash the contents of | 2262 | * poisoning, then it's going to smash the contents of |
2270 | * the redzone and userword anyhow, so switch them off. | 2263 | * the redzone and userword anyhow, so switch them off. |
2271 | */ | 2264 | */ |
2272 | if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) | 2265 | if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) |
2273 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2266 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
2274 | #endif | 2267 | #endif |
2275 | } | 2268 | } |
2276 | 2269 | ||
2277 | cachep->colour_off = cache_line_size(); | 2270 | cachep->colour_off = cache_line_size(); |
2278 | /* Offset must be a multiple of the alignment. */ | 2271 | /* Offset must be a multiple of the alignment. */ |
2279 | if (cachep->colour_off < cachep->align) | 2272 | if (cachep->colour_off < cachep->align) |
2280 | cachep->colour_off = cachep->align; | 2273 | cachep->colour_off = cachep->align; |
2281 | cachep->colour = left_over / cachep->colour_off; | 2274 | cachep->colour = left_over / cachep->colour_off; |
2282 | cachep->freelist_size = freelist_size; | 2275 | cachep->freelist_size = freelist_size; |
2283 | cachep->flags = flags; | 2276 | cachep->flags = flags; |
2284 | cachep->allocflags = __GFP_COMP; | 2277 | cachep->allocflags = __GFP_COMP; |
2285 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) | 2278 | if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) |
2286 | cachep->allocflags |= GFP_DMA; | 2279 | cachep->allocflags |= GFP_DMA; |
2287 | cachep->size = size; | 2280 | cachep->size = size; |
2288 | cachep->reciprocal_buffer_size = reciprocal_value(size); | 2281 | cachep->reciprocal_buffer_size = reciprocal_value(size); |
2289 | 2282 | ||
2290 | if (flags & CFLGS_OFF_SLAB) { | 2283 | if (flags & CFLGS_OFF_SLAB) { |
2291 | cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); | 2284 | cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); |
2292 | /* | 2285 | /* |
2293 | * This is a possibility for one of the kmalloc_{dma,}_caches. | 2286 | * This is a possibility for one of the kmalloc_{dma,}_caches. |
2294 | * But since we go off slab only for object size greater than | 2287 | * But since we go off slab only for object size greater than |
2295 | * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created | 2288 | * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created |
2296 | * in ascending order,this should not happen at all. | 2289 | * in ascending order,this should not happen at all. |
2297 | * But leave a BUG_ON for some lucky dude. | 2290 | * But leave a BUG_ON for some lucky dude. |
2298 | */ | 2291 | */ |
2299 | BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); | 2292 | BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); |
2300 | } | 2293 | } |
2301 | 2294 | ||
2302 | err = setup_cpu_cache(cachep, gfp); | 2295 | err = setup_cpu_cache(cachep, gfp); |
2303 | if (err) { | 2296 | if (err) { |
2304 | __kmem_cache_shutdown(cachep); | 2297 | __kmem_cache_shutdown(cachep); |
2305 | return err; | 2298 | return err; |
2306 | } | 2299 | } |
2307 | 2300 | ||
2308 | return 0; | 2301 | return 0; |
2309 | } | 2302 | } |
2310 | 2303 | ||
2311 | #if DEBUG | 2304 | #if DEBUG |
2312 | static void check_irq_off(void) | 2305 | static void check_irq_off(void) |
2313 | { | 2306 | { |
2314 | BUG_ON(!irqs_disabled()); | 2307 | BUG_ON(!irqs_disabled()); |
2315 | } | 2308 | } |
2316 | 2309 | ||
2317 | static void check_irq_on(void) | 2310 | static void check_irq_on(void) |
2318 | { | 2311 | { |
2319 | BUG_ON(irqs_disabled()); | 2312 | BUG_ON(irqs_disabled()); |
2320 | } | 2313 | } |
2321 | 2314 | ||
2322 | static void check_spinlock_acquired(struct kmem_cache *cachep) | 2315 | static void check_spinlock_acquired(struct kmem_cache *cachep) |
2323 | { | 2316 | { |
2324 | #ifdef CONFIG_SMP | 2317 | #ifdef CONFIG_SMP |
2325 | check_irq_off(); | 2318 | check_irq_off(); |
2326 | assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); | 2319 | assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); |
2327 | #endif | 2320 | #endif |
2328 | } | 2321 | } |
2329 | 2322 | ||
2330 | static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) | 2323 | static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) |
2331 | { | 2324 | { |
2332 | #ifdef CONFIG_SMP | 2325 | #ifdef CONFIG_SMP |
2333 | check_irq_off(); | 2326 | check_irq_off(); |
2334 | assert_spin_locked(&get_node(cachep, node)->list_lock); | 2327 | assert_spin_locked(&get_node(cachep, node)->list_lock); |
2335 | #endif | 2328 | #endif |
2336 | } | 2329 | } |
2337 | 2330 | ||
2338 | #else | 2331 | #else |
2339 | #define check_irq_off() do { } while(0) | 2332 | #define check_irq_off() do { } while(0) |
2340 | #define check_irq_on() do { } while(0) | 2333 | #define check_irq_on() do { } while(0) |
2341 | #define check_spinlock_acquired(x) do { } while(0) | 2334 | #define check_spinlock_acquired(x) do { } while(0) |
2342 | #define check_spinlock_acquired_node(x, y) do { } while(0) | 2335 | #define check_spinlock_acquired_node(x, y) do { } while(0) |
2343 | #endif | 2336 | #endif |
2344 | 2337 | ||
2345 | static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, | 2338 | static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, |
2346 | struct array_cache *ac, | 2339 | struct array_cache *ac, |
2347 | int force, int node); | 2340 | int force, int node); |
2348 | 2341 | ||
2349 | static void do_drain(void *arg) | 2342 | static void do_drain(void *arg) |
2350 | { | 2343 | { |
2351 | struct kmem_cache *cachep = arg; | 2344 | struct kmem_cache *cachep = arg; |
2352 | struct array_cache *ac; | 2345 | struct array_cache *ac; |
2353 | int node = numa_mem_id(); | 2346 | int node = numa_mem_id(); |
2354 | struct kmem_cache_node *n; | 2347 | struct kmem_cache_node *n; |
2355 | LIST_HEAD(list); | 2348 | LIST_HEAD(list); |
2356 | 2349 | ||
2357 | check_irq_off(); | 2350 | check_irq_off(); |
2358 | ac = cpu_cache_get(cachep); | 2351 | ac = cpu_cache_get(cachep); |
2359 | n = get_node(cachep, node); | 2352 | n = get_node(cachep, node); |
2360 | spin_lock(&n->list_lock); | 2353 | spin_lock(&n->list_lock); |
2361 | free_block(cachep, ac->entry, ac->avail, node, &list); | 2354 | free_block(cachep, ac->entry, ac->avail, node, &list); |
2362 | spin_unlock(&n->list_lock); | 2355 | spin_unlock(&n->list_lock); |
2363 | slabs_destroy(cachep, &list); | 2356 | slabs_destroy(cachep, &list); |
2364 | ac->avail = 0; | 2357 | ac->avail = 0; |
2365 | } | 2358 | } |
2366 | 2359 | ||
2367 | static void drain_cpu_caches(struct kmem_cache *cachep) | 2360 | static void drain_cpu_caches(struct kmem_cache *cachep) |
2368 | { | 2361 | { |
2369 | struct kmem_cache_node *n; | 2362 | struct kmem_cache_node *n; |
2370 | int node; | 2363 | int node; |
2371 | 2364 | ||
2372 | on_each_cpu(do_drain, cachep, 1); | 2365 | on_each_cpu(do_drain, cachep, 1); |
2373 | check_irq_on(); | 2366 | check_irq_on(); |
2374 | for_each_kmem_cache_node(cachep, node, n) | 2367 | for_each_kmem_cache_node(cachep, node, n) |
2375 | if (n->alien) | 2368 | if (n->alien) |
2376 | drain_alien_cache(cachep, n->alien); | 2369 | drain_alien_cache(cachep, n->alien); |
2377 | 2370 | ||
2378 | for_each_kmem_cache_node(cachep, node, n) | 2371 | for_each_kmem_cache_node(cachep, node, n) |
2379 | drain_array(cachep, n, n->shared, 1, node); | 2372 | drain_array(cachep, n, n->shared, 1, node); |
2380 | } | 2373 | } |
2381 | 2374 | ||
2382 | /* | 2375 | /* |
2383 | * Remove slabs from the list of free slabs. | 2376 | * Remove slabs from the list of free slabs. |
2384 | * Specify the number of slabs to drain in tofree. | 2377 | * Specify the number of slabs to drain in tofree. |
2385 | * | 2378 | * |
2386 | * Returns the actual number of slabs released. | 2379 | * Returns the actual number of slabs released. |
2387 | */ | 2380 | */ |
2388 | static int drain_freelist(struct kmem_cache *cache, | 2381 | static int drain_freelist(struct kmem_cache *cache, |
2389 | struct kmem_cache_node *n, int tofree) | 2382 | struct kmem_cache_node *n, int tofree) |
2390 | { | 2383 | { |
2391 | struct list_head *p; | 2384 | struct list_head *p; |
2392 | int nr_freed; | 2385 | int nr_freed; |
2393 | struct page *page; | 2386 | struct page *page; |
2394 | 2387 | ||
2395 | nr_freed = 0; | 2388 | nr_freed = 0; |
2396 | while (nr_freed < tofree && !list_empty(&n->slabs_free)) { | 2389 | while (nr_freed < tofree && !list_empty(&n->slabs_free)) { |
2397 | 2390 | ||
2398 | spin_lock_irq(&n->list_lock); | 2391 | spin_lock_irq(&n->list_lock); |
2399 | p = n->slabs_free.prev; | 2392 | p = n->slabs_free.prev; |
2400 | if (p == &n->slabs_free) { | 2393 | if (p == &n->slabs_free) { |
2401 | spin_unlock_irq(&n->list_lock); | 2394 | spin_unlock_irq(&n->list_lock); |
2402 | goto out; | 2395 | goto out; |
2403 | } | 2396 | } |
2404 | 2397 | ||
2405 | page = list_entry(p, struct page, lru); | 2398 | page = list_entry(p, struct page, lru); |
2406 | #if DEBUG | 2399 | #if DEBUG |
2407 | BUG_ON(page->active); | 2400 | BUG_ON(page->active); |
2408 | #endif | 2401 | #endif |
2409 | list_del(&page->lru); | 2402 | list_del(&page->lru); |
2410 | /* | 2403 | /* |
2411 | * Safe to drop the lock. The slab is no longer linked | 2404 | * Safe to drop the lock. The slab is no longer linked |
2412 | * to the cache. | 2405 | * to the cache. |
2413 | */ | 2406 | */ |
2414 | n->free_objects -= cache->num; | 2407 | n->free_objects -= cache->num; |
2415 | spin_unlock_irq(&n->list_lock); | 2408 | spin_unlock_irq(&n->list_lock); |
2416 | slab_destroy(cache, page); | 2409 | slab_destroy(cache, page); |
2417 | nr_freed++; | 2410 | nr_freed++; |
2418 | } | 2411 | } |
2419 | out: | 2412 | out: |
2420 | return nr_freed; | 2413 | return nr_freed; |
2421 | } | 2414 | } |
2422 | 2415 | ||
2423 | int __kmem_cache_shrink(struct kmem_cache *cachep) | 2416 | int __kmem_cache_shrink(struct kmem_cache *cachep) |
2424 | { | 2417 | { |
2425 | int ret = 0; | 2418 | int ret = 0; |
2426 | int node; | 2419 | int node; |
2427 | struct kmem_cache_node *n; | 2420 | struct kmem_cache_node *n; |
2428 | 2421 | ||
2429 | drain_cpu_caches(cachep); | 2422 | drain_cpu_caches(cachep); |
2430 | 2423 | ||
2431 | check_irq_on(); | 2424 | check_irq_on(); |
2432 | for_each_kmem_cache_node(cachep, node, n) { | 2425 | for_each_kmem_cache_node(cachep, node, n) { |
2433 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); | 2426 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
2434 | 2427 | ||
2435 | ret += !list_empty(&n->slabs_full) || | 2428 | ret += !list_empty(&n->slabs_full) || |
2436 | !list_empty(&n->slabs_partial); | 2429 | !list_empty(&n->slabs_partial); |
2437 | } | 2430 | } |
2438 | return (ret ? 1 : 0); | 2431 | return (ret ? 1 : 0); |
2439 | } | 2432 | } |
2440 | 2433 | ||
2441 | int __kmem_cache_shutdown(struct kmem_cache *cachep) | 2434 | int __kmem_cache_shutdown(struct kmem_cache *cachep) |
2442 | { | 2435 | { |
2443 | int i; | 2436 | int i; |
2444 | struct kmem_cache_node *n; | 2437 | struct kmem_cache_node *n; |
2445 | int rc = __kmem_cache_shrink(cachep); | 2438 | int rc = __kmem_cache_shrink(cachep); |
2446 | 2439 | ||
2447 | if (rc) | 2440 | if (rc) |
2448 | return rc; | 2441 | return rc; |
2449 | 2442 | ||
2450 | for_each_online_cpu(i) | 2443 | for_each_online_cpu(i) |
2451 | kfree(cachep->array[i]); | 2444 | kfree(cachep->array[i]); |
2452 | 2445 | ||
2453 | /* NUMA: free the node structures */ | 2446 | /* NUMA: free the node structures */ |
2454 | for_each_kmem_cache_node(cachep, i, n) { | 2447 | for_each_kmem_cache_node(cachep, i, n) { |
2455 | kfree(n->shared); | 2448 | kfree(n->shared); |
2456 | free_alien_cache(n->alien); | 2449 | free_alien_cache(n->alien); |
2457 | kfree(n); | 2450 | kfree(n); |
2458 | cachep->node[i] = NULL; | 2451 | cachep->node[i] = NULL; |
2459 | } | 2452 | } |
2460 | return 0; | 2453 | return 0; |
2461 | } | 2454 | } |
2462 | 2455 | ||
2463 | /* | 2456 | /* |
2464 | * Get the memory for a slab management obj. | 2457 | * Get the memory for a slab management obj. |
2465 | * | 2458 | * |
2466 | * For a slab cache when the slab descriptor is off-slab, the | 2459 | * For a slab cache when the slab descriptor is off-slab, the |
2467 | * slab descriptor can't come from the same cache which is being created, | 2460 | * slab descriptor can't come from the same cache which is being created, |
2468 | * Because if it is the case, that means we defer the creation of | 2461 | * Because if it is the case, that means we defer the creation of |
2469 | * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. | 2462 | * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. |
2470 | * And we eventually call down to __kmem_cache_create(), which | 2463 | * And we eventually call down to __kmem_cache_create(), which |
2471 | * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. | 2464 | * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. |
2472 | * This is a "chicken-and-egg" problem. | 2465 | * This is a "chicken-and-egg" problem. |
2473 | * | 2466 | * |
2474 | * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, | 2467 | * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, |
2475 | * which are all initialized during kmem_cache_init(). | 2468 | * which are all initialized during kmem_cache_init(). |
2476 | */ | 2469 | */ |
2477 | static void *alloc_slabmgmt(struct kmem_cache *cachep, | 2470 | static void *alloc_slabmgmt(struct kmem_cache *cachep, |
2478 | struct page *page, int colour_off, | 2471 | struct page *page, int colour_off, |
2479 | gfp_t local_flags, int nodeid) | 2472 | gfp_t local_flags, int nodeid) |
2480 | { | 2473 | { |
2481 | void *freelist; | 2474 | void *freelist; |
2482 | void *addr = page_address(page); | 2475 | void *addr = page_address(page); |
2483 | 2476 | ||
2484 | if (OFF_SLAB(cachep)) { | 2477 | if (OFF_SLAB(cachep)) { |
2485 | /* Slab management obj is off-slab. */ | 2478 | /* Slab management obj is off-slab. */ |
2486 | freelist = kmem_cache_alloc_node(cachep->freelist_cache, | 2479 | freelist = kmem_cache_alloc_node(cachep->freelist_cache, |
2487 | local_flags, nodeid); | 2480 | local_flags, nodeid); |
2488 | if (!freelist) | 2481 | if (!freelist) |
2489 | return NULL; | 2482 | return NULL; |
2490 | } else { | 2483 | } else { |
2491 | freelist = addr + colour_off; | 2484 | freelist = addr + colour_off; |
2492 | colour_off += cachep->freelist_size; | 2485 | colour_off += cachep->freelist_size; |
2493 | } | 2486 | } |
2494 | page->active = 0; | 2487 | page->active = 0; |
2495 | page->s_mem = addr + colour_off; | 2488 | page->s_mem = addr + colour_off; |
2496 | return freelist; | 2489 | return freelist; |
2497 | } | 2490 | } |
2498 | 2491 | ||
2499 | static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) | 2492 | static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) |
2500 | { | 2493 | { |
2501 | return ((freelist_idx_t *)page->freelist)[idx]; | 2494 | return ((freelist_idx_t *)page->freelist)[idx]; |
2502 | } | 2495 | } |
2503 | 2496 | ||
2504 | static inline void set_free_obj(struct page *page, | 2497 | static inline void set_free_obj(struct page *page, |
2505 | unsigned int idx, freelist_idx_t val) | 2498 | unsigned int idx, freelist_idx_t val) |
2506 | { | 2499 | { |
2507 | ((freelist_idx_t *)(page->freelist))[idx] = val; | 2500 | ((freelist_idx_t *)(page->freelist))[idx] = val; |
2508 | } | 2501 | } |
2509 | 2502 | ||
2510 | static void cache_init_objs(struct kmem_cache *cachep, | 2503 | static void cache_init_objs(struct kmem_cache *cachep, |
2511 | struct page *page) | 2504 | struct page *page) |
2512 | { | 2505 | { |
2513 | int i; | 2506 | int i; |
2514 | 2507 | ||
2515 | for (i = 0; i < cachep->num; i++) { | 2508 | for (i = 0; i < cachep->num; i++) { |
2516 | void *objp = index_to_obj(cachep, page, i); | 2509 | void *objp = index_to_obj(cachep, page, i); |
2517 | #if DEBUG | 2510 | #if DEBUG |
2518 | /* need to poison the objs? */ | 2511 | /* need to poison the objs? */ |
2519 | if (cachep->flags & SLAB_POISON) | 2512 | if (cachep->flags & SLAB_POISON) |
2520 | poison_obj(cachep, objp, POISON_FREE); | 2513 | poison_obj(cachep, objp, POISON_FREE); |
2521 | if (cachep->flags & SLAB_STORE_USER) | 2514 | if (cachep->flags & SLAB_STORE_USER) |
2522 | *dbg_userword(cachep, objp) = NULL; | 2515 | *dbg_userword(cachep, objp) = NULL; |
2523 | 2516 | ||
2524 | if (cachep->flags & SLAB_RED_ZONE) { | 2517 | if (cachep->flags & SLAB_RED_ZONE) { |
2525 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | 2518 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; |
2526 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2519 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2527 | } | 2520 | } |
2528 | /* | 2521 | /* |
2529 | * Constructors are not allowed to allocate memory from the same | 2522 | * Constructors are not allowed to allocate memory from the same |
2530 | * cache which they are a constructor for. Otherwise, deadlock. | 2523 | * cache which they are a constructor for. Otherwise, deadlock. |
2531 | * They must also be threaded. | 2524 | * They must also be threaded. |
2532 | */ | 2525 | */ |
2533 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2526 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2534 | cachep->ctor(objp + obj_offset(cachep)); | 2527 | cachep->ctor(objp + obj_offset(cachep)); |
2535 | 2528 | ||
2536 | if (cachep->flags & SLAB_RED_ZONE) { | 2529 | if (cachep->flags & SLAB_RED_ZONE) { |
2537 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2530 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
2538 | slab_error(cachep, "constructor overwrote the" | 2531 | slab_error(cachep, "constructor overwrote the" |
2539 | " end of an object"); | 2532 | " end of an object"); |
2540 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | 2533 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) |
2541 | slab_error(cachep, "constructor overwrote the" | 2534 | slab_error(cachep, "constructor overwrote the" |
2542 | " start of an object"); | 2535 | " start of an object"); |
2543 | } | 2536 | } |
2544 | if ((cachep->size % PAGE_SIZE) == 0 && | 2537 | if ((cachep->size % PAGE_SIZE) == 0 && |
2545 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) | 2538 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) |
2546 | kernel_map_pages(virt_to_page(objp), | 2539 | kernel_map_pages(virt_to_page(objp), |
2547 | cachep->size / PAGE_SIZE, 0); | 2540 | cachep->size / PAGE_SIZE, 0); |
2548 | #else | 2541 | #else |
2549 | if (cachep->ctor) | 2542 | if (cachep->ctor) |
2550 | cachep->ctor(objp); | 2543 | cachep->ctor(objp); |
2551 | #endif | 2544 | #endif |
2552 | set_obj_status(page, i, OBJECT_FREE); | 2545 | set_obj_status(page, i, OBJECT_FREE); |
2553 | set_free_obj(page, i, i); | 2546 | set_free_obj(page, i, i); |
2554 | } | 2547 | } |
2555 | } | 2548 | } |
2556 | 2549 | ||
2557 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2550 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
2558 | { | 2551 | { |
2559 | if (CONFIG_ZONE_DMA_FLAG) { | 2552 | if (CONFIG_ZONE_DMA_FLAG) { |
2560 | if (flags & GFP_DMA) | 2553 | if (flags & GFP_DMA) |
2561 | BUG_ON(!(cachep->allocflags & GFP_DMA)); | 2554 | BUG_ON(!(cachep->allocflags & GFP_DMA)); |
2562 | else | 2555 | else |
2563 | BUG_ON(cachep->allocflags & GFP_DMA); | 2556 | BUG_ON(cachep->allocflags & GFP_DMA); |
2564 | } | 2557 | } |
2565 | } | 2558 | } |
2566 | 2559 | ||
2567 | static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, | 2560 | static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, |
2568 | int nodeid) | 2561 | int nodeid) |
2569 | { | 2562 | { |
2570 | void *objp; | 2563 | void *objp; |
2571 | 2564 | ||
2572 | objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); | 2565 | objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); |
2573 | page->active++; | 2566 | page->active++; |
2574 | #if DEBUG | 2567 | #if DEBUG |
2575 | WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); | 2568 | WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); |
2576 | #endif | 2569 | #endif |
2577 | 2570 | ||
2578 | return objp; | 2571 | return objp; |
2579 | } | 2572 | } |
2580 | 2573 | ||
2581 | static void slab_put_obj(struct kmem_cache *cachep, struct page *page, | 2574 | static void slab_put_obj(struct kmem_cache *cachep, struct page *page, |
2582 | void *objp, int nodeid) | 2575 | void *objp, int nodeid) |
2583 | { | 2576 | { |
2584 | unsigned int objnr = obj_to_index(cachep, page, objp); | 2577 | unsigned int objnr = obj_to_index(cachep, page, objp); |
2585 | #if DEBUG | 2578 | #if DEBUG |
2586 | unsigned int i; | 2579 | unsigned int i; |
2587 | 2580 | ||
2588 | /* Verify that the slab belongs to the intended node */ | 2581 | /* Verify that the slab belongs to the intended node */ |
2589 | WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); | 2582 | WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); |
2590 | 2583 | ||
2591 | /* Verify double free bug */ | 2584 | /* Verify double free bug */ |
2592 | for (i = page->active; i < cachep->num; i++) { | 2585 | for (i = page->active; i < cachep->num; i++) { |
2593 | if (get_free_obj(page, i) == objnr) { | 2586 | if (get_free_obj(page, i) == objnr) { |
2594 | printk(KERN_ERR "slab: double free detected in cache " | 2587 | printk(KERN_ERR "slab: double free detected in cache " |
2595 | "'%s', objp %p\n", cachep->name, objp); | 2588 | "'%s', objp %p\n", cachep->name, objp); |
2596 | BUG(); | 2589 | BUG(); |
2597 | } | 2590 | } |
2598 | } | 2591 | } |
2599 | #endif | 2592 | #endif |
2600 | page->active--; | 2593 | page->active--; |
2601 | set_free_obj(page, page->active, objnr); | 2594 | set_free_obj(page, page->active, objnr); |
2602 | } | 2595 | } |
2603 | 2596 | ||
2604 | /* | 2597 | /* |
2605 | * Map pages beginning at addr to the given cache and slab. This is required | 2598 | * Map pages beginning at addr to the given cache and slab. This is required |
2606 | * for the slab allocator to be able to lookup the cache and slab of a | 2599 | * for the slab allocator to be able to lookup the cache and slab of a |
2607 | * virtual address for kfree, ksize, and slab debugging. | 2600 | * virtual address for kfree, ksize, and slab debugging. |
2608 | */ | 2601 | */ |
2609 | static void slab_map_pages(struct kmem_cache *cache, struct page *page, | 2602 | static void slab_map_pages(struct kmem_cache *cache, struct page *page, |
2610 | void *freelist) | 2603 | void *freelist) |
2611 | { | 2604 | { |
2612 | page->slab_cache = cache; | 2605 | page->slab_cache = cache; |
2613 | page->freelist = freelist; | 2606 | page->freelist = freelist; |
2614 | } | 2607 | } |
2615 | 2608 | ||
2616 | /* | 2609 | /* |
2617 | * Grow (by 1) the number of slabs within a cache. This is called by | 2610 | * Grow (by 1) the number of slabs within a cache. This is called by |
2618 | * kmem_cache_alloc() when there are no active objs left in a cache. | 2611 | * kmem_cache_alloc() when there are no active objs left in a cache. |
2619 | */ | 2612 | */ |
2620 | static int cache_grow(struct kmem_cache *cachep, | 2613 | static int cache_grow(struct kmem_cache *cachep, |
2621 | gfp_t flags, int nodeid, struct page *page) | 2614 | gfp_t flags, int nodeid, struct page *page) |
2622 | { | 2615 | { |
2623 | void *freelist; | 2616 | void *freelist; |
2624 | size_t offset; | 2617 | size_t offset; |
2625 | gfp_t local_flags; | 2618 | gfp_t local_flags; |
2626 | struct kmem_cache_node *n; | 2619 | struct kmem_cache_node *n; |
2627 | 2620 | ||
2628 | /* | 2621 | /* |
2629 | * Be lazy and only check for valid flags here, keeping it out of the | 2622 | * Be lazy and only check for valid flags here, keeping it out of the |
2630 | * critical path in kmem_cache_alloc(). | 2623 | * critical path in kmem_cache_alloc(). |
2631 | */ | 2624 | */ |
2632 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 2625 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
2633 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 2626 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
2634 | 2627 | ||
2635 | /* Take the node list lock to change the colour_next on this node */ | 2628 | /* Take the node list lock to change the colour_next on this node */ |
2636 | check_irq_off(); | 2629 | check_irq_off(); |
2637 | n = get_node(cachep, nodeid); | 2630 | n = get_node(cachep, nodeid); |
2638 | spin_lock(&n->list_lock); | 2631 | spin_lock(&n->list_lock); |
2639 | 2632 | ||
2640 | /* Get colour for the slab, and cal the next value. */ | 2633 | /* Get colour for the slab, and cal the next value. */ |
2641 | offset = n->colour_next; | 2634 | offset = n->colour_next; |
2642 | n->colour_next++; | 2635 | n->colour_next++; |
2643 | if (n->colour_next >= cachep->colour) | 2636 | if (n->colour_next >= cachep->colour) |
2644 | n->colour_next = 0; | 2637 | n->colour_next = 0; |
2645 | spin_unlock(&n->list_lock); | 2638 | spin_unlock(&n->list_lock); |
2646 | 2639 | ||
2647 | offset *= cachep->colour_off; | 2640 | offset *= cachep->colour_off; |
2648 | 2641 | ||
2649 | if (local_flags & __GFP_WAIT) | 2642 | if (local_flags & __GFP_WAIT) |
2650 | local_irq_enable(); | 2643 | local_irq_enable(); |
2651 | 2644 | ||
2652 | /* | 2645 | /* |
2653 | * The test for missing atomic flag is performed here, rather than | 2646 | * The test for missing atomic flag is performed here, rather than |
2654 | * the more obvious place, simply to reduce the critical path length | 2647 | * the more obvious place, simply to reduce the critical path length |
2655 | * in kmem_cache_alloc(). If a caller is seriously mis-behaving they | 2648 | * in kmem_cache_alloc(). If a caller is seriously mis-behaving they |
2656 | * will eventually be caught here (where it matters). | 2649 | * will eventually be caught here (where it matters). |
2657 | */ | 2650 | */ |
2658 | kmem_flagcheck(cachep, flags); | 2651 | kmem_flagcheck(cachep, flags); |
2659 | 2652 | ||
2660 | /* | 2653 | /* |
2661 | * Get mem for the objs. Attempt to allocate a physical page from | 2654 | * Get mem for the objs. Attempt to allocate a physical page from |
2662 | * 'nodeid'. | 2655 | * 'nodeid'. |
2663 | */ | 2656 | */ |
2664 | if (!page) | 2657 | if (!page) |
2665 | page = kmem_getpages(cachep, local_flags, nodeid); | 2658 | page = kmem_getpages(cachep, local_flags, nodeid); |
2666 | if (!page) | 2659 | if (!page) |
2667 | goto failed; | 2660 | goto failed; |
2668 | 2661 | ||
2669 | /* Get slab management. */ | 2662 | /* Get slab management. */ |
2670 | freelist = alloc_slabmgmt(cachep, page, offset, | 2663 | freelist = alloc_slabmgmt(cachep, page, offset, |
2671 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); | 2664 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); |
2672 | if (!freelist) | 2665 | if (!freelist) |
2673 | goto opps1; | 2666 | goto opps1; |
2674 | 2667 | ||
2675 | slab_map_pages(cachep, page, freelist); | 2668 | slab_map_pages(cachep, page, freelist); |
2676 | 2669 | ||
2677 | cache_init_objs(cachep, page); | 2670 | cache_init_objs(cachep, page); |
2678 | 2671 | ||
2679 | if (local_flags & __GFP_WAIT) | 2672 | if (local_flags & __GFP_WAIT) |
2680 | local_irq_disable(); | 2673 | local_irq_disable(); |
2681 | check_irq_off(); | 2674 | check_irq_off(); |
2682 | spin_lock(&n->list_lock); | 2675 | spin_lock(&n->list_lock); |
2683 | 2676 | ||
2684 | /* Make slab active. */ | 2677 | /* Make slab active. */ |
2685 | list_add_tail(&page->lru, &(n->slabs_free)); | 2678 | list_add_tail(&page->lru, &(n->slabs_free)); |
2686 | STATS_INC_GROWN(cachep); | 2679 | STATS_INC_GROWN(cachep); |
2687 | n->free_objects += cachep->num; | 2680 | n->free_objects += cachep->num; |
2688 | spin_unlock(&n->list_lock); | 2681 | spin_unlock(&n->list_lock); |
2689 | return 1; | 2682 | return 1; |
2690 | opps1: | 2683 | opps1: |
2691 | kmem_freepages(cachep, page); | 2684 | kmem_freepages(cachep, page); |
2692 | failed: | 2685 | failed: |
2693 | if (local_flags & __GFP_WAIT) | 2686 | if (local_flags & __GFP_WAIT) |
2694 | local_irq_disable(); | 2687 | local_irq_disable(); |
2695 | return 0; | 2688 | return 0; |
2696 | } | 2689 | } |
2697 | 2690 | ||
2698 | #if DEBUG | 2691 | #if DEBUG |
2699 | 2692 | ||
2700 | /* | 2693 | /* |
2701 | * Perform extra freeing checks: | 2694 | * Perform extra freeing checks: |
2702 | * - detect bad pointers. | 2695 | * - detect bad pointers. |
2703 | * - POISON/RED_ZONE checking | 2696 | * - POISON/RED_ZONE checking |
2704 | */ | 2697 | */ |
2705 | static void kfree_debugcheck(const void *objp) | 2698 | static void kfree_debugcheck(const void *objp) |
2706 | { | 2699 | { |
2707 | if (!virt_addr_valid(objp)) { | 2700 | if (!virt_addr_valid(objp)) { |
2708 | printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", | 2701 | printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", |
2709 | (unsigned long)objp); | 2702 | (unsigned long)objp); |
2710 | BUG(); | 2703 | BUG(); |
2711 | } | 2704 | } |
2712 | } | 2705 | } |
2713 | 2706 | ||
2714 | static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) | 2707 | static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) |
2715 | { | 2708 | { |
2716 | unsigned long long redzone1, redzone2; | 2709 | unsigned long long redzone1, redzone2; |
2717 | 2710 | ||
2718 | redzone1 = *dbg_redzone1(cache, obj); | 2711 | redzone1 = *dbg_redzone1(cache, obj); |
2719 | redzone2 = *dbg_redzone2(cache, obj); | 2712 | redzone2 = *dbg_redzone2(cache, obj); |
2720 | 2713 | ||
2721 | /* | 2714 | /* |
2722 | * Redzone is ok. | 2715 | * Redzone is ok. |
2723 | */ | 2716 | */ |
2724 | if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) | 2717 | if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) |
2725 | return; | 2718 | return; |
2726 | 2719 | ||
2727 | if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) | 2720 | if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) |
2728 | slab_error(cache, "double free detected"); | 2721 | slab_error(cache, "double free detected"); |
2729 | else | 2722 | else |
2730 | slab_error(cache, "memory outside object was overwritten"); | 2723 | slab_error(cache, "memory outside object was overwritten"); |
2731 | 2724 | ||
2732 | printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", | 2725 | printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", |
2733 | obj, redzone1, redzone2); | 2726 | obj, redzone1, redzone2); |
2734 | } | 2727 | } |
2735 | 2728 | ||
2736 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | 2729 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, |
2737 | unsigned long caller) | 2730 | unsigned long caller) |
2738 | { | 2731 | { |
2739 | unsigned int objnr; | 2732 | unsigned int objnr; |
2740 | struct page *page; | 2733 | struct page *page; |
2741 | 2734 | ||
2742 | BUG_ON(virt_to_cache(objp) != cachep); | 2735 | BUG_ON(virt_to_cache(objp) != cachep); |
2743 | 2736 | ||
2744 | objp -= obj_offset(cachep); | 2737 | objp -= obj_offset(cachep); |
2745 | kfree_debugcheck(objp); | 2738 | kfree_debugcheck(objp); |
2746 | page = virt_to_head_page(objp); | 2739 | page = virt_to_head_page(objp); |
2747 | 2740 | ||
2748 | if (cachep->flags & SLAB_RED_ZONE) { | 2741 | if (cachep->flags & SLAB_RED_ZONE) { |
2749 | verify_redzone_free(cachep, objp); | 2742 | verify_redzone_free(cachep, objp); |
2750 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | 2743 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; |
2751 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2744 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2752 | } | 2745 | } |
2753 | if (cachep->flags & SLAB_STORE_USER) | 2746 | if (cachep->flags & SLAB_STORE_USER) |
2754 | *dbg_userword(cachep, objp) = (void *)caller; | 2747 | *dbg_userword(cachep, objp) = (void *)caller; |
2755 | 2748 | ||
2756 | objnr = obj_to_index(cachep, page, objp); | 2749 | objnr = obj_to_index(cachep, page, objp); |
2757 | 2750 | ||
2758 | BUG_ON(objnr >= cachep->num); | 2751 | BUG_ON(objnr >= cachep->num); |
2759 | BUG_ON(objp != index_to_obj(cachep, page, objnr)); | 2752 | BUG_ON(objp != index_to_obj(cachep, page, objnr)); |
2760 | 2753 | ||
2761 | set_obj_status(page, objnr, OBJECT_FREE); | 2754 | set_obj_status(page, objnr, OBJECT_FREE); |
2762 | if (cachep->flags & SLAB_POISON) { | 2755 | if (cachep->flags & SLAB_POISON) { |
2763 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2756 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2764 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | 2757 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
2765 | store_stackinfo(cachep, objp, caller); | 2758 | store_stackinfo(cachep, objp, caller); |
2766 | kernel_map_pages(virt_to_page(objp), | 2759 | kernel_map_pages(virt_to_page(objp), |
2767 | cachep->size / PAGE_SIZE, 0); | 2760 | cachep->size / PAGE_SIZE, 0); |
2768 | } else { | 2761 | } else { |
2769 | poison_obj(cachep, objp, POISON_FREE); | 2762 | poison_obj(cachep, objp, POISON_FREE); |
2770 | } | 2763 | } |
2771 | #else | 2764 | #else |
2772 | poison_obj(cachep, objp, POISON_FREE); | 2765 | poison_obj(cachep, objp, POISON_FREE); |
2773 | #endif | 2766 | #endif |
2774 | } | 2767 | } |
2775 | return objp; | 2768 | return objp; |
2776 | } | 2769 | } |
2777 | 2770 | ||
2778 | #else | 2771 | #else |
2779 | #define kfree_debugcheck(x) do { } while(0) | 2772 | #define kfree_debugcheck(x) do { } while(0) |
2780 | #define cache_free_debugcheck(x,objp,z) (objp) | 2773 | #define cache_free_debugcheck(x,objp,z) (objp) |
2781 | #endif | 2774 | #endif |
2782 | 2775 | ||
2783 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, | 2776 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, |
2784 | bool force_refill) | 2777 | bool force_refill) |
2785 | { | 2778 | { |
2786 | int batchcount; | 2779 | int batchcount; |
2787 | struct kmem_cache_node *n; | 2780 | struct kmem_cache_node *n; |
2788 | struct array_cache *ac; | 2781 | struct array_cache *ac; |
2789 | int node; | 2782 | int node; |
2790 | 2783 | ||
2791 | check_irq_off(); | 2784 | check_irq_off(); |
2792 | node = numa_mem_id(); | 2785 | node = numa_mem_id(); |
2793 | if (unlikely(force_refill)) | 2786 | if (unlikely(force_refill)) |
2794 | goto force_grow; | 2787 | goto force_grow; |
2795 | retry: | 2788 | retry: |
2796 | ac = cpu_cache_get(cachep); | 2789 | ac = cpu_cache_get(cachep); |
2797 | batchcount = ac->batchcount; | 2790 | batchcount = ac->batchcount; |
2798 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 2791 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
2799 | /* | 2792 | /* |
2800 | * If there was little recent activity on this cache, then | 2793 | * If there was little recent activity on this cache, then |
2801 | * perform only a partial refill. Otherwise we could generate | 2794 | * perform only a partial refill. Otherwise we could generate |
2802 | * refill bouncing. | 2795 | * refill bouncing. |
2803 | */ | 2796 | */ |
2804 | batchcount = BATCHREFILL_LIMIT; | 2797 | batchcount = BATCHREFILL_LIMIT; |
2805 | } | 2798 | } |
2806 | n = get_node(cachep, node); | 2799 | n = get_node(cachep, node); |
2807 | 2800 | ||
2808 | BUG_ON(ac->avail > 0 || !n); | 2801 | BUG_ON(ac->avail > 0 || !n); |
2809 | spin_lock(&n->list_lock); | 2802 | spin_lock(&n->list_lock); |
2810 | 2803 | ||
2811 | /* See if we can refill from the shared array */ | 2804 | /* See if we can refill from the shared array */ |
2812 | if (n->shared && transfer_objects(ac, n->shared, batchcount)) { | 2805 | if (n->shared && transfer_objects(ac, n->shared, batchcount)) { |
2813 | n->shared->touched = 1; | 2806 | n->shared->touched = 1; |
2814 | goto alloc_done; | 2807 | goto alloc_done; |
2815 | } | 2808 | } |
2816 | 2809 | ||
2817 | while (batchcount > 0) { | 2810 | while (batchcount > 0) { |
2818 | struct list_head *entry; | 2811 | struct list_head *entry; |
2819 | struct page *page; | 2812 | struct page *page; |
2820 | /* Get slab alloc is to come from. */ | 2813 | /* Get slab alloc is to come from. */ |
2821 | entry = n->slabs_partial.next; | 2814 | entry = n->slabs_partial.next; |
2822 | if (entry == &n->slabs_partial) { | 2815 | if (entry == &n->slabs_partial) { |
2823 | n->free_touched = 1; | 2816 | n->free_touched = 1; |
2824 | entry = n->slabs_free.next; | 2817 | entry = n->slabs_free.next; |
2825 | if (entry == &n->slabs_free) | 2818 | if (entry == &n->slabs_free) |
2826 | goto must_grow; | 2819 | goto must_grow; |
2827 | } | 2820 | } |
2828 | 2821 | ||
2829 | page = list_entry(entry, struct page, lru); | 2822 | page = list_entry(entry, struct page, lru); |
2830 | check_spinlock_acquired(cachep); | 2823 | check_spinlock_acquired(cachep); |
2831 | 2824 | ||
2832 | /* | 2825 | /* |
2833 | * The slab was either on partial or free list so | 2826 | * The slab was either on partial or free list so |
2834 | * there must be at least one object available for | 2827 | * there must be at least one object available for |
2835 | * allocation. | 2828 | * allocation. |
2836 | */ | 2829 | */ |
2837 | BUG_ON(page->active >= cachep->num); | 2830 | BUG_ON(page->active >= cachep->num); |
2838 | 2831 | ||
2839 | while (page->active < cachep->num && batchcount--) { | 2832 | while (page->active < cachep->num && batchcount--) { |
2840 | STATS_INC_ALLOCED(cachep); | 2833 | STATS_INC_ALLOCED(cachep); |
2841 | STATS_INC_ACTIVE(cachep); | 2834 | STATS_INC_ACTIVE(cachep); |
2842 | STATS_SET_HIGH(cachep); | 2835 | STATS_SET_HIGH(cachep); |
2843 | 2836 | ||
2844 | ac_put_obj(cachep, ac, slab_get_obj(cachep, page, | 2837 | ac_put_obj(cachep, ac, slab_get_obj(cachep, page, |
2845 | node)); | 2838 | node)); |
2846 | } | 2839 | } |
2847 | 2840 | ||
2848 | /* move slabp to correct slabp list: */ | 2841 | /* move slabp to correct slabp list: */ |
2849 | list_del(&page->lru); | 2842 | list_del(&page->lru); |
2850 | if (page->active == cachep->num) | 2843 | if (page->active == cachep->num) |
2851 | list_add(&page->lru, &n->slabs_full); | 2844 | list_add(&page->lru, &n->slabs_full); |
2852 | else | 2845 | else |
2853 | list_add(&page->lru, &n->slabs_partial); | 2846 | list_add(&page->lru, &n->slabs_partial); |
2854 | } | 2847 | } |
2855 | 2848 | ||
2856 | must_grow: | 2849 | must_grow: |
2857 | n->free_objects -= ac->avail; | 2850 | n->free_objects -= ac->avail; |
2858 | alloc_done: | 2851 | alloc_done: |
2859 | spin_unlock(&n->list_lock); | 2852 | spin_unlock(&n->list_lock); |
2860 | 2853 | ||
2861 | if (unlikely(!ac->avail)) { | 2854 | if (unlikely(!ac->avail)) { |
2862 | int x; | 2855 | int x; |
2863 | force_grow: | 2856 | force_grow: |
2864 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); | 2857 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
2865 | 2858 | ||
2866 | /* cache_grow can reenable interrupts, then ac could change. */ | 2859 | /* cache_grow can reenable interrupts, then ac could change. */ |
2867 | ac = cpu_cache_get(cachep); | 2860 | ac = cpu_cache_get(cachep); |
2868 | node = numa_mem_id(); | 2861 | node = numa_mem_id(); |
2869 | 2862 | ||
2870 | /* no objects in sight? abort */ | 2863 | /* no objects in sight? abort */ |
2871 | if (!x && (ac->avail == 0 || force_refill)) | 2864 | if (!x && (ac->avail == 0 || force_refill)) |
2872 | return NULL; | 2865 | return NULL; |
2873 | 2866 | ||
2874 | if (!ac->avail) /* objects refilled by interrupt? */ | 2867 | if (!ac->avail) /* objects refilled by interrupt? */ |
2875 | goto retry; | 2868 | goto retry; |
2876 | } | 2869 | } |
2877 | ac->touched = 1; | 2870 | ac->touched = 1; |
2878 | 2871 | ||
2879 | return ac_get_obj(cachep, ac, flags, force_refill); | 2872 | return ac_get_obj(cachep, ac, flags, force_refill); |
2880 | } | 2873 | } |
2881 | 2874 | ||
2882 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | 2875 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
2883 | gfp_t flags) | 2876 | gfp_t flags) |
2884 | { | 2877 | { |
2885 | might_sleep_if(flags & __GFP_WAIT); | 2878 | might_sleep_if(flags & __GFP_WAIT); |
2886 | #if DEBUG | 2879 | #if DEBUG |
2887 | kmem_flagcheck(cachep, flags); | 2880 | kmem_flagcheck(cachep, flags); |
2888 | #endif | 2881 | #endif |
2889 | } | 2882 | } |
2890 | 2883 | ||
2891 | #if DEBUG | 2884 | #if DEBUG |
2892 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | 2885 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
2893 | gfp_t flags, void *objp, unsigned long caller) | 2886 | gfp_t flags, void *objp, unsigned long caller) |
2894 | { | 2887 | { |
2895 | struct page *page; | 2888 | struct page *page; |
2896 | 2889 | ||
2897 | if (!objp) | 2890 | if (!objp) |
2898 | return objp; | 2891 | return objp; |
2899 | if (cachep->flags & SLAB_POISON) { | 2892 | if (cachep->flags & SLAB_POISON) { |
2900 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2893 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2901 | if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) | 2894 | if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) |
2902 | kernel_map_pages(virt_to_page(objp), | 2895 | kernel_map_pages(virt_to_page(objp), |
2903 | cachep->size / PAGE_SIZE, 1); | 2896 | cachep->size / PAGE_SIZE, 1); |
2904 | else | 2897 | else |
2905 | check_poison_obj(cachep, objp); | 2898 | check_poison_obj(cachep, objp); |
2906 | #else | 2899 | #else |
2907 | check_poison_obj(cachep, objp); | 2900 | check_poison_obj(cachep, objp); |
2908 | #endif | 2901 | #endif |
2909 | poison_obj(cachep, objp, POISON_INUSE); | 2902 | poison_obj(cachep, objp, POISON_INUSE); |
2910 | } | 2903 | } |
2911 | if (cachep->flags & SLAB_STORE_USER) | 2904 | if (cachep->flags & SLAB_STORE_USER) |
2912 | *dbg_userword(cachep, objp) = (void *)caller; | 2905 | *dbg_userword(cachep, objp) = (void *)caller; |
2913 | 2906 | ||
2914 | if (cachep->flags & SLAB_RED_ZONE) { | 2907 | if (cachep->flags & SLAB_RED_ZONE) { |
2915 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || | 2908 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || |
2916 | *dbg_redzone2(cachep, objp) != RED_INACTIVE) { | 2909 | *dbg_redzone2(cachep, objp) != RED_INACTIVE) { |
2917 | slab_error(cachep, "double free, or memory outside" | 2910 | slab_error(cachep, "double free, or memory outside" |
2918 | " object was overwritten"); | 2911 | " object was overwritten"); |
2919 | printk(KERN_ERR | 2912 | printk(KERN_ERR |
2920 | "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", | 2913 | "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", |
2921 | objp, *dbg_redzone1(cachep, objp), | 2914 | objp, *dbg_redzone1(cachep, objp), |
2922 | *dbg_redzone2(cachep, objp)); | 2915 | *dbg_redzone2(cachep, objp)); |
2923 | } | 2916 | } |
2924 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 2917 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
2925 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2918 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
2926 | } | 2919 | } |
2927 | 2920 | ||
2928 | page = virt_to_head_page(objp); | 2921 | page = virt_to_head_page(objp); |
2929 | set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); | 2922 | set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); |
2930 | objp += obj_offset(cachep); | 2923 | objp += obj_offset(cachep); |
2931 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 2924 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
2932 | cachep->ctor(objp); | 2925 | cachep->ctor(objp); |
2933 | if (ARCH_SLAB_MINALIGN && | 2926 | if (ARCH_SLAB_MINALIGN && |
2934 | ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { | 2927 | ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { |
2935 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 2928 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
2936 | objp, (int)ARCH_SLAB_MINALIGN); | 2929 | objp, (int)ARCH_SLAB_MINALIGN); |
2937 | } | 2930 | } |
2938 | return objp; | 2931 | return objp; |
2939 | } | 2932 | } |
2940 | #else | 2933 | #else |
2941 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) | 2934 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) |
2942 | #endif | 2935 | #endif |
2943 | 2936 | ||
2944 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | 2937 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) |
2945 | { | 2938 | { |
2946 | if (unlikely(cachep == kmem_cache)) | 2939 | if (unlikely(cachep == kmem_cache)) |
2947 | return false; | 2940 | return false; |
2948 | 2941 | ||
2949 | return should_failslab(cachep->object_size, flags, cachep->flags); | 2942 | return should_failslab(cachep->object_size, flags, cachep->flags); |
2950 | } | 2943 | } |
2951 | 2944 | ||
2952 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 2945 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
2953 | { | 2946 | { |
2954 | void *objp; | 2947 | void *objp; |
2955 | struct array_cache *ac; | 2948 | struct array_cache *ac; |
2956 | bool force_refill = false; | 2949 | bool force_refill = false; |
2957 | 2950 | ||
2958 | check_irq_off(); | 2951 | check_irq_off(); |
2959 | 2952 | ||
2960 | ac = cpu_cache_get(cachep); | 2953 | ac = cpu_cache_get(cachep); |
2961 | if (likely(ac->avail)) { | 2954 | if (likely(ac->avail)) { |
2962 | ac->touched = 1; | 2955 | ac->touched = 1; |
2963 | objp = ac_get_obj(cachep, ac, flags, false); | 2956 | objp = ac_get_obj(cachep, ac, flags, false); |
2964 | 2957 | ||
2965 | /* | 2958 | /* |
2966 | * Allow for the possibility all avail objects are not allowed | 2959 | * Allow for the possibility all avail objects are not allowed |
2967 | * by the current flags | 2960 | * by the current flags |
2968 | */ | 2961 | */ |
2969 | if (objp) { | 2962 | if (objp) { |
2970 | STATS_INC_ALLOCHIT(cachep); | 2963 | STATS_INC_ALLOCHIT(cachep); |
2971 | goto out; | 2964 | goto out; |
2972 | } | 2965 | } |
2973 | force_refill = true; | 2966 | force_refill = true; |
2974 | } | 2967 | } |
2975 | 2968 | ||
2976 | STATS_INC_ALLOCMISS(cachep); | 2969 | STATS_INC_ALLOCMISS(cachep); |
2977 | objp = cache_alloc_refill(cachep, flags, force_refill); | 2970 | objp = cache_alloc_refill(cachep, flags, force_refill); |
2978 | /* | 2971 | /* |
2979 | * the 'ac' may be updated by cache_alloc_refill(), | 2972 | * the 'ac' may be updated by cache_alloc_refill(), |
2980 | * and kmemleak_erase() requires its correct value. | 2973 | * and kmemleak_erase() requires its correct value. |
2981 | */ | 2974 | */ |
2982 | ac = cpu_cache_get(cachep); | 2975 | ac = cpu_cache_get(cachep); |
2983 | 2976 | ||
2984 | out: | 2977 | out: |
2985 | /* | 2978 | /* |
2986 | * To avoid a false negative, if an object that is in one of the | 2979 | * To avoid a false negative, if an object that is in one of the |
2987 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 2980 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
2988 | * treat the array pointers as a reference to the object. | 2981 | * treat the array pointers as a reference to the object. |
2989 | */ | 2982 | */ |
2990 | if (objp) | 2983 | if (objp) |
2991 | kmemleak_erase(&ac->entry[ac->avail]); | 2984 | kmemleak_erase(&ac->entry[ac->avail]); |
2992 | return objp; | 2985 | return objp; |
2993 | } | 2986 | } |
2994 | 2987 | ||
2995 | #ifdef CONFIG_NUMA | 2988 | #ifdef CONFIG_NUMA |
2996 | /* | 2989 | /* |
2997 | * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. | 2990 | * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. |
2998 | * | 2991 | * |
2999 | * If we are in_interrupt, then process context, including cpusets and | 2992 | * If we are in_interrupt, then process context, including cpusets and |
3000 | * mempolicy, may not apply and should not be used for allocation policy. | 2993 | * mempolicy, may not apply and should not be used for allocation policy. |
3001 | */ | 2994 | */ |
3002 | static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | 2995 | static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) |
3003 | { | 2996 | { |
3004 | int nid_alloc, nid_here; | 2997 | int nid_alloc, nid_here; |
3005 | 2998 | ||
3006 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 2999 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3007 | return NULL; | 3000 | return NULL; |
3008 | nid_alloc = nid_here = numa_mem_id(); | 3001 | nid_alloc = nid_here = numa_mem_id(); |
3009 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3002 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3010 | nid_alloc = cpuset_slab_spread_node(); | 3003 | nid_alloc = cpuset_slab_spread_node(); |
3011 | else if (current->mempolicy) | 3004 | else if (current->mempolicy) |
3012 | nid_alloc = mempolicy_slab_node(); | 3005 | nid_alloc = mempolicy_slab_node(); |
3013 | if (nid_alloc != nid_here) | 3006 | if (nid_alloc != nid_here) |
3014 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3007 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3015 | return NULL; | 3008 | return NULL; |
3016 | } | 3009 | } |
3017 | 3010 | ||
3018 | /* | 3011 | /* |
3019 | * Fallback function if there was no memory available and no objects on a | 3012 | * Fallback function if there was no memory available and no objects on a |
3020 | * certain node and fall back is permitted. First we scan all the | 3013 | * certain node and fall back is permitted. First we scan all the |
3021 | * available node for available objects. If that fails then we | 3014 | * available node for available objects. If that fails then we |
3022 | * perform an allocation without specifying a node. This allows the page | 3015 | * perform an allocation without specifying a node. This allows the page |
3023 | * allocator to do its reclaim / fallback magic. We then insert the | 3016 | * allocator to do its reclaim / fallback magic. We then insert the |
3024 | * slab into the proper nodelist and then allocate from it. | 3017 | * slab into the proper nodelist and then allocate from it. |
3025 | */ | 3018 | */ |
3026 | static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | 3019 | static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) |
3027 | { | 3020 | { |
3028 | struct zonelist *zonelist; | 3021 | struct zonelist *zonelist; |
3029 | gfp_t local_flags; | 3022 | gfp_t local_flags; |
3030 | struct zoneref *z; | 3023 | struct zoneref *z; |
3031 | struct zone *zone; | 3024 | struct zone *zone; |
3032 | enum zone_type high_zoneidx = gfp_zone(flags); | 3025 | enum zone_type high_zoneidx = gfp_zone(flags); |
3033 | void *obj = NULL; | 3026 | void *obj = NULL; |
3034 | int nid; | 3027 | int nid; |
3035 | unsigned int cpuset_mems_cookie; | 3028 | unsigned int cpuset_mems_cookie; |
3036 | 3029 | ||
3037 | if (flags & __GFP_THISNODE) | 3030 | if (flags & __GFP_THISNODE) |
3038 | return NULL; | 3031 | return NULL; |
3039 | 3032 | ||
3040 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3033 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3041 | 3034 | ||
3042 | retry_cpuset: | 3035 | retry_cpuset: |
3043 | cpuset_mems_cookie = read_mems_allowed_begin(); | 3036 | cpuset_mems_cookie = read_mems_allowed_begin(); |
3044 | zonelist = node_zonelist(mempolicy_slab_node(), flags); | 3037 | zonelist = node_zonelist(mempolicy_slab_node(), flags); |
3045 | 3038 | ||
3046 | retry: | 3039 | retry: |
3047 | /* | 3040 | /* |
3048 | * Look through allowed nodes for objects available | 3041 | * Look through allowed nodes for objects available |
3049 | * from existing per node queues. | 3042 | * from existing per node queues. |
3050 | */ | 3043 | */ |
3051 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 3044 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
3052 | nid = zone_to_nid(zone); | 3045 | nid = zone_to_nid(zone); |
3053 | 3046 | ||
3054 | if (cpuset_zone_allowed_hardwall(zone, flags) && | 3047 | if (cpuset_zone_allowed_hardwall(zone, flags) && |
3055 | get_node(cache, nid) && | 3048 | get_node(cache, nid) && |
3056 | get_node(cache, nid)->free_objects) { | 3049 | get_node(cache, nid)->free_objects) { |
3057 | obj = ____cache_alloc_node(cache, | 3050 | obj = ____cache_alloc_node(cache, |
3058 | flags | GFP_THISNODE, nid); | 3051 | flags | GFP_THISNODE, nid); |
3059 | if (obj) | 3052 | if (obj) |
3060 | break; | 3053 | break; |
3061 | } | 3054 | } |
3062 | } | 3055 | } |
3063 | 3056 | ||
3064 | if (!obj) { | 3057 | if (!obj) { |
3065 | /* | 3058 | /* |
3066 | * This allocation will be performed within the constraints | 3059 | * This allocation will be performed within the constraints |
3067 | * of the current cpuset / memory policy requirements. | 3060 | * of the current cpuset / memory policy requirements. |
3068 | * We may trigger various forms of reclaim on the allowed | 3061 | * We may trigger various forms of reclaim on the allowed |
3069 | * set and go into memory reserves if necessary. | 3062 | * set and go into memory reserves if necessary. |
3070 | */ | 3063 | */ |
3071 | struct page *page; | 3064 | struct page *page; |
3072 | 3065 | ||
3073 | if (local_flags & __GFP_WAIT) | 3066 | if (local_flags & __GFP_WAIT) |
3074 | local_irq_enable(); | 3067 | local_irq_enable(); |
3075 | kmem_flagcheck(cache, flags); | 3068 | kmem_flagcheck(cache, flags); |
3076 | page = kmem_getpages(cache, local_flags, numa_mem_id()); | 3069 | page = kmem_getpages(cache, local_flags, numa_mem_id()); |
3077 | if (local_flags & __GFP_WAIT) | 3070 | if (local_flags & __GFP_WAIT) |
3078 | local_irq_disable(); | 3071 | local_irq_disable(); |
3079 | if (page) { | 3072 | if (page) { |
3080 | /* | 3073 | /* |
3081 | * Insert into the appropriate per node queues | 3074 | * Insert into the appropriate per node queues |
3082 | */ | 3075 | */ |
3083 | nid = page_to_nid(page); | 3076 | nid = page_to_nid(page); |
3084 | if (cache_grow(cache, flags, nid, page)) { | 3077 | if (cache_grow(cache, flags, nid, page)) { |
3085 | obj = ____cache_alloc_node(cache, | 3078 | obj = ____cache_alloc_node(cache, |
3086 | flags | GFP_THISNODE, nid); | 3079 | flags | GFP_THISNODE, nid); |
3087 | if (!obj) | 3080 | if (!obj) |
3088 | /* | 3081 | /* |
3089 | * Another processor may allocate the | 3082 | * Another processor may allocate the |
3090 | * objects in the slab since we are | 3083 | * objects in the slab since we are |
3091 | * not holding any locks. | 3084 | * not holding any locks. |
3092 | */ | 3085 | */ |
3093 | goto retry; | 3086 | goto retry; |
3094 | } else { | 3087 | } else { |
3095 | /* cache_grow already freed obj */ | 3088 | /* cache_grow already freed obj */ |
3096 | obj = NULL; | 3089 | obj = NULL; |
3097 | } | 3090 | } |
3098 | } | 3091 | } |
3099 | } | 3092 | } |
3100 | 3093 | ||
3101 | if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) | 3094 | if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) |
3102 | goto retry_cpuset; | 3095 | goto retry_cpuset; |
3103 | return obj; | 3096 | return obj; |
3104 | } | 3097 | } |
3105 | 3098 | ||
3106 | /* | 3099 | /* |
3107 | * A interface to enable slab creation on nodeid | 3100 | * A interface to enable slab creation on nodeid |
3108 | */ | 3101 | */ |
3109 | static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3102 | static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
3110 | int nodeid) | 3103 | int nodeid) |
3111 | { | 3104 | { |
3112 | struct list_head *entry; | 3105 | struct list_head *entry; |
3113 | struct page *page; | 3106 | struct page *page; |
3114 | struct kmem_cache_node *n; | 3107 | struct kmem_cache_node *n; |
3115 | void *obj; | 3108 | void *obj; |
3116 | int x; | 3109 | int x; |
3117 | 3110 | ||
3118 | VM_BUG_ON(nodeid > num_online_nodes()); | 3111 | VM_BUG_ON(nodeid > num_online_nodes()); |
3119 | n = get_node(cachep, nodeid); | 3112 | n = get_node(cachep, nodeid); |
3120 | BUG_ON(!n); | 3113 | BUG_ON(!n); |
3121 | 3114 | ||
3122 | retry: | 3115 | retry: |
3123 | check_irq_off(); | 3116 | check_irq_off(); |
3124 | spin_lock(&n->list_lock); | 3117 | spin_lock(&n->list_lock); |
3125 | entry = n->slabs_partial.next; | 3118 | entry = n->slabs_partial.next; |
3126 | if (entry == &n->slabs_partial) { | 3119 | if (entry == &n->slabs_partial) { |
3127 | n->free_touched = 1; | 3120 | n->free_touched = 1; |
3128 | entry = n->slabs_free.next; | 3121 | entry = n->slabs_free.next; |
3129 | if (entry == &n->slabs_free) | 3122 | if (entry == &n->slabs_free) |
3130 | goto must_grow; | 3123 | goto must_grow; |
3131 | } | 3124 | } |
3132 | 3125 | ||
3133 | page = list_entry(entry, struct page, lru); | 3126 | page = list_entry(entry, struct page, lru); |
3134 | check_spinlock_acquired_node(cachep, nodeid); | 3127 | check_spinlock_acquired_node(cachep, nodeid); |
3135 | 3128 | ||
3136 | STATS_INC_NODEALLOCS(cachep); | 3129 | STATS_INC_NODEALLOCS(cachep); |
3137 | STATS_INC_ACTIVE(cachep); | 3130 | STATS_INC_ACTIVE(cachep); |
3138 | STATS_SET_HIGH(cachep); | 3131 | STATS_SET_HIGH(cachep); |
3139 | 3132 | ||
3140 | BUG_ON(page->active == cachep->num); | 3133 | BUG_ON(page->active == cachep->num); |
3141 | 3134 | ||
3142 | obj = slab_get_obj(cachep, page, nodeid); | 3135 | obj = slab_get_obj(cachep, page, nodeid); |
3143 | n->free_objects--; | 3136 | n->free_objects--; |
3144 | /* move slabp to correct slabp list: */ | 3137 | /* move slabp to correct slabp list: */ |
3145 | list_del(&page->lru); | 3138 | list_del(&page->lru); |
3146 | 3139 | ||
3147 | if (page->active == cachep->num) | 3140 | if (page->active == cachep->num) |
3148 | list_add(&page->lru, &n->slabs_full); | 3141 | list_add(&page->lru, &n->slabs_full); |
3149 | else | 3142 | else |
3150 | list_add(&page->lru, &n->slabs_partial); | 3143 | list_add(&page->lru, &n->slabs_partial); |
3151 | 3144 | ||
3152 | spin_unlock(&n->list_lock); | 3145 | spin_unlock(&n->list_lock); |
3153 | goto done; | 3146 | goto done; |
3154 | 3147 | ||
3155 | must_grow: | 3148 | must_grow: |
3156 | spin_unlock(&n->list_lock); | 3149 | spin_unlock(&n->list_lock); |
3157 | x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); | 3150 | x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); |
3158 | if (x) | 3151 | if (x) |
3159 | goto retry; | 3152 | goto retry; |
3160 | 3153 | ||
3161 | return fallback_alloc(cachep, flags); | 3154 | return fallback_alloc(cachep, flags); |
3162 | 3155 | ||
3163 | done: | 3156 | done: |
3164 | return obj; | 3157 | return obj; |
3165 | } | 3158 | } |
3166 | 3159 | ||
3167 | static __always_inline void * | 3160 | static __always_inline void * |
3168 | slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | 3161 | slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, |
3169 | unsigned long caller) | 3162 | unsigned long caller) |
3170 | { | 3163 | { |
3171 | unsigned long save_flags; | 3164 | unsigned long save_flags; |
3172 | void *ptr; | 3165 | void *ptr; |
3173 | int slab_node = numa_mem_id(); | 3166 | int slab_node = numa_mem_id(); |
3174 | 3167 | ||
3175 | flags &= gfp_allowed_mask; | 3168 | flags &= gfp_allowed_mask; |
3176 | 3169 | ||
3177 | lockdep_trace_alloc(flags); | 3170 | lockdep_trace_alloc(flags); |
3178 | 3171 | ||
3179 | if (slab_should_failslab(cachep, flags)) | 3172 | if (slab_should_failslab(cachep, flags)) |
3180 | return NULL; | 3173 | return NULL; |
3181 | 3174 | ||
3182 | cachep = memcg_kmem_get_cache(cachep, flags); | 3175 | cachep = memcg_kmem_get_cache(cachep, flags); |
3183 | 3176 | ||
3184 | cache_alloc_debugcheck_before(cachep, flags); | 3177 | cache_alloc_debugcheck_before(cachep, flags); |
3185 | local_irq_save(save_flags); | 3178 | local_irq_save(save_flags); |
3186 | 3179 | ||
3187 | if (nodeid == NUMA_NO_NODE) | 3180 | if (nodeid == NUMA_NO_NODE) |
3188 | nodeid = slab_node; | 3181 | nodeid = slab_node; |
3189 | 3182 | ||
3190 | if (unlikely(!get_node(cachep, nodeid))) { | 3183 | if (unlikely(!get_node(cachep, nodeid))) { |
3191 | /* Node not bootstrapped yet */ | 3184 | /* Node not bootstrapped yet */ |
3192 | ptr = fallback_alloc(cachep, flags); | 3185 | ptr = fallback_alloc(cachep, flags); |
3193 | goto out; | 3186 | goto out; |
3194 | } | 3187 | } |
3195 | 3188 | ||
3196 | if (nodeid == slab_node) { | 3189 | if (nodeid == slab_node) { |
3197 | /* | 3190 | /* |
3198 | * Use the locally cached objects if possible. | 3191 | * Use the locally cached objects if possible. |
3199 | * However ____cache_alloc does not allow fallback | 3192 | * However ____cache_alloc does not allow fallback |
3200 | * to other nodes. It may fail while we still have | 3193 | * to other nodes. It may fail while we still have |
3201 | * objects on other nodes available. | 3194 | * objects on other nodes available. |
3202 | */ | 3195 | */ |
3203 | ptr = ____cache_alloc(cachep, flags); | 3196 | ptr = ____cache_alloc(cachep, flags); |
3204 | if (ptr) | 3197 | if (ptr) |
3205 | goto out; | 3198 | goto out; |
3206 | } | 3199 | } |
3207 | /* ___cache_alloc_node can fall back to other nodes */ | 3200 | /* ___cache_alloc_node can fall back to other nodes */ |
3208 | ptr = ____cache_alloc_node(cachep, flags, nodeid); | 3201 | ptr = ____cache_alloc_node(cachep, flags, nodeid); |
3209 | out: | 3202 | out: |
3210 | local_irq_restore(save_flags); | 3203 | local_irq_restore(save_flags); |
3211 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3204 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
3212 | kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, | 3205 | kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, |
3213 | flags); | 3206 | flags); |
3214 | 3207 | ||
3215 | if (likely(ptr)) { | 3208 | if (likely(ptr)) { |
3216 | kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); | 3209 | kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); |
3217 | if (unlikely(flags & __GFP_ZERO)) | 3210 | if (unlikely(flags & __GFP_ZERO)) |
3218 | memset(ptr, 0, cachep->object_size); | 3211 | memset(ptr, 0, cachep->object_size); |
3219 | } | 3212 | } |
3220 | 3213 | ||
3221 | return ptr; | 3214 | return ptr; |
3222 | } | 3215 | } |
3223 | 3216 | ||
3224 | static __always_inline void * | 3217 | static __always_inline void * |
3225 | __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) | 3218 | __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) |
3226 | { | 3219 | { |
3227 | void *objp; | 3220 | void *objp; |
3228 | 3221 | ||
3229 | if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { | 3222 | if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { |
3230 | objp = alternate_node_alloc(cache, flags); | 3223 | objp = alternate_node_alloc(cache, flags); |
3231 | if (objp) | 3224 | if (objp) |
3232 | goto out; | 3225 | goto out; |
3233 | } | 3226 | } |
3234 | objp = ____cache_alloc(cache, flags); | 3227 | objp = ____cache_alloc(cache, flags); |
3235 | 3228 | ||
3236 | /* | 3229 | /* |
3237 | * We may just have run out of memory on the local node. | 3230 | * We may just have run out of memory on the local node. |
3238 | * ____cache_alloc_node() knows how to locate memory on other nodes | 3231 | * ____cache_alloc_node() knows how to locate memory on other nodes |
3239 | */ | 3232 | */ |
3240 | if (!objp) | 3233 | if (!objp) |
3241 | objp = ____cache_alloc_node(cache, flags, numa_mem_id()); | 3234 | objp = ____cache_alloc_node(cache, flags, numa_mem_id()); |
3242 | 3235 | ||
3243 | out: | 3236 | out: |
3244 | return objp; | 3237 | return objp; |
3245 | } | 3238 | } |
3246 | #else | 3239 | #else |
3247 | 3240 | ||
3248 | static __always_inline void * | 3241 | static __always_inline void * |
3249 | __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3242 | __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3250 | { | 3243 | { |
3251 | return ____cache_alloc(cachep, flags); | 3244 | return ____cache_alloc(cachep, flags); |
3252 | } | 3245 | } |
3253 | 3246 | ||
3254 | #endif /* CONFIG_NUMA */ | 3247 | #endif /* CONFIG_NUMA */ |
3255 | 3248 | ||
3256 | static __always_inline void * | 3249 | static __always_inline void * |
3257 | slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | 3250 | slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) |
3258 | { | 3251 | { |
3259 | unsigned long save_flags; | 3252 | unsigned long save_flags; |
3260 | void *objp; | 3253 | void *objp; |
3261 | 3254 | ||
3262 | flags &= gfp_allowed_mask; | 3255 | flags &= gfp_allowed_mask; |
3263 | 3256 | ||
3264 | lockdep_trace_alloc(flags); | 3257 | lockdep_trace_alloc(flags); |
3265 | 3258 | ||
3266 | if (slab_should_failslab(cachep, flags)) | 3259 | if (slab_should_failslab(cachep, flags)) |
3267 | return NULL; | 3260 | return NULL; |
3268 | 3261 | ||
3269 | cachep = memcg_kmem_get_cache(cachep, flags); | 3262 | cachep = memcg_kmem_get_cache(cachep, flags); |
3270 | 3263 | ||
3271 | cache_alloc_debugcheck_before(cachep, flags); | 3264 | cache_alloc_debugcheck_before(cachep, flags); |
3272 | local_irq_save(save_flags); | 3265 | local_irq_save(save_flags); |
3273 | objp = __do_cache_alloc(cachep, flags); | 3266 | objp = __do_cache_alloc(cachep, flags); |
3274 | local_irq_restore(save_flags); | 3267 | local_irq_restore(save_flags); |
3275 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3268 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
3276 | kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, | 3269 | kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, |
3277 | flags); | 3270 | flags); |
3278 | prefetchw(objp); | 3271 | prefetchw(objp); |
3279 | 3272 | ||
3280 | if (likely(objp)) { | 3273 | if (likely(objp)) { |
3281 | kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); | 3274 | kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); |
3282 | if (unlikely(flags & __GFP_ZERO)) | 3275 | if (unlikely(flags & __GFP_ZERO)) |
3283 | memset(objp, 0, cachep->object_size); | 3276 | memset(objp, 0, cachep->object_size); |
3284 | } | 3277 | } |
3285 | 3278 | ||
3286 | return objp; | 3279 | return objp; |
3287 | } | 3280 | } |
3288 | 3281 | ||
3289 | /* | 3282 | /* |
3290 | * Caller needs to acquire correct kmem_cache_node's list_lock | 3283 | * Caller needs to acquire correct kmem_cache_node's list_lock |
3291 | * @list: List of detached free slabs should be freed by caller | 3284 | * @list: List of detached free slabs should be freed by caller |
3292 | */ | 3285 | */ |
3293 | static void free_block(struct kmem_cache *cachep, void **objpp, | 3286 | static void free_block(struct kmem_cache *cachep, void **objpp, |
3294 | int nr_objects, int node, struct list_head *list) | 3287 | int nr_objects, int node, struct list_head *list) |
3295 | { | 3288 | { |
3296 | int i; | 3289 | int i; |
3297 | struct kmem_cache_node *n = get_node(cachep, node); | 3290 | struct kmem_cache_node *n = get_node(cachep, node); |
3298 | 3291 | ||
3299 | for (i = 0; i < nr_objects; i++) { | 3292 | for (i = 0; i < nr_objects; i++) { |
3300 | void *objp; | 3293 | void *objp; |
3301 | struct page *page; | 3294 | struct page *page; |
3302 | 3295 | ||
3303 | clear_obj_pfmemalloc(&objpp[i]); | 3296 | clear_obj_pfmemalloc(&objpp[i]); |
3304 | objp = objpp[i]; | 3297 | objp = objpp[i]; |
3305 | 3298 | ||
3306 | page = virt_to_head_page(objp); | 3299 | page = virt_to_head_page(objp); |
3307 | list_del(&page->lru); | 3300 | list_del(&page->lru); |
3308 | check_spinlock_acquired_node(cachep, node); | 3301 | check_spinlock_acquired_node(cachep, node); |
3309 | slab_put_obj(cachep, page, objp, node); | 3302 | slab_put_obj(cachep, page, objp, node); |
3310 | STATS_DEC_ACTIVE(cachep); | 3303 | STATS_DEC_ACTIVE(cachep); |
3311 | n->free_objects++; | 3304 | n->free_objects++; |
3312 | 3305 | ||
3313 | /* fixup slab chains */ | 3306 | /* fixup slab chains */ |
3314 | if (page->active == 0) { | 3307 | if (page->active == 0) { |
3315 | if (n->free_objects > n->free_limit) { | 3308 | if (n->free_objects > n->free_limit) { |
3316 | n->free_objects -= cachep->num; | 3309 | n->free_objects -= cachep->num; |
3317 | list_add_tail(&page->lru, list); | 3310 | list_add_tail(&page->lru, list); |
3318 | } else { | 3311 | } else { |
3319 | list_add(&page->lru, &n->slabs_free); | 3312 | list_add(&page->lru, &n->slabs_free); |
3320 | } | 3313 | } |
3321 | } else { | 3314 | } else { |
3322 | /* Unconditionally move a slab to the end of the | 3315 | /* Unconditionally move a slab to the end of the |
3323 | * partial list on free - maximum time for the | 3316 | * partial list on free - maximum time for the |
3324 | * other objects to be freed, too. | 3317 | * other objects to be freed, too. |
3325 | */ | 3318 | */ |
3326 | list_add_tail(&page->lru, &n->slabs_partial); | 3319 | list_add_tail(&page->lru, &n->slabs_partial); |
3327 | } | 3320 | } |
3328 | } | 3321 | } |
3329 | } | 3322 | } |
3330 | 3323 | ||
3331 | static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | 3324 | static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) |
3332 | { | 3325 | { |
3333 | int batchcount; | 3326 | int batchcount; |
3334 | struct kmem_cache_node *n; | 3327 | struct kmem_cache_node *n; |
3335 | int node = numa_mem_id(); | 3328 | int node = numa_mem_id(); |
3336 | LIST_HEAD(list); | 3329 | LIST_HEAD(list); |
3337 | 3330 | ||
3338 | batchcount = ac->batchcount; | 3331 | batchcount = ac->batchcount; |
3339 | #if DEBUG | 3332 | #if DEBUG |
3340 | BUG_ON(!batchcount || batchcount > ac->avail); | 3333 | BUG_ON(!batchcount || batchcount > ac->avail); |
3341 | #endif | 3334 | #endif |
3342 | check_irq_off(); | 3335 | check_irq_off(); |
3343 | n = get_node(cachep, node); | 3336 | n = get_node(cachep, node); |
3344 | spin_lock(&n->list_lock); | 3337 | spin_lock(&n->list_lock); |
3345 | if (n->shared) { | 3338 | if (n->shared) { |
3346 | struct array_cache *shared_array = n->shared; | 3339 | struct array_cache *shared_array = n->shared; |
3347 | int max = shared_array->limit - shared_array->avail; | 3340 | int max = shared_array->limit - shared_array->avail; |
3348 | if (max) { | 3341 | if (max) { |
3349 | if (batchcount > max) | 3342 | if (batchcount > max) |
3350 | batchcount = max; | 3343 | batchcount = max; |
3351 | memcpy(&(shared_array->entry[shared_array->avail]), | 3344 | memcpy(&(shared_array->entry[shared_array->avail]), |
3352 | ac->entry, sizeof(void *) * batchcount); | 3345 | ac->entry, sizeof(void *) * batchcount); |
3353 | shared_array->avail += batchcount; | 3346 | shared_array->avail += batchcount; |
3354 | goto free_done; | 3347 | goto free_done; |
3355 | } | 3348 | } |
3356 | } | 3349 | } |
3357 | 3350 | ||
3358 | free_block(cachep, ac->entry, batchcount, node, &list); | 3351 | free_block(cachep, ac->entry, batchcount, node, &list); |
3359 | free_done: | 3352 | free_done: |
3360 | #if STATS | 3353 | #if STATS |
3361 | { | 3354 | { |
3362 | int i = 0; | 3355 | int i = 0; |
3363 | struct list_head *p; | 3356 | struct list_head *p; |
3364 | 3357 | ||
3365 | p = n->slabs_free.next; | 3358 | p = n->slabs_free.next; |
3366 | while (p != &(n->slabs_free)) { | 3359 | while (p != &(n->slabs_free)) { |
3367 | struct page *page; | 3360 | struct page *page; |
3368 | 3361 | ||
3369 | page = list_entry(p, struct page, lru); | 3362 | page = list_entry(p, struct page, lru); |
3370 | BUG_ON(page->active); | 3363 | BUG_ON(page->active); |
3371 | 3364 | ||
3372 | i++; | 3365 | i++; |
3373 | p = p->next; | 3366 | p = p->next; |
3374 | } | 3367 | } |
3375 | STATS_SET_FREEABLE(cachep, i); | 3368 | STATS_SET_FREEABLE(cachep, i); |
3376 | } | 3369 | } |
3377 | #endif | 3370 | #endif |
3378 | spin_unlock(&n->list_lock); | 3371 | spin_unlock(&n->list_lock); |
3379 | slabs_destroy(cachep, &list); | 3372 | slabs_destroy(cachep, &list); |
3380 | ac->avail -= batchcount; | 3373 | ac->avail -= batchcount; |
3381 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); | 3374 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); |
3382 | } | 3375 | } |
3383 | 3376 | ||
3384 | /* | 3377 | /* |
3385 | * Release an obj back to its cache. If the obj has a constructed state, it must | 3378 | * Release an obj back to its cache. If the obj has a constructed state, it must |
3386 | * be in this state _before_ it is released. Called with disabled ints. | 3379 | * be in this state _before_ it is released. Called with disabled ints. |
3387 | */ | 3380 | */ |
3388 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, | 3381 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, |
3389 | unsigned long caller) | 3382 | unsigned long caller) |
3390 | { | 3383 | { |
3391 | struct array_cache *ac = cpu_cache_get(cachep); | 3384 | struct array_cache *ac = cpu_cache_get(cachep); |
3392 | 3385 | ||
3393 | check_irq_off(); | 3386 | check_irq_off(); |
3394 | kmemleak_free_recursive(objp, cachep->flags); | 3387 | kmemleak_free_recursive(objp, cachep->flags); |
3395 | objp = cache_free_debugcheck(cachep, objp, caller); | 3388 | objp = cache_free_debugcheck(cachep, objp, caller); |
3396 | 3389 | ||
3397 | kmemcheck_slab_free(cachep, objp, cachep->object_size); | 3390 | kmemcheck_slab_free(cachep, objp, cachep->object_size); |
3398 | 3391 | ||
3399 | /* | 3392 | /* |
3400 | * Skip calling cache_free_alien() when the platform is not numa. | 3393 | * Skip calling cache_free_alien() when the platform is not numa. |
3401 | * This will avoid cache misses that happen while accessing slabp (which | 3394 | * This will avoid cache misses that happen while accessing slabp (which |
3402 | * is per page memory reference) to get nodeid. Instead use a global | 3395 | * is per page memory reference) to get nodeid. Instead use a global |
3403 | * variable to skip the call, which is mostly likely to be present in | 3396 | * variable to skip the call, which is mostly likely to be present in |
3404 | * the cache. | 3397 | * the cache. |
3405 | */ | 3398 | */ |
3406 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) | 3399 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
3407 | return; | 3400 | return; |
3408 | 3401 | ||
3409 | if (likely(ac->avail < ac->limit)) { | 3402 | if (likely(ac->avail < ac->limit)) { |
3410 | STATS_INC_FREEHIT(cachep); | 3403 | STATS_INC_FREEHIT(cachep); |
3411 | } else { | 3404 | } else { |
3412 | STATS_INC_FREEMISS(cachep); | 3405 | STATS_INC_FREEMISS(cachep); |
3413 | cache_flusharray(cachep, ac); | 3406 | cache_flusharray(cachep, ac); |
3414 | } | 3407 | } |
3415 | 3408 | ||
3416 | ac_put_obj(cachep, ac, objp); | 3409 | ac_put_obj(cachep, ac, objp); |
3417 | } | 3410 | } |
3418 | 3411 | ||
3419 | /** | 3412 | /** |
3420 | * kmem_cache_alloc - Allocate an object | 3413 | * kmem_cache_alloc - Allocate an object |
3421 | * @cachep: The cache to allocate from. | 3414 | * @cachep: The cache to allocate from. |
3422 | * @flags: See kmalloc(). | 3415 | * @flags: See kmalloc(). |
3423 | * | 3416 | * |
3424 | * Allocate an object from this cache. The flags are only relevant | 3417 | * Allocate an object from this cache. The flags are only relevant |
3425 | * if the cache has no available objects. | 3418 | * if the cache has no available objects. |
3426 | */ | 3419 | */ |
3427 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3420 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3428 | { | 3421 | { |
3429 | void *ret = slab_alloc(cachep, flags, _RET_IP_); | 3422 | void *ret = slab_alloc(cachep, flags, _RET_IP_); |
3430 | 3423 | ||
3431 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3424 | trace_kmem_cache_alloc(_RET_IP_, ret, |
3432 | cachep->object_size, cachep->size, flags); | 3425 | cachep->object_size, cachep->size, flags); |
3433 | 3426 | ||
3434 | return ret; | 3427 | return ret; |
3435 | } | 3428 | } |
3436 | EXPORT_SYMBOL(kmem_cache_alloc); | 3429 | EXPORT_SYMBOL(kmem_cache_alloc); |
3437 | 3430 | ||
3438 | #ifdef CONFIG_TRACING | 3431 | #ifdef CONFIG_TRACING |
3439 | void * | 3432 | void * |
3440 | kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) | 3433 | kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) |
3441 | { | 3434 | { |
3442 | void *ret; | 3435 | void *ret; |
3443 | 3436 | ||
3444 | ret = slab_alloc(cachep, flags, _RET_IP_); | 3437 | ret = slab_alloc(cachep, flags, _RET_IP_); |
3445 | 3438 | ||
3446 | trace_kmalloc(_RET_IP_, ret, | 3439 | trace_kmalloc(_RET_IP_, ret, |
3447 | size, cachep->size, flags); | 3440 | size, cachep->size, flags); |
3448 | return ret; | 3441 | return ret; |
3449 | } | 3442 | } |
3450 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 3443 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
3451 | #endif | 3444 | #endif |
3452 | 3445 | ||
3453 | #ifdef CONFIG_NUMA | 3446 | #ifdef CONFIG_NUMA |
3454 | /** | 3447 | /** |
3455 | * kmem_cache_alloc_node - Allocate an object on the specified node | 3448 | * kmem_cache_alloc_node - Allocate an object on the specified node |
3456 | * @cachep: The cache to allocate from. | 3449 | * @cachep: The cache to allocate from. |
3457 | * @flags: See kmalloc(). | 3450 | * @flags: See kmalloc(). |
3458 | * @nodeid: node number of the target node. | 3451 | * @nodeid: node number of the target node. |
3459 | * | 3452 | * |
3460 | * Identical to kmem_cache_alloc but it will allocate memory on the given | 3453 | * Identical to kmem_cache_alloc but it will allocate memory on the given |
3461 | * node, which can improve the performance for cpu bound structures. | 3454 | * node, which can improve the performance for cpu bound structures. |
3462 | * | 3455 | * |
3463 | * Fallback to other node is possible if __GFP_THISNODE is not set. | 3456 | * Fallback to other node is possible if __GFP_THISNODE is not set. |
3464 | */ | 3457 | */ |
3465 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3458 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
3466 | { | 3459 | { |
3467 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3460 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3468 | 3461 | ||
3469 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3462 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
3470 | cachep->object_size, cachep->size, | 3463 | cachep->object_size, cachep->size, |
3471 | flags, nodeid); | 3464 | flags, nodeid); |
3472 | 3465 | ||
3473 | return ret; | 3466 | return ret; |
3474 | } | 3467 | } |
3475 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3468 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3476 | 3469 | ||
3477 | #ifdef CONFIG_TRACING | 3470 | #ifdef CONFIG_TRACING |
3478 | void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, | 3471 | void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, |
3479 | gfp_t flags, | 3472 | gfp_t flags, |
3480 | int nodeid, | 3473 | int nodeid, |
3481 | size_t size) | 3474 | size_t size) |
3482 | { | 3475 | { |
3483 | void *ret; | 3476 | void *ret; |
3484 | 3477 | ||
3485 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3478 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3486 | 3479 | ||
3487 | trace_kmalloc_node(_RET_IP_, ret, | 3480 | trace_kmalloc_node(_RET_IP_, ret, |
3488 | size, cachep->size, | 3481 | size, cachep->size, |
3489 | flags, nodeid); | 3482 | flags, nodeid); |
3490 | return ret; | 3483 | return ret; |
3491 | } | 3484 | } |
3492 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | 3485 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
3493 | #endif | 3486 | #endif |
3494 | 3487 | ||
3495 | static __always_inline void * | 3488 | static __always_inline void * |
3496 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | 3489 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) |
3497 | { | 3490 | { |
3498 | struct kmem_cache *cachep; | 3491 | struct kmem_cache *cachep; |
3499 | 3492 | ||
3500 | cachep = kmalloc_slab(size, flags); | 3493 | cachep = kmalloc_slab(size, flags); |
3501 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3494 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3502 | return cachep; | 3495 | return cachep; |
3503 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); | 3496 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); |
3504 | } | 3497 | } |
3505 | 3498 | ||
3506 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3499 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3507 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3500 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3508 | { | 3501 | { |
3509 | return __do_kmalloc_node(size, flags, node, _RET_IP_); | 3502 | return __do_kmalloc_node(size, flags, node, _RET_IP_); |
3510 | } | 3503 | } |
3511 | EXPORT_SYMBOL(__kmalloc_node); | 3504 | EXPORT_SYMBOL(__kmalloc_node); |
3512 | 3505 | ||
3513 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | 3506 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, |
3514 | int node, unsigned long caller) | 3507 | int node, unsigned long caller) |
3515 | { | 3508 | { |
3516 | return __do_kmalloc_node(size, flags, node, caller); | 3509 | return __do_kmalloc_node(size, flags, node, caller); |
3517 | } | 3510 | } |
3518 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | 3511 | EXPORT_SYMBOL(__kmalloc_node_track_caller); |
3519 | #else | 3512 | #else |
3520 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3513 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3521 | { | 3514 | { |
3522 | return __do_kmalloc_node(size, flags, node, 0); | 3515 | return __do_kmalloc_node(size, flags, node, 0); |
3523 | } | 3516 | } |
3524 | EXPORT_SYMBOL(__kmalloc_node); | 3517 | EXPORT_SYMBOL(__kmalloc_node); |
3525 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ | 3518 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ |
3526 | #endif /* CONFIG_NUMA */ | 3519 | #endif /* CONFIG_NUMA */ |
3527 | 3520 | ||
3528 | /** | 3521 | /** |
3529 | * __do_kmalloc - allocate memory | 3522 | * __do_kmalloc - allocate memory |
3530 | * @size: how many bytes of memory are required. | 3523 | * @size: how many bytes of memory are required. |
3531 | * @flags: the type of memory to allocate (see kmalloc). | 3524 | * @flags: the type of memory to allocate (see kmalloc). |
3532 | * @caller: function caller for debug tracking of the caller | 3525 | * @caller: function caller for debug tracking of the caller |
3533 | */ | 3526 | */ |
3534 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | 3527 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, |
3535 | unsigned long caller) | 3528 | unsigned long caller) |
3536 | { | 3529 | { |
3537 | struct kmem_cache *cachep; | 3530 | struct kmem_cache *cachep; |
3538 | void *ret; | 3531 | void *ret; |
3539 | 3532 | ||
3540 | cachep = kmalloc_slab(size, flags); | 3533 | cachep = kmalloc_slab(size, flags); |
3541 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3534 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3542 | return cachep; | 3535 | return cachep; |
3543 | ret = slab_alloc(cachep, flags, caller); | 3536 | ret = slab_alloc(cachep, flags, caller); |
3544 | 3537 | ||
3545 | trace_kmalloc(caller, ret, | 3538 | trace_kmalloc(caller, ret, |
3546 | size, cachep->size, flags); | 3539 | size, cachep->size, flags); |
3547 | 3540 | ||
3548 | return ret; | 3541 | return ret; |
3549 | } | 3542 | } |
3550 | 3543 | ||
3551 | 3544 | ||
3552 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3545 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3553 | void *__kmalloc(size_t size, gfp_t flags) | 3546 | void *__kmalloc(size_t size, gfp_t flags) |
3554 | { | 3547 | { |
3555 | return __do_kmalloc(size, flags, _RET_IP_); | 3548 | return __do_kmalloc(size, flags, _RET_IP_); |
3556 | } | 3549 | } |
3557 | EXPORT_SYMBOL(__kmalloc); | 3550 | EXPORT_SYMBOL(__kmalloc); |
3558 | 3551 | ||
3559 | void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) | 3552 | void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) |
3560 | { | 3553 | { |
3561 | return __do_kmalloc(size, flags, caller); | 3554 | return __do_kmalloc(size, flags, caller); |
3562 | } | 3555 | } |
3563 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3556 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3564 | 3557 | ||
3565 | #else | 3558 | #else |
3566 | void *__kmalloc(size_t size, gfp_t flags) | 3559 | void *__kmalloc(size_t size, gfp_t flags) |
3567 | { | 3560 | { |
3568 | return __do_kmalloc(size, flags, 0); | 3561 | return __do_kmalloc(size, flags, 0); |
3569 | } | 3562 | } |
3570 | EXPORT_SYMBOL(__kmalloc); | 3563 | EXPORT_SYMBOL(__kmalloc); |
3571 | #endif | 3564 | #endif |
3572 | 3565 | ||
3573 | /** | 3566 | /** |
3574 | * kmem_cache_free - Deallocate an object | 3567 | * kmem_cache_free - Deallocate an object |
3575 | * @cachep: The cache the allocation was from. | 3568 | * @cachep: The cache the allocation was from. |
3576 | * @objp: The previously allocated object. | 3569 | * @objp: The previously allocated object. |
3577 | * | 3570 | * |
3578 | * Free an object which was previously allocated from this | 3571 | * Free an object which was previously allocated from this |
3579 | * cache. | 3572 | * cache. |
3580 | */ | 3573 | */ |
3581 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) | 3574 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) |
3582 | { | 3575 | { |
3583 | unsigned long flags; | 3576 | unsigned long flags; |
3584 | cachep = cache_from_obj(cachep, objp); | 3577 | cachep = cache_from_obj(cachep, objp); |
3585 | if (!cachep) | 3578 | if (!cachep) |
3586 | return; | 3579 | return; |
3587 | 3580 | ||
3588 | local_irq_save(flags); | 3581 | local_irq_save(flags); |
3589 | debug_check_no_locks_freed(objp, cachep->object_size); | 3582 | debug_check_no_locks_freed(objp, cachep->object_size); |
3590 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3583 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3591 | debug_check_no_obj_freed(objp, cachep->object_size); | 3584 | debug_check_no_obj_freed(objp, cachep->object_size); |
3592 | __cache_free(cachep, objp, _RET_IP_); | 3585 | __cache_free(cachep, objp, _RET_IP_); |
3593 | local_irq_restore(flags); | 3586 | local_irq_restore(flags); |
3594 | 3587 | ||
3595 | trace_kmem_cache_free(_RET_IP_, objp); | 3588 | trace_kmem_cache_free(_RET_IP_, objp); |
3596 | } | 3589 | } |
3597 | EXPORT_SYMBOL(kmem_cache_free); | 3590 | EXPORT_SYMBOL(kmem_cache_free); |
3598 | 3591 | ||
3599 | /** | 3592 | /** |
3600 | * kfree - free previously allocated memory | 3593 | * kfree - free previously allocated memory |
3601 | * @objp: pointer returned by kmalloc. | 3594 | * @objp: pointer returned by kmalloc. |
3602 | * | 3595 | * |
3603 | * If @objp is NULL, no operation is performed. | 3596 | * If @objp is NULL, no operation is performed. |
3604 | * | 3597 | * |
3605 | * Don't free memory not originally allocated by kmalloc() | 3598 | * Don't free memory not originally allocated by kmalloc() |
3606 | * or you will run into trouble. | 3599 | * or you will run into trouble. |
3607 | */ | 3600 | */ |
3608 | void kfree(const void *objp) | 3601 | void kfree(const void *objp) |
3609 | { | 3602 | { |
3610 | struct kmem_cache *c; | 3603 | struct kmem_cache *c; |
3611 | unsigned long flags; | 3604 | unsigned long flags; |
3612 | 3605 | ||
3613 | trace_kfree(_RET_IP_, objp); | 3606 | trace_kfree(_RET_IP_, objp); |
3614 | 3607 | ||
3615 | if (unlikely(ZERO_OR_NULL_PTR(objp))) | 3608 | if (unlikely(ZERO_OR_NULL_PTR(objp))) |
3616 | return; | 3609 | return; |
3617 | local_irq_save(flags); | 3610 | local_irq_save(flags); |
3618 | kfree_debugcheck(objp); | 3611 | kfree_debugcheck(objp); |
3619 | c = virt_to_cache(objp); | 3612 | c = virt_to_cache(objp); |
3620 | debug_check_no_locks_freed(objp, c->object_size); | 3613 | debug_check_no_locks_freed(objp, c->object_size); |
3621 | 3614 | ||
3622 | debug_check_no_obj_freed(objp, c->object_size); | 3615 | debug_check_no_obj_freed(objp, c->object_size); |
3623 | __cache_free(c, (void *)objp, _RET_IP_); | 3616 | __cache_free(c, (void *)objp, _RET_IP_); |
3624 | local_irq_restore(flags); | 3617 | local_irq_restore(flags); |
3625 | } | 3618 | } |
3626 | EXPORT_SYMBOL(kfree); | 3619 | EXPORT_SYMBOL(kfree); |
3627 | 3620 | ||
3628 | /* | 3621 | /* |
3629 | * This initializes kmem_cache_node or resizes various caches for all nodes. | 3622 | * This initializes kmem_cache_node or resizes various caches for all nodes. |
3630 | */ | 3623 | */ |
3631 | static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) | 3624 | static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) |
3632 | { | 3625 | { |
3633 | int node; | 3626 | int node; |
3634 | struct kmem_cache_node *n; | 3627 | struct kmem_cache_node *n; |
3635 | struct array_cache *new_shared; | 3628 | struct array_cache *new_shared; |
3636 | struct alien_cache **new_alien = NULL; | 3629 | struct alien_cache **new_alien = NULL; |
3637 | 3630 | ||
3638 | for_each_online_node(node) { | 3631 | for_each_online_node(node) { |
3639 | 3632 | ||
3640 | if (use_alien_caches) { | 3633 | if (use_alien_caches) { |
3641 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); | 3634 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); |
3642 | if (!new_alien) | 3635 | if (!new_alien) |
3643 | goto fail; | 3636 | goto fail; |
3644 | } | 3637 | } |
3645 | 3638 | ||
3646 | new_shared = NULL; | 3639 | new_shared = NULL; |
3647 | if (cachep->shared) { | 3640 | if (cachep->shared) { |
3648 | new_shared = alloc_arraycache(node, | 3641 | new_shared = alloc_arraycache(node, |
3649 | cachep->shared*cachep->batchcount, | 3642 | cachep->shared*cachep->batchcount, |
3650 | 0xbaadf00d, gfp); | 3643 | 0xbaadf00d, gfp); |
3651 | if (!new_shared) { | 3644 | if (!new_shared) { |
3652 | free_alien_cache(new_alien); | 3645 | free_alien_cache(new_alien); |
3653 | goto fail; | 3646 | goto fail; |
3654 | } | 3647 | } |
3655 | } | 3648 | } |
3656 | 3649 | ||
3657 | n = get_node(cachep, node); | 3650 | n = get_node(cachep, node); |
3658 | if (n) { | 3651 | if (n) { |
3659 | struct array_cache *shared = n->shared; | 3652 | struct array_cache *shared = n->shared; |
3660 | LIST_HEAD(list); | 3653 | LIST_HEAD(list); |
3661 | 3654 | ||
3662 | spin_lock_irq(&n->list_lock); | 3655 | spin_lock_irq(&n->list_lock); |
3663 | 3656 | ||
3664 | if (shared) | 3657 | if (shared) |
3665 | free_block(cachep, shared->entry, | 3658 | free_block(cachep, shared->entry, |
3666 | shared->avail, node, &list); | 3659 | shared->avail, node, &list); |
3667 | 3660 | ||
3668 | n->shared = new_shared; | 3661 | n->shared = new_shared; |
3669 | if (!n->alien) { | 3662 | if (!n->alien) { |
3670 | n->alien = new_alien; | 3663 | n->alien = new_alien; |
3671 | new_alien = NULL; | 3664 | new_alien = NULL; |
3672 | } | 3665 | } |
3673 | n->free_limit = (1 + nr_cpus_node(node)) * | 3666 | n->free_limit = (1 + nr_cpus_node(node)) * |
3674 | cachep->batchcount + cachep->num; | 3667 | cachep->batchcount + cachep->num; |
3675 | spin_unlock_irq(&n->list_lock); | 3668 | spin_unlock_irq(&n->list_lock); |
3676 | slabs_destroy(cachep, &list); | 3669 | slabs_destroy(cachep, &list); |
3677 | kfree(shared); | 3670 | kfree(shared); |
3678 | free_alien_cache(new_alien); | 3671 | free_alien_cache(new_alien); |
3679 | continue; | 3672 | continue; |
3680 | } | 3673 | } |
3681 | n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); | 3674 | n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); |
3682 | if (!n) { | 3675 | if (!n) { |
3683 | free_alien_cache(new_alien); | 3676 | free_alien_cache(new_alien); |
3684 | kfree(new_shared); | 3677 | kfree(new_shared); |
3685 | goto fail; | 3678 | goto fail; |
3686 | } | 3679 | } |
3687 | 3680 | ||
3688 | kmem_cache_node_init(n); | 3681 | kmem_cache_node_init(n); |
3689 | n->next_reap = jiffies + REAPTIMEOUT_NODE + | 3682 | n->next_reap = jiffies + REAPTIMEOUT_NODE + |
3690 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; | 3683 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; |
3691 | n->shared = new_shared; | 3684 | n->shared = new_shared; |
3692 | n->alien = new_alien; | 3685 | n->alien = new_alien; |
3693 | n->free_limit = (1 + nr_cpus_node(node)) * | 3686 | n->free_limit = (1 + nr_cpus_node(node)) * |
3694 | cachep->batchcount + cachep->num; | 3687 | cachep->batchcount + cachep->num; |
3695 | cachep->node[node] = n; | 3688 | cachep->node[node] = n; |
3696 | } | 3689 | } |
3697 | return 0; | 3690 | return 0; |
3698 | 3691 | ||
3699 | fail: | 3692 | fail: |
3700 | if (!cachep->list.next) { | 3693 | if (!cachep->list.next) { |
3701 | /* Cache is not active yet. Roll back what we did */ | 3694 | /* Cache is not active yet. Roll back what we did */ |
3702 | node--; | 3695 | node--; |
3703 | while (node >= 0) { | 3696 | while (node >= 0) { |
3704 | n = get_node(cachep, node); | 3697 | n = get_node(cachep, node); |
3705 | if (n) { | 3698 | if (n) { |
3706 | kfree(n->shared); | 3699 | kfree(n->shared); |
3707 | free_alien_cache(n->alien); | 3700 | free_alien_cache(n->alien); |
3708 | kfree(n); | 3701 | kfree(n); |
3709 | cachep->node[node] = NULL; | 3702 | cachep->node[node] = NULL; |
3710 | } | 3703 | } |
3711 | node--; | 3704 | node--; |
3712 | } | 3705 | } |
3713 | } | 3706 | } |
3714 | return -ENOMEM; | 3707 | return -ENOMEM; |
3715 | } | 3708 | } |
3716 | 3709 | ||
3717 | struct ccupdate_struct { | 3710 | struct ccupdate_struct { |
3718 | struct kmem_cache *cachep; | 3711 | struct kmem_cache *cachep; |
3719 | struct array_cache *new[0]; | 3712 | struct array_cache *new[0]; |
3720 | }; | 3713 | }; |
3721 | 3714 | ||
3722 | static void do_ccupdate_local(void *info) | 3715 | static void do_ccupdate_local(void *info) |
3723 | { | 3716 | { |
3724 | struct ccupdate_struct *new = info; | 3717 | struct ccupdate_struct *new = info; |
3725 | struct array_cache *old; | 3718 | struct array_cache *old; |
3726 | 3719 | ||
3727 | check_irq_off(); | 3720 | check_irq_off(); |
3728 | old = cpu_cache_get(new->cachep); | 3721 | old = cpu_cache_get(new->cachep); |
3729 | 3722 | ||
3730 | new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; | 3723 | new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; |
3731 | new->new[smp_processor_id()] = old; | 3724 | new->new[smp_processor_id()] = old; |
3732 | } | 3725 | } |
3733 | 3726 | ||
3734 | /* Always called with the slab_mutex held */ | 3727 | /* Always called with the slab_mutex held */ |
3735 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3728 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3736 | int batchcount, int shared, gfp_t gfp) | 3729 | int batchcount, int shared, gfp_t gfp) |
3737 | { | 3730 | { |
3738 | struct ccupdate_struct *new; | 3731 | struct ccupdate_struct *new; |
3739 | int i; | 3732 | int i; |
3740 | 3733 | ||
3741 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), | 3734 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), |
3742 | gfp); | 3735 | gfp); |
3743 | if (!new) | 3736 | if (!new) |
3744 | return -ENOMEM; | 3737 | return -ENOMEM; |
3745 | 3738 | ||
3746 | for_each_online_cpu(i) { | 3739 | for_each_online_cpu(i) { |
3747 | new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, | 3740 | new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, |
3748 | batchcount, gfp); | 3741 | batchcount, gfp); |
3749 | if (!new->new[i]) { | 3742 | if (!new->new[i]) { |
3750 | for (i--; i >= 0; i--) | 3743 | for (i--; i >= 0; i--) |
3751 | kfree(new->new[i]); | 3744 | kfree(new->new[i]); |
3752 | kfree(new); | 3745 | kfree(new); |
3753 | return -ENOMEM; | 3746 | return -ENOMEM; |
3754 | } | 3747 | } |
3755 | } | 3748 | } |
3756 | new->cachep = cachep; | 3749 | new->cachep = cachep; |
3757 | 3750 | ||
3758 | on_each_cpu(do_ccupdate_local, (void *)new, 1); | 3751 | on_each_cpu(do_ccupdate_local, (void *)new, 1); |
3759 | 3752 | ||
3760 | check_irq_on(); | 3753 | check_irq_on(); |
3761 | cachep->batchcount = batchcount; | 3754 | cachep->batchcount = batchcount; |
3762 | cachep->limit = limit; | 3755 | cachep->limit = limit; |
3763 | cachep->shared = shared; | 3756 | cachep->shared = shared; |
3764 | 3757 | ||
3765 | for_each_online_cpu(i) { | 3758 | for_each_online_cpu(i) { |
3766 | LIST_HEAD(list); | 3759 | LIST_HEAD(list); |
3767 | struct array_cache *ccold = new->new[i]; | 3760 | struct array_cache *ccold = new->new[i]; |
3768 | int node; | 3761 | int node; |
3769 | struct kmem_cache_node *n; | 3762 | struct kmem_cache_node *n; |
3770 | 3763 | ||
3771 | if (!ccold) | 3764 | if (!ccold) |
3772 | continue; | 3765 | continue; |
3773 | 3766 | ||
3774 | node = cpu_to_mem(i); | 3767 | node = cpu_to_mem(i); |
3775 | n = get_node(cachep, node); | 3768 | n = get_node(cachep, node); |
3776 | spin_lock_irq(&n->list_lock); | 3769 | spin_lock_irq(&n->list_lock); |
3777 | free_block(cachep, ccold->entry, ccold->avail, node, &list); | 3770 | free_block(cachep, ccold->entry, ccold->avail, node, &list); |
3778 | spin_unlock_irq(&n->list_lock); | 3771 | spin_unlock_irq(&n->list_lock); |
3779 | slabs_destroy(cachep, &list); | 3772 | slabs_destroy(cachep, &list); |
3780 | kfree(ccold); | 3773 | kfree(ccold); |
3781 | } | 3774 | } |
3782 | kfree(new); | 3775 | kfree(new); |
3783 | return alloc_kmem_cache_node(cachep, gfp); | 3776 | return alloc_kmem_cache_node(cachep, gfp); |
3784 | } | 3777 | } |
3785 | 3778 | ||
3786 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3779 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3787 | int batchcount, int shared, gfp_t gfp) | 3780 | int batchcount, int shared, gfp_t gfp) |
3788 | { | 3781 | { |
3789 | int ret; | 3782 | int ret; |
3790 | struct kmem_cache *c = NULL; | 3783 | struct kmem_cache *c = NULL; |
3791 | int i = 0; | 3784 | int i = 0; |
3792 | 3785 | ||
3793 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | 3786 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); |
3794 | 3787 | ||
3795 | if (slab_state < FULL) | 3788 | if (slab_state < FULL) |
3796 | return ret; | 3789 | return ret; |
3797 | 3790 | ||
3798 | if ((ret < 0) || !is_root_cache(cachep)) | 3791 | if ((ret < 0) || !is_root_cache(cachep)) |
3799 | return ret; | 3792 | return ret; |
3800 | 3793 | ||
3801 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | 3794 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); |
3802 | for_each_memcg_cache_index(i) { | 3795 | for_each_memcg_cache_index(i) { |
3803 | c = cache_from_memcg_idx(cachep, i); | 3796 | c = cache_from_memcg_idx(cachep, i); |
3804 | if (c) | 3797 | if (c) |
3805 | /* return value determined by the parent cache only */ | 3798 | /* return value determined by the parent cache only */ |
3806 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | 3799 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); |
3807 | } | 3800 | } |
3808 | 3801 | ||
3809 | return ret; | 3802 | return ret; |
3810 | } | 3803 | } |
3811 | 3804 | ||
3812 | /* Called with slab_mutex held always */ | 3805 | /* Called with slab_mutex held always */ |
3813 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | 3806 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
3814 | { | 3807 | { |
3815 | int err; | 3808 | int err; |
3816 | int limit = 0; | 3809 | int limit = 0; |
3817 | int shared = 0; | 3810 | int shared = 0; |
3818 | int batchcount = 0; | 3811 | int batchcount = 0; |
3819 | 3812 | ||
3820 | if (!is_root_cache(cachep)) { | 3813 | if (!is_root_cache(cachep)) { |
3821 | struct kmem_cache *root = memcg_root_cache(cachep); | 3814 | struct kmem_cache *root = memcg_root_cache(cachep); |
3822 | limit = root->limit; | 3815 | limit = root->limit; |
3823 | shared = root->shared; | 3816 | shared = root->shared; |
3824 | batchcount = root->batchcount; | 3817 | batchcount = root->batchcount; |
3825 | } | 3818 | } |
3826 | 3819 | ||
3827 | if (limit && shared && batchcount) | 3820 | if (limit && shared && batchcount) |
3828 | goto skip_setup; | 3821 | goto skip_setup; |
3829 | /* | 3822 | /* |
3830 | * The head array serves three purposes: | 3823 | * The head array serves three purposes: |
3831 | * - create a LIFO ordering, i.e. return objects that are cache-warm | 3824 | * - create a LIFO ordering, i.e. return objects that are cache-warm |
3832 | * - reduce the number of spinlock operations. | 3825 | * - reduce the number of spinlock operations. |
3833 | * - reduce the number of linked list operations on the slab and | 3826 | * - reduce the number of linked list operations on the slab and |
3834 | * bufctl chains: array operations are cheaper. | 3827 | * bufctl chains: array operations are cheaper. |
3835 | * The numbers are guessed, we should auto-tune as described by | 3828 | * The numbers are guessed, we should auto-tune as described by |
3836 | * Bonwick. | 3829 | * Bonwick. |
3837 | */ | 3830 | */ |
3838 | if (cachep->size > 131072) | 3831 | if (cachep->size > 131072) |
3839 | limit = 1; | 3832 | limit = 1; |
3840 | else if (cachep->size > PAGE_SIZE) | 3833 | else if (cachep->size > PAGE_SIZE) |
3841 | limit = 8; | 3834 | limit = 8; |
3842 | else if (cachep->size > 1024) | 3835 | else if (cachep->size > 1024) |
3843 | limit = 24; | 3836 | limit = 24; |
3844 | else if (cachep->size > 256) | 3837 | else if (cachep->size > 256) |
3845 | limit = 54; | 3838 | limit = 54; |
3846 | else | 3839 | else |
3847 | limit = 120; | 3840 | limit = 120; |
3848 | 3841 | ||
3849 | /* | 3842 | /* |
3850 | * CPU bound tasks (e.g. network routing) can exhibit cpu bound | 3843 | * CPU bound tasks (e.g. network routing) can exhibit cpu bound |
3851 | * allocation behaviour: Most allocs on one cpu, most free operations | 3844 | * allocation behaviour: Most allocs on one cpu, most free operations |
3852 | * on another cpu. For these cases, an efficient object passing between | 3845 | * on another cpu. For these cases, an efficient object passing between |
3853 | * cpus is necessary. This is provided by a shared array. The array | 3846 | * cpus is necessary. This is provided by a shared array. The array |
3854 | * replaces Bonwick's magazine layer. | 3847 | * replaces Bonwick's magazine layer. |
3855 | * On uniprocessor, it's functionally equivalent (but less efficient) | 3848 | * On uniprocessor, it's functionally equivalent (but less efficient) |
3856 | * to a larger limit. Thus disabled by default. | 3849 | * to a larger limit. Thus disabled by default. |
3857 | */ | 3850 | */ |
3858 | shared = 0; | 3851 | shared = 0; |
3859 | if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) | 3852 | if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) |
3860 | shared = 8; | 3853 | shared = 8; |
3861 | 3854 | ||
3862 | #if DEBUG | 3855 | #if DEBUG |
3863 | /* | 3856 | /* |
3864 | * With debugging enabled, large batchcount lead to excessively long | 3857 | * With debugging enabled, large batchcount lead to excessively long |
3865 | * periods with disabled local interrupts. Limit the batchcount | 3858 | * periods with disabled local interrupts. Limit the batchcount |
3866 | */ | 3859 | */ |
3867 | if (limit > 32) | 3860 | if (limit > 32) |
3868 | limit = 32; | 3861 | limit = 32; |
3869 | #endif | 3862 | #endif |
3870 | batchcount = (limit + 1) / 2; | 3863 | batchcount = (limit + 1) / 2; |
3871 | skip_setup: | 3864 | skip_setup: |
3872 | err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | 3865 | err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); |
3873 | if (err) | 3866 | if (err) |
3874 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3867 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
3875 | cachep->name, -err); | 3868 | cachep->name, -err); |
3876 | return err; | 3869 | return err; |
3877 | } | 3870 | } |
3878 | 3871 | ||
3879 | /* | 3872 | /* |
3880 | * Drain an array if it contains any elements taking the node lock only if | 3873 | * Drain an array if it contains any elements taking the node lock only if |
3881 | * necessary. Note that the node listlock also protects the array_cache | 3874 | * necessary. Note that the node listlock also protects the array_cache |
3882 | * if drain_array() is used on the shared array. | 3875 | * if drain_array() is used on the shared array. |
3883 | */ | 3876 | */ |
3884 | static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, | 3877 | static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, |
3885 | struct array_cache *ac, int force, int node) | 3878 | struct array_cache *ac, int force, int node) |
3886 | { | 3879 | { |
3887 | LIST_HEAD(list); | 3880 | LIST_HEAD(list); |
3888 | int tofree; | 3881 | int tofree; |
3889 | 3882 | ||
3890 | if (!ac || !ac->avail) | 3883 | if (!ac || !ac->avail) |
3891 | return; | 3884 | return; |
3892 | if (ac->touched && !force) { | 3885 | if (ac->touched && !force) { |
3893 | ac->touched = 0; | 3886 | ac->touched = 0; |
3894 | } else { | 3887 | } else { |
3895 | spin_lock_irq(&n->list_lock); | 3888 | spin_lock_irq(&n->list_lock); |
3896 | if (ac->avail) { | 3889 | if (ac->avail) { |
3897 | tofree = force ? ac->avail : (ac->limit + 4) / 5; | 3890 | tofree = force ? ac->avail : (ac->limit + 4) / 5; |
3898 | if (tofree > ac->avail) | 3891 | if (tofree > ac->avail) |
3899 | tofree = (ac->avail + 1) / 2; | 3892 | tofree = (ac->avail + 1) / 2; |
3900 | free_block(cachep, ac->entry, tofree, node, &list); | 3893 | free_block(cachep, ac->entry, tofree, node, &list); |
3901 | ac->avail -= tofree; | 3894 | ac->avail -= tofree; |
3902 | memmove(ac->entry, &(ac->entry[tofree]), | 3895 | memmove(ac->entry, &(ac->entry[tofree]), |
3903 | sizeof(void *) * ac->avail); | 3896 | sizeof(void *) * ac->avail); |
3904 | } | 3897 | } |
3905 | spin_unlock_irq(&n->list_lock); | 3898 | spin_unlock_irq(&n->list_lock); |
3906 | slabs_destroy(cachep, &list); | 3899 | slabs_destroy(cachep, &list); |
3907 | } | 3900 | } |
3908 | } | 3901 | } |
3909 | 3902 | ||
3910 | /** | 3903 | /** |
3911 | * cache_reap - Reclaim memory from caches. | 3904 | * cache_reap - Reclaim memory from caches. |
3912 | * @w: work descriptor | 3905 | * @w: work descriptor |
3913 | * | 3906 | * |
3914 | * Called from workqueue/eventd every few seconds. | 3907 | * Called from workqueue/eventd every few seconds. |
3915 | * Purpose: | 3908 | * Purpose: |
3916 | * - clear the per-cpu caches for this CPU. | 3909 | * - clear the per-cpu caches for this CPU. |
3917 | * - return freeable pages to the main free memory pool. | 3910 | * - return freeable pages to the main free memory pool. |
3918 | * | 3911 | * |
3919 | * If we cannot acquire the cache chain mutex then just give up - we'll try | 3912 | * If we cannot acquire the cache chain mutex then just give up - we'll try |
3920 | * again on the next iteration. | 3913 | * again on the next iteration. |
3921 | */ | 3914 | */ |
3922 | static void cache_reap(struct work_struct *w) | 3915 | static void cache_reap(struct work_struct *w) |
3923 | { | 3916 | { |
3924 | struct kmem_cache *searchp; | 3917 | struct kmem_cache *searchp; |
3925 | struct kmem_cache_node *n; | 3918 | struct kmem_cache_node *n; |
3926 | int node = numa_mem_id(); | 3919 | int node = numa_mem_id(); |
3927 | struct delayed_work *work = to_delayed_work(w); | 3920 | struct delayed_work *work = to_delayed_work(w); |
3928 | 3921 | ||
3929 | if (!mutex_trylock(&slab_mutex)) | 3922 | if (!mutex_trylock(&slab_mutex)) |
3930 | /* Give up. Setup the next iteration. */ | 3923 | /* Give up. Setup the next iteration. */ |
3931 | goto out; | 3924 | goto out; |
3932 | 3925 | ||
3933 | list_for_each_entry(searchp, &slab_caches, list) { | 3926 | list_for_each_entry(searchp, &slab_caches, list) { |
3934 | check_irq_on(); | 3927 | check_irq_on(); |
3935 | 3928 | ||
3936 | /* | 3929 | /* |
3937 | * We only take the node lock if absolutely necessary and we | 3930 | * We only take the node lock if absolutely necessary and we |
3938 | * have established with reasonable certainty that | 3931 | * have established with reasonable certainty that |
3939 | * we can do some work if the lock was obtained. | 3932 | * we can do some work if the lock was obtained. |
3940 | */ | 3933 | */ |
3941 | n = get_node(searchp, node); | 3934 | n = get_node(searchp, node); |
3942 | 3935 | ||
3943 | reap_alien(searchp, n); | 3936 | reap_alien(searchp, n); |
3944 | 3937 | ||
3945 | drain_array(searchp, n, cpu_cache_get(searchp), 0, node); | 3938 | drain_array(searchp, n, cpu_cache_get(searchp), 0, node); |
3946 | 3939 | ||
3947 | /* | 3940 | /* |
3948 | * These are racy checks but it does not matter | 3941 | * These are racy checks but it does not matter |
3949 | * if we skip one check or scan twice. | 3942 | * if we skip one check or scan twice. |
3950 | */ | 3943 | */ |
3951 | if (time_after(n->next_reap, jiffies)) | 3944 | if (time_after(n->next_reap, jiffies)) |
3952 | goto next; | 3945 | goto next; |
3953 | 3946 | ||
3954 | n->next_reap = jiffies + REAPTIMEOUT_NODE; | 3947 | n->next_reap = jiffies + REAPTIMEOUT_NODE; |
3955 | 3948 | ||
3956 | drain_array(searchp, n, n->shared, 0, node); | 3949 | drain_array(searchp, n, n->shared, 0, node); |
3957 | 3950 | ||
3958 | if (n->free_touched) | 3951 | if (n->free_touched) |
3959 | n->free_touched = 0; | 3952 | n->free_touched = 0; |
3960 | else { | 3953 | else { |
3961 | int freed; | 3954 | int freed; |
3962 | 3955 | ||
3963 | freed = drain_freelist(searchp, n, (n->free_limit + | 3956 | freed = drain_freelist(searchp, n, (n->free_limit + |
3964 | 5 * searchp->num - 1) / (5 * searchp->num)); | 3957 | 5 * searchp->num - 1) / (5 * searchp->num)); |
3965 | STATS_ADD_REAPED(searchp, freed); | 3958 | STATS_ADD_REAPED(searchp, freed); |
3966 | } | 3959 | } |
3967 | next: | 3960 | next: |
3968 | cond_resched(); | 3961 | cond_resched(); |
3969 | } | 3962 | } |
3970 | check_irq_on(); | 3963 | check_irq_on(); |
3971 | mutex_unlock(&slab_mutex); | 3964 | mutex_unlock(&slab_mutex); |
3972 | next_reap_node(); | 3965 | next_reap_node(); |
3973 | out: | 3966 | out: |
3974 | /* Set up the next iteration */ | 3967 | /* Set up the next iteration */ |
3975 | schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); | 3968 | schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); |
3976 | } | 3969 | } |
3977 | 3970 | ||
3978 | #ifdef CONFIG_SLABINFO | 3971 | #ifdef CONFIG_SLABINFO |
3979 | void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) | 3972 | void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) |
3980 | { | 3973 | { |
3981 | struct page *page; | 3974 | struct page *page; |
3982 | unsigned long active_objs; | 3975 | unsigned long active_objs; |
3983 | unsigned long num_objs; | 3976 | unsigned long num_objs; |
3984 | unsigned long active_slabs = 0; | 3977 | unsigned long active_slabs = 0; |
3985 | unsigned long num_slabs, free_objects = 0, shared_avail = 0; | 3978 | unsigned long num_slabs, free_objects = 0, shared_avail = 0; |
3986 | const char *name; | 3979 | const char *name; |
3987 | char *error = NULL; | 3980 | char *error = NULL; |
3988 | int node; | 3981 | int node; |
3989 | struct kmem_cache_node *n; | 3982 | struct kmem_cache_node *n; |
3990 | 3983 | ||
3991 | active_objs = 0; | 3984 | active_objs = 0; |
3992 | num_slabs = 0; | 3985 | num_slabs = 0; |
3993 | for_each_kmem_cache_node(cachep, node, n) { | 3986 | for_each_kmem_cache_node(cachep, node, n) { |
3994 | 3987 | ||
3995 | check_irq_on(); | 3988 | check_irq_on(); |
3996 | spin_lock_irq(&n->list_lock); | 3989 | spin_lock_irq(&n->list_lock); |
3997 | 3990 | ||
3998 | list_for_each_entry(page, &n->slabs_full, lru) { | 3991 | list_for_each_entry(page, &n->slabs_full, lru) { |
3999 | if (page->active != cachep->num && !error) | 3992 | if (page->active != cachep->num && !error) |
4000 | error = "slabs_full accounting error"; | 3993 | error = "slabs_full accounting error"; |
4001 | active_objs += cachep->num; | 3994 | active_objs += cachep->num; |
4002 | active_slabs++; | 3995 | active_slabs++; |
4003 | } | 3996 | } |
4004 | list_for_each_entry(page, &n->slabs_partial, lru) { | 3997 | list_for_each_entry(page, &n->slabs_partial, lru) { |
4005 | if (page->active == cachep->num && !error) | 3998 | if (page->active == cachep->num && !error) |
4006 | error = "slabs_partial accounting error"; | 3999 | error = "slabs_partial accounting error"; |
4007 | if (!page->active && !error) | 4000 | if (!page->active && !error) |
4008 | error = "slabs_partial accounting error"; | 4001 | error = "slabs_partial accounting error"; |
4009 | active_objs += page->active; | 4002 | active_objs += page->active; |
4010 | active_slabs++; | 4003 | active_slabs++; |
4011 | } | 4004 | } |
4012 | list_for_each_entry(page, &n->slabs_free, lru) { | 4005 | list_for_each_entry(page, &n->slabs_free, lru) { |
4013 | if (page->active && !error) | 4006 | if (page->active && !error) |
4014 | error = "slabs_free accounting error"; | 4007 | error = "slabs_free accounting error"; |
4015 | num_slabs++; | 4008 | num_slabs++; |
4016 | } | 4009 | } |
4017 | free_objects += n->free_objects; | 4010 | free_objects += n->free_objects; |
4018 | if (n->shared) | 4011 | if (n->shared) |
4019 | shared_avail += n->shared->avail; | 4012 | shared_avail += n->shared->avail; |
4020 | 4013 | ||
4021 | spin_unlock_irq(&n->list_lock); | 4014 | spin_unlock_irq(&n->list_lock); |
4022 | } | 4015 | } |
4023 | num_slabs += active_slabs; | 4016 | num_slabs += active_slabs; |
4024 | num_objs = num_slabs * cachep->num; | 4017 | num_objs = num_slabs * cachep->num; |
4025 | if (num_objs - active_objs != free_objects && !error) | 4018 | if (num_objs - active_objs != free_objects && !error) |
4026 | error = "free_objects accounting error"; | 4019 | error = "free_objects accounting error"; |
4027 | 4020 | ||
4028 | name = cachep->name; | 4021 | name = cachep->name; |
4029 | if (error) | 4022 | if (error) |
4030 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | 4023 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); |
4031 | 4024 | ||
4032 | sinfo->active_objs = active_objs; | 4025 | sinfo->active_objs = active_objs; |
4033 | sinfo->num_objs = num_objs; | 4026 | sinfo->num_objs = num_objs; |
4034 | sinfo->active_slabs = active_slabs; | 4027 | sinfo->active_slabs = active_slabs; |
4035 | sinfo->num_slabs = num_slabs; | 4028 | sinfo->num_slabs = num_slabs; |
4036 | sinfo->shared_avail = shared_avail; | 4029 | sinfo->shared_avail = shared_avail; |
4037 | sinfo->limit = cachep->limit; | 4030 | sinfo->limit = cachep->limit; |
4038 | sinfo->batchcount = cachep->batchcount; | 4031 | sinfo->batchcount = cachep->batchcount; |
4039 | sinfo->shared = cachep->shared; | 4032 | sinfo->shared = cachep->shared; |
4040 | sinfo->objects_per_slab = cachep->num; | 4033 | sinfo->objects_per_slab = cachep->num; |
4041 | sinfo->cache_order = cachep->gfporder; | 4034 | sinfo->cache_order = cachep->gfporder; |
4042 | } | 4035 | } |
4043 | 4036 | ||
4044 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) | 4037 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) |
4045 | { | 4038 | { |
4046 | #if STATS | 4039 | #if STATS |
4047 | { /* node stats */ | 4040 | { /* node stats */ |
4048 | unsigned long high = cachep->high_mark; | 4041 | unsigned long high = cachep->high_mark; |
4049 | unsigned long allocs = cachep->num_allocations; | 4042 | unsigned long allocs = cachep->num_allocations; |
4050 | unsigned long grown = cachep->grown; | 4043 | unsigned long grown = cachep->grown; |
4051 | unsigned long reaped = cachep->reaped; | 4044 | unsigned long reaped = cachep->reaped; |
4052 | unsigned long errors = cachep->errors; | 4045 | unsigned long errors = cachep->errors; |
4053 | unsigned long max_freeable = cachep->max_freeable; | 4046 | unsigned long max_freeable = cachep->max_freeable; |
4054 | unsigned long node_allocs = cachep->node_allocs; | 4047 | unsigned long node_allocs = cachep->node_allocs; |
4055 | unsigned long node_frees = cachep->node_frees; | 4048 | unsigned long node_frees = cachep->node_frees; |
4056 | unsigned long overflows = cachep->node_overflow; | 4049 | unsigned long overflows = cachep->node_overflow; |
4057 | 4050 | ||
4058 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " | 4051 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " |
4059 | "%4lu %4lu %4lu %4lu %4lu", | 4052 | "%4lu %4lu %4lu %4lu %4lu", |
4060 | allocs, high, grown, | 4053 | allocs, high, grown, |
4061 | reaped, errors, max_freeable, node_allocs, | 4054 | reaped, errors, max_freeable, node_allocs, |
4062 | node_frees, overflows); | 4055 | node_frees, overflows); |
4063 | } | 4056 | } |
4064 | /* cpu stats */ | 4057 | /* cpu stats */ |
4065 | { | 4058 | { |
4066 | unsigned long allochit = atomic_read(&cachep->allochit); | 4059 | unsigned long allochit = atomic_read(&cachep->allochit); |
4067 | unsigned long allocmiss = atomic_read(&cachep->allocmiss); | 4060 | unsigned long allocmiss = atomic_read(&cachep->allocmiss); |
4068 | unsigned long freehit = atomic_read(&cachep->freehit); | 4061 | unsigned long freehit = atomic_read(&cachep->freehit); |
4069 | unsigned long freemiss = atomic_read(&cachep->freemiss); | 4062 | unsigned long freemiss = atomic_read(&cachep->freemiss); |
4070 | 4063 | ||
4071 | seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", | 4064 | seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", |
4072 | allochit, allocmiss, freehit, freemiss); | 4065 | allochit, allocmiss, freehit, freemiss); |
4073 | } | 4066 | } |
4074 | #endif | 4067 | #endif |
4075 | } | 4068 | } |
4076 | 4069 | ||
4077 | #define MAX_SLABINFO_WRITE 128 | 4070 | #define MAX_SLABINFO_WRITE 128 |
4078 | /** | 4071 | /** |
4079 | * slabinfo_write - Tuning for the slab allocator | 4072 | * slabinfo_write - Tuning for the slab allocator |
4080 | * @file: unused | 4073 | * @file: unused |
4081 | * @buffer: user buffer | 4074 | * @buffer: user buffer |
4082 | * @count: data length | 4075 | * @count: data length |
4083 | * @ppos: unused | 4076 | * @ppos: unused |
4084 | */ | 4077 | */ |
4085 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | 4078 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
4086 | size_t count, loff_t *ppos) | 4079 | size_t count, loff_t *ppos) |
4087 | { | 4080 | { |
4088 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; | 4081 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
4089 | int limit, batchcount, shared, res; | 4082 | int limit, batchcount, shared, res; |
4090 | struct kmem_cache *cachep; | 4083 | struct kmem_cache *cachep; |
4091 | 4084 | ||
4092 | if (count > MAX_SLABINFO_WRITE) | 4085 | if (count > MAX_SLABINFO_WRITE) |
4093 | return -EINVAL; | 4086 | return -EINVAL; |
4094 | if (copy_from_user(&kbuf, buffer, count)) | 4087 | if (copy_from_user(&kbuf, buffer, count)) |
4095 | return -EFAULT; | 4088 | return -EFAULT; |
4096 | kbuf[MAX_SLABINFO_WRITE] = '\0'; | 4089 | kbuf[MAX_SLABINFO_WRITE] = '\0'; |
4097 | 4090 | ||
4098 | tmp = strchr(kbuf, ' '); | 4091 | tmp = strchr(kbuf, ' '); |
4099 | if (!tmp) | 4092 | if (!tmp) |
4100 | return -EINVAL; | 4093 | return -EINVAL; |
4101 | *tmp = '\0'; | 4094 | *tmp = '\0'; |
4102 | tmp++; | 4095 | tmp++; |
4103 | if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) | 4096 | if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) |
4104 | return -EINVAL; | 4097 | return -EINVAL; |
4105 | 4098 | ||
4106 | /* Find the cache in the chain of caches. */ | 4099 | /* Find the cache in the chain of caches. */ |
4107 | mutex_lock(&slab_mutex); | 4100 | mutex_lock(&slab_mutex); |
4108 | res = -EINVAL; | 4101 | res = -EINVAL; |
4109 | list_for_each_entry(cachep, &slab_caches, list) { | 4102 | list_for_each_entry(cachep, &slab_caches, list) { |
4110 | if (!strcmp(cachep->name, kbuf)) { | 4103 | if (!strcmp(cachep->name, kbuf)) { |
4111 | if (limit < 1 || batchcount < 1 || | 4104 | if (limit < 1 || batchcount < 1 || |
4112 | batchcount > limit || shared < 0) { | 4105 | batchcount > limit || shared < 0) { |
4113 | res = 0; | 4106 | res = 0; |
4114 | } else { | 4107 | } else { |
4115 | res = do_tune_cpucache(cachep, limit, | 4108 | res = do_tune_cpucache(cachep, limit, |
4116 | batchcount, shared, | 4109 | batchcount, shared, |
4117 | GFP_KERNEL); | 4110 | GFP_KERNEL); |
4118 | } | 4111 | } |
4119 | break; | 4112 | break; |
4120 | } | 4113 | } |
4121 | } | 4114 | } |
4122 | mutex_unlock(&slab_mutex); | 4115 | mutex_unlock(&slab_mutex); |
4123 | if (res >= 0) | 4116 | if (res >= 0) |
4124 | res = count; | 4117 | res = count; |
4125 | return res; | 4118 | return res; |
4126 | } | 4119 | } |
4127 | 4120 | ||
4128 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4121 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4129 | 4122 | ||
4130 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4123 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
4131 | { | 4124 | { |
4132 | mutex_lock(&slab_mutex); | 4125 | mutex_lock(&slab_mutex); |
4133 | return seq_list_start(&slab_caches, *pos); | 4126 | return seq_list_start(&slab_caches, *pos); |
4134 | } | 4127 | } |
4135 | 4128 | ||
4136 | static inline int add_caller(unsigned long *n, unsigned long v) | 4129 | static inline int add_caller(unsigned long *n, unsigned long v) |
4137 | { | 4130 | { |
4138 | unsigned long *p; | 4131 | unsigned long *p; |
4139 | int l; | 4132 | int l; |
4140 | if (!v) | 4133 | if (!v) |
4141 | return 1; | 4134 | return 1; |
4142 | l = n[1]; | 4135 | l = n[1]; |
4143 | p = n + 2; | 4136 | p = n + 2; |
4144 | while (l) { | 4137 | while (l) { |
4145 | int i = l/2; | 4138 | int i = l/2; |
4146 | unsigned long *q = p + 2 * i; | 4139 | unsigned long *q = p + 2 * i; |
4147 | if (*q == v) { | 4140 | if (*q == v) { |
4148 | q[1]++; | 4141 | q[1]++; |
4149 | return 1; | 4142 | return 1; |
4150 | } | 4143 | } |
4151 | if (*q > v) { | 4144 | if (*q > v) { |
4152 | l = i; | 4145 | l = i; |
4153 | } else { | 4146 | } else { |
4154 | p = q + 2; | 4147 | p = q + 2; |
4155 | l -= i + 1; | 4148 | l -= i + 1; |
4156 | } | 4149 | } |
4157 | } | 4150 | } |
4158 | if (++n[1] == n[0]) | 4151 | if (++n[1] == n[0]) |
4159 | return 0; | 4152 | return 0; |
4160 | memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); | 4153 | memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); |
4161 | p[0] = v; | 4154 | p[0] = v; |
4162 | p[1] = 1; | 4155 | p[1] = 1; |
4163 | return 1; | 4156 | return 1; |
4164 | } | 4157 | } |
4165 | 4158 | ||
4166 | static void handle_slab(unsigned long *n, struct kmem_cache *c, | 4159 | static void handle_slab(unsigned long *n, struct kmem_cache *c, |
4167 | struct page *page) | 4160 | struct page *page) |
4168 | { | 4161 | { |
4169 | void *p; | 4162 | void *p; |
4170 | int i; | 4163 | int i; |
4171 | 4164 | ||
4172 | if (n[0] == n[1]) | 4165 | if (n[0] == n[1]) |
4173 | return; | 4166 | return; |
4174 | for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { | 4167 | for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { |
4175 | if (get_obj_status(page, i) != OBJECT_ACTIVE) | 4168 | if (get_obj_status(page, i) != OBJECT_ACTIVE) |
4176 | continue; | 4169 | continue; |
4177 | 4170 | ||
4178 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | 4171 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) |
4179 | return; | 4172 | return; |
4180 | } | 4173 | } |
4181 | } | 4174 | } |
4182 | 4175 | ||
4183 | static void show_symbol(struct seq_file *m, unsigned long address) | 4176 | static void show_symbol(struct seq_file *m, unsigned long address) |
4184 | { | 4177 | { |
4185 | #ifdef CONFIG_KALLSYMS | 4178 | #ifdef CONFIG_KALLSYMS |
4186 | unsigned long offset, size; | 4179 | unsigned long offset, size; |
4187 | char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; | 4180 | char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; |
4188 | 4181 | ||
4189 | if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { | 4182 | if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { |
4190 | seq_printf(m, "%s+%#lx/%#lx", name, offset, size); | 4183 | seq_printf(m, "%s+%#lx/%#lx", name, offset, size); |
4191 | if (modname[0]) | 4184 | if (modname[0]) |
4192 | seq_printf(m, " [%s]", modname); | 4185 | seq_printf(m, " [%s]", modname); |
4193 | return; | 4186 | return; |
4194 | } | 4187 | } |
4195 | #endif | 4188 | #endif |
4196 | seq_printf(m, "%p", (void *)address); | 4189 | seq_printf(m, "%p", (void *)address); |
4197 | } | 4190 | } |
4198 | 4191 | ||
4199 | static int leaks_show(struct seq_file *m, void *p) | 4192 | static int leaks_show(struct seq_file *m, void *p) |
4200 | { | 4193 | { |
4201 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); | 4194 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); |
4202 | struct page *page; | 4195 | struct page *page; |
4203 | struct kmem_cache_node *n; | 4196 | struct kmem_cache_node *n; |
4204 | const char *name; | 4197 | const char *name; |
4205 | unsigned long *x = m->private; | 4198 | unsigned long *x = m->private; |
4206 | int node; | 4199 | int node; |
4207 | int i; | 4200 | int i; |
4208 | 4201 | ||
4209 | if (!(cachep->flags & SLAB_STORE_USER)) | 4202 | if (!(cachep->flags & SLAB_STORE_USER)) |
4210 | return 0; | 4203 | return 0; |
4211 | if (!(cachep->flags & SLAB_RED_ZONE)) | 4204 | if (!(cachep->flags & SLAB_RED_ZONE)) |
4212 | return 0; | 4205 | return 0; |
4213 | 4206 | ||
4214 | /* OK, we can do it */ | 4207 | /* OK, we can do it */ |
4215 | 4208 | ||
4216 | x[1] = 0; | 4209 | x[1] = 0; |
4217 | 4210 | ||
4218 | for_each_kmem_cache_node(cachep, node, n) { | 4211 | for_each_kmem_cache_node(cachep, node, n) { |
4219 | 4212 | ||
4220 | check_irq_on(); | 4213 | check_irq_on(); |
4221 | spin_lock_irq(&n->list_lock); | 4214 | spin_lock_irq(&n->list_lock); |
4222 | 4215 | ||
4223 | list_for_each_entry(page, &n->slabs_full, lru) | 4216 | list_for_each_entry(page, &n->slabs_full, lru) |
4224 | handle_slab(x, cachep, page); | 4217 | handle_slab(x, cachep, page); |
4225 | list_for_each_entry(page, &n->slabs_partial, lru) | 4218 | list_for_each_entry(page, &n->slabs_partial, lru) |
4226 | handle_slab(x, cachep, page); | 4219 | handle_slab(x, cachep, page); |
4227 | spin_unlock_irq(&n->list_lock); | 4220 | spin_unlock_irq(&n->list_lock); |
4228 | } | 4221 | } |
4229 | name = cachep->name; | 4222 | name = cachep->name; |
4230 | if (x[0] == x[1]) { | 4223 | if (x[0] == x[1]) { |
4231 | /* Increase the buffer size */ | 4224 | /* Increase the buffer size */ |
4232 | mutex_unlock(&slab_mutex); | 4225 | mutex_unlock(&slab_mutex); |
4233 | m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); | 4226 | m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); |
4234 | if (!m->private) { | 4227 | if (!m->private) { |
4235 | /* Too bad, we are really out */ | 4228 | /* Too bad, we are really out */ |
4236 | m->private = x; | 4229 | m->private = x; |
4237 | mutex_lock(&slab_mutex); | 4230 | mutex_lock(&slab_mutex); |
4238 | return -ENOMEM; | 4231 | return -ENOMEM; |
4239 | } | 4232 | } |
4240 | *(unsigned long *)m->private = x[0] * 2; | 4233 | *(unsigned long *)m->private = x[0] * 2; |
4241 | kfree(x); | 4234 | kfree(x); |
4242 | mutex_lock(&slab_mutex); | 4235 | mutex_lock(&slab_mutex); |
4243 | /* Now make sure this entry will be retried */ | 4236 | /* Now make sure this entry will be retried */ |
4244 | m->count = m->size; | 4237 | m->count = m->size; |
4245 | return 0; | 4238 | return 0; |
4246 | } | 4239 | } |
4247 | for (i = 0; i < x[1]; i++) { | 4240 | for (i = 0; i < x[1]; i++) { |
4248 | seq_printf(m, "%s: %lu ", name, x[2*i+3]); | 4241 | seq_printf(m, "%s: %lu ", name, x[2*i+3]); |
4249 | show_symbol(m, x[2*i+2]); | 4242 | show_symbol(m, x[2*i+2]); |
4250 | seq_putc(m, '\n'); | 4243 | seq_putc(m, '\n'); |
4251 | } | 4244 | } |
4252 | 4245 | ||
4253 | return 0; | 4246 | return 0; |
4254 | } | 4247 | } |
4255 | 4248 | ||
4256 | static const struct seq_operations slabstats_op = { | 4249 | static const struct seq_operations slabstats_op = { |
4257 | .start = leaks_start, | 4250 | .start = leaks_start, |
4258 | .next = slab_next, | 4251 | .next = slab_next, |
4259 | .stop = slab_stop, | 4252 | .stop = slab_stop, |
4260 | .show = leaks_show, | 4253 | .show = leaks_show, |
4261 | }; | 4254 | }; |
4262 | 4255 | ||
4263 | static int slabstats_open(struct inode *inode, struct file *file) | 4256 | static int slabstats_open(struct inode *inode, struct file *file) |
4264 | { | 4257 | { |
4265 | unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); | 4258 | unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); |
4266 | int ret = -ENOMEM; | 4259 | int ret = -ENOMEM; |
4267 | if (n) { | 4260 | if (n) { |
4268 | ret = seq_open(file, &slabstats_op); | 4261 | ret = seq_open(file, &slabstats_op); |
4269 | if (!ret) { | 4262 | if (!ret) { |
4270 | struct seq_file *m = file->private_data; | 4263 | struct seq_file *m = file->private_data; |
4271 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); | 4264 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); |
4272 | m->private = n; | 4265 | m->private = n; |
4273 | n = NULL; | 4266 | n = NULL; |
4274 | } | 4267 | } |
4275 | kfree(n); | 4268 | kfree(n); |
4276 | } | 4269 | } |
4277 | return ret; | 4270 | return ret; |
4278 | } | 4271 | } |
4279 | 4272 | ||
4280 | static const struct file_operations proc_slabstats_operations = { | 4273 | static const struct file_operations proc_slabstats_operations = { |
4281 | .open = slabstats_open, | 4274 | .open = slabstats_open, |
4282 | .read = seq_read, | 4275 | .read = seq_read, |
4283 | .llseek = seq_lseek, | 4276 | .llseek = seq_lseek, |
4284 | .release = seq_release_private, | 4277 | .release = seq_release_private, |
4285 | }; | 4278 | }; |
4286 | #endif | 4279 | #endif |
4287 | 4280 | ||
4288 | static int __init slab_proc_init(void) | 4281 | static int __init slab_proc_init(void) |
4289 | { | 4282 | { |
4290 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4283 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4291 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | 4284 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); |
4292 | #endif | 4285 | #endif |
4293 | return 0; | 4286 | return 0; |
4294 | } | 4287 | } |
4295 | module_init(slab_proc_init); | 4288 | module_init(slab_proc_init); |
4296 | #endif | 4289 | #endif |
4297 | 4290 | ||
4298 | /** | 4291 | /** |
4299 | * ksize - get the actual amount of memory allocated for a given object | 4292 | * ksize - get the actual amount of memory allocated for a given object |
4300 | * @objp: Pointer to the object | 4293 | * @objp: Pointer to the object |
4301 | * | 4294 | * |
4302 | * kmalloc may internally round up allocations and return more memory | 4295 | * kmalloc may internally round up allocations and return more memory |
4303 | * than requested. ksize() can be used to determine the actual amount of | 4296 | * than requested. ksize() can be used to determine the actual amount of |
4304 | * memory allocated. The caller may use this additional memory, even though | 4297 | * memory allocated. The caller may use this additional memory, even though |
4305 | * a smaller amount of memory was initially specified with the kmalloc call. | 4298 | * a smaller amount of memory was initially specified with the kmalloc call. |
4306 | * The caller must guarantee that objp points to a valid object previously | 4299 | * The caller must guarantee that objp points to a valid object previously |
4307 | * allocated with either kmalloc() or kmem_cache_alloc(). The object | 4300 | * allocated with either kmalloc() or kmem_cache_alloc(). The object |
4308 | * must not be freed during the duration of the call. | 4301 | * must not be freed during the duration of the call. |
4309 | */ | 4302 | */ |
4310 | size_t ksize(const void *objp) | 4303 | size_t ksize(const void *objp) |
4311 | { | 4304 | { |
4312 | BUG_ON(!objp); | 4305 | BUG_ON(!objp); |
4313 | if (unlikely(objp == ZERO_SIZE_PTR)) | 4306 | if (unlikely(objp == ZERO_SIZE_PTR)) |
4314 | return 0; | 4307 | return 0; |
4315 | 4308 | ||
4316 | return virt_to_cache(objp)->object_size; | 4309 | return virt_to_cache(objp)->object_size; |
4317 | } | 4310 | } |
4318 | EXPORT_SYMBOL(ksize); | 4311 | EXPORT_SYMBOL(ksize); |