Blame view
include/linux/iversion.h
12.1 KB
ae5e165d8 fs: new API for h... |
1 2 3 4 5 6 7 |
/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IVERSION_H #define _LINUX_IVERSION_H #include <linux/fs.h> /* |
f02a9ad1f fs: handle inode-... |
8 9 |
* The inode->i_version field: * --------------------------- |
ae5e165d8 fs: new API for h... |
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
* The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must * appear different to observers if there was a change to the inode's data or * metadata since it was last queried. * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the * inode. If it's different then something has changed. Observers cannot infer * anything about the nature or magnitude of the changes from the value, only * that the inode has changed in some fashion. * * Not all filesystems properly implement the i_version counter. Subsystems that * want to use i_version field on an inode should first check whether the * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro). * * Those that set SB_I_VERSION will automatically have their i_version counter * incremented on writes to normal files. If the SB_I_VERSION is not set, then * the VFS will not touch it on writes, and the filesystem can use it how it * wishes. Note that the filesystem is always responsible for updating the * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.). * We consider these sorts of filesystems to have a kernel-managed i_version. * * It may be impractical for filesystems to keep i_version updates atomic with * respect to the changes that cause them. They should, however, guarantee * that i_version updates are never visible before the changes that caused * them. Also, i_version updates should never be delayed longer than it takes * the original change to reach disk. * |
f02a9ad1f fs: handle inode-... |
38 39 40 41 42 43 44 45 46 47 |
* This implementation uses the low bit in the i_version field as a flag to * track when the value has been queried. If it has not been queried since it * was last incremented, we can skip the increment in most cases. * * In the event that we're updating the ctime, we will usually go ahead and * bump the i_version anyway. Since that has to go to stable storage in some * fashion, we might as well increment it as well. * * With this implementation, the value should always appear to observers to * increase over time if the file has changed. It's recommended to use |
c472c07bf iversion: Rename ... |
48 |
* inode_eq_iversion() helper to compare values. |
f02a9ad1f fs: handle inode-... |
49 |
* |
ae5e165d8 fs: new API for h... |
50 51 52 53 |
* Note that some filesystems (e.g. NFS and AFS) just use the field to store * a server-provided value (for the most part). For that reason, those * filesystems do not set SB_I_VERSION. These filesystems are considered to * have a self-managed i_version. |
f02a9ad1f fs: handle inode-... |
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
* * Persistently storing the i_version * ---------------------------------- * Queries of the i_version field are not gated on them hitting the backing * store. It's always possible that the host could crash after allowing * a query of the value but before it has made it to disk. * * To mitigate this problem, filesystems should always use * inode_set_iversion_queried when loading an existing inode from disk. This * ensures that the next attempted inode increment will result in the value * changing. * * Storing the value to disk therefore does not count as a query, so those * filesystems should use inode_peek_iversion to grab the value to be stored. * There is no need to flag the value as having been queried in that case. |
ae5e165d8 fs: new API for h... |
69 |
*/ |
f02a9ad1f fs: handle inode-... |
70 71 72 73 74 75 76 77 78 |
/* * We borrow the lowest bit in the i_version to use as a flag to tell whether * it has been queried since we last incremented it. If it has, then we must * increment it on the next change. After that, we can clear the flag and * avoid incrementing it again until it has again been queried. */ #define I_VERSION_QUERIED_SHIFT (1) #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) |
ae5e165d8 fs: new API for h... |
79 80 81 |
/** * inode_set_iversion_raw - set i_version to the specified raw value * @inode: inode to set |
f02a9ad1f fs: handle inode-... |
82 |
* @val: new i_version value to set |
ae5e165d8 fs: new API for h... |
83 |
* |
f02a9ad1f fs: handle inode-... |
84 |
* Set @inode's i_version field to @val. This function is for use by |
ae5e165d8 fs: new API for h... |
85 86 87 88 89 90 |
* filesystems that self-manage the i_version. * * For example, the NFS client stores its NFSv4 change attribute in this way, * and the AFS client stores the data_version from the server here. */ static inline void |
f02a9ad1f fs: handle inode-... |
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
inode_set_iversion_raw(struct inode *inode, u64 val) { atomic64_set(&inode->i_version, val); } /** * inode_peek_iversion_raw - grab a "raw" iversion value * @inode: inode from which i_version should be read * * Grab a "raw" inode->i_version value and return it. The i_version is not * flagged or converted in any way. This is mostly used to access a self-managed * i_version. * * With those filesystems, we want to treat the i_version as an entirely * opaque value. */ static inline u64 inode_peek_iversion_raw(const struct inode *inode) |
ae5e165d8 fs: new API for h... |
109 |
{ |
f02a9ad1f fs: handle inode-... |
110 |
return atomic64_read(&inode->i_version); |
ae5e165d8 fs: new API for h... |
111 112 113 |
} /** |
441d36764 iversion: add a r... |
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
* inode_set_max_iversion_raw - update i_version new value is larger * @inode: inode to set * @val: new i_version to set * * Some self-managed filesystems (e.g Ceph) will only update the i_version * value if the new value is larger than the one we already have. */ static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { u64 cur, old; cur = inode_peek_iversion_raw(inode); for (;;) { if (cur > val) break; old = atomic64_cmpxchg(&inode->i_version, cur, val); if (likely(old == cur)) break; cur = old; } } /** |
ae5e165d8 fs: new API for h... |
138 139 |
* inode_set_iversion - set i_version to a particular value * @inode: inode to set |
f02a9ad1f fs: handle inode-... |
140 |
* @val: new i_version value to set |
ae5e165d8 fs: new API for h... |
141 |
* |
f02a9ad1f fs: handle inode-... |
142 143 144 |
* Set @inode's i_version field to @val. This function is for filesystems with * a kernel-managed i_version, for initializing a newly-created inode from * scratch. |
ae5e165d8 fs: new API for h... |
145 |
* |
f02a9ad1f fs: handle inode-... |
146 147 |
* In this case, we do not set the QUERIED flag since we know that this value * has never been queried. |
ae5e165d8 fs: new API for h... |
148 149 |
*/ static inline void |
f02a9ad1f fs: handle inode-... |
150 |
inode_set_iversion(struct inode *inode, u64 val) |
ae5e165d8 fs: new API for h... |
151 |
{ |
f02a9ad1f fs: handle inode-... |
152 |
inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); |
ae5e165d8 fs: new API for h... |
153 154 155 |
} /** |
f02a9ad1f fs: handle inode-... |
156 |
* inode_set_iversion_queried - set i_version to a particular value as quereied |
ae5e165d8 fs: new API for h... |
157 |
* @inode: inode to set |
f02a9ad1f fs: handle inode-... |
158 |
* @val: new i_version value to set |
ae5e165d8 fs: new API for h... |
159 |
* |
f02a9ad1f fs: handle inode-... |
160 161 |
* Set @inode's i_version field to @val, and flag it for increment on the next * change. |
ae5e165d8 fs: new API for h... |
162 |
* |
f02a9ad1f fs: handle inode-... |
163 164 |
* Filesystems that persistently store the i_version on disk should use this * when loading an existing inode from disk. |
ae5e165d8 fs: new API for h... |
165 |
* |
f02a9ad1f fs: handle inode-... |
166 167 168 169 |
* When loading in an i_version value from a backing store, we can't be certain * that it wasn't previously viewed before being stored. Thus, we must assume * that it was, to ensure that we don't end up handing out the same value for * different versions of the same inode. |
ae5e165d8 fs: new API for h... |
170 171 |
*/ static inline void |
f02a9ad1f fs: handle inode-... |
172 |
inode_set_iversion_queried(struct inode *inode, u64 val) |
ae5e165d8 fs: new API for h... |
173 |
{ |
f02a9ad1f fs: handle inode-... |
174 175 |
inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | I_VERSION_QUERIED); |
ae5e165d8 fs: new API for h... |
176 177 178 179 180 |
} /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated |
f02a9ad1f fs: handle inode-... |
181 |
* @force: increment the counter even if it's not necessary? |
ae5e165d8 fs: new API for h... |
182 183 184 185 |
* * Every time the inode is modified, the i_version field must be seen to have * changed by any observer. * |
f02a9ad1f fs: handle inode-... |
186 187 |
* If "force" is set or the QUERIED flag is set, then ensure that we increment * the value, and clear the queried flag. |
ae5e165d8 fs: new API for h... |
188 |
* |
f02a9ad1f fs: handle inode-... |
189 190 191 192 193 |
* In the common case where neither is set, then we can return "false" without * updating i_version. * * If this function returns false, and no other metadata has changed, then we * can avoid logging the metadata. |
ae5e165d8 fs: new API for h... |
194 195 196 197 |
*/ static inline bool inode_maybe_inc_iversion(struct inode *inode, bool force) { |
f02a9ad1f fs: handle inode-... |
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
u64 cur, old, new; /* * The i_version field is not strictly ordered with any other inode * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * * Here, we add full memory barriers to ensure that any de-facto * ordering with other info is preserved. * * This barrier pairs with the barrier in inode_query_iversion() */ smp_mb(); cur = inode_peek_iversion_raw(inode); for (;;) { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; |
7594c4611 fs: don't take th... |
216 |
|
f02a9ad1f fs: handle inode-... |
217 218 219 220 221 222 223 224 |
/* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; old = atomic64_cmpxchg(&inode->i_version, cur, new); if (likely(old == cur)) break; cur = old; } |
ae5e165d8 fs: new API for h... |
225 226 |
return true; } |
7594c4611 fs: don't take th... |
227 |
|
ae5e165d8 fs: new API for h... |
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
/** * inode_inc_iversion - forcibly increment i_version * @inode: inode that needs to be updated * * Forcbily increment the i_version field. This always results in a change to * the observable value. */ static inline void inode_inc_iversion(struct inode *inode) { inode_maybe_inc_iversion(inode, true); } /** * inode_iversion_need_inc - is the i_version in need of being incremented? * @inode: inode to check * * Returns whether the inode->i_version counter needs incrementing on the next |
f02a9ad1f fs: handle inode-... |
246 |
* change. Just fetch the value and check the QUERIED flag. |
ae5e165d8 fs: new API for h... |
247 248 249 250 |
*/ static inline bool inode_iversion_need_inc(struct inode *inode) { |
f02a9ad1f fs: handle inode-... |
251 |
return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; |
ae5e165d8 fs: new API for h... |
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 |
} /** * inode_inc_iversion_raw - forcibly increment raw i_version * @inode: inode that needs to be updated * * Forcbily increment the raw i_version field. This always results in a change * to the raw value. * * NFS will use the i_version field to store the value from the server. It * mostly treats it as opaque, but in the case where it holds a write * delegation, it must increment the value itself. This function does that. */ static inline void inode_inc_iversion_raw(struct inode *inode) { |
f02a9ad1f fs: handle inode-... |
268 |
atomic64_inc(&inode->i_version); |
ae5e165d8 fs: new API for h... |
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
} /** * inode_peek_iversion - read i_version without flagging it to be incremented * @inode: inode from which i_version should be read * * Read the inode i_version counter for an inode without registering it as a * query. * * This is typically used by local filesystems that need to store an i_version * on disk. In that situation, it's not necessary to flag it as having been * viewed, as the result won't be used to gauge changes from that point. */ static inline u64 inode_peek_iversion(const struct inode *inode) { |
f02a9ad1f fs: handle inode-... |
285 |
return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; |
ae5e165d8 fs: new API for h... |
286 287 288 289 290 291 292 293 294 295 296 |
} /** * inode_query_iversion - read i_version for later use * @inode: inode from which i_version should be read * * Read the inode i_version counter. This should be used by callers that wish * to store the returned i_version for later comparison. This will guarantee * that a later query of the i_version will result in a different value if * anything has changed. * |
f02a9ad1f fs: handle inode-... |
297 298 299 |
* In this implementation, we fetch the current value, set the QUERIED flag and * then try to swap it into place with a cmpxchg, if it wasn't already set. If * that fails, we try again with the newly fetched value from the cmpxchg. |
ae5e165d8 fs: new API for h... |
300 301 302 303 |
*/ static inline u64 inode_query_iversion(struct inode *inode) { |
f02a9ad1f fs: handle inode-... |
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
u64 cur, old, new; cur = inode_peek_iversion_raw(inode); for (;;) { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { /* * This barrier (and the implicit barrier in the * cmpxchg below) pairs with the barrier in * inode_maybe_inc_iversion(). */ smp_mb(); break; } new = cur | I_VERSION_QUERIED; old = atomic64_cmpxchg(&inode->i_version, cur, new); if (likely(old == cur)) break; cur = old; } return cur >> I_VERSION_QUERIED_SHIFT; |
ae5e165d8 fs: new API for h... |
326 327 328 |
} /** |
c472c07bf iversion: Rename ... |
329 |
* inode_eq_iversion_raw - check whether the raw i_version counter has changed |
ae5e165d8 fs: new API for h... |
330 331 332 |
* @inode: inode to check * @old: old value to check against its i_version * |
c472c07bf iversion: Rename ... |
333 334 |
* Compare the current raw i_version counter with a previous one. Returns true * if they are the same or false if they are different. |
ae5e165d8 fs: new API for h... |
335 |
*/ |
c0cef30e4 iversion: make in... |
336 |
static inline bool |
c472c07bf iversion: Rename ... |
337 |
inode_eq_iversion_raw(const struct inode *inode, u64 old) |
ae5e165d8 fs: new API for h... |
338 |
{ |
c472c07bf iversion: Rename ... |
339 |
return inode_peek_iversion_raw(inode) == old; |
ae5e165d8 fs: new API for h... |
340 341 342 |
} /** |
c472c07bf iversion: Rename ... |
343 |
* inode_eq_iversion - check whether the i_version counter has changed |
ae5e165d8 fs: new API for h... |
344 345 346 |
* @inode: inode to check * @old: old value to check against its i_version * |
c472c07bf iversion: Rename ... |
347 348 |
* Compare an i_version counter with a previous one. Returns true if they are * the same, and false if they are different. |
f02a9ad1f fs: handle inode-... |
349 350 351 |
* * Note that we don't need to set the QUERIED flag in this case, as the value * in the inode is not being recorded for later use. |
ae5e165d8 fs: new API for h... |
352 |
*/ |
c0cef30e4 iversion: make in... |
353 |
static inline bool |
c472c07bf iversion: Rename ... |
354 |
inode_eq_iversion(const struct inode *inode, u64 old) |
ae5e165d8 fs: new API for h... |
355 |
{ |
c472c07bf iversion: Rename ... |
356 |
return inode_peek_iversion(inode) == old; |
ae5e165d8 fs: new API for h... |
357 358 |
} #endif |