Commit d18e9008c377dc6a6d2166a6840bf3a23a5867fd

Authored by Miklos Szeredi
Committed by Al Viro
1 parent 54ef487241

vfs: add i_op->atomic_open()

Add a new inode operation which is called on the last component of an open.
Using this the filesystem can look up, possibly create and open the file in one
atomic operation.  If it cannot perform this (e.g. the file type turned out to
be wrong) it may signal this by returning NULL instead of an open struct file
pointer.

i_op->atomic_open() is only called if the last component is negative or needs
lookup.  Handling cached positive dentries here doesn't add much value: these
can be opened using f_op->open().  If the cached file turns out to be invalid,
the open can be retried, this time using ->atomic_open() with a fresh dentry.

For now leave the old way of using open intents in lookup and revalidate in
place.  This will be removed once all the users are converted.

David Howells noticed that if ->atomic_open() opens the file but does not create
it, handle_truncate() will be called on it even if it is not a regular file.
Fix this by checking the file type in this case too.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 6 changed files with 270 additions and 2 deletions Side-by-side Diff

Documentation/filesystems/Locking
... ... @@ -62,6 +62,9 @@
62 62 int (*removexattr) (struct dentry *, const char *);
63 63 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
64 64 void (*update_time)(struct inode *, struct timespec *, int);
  65 + struct file * (*atomic_open)(struct inode *, struct dentry *,
  66 + struct opendata *, unsigned open_flag,
  67 + umode_t create_mode, bool *created);
65 68  
66 69 locking rules:
67 70 all may block
... ... @@ -89,6 +92,7 @@
89 92 removexattr: yes
90 93 fiemap: no
91 94 update_time: no
  95 +atomic_open: yes
92 96  
93 97 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
94 98 victim.
Documentation/filesystems/vfs.txt
... ... @@ -364,6 +364,9 @@
364 364 ssize_t (*listxattr) (struct dentry *, char *, size_t);
365 365 int (*removexattr) (struct dentry *, const char *);
366 366 void (*update_time)(struct inode *, struct timespec *, int);
  367 + struct file * (*atomic_open)(struct inode *, struct dentry *,
  368 + struct opendata *, unsigned open_flag,
  369 + umode_t create_mode, bool *created);
367 370 };
368 371  
369 372 Again, all methods are called without any locks being held, unless
... ... @@ -475,6 +478,14 @@
475 478 update_time: called by the VFS to update a specific time or the i_version of
476 479 an inode. If this is not defined the VFS will update the inode itself
477 480 and call mark_inode_dirty_sync.
  481 +
  482 + atomic_open: called on the last component of an open. Using this optional
  483 + method the filesystem can look up, possibly create and open the file in
  484 + one atomic operation. If it cannot perform this (e.g. the file type
  485 + turned out to be wrong) it may signal this by returning NULL instead of
  486 + an open struct file pointer. This method is only called if the last
  487 + component is negative or needs lookup. Cached positive dentries are
  488 + still handled by f_op->open().
478 489  
479 490 The Address Space Object
480 491 ========================
... ... @@ -85,6 +85,11 @@
85 85 struct nameidata;
86 86 extern struct file *nameidata_to_filp(struct nameidata *);
87 87 extern void release_open_intent(struct nameidata *);
  88 +struct opendata {
  89 + struct dentry *dentry;
  90 + struct vfsmount *mnt;
  91 + struct file **filp;
  92 +};
88 93 struct open_flags {
89 94 int open_flag;
90 95 umode_t mode;
... ... @@ -2196,6 +2196,176 @@
2196 2196 return flag;
2197 2197 }
2198 2198  
  2199 +static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
  2200 +{
  2201 + int error = security_path_mknod(dir, dentry, mode, 0);
  2202 + if (error)
  2203 + return error;
  2204 +
  2205 + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
  2206 + if (error)
  2207 + return error;
  2208 +
  2209 + return security_inode_create(dir->dentry->d_inode, dentry, mode);
  2210 +}
  2211 +
  2212 +static struct file *atomic_open(struct nameidata *nd, struct dentry *dentry,
  2213 + struct path *path, const struct open_flags *op,
  2214 + int *want_write, bool need_lookup,
  2215 + bool *created)
  2216 +{
  2217 + struct inode *dir = nd->path.dentry->d_inode;
  2218 + unsigned open_flag = open_to_namei_flags(op->open_flag);
  2219 + umode_t mode;
  2220 + int error;
  2221 + int acc_mode;
  2222 + struct opendata od;
  2223 + struct file *filp;
  2224 + int create_error = 0;
  2225 + struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
  2226 +
  2227 + BUG_ON(dentry->d_inode);
  2228 +
  2229 + /* Don't create child dentry for a dead directory. */
  2230 + if (unlikely(IS_DEADDIR(dir))) {
  2231 + filp = ERR_PTR(-ENOENT);
  2232 + goto out;
  2233 + }
  2234 +
  2235 + mode = op->mode & S_IALLUGO;
  2236 + if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
  2237 + mode &= ~current_umask();
  2238 +
  2239 + if (open_flag & O_EXCL) {
  2240 + open_flag &= ~O_TRUNC;
  2241 + *created = true;
  2242 + }
  2243 +
  2244 + /*
  2245 + * Checking write permission is tricky, bacuse we don't know if we are
  2246 + * going to actually need it: O_CREAT opens should work as long as the
  2247 + * file exists. But checking existence breaks atomicity. The trick is
  2248 + * to check access and if not granted clear O_CREAT from the flags.
  2249 + *
  2250 + * Another problem is returing the "right" error value (e.g. for an
  2251 + * O_EXCL open we want to return EEXIST not EROFS).
  2252 + */
  2253 + if ((open_flag & (O_CREAT | O_TRUNC)) ||
  2254 + (open_flag & O_ACCMODE) != O_RDONLY) {
  2255 + error = mnt_want_write(nd->path.mnt);
  2256 + if (!error) {
  2257 + *want_write = 1;
  2258 + } else if (!(open_flag & O_CREAT)) {
  2259 + /*
  2260 + * No O_CREATE -> atomicity not a requirement -> fall
  2261 + * back to lookup + open
  2262 + */
  2263 + goto no_open;
  2264 + } else if (open_flag & (O_EXCL | O_TRUNC)) {
  2265 + /* Fall back and fail with the right error */
  2266 + create_error = error;
  2267 + goto no_open;
  2268 + } else {
  2269 + /* No side effects, safe to clear O_CREAT */
  2270 + create_error = error;
  2271 + open_flag &= ~O_CREAT;
  2272 + }
  2273 + }
  2274 +
  2275 + if (open_flag & O_CREAT) {
  2276 + error = may_o_create(&nd->path, dentry, op->mode);
  2277 + if (error) {
  2278 + create_error = error;
  2279 + if (open_flag & O_EXCL)
  2280 + goto no_open;
  2281 + open_flag &= ~O_CREAT;
  2282 + }
  2283 + }
  2284 +
  2285 + if (nd->flags & LOOKUP_DIRECTORY)
  2286 + open_flag |= O_DIRECTORY;
  2287 +
  2288 + od.dentry = DENTRY_NOT_SET;
  2289 + od.mnt = nd->path.mnt;
  2290 + od.filp = &nd->intent.open.file;
  2291 + filp = dir->i_op->atomic_open(dir, dentry, &od, open_flag, mode,
  2292 + created);
  2293 + if (IS_ERR(filp)) {
  2294 + if (WARN_ON(od.dentry != DENTRY_NOT_SET))
  2295 + dput(od.dentry);
  2296 +
  2297 + if (create_error && PTR_ERR(filp) == -ENOENT)
  2298 + filp = ERR_PTR(create_error);
  2299 + goto out;
  2300 + }
  2301 +
  2302 + acc_mode = op->acc_mode;
  2303 + if (*created) {
  2304 + fsnotify_create(dir, dentry);
  2305 + acc_mode = MAY_OPEN;
  2306 + }
  2307 +
  2308 + if (!filp) {
  2309 + if (WARN_ON(od.dentry == DENTRY_NOT_SET)) {
  2310 + filp = ERR_PTR(-EIO);
  2311 + goto out;
  2312 + }
  2313 + if (od.dentry) {
  2314 + dput(dentry);
  2315 + dentry = od.dentry;
  2316 + }
  2317 + goto looked_up;
  2318 + }
  2319 +
  2320 + /*
  2321 + * We didn't have the inode before the open, so check open permission
  2322 + * here.
  2323 + */
  2324 + error = may_open(&filp->f_path, acc_mode, open_flag);
  2325 + if (error)
  2326 + goto out_fput;
  2327 +
  2328 + error = open_check_o_direct(filp);
  2329 + if (error)
  2330 + goto out_fput;
  2331 +
  2332 +out:
  2333 + dput(dentry);
  2334 + return filp;
  2335 +
  2336 +out_fput:
  2337 + fput(filp);
  2338 + filp = ERR_PTR(error);
  2339 + goto out;
  2340 +
  2341 +no_open:
  2342 + if (need_lookup) {
  2343 + dentry = lookup_real(dir, dentry, nd);
  2344 + if (IS_ERR(dentry))
  2345 + return ERR_CAST(dentry);
  2346 +
  2347 + if (create_error) {
  2348 + int open_flag = op->open_flag;
  2349 +
  2350 + filp = ERR_PTR(create_error);
  2351 + if ((open_flag & O_EXCL)) {
  2352 + if (!dentry->d_inode)
  2353 + goto out;
  2354 + } else if (!dentry->d_inode) {
  2355 + goto out;
  2356 + } else if ((open_flag & O_TRUNC) &&
  2357 + S_ISREG(dentry->d_inode->i_mode)) {
  2358 + goto out;
  2359 + }
  2360 + /* will fail later, go on to get the right error */
  2361 + }
  2362 + }
  2363 +looked_up:
  2364 + path->dentry = dentry;
  2365 + path->mnt = nd->path.mnt;
  2366 + return NULL;
  2367 +}
  2368 +
2199 2369 /*
2200 2370 * Lookup, maybe create and open the last component
2201 2371 *
... ... @@ -2219,6 +2389,15 @@
2219 2389 if (IS_ERR(dentry))
2220 2390 return ERR_CAST(dentry);
2221 2391  
  2392 + /* Cached positive dentry: will open in f_op->open */
  2393 + if (!need_lookup && dentry->d_inode)
  2394 + goto out_no_open;
  2395 +
  2396 + if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
  2397 + return atomic_open(nd, dentry, path, op, want_write,
  2398 + need_lookup, created);
  2399 + }
  2400 +
2222 2401 if (need_lookup) {
2223 2402 BUG_ON(dentry->d_inode);
2224 2403  
... ... @@ -2251,6 +2430,7 @@
2251 2430 if (error)
2252 2431 goto out_dput;
2253 2432 }
  2433 +out_no_open:
2254 2434 path->dentry = dentry;
2255 2435 path->mnt = nd->path.mnt;
2256 2436 return NULL;
2257 2437  
... ... @@ -2344,9 +2524,17 @@
2344 2524 filp = lookup_open(nd, path, op, &want_write, &created);
2345 2525 mutex_unlock(&dir->d_inode->i_mutex);
2346 2526  
2347   - if (IS_ERR(filp))
2348   - goto out;
  2527 + if (filp) {
  2528 + if (IS_ERR(filp))
  2529 + goto out;
2349 2530  
  2531 + if (created || !S_ISREG(filp->f_path.dentry->d_inode->i_mode))
  2532 + will_truncate = 0;
  2533 +
  2534 + audit_inode(pathname, filp->f_path.dentry);
  2535 + goto opened;
  2536 + }
  2537 +
2350 2538 if (created) {
2351 2539 /* Don't check for write permission, don't truncate */
2352 2540 open_flag &= ~O_TRUNC;
... ... @@ -2361,6 +2549,16 @@
2361 2549 */
2362 2550 audit_inode(pathname, path->dentry);
2363 2551  
  2552 + /*
  2553 + * If atomic_open() acquired write access it is dropped now due to
  2554 + * possible mount and symlink following (this might be optimized away if
  2555 + * necessary...)
  2556 + */
  2557 + if (want_write) {
  2558 + mnt_drop_write(nd->path.mnt);
  2559 + want_write = 0;
  2560 + }
  2561 +
2364 2562 error = -EEXIST;
2365 2563 if (open_flag & O_EXCL)
2366 2564 goto exit_dput;
... ... @@ -2444,6 +2642,7 @@
2444 2642 retried = true;
2445 2643 goto retry_lookup;
2446 2644 }
  2645 +opened:
2447 2646 if (!IS_ERR(filp)) {
2448 2647 error = ima_file_check(filp, op->acc_mode);
2449 2648 if (error) {
... ... @@ -811,6 +811,48 @@
811 811 EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
812 812  
813 813 /**
  814 + * finish_open - finish opening a file
  815 + * @od: opaque open data
  816 + * @dentry: pointer to dentry
  817 + * @open: open callback
  818 + *
  819 + * This can be used to finish opening a file passed to i_op->atomic_open().
  820 + *
  821 + * If the open callback is set to NULL, then the standard f_op->open()
  822 + * filesystem callback is substituted.
  823 + */
  824 +struct file *finish_open(struct opendata *od, struct dentry *dentry,
  825 + int (*open)(struct inode *, struct file *))
  826 +{
  827 + struct file *res;
  828 +
  829 + mntget(od->mnt);
  830 + dget(dentry);
  831 +
  832 + res = do_dentry_open(dentry, od->mnt, *od->filp, open, current_cred());
  833 + if (!IS_ERR(res))
  834 + *od->filp = NULL;
  835 +
  836 + return res;
  837 +}
  838 +EXPORT_SYMBOL(finish_open);
  839 +
  840 +/**
  841 + * finish_no_open - finish ->atomic_open() without opening the file
  842 + *
  843 + * @od: opaque open data
  844 + * @dentry: dentry or NULL (as returned from ->lookup())
  845 + *
  846 + * This can be used to set the result of a successful lookup in ->atomic_open().
  847 + * The filesystem's atomic_open() method shall return NULL after calling this.
  848 + */
  849 +void finish_no_open(struct opendata *od, struct dentry *dentry)
  850 +{
  851 + od->dentry = dentry;
  852 +}
  853 +EXPORT_SYMBOL(finish_no_open);
  854 +
  855 +/**
814 856 * nameidata_to_filp - convert a nameidata to an open filp.
815 857 * @nd: pointer to nameidata
816 858 * @flags: open flags
... ... @@ -427,6 +427,7 @@
427 427 struct vm_area_struct;
428 428 struct vfsmount;
429 429 struct cred;
  430 +struct opendata;
430 431  
431 432 extern void __init inode_init(void);
432 433 extern void __init inode_init_early(void);
... ... @@ -1693,6 +1694,9 @@
1693 1694 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1694 1695 u64 len);
1695 1696 int (*update_time)(struct inode *, struct timespec *, int);
  1697 + struct file * (*atomic_open)(struct inode *, struct dentry *,
  1698 + struct opendata *, unsigned open_flag,
  1699 + umode_t create_mode, bool *created);
1696 1700 } ____cacheline_aligned;
1697 1701  
1698 1702 struct seq_file;
... ... @@ -2061,6 +2065,9 @@
2061 2065 const struct cred *);
2062 2066 extern int filp_close(struct file *, fl_owner_t id);
2063 2067 extern char * getname(const char __user *);
  2068 +extern struct file *finish_open(struct opendata *od, struct dentry *dentry,
  2069 + int (*open)(struct inode *, struct file *));
  2070 +extern void finish_no_open(struct opendata *od, struct dentry *dentry);
2064 2071  
2065 2072 /* fs/ioctl.c */
2066 2073