Commit 88b88a66797159949cec32eaab12b4968f6fae2d

Authored by Jaegeuk Kim
1 parent 120c2cba1d

f2fs: support atomic writes

This patch introduces a very limited functionality for atomic write support.
In order to support atomic write, this patch adds two ioctls:
 o F2FS_IOC_START_ATOMIC_WRITE
 o F2FS_IOC_COMMIT_ATOMIC_WRITE

The database engine should be aware of the following sequence.
1. open
 -> ioctl(F2FS_IOC_START_ATOMIC_WRITE);
2. writes
  : all the written data will be treated as atomic pages.
3. commit
 -> ioctl(F2FS_IOC_COMMIT_ATOMIC_WRITE);
  : this flushes all the data blocks to the disk, which will be shown all or
  nothing by f2fs recovery procedure.
4. repeat to #2.

The IO pattens should be:

  ,- START_ATOMIC_WRITE                  ,- COMMIT_ATOMIC_WRITE
 CP | D D D D D D | FSYNC | D D D D | FSYNC ...
                      `- COMMIT_ATOMIC_WRITE

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

Showing 8 changed files with 139 additions and 5 deletions Side-by-side Diff

... ... @@ -1052,7 +1052,10 @@
1052 1052  
1053 1053 trace_f2fs_write_end(inode, pos, len, copied);
1054 1054  
1055   - set_page_dirty(page);
  1055 + if (f2fs_is_atomic_file(inode))
  1056 + register_inmem_page(inode, page);
  1057 + else
  1058 + set_page_dirty(page);
1056 1059  
1057 1060 if (pos + copied > i_size_read(inode)) {
1058 1061 i_size_write(inode, pos + copied);
... ... @@ -192,9 +192,13 @@
192 192 /*
193 193 * ioctl commands
194 194 */
195   -#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
196   -#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
  195 +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
  196 +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
197 197  
  198 +#define F2FS_IOCTL_MAGIC 0xf5
  199 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
  200 +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
  201 +
198 202 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
199 203 /*
200 204 * ioctl commands in 32 bit emulation
... ... @@ -263,6 +267,9 @@
263 267 unsigned long long xattr_ver; /* cp version of xattr modification */
264 268 struct extent_info ext; /* in-memory extent cache entry */
265 269 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
  270 +
  271 + struct list_head inmem_pages; /* inmemory pages managed by f2fs */
  272 + struct mutex inmem_lock; /* lock for inmemory pages */
266 273 };
267 274  
268 275 static inline void get_extent_info(struct extent_info *ext,
... ... @@ -1051,7 +1058,8 @@
1051 1058 FI_INLINE_DATA, /* used for inline data*/
1052 1059 FI_APPEND_WRITE, /* inode has appended data */
1053 1060 FI_UPDATE_WRITE, /* inode has in-place-update data */
1054   - FI_NEED_IPU, /* used fo ipu for fdatasync */
  1061 + FI_NEED_IPU, /* used for ipu per file */
  1062 + FI_ATOMIC_FILE, /* indicate atomic file */
1055 1063 };
1056 1064  
1057 1065 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
... ... @@ -1138,6 +1146,11 @@
1138 1146 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
1139 1147 }
1140 1148  
  1149 +static inline bool f2fs_is_atomic_file(struct inode *inode)
  1150 +{
  1151 + return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
  1152 +}
  1153 +
1141 1154 static inline void *inline_data_addr(struct page *page)
1142 1155 {
1143 1156 struct f2fs_inode *ri = F2FS_INODE(page);
... ... @@ -1275,6 +1288,8 @@
1275 1288 /*
1276 1289 * segment.c
1277 1290 */
  1291 +void register_inmem_page(struct inode *, struct page *);
  1292 +void commit_inmem_pages(struct inode *, bool);
1278 1293 void f2fs_balance_fs(struct f2fs_sb_info *);
1279 1294 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1280 1295 int f2fs_issue_flush(struct f2fs_sb_info *);
... ... @@ -862,6 +862,41 @@
862 862 return ret;
863 863 }
864 864  
  865 +static int f2fs_ioc_start_atomic_write(struct file *filp)
  866 +{
  867 + struct inode *inode = file_inode(filp);
  868 + struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
  869 +
  870 + if (!inode_owner_or_capable(inode))
  871 + return -EACCES;
  872 +
  873 + f2fs_balance_fs(sbi);
  874 +
  875 + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
  876 +
  877 + return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
  878 +}
  879 +
  880 +static int f2fs_ioc_commit_atomic_write(struct file *filp)
  881 +{
  882 + struct inode *inode = file_inode(filp);
  883 + int ret;
  884 +
  885 + if (!inode_owner_or_capable(inode))
  886 + return -EACCES;
  887 +
  888 + ret = mnt_want_write_file(filp);
  889 + if (ret)
  890 + return ret;
  891 +
  892 + if (f2fs_is_atomic_file(inode))
  893 + commit_inmem_pages(inode, false);
  894 +
  895 + ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
  896 + mnt_drop_write_file(filp);
  897 + return ret;
  898 +}
  899 +
865 900 static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
866 901 {
867 902 struct inode *inode = file_inode(filp);
... ... @@ -899,6 +934,10 @@
899 934 return f2fs_ioc_getflags(filp, arg);
900 935 case F2FS_IOC_SETFLAGS:
901 936 return f2fs_ioc_setflags(filp, arg);
  937 + case F2FS_IOC_START_ATOMIC_WRITE:
  938 + return f2fs_ioc_start_atomic_write(filp);
  939 + case F2FS_IOC_COMMIT_ATOMIC_WRITE:
  940 + return f2fs_ioc_commit_atomic_write(filp);
902 941 case FITRIM:
903 942 return f2fs_ioc_fitrim(filp, arg);
904 943 default:
... ... @@ -21,6 +21,9 @@
21 21 if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
22 22 return false;
23 23  
  24 + if (f2fs_is_atomic_file(inode))
  25 + return false;
  26 +
24 27 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
25 28 if (inode->i_blocks > nr_blocks)
26 29 return false;
... ... @@ -269,6 +269,10 @@
269 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 270 nid_t xnid = F2FS_I(inode)->i_xattr_nid;
271 271  
  272 + /* some remained atomic pages should discarded */
  273 + if (f2fs_is_atomic_file(inode))
  274 + commit_inmem_pages(inode, true);
  275 +
272 276 trace_f2fs_evict_inode(inode);
273 277 truncate_inode_pages_final(&inode->i_data);
274 278  
... ... @@ -26,6 +26,7 @@
26 26  
27 27 static struct kmem_cache *discard_entry_slab;
28 28 static struct kmem_cache *sit_entry_set_slab;
  29 +static struct kmem_cache *inmem_entry_slab;
29 30  
30 31 /*
31 32 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
... ... @@ -173,6 +174,60 @@
173 174 return result + __reverse_ffz(tmp);
174 175 }
175 176  
  177 +void register_inmem_page(struct inode *inode, struct page *page)
  178 +{
  179 + struct f2fs_inode_info *fi = F2FS_I(inode);
  180 + struct inmem_pages *new;
  181 +
  182 + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
  183 +
  184 + /* add atomic page indices to the list */
  185 + new->page = page;
  186 + INIT_LIST_HEAD(&new->list);
  187 +
  188 + /* increase reference count with clean state */
  189 + mutex_lock(&fi->inmem_lock);
  190 + get_page(page);
  191 + list_add_tail(&new->list, &fi->inmem_pages);
  192 + mutex_unlock(&fi->inmem_lock);
  193 +}
  194 +
  195 +void commit_inmem_pages(struct inode *inode, bool abort)
  196 +{
  197 + struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
  198 + struct f2fs_inode_info *fi = F2FS_I(inode);
  199 + struct inmem_pages *cur, *tmp;
  200 + bool submit_bio = false;
  201 + struct f2fs_io_info fio = {
  202 + .type = DATA,
  203 + .rw = WRITE_SYNC,
  204 + };
  205 +
  206 + f2fs_balance_fs(sbi);
  207 + f2fs_lock_op(sbi);
  208 +
  209 + mutex_lock(&fi->inmem_lock);
  210 + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
  211 + lock_page(cur->page);
  212 + if (!abort && cur->page->mapping == inode->i_mapping) {
  213 + f2fs_wait_on_page_writeback(cur->page, DATA);
  214 + if (clear_page_dirty_for_io(cur->page))
  215 + inode_dec_dirty_pages(inode);
  216 + do_write_data_page(cur->page, &fio);
  217 + submit_bio = true;
  218 + }
  219 + f2fs_put_page(cur->page, 1);
  220 + list_del(&cur->list);
  221 + kmem_cache_free(inmem_entry_slab, cur);
  222 + }
  223 + if (submit_bio)
  224 + f2fs_submit_merged_bio(sbi, DATA, WRITE);
  225 + mutex_unlock(&fi->inmem_lock);
  226 +
  227 + filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
  228 + f2fs_unlock_op(sbi);
  229 +}
  230 +
176 231 /*
177 232 * This function balances dirty node and dentry pages.
178 233 * In addition, it controls garbage collection.
179 234  
... ... @@ -2148,8 +2203,15 @@
2148 2203 sizeof(struct nat_entry_set));
2149 2204 if (!sit_entry_set_slab)
2150 2205 goto destory_discard_entry;
  2206 +
  2207 + inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
  2208 + sizeof(struct inmem_pages));
  2209 + if (!inmem_entry_slab)
  2210 + goto destroy_sit_entry_set;
2151 2211 return 0;
2152 2212  
  2213 +destroy_sit_entry_set:
  2214 + kmem_cache_destroy(sit_entry_set_slab);
2153 2215 destory_discard_entry:
2154 2216 kmem_cache_destroy(discard_entry_slab);
2155 2217 fail:
... ... @@ -2160,5 +2222,6 @@
2160 2222 {
2161 2223 kmem_cache_destroy(sit_entry_set_slab);
2162 2224 kmem_cache_destroy(discard_entry_slab);
  2225 + kmem_cache_destroy(inmem_entry_slab);
2163 2226 }
... ... @@ -175,6 +175,11 @@
175 175 void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
176 176 };
177 177  
  178 +struct inmem_pages {
  179 + struct list_head list;
  180 + struct page *page;
  181 +};
  182 +
178 183 struct sit_info {
179 184 const struct segment_allocation *s_ops;
180 185  
... ... @@ -504,7 +509,7 @@
504 509 unsigned int policy = SM_I(sbi)->ipu_policy;
505 510  
506 511 /* IPU can be done only for the user data */
507   - if (S_ISDIR(inode->i_mode))
  512 + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
508 513 return false;
509 514  
510 515 if (policy & (0x1 << F2FS_IPU_FORCE))
... ... @@ -373,6 +373,8 @@
373 373 fi->i_advise = 0;
374 374 rwlock_init(&fi->ext.ext_lock);
375 375 init_rwsem(&fi->i_sem);
  376 + INIT_LIST_HEAD(&fi->inmem_pages);
  377 + mutex_init(&fi->inmem_lock);
376 378  
377 379 set_inode_flag(fi, FI_NEW_INODE);
378 380