Commit 044b9414c7caf9a26192c73a5b88fa1a8a32a1c1

Authored by Steven Whitehouse
1 parent 0143832cc9

GFS2: Fix inode deallocation race

This area of the code has always been a bit delicate due to the
subtleties of lock ordering. The problem is that for "normal"
alloc/dealloc, we always grab the inode locks first and the rgrp lock
later.

In order to ensure no races in looking up the unlinked, but still
allocated inodes, we need to hold the rgrp lock when we do the lookup,
which means that we can't take the inode glock.

The solution is to borrow the technique already used by NFS to solve
what is essentially the same problem (given an inode number, look up
the inode carefully, checking that it really is in the expected
state).

We cannot do that directly from the allocation code (lock ordering
again) so we give the job to the pre-existing delete workqueue and
carry on with the allocation as normal.

If we find there is no space, we do a journal flush (required anyway
if space from a deallocation is to be released) which should block
against the pending deallocations, so we should always get the space
back.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

Showing 5 changed files with 98 additions and 216 deletions Side-by-side Diff

... ... @@ -138,10 +138,8 @@
138 138 struct gfs2_inum_host *inum)
139 139 {
140 140 struct gfs2_sbd *sdp = sb->s_fs_info;
141   - struct gfs2_holder i_gh;
142 141 struct inode *inode;
143 142 struct dentry *dentry;
144   - int error;
145 143  
146 144 inode = gfs2_ilookup(sb, inum->no_addr);
147 145 if (inode) {
148 146  
149 147  
... ... @@ -152,52 +150,16 @@
152 150 goto out_inode;
153 151 }
154 152  
155   - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
156   - LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
157   - if (error)
158   - return ERR_PTR(error);
  153 + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
  154 + GFS2_BLKST_DINODE);
  155 + if (IS_ERR(inode))
  156 + return ERR_CAST(inode);
159 157  
160   - error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
161   - if (error)
162   - goto fail;
163   -
164   - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
165   - if (IS_ERR(inode)) {
166   - error = PTR_ERR(inode);
167   - goto fail;
168   - }
169   -
170   - error = gfs2_inode_refresh(GFS2_I(inode));
171   - if (error) {
172   - iput(inode);
173   - goto fail;
174   - }
175   -
176   - /* Pick up the works we bypass in gfs2_inode_lookup */
177   - if (inode->i_state & I_NEW)
178   - gfs2_set_iop(inode);
179   -
180   - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
181   - iput(inode);
182   - goto fail;
183   - }
184   -
185   - error = -EIO;
186   - if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
187   - iput(inode);
188   - goto fail;
189   - }
190   -
191   - gfs2_glock_dq_uninit(&i_gh);
192   -
193 158 out_inode:
194 159 dentry = d_obtain_alias(inode);
195 160 if (!IS_ERR(dentry))
196 161 dentry->d_op = &gfs2_dops;
197 162 return dentry;
198   -fail:
199   - gfs2_glock_dq_uninit(&i_gh);
200   - return ERR_PTR(error);
201 163 }
202 164  
203 165 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
... ... @@ -686,21 +686,20 @@
686 686 {
687 687 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
688 688 struct gfs2_sbd *sdp = gl->gl_sbd;
689   - struct gfs2_inode *ip = NULL;
  689 + struct gfs2_inode *ip;
690 690 struct inode *inode;
691   - u64 no_addr = 0;
  691 + u64 no_addr = gl->gl_name.ln_number;
692 692  
693   - spin_lock(&gl->gl_spin);
694   - ip = (struct gfs2_inode *)gl->gl_object;
  693 + ip = gl->gl_object;
  694 + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
  695 +
695 696 if (ip)
696   - no_addr = ip->i_no_addr;
697   - spin_unlock(&gl->gl_spin);
698   - if (ip) {
699 697 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
700   - if (inode) {
701   - d_prune_aliases(inode);
702   - iput(inode);
703   - }
  698 + else
  699 + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
  700 + if (inode && !IS_ERR(inode)) {
  701 + d_prune_aliases(inode);
  702 + iput(inode);
704 703 }
705 704 gfs2_glock_put(gl);
706 705 }
... ... @@ -73,49 +73,6 @@
73 73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
74 74 }
75 75  
76   -struct gfs2_skip_data {
77   - u64 no_addr;
78   - int skipped;
79   -};
80   -
81   -static int iget_skip_test(struct inode *inode, void *opaque)
82   -{
83   - struct gfs2_inode *ip = GFS2_I(inode);
84   - struct gfs2_skip_data *data = opaque;
85   -
86   - if (ip->i_no_addr == data->no_addr) {
87   - if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88   - data->skipped = 1;
89   - return 0;
90   - }
91   - return 1;
92   - }
93   - return 0;
94   -}
95   -
96   -static int iget_skip_set(struct inode *inode, void *opaque)
97   -{
98   - struct gfs2_inode *ip = GFS2_I(inode);
99   - struct gfs2_skip_data *data = opaque;
100   -
101   - if (data->skipped)
102   - return 1;
103   - inode->i_ino = (unsigned long)(data->no_addr);
104   - ip->i_no_addr = data->no_addr;
105   - return 0;
106   -}
107   -
108   -static struct inode *gfs2_iget_skip(struct super_block *sb,
109   - u64 no_addr)
110   -{
111   - struct gfs2_skip_data data;
112   - unsigned long hash = (unsigned long)no_addr;
113   -
114   - data.no_addr = no_addr;
115   - data.skipped = 0;
116   - return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
117   -}
118   -
119 76 /**
120 77 * GFS2 lookup code fills in vfs inode contents based on info obtained
121 78 * from directory entry inside gfs2_inode_lookup(). This has caused issues
122 79  
123 80  
124 81  
125 82  
126 83  
127 84  
128 85  
129 86  
130 87  
131 88  
132 89  
133 90  
134 91  
... ... @@ -243,93 +200,54 @@
243 200 return ERR_PTR(error);
244 201 }
245 202  
246   -/**
247   - * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
248   - * and try to reclaim it by doing iput.
249   - *
250   - * This function assumes no rgrp locks are currently held.
251   - *
252   - * @sb: The super block
253   - * no_addr: The inode number
254   - *
255   - */
256   -
257   -void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
  203 +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
  204 + u64 *no_formal_ino, unsigned int blktype)
258 205 {
259   - struct gfs2_sbd *sdp;
260   - struct gfs2_inode *ip;
261   - struct gfs2_glock *io_gl = NULL;
262   - int error;
263   - struct gfs2_holder gh;
  206 + struct super_block *sb = sdp->sd_vfs;
  207 + struct gfs2_holder i_gh;
264 208 struct inode *inode;
  209 + int error;
265 210  
266   - inode = gfs2_iget_skip(sb, no_addr);
  211 + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
  212 + LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
  213 + if (error)
  214 + return ERR_PTR(error);
267 215  
268   - if (!inode)
269   - return;
  216 + error = gfs2_check_blk_type(sdp, no_addr, blktype);
  217 + if (error)
  218 + goto fail;
270 219  
271   - /* If it's not a new inode, someone's using it, so leave it alone. */
272   - if (!(inode->i_state & I_NEW)) {
273   - iput(inode);
274   - return;
275   - }
276   -
277   - ip = GFS2_I(inode);
278   - sdp = GFS2_SB(inode);
279   - ip->i_no_formal_ino = -1;
280   -
281   - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
282   - if (unlikely(error))
  220 + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
  221 + if (IS_ERR(inode))
283 222 goto fail;
284   - ip->i_gl->gl_object = ip;
285 223  
286   - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
287   - if (unlikely(error))
288   - goto fail_put;
  224 + error = gfs2_inode_refresh(GFS2_I(inode));
  225 + if (error)
  226 + goto fail_iput;
289 227  
290   - set_bit(GIF_INVALID, &ip->i_flags);
291   - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
292   - &ip->i_iopen_gh);
293   - if (unlikely(error))
294   - goto fail_iopen;
  228 + /* Pick up the works we bypass in gfs2_inode_lookup */
  229 + if (inode->i_state & I_NEW)
  230 + gfs2_set_iop(inode);
295 231  
296   - ip->i_iopen_gh.gh_gl->gl_object = ip;
297   - gfs2_glock_put(io_gl);
298   - io_gl = NULL;
  232 + /* Two extra checks for NFS only */
  233 + if (no_formal_ino) {
  234 + error = -ESTALE;
  235 + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
  236 + goto fail_iput;
299 237  
300   - inode->i_mode = DT2IF(DT_UNKNOWN);
  238 + error = -EIO;
  239 + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
  240 + goto fail_iput;
301 241  
302   - /*
303   - * We must read the inode in order to work out its type in
304   - * this case. Note that this doesn't happen often as we normally
305   - * know the type beforehand. This code path only occurs during
306   - * unlinked inode recovery (where it is safe to do this glock,
307   - * which is not true in the general case).
308   - */
309   - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
310   - &gh);
311   - if (unlikely(error))
312   - goto fail_glock;
  242 + error = 0;
  243 + }
313 244  
314   - /* Inode is now uptodate */
315   - gfs2_glock_dq_uninit(&gh);
316   - gfs2_set_iop(inode);
317   -
318   - /* The iput will cause it to be deleted. */
319   - iput(inode);
320   - return;
321   -
322   -fail_glock:
323   - gfs2_glock_dq(&ip->i_iopen_gh);
324   -fail_iopen:
325   - if (io_gl)
326   - gfs2_glock_put(io_gl);
327   -fail_put:
328   - ip->i_gl->gl_object = NULL;
329   - gfs2_glock_put(ip->i_gl);
330 245 fail:
331   - iget_failed(inode);
332   - return;
  246 + gfs2_glock_dq_uninit(&i_gh);
  247 + return error ? ERR_PTR(error) : inode;
  248 +fail_iput:
  249 + iput(inode);
  250 + goto fail;
333 251 }
334 252  
335 253 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
... ... @@ -99,7 +99,9 @@
99 99 extern void gfs2_set_iop(struct inode *inode);
100 100 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
101 101 u64 no_addr, u64 no_formal_ino);
102   -extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
  102 +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
  103 + u64 *no_formal_ino,
  104 + unsigned int blktype);
103 105 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
104 106  
105 107 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
... ... @@ -963,17 +963,18 @@
963 963 * The inode, if one has been found, in inode.
964 964 */
965 965  
966   -static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
967   - u64 skip)
  966 +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
968 967 {
969 968 u32 goal = 0, block;
970 969 u64 no_addr;
971 970 struct gfs2_sbd *sdp = rgd->rd_sbd;
972 971 unsigned int n;
  972 + struct gfs2_glock *gl;
  973 + struct gfs2_inode *ip;
  974 + int error;
  975 + int found = 0;
973 976  
974   - for(;;) {
975   - if (goal >= rgd->rd_data)
976   - break;
  977 + while (goal < rgd->rd_data) {
977 978 down_write(&sdp->sd_log_flush_lock);
978 979 n = 1;
979 980 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
980 981  
... ... @@ -990,11 +991,32 @@
990 991 if (no_addr == skip)
991 992 continue;
992 993 *last_unlinked = no_addr;
993   - return no_addr;
  994 +
  995 + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
  996 + if (error)
  997 + continue;
  998 +
  999 + /* If the inode is already in cache, we can ignore it here
  1000 + * because the existing inode disposal code will deal with
  1001 + * it when all refs have gone away. Accessing gl_object like
  1002 + * this is not safe in general. Here it is ok because we do
  1003 + * not dereference the pointer, and we only need an approx
  1004 + * answer to whether it is NULL or not.
  1005 + */
  1006 + ip = gl->gl_object;
  1007 +
  1008 + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
  1009 + gfs2_glock_put(gl);
  1010 + else
  1011 + found++;
  1012 +
  1013 + /* Limit reclaim to sensible number of tasks */
  1014 + if (found > 2*NR_CPUS)
  1015 + return;
994 1016 }
995 1017  
996 1018 rgd->rd_flags &= ~GFS2_RDF_CHECK;
997   - return 0;
  1019 + return;
998 1020 }
999 1021  
1000 1022 /**
1001 1023  
... ... @@ -1075,11 +1097,9 @@
1075 1097 * Try to acquire rgrp in way which avoids contending with others.
1076 1098 *
1077 1099 * Returns: errno
1078   - * unlinked: the block address of an unlinked block to be reclaimed
1079 1100 */
1080 1101  
1081   -static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1082   - u64 *last_unlinked)
  1102 +static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1083 1103 {
1084 1104 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1085 1105 struct gfs2_rgrpd *rgd, *begin = NULL;
... ... @@ -1089,7 +1109,6 @@
1089 1109 int loops = 0;
1090 1110 int error, rg_locked;
1091 1111  
1092   - *unlinked = 0;
1093 1112 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1094 1113  
1095 1114 while (rgd) {
1096 1115  
... ... @@ -1106,17 +1125,10 @@
1106 1125 case 0:
1107 1126 if (try_rgrp_fit(rgd, al))
1108 1127 goto out;
1109   - /* If the rg came in already locked, there's no
1110   - way we can recover from a failed try_rgrp_unlink
1111   - because that would require an iput which can only
1112   - happen after the rgrp is unlocked. */
1113   - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1114   - *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1115   - ip->i_no_addr);
  1128 + if (rgd->rd_flags & GFS2_RDF_CHECK)
  1129 + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1116 1130 if (!rg_locked)
1117 1131 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1118   - if (*unlinked)
1119   - return -EAGAIN;
1120 1132 /* fall through */
1121 1133 case GLR_TRYFAILED:
1122 1134 rgd = recent_rgrp_next(rgd);
1123 1135  
... ... @@ -1145,13 +1157,10 @@
1145 1157 case 0:
1146 1158 if (try_rgrp_fit(rgd, al))
1147 1159 goto out;
1148   - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1149   - *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1150   - ip->i_no_addr);
  1160 + if (rgd->rd_flags & GFS2_RDF_CHECK)
  1161 + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1151 1162 if (!rg_locked)
1152 1163 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1153   - if (*unlinked)
1154   - return -EAGAIN;
1155 1164 break;
1156 1165  
1157 1166 case GLR_TRYFAILED:
1158 1167  
... ... @@ -1204,12 +1213,12 @@
1204 1213 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1205 1214 struct gfs2_alloc *al = ip->i_alloc;
1206 1215 int error = 0;
1207   - u64 last_unlinked = NO_BLOCK, unlinked;
  1216 + u64 last_unlinked = NO_BLOCK;
  1217 + int tries = 0;
1208 1218  
1209 1219 if (gfs2_assert_warn(sdp, al->al_requested))
1210 1220 return -EINVAL;
1211 1221  
1212   -try_again:
1213 1222 if (hold_rindex) {
1214 1223 /* We need to hold the rindex unless the inode we're using is
1215 1224 the rindex itself, in which case it's already held. */
1216 1225  
1217 1226  
1218 1227  
1219 1228  
... ... @@ -1218,31 +1227,23 @@
1218 1227 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1219 1228 in, so: */
1220 1229 error = gfs2_ri_update_special(ip);
  1230 + if (error)
  1231 + return error;
1221 1232 }
1222 1233  
1223   - if (error)
1224   - return error;
  1234 + do {
  1235 + error = get_local_rgrp(ip, &last_unlinked);
  1236 + /* If there is no space, flushing the log may release some */
  1237 + if (error)
  1238 + gfs2_log_flush(sdp, NULL);
  1239 + } while (error && tries++ < 3);
1225 1240  
1226   - /* Find an rgrp suitable for allocation. If it encounters any unlinked
1227   - dinodes along the way, error will equal -EAGAIN and unlinked will
1228   - contains it block address. We then need to look up that inode and
1229   - try to free it, and try the allocation again. */
1230   - error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1231 1241 if (error) {
1232 1242 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1233 1243 gfs2_glock_dq_uninit(&al->al_ri_gh);
1234   - if (error != -EAGAIN)
1235   - return error;
1236   -
1237   - gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1238   - /* regardless of whether or not gfs2_process_unlinked_inode
1239   - was successful, we don't want to repeat it again. */
1240   - last_unlinked = unlinked;
1241   - gfs2_log_flush(sdp, NULL);
1242   - error = 0;
1243   -
1244   - goto try_again;
  1244 + return error;
1245 1245 }
  1246 +
1246 1247 /* no error, so we have the rgrp set in the inode's allocation. */
1247 1248 al->al_file = file;
1248 1249 al->al_line = line;