Commit c4f68a130fc1795e4a75ec5bdaf9e85d86c22419

Authored by Benjamin Marzinski
Committed by Steven Whitehouse
1 parent d1e2777d4f

[GFS2] delay glock demote for a minimum hold time

When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second.  This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.

A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.

Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

Showing 3 changed files with 66 additions and 16 deletions Side-by-side Diff

... ... @@ -27,6 +27,8 @@
27 27 #include <linux/debugfs.h>
28 28 #include <linux/kthread.h>
29 29 #include <linux/freezer.h>
  30 +#include <linux/workqueue.h>
  31 +#include <linux/jiffies.h>
30 32  
31 33 #include "gfs2.h"
32 34 #include "incore.h"
33 35  
... ... @@ -58,10 +60,13 @@
58 60 static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
59 61 static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
60 62 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
  63 +static void run_queue(struct gfs2_glock *gl);
  64 +
61 65 static DECLARE_RWSEM(gfs2_umount_flush_sem);
62 66 static struct dentry *gfs2_root;
63 67 static struct task_struct *scand_process;
64 68 static unsigned int scand_secs = 5;
  69 +static struct workqueue_struct *glock_workqueue;
65 70  
66 71 #define GFS2_GL_HASH_SHIFT 15
67 72 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
... ... @@ -277,6 +282,18 @@
277 282 return gl;
278 283 }
279 284  
  285 +static void glock_work_func(struct work_struct *work)
  286 +{
  287 + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
  288 +
  289 + spin_lock(&gl->gl_spin);
  290 + if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
  291 + set_bit(GLF_DEMOTE, &gl->gl_flags);
  292 + run_queue(gl);
  293 + spin_unlock(&gl->gl_spin);
  294 + gfs2_glock_put(gl);
  295 +}
  296 +
280 297 /**
281 298 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
282 299 * @sdp: The GFS2 superblock
... ... @@ -316,6 +333,7 @@
316 333 gl->gl_name = name;
317 334 atomic_set(&gl->gl_ref, 1);
318 335 gl->gl_state = LM_ST_UNLOCKED;
  336 + gl->gl_demote_state = LM_ST_EXCLUSIVE;
319 337 gl->gl_hash = hash;
320 338 gl->gl_owner_pid = 0;
321 339 gl->gl_ip = 0;
322 340  
... ... @@ -324,10 +342,12 @@
324 342 gl->gl_req_bh = NULL;
325 343 gl->gl_vn = 0;
326 344 gl->gl_stamp = jiffies;
  345 + gl->gl_tchange = jiffies;
327 346 gl->gl_object = NULL;
328 347 gl->gl_sbd = sdp;
329 348 gl->gl_aspace = NULL;
330 349 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
  350 + INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
331 351  
332 352 /* If this glock protects actual on-disk data or metadata blocks,
333 353 create a VFS inode to manage the pages/buffers holding them. */
... ... @@ -441,6 +461,8 @@
441 461  
442 462 static void gfs2_demote_wake(struct gfs2_glock *gl)
443 463 {
  464 + BUG_ON(!spin_is_locked(&gl->gl_spin));
  465 + gl->gl_demote_state = LM_ST_EXCLUSIVE;
444 466 clear_bit(GLF_DEMOTE, &gl->gl_flags);
445 467 smp_mb__after_clear_bit();
446 468 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
447 469  
448 470  
... ... @@ -682,10 +704,14 @@
682 704 * practise: LM_ST_SHARED and LM_ST_UNLOCKED
683 705 */
684 706  
685   -static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote)
  707 +static void handle_callback(struct gfs2_glock *gl, unsigned int state,
  708 + int remote, unsigned long delay)
686 709 {
  710 + int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
  711 +
687 712 spin_lock(&gl->gl_spin);
688   - if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
  713 + set_bit(bit, &gl->gl_flags);
  714 + if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
689 715 gl->gl_demote_state = state;
690 716 gl->gl_demote_time = jiffies;
691 717 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
... ... @@ -727,6 +753,7 @@
727 753 }
728 754  
729 755 gl->gl_state = new_state;
  756 + gl->gl_tchange = jiffies;
730 757 }
731 758  
732 759 /**
... ... @@ -813,7 +840,6 @@
813 840 gl->gl_req_gh = NULL;
814 841 gl->gl_req_bh = NULL;
815 842 clear_bit(GLF_LOCK, &gl->gl_flags);
816   - run_queue(gl);
817 843 spin_unlock(&gl->gl_spin);
818 844 }
819 845  
... ... @@ -885,7 +911,6 @@
885 911 gfs2_assert_warn(sdp, !ret);
886 912  
887 913 state_change(gl, LM_ST_UNLOCKED);
888   - gfs2_demote_wake(gl);
889 914  
890 915 if (glops->go_inval)
891 916 glops->go_inval(gl, DIO_METADATA);
892 917  
... ... @@ -898,10 +923,10 @@
898 923 }
899 924  
900 925 spin_lock(&gl->gl_spin);
  926 + gfs2_demote_wake(gl);
901 927 gl->gl_req_gh = NULL;
902 928 gl->gl_req_bh = NULL;
903 929 clear_bit(GLF_LOCK, &gl->gl_flags);
904   - run_queue(gl);
905 930 spin_unlock(&gl->gl_spin);
906 931  
907 932 gfs2_glock_put(gl);
908 933  
... ... @@ -1209,9 +1234,10 @@
1209 1234 {
1210 1235 struct gfs2_glock *gl = gh->gh_gl;
1211 1236 const struct gfs2_glock_operations *glops = gl->gl_ops;
  1237 + unsigned delay = 0;
1212 1238  
1213 1239 if (gh->gh_flags & GL_NOCACHE)
1214   - handle_callback(gl, LM_ST_UNLOCKED, 0);
  1240 + handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1215 1241  
1216 1242 gfs2_glmutex_lock(gl);
1217 1243  
1218 1244  
... ... @@ -1229,8 +1255,14 @@
1229 1255 }
1230 1256  
1231 1257 clear_bit(GLF_LOCK, &gl->gl_flags);
1232   - run_queue(gl);
1233 1258 spin_unlock(&gl->gl_spin);
  1259 +
  1260 + gfs2_glock_hold(gl);
  1261 + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
  1262 + !test_bit(GLF_DEMOTE, &gl->gl_flags))
  1263 + delay = gl->gl_ops->go_min_hold_time;
  1264 + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
  1265 + gfs2_glock_put(gl);
1234 1266 }
1235 1267  
1236 1268 void gfs2_glock_dq_wait(struct gfs2_holder *gh)
1237 1269  
1238 1270  
... ... @@ -1457,18 +1489,21 @@
1457 1489 unsigned int state)
1458 1490 {
1459 1491 struct gfs2_glock *gl;
  1492 + unsigned long delay = 0;
  1493 + unsigned long holdtime;
  1494 + unsigned long now = jiffies;
1460 1495  
1461 1496 gl = gfs2_glock_find(sdp, name);
1462 1497 if (!gl)
1463 1498 return;
1464 1499  
1465   - handle_callback(gl, state, 1);
  1500 + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
  1501 + if (time_before(now, holdtime))
  1502 + delay = holdtime - now;
1466 1503  
1467   - spin_lock(&gl->gl_spin);
1468   - run_queue(gl);
1469   - spin_unlock(&gl->gl_spin);
1470   -
1471   - gfs2_glock_put(gl);
  1504 + handle_callback(gl, state, 1, delay);
  1505 + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
  1506 + gfs2_glock_put(gl);
1472 1507 }
1473 1508  
1474 1509 /**
... ... @@ -1509,7 +1544,8 @@
1509 1544 return;
1510 1545 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1511 1546 gl->gl_req_bh(gl, async->lc_ret);
1512   - gfs2_glock_put(gl);
  1547 + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
  1548 + gfs2_glock_put(gl);
1513 1549 up_read(&gfs2_umount_flush_sem);
1514 1550 return;
1515 1551 }
... ... @@ -1602,7 +1638,7 @@
1602 1638 if (gfs2_glmutex_trylock(gl)) {
1603 1639 if (list_empty(&gl->gl_holders) &&
1604 1640 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1605   - handle_callback(gl, LM_ST_UNLOCKED, 0);
  1641 + handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1606 1642 gfs2_glmutex_unlock(gl);
1607 1643 }
1608 1644  
... ... @@ -1702,7 +1738,7 @@
1702 1738 if (gfs2_glmutex_trylock(gl)) {
1703 1739 if (list_empty(&gl->gl_holders) &&
1704 1740 gl->gl_state != LM_ST_UNLOCKED)
1705   - handle_callback(gl, LM_ST_UNLOCKED, 0);
  1741 + handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1706 1742 gfs2_glmutex_unlock(gl);
1707 1743 }
1708 1744 }
1709 1745  
... ... @@ -2009,11 +2045,18 @@
2009 2045 if (IS_ERR(scand_process))
2010 2046 return PTR_ERR(scand_process);
2011 2047  
  2048 + glock_workqueue = create_workqueue("glock_workqueue");
  2049 + if (IS_ERR(glock_workqueue)) {
  2050 + kthread_stop(scand_process);
  2051 + return PTR_ERR(glock_workqueue);
  2052 + }
  2053 +
2012 2054 return 0;
2013 2055 }
2014 2056  
2015 2057 void gfs2_glock_exit(void)
2016 2058 {
  2059 + destroy_workqueue(glock_workqueue);
2017 2060 kthread_stop(scand_process);
2018 2061 }
2019 2062  
... ... @@ -454,6 +454,7 @@
454 454 .go_lock = inode_go_lock,
455 455 .go_unlock = inode_go_unlock,
456 456 .go_type = LM_TYPE_INODE,
  457 + .go_min_hold_time = HZ / 10,
457 458 };
458 459  
459 460 const struct gfs2_glock_operations gfs2_rgrp_glops = {
... ... @@ -464,6 +465,7 @@
464 465 .go_lock = rgrp_go_lock,
465 466 .go_unlock = rgrp_go_unlock,
466 467 .go_type = LM_TYPE_RGRP,
  468 + .go_min_hold_time = HZ / 10,
467 469 };
468 470  
469 471 const struct gfs2_glock_operations gfs2_trans_glops = {
... ... @@ -11,6 +11,7 @@
11 11 #define __INCORE_DOT_H__
12 12  
13 13 #include <linux/fs.h>
  14 +#include <linux/workqueue.h>
14 15  
15 16 #define DIO_WAIT 0x00000010
16 17 #define DIO_METADATA 0x00000020
... ... @@ -130,6 +131,7 @@
130 131 int (*go_lock) (struct gfs2_holder *gh);
131 132 void (*go_unlock) (struct gfs2_holder *gh);
132 133 const int go_type;
  134 + const unsigned long go_min_hold_time;
133 135 };
134 136  
135 137 enum {
... ... @@ -161,6 +163,7 @@
161 163 GLF_LOCK = 1,
162 164 GLF_STICKY = 2,
163 165 GLF_DEMOTE = 3,
  166 + GLF_PENDING_DEMOTE = 4,
164 167 GLF_DIRTY = 5,
165 168 };
166 169  
... ... @@ -193,6 +196,7 @@
193 196  
194 197 u64 gl_vn;
195 198 unsigned long gl_stamp;
  199 + unsigned long gl_tchange;
196 200 void *gl_object;
197 201  
198 202 struct list_head gl_reclaim;
... ... @@ -203,6 +207,7 @@
203 207 struct gfs2_log_element gl_le;
204 208 struct list_head gl_ail_list;
205 209 atomic_t gl_ail_count;
  210 + struct delayed_work gl_work;
206 211 };
207 212  
208 213 struct gfs2_alloc {