Commit aec71382c68135261ef6efc3d8a96b7149939446

Authored by Chao Yu
Committed by Jaegeuk Kim
1 parent a014e037be

f2fs: refactor flush_nat_entries codes for reducing NAT writes

Although building NAT journal in cursum reduce the read/write work for NAT
block, but previous design leave us lower performance when write checkpoint
frequently for these cases:
1. if journal in cursum has already full, it's a bit of waste that we flush all
   nat entries to page for persistence, but not to cache any entries.
2. if journal in cursum is not full, we fill nat entries to journal util
   journal is full, then flush the left dirty entries to disk without merge
   journaled entries, so these journaled entries may be flushed to disk at next
   checkpoint but lost chance to flushed last time.

In this patch we merge dirty entries located in same NAT block to nat entry set,
and linked all set to list, sorted ascending order by entries' count of set.
Later we flush entries in sparse set into journal as many as we can, and then
flush merged entries to disk. In this way we can not only gain in performance,
but also save lifetime of flash device.

In my testing environment, it shows this patch can help to reduce NAT block
writes obviously. In hard disk test case: cost time of fsstress is stablely
reduced by about 5%.

1. virtual machine + hard disk:
fsstress -p 20 -n 200 -l 5
		node num	cp count	nodes/cp
based		4599.6		1803.0		2.551
patched		2714.6		1829.6		1.483

2. virtual machine + 32g micro SD card:
fsstress -p 20 -n 200 -l 1 -w -f chown=0 -f creat=4 -f dwrite=0
-f fdatasync=4 -f fsync=4 -f link=0 -f mkdir=4 -f mknod=4 -f rename=5
-f rmdir=5 -f symlink=0 -f truncate=4 -f unlink=5 -f write=0 -S

		node num	cp count	nodes/cp
based		84.5		43.7		1.933
patched		49.2		40.0		1.23

Our latency of merging op shows not bad when handling extreme case like:
merging a great number of dirty nats:
latency(ns)	dirty nat count
3089219		24922
5129423		27422
4000250		24523

change log from v1:
 o fix wrong logic in add_nat_entry when grab a new nat entry set.
 o swith to create slab cache in create_node_manager_caches.
 o use GFP_ATOMIC instead of GFP_NOFS to avoid potential long latency.

change log from v2:
 o make comment position more appropriate suggested by Jaegeuk Kim.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

Showing 3 changed files with 188 additions and 84 deletions Side-by-side Diff

... ... @@ -256,6 +256,8 @@
256 256 unsigned int nat_cnt; /* the # of cached nat entries */
257 257 struct list_head nat_entries; /* cached nat entry list (clean) */
258 258 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
  259 + struct list_head nat_entry_set; /* nat entry set list */
  260 + unsigned int dirty_nat_cnt; /* total num of nat entries in set */
259 261  
260 262 /* free node ids management */
261 263 struct radix_tree_root free_nid_root;/* root of the free_nid cache */
... ... @@ -25,6 +25,7 @@
25 25  
26 26 static struct kmem_cache *nat_entry_slab;
27 27 static struct kmem_cache *free_nid_slab;
  28 +static struct kmem_cache *nat_entry_set_slab;
28 29  
29 30 bool available_free_memory(struct f2fs_sb_info *sbi, int type)
30 31 {
31 32  
... ... @@ -90,12 +91,8 @@
90 91  
91 92 /* get current nat block page with lock */
92 93 src_page = get_meta_page(sbi, src_off);
93   -
94   - /* Dirty src_page means that it is already the new target NAT page. */
95   - if (PageDirty(src_page))
96   - return src_page;
97   -
98 94 dst_page = grab_meta_page(sbi, dst_off);
  95 + f2fs_bug_on(PageDirty(src_page));
99 96  
100 97 src_addr = page_address(src_page);
101 98 dst_addr = page_address(dst_page);
102 99  
103 100  
104 101  
... ... @@ -1744,20 +1741,97 @@
1744 1741 return err;
1745 1742 }
1746 1743  
1747   -static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
  1744 +static struct nat_entry_set *grab_nat_entry_set(void)
1748 1745 {
  1746 + struct nat_entry_set *nes =
  1747 + f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
  1748 +
  1749 + nes->entry_cnt = 0;
  1750 + INIT_LIST_HEAD(&nes->set_list);
  1751 + INIT_LIST_HEAD(&nes->entry_list);
  1752 + return nes;
  1753 +}
  1754 +
  1755 +static void release_nat_entry_set(struct nat_entry_set *nes,
  1756 + struct f2fs_nm_info *nm_i)
  1757 +{
  1758 + f2fs_bug_on(!list_empty(&nes->entry_list));
  1759 +
  1760 + nm_i->dirty_nat_cnt -= nes->entry_cnt;
  1761 + list_del(&nes->set_list);
  1762 + kmem_cache_free(nat_entry_set_slab, nes);
  1763 +}
  1764 +
  1765 +static void adjust_nat_entry_set(struct nat_entry_set *nes,
  1766 + struct list_head *head)
  1767 +{
  1768 + struct nat_entry_set *next = nes;
  1769 +
  1770 + if (list_is_last(&nes->set_list, head))
  1771 + return;
  1772 +
  1773 + list_for_each_entry_continue(next, head, set_list)
  1774 + if (nes->entry_cnt <= next->entry_cnt)
  1775 + break;
  1776 +
  1777 + list_move_tail(&nes->set_list, &next->set_list);
  1778 +}
  1779 +
  1780 +static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
  1781 +{
  1782 + struct nat_entry_set *nes;
  1783 + nid_t start_nid = START_NID(ne->ni.nid);
  1784 +
  1785 + list_for_each_entry(nes, head, set_list) {
  1786 + if (nes->start_nid == start_nid) {
  1787 + list_move_tail(&ne->list, &nes->entry_list);
  1788 + nes->entry_cnt++;
  1789 + adjust_nat_entry_set(nes, head);
  1790 + return;
  1791 + }
  1792 + }
  1793 +
  1794 + nes = grab_nat_entry_set();
  1795 +
  1796 + nes->start_nid = start_nid;
  1797 + list_move_tail(&ne->list, &nes->entry_list);
  1798 + nes->entry_cnt++;
  1799 + list_add(&nes->set_list, head);
  1800 +}
  1801 +
  1802 +static void merge_nats_in_set(struct f2fs_sb_info *sbi)
  1803 +{
1749 1804 struct f2fs_nm_info *nm_i = NM_I(sbi);
  1805 + struct list_head *dirty_list = &nm_i->dirty_nat_entries;
  1806 + struct list_head *set_list = &nm_i->nat_entry_set;
  1807 + struct nat_entry *ne, *tmp;
  1808 +
  1809 + write_lock(&nm_i->nat_tree_lock);
  1810 + list_for_each_entry_safe(ne, tmp, dirty_list, list) {
  1811 + if (nat_get_blkaddr(ne) == NEW_ADDR)
  1812 + continue;
  1813 + add_nat_entry(ne, set_list);
  1814 + nm_i->dirty_nat_cnt++;
  1815 + }
  1816 + write_unlock(&nm_i->nat_tree_lock);
  1817 +}
  1818 +
  1819 +static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
  1820 +{
  1821 + if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
  1822 + return true;
  1823 + else
  1824 + return false;
  1825 +}
  1826 +
  1827 +static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
  1828 +{
  1829 + struct f2fs_nm_info *nm_i = NM_I(sbi);
1750 1830 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1751 1831 struct f2fs_summary_block *sum = curseg->sum_blk;
1752 1832 int i;
1753 1833  
1754 1834 mutex_lock(&curseg->curseg_mutex);
1755   -
1756   - if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
1757   - mutex_unlock(&curseg->curseg_mutex);
1758   - return false;
1759   - }
1760   -
1761 1835 for (i = 0; i < nats_in_cursum(sum); i++) {
1762 1836 struct nat_entry *ne;
1763 1837 struct f2fs_nat_entry raw_ne;
1764 1838  
1765 1839  
... ... @@ -1767,23 +1841,21 @@
1767 1841 retry:
1768 1842 write_lock(&nm_i->nat_tree_lock);
1769 1843 ne = __lookup_nat_cache(nm_i, nid);
1770   - if (ne) {
1771   - __set_nat_cache_dirty(nm_i, ne);
1772   - write_unlock(&nm_i->nat_tree_lock);
1773   - continue;
1774   - }
  1844 + if (ne)
  1845 + goto found;
  1846 +
1775 1847 ne = grab_nat_entry(nm_i, nid);
1776 1848 if (!ne) {
1777 1849 write_unlock(&nm_i->nat_tree_lock);
1778 1850 goto retry;
1779 1851 }
1780 1852 node_info_from_raw_nat(&ne->ni, &raw_ne);
  1853 +found:
1781 1854 __set_nat_cache_dirty(nm_i, ne);
1782 1855 write_unlock(&nm_i->nat_tree_lock);
1783 1856 }
1784 1857 update_nats_in_cursum(sum, -i);
1785 1858 mutex_unlock(&curseg->curseg_mutex);
1786   - return true;
1787 1859 }
1788 1860  
1789 1861 /*
1790 1862  
1791 1863  
1792 1864  
1793 1865  
1794 1866  
1795 1867  
1796 1868  
1797 1869  
1798 1870  
1799 1871  
1800 1872  
1801 1873  
1802 1874  
1803 1875  
... ... @@ -1794,80 +1866,91 @@
1794 1866 struct f2fs_nm_info *nm_i = NM_I(sbi);
1795 1867 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1796 1868 struct f2fs_summary_block *sum = curseg->sum_blk;
1797   - struct nat_entry *ne, *cur;
1798   - struct page *page = NULL;
1799   - struct f2fs_nat_block *nat_blk = NULL;
1800   - nid_t start_nid = 0, end_nid = 0;
1801   - bool flushed;
  1869 + struct nat_entry_set *nes, *tmp;
  1870 + struct list_head *head = &nm_i->nat_entry_set;
  1871 + bool to_journal = true;
1802 1872  
1803   - flushed = flush_nats_in_journal(sbi);
  1873 + /* merge nat entries of dirty list to nat entry set temporarily */
  1874 + merge_nats_in_set(sbi);
1804 1875  
1805   - if (!flushed)
1806   - mutex_lock(&curseg->curseg_mutex);
  1876 + /*
  1877 + * if there are no enough space in journal to store dirty nat
  1878 + * entries, remove all entries from journal and merge them
  1879 + * into nat entry set.
  1880 + */
  1881 + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
  1882 + remove_nats_in_journal(sbi);
1807 1883  
1808   - /* 1) flush dirty nat caches */
1809   - list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
1810   - nid_t nid;
1811   - struct f2fs_nat_entry raw_ne;
1812   - int offset = -1;
  1884 + /*
  1885 + * merge nat entries of dirty list to nat entry set temporarily
  1886 + */
  1887 + merge_nats_in_set(sbi);
  1888 + }
1813 1889  
1814   - if (nat_get_blkaddr(ne) == NEW_ADDR)
1815   - continue;
  1890 + if (!nm_i->dirty_nat_cnt)
  1891 + return;
1816 1892  
1817   - nid = nat_get_nid(ne);
  1893 + /*
  1894 + * there are two steps to flush nat entries:
  1895 + * #1, flush nat entries to journal in current hot data summary block.
  1896 + * #2, flush nat entries to nat page.
  1897 + */
  1898 + list_for_each_entry_safe(nes, tmp, head, set_list) {
  1899 + struct f2fs_nat_block *nat_blk;
  1900 + struct nat_entry *ne, *cur;
  1901 + struct page *page;
  1902 + nid_t start_nid = nes->start_nid;
1818 1903  
1819   - if (flushed)
1820   - goto to_nat_page;
  1904 + if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
  1905 + to_journal = false;
1821 1906  
1822   - /* if there is room for nat enries in curseg->sumpage */
1823   - offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
1824   - if (offset >= 0) {
1825   - raw_ne = nat_in_journal(sum, offset);
1826   - goto flush_now;
1827   - }
1828   -to_nat_page:
1829   - if (!page || (start_nid > nid || nid > end_nid)) {
1830   - if (page) {
1831   - f2fs_put_page(page, 1);
1832   - page = NULL;
1833   - }
1834   - start_nid = START_NID(nid);
1835   - end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
1836   -
1837   - /*
1838   - * get nat block with dirty flag, increased reference
1839   - * count, mapped and lock
1840   - */
  1907 + if (to_journal) {
  1908 + mutex_lock(&curseg->curseg_mutex);
  1909 + } else {
1841 1910 page = get_next_nat_page(sbi, start_nid);
1842 1911 nat_blk = page_address(page);
  1912 + f2fs_bug_on(!nat_blk);
1843 1913 }
1844 1914  
1845   - f2fs_bug_on(!nat_blk);
1846   - raw_ne = nat_blk->entries[nid - start_nid];
1847   -flush_now:
1848   - raw_nat_from_node_info(&raw_ne, &ne->ni);
  1915 + /* flush dirty nats in nat entry set */
  1916 + list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
  1917 + struct f2fs_nat_entry *raw_ne;
  1918 + nid_t nid = nat_get_nid(ne);
  1919 + int offset;
1849 1920  
1850   - if (offset < 0) {
1851   - nat_blk->entries[nid - start_nid] = raw_ne;
1852   - } else {
1853   - nat_in_journal(sum, offset) = raw_ne;
1854   - nid_in_journal(sum, offset) = cpu_to_le32(nid);
1855   - }
  1921 + if (to_journal) {
  1922 + offset = lookup_journal_in_cursum(sum,
  1923 + NAT_JOURNAL, nid, 1);
  1924 + f2fs_bug_on(offset < 0);
  1925 + raw_ne = &nat_in_journal(sum, offset);
  1926 + nid_in_journal(sum, offset) = cpu_to_le32(nid);
  1927 + } else {
  1928 + raw_ne = &nat_blk->entries[nid - start_nid];
  1929 + }
  1930 + raw_nat_from_node_info(raw_ne, &ne->ni);
1856 1931  
1857   - if (nat_get_blkaddr(ne) == NULL_ADDR &&
  1932 + if (nat_get_blkaddr(ne) == NULL_ADDR &&
1858 1933 add_free_nid(sbi, nid, false) <= 0) {
1859   - write_lock(&nm_i->nat_tree_lock);
1860   - __del_from_nat_cache(nm_i, ne);
1861   - write_unlock(&nm_i->nat_tree_lock);
1862   - } else {
1863   - write_lock(&nm_i->nat_tree_lock);
1864   - __clear_nat_cache_dirty(nm_i, ne);
1865   - write_unlock(&nm_i->nat_tree_lock);
  1934 + write_lock(&nm_i->nat_tree_lock);
  1935 + __del_from_nat_cache(nm_i, ne);
  1936 + write_unlock(&nm_i->nat_tree_lock);
  1937 + } else {
  1938 + write_lock(&nm_i->nat_tree_lock);
  1939 + __clear_nat_cache_dirty(nm_i, ne);
  1940 + write_unlock(&nm_i->nat_tree_lock);
  1941 + }
1866 1942 }
  1943 +
  1944 + if (to_journal)
  1945 + mutex_unlock(&curseg->curseg_mutex);
  1946 + else
  1947 + f2fs_put_page(page, 1);
  1948 +
  1949 + release_nat_entry_set(nes, nm_i);
1867 1950 }
1868   - if (!flushed)
1869   - mutex_unlock(&curseg->curseg_mutex);
1870   - f2fs_put_page(page, 1);
  1951 +
  1952 + f2fs_bug_on(!list_empty(head));
  1953 + f2fs_bug_on(nm_i->dirty_nat_cnt);
1871 1954 }
1872 1955  
1873 1956 static int init_node_manager(struct f2fs_sb_info *sbi)
... ... @@ -1896,6 +1979,7 @@
1896 1979 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1897 1980 INIT_LIST_HEAD(&nm_i->nat_entries);
1898 1981 INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
  1982 + INIT_LIST_HEAD(&nm_i->nat_entry_set);
1899 1983  
1900 1984 mutex_init(&nm_i->build_lock);
1901 1985 spin_lock_init(&nm_i->free_nid_list_lock);
1902 1986  
1903 1987  
1904 1988  
... ... @@ -1976,19 +2060,30 @@
1976 2060 nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1977 2061 sizeof(struct nat_entry));
1978 2062 if (!nat_entry_slab)
1979   - return -ENOMEM;
  2063 + goto fail;
1980 2064  
1981 2065 free_nid_slab = f2fs_kmem_cache_create("free_nid",
1982 2066 sizeof(struct free_nid));
1983   - if (!free_nid_slab) {
1984   - kmem_cache_destroy(nat_entry_slab);
1985   - return -ENOMEM;
1986   - }
  2067 + if (!free_nid_slab)
  2068 + goto destory_nat_entry;
  2069 +
  2070 + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
  2071 + sizeof(struct nat_entry_set));
  2072 + if (!nat_entry_set_slab)
  2073 + goto destory_free_nid;
1987 2074 return 0;
  2075 +
  2076 +destory_free_nid:
  2077 + kmem_cache_destroy(free_nid_slab);
  2078 +destory_nat_entry:
  2079 + kmem_cache_destroy(nat_entry_slab);
  2080 +fail:
  2081 + return -ENOMEM;
1988 2082 }
1989 2083  
1990 2084 void destroy_node_manager_caches(void)
1991 2085 {
  2086 + kmem_cache_destroy(nat_entry_set_slab);
1992 2087 kmem_cache_destroy(free_nid_slab);
1993 2088 kmem_cache_destroy(nat_entry_slab);
1994 2089 }
... ... @@ -89,6 +89,13 @@
89 89 DIRTY_DENTS /* indicates dirty dentry pages */
90 90 };
91 91  
  92 +struct nat_entry_set {
  93 + struct list_head set_list; /* link with all nat sets */
  94 + struct list_head entry_list; /* link with dirty nat entries */
  95 + nid_t start_nid; /* start nid of nats in set */
  96 + unsigned int entry_cnt; /* the # of nat entries in set */
  97 +};
  98 +
92 99 /*
93 100 * For free nid mangement
94 101 */