Commit 4e7ea81db53465ddd753678bc4cebf95369d0984

Authored by Jan Kara
Committed by Theodore Ts'o
1 parent fffb273997

ext4: restructure writeback path

There are two issues with current writeback path in ext4.  For one we
don't necessarily map complete pages when blocksize < pagesize and
thus needn't do any writeback in one iteration.  We always map some
blocks though so we will eventually finish mapping the page.  Just if
writeback races with other operations on the file, forward progress is
not really guaranteed. The second problem is that current code
structure makes it hard to associate all the bios to some range of
pages with one io_end structure so that unwritten extents can be
converted after all the bios are finished.  This will be especially
difficult later when io_end will be associated with reserved
transaction handle.

We restructure the writeback path to a relatively simple loop which
first prepares extent of pages, then maps one or more extents so that
no page is partially mapped, and once page is fully mapped it is
submitted for IO. We keep all the mapping and IO submission
information in mpage_da_data structure to somewhat reduce stack usage.
Resulting code is somewhat shorter than the old one and hopefully also
easier to read.

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 4 changed files with 527 additions and 567 deletions Side-by-side Diff

... ... @@ -177,21 +177,6 @@
177 177 };
178 178  
179 179 /*
180   - * For delayed allocation tracking
181   - */
182   -struct mpage_da_data {
183   - struct inode *inode;
184   - sector_t b_blocknr; /* start block number of extent */
185   - size_t b_size; /* size of extent */
186   - unsigned long b_state; /* state of the extent */
187   - unsigned long first_page, next_page; /* extent of pages */
188   - struct writeback_control *wbc;
189   - int io_done;
190   - int pages_written;
191   - int retval;
192   -};
193   -
194   -/*
195 180 * Flags for ext4_io_end->flags
196 181 */
197 182 #define EXT4_IO_END_UNWRITTEN 0x0001
Changes suppressed. Click to show
... ... @@ -1407,150 +1407,43 @@
1407 1407 * Delayed allocation stuff
1408 1408 */
1409 1409  
1410   -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
1411   -
1412   -/*
1413   - * mpage_da_submit_io - walks through extent of pages and try to write
1414   - * them with writepage() call back
1415   - *
1416   - * @mpd->inode: inode
1417   - * @mpd->first_page: first page of the extent
1418   - * @mpd->next_page: page after the last page of the extent
1419   - *
1420   - * By the time mpage_da_submit_io() is called we expect all blocks
1421   - * to be allocated. this may be wrong if allocation failed.
1422   - *
1423   - * As pages are already locked by write_cache_pages(), we can't use it
1424   - */
1425   -static int mpage_da_submit_io(struct mpage_da_data *mpd,
1426   - struct ext4_map_blocks *map)
1427   -{
1428   - struct pagevec pvec;
1429   - unsigned long index, end;
1430   - int ret = 0, err, nr_pages, i;
1431   - struct inode *inode = mpd->inode;
1432   - struct address_space *mapping = inode->i_mapping;
1433   - loff_t size = i_size_read(inode);
1434   - unsigned int len, block_start;
1435   - struct buffer_head *bh, *page_bufs = NULL;
1436   - sector_t pblock = 0, cur_logical = 0;
1437   - struct ext4_io_submit io_submit;
1438   -
1439   - BUG_ON(mpd->next_page <= mpd->first_page);
1440   - ext4_io_submit_init(&io_submit, mpd->wbc);
1441   - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1442   - if (!io_submit.io_end) {
1443   - ext4_da_block_invalidatepages(mpd);
1444   - return -ENOMEM;
1445   - }
  1410 +struct mpage_da_data {
  1411 + struct inode *inode;
  1412 + struct writeback_control *wbc;
  1413 + pgoff_t first_page; /* The first page to write */
  1414 + pgoff_t next_page; /* Current page to examine */
  1415 + pgoff_t last_page; /* Last page to examine */
1446 1416 /*
1447   - * We need to start from the first_page to the next_page - 1
1448   - * to make sure we also write the mapped dirty buffer_heads.
1449   - * If we look at mpd->b_blocknr we would only be looking
1450   - * at the currently mapped buffer_heads.
  1417 + * Extent to map - this can be after first_page because that can be
  1418 + * fully mapped. We somewhat abuse m_flags to store whether the extent
  1419 + * is delalloc or unwritten.
1451 1420 */
1452   - index = mpd->first_page;
1453   - end = mpd->next_page - 1;
  1421 + struct ext4_map_blocks map;
  1422 + struct ext4_io_submit io_submit; /* IO submission data */
  1423 +};
1454 1424  
1455   - pagevec_init(&pvec, 0);
1456   - while (index <= end) {
1457   - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1458   - if (nr_pages == 0)
1459   - break;
1460   - for (i = 0; i < nr_pages; i++) {
1461   - int skip_page = 0;
1462   - struct page *page = pvec.pages[i];
1463   -
1464   - index = page->index;
1465   - if (index > end)
1466   - break;
1467   -
1468   - if (index == size >> PAGE_CACHE_SHIFT)
1469   - len = size & ~PAGE_CACHE_MASK;
1470   - else
1471   - len = PAGE_CACHE_SIZE;
1472   - if (map) {
1473   - cur_logical = index << (PAGE_CACHE_SHIFT -
1474   - inode->i_blkbits);
1475   - pblock = map->m_pblk + (cur_logical -
1476   - map->m_lblk);
1477   - }
1478   - index++;
1479   -
1480   - BUG_ON(!PageLocked(page));
1481   - BUG_ON(PageWriteback(page));
1482   -
1483   - bh = page_bufs = page_buffers(page);
1484   - block_start = 0;
1485   - do {
1486   - if (map && (cur_logical >= map->m_lblk) &&
1487   - (cur_logical <= (map->m_lblk +
1488   - (map->m_len - 1)))) {
1489   - if (buffer_delay(bh)) {
1490   - clear_buffer_delay(bh);
1491   - bh->b_blocknr = pblock;
1492   - }
1493   - if (buffer_unwritten(bh) ||
1494   - buffer_mapped(bh))
1495   - BUG_ON(bh->b_blocknr != pblock);
1496   - if (map->m_flags & EXT4_MAP_UNINIT)
1497   - set_buffer_uninit(bh);
1498   - clear_buffer_unwritten(bh);
1499   - }
1500   -
1501   - /*
1502   - * skip page if block allocation undone and
1503   - * block is dirty
1504   - */
1505   - if (ext4_bh_delay_or_unwritten(NULL, bh))
1506   - skip_page = 1;
1507   - bh = bh->b_this_page;
1508   - block_start += bh->b_size;
1509   - cur_logical++;
1510   - pblock++;
1511   - } while (bh != page_bufs);
1512   -
1513   - if (skip_page) {
1514   - unlock_page(page);
1515   - continue;
1516   - }
1517   -
1518   - clear_page_dirty_for_io(page);
1519   - err = ext4_bio_write_page(&io_submit, page, len,
1520   - mpd->wbc);
1521   - if (!err)
1522   - mpd->pages_written++;
1523   - /*
1524   - * In error case, we have to continue because
1525   - * remaining pages are still locked
1526   - */
1527   - if (ret == 0)
1528   - ret = err;
1529   - }
1530   - pagevec_release(&pvec);
1531   - }
1532   - ext4_io_submit(&io_submit);
1533   - /* Drop io_end reference we got from init */
1534   - ext4_put_io_end_defer(io_submit.io_end);
1535   - return ret;
1536   -}
1537   -
1538   -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
  1425 +static void mpage_release_unused_pages(struct mpage_da_data *mpd,
  1426 + bool invalidate)
1539 1427 {
1540 1428 int nr_pages, i;
1541 1429 pgoff_t index, end;
1542 1430 struct pagevec pvec;
1543 1431 struct inode *inode = mpd->inode;
1544 1432 struct address_space *mapping = inode->i_mapping;
1545   - ext4_lblk_t start, last;
1546 1433  
  1434 + /* This is necessary when next_page == 0. */
  1435 + if (mpd->first_page >= mpd->next_page)
  1436 + return;
  1437 +
1547 1438 index = mpd->first_page;
1548 1439 end = mpd->next_page - 1;
  1440 + if (invalidate) {
  1441 + ext4_lblk_t start, last;
  1442 + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
  1443 + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
  1444 + ext4_es_remove_extent(inode, start, last - start + 1);
  1445 + }
1549 1446  
1550   - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1551   - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1552   - ext4_es_remove_extent(inode, start, last - start + 1);
1553   -
1554 1447 pagevec_init(&pvec, 0);
1555 1448 while (index <= end) {
1556 1449 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1557 1450  
... ... @@ -1562,14 +1455,15 @@
1562 1455 break;
1563 1456 BUG_ON(!PageLocked(page));
1564 1457 BUG_ON(PageWriteback(page));
1565   - block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1566   - ClearPageUptodate(page);
  1458 + if (invalidate) {
  1459 + block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
  1460 + ClearPageUptodate(page);
  1461 + }
1567 1462 unlock_page(page);
1568 1463 }
1569 1464 index = pvec.pages[nr_pages - 1]->index + 1;
1570 1465 pagevec_release(&pvec);
1571 1466 }
1572   - return;
1573 1467 }
1574 1468  
1575 1469 static void ext4_print_free_blocks(struct inode *inode)
... ... @@ -1598,215 +1492,6 @@
1598 1492 return;
1599 1493 }
1600 1494  
1601   -/*
1602   - * mpage_da_map_and_submit - go through given space, map them
1603   - * if necessary, and then submit them for I/O
1604   - *
1605   - * @mpd - bh describing space
1606   - *
1607   - * The function skips space we know is already mapped to disk blocks.
1608   - *
1609   - */
1610   -static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1611   -{
1612   - int err, blks, get_blocks_flags;
1613   - struct ext4_map_blocks map, *mapp = NULL;
1614   - sector_t next = mpd->b_blocknr;
1615   - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1616   - loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1617   - handle_t *handle = NULL;
1618   -
1619   - /*
1620   - * If the blocks are mapped already, or we couldn't accumulate
1621   - * any blocks, then proceed immediately to the submission stage.
1622   - */
1623   - if ((mpd->b_size == 0) ||
1624   - ((mpd->b_state & (1 << BH_Mapped)) &&
1625   - !(mpd->b_state & (1 << BH_Delay)) &&
1626   - !(mpd->b_state & (1 << BH_Unwritten))))
1627   - goto submit_io;
1628   -
1629   - handle = ext4_journal_current_handle();
1630   - BUG_ON(!handle);
1631   -
1632   - /*
1633   - * Call ext4_map_blocks() to allocate any delayed allocation
1634   - * blocks, or to convert an uninitialized extent to be
1635   - * initialized (in the case where we have written into
1636   - * one or more preallocated blocks).
1637   - *
1638   - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1639   - * indicate that we are on the delayed allocation path. This
1640   - * affects functions in many different parts of the allocation
1641   - * call path. This flag exists primarily because we don't
1642   - * want to change *many* call functions, so ext4_map_blocks()
1643   - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1644   - * inode's allocation semaphore is taken.
1645   - *
1646   - * If the blocks in questions were delalloc blocks, set
1647   - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1648   - * variables are updated after the blocks have been allocated.
1649   - */
1650   - map.m_lblk = next;
1651   - map.m_len = max_blocks;
1652   - /*
1653   - * We're in delalloc path and it is possible that we're going to
1654   - * need more metadata blocks than previously reserved. However
1655   - * we must not fail because we're in writeback and there is
1656   - * nothing we can do about it so it might result in data loss.
1657   - * So use reserved blocks to allocate metadata if possible.
1658   - */
1659   - get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1660   - EXT4_GET_BLOCKS_METADATA_NOFAIL;
1661   - if (ext4_should_dioread_nolock(mpd->inode))
1662   - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1663   - if (mpd->b_state & (1 << BH_Delay))
1664   - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1665   -
1666   -
1667   - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1668   - if (blks < 0) {
1669   - struct super_block *sb = mpd->inode->i_sb;
1670   -
1671   - err = blks;
1672   - /*
1673   - * If get block returns EAGAIN or ENOSPC and there
1674   - * appears to be free blocks we will just let
1675   - * mpage_da_submit_io() unlock all of the pages.
1676   - */
1677   - if (err == -EAGAIN)
1678   - goto submit_io;
1679   -
1680   - if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1681   - mpd->retval = err;
1682   - goto submit_io;
1683   - }
1684   -
1685   - /*
1686   - * get block failure will cause us to loop in
1687   - * writepages, because a_ops->writepage won't be able
1688   - * to make progress. The page will be redirtied by
1689   - * writepage and writepages will again try to write
1690   - * the same.
1691   - */
1692   - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1693   - ext4_msg(sb, KERN_CRIT,
1694   - "delayed block allocation failed for inode %lu "
1695   - "at logical offset %llu with max blocks %zd "
1696   - "with error %d", mpd->inode->i_ino,
1697   - (unsigned long long) next,
1698   - mpd->b_size >> mpd->inode->i_blkbits, err);
1699   - ext4_msg(sb, KERN_CRIT,
1700   - "This should not happen!! Data will be lost");
1701   - if (err == -ENOSPC)
1702   - ext4_print_free_blocks(mpd->inode);
1703   - }
1704   - /* invalidate all the pages */
1705   - ext4_da_block_invalidatepages(mpd);
1706   -
1707   - /* Mark this page range as having been completed */
1708   - mpd->io_done = 1;
1709   - return;
1710   - }
1711   - BUG_ON(blks == 0);
1712   -
1713   - mapp = &map;
1714   - if (map.m_flags & EXT4_MAP_NEW) {
1715   - struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1716   - int i;
1717   -
1718   - for (i = 0; i < map.m_len; i++)
1719   - unmap_underlying_metadata(bdev, map.m_pblk + i);
1720   - }
1721   -
1722   - /*
1723   - * Update on-disk size along with block allocation.
1724   - */
1725   - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1726   - if (disksize > i_size_read(mpd->inode))
1727   - disksize = i_size_read(mpd->inode);
1728   - if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1729   - ext4_update_i_disksize(mpd->inode, disksize);
1730   - err = ext4_mark_inode_dirty(handle, mpd->inode);
1731   - if (err)
1732   - ext4_error(mpd->inode->i_sb,
1733   - "Failed to mark inode %lu dirty",
1734   - mpd->inode->i_ino);
1735   - }
1736   -
1737   -submit_io:
1738   - mpage_da_submit_io(mpd, mapp);
1739   - mpd->io_done = 1;
1740   -}
1741   -
1742   -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1743   - (1 << BH_Delay) | (1 << BH_Unwritten))
1744   -
1745   -/*
1746   - * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1747   - *
1748   - * @mpd->lbh - extent of blocks
1749   - * @logical - logical number of the block in the file
1750   - * @b_state - b_state of the buffer head added
1751   - *
1752   - * the function is used to collect contig. blocks in same state
1753   - */
1754   -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1755   - unsigned long b_state)
1756   -{
1757   - sector_t next;
1758   - int blkbits = mpd->inode->i_blkbits;
1759   - int nrblocks = mpd->b_size >> blkbits;
1760   -
1761   - /*
1762   - * XXX Don't go larger than mballoc is willing to allocate
1763   - * This is a stopgap solution. We eventually need to fold
1764   - * mpage_da_submit_io() into this function and then call
1765   - * ext4_map_blocks() multiple times in a loop
1766   - */
1767   - if (nrblocks >= (8*1024*1024 >> blkbits))
1768   - goto flush_it;
1769   -
1770   - /* check if the reserved journal credits might overflow */
1771   - if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1772   - if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1773   - /*
1774   - * With non-extent format we are limited by the journal
1775   - * credit available. Total credit needed to insert
1776   - * nrblocks contiguous blocks is dependent on the
1777   - * nrblocks. So limit nrblocks.
1778   - */
1779   - goto flush_it;
1780   - }
1781   - }
1782   - /*
1783   - * First block in the extent
1784   - */
1785   - if (mpd->b_size == 0) {
1786   - mpd->b_blocknr = logical;
1787   - mpd->b_size = 1 << blkbits;
1788   - mpd->b_state = b_state & BH_FLAGS;
1789   - return;
1790   - }
1791   -
1792   - next = mpd->b_blocknr + nrblocks;
1793   - /*
1794   - * Can we merge the block to our big extent?
1795   - */
1796   - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1797   - mpd->b_size += 1 << blkbits;
1798   - return;
1799   - }
1800   -
1801   -flush_it:
1802   - /*
1803   - * We couldn't merge the block to our extent, so we
1804   - * need to flush current extent and start new one
1805   - */
1806   - mpage_da_map_and_submit(mpd);
1807   - return;
1808   -}
1809   -
1810 1495 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1811 1496 {
1812 1497 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
... ... @@ -2204,6 +1889,8 @@
2204 1889 return ret;
2205 1890 }
2206 1891  
  1892 +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
  1893 +
2207 1894 /*
2208 1895 * mballoc gives us at most this number of blocks...
2209 1896 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
... ... @@ -2212,6 +1899,315 @@
2212 1899 #define MAX_WRITEPAGES_EXTENT_LEN 2048
2213 1900  
2214 1901 /*
  1902 + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
  1903 + *
  1904 + * @mpd - extent of blocks
  1905 + * @lblk - logical number of the block in the file
  1906 + * @b_state - b_state of the buffer head added
  1907 + *
  1908 + * the function is used to collect contig. blocks in same state
  1909 + */
  1910 +static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
  1911 + unsigned long b_state)
  1912 +{
  1913 + struct ext4_map_blocks *map = &mpd->map;
  1914 +
  1915 + /* Don't go larger than mballoc is willing to allocate */
  1916 + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
  1917 + return 0;
  1918 +
  1919 + /* First block in the extent? */
  1920 + if (map->m_len == 0) {
  1921 + map->m_lblk = lblk;
  1922 + map->m_len = 1;
  1923 + map->m_flags = b_state & BH_FLAGS;
  1924 + return 1;
  1925 + }
  1926 +
  1927 + /* Can we merge the block to our big extent? */
  1928 + if (lblk == map->m_lblk + map->m_len &&
  1929 + (b_state & BH_FLAGS) == map->m_flags) {
  1930 + map->m_len++;
  1931 + return 1;
  1932 + }
  1933 + return 0;
  1934 +}
  1935 +
  1936 +static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
  1937 + struct buffer_head *head,
  1938 + struct buffer_head *bh,
  1939 + ext4_lblk_t lblk)
  1940 +{
  1941 + struct inode *inode = mpd->inode;
  1942 + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
  1943 + >> inode->i_blkbits;
  1944 +
  1945 + do {
  1946 + BUG_ON(buffer_locked(bh));
  1947 +
  1948 + if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
  1949 + (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
  1950 + lblk >= blocks) {
  1951 + /* Found extent to map? */
  1952 + if (mpd->map.m_len)
  1953 + return false;
  1954 + if (lblk >= blocks)
  1955 + return true;
  1956 + continue;
  1957 + }
  1958 + if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
  1959 + return false;
  1960 + } while (lblk++, (bh = bh->b_this_page) != head);
  1961 + return true;
  1962 +}
  1963 +
  1964 +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
  1965 +{
  1966 + int len;
  1967 + loff_t size = i_size_read(mpd->inode);
  1968 + int err;
  1969 +
  1970 + BUG_ON(page->index != mpd->first_page);
  1971 + if (page->index == size >> PAGE_CACHE_SHIFT)
  1972 + len = size & ~PAGE_CACHE_MASK;
  1973 + else
  1974 + len = PAGE_CACHE_SIZE;
  1975 + clear_page_dirty_for_io(page);
  1976 + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
  1977 + if (!err)
  1978 + mpd->wbc->nr_to_write--;
  1979 + mpd->first_page++;
  1980 +
  1981 + return err;
  1982 +}
  1983 +
  1984 +/*
  1985 + * mpage_map_buffers - update buffers corresponding to changed extent and
  1986 + * submit fully mapped pages for IO
  1987 + *
  1988 + * @mpd - description of extent to map, on return next extent to map
  1989 + *
  1990 + * Scan buffers corresponding to changed extent (we expect corresponding pages
  1991 + * to be already locked) and update buffer state according to new extent state.
  1992 + * We map delalloc buffers to their physical location, clear unwritten bits,
  1993 + * and mark buffers as uninit when we perform writes to uninitialized extents
  1994 + * and do extent conversion after IO is finished. If the last page is not fully
  1995 + * mapped, we update @map to the next extent in the last page that needs
  1996 + * mapping. Otherwise we submit the page for IO.
  1997 + */
  1998 +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
  1999 +{
  2000 + struct pagevec pvec;
  2001 + int nr_pages, i;
  2002 + struct inode *inode = mpd->inode;
  2003 + struct buffer_head *head, *bh;
  2004 + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
  2005 + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
  2006 + >> inode->i_blkbits;
  2007 + pgoff_t start, end;
  2008 + ext4_lblk_t lblk;
  2009 + sector_t pblock;
  2010 + int err;
  2011 +
  2012 + start = mpd->map.m_lblk >> bpp_bits;
  2013 + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
  2014 + lblk = start << bpp_bits;
  2015 + pblock = mpd->map.m_pblk;
  2016 +
  2017 + pagevec_init(&pvec, 0);
  2018 + while (start <= end) {
  2019 + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
  2020 + PAGEVEC_SIZE);
  2021 + if (nr_pages == 0)
  2022 + break;
  2023 + for (i = 0; i < nr_pages; i++) {
  2024 + struct page *page = pvec.pages[i];
  2025 +
  2026 + if (page->index > end)
  2027 + break;
  2028 + /* Upto 'end' pages must be contiguous */
  2029 + BUG_ON(page->index != start);
  2030 + bh = head = page_buffers(page);
  2031 + do {
  2032 + if (lblk < mpd->map.m_lblk)
  2033 + continue;
  2034 + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
  2035 + /*
  2036 + * Buffer after end of mapped extent.
  2037 + * Find next buffer in the page to map.
  2038 + */
  2039 + mpd->map.m_len = 0;
  2040 + mpd->map.m_flags = 0;
  2041 + add_page_bufs_to_extent(mpd, head, bh,
  2042 + lblk);
  2043 + pagevec_release(&pvec);
  2044 + return 0;
  2045 + }
  2046 + if (buffer_delay(bh)) {
  2047 + clear_buffer_delay(bh);
  2048 + bh->b_blocknr = pblock++;
  2049 + }
  2050 + if (mpd->map.m_flags & EXT4_MAP_UNINIT)
  2051 + set_buffer_uninit(bh);
  2052 + clear_buffer_unwritten(bh);
  2053 + } while (++lblk < blocks &&
  2054 + (bh = bh->b_this_page) != head);
  2055 +
  2056 + /*
  2057 + * FIXME: This is going to break if dioread_nolock
  2058 + * supports blocksize < pagesize as we will try to
  2059 + * convert potentially unmapped parts of inode.
  2060 + */
  2061 + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
  2062 + /* Page fully mapped - let IO run! */
  2063 + err = mpage_submit_page(mpd, page);
  2064 + if (err < 0) {
  2065 + pagevec_release(&pvec);
  2066 + return err;
  2067 + }
  2068 + start++;
  2069 + }
  2070 + pagevec_release(&pvec);
  2071 + }
  2072 + /* Extent fully mapped and matches with page boundary. We are done. */
  2073 + mpd->map.m_len = 0;
  2074 + mpd->map.m_flags = 0;
  2075 + return 0;
  2076 +}
  2077 +
  2078 +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
  2079 +{
  2080 + struct inode *inode = mpd->inode;
  2081 + struct ext4_map_blocks *map = &mpd->map;
  2082 + int get_blocks_flags;
  2083 + int err;
  2084 +
  2085 + trace_ext4_da_write_pages_extent(inode, map);
  2086 + /*
  2087 + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
  2088 + * to convert an uninitialized extent to be initialized (in the case
  2089 + * where we have written into one or more preallocated blocks). It is
  2090 + * possible that we're going to need more metadata blocks than
  2091 + * previously reserved. However we must not fail because we're in
  2092 + * writeback and there is nothing we can do about it so it might result
  2093 + * in data loss. So use reserved blocks to allocate metadata if
  2094 + * possible.
  2095 + *
  2096 + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
  2097 + * in question are delalloc blocks. This affects functions in many
  2098 + * different parts of the allocation call path. This flag exists
  2099 + * primarily because we don't want to change *many* call functions, so
  2100 + * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
  2101 + * once the inode's allocation semaphore is taken.
  2102 + */
  2103 + get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
  2104 + EXT4_GET_BLOCKS_METADATA_NOFAIL;
  2105 + if (ext4_should_dioread_nolock(inode))
  2106 + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
  2107 + if (map->m_flags & (1 << BH_Delay))
  2108 + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
  2109 +
  2110 + err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
  2111 + if (err < 0)
  2112 + return err;
  2113 +
  2114 + BUG_ON(map->m_len == 0);
  2115 + if (map->m_flags & EXT4_MAP_NEW) {
  2116 + struct block_device *bdev = inode->i_sb->s_bdev;
  2117 + int i;
  2118 +
  2119 + for (i = 0; i < map->m_len; i++)
  2120 + unmap_underlying_metadata(bdev, map->m_pblk + i);
  2121 + }
  2122 + return 0;
  2123 +}
  2124 +
  2125 +/*
  2126 + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
  2127 + * mpd->len and submit pages underlying it for IO
  2128 + *
  2129 + * @handle - handle for journal operations
  2130 + * @mpd - extent to map
  2131 + *
  2132 + * The function maps extent starting at mpd->lblk of length mpd->len. If it is
  2133 + * delayed, blocks are allocated, if it is unwritten, we may need to convert
  2134 + * them to initialized or split the described range from larger unwritten
  2135 + * extent. Note that we need not map all the described range since allocation
  2136 + * can return less blocks or the range is covered by more unwritten extents. We
  2137 + * cannot map more because we are limited by reserved transaction credits. On
  2138 + * the other hand we always make sure that the last touched page is fully
  2139 + * mapped so that it can be written out (and thus forward progress is
  2140 + * guaranteed). After mapping we submit all mapped pages for IO.
  2141 + */
  2142 +static int mpage_map_and_submit_extent(handle_t *handle,
  2143 + struct mpage_da_data *mpd)
  2144 +{
  2145 + struct inode *inode = mpd->inode;
  2146 + struct ext4_map_blocks *map = &mpd->map;
  2147 + int err;
  2148 + loff_t disksize;
  2149 +
  2150 + mpd->io_submit.io_end->offset =
  2151 + ((loff_t)map->m_lblk) << inode->i_blkbits;
  2152 + while (map->m_len) {
  2153 + err = mpage_map_one_extent(handle, mpd);
  2154 + if (err < 0) {
  2155 + struct super_block *sb = inode->i_sb;
  2156 +
  2157 + /*
  2158 + * Need to commit transaction to free blocks. Let upper
  2159 + * layers sort it out.
  2160 + */
  2161 + if (err == -ENOSPC && ext4_count_free_clusters(sb))
  2162 + return -ENOSPC;
  2163 +
  2164 + if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
  2165 + ext4_msg(sb, KERN_CRIT,
  2166 + "Delayed block allocation failed for "
  2167 + "inode %lu at logical offset %llu with"
  2168 + " max blocks %u with error %d",
  2169 + inode->i_ino,
  2170 + (unsigned long long)map->m_lblk,
  2171 + (unsigned)map->m_len, err);
  2172 + ext4_msg(sb, KERN_CRIT,
  2173 + "This should not happen!! Data will "
  2174 + "be lost\n");
  2175 + if (err == -ENOSPC)
  2176 + ext4_print_free_blocks(inode);
  2177 + }
  2178 + /* invalidate all the pages */
  2179 + mpage_release_unused_pages(mpd, true);
  2180 + return err;
  2181 + }
  2182 + /*
  2183 + * Update buffer state, submit mapped pages, and get us new
  2184 + * extent to map
  2185 + */
  2186 + err = mpage_map_and_submit_buffers(mpd);
  2187 + if (err < 0)
  2188 + return err;
  2189 + }
  2190 +
  2191 + /* Update on-disk size after IO is submitted */
  2192 + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
  2193 + if (disksize > i_size_read(inode))
  2194 + disksize = i_size_read(inode);
  2195 + if (disksize > EXT4_I(inode)->i_disksize) {
  2196 + int err2;
  2197 +
  2198 + ext4_update_i_disksize(inode, disksize);
  2199 + err2 = ext4_mark_inode_dirty(handle, inode);
  2200 + if (err2)
  2201 + ext4_error(inode->i_sb,
  2202 + "Failed to mark inode %lu dirty",
  2203 + inode->i_ino);
  2204 + if (!err)
  2205 + err = err2;
  2206 + }
  2207 + return err;
  2208 +}
  2209 +
  2210 +/*
2215 2211 * Calculate the total number of credits to reserve for one writepages
2216 2212 * iteration. This is called from ext4_da_writepages(). We map an extent of
2217 2213 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2218 2214  
2219 2215  
2220 2216  
2221 2217  
2222 2218  
... ... @@ -2227,44 +2223,49 @@
2227 2223 }
2228 2224  
2229 2225 /*
2230   - * write_cache_pages_da - walk the list of dirty pages of the given
2231   - * address space and accumulate pages that need writing, and call
2232   - * mpage_da_map_and_submit to map a single contiguous memory region
2233   - * and then write them.
  2226 + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
  2227 + * and underlying extent to map
  2228 + *
  2229 + * @mpd - where to look for pages
  2230 + *
  2231 + * Walk dirty pages in the mapping. If they are fully mapped, submit them for
  2232 + * IO immediately. When we find a page which isn't mapped we start accumulating
  2233 + * extent of buffers underlying these pages that needs mapping (formed by
  2234 + * either delayed or unwritten buffers). We also lock the pages containing
  2235 + * these buffers. The extent found is returned in @mpd structure (starting at
  2236 + * mpd->lblk with length mpd->len blocks).
  2237 + *
  2238 + * Note that this function can attach bios to one io_end structure which are
  2239 + * neither logically nor physically contiguous. Although it may seem as an
  2240 + * unnecessary complication, it is actually inevitable in blocksize < pagesize
  2241 + * case as we need to track IO to all buffers underlying a page in one io_end.
2234 2242 */
2235   -static int write_cache_pages_da(handle_t *handle,
2236   - struct address_space *mapping,
2237   - struct writeback_control *wbc,
2238   - struct mpage_da_data *mpd,
2239   - pgoff_t *done_index)
  2243 +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2240 2244 {
2241   - struct buffer_head *bh, *head;
2242   - struct inode *inode = mapping->host;
2243   - struct pagevec pvec;
2244   - unsigned int nr_pages;
2245   - sector_t logical;
2246   - pgoff_t index, end;
2247   - long nr_to_write = wbc->nr_to_write;
2248   - int i, tag, ret = 0;
  2245 + struct address_space *mapping = mpd->inode->i_mapping;
  2246 + struct pagevec pvec;
  2247 + unsigned int nr_pages;
  2248 + pgoff_t index = mpd->first_page;
  2249 + pgoff_t end = mpd->last_page;
  2250 + int tag;
  2251 + int i, err = 0;
  2252 + int blkbits = mpd->inode->i_blkbits;
  2253 + ext4_lblk_t lblk;
  2254 + struct buffer_head *head;
2249 2255  
2250   - memset(mpd, 0, sizeof(struct mpage_da_data));
2251   - mpd->wbc = wbc;
2252   - mpd->inode = inode;
2253   - pagevec_init(&pvec, 0);
2254   - index = wbc->range_start >> PAGE_CACHE_SHIFT;
2255   - end = wbc->range_end >> PAGE_CACHE_SHIFT;
2256   -
2257   - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  2256 + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2258 2257 tag = PAGECACHE_TAG_TOWRITE;
2259 2258 else
2260 2259 tag = PAGECACHE_TAG_DIRTY;
2261 2260  
2262   - *done_index = index;
  2261 + pagevec_init(&pvec, 0);
  2262 + mpd->map.m_len = 0;
  2263 + mpd->next_page = index;
2263 2264 while (index <= end) {
2264 2265 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2265 2266 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2266 2267 if (nr_pages == 0)
2267   - return 0;
  2268 + goto out;
2268 2269  
2269 2270 for (i = 0; i < nr_pages; i++) {
2270 2271 struct page *page = pvec.pages[i];
2271 2272  
2272 2273  
2273 2274  
2274 2275  
... ... @@ -2279,31 +2280,21 @@
2279 2280 if (page->index > end)
2280 2281 goto out;
2281 2282  
2282   - *done_index = page->index + 1;
  2283 + /* If we can't merge this page, we are done. */
  2284 + if (mpd->map.m_len > 0 && mpd->next_page != page->index)
  2285 + goto out;
2283 2286  
2284   - /*
2285   - * If we can't merge this page, and we have
2286   - * accumulated an contiguous region, write it
2287   - */
2288   - if ((mpd->next_page != page->index) &&
2289   - (mpd->next_page != mpd->first_page)) {
2290   - mpage_da_map_and_submit(mpd);
2291   - goto ret_extent_tail;
2292   - }
2293   -
2294 2287 lock_page(page);
2295   -
2296 2288 /*
2297   - * If the page is no longer dirty, or its
2298   - * mapping no longer corresponds to inode we
2299   - * are writing (which means it has been
2300   - * truncated or invalidated), or the page is
2301   - * already under writeback and we are not
2302   - * doing a data integrity writeback, skip the page
  2289 + * If the page is no longer dirty, or its mapping no
  2290 + * longer corresponds to inode we are writing (which
  2291 + * means it has been truncated or invalidated), or the
  2292 + * page is already under writeback and we are not doing
  2293 + * a data integrity writeback, skip the page
2303 2294 */
2304 2295 if (!PageDirty(page) ||
2305 2296 (PageWriteback(page) &&
2306   - (wbc->sync_mode == WB_SYNC_NONE)) ||
  2297 + (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2307 2298 unlikely(page->mapping != mapping)) {
2308 2299 unlock_page(page);
2309 2300 continue;
2310 2301  
2311 2302  
2312 2303  
2313 2304  
2314 2305  
2315 2306  
2316 2307  
2317 2308  
2318 2309  
2319 2310  
2320 2311  
2321 2312  
... ... @@ -2312,101 +2303,57 @@
2312 2303 wait_on_page_writeback(page);
2313 2304 BUG_ON(PageWriteback(page));
2314 2305  
2315   - /*
2316   - * If we have inline data and arrive here, it means that
2317   - * we will soon create the block for the 1st page, so
2318   - * we'd better clear the inline data here.
2319   - */
2320   - if (ext4_has_inline_data(inode)) {
2321   - BUG_ON(ext4_test_inode_state(inode,
2322   - EXT4_STATE_MAY_INLINE_DATA));
2323   - ext4_destroy_inline_data(handle, inode);
2324   - }
2325   -
2326   - if (mpd->next_page != page->index)
  2306 + if (mpd->map.m_len == 0)
2327 2307 mpd->first_page = page->index;
2328 2308 mpd->next_page = page->index + 1;
2329   - logical = (sector_t) page->index <<
2330   - (PAGE_CACHE_SHIFT - inode->i_blkbits);
2331   -
2332 2309 /* Add all dirty buffers to mpd */
  2310 + lblk = ((ext4_lblk_t)page->index) <<
  2311 + (PAGE_CACHE_SHIFT - blkbits);
2333 2312 head = page_buffers(page);
2334   - bh = head;
2335   - do {
2336   - BUG_ON(buffer_locked(bh));
2337   - /*
2338   - * We need to try to allocate unmapped blocks
2339   - * in the same page. Otherwise we won't make
2340   - * progress with the page in ext4_writepage
2341   - */
2342   - if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2343   - mpage_add_bh_to_extent(mpd, logical,
2344   - bh->b_state);
2345   - if (mpd->io_done)
2346   - goto ret_extent_tail;
2347   - } else if (buffer_dirty(bh) &&
2348   - buffer_mapped(bh)) {
2349   - /*
2350   - * mapped dirty buffer. We need to
2351   - * update the b_state because we look
2352   - * at b_state in mpage_da_map_blocks.
2353   - * We don't update b_size because if we
2354   - * find an unmapped buffer_head later
2355   - * we need to use the b_state flag of
2356   - * that buffer_head.
2357   - */
2358   - if (mpd->b_size == 0)
2359   - mpd->b_state =
2360   - bh->b_state & BH_FLAGS;
2361   - }
2362   - logical++;
2363   - } while ((bh = bh->b_this_page) != head);
2364   -
2365   - if (nr_to_write > 0) {
2366   - nr_to_write--;
2367   - if (nr_to_write == 0 &&
2368   - wbc->sync_mode == WB_SYNC_NONE)
2369   - /*
2370   - * We stop writing back only if we are
2371   - * not doing integrity sync. In case of
2372   - * integrity sync we have to keep going
2373   - * because someone may be concurrently
2374   - * dirtying pages, and we might have
2375   - * synced a lot of newly appeared dirty
2376   - * pages, but have not synced all of the
2377   - * old dirty pages.
2378   - */
  2313 + if (!add_page_bufs_to_extent(mpd, head, head, lblk))
  2314 + goto out;
  2315 + /* So far everything mapped? Submit the page for IO. */
  2316 + if (mpd->map.m_len == 0) {
  2317 + err = mpage_submit_page(mpd, page);
  2318 + if (err < 0)
2379 2319 goto out;
2380 2320 }
  2321 +
  2322 + /*
  2323 + * Accumulated enough dirty pages? This doesn't apply
  2324 + * to WB_SYNC_ALL mode. For integrity sync we have to
  2325 + * keep going because someone may be concurrently
  2326 + * dirtying pages, and we might have synced a lot of
  2327 + * newly appeared dirty pages, but have not synced all
  2328 + * of the old dirty pages.
  2329 + */
  2330 + if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
  2331 + mpd->next_page - mpd->first_page >=
  2332 + mpd->wbc->nr_to_write)
  2333 + goto out;
2381 2334 }
2382 2335 pagevec_release(&pvec);
2383 2336 cond_resched();
2384 2337 }
2385 2338 return 0;
2386   -ret_extent_tail:
2387   - ret = MPAGE_DA_EXTENT_TAIL;
2388 2339 out:
2389 2340 pagevec_release(&pvec);
2390   - cond_resched();
2391   - return ret;
  2341 + return err;
2392 2342 }
2393 2343  
2394   -
2395 2344 static int ext4_da_writepages(struct address_space *mapping,
2396 2345 struct writeback_control *wbc)
2397 2346 {
2398   - pgoff_t index;
  2347 + pgoff_t writeback_index = 0;
  2348 + long nr_to_write = wbc->nr_to_write;
2399 2349 int range_whole = 0;
  2350 + int cycled = 1;
2400 2351 handle_t *handle = NULL;
2401 2352 struct mpage_da_data mpd;
2402 2353 struct inode *inode = mapping->host;
2403   - int pages_written = 0;
2404   - int range_cyclic, cycled = 1, io_done = 0;
2405 2354 int needed_blocks, ret = 0;
2406   - loff_t range_start = wbc->range_start;
2407 2355 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2408   - pgoff_t done_index = 0;
2409   - pgoff_t end;
  2356 + bool done;
2410 2357 struct blk_plug plug;
2411 2358  
2412 2359 trace_ext4_da_writepages(inode, wbc);
2413 2360  
2414 2361  
2415 2362  
2416 2363  
2417 2364  
2418 2365  
2419 2366  
2420 2367  
2421 2368  
... ... @@ -2432,40 +2379,65 @@
2432 2379 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2433 2380 return -EROFS;
2434 2381  
  2382 + /*
  2383 + * If we have inline data and arrive here, it means that
  2384 + * we will soon create the block for the 1st page, so
  2385 + * we'd better clear the inline data here.
  2386 + */
  2387 + if (ext4_has_inline_data(inode)) {
  2388 + /* Just inode will be modified... */
  2389 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
  2390 + if (IS_ERR(handle)) {
  2391 + ret = PTR_ERR(handle);
  2392 + goto out_writepages;
  2393 + }
  2394 + BUG_ON(ext4_test_inode_state(inode,
  2395 + EXT4_STATE_MAY_INLINE_DATA));
  2396 + ext4_destroy_inline_data(handle, inode);
  2397 + ext4_journal_stop(handle);
  2398 + }
  2399 +
2435 2400 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2436 2401 range_whole = 1;
2437 2402  
2438   - range_cyclic = wbc->range_cyclic;
2439 2403 if (wbc->range_cyclic) {
2440   - index = mapping->writeback_index;
2441   - if (index)
  2404 + writeback_index = mapping->writeback_index;
  2405 + if (writeback_index)
2442 2406 cycled = 0;
2443   - wbc->range_start = index << PAGE_CACHE_SHIFT;
2444   - wbc->range_end = LLONG_MAX;
2445   - wbc->range_cyclic = 0;
2446   - end = -1;
  2407 + mpd.first_page = writeback_index;
  2408 + mpd.last_page = -1;
2447 2409 } else {
2448   - index = wbc->range_start >> PAGE_CACHE_SHIFT;
2449   - end = wbc->range_end >> PAGE_CACHE_SHIFT;
  2410 + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
  2411 + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2450 2412 }
2451 2413  
  2414 + mpd.inode = inode;
  2415 + mpd.wbc = wbc;
  2416 + ext4_io_submit_init(&mpd.io_submit, wbc);
2452 2417 retry:
2453 2418 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2454   - tag_pages_for_writeback(mapping, index, end);
2455   -
  2419 + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
  2420 + done = false;
2456 2421 blk_start_plug(&plug);
2457   - while (!ret && wbc->nr_to_write > 0) {
  2422 + while (!done && mpd.first_page <= mpd.last_page) {
  2423 + /* For each extent of pages we use new io_end */
  2424 + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
  2425 + if (!mpd.io_submit.io_end) {
  2426 + ret = -ENOMEM;
  2427 + break;
  2428 + }
2458 2429  
2459 2430 /*
2460   - * we insert one extent at a time. So we need
2461   - * credit needed for single extent allocation.
2462   - * journalled mode is currently not supported
2463   - * by delalloc
  2431 + * We have two constraints: We find one extent to map and we
  2432 + * must always write out whole page (makes a difference when
  2433 + * blocksize < pagesize) so that we don't block on IO when we
  2434 + * try to write out the rest of the page. Journalled mode is
  2435 + * not supported by delalloc.
2464 2436 */
2465 2437 BUG_ON(ext4_should_journal_data(inode));
2466 2438 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2467 2439  
2468   - /* start a new transaction*/
  2440 + /* start a new transaction */
2469 2441 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
2470 2442 needed_blocks);
2471 2443 if (IS_ERR(handle)) {
2472 2444  
2473 2445  
2474 2446  
2475 2447  
2476 2448  
2477 2449  
2478 2450  
2479 2451  
2480 2452  
2481 2453  
2482 2454  
... ... @@ -2473,76 +2445,67 @@
2473 2445 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2474 2446 "%ld pages, ino %lu; err %d", __func__,
2475 2447 wbc->nr_to_write, inode->i_ino, ret);
2476   - blk_finish_plug(&plug);
2477   - goto out_writepages;
  2448 + /* Release allocated io_end */
  2449 + ext4_put_io_end(mpd.io_submit.io_end);
  2450 + break;
2478 2451 }
2479 2452  
2480   - /*
2481   - * Now call write_cache_pages_da() to find the next
2482   - * contiguous region of logical blocks that need
2483   - * blocks to be allocated by ext4 and submit them.
2484   - */
2485   - ret = write_cache_pages_da(handle, mapping,
2486   - wbc, &mpd, &done_index);
2487   - /*
2488   - * If we have a contiguous extent of pages and we
2489   - * haven't done the I/O yet, map the blocks and submit
2490   - * them for I/O.
2491   - */
2492   - if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2493   - mpage_da_map_and_submit(&mpd);
2494   - ret = MPAGE_DA_EXTENT_TAIL;
  2453 + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
  2454 + ret = mpage_prepare_extent_to_map(&mpd);
  2455 + if (!ret) {
  2456 + if (mpd.map.m_len)
  2457 + ret = mpage_map_and_submit_extent(handle, &mpd);
  2458 + else {
  2459 + /*
  2460 + * We scanned the whole range (or exhausted
  2461 + * nr_to_write), submitted what was mapped and
  2462 + * didn't find anything needing mapping. We are
  2463 + * done.
  2464 + */
  2465 + done = true;
  2466 + }
2495 2467 }
2496   - trace_ext4_da_write_pages(inode, &mpd);
2497   - wbc->nr_to_write -= mpd.pages_written;
2498   -
2499 2468 ext4_journal_stop(handle);
  2469 + /* Submit prepared bio */
  2470 + ext4_io_submit(&mpd.io_submit);
  2471 + /* Unlock pages we didn't use */
  2472 + mpage_release_unused_pages(&mpd, false);
  2473 + /* Drop our io_end reference we got from init */
  2474 + ext4_put_io_end(mpd.io_submit.io_end);
2500 2475  
2501   - if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2502   - /* commit the transaction which would
  2476 + if (ret == -ENOSPC && sbi->s_journal) {
  2477 + /*
  2478 + * Commit the transaction which would
2503 2479 * free blocks released in the transaction
2504 2480 * and try again
2505 2481 */
2506 2482 jbd2_journal_force_commit_nested(sbi->s_journal);
2507 2483 ret = 0;
2508   - } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2509   - /*
2510   - * Got one extent now try with rest of the pages.
2511   - * If mpd.retval is set -EIO, journal is aborted.
2512   - * So we don't need to write any more.
2513   - */
2514   - pages_written += mpd.pages_written;
2515   - ret = mpd.retval;
2516   - io_done = 1;
2517   - } else if (wbc->nr_to_write)
2518   - /*
2519   - * There is no more writeout needed
2520   - * or we requested for a noblocking writeout
2521   - * and we found the device congested
2522   - */
  2484 + continue;
  2485 + }
  2486 + /* Fatal error - ENOMEM, EIO... */
  2487 + if (ret)
2523 2488 break;
2524 2489 }
2525 2490 blk_finish_plug(&plug);
2526   - if (!io_done && !cycled) {
  2491 + if (!ret && !cycled) {
2527 2492 cycled = 1;
2528   - index = 0;
2529   - wbc->range_start = index << PAGE_CACHE_SHIFT;
2530   - wbc->range_end = mapping->writeback_index - 1;
  2493 + mpd.last_page = writeback_index - 1;
  2494 + mpd.first_page = 0;
2531 2495 goto retry;
2532 2496 }
2533 2497  
2534 2498 /* Update index */
2535   - wbc->range_cyclic = range_cyclic;
2536 2499 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2537 2500 /*
2538   - * set the writeback_index so that range_cyclic
  2501 + * Set the writeback_index so that range_cyclic
2539 2502 * mode will write it back later
2540 2503 */
2541   - mapping->writeback_index = done_index;
  2504 + mapping->writeback_index = mpd.first_page;
2542 2505  
2543 2506 out_writepages:
2544   - wbc->range_start = range_start;
2545   - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
  2507 + trace_ext4_da_writepages_result(inode, wbc, ret,
  2508 + nr_to_write - wbc->nr_to_write);
2546 2509 return ret;
2547 2510 }
2548 2511  
... ... @@ -360,9 +360,6 @@
360 360 bio->bi_bdev = bh->b_bdev;
361 361 bio->bi_end_io = ext4_end_bio;
362 362 bio->bi_private = ext4_get_io_end(io->io_end);
363   - if (!io->io_end->size)
364   - io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
365   - + bh_offset(bh);
366 363 io->io_bio = bio;
367 364 io->io_next_block = bh->b_blocknr;
368 365 return 0;
... ... @@ -390,7 +387,6 @@
390 387 io_end = io->io_end;
391 388 if (test_clear_buffer_uninit(bh))
392 389 ext4_set_io_unwritten_flag(inode, io_end);
393   - io_end->size += bh->b_size;
394 390 io->io_next_block++;
395 391 return 0;
396 392 }
include/trace/events/ext4.h
... ... @@ -324,43 +324,59 @@
324 324 );
325 325  
326 326 TRACE_EVENT(ext4_da_write_pages,
327   - TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
  327 + TP_PROTO(struct inode *inode, pgoff_t first_page,
  328 + struct writeback_control *wbc),
328 329  
329   - TP_ARGS(inode, mpd),
  330 + TP_ARGS(inode, first_page, wbc),
330 331  
331 332 TP_STRUCT__entry(
332 333 __field( dev_t, dev )
333 334 __field( ino_t, ino )
334   - __field( __u64, b_blocknr )
335   - __field( __u32, b_size )
336   - __field( __u32, b_state )
337   - __field( unsigned long, first_page )
338   - __field( int, io_done )
339   - __field( int, pages_written )
340   - __field( int, sync_mode )
  335 + __field( pgoff_t, first_page )
  336 + __field( long, nr_to_write )
  337 + __field( int, sync_mode )
341 338 ),
342 339  
343 340 TP_fast_assign(
344 341 __entry->dev = inode->i_sb->s_dev;
345 342 __entry->ino = inode->i_ino;
346   - __entry->b_blocknr = mpd->b_blocknr;
347   - __entry->b_size = mpd->b_size;
348   - __entry->b_state = mpd->b_state;
349   - __entry->first_page = mpd->first_page;
350   - __entry->io_done = mpd->io_done;
351   - __entry->pages_written = mpd->pages_written;
352   - __entry->sync_mode = mpd->wbc->sync_mode;
  343 + __entry->first_page = first_page;
  344 + __entry->nr_to_write = wbc->nr_to_write;
  345 + __entry->sync_mode = wbc->sync_mode;
353 346 ),
354 347  
355   - TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
356   - "first_page %lu io_done %d pages_written %d sync_mode %d",
  348 + TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
  349 + "sync_mode %d",
357 350 MAJOR(__entry->dev), MINOR(__entry->dev),
358   - (unsigned long) __entry->ino,
359   - __entry->b_blocknr, __entry->b_size,
360   - __entry->b_state, __entry->first_page,
361   - __entry->io_done, __entry->pages_written,
362   - __entry->sync_mode
363   - )
  351 + (unsigned long) __entry->ino, __entry->first_page,
  352 + __entry->nr_to_write, __entry->sync_mode)
  353 +);
  354 +
  355 +TRACE_EVENT(ext4_da_write_pages_extent,
  356 + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
  357 +
  358 + TP_ARGS(inode, map),
  359 +
  360 + TP_STRUCT__entry(
  361 + __field( dev_t, dev )
  362 + __field( ino_t, ino )
  363 + __field( __u64, lblk )
  364 + __field( __u32, len )
  365 + __field( __u32, flags )
  366 + ),
  367 +
  368 + TP_fast_assign(
  369 + __entry->dev = inode->i_sb->s_dev;
  370 + __entry->ino = inode->i_ino;
  371 + __entry->lblk = map->m_lblk;
  372 + __entry->len = map->m_len;
  373 + __entry->flags = map->m_flags;
  374 + ),
  375 +
  376 + TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
  377 + MAJOR(__entry->dev), MINOR(__entry->dev),
  378 + (unsigned long) __entry->ino, __entry->lblk, __entry->len,
  379 + __entry->flags)
364 380 );
365 381  
366 382 TRACE_EVENT(ext4_da_writepages_result,