Commit 6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d

Authored by Youquan Song
Committed by David Woodhouse
1 parent 7b66835781

intel-iommu: Enable super page (2MiB, 1GiB, etc.) support

There are no externally-visible changes with this. In the loop in the
internal __domain_mapping() function, we simply detect if we are mapping:
  - size >= 2MiB, and
  - virtual address aligned to 2MiB, and
  - physical address aligned to 2MiB, and
  - on hardware that supports superpages.

(and likewise for larger superpages).

We automatically use a superpage for such mappings. We never have to
worry about *breaking* superpages, since we trust that we will always
*unmap* the same range that was mapped. So all we need to do is ensure
that dma_pte_clear_range() will also cope with superpages.

Adjust pfn_to_dma_pte() to take a superpage 'level' as an argument, so
it can return a PTE at the appropriate level rather than always
extending the page tables all the way down to level 1. Again, this is
simplified by the fact that we should never encounter existing small
pages when we're creating a mapping; any old mapping that used the same
virtual range will have been entirely removed and its obsolete page
tables freed.

Provide an 'intel_iommu=sp_off' argument on the command line as a
chicken bit. Not that it should ever be required.

==

The original commit seen in the iommu-2.6.git was Youquan's
implementation (and completion) of my own half-baked code which I'd
typed into an email. Followed by half a dozen subsequent 'fixes'.

I've taken the unusual step of rewriting history and collapsing the
original commits in order to keep the main history simpler, and make
life easier for the people who are going to have to backport this to
older kernels. And also so I can give it a more coherent commit comment
which (hopefully) gives a better explanation of what's going on.

The original sequence of commits leading to identical code was:

Youquan Song (3):
      intel-iommu: super page support
      intel-iommu: Fix superpage alignment calculation error
      intel-iommu: Fix superpage level calculation error in dma_pfn_level_pte()

David Woodhouse (4):
      intel-iommu: Precalculate superpage support for dmar_domain
      intel-iommu: Fix hardware_largepage_caps()
      intel-iommu: Fix inappropriate use of superpages in __domain_mapping()
      intel-iommu: Fix phys_pfn in __domain_mapping for sglist pages

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>

Showing 3 changed files with 147 additions and 19 deletions Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -999,7 +999,10 @@
999 999 With this option on every unmap_single operation will
1000 1000 result in a hardware IOTLB flush operation as opposed
1001 1001 to batching them for performance.
1002   -
  1002 + sp_off [Default Off]
  1003 + By default, super page will be supported if Intel IOMMU
  1004 + has the capability. With this option, super page will
  1005 + not be supported.
1003 1006 intremap= [X86-64, Intel-IOMMU]
1004 1007 Format: { on (default) | off | nosid }
1005 1008 on enable Interrupt Remapping (default)
drivers/pci/intel-iommu.c
... ... @@ -115,6 +115,11 @@
115 115 return (pfn + level_size(level) - 1) & level_mask(level);
116 116 }
117 117  
  118 +static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
  119 +{
  120 + return 1 << ((lvl - 1) * LEVEL_STRIDE);
  121 +}
  122 +
118 123 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119 124 are never going to work. */
120 125 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
... ... @@ -343,6 +348,9 @@
343 348 int iommu_coherency;/* indicate coherency of iommu access */
344 349 int iommu_snooping; /* indicate snooping control feature*/
345 350 int iommu_count; /* reference count of iommu */
  351 + int iommu_superpage;/* Level of superpages supported:
  352 + 0 == 4KiB (no superpages), 1 == 2MiB,
  353 + 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
346 354 spinlock_t iommu_lock; /* protect iommu set in domain */
347 355 u64 max_addr; /* maximum mapped address */
348 356 };
... ... @@ -392,6 +400,7 @@
392 400 static int dmar_map_gfx = 1;
393 401 static int dmar_forcedac;
394 402 static int intel_iommu_strict;
  403 +static int intel_iommu_superpage = 1;
395 404  
396 405 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 406 static DEFINE_SPINLOCK(device_domain_lock);
... ... @@ -422,6 +431,10 @@
422 431 printk(KERN_INFO
423 432 "Intel-IOMMU: disable batched IOTLB flush\n");
424 433 intel_iommu_strict = 1;
  434 + } else if (!strncmp(str, "sp_off", 6)) {
  435 + printk(KERN_INFO
  436 + "Intel-IOMMU: disable supported super page\n");
  437 + intel_iommu_superpage = 0;
425 438 }
426 439  
427 440 str += strcspn(str, ",");
428 441  
... ... @@ -560,11 +573,32 @@
560 573 }
561 574 }
562 575  
  576 +static void domain_update_iommu_superpage(struct dmar_domain *domain)
  577 +{
  578 + int i, mask = 0xf;
  579 +
  580 + if (!intel_iommu_superpage) {
  581 + domain->iommu_superpage = 0;
  582 + return;
  583 + }
  584 +
  585 + domain->iommu_superpage = 4; /* 1TiB */
  586 +
  587 + for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
  588 + mask |= cap_super_page_val(g_iommus[i]->cap);
  589 + if (!mask) {
  590 + break;
  591 + }
  592 + }
  593 + domain->iommu_superpage = fls(mask);
  594 +}
  595 +
563 596 /* Some capabilities may be different across iommus */
564 597 static void domain_update_iommu_cap(struct dmar_domain *domain)
565 598 {
566 599 domain_update_iommu_coherency(domain);
567 600 domain_update_iommu_snooping(domain);
  601 + domain_update_iommu_superpage(domain);
568 602 }
569 603  
570 604 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
571 605  
572 606  
573 607  
574 608  
... ... @@ -694,24 +728,32 @@
694 728 }
695 729  
696 730 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
697   - unsigned long pfn)
  731 + unsigned long pfn, int large_level)
698 732 {
699 733 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
700 734 struct dma_pte *parent, *pte = NULL;
701 735 int level = agaw_to_level(domain->agaw);
702   - int offset;
  736 + int offset, target_level;
703 737  
704 738 BUG_ON(!domain->pgd);
705 739 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
706 740 parent = domain->pgd;
707 741  
  742 + /* Search pte */
  743 + if (!large_level)
  744 + target_level = 1;
  745 + else
  746 + target_level = large_level;
  747 +
708 748 while (level > 0) {
709 749 void *tmp_page;
710 750  
711 751 offset = pfn_level_offset(pfn, level);
712 752 pte = &parent[offset];
713   - if (level == 1)
  753 + if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
714 754 break;
  755 + if (level == target_level)
  756 + break;
715 757  
716 758 if (!dma_pte_present(pte)) {
717 759 uint64_t pteval;
718 760  
... ... @@ -738,10 +780,11 @@
738 780 return pte;
739 781 }
740 782  
  783 +
741 784 /* return address's pte at specific level */
742 785 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
743 786 unsigned long pfn,
744   - int level)
  787 + int level, int *large_page)
745 788 {
746 789 struct dma_pte *parent, *pte = NULL;
747 790 int total = agaw_to_level(domain->agaw);
748 791  
... ... @@ -754,8 +797,16 @@
754 797 if (level == total)
755 798 return pte;
756 799  
757   - if (!dma_pte_present(pte))
  800 + if (!dma_pte_present(pte)) {
  801 + *large_page = total;
758 802 break;
  803 + }
  804 +
  805 + if (pte->val & DMA_PTE_LARGE_PAGE) {
  806 + *large_page = total;
  807 + return pte;
  808 + }
  809 +
759 810 parent = phys_to_virt(dma_pte_addr(pte));
760 811 total--;
761 812 }
... ... @@ -768,6 +819,7 @@
768 819 unsigned long last_pfn)
769 820 {
770 821 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
  822 + unsigned int large_page = 1;
771 823 struct dma_pte *first_pte, *pte;
772 824  
773 825 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
774 826  
775 827  
776 828  
... ... @@ -776,14 +828,15 @@
776 828  
777 829 /* we don't need lock here; nobody else touches the iova range */
778 830 do {
779   - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
  831 + large_page = 1;
  832 + first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
780 833 if (!pte) {
781   - start_pfn = align_to_level(start_pfn + 1, 2);
  834 + start_pfn = align_to_level(start_pfn + 1, large_page + 1);
782 835 continue;
783 836 }
784   - do {
  837 + do {
785 838 dma_clear_pte(pte);
786   - start_pfn++;
  839 + start_pfn += lvl_to_nr_pages(large_page);
787 840 pte++;
788 841 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
789 842  
... ... @@ -803,6 +856,7 @@
803 856 int total = agaw_to_level(domain->agaw);
804 857 int level;
805 858 unsigned long tmp;
  859 + int large_page = 2;
806 860  
807 861 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
808 862 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
... ... @@ -818,7 +872,10 @@
818 872 return;
819 873  
820 874 do {
821   - first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
  875 + large_page = level;
  876 + first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
  877 + if (large_page > level)
  878 + level = large_page + 1;
822 879 if (!pte) {
823 880 tmp = align_to_level(tmp + 1, level + 1);
824 881 continue;
... ... @@ -1402,6 +1459,7 @@
1402 1459 else
1403 1460 domain->iommu_snooping = 0;
1404 1461  
  1462 + domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1405 1463 domain->iommu_count = 1;
1406 1464 domain->nid = iommu->node;
1407 1465  
... ... @@ -1657,6 +1715,34 @@
1657 1715 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1658 1716 }
1659 1717  
  1718 +/* Return largest possible superpage level for a given mapping */
  1719 +static inline int hardware_largepage_caps(struct dmar_domain *domain,
  1720 + unsigned long iov_pfn,
  1721 + unsigned long phy_pfn,
  1722 + unsigned long pages)
  1723 +{
  1724 + int support, level = 1;
  1725 + unsigned long pfnmerge;
  1726 +
  1727 + support = domain->iommu_superpage;
  1728 +
  1729 + /* To use a large page, the virtual *and* physical addresses
  1730 + must be aligned to 2MiB/1GiB/etc. Lower bits set in either
  1731 + of them will mean we have to use smaller pages. So just
  1732 + merge them and check both at once. */
  1733 + pfnmerge = iov_pfn | phy_pfn;
  1734 +
  1735 + while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
  1736 + pages >>= VTD_STRIDE_SHIFT;
  1737 + if (!pages)
  1738 + break;
  1739 + pfnmerge >>= VTD_STRIDE_SHIFT;
  1740 + level++;
  1741 + support--;
  1742 + }
  1743 + return level;
  1744 +}
  1745 +
1660 1746 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1661 1747 struct scatterlist *sg, unsigned long phys_pfn,
1662 1748 unsigned long nr_pages, int prot)
... ... @@ -1665,6 +1751,8 @@
1665 1751 phys_addr_t uninitialized_var(pteval);
1666 1752 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1667 1753 unsigned long sg_res;
  1754 + unsigned int largepage_lvl = 0;
  1755 + unsigned long lvl_pages = 0;
1668 1756  
1669 1757 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1670 1758  
... ... @@ -1680,7 +1768,7 @@
1680 1768 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1681 1769 }
1682 1770  
1683   - while (nr_pages--) {
  1771 + while (nr_pages > 0) {
1684 1772 uint64_t tmp;
1685 1773  
1686 1774 if (!sg_res) {
1687 1775  
1688 1776  
1689 1777  
... ... @@ -1688,11 +1776,21 @@
1688 1776 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1689 1777 sg->dma_length = sg->length;
1690 1778 pteval = page_to_phys(sg_page(sg)) | prot;
  1779 + phys_pfn = pteval >> VTD_PAGE_SHIFT;
1691 1780 }
  1781 +
1692 1782 if (!pte) {
1693   - first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
  1783 + largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
  1784 +
  1785 + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1694 1786 if (!pte)
1695 1787 return -ENOMEM;
  1788 + /* It is large page*/
  1789 + if (largepage_lvl > 1)
  1790 + pteval |= DMA_PTE_LARGE_PAGE;
  1791 + else
  1792 + pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
  1793 +
1696 1794 }
1697 1795 /* We don't need lock here, nobody else
1698 1796 * touches the iova range
1699 1797  
1700 1798  
... ... @@ -1708,16 +1806,38 @@
1708 1806 }
1709 1807 WARN_ON(1);
1710 1808 }
  1809 +
  1810 + lvl_pages = lvl_to_nr_pages(largepage_lvl);
  1811 +
  1812 + BUG_ON(nr_pages < lvl_pages);
  1813 + BUG_ON(sg_res < lvl_pages);
  1814 +
  1815 + nr_pages -= lvl_pages;
  1816 + iov_pfn += lvl_pages;
  1817 + phys_pfn += lvl_pages;
  1818 + pteval += lvl_pages * VTD_PAGE_SIZE;
  1819 + sg_res -= lvl_pages;
  1820 +
  1821 + /* If the next PTE would be the first in a new page, then we
  1822 + need to flush the cache on the entries we've just written.
  1823 + And then we'll need to recalculate 'pte', so clear it and
  1824 + let it get set again in the if (!pte) block above.
  1825 +
  1826 + If we're done (!nr_pages) we need to flush the cache too.
  1827 +
  1828 + Also if we've been setting superpages, we may need to
  1829 + recalculate 'pte' and switch back to smaller pages for the
  1830 + end of the mapping, if the trailing size is not enough to
  1831 + use another superpage (i.e. sg_res < lvl_pages). */
1711 1832 pte++;
1712   - if (!nr_pages || first_pte_in_page(pte)) {
  1833 + if (!nr_pages || first_pte_in_page(pte) ||
  1834 + (largepage_lvl > 1 && sg_res < lvl_pages)) {
1713 1835 domain_flush_cache(domain, first_pte,
1714 1836 (void *)pte - (void *)first_pte);
1715 1837 pte = NULL;
1716 1838 }
1717   - iov_pfn++;
1718   - pteval += VTD_PAGE_SIZE;
1719   - sg_res--;
1720   - if (!sg_res)
  1839 +
  1840 + if (!sg_res && nr_pages)
1721 1841 sg = sg_next(sg);
1722 1842 }
1723 1843 return 0;
... ... @@ -3527,6 +3647,7 @@
3527 3647 domain->iommu_count = 0;
3528 3648 domain->iommu_coherency = 0;
3529 3649 domain->iommu_snooping = 0;
  3650 + domain->iommu_superpage = 0;
3530 3651 domain->max_addr = 0;
3531 3652 domain->nid = -1;
3532 3653  
... ... @@ -3742,7 +3863,7 @@
3742 3863 struct dma_pte *pte;
3743 3864 u64 phys = 0;
3744 3865  
3745   - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
  3866 + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3746 3867 if (pte)
3747 3868 phys = dma_pte_addr(pte);
3748 3869  
include/linux/dma_remapping.h
... ... @@ -9,8 +9,12 @@
9 9 #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT)
10 10 #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
11 11  
  12 +#define VTD_STRIDE_SHIFT (9)
  13 +#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT)
  14 +
12 15 #define DMA_PTE_READ (1)
13 16 #define DMA_PTE_WRITE (2)
  17 +#define DMA_PTE_LARGE_PAGE (1 << 7)
14 18 #define DMA_PTE_SNP (1 << 11)
15 19  
16 20 #define CONTEXT_TT_MULTI_LEVEL 0