Commit 687d680985b1438360a9ba470ece8b57cd205c3b

Authored by Linus Torvalds

Merge git://git.infradead.org/~dwmw2/iommu-2.6.31

* git://git.infradead.org/~dwmw2/iommu-2.6.31:
  intel-iommu: Fix one last ia64 build problem in Pass Through Support
  VT-d: support the device IOTLB
  VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
  VT-d: add device IOTLB invalidation support
  VT-d: parse ATSR in DMA Remapping Reporting Structure
  PCI: handle Virtual Function ATS enabling
  PCI: support the ATS capability
  intel-iommu: dmar_set_interrupt return error value
  intel-iommu: Tidy up iommu->gcmd handling
  intel-iommu: Fix tiny theoretical race in write-buffer flush.
  intel-iommu: Clean up handling of "caching mode" vs. IOTLB flushing.
  intel-iommu: Clean up handling of "caching mode" vs. context flushing.
  VT-d: fix invalid domain id for KVM context flush
  Fix !CONFIG_DMAR build failure introduced by Intel IOMMU Pass Through Support
  Intel IOMMU Pass Through Support

Fix up trivial conflicts in drivers/pci/{intel-iommu.c,intr_remapping.c}

Showing 17 changed files Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -1006,6 +1006,7 @@
1006 1006 nomerge
1007 1007 forcesac
1008 1008 soft
  1009 + pt [x86, IA64]
1009 1010  
1010 1011 io7= [HW] IO7 for Marvel based alpha systems
1011 1012 See comment before marvel_specify_io7 in
arch/ia64/include/asm/iommu.h
... ... @@ -9,6 +9,11 @@
9 9 extern void no_iommu_init(void);
10 10 extern int force_iommu, no_iommu;
11 11 extern int iommu_detected;
  12 +#ifdef CONFIG_DMAR
  13 +extern int iommu_pass_through;
  14 +#else
  15 +#define iommu_pass_through (0)
  16 +#endif
12 17 extern void iommu_dma_init(void);
13 18 extern void machvec_init(const char *name);
14 19  
arch/ia64/kernel/pci-dma.c
... ... @@ -32,6 +32,8 @@
32 32 int force_iommu __read_mostly;
33 33 #endif
34 34  
  35 +int iommu_pass_through;
  36 +
35 37 /* Dummy device used for NULL arguments (normally ISA). Better would
36 38 be probably a smaller DMA mask, but this is bug-to-bug compatible
37 39 to i386. */
arch/ia64/kernel/pci-swiotlb.c
... ... @@ -46,7 +46,7 @@
46 46  
47 47 void __init pci_swiotlb_init(void)
48 48 {
49   - if (!iommu_detected) {
  49 + if (!iommu_detected || iommu_pass_through) {
50 50 #ifdef CONFIG_IA64_GENERIC
51 51 swiotlb = 1;
52 52 printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
arch/x86/include/asm/iommu.h
... ... @@ -6,6 +6,7 @@
6 6 extern struct dma_map_ops nommu_dma_ops;
7 7 extern int force_iommu, no_iommu;
8 8 extern int iommu_detected;
  9 +extern int iommu_pass_through;
9 10  
10 11 /* 10 seconds */
11 12 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
arch/x86/kernel/pci-dma.c
... ... @@ -32,6 +32,8 @@
32 32 /* Set this to 1 if there is a HW IOMMU in the system */
33 33 int iommu_detected __read_mostly = 0;
34 34  
  35 +int iommu_pass_through;
  36 +
35 37 dma_addr_t bad_dma_address __read_mostly = 0;
36 38 EXPORT_SYMBOL(bad_dma_address);
37 39  
... ... @@ -209,6 +211,10 @@
209 211 #ifdef CONFIG_SWIOTLB
210 212 if (!strncmp(p, "soft", 4))
211 213 swiotlb = 1;
  214 + if (!strncmp(p, "pt", 2)) {
  215 + iommu_pass_through = 1;
  216 + return 1;
  217 + }
212 218 #endif
213 219  
214 220 gart_parse_options(p);
arch/x86/kernel/pci-swiotlb.c
... ... @@ -71,7 +71,8 @@
71 71 {
72 72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
73 73 #ifdef CONFIG_X86_64
74   - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
  74 + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
  75 + iommu_pass_through)
75 76 swiotlb = 1;
76 77 #endif
77 78 if (swiotlb_force)
... ... @@ -267,6 +267,84 @@
267 267 }
268 268 return ret;
269 269 }
  270 +
  271 +static LIST_HEAD(dmar_atsr_units);
  272 +
  273 +static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
  274 +{
  275 + struct acpi_dmar_atsr *atsr;
  276 + struct dmar_atsr_unit *atsru;
  277 +
  278 + atsr = container_of(hdr, struct acpi_dmar_atsr, header);
  279 + atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
  280 + if (!atsru)
  281 + return -ENOMEM;
  282 +
  283 + atsru->hdr = hdr;
  284 + atsru->include_all = atsr->flags & 0x1;
  285 +
  286 + list_add(&atsru->list, &dmar_atsr_units);
  287 +
  288 + return 0;
  289 +}
  290 +
  291 +static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
  292 +{
  293 + int rc;
  294 + struct acpi_dmar_atsr *atsr;
  295 +
  296 + if (atsru->include_all)
  297 + return 0;
  298 +
  299 + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
  300 + rc = dmar_parse_dev_scope((void *)(atsr + 1),
  301 + (void *)atsr + atsr->header.length,
  302 + &atsru->devices_cnt, &atsru->devices,
  303 + atsr->segment);
  304 + if (rc || !atsru->devices_cnt) {
  305 + list_del(&atsru->list);
  306 + kfree(atsru);
  307 + }
  308 +
  309 + return rc;
  310 +}
  311 +
  312 +int dmar_find_matched_atsr_unit(struct pci_dev *dev)
  313 +{
  314 + int i;
  315 + struct pci_bus *bus;
  316 + struct acpi_dmar_atsr *atsr;
  317 + struct dmar_atsr_unit *atsru;
  318 +
  319 + list_for_each_entry(atsru, &dmar_atsr_units, list) {
  320 + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
  321 + if (atsr->segment == pci_domain_nr(dev->bus))
  322 + goto found;
  323 + }
  324 +
  325 + return 0;
  326 +
  327 +found:
  328 + for (bus = dev->bus; bus; bus = bus->parent) {
  329 + struct pci_dev *bridge = bus->self;
  330 +
  331 + if (!bridge || !bridge->is_pcie ||
  332 + bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
  333 + return 0;
  334 +
  335 + if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
  336 + for (i = 0; i < atsru->devices_cnt; i++)
  337 + if (atsru->devices[i] == bridge)
  338 + return 1;
  339 + break;
  340 + }
  341 + }
  342 +
  343 + if (atsru->include_all)
  344 + return 1;
  345 +
  346 + return 0;
  347 +}
270 348 #endif
271 349  
272 350 static void __init
273 351  
274 352  
275 353  
276 354  
277 355  
... ... @@ -274,22 +352,28 @@
274 352 {
275 353 struct acpi_dmar_hardware_unit *drhd;
276 354 struct acpi_dmar_reserved_memory *rmrr;
  355 + struct acpi_dmar_atsr *atsr;
277 356  
278 357 switch (header->type) {
279 358 case ACPI_DMAR_TYPE_HARDWARE_UNIT:
280   - drhd = (struct acpi_dmar_hardware_unit *)header;
  359 + drhd = container_of(header, struct acpi_dmar_hardware_unit,
  360 + header);
281 361 printk (KERN_INFO PREFIX
282   - "DRHD (flags: 0x%08x)base: 0x%016Lx\n",
283   - drhd->flags, (unsigned long long)drhd->address);
  362 + "DRHD base: %#016Lx flags: %#x\n",
  363 + (unsigned long long)drhd->address, drhd->flags);
284 364 break;
285 365 case ACPI_DMAR_TYPE_RESERVED_MEMORY:
286   - rmrr = (struct acpi_dmar_reserved_memory *)header;
287   -
  366 + rmrr = container_of(header, struct acpi_dmar_reserved_memory,
  367 + header);
288 368 printk (KERN_INFO PREFIX
289   - "RMRR base: 0x%016Lx end: 0x%016Lx\n",
  369 + "RMRR base: %#016Lx end: %#016Lx\n",
290 370 (unsigned long long)rmrr->base_address,
291 371 (unsigned long long)rmrr->end_address);
292 372 break;
  373 + case ACPI_DMAR_TYPE_ATSR:
  374 + atsr = container_of(header, struct acpi_dmar_atsr, header);
  375 + printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
  376 + break;
293 377 }
294 378 }
295 379  
... ... @@ -363,6 +447,11 @@
363 447 ret = dmar_parse_one_rmrr(entry_header);
364 448 #endif
365 449 break;
  450 + case ACPI_DMAR_TYPE_ATSR:
  451 +#ifdef CONFIG_DMAR
  452 + ret = dmar_parse_one_atsr(entry_header);
  453 +#endif
  454 + break;
366 455 default:
367 456 printk(KERN_WARNING PREFIX
368 457 "Unknown DMAR structure type\n");
369 458  
... ... @@ -431,11 +520,19 @@
431 520 #ifdef CONFIG_DMAR
432 521 {
433 522 struct dmar_rmrr_unit *rmrr, *rmrr_n;
  523 + struct dmar_atsr_unit *atsr, *atsr_n;
  524 +
434 525 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
435 526 ret = rmrr_parse_dev(rmrr);
436 527 if (ret)
437 528 return ret;
438 529 }
  530 +
  531 + list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
  532 + ret = atsr_parse_dev(atsr);
  533 + if (ret)
  534 + return ret;
  535 + }
439 536 }
440 537 #endif
441 538  
... ... @@ -468,6 +565,9 @@
468 565 #ifdef CONFIG_DMAR
469 566 if (list_empty(&dmar_rmrr_units))
470 567 printk(KERN_INFO PREFIX "No RMRR found\n");
  568 +
  569 + if (list_empty(&dmar_atsr_units))
  570 + printk(KERN_INFO PREFIX "No ATSR found\n");
471 571 #endif
472 572  
473 573 #ifdef CONFIG_INTR_REMAP
... ... @@ -515,6 +615,7 @@
515 615 u32 ver;
516 616 static int iommu_allocated = 0;
517 617 int agaw = 0;
  618 + int msagaw = 0;
518 619  
519 620 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
520 621 if (!iommu)
521 622  
... ... @@ -535,12 +636,20 @@
535 636 agaw = iommu_calculate_agaw(iommu);
536 637 if (agaw < 0) {
537 638 printk(KERN_ERR
538   - "Cannot get a valid agaw for iommu (seq_id = %d)\n",
  639 + "Cannot get a valid agaw for iommu (seq_id = %d)\n",
  640 + iommu->seq_id);
  641 + goto error;
  642 + }
  643 + msagaw = iommu_calculate_max_sagaw(iommu);
  644 + if (msagaw < 0) {
  645 + printk(KERN_ERR
  646 + "Cannot get a valid max agaw for iommu (seq_id = %d)\n",
539 647 iommu->seq_id);
540 648 goto error;
541 649 }
542 650 #endif
543 651 iommu->agaw = agaw;
  652 + iommu->msagaw = msagaw;
544 653  
545 654 /* the registers might be more than one page */
546 655 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
... ... @@ -590,7 +699,8 @@
590 699 */
591 700 static inline void reclaim_free_desc(struct q_inval *qi)
592 701 {
593   - while (qi->desc_status[qi->free_tail] == QI_DONE) {
  702 + while (qi->desc_status[qi->free_tail] == QI_DONE ||
  703 + qi->desc_status[qi->free_tail] == QI_ABORT) {
594 704 qi->desc_status[qi->free_tail] = QI_FREE;
595 705 qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
596 706 qi->free_cnt++;
597 707  
... ... @@ -600,10 +710,13 @@
600 710 static int qi_check_fault(struct intel_iommu *iommu, int index)
601 711 {
602 712 u32 fault;
603   - int head;
  713 + int head, tail;
604 714 struct q_inval *qi = iommu->qi;
605 715 int wait_index = (index + 1) % QI_LENGTH;
606 716  
  717 + if (qi->desc_status[wait_index] == QI_ABORT)
  718 + return -EAGAIN;
  719 +
607 720 fault = readl(iommu->reg + DMAR_FSTS_REG);
608 721  
609 722 /*
... ... @@ -613,7 +726,11 @@
613 726 */
614 727 if (fault & DMA_FSTS_IQE) {
615 728 head = readl(iommu->reg + DMAR_IQH_REG);
616   - if ((head >> 4) == index) {
  729 + if ((head >> DMAR_IQ_SHIFT) == index) {
  730 + printk(KERN_ERR "VT-d detected invalid descriptor: "
  731 + "low=%llx, high=%llx\n",
  732 + (unsigned long long)qi->desc[index].low,
  733 + (unsigned long long)qi->desc[index].high);
617 734 memcpy(&qi->desc[index], &qi->desc[wait_index],
618 735 sizeof(struct qi_desc));
619 736 __iommu_flush_cache(iommu, &qi->desc[index],
... ... @@ -623,6 +740,32 @@
623 740 }
624 741 }
625 742  
  743 + /*
  744 + * If ITE happens, all pending wait_desc commands are aborted.
  745 + * No new descriptors are fetched until the ITE is cleared.
  746 + */
  747 + if (fault & DMA_FSTS_ITE) {
  748 + head = readl(iommu->reg + DMAR_IQH_REG);
  749 + head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
  750 + head |= 1;
  751 + tail = readl(iommu->reg + DMAR_IQT_REG);
  752 + tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
  753 +
  754 + writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
  755 +
  756 + do {
  757 + if (qi->desc_status[head] == QI_IN_USE)
  758 + qi->desc_status[head] = QI_ABORT;
  759 + head = (head - 2 + QI_LENGTH) % QI_LENGTH;
  760 + } while (head != tail);
  761 +
  762 + if (qi->desc_status[wait_index] == QI_ABORT)
  763 + return -EAGAIN;
  764 + }
  765 +
  766 + if (fault & DMA_FSTS_ICE)
  767 + writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
  768 +
626 769 return 0;
627 770 }
628 771  
... ... @@ -632,7 +775,7 @@
632 775 */
633 776 int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
634 777 {
635   - int rc = 0;
  778 + int rc;
636 779 struct q_inval *qi = iommu->qi;
637 780 struct qi_desc *hw, wait_desc;
638 781 int wait_index, index;
... ... @@ -643,6 +786,9 @@
643 786  
644 787 hw = qi->desc;
645 788  
  789 +restart:
  790 + rc = 0;
  791 +
646 792 spin_lock_irqsave(&qi->q_lock, flags);
647 793 while (qi->free_cnt < 3) {
648 794 spin_unlock_irqrestore(&qi->q_lock, flags);
... ... @@ -673,7 +819,7 @@
673 819 * update the HW tail register indicating the presence of
674 820 * new descriptors.
675 821 */
676   - writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG);
  822 + writel(qi->free_head << DMAR_IQ_SHIFT, iommu->reg + DMAR_IQT_REG);
677 823  
678 824 while (qi->desc_status[wait_index] != QI_DONE) {
679 825 /*
680 826  
681 827  
682 828  
... ... @@ -685,18 +831,21 @@
685 831 */
686 832 rc = qi_check_fault(iommu, index);
687 833 if (rc)
688   - goto out;
  834 + break;
689 835  
690 836 spin_unlock(&qi->q_lock);
691 837 cpu_relax();
692 838 spin_lock(&qi->q_lock);
693 839 }
694   -out:
695   - qi->desc_status[index] = qi->desc_status[wait_index] = QI_DONE;
696 840  
  841 + qi->desc_status[index] = QI_DONE;
  842 +
697 843 reclaim_free_desc(qi);
698 844 spin_unlock_irqrestore(&qi->q_lock, flags);
699 845  
  846 + if (rc == -EAGAIN)
  847 + goto restart;
  848 +
700 849 return rc;
701 850 }
702 851  
703 852  
704 853  
705 854  
706 855  
... ... @@ -714,41 +863,26 @@
714 863 qi_submit_sync(&desc, iommu);
715 864 }
716 865  
717   -int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
718   - u64 type, int non_present_entry_flush)
  866 +void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
  867 + u64 type)
719 868 {
720 869 struct qi_desc desc;
721 870  
722   - if (non_present_entry_flush) {
723   - if (!cap_caching_mode(iommu->cap))
724   - return 1;
725   - else
726   - did = 0;
727   - }
728   -
729 871 desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
730 872 | QI_CC_GRAN(type) | QI_CC_TYPE;
731 873 desc.high = 0;
732 874  
733   - return qi_submit_sync(&desc, iommu);
  875 + qi_submit_sync(&desc, iommu);
734 876 }
735 877  
736   -int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
737   - unsigned int size_order, u64 type,
738   - int non_present_entry_flush)
  878 +void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
  879 + unsigned int size_order, u64 type)
739 880 {
740 881 u8 dw = 0, dr = 0;
741 882  
742 883 struct qi_desc desc;
743 884 int ih = 0;
744 885  
745   - if (non_present_entry_flush) {
746   - if (!cap_caching_mode(iommu->cap))
747   - return 1;
748   - else
749   - did = 0;
750   - }
751   -
752 886 if (cap_write_drain(iommu->cap))
753 887 dw = 1;
754 888  
755 889  
... ... @@ -760,9 +894,30 @@
760 894 desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
761 895 | QI_IOTLB_AM(size_order);
762 896  
763   - return qi_submit_sync(&desc, iommu);
  897 + qi_submit_sync(&desc, iommu);
764 898 }
765 899  
  900 +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
  901 + u64 addr, unsigned mask)
  902 +{
  903 + struct qi_desc desc;
  904 +
  905 + if (mask) {
  906 + BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
  907 + addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
  908 + desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
  909 + } else
  910 + desc.high = QI_DEV_IOTLB_ADDR(addr);
  911 +
  912 + if (qdep >= QI_DEV_IOTLB_MAX_INVS)
  913 + qdep = 0;
  914 +
  915 + desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
  916 + QI_DIOTLB_TYPE;
  917 +
  918 + qi_submit_sync(&desc, iommu);
  919 +}
  920 +
766 921 /*
767 922 * Disable Queued Invalidation interface.
768 923 */
... ... @@ -790,7 +945,6 @@
790 945 cpu_relax();
791 946  
792 947 iommu->gcmd &= ~DMA_GCMD_QIE;
793   -
794 948 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
795 949  
796 950 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
... ... @@ -804,7 +958,7 @@
804 958 */
805 959 static void __dmar_enable_qi(struct intel_iommu *iommu)
806 960 {
807   - u32 cmd, sts;
  961 + u32 sts;
808 962 unsigned long flags;
809 963 struct q_inval *qi = iommu->qi;
810 964  
811 965  
... ... @@ -818,9 +972,8 @@
818 972  
819 973 dmar_writeq(iommu->reg + DMAR_IQA_REG, virt_to_phys(qi->desc));
820 974  
821   - cmd = iommu->gcmd | DMA_GCMD_QIE;
822 975 iommu->gcmd |= DMA_GCMD_QIE;
823   - writel(cmd, iommu->reg + DMAR_GCMD_REG);
  976 + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
824 977  
825 978 /* Make sure hardware complete it */
826 979 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts);
... ... @@ -1096,7 +1249,7 @@
1096 1249 set_irq_data(irq, NULL);
1097 1250 iommu->irq = 0;
1098 1251 destroy_irq(irq);
1099   - return 0;
  1252 + return ret;
1100 1253 }
1101 1254  
1102 1255 ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
drivers/pci/intel-iommu.c
... ... @@ -53,6 +53,8 @@
53 53  
54 54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 55  
  56 +#define MAX_AGAW_WIDTH 64
  57 +
56 58 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 59  
58 60 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
... ... @@ -131,8 +133,6 @@
131 133 context->lo &= (((u64)-1) << 2) | 1;
132 134 }
133 135  
134   -#define CONTEXT_TT_MULTI_LEVEL 0
135   -
136 136 static inline void context_set_translation_type(struct context_entry *context,
137 137 unsigned long value)
138 138 {
... ... @@ -256,6 +256,7 @@
256 256 u8 bus; /* PCI bus number */
257 257 u8 devfn; /* PCI devfn number */
258 258 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
  259 + struct intel_iommu *iommu; /* IOMMU used by this device */
259 260 struct dmar_domain *domain; /* pointer to domain */
260 261 };
261 262  
262 263  
... ... @@ -401,17 +402,13 @@
401 402  
402 403 static inline int width_to_agaw(int width);
403 404  
404   -/* calculate agaw for each iommu.
405   - * "SAGAW" may be different across iommus, use a default agaw, and
406   - * get a supported less agaw for iommus that don't support the default agaw.
407   - */
408   -int iommu_calculate_agaw(struct intel_iommu *iommu)
  405 +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
409 406 {
410 407 unsigned long sagaw;
411 408 int agaw = -1;
412 409  
413 410 sagaw = cap_sagaw(iommu->cap);
414   - for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
  411 + for (agaw = width_to_agaw(max_gaw);
415 412 agaw >= 0; agaw--) {
416 413 if (test_bit(agaw, &sagaw))
417 414 break;
... ... @@ -420,6 +417,24 @@
420 417 return agaw;
421 418 }
422 419  
  420 +/*
  421 + * Calculate max SAGAW for each iommu.
  422 + */
  423 +int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
  424 +{
  425 + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
  426 +}
  427 +
  428 +/*
  429 + * calculate agaw for each iommu.
  430 + * "SAGAW" may be different across iommus, use a default agaw, and
  431 + * get a supported less agaw for iommus that don't support the default agaw.
  432 + */
  433 +int iommu_calculate_agaw(struct intel_iommu *iommu)
  434 +{
  435 + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
  436 +}
  437 +
423 438 /* in native case, each domain is related to only one iommu */
424 439 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
425 440 {
... ... @@ -809,7 +824,7 @@
809 824 static void iommu_set_root_entry(struct intel_iommu *iommu)
810 825 {
811 826 void *addr;
812   - u32 cmd, sts;
  827 + u32 sts;
813 828 unsigned long flag;
814 829  
815 830 addr = iommu->root_entry;
816 831  
... ... @@ -817,12 +832,11 @@
817 832 spin_lock_irqsave(&iommu->register_lock, flag);
818 833 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
819 834  
820   - cmd = iommu->gcmd | DMA_GCMD_SRTP;
821   - writel(cmd, iommu->reg + DMAR_GCMD_REG);
  835 + writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
822 836  
823 837 /* Make sure hardware complete it */
824 838 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
825   - readl, (sts & DMA_GSTS_RTPS), sts);
  839 + readl, (sts & DMA_GSTS_RTPS), sts);
826 840  
827 841 spin_unlock_irqrestore(&iommu->register_lock, flag);
828 842 }
829 843  
830 844  
831 845  
832 846  
... ... @@ -834,39 +848,25 @@
834 848  
835 849 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
836 850 return;
837   - val = iommu->gcmd | DMA_GCMD_WBF;
838 851  
839 852 spin_lock_irqsave(&iommu->register_lock, flag);
840   - writel(val, iommu->reg + DMAR_GCMD_REG);
  853 + writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
841 854  
842 855 /* Make sure hardware complete it */
843 856 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
844   - readl, (!(val & DMA_GSTS_WBFS)), val);
  857 + readl, (!(val & DMA_GSTS_WBFS)), val);
845 858  
846 859 spin_unlock_irqrestore(&iommu->register_lock, flag);
847 860 }
848 861  
849 862 /* return value determine if we need a write buffer flush */
850   -static int __iommu_flush_context(struct intel_iommu *iommu,
851   - u16 did, u16 source_id, u8 function_mask, u64 type,
852   - int non_present_entry_flush)
  863 +static void __iommu_flush_context(struct intel_iommu *iommu,
  864 + u16 did, u16 source_id, u8 function_mask,
  865 + u64 type)
853 866 {
854 867 u64 val = 0;
855 868 unsigned long flag;
856 869  
857   - /*
858   - * In the non-present entry flush case, if hardware doesn't cache
859   - * non-present entry we do nothing and if hardware cache non-present
860   - * entry, we flush entries of domain 0 (the domain id is used to cache
861   - * any non-present entries)
862   - */
863   - if (non_present_entry_flush) {
864   - if (!cap_caching_mode(iommu->cap))
865   - return 1;
866   - else
867   - did = 0;
868   - }
869   -
870 870 switch (type) {
871 871 case DMA_CCMD_GLOBAL_INVL:
872 872 val = DMA_CCMD_GLOBAL_INVL;
873 873  
874 874  
... ... @@ -891,33 +891,16 @@
891 891 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
892 892  
893 893 spin_unlock_irqrestore(&iommu->register_lock, flag);
894   -
895   - /* flush context entry will implicitly flush write buffer */
896   - return 0;
897 894 }
898 895  
899 896 /* return value determine if we need a write buffer flush */
900   -static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
901   - u64 addr, unsigned int size_order, u64 type,
902   - int non_present_entry_flush)
  897 +static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
  898 + u64 addr, unsigned int size_order, u64 type)
903 899 {
904 900 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
905 901 u64 val = 0, val_iva = 0;
906 902 unsigned long flag;
907 903  
908   - /*
909   - * In the non-present entry flush case, if hardware doesn't cache
910   - * non-present entry we do nothing and if hardware cache non-present
911   - * entry, we flush entries of domain 0 (the domain id is used to cache
912   - * any non-present entries)
913   - */
914   - if (non_present_entry_flush) {
915   - if (!cap_caching_mode(iommu->cap))
916   - return 1;
917   - else
918   - did = 0;
919   - }
920   -
921 904 switch (type) {
922 905 case DMA_TLB_GLOBAL_FLUSH:
923 906 /* global flush doesn't need set IVA_REG */
924 907  
925 908  
926 909  
927 910  
928 911  
929 912  
... ... @@ -965,37 +948,101 @@
965 948 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
966 949 (unsigned long long)DMA_TLB_IIRG(type),
967 950 (unsigned long long)DMA_TLB_IAIG(val));
968   - /* flush iotlb entry will implicitly flush write buffer */
969   - return 0;
970 951 }
971 952  
972   -static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
973   - u64 addr, unsigned int pages, int non_present_entry_flush)
  953 +static struct device_domain_info *iommu_support_dev_iotlb(
  954 + struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
974 955 {
975   - unsigned int mask;
  956 + int found = 0;
  957 + unsigned long flags;
  958 + struct device_domain_info *info;
  959 + struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
976 960  
  961 + if (!ecap_dev_iotlb_support(iommu->ecap))
  962 + return NULL;
  963 +
  964 + if (!iommu->qi)
  965 + return NULL;
  966 +
  967 + spin_lock_irqsave(&device_domain_lock, flags);
  968 + list_for_each_entry(info, &domain->devices, link)
  969 + if (info->bus == bus && info->devfn == devfn) {
  970 + found = 1;
  971 + break;
  972 + }
  973 + spin_unlock_irqrestore(&device_domain_lock, flags);
  974 +
  975 + if (!found || !info->dev)
  976 + return NULL;
  977 +
  978 + if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
  979 + return NULL;
  980 +
  981 + if (!dmar_find_matched_atsr_unit(info->dev))
  982 + return NULL;
  983 +
  984 + info->iommu = iommu;
  985 +
  986 + return info;
  987 +}
  988 +
  989 +static void iommu_enable_dev_iotlb(struct device_domain_info *info)
  990 +{
  991 + if (!info)
  992 + return;
  993 +
  994 + pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
  995 +}
  996 +
  997 +static void iommu_disable_dev_iotlb(struct device_domain_info *info)
  998 +{
  999 + if (!info->dev || !pci_ats_enabled(info->dev))
  1000 + return;
  1001 +
  1002 + pci_disable_ats(info->dev);
  1003 +}
  1004 +
  1005 +static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
  1006 + u64 addr, unsigned mask)
  1007 +{
  1008 + u16 sid, qdep;
  1009 + unsigned long flags;
  1010 + struct device_domain_info *info;
  1011 +
  1012 + spin_lock_irqsave(&device_domain_lock, flags);
  1013 + list_for_each_entry(info, &domain->devices, link) {
  1014 + if (!info->dev || !pci_ats_enabled(info->dev))
  1015 + continue;
  1016 +
  1017 + sid = info->bus << 8 | info->devfn;
  1018 + qdep = pci_ats_queue_depth(info->dev);
  1019 + qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
  1020 + }
  1021 + spin_unlock_irqrestore(&device_domain_lock, flags);
  1022 +}
  1023 +
  1024 +static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
  1025 + u64 addr, unsigned int pages)
  1026 +{
  1027 + unsigned int mask = ilog2(__roundup_pow_of_two(pages));
  1028 +
977 1029 BUG_ON(addr & (~VTD_PAGE_MASK));
978 1030 BUG_ON(pages == 0);
979 1031  
980   - /* Fallback to domain selective flush if no PSI support */
981   - if (!cap_pgsel_inv(iommu->cap))
982   - return iommu->flush.flush_iotlb(iommu, did, 0, 0,
983   - DMA_TLB_DSI_FLUSH,
984   - non_present_entry_flush);
985   -
986 1032 /*
  1033 + * Fallback to domain selective flush if no PSI support or the size is
  1034 + * too big.
987 1035 * PSI requires page size to be 2 ^ x, and the base address is naturally
988 1036 * aligned to the size
989 1037 */
990   - mask = ilog2(__roundup_pow_of_two(pages));
991   - /* Fallback to domain selective flush if size is too big */
992   - if (mask > cap_max_amask_val(iommu->cap))
993   - return iommu->flush.flush_iotlb(iommu, did, 0, 0,
994   - DMA_TLB_DSI_FLUSH, non_present_entry_flush);
995   -
996   - return iommu->flush.flush_iotlb(iommu, did, addr, mask,
997   - DMA_TLB_PSI_FLUSH,
998   - non_present_entry_flush);
  1038 + if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
  1039 + iommu->flush.flush_iotlb(iommu, did, 0, 0,
  1040 + DMA_TLB_DSI_FLUSH);
  1041 + else
  1042 + iommu->flush.flush_iotlb(iommu, did, addr, mask,
  1043 + DMA_TLB_PSI_FLUSH);
  1044 + if (did)
  1045 + iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
999 1046 }
1000 1047  
1001 1048 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1002 1049  
1003 1050  
... ... @@ -1021,13 +1068,13 @@
1021 1068 unsigned long flags;
1022 1069  
1023 1070 spin_lock_irqsave(&iommu->register_lock, flags);
1024   - writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
  1071 + iommu->gcmd |= DMA_GCMD_TE;
  1072 + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1025 1073  
1026 1074 /* Make sure hardware complete it */
1027 1075 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1028   - readl, (sts & DMA_GSTS_TES), sts);
  1076 + readl, (sts & DMA_GSTS_TES), sts);
1029 1077  
1030   - iommu->gcmd |= DMA_GCMD_TE;
1031 1078 spin_unlock_irqrestore(&iommu->register_lock, flags);
1032 1079 return 0;
1033 1080 }
... ... @@ -1043,7 +1090,7 @@
1043 1090  
1044 1091 /* Make sure hardware complete it */
1045 1092 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1046   - readl, (!(sts & DMA_GSTS_TES)), sts);
  1093 + readl, (!(sts & DMA_GSTS_TES)), sts);
1047 1094  
1048 1095 spin_unlock_irqrestore(&iommu->register_lock, flag);
1049 1096 return 0;
... ... @@ -1325,8 +1372,8 @@
1325 1372 free_domain_mem(domain);
1326 1373 }
1327 1374  
1328   -static int domain_context_mapping_one(struct dmar_domain *domain,
1329   - int segment, u8 bus, u8 devfn)
  1375 +static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
  1376 + u8 bus, u8 devfn, int translation)
1330 1377 {
1331 1378 struct context_entry *context;
1332 1379 unsigned long flags;
1333 1380  
1334 1381  
... ... @@ -1336,10 +1383,14 @@
1336 1383 unsigned long ndomains;
1337 1384 int id;
1338 1385 int agaw;
  1386 + struct device_domain_info *info = NULL;
1339 1387  
1340 1388 pr_debug("Set context mapping for %02x:%02x.%d\n",
1341 1389 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
  1390 +
1342 1391 BUG_ON(!domain->pgd);
  1392 + BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
  1393 + translation != CONTEXT_TT_MULTI_LEVEL);
1343 1394  
1344 1395 iommu = device_to_iommu(segment, bus, devfn);
1345 1396 if (!iommu)
1346 1397  
1347 1398  
... ... @@ -1399,21 +1450,44 @@
1399 1450 }
1400 1451  
1401 1452 context_set_domain_id(context, id);
1402   - context_set_address_width(context, iommu->agaw);
1403   - context_set_address_root(context, virt_to_phys(pgd));
1404   - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
  1453 +
  1454 + if (translation != CONTEXT_TT_PASS_THROUGH) {
  1455 + info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
  1456 + translation = info ? CONTEXT_TT_DEV_IOTLB :
  1457 + CONTEXT_TT_MULTI_LEVEL;
  1458 + }
  1459 + /*
  1460 + * In pass through mode, AW must be programmed to indicate the largest
  1461 + * AGAW value supported by hardware. And ASR is ignored by hardware.
  1462 + */
  1463 + if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
  1464 + context_set_address_width(context, iommu->msagaw);
  1465 + else {
  1466 + context_set_address_root(context, virt_to_phys(pgd));
  1467 + context_set_address_width(context, iommu->agaw);
  1468 + }
  1469 +
  1470 + context_set_translation_type(context, translation);
1405 1471 context_set_fault_enable(context);
1406 1472 context_set_present(context);
1407 1473 domain_flush_cache(domain, context, sizeof(*context));
1408 1474  
1409   - /* it's a non-present to present mapping */
1410   - if (iommu->flush.flush_context(iommu, domain->id,
1411   - (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1412   - DMA_CCMD_DEVICE_INVL, 1))
  1475 + /*
  1476 + * It's a non-present to present mapping. If hardware doesn't cache
  1477 + * non-present entry we only need to flush the write-buffer. If the
  1478 + * _does_ cache non-present entries, then it does so in the special
  1479 + * domain #0, which we have to flush:
  1480 + */
  1481 + if (cap_caching_mode(iommu->cap)) {
  1482 + iommu->flush.flush_context(iommu, 0,
  1483 + (((u16)bus) << 8) | devfn,
  1484 + DMA_CCMD_MASK_NOBIT,
  1485 + DMA_CCMD_DEVICE_INVL);
  1486 + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
  1487 + } else {
1413 1488 iommu_flush_write_buffer(iommu);
1414   - else
1415   - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1416   -
  1489 + }
  1490 + iommu_enable_dev_iotlb(info);
1417 1491 spin_unlock_irqrestore(&iommu->lock, flags);
1418 1492  
1419 1493 spin_lock_irqsave(&domain->iommu_lock, flags);
1420 1494  
... ... @@ -1426,13 +1500,15 @@
1426 1500 }
1427 1501  
1428 1502 static int
1429   -domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
  1503 +domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
  1504 + int translation)
1430 1505 {
1431 1506 int ret;
1432 1507 struct pci_dev *tmp, *parent;
1433 1508  
1434 1509 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1435   - pdev->bus->number, pdev->devfn);
  1510 + pdev->bus->number, pdev->devfn,
  1511 + translation);
1436 1512 if (ret)
1437 1513 return ret;
1438 1514  
... ... @@ -1446,7 +1522,7 @@
1446 1522 ret = domain_context_mapping_one(domain,
1447 1523 pci_domain_nr(parent->bus),
1448 1524 parent->bus->number,
1449   - parent->devfn);
  1525 + parent->devfn, translation);
1450 1526 if (ret)
1451 1527 return ret;
1452 1528 parent = parent->bus->self;
1453 1529  
... ... @@ -1454,12 +1530,14 @@
1454 1530 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1455 1531 return domain_context_mapping_one(domain,
1456 1532 pci_domain_nr(tmp->subordinate),
1457   - tmp->subordinate->number, 0);
  1533 + tmp->subordinate->number, 0,
  1534 + translation);
1458 1535 else /* this is a legacy PCI bridge */
1459 1536 return domain_context_mapping_one(domain,
1460 1537 pci_domain_nr(tmp->bus),
1461 1538 tmp->bus->number,
1462   - tmp->devfn);
  1539 + tmp->devfn,
  1540 + translation);
1463 1541 }
1464 1542  
1465 1543 static int domain_context_mapped(struct pci_dev *pdev)
... ... @@ -1540,9 +1618,8 @@
1540 1618  
1541 1619 clear_context_table(iommu, bus, devfn);
1542 1620 iommu->flush.flush_context(iommu, 0, 0, 0,
1543   - DMA_CCMD_GLOBAL_INVL, 0);
1544   - iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1545   - DMA_TLB_GLOBAL_FLUSH, 0);
  1621 + DMA_CCMD_GLOBAL_INVL);
  1622 + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1546 1623 }
1547 1624  
1548 1625 static void domain_remove_dev_info(struct dmar_domain *domain)
... ... @@ -1561,6 +1638,7 @@
1561 1638 info->dev->dev.archdata.iommu = NULL;
1562 1639 spin_unlock_irqrestore(&device_domain_lock, flags);
1563 1640  
  1641 + iommu_disable_dev_iotlb(info);
1564 1642 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1565 1643 iommu_detach_dev(iommu, info->bus, info->devfn);
1566 1644 free_devinfo_mem(info);
... ... @@ -1756,7 +1834,7 @@
1756 1834 goto error;
1757 1835  
1758 1836 /* context entry init */
1759   - ret = domain_context_mapping(domain, pdev);
  1837 + ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1760 1838 if (!ret)
1761 1839 return 0;
1762 1840 error:
... ... @@ -1857,6 +1935,23 @@
1857 1935 }
1858 1936 #endif /* !CONFIG_DMAR_FLPY_WA */
1859 1937  
  1938 +/* Initialize each context entry as pass through.*/
  1939 +static int __init init_context_pass_through(void)
  1940 +{
  1941 + struct pci_dev *pdev = NULL;
  1942 + struct dmar_domain *domain;
  1943 + int ret;
  1944 +
  1945 + for_each_pci_dev(pdev) {
  1946 + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
  1947 + ret = domain_context_mapping(domain, pdev,
  1948 + CONTEXT_TT_PASS_THROUGH);
  1949 + if (ret)
  1950 + return ret;
  1951 + }
  1952 + return 0;
  1953 +}
  1954 +
1860 1955 static int __init init_dmars(void)
1861 1956 {
1862 1957 struct dmar_drhd_unit *drhd;
... ... @@ -1864,6 +1959,7 @@
1864 1959 struct pci_dev *pdev;
1865 1960 struct intel_iommu *iommu;
1866 1961 int i, ret;
  1962 + int pass_through = 1;
1867 1963  
1868 1964 /*
1869 1965 * for each drhd
1870 1966  
... ... @@ -1917,7 +2013,15 @@
1917 2013 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1918 2014 goto error;
1919 2015 }
  2016 + if (!ecap_pass_through(iommu->ecap))
  2017 + pass_through = 0;
1920 2018 }
  2019 + if (iommu_pass_through)
  2020 + if (!pass_through) {
  2021 + printk(KERN_INFO
  2022 + "Pass Through is not supported by hardware.\n");
  2023 + iommu_pass_through = 0;
  2024 + }
1921 2025  
1922 2026 /*
1923 2027 * Start from the sane iommu hardware state.
1924 2028  
1925 2029  
1926 2030  
1927 2031  
... ... @@ -1973,36 +2077,57 @@
1973 2077 }
1974 2078  
1975 2079 /*
1976   - * For each rmrr
1977   - * for each dev attached to rmrr
1978   - * do
1979   - * locate drhd for dev, alloc domain for dev
1980   - * allocate free domain
1981   - * allocate page table entries for rmrr
1982   - * if context not allocated for bus
1983   - * allocate and init context
1984   - * set present in root table for this bus
1985   - * init context with domain, translation etc
1986   - * endfor
1987   - * endfor
  2080 + * If pass through is set and enabled, context entries of all pci
  2081 + * devices are intialized by pass through translation type.
1988 2082 */
1989   - for_each_rmrr_units(rmrr) {
1990   - for (i = 0; i < rmrr->devices_cnt; i++) {
1991   - pdev = rmrr->devices[i];
1992   - /* some BIOS lists non-exist devices in DMAR table */
1993   - if (!pdev)
1994   - continue;
1995   - ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1996   - if (ret)
1997   - printk(KERN_ERR
1998   - "IOMMU: mapping reserved region failed\n");
  2083 + if (iommu_pass_through) {
  2084 + ret = init_context_pass_through();
  2085 + if (ret) {
  2086 + printk(KERN_ERR "IOMMU: Pass through init failed.\n");
  2087 + iommu_pass_through = 0;
1999 2088 }
2000 2089 }
2001 2090  
2002   - iommu_prepare_gfx_mapping();
  2091 + /*
  2092 + * If pass through is not set or not enabled, setup context entries for
  2093 + * identity mappings for rmrr, gfx, and isa.
  2094 + */
  2095 + if (!iommu_pass_through) {
  2096 + /*
  2097 + * For each rmrr
  2098 + * for each dev attached to rmrr
  2099 + * do
  2100 + * locate drhd for dev, alloc domain for dev
  2101 + * allocate free domain
  2102 + * allocate page table entries for rmrr
  2103 + * if context not allocated for bus
  2104 + * allocate and init context
  2105 + * set present in root table for this bus
  2106 + * init context with domain, translation etc
  2107 + * endfor
  2108 + * endfor
  2109 + */
  2110 + for_each_rmrr_units(rmrr) {
  2111 + for (i = 0; i < rmrr->devices_cnt; i++) {
  2112 + pdev = rmrr->devices[i];
  2113 + /*
  2114 + * some BIOS lists non-exist devices in DMAR
  2115 + * table.
  2116 + */
  2117 + if (!pdev)
  2118 + continue;
  2119 + ret = iommu_prepare_rmrr_dev(rmrr, pdev);
  2120 + if (ret)
  2121 + printk(KERN_ERR
  2122 + "IOMMU: mapping reserved region failed\n");
  2123 + }
  2124 + }
2003 2125  
2004   - iommu_prepare_isa();
  2126 + iommu_prepare_gfx_mapping();
2005 2127  
  2128 + iommu_prepare_isa();
  2129 + }
  2130 +
2006 2131 /*
2007 2132 * for each drhd
2008 2133 * enable fault log
... ... @@ -2023,10 +2148,8 @@
2023 2148  
2024 2149 iommu_set_root_entry(iommu);
2025 2150  
2026   - iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2027   - 0);
2028   - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2029   - 0);
  2151 + iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
  2152 + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2030 2153 iommu_disable_protect_mem_regions(iommu);
2031 2154  
2032 2155 ret = iommu_enable_translation(iommu);
... ... @@ -2112,7 +2235,8 @@
2112 2235  
2113 2236 /* make sure context mapping is ok */
2114 2237 if (unlikely(!domain_context_mapped(pdev))) {
2115   - ret = domain_context_mapping(domain, pdev);
  2238 + ret = domain_context_mapping(domain, pdev,
  2239 + CONTEXT_TT_MULTI_LEVEL);
2116 2240 if (ret) {
2117 2241 printk(KERN_ERR
2118 2242 "Domain context map for %s failed",
... ... @@ -2173,10 +2297,11 @@
2173 2297 if (ret)
2174 2298 goto error;
2175 2299  
2176   - /* it's a non-present to present mapping */
2177   - ret = iommu_flush_iotlb_psi(iommu, domain->id,
2178   - start_paddr, size >> VTD_PAGE_SHIFT, 1);
2179   - if (ret)
  2300 + /* it's a non-present to present mapping. Only flush if caching mode */
  2301 + if (cap_caching_mode(iommu->cap))
  2302 + iommu_flush_iotlb_psi(iommu, 0, start_paddr,
  2303 + size >> VTD_PAGE_SHIFT);
  2304 + else
2180 2305 iommu_flush_write_buffer(iommu);
2181 2306  
2182 2307 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2183 2308  
... ... @@ -2210,15 +2335,22 @@
2210 2335 if (!iommu)
2211 2336 continue;
2212 2337  
2213   - if (deferred_flush[i].next) {
2214   - iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2215   - DMA_TLB_GLOBAL_FLUSH, 0);
2216   - for (j = 0; j < deferred_flush[i].next; j++) {
2217   - __free_iova(&deferred_flush[i].domain[j]->iovad,
2218   - deferred_flush[i].iova[j]);
2219   - }
2220   - deferred_flush[i].next = 0;
  2338 + if (!deferred_flush[i].next)
  2339 + continue;
  2340 +
  2341 + iommu->flush.flush_iotlb(iommu, 0, 0, 0,
  2342 + DMA_TLB_GLOBAL_FLUSH);
  2343 + for (j = 0; j < deferred_flush[i].next; j++) {
  2344 + unsigned long mask;
  2345 + struct iova *iova = deferred_flush[i].iova[j];
  2346 +
  2347 + mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT;
  2348 + mask = ilog2(mask >> VTD_PAGE_SHIFT);
  2349 + iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
  2350 + iova->pfn_lo << PAGE_SHIFT, mask);
  2351 + __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2221 2352 }
  2353 + deferred_flush[i].next = 0;
2222 2354 }
2223 2355  
2224 2356 list_size = 0;
... ... @@ -2291,9 +2423,8 @@
2291 2423 /* free page tables */
2292 2424 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2293 2425 if (intel_iommu_strict) {
2294   - if (iommu_flush_iotlb_psi(iommu,
2295   - domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2296   - iommu_flush_write_buffer(iommu);
  2426 + iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
  2427 + size >> VTD_PAGE_SHIFT);
2297 2428 /* free iova */
2298 2429 __free_iova(&domain->iovad, iova);
2299 2430 } else {
... ... @@ -2384,9 +2515,8 @@
2384 2515 /* free page tables */
2385 2516 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2386 2517  
2387   - if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2388   - size >> VTD_PAGE_SHIFT, 0))
2389   - iommu_flush_write_buffer(iommu);
  2518 + iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
  2519 + size >> VTD_PAGE_SHIFT);
2390 2520  
2391 2521 /* free iova */
2392 2522 __free_iova(&domain->iovad, iova);
2393 2523  
... ... @@ -2478,10 +2608,13 @@
2478 2608 offset += size;
2479 2609 }
2480 2610  
2481   - /* it's a non-present to present mapping */
2482   - if (iommu_flush_iotlb_psi(iommu, domain->id,
2483   - start_addr, offset >> VTD_PAGE_SHIFT, 1))
  2611 + /* it's a non-present to present mapping. Only flush if caching mode */
  2612 + if (cap_caching_mode(iommu->cap))
  2613 + iommu_flush_iotlb_psi(iommu, 0, start_addr,
  2614 + offset >> VTD_PAGE_SHIFT);
  2615 + else
2484 2616 iommu_flush_write_buffer(iommu);
  2617 +
2485 2618 return nelems;
2486 2619 }
2487 2620  
2488 2621  
... ... @@ -2640,9 +2773,9 @@
2640 2773 iommu_set_root_entry(iommu);
2641 2774  
2642 2775 iommu->flush.flush_context(iommu, 0, 0, 0,
2643   - DMA_CCMD_GLOBAL_INVL, 0);
  2776 + DMA_CCMD_GLOBAL_INVL);
2644 2777 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2645   - DMA_TLB_GLOBAL_FLUSH, 0);
  2778 + DMA_TLB_GLOBAL_FLUSH);
2646 2779 iommu_disable_protect_mem_regions(iommu);
2647 2780 iommu_enable_translation(iommu);
2648 2781 }
2649 2782  
... ... @@ -2657,9 +2790,9 @@
2657 2790  
2658 2791 for_each_active_iommu(iommu, drhd) {
2659 2792 iommu->flush.flush_context(iommu, 0, 0, 0,
2660   - DMA_CCMD_GLOBAL_INVL, 0);
  2793 + DMA_CCMD_GLOBAL_INVL);
2661 2794 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2662   - DMA_TLB_GLOBAL_FLUSH, 0);
  2795 + DMA_TLB_GLOBAL_FLUSH);
2663 2796 }
2664 2797 }
2665 2798  
... ... @@ -2782,7 +2915,7 @@
2782 2915 * Check the need for DMA-remapping initialization now.
2783 2916 * Above initialization will also be used by Interrupt-remapping.
2784 2917 */
2785   - if (no_iommu || swiotlb || dmar_disabled)
  2918 + if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
2786 2919 return -ENODEV;
2787 2920  
2788 2921 iommu_init_mempool();
... ... @@ -2802,7 +2935,15 @@
2802 2935  
2803 2936 init_timer(&unmap_timer);
2804 2937 force_iommu = 1;
2805   - dma_ops = &intel_dma_ops;
  2938 +
  2939 + if (!iommu_pass_through) {
  2940 + printk(KERN_INFO
  2941 + "Multi-level page-table translation for DMAR.\n");
  2942 + dma_ops = &intel_dma_ops;
  2943 + } else
  2944 + printk(KERN_INFO
  2945 + "DMAR: Pass through translation for DMAR.\n");
  2946 +
2806 2947 init_iommu_sysfs();
2807 2948  
2808 2949 register_iommu(&intel_iommu_ops);
... ... @@ -2888,6 +3029,7 @@
2888 3029 info->dev->dev.archdata.iommu = NULL;
2889 3030 spin_unlock_irqrestore(&device_domain_lock, flags);
2890 3031  
  3032 + iommu_disable_dev_iotlb(info);
2891 3033 iommu_detach_dev(iommu, info->bus, info->devfn);
2892 3034 iommu_detach_dependent_devices(iommu, pdev);
2893 3035 free_devinfo_mem(info);
... ... @@ -2938,6 +3080,7 @@
2938 3080  
2939 3081 spin_unlock_irqrestore(&device_domain_lock, flags1);
2940 3082  
  3083 + iommu_disable_dev_iotlb(info);
2941 3084 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2942 3085 iommu_detach_dev(iommu, info->bus, info->devfn);
2943 3086 iommu_detach_dependent_devices(iommu, info->dev);
2944 3087  
... ... @@ -3142,11 +3285,11 @@
3142 3285 return -EFAULT;
3143 3286 }
3144 3287  
3145   - ret = domain_context_mapping(dmar_domain, pdev);
  3288 + ret = vm_domain_add_dev_info(dmar_domain, pdev);
3146 3289 if (ret)
3147 3290 return ret;
3148 3291  
3149   - ret = vm_domain_add_dev_info(dmar_domain, pdev);
  3292 + ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3150 3293 return ret;
3151 3294 }
3152 3295  
drivers/pci/intr_remapping.c
... ... @@ -409,7 +409,7 @@
409 409 static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
410 410 {
411 411 u64 addr;
412   - u32 cmd, sts;
  412 + u32 sts;
413 413 unsigned long flags;
414 414  
415 415 addr = virt_to_phys((void *)iommu->ir_table->base);
416 416  
... ... @@ -420,9 +420,8 @@
420 420 (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
421 421  
422 422 /* Set interrupt-remapping table pointer */
423   - cmd = iommu->gcmd | DMA_GCMD_SIRTP;
424 423 iommu->gcmd |= DMA_GCMD_SIRTP;
425   - writel(cmd, iommu->reg + DMAR_GCMD_REG);
  424 + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
426 425  
427 426 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
428 427 readl, (sts & DMA_GSTS_IRTPS), sts);
429 428  
... ... @@ -437,9 +436,8 @@
437 436 spin_lock_irqsave(&iommu->register_lock, flags);
438 437  
439 438 /* Enable interrupt-remapping */
440   - cmd = iommu->gcmd | DMA_GCMD_IRE;
441 439 iommu->gcmd |= DMA_GCMD_IRE;
442   - writel(cmd, iommu->reg + DMAR_GCMD_REG);
  440 + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
443 441  
444 442 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
445 443 readl, (sts & DMA_GSTS_IRES), sts);
... ... @@ -5,6 +5,7 @@
5 5 *
6 6 * PCI Express I/O Virtualization (IOV) support.
7 7 * Single Root IOV 1.0
  8 + * Address Translation Service 1.0
8 9 */
9 10  
10 11 #include <linux/pci.h>
11 12  
12 13  
... ... @@ -492,11 +493,11 @@
492 493  
493 494 if (pdev)
494 495 iov->dev = pci_dev_get(pdev);
495   - else {
  496 + else
496 497 iov->dev = dev;
497   - mutex_init(&iov->lock);
498   - }
499 498  
  499 + mutex_init(&iov->lock);
  500 +
500 501 dev->sriov = iov;
501 502 dev->is_physfn = 1;
502 503  
503 504  
... ... @@ -515,11 +516,11 @@
515 516 {
516 517 BUG_ON(dev->sriov->nr_virtfn);
517 518  
518   - if (dev == dev->sriov->dev)
519   - mutex_destroy(&dev->sriov->lock);
520   - else
  519 + if (dev != dev->sriov->dev)
521 520 pci_dev_put(dev->sriov->dev);
522 521  
  522 + mutex_destroy(&dev->sriov->lock);
  523 +
523 524 kfree(dev->sriov);
524 525 dev->sriov = NULL;
525 526 }
... ... @@ -681,4 +682,146 @@
681 682 return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
682 683 }
683 684 EXPORT_SYMBOL_GPL(pci_sriov_migration);
  685 +
  686 +static int ats_alloc_one(struct pci_dev *dev, int ps)
  687 +{
  688 + int pos;
  689 + u16 cap;
  690 + struct pci_ats *ats;
  691 +
  692 + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
  693 + if (!pos)
  694 + return -ENODEV;
  695 +
  696 + ats = kzalloc(sizeof(*ats), GFP_KERNEL);
  697 + if (!ats)
  698 + return -ENOMEM;
  699 +
  700 + ats->pos = pos;
  701 + ats->stu = ps;
  702 + pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
  703 + ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
  704 + PCI_ATS_MAX_QDEP;
  705 + dev->ats = ats;
  706 +
  707 + return 0;
  708 +}
  709 +
  710 +static void ats_free_one(struct pci_dev *dev)
  711 +{
  712 + kfree(dev->ats);
  713 + dev->ats = NULL;
  714 +}
  715 +
  716 +/**
  717 + * pci_enable_ats - enable the ATS capability
  718 + * @dev: the PCI device
  719 + * @ps: the IOMMU page shift
  720 + *
  721 + * Returns 0 on success, or negative on failure.
  722 + */
  723 +int pci_enable_ats(struct pci_dev *dev, int ps)
  724 +{
  725 + int rc;
  726 + u16 ctrl;
  727 +
  728 + BUG_ON(dev->ats && dev->ats->is_enabled);
  729 +
  730 + if (ps < PCI_ATS_MIN_STU)
  731 + return -EINVAL;
  732 +
  733 + if (dev->is_physfn || dev->is_virtfn) {
  734 + struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn;
  735 +
  736 + mutex_lock(&pdev->sriov->lock);
  737 + if (pdev->ats)
  738 + rc = pdev->ats->stu == ps ? 0 : -EINVAL;
  739 + else
  740 + rc = ats_alloc_one(pdev, ps);
  741 +
  742 + if (!rc)
  743 + pdev->ats->ref_cnt++;
  744 + mutex_unlock(&pdev->sriov->lock);
  745 + if (rc)
  746 + return rc;
  747 + }
  748 +
  749 + if (!dev->is_physfn) {
  750 + rc = ats_alloc_one(dev, ps);
  751 + if (rc)
  752 + return rc;
  753 + }
  754 +
  755 + ctrl = PCI_ATS_CTRL_ENABLE;
  756 + if (!dev->is_virtfn)
  757 + ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU);
  758 + pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
  759 +
  760 + dev->ats->is_enabled = 1;
  761 +
  762 + return 0;
  763 +}
  764 +
  765 +/**
  766 + * pci_disable_ats - disable the ATS capability
  767 + * @dev: the PCI device
  768 + */
  769 +void pci_disable_ats(struct pci_dev *dev)
  770 +{
  771 + u16 ctrl;
  772 +
  773 + BUG_ON(!dev->ats || !dev->ats->is_enabled);
  774 +
  775 + pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl);
  776 + ctrl &= ~PCI_ATS_CTRL_ENABLE;
  777 + pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl);
  778 +
  779 + dev->ats->is_enabled = 0;
  780 +
  781 + if (dev->is_physfn || dev->is_virtfn) {
  782 + struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn;
  783 +
  784 + mutex_lock(&pdev->sriov->lock);
  785 + pdev->ats->ref_cnt--;
  786 + if (!pdev->ats->ref_cnt)
  787 + ats_free_one(pdev);
  788 + mutex_unlock(&pdev->sriov->lock);
  789 + }
  790 +
  791 + if (!dev->is_physfn)
  792 + ats_free_one(dev);
  793 +}
  794 +
  795 +/**
  796 + * pci_ats_queue_depth - query the ATS Invalidate Queue Depth
  797 + * @dev: the PCI device
  798 + *
  799 + * Returns the queue depth on success, or negative on failure.
  800 + *
  801 + * The ATS spec uses 0 in the Invalidate Queue Depth field to
  802 + * indicate that the function can accept 32 Invalidate Request.
  803 + * But here we use the `real' values (i.e. 1~32) for the Queue
  804 + * Depth; and 0 indicates the function shares the Queue with
  805 + * other functions (doesn't exclusively own a Queue).
  806 + */
  807 +int pci_ats_queue_depth(struct pci_dev *dev)
  808 +{
  809 + int pos;
  810 + u16 cap;
  811 +
  812 + if (dev->is_virtfn)
  813 + return 0;
  814 +
  815 + if (dev->ats)
  816 + return dev->ats->qdep;
  817 +
  818 + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
  819 + if (!pos)
  820 + return -ENODEV;
  821 +
  822 + pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
  823 +
  824 + return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) :
  825 + PCI_ATS_MAX_QDEP;
  826 +}
... ... @@ -229,6 +229,15 @@
229 229 u8 __iomem *mstate; /* VF Migration State Array */
230 230 };
231 231  
  232 +/* Address Translation Service */
  233 +struct pci_ats {
  234 + int pos; /* capability position */
  235 + int stu; /* Smallest Translation Unit */
  236 + int qdep; /* Invalidate Queue Depth */
  237 + int ref_cnt; /* Physical Function reference count */
  238 + int is_enabled:1; /* Enable bit is set */
  239 +};
  240 +
232 241 #ifdef CONFIG_PCI_IOV
233 242 extern int pci_iov_init(struct pci_dev *dev);
234 243 extern void pci_iov_release(struct pci_dev *dev);
... ... @@ -236,6 +245,20 @@
236 245 enum pci_bar_type *type);
237 246 extern void pci_restore_iov_state(struct pci_dev *dev);
238 247 extern int pci_iov_bus_range(struct pci_bus *bus);
  248 +
  249 +extern int pci_enable_ats(struct pci_dev *dev, int ps);
  250 +extern void pci_disable_ats(struct pci_dev *dev);
  251 +extern int pci_ats_queue_depth(struct pci_dev *dev);
  252 +/**
  253 + * pci_ats_enabled - query the ATS status
  254 + * @dev: the PCI device
  255 + *
  256 + * Returns 1 if ATS capability is enabled, or 0 if not.
  257 + */
  258 +static inline int pci_ats_enabled(struct pci_dev *dev)
  259 +{
  260 + return dev->ats && dev->ats->is_enabled;
  261 +}
239 262 #else
240 263 static inline int pci_iov_init(struct pci_dev *dev)
241 264 {
... ... @@ -254,6 +277,22 @@
254 277 {
255 278 }
256 279 static inline int pci_iov_bus_range(struct pci_bus *bus)
  280 +{
  281 + return 0;
  282 +}
  283 +
  284 +static inline int pci_enable_ats(struct pci_dev *dev, int ps)
  285 +{
  286 + return -ENODEV;
  287 +}
  288 +static inline void pci_disable_ats(struct pci_dev *dev)
  289 +{
  290 +}
  291 +static inline int pci_ats_queue_depth(struct pci_dev *dev)
  292 +{
  293 + return -ENODEV;
  294 +}
  295 +static inline int pci_ats_enabled(struct pci_dev *dev)
257 296 {
258 297 return 0;
259 298 }
include/linux/dma_remapping.h
... ... @@ -13,6 +13,10 @@
13 13 #define DMA_PTE_WRITE (2)
14 14 #define DMA_PTE_SNP (1 << 11)
15 15  
  16 +#define CONTEXT_TT_MULTI_LEVEL 0
  17 +#define CONTEXT_TT_DEV_IOTLB 1
  18 +#define CONTEXT_TT_PASS_THROUGH 2
  19 +
16 20 struct intel_iommu;
17 21 struct dmar_domain;
18 22 struct root_entry;
19 23  
... ... @@ -21,8 +25,13 @@
21 25  
22 26 #ifdef CONFIG_DMAR
23 27 extern int iommu_calculate_agaw(struct intel_iommu *iommu);
  28 +extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
24 29 #else
25 30 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
  31 +{
  32 + return 0;
  33 +}
  34 +static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
26 35 {
27 36 return 0;
28 37 }
include/linux/dmar.h
... ... @@ -188,6 +188,15 @@
188 188  
189 189 #define for_each_rmrr_units(rmrr) \
190 190 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
  191 +
  192 +struct dmar_atsr_unit {
  193 + struct list_head list; /* list of ATSR units */
  194 + struct acpi_dmar_header *hdr; /* ACPI header */
  195 + struct pci_dev **devices; /* target devices */
  196 + int devices_cnt; /* target device count */
  197 + u8 include_all:1; /* include all ports */
  198 +};
  199 +
191 200 /* Intel DMAR initialization functions */
192 201 extern int intel_iommu_init(void);
193 202 #else
include/linux/intel-iommu.h
... ... @@ -53,6 +53,7 @@
53 53 #define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */
54 54 #define DMAR_IQH_REG 0x80 /* Invalidation queue head register */
55 55 #define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */
  56 +#define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */
56 57 #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */
57 58 #define DMAR_ICS_REG 0x98 /* Invalidation complete status register */
58 59 #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */
59 60  
... ... @@ -120,8 +121,10 @@
120 121 (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
121 122 #define ecap_coherent(e) ((e) & 0x1)
122 123 #define ecap_qis(e) ((e) & 0x2)
  124 +#define ecap_pass_through(e) ((e >> 6) & 0x1)
123 125 #define ecap_eim_support(e) ((e >> 4) & 0x1)
124 126 #define ecap_ir_support(e) ((e >> 3) & 0x1)
  127 +#define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1)
125 128 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
126 129 #define ecap_sc_support(e) ((e >> 7) & 0x1) /* Snooping Control */
127 130  
... ... @@ -197,6 +200,8 @@
197 200 #define DMA_FSTS_PPF ((u32)2)
198 201 #define DMA_FSTS_PFO ((u32)1)
199 202 #define DMA_FSTS_IQE (1 << 4)
  203 +#define DMA_FSTS_ICE (1 << 5)
  204 +#define DMA_FSTS_ITE (1 << 6)
200 205 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
201 206  
202 207 /* FRCD_REG, 32 bits access */
... ... @@ -225,7 +230,8 @@
225 230 enum {
226 231 QI_FREE,
227 232 QI_IN_USE,
228   - QI_DONE
  233 + QI_DONE,
  234 + QI_ABORT
229 235 };
230 236  
231 237 #define QI_CC_TYPE 0x1
... ... @@ -254,6 +260,12 @@
254 260 #define QI_CC_DID(did) (((u64)did) << 16)
255 261 #define QI_CC_GRAN(gran) (((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
256 262  
  263 +#define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32)
  264 +#define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16)
  265 +#define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK)
  266 +#define QI_DEV_IOTLB_SIZE 1
  267 +#define QI_DEV_IOTLB_MAX_INVS 32
  268 +
257 269 struct qi_desc {
258 270 u64 low, high;
259 271 };
... ... @@ -280,10 +292,10 @@
280 292 #endif
281 293  
282 294 struct iommu_flush {
283   - int (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
284   - u64 type, int non_present_entry_flush);
285   - int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
286   - unsigned int size_order, u64 type, int non_present_entry_flush);
  295 + void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
  296 + u8 fm, u64 type);
  297 + void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
  298 + unsigned int size_order, u64 type);
287 299 };
288 300  
289 301 enum {
... ... @@ -302,6 +314,7 @@
302 314 spinlock_t register_lock; /* protect register handling */
303 315 int seq_id; /* sequence id of the iommu */
304 316 int agaw; /* agaw of this iommu */
  317 + int msagaw; /* max sagaw of this iommu */
305 318 unsigned int irq;
306 319 unsigned char name[13]; /* Device Name */
307 320  
... ... @@ -329,6 +342,7 @@
329 342 }
330 343  
331 344 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
  345 +extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
332 346  
333 347 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
334 348 extern void free_iommu(struct intel_iommu *iommu);
... ... @@ -337,11 +351,12 @@
337 351 extern int dmar_reenable_qi(struct intel_iommu *iommu);
338 352 extern void qi_global_iec(struct intel_iommu *iommu);
339 353  
340   -extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
341   - u8 fm, u64 type, int non_present_entry_flush);
342   -extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
343   - unsigned int size_order, u64 type,
344   - int non_present_entry_flush);
  354 +extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
  355 + u8 fm, u64 type);
  356 +extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
  357 + unsigned int size_order, u64 type);
  358 +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
  359 + u64 addr, unsigned mask);
345 360  
346 361 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
347 362  
... ... @@ -196,6 +196,7 @@
196 196 struct pcie_link_state;
197 197 struct pci_vpd;
198 198 struct pci_sriov;
  199 +struct pci_ats;
199 200  
200 201 /*
201 202 * The pci_dev structure is used to describe PCI devices.
... ... @@ -293,6 +294,7 @@
293 294 struct pci_sriov *sriov; /* SR-IOV capability related */
294 295 struct pci_dev *physfn; /* the PF this VF is associated with */
295 296 };
  297 + struct pci_ats *ats; /* Address Translation Service */
296 298 #endif
297 299 };
298 300  
include/linux/pci_regs.h
... ... @@ -502,6 +502,7 @@
502 502 #define PCI_EXT_CAP_ID_DSN 3
503 503 #define PCI_EXT_CAP_ID_PWR 4
504 504 #define PCI_EXT_CAP_ID_ARI 14
  505 +#define PCI_EXT_CAP_ID_ATS 15
505 506 #define PCI_EXT_CAP_ID_SRIOV 16
506 507  
507 508 /* Advanced Error Reporting */
... ... @@ -619,6 +620,15 @@
619 620 #define PCI_ARI_CTRL_MFVC 0x0001 /* MFVC Function Groups Enable */
620 621 #define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */
621 622 #define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */
  623 +
  624 +/* Address Translation Service */
  625 +#define PCI_ATS_CAP 0x04 /* ATS Capability Register */
  626 +#define PCI_ATS_CAP_QDEP(x) ((x) & 0x1f) /* Invalidate Queue Depth */
  627 +#define PCI_ATS_MAX_QDEP 32 /* Max Invalidate Queue Depth */
  628 +#define PCI_ATS_CTRL 0x06 /* ATS Control Register */
  629 +#define PCI_ATS_CTRL_ENABLE 0x8000 /* ATS Enable */
  630 +#define PCI_ATS_CTRL_STU(x) ((x) & 0x1f) /* Smallest Translation Unit */
  631 +#define PCI_ATS_MIN_STU 12 /* shift of minimum STU block */
622 632  
623 633 /* Single Root I/O Virtualization */
624 634 #define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */