Commit 07700a94b00a4fcbbfb07d1b72dc112a0e036735

Authored by Jan Kiszka
Committed by Avi Kivity
1 parent 3e515705a1

KVM: Allow host IRQ sharing for assigned PCI 2.3 devices

PCI 2.3 allows to generically disable IRQ sources at device level. This
enables us to share legacy IRQs of such devices with other host devices
when passing them to a guest.

The new IRQ sharing feature introduced here is optional, user space has
to request it explicitly. Moreover, user space can inform us about its
view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
interrupt and signaling it if the guest masked it via the virtualized
PCI config space.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

Showing 5 changed files with 230 additions and 29 deletions Side-by-side Diff

Documentation/virtual/kvm/api.txt
... ... @@ -1169,7 +1169,15 @@
1169 1169  
1170 1170 /* Depends on KVM_CAP_IOMMU */
1171 1171 #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
  1172 +/* The following two depend on KVM_CAP_PCI_2_3 */
  1173 +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
  1174 +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
1172 1175  
  1176 +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
  1177 +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
  1178 +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
  1179 +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
  1180 +
1173 1181 The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
1174 1182 isolation of the device. Usages not specifying this flag are deprecated.
1175 1183  
... ... @@ -1440,6 +1448,39 @@
1440 1448 The "num_dirty" field is a performance hint for KVM to determine whether it
1441 1449 should skip processing the bitmap and just invalidate everything. It must
1442 1450 be set to the number of set bits in the bitmap.
  1451 +
  1452 +4.60 KVM_ASSIGN_SET_INTX_MASK
  1453 +
  1454 +Capability: KVM_CAP_PCI_2_3
  1455 +Architectures: x86
  1456 +Type: vm ioctl
  1457 +Parameters: struct kvm_assigned_pci_dev (in)
  1458 +Returns: 0 on success, -1 on error
  1459 +
  1460 +Allows userspace to mask PCI INTx interrupts from the assigned device. The
  1461 +kernel will not deliver INTx interrupts to the guest between setting and
  1462 +clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of
  1463 +and emulation of PCI 2.3 INTx disable command register behavior.
  1464 +
  1465 +This may be used for both PCI 2.3 devices supporting INTx disable natively and
  1466 +older devices lacking this support. Userspace is responsible for emulating the
  1467 +read value of the INTx disable bit in the guest visible PCI command register.
  1468 +When modifying the INTx disable state, userspace should precede updating the
  1469 +physical device command register by calling this ioctl to inform the kernel of
  1470 +the new intended INTx mask state.
  1471 +
  1472 +Note that the kernel uses the device INTx disable bit to internally manage the
  1473 +device interrupt state for PCI 2.3 devices. Reads of this register may
  1474 +therefore not match the expected value. Writes should always use the guest
  1475 +intended INTx disable value rather than attempting to read-copy-update the
  1476 +current physical device state. Races between user and kernel updates to the
  1477 +INTx disable bit are handled lazily in the kernel. It's possible the device
  1478 +may generate unintended interrupts, but they will not be injected into the
  1479 +guest.
  1480 +
  1481 +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
  1482 +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
  1483 +evaluated.
1443 1484  
1444 1485 4.62 KVM_CREATE_SPAPR_TCE
1445 1486  
... ... @@ -2143,6 +2143,7 @@
2143 2143 case KVM_CAP_XSAVE:
2144 2144 case KVM_CAP_ASYNC_PF:
2145 2145 case KVM_CAP_GET_TSC_KHZ:
  2146 + case KVM_CAP_PCI_2_3:
2146 2147 r = 1;
2147 2148 break;
2148 2149 case KVM_CAP_COALESCED_MMIO:
... ... @@ -588,6 +588,7 @@
588 588 #define KVM_CAP_TSC_DEADLINE_TIMER 72
589 589 #define KVM_CAP_S390_UCONTROL 73
590 590 #define KVM_CAP_SYNC_REGS 74
  591 +#define KVM_CAP_PCI_2_3 75
591 592  
592 593 #ifdef KVM_CAP_IRQ_ROUTING
593 594  
... ... @@ -784,6 +785,9 @@
784 785 /* Available with KVM_CAP_TSC_CONTROL */
785 786 #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
786 787 #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
  788 +/* Available with KVM_CAP_PCI_2_3 */
  789 +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
  790 + struct kvm_assigned_pci_dev)
787 791  
788 792 /*
789 793 * ioctls for vcpu fds
... ... @@ -857,6 +861,8 @@
857 861 #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg)
858 862  
859 863 #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
  864 +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
  865 +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
860 866  
861 867 struct kvm_assigned_pci_dev {
862 868 __u32 assigned_dev_id;
include/linux/kvm_host.h
... ... @@ -546,6 +546,7 @@
546 546 unsigned int entries_nr;
547 547 int host_irq;
548 548 bool host_irq_disabled;
  549 + bool pci_2_3;
549 550 struct msix_entry *host_msix_entries;
550 551 int guest_irq;
551 552 struct msix_entry *guest_msix_entries;
... ... @@ -555,6 +556,7 @@
555 556 struct pci_dev *dev;
556 557 struct kvm *kvm;
557 558 spinlock_t intx_lock;
  559 + struct mutex intx_mask_lock;
558 560 char irq_name[32];
559 561 struct pci_saved_state *pci_saved_state;
560 562 };
virt/kvm/assigned-dev.c
... ... @@ -55,23 +55,67 @@
55 55 return index;
56 56 }
57 57  
58   -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
  58 +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
59 59 {
60 60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
  61 + int ret;
61 62  
62   - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
63   - spin_lock(&assigned_dev->intx_lock);
  63 + spin_lock(&assigned_dev->intx_lock);
  64 + if (pci_check_and_mask_intx(assigned_dev->dev)) {
  65 + assigned_dev->host_irq_disabled = true;
  66 + ret = IRQ_WAKE_THREAD;
  67 + } else
  68 + ret = IRQ_NONE;
  69 + spin_unlock(&assigned_dev->intx_lock);
  70 +
  71 + return ret;
  72 +}
  73 +
  74 +static void
  75 +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
  76 + int vector)
  77 +{
  78 + if (unlikely(assigned_dev->irq_requested_type &
  79 + KVM_DEV_IRQ_GUEST_INTX)) {
  80 + mutex_lock(&assigned_dev->intx_mask_lock);
  81 + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
  82 + kvm_set_irq(assigned_dev->kvm,
  83 + assigned_dev->irq_source_id, vector, 1);
  84 + mutex_unlock(&assigned_dev->intx_mask_lock);
  85 + } else
  86 + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
  87 + vector, 1);
  88 +}
  89 +
  90 +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
  91 +{
  92 + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
  93 +
  94 + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
  95 + spin_lock_irq(&assigned_dev->intx_lock);
64 96 disable_irq_nosync(irq);
65 97 assigned_dev->host_irq_disabled = true;
66   - spin_unlock(&assigned_dev->intx_lock);
  98 + spin_unlock_irq(&assigned_dev->intx_lock);
67 99 }
68 100  
69   - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
70   - assigned_dev->guest_irq, 1);
  101 + kvm_assigned_dev_raise_guest_irq(assigned_dev,
  102 + assigned_dev->guest_irq);
71 103  
72 104 return IRQ_HANDLED;
73 105 }
74 106  
  107 +#ifdef __KVM_HAVE_MSI
  108 +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
  109 +{
  110 + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
  111 +
  112 + kvm_assigned_dev_raise_guest_irq(assigned_dev,
  113 + assigned_dev->guest_irq);
  114 +
  115 + return IRQ_HANDLED;
  116 +}
  117 +#endif
  118 +
75 119 #ifdef __KVM_HAVE_MSIX
76 120 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
77 121 {
... ... @@ -81,8 +125,7 @@
81 125  
82 126 if (index >= 0) {
83 127 vector = assigned_dev->guest_msix_entries[index].vector;
84   - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
85   - vector, 1);
  128 + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
86 129 }
87 130  
88 131 return IRQ_HANDLED;
89 132  
... ... @@ -98,15 +141,31 @@
98 141  
99 142 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
100 143  
101   - /* The guest irq may be shared so this ack may be
102   - * from another device.
103   - */
104   - spin_lock(&dev->intx_lock);
105   - if (dev->host_irq_disabled) {
106   - enable_irq(dev->host_irq);
107   - dev->host_irq_disabled = false;
  144 + mutex_lock(&dev->intx_mask_lock);
  145 +
  146 + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
  147 + bool reassert = false;
  148 +
  149 + spin_lock_irq(&dev->intx_lock);
  150 + /*
  151 + * The guest IRQ may be shared so this ack can come from an
  152 + * IRQ for another guest device.
  153 + */
  154 + if (dev->host_irq_disabled) {
  155 + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
  156 + enable_irq(dev->host_irq);
  157 + else if (!pci_check_and_unmask_intx(dev->dev))
  158 + reassert = true;
  159 + dev->host_irq_disabled = reassert;
  160 + }
  161 + spin_unlock_irq(&dev->intx_lock);
  162 +
  163 + if (reassert)
  164 + kvm_set_irq(dev->kvm, dev->irq_source_id,
  165 + dev->guest_irq, 1);
108 166 }
109   - spin_unlock(&dev->intx_lock);
  167 +
  168 + mutex_unlock(&dev->intx_mask_lock);
110 169 }
111 170  
112 171 static void deassign_guest_irq(struct kvm *kvm,
... ... @@ -154,7 +213,15 @@
154 213 pci_disable_msix(assigned_dev->dev);
155 214 } else {
156 215 /* Deal with MSI and INTx */
157   - disable_irq(assigned_dev->host_irq);
  216 + if ((assigned_dev->irq_requested_type &
  217 + KVM_DEV_IRQ_HOST_INTX) &&
  218 + (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
  219 + spin_lock_irq(&assigned_dev->intx_lock);
  220 + pci_intx(assigned_dev->dev, false);
  221 + spin_unlock_irq(&assigned_dev->intx_lock);
  222 + synchronize_irq(assigned_dev->host_irq);
  223 + } else
  224 + disable_irq(assigned_dev->host_irq);
158 225  
159 226 free_irq(assigned_dev->host_irq, assigned_dev);
160 227  
161 228  
162 229  
163 230  
... ... @@ -235,15 +302,34 @@
235 302 static int assigned_device_enable_host_intx(struct kvm *kvm,
236 303 struct kvm_assigned_dev_kernel *dev)
237 304 {
  305 + irq_handler_t irq_handler;
  306 + unsigned long flags;
  307 +
238 308 dev->host_irq = dev->dev->irq;
239   - /* Even though this is PCI, we don't want to use shared
240   - * interrupts. Sharing host devices with guest-assigned devices
241   - * on the same interrupt line is not a happy situation: there
242   - * are going to be long delays in accepting, acking, etc.
  309 +
  310 + /*
  311 + * We can only share the IRQ line with other host devices if we are
  312 + * able to disable the IRQ source at device-level - independently of
  313 + * the guest driver. Otherwise host devices may suffer from unbounded
  314 + * IRQ latencies when the guest keeps the line asserted.
243 315 */
244   - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
245   - IRQF_ONESHOT, dev->irq_name, dev))
  316 + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
  317 + irq_handler = kvm_assigned_dev_intx;
  318 + flags = IRQF_SHARED;
  319 + } else {
  320 + irq_handler = NULL;
  321 + flags = IRQF_ONESHOT;
  322 + }
  323 + if (request_threaded_irq(dev->host_irq, irq_handler,
  324 + kvm_assigned_dev_thread_intx, flags,
  325 + dev->irq_name, dev))
246 326 return -EIO;
  327 +
  328 + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
  329 + spin_lock_irq(&dev->intx_lock);
  330 + pci_intx(dev->dev, true);
  331 + spin_unlock_irq(&dev->intx_lock);
  332 + }
247 333 return 0;
248 334 }
249 335  
... ... @@ -260,8 +346,9 @@
260 346 }
261 347  
262 348 dev->host_irq = dev->dev->irq;
263   - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
264   - 0, dev->irq_name, dev)) {
  349 + if (request_threaded_irq(dev->host_irq, NULL,
  350 + kvm_assigned_dev_thread_msi, 0,
  351 + dev->irq_name, dev)) {
265 352 pci_disable_msi(dev->dev);
266 353 return -EIO;
267 354 }
... ... @@ -319,7 +406,6 @@
319 406 {
320 407 dev->guest_irq = irq->guest_irq;
321 408 dev->ack_notifier.gsi = -1;
322   - dev->host_irq_disabled = false;
323 409 return 0;
324 410 }
325 411 #endif
... ... @@ -331,7 +417,6 @@
331 417 {
332 418 dev->guest_irq = irq->guest_irq;
333 419 dev->ack_notifier.gsi = -1;
334   - dev->host_irq_disabled = false;
335 420 return 0;
336 421 }
337 422 #endif
... ... @@ -365,6 +450,7 @@
365 450 default:
366 451 r = -EINVAL;
367 452 }
  453 + dev->host_irq_disabled = false;
368 454  
369 455 if (!r)
370 456 dev->irq_requested_type |= host_irq_type;
... ... @@ -466,6 +552,7 @@
466 552 {
467 553 int r = -ENODEV;
468 554 struct kvm_assigned_dev_kernel *match;
  555 + unsigned long irq_type;
469 556  
470 557 mutex_lock(&kvm->lock);
471 558  
... ... @@ -474,7 +561,9 @@
474 561 if (!match)
475 562 goto out;
476 563  
477   - r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
  564 + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
  565 + KVM_DEV_IRQ_GUEST_MASK);
  566 + r = kvm_deassign_irq(kvm, match, irq_type);
478 567 out:
479 568 mutex_unlock(&kvm->lock);
480 569 return r;
... ... @@ -607,6 +696,10 @@
607 696 if (!match->pci_saved_state)
608 697 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
609 698 __func__, dev_name(&dev->dev));
  699 +
  700 + if (!pci_intx_mask_supported(dev))
  701 + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
  702 +
610 703 match->assigned_dev_id = assigned_dev->assigned_dev_id;
611 704 match->host_segnr = assigned_dev->segnr;
612 705 match->host_busnr = assigned_dev->busnr;
... ... @@ -614,6 +707,7 @@
614 707 match->flags = assigned_dev->flags;
615 708 match->dev = dev;
616 709 spin_lock_init(&match->intx_lock);
  710 + mutex_init(&match->intx_mask_lock);
617 711 match->irq_source_id = -1;
618 712 match->kvm = kvm;
619 713 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
... ... @@ -759,6 +853,55 @@
759 853 }
760 854 #endif
761 855  
  856 +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
  857 + struct kvm_assigned_pci_dev *assigned_dev)
  858 +{
  859 + int r = 0;
  860 + struct kvm_assigned_dev_kernel *match;
  861 +
  862 + mutex_lock(&kvm->lock);
  863 +
  864 + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
  865 + assigned_dev->assigned_dev_id);
  866 + if (!match) {
  867 + r = -ENODEV;
  868 + goto out;
  869 + }
  870 +
  871 + mutex_lock(&match->intx_mask_lock);
  872 +
  873 + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
  874 + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
  875 +
  876 + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
  877 + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
  878 + kvm_set_irq(match->kvm, match->irq_source_id,
  879 + match->guest_irq, 0);
  880 + /*
  881 + * Masking at hardware-level is performed on demand,
  882 + * i.e. when an IRQ actually arrives at the host.
  883 + */
  884 + } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
  885 + /*
  886 + * Unmask the IRQ line if required. Unmasking at
  887 + * device level will be performed by user space.
  888 + */
  889 + spin_lock_irq(&match->intx_lock);
  890 + if (match->host_irq_disabled) {
  891 + enable_irq(match->host_irq);
  892 + match->host_irq_disabled = false;
  893 + }
  894 + spin_unlock_irq(&match->intx_lock);
  895 + }
  896 + }
  897 +
  898 + mutex_unlock(&match->intx_mask_lock);
  899 +
  900 +out:
  901 + mutex_unlock(&kvm->lock);
  902 + return r;
  903 +}
  904 +
762 905 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
763 906 unsigned long arg)
764 907 {
... ... @@ -866,6 +1009,15 @@
866 1009 break;
867 1010 }
868 1011 #endif
  1012 + case KVM_ASSIGN_SET_INTX_MASK: {
  1013 + struct kvm_assigned_pci_dev assigned_dev;
  1014 +
  1015 + r = -EFAULT;
  1016 + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
  1017 + goto out;
  1018 + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
  1019 + break;
  1020 + }
869 1021 default:
870 1022 r = -ENOTTY;
871 1023 break;