Commit 89e1f7d4c66d85f42c3d52ea3866eb10cadf6153

Authored by Alex Williamson
1 parent 73fa0d10d0

vfio: Add PCI device driver

Add PCI device support for VFIO.  PCI devices expose regions
for accessing config space, I/O port space, and MMIO areas
of the device.  PCI config access is virtualized in the kernel,
allowing us to ensure the integrity of the system, by preventing
various accesses while reducing duplicate support across various
userspace drivers.  I/O port supports read/write access while
MMIO also supports mmap of sufficiently sized regions.  Support
for INTx, MSI, and MSI-X interrupts are provided using eventfds to
userspace.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Showing 9 changed files with 3259 additions and 0 deletions Side-by-side Diff

drivers/vfio/Kconfig
... ... @@ -12,4 +12,6 @@
12 12 See Documentation/vfio.txt for more details.
13 13  
14 14 If you don't know what to do here, say N.
  15 +
  16 +source "drivers/vfio/pci/Kconfig"
drivers/vfio/pci/Kconfig
  1 +config VFIO_PCI
  2 + tristate "VFIO support for PCI devices"
  3 + depends on VFIO && PCI && EVENTFD
  4 + help
  5 + Support for the PCI VFIO bus driver. This is required to make
  6 + use of PCI drivers using the VFIO framework.
  7 +
  8 + If you don't know what to do here, say N.
drivers/vfio/pci/Makefile
  1 +
  2 +vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
  3 +
  4 +obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
drivers/vfio/pci/vfio_pci.c
  1 +/*
  2 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  3 + * Author: Alex Williamson <alex.williamson@redhat.com>
  4 + *
  5 + * This program is free software; you can redistribute it and/or modify
  6 + * it under the terms of the GNU General Public License version 2 as
  7 + * published by the Free Software Foundation.
  8 + *
  9 + * Derived from original vfio:
  10 + * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  11 + * Author: Tom Lyon, pugs@cisco.com
  12 + */
  13 +
  14 +#include <linux/device.h>
  15 +#include <linux/eventfd.h>
  16 +#include <linux/interrupt.h>
  17 +#include <linux/iommu.h>
  18 +#include <linux/module.h>
  19 +#include <linux/mutex.h>
  20 +#include <linux/notifier.h>
  21 +#include <linux/pci.h>
  22 +#include <linux/pm_runtime.h>
  23 +#include <linux/slab.h>
  24 +#include <linux/types.h>
  25 +#include <linux/uaccess.h>
  26 +#include <linux/vfio.h>
  27 +
  28 +#include "vfio_pci_private.h"
  29 +
  30 +#define DRIVER_VERSION "0.2"
  31 +#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
  32 +#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
  33 +
  34 +static bool nointxmask;
  35 +module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
  36 +MODULE_PARM_DESC(nointxmask,
  37 + "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
  38 +
  39 +static int vfio_pci_enable(struct vfio_pci_device *vdev)
  40 +{
  41 + struct pci_dev *pdev = vdev->pdev;
  42 + int ret;
  43 + u16 cmd;
  44 + u8 msix_pos;
  45 +
  46 + vdev->reset_works = (pci_reset_function(pdev) == 0);
  47 + pci_save_state(pdev);
  48 + vdev->pci_saved_state = pci_store_saved_state(pdev);
  49 + if (!vdev->pci_saved_state)
  50 + pr_debug("%s: Couldn't store %s saved state\n",
  51 + __func__, dev_name(&pdev->dev));
  52 +
  53 + ret = vfio_config_init(vdev);
  54 + if (ret)
  55 + goto out;
  56 +
  57 + if (likely(!nointxmask))
  58 + vdev->pci_2_3 = pci_intx_mask_supported(pdev);
  59 +
  60 + pci_read_config_word(pdev, PCI_COMMAND, &cmd);
  61 + if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
  62 + cmd &= ~PCI_COMMAND_INTX_DISABLE;
  63 + pci_write_config_word(pdev, PCI_COMMAND, cmd);
  64 + }
  65 +
  66 + msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
  67 + if (msix_pos) {
  68 + u16 flags;
  69 + u32 table;
  70 +
  71 + pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
  72 + pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
  73 +
  74 + vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
  75 + vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
  76 + vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
  77 + } else
  78 + vdev->msix_bar = 0xFF;
  79 +
  80 + ret = pci_enable_device(pdev);
  81 + if (ret)
  82 + goto out;
  83 +
  84 + return ret;
  85 +
  86 +out:
  87 + kfree(vdev->pci_saved_state);
  88 + vdev->pci_saved_state = NULL;
  89 + vfio_config_free(vdev);
  90 + return ret;
  91 +}
  92 +
  93 +static void vfio_pci_disable(struct vfio_pci_device *vdev)
  94 +{
  95 + int bar;
  96 +
  97 + pci_disable_device(vdev->pdev);
  98 +
  99 + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
  100 + VFIO_IRQ_SET_ACTION_TRIGGER,
  101 + vdev->irq_type, 0, 0, NULL);
  102 +
  103 + vdev->virq_disabled = false;
  104 +
  105 + vfio_config_free(vdev);
  106 +
  107 + pci_reset_function(vdev->pdev);
  108 +
  109 + if (pci_load_and_free_saved_state(vdev->pdev,
  110 + &vdev->pci_saved_state) == 0)
  111 + pci_restore_state(vdev->pdev);
  112 + else
  113 + pr_info("%s: Couldn't reload %s saved state\n",
  114 + __func__, dev_name(&vdev->pdev->dev));
  115 +
  116 + for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
  117 + if (!vdev->barmap[bar])
  118 + continue;
  119 + pci_iounmap(vdev->pdev, vdev->barmap[bar]);
  120 + pci_release_selected_regions(vdev->pdev, 1 << bar);
  121 + vdev->barmap[bar] = NULL;
  122 + }
  123 +}
  124 +
  125 +static void vfio_pci_release(void *device_data)
  126 +{
  127 + struct vfio_pci_device *vdev = device_data;
  128 +
  129 + if (atomic_dec_and_test(&vdev->refcnt))
  130 + vfio_pci_disable(vdev);
  131 +
  132 + module_put(THIS_MODULE);
  133 +}
  134 +
  135 +static int vfio_pci_open(void *device_data)
  136 +{
  137 + struct vfio_pci_device *vdev = device_data;
  138 +
  139 + if (!try_module_get(THIS_MODULE))
  140 + return -ENODEV;
  141 +
  142 + if (atomic_inc_return(&vdev->refcnt) == 1) {
  143 + int ret = vfio_pci_enable(vdev);
  144 + if (ret) {
  145 + module_put(THIS_MODULE);
  146 + return ret;
  147 + }
  148 + }
  149 +
  150 + return 0;
  151 +}
  152 +
  153 +static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
  154 +{
  155 + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
  156 + u8 pin;
  157 + pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
  158 + if (pin)
  159 + return 1;
  160 +
  161 + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
  162 + u8 pos;
  163 + u16 flags;
  164 +
  165 + pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
  166 + if (pos) {
  167 + pci_read_config_word(vdev->pdev,
  168 + pos + PCI_MSI_FLAGS, &flags);
  169 +
  170 + return 1 << (flags & PCI_MSI_FLAGS_QMASK);
  171 + }
  172 + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
  173 + u8 pos;
  174 + u16 flags;
  175 +
  176 + pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
  177 + if (pos) {
  178 + pci_read_config_word(vdev->pdev,
  179 + pos + PCI_MSIX_FLAGS, &flags);
  180 +
  181 + return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
  182 + }
  183 + }
  184 +
  185 + return 0;
  186 +}
  187 +
  188 +static long vfio_pci_ioctl(void *device_data,
  189 + unsigned int cmd, unsigned long arg)
  190 +{
  191 + struct vfio_pci_device *vdev = device_data;
  192 + unsigned long minsz;
  193 +
  194 + if (cmd == VFIO_DEVICE_GET_INFO) {
  195 + struct vfio_device_info info;
  196 +
  197 + minsz = offsetofend(struct vfio_device_info, num_irqs);
  198 +
  199 + if (copy_from_user(&info, (void __user *)arg, minsz))
  200 + return -EFAULT;
  201 +
  202 + if (info.argsz < minsz)
  203 + return -EINVAL;
  204 +
  205 + info.flags = VFIO_DEVICE_FLAGS_PCI;
  206 +
  207 + if (vdev->reset_works)
  208 + info.flags |= VFIO_DEVICE_FLAGS_RESET;
  209 +
  210 + info.num_regions = VFIO_PCI_NUM_REGIONS;
  211 + info.num_irqs = VFIO_PCI_NUM_IRQS;
  212 +
  213 + return copy_to_user((void __user *)arg, &info, minsz);
  214 +
  215 + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
  216 + struct pci_dev *pdev = vdev->pdev;
  217 + struct vfio_region_info info;
  218 +
  219 + minsz = offsetofend(struct vfio_region_info, offset);
  220 +
  221 + if (copy_from_user(&info, (void __user *)arg, minsz))
  222 + return -EFAULT;
  223 +
  224 + if (info.argsz < minsz)
  225 + return -EINVAL;
  226 +
  227 + switch (info.index) {
  228 + case VFIO_PCI_CONFIG_REGION_INDEX:
  229 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
  230 + info.size = pdev->cfg_size;
  231 + info.flags = VFIO_REGION_INFO_FLAG_READ |
  232 + VFIO_REGION_INFO_FLAG_WRITE;
  233 + break;
  234 + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
  235 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
  236 + info.size = pci_resource_len(pdev, info.index);
  237 + if (!info.size) {
  238 + info.flags = 0;
  239 + break;
  240 + }
  241 +
  242 + info.flags = VFIO_REGION_INFO_FLAG_READ |
  243 + VFIO_REGION_INFO_FLAG_WRITE;
  244 + if (pci_resource_flags(pdev, info.index) &
  245 + IORESOURCE_MEM && info.size >= PAGE_SIZE)
  246 + info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
  247 + break;
  248 + case VFIO_PCI_ROM_REGION_INDEX:
  249 + {
  250 + void __iomem *io;
  251 + size_t size;
  252 +
  253 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
  254 + info.flags = 0;
  255 +
  256 + /* Report the BAR size, not the ROM size */
  257 + info.size = pci_resource_len(pdev, info.index);
  258 + if (!info.size)
  259 + break;
  260 +
  261 + /* Is it really there? */
  262 + io = pci_map_rom(pdev, &size);
  263 + if (!io || !size) {
  264 + info.size = 0;
  265 + break;
  266 + }
  267 + pci_unmap_rom(pdev, io);
  268 +
  269 + info.flags = VFIO_REGION_INFO_FLAG_READ;
  270 + break;
  271 + }
  272 + default:
  273 + return -EINVAL;
  274 + }
  275 +
  276 + return copy_to_user((void __user *)arg, &info, minsz);
  277 +
  278 + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
  279 + struct vfio_irq_info info;
  280 +
  281 + minsz = offsetofend(struct vfio_irq_info, count);
  282 +
  283 + if (copy_from_user(&info, (void __user *)arg, minsz))
  284 + return -EFAULT;
  285 +
  286 + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
  287 + return -EINVAL;
  288 +
  289 + info.flags = VFIO_IRQ_INFO_EVENTFD;
  290 +
  291 + info.count = vfio_pci_get_irq_count(vdev, info.index);
  292 +
  293 + if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
  294 + info.flags |= (VFIO_IRQ_INFO_MASKABLE |
  295 + VFIO_IRQ_INFO_AUTOMASKED);
  296 + else
  297 + info.flags |= VFIO_IRQ_INFO_NORESIZE;
  298 +
  299 + return copy_to_user((void __user *)arg, &info, minsz);
  300 +
  301 + } else if (cmd == VFIO_DEVICE_SET_IRQS) {
  302 + struct vfio_irq_set hdr;
  303 + u8 *data = NULL;
  304 + int ret = 0;
  305 +
  306 + minsz = offsetofend(struct vfio_irq_set, count);
  307 +
  308 + if (copy_from_user(&hdr, (void __user *)arg, minsz))
  309 + return -EFAULT;
  310 +
  311 + if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
  312 + hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
  313 + VFIO_IRQ_SET_ACTION_TYPE_MASK))
  314 + return -EINVAL;
  315 +
  316 + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
  317 + size_t size;
  318 +
  319 + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
  320 + size = sizeof(uint8_t);
  321 + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
  322 + size = sizeof(int32_t);
  323 + else
  324 + return -EINVAL;
  325 +
  326 + if (hdr.argsz - minsz < hdr.count * size ||
  327 + hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
  328 + return -EINVAL;
  329 +
  330 + data = kmalloc(hdr.count * size, GFP_KERNEL);
  331 + if (!data)
  332 + return -ENOMEM;
  333 +
  334 + if (copy_from_user(data, (void __user *)(arg + minsz),
  335 + hdr.count * size)) {
  336 + kfree(data);
  337 + return -EFAULT;
  338 + }
  339 + }
  340 +
  341 + mutex_lock(&vdev->igate);
  342 +
  343 + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
  344 + hdr.start, hdr.count, data);
  345 +
  346 + mutex_unlock(&vdev->igate);
  347 + kfree(data);
  348 +
  349 + return ret;
  350 +
  351 + } else if (cmd == VFIO_DEVICE_RESET)
  352 + return vdev->reset_works ?
  353 + pci_reset_function(vdev->pdev) : -EINVAL;
  354 +
  355 + return -ENOTTY;
  356 +}
  357 +
  358 +static ssize_t vfio_pci_read(void *device_data, char __user *buf,
  359 + size_t count, loff_t *ppos)
  360 +{
  361 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
  362 + struct vfio_pci_device *vdev = device_data;
  363 + struct pci_dev *pdev = vdev->pdev;
  364 +
  365 + if (index >= VFIO_PCI_NUM_REGIONS)
  366 + return -EINVAL;
  367 +
  368 + if (index == VFIO_PCI_CONFIG_REGION_INDEX)
  369 + return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
  370 + else if (index == VFIO_PCI_ROM_REGION_INDEX)
  371 + return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
  372 + else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
  373 + return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
  374 + else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
  375 + return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
  376 +
  377 + return -EINVAL;
  378 +}
  379 +
  380 +static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
  381 + size_t count, loff_t *ppos)
  382 +{
  383 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
  384 + struct vfio_pci_device *vdev = device_data;
  385 + struct pci_dev *pdev = vdev->pdev;
  386 +
  387 + if (index >= VFIO_PCI_NUM_REGIONS)
  388 + return -EINVAL;
  389 +
  390 + if (index == VFIO_PCI_CONFIG_REGION_INDEX)
  391 + return vfio_pci_config_readwrite(vdev, (char __user *)buf,
  392 + count, ppos, true);
  393 + else if (index == VFIO_PCI_ROM_REGION_INDEX)
  394 + return -EINVAL;
  395 + else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
  396 + return vfio_pci_io_readwrite(vdev, (char __user *)buf,
  397 + count, ppos, true);
  398 + else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
  399 + return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
  400 + count, ppos, true);
  401 + }
  402 +
  403 + return -EINVAL;
  404 +}
  405 +
  406 +static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
  407 +{
  408 + struct vfio_pci_device *vdev = device_data;
  409 + struct pci_dev *pdev = vdev->pdev;
  410 + unsigned int index;
  411 + u64 phys_len, req_len, pgoff, req_start, phys;
  412 + int ret;
  413 +
  414 + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
  415 +
  416 + if (vma->vm_end < vma->vm_start)
  417 + return -EINVAL;
  418 + if ((vma->vm_flags & VM_SHARED) == 0)
  419 + return -EINVAL;
  420 + if (index >= VFIO_PCI_ROM_REGION_INDEX)
  421 + return -EINVAL;
  422 + if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
  423 + return -EINVAL;
  424 +
  425 + phys_len = pci_resource_len(pdev, index);
  426 + req_len = vma->vm_end - vma->vm_start;
  427 + pgoff = vma->vm_pgoff &
  428 + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
  429 + req_start = pgoff << PAGE_SHIFT;
  430 +
  431 + if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
  432 + return -EINVAL;
  433 +
  434 + if (index == vdev->msix_bar) {
  435 + /*
  436 + * Disallow mmaps overlapping the MSI-X table; users don't
  437 + * get to touch this directly. We could find somewhere
  438 + * else to map the overlap, but page granularity is only
  439 + * a recommendation, not a requirement, so the user needs
  440 + * to know which bits are real. Requiring them to mmap
  441 + * around the table makes that clear.
  442 + */
  443 +
  444 + /* If neither entirely above nor below, then it overlaps */
  445 + if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
  446 + req_start + req_len <= vdev->msix_offset))
  447 + return -EINVAL;
  448 + }
  449 +
  450 + /*
  451 + * Even though we don't make use of the barmap for the mmap,
  452 + * we need to request the region and the barmap tracks that.
  453 + */
  454 + if (!vdev->barmap[index]) {
  455 + ret = pci_request_selected_regions(pdev,
  456 + 1 << index, "vfio-pci");
  457 + if (ret)
  458 + return ret;
  459 +
  460 + vdev->barmap[index] = pci_iomap(pdev, index, 0);
  461 + }
  462 +
  463 + vma->vm_private_data = vdev;
  464 + vma->vm_flags |= (VM_IO | VM_RESERVED);
  465 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
  466 +
  467 + phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
  468 +
  469 + return remap_pfn_range(vma, vma->vm_start, phys,
  470 + req_len, vma->vm_page_prot);
  471 +}
  472 +
  473 +static const struct vfio_device_ops vfio_pci_ops = {
  474 + .name = "vfio-pci",
  475 + .open = vfio_pci_open,
  476 + .release = vfio_pci_release,
  477 + .ioctl = vfio_pci_ioctl,
  478 + .read = vfio_pci_read,
  479 + .write = vfio_pci_write,
  480 + .mmap = vfio_pci_mmap,
  481 +};
  482 +
  483 +static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  484 +{
  485 + u8 type;
  486 + struct vfio_pci_device *vdev;
  487 + struct iommu_group *group;
  488 + int ret;
  489 +
  490 + pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
  491 + if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
  492 + return -EINVAL;
  493 +
  494 + group = iommu_group_get(&pdev->dev);
  495 + if (!group)
  496 + return -EINVAL;
  497 +
  498 + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
  499 + if (!vdev) {
  500 + iommu_group_put(group);
  501 + return -ENOMEM;
  502 + }
  503 +
  504 + vdev->pdev = pdev;
  505 + vdev->irq_type = VFIO_PCI_NUM_IRQS;
  506 + mutex_init(&vdev->igate);
  507 + spin_lock_init(&vdev->irqlock);
  508 + atomic_set(&vdev->refcnt, 0);
  509 +
  510 + ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
  511 + if (ret) {
  512 + iommu_group_put(group);
  513 + kfree(vdev);
  514 + }
  515 +
  516 + return ret;
  517 +}
  518 +
  519 +static void vfio_pci_remove(struct pci_dev *pdev)
  520 +{
  521 + struct vfio_pci_device *vdev;
  522 +
  523 + vdev = vfio_del_group_dev(&pdev->dev);
  524 + if (!vdev)
  525 + return;
  526 +
  527 + iommu_group_put(pdev->dev.iommu_group);
  528 + kfree(vdev);
  529 +}
  530 +
  531 +static struct pci_driver vfio_pci_driver = {
  532 + .name = "vfio-pci",
  533 + .id_table = NULL, /* only dynamic ids */
  534 + .probe = vfio_pci_probe,
  535 + .remove = vfio_pci_remove,
  536 +};
  537 +
  538 +static void __exit vfio_pci_cleanup(void)
  539 +{
  540 + pci_unregister_driver(&vfio_pci_driver);
  541 + vfio_pci_virqfd_exit();
  542 + vfio_pci_uninit_perm_bits();
  543 +}
  544 +
  545 +static int __init vfio_pci_init(void)
  546 +{
  547 + int ret;
  548 +
  549 + /* Allocate shared config space permision data used by all devices */
  550 + ret = vfio_pci_init_perm_bits();
  551 + if (ret)
  552 + return ret;
  553 +
  554 + /* Start the virqfd cleanup handler */
  555 + ret = vfio_pci_virqfd_init();
  556 + if (ret)
  557 + goto out_virqfd;
  558 +
  559 + /* Register and scan for devices */
  560 + ret = pci_register_driver(&vfio_pci_driver);
  561 + if (ret)
  562 + goto out_driver;
  563 +
  564 + return 0;
  565 +
  566 +out_virqfd:
  567 + vfio_pci_virqfd_exit();
  568 +out_driver:
  569 + vfio_pci_uninit_perm_bits();
  570 + return ret;
  571 +}
  572 +
  573 +module_init(vfio_pci_init);
  574 +module_exit(vfio_pci_cleanup);
  575 +
  576 +MODULE_VERSION(DRIVER_VERSION);
  577 +MODULE_LICENSE("GPL v2");
  578 +MODULE_AUTHOR(DRIVER_AUTHOR);
  579 +MODULE_DESCRIPTION(DRIVER_DESC);
drivers/vfio/pci/vfio_pci_config.c
Changes suppressed. Click to show
  1 +/*
  2 + * VFIO PCI config space virtualization
  3 + *
  4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  5 + * Author: Alex Williamson <alex.williamson@redhat.com>
  6 + *
  7 + * This program is free software; you can redistribute it and/or modify
  8 + * it under the terms of the GNU General Public License version 2 as
  9 + * published by the Free Software Foundation.
  10 + *
  11 + * Derived from original vfio:
  12 + * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  13 + * Author: Tom Lyon, pugs@cisco.com
  14 + */
  15 +
  16 +/*
  17 + * This code handles reading and writing of PCI configuration registers.
  18 + * This is hairy because we want to allow a lot of flexibility to the
  19 + * user driver, but cannot trust it with all of the config fields.
  20 + * Tables determine which fields can be read and written, as well as
  21 + * which fields are 'virtualized' - special actions and translations to
  22 + * make it appear to the user that he has control, when in fact things
  23 + * must be negotiated with the underlying OS.
  24 + */
  25 +
  26 +#include <linux/fs.h>
  27 +#include <linux/pci.h>
  28 +#include <linux/uaccess.h>
  29 +#include <linux/vfio.h>
  30 +
  31 +#include "vfio_pci_private.h"
  32 +
  33 +#define PCI_CFG_SPACE_SIZE 256
  34 +
  35 +/* Useful "pseudo" capabilities */
  36 +#define PCI_CAP_ID_BASIC 0
  37 +#define PCI_CAP_ID_INVALID 0xFF
  38 +
  39 +#define is_bar(offset) \
  40 + ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \
  41 + (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4))
  42 +
  43 +/*
  44 + * Lengths of PCI Config Capabilities
  45 + * 0: Removed from the user visible capability list
  46 + * FF: Variable length
  47 + */
  48 +static u8 pci_cap_length[] = {
  49 + [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */
  50 + [PCI_CAP_ID_PM] = PCI_PM_SIZEOF,
  51 + [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF,
  52 + [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF,
  53 + [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */
  54 + [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */
  55 + [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */
  56 + [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */
  57 + [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */
  58 + [PCI_CAP_ID_VNDR] = 0xFF, /* variable */
  59 + [PCI_CAP_ID_DBG] = 0, /* debug - don't care */
  60 + [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */
  61 + [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */
  62 + [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */
  63 + [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */
  64 + [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */
  65 + [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */
  66 + [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF,
  67 + [PCI_CAP_ID_SATA] = 0xFF,
  68 + [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF,
  69 +};
  70 +
  71 +/*
  72 + * Lengths of PCIe/PCI-X Extended Config Capabilities
  73 + * 0: Removed or masked from the user visible capabilty list
  74 + * FF: Variable length
  75 + */
  76 +static u16 pci_ext_cap_length[] = {
  77 + [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND,
  78 + [PCI_EXT_CAP_ID_VC] = 0xFF,
  79 + [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF,
  80 + [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF,
  81 + [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */
  82 + [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */
  83 + [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */
  84 + [PCI_EXT_CAP_ID_MFVC] = 0xFF,
  85 + [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */
  86 + [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */
  87 + [PCI_EXT_CAP_ID_VNDR] = 0xFF,
  88 + [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */
  89 + [PCI_EXT_CAP_ID_ACS] = 0xFF,
  90 + [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF,
  91 + [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF,
  92 + [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF,
  93 + [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */
  94 + [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF,
  95 + [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF,
  96 + [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */
  97 + [PCI_EXT_CAP_ID_REBAR] = 0xFF,
  98 + [PCI_EXT_CAP_ID_DPA] = 0xFF,
  99 + [PCI_EXT_CAP_ID_TPH] = 0xFF,
  100 + [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF,
  101 + [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */
  102 + [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */
  103 + [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */
  104 +};
  105 +
  106 +/*
  107 + * Read/Write Permission Bits - one bit for each bit in capability
  108 + * Any field can be read if it exists, but what is read depends on
  109 + * whether the field is 'virtualized', or just pass thru to the
  110 + * hardware. Any virtualized field is also virtualized for writes.
  111 + * Writes are only permitted if they have a 1 bit here.
  112 + */
  113 +struct perm_bits {
  114 + u8 *virt; /* read/write virtual data, not hw */
  115 + u8 *write; /* writeable bits */
  116 + int (*readfn)(struct vfio_pci_device *vdev, int pos, int count,
  117 + struct perm_bits *perm, int offset, __le32 *val);
  118 + int (*writefn)(struct vfio_pci_device *vdev, int pos, int count,
  119 + struct perm_bits *perm, int offset, __le32 val);
  120 +};
  121 +
  122 +#define NO_VIRT 0
  123 +#define ALL_VIRT 0xFFFFFFFFU
  124 +#define NO_WRITE 0
  125 +#define ALL_WRITE 0xFFFFFFFFU
  126 +
  127 +static int vfio_user_config_read(struct pci_dev *pdev, int offset,
  128 + __le32 *val, int count)
  129 +{
  130 + int ret = -EINVAL;
  131 + u32 tmp_val = 0;
  132 +
  133 + switch (count) {
  134 + case 1:
  135 + {
  136 + u8 tmp;
  137 + ret = pci_user_read_config_byte(pdev, offset, &tmp);
  138 + tmp_val = tmp;
  139 + break;
  140 + }
  141 + case 2:
  142 + {
  143 + u16 tmp;
  144 + ret = pci_user_read_config_word(pdev, offset, &tmp);
  145 + tmp_val = tmp;
  146 + break;
  147 + }
  148 + case 4:
  149 + ret = pci_user_read_config_dword(pdev, offset, &tmp_val);
  150 + break;
  151 + }
  152 +
  153 + *val = cpu_to_le32(tmp_val);
  154 +
  155 + return pcibios_err_to_errno(ret);
  156 +}
  157 +
  158 +static int vfio_user_config_write(struct pci_dev *pdev, int offset,
  159 + __le32 val, int count)
  160 +{
  161 + int ret = -EINVAL;
  162 + u32 tmp_val = le32_to_cpu(val);
  163 +
  164 + switch (count) {
  165 + case 1:
  166 + ret = pci_user_write_config_byte(pdev, offset, tmp_val);
  167 + break;
  168 + case 2:
  169 + ret = pci_user_write_config_word(pdev, offset, tmp_val);
  170 + break;
  171 + case 4:
  172 + ret = pci_user_write_config_dword(pdev, offset, tmp_val);
  173 + break;
  174 + }
  175 +
  176 + return pcibios_err_to_errno(ret);
  177 +}
  178 +
  179 +static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
  180 + int count, struct perm_bits *perm,
  181 + int offset, __le32 *val)
  182 +{
  183 + __le32 virt = 0;
  184 +
  185 + memcpy(val, vdev->vconfig + pos, count);
  186 +
  187 + memcpy(&virt, perm->virt + offset, count);
  188 +
  189 + /* Any non-virtualized bits? */
  190 + if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) {
  191 + struct pci_dev *pdev = vdev->pdev;
  192 + __le32 phys_val = 0;
  193 + int ret;
  194 +
  195 + ret = vfio_user_config_read(pdev, pos, &phys_val, count);
  196 + if (ret)
  197 + return ret;
  198 +
  199 + *val = (phys_val & ~virt) | (*val & virt);
  200 + }
  201 +
  202 + return count;
  203 +}
  204 +
  205 +static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
  206 + int count, struct perm_bits *perm,
  207 + int offset, __le32 val)
  208 +{
  209 + __le32 virt = 0, write = 0;
  210 +
  211 + memcpy(&write, perm->write + offset, count);
  212 +
  213 + if (!write)
  214 + return count; /* drop, no writable bits */
  215 +
  216 + memcpy(&virt, perm->virt + offset, count);
  217 +
  218 + /* Virtualized and writable bits go to vconfig */
  219 + if (write & virt) {
  220 + __le32 virt_val = 0;
  221 +
  222 + memcpy(&virt_val, vdev->vconfig + pos, count);
  223 +
  224 + virt_val &= ~(write & virt);
  225 + virt_val |= (val & (write & virt));
  226 +
  227 + memcpy(vdev->vconfig + pos, &virt_val, count);
  228 + }
  229 +
  230 + /* Non-virtualzed and writable bits go to hardware */
  231 + if (write & ~virt) {
  232 + struct pci_dev *pdev = vdev->pdev;
  233 + __le32 phys_val = 0;
  234 + int ret;
  235 +
  236 + ret = vfio_user_config_read(pdev, pos, &phys_val, count);
  237 + if (ret)
  238 + return ret;
  239 +
  240 + phys_val &= ~(write & ~virt);
  241 + phys_val |= (val & (write & ~virt));
  242 +
  243 + ret = vfio_user_config_write(pdev, pos, phys_val, count);
  244 + if (ret)
  245 + return ret;
  246 + }
  247 +
  248 + return count;
  249 +}
  250 +
  251 +/* Allow direct read from hardware, except for capability next pointer */
  252 +static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
  253 + int count, struct perm_bits *perm,
  254 + int offset, __le32 *val)
  255 +{
  256 + int ret;
  257 +
  258 + ret = vfio_user_config_read(vdev->pdev, pos, val, count);
  259 + if (ret)
  260 + return pcibios_err_to_errno(ret);
  261 +
  262 + if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
  263 + if (offset < 4)
  264 + memcpy(val, vdev->vconfig + pos, count);
  265 + } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */
  266 + if (offset == PCI_CAP_LIST_ID && count > 1)
  267 + memcpy(val, vdev->vconfig + pos,
  268 + min(PCI_CAP_FLAGS, count));
  269 + else if (offset == PCI_CAP_LIST_NEXT)
  270 + memcpy(val, vdev->vconfig + pos, 1);
  271 + }
  272 +
  273 + return count;
  274 +}
  275 +
  276 +static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
  277 + int count, struct perm_bits *perm,
  278 + int offset, __le32 val)
  279 +{
  280 + int ret;
  281 +
  282 + ret = vfio_user_config_write(vdev->pdev, pos, val, count);
  283 + if (ret)
  284 + return ret;
  285 +
  286 + return count;
  287 +}
  288 +
  289 +/* Default all regions to read-only, no-virtualization */
  290 +static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
  291 + [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
  292 +};
  293 +static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
  294 + [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
  295 +};
  296 +
  297 +static void free_perm_bits(struct perm_bits *perm)
  298 +{
  299 + kfree(perm->virt);
  300 + kfree(perm->write);
  301 + perm->virt = NULL;
  302 + perm->write = NULL;
  303 +}
  304 +
  305 +static int alloc_perm_bits(struct perm_bits *perm, int size)
  306 +{
  307 + /*
  308 + * Round up all permission bits to the next dword, this lets us
  309 + * ignore whether a read/write exceeds the defined capability
  310 + * structure. We can do this because:
  311 + * - Standard config space is already dword aligned
  312 + * - Capabilities are all dword alinged (bits 0:1 of next reserved)
  313 + * - Express capabilities defined as dword aligned
  314 + */
  315 + size = round_up(size, 4);
  316 +
  317 + /*
  318 + * Zero state is
  319 + * - All Readable, None Writeable, None Virtualized
  320 + */
  321 + perm->virt = kzalloc(size, GFP_KERNEL);
  322 + perm->write = kzalloc(size, GFP_KERNEL);
  323 + if (!perm->virt || !perm->write) {
  324 + free_perm_bits(perm);
  325 + return -ENOMEM;
  326 + }
  327 +
  328 + perm->readfn = vfio_default_config_read;
  329 + perm->writefn = vfio_default_config_write;
  330 +
  331 + return 0;
  332 +}
  333 +
  334 +/*
  335 + * Helper functions for filling in permission tables
  336 + */
  337 +static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write)
  338 +{
  339 + p->virt[off] = virt;
  340 + p->write[off] = write;
  341 +}
  342 +
  343 +/* Handle endian-ness - pci and tables are little-endian */
  344 +static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write)
  345 +{
  346 + *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt);
  347 + *(__le16 *)(&p->write[off]) = cpu_to_le16(write);
  348 +}
  349 +
  350 +/* Handle endian-ness - pci and tables are little-endian */
  351 +static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
  352 +{
  353 + *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt);
  354 + *(__le32 *)(&p->write[off]) = cpu_to_le32(write);
  355 +}
  356 +
  357 +/*
  358 + * Restore the *real* BARs after we detect a FLR or backdoor reset.
  359 + * (backdoor = some device specific technique that we didn't catch)
  360 + */
  361 +static void vfio_bar_restore(struct vfio_pci_device *vdev)
  362 +{
  363 + struct pci_dev *pdev = vdev->pdev;
  364 + u32 *rbar = vdev->rbar;
  365 + int i;
  366 +
  367 + if (pdev->is_virtfn)
  368 + return;
  369 +
  370 + pr_info("%s: %s reset recovery - restoring bars\n",
  371 + __func__, dev_name(&pdev->dev));
  372 +
  373 + for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
  374 + pci_user_write_config_dword(pdev, i, *rbar);
  375 +
  376 + pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
  377 +}
  378 +
  379 +static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
  380 +{
  381 + unsigned long flags = pci_resource_flags(pdev, bar);
  382 + u32 val;
  383 +
  384 + if (flags & IORESOURCE_IO)
  385 + return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
  386 +
  387 + val = PCI_BASE_ADDRESS_SPACE_MEMORY;
  388 +
  389 + if (flags & IORESOURCE_PREFETCH)
  390 + val |= PCI_BASE_ADDRESS_MEM_PREFETCH;
  391 +
  392 + if (flags & IORESOURCE_MEM_64)
  393 + val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
  394 +
  395 + return cpu_to_le32(val);
  396 +}
  397 +
  398 +/*
  399 + * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
  400 + * to reflect the hardware capabilities. This implements BAR sizing.
  401 + */
  402 +static void vfio_bar_fixup(struct vfio_pci_device *vdev)
  403 +{
  404 + struct pci_dev *pdev = vdev->pdev;
  405 + int i;
  406 + __le32 *bar;
  407 + u64 mask;
  408 +
  409 + bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
  410 +
  411 + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
  412 + if (!pci_resource_start(pdev, i)) {
  413 + *bar = 0; /* Unmapped by host = unimplemented to user */
  414 + continue;
  415 + }
  416 +
  417 + mask = ~(pci_resource_len(pdev, i) - 1);
  418 +
  419 + *bar &= cpu_to_le32((u32)mask);
  420 + *bar |= vfio_generate_bar_flags(pdev, i);
  421 +
  422 + if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
  423 + bar++;
  424 + *bar &= cpu_to_le32((u32)(mask >> 32));
  425 + i++;
  426 + }
  427 + }
  428 +
  429 + bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
  430 +
  431 + /*
  432 + * NB. we expose the actual BAR size here, regardless of whether
  433 + * we can read it. When we report the REGION_INFO for the ROM
  434 + * we report what PCI tells us is the actual ROM size.
  435 + */
  436 + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
  437 + mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
  438 + mask |= PCI_ROM_ADDRESS_ENABLE;
  439 + *bar &= cpu_to_le32((u32)mask);
  440 + } else
  441 + *bar = 0;
  442 +
  443 + vdev->bardirty = false;
  444 +}
  445 +
  446 +static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
  447 + int count, struct perm_bits *perm,
  448 + int offset, __le32 *val)
  449 +{
  450 + if (is_bar(offset)) /* pos == offset for basic config */
  451 + vfio_bar_fixup(vdev);
  452 +
  453 + count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
  454 +
  455 + /* Mask in virtual memory enable for SR-IOV devices */
  456 + if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
  457 + u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
  458 + u32 tmp_val = le32_to_cpu(*val);
  459 +
  460 + tmp_val |= cmd & PCI_COMMAND_MEMORY;
  461 + *val = cpu_to_le32(tmp_val);
  462 + }
  463 +
  464 + return count;
  465 +}
  466 +
  467 +static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
  468 + int count, struct perm_bits *perm,
  469 + int offset, __le32 val)
  470 +{
  471 + struct pci_dev *pdev = vdev->pdev;
  472 + __le16 *virt_cmd;
  473 + u16 new_cmd = 0;
  474 + int ret;
  475 +
  476 + virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND];
  477 +
  478 + if (offset == PCI_COMMAND) {
  479 + bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io;
  480 + u16 phys_cmd;
  481 +
  482 + ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd);
  483 + if (ret)
  484 + return ret;
  485 +
  486 + new_cmd = le32_to_cpu(val);
  487 +
  488 + phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY);
  489 + virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
  490 + new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
  491 +
  492 + phys_io = !!(phys_cmd & PCI_COMMAND_IO);
  493 + virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO);
  494 + new_io = !!(new_cmd & PCI_COMMAND_IO);
  495 +
  496 + /*
  497 + * If the user is writing mem/io enable (new_mem/io) and we
  498 + * think it's already enabled (virt_mem/io), but the hardware
  499 + * shows it disabled (phys_mem/io, then the device has
  500 + * undergone some kind of backdoor reset and needs to be
  501 + * restored before we allow it to enable the bars.
  502 + * SR-IOV devices will trigger this, but we catch them later
  503 + */
  504 + if ((new_mem && virt_mem && !phys_mem) ||
  505 + (new_io && virt_io && !phys_io))
  506 + vfio_bar_restore(vdev);
  507 + }
  508 +
  509 + count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
  510 + if (count < 0)
  511 + return count;
  512 +
  513 + /*
  514 + * Save current memory/io enable bits in vconfig to allow for
  515 + * the test above next time.
  516 + */
  517 + if (offset == PCI_COMMAND) {
  518 + u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO;
  519 +
  520 + *virt_cmd &= cpu_to_le16(~mask);
  521 + *virt_cmd |= cpu_to_le16(new_cmd & mask);
  522 + }
  523 +
  524 + /* Emulate INTx disable */
  525 + if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) {
  526 + bool virt_intx_disable;
  527 +
  528 + virt_intx_disable = !!(le16_to_cpu(*virt_cmd) &
  529 + PCI_COMMAND_INTX_DISABLE);
  530 +
  531 + if (virt_intx_disable && !vdev->virq_disabled) {
  532 + vdev->virq_disabled = true;
  533 + vfio_pci_intx_mask(vdev);
  534 + } else if (!virt_intx_disable && vdev->virq_disabled) {
  535 + vdev->virq_disabled = false;
  536 + vfio_pci_intx_unmask(vdev);
  537 + }
  538 + }
  539 +
  540 + if (is_bar(offset))
  541 + vdev->bardirty = true;
  542 +
  543 + return count;
  544 +}
  545 +
  546 +/* Permissions for the Basic PCI Header */
  547 +static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
  548 +{
  549 + if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF))
  550 + return -ENOMEM;
  551 +
  552 + perm->readfn = vfio_basic_config_read;
  553 + perm->writefn = vfio_basic_config_write;
  554 +
  555 + /* Virtualized for SR-IOV functions, which just have FFFF */
  556 + p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE);
  557 + p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE);
  558 +
  559 + /*
  560 + * Virtualize INTx disable, we use it internally for interrupt
  561 + * control and can emulate it for non-PCI 2.3 devices.
  562 + */
  563 + p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE);
  564 +
  565 + /* Virtualize capability list, we might want to skip/disable */
  566 + p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE);
  567 +
  568 + /* No harm to write */
  569 + p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE);
  570 + p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE);
  571 + p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE);
  572 +
  573 + /* Virtualize all bars, can't touch the real ones */
  574 + p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE);
  575 + p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE);
  576 + p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE);
  577 + p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE);
  578 + p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE);
  579 + p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE);
  580 + p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE);
  581 +
  582 + /* Allow us to adjust capability chain */
  583 + p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE);
  584 +
  585 + /* Sometimes used by sw, just virtualize */
  586 + p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE);
  587 + return 0;
  588 +}
  589 +
  590 +/* Permissions for the Power Management capability */
  591 +static int __init init_pci_cap_pm_perm(struct perm_bits *perm)
  592 +{
  593 + if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM]))
  594 + return -ENOMEM;
  595 +
  596 + /*
  597 + * We always virtualize the next field so we can remove
  598 + * capabilities from the chain if we want to.
  599 + */
  600 + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
  601 +
  602 + /*
  603 + * Power management is defined *per function*,
  604 + * so we let the user write this
  605 + */
  606 + p_setd(perm, PCI_PM_CTRL, NO_VIRT, ALL_WRITE);
  607 + return 0;
  608 +}
  609 +
  610 +/* Permissions for PCI-X capability */
  611 +static int __init init_pci_cap_pcix_perm(struct perm_bits *perm)
  612 +{
  613 + /* Alloc 24, but only 8 are used in v0 */
  614 + if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2))
  615 + return -ENOMEM;
  616 +
  617 + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
  618 +
  619 + p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE);
  620 + p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE);
  621 + return 0;
  622 +}
  623 +
  624 +/* Permissions for PCI Express capability */
  625 +static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
  626 +{
  627 + /* Alloc larger of two possible sizes */
  628 + if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2))
  629 + return -ENOMEM;
  630 +
  631 + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
  632 +
  633 + /*
  634 + * Allow writes to device control fields (includes FLR!)
  635 + * but not to devctl_phantom which could confuse IOMMU
  636 + * or to the ARI bit in devctl2 which is set at probe time
  637 + */
  638 + p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM);
  639 + p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
  640 + return 0;
  641 +}
  642 +
  643 +/* Permissions for Advanced Function capability */
  644 +static int __init init_pci_cap_af_perm(struct perm_bits *perm)
  645 +{
  646 + if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF]))
  647 + return -ENOMEM;
  648 +
  649 + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
  650 + p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR);
  651 + return 0;
  652 +}
  653 +
  654 +/* Permissions for Advanced Error Reporting extended capability */
  655 +static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm)
  656 +{
  657 + u32 mask;
  658 +
  659 + if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR]))
  660 + return -ENOMEM;
  661 +
  662 + /*
  663 + * Virtualize the first dword of all express capabilities
  664 + * because it includes the next pointer. This lets us later
  665 + * remove capabilities from the chain if we need to.
  666 + */
  667 + p_setd(perm, 0, ALL_VIRT, NO_WRITE);
  668 +
  669 + /* Writable bits mask */
  670 + mask = PCI_ERR_UNC_TRAIN | /* Training */
  671 + PCI_ERR_UNC_DLP | /* Data Link Protocol */
  672 + PCI_ERR_UNC_SURPDN | /* Surprise Down */
  673 + PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */
  674 + PCI_ERR_UNC_FCP | /* Flow Control Protocol */
  675 + PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */
  676 + PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */
  677 + PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */
  678 + PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */
  679 + PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */
  680 + PCI_ERR_UNC_ECRC | /* ECRC Error Status */
  681 + PCI_ERR_UNC_UNSUP | /* Unsupported Request */
  682 + PCI_ERR_UNC_ACSV | /* ACS Violation */
  683 + PCI_ERR_UNC_INTN | /* internal error */
  684 + PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */
  685 + PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */
  686 + PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */
  687 + p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask);
  688 + p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask);
  689 + p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask);
  690 +
  691 + mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */
  692 + PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */
  693 + PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */
  694 + PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */
  695 + PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */
  696 + PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */
  697 + PCI_ERR_COR_INTERNAL | /* Corrected Internal */
  698 + PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */
  699 + p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask);
  700 + p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask);
  701 +
  702 + mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */
  703 + PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */
  704 + p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask);
  705 + return 0;
  706 +}
  707 +
  708 +/* Permissions for Power Budgeting extended capability */
  709 +static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm)
  710 +{
  711 + if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR]))
  712 + return -ENOMEM;
  713 +
  714 + p_setd(perm, 0, ALL_VIRT, NO_WRITE);
  715 +
  716 + /* Writing the data selector is OK, the info is still read-only */
  717 + p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE);
  718 + return 0;
  719 +}
  720 +
  721 +/*
  722 + * Initialize the shared permission tables
  723 + */
  724 +void vfio_pci_uninit_perm_bits(void)
  725 +{
  726 + free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]);
  727 +
  728 + free_perm_bits(&cap_perms[PCI_CAP_ID_PM]);
  729 + free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]);
  730 + free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]);
  731 + free_perm_bits(&cap_perms[PCI_CAP_ID_AF]);
  732 +
  733 + free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
  734 + free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
  735 +}
  736 +
  737 +int __init vfio_pci_init_perm_bits(void)
  738 +{
  739 + int ret;
  740 +
  741 + /* Basic config space */
  742 + ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]);
  743 +
  744 + /* Capabilities */
  745 + ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
  746 + cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write;
  747 + ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
  748 + cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write;
  749 + ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
  750 + ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
  751 +
  752 + /* Extended capabilities */
  753 + ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
  754 + ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
  755 + ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write;
  756 +
  757 + if (ret)
  758 + vfio_pci_uninit_perm_bits();
  759 +
  760 + return ret;
  761 +}
  762 +
  763 +static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
  764 +{
  765 + u8 cap;
  766 + int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
  767 + PCI_STD_HEADER_SIZEOF;
  768 + base /= 4;
  769 + pos /= 4;
  770 +
  771 + cap = vdev->pci_config_map[pos];
  772 +
  773 + if (cap == PCI_CAP_ID_BASIC)
  774 + return 0;
  775 +
  776 + /* XXX Can we have to abutting capabilities of the same type? */
  777 + while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
  778 + pos--;
  779 +
  780 + return pos * 4;
  781 +}
  782 +
  783 +static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
  784 + int count, struct perm_bits *perm,
  785 + int offset, __le32 *val)
  786 +{
  787 + /* Update max available queue size from msi_qmax */
  788 + if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
  789 + __le16 *flags;
  790 + int start;
  791 +
  792 + start = vfio_find_cap_start(vdev, pos);
  793 +
  794 + flags = (__le16 *)&vdev->vconfig[start];
  795 +
  796 + *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK);
  797 + *flags |= cpu_to_le16(vdev->msi_qmax << 1);
  798 + }
  799 +
  800 + return vfio_default_config_read(vdev, pos, count, perm, offset, val);
  801 +}
  802 +
  803 +static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
  804 + int count, struct perm_bits *perm,
  805 + int offset, __le32 val)
  806 +{
  807 + count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
  808 + if (count < 0)
  809 + return count;
  810 +
  811 + /* Fixup and write configured queue size and enable to hardware */
  812 + if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) {
  813 + __le16 *pflags;
  814 + u16 flags;
  815 + int start, ret;
  816 +
  817 + start = vfio_find_cap_start(vdev, pos);
  818 +
  819 + pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS];
  820 +
  821 + flags = le16_to_cpu(*pflags);
  822 +
  823 + /* MSI is enabled via ioctl */
  824 + if (!is_msi(vdev))
  825 + flags &= ~PCI_MSI_FLAGS_ENABLE;
  826 +
  827 + /* Check queue size */
  828 + if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) {
  829 + flags &= ~PCI_MSI_FLAGS_QSIZE;
  830 + flags |= vdev->msi_qmax << 4;
  831 + }
  832 +
  833 + /* Write back to virt and to hardware */
  834 + *pflags = cpu_to_le16(flags);
  835 + ret = pci_user_write_config_word(vdev->pdev,
  836 + start + PCI_MSI_FLAGS,
  837 + flags);
  838 + if (ret)
  839 + return pcibios_err_to_errno(ret);
  840 + }
  841 +
  842 + return count;
  843 +}
  844 +
  845 +/*
  846 + * MSI determination is per-device, so this routine gets used beyond
  847 + * initialization time. Don't add __init
  848 + */
  849 +static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags)
  850 +{
  851 + if (alloc_perm_bits(perm, len))
  852 + return -ENOMEM;
  853 +
  854 + perm->readfn = vfio_msi_config_read;
  855 + perm->writefn = vfio_msi_config_write;
  856 +
  857 + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE);
  858 +
  859 + /*
  860 + * The upper byte of the control register is reserved,
  861 + * just setup the lower byte.
  862 + */
  863 + p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE);
  864 + p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE);
  865 + if (flags & PCI_MSI_FLAGS_64BIT) {
  866 + p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE);
  867 + p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE);
  868 + if (flags & PCI_MSI_FLAGS_MASKBIT) {
  869 + p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE);
  870 + p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE);
  871 + }
  872 + } else {
  873 + p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE);
  874 + if (flags & PCI_MSI_FLAGS_MASKBIT) {
  875 + p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE);
  876 + p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE);
  877 + }
  878 + }
  879 + return 0;
  880 +}
  881 +
  882 +/* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */
  883 +static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos)
  884 +{
  885 + struct pci_dev *pdev = vdev->pdev;
  886 + int len, ret;
  887 + u16 flags;
  888 +
  889 + ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags);
  890 + if (ret)
  891 + return pcibios_err_to_errno(ret);
  892 +
  893 + len = 10; /* Minimum size */
  894 + if (flags & PCI_MSI_FLAGS_64BIT)
  895 + len += 4;
  896 + if (flags & PCI_MSI_FLAGS_MASKBIT)
  897 + len += 10;
  898 +
  899 + if (vdev->msi_perm)
  900 + return len;
  901 +
  902 + vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL);
  903 + if (!vdev->msi_perm)
  904 + return -ENOMEM;
  905 +
  906 + ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags);
  907 + if (ret)
  908 + return ret;
  909 +
  910 + return len;
  911 +}
  912 +
  913 +/* Determine extended capability length for VC (2 & 9) and MFVC */
  914 +static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
  915 +{
  916 + struct pci_dev *pdev = vdev->pdev;
  917 + u32 tmp;
  918 + int ret, evcc, phases, vc_arb;
  919 + int len = PCI_CAP_VC_BASE_SIZEOF;
  920 +
  921 + ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG1, &tmp);
  922 + if (ret)
  923 + return pcibios_err_to_errno(ret);
  924 +
  925 + evcc = tmp & PCI_VC_REG1_EVCC; /* extended vc count */
  926 + ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_REG2, &tmp);
  927 + if (ret)
  928 + return pcibios_err_to_errno(ret);
  929 +
  930 + if (tmp & PCI_VC_REG2_128_PHASE)
  931 + phases = 128;
  932 + else if (tmp & PCI_VC_REG2_64_PHASE)
  933 + phases = 64;
  934 + else if (tmp & PCI_VC_REG2_32_PHASE)
  935 + phases = 32;
  936 + else
  937 + phases = 0;
  938 +
  939 + vc_arb = phases * 4;
  940 +
  941 + /*
  942 + * Port arbitration tables are root & switch only;
  943 + * function arbitration tables are function 0 only.
  944 + * In either case, we'll never let user write them so
  945 + * we don't care how big they are
  946 + */
  947 + len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF;
  948 + if (vc_arb) {
  949 + len = round_up(len, 16);
  950 + len += vc_arb / 8;
  951 + }
  952 + return len;
  953 +}
  954 +
  955 +static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
  956 +{
  957 + struct pci_dev *pdev = vdev->pdev;
  958 + u16 word;
  959 + u8 byte;
  960 + int ret;
  961 +
  962 + switch (cap) {
  963 + case PCI_CAP_ID_MSI:
  964 + return vfio_msi_cap_len(vdev, pos);
  965 + case PCI_CAP_ID_PCIX:
  966 + ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word);
  967 + if (ret)
  968 + return pcibios_err_to_errno(ret);
  969 +
  970 + if (PCI_X_CMD_VERSION(word)) {
  971 + vdev->extended_caps = true;
  972 + return PCI_CAP_PCIX_SIZEOF_V2;
  973 + } else
  974 + return PCI_CAP_PCIX_SIZEOF_V0;
  975 + case PCI_CAP_ID_VNDR:
  976 + /* length follows next field */
  977 + ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte);
  978 + if (ret)
  979 + return pcibios_err_to_errno(ret);
  980 +
  981 + return byte;
  982 + case PCI_CAP_ID_EXP:
  983 + /* length based on version */
  984 + ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
  985 + if (ret)
  986 + return pcibios_err_to_errno(ret);
  987 +
  988 + if ((word & PCI_EXP_FLAGS_VERS) == 1)
  989 + return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
  990 + else {
  991 + vdev->extended_caps = true;
  992 + return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
  993 + }
  994 + case PCI_CAP_ID_HT:
  995 + ret = pci_read_config_byte(pdev, pos + 3, &byte);
  996 + if (ret)
  997 + return pcibios_err_to_errno(ret);
  998 +
  999 + return (byte & HT_3BIT_CAP_MASK) ?
  1000 + HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG;
  1001 + case PCI_CAP_ID_SATA:
  1002 + ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte);
  1003 + if (ret)
  1004 + return pcibios_err_to_errno(ret);
  1005 +
  1006 + byte &= PCI_SATA_REGS_MASK;
  1007 + if (byte == PCI_SATA_REGS_INLINE)
  1008 + return PCI_SATA_SIZEOF_LONG;
  1009 + else
  1010 + return PCI_SATA_SIZEOF_SHORT;
  1011 + default:
  1012 + pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n",
  1013 + dev_name(&pdev->dev), __func__, cap, pos);
  1014 + }
  1015 +
  1016 + return 0;
  1017 +}
  1018 +
  1019 +static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos)
  1020 +{
  1021 + struct pci_dev *pdev = vdev->pdev;
  1022 + u8 byte;
  1023 + u32 dword;
  1024 + int ret;
  1025 +
  1026 + switch (ecap) {
  1027 + case PCI_EXT_CAP_ID_VNDR:
  1028 + ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
  1029 + if (ret)
  1030 + return pcibios_err_to_errno(ret);
  1031 +
  1032 + return dword >> PCI_VSEC_HDR_LEN_SHIFT;
  1033 + case PCI_EXT_CAP_ID_VC:
  1034 + case PCI_EXT_CAP_ID_VC9:
  1035 + case PCI_EXT_CAP_ID_MFVC:
  1036 + return vfio_vc_cap_len(vdev, epos);
  1037 + case PCI_EXT_CAP_ID_ACS:
  1038 + ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte);
  1039 + if (ret)
  1040 + return pcibios_err_to_errno(ret);
  1041 +
  1042 + if (byte & PCI_ACS_EC) {
  1043 + int bits;
  1044 +
  1045 + ret = pci_read_config_byte(pdev,
  1046 + epos + PCI_ACS_EGRESS_BITS,
  1047 + &byte);
  1048 + if (ret)
  1049 + return pcibios_err_to_errno(ret);
  1050 +
  1051 + bits = byte ? round_up(byte, 32) : 256;
  1052 + return 8 + (bits / 8);
  1053 + }
  1054 + return 8;
  1055 +
  1056 + case PCI_EXT_CAP_ID_REBAR:
  1057 + ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte);
  1058 + if (ret)
  1059 + return pcibios_err_to_errno(ret);
  1060 +
  1061 + byte &= PCI_REBAR_CTRL_NBAR_MASK;
  1062 + byte >>= PCI_REBAR_CTRL_NBAR_SHIFT;
  1063 +
  1064 + return 4 + (byte * 8);
  1065 + case PCI_EXT_CAP_ID_DPA:
  1066 + ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte);
  1067 + if (ret)
  1068 + return pcibios_err_to_errno(ret);
  1069 +
  1070 + byte &= PCI_DPA_CAP_SUBSTATE_MASK;
  1071 + byte = round_up(byte + 1, 4);
  1072 + return PCI_DPA_BASE_SIZEOF + byte;
  1073 + case PCI_EXT_CAP_ID_TPH:
  1074 + ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword);
  1075 + if (ret)
  1076 + return pcibios_err_to_errno(ret);
  1077 +
  1078 + if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) {
  1079 + int sts;
  1080 +
  1081 + sts = byte & PCI_TPH_CAP_ST_MASK;
  1082 + sts >>= PCI_TPH_CAP_ST_SHIFT;
  1083 + return PCI_TPH_BASE_SIZEOF + round_up(sts * 2, 4);
  1084 + }
  1085 + return PCI_TPH_BASE_SIZEOF;
  1086 + default:
  1087 + pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n",
  1088 + dev_name(&pdev->dev), __func__, ecap, epos);
  1089 + }
  1090 +
  1091 + return 0;
  1092 +}
  1093 +
  1094 +static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
  1095 + int offset, int size)
  1096 +{
  1097 + struct pci_dev *pdev = vdev->pdev;
  1098 + int ret = 0;
  1099 +
  1100 + /*
  1101 + * We try to read physical config space in the largest chunks
  1102 + * we can, assuming that all of the fields support dword access.
  1103 + * pci_save_state() makes this same assumption and seems to do ok.
  1104 + */
  1105 + while (size) {
  1106 + int filled;
  1107 +
  1108 + if (size >= 4 && !(offset % 4)) {
  1109 + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset];
  1110 + u32 dword;
  1111 +
  1112 + ret = pci_read_config_dword(pdev, offset, &dword);
  1113 + if (ret)
  1114 + return ret;
  1115 + *dwordp = cpu_to_le32(dword);
  1116 + filled = 4;
  1117 + } else if (size >= 2 && !(offset % 2)) {
  1118 + __le16 *wordp = (__le16 *)&vdev->vconfig[offset];
  1119 + u16 word;
  1120 +
  1121 + ret = pci_read_config_word(pdev, offset, &word);
  1122 + if (ret)
  1123 + return ret;
  1124 + *wordp = cpu_to_le16(word);
  1125 + filled = 2;
  1126 + } else {
  1127 + u8 *byte = &vdev->vconfig[offset];
  1128 + ret = pci_read_config_byte(pdev, offset, byte);
  1129 + if (ret)
  1130 + return ret;
  1131 + filled = 1;
  1132 + }
  1133 +
  1134 + offset += filled;
  1135 + size -= filled;
  1136 + }
  1137 +
  1138 + return ret;
  1139 +}
  1140 +
  1141 +static int vfio_cap_init(struct vfio_pci_device *vdev)
  1142 +{
  1143 + struct pci_dev *pdev = vdev->pdev;
  1144 + u8 *map = vdev->pci_config_map;
  1145 + u16 status;
  1146 + u8 pos, *prev, cap;
  1147 + int loops, ret, caps = 0;
  1148 +
  1149 + /* Any capabilities? */
  1150 + ret = pci_read_config_word(pdev, PCI_STATUS, &status);
  1151 + if (ret)
  1152 + return ret;
  1153 +
  1154 + if (!(status & PCI_STATUS_CAP_LIST))
  1155 + return 0; /* Done */
  1156 +
  1157 + ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
  1158 + if (ret)
  1159 + return ret;
  1160 +
  1161 + /* Mark the previous position in case we want to skip a capability */
  1162 + prev = &vdev->vconfig[PCI_CAPABILITY_LIST];
  1163 +
  1164 + /* We can bound our loop, capabilities are dword aligned */
  1165 + loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF;
  1166 + while (pos && loops--) {
  1167 + u8 next;
  1168 + int i, len = 0;
  1169 +
  1170 + ret = pci_read_config_byte(pdev, pos, &cap);
  1171 + if (ret)
  1172 + return ret;
  1173 +
  1174 + ret = pci_read_config_byte(pdev,
  1175 + pos + PCI_CAP_LIST_NEXT, &next);
  1176 + if (ret)
  1177 + return ret;
  1178 +
  1179 + if (cap <= PCI_CAP_ID_MAX) {
  1180 + len = pci_cap_length[cap];
  1181 + if (len == 0xFF) { /* Variable length */
  1182 + len = vfio_cap_len(vdev, cap, pos);
  1183 + if (len < 0)
  1184 + return len;
  1185 + }
  1186 + }
  1187 +
  1188 + if (!len) {
  1189 + pr_info("%s: %s hiding cap 0x%x\n",
  1190 + __func__, dev_name(&pdev->dev), cap);
  1191 + *prev = next;
  1192 + pos = next;
  1193 + continue;
  1194 + }
  1195 +
  1196 + /* Sanity check, do we overlap other capabilities? */
  1197 + for (i = 0; i < len; i += 4) {
  1198 + if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID))
  1199 + continue;
  1200 +
  1201 + pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
  1202 + __func__, dev_name(&pdev->dev),
  1203 + pos + i, map[pos + i], cap);
  1204 + }
  1205 +
  1206 + memset(map + (pos / 4), cap, len / 4);
  1207 + ret = vfio_fill_vconfig_bytes(vdev, pos, len);
  1208 + if (ret)
  1209 + return ret;
  1210 +
  1211 + prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT];
  1212 + pos = next;
  1213 + caps++;
  1214 + }
  1215 +
  1216 + /* If we didn't fill any capabilities, clear the status flag */
  1217 + if (!caps) {
  1218 + __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS];
  1219 + *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST);
  1220 + }
  1221 +
  1222 + return 0;
  1223 +}
  1224 +
  1225 +static int vfio_ecap_init(struct vfio_pci_device *vdev)
  1226 +{
  1227 + struct pci_dev *pdev = vdev->pdev;
  1228 + u8 *map = vdev->pci_config_map;
  1229 + u16 epos;
  1230 + __le32 *prev = NULL;
  1231 + int loops, ret, ecaps = 0;
  1232 +
  1233 + if (!vdev->extended_caps)
  1234 + return 0;
  1235 +
  1236 + epos = PCI_CFG_SPACE_SIZE;
  1237 +
  1238 + loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF;
  1239 +
  1240 + while (loops-- && epos >= PCI_CFG_SPACE_SIZE) {
  1241 + u32 header;
  1242 + u16 ecap;
  1243 + int i, len = 0;
  1244 + bool hidden = false;
  1245 +
  1246 + ret = pci_read_config_dword(pdev, epos, &header);
  1247 + if (ret)
  1248 + return ret;
  1249 +
  1250 + ecap = PCI_EXT_CAP_ID(header);
  1251 +
  1252 + if (ecap <= PCI_EXT_CAP_ID_MAX) {
  1253 + len = pci_ext_cap_length[ecap];
  1254 + if (len == 0xFF) {
  1255 + len = vfio_ext_cap_len(vdev, ecap, epos);
  1256 + if (len < 0)
  1257 + return ret;
  1258 + }
  1259 + }
  1260 +
  1261 + if (!len) {
  1262 + pr_info("%s: %s hiding ecap 0x%x@0x%x\n",
  1263 + __func__, dev_name(&pdev->dev), ecap, epos);
  1264 +
  1265 + /* If not the first in the chain, we can skip over it */
  1266 + if (prev) {
  1267 + u32 val = epos = PCI_EXT_CAP_NEXT(header);
  1268 + *prev &= cpu_to_le32(~(0xffcU << 20));
  1269 + *prev |= cpu_to_le32(val << 20);
  1270 + continue;
  1271 + }
  1272 +
  1273 + /*
  1274 + * Otherwise, fill in a placeholder, the direct
  1275 + * readfn will virtualize this automatically
  1276 + */
  1277 + len = PCI_CAP_SIZEOF;
  1278 + hidden = true;
  1279 + }
  1280 +
  1281 + for (i = 0; i < len; i += 4) {
  1282 + if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID))
  1283 + continue;
  1284 +
  1285 + pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
  1286 + __func__, dev_name(&pdev->dev),
  1287 + epos + i, map[epos + i], ecap);
  1288 + }
  1289 +
  1290 + /*
  1291 + * Even though ecap is 2 bytes, we're currently a long way
  1292 + * from exceeding 1 byte capabilities. If we ever make it
  1293 + * up to 0xFF we'll need to up this to a two-byte, byte map.
  1294 + */
  1295 + BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
  1296 +
  1297 + memset(map + (epos / 4), ecap, len / 4);
  1298 + ret = vfio_fill_vconfig_bytes(vdev, epos, len);
  1299 + if (ret)
  1300 + return ret;
  1301 +
  1302 + /*
  1303 + * If we're just using this capability to anchor the list,
  1304 + * hide the real ID. Only count real ecaps. XXX PCI spec
  1305 + * indicates to use cap id = 0, version = 0, next = 0 if
  1306 + * ecaps are absent, hope users check all the way to next.
  1307 + */
  1308 + if (hidden)
  1309 + *(__le32 *)&vdev->vconfig[epos] &=
  1310 + cpu_to_le32((0xffcU << 20));
  1311 + else
  1312 + ecaps++;
  1313 +
  1314 + prev = (__le32 *)&vdev->vconfig[epos];
  1315 + epos = PCI_EXT_CAP_NEXT(header);
  1316 + }
  1317 +
  1318 + if (!ecaps)
  1319 + *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0;
  1320 +
  1321 + return 0;
  1322 +}
  1323 +
  1324 +/*
  1325 + * For each device we allocate a pci_config_map that indicates the
  1326 + * capability occupying each dword and thus the struct perm_bits we
  1327 + * use for read and write. We also allocate a virtualized config
  1328 + * space which tracks reads and writes to bits that we emulate for
  1329 + * the user. Initial values filled from device.
  1330 + *
  1331 + * Using shared stuct perm_bits between all vfio-pci devices saves
  1332 + * us from allocating cfg_size buffers for virt and write for every
  1333 + * device. We could remove vconfig and allocate individual buffers
  1334 + * for each area requring emulated bits, but the array of pointers
  1335 + * would be comparable in size (at least for standard config space).
  1336 + */
  1337 +int vfio_config_init(struct vfio_pci_device *vdev)
  1338 +{
  1339 + struct pci_dev *pdev = vdev->pdev;
  1340 + u8 *map, *vconfig;
  1341 + int ret;
  1342 +
  1343 + /*
  1344 + * Config space, caps and ecaps are all dword aligned, so we can
  1345 + * use one byte per dword to record the type.
  1346 + */
  1347 + map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL);
  1348 + if (!map)
  1349 + return -ENOMEM;
  1350 +
  1351 + vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL);
  1352 + if (!vconfig) {
  1353 + kfree(map);
  1354 + return -ENOMEM;
  1355 + }
  1356 +
  1357 + vdev->pci_config_map = map;
  1358 + vdev->vconfig = vconfig;
  1359 +
  1360 + memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4);
  1361 + memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID,
  1362 + (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4);
  1363 +
  1364 + ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
  1365 + if (ret)
  1366 + goto out;
  1367 +
  1368 + vdev->bardirty = true;
  1369 +
  1370 + /*
  1371 + * XXX can we just pci_load_saved_state/pci_restore_state?
  1372 + * may need to rebuild vconfig after that
  1373 + */
  1374 +
  1375 + /* For restore after reset */
  1376 + vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]);
  1377 + vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]);
  1378 + vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]);
  1379 + vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]);
  1380 + vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]);
  1381 + vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]);
  1382 + vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]);
  1383 +
  1384 + if (pdev->is_virtfn) {
  1385 + *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor);
  1386 + *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
  1387 + }
  1388 +
  1389 + ret = vfio_cap_init(vdev);
  1390 + if (ret)
  1391 + goto out;
  1392 +
  1393 + ret = vfio_ecap_init(vdev);
  1394 + if (ret)
  1395 + goto out;
  1396 +
  1397 + return 0;
  1398 +
  1399 +out:
  1400 + kfree(map);
  1401 + vdev->pci_config_map = NULL;
  1402 + kfree(vconfig);
  1403 + vdev->vconfig = NULL;
  1404 + return pcibios_err_to_errno(ret);
  1405 +}
  1406 +
  1407 +void vfio_config_free(struct vfio_pci_device *vdev)
  1408 +{
  1409 + kfree(vdev->vconfig);
  1410 + vdev->vconfig = NULL;
  1411 + kfree(vdev->pci_config_map);
  1412 + vdev->pci_config_map = NULL;
  1413 + kfree(vdev->msi_perm);
  1414 + vdev->msi_perm = NULL;
  1415 +}
  1416 +
  1417 +static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
  1418 + size_t count, loff_t *ppos, bool iswrite)
  1419 +{
  1420 + struct pci_dev *pdev = vdev->pdev;
  1421 + struct perm_bits *perm;
  1422 + __le32 val = 0;
  1423 + int cap_start = 0, offset;
  1424 + u8 cap_id;
  1425 + ssize_t ret = count;
  1426 +
  1427 + if (*ppos < 0 || *ppos + count > pdev->cfg_size)
  1428 + return -EFAULT;
  1429 +
  1430 + /*
  1431 + * gcc can't seem to figure out we're a static function, only called
  1432 + * with count of 1/2/4 and hits copy_from_user_overflow without this.
  1433 + */
  1434 + if (count > sizeof(val))
  1435 + return -EINVAL;
  1436 +
  1437 + cap_id = vdev->pci_config_map[*ppos / 4];
  1438 +
  1439 + if (cap_id == PCI_CAP_ID_INVALID) {
  1440 + if (iswrite)
  1441 + return ret; /* drop */
  1442 +
  1443 + /*
  1444 + * Per PCI spec 3.0, section 6.1, reads from reserved and
  1445 + * unimplemented registers return 0
  1446 + */
  1447 + if (copy_to_user(buf, &val, count))
  1448 + return -EFAULT;
  1449 +
  1450 + return ret;
  1451 + }
  1452 +
  1453 + /*
  1454 + * All capabilities are minimum 4 bytes and aligned on dword
  1455 + * boundaries. Since we don't support unaligned accesses, we're
  1456 + * only ever accessing a single capability.
  1457 + */
  1458 + if (*ppos >= PCI_CFG_SPACE_SIZE) {
  1459 + WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
  1460 +
  1461 + perm = &ecap_perms[cap_id];
  1462 + cap_start = vfio_find_cap_start(vdev, *ppos);
  1463 +
  1464 + } else {
  1465 + WARN_ON(cap_id > PCI_CAP_ID_MAX);
  1466 +
  1467 + perm = &cap_perms[cap_id];
  1468 +
  1469 + if (cap_id == PCI_CAP_ID_MSI)
  1470 + perm = vdev->msi_perm;
  1471 +
  1472 + if (cap_id > PCI_CAP_ID_BASIC)
  1473 + cap_start = vfio_find_cap_start(vdev, *ppos);
  1474 + }
  1475 +
  1476 + WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
  1477 + WARN_ON(cap_start > *ppos);
  1478 +
  1479 + offset = *ppos - cap_start;
  1480 +
  1481 + if (iswrite) {
  1482 + if (!perm->writefn)
  1483 + return ret;
  1484 +
  1485 + if (copy_from_user(&val, buf, count))
  1486 + return -EFAULT;
  1487 +
  1488 + ret = perm->writefn(vdev, *ppos, count, perm, offset, val);
  1489 + } else {
  1490 + if (perm->readfn) {
  1491 + ret = perm->readfn(vdev, *ppos, count,
  1492 + perm, offset, &val);
  1493 + if (ret < 0)
  1494 + return ret;
  1495 + }
  1496 +
  1497 + if (copy_to_user(buf, &val, count))
  1498 + return -EFAULT;
  1499 + }
  1500 +
  1501 + return ret;
  1502 +}
  1503 +
  1504 +ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
  1505 + char __user *buf, size_t count,
  1506 + loff_t *ppos, bool iswrite)
  1507 +{
  1508 + size_t done = 0;
  1509 + int ret = 0;
  1510 + loff_t pos = *ppos;
  1511 +
  1512 + pos &= VFIO_PCI_OFFSET_MASK;
  1513 +
  1514 + /*
  1515 + * We want to both keep the access size the caller users as well as
  1516 + * support reading large chunks of config space in a single call.
  1517 + * PCI doesn't support unaligned accesses, so we can safely break
  1518 + * those apart.
  1519 + */
  1520 + while (count) {
  1521 + if (count >= 4 && !(pos % 4))
  1522 + ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
  1523 + else if (count >= 2 && !(pos % 2))
  1524 + ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
  1525 + else
  1526 + ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
  1527 +
  1528 + if (ret < 0)
  1529 + return ret;
  1530 +
  1531 + count -= ret;
  1532 + done += ret;
  1533 + buf += ret;
  1534 + pos += ret;
  1535 + }
  1536 +
  1537 + *ppos += done;
  1538 +
  1539 + return done;
  1540 +}
drivers/vfio/pci/vfio_pci_intrs.c
  1 +/*
  2 + * VFIO PCI interrupt handling
  3 + *
  4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  5 + * Author: Alex Williamson <alex.williamson@redhat.com>
  6 + *
  7 + * This program is free software; you can redistribute it and/or modify
  8 + * it under the terms of the GNU General Public License version 2 as
  9 + * published by the Free Software Foundation.
  10 + *
  11 + * Derived from original vfio:
  12 + * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  13 + * Author: Tom Lyon, pugs@cisco.com
  14 + */
  15 +
  16 +#include <linux/device.h>
  17 +#include <linux/interrupt.h>
  18 +#include <linux/eventfd.h>
  19 +#include <linux/pci.h>
  20 +#include <linux/file.h>
  21 +#include <linux/poll.h>
  22 +#include <linux/vfio.h>
  23 +#include <linux/wait.h>
  24 +#include <linux/workqueue.h>
  25 +
  26 +#include "vfio_pci_private.h"
  27 +
  28 +/*
  29 + * IRQfd - generic
  30 + */
  31 +struct virqfd {
  32 + struct vfio_pci_device *vdev;
  33 + struct eventfd_ctx *eventfd;
  34 + int (*handler)(struct vfio_pci_device *, void *);
  35 + void (*thread)(struct vfio_pci_device *, void *);
  36 + void *data;
  37 + struct work_struct inject;
  38 + wait_queue_t wait;
  39 + poll_table pt;
  40 + struct work_struct shutdown;
  41 + struct virqfd **pvirqfd;
  42 +};
  43 +
  44 +static struct workqueue_struct *vfio_irqfd_cleanup_wq;
  45 +
  46 +int __init vfio_pci_virqfd_init(void)
  47 +{
  48 + vfio_irqfd_cleanup_wq =
  49 + create_singlethread_workqueue("vfio-irqfd-cleanup");
  50 + if (!vfio_irqfd_cleanup_wq)
  51 + return -ENOMEM;
  52 +
  53 + return 0;
  54 +}
  55 +
  56 +void vfio_pci_virqfd_exit(void)
  57 +{
  58 + destroy_workqueue(vfio_irqfd_cleanup_wq);
  59 +}
  60 +
  61 +static void virqfd_deactivate(struct virqfd *virqfd)
  62 +{
  63 + queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
  64 +}
  65 +
  66 +static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
  67 +{
  68 + struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
  69 + unsigned long flags = (unsigned long)key;
  70 +
  71 + if (flags & POLLIN) {
  72 + /* An event has been signaled, call function */
  73 + if ((!virqfd->handler ||
  74 + virqfd->handler(virqfd->vdev, virqfd->data)) &&
  75 + virqfd->thread)
  76 + schedule_work(&virqfd->inject);
  77 + }
  78 +
  79 + if (flags & POLLHUP)
  80 + /* The eventfd is closing, detach from VFIO */
  81 + virqfd_deactivate(virqfd);
  82 +
  83 + return 0;
  84 +}
  85 +
  86 +static void virqfd_ptable_queue_proc(struct file *file,
  87 + wait_queue_head_t *wqh, poll_table *pt)
  88 +{
  89 + struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
  90 + add_wait_queue(wqh, &virqfd->wait);
  91 +}
  92 +
  93 +static void virqfd_shutdown(struct work_struct *work)
  94 +{
  95 + struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
  96 + struct virqfd **pvirqfd = virqfd->pvirqfd;
  97 + u64 cnt;
  98 +
  99 + eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
  100 + flush_work(&virqfd->inject);
  101 + eventfd_ctx_put(virqfd->eventfd);
  102 +
  103 + kfree(virqfd);
  104 + *pvirqfd = NULL;
  105 +}
  106 +
  107 +static void virqfd_inject(struct work_struct *work)
  108 +{
  109 + struct virqfd *virqfd = container_of(work, struct virqfd, inject);
  110 + if (virqfd->thread)
  111 + virqfd->thread(virqfd->vdev, virqfd->data);
  112 +}
  113 +
  114 +static int virqfd_enable(struct vfio_pci_device *vdev,
  115 + int (*handler)(struct vfio_pci_device *, void *),
  116 + void (*thread)(struct vfio_pci_device *, void *),
  117 + void *data, struct virqfd **pvirqfd, int fd)
  118 +{
  119 + struct file *file = NULL;
  120 + struct eventfd_ctx *ctx = NULL;
  121 + struct virqfd *virqfd;
  122 + int ret = 0;
  123 + unsigned int events;
  124 +
  125 + if (*pvirqfd)
  126 + return -EBUSY;
  127 +
  128 + virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
  129 + if (!virqfd)
  130 + return -ENOMEM;
  131 +
  132 + virqfd->pvirqfd = pvirqfd;
  133 + *pvirqfd = virqfd;
  134 + virqfd->vdev = vdev;
  135 + virqfd->handler = handler;
  136 + virqfd->thread = thread;
  137 + virqfd->data = data;
  138 +
  139 + INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
  140 + INIT_WORK(&virqfd->inject, virqfd_inject);
  141 +
  142 + file = eventfd_fget(fd);
  143 + if (IS_ERR(file)) {
  144 + ret = PTR_ERR(file);
  145 + goto fail;
  146 + }
  147 +
  148 + ctx = eventfd_ctx_fileget(file);
  149 + if (IS_ERR(ctx)) {
  150 + ret = PTR_ERR(ctx);
  151 + goto fail;
  152 + }
  153 +
  154 + virqfd->eventfd = ctx;
  155 +
  156 + /*
  157 + * Install our own custom wake-up handling so we are notified via
  158 + * a callback whenever someone signals the underlying eventfd.
  159 + */
  160 + init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
  161 + init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
  162 +
  163 + events = file->f_op->poll(file, &virqfd->pt);
  164 +
  165 + /*
  166 + * Check if there was an event already pending on the eventfd
  167 + * before we registered and trigger it as if we didn't miss it.
  168 + */
  169 + if (events & POLLIN) {
  170 + if ((!handler || handler(vdev, data)) && thread)
  171 + schedule_work(&virqfd->inject);
  172 + }
  173 +
  174 + /*
  175 + * Do not drop the file until the irqfd is fully initialized,
  176 + * otherwise we might race against the POLLHUP.
  177 + */
  178 + fput(file);
  179 +
  180 + return 0;
  181 +
  182 +fail:
  183 + if (ctx && !IS_ERR(ctx))
  184 + eventfd_ctx_put(ctx);
  185 +
  186 + if (file && !IS_ERR(file))
  187 + fput(file);
  188 +
  189 + kfree(virqfd);
  190 + *pvirqfd = NULL;
  191 +
  192 + return ret;
  193 +}
  194 +
  195 +static void virqfd_disable(struct virqfd *virqfd)
  196 +{
  197 + if (!virqfd)
  198 + return;
  199 +
  200 + virqfd_deactivate(virqfd);
  201 +
  202 + /* Block until we know all outstanding shutdown jobs have completed. */
  203 + flush_workqueue(vfio_irqfd_cleanup_wq);
  204 +}
  205 +
  206 +/*
  207 + * INTx
  208 + */
  209 +static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
  210 +{
  211 + if (likely(is_intx(vdev) && !vdev->virq_disabled))
  212 + eventfd_signal(vdev->ctx[0].trigger, 1);
  213 +}
  214 +
  215 +void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
  216 +{
  217 + struct pci_dev *pdev = vdev->pdev;
  218 + unsigned long flags;
  219 +
  220 + spin_lock_irqsave(&vdev->irqlock, flags);
  221 +
  222 + /*
  223 + * Masking can come from interrupt, ioctl, or config space
  224 + * via INTx disable. The latter means this can get called
  225 + * even when not using intx delivery. In this case, just
  226 + * try to have the physical bit follow the virtual bit.
  227 + */
  228 + if (unlikely(!is_intx(vdev))) {
  229 + if (vdev->pci_2_3)
  230 + pci_intx(pdev, 0);
  231 + } else if (!vdev->ctx[0].masked) {
  232 + /*
  233 + * Can't use check_and_mask here because we always want to
  234 + * mask, not just when something is pending.
  235 + */
  236 + if (vdev->pci_2_3)
  237 + pci_intx(pdev, 0);
  238 + else
  239 + disable_irq_nosync(pdev->irq);
  240 +
  241 + vdev->ctx[0].masked = true;
  242 + }
  243 +
  244 + spin_unlock_irqrestore(&vdev->irqlock, flags);
  245 +}
  246 +
  247 +/*
  248 + * If this is triggered by an eventfd, we can't call eventfd_signal
  249 + * or else we'll deadlock on the eventfd wait queue. Return >0 when
  250 + * a signal is necessary, which can then be handled via a work queue
  251 + * or directly depending on the caller.
  252 + */
  253 +int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused)
  254 +{
  255 + struct pci_dev *pdev = vdev->pdev;
  256 + unsigned long flags;
  257 + int ret = 0;
  258 +
  259 + spin_lock_irqsave(&vdev->irqlock, flags);
  260 +
  261 + /*
  262 + * Unmasking comes from ioctl or config, so again, have the
  263 + * physical bit follow the virtual even when not using INTx.
  264 + */
  265 + if (unlikely(!is_intx(vdev))) {
  266 + if (vdev->pci_2_3)
  267 + pci_intx(pdev, 1);
  268 + } else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
  269 + /*
  270 + * A pending interrupt here would immediately trigger,
  271 + * but we can avoid that overhead by just re-sending
  272 + * the interrupt to the user.
  273 + */
  274 + if (vdev->pci_2_3) {
  275 + if (!pci_check_and_unmask_intx(pdev))
  276 + ret = 1;
  277 + } else
  278 + enable_irq(pdev->irq);
  279 +
  280 + vdev->ctx[0].masked = (ret > 0);
  281 + }
  282 +
  283 + spin_unlock_irqrestore(&vdev->irqlock, flags);
  284 +
  285 + return ret;
  286 +}
  287 +
  288 +void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
  289 +{
  290 + if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
  291 + vfio_send_intx_eventfd(vdev, NULL);
  292 +}
  293 +
  294 +static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
  295 +{
  296 + struct vfio_pci_device *vdev = dev_id;
  297 + unsigned long flags;
  298 + int ret = IRQ_NONE;
  299 +
  300 + spin_lock_irqsave(&vdev->irqlock, flags);
  301 +
  302 + if (!vdev->pci_2_3) {
  303 + disable_irq_nosync(vdev->pdev->irq);
  304 + vdev->ctx[0].masked = true;
  305 + ret = IRQ_HANDLED;
  306 + } else if (!vdev->ctx[0].masked && /* may be shared */
  307 + pci_check_and_mask_intx(vdev->pdev)) {
  308 + vdev->ctx[0].masked = true;
  309 + ret = IRQ_HANDLED;
  310 + }
  311 +
  312 + spin_unlock_irqrestore(&vdev->irqlock, flags);
  313 +
  314 + if (ret == IRQ_HANDLED)
  315 + vfio_send_intx_eventfd(vdev, NULL);
  316 +
  317 + return ret;
  318 +}
  319 +
  320 +static int vfio_intx_enable(struct vfio_pci_device *vdev)
  321 +{
  322 + if (!is_irq_none(vdev))
  323 + return -EINVAL;
  324 +
  325 + if (!vdev->pdev->irq)
  326 + return -ENODEV;
  327 +
  328 + vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
  329 + if (!vdev->ctx)
  330 + return -ENOMEM;
  331 +
  332 + vdev->num_ctx = 1;
  333 + vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
  334 +
  335 + return 0;
  336 +}
  337 +
  338 +static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
  339 +{
  340 + struct pci_dev *pdev = vdev->pdev;
  341 + unsigned long irqflags = IRQF_SHARED;
  342 + struct eventfd_ctx *trigger;
  343 + unsigned long flags;
  344 + int ret;
  345 +
  346 + if (vdev->ctx[0].trigger) {
  347 + free_irq(pdev->irq, vdev);
  348 + kfree(vdev->ctx[0].name);
  349 + eventfd_ctx_put(vdev->ctx[0].trigger);
  350 + vdev->ctx[0].trigger = NULL;
  351 + }
  352 +
  353 + if (fd < 0) /* Disable only */
  354 + return 0;
  355 +
  356 + vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
  357 + pci_name(pdev));
  358 + if (!vdev->ctx[0].name)
  359 + return -ENOMEM;
  360 +
  361 + trigger = eventfd_ctx_fdget(fd);
  362 + if (IS_ERR(trigger)) {
  363 + kfree(vdev->ctx[0].name);
  364 + return PTR_ERR(trigger);
  365 + }
  366 +
  367 + if (!vdev->pci_2_3)
  368 + irqflags = 0;
  369 +
  370 + ret = request_irq(pdev->irq, vfio_intx_handler,
  371 + irqflags, vdev->ctx[0].name, vdev);
  372 + if (ret) {
  373 + kfree(vdev->ctx[0].name);
  374 + eventfd_ctx_put(trigger);
  375 + return ret;
  376 + }
  377 +
  378 + vdev->ctx[0].trigger = trigger;
  379 +
  380 + /*
  381 + * INTx disable will stick across the new irq setup,
  382 + * disable_irq won't.
  383 + */
  384 + spin_lock_irqsave(&vdev->irqlock, flags);
  385 + if (!vdev->pci_2_3 && (vdev->ctx[0].masked || vdev->virq_disabled))
  386 + disable_irq_nosync(pdev->irq);
  387 + spin_unlock_irqrestore(&vdev->irqlock, flags);
  388 +
  389 + return 0;
  390 +}
  391 +
  392 +static void vfio_intx_disable(struct vfio_pci_device *vdev)
  393 +{
  394 + vfio_intx_set_signal(vdev, -1);
  395 + virqfd_disable(vdev->ctx[0].unmask);
  396 + virqfd_disable(vdev->ctx[0].mask);
  397 + vdev->irq_type = VFIO_PCI_NUM_IRQS;
  398 + vdev->num_ctx = 0;
  399 + kfree(vdev->ctx);
  400 +}
  401 +
  402 +/*
  403 + * MSI/MSI-X
  404 + */
  405 +static irqreturn_t vfio_msihandler(int irq, void *arg)
  406 +{
  407 + struct eventfd_ctx *trigger = arg;
  408 +
  409 + eventfd_signal(trigger, 1);
  410 + return IRQ_HANDLED;
  411 +}
  412 +
  413 +static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
  414 +{
  415 + struct pci_dev *pdev = vdev->pdev;
  416 + int ret;
  417 +
  418 + if (!is_irq_none(vdev))
  419 + return -EINVAL;
  420 +
  421 + vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
  422 + if (!vdev->ctx)
  423 + return -ENOMEM;
  424 +
  425 + if (msix) {
  426 + int i;
  427 +
  428 + vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
  429 + GFP_KERNEL);
  430 + if (!vdev->msix) {
  431 + kfree(vdev->ctx);
  432 + return -ENOMEM;
  433 + }
  434 +
  435 + for (i = 0; i < nvec; i++)
  436 + vdev->msix[i].entry = i;
  437 +
  438 + ret = pci_enable_msix(pdev, vdev->msix, nvec);
  439 + if (ret) {
  440 + kfree(vdev->msix);
  441 + kfree(vdev->ctx);
  442 + return ret;
  443 + }
  444 + } else {
  445 + ret = pci_enable_msi_block(pdev, nvec);
  446 + if (ret) {
  447 + kfree(vdev->ctx);
  448 + return ret;
  449 + }
  450 + }
  451 +
  452 + vdev->num_ctx = nvec;
  453 + vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
  454 + VFIO_PCI_MSI_IRQ_INDEX;
  455 +
  456 + if (!msix) {
  457 + /*
  458 + * Compute the virtual hardware field for max msi vectors -
  459 + * it is the log base 2 of the number of vectors.
  460 + */
  461 + vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
  462 + }
  463 +
  464 + return 0;
  465 +}
  466 +
  467 +static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
  468 + int vector, int fd, bool msix)
  469 +{
  470 + struct pci_dev *pdev = vdev->pdev;
  471 + int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
  472 + char *name = msix ? "vfio-msix" : "vfio-msi";
  473 + struct eventfd_ctx *trigger;
  474 + int ret;
  475 +
  476 + if (vector >= vdev->num_ctx)
  477 + return -EINVAL;
  478 +
  479 + if (vdev->ctx[vector].trigger) {
  480 + free_irq(irq, vdev->ctx[vector].trigger);
  481 + kfree(vdev->ctx[vector].name);
  482 + eventfd_ctx_put(vdev->ctx[vector].trigger);
  483 + vdev->ctx[vector].trigger = NULL;
  484 + }
  485 +
  486 + if (fd < 0)
  487 + return 0;
  488 +
  489 + vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
  490 + name, vector, pci_name(pdev));
  491 + if (!vdev->ctx[vector].name)
  492 + return -ENOMEM;
  493 +
  494 + trigger = eventfd_ctx_fdget(fd);
  495 + if (IS_ERR(trigger)) {
  496 + kfree(vdev->ctx[vector].name);
  497 + return PTR_ERR(trigger);
  498 + }
  499 +
  500 + ret = request_irq(irq, vfio_msihandler, 0,
  501 + vdev->ctx[vector].name, trigger);
  502 + if (ret) {
  503 + kfree(vdev->ctx[vector].name);
  504 + eventfd_ctx_put(trigger);
  505 + return ret;
  506 + }
  507 +
  508 + vdev->ctx[vector].trigger = trigger;
  509 +
  510 + return 0;
  511 +}
  512 +
  513 +static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
  514 + unsigned count, int32_t *fds, bool msix)
  515 +{
  516 + int i, j, ret = 0;
  517 +
  518 + if (start + count > vdev->num_ctx)
  519 + return -EINVAL;
  520 +
  521 + for (i = 0, j = start; i < count && !ret; i++, j++) {
  522 + int fd = fds ? fds[i] : -1;
  523 + ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
  524 + }
  525 +
  526 + if (ret) {
  527 + for (--j; j >= start; j--)
  528 + vfio_msi_set_vector_signal(vdev, j, -1, msix);
  529 + }
  530 +
  531 + return ret;
  532 +}
  533 +
  534 +static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
  535 +{
  536 + struct pci_dev *pdev = vdev->pdev;
  537 + int i;
  538 +
  539 + vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
  540 +
  541 + for (i = 0; i < vdev->num_ctx; i++) {
  542 + virqfd_disable(vdev->ctx[i].unmask);
  543 + virqfd_disable(vdev->ctx[i].mask);
  544 + }
  545 +
  546 + if (msix) {
  547 + pci_disable_msix(vdev->pdev);
  548 + kfree(vdev->msix);
  549 + } else
  550 + pci_disable_msi(pdev);
  551 +
  552 + vdev->irq_type = VFIO_PCI_NUM_IRQS;
  553 + vdev->num_ctx = 0;
  554 + kfree(vdev->ctx);
  555 +}
  556 +
  557 +/*
  558 + * IOCTL support
  559 + */
  560 +static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
  561 + unsigned index, unsigned start,
  562 + unsigned count, uint32_t flags, void *data)
  563 +{
  564 + if (!is_intx(vdev) || start != 0 || count != 1)
  565 + return -EINVAL;
  566 +
  567 + if (flags & VFIO_IRQ_SET_DATA_NONE) {
  568 + vfio_pci_intx_unmask(vdev);
  569 + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
  570 + uint8_t unmask = *(uint8_t *)data;
  571 + if (unmask)
  572 + vfio_pci_intx_unmask(vdev);
  573 + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
  574 + int32_t fd = *(int32_t *)data;
  575 + if (fd >= 0)
  576 + return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
  577 + vfio_send_intx_eventfd, NULL,
  578 + &vdev->ctx[0].unmask, fd);
  579 +
  580 + virqfd_disable(vdev->ctx[0].unmask);
  581 + }
  582 +
  583 + return 0;
  584 +}
  585 +
  586 +static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
  587 + unsigned index, unsigned start,
  588 + unsigned count, uint32_t flags, void *data)
  589 +{
  590 + if (!is_intx(vdev) || start != 0 || count != 1)
  591 + return -EINVAL;
  592 +
  593 + if (flags & VFIO_IRQ_SET_DATA_NONE) {
  594 + vfio_pci_intx_mask(vdev);
  595 + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
  596 + uint8_t mask = *(uint8_t *)data;
  597 + if (mask)
  598 + vfio_pci_intx_mask(vdev);
  599 + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
  600 + return -ENOTTY; /* XXX implement me */
  601 + }
  602 +
  603 + return 0;
  604 +}
  605 +
  606 +static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
  607 + unsigned index, unsigned start,
  608 + unsigned count, uint32_t flags, void *data)
  609 +{
  610 + if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
  611 + vfio_intx_disable(vdev);
  612 + return 0;
  613 + }
  614 +
  615 + if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
  616 + return -EINVAL;
  617 +
  618 + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
  619 + int32_t fd = *(int32_t *)data;
  620 + int ret;
  621 +
  622 + if (is_intx(vdev))
  623 + return vfio_intx_set_signal(vdev, fd);
  624 +
  625 + ret = vfio_intx_enable(vdev);
  626 + if (ret)
  627 + return ret;
  628 +
  629 + ret = vfio_intx_set_signal(vdev, fd);
  630 + if (ret)
  631 + vfio_intx_disable(vdev);
  632 +
  633 + return ret;
  634 + }
  635 +
  636 + if (!is_intx(vdev))
  637 + return -EINVAL;
  638 +
  639 + if (flags & VFIO_IRQ_SET_DATA_NONE) {
  640 + vfio_send_intx_eventfd(vdev, NULL);
  641 + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
  642 + uint8_t trigger = *(uint8_t *)data;
  643 + if (trigger)
  644 + vfio_send_intx_eventfd(vdev, NULL);
  645 + }
  646 + return 0;
  647 +}
  648 +
  649 +static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
  650 + unsigned index, unsigned start,
  651 + unsigned count, uint32_t flags, void *data)
  652 +{
  653 + int i;
  654 + bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
  655 +
  656 + if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
  657 + vfio_msi_disable(vdev, msix);
  658 + return 0;
  659 + }
  660 +
  661 + if (!(irq_is(vdev, index) || is_irq_none(vdev)))
  662 + return -EINVAL;
  663 +
  664 + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
  665 + int32_t *fds = data;
  666 + int ret;
  667 +
  668 + if (vdev->irq_type == index)
  669 + return vfio_msi_set_block(vdev, start, count,
  670 + fds, msix);
  671 +
  672 + ret = vfio_msi_enable(vdev, start + count, msix);
  673 + if (ret)
  674 + return ret;
  675 +
  676 + ret = vfio_msi_set_block(vdev, start, count, fds, msix);
  677 + if (ret)
  678 + vfio_msi_disable(vdev, msix);
  679 +
  680 + return ret;
  681 + }
  682 +
  683 + if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
  684 + return -EINVAL;
  685 +
  686 + for (i = start; i < start + count; i++) {
  687 + if (!vdev->ctx[i].trigger)
  688 + continue;
  689 + if (flags & VFIO_IRQ_SET_DATA_NONE) {
  690 + eventfd_signal(vdev->ctx[i].trigger, 1);
  691 + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
  692 + uint8_t *bools = data;
  693 + if (bools[i - start])
  694 + eventfd_signal(vdev->ctx[i].trigger, 1);
  695 + }
  696 + }
  697 + return 0;
  698 +}
  699 +
  700 +int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
  701 + unsigned index, unsigned start, unsigned count,
  702 + void *data)
  703 +{
  704 + int (*func)(struct vfio_pci_device *vdev, unsigned index,
  705 + unsigned start, unsigned count, uint32_t flags,
  706 + void *data) = NULL;
  707 +
  708 + switch (index) {
  709 + case VFIO_PCI_INTX_IRQ_INDEX:
  710 + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
  711 + case VFIO_IRQ_SET_ACTION_MASK:
  712 + func = vfio_pci_set_intx_mask;
  713 + break;
  714 + case VFIO_IRQ_SET_ACTION_UNMASK:
  715 + func = vfio_pci_set_intx_unmask;
  716 + break;
  717 + case VFIO_IRQ_SET_ACTION_TRIGGER:
  718 + func = vfio_pci_set_intx_trigger;
  719 + break;
  720 + }
  721 + break;
  722 + case VFIO_PCI_MSI_IRQ_INDEX:
  723 + case VFIO_PCI_MSIX_IRQ_INDEX:
  724 + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
  725 + case VFIO_IRQ_SET_ACTION_MASK:
  726 + case VFIO_IRQ_SET_ACTION_UNMASK:
  727 + /* XXX Need masking support exported */
  728 + break;
  729 + case VFIO_IRQ_SET_ACTION_TRIGGER:
  730 + func = vfio_pci_set_msi_trigger;
  731 + break;
  732 + }
  733 + break;
  734 + }
  735 +
  736 + if (!func)
  737 + return -ENOTTY;
  738 +
  739 + return func(vdev, index, start, count, flags, data);
  740 +}
drivers/vfio/pci/vfio_pci_private.h
  1 +/*
  2 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  3 + * Author: Alex Williamson <alex.williamson@redhat.com>
  4 + *
  5 + * This program is free software; you can redistribute it and/or modify
  6 + * it under the terms of the GNU General Public License version 2 as
  7 + * published by the Free Software Foundation.
  8 + *
  9 + * Derived from original vfio:
  10 + * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  11 + * Author: Tom Lyon, pugs@cisco.com
  12 + */
  13 +
  14 +#include <linux/mutex.h>
  15 +#include <linux/pci.h>
  16 +
  17 +#ifndef VFIO_PCI_PRIVATE_H
  18 +#define VFIO_PCI_PRIVATE_H
  19 +
  20 +#define VFIO_PCI_OFFSET_SHIFT 40
  21 +
  22 +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
  23 +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
  24 +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
  25 +
  26 +struct vfio_pci_irq_ctx {
  27 + struct eventfd_ctx *trigger;
  28 + struct virqfd *unmask;
  29 + struct virqfd *mask;
  30 + char *name;
  31 + bool masked;
  32 +};
  33 +
  34 +struct vfio_pci_device {
  35 + struct pci_dev *pdev;
  36 + void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
  37 + u8 *pci_config_map;
  38 + u8 *vconfig;
  39 + struct perm_bits *msi_perm;
  40 + spinlock_t irqlock;
  41 + struct mutex igate;
  42 + struct msix_entry *msix;
  43 + struct vfio_pci_irq_ctx *ctx;
  44 + int num_ctx;
  45 + int irq_type;
  46 + u8 msi_qmax;
  47 + u8 msix_bar;
  48 + u16 msix_size;
  49 + u32 msix_offset;
  50 + u32 rbar[7];
  51 + bool pci_2_3;
  52 + bool virq_disabled;
  53 + bool reset_works;
  54 + bool extended_caps;
  55 + bool bardirty;
  56 + struct pci_saved_state *pci_saved_state;
  57 + atomic_t refcnt;
  58 +};
  59 +
  60 +#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
  61 +#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
  62 +#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
  63 +#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
  64 +#define irq_is(vdev, type) (vdev->irq_type == type)
  65 +
  66 +extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev);
  67 +extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev);
  68 +
  69 +extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev,
  70 + uint32_t flags, unsigned index,
  71 + unsigned start, unsigned count, void *data);
  72 +
  73 +extern ssize_t vfio_pci_config_readwrite(struct vfio_pci_device *vdev,
  74 + char __user *buf, size_t count,
  75 + loff_t *ppos, bool iswrite);
  76 +extern ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev,
  77 + char __user *buf, size_t count,
  78 + loff_t *ppos, bool iswrite);
  79 +extern ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev,
  80 + char __user *buf, size_t count,
  81 + loff_t *ppos, bool iswrite);
  82 +
  83 +extern int vfio_pci_init_perm_bits(void);
  84 +extern void vfio_pci_uninit_perm_bits(void);
  85 +
  86 +extern int vfio_pci_virqfd_init(void);
  87 +extern void vfio_pci_virqfd_exit(void);
  88 +
  89 +extern int vfio_config_init(struct vfio_pci_device *vdev);
  90 +extern void vfio_config_free(struct vfio_pci_device *vdev);
  91 +#endif /* VFIO_PCI_PRIVATE_H */
drivers/vfio/pci/vfio_pci_rdwr.c
  1 +/*
  2 + * VFIO PCI I/O Port & MMIO access
  3 + *
  4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  5 + * Author: Alex Williamson <alex.williamson@redhat.com>
  6 + *
  7 + * This program is free software; you can redistribute it and/or modify
  8 + * it under the terms of the GNU General Public License version 2 as
  9 + * published by the Free Software Foundation.
  10 + *
  11 + * Derived from original vfio:
  12 + * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  13 + * Author: Tom Lyon, pugs@cisco.com
  14 + */
  15 +
  16 +#include <linux/fs.h>
  17 +#include <linux/pci.h>
  18 +#include <linux/uaccess.h>
  19 +#include <linux/io.h>
  20 +
  21 +#include "vfio_pci_private.h"
  22 +
  23 +/* I/O Port BAR access */
  24 +ssize_t vfio_pci_io_readwrite(struct vfio_pci_device *vdev, char __user *buf,
  25 + size_t count, loff_t *ppos, bool iswrite)
  26 +{
  27 + struct pci_dev *pdev = vdev->pdev;
  28 + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
  29 + int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
  30 + void __iomem *io;
  31 + size_t done = 0;
  32 +
  33 + if (!pci_resource_start(pdev, bar))
  34 + return -EINVAL;
  35 +
  36 + if (pos + count > pci_resource_len(pdev, bar))
  37 + return -EINVAL;
  38 +
  39 + if (!vdev->barmap[bar]) {
  40 + int ret;
  41 +
  42 + ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
  43 + if (ret)
  44 + return ret;
  45 +
  46 + vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
  47 +
  48 + if (!vdev->barmap[bar]) {
  49 + pci_release_selected_regions(pdev, 1 << bar);
  50 + return -EINVAL;
  51 + }
  52 + }
  53 +
  54 + io = vdev->barmap[bar];
  55 +
  56 + while (count) {
  57 + int filled;
  58 +
  59 + if (count >= 3 && !(pos % 4)) {
  60 + __le32 val;
  61 +
  62 + if (iswrite) {
  63 + if (copy_from_user(&val, buf, 4))
  64 + return -EFAULT;
  65 +
  66 + iowrite32(le32_to_cpu(val), io + pos);
  67 + } else {
  68 + val = cpu_to_le32(ioread32(io + pos));
  69 +
  70 + if (copy_to_user(buf, &val, 4))
  71 + return -EFAULT;
  72 + }
  73 +
  74 + filled = 4;
  75 +
  76 + } else if ((pos % 2) == 0 && count >= 2) {
  77 + __le16 val;
  78 +
  79 + if (iswrite) {
  80 + if (copy_from_user(&val, buf, 2))
  81 + return -EFAULT;
  82 +
  83 + iowrite16(le16_to_cpu(val), io + pos);
  84 + } else {
  85 + val = cpu_to_le16(ioread16(io + pos));
  86 +
  87 + if (copy_to_user(buf, &val, 2))
  88 + return -EFAULT;
  89 + }
  90 +
  91 + filled = 2;
  92 + } else {
  93 + u8 val;
  94 +
  95 + if (iswrite) {
  96 + if (copy_from_user(&val, buf, 1))
  97 + return -EFAULT;
  98 +
  99 + iowrite8(val, io + pos);
  100 + } else {
  101 + val = ioread8(io + pos);
  102 +
  103 + if (copy_to_user(buf, &val, 1))
  104 + return -EFAULT;
  105 + }
  106 +
  107 + filled = 1;
  108 + }
  109 +
  110 + count -= filled;
  111 + done += filled;
  112 + buf += filled;
  113 + pos += filled;
  114 + }
  115 +
  116 + *ppos += done;
  117 +
  118 + return done;
  119 +}
  120 +
  121 +/*
  122 + * MMIO BAR access
  123 + * We handle two excluded ranges here as well, if the user tries to read
  124 + * the ROM beyond what PCI tells us is available or the MSI-X table region,
  125 + * we return 0xFF and writes are dropped.
  126 + */
  127 +ssize_t vfio_pci_mem_readwrite(struct vfio_pci_device *vdev, char __user *buf,
  128 + size_t count, loff_t *ppos, bool iswrite)
  129 +{
  130 + struct pci_dev *pdev = vdev->pdev;
  131 + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
  132 + int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
  133 + void __iomem *io;
  134 + resource_size_t end;
  135 + size_t done = 0;
  136 + size_t x_start = 0, x_end = 0; /* excluded range */
  137 +
  138 + if (!pci_resource_start(pdev, bar))
  139 + return -EINVAL;
  140 +
  141 + end = pci_resource_len(pdev, bar);
  142 +
  143 + if (pos > end)
  144 + return -EINVAL;
  145 +
  146 + if (pos == end)
  147 + return 0;
  148 +
  149 + if (pos + count > end)
  150 + count = end - pos;
  151 +
  152 + if (bar == PCI_ROM_RESOURCE) {
  153 + io = pci_map_rom(pdev, &x_start);
  154 + x_end = end;
  155 + } else {
  156 + if (!vdev->barmap[bar]) {
  157 + int ret;
  158 +
  159 + ret = pci_request_selected_regions(pdev, 1 << bar,
  160 + "vfio");
  161 + if (ret)
  162 + return ret;
  163 +
  164 + vdev->barmap[bar] = pci_iomap(pdev, bar, 0);
  165 +
  166 + if (!vdev->barmap[bar]) {
  167 + pci_release_selected_regions(pdev, 1 << bar);
  168 + return -EINVAL;
  169 + }
  170 + }
  171 +
  172 + io = vdev->barmap[bar];
  173 +
  174 + if (bar == vdev->msix_bar) {
  175 + x_start = vdev->msix_offset;
  176 + x_end = vdev->msix_offset + vdev->msix_size;
  177 + }
  178 + }
  179 +
  180 + if (!io)
  181 + return -EINVAL;
  182 +
  183 + while (count) {
  184 + size_t fillable, filled;
  185 +
  186 + if (pos < x_start)
  187 + fillable = x_start - pos;
  188 + else if (pos >= x_end)
  189 + fillable = end - pos;
  190 + else
  191 + fillable = 0;
  192 +
  193 + if (fillable >= 4 && !(pos % 4) && (count >= 4)) {
  194 + __le32 val;
  195 +
  196 + if (iswrite) {
  197 + if (copy_from_user(&val, buf, 4))
  198 + goto out;
  199 +
  200 + iowrite32(le32_to_cpu(val), io + pos);
  201 + } else {
  202 + val = cpu_to_le32(ioread32(io + pos));
  203 +
  204 + if (copy_to_user(buf, &val, 4))
  205 + goto out;
  206 + }
  207 +
  208 + filled = 4;
  209 + } else if (fillable >= 2 && !(pos % 2) && (count >= 2)) {
  210 + __le16 val;
  211 +
  212 + if (iswrite) {
  213 + if (copy_from_user(&val, buf, 2))
  214 + goto out;
  215 +
  216 + iowrite16(le16_to_cpu(val), io + pos);
  217 + } else {
  218 + val = cpu_to_le16(ioread16(io + pos));
  219 +
  220 + if (copy_to_user(buf, &val, 2))
  221 + goto out;
  222 + }
  223 +
  224 + filled = 2;
  225 + } else if (fillable) {
  226 + u8 val;
  227 +
  228 + if (iswrite) {
  229 + if (copy_from_user(&val, buf, 1))
  230 + goto out;
  231 +
  232 + iowrite8(val, io + pos);
  233 + } else {
  234 + val = ioread8(io + pos);
  235 +
  236 + if (copy_to_user(buf, &val, 1))
  237 + goto out;
  238 + }
  239 +
  240 + filled = 1;
  241 + } else {
  242 + /* Drop writes, fill reads with FF */
  243 + if (!iswrite) {
  244 + char val = 0xFF;
  245 + size_t i;
  246 +
  247 + for (i = 0; i < x_end - pos; i++) {
  248 + if (put_user(val, buf + i))
  249 + goto out;
  250 + }
  251 + }
  252 +
  253 + filled = x_end - pos;
  254 + }
  255 +
  256 + count -= filled;
  257 + done += filled;
  258 + buf += filled;
  259 + pos += filled;
  260 + }
  261 +
  262 + *ppos += done;
  263 +
  264 +out:
  265 + if (bar == PCI_ROM_RESOURCE)
  266 + pci_unmap_rom(pdev, io);
  267 +
  268 + return count ? -EFAULT : done;
  269 +}
include/linux/vfio.h
... ... @@ -223,6 +223,7 @@
223 223 __u32 argsz;
224 224 __u32 flags;
225 225 #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */
  226 +#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
226 227 __u32 num_regions; /* Max region index + 1 */
227 228 __u32 num_irqs; /* Max IRQ index + 1 */
228 229 };
... ... @@ -363,6 +364,31 @@
363 364 * Reset a device.
364 365 */
365 366 #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
  367 +
  368 +/*
  369 + * The VFIO-PCI bus driver makes use of the following fixed region and
  370 + * IRQ index mapping. Unimplemented regions return a size of zero.
  371 + * Unimplemented IRQ types return a count of zero.
  372 + */
  373 +
  374 +enum {
  375 + VFIO_PCI_BAR0_REGION_INDEX,
  376 + VFIO_PCI_BAR1_REGION_INDEX,
  377 + VFIO_PCI_BAR2_REGION_INDEX,
  378 + VFIO_PCI_BAR3_REGION_INDEX,
  379 + VFIO_PCI_BAR4_REGION_INDEX,
  380 + VFIO_PCI_BAR5_REGION_INDEX,
  381 + VFIO_PCI_ROM_REGION_INDEX,
  382 + VFIO_PCI_CONFIG_REGION_INDEX,
  383 + VFIO_PCI_NUM_REGIONS
  384 +};
  385 +
  386 +enum {
  387 + VFIO_PCI_INTX_IRQ_INDEX,
  388 + VFIO_PCI_MSI_IRQ_INDEX,
  389 + VFIO_PCI_MSIX_IRQ_INDEX,
  390 + VFIO_PCI_NUM_IRQS
  391 +};
366 392  
367 393 /* -------- API for Type1 VFIO IOMMU -------- */
368 394