Commit 21e98932dcf15fe7eabd09a35f2020e0dd86b685

Authored by Linus Torvalds

Merge git://git.infradead.org/users/willy/linux-nvme

Pull NVMe driver fixes from Matthew Wilcox:
 "Now that actual hardware has been released (don't have any yet
  myself), people are starting to want some of these fixes merged."

Willy doesn't have hardware? Guys...

* git://git.infradead.org/users/willy/linux-nvme:
  NVMe: Cancel outstanding IOs on queue deletion
  NVMe: Free admin queue memory on initialisation failure
  NVMe: Use ida for nvme device instance
  NVMe: Fix whitespace damage in nvme_init
  NVMe: handle allocation failure in nvme_map_user_pages()
  NVMe: Fix uninitialized iod compiler warning
  NVMe: Do not set IO queue depth beyond device max
  NVMe: Set block queue max sectors
  NVMe: use namespace id for nvme_get_features
  NVMe: replace nvme_ns with nvme_dev for user admin
  NVMe: Fix nvme module init when nvme_major is set
  NVMe: Set request queue logical block size

Showing 2 changed files Side-by-side Diff

drivers/block/nvme.c
... ... @@ -79,6 +79,7 @@
79 79 char serial[20];
80 80 char model[40];
81 81 char firmware_rev[8];
  82 + u32 max_hw_sectors;
82 83 };
83 84  
84 85 /*
85 86  
86 87  
... ... @@ -835,15 +836,15 @@
835 836 }
836 837  
837 838 static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
838   - unsigned dword11, dma_addr_t dma_addr)
  839 + unsigned nsid, dma_addr_t dma_addr)
839 840 {
840 841 struct nvme_command c;
841 842  
842 843 memset(&c, 0, sizeof(c));
843 844 c.features.opcode = nvme_admin_get_features;
  845 + c.features.nsid = cpu_to_le32(nsid);
844 846 c.features.prp1 = cpu_to_le64(dma_addr);
845 847 c.features.fid = cpu_to_le32(fid);
846   - c.features.dword11 = cpu_to_le32(dword11);
847 848  
848 849 return nvme_submit_admin_cmd(dev, &c, NULL);
849 850 }
850 851  
... ... @@ -862,11 +863,51 @@
862 863 return nvme_submit_admin_cmd(dev, &c, result);
863 864 }
864 865  
  866 +/**
  867 + * nvme_cancel_ios - Cancel outstanding I/Os
  868 + * @queue: The queue to cancel I/Os on
  869 + * @timeout: True to only cancel I/Os which have timed out
  870 + */
  871 +static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
  872 +{
  873 + int depth = nvmeq->q_depth - 1;
  874 + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
  875 + unsigned long now = jiffies;
  876 + int cmdid;
  877 +
  878 + for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
  879 + void *ctx;
  880 + nvme_completion_fn fn;
  881 + static struct nvme_completion cqe = {
  882 + .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1,
  883 + };
  884 +
  885 + if (timeout && !time_after(now, info[cmdid].timeout))
  886 + continue;
  887 + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
  888 + ctx = cancel_cmdid(nvmeq, cmdid, &fn);
  889 + fn(nvmeq->dev, ctx, &cqe);
  890 + }
  891 +}
  892 +
  893 +static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
  894 +{
  895 + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
  896 + (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
  897 + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
  898 + nvmeq->sq_cmds, nvmeq->sq_dma_addr);
  899 + kfree(nvmeq);
  900 +}
  901 +
865 902 static void nvme_free_queue(struct nvme_dev *dev, int qid)
866 903 {
867 904 struct nvme_queue *nvmeq = dev->queues[qid];
868 905 int vector = dev->entry[nvmeq->cq_vector].vector;
869 906  
  907 + spin_lock_irq(&nvmeq->q_lock);
  908 + nvme_cancel_ios(nvmeq, false);
  909 + spin_unlock_irq(&nvmeq->q_lock);
  910 +
870 911 irq_set_affinity_hint(vector, NULL);
871 912 free_irq(vector, nvmeq);
872 913  
873 914  
... ... @@ -876,18 +917,15 @@
876 917 adapter_delete_cq(dev, qid);
877 918 }
878 919  
879   - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
880   - (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
881   - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
882   - nvmeq->sq_cmds, nvmeq->sq_dma_addr);
883   - kfree(nvmeq);
  920 + nvme_free_queue_mem(nvmeq);
884 921 }
885 922  
886 923 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
887 924 int depth, int vector)
888 925 {
889 926 struct device *dmadev = &dev->pci_dev->dev;
890   - unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
  927 + unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
  928 + sizeof(struct nvme_cmd_info));
891 929 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
892 930 if (!nvmeq)
893 931 return NULL;
... ... @@ -975,7 +1013,7 @@
975 1013  
976 1014 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
977 1015 {
978   - int result;
  1016 + int result = 0;
979 1017 u32 aqa;
980 1018 u64 cap;
981 1019 unsigned long timeout;
982 1020  
983 1021  
984 1022  
... ... @@ -1005,17 +1043,22 @@
1005 1043 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1006 1044 dev->db_stride = NVME_CAP_STRIDE(cap);
1007 1045  
1008   - while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
  1046 + while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
1009 1047 msleep(100);
1010 1048 if (fatal_signal_pending(current))
1011   - return -EINTR;
  1049 + result = -EINTR;
1012 1050 if (time_after(jiffies, timeout)) {
1013 1051 dev_err(&dev->pci_dev->dev,
1014 1052 "Device not ready; aborting initialisation\n");
1015   - return -ENODEV;
  1053 + result = -ENODEV;
1016 1054 }
1017 1055 }
1018 1056  
  1057 + if (result) {
  1058 + nvme_free_queue_mem(nvmeq);
  1059 + return result;
  1060 + }
  1061 +
1019 1062 result = queue_request_irq(dev, nvmeq, "nvme admin");
1020 1063 dev->queues[0] = nvmeq;
1021 1064 return result;
... ... @@ -1037,6 +1080,8 @@
1037 1080 offset = offset_in_page(addr);
1038 1081 count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
1039 1082 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
  1083 + if (!pages)
  1084 + return ERR_PTR(-ENOMEM);
1040 1085  
1041 1086 err = get_user_pages_fast(addr, count, 1, pages);
1042 1087 if (err < count) {
1043 1088  
1044 1089  
... ... @@ -1146,14 +1191,13 @@
1146 1191 return status;
1147 1192 }
1148 1193  
1149   -static int nvme_user_admin_cmd(struct nvme_ns *ns,
  1194 +static int nvme_user_admin_cmd(struct nvme_dev *dev,
1150 1195 struct nvme_admin_cmd __user *ucmd)
1151 1196 {
1152   - struct nvme_dev *dev = ns->dev;
1153 1197 struct nvme_admin_cmd cmd;
1154 1198 struct nvme_command c;
1155 1199 int status, length;
1156   - struct nvme_iod *iod;
  1200 + struct nvme_iod *uninitialized_var(iod);
1157 1201  
1158 1202 if (!capable(CAP_SYS_ADMIN))
1159 1203 return -EACCES;
... ... @@ -1204,7 +1248,7 @@
1204 1248 case NVME_IOCTL_ID:
1205 1249 return ns->ns_id;
1206 1250 case NVME_IOCTL_ADMIN_CMD:
1207   - return nvme_user_admin_cmd(ns, (void __user *)arg);
  1251 + return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
1208 1252 case NVME_IOCTL_SUBMIT_IO:
1209 1253 return nvme_submit_io(ns, (void __user *)arg);
1210 1254 default:
... ... @@ -1218,26 +1262,6 @@
1218 1262 .compat_ioctl = nvme_ioctl,
1219 1263 };
1220 1264  
1221   -static void nvme_timeout_ios(struct nvme_queue *nvmeq)
1222   -{
1223   - int depth = nvmeq->q_depth - 1;
1224   - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
1225   - unsigned long now = jiffies;
1226   - int cmdid;
1227   -
1228   - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
1229   - void *ctx;
1230   - nvme_completion_fn fn;
1231   - static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
1232   -
1233   - if (!time_after(now, info[cmdid].timeout))
1234   - continue;
1235   - dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
1236   - ctx = cancel_cmdid(nvmeq, cmdid, &fn);
1237   - fn(nvmeq->dev, ctx, &cqe);
1238   - }
1239   -}
1240   -
1241 1265 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1242 1266 {
1243 1267 while (bio_list_peek(&nvmeq->sq_cong)) {
... ... @@ -1269,7 +1293,7 @@
1269 1293 spin_lock_irq(&nvmeq->q_lock);
1270 1294 if (nvme_process_cq(nvmeq))
1271 1295 printk("process_cq did something\n");
1272   - nvme_timeout_ios(nvmeq);
  1296 + nvme_cancel_ios(nvmeq, true);
1273 1297 nvme_resubmit_bios(nvmeq);
1274 1298 spin_unlock_irq(&nvmeq->q_lock);
1275 1299 }
... ... @@ -1339,6 +1363,9 @@
1339 1363 ns->disk = disk;
1340 1364 lbaf = id->flbas & 0xf;
1341 1365 ns->lba_shift = id->lbaf[lbaf].ds;
  1366 + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
  1367 + if (dev->max_hw_sectors)
  1368 + blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
1342 1369  
1343 1370 disk->major = nvme_major;
1344 1371 disk->minors = NVME_MINORS;
... ... @@ -1383,7 +1410,7 @@
1383 1410  
1384 1411 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
1385 1412 {
1386   - int result, cpu, i, nr_io_queues, db_bar_size;
  1413 + int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
1387 1414  
1388 1415 nr_io_queues = num_online_cpus();
1389 1416 result = set_queue_count(dev, nr_io_queues);
1390 1417  
... ... @@ -1429,9 +1456,10 @@
1429 1456 cpu = cpumask_next(cpu, cpu_online_mask);
1430 1457 }
1431 1458  
  1459 + q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
  1460 + NVME_Q_DEPTH);
1432 1461 for (i = 0; i < nr_io_queues; i++) {
1433   - dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
1434   - NVME_Q_DEPTH, i);
  1462 + dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
1435 1463 if (IS_ERR(dev->queues[i + 1]))
1436 1464 return PTR_ERR(dev->queues[i + 1]);
1437 1465 dev->queue_count++;
... ... @@ -1480,6 +1508,10 @@
1480 1508 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
1481 1509 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
1482 1510 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
  1511 + if (ctrl->mdts) {
  1512 + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
  1513 + dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
  1514 + }
1483 1515  
1484 1516 id_ns = mem;
1485 1517 for (i = 1; i <= nn; i++) {
... ... @@ -1523,8 +1555,6 @@
1523 1555 list_del(&dev->node);
1524 1556 spin_unlock(&dev_list_lock);
1525 1557  
1526   - /* TODO: wait all I/O finished or cancel them */
1527   -
1528 1558 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1529 1559 list_del(&ns->list);
1530 1560 del_gendisk(ns->disk);
1531 1561  
1532 1562  
... ... @@ -1560,15 +1590,33 @@
1560 1590 dma_pool_destroy(dev->prp_small_pool);
1561 1591 }
1562 1592  
1563   -/* XXX: Use an ida or something to let remove / add work correctly */
1564   -static void nvme_set_instance(struct nvme_dev *dev)
  1593 +static DEFINE_IDA(nvme_instance_ida);
  1594 +
  1595 +static int nvme_set_instance(struct nvme_dev *dev)
1565 1596 {
1566   - static int instance;
1567   - dev->instance = instance++;
  1597 + int instance, error;
  1598 +
  1599 + do {
  1600 + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
  1601 + return -ENODEV;
  1602 +
  1603 + spin_lock(&dev_list_lock);
  1604 + error = ida_get_new(&nvme_instance_ida, &instance);
  1605 + spin_unlock(&dev_list_lock);
  1606 + } while (error == -EAGAIN);
  1607 +
  1608 + if (error)
  1609 + return -ENODEV;
  1610 +
  1611 + dev->instance = instance;
  1612 + return 0;
1568 1613 }
1569 1614  
1570 1615 static void nvme_release_instance(struct nvme_dev *dev)
1571 1616 {
  1617 + spin_lock(&dev_list_lock);
  1618 + ida_remove(&nvme_instance_ida, dev->instance);
  1619 + spin_unlock(&dev_list_lock);
1572 1620 }
1573 1621  
1574 1622 static int __devinit nvme_probe(struct pci_dev *pdev,
... ... @@ -1601,7 +1649,10 @@
1601 1649 pci_set_drvdata(pdev, dev);
1602 1650 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
1603 1651 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
1604   - nvme_set_instance(dev);
  1652 + result = nvme_set_instance(dev);
  1653 + if (result)
  1654 + goto disable;
  1655 +
1605 1656 dev->entry[0].vector = pdev->irq;
1606 1657  
1607 1658 result = nvme_setup_prp_pools(dev);
1608 1659  
1609 1660  
... ... @@ -1704,15 +1755,17 @@
1704 1755  
1705 1756 static int __init nvme_init(void)
1706 1757 {
1707   - int result = -EBUSY;
  1758 + int result;
1708 1759  
1709 1760 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
1710 1761 if (IS_ERR(nvme_thread))
1711 1762 return PTR_ERR(nvme_thread);
1712 1763  
1713   - nvme_major = register_blkdev(nvme_major, "nvme");
1714   - if (nvme_major <= 0)
  1764 + result = register_blkdev(nvme_major, "nvme");
  1765 + if (result < 0)
1715 1766 goto kill_kthread;
  1767 + else if (result > 0)
  1768 + nvme_major = result;
1716 1769  
1717 1770 result = pci_register_driver(&nvme_driver);
1718 1771 if (result)
include/linux/nvme.h
... ... @@ -35,8 +35,10 @@
35 35 __u64 acq; /* Admin CQ Base Address */
36 36 };
37 37  
  38 +#define NVME_CAP_MQES(cap) ((cap) & 0xffff)
38 39 #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
39 40 #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
  41 +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf)
40 42  
41 43 enum {
42 44 NVME_CC_ENABLE = 1 << 0,