Commit b60503ba432b16fc84442a84e29a7aad2c0c363d

Authored by Matthew Wilcox
1 parent 0b934ccd70

NVMe: New driver

This driver is for devices that follow the NVM Express standard

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>

Showing 5 changed files with 1399 additions and 0 deletions Side-by-side Diff

Documentation/ioctl/ioctl-number.txt
... ... @@ -149,6 +149,7 @@
149 149 'M' 01-03 drivers/scsi/megaraid/megaraid_sas.h
150 150 'M' 00-0F drivers/video/fsl-diu-fb.h conflict!
151 151 'N' 00-1F drivers/usb/scanner.h
  152 +'N' 40-7F drivers/block/nvme.c
152 153 'O' 00-06 mtd/ubi-user.h UBI
153 154 'P' all linux/soundcard.h conflict!
154 155 'P' 60-6F sound/sscape_ioctl.h conflict!
drivers/block/Kconfig
... ... @@ -315,6 +315,17 @@
315 315  
316 316 If unsure, say N.
317 317  
  318 +config BLK_DEV_NVME
  319 + tristate "NVM Express block device"
  320 + depends on PCI
  321 + ---help---
  322 + The NVM Express driver is for solid state drives directly
  323 + connected to the PCI or PCI Express bus. If you know you
  324 + don't have one of these, it is safe to answer N.
  325 +
  326 + To compile this driver as a module, choose M here: the
  327 + module will be called nvme.
  328 +
318 329 config BLK_DEV_OSD
319 330 tristate "OSD object-as-blkdev support"
320 331 depends on SCSI_OSD_ULD
drivers/block/Makefile
... ... @@ -23,6 +23,7 @@
23 23 obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
24 24 obj-$(CONFIG_MG_DISK) += mg_disk.o
25 25 obj-$(CONFIG_SUNVDC) += sunvdc.o
  26 +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
26 27 obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
27 28  
28 29 obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
drivers/block/nvme.c
Changes suppressed. Click to show
  1 +/*
  2 + * NVM Express device driver
  3 + * Copyright (c) 2011, Intel Corporation.
  4 + *
  5 + * This program is free software; you can redistribute it and/or modify it
  6 + * under the terms and conditions of the GNU General Public License,
  7 + * version 2, as published by the Free Software Foundation.
  8 + *
  9 + * This program is distributed in the hope it will be useful, but WITHOUT
  10 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12 + * more details.
  13 + *
  14 + * You should have received a copy of the GNU General Public License along with
  15 + * this program; if not, write to the Free Software Foundation, Inc.,
  16 + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  17 + */
  18 +
  19 +#include <linux/nvme.h>
  20 +#include <linux/bio.h>
  21 +#include <linux/blkdev.h>
  22 +#include <linux/errno.h>
  23 +#include <linux/fs.h>
  24 +#include <linux/genhd.h>
  25 +#include <linux/init.h>
  26 +#include <linux/interrupt.h>
  27 +#include <linux/io.h>
  28 +#include <linux/kdev_t.h>
  29 +#include <linux/kernel.h>
  30 +#include <linux/mm.h>
  31 +#include <linux/module.h>
  32 +#include <linux/moduleparam.h>
  33 +#include <linux/pci.h>
  34 +#include <linux/sched.h>
  35 +#include <linux/slab.h>
  36 +#include <linux/types.h>
  37 +#include <linux/version.h>
  38 +
  39 +#define NVME_Q_DEPTH 1024
  40 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
  41 +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
  42 +#define NVME_MINORS 64
  43 +
  44 +static int nvme_major;
  45 +module_param(nvme_major, int, 0);
  46 +
  47 +/*
  48 + * Represents an NVM Express device. Each nvme_dev is a PCI function.
  49 + */
  50 +struct nvme_dev {
  51 + struct list_head node;
  52 + struct nvme_queue **queues;
  53 + u32 __iomem *dbs;
  54 + struct pci_dev *pci_dev;
  55 + int instance;
  56 + int queue_count;
  57 + u32 ctrl_config;
  58 + struct msix_entry *entry;
  59 + struct nvme_bar __iomem *bar;
  60 + struct list_head namespaces;
  61 +};
  62 +
  63 +/*
  64 + * An NVM Express namespace is equivalent to a SCSI LUN
  65 + */
  66 +struct nvme_ns {
  67 + struct list_head list;
  68 +
  69 + struct nvme_dev *dev;
  70 + struct request_queue *queue;
  71 + struct gendisk *disk;
  72 +
  73 + int ns_id;
  74 + int lba_shift;
  75 +};
  76 +
  77 +/*
  78 + * An NVM Express queue. Each device has at least two (one for admin
  79 + * commands and one for I/O commands).
  80 + */
  81 +struct nvme_queue {
  82 + struct device *q_dmadev;
  83 + spinlock_t q_lock;
  84 + struct nvme_command *sq_cmds;
  85 + volatile struct nvme_completion *cqes;
  86 + dma_addr_t sq_dma_addr;
  87 + dma_addr_t cq_dma_addr;
  88 + wait_queue_head_t sq_full;
  89 + struct bio_list sq_cong;
  90 + u32 __iomem *q_db;
  91 + u16 q_depth;
  92 + u16 cq_vector;
  93 + u16 sq_head;
  94 + u16 sq_tail;
  95 + u16 cq_head;
  96 + u16 cq_cycle;
  97 + unsigned long cmdid_data[];
  98 +};
  99 +
  100 +/*
  101 + * Check we didin't inadvertently grow the command struct
  102 + */
  103 +static inline void _nvme_check_size(void)
  104 +{
  105 + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
  106 + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
  107 + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
  108 + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
  109 + BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
  110 + BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
  111 + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
  112 + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
  113 + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
  114 +}
  115 +
  116 +/**
  117 + * alloc_cmdid - Allocate a Command ID
  118 + * @param nvmeq The queue that will be used for this command
  119 + * @param ctx A pointer that will be passed to the handler
  120 + * @param handler The ID of the handler to call
  121 + *
  122 + * Allocate a Command ID for a queue. The data passed in will
  123 + * be passed to the completion handler. This is implemented by using
  124 + * the bottom two bits of the ctx pointer to store the handler ID.
  125 + * Passing in a pointer that's not 4-byte aligned will cause a BUG.
  126 + * We can change this if it becomes a problem.
  127 + */
  128 +static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
  129 +{
  130 + int depth = nvmeq->q_depth;
  131 + unsigned long data = (unsigned long)ctx | handler;
  132 + int cmdid;
  133 +
  134 + BUG_ON((unsigned long)ctx & 3);
  135 +
  136 + do {
  137 + cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
  138 + if (cmdid >= depth)
  139 + return -EBUSY;
  140 + } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
  141 +
  142 + nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
  143 + return cmdid;
  144 +}
  145 +
  146 +static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
  147 + int handler)
  148 +{
  149 + int cmdid;
  150 + wait_event_killable(nvmeq->sq_full,
  151 + (cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
  152 + return (cmdid < 0) ? -EINTR : cmdid;
  153 +}
  154 +
  155 +/* If you need more than four handlers, you'll need to change how
  156 + * alloc_cmdid and nvme_process_cq work
  157 + */
  158 +enum {
  159 + sync_completion_id = 0,
  160 + bio_completion_id,
  161 +};
  162 +
  163 +static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
  164 +{
  165 + unsigned long data;
  166 +
  167 + data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
  168 + clear_bit(cmdid, nvmeq->cmdid_data);
  169 + wake_up(&nvmeq->sq_full);
  170 + return data;
  171 +}
  172 +
  173 +static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
  174 +{
  175 + return ns->dev->queues[1];
  176 +}
  177 +
  178 +static void put_nvmeq(struct nvme_queue *nvmeq)
  179 +{
  180 +}
  181 +
  182 +/**
  183 + * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
  184 + * @nvmeq: The queue to use
  185 + * @cmd: The command to send
  186 + *
  187 + * Safe to use from interrupt context
  188 + */
  189 +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
  190 +{
  191 + unsigned long flags;
  192 + u16 tail;
  193 + /* XXX: Need to check tail isn't going to overrun head */
  194 + spin_lock_irqsave(&nvmeq->q_lock, flags);
  195 + tail = nvmeq->sq_tail;
  196 + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
  197 + writel(tail, nvmeq->q_db);
  198 + if (++tail == nvmeq->q_depth)
  199 + tail = 0;
  200 + nvmeq->sq_tail = tail;
  201 + spin_unlock_irqrestore(&nvmeq->q_lock, flags);
  202 +
  203 + return 0;
  204 +}
  205 +
  206 +struct nvme_req_info {
  207 + struct bio *bio;
  208 + int nents;
  209 + struct scatterlist sg[0];
  210 +};
  211 +
  212 +/* XXX: use a mempool */
  213 +static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
  214 +{
  215 + return kmalloc(sizeof(struct nvme_req_info) +
  216 + sizeof(struct scatterlist) * nseg, gfp);
  217 +}
  218 +
  219 +static void free_info(struct nvme_req_info *info)
  220 +{
  221 + kfree(info);
  222 +}
  223 +
  224 +static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
  225 + struct nvme_completion *cqe)
  226 +{
  227 + struct nvme_req_info *info = ctx;
  228 + struct bio *bio = info->bio;
  229 + u16 status = le16_to_cpup(&cqe->status) >> 1;
  230 +
  231 + dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
  232 + bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
  233 + free_info(info);
  234 + bio_endio(bio, status ? -EIO : 0);
  235 +}
  236 +
  237 +static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
  238 + struct bio *bio, enum dma_data_direction dma_dir, int psegs)
  239 +{
  240 + struct bio_vec *bvec;
  241 + struct scatterlist *sg = info->sg;
  242 + int i, nsegs;
  243 +
  244 + sg_init_table(sg, psegs);
  245 + bio_for_each_segment(bvec, bio, i) {
  246 + sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
  247 + /* XXX: handle non-mergable here */
  248 + nsegs++;
  249 + }
  250 + info->nents = nsegs;
  251 +
  252 + return dma_map_sg(dev, info->sg, info->nents, dma_dir);
  253 +}
  254 +
  255 +static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
  256 + struct bio *bio)
  257 +{
  258 + struct nvme_rw_command *cmnd;
  259 + struct nvme_req_info *info;
  260 + enum dma_data_direction dma_dir;
  261 + int cmdid;
  262 + u16 control;
  263 + u32 dsmgmt;
  264 + unsigned long flags;
  265 + int psegs = bio_phys_segments(ns->queue, bio);
  266 +
  267 + info = alloc_info(psegs, GFP_NOIO);
  268 + if (!info)
  269 + goto congestion;
  270 + info->bio = bio;
  271 +
  272 + cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
  273 + if (unlikely(cmdid < 0))
  274 + goto free_info;
  275 +
  276 + control = 0;
  277 + if (bio->bi_rw & REQ_FUA)
  278 + control |= NVME_RW_FUA;
  279 + if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
  280 + control |= NVME_RW_LR;
  281 +
  282 + dsmgmt = 0;
  283 + if (bio->bi_rw & REQ_RAHEAD)
  284 + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
  285 +
  286 + spin_lock_irqsave(&nvmeq->q_lock, flags);
  287 + cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail].rw;
  288 +
  289 + if (bio_data_dir(bio)) {
  290 + cmnd->opcode = nvme_cmd_write;
  291 + dma_dir = DMA_TO_DEVICE;
  292 + } else {
  293 + cmnd->opcode = nvme_cmd_read;
  294 + dma_dir = DMA_FROM_DEVICE;
  295 + }
  296 +
  297 + nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
  298 +
  299 + cmnd->flags = 1;
  300 + cmnd->command_id = cmdid;
  301 + cmnd->nsid = cpu_to_le32(ns->ns_id);
  302 + cmnd->prp1 = cpu_to_le64(sg_phys(info->sg));
  303 + /* XXX: Support more than one PRP */
  304 + cmnd->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
  305 + cmnd->length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
  306 + cmnd->control = cpu_to_le16(control);
  307 + cmnd->dsmgmt = cpu_to_le32(dsmgmt);
  308 +
  309 + writel(nvmeq->sq_tail, nvmeq->q_db);
  310 + if (++nvmeq->sq_tail == nvmeq->q_depth)
  311 + nvmeq->sq_tail = 0;
  312 +
  313 + spin_unlock_irqrestore(&nvmeq->q_lock, flags);
  314 +
  315 + return 0;
  316 +
  317 + free_info:
  318 + free_info(info);
  319 + congestion:
  320 + return -EBUSY;
  321 +}
  322 +
  323 +/*
  324 + * NB: return value of non-zero would mean that we were a stacking driver.
  325 + * make_request must always succeed.
  326 + */
  327 +static int nvme_make_request(struct request_queue *q, struct bio *bio)
  328 +{
  329 + struct nvme_ns *ns = q->queuedata;
  330 + struct nvme_queue *nvmeq = get_nvmeq(ns);
  331 +
  332 + if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
  333 + blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
  334 + bio_list_add(&nvmeq->sq_cong, bio);
  335 + }
  336 + put_nvmeq(nvmeq);
  337 +
  338 + return 0;
  339 +}
  340 +
  341 +struct sync_cmd_info {
  342 + struct task_struct *task;
  343 + u32 result;
  344 + int status;
  345 +};
  346 +
  347 +static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
  348 + struct nvme_completion *cqe)
  349 +{
  350 + struct sync_cmd_info *cmdinfo = ctx;
  351 + cmdinfo->result = le32_to_cpup(&cqe->result);
  352 + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
  353 + wake_up_process(cmdinfo->task);
  354 +}
  355 +
  356 +typedef void (*completion_fn)(struct nvme_queue *, void *,
  357 + struct nvme_completion *);
  358 +
  359 +static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
  360 +{
  361 + u16 head, cycle;
  362 +
  363 + static const completion_fn completions[4] = {
  364 + [sync_completion_id] = sync_completion,
  365 + [bio_completion_id] = bio_completion,
  366 + };
  367 +
  368 + head = nvmeq->cq_head;
  369 + cycle = nvmeq->cq_cycle;
  370 +
  371 + for (;;) {
  372 + unsigned long data;
  373 + void *ptr;
  374 + unsigned char handler;
  375 + struct nvme_completion cqe = nvmeq->cqes[head];
  376 + if ((le16_to_cpu(cqe.status) & 1) != cycle)
  377 + break;
  378 + nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
  379 + if (++head == nvmeq->q_depth) {
  380 + head = 0;
  381 + cycle = !cycle;
  382 + }
  383 +
  384 + data = free_cmdid(nvmeq, cqe.command_id);
  385 + handler = data & 3;
  386 + ptr = (void *)(data & ~3UL);
  387 + completions[handler](nvmeq, ptr, &cqe);
  388 + }
  389 +
  390 + /* If the controller ignores the cq head doorbell and continuously
  391 + * writes to the queue, it is theoretically possible to wrap around
  392 + * the queue twice and mistakenly return IRQ_NONE. Linux only
  393 + * requires that 0.1% of your interrupts are handled, so this isn't
  394 + * a big problem.
  395 + */
  396 + if (head == nvmeq->cq_head && cycle == nvmeq->cq_cycle)
  397 + return IRQ_NONE;
  398 +
  399 + writel(head, nvmeq->q_db + 1);
  400 + nvmeq->cq_head = head;
  401 + nvmeq->cq_cycle = cycle;
  402 +
  403 + return IRQ_HANDLED;
  404 +}
  405 +
  406 +static irqreturn_t nvme_irq(int irq, void *data)
  407 +{
  408 + return nvme_process_cq(data);
  409 +}
  410 +
  411 +/*
  412 + * Returns 0 on success. If the result is negative, it's a Linux error code;
  413 + * if the result is positive, it's an NVM Express status code
  414 + */
  415 +static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
  416 + u32 *result)
  417 +{
  418 + int cmdid;
  419 + struct sync_cmd_info cmdinfo;
  420 +
  421 + cmdinfo.task = current;
  422 + cmdinfo.status = -EINTR;
  423 +
  424 + cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id);
  425 + if (cmdid < 0)
  426 + return cmdid;
  427 + cmd->common.command_id = cmdid;
  428 +
  429 + set_current_state(TASK_UNINTERRUPTIBLE);
  430 + nvme_submit_cmd(q, cmd);
  431 + schedule();
  432 +
  433 + if (result)
  434 + *result = cmdinfo.result;
  435 +
  436 + return cmdinfo.status;
  437 +}
  438 +
  439 +static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
  440 + u32 *result)
  441 +{
  442 + return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
  443 +}
  444 +
  445 +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
  446 +{
  447 + int status;
  448 + struct nvme_command c;
  449 +
  450 + memset(&c, 0, sizeof(c));
  451 + c.delete_queue.opcode = opcode;
  452 + c.delete_queue.qid = cpu_to_le16(id);
  453 +
  454 + status = nvme_submit_admin_cmd(dev, &c, NULL);
  455 + if (status)
  456 + return -EIO;
  457 + return 0;
  458 +}
  459 +
  460 +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
  461 + struct nvme_queue *nvmeq)
  462 +{
  463 + int status;
  464 + struct nvme_command c;
  465 + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
  466 +
  467 + memset(&c, 0, sizeof(c));
  468 + c.create_cq.opcode = nvme_admin_create_cq;
  469 + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
  470 + c.create_cq.cqid = cpu_to_le16(qid);
  471 + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
  472 + c.create_cq.cq_flags = cpu_to_le16(flags);
  473 + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
  474 +
  475 + status = nvme_submit_admin_cmd(dev, &c, NULL);
  476 + if (status)
  477 + return -EIO;
  478 + return 0;
  479 +}
  480 +
  481 +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
  482 + struct nvme_queue *nvmeq)
  483 +{
  484 + int status;
  485 + struct nvme_command c;
  486 + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
  487 +
  488 + memset(&c, 0, sizeof(c));
  489 + c.create_sq.opcode = nvme_admin_create_sq;
  490 + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
  491 + c.create_sq.sqid = cpu_to_le16(qid);
  492 + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
  493 + c.create_sq.sq_flags = cpu_to_le16(flags);
  494 + c.create_sq.cqid = cpu_to_le16(qid);
  495 +
  496 + status = nvme_submit_admin_cmd(dev, &c, NULL);
  497 + if (status)
  498 + return -EIO;
  499 + return 0;
  500 +}
  501 +
  502 +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
  503 +{
  504 + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
  505 +}
  506 +
  507 +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
  508 +{
  509 + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
  510 +}
  511 +
  512 +static void nvme_free_queue(struct nvme_dev *dev, int qid)
  513 +{
  514 + struct nvme_queue *nvmeq = dev->queues[qid];
  515 +
  516 + free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
  517 +
  518 + /* Don't tell the adapter to delete the admin queue */
  519 + if (qid) {
  520 + adapter_delete_sq(dev, qid);
  521 + adapter_delete_cq(dev, qid);
  522 + }
  523 +
  524 + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
  525 + (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
  526 + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
  527 + nvmeq->sq_cmds, nvmeq->sq_dma_addr);
  528 + kfree(nvmeq);
  529 +}
  530 +
  531 +static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
  532 + int depth, int vector)
  533 +{
  534 + struct device *dmadev = &dev->pci_dev->dev;
  535 + unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
  536 + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
  537 + if (!nvmeq)
  538 + return NULL;
  539 +
  540 + nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
  541 + &nvmeq->cq_dma_addr, GFP_KERNEL);
  542 + if (!nvmeq->cqes)
  543 + goto free_nvmeq;
  544 + memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
  545 +
  546 + nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
  547 + &nvmeq->sq_dma_addr, GFP_KERNEL);
  548 + if (!nvmeq->sq_cmds)
  549 + goto free_cqdma;
  550 +
  551 + nvmeq->q_dmadev = dmadev;
  552 + spin_lock_init(&nvmeq->q_lock);
  553 + nvmeq->cq_head = 0;
  554 + nvmeq->cq_cycle = 1;
  555 + init_waitqueue_head(&nvmeq->sq_full);
  556 + bio_list_init(&nvmeq->sq_cong);
  557 + nvmeq->q_db = &dev->dbs[qid * 2];
  558 + nvmeq->q_depth = depth;
  559 + nvmeq->cq_vector = vector;
  560 +
  561 + return nvmeq;
  562 +
  563 + free_cqdma:
  564 + dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
  565 + nvmeq->cq_dma_addr);
  566 + free_nvmeq:
  567 + kfree(nvmeq);
  568 + return NULL;
  569 +}
  570 +
  571 +static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
  572 + int qid, int cq_size, int vector)
  573 +{
  574 + int result;
  575 + struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
  576 +
  577 + result = adapter_alloc_cq(dev, qid, nvmeq);
  578 + if (result < 0)
  579 + goto free_nvmeq;
  580 +
  581 + result = adapter_alloc_sq(dev, qid, nvmeq);
  582 + if (result < 0)
  583 + goto release_cq;
  584 +
  585 + result = request_irq(dev->entry[vector].vector, nvme_irq,
  586 + IRQF_DISABLED | IRQF_SHARED, "nvme", nvmeq);
  587 + if (result < 0)
  588 + goto release_sq;
  589 +
  590 + return nvmeq;
  591 +
  592 + release_sq:
  593 + adapter_delete_sq(dev, qid);
  594 + release_cq:
  595 + adapter_delete_cq(dev, qid);
  596 + free_nvmeq:
  597 + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
  598 + (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
  599 + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
  600 + nvmeq->sq_cmds, nvmeq->sq_dma_addr);
  601 + kfree(nvmeq);
  602 + return NULL;
  603 +}
  604 +
  605 +static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
  606 +{
  607 + int result;
  608 + u32 aqa;
  609 + struct nvme_queue *nvmeq;
  610 +
  611 + dev->dbs = ((void __iomem *)dev->bar) + 4096;
  612 +
  613 + nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
  614 +
  615 + aqa = nvmeq->q_depth - 1;
  616 + aqa |= aqa << 16;
  617 +
  618 + dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
  619 + dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
  620 + dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
  621 +
  622 + writel(aqa, &dev->bar->aqa);
  623 + writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
  624 + writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
  625 + writel(dev->ctrl_config, &dev->bar->cc);
  626 +
  627 + while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
  628 + msleep(100);
  629 + if (fatal_signal_pending(current))
  630 + return -EINTR;
  631 + }
  632 +
  633 + result = request_irq(dev->entry[0].vector, nvme_irq,
  634 + IRQF_DISABLED | IRQF_SHARED, "nvme admin", nvmeq);
  635 + dev->queues[0] = nvmeq;
  636 + return result;
  637 +}
  638 +
  639 +static int nvme_identify(struct nvme_ns *ns, void __user *addr, int cns)
  640 +{
  641 + struct nvme_dev *dev = ns->dev;
  642 + int status;
  643 + struct nvme_command c;
  644 + void *page;
  645 + dma_addr_t dma_addr;
  646 +
  647 + page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
  648 + GFP_KERNEL);
  649 +
  650 + memset(&c, 0, sizeof(c));
  651 + c.identify.opcode = nvme_admin_identify;
  652 + c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
  653 + c.identify.prp1 = cpu_to_le64(dma_addr);
  654 + c.identify.cns = cpu_to_le32(cns);
  655 +
  656 + status = nvme_submit_admin_cmd(dev, &c, NULL);
  657 +
  658 + if (status)
  659 + status = -EIO;
  660 + else if (copy_to_user(addr, page, 4096))
  661 + status = -EFAULT;
  662 +
  663 + dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
  664 +
  665 + return status;
  666 +}
  667 +
  668 +static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
  669 +{
  670 + struct nvme_dev *dev = ns->dev;
  671 + int status;
  672 + struct nvme_command c;
  673 + void *page;
  674 + dma_addr_t dma_addr;
  675 +
  676 + page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
  677 + GFP_KERNEL);
  678 +
  679 + memset(&c, 0, sizeof(c));
  680 + c.features.opcode = nvme_admin_get_features;
  681 + c.features.nsid = cpu_to_le32(ns->ns_id);
  682 + c.features.prp1 = cpu_to_le64(dma_addr);
  683 + c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
  684 +
  685 + status = nvme_submit_admin_cmd(dev, &c, NULL);
  686 +
  687 + /* XXX: Assuming first range for now */
  688 + if (status)
  689 + status = -EIO;
  690 + else if (copy_to_user(addr, page, 64))
  691 + status = -EFAULT;
  692 +
  693 + dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
  694 +
  695 + return status;
  696 +}
  697 +
  698 +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
  699 + unsigned long arg)
  700 +{
  701 + struct nvme_ns *ns = bdev->bd_disk->private_data;
  702 +
  703 + switch (cmd) {
  704 + case NVME_IOCTL_IDENTIFY_NS:
  705 + return nvme_identify(ns, (void __user *)arg, 0);
  706 + case NVME_IOCTL_IDENTIFY_CTRL:
  707 + return nvme_identify(ns, (void __user *)arg, 1);
  708 + case NVME_IOCTL_GET_RANGE_TYPE:
  709 + return nvme_get_range_type(ns, (void __user *)arg);
  710 + default:
  711 + return -ENOTTY;
  712 + }
  713 +}
  714 +
  715 +static const struct block_device_operations nvme_fops = {
  716 + .owner = THIS_MODULE,
  717 + .ioctl = nvme_ioctl,
  718 +};
  719 +
  720 +static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
  721 + struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
  722 +{
  723 + struct nvme_ns *ns;
  724 + struct gendisk *disk;
  725 + int lbaf;
  726 +
  727 + if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
  728 + return NULL;
  729 +
  730 + ns = kzalloc(sizeof(*ns), GFP_KERNEL);
  731 + if (!ns)
  732 + return NULL;
  733 + ns->queue = blk_alloc_queue(GFP_KERNEL);
  734 + if (!ns->queue)
  735 + goto out_free_ns;
  736 + ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
  737 + QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
  738 + blk_queue_make_request(ns->queue, nvme_make_request);
  739 + ns->dev = dev;
  740 + ns->queue->queuedata = ns;
  741 +
  742 + disk = alloc_disk(NVME_MINORS);
  743 + if (!disk)
  744 + goto out_free_queue;
  745 + ns->ns_id = index;
  746 + ns->disk = disk;
  747 + lbaf = id->flbas & 0xf;
  748 + ns->lba_shift = id->lbaf[lbaf].ds;
  749 +
  750 + disk->major = nvme_major;
  751 + disk->minors = NVME_MINORS;
  752 + disk->first_minor = NVME_MINORS * index;
  753 + disk->fops = &nvme_fops;
  754 + disk->private_data = ns;
  755 + disk->queue = ns->queue;
  756 + sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
  757 + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
  758 +
  759 + return ns;
  760 +
  761 + out_free_queue:
  762 + blk_cleanup_queue(ns->queue);
  763 + out_free_ns:
  764 + kfree(ns);
  765 + return NULL;
  766 +}
  767 +
  768 +static void nvme_ns_free(struct nvme_ns *ns)
  769 +{
  770 + put_disk(ns->disk);
  771 + blk_cleanup_queue(ns->queue);
  772 + kfree(ns);
  773 +}
  774 +
  775 +static int set_queue_count(struct nvme_dev *dev, int sq_count, int cq_count)
  776 +{
  777 + int status;
  778 + u32 result;
  779 + struct nvme_command c;
  780 + u32 q_count = (sq_count - 1) | ((cq_count - 1) << 16);
  781 +
  782 + memset(&c, 0, sizeof(c));
  783 + c.features.opcode = nvme_admin_get_features;
  784 + c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
  785 + c.features.dword11 = cpu_to_le32(q_count);
  786 +
  787 + status = nvme_submit_admin_cmd(dev, &c, &result);
  788 + if (status)
  789 + return -EIO;
  790 + return min(result & 0xffff, result >> 16) + 1;
  791 +}
  792 +
  793 +/* XXX: Create per-CPU queues */
  794 +static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
  795 +{
  796 + int this_cpu;
  797 +
  798 + set_queue_count(dev, 1, 1);
  799 +
  800 + this_cpu = get_cpu();
  801 + dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
  802 + put_cpu();
  803 + if (!dev->queues[1])
  804 + return -ENOMEM;
  805 + dev->queue_count++;
  806 +
  807 + return 0;
  808 +}
  809 +
  810 +static void nvme_free_queues(struct nvme_dev *dev)
  811 +{
  812 + int i;
  813 +
  814 + for (i = dev->queue_count - 1; i >= 0; i--)
  815 + nvme_free_queue(dev, i);
  816 +}
  817 +
  818 +static int __devinit nvme_dev_add(struct nvme_dev *dev)
  819 +{
  820 + int res, nn, i;
  821 + struct nvme_ns *ns, *next;
  822 + void *id;
  823 + dma_addr_t dma_addr;
  824 + struct nvme_command cid, crt;
  825 +
  826 + res = nvme_setup_io_queues(dev);
  827 + if (res)
  828 + return res;
  829 +
  830 + /* XXX: Switch to a SG list once prp2 works */
  831 + id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
  832 + GFP_KERNEL);
  833 +
  834 + memset(&cid, 0, sizeof(cid));
  835 + cid.identify.opcode = nvme_admin_identify;
  836 + cid.identify.nsid = 0;
  837 + cid.identify.prp1 = cpu_to_le64(dma_addr);
  838 + cid.identify.cns = cpu_to_le32(1);
  839 +
  840 + res = nvme_submit_admin_cmd(dev, &cid, NULL);
  841 + if (res) {
  842 + res = -EIO;
  843 + goto out_free;
  844 + }
  845 +
  846 + nn = le32_to_cpup(&((struct nvme_id_ctrl *)id)->nn);
  847 +
  848 + cid.identify.cns = 0;
  849 + memset(&crt, 0, sizeof(crt));
  850 + crt.features.opcode = nvme_admin_get_features;
  851 + crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
  852 + crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
  853 +
  854 + for (i = 0; i < nn; i++) {
  855 + cid.identify.nsid = cpu_to_le32(i);
  856 + res = nvme_submit_admin_cmd(dev, &cid, NULL);
  857 + if (res)
  858 + continue;
  859 +
  860 + if (((struct nvme_id_ns *)id)->ncap == 0)
  861 + continue;
  862 +
  863 + crt.features.nsid = cpu_to_le32(i);
  864 + res = nvme_submit_admin_cmd(dev, &crt, NULL);
  865 + if (res)
  866 + continue;
  867 +
  868 + ns = nvme_alloc_ns(dev, i, id, id + 4096);
  869 + if (ns)
  870 + list_add_tail(&ns->list, &dev->namespaces);
  871 + }
  872 + list_for_each_entry(ns, &dev->namespaces, list)
  873 + add_disk(ns->disk);
  874 +
  875 + dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
  876 + return 0;
  877 +
  878 + out_free:
  879 + list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
  880 + list_del(&ns->list);
  881 + nvme_ns_free(ns);
  882 + }
  883 +
  884 + dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
  885 + return res;
  886 +}
  887 +
  888 +static int nvme_dev_remove(struct nvme_dev *dev)
  889 +{
  890 + struct nvme_ns *ns, *next;
  891 +
  892 + /* TODO: wait all I/O finished or cancel them */
  893 +
  894 + list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
  895 + list_del(&ns->list);
  896 + del_gendisk(ns->disk);
  897 + nvme_ns_free(ns);
  898 + }
  899 +
  900 + nvme_free_queues(dev);
  901 +
  902 + return 0;
  903 +}
  904 +
  905 +/* XXX: Use an ida or something to let remove / add work correctly */
  906 +static void nvme_set_instance(struct nvme_dev *dev)
  907 +{
  908 + static int instance;
  909 + dev->instance = instance++;
  910 +}
  911 +
  912 +static void nvme_release_instance(struct nvme_dev *dev)
  913 +{
  914 +}
  915 +
  916 +static int __devinit nvme_probe(struct pci_dev *pdev,
  917 + const struct pci_device_id *id)
  918 +{
  919 + int result = -ENOMEM;
  920 + struct nvme_dev *dev;
  921 +
  922 + dev = kzalloc(sizeof(*dev), GFP_KERNEL);
  923 + if (!dev)
  924 + return -ENOMEM;
  925 + dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
  926 + GFP_KERNEL);
  927 + if (!dev->entry)
  928 + goto free;
  929 + dev->queues = kcalloc(2, sizeof(void *), GFP_KERNEL);
  930 + if (!dev->queues)
  931 + goto free;
  932 +
  933 + INIT_LIST_HEAD(&dev->namespaces);
  934 + dev->pci_dev = pdev;
  935 + pci_set_drvdata(pdev, dev);
  936 + dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
  937 + nvme_set_instance(dev);
  938 +
  939 + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
  940 + if (!dev->bar) {
  941 + result = -ENOMEM;
  942 + goto disable;
  943 + }
  944 +
  945 + result = nvme_configure_admin_queue(dev);
  946 + if (result)
  947 + goto unmap;
  948 + dev->queue_count++;
  949 +
  950 + result = nvme_dev_add(dev);
  951 + if (result)
  952 + goto delete;
  953 + return 0;
  954 +
  955 + delete:
  956 + nvme_free_queues(dev);
  957 + unmap:
  958 + iounmap(dev->bar);
  959 + disable:
  960 + pci_disable_msix(pdev);
  961 + nvme_release_instance(dev);
  962 + free:
  963 + kfree(dev->queues);
  964 + kfree(dev->entry);
  965 + kfree(dev);
  966 + return result;
  967 +}
  968 +
  969 +static void __devexit nvme_remove(struct pci_dev *pdev)
  970 +{
  971 + struct nvme_dev *dev = pci_get_drvdata(pdev);
  972 + nvme_dev_remove(dev);
  973 + pci_disable_msix(pdev);
  974 + iounmap(dev->bar);
  975 + nvme_release_instance(dev);
  976 + kfree(dev->queues);
  977 + kfree(dev->entry);
  978 + kfree(dev);
  979 +}
  980 +
  981 +/* These functions are yet to be implemented */
  982 +#define nvme_error_detected NULL
  983 +#define nvme_dump_registers NULL
  984 +#define nvme_link_reset NULL
  985 +#define nvme_slot_reset NULL
  986 +#define nvme_error_resume NULL
  987 +#define nvme_suspend NULL
  988 +#define nvme_resume NULL
  989 +
  990 +static struct pci_error_handlers nvme_err_handler = {
  991 + .error_detected = nvme_error_detected,
  992 + .mmio_enabled = nvme_dump_registers,
  993 + .link_reset = nvme_link_reset,
  994 + .slot_reset = nvme_slot_reset,
  995 + .resume = nvme_error_resume,
  996 +};
  997 +
  998 +/* Move to pci_ids.h later */
  999 +#define PCI_CLASS_STORAGE_EXPRESS 0x010802
  1000 +
  1001 +static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
  1002 + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
  1003 + { 0, }
  1004 +};
  1005 +MODULE_DEVICE_TABLE(pci, nvme_id_table);
  1006 +
  1007 +static struct pci_driver nvme_driver = {
  1008 + .name = "nvme",
  1009 + .id_table = nvme_id_table,
  1010 + .probe = nvme_probe,
  1011 + .remove = __devexit_p(nvme_remove),
  1012 + .suspend = nvme_suspend,
  1013 + .resume = nvme_resume,
  1014 + .err_handler = &nvme_err_handler,
  1015 +};
  1016 +
  1017 +static int __init nvme_init(void)
  1018 +{
  1019 + int result;
  1020 +
  1021 + nvme_major = register_blkdev(nvme_major, "nvme");
  1022 + if (nvme_major <= 0)
  1023 + return -EBUSY;
  1024 +
  1025 + result = pci_register_driver(&nvme_driver);
  1026 + if (!result)
  1027 + return 0;
  1028 +
  1029 + unregister_blkdev(nvme_major, "nvme");
  1030 + return result;
  1031 +}
  1032 +
  1033 +static void __exit nvme_exit(void)
  1034 +{
  1035 + pci_unregister_driver(&nvme_driver);
  1036 + unregister_blkdev(nvme_major, "nvme");
  1037 +}
  1038 +
  1039 +MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
  1040 +MODULE_LICENSE("GPL");
  1041 +MODULE_VERSION("0.1");
  1042 +module_init(nvme_init);
  1043 +module_exit(nvme_exit);
include/linux/nvme.h
  1 +/*
  2 + * Definitions for the NVM Express interface
  3 + * Copyright (c) 2011, Intel Corporation.
  4 + *
  5 + * This program is free software; you can redistribute it and/or modify it
  6 + * under the terms and conditions of the GNU General Public License,
  7 + * version 2, as published by the Free Software Foundation.
  8 + *
  9 + * This program is distributed in the hope it will be useful, but WITHOUT
  10 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12 + * more details.
  13 + *
  14 + * You should have received a copy of the GNU General Public License along with
  15 + * this program; if not, write to the Free Software Foundation, Inc.,
  16 + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  17 + */
  18 +
  19 +#ifndef _LINUX_NVME_H
  20 +#define _LINUX_NVME_H
  21 +
  22 +#include <linux/types.h>
  23 +
  24 +struct nvme_bar {
  25 + __u64 cap; /* Controller Capabilities */
  26 + __u32 vs; /* Version */
  27 + __u32 ims; /* Interrupt Mask Set */
  28 + __u32 imc; /* Interrupt Mask Clear */
  29 + __u32 cc; /* Controller Configuration */
  30 + __u32 csts; /* Controller Status */
  31 + __u32 aqa; /* Admin Queue Attributes */
  32 + __u64 asq; /* Admin SQ Base Address */
  33 + __u64 acq; /* Admin CQ Base Address */
  34 +};
  35 +
  36 +enum {
  37 + NVME_CC_ENABLE = 1 << 0,
  38 + NVME_CC_CSS_NVM = 0 << 4,
  39 + NVME_CC_MPS_SHIFT = 7,
  40 + NVME_CC_ARB_RR = 0 << 11,
  41 + NVME_CC_ARB_WRRU = 1 << 11,
  42 + NVME_CC_ARB_VS = 3 << 11,
  43 + NVME_CC_SHN_NONE = 0 << 13,
  44 + NVME_CC_SHN_NORMAL = 1 << 13,
  45 + NVME_CC_SHN_ABRUPT = 2 << 13,
  46 + NVME_CSTS_RDY = 1 << 0,
  47 + NVME_CSTS_CFS = 1 << 1,
  48 + NVME_CSTS_SHST_NORMAL = 0 << 2,
  49 + NVME_CSTS_SHST_OCCUR = 1 << 2,
  50 + NVME_CSTS_SHST_CMPLT = 2 << 2,
  51 +};
  52 +
  53 +#define NVME_VS(major, minor) (major << 16 | minor)
  54 +
  55 +struct nvme_id_ctrl {
  56 + __le16 vid;
  57 + __le16 ssvid;
  58 + char sn[20];
  59 + char mn[40];
  60 + char fr[8];
  61 + __le32 nn;
  62 + __u8 rab;
  63 + __u8 rsvd77[178];
  64 + __le16 oacs;
  65 + __u8 acl;
  66 + __u8 aerl;
  67 + __u8 frmw;
  68 + __u8 lpa;
  69 + __u8 elpe;
  70 + __u8 npss;
  71 + __u8 rsvd264[248];
  72 + __le64 psd[32];
  73 + __le16 oncs;
  74 + __le16 fuses;
  75 + __u8 fna;
  76 + __u8 vwc;
  77 + __le16 awun;
  78 + __le16 awupf;
  79 + __u8 rsvd778[246];
  80 + __u8 cmdset[2048];
  81 + __u8 vs[1024];
  82 +};
  83 +
  84 +struct nvme_lbaf {
  85 + __le16 ms;
  86 + __u8 ds;
  87 + __u8 rp;
  88 +};
  89 +
  90 +struct nvme_id_ns {
  91 + __le64 nsze;
  92 + __le64 ncap;
  93 + __le64 nuse;
  94 + __u8 nsfeat;
  95 + __u8 nlbaf;
  96 + __u8 flbas;
  97 + __u8 mc;
  98 + __u8 dpc;
  99 + __u8 dps;
  100 + __u8 rsvd30[98];
  101 + struct nvme_lbaf lbaf[16];
  102 + __u8 rsvd192[192];
  103 + __u8 vs[3712];
  104 +};
  105 +
  106 +enum {
  107 + NVME_NS_FEAT_THIN = 1 << 0,
  108 + NVME_LBAF_RP_BEST = 0,
  109 + NVME_LBAF_RP_BETTER = 1,
  110 + NVME_LBAF_RP_GOOD = 2,
  111 + NVME_LBAF_RP_DEGRADED = 3,
  112 +};
  113 +
  114 +struct nvme_lba_range_type {
  115 + __u8 type;
  116 + __u8 attributes;
  117 + __u8 rsvd2[14];
  118 + __u64 slba;
  119 + __u64 nlb;
  120 + __u8 guid[16];
  121 + __u8 rsvd48[16];
  122 +};
  123 +
  124 +enum {
  125 + NVME_LBART_TYPE_FS = 0x01,
  126 + NVME_LBART_TYPE_RAID = 0x02,
  127 + NVME_LBART_TYPE_CACHE = 0x03,
  128 + NVME_LBART_TYPE_SWAP = 0x04,
  129 +
  130 + NVME_LBART_ATTRIB_TEMP = 1 << 0,
  131 + NVME_LBART_ATTRIB_HIDE = 1 << 1,
  132 +};
  133 +
  134 +/* I/O commands */
  135 +
  136 +enum nvme_opcode {
  137 + nvme_cmd_flush = 0x00,
  138 + nvme_cmd_write = 0x01,
  139 + nvme_cmd_read = 0x02,
  140 + nvme_cmd_write_uncor = 0x04,
  141 + nvme_cmd_compare = 0x05,
  142 + nvme_cmd_dsm = 0x09,
  143 +};
  144 +
  145 +struct nvme_rw_command {
  146 + __u8 opcode;
  147 + __u8 flags;
  148 + __u16 command_id;
  149 + __le32 nsid;
  150 + __u64 rsvd2;
  151 + __le64 metadata;
  152 + __le64 prp1;
  153 + __le64 prp2;
  154 + __le64 slba;
  155 + __le16 length;
  156 + __le16 control;
  157 + __le32 dsmgmt;
  158 + __le32 reftag;
  159 + __le16 apptag;
  160 + __le16 appmask;
  161 +};
  162 +
  163 +enum {
  164 + NVME_RW_LR = 1 << 15,
  165 + NVME_RW_FUA = 1 << 14,
  166 + NVME_RW_DSM_FREQ_UNSPEC = 0,
  167 + NVME_RW_DSM_FREQ_TYPICAL = 1,
  168 + NVME_RW_DSM_FREQ_RARE = 2,
  169 + NVME_RW_DSM_FREQ_READS = 3,
  170 + NVME_RW_DSM_FREQ_WRITES = 4,
  171 + NVME_RW_DSM_FREQ_RW = 5,
  172 + NVME_RW_DSM_FREQ_ONCE = 6,
  173 + NVME_RW_DSM_FREQ_PREFETCH = 7,
  174 + NVME_RW_DSM_FREQ_TEMP = 8,
  175 + NVME_RW_DSM_LATENCY_NONE = 0 << 4,
  176 + NVME_RW_DSM_LATENCY_IDLE = 1 << 4,
  177 + NVME_RW_DSM_LATENCY_NORM = 2 << 4,
  178 + NVME_RW_DSM_LATENCY_LOW = 3 << 4,
  179 + NVME_RW_DSM_SEQ_REQ = 1 << 6,
  180 + NVME_RW_DSM_COMPRESSED = 1 << 7,
  181 +};
  182 +
  183 +/* Admin commands */
  184 +
  185 +enum nvme_admin_opcode {
  186 + nvme_admin_delete_sq = 0x00,
  187 + nvme_admin_create_sq = 0x01,
  188 + nvme_admin_get_features = 0x02,
  189 + nvme_admin_delete_cq = 0x04,
  190 + nvme_admin_create_cq = 0x05,
  191 + nvme_admin_identify = 0x06,
  192 + nvme_admin_abort_cmd = 0x08,
  193 + nvme_admin_set_features = 0x09,
  194 + nvme_admin_get_log_page = 0x0a,
  195 + nvme_admin_async_event = 0x0c,
  196 + nvme_admin_download_fw = 0x0d,
  197 + nvme_admin_security_recv = 0x0e,
  198 + nvme_admin_format_nvm = 0x10,
  199 + nvme_admin_security_send = 0x11,
  200 + nvme_admin_activate_fw = 0x14,
  201 +};
  202 +
  203 +enum {
  204 + NVME_QUEUE_PHYS_CONTIG = (1 << 0),
  205 + NVME_CQ_IRQ_ENABLED = (1 << 1),
  206 + NVME_SQ_PRIO_URGENT = (0 << 1),
  207 + NVME_SQ_PRIO_HIGH = (1 << 1),
  208 + NVME_SQ_PRIO_MEDIUM = (2 << 1),
  209 + NVME_SQ_PRIO_LOW = (3 << 1),
  210 + NVME_FEAT_ARBITRATION = 0x01,
  211 + NVME_FEAT_POWER_MGMT = 0x02,
  212 + NVME_FEAT_LBA_RANGE = 0x03,
  213 + NVME_FEAT_TEMP_THRESH = 0x04,
  214 + NVME_FEAT_ERR_RECOVERY = 0x05,
  215 + NVME_FEAT_VOLATILE_WC = 0x06,
  216 + NVME_FEAT_NUM_QUEUES = 0x07,
  217 + NVME_FEAT_IRQ_COALESCE = 0x08,
  218 + NVME_FEAT_IRQ_CONFIG = 0x09,
  219 + NVME_FEAT_WRITE_ATOMIC = 0x0a,
  220 + NVME_FEAT_ASYNC_EVENT = 0x0b,
  221 + NVME_FEAT_SW_PROGRESS = 0x0c,
  222 +};
  223 +
  224 +struct nvme_identify {
  225 + __u8 opcode;
  226 + __u8 flags;
  227 + __u16 command_id;
  228 + __le32 nsid;
  229 + __u64 rsvd2[2];
  230 + __le64 prp1;
  231 + __le64 prp2;
  232 + __le32 cns;
  233 + __u32 rsvd11[5];
  234 +};
  235 +
  236 +struct nvme_features {
  237 + __u8 opcode;
  238 + __u8 flags;
  239 + __u16 command_id;
  240 + __le32 nsid;
  241 + __u64 rsvd2[2];
  242 + __le64 prp1;
  243 + __le64 prp2;
  244 + __le32 fid;
  245 + __le32 dword11;
  246 + __u32 rsvd12[4];
  247 +};
  248 +
  249 +struct nvme_create_cq {
  250 + __u8 opcode;
  251 + __u8 flags;
  252 + __u16 command_id;
  253 + __le32 rsvd1[5];
  254 + __le64 prp1;
  255 + __u64 rsvd8;
  256 + __le16 cqid;
  257 + __le16 qsize;
  258 + __le16 cq_flags;
  259 + __le16 irq_vector;
  260 + __u32 rsvd12[4];
  261 +};
  262 +
  263 +struct nvme_create_sq {
  264 + __u8 opcode;
  265 + __u8 flags;
  266 + __u16 command_id;
  267 + __le32 rsvd1[5];
  268 + __le64 prp1;
  269 + __u64 rsvd8;
  270 + __le16 sqid;
  271 + __le16 qsize;
  272 + __le16 sq_flags;
  273 + __le16 cqid;
  274 + __le32 rsvd12[4];
  275 +};
  276 +
  277 +struct nvme_delete_queue {
  278 + __u8 opcode;
  279 + __u8 flags;
  280 + __u16 command_id;
  281 + __u32 rsvd1[9];
  282 + __le16 qid;
  283 + __le16 rsvd10;
  284 + __le32 rsvd11[5];
  285 +};
  286 +
  287 +struct nvme_common_command {
  288 + __u8 opcode;
  289 + __u8 flags;
  290 + __u16 command_id;
  291 + __le32 nsid;
  292 + __u32 rsvd2[14];
  293 +};
  294 +
  295 +struct nvme_command {
  296 + union {
  297 + struct nvme_common_command common;
  298 + struct nvme_rw_command rw;
  299 + struct nvme_identify identify;
  300 + struct nvme_features features;
  301 + struct nvme_create_cq create_cq;
  302 + struct nvme_create_sq create_sq;
  303 + struct nvme_delete_queue delete_queue;
  304 + };
  305 +};
  306 +
  307 +/* XXX: Sync with spec */
  308 +enum {
  309 + NVME_SC_SUCCESS = 0x0,
  310 + NVME_SC_INVALID_OPCODE = 0x1,
  311 + NVME_SC_INVALID_FIELD = 0x2,
  312 + NVME_SC_CMDID_CONFLICT = 0x3,
  313 + NVME_SC_DATA_XFER_ERROR = 0x4,
  314 + NVME_SC_POWER_LOSS = 0x5,
  315 + NVME_SC_INTERNAL = 0x6,
  316 + NVME_SC_ABORT_REQ = 0x7,
  317 + NVME_SC_ABORT_QUEUE = 0x8,
  318 + NVME_SC_FUSED_FAIL = 0x9,
  319 + NVME_SC_FUSED_MISSING = 0xa,
  320 + NVME_SC_LBA_RANGE = 0x80,
  321 + NVME_SC_CAP_EXCEEDED = 0x81,
  322 + NVME_SC_NS_NOT_READY = 0x82,
  323 + NVME_SC_CQ_INVALID = 0x100,
  324 + NVME_SC_QID_INVALID = 0x101,
  325 + NVME_SC_QUEUE_SIZE = 0x102,
  326 + NVME_SC_WRITE_FAULT = 0x280,
  327 + NVME_SC_READ_ERROR = 0x281,
  328 +};
  329 +
  330 +struct nvme_completion {
  331 + __le32 result; /* Used by admin commands to return data */
  332 + __le32 rsvd;
  333 + __le16 sq_head; /* how much of this queue may be reclaimed */
  334 + __le16 sq_id; /* submission queue that generated this entry */
  335 + __u16 command_id; /* of the command which completed */
  336 + __le16 status; /* did the command fail, and if so, why? */
  337 +};
  338 +
  339 +#define NVME_IOCTL_IDENTIFY_NS _IOW('N', 0x40, struct nvme_id_ns)
  340 +#define NVME_IOCTL_IDENTIFY_CTRL _IOW('N', 0x41, struct nvme_id_ctrl)
  341 +#define NVME_IOCTL_GET_RANGE_TYPE _IOW('N', 0x42, struct nvme_lba_range_type)
  342 +
  343 +#endif /* _LINUX_NVME_H */