Commit 00f7ec36c9324928e4cd23f02e6d8550f30c32ca

Authored by Steve Wise
Committed by Roland Dreier
1 parent f89271da32

RDMA/core: Add memory management extensions support

This patch adds support for the IB "base memory management extension"
(BMME) and the equivalent iWARP operations (which the iWARP verbs
mandates all devices must implement).  The new operations are:

 - Allocate an ib_mr for use in fast register work requests.

 - Allocate/free a physical buffer lists for use in fast register work
   requests.  This allows device drivers to allocate this memory as
   needed for use in posting send requests (eg via dma_alloc_coherent).

 - New send queue work requests:
   * send with remote invalidate
   * fast register memory region
   * local invalidate memory region
   * RDMA read with invalidate local memory region (iWARP only)

Consumer interface details:

 - A new device capability flag IB_DEVICE_MEM_MGT_EXTENSIONS is added
   to indicate device support for these features.

 - New send work request opcodes IB_WR_FAST_REG_MR, IB_WR_LOCAL_INV,
   IB_WR_RDMA_READ_WITH_INV are added.

 - A new consumer API function, ib_alloc_mr() is added to allocate
   fast register memory regions.

 - New consumer API functions, ib_alloc_fast_reg_page_list() and
   ib_free_fast_reg_page_list() are added to allocate and free
   device-specific memory for fast registration page lists.

 - A new consumer API function, ib_update_fast_reg_key(), is added to
   allow the key portion of the R_Key and L_Key of a fast registration
   MR to be updated.  Consumers call this if desired before posting
   a IB_WR_FAST_REG_MR work request.

Consumers can use this as follows:

 - MR is allocated with ib_alloc_mr().

 - Page list memory is allocated with ib_alloc_fast_reg_page_list().

 - MR R_Key/L_Key "key" field is updated with ib_update_fast_reg_key().

 - MR made VALID and bound to a specific page list via
   ib_post_send(IB_WR_FAST_REG_MR)

 - MR made INVALID via ib_post_send(IB_WR_LOCAL_INV),
   ib_post_send(IB_WR_RDMA_READ_WITH_INV) or an incoming send with
   invalidate operation.

 - MR is deallocated with ib_dereg_mr()

 - page lists dealloced via ib_free_fast_reg_page_list().

Applications can allocate a fast register MR once, and then can
repeatedly bind the MR to different physical block lists (PBLs) via
posting work requests to a send queue (SQ).  For each outstanding
MR-to-PBL binding in the SQ pipe, a fast_reg_page_list needs to be
allocated (the fast_reg_page_list is owned by the low-level driver
from the consumer posting a work request until the request completes).
Thus pipelining can be achieved while still allowing device-specific
page_list processing.

The 32-bit fast register memory key/STag is composed of a 24-bit index
and an 8-bit key.  The application can change the key each time it
fast registers thus allowing more control over the peer's use of the
key/STag (ie it can effectively be changed each time the rkey is
rebound to a page list).

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

Showing 12 changed files with 154 additions and 26 deletions Side-by-side Diff

drivers/infiniband/core/uverbs_cmd.c
... ... @@ -917,7 +917,7 @@
917 917 resp->wc[i].opcode = wc[i].opcode;
918 918 resp->wc[i].vendor_err = wc[i].vendor_err;
919 919 resp->wc[i].byte_len = wc[i].byte_len;
920   - resp->wc[i].imm_data = (__u32 __force) wc[i].imm_data;
  920 + resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data;
921 921 resp->wc[i].qp_num = wc[i].qp->qp_num;
922 922 resp->wc[i].src_qp = wc[i].src_qp;
923 923 resp->wc[i].wc_flags = wc[i].wc_flags;
drivers/infiniband/core/verbs.c
... ... @@ -753,6 +753,52 @@
753 753 }
754 754 EXPORT_SYMBOL(ib_dereg_mr);
755 755  
  756 +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
  757 +{
  758 + struct ib_mr *mr;
  759 +
  760 + if (!pd->device->alloc_fast_reg_mr)
  761 + return ERR_PTR(-ENOSYS);
  762 +
  763 + mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
  764 +
  765 + if (!IS_ERR(mr)) {
  766 + mr->device = pd->device;
  767 + mr->pd = pd;
  768 + mr->uobject = NULL;
  769 + atomic_inc(&pd->usecnt);
  770 + atomic_set(&mr->usecnt, 0);
  771 + }
  772 +
  773 + return mr;
  774 +}
  775 +EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
  776 +
  777 +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
  778 + int max_page_list_len)
  779 +{
  780 + struct ib_fast_reg_page_list *page_list;
  781 +
  782 + if (!device->alloc_fast_reg_page_list)
  783 + return ERR_PTR(-ENOSYS);
  784 +
  785 + page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
  786 +
  787 + if (!IS_ERR(page_list)) {
  788 + page_list->device = device;
  789 + page_list->max_page_list_len = max_page_list_len;
  790 + }
  791 +
  792 + return page_list;
  793 +}
  794 +EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
  795 +
  796 +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
  797 +{
  798 + page_list->device->free_fast_reg_page_list(page_list);
  799 +}
  800 +EXPORT_SYMBOL(ib_free_fast_reg_page_list);
  801 +
756 802 /* Memory windows */
757 803  
758 804 struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
drivers/infiniband/hw/ehca/ehca_reqs.c
... ... @@ -681,7 +681,7 @@
681 681 wc->dlid_path_bits = cqe->dlid;
682 682 wc->src_qp = cqe->remote_qp_number;
683 683 wc->wc_flags = cqe->w_completion_flags;
684   - wc->imm_data = cpu_to_be32(cqe->immediate_data);
  684 + wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
685 685 wc->sl = cqe->service_level;
686 686  
687 687 poll_cq_one_exit0:
drivers/infiniband/hw/ipath/ipath_cq.c
... ... @@ -82,7 +82,7 @@
82 82 wc->uqueue[head].opcode = entry->opcode;
83 83 wc->uqueue[head].vendor_err = entry->vendor_err;
84 84 wc->uqueue[head].byte_len = entry->byte_len;
85   - wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data;
  85 + wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
86 86 wc->uqueue[head].qp_num = entry->qp->qp_num;
87 87 wc->uqueue[head].src_qp = entry->src_qp;
88 88 wc->uqueue[head].wc_flags = entry->wc_flags;
drivers/infiniband/hw/ipath/ipath_rc.c
... ... @@ -1703,11 +1703,11 @@
1703 1703 case OP(SEND_LAST_WITH_IMMEDIATE):
1704 1704 send_last_imm:
1705 1705 if (header_in_data) {
1706   - wc.imm_data = *(__be32 *) data;
  1706 + wc.ex.imm_data = *(__be32 *) data;
1707 1707 data += sizeof(__be32);
1708 1708 } else {
1709 1709 /* Immediate data comes after BTH */
1710   - wc.imm_data = ohdr->u.imm_data;
  1710 + wc.ex.imm_data = ohdr->u.imm_data;
1711 1711 }
1712 1712 hdrsize += 4;
1713 1713 wc.wc_flags = IB_WC_WITH_IMM;
drivers/infiniband/hw/ipath/ipath_ruc.c
... ... @@ -331,7 +331,7 @@
331 331 switch (wqe->wr.opcode) {
332 332 case IB_WR_SEND_WITH_IMM:
333 333 wc.wc_flags = IB_WC_WITH_IMM;
334   - wc.imm_data = wqe->wr.ex.imm_data;
  334 + wc.ex.imm_data = wqe->wr.ex.imm_data;
335 335 /* FALLTHROUGH */
336 336 case IB_WR_SEND:
337 337 if (!ipath_get_rwqe(qp, 0))
... ... @@ -342,7 +342,7 @@
342 342 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
343 343 goto inv_err;
344 344 wc.wc_flags = IB_WC_WITH_IMM;
345   - wc.imm_data = wqe->wr.ex.imm_data;
  345 + wc.ex.imm_data = wqe->wr.ex.imm_data;
346 346 if (!ipath_get_rwqe(qp, 1))
347 347 goto rnr_nak;
348 348 /* FALLTHROUGH */
drivers/infiniband/hw/ipath/ipath_uc.c
... ... @@ -379,11 +379,11 @@
379 379 case OP(SEND_LAST_WITH_IMMEDIATE):
380 380 send_last_imm:
381 381 if (header_in_data) {
382   - wc.imm_data = *(__be32 *) data;
  382 + wc.ex.imm_data = *(__be32 *) data;
383 383 data += sizeof(__be32);
384 384 } else {
385 385 /* Immediate data comes after BTH */
386   - wc.imm_data = ohdr->u.imm_data;
  386 + wc.ex.imm_data = ohdr->u.imm_data;
387 387 }
388 388 hdrsize += 4;
389 389 wc.wc_flags = IB_WC_WITH_IMM;
390 390  
... ... @@ -483,11 +483,11 @@
483 483 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
484 484 rdma_last_imm:
485 485 if (header_in_data) {
486   - wc.imm_data = *(__be32 *) data;
  486 + wc.ex.imm_data = *(__be32 *) data;
487 487 data += sizeof(__be32);
488 488 } else {
489 489 /* Immediate data comes after BTH */
490   - wc.imm_data = ohdr->u.imm_data;
  490 + wc.ex.imm_data = ohdr->u.imm_data;
491 491 }
492 492 hdrsize += 4;
493 493 wc.wc_flags = IB_WC_WITH_IMM;
drivers/infiniband/hw/ipath/ipath_ud.c
... ... @@ -96,7 +96,7 @@
96 96  
97 97 if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
98 98 wc.wc_flags = IB_WC_WITH_IMM;
99   - wc.imm_data = swqe->wr.ex.imm_data;
  99 + wc.ex.imm_data = swqe->wr.ex.imm_data;
100 100 }
101 101  
102 102 /*
103 103  
104 104  
... ... @@ -492,14 +492,14 @@
492 492 if (qp->ibqp.qp_num > 1 &&
493 493 opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
494 494 if (header_in_data) {
495   - wc.imm_data = *(__be32 *) data;
  495 + wc.ex.imm_data = *(__be32 *) data;
496 496 data += sizeof(__be32);
497 497 } else
498   - wc.imm_data = ohdr->u.ud.imm_data;
  498 + wc.ex.imm_data = ohdr->u.ud.imm_data;
499 499 wc.wc_flags = IB_WC_WITH_IMM;
500 500 hdrsize += sizeof(u32);
501 501 } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
502   - wc.imm_data = 0;
  502 + wc.ex.imm_data = 0;
503 503 wc.wc_flags = 0;
504 504 } else {
505 505 dev->n_pkt_drops++;
drivers/infiniband/hw/mlx4/cq.c
... ... @@ -663,18 +663,18 @@
663 663  
664 664 switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
665 665 case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
666   - wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
667   - wc->wc_flags = IB_WC_WITH_IMM;
668   - wc->imm_data = cqe->immed_rss_invalid;
  666 + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
  667 + wc->wc_flags = IB_WC_WITH_IMM;
  668 + wc->ex.imm_data = cqe->immed_rss_invalid;
669 669 break;
670 670 case MLX4_RECV_OPCODE_SEND:
671 671 wc->opcode = IB_WC_RECV;
672 672 wc->wc_flags = 0;
673 673 break;
674 674 case MLX4_RECV_OPCODE_SEND_IMM:
675   - wc->opcode = IB_WC_RECV;
676   - wc->wc_flags = IB_WC_WITH_IMM;
677   - wc->imm_data = cqe->immed_rss_invalid;
  675 + wc->opcode = IB_WC_RECV;
  676 + wc->wc_flags = IB_WC_WITH_IMM;
  677 + wc->ex.imm_data = cqe->immed_rss_invalid;
678 678 break;
679 679 }
680 680  
drivers/infiniband/hw/mthca/mthca_cq.c
... ... @@ -620,13 +620,13 @@
620 620 case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
621 621 case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
622 622 entry->wc_flags = IB_WC_WITH_IMM;
623   - entry->imm_data = cqe->imm_etype_pkey_eec;
  623 + entry->ex.imm_data = cqe->imm_etype_pkey_eec;
624 624 entry->opcode = IB_WC_RECV;
625 625 break;
626 626 case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
627 627 case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
628 628 entry->wc_flags = IB_WC_WITH_IMM;
629   - entry->imm_data = cqe->imm_etype_pkey_eec;
  629 + entry->ex.imm_data = cqe->imm_etype_pkey_eec;
630 630 entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
631 631 break;
632 632 default:
include/rdma/ib_user_verbs.h
... ... @@ -289,7 +289,10 @@
289 289 __u32 opcode;
290 290 __u32 vendor_err;
291 291 __u32 byte_len;
292   - __u32 imm_data;
  292 + union {
  293 + __u32 imm_data;
  294 + __u32 invalidate_rkey;
  295 + } ex;
293 296 __u32 qp_num;
294 297 __u32 src_qp;
295 298 __u32 wc_flags;
include/rdma/ib_verbs.h
... ... @@ -103,6 +103,7 @@
103 103 */
104 104 IB_DEVICE_UD_IP_CSUM = (1<<18),
105 105 IB_DEVICE_UD_TSO = (1<<19),
  106 + IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21),
106 107 };
107 108  
108 109 enum ib_atomic_cap {
... ... @@ -148,6 +149,7 @@
148 149 int max_srq;
149 150 int max_srq_wr;
150 151 int max_srq_sge;
  152 + unsigned int max_fast_reg_page_list_len;
151 153 u16 max_pkeys;
152 154 u8 local_ca_ack_delay;
153 155 };
... ... @@ -411,6 +413,8 @@
411 413 IB_WC_FETCH_ADD,
412 414 IB_WC_BIND_MW,
413 415 IB_WC_LSO,
  416 + IB_WC_LOCAL_INV,
  417 + IB_WC_FAST_REG_MR,
414 418 /*
415 419 * Set value of IB_WC_RECV so consumers can test if a completion is a
416 420 * receive by testing (opcode & IB_WC_RECV).
... ... @@ -421,7 +425,8 @@
421 425  
422 426 enum ib_wc_flags {
423 427 IB_WC_GRH = 1,
424   - IB_WC_WITH_IMM = (1<<1)
  428 + IB_WC_WITH_IMM = (1<<1),
  429 + IB_WC_WITH_INVALIDATE = (1<<2),
425 430 };
426 431  
427 432 struct ib_wc {
... ... @@ -431,7 +436,10 @@
431 436 u32 vendor_err;
432 437 u32 byte_len;
433 438 struct ib_qp *qp;
434   - __be32 imm_data;
  439 + union {
  440 + __be32 imm_data;
  441 + u32 invalidate_rkey;
  442 + } ex;
435 443 u32 src_qp;
436 444 int wc_flags;
437 445 u16 pkey_index;
... ... @@ -625,6 +633,9 @@
625 633 IB_WR_ATOMIC_FETCH_AND_ADD,
626 634 IB_WR_LSO,
627 635 IB_WR_SEND_WITH_INV,
  636 + IB_WR_RDMA_READ_WITH_INV,
  637 + IB_WR_LOCAL_INV,
  638 + IB_WR_FAST_REG_MR,
628 639 };
629 640  
630 641 enum ib_send_flags {
... ... @@ -641,6 +652,12 @@
641 652 u32 lkey;
642 653 };
643 654  
  655 +struct ib_fast_reg_page_list {
  656 + struct ib_device *device;
  657 + u64 *page_list;
  658 + unsigned int max_page_list_len;
  659 +};
  660 +
644 661 struct ib_send_wr {
645 662 struct ib_send_wr *next;
646 663 u64 wr_id;
... ... @@ -673,6 +690,15 @@
673 690 u16 pkey_index; /* valid for GSI only */
674 691 u8 port_num; /* valid for DR SMPs on switch only */
675 692 } ud;
  693 + struct {
  694 + u64 iova_start;
  695 + struct ib_fast_reg_page_list *page_list;
  696 + unsigned int page_shift;
  697 + unsigned int page_list_len;
  698 + u32 length;
  699 + int access_flags;
  700 + u32 rkey;
  701 + } fast_reg;
676 702 } wr;
677 703 };
678 704  
... ... @@ -1011,6 +1037,11 @@
1011 1037 int (*query_mr)(struct ib_mr *mr,
1012 1038 struct ib_mr_attr *mr_attr);
1013 1039 int (*dereg_mr)(struct ib_mr *mr);
  1040 + struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd,
  1041 + int max_page_list_len);
  1042 + struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
  1043 + int page_list_len);
  1044 + void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
1014 1045 int (*rereg_phys_mr)(struct ib_mr *mr,
1015 1046 int mr_rereg_mask,
1016 1047 struct ib_pd *pd,
... ... @@ -1803,6 +1834,54 @@
1803 1834 * @mr: The memory region to deregister.
1804 1835 */
1805 1836 int ib_dereg_mr(struct ib_mr *mr);
  1837 +
  1838 +/**
  1839 + * ib_alloc_fast_reg_mr - Allocates memory region usable with the
  1840 + * IB_WR_FAST_REG_MR send work request.
  1841 + * @pd: The protection domain associated with the region.
  1842 + * @max_page_list_len: requested max physical buffer list length to be
  1843 + * used with fast register work requests for this MR.
  1844 + */
  1845 +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
  1846 +
  1847 +/**
  1848 + * ib_alloc_fast_reg_page_list - Allocates a page list array
  1849 + * @device - ib device pointer.
  1850 + * @page_list_len - size of the page list array to be allocated.
  1851 + *
  1852 + * This allocates and returns a struct ib_fast_reg_page_list * and a
  1853 + * page_list array that is at least page_list_len in size. The actual
  1854 + * size is returned in max_page_list_len. The caller is responsible
  1855 + * for initializing the contents of the page_list array before posting
  1856 + * a send work request with the IB_WC_FAST_REG_MR opcode.
  1857 + *
  1858 + * The page_list array entries must be translated using one of the
  1859 + * ib_dma_*() functions just like the addresses passed to
  1860 + * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct
  1861 + * ib_fast_reg_page_list must not be modified by the caller until the
  1862 + * IB_WC_FAST_REG_MR work request completes.
  1863 + */
  1864 +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
  1865 + struct ib_device *device, int page_list_len);
  1866 +
  1867 +/**
  1868 + * ib_free_fast_reg_page_list - Deallocates a previously allocated
  1869 + * page list array.
  1870 + * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
  1871 + */
  1872 +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
  1873 +
  1874 +/**
  1875 + * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
  1876 + * R_Key and L_Key.
  1877 + * @mr - struct ib_mr pointer to be updated.
  1878 + * @newkey - new key to be used.
  1879 + */
  1880 +static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
  1881 +{
  1882 + mr->lkey = (mr->lkey & 0xffffff00) | newkey;
  1883 + mr->rkey = (mr->rkey & 0xffffff00) | newkey;
  1884 +}
1806 1885  
1807 1886 /**
1808 1887 * ib_alloc_mw - Allocates a memory window.