Blame view

net/rds/ib.c 12.2 KB
ec16227e1   Andy Grover   RDS/IB: Infiniban...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
  /*
   * Copyright (c) 2006 Oracle.  All rights reserved.
   *
   * This software is available to you under a choice of one of two
   * licenses.  You may choose to be licensed under the terms of the GNU
   * General Public License (GPL) Version 2, available from the file
   * COPYING in the main directory of this source tree, or the
   * OpenIB.org BSD license below:
   *
   *     Redistribution and use in source and binary forms, with or
   *     without modification, are permitted provided that the following
   *     conditions are met:
   *
   *      - Redistributions of source code must retain the above
   *        copyright notice, this list of conditions and the following
   *        disclaimer.
   *
   *      - Redistributions in binary form must reproduce the above
   *        copyright notice, this list of conditions and the following
   *        disclaimer in the documentation and/or other materials
   *        provided with the distribution.
   *
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   * SOFTWARE.
   *
   */
  #include <linux/kernel.h>
  #include <linux/in.h>
  #include <linux/if.h>
  #include <linux/netdevice.h>
  #include <linux/inetdevice.h>
  #include <linux/if_arp.h>
  #include <linux/delay.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
40
  #include <linux/slab.h>
3a9a231d9   Paul Gortmaker   net: Fix files ex...
41
  #include <linux/module.h>
ec16227e1   Andy Grover   RDS/IB: Infiniban...
42
43
44
  
  #include "rds.h"
  #include "ib.h"
ff51bf841   stephen hemminger   rds: make local f...
45
  static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
ec16227e1   Andy Grover   RDS/IB: Infiniban...
46
  unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
3ba23ade4   Andy Grover   RDS: Set retry_co...
47
  unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
ec16227e1   Andy Grover   RDS/IB: Infiniban...
48
49
50
51
52
  
  module_param(fmr_pool_size, int, 0444);
  MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
  module_param(fmr_message_size, int, 0444);
  MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
3ba23ade4   Andy Grover   RDS: Set retry_co...
53
54
  module_param(rds_ib_retry_count, int, 0444);
  MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
ec16227e1   Andy Grover   RDS/IB: Infiniban...
55

ea819867b   Zach Brown   RDS/IB: protect t...
56
57
58
59
60
61
  /*
   * we have a clumsy combination of RCU and a rwsem protecting this list
   * because it is used both in the get_mr fast path and while blocking in
   * the FMR flushing path.
   */
  DECLARE_RWSEM(rds_ib_devices_lock);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
62
  struct list_head rds_ib_devices;
745cbccac   Andy Grover   RDS: Rewrite conn...
63
  /* NOTE: if also grabbing ibdev lock, grab this first */
ec16227e1   Andy Grover   RDS/IB: Infiniban...
64
65
  DEFINE_SPINLOCK(ib_nodev_conns_lock);
  LIST_HEAD(ib_nodev_conns);
ff51bf841   stephen hemminger   rds: make local f...
66
  static void rds_ib_nodev_connect(void)
fc19de38b   Zach Brown   RDS/IB: disconnec...
67
68
69
70
71
72
73
74
  {
  	struct rds_ib_connection *ic;
  
  	spin_lock(&ib_nodev_conns_lock);
  	list_for_each_entry(ic, &ib_nodev_conns, ib_node)
  		rds_conn_connect_if_down(ic->conn);
  	spin_unlock(&ib_nodev_conns_lock);
  }
ff51bf841   stephen hemminger   rds: make local f...
75
  static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
fc19de38b   Zach Brown   RDS/IB: disconnec...
76
77
78
79
80
81
82
83
84
  {
  	struct rds_ib_connection *ic;
  	unsigned long flags;
  
  	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
  	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
  		rds_conn_drop(ic->conn);
  	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
  }
3e0249f9c   Zach Brown   RDS/IB: add refco...
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
  /*
   * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
   * from interrupt context so we push freing off into a work struct in krdsd.
   */
  static void rds_ib_dev_free(struct work_struct *work)
  {
  	struct rds_ib_ipaddr *i_ipaddr, *i_next;
  	struct rds_ib_device *rds_ibdev = container_of(work,
  					struct rds_ib_device, free_work);
  
  	if (rds_ibdev->mr_pool)
  		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
  	if (rds_ibdev->mr)
  		ib_dereg_mr(rds_ibdev->mr);
  	if (rds_ibdev->pd)
  		ib_dealloc_pd(rds_ibdev->pd);
  
  	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
  		list_del(&i_ipaddr->list);
  		kfree(i_ipaddr);
  	}
  
  	kfree(rds_ibdev);
  }
  
  void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
  {
  	BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
  	if (atomic_dec_and_test(&rds_ibdev->refcount))
  		queue_work(rds_wq, &rds_ibdev->free_work);
  }
ff51bf841   stephen hemminger   rds: make local f...
116
  static void rds_ib_add_one(struct ib_device *device)
ec16227e1   Andy Grover   RDS/IB: Infiniban...
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  {
  	struct rds_ib_device *rds_ibdev;
  	struct ib_device_attr *dev_attr;
  
  	/* Only handle IB (no iWARP) devices */
  	if (device->node_type != RDMA_NODE_IB_CA)
  		return;
  
  	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
  	if (!dev_attr)
  		return;
  
  	if (ib_query_device(device, dev_attr)) {
  		rdsdebug("Query device failed for %s
  ", device->name);
  		goto free_attr;
  	}
3e0249f9c   Zach Brown   RDS/IB: add refco...
134
135
  	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
  				 ibdev_to_node(device));
ec16227e1   Andy Grover   RDS/IB: Infiniban...
136
137
138
139
  	if (!rds_ibdev)
  		goto free_attr;
  
  	spin_lock_init(&rds_ibdev->spinlock);
3e0249f9c   Zach Brown   RDS/IB: add refco...
140
141
  	atomic_set(&rds_ibdev->refcount, 1);
  	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
142
143
144
  
  	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
  	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
145
146
147
148
  	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
  	rds_ibdev->max_fmrs = dev_attr->max_fmr ?
  			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
  			fmr_pool_size;
40589e74f   Andy Grover   RDS: Base init_de...
149
150
  	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
  	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
ec16227e1   Andy Grover   RDS/IB: Infiniban...
151
152
  	rds_ibdev->dev = device;
  	rds_ibdev->pd = ib_alloc_pd(device);
3e0249f9c   Zach Brown   RDS/IB: add refco...
153
154
155
156
  	if (IS_ERR(rds_ibdev->pd)) {
  		rds_ibdev->pd = NULL;
  		goto put_dev;
  	}
ec16227e1   Andy Grover   RDS/IB: Infiniban...
157

3e0249f9c   Zach Brown   RDS/IB: add refco...
158
159
160
161
162
  	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
  	if (IS_ERR(rds_ibdev->mr)) {
  		rds_ibdev->mr = NULL;
  		goto put_dev;
  	}
ec16227e1   Andy Grover   RDS/IB: Infiniban...
163
164
165
166
  
  	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
  	if (IS_ERR(rds_ibdev->mr_pool)) {
  		rds_ibdev->mr_pool = NULL;
3e0249f9c   Zach Brown   RDS/IB: add refco...
167
  		goto put_dev;
ec16227e1   Andy Grover   RDS/IB: Infiniban...
168
169
170
171
  	}
  
  	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
  	INIT_LIST_HEAD(&rds_ibdev->conn_list);
ea819867b   Zach Brown   RDS/IB: protect t...
172
173
174
175
  
  	down_write(&rds_ib_devices_lock);
  	list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
  	up_write(&rds_ib_devices_lock);
3e0249f9c   Zach Brown   RDS/IB: add refco...
176
  	atomic_inc(&rds_ibdev->refcount);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
177
178
  
  	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
3e0249f9c   Zach Brown   RDS/IB: add refco...
179
  	atomic_inc(&rds_ibdev->refcount);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
180

fc19de38b   Zach Brown   RDS/IB: disconnec...
181
  	rds_ib_nodev_connect();
3e0249f9c   Zach Brown   RDS/IB: add refco...
182
183
  put_dev:
  	rds_ib_dev_put(rds_ibdev);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
184
185
186
  free_attr:
  	kfree(dev_attr);
  }
3e0249f9c   Zach Brown   RDS/IB: add refco...
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
  /*
   * New connections use this to find the device to associate with the
   * connection.  It's not in the fast path so we're not concerned about the
   * performance of the IB call.  (As of this writing, it uses an interrupt
   * blocking spinlock to serialize walking a per-device list of all registered
   * clients.)
   *
   * RCU is used to handle incoming connections racing with device teardown.
   * Rather than use a lock to serialize removal from the client_data and
   * getting a new reference, we use an RCU grace period.  The destruction
   * path removes the device from client_data and then waits for all RCU
   * readers to finish.
   *
   * A new connection can get NULL from this if its arriving on a
   * device that is in the process of being removed.
   */
  struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
  {
  	struct rds_ib_device *rds_ibdev;
  
  	rcu_read_lock();
  	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
  	if (rds_ibdev)
  		atomic_inc(&rds_ibdev->refcount);
  	rcu_read_unlock();
  	return rds_ibdev;
  }
  
  /*
   * The IB stack is letting us know that a device is going away.  This can
   * happen if the underlying HCA driver is removed or if PCI hotplug is removing
   * the pci function, for example.
   *
   * This can be called at any time and can be racing with any other RDS path.
   */
ff51bf841   stephen hemminger   rds: make local f...
222
  static void rds_ib_remove_one(struct ib_device *device)
ec16227e1   Andy Grover   RDS/IB: Infiniban...
223
224
  {
  	struct rds_ib_device *rds_ibdev;
ec16227e1   Andy Grover   RDS/IB: Infiniban...
225
226
227
228
  
  	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
  	if (!rds_ibdev)
  		return;
fc19de38b   Zach Brown   RDS/IB: disconnec...
229
  	rds_ib_dev_shutdown(rds_ibdev);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
230

ea819867b   Zach Brown   RDS/IB: protect t...
231
232
233
234
235
236
  	/* stop connection attempts from getting a reference to this device. */
  	ib_set_client_data(device, &rds_ib_client, NULL);
  
  	down_write(&rds_ib_devices_lock);
  	list_del_rcu(&rds_ibdev->list);
  	up_write(&rds_ib_devices_lock);
3e0249f9c   Zach Brown   RDS/IB: add refco...
237
  	/*
ea819867b   Zach Brown   RDS/IB: protect t...
238
239
240
  	 * This synchronize rcu is waiting for readers of both the ib
  	 * client data and the devices list to finish before we drop
  	 * both of those references.
3e0249f9c   Zach Brown   RDS/IB: add refco...
241
  	 */
3e0249f9c   Zach Brown   RDS/IB: add refco...
242
243
  	synchronize_rcu();
  	rds_ib_dev_put(rds_ibdev);
3e0249f9c   Zach Brown   RDS/IB: add refco...
244
  	rds_ib_dev_put(rds_ibdev);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  }
  
  struct ib_client rds_ib_client = {
  	.name   = "rds_ib",
  	.add    = rds_ib_add_one,
  	.remove = rds_ib_remove_one
  };
  
  static int rds_ib_conn_info_visitor(struct rds_connection *conn,
  				    void *buffer)
  {
  	struct rds_info_rdma_connection *iinfo = buffer;
  	struct rds_ib_connection *ic;
  
  	/* We will only ever look at IB transports */
  	if (conn->c_trans != &rds_ib_transport)
  		return 0;
  
  	iinfo->src_addr = conn->c_laddr;
  	iinfo->dst_addr = conn->c_faddr;
  
  	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
  	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
  	if (rds_conn_state(conn) == RDS_CONN_UP) {
  		struct rds_ib_device *rds_ibdev;
  		struct rdma_dev_addr *dev_addr;
  
  		ic = conn->c_transport_data;
  		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
6f8372b69   Sean Hefty   RDMA/cm: fix loop...
274
275
  		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
  		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
276

3e0249f9c   Zach Brown   RDS/IB: add refco...
277
  		rds_ibdev = ic->rds_ibdev;
ec16227e1   Andy Grover   RDS/IB: Infiniban...
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
  		iinfo->max_send_wr = ic->i_send_ring.w_nr;
  		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
  		iinfo->max_send_sge = rds_ibdev->max_sge;
  		rds_ib_get_mr_info(rds_ibdev, iinfo);
  	}
  	return 1;
  }
  
  static void rds_ib_ic_info(struct socket *sock, unsigned int len,
  			   struct rds_info_iterator *iter,
  			   struct rds_info_lengths *lens)
  {
  	rds_for_each_conn_info(sock, len, iter, lens,
  				rds_ib_conn_info_visitor,
  				sizeof(struct rds_info_rdma_connection));
  }
  
  
  /*
   * Early RDS/IB was built to only bind to an address if there is an IPoIB
   * device with that address set.
   *
   * If it were me, I'd advocate for something more flexible.  Sending and
   * receiving should be device-agnostic.  Transports would try and maintain
   * connections between peers who have messages queued.  Userspace would be
   * allowed to influence which paths have priority.  We could call userspace
   * asserting this policy "routing".
   */
  static int rds_ib_laddr_check(__be32 addr)
  {
  	int ret;
  	struct rdma_cm_id *cm_id;
  	struct sockaddr_in sin;
  
  	/* Create a CMA ID and try to bind it. This catches both
  	 * IB and iWARP capable NICs.
  	 */
b26f9b994   Sean Hefty   RDMA/cma: Pass QP...
315
  	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
94713bab6   Dan Carpenter   ERR_PTR() derefer...
316
317
  	if (IS_ERR(cm_id))
  		return PTR_ERR(cm_id);
ec16227e1   Andy Grover   RDS/IB: Infiniban...
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
  
  	memset(&sin, 0, sizeof(sin));
  	sin.sin_family = AF_INET;
  	sin.sin_addr.s_addr = addr;
  
  	/* rdma_bind_addr will only succeed for IB & iWARP devices */
  	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
  	/* due to this, we will claim to support iWARP devices unless we
  	   check node_type. */
  	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
  		ret = -EADDRNOTAVAIL;
  
  	rdsdebug("addr %pI4 ret %d node type %d
  ",
  		&addr, ret,
  		cm_id->device ? cm_id->device->node_type : -1);
  
  	rdma_destroy_id(cm_id);
  
  	return ret;
  }
24fa163a4   Zach Brown   RDS/IB: wait for ...
339
340
341
342
343
344
  static void rds_ib_unregister_client(void)
  {
  	ib_unregister_client(&rds_ib_client);
  	/* wait for rds_ib_dev_free() to complete */
  	flush_workqueue(rds_wq);
  }
ec16227e1   Andy Grover   RDS/IB: Infiniban...
345
346
347
  void rds_ib_exit(void)
  {
  	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
24fa163a4   Zach Brown   RDS/IB: wait for ...
348
  	rds_ib_unregister_client();
8aeb1ba66   Zach Brown   RDS/IB: destroy c...
349
  	rds_ib_destroy_nodev_conns();
ec16227e1   Andy Grover   RDS/IB: Infiniban...
350
351
352
353
354
355
356
357
358
  	rds_ib_sysctl_exit();
  	rds_ib_recv_exit();
  	rds_trans_unregister(&rds_ib_transport);
  }
  
  struct rds_transport rds_ib_transport = {
  	.laddr_check		= rds_ib_laddr_check,
  	.xmit_complete		= rds_ib_xmit_complete,
  	.xmit			= rds_ib_xmit,
ec16227e1   Andy Grover   RDS/IB: Infiniban...
359
  	.xmit_rdma		= rds_ib_xmit_rdma,
15133f6e6   Andy Grover   RDS: Implement at...
360
  	.xmit_atomic		= rds_ib_xmit_atomic,
ec16227e1   Andy Grover   RDS/IB: Infiniban...
361
362
363
364
365
366
  	.recv			= rds_ib_recv,
  	.conn_alloc		= rds_ib_conn_alloc,
  	.conn_free		= rds_ib_conn_free,
  	.conn_connect		= rds_ib_conn_connect,
  	.conn_shutdown		= rds_ib_conn_shutdown,
  	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
ec16227e1   Andy Grover   RDS/IB: Infiniban...
367
368
369
370
371
372
373
374
375
376
377
378
  	.inc_free		= rds_ib_inc_free,
  	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
  	.cm_handle_connect	= rds_ib_cm_handle_connect,
  	.cm_connect_complete	= rds_ib_cm_connect_complete,
  	.stats_info_copy	= rds_ib_stats_info_copy,
  	.exit			= rds_ib_exit,
  	.get_mr			= rds_ib_get_mr,
  	.sync_mr		= rds_ib_sync_mr,
  	.free_mr		= rds_ib_free_mr,
  	.flush_mrs		= rds_ib_flush_mrs,
  	.t_owner		= THIS_MODULE,
  	.t_name			= "infiniband",
335776bd6   Andy Grover   RDS: Track transp...
379
  	.t_type			= RDS_TRANS_IB
ec16227e1   Andy Grover   RDS/IB: Infiniban...
380
  };
ef87b7ea3   Zach Brown   RDS: remove __ini...
381
  int rds_ib_init(void)
ec16227e1   Andy Grover   RDS/IB: Infiniban...
382
383
384
385
  {
  	int ret;
  
  	INIT_LIST_HEAD(&rds_ib_devices);
515e079da   Zach Brown   RDS/IB: create a ...
386
387
  	ret = ib_register_client(&rds_ib_client);
  	if (ret)
c534a107e   Tejun Heo   rds/ib: use syste...
388
  		goto out;
515e079da   Zach Brown   RDS/IB: create a ...
389

ec16227e1   Andy Grover   RDS/IB: Infiniban...
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
  	ret = rds_ib_sysctl_init();
  	if (ret)
  		goto out_ibreg;
  
  	ret = rds_ib_recv_init();
  	if (ret)
  		goto out_sysctl;
  
  	ret = rds_trans_register(&rds_ib_transport);
  	if (ret)
  		goto out_recv;
  
  	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
  
  	goto out;
  
  out_recv:
  	rds_ib_recv_exit();
  out_sysctl:
  	rds_ib_sysctl_exit();
  out_ibreg:
24fa163a4   Zach Brown   RDS/IB: wait for ...
411
  	rds_ib_unregister_client();
ec16227e1   Andy Grover   RDS/IB: Infiniban...
412
413
414
415
416
  out:
  	return ret;
  }
  
  MODULE_LICENSE("GPL");