Blame view

net/rds/rdma.c 22.5 KB
eff5f53be   Andy Grover   RDS: RDMA support
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
  /*
   * Copyright (c) 2007 Oracle.  All rights reserved.
   *
   * This software is available to you under a choice of one of two
   * licenses.  You may choose to be licensed under the terms of the GNU
   * General Public License (GPL) Version 2, available from the file
   * COPYING in the main directory of this source tree, or the
   * OpenIB.org BSD license below:
   *
   *     Redistribution and use in source and binary forms, with or
   *     without modification, are permitted provided that the following
   *     conditions are met:
   *
   *      - Redistributions of source code must retain the above
   *        copyright notice, this list of conditions and the following
   *        disclaimer.
   *
   *      - Redistributions in binary form must reproduce the above
   *        copyright notice, this list of conditions and the following
   *        disclaimer in the documentation and/or other materials
   *        provided with the distribution.
   *
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   * SOFTWARE.
   *
   */
  #include <linux/pagemap.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
34
  #include <linux/slab.h>
eff5f53be   Andy Grover   RDS: RDMA support
35
36
  #include <linux/rbtree.h>
  #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
21f79afa5   Andy Grover   RDS: fold rdma.h ...
37
  #include "rds.h"
eff5f53be   Andy Grover   RDS: RDMA support
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
  
  /*
   * XXX
   *  - build with sparse
   *  - should we limit the size of a mr region?  let transport return failure?
   *  - should we detect duplicate keys on a socket?  hmm.
   *  - an rdma is an mlock, apply rlimit?
   */
  
  /*
   * get the number of pages by looking at the page indices that the start and
   * end addresses fall in.
   *
   * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
   * causes the address to wrap or overflows an unsigned int.  This comes
   * from being stored in the 'length' member of 'struct scatterlist'.
   */
  static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
  {
  	if ((vec->addr + vec->bytes <= vec->addr) ||
  	    (vec->bytes > (u64)UINT_MAX))
  		return 0;
  
  	return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
  		(vec->addr >> PAGE_SHIFT);
  }
  
  static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
  				       struct rds_mr *insert)
  {
  	struct rb_node **p = &root->rb_node;
  	struct rb_node *parent = NULL;
  	struct rds_mr *mr;
  
  	while (*p) {
  		parent = *p;
  		mr = rb_entry(parent, struct rds_mr, r_rb_node);
  
  		if (key < mr->r_key)
  			p = &(*p)->rb_left;
  		else if (key > mr->r_key)
  			p = &(*p)->rb_right;
  		else
  			return mr;
  	}
  
  	if (insert) {
  		rb_link_node(&insert->r_rb_node, parent, p);
  		rb_insert_color(&insert->r_rb_node, root);
  		atomic_inc(&insert->r_refcount);
  	}
  	return NULL;
  }
  
  /*
   * Destroy the transport-specific part of a MR.
   */
  static void rds_destroy_mr(struct rds_mr *mr)
  {
  	struct rds_sock *rs = mr->r_sock;
  	void *trans_private = NULL;
  	unsigned long flags;
  
  	rdsdebug("RDS: destroy mr key is %x refcnt %u
  ",
  			mr->r_key, atomic_read(&mr->r_refcount));
  
  	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
  		return;
  
  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
  	if (!RB_EMPTY_NODE(&mr->r_rb_node))
  		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
  	trans_private = mr->r_trans_private;
  	mr->r_trans_private = NULL;
  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
  	if (trans_private)
  		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
  }
  
  void __rds_put_mr_final(struct rds_mr *mr)
  {
  	rds_destroy_mr(mr);
  	kfree(mr);
  }
  
  /*
   * By the time this is called we can't have any more ioctls called on
   * the socket so we don't need to worry about racing with others.
   */
  void rds_rdma_drop_keys(struct rds_sock *rs)
  {
  	struct rds_mr *mr;
  	struct rb_node *node;
35b52c705   Tina Yang   RDS: Fix corrupte...
133
  	unsigned long flags;
eff5f53be   Andy Grover   RDS: RDMA support
134
135
  
  	/* Release any MRs associated with this socket */
35b52c705   Tina Yang   RDS: Fix corrupte...
136
  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
eff5f53be   Andy Grover   RDS: RDMA support
137
138
139
140
  	while ((node = rb_first(&rs->rs_rdma_keys))) {
  		mr = container_of(node, struct rds_mr, r_rb_node);
  		if (mr->r_trans == rs->rs_transport)
  			mr->r_invalidate = 0;
35b52c705   Tina Yang   RDS: Fix corrupte...
141
142
143
144
  		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
  		RB_CLEAR_NODE(&mr->r_rb_node);
  		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  		rds_destroy_mr(mr);
eff5f53be   Andy Grover   RDS: RDMA support
145
  		rds_mr_put(mr);
35b52c705   Tina Yang   RDS: Fix corrupte...
146
  		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
eff5f53be   Andy Grover   RDS: RDMA support
147
  	}
35b52c705   Tina Yang   RDS: Fix corrupte...
148
  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
eff5f53be   Andy Grover   RDS: RDMA support
149
150
151
152
153
154
155
156
157
158
159
160
  
  	if (rs->rs_transport && rs->rs_transport->flush_mrs)
  		rs->rs_transport->flush_mrs();
  }
  
  /*
   * Helper function to pin user pages.
   */
  static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
  			struct page **pages, int write)
  {
  	int ret;
830eb7d56   Andy Grover   RDS: use get_user...
161
  	ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
eff5f53be   Andy Grover   RDS: RDMA support
162

7acd4a794   Andy Grover   RDS: Fix ordering...
163
  	if (ret >= 0 && ret < nr_pages) {
eff5f53be   Andy Grover   RDS: RDMA support
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
  		while (ret--)
  			put_page(pages[ret]);
  		ret = -EFAULT;
  	}
  
  	return ret;
  }
  
  static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
  				u64 *cookie_ret, struct rds_mr **mr_ret)
  {
  	struct rds_mr *mr = NULL, *found;
  	unsigned int nr_pages;
  	struct page **pages = NULL;
  	struct scatterlist *sg;
  	void *trans_private;
  	unsigned long flags;
  	rds_rdma_cookie_t cookie;
  	unsigned int nents;
  	long i;
  	int ret;
  
  	if (rs->rs_bound_addr == 0) {
  		ret = -ENOTCONN; /* XXX not a great errno */
  		goto out;
  	}
8690bfa17   Andy Grover   RDS: cleanup: rem...
190
  	if (!rs->rs_transport->get_mr) {
eff5f53be   Andy Grover   RDS: RDMA support
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
  		ret = -EOPNOTSUPP;
  		goto out;
  	}
  
  	nr_pages = rds_pages_in_vec(&args->vec);
  	if (nr_pages == 0) {
  		ret = -EINVAL;
  		goto out;
  	}
  
  	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u
  ",
  		args->vec.addr, args->vec.bytes, nr_pages);
  
  	/* XXX clamp nr_pages to limit the size of this alloc? */
  	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
8690bfa17   Andy Grover   RDS: cleanup: rem...
207
  	if (!pages) {
eff5f53be   Andy Grover   RDS: RDMA support
208
209
210
211
212
  		ret = -ENOMEM;
  		goto out;
  	}
  
  	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
8690bfa17   Andy Grover   RDS: cleanup: rem...
213
  	if (!mr) {
eff5f53be   Andy Grover   RDS: RDMA support
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
  		ret = -ENOMEM;
  		goto out;
  	}
  
  	atomic_set(&mr->r_refcount, 1);
  	RB_CLEAR_NODE(&mr->r_rb_node);
  	mr->r_trans = rs->rs_transport;
  	mr->r_sock = rs;
  
  	if (args->flags & RDS_RDMA_USE_ONCE)
  		mr->r_use_once = 1;
  	if (args->flags & RDS_RDMA_INVALIDATE)
  		mr->r_invalidate = 1;
  	if (args->flags & RDS_RDMA_READWRITE)
  		mr->r_write = 1;
  
  	/*
  	 * Pin the pages that make up the user buffer and transfer the page
  	 * pointers to the mr's sg array.  We check to see if we've mapped
  	 * the whole region after transferring the partial page references
  	 * to the sg array so that we can have one page ref cleanup path.
  	 *
  	 * For now we have no flag that tells us whether the mapping is
  	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
  	 * the zero page.
  	 */
d22faec22   Andy Grover   RDS: Do not mask ...
240
  	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
eff5f53be   Andy Grover   RDS: RDMA support
241
242
243
244
245
  	if (ret < 0)
  		goto out;
  
  	nents = ret;
  	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
8690bfa17   Andy Grover   RDS: cleanup: rem...
246
  	if (!sg) {
eff5f53be   Andy Grover   RDS: RDMA support
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
  		ret = -ENOMEM;
  		goto out;
  	}
  	WARN_ON(!nents);
  	sg_init_table(sg, nents);
  
  	/* Stick all pages into the scatterlist */
  	for (i = 0 ; i < nents; i++)
  		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
  
  	rdsdebug("RDS: trans_private nents is %u
  ", nents);
  
  	/* Obtain a transport specific MR. If this succeeds, the
  	 * s/g list is now owned by the MR.
  	 * Note that dma_map() implies that pending writes are
  	 * flushed to RAM, so no dma_sync is needed here. */
  	trans_private = rs->rs_transport->get_mr(sg, nents, rs,
  						 &mr->r_key);
  
  	if (IS_ERR(trans_private)) {
  		for (i = 0 ; i < nents; i++)
  			put_page(sg_page(&sg[i]));
  		kfree(sg);
  		ret = PTR_ERR(trans_private);
  		goto out;
  	}
  
  	mr->r_trans_private = trans_private;
  
  	rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p
  ",
  	       mr->r_key, (void *)(unsigned long) args->cookie_addr);
  
  	/* The user may pass us an unaligned address, but we can only
  	 * map page aligned regions. So we keep the offset, and build
  	 * a 64bit cookie containing <R_Key, offset> and pass that
  	 * around. */
  	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
  	if (cookie_ret)
  		*cookie_ret = cookie;
  
  	if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
  		ret = -EFAULT;
  		goto out;
  	}
  
  	/* Inserting the new MR into the rbtree bumps its
  	 * reference count. */
  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
  	found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
  	BUG_ON(found && found != mr);
  
  	rdsdebug("RDS: get_mr key is %x
  ", mr->r_key);
  	if (mr_ret) {
  		atomic_inc(&mr->r_refcount);
  		*mr_ret = mr;
  	}
  
  	ret = 0;
  out:
  	kfree(pages);
  	if (mr)
  		rds_mr_put(mr);
  	return ret;
  }
  
  int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
  {
  	struct rds_get_mr_args args;
  
  	if (optlen != sizeof(struct rds_get_mr_args))
  		return -EINVAL;
  
  	if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
  			   sizeof(struct rds_get_mr_args)))
  		return -EFAULT;
  
  	return __rds_rdma_map(rs, &args, NULL, NULL);
  }
244546f0d   Andy Grover   RDS: Add GET_MR_F...
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
  int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
  {
  	struct rds_get_mr_for_dest_args args;
  	struct rds_get_mr_args new_args;
  
  	if (optlen != sizeof(struct rds_get_mr_for_dest_args))
  		return -EINVAL;
  
  	if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
  			   sizeof(struct rds_get_mr_for_dest_args)))
  		return -EFAULT;
  
  	/*
  	 * Initially, just behave like get_mr().
  	 * TODO: Implement get_mr as wrapper around this
  	 *	 and deprecate it.
  	 */
  	new_args.vec = args.vec;
  	new_args.cookie_addr = args.cookie_addr;
  	new_args.flags = args.flags;
  
  	return __rds_rdma_map(rs, &new_args, NULL, NULL);
  }
eff5f53be   Andy Grover   RDS: RDMA support
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
  /*
   * Free the MR indicated by the given R_Key
   */
  int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
  {
  	struct rds_free_mr_args args;
  	struct rds_mr *mr;
  	unsigned long flags;
  
  	if (optlen != sizeof(struct rds_free_mr_args))
  		return -EINVAL;
  
  	if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
  			   sizeof(struct rds_free_mr_args)))
  		return -EFAULT;
  
  	/* Special case - a null cookie means flush all unused MRs */
  	if (args.cookie == 0) {
  		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
  			return -EINVAL;
  		rs->rs_transport->flush_mrs();
  		return 0;
  	}
  
  	/* Look up the MR given its R_key and remove it from the rbtree
  	 * so nobody else finds it.
  	 * This should also prevent races with rds_rdma_unuse.
  	 */
  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
  	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
  	if (mr) {
  		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
  		RB_CLEAR_NODE(&mr->r_rb_node);
  		if (args.flags & RDS_RDMA_INVALIDATE)
  			mr->r_invalidate = 1;
  	}
  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
  	if (!mr)
  		return -EINVAL;
  
  	/*
  	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
  	 * we return.  If we let rds_mr_put() do it it might not happen until
  	 * someone else drops their ref.
  	 */
  	rds_destroy_mr(mr);
  	rds_mr_put(mr);
  	return 0;
  }
  
  /*
   * This is called when we receive an extension header that
   * tells us this MR was used. It allows us to implement
   * use_once semantics
   */
  void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
  {
  	struct rds_mr *mr;
  	unsigned long flags;
  	int zot_me = 0;
  
  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
  	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
3ef13f3c2   Andy Grover   RDS: cleanup/fix ...
417
418
419
420
421
422
423
424
  	if (!mr) {
  		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!
  ", r_key);
  		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  		return;
  	}
  
  	if (mr->r_use_once || force) {
eff5f53be   Andy Grover   RDS: RDMA support
425
426
427
  		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
  		RB_CLEAR_NODE(&mr->r_rb_node);
  		zot_me = 1;
3ef13f3c2   Andy Grover   RDS: cleanup/fix ...
428
  	}
eff5f53be   Andy Grover   RDS: RDMA support
429
430
431
432
433
  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
  	/* May have to issue a dma_sync on this memory region.
  	 * Note we could avoid this if the operation was a RDMA READ,
  	 * but at this point we can't tell. */
3ef13f3c2   Andy Grover   RDS: cleanup/fix ...
434
435
  	if (mr->r_trans->sync_mr)
  		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
eff5f53be   Andy Grover   RDS: RDMA support
436

3ef13f3c2   Andy Grover   RDS: cleanup/fix ...
437
438
439
440
441
  	/* If the MR was marked as invalidate, this will
  	 * trigger an async flush. */
  	if (zot_me)
  		rds_destroy_mr(mr);
  	rds_mr_put(mr);
eff5f53be   Andy Grover   RDS: RDMA support
442
  }
f8b3aaf2b   Andy Grover   RDS: Remove struc...
443
  void rds_rdma_free_op(struct rm_rdma_op *ro)
eff5f53be   Andy Grover   RDS: RDMA support
444
445
  {
  	unsigned int i;
f8b3aaf2b   Andy Grover   RDS: Remove struc...
446
447
  	for (i = 0; i < ro->op_nents; i++) {
  		struct page *page = sg_page(&ro->op_sg[i]);
eff5f53be   Andy Grover   RDS: RDMA support
448
449
450
451
  
  		/* Mark page dirty if it was possibly modified, which
  		 * is the case for a RDMA_READ which copies from remote
  		 * to local memory */
f8b3aaf2b   Andy Grover   RDS: Remove struc...
452
  		if (!ro->op_write) {
9e2effba2   Andy Grover   RDS: Fix BUG_ONs ...
453
  			BUG_ON(irqs_disabled());
eff5f53be   Andy Grover   RDS: RDMA support
454
  			set_page_dirty(page);
561c7df63   Andy Grover   RDS: Do not call ...
455
  		}
eff5f53be   Andy Grover   RDS: RDMA support
456
457
  		put_page(page);
  	}
f8b3aaf2b   Andy Grover   RDS: Remove struc...
458
459
460
  	kfree(ro->op_notifier);
  	ro->op_notifier = NULL;
  	ro->op_active = 0;
ff87e97a9   Andy Grover   RDS: make m_rdma_...
461
  }
d0ab25a83   Andy Grover   RDS: purge atomic...
462
463
464
465
466
467
468
469
470
471
472
473
474
475
  void rds_atomic_free_op(struct rm_atomic_op *ao)
  {
  	struct page *page = sg_page(ao->op_sg);
  
  	/* Mark page dirty if it was possibly modified, which
  	 * is the case for a RDMA_READ which copies from remote
  	 * to local memory */
  	set_page_dirty(page);
  	put_page(page);
  
  	kfree(ao->op_notifier);
  	ao->op_notifier = NULL;
  	ao->op_active = 0;
  }
ff87e97a9   Andy Grover   RDS: make m_rdma_...
476
  /*
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
477
   * Count the number of pages needed to describe an incoming iovec array.
ff87e97a9   Andy Grover   RDS: make m_rdma_...
478
   */
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
  static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
  {
  	int tot_pages = 0;
  	unsigned int nr_pages;
  	unsigned int i;
  
  	/* figure out the number of pages in the vector */
  	for (i = 0; i < nr_iovecs; i++) {
  		nr_pages = rds_pages_in_vec(&iov[i]);
  		if (nr_pages == 0)
  			return -EINVAL;
  
  		tot_pages += nr_pages;
  
  		/*
  		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
  		 * so tot_pages cannot overflow without first going negative.
  		 */
  		if (tot_pages < 0)
  			return -EINVAL;
  	}
  
  	return tot_pages;
  }
  
  int rds_rdma_extra_size(struct rds_rdma_args *args)
ff87e97a9   Andy Grover   RDS: make m_rdma_...
505
506
507
  {
  	struct rds_iovec vec;
  	struct rds_iovec __user *local_vec;
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
508
  	int tot_pages = 0;
ff87e97a9   Andy Grover   RDS: make m_rdma_...
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
  	unsigned int nr_pages;
  	unsigned int i;
  
  	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
  
  	/* figure out the number of pages in the vector */
  	for (i = 0; i < args->nr_local; i++) {
  		if (copy_from_user(&vec, &local_vec[i],
  				   sizeof(struct rds_iovec)))
  			return -EFAULT;
  
  		nr_pages = rds_pages_in_vec(&vec);
  		if (nr_pages == 0)
  			return -EINVAL;
  
  		tot_pages += nr_pages;
1b1f693d7   Linus Torvalds   net: fix rds_iove...
525
526
527
528
529
  
  		/*
  		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
  		 * so tot_pages cannot overflow without first going negative.
  		 */
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
530
  		if (tot_pages < 0)
1b1f693d7   Linus Torvalds   net: fix rds_iove...
531
  			return -EINVAL;
ff87e97a9   Andy Grover   RDS: make m_rdma_...
532
  	}
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
533
  	return tot_pages * sizeof(struct scatterlist);
eff5f53be   Andy Grover   RDS: RDMA support
534
535
536
  }
  
  /*
4324879df   Andy Grover   RDS: Inline rdma_...
537
538
   * The application asks for a RDMA transfer.
   * Extract all arguments and set up the rdma_op
eff5f53be   Andy Grover   RDS: RDMA support
539
   */
4324879df   Andy Grover   RDS: Inline rdma_...
540
541
  int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
  			  struct cmsghdr *cmsg)
eff5f53be   Andy Grover   RDS: RDMA support
542
  {
4324879df   Andy Grover   RDS: Inline rdma_...
543
  	struct rds_rdma_args *args;
f8b3aaf2b   Andy Grover   RDS: Remove struc...
544
  	struct rm_rdma_op *op = &rm->rdma;
9b9d2e00b   Dan Carpenter   rds: signedness bug
545
  	int nr_pages;
eff5f53be   Andy Grover   RDS: RDMA support
546
547
  	unsigned int nr_bytes;
  	struct page **pages = NULL;
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
548
549
  	struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
  	int iov_size;
eff5f53be   Andy Grover   RDS: RDMA support
550
  	unsigned int i, j;
ff87e97a9   Andy Grover   RDS: make m_rdma_...
551
  	int ret = 0;
eff5f53be   Andy Grover   RDS: RDMA support
552

4324879df   Andy Grover   RDS: Inline rdma_...
553
  	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
f8b3aaf2b   Andy Grover   RDS: Remove struc...
554
  	    || rm->rdma.op_active)
4324879df   Andy Grover   RDS: Inline rdma_...
555
556
557
  		return -EINVAL;
  
  	args = CMSG_DATA(cmsg);
eff5f53be   Andy Grover   RDS: RDMA support
558
559
560
  
  	if (rs->rs_bound_addr == 0) {
  		ret = -ENOTCONN; /* XXX not a great errno */
dee49f203   Cong Wang   rds: avoid callin...
561
  		goto out_ret;
eff5f53be   Andy Grover   RDS: RDMA support
562
  	}
218854af8   Dan Rosenberg   rds: Integer over...
563
  	if (args->nr_local > UIO_MAXIOV) {
eff5f53be   Andy Grover   RDS: RDMA support
564
  		ret = -EMSGSIZE;
dee49f203   Cong Wang   rds: avoid callin...
565
  		goto out_ret;
eff5f53be   Andy Grover   RDS: RDMA support
566
  	}
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
567
568
569
570
571
572
  	/* Check whether to allocate the iovec area */
  	iov_size = args->nr_local * sizeof(struct rds_iovec);
  	if (args->nr_local > UIO_FASTIOV) {
  		iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
  		if (!iovs) {
  			ret = -ENOMEM;
dee49f203   Cong Wang   rds: avoid callin...
573
  			goto out_ret;
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
574
575
576
577
578
579
580
581
582
  		}
  	}
  
  	if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
  		ret = -EFAULT;
  		goto out;
  	}
  
  	nr_pages = rds_rdma_pages(iovs, args->nr_local);
a09f69c49   Andy Grover   RDS: Return -EINV...
583
584
  	if (nr_pages < 0) {
  		ret = -EINVAL;
eff5f53be   Andy Grover   RDS: RDMA support
585
  		goto out;
a09f69c49   Andy Grover   RDS: Return -EINV...
586
  	}
eff5f53be   Andy Grover   RDS: RDMA support
587

ff87e97a9   Andy Grover   RDS: make m_rdma_...
588
589
  	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
  	if (!pages) {
eff5f53be   Andy Grover   RDS: RDMA support
590
591
592
  		ret = -ENOMEM;
  		goto out;
  	}
f8b3aaf2b   Andy Grover   RDS: Remove struc...
593
594
595
  	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
  	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
  	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
2c3a5f9ab   Andy Grover   RDS: Add flag for...
596
  	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
f8b3aaf2b   Andy Grover   RDS: Remove struc...
597
598
  	op->op_active = 1;
  	op->op_recverr = rs->rs_recverr;
eff5f53be   Andy Grover   RDS: RDMA support
599
  	WARN_ON(!nr_pages);
f8b3aaf2b   Andy Grover   RDS: Remove struc...
600
  	op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
d139ff090   Andy Grover   RDS: Let rds_mess...
601
602
603
604
  	if (!op->op_sg) {
  		ret = -ENOMEM;
  		goto out;
  	}
eff5f53be   Andy Grover   RDS: RDMA support
605

f8b3aaf2b   Andy Grover   RDS: Remove struc...
606
  	if (op->op_notify || op->op_recverr) {
eff5f53be   Andy Grover   RDS: RDMA support
607
608
609
610
611
  		/* We allocate an uninitialized notifier here, because
  		 * we don't want to do that in the completion handler. We
  		 * would have to use GFP_ATOMIC there, and don't want to deal
  		 * with failed allocations.
  		 */
f8b3aaf2b   Andy Grover   RDS: Remove struc...
612
613
  		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
  		if (!op->op_notifier) {
eff5f53be   Andy Grover   RDS: RDMA support
614
615
616
  			ret = -ENOMEM;
  			goto out;
  		}
f8b3aaf2b   Andy Grover   RDS: Remove struc...
617
618
  		op->op_notifier->n_user_token = args->user_token;
  		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
eff5f53be   Andy Grover   RDS: RDMA support
619
620
621
622
623
624
625
626
627
  	}
  
  	/* The cookie contains the R_Key of the remote memory region, and
  	 * optionally an offset into it. This is how we implement RDMA into
  	 * unaligned memory.
  	 * When setting up the RDMA, we need to add that offset to the
  	 * destination address (which is really an offset into the MR)
  	 * FIXME: We may want to move this into ib_rdma.c
  	 */
f8b3aaf2b   Andy Grover   RDS: Remove struc...
628
629
  	op->op_rkey = rds_rdma_cookie_key(args->cookie);
  	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
eff5f53be   Andy Grover   RDS: RDMA support
630
631
632
633
634
635
636
  
  	nr_bytes = 0;
  
  	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x
  ",
  	       (unsigned long long)args->nr_local,
  	       (unsigned long long)args->remote_vec.addr,
f8b3aaf2b   Andy Grover   RDS: Remove struc...
637
  	       op->op_rkey);
eff5f53be   Andy Grover   RDS: RDMA support
638
639
  
  	for (i = 0; i < args->nr_local; i++) {
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
640
641
642
  		struct rds_iovec *iov = &iovs[i];
  		/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
  		unsigned int nr = rds_pages_in_vec(iov);
eff5f53be   Andy Grover   RDS: RDMA support
643

fc8162e3c   Andy Grover   RDS: Copy rds_iov...
644
645
  		rs->rs_user_addr = iov->addr;
  		rs->rs_user_bytes = iov->bytes;
eff5f53be   Andy Grover   RDS: RDMA support
646

eff5f53be   Andy Grover   RDS: RDMA support
647
648
649
  		/* If it's a WRITE operation, we want to pin the pages for reading.
  		 * If it's a READ operation, we need to pin the pages for writing.
  		 */
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
650
  		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
eff5f53be   Andy Grover   RDS: RDMA support
651
652
  		if (ret < 0)
  			goto out;
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
653
654
655
  		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx
  ",
  			 nr_bytes, nr, iov->bytes, iov->addr);
eff5f53be   Andy Grover   RDS: RDMA support
656

fc8162e3c   Andy Grover   RDS: Copy rds_iov...
657
  		nr_bytes += iov->bytes;
eff5f53be   Andy Grover   RDS: RDMA support
658
659
  
  		for (j = 0; j < nr; j++) {
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
660
  			unsigned int offset = iov->addr & ~PAGE_MASK;
ff87e97a9   Andy Grover   RDS: make m_rdma_...
661
  			struct scatterlist *sg;
eff5f53be   Andy Grover   RDS: RDMA support
662

f8b3aaf2b   Andy Grover   RDS: Remove struc...
663
  			sg = &op->op_sg[op->op_nents + j];
eff5f53be   Andy Grover   RDS: RDMA support
664
  			sg_set_page(sg, pages[j],
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
665
  					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
eff5f53be   Andy Grover   RDS: RDMA support
666
  					offset);
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
667
668
669
  			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu
  ",
  			       sg->offset, sg->length, iov->addr, iov->bytes);
eff5f53be   Andy Grover   RDS: RDMA support
670

fc8162e3c   Andy Grover   RDS: Copy rds_iov...
671
672
  			iov->addr += sg->length;
  			iov->bytes -= sg->length;
eff5f53be   Andy Grover   RDS: RDMA support
673
  		}
f8b3aaf2b   Andy Grover   RDS: Remove struc...
674
  		op->op_nents += nr;
eff5f53be   Andy Grover   RDS: RDMA support
675
  	}
eff5f53be   Andy Grover   RDS: RDMA support
676
677
678
679
680
681
682
683
  	if (nr_bytes > args->remote_vec.bytes) {
  		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match
  ",
  				nr_bytes,
  				(unsigned int) args->remote_vec.bytes);
  		ret = -EINVAL;
  		goto out;
  	}
f8b3aaf2b   Andy Grover   RDS: Remove struc...
684
  	op->op_bytes = nr_bytes;
eff5f53be   Andy Grover   RDS: RDMA support
685

eff5f53be   Andy Grover   RDS: RDMA support
686
  out:
fc8162e3c   Andy Grover   RDS: Copy rds_iov...
687
688
  	if (iovs != iovstack)
  		sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
eff5f53be   Andy Grover   RDS: RDMA support
689
  	kfree(pages);
dee49f203   Cong Wang   rds: avoid callin...
690
  out_ret:
ff87e97a9   Andy Grover   RDS: make m_rdma_...
691
692
  	if (ret)
  		rds_rdma_free_op(op);
f4a3fc03c   Andy Grover   RDS: Clean up err...
693
694
  	else
  		rds_stats_inc(s_send_rdma);
4324879df   Andy Grover   RDS: Inline rdma_...
695
696
  
  	return ret;
eff5f53be   Andy Grover   RDS: RDMA support
697
698
699
700
701
702
703
704
705
706
707
708
709
  }
  
  /*
   * The application wants us to pass an RDMA destination (aka MR)
   * to the remote
   */
  int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
  			  struct cmsghdr *cmsg)
  {
  	unsigned long flags;
  	struct rds_mr *mr;
  	u32 r_key;
  	int err = 0;
f64f9e719   Joe Perches   net: Move && and ...
710
711
  	if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
  	    rm->m_rdma_cookie != 0)
eff5f53be   Andy Grover   RDS: RDMA support
712
713
714
715
716
717
718
719
720
721
722
723
724
  		return -EINVAL;
  
  	memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
  
  	/* We are reusing a previously mapped MR here. Most likely, the
  	 * application has written to the buffer, so we need to explicitly
  	 * flush those writes to RAM. Otherwise the HCA may not see them
  	 * when doing a DMA from that buffer.
  	 */
  	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
  
  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
  	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
8690bfa17   Andy Grover   RDS: cleanup: rem...
725
  	if (!mr)
eff5f53be   Andy Grover   RDS: RDMA support
726
727
728
729
730
731
732
  		err = -EINVAL;	/* invalid r_key */
  	else
  		atomic_inc(&mr->r_refcount);
  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
  	if (mr) {
  		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
f8b3aaf2b   Andy Grover   RDS: Remove struc...
733
  		rm->rdma.op_rdma_mr = mr;
eff5f53be   Andy Grover   RDS: RDMA support
734
735
736
737
738
739
740
741
742
743
744
745
746
  	}
  	return err;
  }
  
  /*
   * The application passes us an address range it wants to enable RDMA
   * to/from. We map the area, and save the <R_Key,offset> pair
   * in rm->m_rdma_cookie. This causes it to be sent along to the peer
   * in an extension header.
   */
  int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
  			  struct cmsghdr *cmsg)
  {
f64f9e719   Joe Perches   net: Move && and ...
747
748
  	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
  	    rm->m_rdma_cookie != 0)
eff5f53be   Andy Grover   RDS: RDMA support
749
  		return -EINVAL;
f8b3aaf2b   Andy Grover   RDS: Remove struc...
750
  	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
eff5f53be   Andy Grover   RDS: RDMA support
751
  }
15133f6e6   Andy Grover   RDS: Implement at...
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
  
  /*
   * Fill in rds_message for an atomic request.
   */
  int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
  		    struct cmsghdr *cmsg)
  {
  	struct page *page = NULL;
  	struct rds_atomic_args *args;
  	int ret = 0;
  
  	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
  	 || rm->atomic.op_active)
  		return -EINVAL;
  
  	args = CMSG_DATA(cmsg);
20c72bd5f   Andy Grover   RDS: Implement ma...
768
769
770
771
772
773
774
775
  	/* Nonmasked & masked cmsg ops converted to masked hw ops */
  	switch (cmsg->cmsg_type) {
  	case RDS_CMSG_ATOMIC_FADD:
  		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
  		rm->atomic.op_m_fadd.add = args->fadd.add;
  		rm->atomic.op_m_fadd.nocarry_mask = 0;
  		break;
  	case RDS_CMSG_MASKED_ATOMIC_FADD:
15133f6e6   Andy Grover   RDS: Implement at...
776
  		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
20c72bd5f   Andy Grover   RDS: Implement ma...
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
  		rm->atomic.op_m_fadd.add = args->m_fadd.add;
  		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
  		break;
  	case RDS_CMSG_ATOMIC_CSWP:
  		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
  		rm->atomic.op_m_cswp.compare = args->cswp.compare;
  		rm->atomic.op_m_cswp.swap = args->cswp.swap;
  		rm->atomic.op_m_cswp.compare_mask = ~0;
  		rm->atomic.op_m_cswp.swap_mask = ~0;
  		break;
  	case RDS_CMSG_MASKED_ATOMIC_CSWP:
  		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
  		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
  		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
  		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
  		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
  		break;
  	default:
  		BUG(); /* should never happen */
15133f6e6   Andy Grover   RDS: Implement at...
796
  	}
15133f6e6   Andy Grover   RDS: Implement at...
797
  	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
2c3a5f9ab   Andy Grover   RDS: Add flag for...
798
  	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
7e3bd65eb   Andy Grover   RDS: Move some va...
799
  	rm->atomic.op_active = 1;
15133f6e6   Andy Grover   RDS: Implement at...
800
801
  	rm->atomic.op_recverr = rs->rs_recverr;
  	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
d139ff090   Andy Grover   RDS: Let rds_mess...
802
803
804
805
  	if (!rm->atomic.op_sg) {
  		ret = -ENOMEM;
  		goto err;
  	}
15133f6e6   Andy Grover   RDS: Implement at...
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
  
  	/* verify 8 byte-aligned */
  	if (args->local_addr & 0x7) {
  		ret = -EFAULT;
  		goto err;
  	}
  
  	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
  	if (ret != 1)
  		goto err;
  	ret = 0;
  
  	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
  
  	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
  		/* We allocate an uninitialized notifier here, because
  		 * we don't want to do that in the completion handler. We
  		 * would have to use GFP_ATOMIC there, and don't want to deal
  		 * with failed allocations.
  		 */
  		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
  		if (!rm->atomic.op_notifier) {
  			ret = -ENOMEM;
  			goto err;
  		}
  
  		rm->atomic.op_notifier->n_user_token = args->user_token;
  		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
  	}
40589e74f   Andy Grover   RDS: Base init_de...
835
  	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
15133f6e6   Andy Grover   RDS: Implement at...
836
  	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
15133f6e6   Andy Grover   RDS: Implement at...
837
838
839
840
841
842
843
844
  	return ret;
  err:
  	if (page)
  		put_page(page);
  	kfree(rm->atomic.op_notifier);
  
  	return ret;
  }