Commit d521b63b27e3a397e0ef7ca86b6e813861083c83

Authored by Andy Grover
Committed by David S. Miller
1 parent 0514f8a9c0

RDS/IB+IW: Move recv processing to a tasklet

Move receive processing from event handler to a tasklet.
This should help prevent hangcheck timer from going off
when RDS is under heavy load.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 6 changed files with 52 additions and 12 deletions Inline Diff

1 #ifndef _RDS_IB_H 1 #ifndef _RDS_IB_H
2 #define _RDS_IB_H 2 #define _RDS_IB_H
3 3
4 #include <rdma/ib_verbs.h> 4 #include <rdma/ib_verbs.h>
5 #include <rdma/rdma_cm.h> 5 #include <rdma/rdma_cm.h>
6 #include "rds.h" 6 #include "rds.h"
7 #include "rdma_transport.h" 7 #include "rdma_transport.h"
8 8
9 #define RDS_FMR_SIZE 256 9 #define RDS_FMR_SIZE 256
10 #define RDS_FMR_POOL_SIZE 4096 10 #define RDS_FMR_POOL_SIZE 4096
11 11
12 #define RDS_IB_MAX_SGE 8 12 #define RDS_IB_MAX_SGE 8
13 #define RDS_IB_RECV_SGE 2 13 #define RDS_IB_RECV_SGE 2
14 14
15 #define RDS_IB_DEFAULT_RECV_WR 1024 15 #define RDS_IB_DEFAULT_RECV_WR 1024
16 #define RDS_IB_DEFAULT_SEND_WR 256 16 #define RDS_IB_DEFAULT_SEND_WR 256
17 17
18 #define RDS_IB_DEFAULT_RETRY_COUNT 2 18 #define RDS_IB_DEFAULT_RETRY_COUNT 2
19 19
20 #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ 20 #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
21 21
22 extern struct list_head rds_ib_devices; 22 extern struct list_head rds_ib_devices;
23 23
24 /* 24 /*
25 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to 25 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
26 * try and minimize the amount of memory tied up both the device and 26 * try and minimize the amount of memory tied up both the device and
27 * socket receive queues. 27 * socket receive queues.
28 */ 28 */
29 /* page offset of the final full frag that fits in the page */ 29 /* page offset of the final full frag that fits in the page */
30 #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) 30 #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
31 struct rds_page_frag { 31 struct rds_page_frag {
32 struct list_head f_item; 32 struct list_head f_item;
33 struct page *f_page; 33 struct page *f_page;
34 unsigned long f_offset; 34 unsigned long f_offset;
35 dma_addr_t f_mapped; 35 dma_addr_t f_mapped;
36 }; 36 };
37 37
38 struct rds_ib_incoming { 38 struct rds_ib_incoming {
39 struct list_head ii_frags; 39 struct list_head ii_frags;
40 struct rds_incoming ii_inc; 40 struct rds_incoming ii_inc;
41 }; 41 };
42 42
43 struct rds_ib_connect_private { 43 struct rds_ib_connect_private {
44 /* Add new fields at the end, and don't permute existing fields. */ 44 /* Add new fields at the end, and don't permute existing fields. */
45 __be32 dp_saddr; 45 __be32 dp_saddr;
46 __be32 dp_daddr; 46 __be32 dp_daddr;
47 u8 dp_protocol_major; 47 u8 dp_protocol_major;
48 u8 dp_protocol_minor; 48 u8 dp_protocol_minor;
49 __be16 dp_protocol_minor_mask; /* bitmask */ 49 __be16 dp_protocol_minor_mask; /* bitmask */
50 __be32 dp_reserved1; 50 __be32 dp_reserved1;
51 __be64 dp_ack_seq; 51 __be64 dp_ack_seq;
52 __be32 dp_credit; /* non-zero enables flow ctl */ 52 __be32 dp_credit; /* non-zero enables flow ctl */
53 }; 53 };
54 54
55 struct rds_ib_send_work { 55 struct rds_ib_send_work {
56 struct rds_message *s_rm; 56 struct rds_message *s_rm;
57 struct rds_rdma_op *s_op; 57 struct rds_rdma_op *s_op;
58 struct ib_send_wr s_wr; 58 struct ib_send_wr s_wr;
59 struct ib_sge s_sge[RDS_IB_MAX_SGE]; 59 struct ib_sge s_sge[RDS_IB_MAX_SGE];
60 unsigned long s_queued; 60 unsigned long s_queued;
61 }; 61 };
62 62
63 struct rds_ib_recv_work { 63 struct rds_ib_recv_work {
64 struct rds_ib_incoming *r_ibinc; 64 struct rds_ib_incoming *r_ibinc;
65 struct rds_page_frag *r_frag; 65 struct rds_page_frag *r_frag;
66 struct ib_recv_wr r_wr; 66 struct ib_recv_wr r_wr;
67 struct ib_sge r_sge[2]; 67 struct ib_sge r_sge[2];
68 }; 68 };
69 69
70 struct rds_ib_work_ring { 70 struct rds_ib_work_ring {
71 u32 w_nr; 71 u32 w_nr;
72 u32 w_alloc_ptr; 72 u32 w_alloc_ptr;
73 u32 w_alloc_ctr; 73 u32 w_alloc_ctr;
74 u32 w_free_ptr; 74 u32 w_free_ptr;
75 atomic_t w_free_ctr; 75 atomic_t w_free_ctr;
76 }; 76 };
77 77
78 struct rds_ib_device; 78 struct rds_ib_device;
79 79
80 struct rds_ib_connection { 80 struct rds_ib_connection {
81 81
82 struct list_head ib_node; 82 struct list_head ib_node;
83 struct rds_ib_device *rds_ibdev; 83 struct rds_ib_device *rds_ibdev;
84 struct rds_connection *conn; 84 struct rds_connection *conn;
85 85
86 /* alphabet soup, IBTA style */ 86 /* alphabet soup, IBTA style */
87 struct rdma_cm_id *i_cm_id; 87 struct rdma_cm_id *i_cm_id;
88 struct ib_pd *i_pd; 88 struct ib_pd *i_pd;
89 struct ib_mr *i_mr; 89 struct ib_mr *i_mr;
90 struct ib_cq *i_send_cq; 90 struct ib_cq *i_send_cq;
91 struct ib_cq *i_recv_cq; 91 struct ib_cq *i_recv_cq;
92 92
93 /* tx */ 93 /* tx */
94 struct rds_ib_work_ring i_send_ring; 94 struct rds_ib_work_ring i_send_ring;
95 struct rds_message *i_rm; 95 struct rds_message *i_rm;
96 struct rds_header *i_send_hdrs; 96 struct rds_header *i_send_hdrs;
97 u64 i_send_hdrs_dma; 97 u64 i_send_hdrs_dma;
98 struct rds_ib_send_work *i_sends; 98 struct rds_ib_send_work *i_sends;
99 99
100 /* rx */ 100 /* rx */
101 struct tasklet_struct i_recv_tasklet;
101 struct mutex i_recv_mutex; 102 struct mutex i_recv_mutex;
102 struct rds_ib_work_ring i_recv_ring; 103 struct rds_ib_work_ring i_recv_ring;
103 struct rds_ib_incoming *i_ibinc; 104 struct rds_ib_incoming *i_ibinc;
104 u32 i_recv_data_rem; 105 u32 i_recv_data_rem;
105 struct rds_header *i_recv_hdrs; 106 struct rds_header *i_recv_hdrs;
106 u64 i_recv_hdrs_dma; 107 u64 i_recv_hdrs_dma;
107 struct rds_ib_recv_work *i_recvs; 108 struct rds_ib_recv_work *i_recvs;
108 struct rds_page_frag i_frag; 109 struct rds_page_frag i_frag;
109 u64 i_ack_recv; /* last ACK received */ 110 u64 i_ack_recv; /* last ACK received */
110 111
111 /* sending acks */ 112 /* sending acks */
112 unsigned long i_ack_flags; 113 unsigned long i_ack_flags;
113 #ifdef KERNEL_HAS_ATOMIC64 114 #ifdef KERNEL_HAS_ATOMIC64
114 atomic64_t i_ack_next; /* next ACK to send */ 115 atomic64_t i_ack_next; /* next ACK to send */
115 #else 116 #else
116 spinlock_t i_ack_lock; /* protect i_ack_next */ 117 spinlock_t i_ack_lock; /* protect i_ack_next */
117 u64 i_ack_next; /* next ACK to send */ 118 u64 i_ack_next; /* next ACK to send */
118 #endif 119 #endif
119 struct rds_header *i_ack; 120 struct rds_header *i_ack;
120 struct ib_send_wr i_ack_wr; 121 struct ib_send_wr i_ack_wr;
121 struct ib_sge i_ack_sge; 122 struct ib_sge i_ack_sge;
122 u64 i_ack_dma; 123 u64 i_ack_dma;
123 unsigned long i_ack_queued; 124 unsigned long i_ack_queued;
124 125
125 /* Flow control related information 126 /* Flow control related information
126 * 127 *
127 * Our algorithm uses a pair variables that we need to access 128 * Our algorithm uses a pair variables that we need to access
128 * atomically - one for the send credits, and one posted 129 * atomically - one for the send credits, and one posted
129 * recv credits we need to transfer to remote. 130 * recv credits we need to transfer to remote.
130 * Rather than protect them using a slow spinlock, we put both into 131 * Rather than protect them using a slow spinlock, we put both into
131 * a single atomic_t and update it using cmpxchg 132 * a single atomic_t and update it using cmpxchg
132 */ 133 */
133 atomic_t i_credits; 134 atomic_t i_credits;
134 135
135 /* Protocol version specific information */ 136 /* Protocol version specific information */
136 unsigned int i_flowctl:1; /* enable/disable flow ctl */ 137 unsigned int i_flowctl:1; /* enable/disable flow ctl */
137 138
138 /* Batched completions */ 139 /* Batched completions */
139 unsigned int i_unsignaled_wrs; 140 unsigned int i_unsignaled_wrs;
140 long i_unsignaled_bytes; 141 long i_unsignaled_bytes;
141 }; 142 };
142 143
143 /* This assumes that atomic_t is at least 32 bits */ 144 /* This assumes that atomic_t is at least 32 bits */
144 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) 145 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
145 #define IB_GET_POST_CREDITS(v) ((v) >> 16) 146 #define IB_GET_POST_CREDITS(v) ((v) >> 16)
146 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) 147 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
147 #define IB_SET_POST_CREDITS(v) ((v) << 16) 148 #define IB_SET_POST_CREDITS(v) ((v) << 16)
148 149
149 struct rds_ib_ipaddr { 150 struct rds_ib_ipaddr {
150 struct list_head list; 151 struct list_head list;
151 __be32 ipaddr; 152 __be32 ipaddr;
152 }; 153 };
153 154
154 struct rds_ib_device { 155 struct rds_ib_device {
155 struct list_head list; 156 struct list_head list;
156 struct list_head ipaddr_list; 157 struct list_head ipaddr_list;
157 struct list_head conn_list; 158 struct list_head conn_list;
158 struct ib_device *dev; 159 struct ib_device *dev;
159 struct ib_pd *pd; 160 struct ib_pd *pd;
160 struct ib_mr *mr; 161 struct ib_mr *mr;
161 struct rds_ib_mr_pool *mr_pool; 162 struct rds_ib_mr_pool *mr_pool;
162 unsigned int fmr_max_remaps; 163 unsigned int fmr_max_remaps;
163 unsigned int max_fmrs; 164 unsigned int max_fmrs;
164 int max_sge; 165 int max_sge;
165 unsigned int max_wrs; 166 unsigned int max_wrs;
166 spinlock_t spinlock; /* protect the above */ 167 spinlock_t spinlock; /* protect the above */
167 }; 168 };
168 169
169 /* bits for i_ack_flags */ 170 /* bits for i_ack_flags */
170 #define IB_ACK_IN_FLIGHT 0 171 #define IB_ACK_IN_FLIGHT 0
171 #define IB_ACK_REQUESTED 1 172 #define IB_ACK_REQUESTED 1
172 173
173 /* Magic WR_ID for ACKs */ 174 /* Magic WR_ID for ACKs */
174 #define RDS_IB_ACK_WR_ID (~(u64) 0) 175 #define RDS_IB_ACK_WR_ID (~(u64) 0)
175 176
176 struct rds_ib_statistics { 177 struct rds_ib_statistics {
177 uint64_t s_ib_connect_raced; 178 uint64_t s_ib_connect_raced;
178 uint64_t s_ib_listen_closed_stale; 179 uint64_t s_ib_listen_closed_stale;
179 uint64_t s_ib_tx_cq_call; 180 uint64_t s_ib_tx_cq_call;
180 uint64_t s_ib_tx_cq_event; 181 uint64_t s_ib_tx_cq_event;
181 uint64_t s_ib_tx_ring_full; 182 uint64_t s_ib_tx_ring_full;
182 uint64_t s_ib_tx_throttle; 183 uint64_t s_ib_tx_throttle;
183 uint64_t s_ib_tx_sg_mapping_failure; 184 uint64_t s_ib_tx_sg_mapping_failure;
184 uint64_t s_ib_tx_stalled; 185 uint64_t s_ib_tx_stalled;
185 uint64_t s_ib_tx_credit_updates; 186 uint64_t s_ib_tx_credit_updates;
186 uint64_t s_ib_rx_cq_call; 187 uint64_t s_ib_rx_cq_call;
187 uint64_t s_ib_rx_cq_event; 188 uint64_t s_ib_rx_cq_event;
188 uint64_t s_ib_rx_ring_empty; 189 uint64_t s_ib_rx_ring_empty;
189 uint64_t s_ib_rx_refill_from_cq; 190 uint64_t s_ib_rx_refill_from_cq;
190 uint64_t s_ib_rx_refill_from_thread; 191 uint64_t s_ib_rx_refill_from_thread;
191 uint64_t s_ib_rx_alloc_limit; 192 uint64_t s_ib_rx_alloc_limit;
192 uint64_t s_ib_rx_credit_updates; 193 uint64_t s_ib_rx_credit_updates;
193 uint64_t s_ib_ack_sent; 194 uint64_t s_ib_ack_sent;
194 uint64_t s_ib_ack_send_failure; 195 uint64_t s_ib_ack_send_failure;
195 uint64_t s_ib_ack_send_delayed; 196 uint64_t s_ib_ack_send_delayed;
196 uint64_t s_ib_ack_send_piggybacked; 197 uint64_t s_ib_ack_send_piggybacked;
197 uint64_t s_ib_ack_received; 198 uint64_t s_ib_ack_received;
198 uint64_t s_ib_rdma_mr_alloc; 199 uint64_t s_ib_rdma_mr_alloc;
199 uint64_t s_ib_rdma_mr_free; 200 uint64_t s_ib_rdma_mr_free;
200 uint64_t s_ib_rdma_mr_used; 201 uint64_t s_ib_rdma_mr_used;
201 uint64_t s_ib_rdma_mr_pool_flush; 202 uint64_t s_ib_rdma_mr_pool_flush;
202 uint64_t s_ib_rdma_mr_pool_wait; 203 uint64_t s_ib_rdma_mr_pool_wait;
203 uint64_t s_ib_rdma_mr_pool_depleted; 204 uint64_t s_ib_rdma_mr_pool_depleted;
204 }; 205 };
205 206
206 extern struct workqueue_struct *rds_ib_wq; 207 extern struct workqueue_struct *rds_ib_wq;
207 208
208 /* 209 /*
209 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h 210 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
210 * doesn't define it. 211 * doesn't define it.
211 */ 212 */
212 static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, 213 static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
213 struct scatterlist *sg, unsigned int sg_dma_len, int direction) 214 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
214 { 215 {
215 unsigned int i; 216 unsigned int i;
216 217
217 for (i = 0; i < sg_dma_len; ++i) { 218 for (i = 0; i < sg_dma_len; ++i) {
218 ib_dma_sync_single_for_cpu(dev, 219 ib_dma_sync_single_for_cpu(dev,
219 ib_sg_dma_address(dev, &sg[i]), 220 ib_sg_dma_address(dev, &sg[i]),
220 ib_sg_dma_len(dev, &sg[i]), 221 ib_sg_dma_len(dev, &sg[i]),
221 direction); 222 direction);
222 } 223 }
223 } 224 }
224 #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu 225 #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
225 226
226 static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, 227 static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
227 struct scatterlist *sg, unsigned int sg_dma_len, int direction) 228 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
228 { 229 {
229 unsigned int i; 230 unsigned int i;
230 231
231 for (i = 0; i < sg_dma_len; ++i) { 232 for (i = 0; i < sg_dma_len; ++i) {
232 ib_dma_sync_single_for_device(dev, 233 ib_dma_sync_single_for_device(dev,
233 ib_sg_dma_address(dev, &sg[i]), 234 ib_sg_dma_address(dev, &sg[i]),
234 ib_sg_dma_len(dev, &sg[i]), 235 ib_sg_dma_len(dev, &sg[i]),
235 direction); 236 direction);
236 } 237 }
237 } 238 }
238 #define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device 239 #define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
239 240
240 241
241 /* ib.c */ 242 /* ib.c */
242 extern struct rds_transport rds_ib_transport; 243 extern struct rds_transport rds_ib_transport;
243 extern void rds_ib_add_one(struct ib_device *device); 244 extern void rds_ib_add_one(struct ib_device *device);
244 extern void rds_ib_remove_one(struct ib_device *device); 245 extern void rds_ib_remove_one(struct ib_device *device);
245 extern struct ib_client rds_ib_client; 246 extern struct ib_client rds_ib_client;
246 247
247 extern unsigned int fmr_pool_size; 248 extern unsigned int fmr_pool_size;
248 extern unsigned int fmr_message_size; 249 extern unsigned int fmr_message_size;
249 extern unsigned int rds_ib_retry_count; 250 extern unsigned int rds_ib_retry_count;
250 251
251 extern spinlock_t ib_nodev_conns_lock; 252 extern spinlock_t ib_nodev_conns_lock;
252 extern struct list_head ib_nodev_conns; 253 extern struct list_head ib_nodev_conns;
253 254
254 /* ib_cm.c */ 255 /* ib_cm.c */
255 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); 256 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
256 void rds_ib_conn_free(void *arg); 257 void rds_ib_conn_free(void *arg);
257 int rds_ib_conn_connect(struct rds_connection *conn); 258 int rds_ib_conn_connect(struct rds_connection *conn);
258 void rds_ib_conn_shutdown(struct rds_connection *conn); 259 void rds_ib_conn_shutdown(struct rds_connection *conn);
259 void rds_ib_state_change(struct sock *sk); 260 void rds_ib_state_change(struct sock *sk);
260 int __init rds_ib_listen_init(void); 261 int __init rds_ib_listen_init(void);
261 void rds_ib_listen_stop(void); 262 void rds_ib_listen_stop(void);
262 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 263 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
263 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 264 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
264 struct rdma_cm_event *event); 265 struct rdma_cm_event *event);
265 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 266 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
266 void rds_ib_cm_connect_complete(struct rds_connection *conn, 267 void rds_ib_cm_connect_complete(struct rds_connection *conn,
267 struct rdma_cm_event *event); 268 struct rdma_cm_event *event);
268 269
269 270
270 #define rds_ib_conn_error(conn, fmt...) \ 271 #define rds_ib_conn_error(conn, fmt...) \
271 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) 272 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
272 273
273 /* ib_rdma.c */ 274 /* ib_rdma.c */
274 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); 275 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
275 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 276 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
276 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 277 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
277 void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); 278 void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
278 static inline void rds_ib_destroy_nodev_conns(void) 279 static inline void rds_ib_destroy_nodev_conns(void)
279 { 280 {
280 __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); 281 __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
281 } 282 }
282 static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) 283 static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
283 { 284 {
284 __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); 285 __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
285 } 286 }
286 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); 287 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
287 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); 288 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
288 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); 289 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
289 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, 290 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
290 struct rds_sock *rs, u32 *key_ret); 291 struct rds_sock *rs, u32 *key_ret);
291 void rds_ib_sync_mr(void *trans_private, int dir); 292 void rds_ib_sync_mr(void *trans_private, int dir);
292 void rds_ib_free_mr(void *trans_private, int invalidate); 293 void rds_ib_free_mr(void *trans_private, int invalidate);
293 void rds_ib_flush_mrs(void); 294 void rds_ib_flush_mrs(void);
294 295
295 /* ib_recv.c */ 296 /* ib_recv.c */
296 int __init rds_ib_recv_init(void); 297 int __init rds_ib_recv_init(void);
297 void rds_ib_recv_exit(void); 298 void rds_ib_recv_exit(void);
298 int rds_ib_recv(struct rds_connection *conn); 299 int rds_ib_recv(struct rds_connection *conn);
299 int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 300 int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
300 gfp_t page_gfp, int prefill); 301 gfp_t page_gfp, int prefill);
301 void rds_ib_inc_purge(struct rds_incoming *inc); 302 void rds_ib_inc_purge(struct rds_incoming *inc);
302 void rds_ib_inc_free(struct rds_incoming *inc); 303 void rds_ib_inc_free(struct rds_incoming *inc);
303 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 304 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
304 size_t size); 305 size_t size);
305 void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); 306 void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
307 void rds_ib_recv_tasklet_fn(unsigned long data);
306 void rds_ib_recv_init_ring(struct rds_ib_connection *ic); 308 void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
307 void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); 309 void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
308 void rds_ib_recv_init_ack(struct rds_ib_connection *ic); 310 void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
309 void rds_ib_attempt_ack(struct rds_ib_connection *ic); 311 void rds_ib_attempt_ack(struct rds_ib_connection *ic);
310 void rds_ib_ack_send_complete(struct rds_ib_connection *ic); 312 void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
311 u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); 313 u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
312 314
313 /* ib_ring.c */ 315 /* ib_ring.c */
314 void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); 316 void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
315 void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); 317 void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
316 u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); 318 u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
317 void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); 319 void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
318 void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); 320 void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
319 int rds_ib_ring_empty(struct rds_ib_work_ring *ring); 321 int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
320 int rds_ib_ring_low(struct rds_ib_work_ring *ring); 322 int rds_ib_ring_low(struct rds_ib_work_ring *ring);
321 u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); 323 u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
322 u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); 324 u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
323 extern wait_queue_head_t rds_ib_ring_empty_wait; 325 extern wait_queue_head_t rds_ib_ring_empty_wait;
324 326
325 /* ib_send.c */ 327 /* ib_send.c */
326 void rds_ib_xmit_complete(struct rds_connection *conn); 328 void rds_ib_xmit_complete(struct rds_connection *conn);
327 int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, 329 int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
328 unsigned int hdr_off, unsigned int sg, unsigned int off); 330 unsigned int hdr_off, unsigned int sg, unsigned int off);
329 void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); 331 void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
330 void rds_ib_send_init_ring(struct rds_ib_connection *ic); 332 void rds_ib_send_init_ring(struct rds_ib_connection *ic);
331 void rds_ib_send_clear_ring(struct rds_ib_connection *ic); 333 void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
332 int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); 334 int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
333 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); 335 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
334 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); 336 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
335 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, 337 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
336 u32 *adv_credits, int need_posted, int max_posted); 338 u32 *adv_credits, int need_posted, int max_posted);
337 339
338 /* ib_stats.c */ 340 /* ib_stats.c */
339 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); 341 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
340 #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) 342 #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
341 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, 343 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
342 unsigned int avail); 344 unsigned int avail);
343 345
344 /* ib_sysctl.c */ 346 /* ib_sysctl.c */
345 int __init rds_ib_sysctl_init(void); 347 int __init rds_ib_sysctl_init(void);
346 void rds_ib_sysctl_exit(void); 348 void rds_ib_sysctl_exit(void);
347 extern unsigned long rds_ib_sysctl_max_send_wr; 349 extern unsigned long rds_ib_sysctl_max_send_wr;
348 extern unsigned long rds_ib_sysctl_max_recv_wr; 350 extern unsigned long rds_ib_sysctl_max_recv_wr;
349 extern unsigned long rds_ib_sysctl_max_unsig_wrs; 351 extern unsigned long rds_ib_sysctl_max_unsig_wrs;
350 extern unsigned long rds_ib_sysctl_max_unsig_bytes; 352 extern unsigned long rds_ib_sysctl_max_unsig_bytes;
351 extern unsigned long rds_ib_sysctl_max_recv_allocation; 353 extern unsigned long rds_ib_sysctl_max_recv_allocation;
352 extern unsigned int rds_ib_sysctl_flow_control; 354 extern unsigned int rds_ib_sysctl_flow_control;
353 extern ctl_table rds_ib_sysctl_table[]; 355 extern ctl_table rds_ib_sysctl_table[];
354 356
355 /* 357 /*
356 * Helper functions for getting/setting the header and data SGEs in 358 * Helper functions for getting/setting the header and data SGEs in
357 * RDS packets (not RDMA) 359 * RDS packets (not RDMA)
358 * 360 *
359 * From version 3.1 onwards, header is in front of data in the sge. 361 * From version 3.1 onwards, header is in front of data in the sge.
360 */ 362 */
361 static inline struct ib_sge * 363 static inline struct ib_sge *
362 rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) 364 rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
363 { 365 {
364 if (ic->conn->c_version > RDS_PROTOCOL_3_0) 366 if (ic->conn->c_version > RDS_PROTOCOL_3_0)
365 return &sge[0]; 367 return &sge[0];
366 else 368 else
367 return &sge[1]; 369 return &sge[1];
368 } 370 }
369 371
370 static inline struct ib_sge * 372 static inline struct ib_sge *
371 rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) 373 rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
372 { 374 {
373 if (ic->conn->c_version > RDS_PROTOCOL_3_0) 375 if (ic->conn->c_version > RDS_PROTOCOL_3_0)
374 return &sge[1]; 376 return &sge[1];
375 else 377 else
376 return &sge[0]; 378 return &sge[0];
377 } 379 }
378 380
379 #endif 381 #endif
380 382
1 /* 1 /*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006 Oracle. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file 6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the 7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below: 8 * OpenIB.org BSD license below:
9 * 9 *
10 * Redistribution and use in source and binary forms, with or 10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following 11 * without modification, are permitted provided that the following
12 * conditions are met: 12 * conditions are met:
13 * 13 *
14 * - Redistributions of source code must retain the above 14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following 15 * copyright notice, this list of conditions and the following
16 * disclaimer. 16 * disclaimer.
17 * 17 *
18 * - Redistributions in binary form must reproduce the above 18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following 19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials 20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution. 21 * provided with the distribution.
22 * 22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE. 30 * SOFTWARE.
31 * 31 *
32 */ 32 */
33 #include <linux/kernel.h> 33 #include <linux/kernel.h>
34 #include <linux/in.h> 34 #include <linux/in.h>
35 #include <linux/vmalloc.h> 35 #include <linux/vmalloc.h>
36 36
37 #include "rds.h" 37 #include "rds.h"
38 #include "ib.h" 38 #include "ib.h"
39 39
40 /* 40 /*
41 * Set the selected protocol version 41 * Set the selected protocol version
42 */ 42 */
43 static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) 43 static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
44 { 44 {
45 conn->c_version = version; 45 conn->c_version = version;
46 } 46 }
47 47
48 /* 48 /*
49 * Set up flow control 49 * Set up flow control
50 */ 50 */
51 static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) 51 static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
52 { 52 {
53 struct rds_ib_connection *ic = conn->c_transport_data; 53 struct rds_ib_connection *ic = conn->c_transport_data;
54 54
55 if (rds_ib_sysctl_flow_control && credits != 0) { 55 if (rds_ib_sysctl_flow_control && credits != 0) {
56 /* We're doing flow control */ 56 /* We're doing flow control */
57 ic->i_flowctl = 1; 57 ic->i_flowctl = 1;
58 rds_ib_send_add_credits(conn, credits); 58 rds_ib_send_add_credits(conn, credits);
59 } else { 59 } else {
60 ic->i_flowctl = 0; 60 ic->i_flowctl = 0;
61 } 61 }
62 } 62 }
63 63
64 /* 64 /*
65 * Tune RNR behavior. Without flow control, we use a rather 65 * Tune RNR behavior. Without flow control, we use a rather
66 * low timeout, but not the absolute minimum - this should 66 * low timeout, but not the absolute minimum - this should
67 * be tunable. 67 * be tunable.
68 * 68 *
69 * We already set the RNR retry count to 7 (which is the 69 * We already set the RNR retry count to 7 (which is the
70 * smallest infinite number :-) above. 70 * smallest infinite number :-) above.
71 * If flow control is off, we want to change this back to 0 71 * If flow control is off, we want to change this back to 0
72 * so that we learn quickly when our credit accounting is 72 * so that we learn quickly when our credit accounting is
73 * buggy. 73 * buggy.
74 * 74 *
75 * Caller passes in a qp_attr pointer - don't waste stack spacv 75 * Caller passes in a qp_attr pointer - don't waste stack spacv
76 * by allocation this twice. 76 * by allocation this twice.
77 */ 77 */
78 static void 78 static void
79 rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) 79 rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
80 { 80 {
81 int ret; 81 int ret;
82 82
83 attr->min_rnr_timer = IB_RNR_TIMER_000_32; 83 attr->min_rnr_timer = IB_RNR_TIMER_000_32;
84 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); 84 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
85 if (ret) 85 if (ret)
86 printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); 86 printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
87 } 87 }
88 88
89 /* 89 /*
90 * Connection established. 90 * Connection established.
91 * We get here for both outgoing and incoming connection. 91 * We get here for both outgoing and incoming connection.
92 */ 92 */
93 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) 93 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
94 { 94 {
95 const struct rds_ib_connect_private *dp = NULL; 95 const struct rds_ib_connect_private *dp = NULL;
96 struct rds_ib_connection *ic = conn->c_transport_data; 96 struct rds_ib_connection *ic = conn->c_transport_data;
97 struct rds_ib_device *rds_ibdev; 97 struct rds_ib_device *rds_ibdev;
98 struct ib_qp_attr qp_attr; 98 struct ib_qp_attr qp_attr;
99 int err; 99 int err;
100 100
101 if (event->param.conn.private_data_len >= sizeof(*dp)) { 101 if (event->param.conn.private_data_len >= sizeof(*dp)) {
102 dp = event->param.conn.private_data; 102 dp = event->param.conn.private_data;
103 103
104 /* make sure it isn't empty data */ 104 /* make sure it isn't empty data */
105 if (dp->dp_protocol_major) { 105 if (dp->dp_protocol_major) {
106 rds_ib_set_protocol(conn, 106 rds_ib_set_protocol(conn,
107 RDS_PROTOCOL(dp->dp_protocol_major, 107 RDS_PROTOCOL(dp->dp_protocol_major,
108 dp->dp_protocol_minor)); 108 dp->dp_protocol_minor));
109 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 109 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
110 } 110 }
111 } 111 }
112 112
113 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", 113 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
114 &conn->c_faddr, 114 &conn->c_faddr,
115 RDS_PROTOCOL_MAJOR(conn->c_version), 115 RDS_PROTOCOL_MAJOR(conn->c_version),
116 RDS_PROTOCOL_MINOR(conn->c_version), 116 RDS_PROTOCOL_MINOR(conn->c_version),
117 ic->i_flowctl ? ", flow control" : ""); 117 ic->i_flowctl ? ", flow control" : "");
118 118
119 /* 119 /*
120 * Init rings and fill recv. this needs to wait until protocol negotiation 120 * Init rings and fill recv. this needs to wait until protocol negotiation
121 * is complete, since ring layout is different from 3.0 to 3.1. 121 * is complete, since ring layout is different from 3.0 to 3.1.
122 */ 122 */
123 rds_ib_send_init_ring(ic); 123 rds_ib_send_init_ring(ic);
124 rds_ib_recv_init_ring(ic); 124 rds_ib_recv_init_ring(ic);
125 /* Post receive buffers - as a side effect, this will update 125 /* Post receive buffers - as a side effect, this will update
126 * the posted credit count. */ 126 * the posted credit count. */
127 rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); 127 rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
128 128
129 /* Tune RNR behavior */ 129 /* Tune RNR behavior */
130 rds_ib_tune_rnr(ic, &qp_attr); 130 rds_ib_tune_rnr(ic, &qp_attr);
131 131
132 qp_attr.qp_state = IB_QPS_RTS; 132 qp_attr.qp_state = IB_QPS_RTS;
133 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 133 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
134 if (err) 134 if (err)
135 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); 135 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
136 136
137 /* update ib_device with this local ipaddr & conn */ 137 /* update ib_device with this local ipaddr & conn */
138 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 138 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
139 err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); 139 err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
140 if (err) 140 if (err)
141 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); 141 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
142 rds_ib_add_conn(rds_ibdev, conn); 142 rds_ib_add_conn(rds_ibdev, conn);
143 143
144 /* If the peer gave us the last packet it saw, process this as if 144 /* If the peer gave us the last packet it saw, process this as if
145 * we had received a regular ACK. */ 145 * we had received a regular ACK. */
146 if (dp && dp->dp_ack_seq) 146 if (dp && dp->dp_ack_seq)
147 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 147 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
148 148
149 rds_connect_complete(conn); 149 rds_connect_complete(conn);
150 } 150 }
151 151
152 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, 152 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
153 struct rdma_conn_param *conn_param, 153 struct rdma_conn_param *conn_param,
154 struct rds_ib_connect_private *dp, 154 struct rds_ib_connect_private *dp,
155 u32 protocol_version) 155 u32 protocol_version)
156 { 156 {
157 memset(conn_param, 0, sizeof(struct rdma_conn_param)); 157 memset(conn_param, 0, sizeof(struct rdma_conn_param));
158 /* XXX tune these? */ 158 /* XXX tune these? */
159 conn_param->responder_resources = 1; 159 conn_param->responder_resources = 1;
160 conn_param->initiator_depth = 1; 160 conn_param->initiator_depth = 1;
161 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); 161 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
162 conn_param->rnr_retry_count = 7; 162 conn_param->rnr_retry_count = 7;
163 163
164 if (dp) { 164 if (dp) {
165 struct rds_ib_connection *ic = conn->c_transport_data; 165 struct rds_ib_connection *ic = conn->c_transport_data;
166 166
167 memset(dp, 0, sizeof(*dp)); 167 memset(dp, 0, sizeof(*dp));
168 dp->dp_saddr = conn->c_laddr; 168 dp->dp_saddr = conn->c_laddr;
169 dp->dp_daddr = conn->c_faddr; 169 dp->dp_daddr = conn->c_faddr;
170 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 170 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
171 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 171 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
172 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 172 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
173 dp->dp_ack_seq = rds_ib_piggyb_ack(ic); 173 dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
174 174
175 /* Advertise flow control */ 175 /* Advertise flow control */
176 if (ic->i_flowctl) { 176 if (ic->i_flowctl) {
177 unsigned int credits; 177 unsigned int credits;
178 178
179 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); 179 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
180 dp->dp_credit = cpu_to_be32(credits); 180 dp->dp_credit = cpu_to_be32(credits);
181 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); 181 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
182 } 182 }
183 183
184 conn_param->private_data = dp; 184 conn_param->private_data = dp;
185 conn_param->private_data_len = sizeof(*dp); 185 conn_param->private_data_len = sizeof(*dp);
186 } 186 }
187 } 187 }
188 188
189 static void rds_ib_cq_event_handler(struct ib_event *event, void *data) 189 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
190 { 190 {
191 rdsdebug("event %u data %p\n", event->event, data); 191 rdsdebug("event %u data %p\n", event->event, data);
192 } 192 }
193 193
194 static void rds_ib_qp_event_handler(struct ib_event *event, void *data) 194 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
195 { 195 {
196 struct rds_connection *conn = data; 196 struct rds_connection *conn = data;
197 struct rds_ib_connection *ic = conn->c_transport_data; 197 struct rds_ib_connection *ic = conn->c_transport_data;
198 198
199 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); 199 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
200 200
201 switch (event->event) { 201 switch (event->event) {
202 case IB_EVENT_COMM_EST: 202 case IB_EVENT_COMM_EST:
203 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 203 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
204 break; 204 break;
205 default: 205 default:
206 rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u " 206 rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u "
207 "- connection %pI4->%pI4, reconnecting\n", 207 "- connection %pI4->%pI4, reconnecting\n",
208 event->event, &conn->c_laddr, &conn->c_faddr); 208 event->event, &conn->c_laddr, &conn->c_faddr);
209 break; 209 break;
210 } 210 }
211 } 211 }
212 212
213 /* 213 /*
214 * This needs to be very careful to not leave IS_ERR pointers around for 214 * This needs to be very careful to not leave IS_ERR pointers around for
215 * cleanup to trip over. 215 * cleanup to trip over.
216 */ 216 */
217 static int rds_ib_setup_qp(struct rds_connection *conn) 217 static int rds_ib_setup_qp(struct rds_connection *conn)
218 { 218 {
219 struct rds_ib_connection *ic = conn->c_transport_data; 219 struct rds_ib_connection *ic = conn->c_transport_data;
220 struct ib_device *dev = ic->i_cm_id->device; 220 struct ib_device *dev = ic->i_cm_id->device;
221 struct ib_qp_init_attr attr; 221 struct ib_qp_init_attr attr;
222 struct rds_ib_device *rds_ibdev; 222 struct rds_ib_device *rds_ibdev;
223 int ret; 223 int ret;
224 224
225 /* rds_ib_add_one creates a rds_ib_device object per IB device, 225 /* rds_ib_add_one creates a rds_ib_device object per IB device,
226 * and allocates a protection domain, memory range and FMR pool 226 * and allocates a protection domain, memory range and FMR pool
227 * for each. If that fails for any reason, it will not register 227 * for each. If that fails for any reason, it will not register
228 * the rds_ibdev at all. 228 * the rds_ibdev at all.
229 */ 229 */
230 rds_ibdev = ib_get_client_data(dev, &rds_ib_client); 230 rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
231 if (rds_ibdev == NULL) { 231 if (rds_ibdev == NULL) {
232 if (printk_ratelimit()) 232 if (printk_ratelimit())
233 printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", 233 printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
234 dev->name); 234 dev->name);
235 return -EOPNOTSUPP; 235 return -EOPNOTSUPP;
236 } 236 }
237 237
238 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 238 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
239 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 239 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
240 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) 240 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
241 rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); 241 rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
242 242
243 /* Protection domain and memory range */ 243 /* Protection domain and memory range */
244 ic->i_pd = rds_ibdev->pd; 244 ic->i_pd = rds_ibdev->pd;
245 ic->i_mr = rds_ibdev->mr; 245 ic->i_mr = rds_ibdev->mr;
246 246
247 ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, 247 ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
248 rds_ib_cq_event_handler, conn, 248 rds_ib_cq_event_handler, conn,
249 ic->i_send_ring.w_nr + 1, 0); 249 ic->i_send_ring.w_nr + 1, 0);
250 if (IS_ERR(ic->i_send_cq)) { 250 if (IS_ERR(ic->i_send_cq)) {
251 ret = PTR_ERR(ic->i_send_cq); 251 ret = PTR_ERR(ic->i_send_cq);
252 ic->i_send_cq = NULL; 252 ic->i_send_cq = NULL;
253 rdsdebug("ib_create_cq send failed: %d\n", ret); 253 rdsdebug("ib_create_cq send failed: %d\n", ret);
254 goto out; 254 goto out;
255 } 255 }
256 256
257 ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, 257 ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
258 rds_ib_cq_event_handler, conn, 258 rds_ib_cq_event_handler, conn,
259 ic->i_recv_ring.w_nr, 0); 259 ic->i_recv_ring.w_nr, 0);
260 if (IS_ERR(ic->i_recv_cq)) { 260 if (IS_ERR(ic->i_recv_cq)) {
261 ret = PTR_ERR(ic->i_recv_cq); 261 ret = PTR_ERR(ic->i_recv_cq);
262 ic->i_recv_cq = NULL; 262 ic->i_recv_cq = NULL;
263 rdsdebug("ib_create_cq recv failed: %d\n", ret); 263 rdsdebug("ib_create_cq recv failed: %d\n", ret);
264 goto out; 264 goto out;
265 } 265 }
266 266
267 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 267 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
268 if (ret) { 268 if (ret) {
269 rdsdebug("ib_req_notify_cq send failed: %d\n", ret); 269 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
270 goto out; 270 goto out;
271 } 271 }
272 272
273 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 273 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
274 if (ret) { 274 if (ret) {
275 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); 275 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
276 goto out; 276 goto out;
277 } 277 }
278 278
279 /* XXX negotiate max send/recv with remote? */ 279 /* XXX negotiate max send/recv with remote? */
280 memset(&attr, 0, sizeof(attr)); 280 memset(&attr, 0, sizeof(attr));
281 attr.event_handler = rds_ib_qp_event_handler; 281 attr.event_handler = rds_ib_qp_event_handler;
282 attr.qp_context = conn; 282 attr.qp_context = conn;
283 /* + 1 to allow for the single ack message */ 283 /* + 1 to allow for the single ack message */
284 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; 284 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
285 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; 285 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
286 attr.cap.max_send_sge = rds_ibdev->max_sge; 286 attr.cap.max_send_sge = rds_ibdev->max_sge;
287 attr.cap.max_recv_sge = RDS_IB_RECV_SGE; 287 attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
288 attr.sq_sig_type = IB_SIGNAL_REQ_WR; 288 attr.sq_sig_type = IB_SIGNAL_REQ_WR;
289 attr.qp_type = IB_QPT_RC; 289 attr.qp_type = IB_QPT_RC;
290 attr.send_cq = ic->i_send_cq; 290 attr.send_cq = ic->i_send_cq;
291 attr.recv_cq = ic->i_recv_cq; 291 attr.recv_cq = ic->i_recv_cq;
292 292
293 /* 293 /*
294 * XXX this can fail if max_*_wr is too large? Are we supposed 294 * XXX this can fail if max_*_wr is too large? Are we supposed
295 * to back off until we get a value that the hardware can support? 295 * to back off until we get a value that the hardware can support?
296 */ 296 */
297 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 297 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
298 if (ret) { 298 if (ret) {
299 rdsdebug("rdma_create_qp failed: %d\n", ret); 299 rdsdebug("rdma_create_qp failed: %d\n", ret);
300 goto out; 300 goto out;
301 } 301 }
302 302
303 ic->i_send_hdrs = ib_dma_alloc_coherent(dev, 303 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
304 ic->i_send_ring.w_nr * 304 ic->i_send_ring.w_nr *
305 sizeof(struct rds_header), 305 sizeof(struct rds_header),
306 &ic->i_send_hdrs_dma, GFP_KERNEL); 306 &ic->i_send_hdrs_dma, GFP_KERNEL);
307 if (ic->i_send_hdrs == NULL) { 307 if (ic->i_send_hdrs == NULL) {
308 ret = -ENOMEM; 308 ret = -ENOMEM;
309 rdsdebug("ib_dma_alloc_coherent send failed\n"); 309 rdsdebug("ib_dma_alloc_coherent send failed\n");
310 goto out; 310 goto out;
311 } 311 }
312 312
313 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, 313 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
314 ic->i_recv_ring.w_nr * 314 ic->i_recv_ring.w_nr *
315 sizeof(struct rds_header), 315 sizeof(struct rds_header),
316 &ic->i_recv_hdrs_dma, GFP_KERNEL); 316 &ic->i_recv_hdrs_dma, GFP_KERNEL);
317 if (ic->i_recv_hdrs == NULL) { 317 if (ic->i_recv_hdrs == NULL) {
318 ret = -ENOMEM; 318 ret = -ENOMEM;
319 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 319 rdsdebug("ib_dma_alloc_coherent recv failed\n");
320 goto out; 320 goto out;
321 } 321 }
322 322
323 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 323 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
324 &ic->i_ack_dma, GFP_KERNEL); 324 &ic->i_ack_dma, GFP_KERNEL);
325 if (ic->i_ack == NULL) { 325 if (ic->i_ack == NULL) {
326 ret = -ENOMEM; 326 ret = -ENOMEM;
327 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 327 rdsdebug("ib_dma_alloc_coherent ack failed\n");
328 goto out; 328 goto out;
329 } 329 }
330 330
331 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); 331 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
332 if (ic->i_sends == NULL) { 332 if (ic->i_sends == NULL) {
333 ret = -ENOMEM; 333 ret = -ENOMEM;
334 rdsdebug("send allocation failed\n"); 334 rdsdebug("send allocation failed\n");
335 goto out; 335 goto out;
336 } 336 }
337 memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); 337 memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
338 338
339 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); 339 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
340 if (ic->i_recvs == NULL) { 340 if (ic->i_recvs == NULL) {
341 ret = -ENOMEM; 341 ret = -ENOMEM;
342 rdsdebug("recv allocation failed\n"); 342 rdsdebug("recv allocation failed\n");
343 goto out; 343 goto out;
344 } 344 }
345 memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); 345 memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
346 346
347 rds_ib_recv_init_ack(ic); 347 rds_ib_recv_init_ack(ic);
348 348
349 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, 349 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
350 ic->i_send_cq, ic->i_recv_cq); 350 ic->i_send_cq, ic->i_recv_cq);
351 351
352 out: 352 out:
353 return ret; 353 return ret;
354 } 354 }
355 355
356 static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) 356 static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
357 { 357 {
358 const struct rds_ib_connect_private *dp = event->param.conn.private_data; 358 const struct rds_ib_connect_private *dp = event->param.conn.private_data;
359 u16 common; 359 u16 common;
360 u32 version = 0; 360 u32 version = 0;
361 361
362 /* 362 /*
363 * rdma_cm private data is odd - when there is any private data in the 363 * rdma_cm private data is odd - when there is any private data in the
364 * request, we will be given a pretty large buffer without telling us the 364 * request, we will be given a pretty large buffer without telling us the
365 * original size. The only way to tell the difference is by looking at 365 * original size. The only way to tell the difference is by looking at
366 * the contents, which are initialized to zero. 366 * the contents, which are initialized to zero.
367 * If the protocol version fields aren't set, this is a connection attempt 367 * If the protocol version fields aren't set, this is a connection attempt
368 * from an older version. This could could be 3.0 or 2.0 - we can't tell. 368 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
369 * We really should have changed this for OFED 1.3 :-( 369 * We really should have changed this for OFED 1.3 :-(
370 */ 370 */
371 371
372 /* Be paranoid. RDS always has privdata */ 372 /* Be paranoid. RDS always has privdata */
373 if (!event->param.conn.private_data_len) { 373 if (!event->param.conn.private_data_len) {
374 printk(KERN_NOTICE "RDS incoming connection has no private data, " 374 printk(KERN_NOTICE "RDS incoming connection has no private data, "
375 "rejecting\n"); 375 "rejecting\n");
376 return 0; 376 return 0;
377 } 377 }
378 378
379 /* Even if len is crap *now* I still want to check it. -ASG */ 379 /* Even if len is crap *now* I still want to check it. -ASG */
380 if (event->param.conn.private_data_len < sizeof (*dp) 380 if (event->param.conn.private_data_len < sizeof (*dp)
381 || dp->dp_protocol_major == 0) 381 || dp->dp_protocol_major == 0)
382 return RDS_PROTOCOL_3_0; 382 return RDS_PROTOCOL_3_0;
383 383
384 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; 384 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
385 if (dp->dp_protocol_major == 3 && common) { 385 if (dp->dp_protocol_major == 3 && common) {
386 version = RDS_PROTOCOL_3_0; 386 version = RDS_PROTOCOL_3_0;
387 while ((common >>= 1) != 0) 387 while ((common >>= 1) != 0)
388 version++; 388 version++;
389 } else if (printk_ratelimit()) { 389 } else if (printk_ratelimit()) {
390 printk(KERN_NOTICE "RDS: Connection from %pI4 using " 390 printk(KERN_NOTICE "RDS: Connection from %pI4 using "
391 "incompatible protocol version %u.%u\n", 391 "incompatible protocol version %u.%u\n",
392 &dp->dp_saddr, 392 &dp->dp_saddr,
393 dp->dp_protocol_major, 393 dp->dp_protocol_major,
394 dp->dp_protocol_minor); 394 dp->dp_protocol_minor);
395 } 395 }
396 return version; 396 return version;
397 } 397 }
398 398
399 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 399 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
400 struct rdma_cm_event *event) 400 struct rdma_cm_event *event)
401 { 401 {
402 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; 402 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
403 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; 403 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
404 const struct rds_ib_connect_private *dp = event->param.conn.private_data; 404 const struct rds_ib_connect_private *dp = event->param.conn.private_data;
405 struct rds_ib_connect_private dp_rep; 405 struct rds_ib_connect_private dp_rep;
406 struct rds_connection *conn = NULL; 406 struct rds_connection *conn = NULL;
407 struct rds_ib_connection *ic = NULL; 407 struct rds_ib_connection *ic = NULL;
408 struct rdma_conn_param conn_param; 408 struct rdma_conn_param conn_param;
409 u32 version; 409 u32 version;
410 int err, destroy = 1; 410 int err, destroy = 1;
411 411
412 /* Check whether the remote protocol version matches ours. */ 412 /* Check whether the remote protocol version matches ours. */
413 version = rds_ib_protocol_compatible(event); 413 version = rds_ib_protocol_compatible(event);
414 if (!version) 414 if (!version)
415 goto out; 415 goto out;
416 416
417 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " 417 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
418 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, 418 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
419 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 419 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
420 (unsigned long long)be64_to_cpu(lguid), 420 (unsigned long long)be64_to_cpu(lguid),
421 (unsigned long long)be64_to_cpu(fguid)); 421 (unsigned long long)be64_to_cpu(fguid));
422 422
423 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, 423 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
424 GFP_KERNEL); 424 GFP_KERNEL);
425 if (IS_ERR(conn)) { 425 if (IS_ERR(conn)) {
426 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 426 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
427 conn = NULL; 427 conn = NULL;
428 goto out; 428 goto out;
429 } 429 }
430 430
431 /* 431 /*
432 * The connection request may occur while the 432 * The connection request may occur while the
433 * previous connection exist, e.g. in case of failover. 433 * previous connection exist, e.g. in case of failover.
434 * But as connections may be initiated simultaneously 434 * But as connections may be initiated simultaneously
435 * by both hosts, we have a random backoff mechanism - 435 * by both hosts, we have a random backoff mechanism -
436 * see the comment above rds_queue_reconnect() 436 * see the comment above rds_queue_reconnect()
437 */ 437 */
438 mutex_lock(&conn->c_cm_lock); 438 mutex_lock(&conn->c_cm_lock);
439 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { 439 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
440 if (rds_conn_state(conn) == RDS_CONN_UP) { 440 if (rds_conn_state(conn) == RDS_CONN_UP) {
441 rdsdebug("incoming connect while connecting\n"); 441 rdsdebug("incoming connect while connecting\n");
442 rds_conn_drop(conn); 442 rds_conn_drop(conn);
443 rds_ib_stats_inc(s_ib_listen_closed_stale); 443 rds_ib_stats_inc(s_ib_listen_closed_stale);
444 } else 444 } else
445 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { 445 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
446 /* Wait and see - our connect may still be succeeding */ 446 /* Wait and see - our connect may still be succeeding */
447 rds_ib_stats_inc(s_ib_connect_raced); 447 rds_ib_stats_inc(s_ib_connect_raced);
448 } 448 }
449 mutex_unlock(&conn->c_cm_lock); 449 mutex_unlock(&conn->c_cm_lock);
450 goto out; 450 goto out;
451 } 451 }
452 452
453 ic = conn->c_transport_data; 453 ic = conn->c_transport_data;
454 454
455 rds_ib_set_protocol(conn, version); 455 rds_ib_set_protocol(conn, version);
456 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 456 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
457 457
458 /* If the peer gave us the last packet it saw, process this as if 458 /* If the peer gave us the last packet it saw, process this as if
459 * we had received a regular ACK. */ 459 * we had received a regular ACK. */
460 if (dp->dp_ack_seq) 460 if (dp->dp_ack_seq)
461 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 461 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
462 462
463 BUG_ON(cm_id->context); 463 BUG_ON(cm_id->context);
464 BUG_ON(ic->i_cm_id); 464 BUG_ON(ic->i_cm_id);
465 465
466 ic->i_cm_id = cm_id; 466 ic->i_cm_id = cm_id;
467 cm_id->context = conn; 467 cm_id->context = conn;
468 468
469 /* We got halfway through setting up the ib_connection, if we 469 /* We got halfway through setting up the ib_connection, if we
470 * fail now, we have to take the long route out of this mess. */ 470 * fail now, we have to take the long route out of this mess. */
471 destroy = 0; 471 destroy = 0;
472 472
473 err = rds_ib_setup_qp(conn); 473 err = rds_ib_setup_qp(conn);
474 if (err) { 474 if (err) {
475 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); 475 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
476 goto out; 476 goto out;
477 } 477 }
478 478
479 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 479 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
480 480
481 /* rdma_accept() calls rdma_reject() internally if it fails */ 481 /* rdma_accept() calls rdma_reject() internally if it fails */
482 err = rdma_accept(cm_id, &conn_param); 482 err = rdma_accept(cm_id, &conn_param);
483 mutex_unlock(&conn->c_cm_lock); 483 mutex_unlock(&conn->c_cm_lock);
484 if (err) { 484 if (err) {
485 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); 485 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
486 goto out; 486 goto out;
487 } 487 }
488 488
489 return 0; 489 return 0;
490 490
491 out: 491 out:
492 rdma_reject(cm_id, NULL, 0); 492 rdma_reject(cm_id, NULL, 0);
493 return destroy; 493 return destroy;
494 } 494 }
495 495
496 496
497 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 497 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
498 { 498 {
499 struct rds_connection *conn = cm_id->context; 499 struct rds_connection *conn = cm_id->context;
500 struct rds_ib_connection *ic = conn->c_transport_data; 500 struct rds_ib_connection *ic = conn->c_transport_data;
501 struct rdma_conn_param conn_param; 501 struct rdma_conn_param conn_param;
502 struct rds_ib_connect_private dp; 502 struct rds_ib_connect_private dp;
503 int ret; 503 int ret;
504 504
505 /* If the peer doesn't do protocol negotiation, we must 505 /* If the peer doesn't do protocol negotiation, we must
506 * default to RDSv3.0 */ 506 * default to RDSv3.0 */
507 rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); 507 rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
508 ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ 508 ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
509 509
510 ret = rds_ib_setup_qp(conn); 510 ret = rds_ib_setup_qp(conn);
511 if (ret) { 511 if (ret) {
512 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); 512 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
513 goto out; 513 goto out;
514 } 514 }
515 515
516 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); 516 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
517 517
518 ret = rdma_connect(cm_id, &conn_param); 518 ret = rdma_connect(cm_id, &conn_param);
519 if (ret) 519 if (ret)
520 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); 520 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
521 521
522 out: 522 out:
523 /* Beware - returning non-zero tells the rdma_cm to destroy 523 /* Beware - returning non-zero tells the rdma_cm to destroy
524 * the cm_id. We should certainly not do it as long as we still 524 * the cm_id. We should certainly not do it as long as we still
525 * "own" the cm_id. */ 525 * "own" the cm_id. */
526 if (ret) { 526 if (ret) {
527 if (ic->i_cm_id == cm_id) 527 if (ic->i_cm_id == cm_id)
528 ret = 0; 528 ret = 0;
529 } 529 }
530 return ret; 530 return ret;
531 } 531 }
532 532
533 int rds_ib_conn_connect(struct rds_connection *conn) 533 int rds_ib_conn_connect(struct rds_connection *conn)
534 { 534 {
535 struct rds_ib_connection *ic = conn->c_transport_data; 535 struct rds_ib_connection *ic = conn->c_transport_data;
536 struct sockaddr_in src, dest; 536 struct sockaddr_in src, dest;
537 int ret; 537 int ret;
538 538
539 /* XXX I wonder what affect the port space has */ 539 /* XXX I wonder what affect the port space has */
540 /* delegate cm event handler to rdma_transport */ 540 /* delegate cm event handler to rdma_transport */
541 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, 541 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
542 RDMA_PS_TCP); 542 RDMA_PS_TCP);
543 if (IS_ERR(ic->i_cm_id)) { 543 if (IS_ERR(ic->i_cm_id)) {
544 ret = PTR_ERR(ic->i_cm_id); 544 ret = PTR_ERR(ic->i_cm_id);
545 ic->i_cm_id = NULL; 545 ic->i_cm_id = NULL;
546 rdsdebug("rdma_create_id() failed: %d\n", ret); 546 rdsdebug("rdma_create_id() failed: %d\n", ret);
547 goto out; 547 goto out;
548 } 548 }
549 549
550 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); 550 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
551 551
552 src.sin_family = AF_INET; 552 src.sin_family = AF_INET;
553 src.sin_addr.s_addr = (__force u32)conn->c_laddr; 553 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
554 src.sin_port = (__force u16)htons(0); 554 src.sin_port = (__force u16)htons(0);
555 555
556 dest.sin_family = AF_INET; 556 dest.sin_family = AF_INET;
557 dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 557 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
558 dest.sin_port = (__force u16)htons(RDS_PORT); 558 dest.sin_port = (__force u16)htons(RDS_PORT);
559 559
560 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 560 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
561 (struct sockaddr *)&dest, 561 (struct sockaddr *)&dest,
562 RDS_RDMA_RESOLVE_TIMEOUT_MS); 562 RDS_RDMA_RESOLVE_TIMEOUT_MS);
563 if (ret) { 563 if (ret) {
564 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, 564 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
565 ret); 565 ret);
566 rdma_destroy_id(ic->i_cm_id); 566 rdma_destroy_id(ic->i_cm_id);
567 ic->i_cm_id = NULL; 567 ic->i_cm_id = NULL;
568 } 568 }
569 569
570 out: 570 out:
571 return ret; 571 return ret;
572 } 572 }
573 573
574 /* 574 /*
575 * This is so careful about only cleaning up resources that were built up 575 * This is so careful about only cleaning up resources that were built up
576 * so that it can be called at any point during startup. In fact it 576 * so that it can be called at any point during startup. In fact it
577 * can be called multiple times for a given connection. 577 * can be called multiple times for a given connection.
578 */ 578 */
579 void rds_ib_conn_shutdown(struct rds_connection *conn) 579 void rds_ib_conn_shutdown(struct rds_connection *conn)
580 { 580 {
581 struct rds_ib_connection *ic = conn->c_transport_data; 581 struct rds_ib_connection *ic = conn->c_transport_data;
582 int err = 0; 582 int err = 0;
583 583
584 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, 584 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
585 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 585 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
586 ic->i_cm_id ? ic->i_cm_id->qp : NULL); 586 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
587 587
588 if (ic->i_cm_id) { 588 if (ic->i_cm_id) {
589 struct ib_device *dev = ic->i_cm_id->device; 589 struct ib_device *dev = ic->i_cm_id->device;
590 590
591 rdsdebug("disconnecting cm %p\n", ic->i_cm_id); 591 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
592 err = rdma_disconnect(ic->i_cm_id); 592 err = rdma_disconnect(ic->i_cm_id);
593 if (err) { 593 if (err) {
594 /* Actually this may happen quite frequently, when 594 /* Actually this may happen quite frequently, when
595 * an outgoing connect raced with an incoming connect. 595 * an outgoing connect raced with an incoming connect.
596 */ 596 */
597 rdsdebug("failed to disconnect, cm: %p err %d\n", 597 rdsdebug("failed to disconnect, cm: %p err %d\n",
598 ic->i_cm_id, err); 598 ic->i_cm_id, err);
599 } 599 }
600 600
601 wait_event(rds_ib_ring_empty_wait, 601 wait_event(rds_ib_ring_empty_wait,
602 rds_ib_ring_empty(&ic->i_send_ring) && 602 rds_ib_ring_empty(&ic->i_send_ring) &&
603 rds_ib_ring_empty(&ic->i_recv_ring)); 603 rds_ib_ring_empty(&ic->i_recv_ring));
604 604
605 if (ic->i_send_hdrs) 605 if (ic->i_send_hdrs)
606 ib_dma_free_coherent(dev, 606 ib_dma_free_coherent(dev,
607 ic->i_send_ring.w_nr * 607 ic->i_send_ring.w_nr *
608 sizeof(struct rds_header), 608 sizeof(struct rds_header),
609 ic->i_send_hdrs, 609 ic->i_send_hdrs,
610 ic->i_send_hdrs_dma); 610 ic->i_send_hdrs_dma);
611 611
612 if (ic->i_recv_hdrs) 612 if (ic->i_recv_hdrs)
613 ib_dma_free_coherent(dev, 613 ib_dma_free_coherent(dev,
614 ic->i_recv_ring.w_nr * 614 ic->i_recv_ring.w_nr *
615 sizeof(struct rds_header), 615 sizeof(struct rds_header),
616 ic->i_recv_hdrs, 616 ic->i_recv_hdrs,
617 ic->i_recv_hdrs_dma); 617 ic->i_recv_hdrs_dma);
618 618
619 if (ic->i_ack) 619 if (ic->i_ack)
620 ib_dma_free_coherent(dev, sizeof(struct rds_header), 620 ib_dma_free_coherent(dev, sizeof(struct rds_header),
621 ic->i_ack, ic->i_ack_dma); 621 ic->i_ack, ic->i_ack_dma);
622 622
623 if (ic->i_sends) 623 if (ic->i_sends)
624 rds_ib_send_clear_ring(ic); 624 rds_ib_send_clear_ring(ic);
625 if (ic->i_recvs) 625 if (ic->i_recvs)
626 rds_ib_recv_clear_ring(ic); 626 rds_ib_recv_clear_ring(ic);
627 627
628 if (ic->i_cm_id->qp) 628 if (ic->i_cm_id->qp)
629 rdma_destroy_qp(ic->i_cm_id); 629 rdma_destroy_qp(ic->i_cm_id);
630 if (ic->i_send_cq) 630 if (ic->i_send_cq)
631 ib_destroy_cq(ic->i_send_cq); 631 ib_destroy_cq(ic->i_send_cq);
632 if (ic->i_recv_cq) 632 if (ic->i_recv_cq)
633 ib_destroy_cq(ic->i_recv_cq); 633 ib_destroy_cq(ic->i_recv_cq);
634 rdma_destroy_id(ic->i_cm_id); 634 rdma_destroy_id(ic->i_cm_id);
635 635
636 /* 636 /*
637 * Move connection back to the nodev list. 637 * Move connection back to the nodev list.
638 */ 638 */
639 if (ic->rds_ibdev) 639 if (ic->rds_ibdev)
640 rds_ib_remove_conn(ic->rds_ibdev, conn); 640 rds_ib_remove_conn(ic->rds_ibdev, conn);
641 641
642 ic->i_cm_id = NULL; 642 ic->i_cm_id = NULL;
643 ic->i_pd = NULL; 643 ic->i_pd = NULL;
644 ic->i_mr = NULL; 644 ic->i_mr = NULL;
645 ic->i_send_cq = NULL; 645 ic->i_send_cq = NULL;
646 ic->i_recv_cq = NULL; 646 ic->i_recv_cq = NULL;
647 ic->i_send_hdrs = NULL; 647 ic->i_send_hdrs = NULL;
648 ic->i_recv_hdrs = NULL; 648 ic->i_recv_hdrs = NULL;
649 ic->i_ack = NULL; 649 ic->i_ack = NULL;
650 } 650 }
651 BUG_ON(ic->rds_ibdev); 651 BUG_ON(ic->rds_ibdev);
652 652
653 /* Clear pending transmit */ 653 /* Clear pending transmit */
654 if (ic->i_rm) { 654 if (ic->i_rm) {
655 rds_message_put(ic->i_rm); 655 rds_message_put(ic->i_rm);
656 ic->i_rm = NULL; 656 ic->i_rm = NULL;
657 } 657 }
658 658
659 /* Clear the ACK state */ 659 /* Clear the ACK state */
660 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 660 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
661 #ifdef KERNEL_HAS_ATOMIC64 661 #ifdef KERNEL_HAS_ATOMIC64
662 atomic64_set(&ic->i_ack_next, 0); 662 atomic64_set(&ic->i_ack_next, 0);
663 #else 663 #else
664 ic->i_ack_next = 0; 664 ic->i_ack_next = 0;
665 #endif 665 #endif
666 ic->i_ack_recv = 0; 666 ic->i_ack_recv = 0;
667 667
668 /* Clear flow control state */ 668 /* Clear flow control state */
669 ic->i_flowctl = 0; 669 ic->i_flowctl = 0;
670 atomic_set(&ic->i_credits, 0); 670 atomic_set(&ic->i_credits, 0);
671 671
672 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); 672 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
673 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); 673 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
674 674
675 if (ic->i_ibinc) { 675 if (ic->i_ibinc) {
676 rds_inc_put(&ic->i_ibinc->ii_inc); 676 rds_inc_put(&ic->i_ibinc->ii_inc);
677 ic->i_ibinc = NULL; 677 ic->i_ibinc = NULL;
678 } 678 }
679 679
680 vfree(ic->i_sends); 680 vfree(ic->i_sends);
681 ic->i_sends = NULL; 681 ic->i_sends = NULL;
682 vfree(ic->i_recvs); 682 vfree(ic->i_recvs);
683 ic->i_recvs = NULL; 683 ic->i_recvs = NULL;
684 } 684 }
685 685
686 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) 686 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
687 { 687 {
688 struct rds_ib_connection *ic; 688 struct rds_ib_connection *ic;
689 unsigned long flags; 689 unsigned long flags;
690 690
691 /* XXX too lazy? */ 691 /* XXX too lazy? */
692 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); 692 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
693 if (ic == NULL) 693 if (ic == NULL)
694 return -ENOMEM; 694 return -ENOMEM;
695 695
696 INIT_LIST_HEAD(&ic->ib_node); 696 INIT_LIST_HEAD(&ic->ib_node);
697 tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
698 (unsigned long) ic);
697 mutex_init(&ic->i_recv_mutex); 699 mutex_init(&ic->i_recv_mutex);
698 #ifndef KERNEL_HAS_ATOMIC64 700 #ifndef KERNEL_HAS_ATOMIC64
699 spin_lock_init(&ic->i_ack_lock); 701 spin_lock_init(&ic->i_ack_lock);
700 #endif 702 #endif
701 703
702 /* 704 /*
703 * rds_ib_conn_shutdown() waits for these to be emptied so they 705 * rds_ib_conn_shutdown() waits for these to be emptied so they
704 * must be initialized before it can be called. 706 * must be initialized before it can be called.
705 */ 707 */
706 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); 708 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
707 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); 709 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
708 710
709 ic->conn = conn; 711 ic->conn = conn;
710 conn->c_transport_data = ic; 712 conn->c_transport_data = ic;
711 713
712 spin_lock_irqsave(&ib_nodev_conns_lock, flags); 714 spin_lock_irqsave(&ib_nodev_conns_lock, flags);
713 list_add_tail(&ic->ib_node, &ib_nodev_conns); 715 list_add_tail(&ic->ib_node, &ib_nodev_conns);
714 spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); 716 spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
715 717
716 718
717 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); 719 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
718 return 0; 720 return 0;
719 } 721 }
720 722
721 /* 723 /*
722 * Free a connection. Connection must be shut down and not set for reconnect. 724 * Free a connection. Connection must be shut down and not set for reconnect.
723 */ 725 */
724 void rds_ib_conn_free(void *arg) 726 void rds_ib_conn_free(void *arg)
725 { 727 {
726 struct rds_ib_connection *ic = arg; 728 struct rds_ib_connection *ic = arg;
727 spinlock_t *lock_ptr; 729 spinlock_t *lock_ptr;
728 730
729 rdsdebug("ic %p\n", ic); 731 rdsdebug("ic %p\n", ic);
730 732
731 /* 733 /*
732 * Conn is either on a dev's list or on the nodev list. 734 * Conn is either on a dev's list or on the nodev list.
733 * A race with shutdown() or connect() would cause problems 735 * A race with shutdown() or connect() would cause problems
734 * (since rds_ibdev would change) but that should never happen. 736 * (since rds_ibdev would change) but that should never happen.
735 */ 737 */
736 lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; 738 lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
737 739
738 spin_lock_irq(lock_ptr); 740 spin_lock_irq(lock_ptr);
739 list_del(&ic->ib_node); 741 list_del(&ic->ib_node);
740 spin_unlock_irq(lock_ptr); 742 spin_unlock_irq(lock_ptr);
741 743
742 kfree(ic); 744 kfree(ic);
743 } 745 }
744 746
745 747
746 /* 748 /*
747 * An error occurred on the connection 749 * An error occurred on the connection
748 */ 750 */
749 void 751 void
750 __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) 752 __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
751 { 753 {
752 va_list ap; 754 va_list ap;
753 755
754 rds_conn_drop(conn); 756 rds_conn_drop(conn);
755 757
756 va_start(ap, fmt); 758 va_start(ap, fmt);
757 vprintk(fmt, ap); 759 vprintk(fmt, ap);
758 va_end(ap); 760 va_end(ap);
759 } 761 }
760 762
1 /* 1 /*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006 Oracle. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file 6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the 7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below: 8 * OpenIB.org BSD license below:
9 * 9 *
10 * Redistribution and use in source and binary forms, with or 10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following 11 * without modification, are permitted provided that the following
12 * conditions are met: 12 * conditions are met:
13 * 13 *
14 * - Redistributions of source code must retain the above 14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following 15 * copyright notice, this list of conditions and the following
16 * disclaimer. 16 * disclaimer.
17 * 17 *
18 * - Redistributions in binary form must reproduce the above 18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following 19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials 20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution. 21 * provided with the distribution.
22 * 22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE. 30 * SOFTWARE.
31 * 31 *
32 */ 32 */
33 #include <linux/kernel.h> 33 #include <linux/kernel.h>
34 #include <linux/pci.h> 34 #include <linux/pci.h>
35 #include <linux/dma-mapping.h> 35 #include <linux/dma-mapping.h>
36 #include <rdma/rdma_cm.h> 36 #include <rdma/rdma_cm.h>
37 37
38 #include "rds.h" 38 #include "rds.h"
39 #include "ib.h" 39 #include "ib.h"
40 40
41 static struct kmem_cache *rds_ib_incoming_slab; 41 static struct kmem_cache *rds_ib_incoming_slab;
42 static struct kmem_cache *rds_ib_frag_slab; 42 static struct kmem_cache *rds_ib_frag_slab;
43 static atomic_t rds_ib_allocation = ATOMIC_INIT(0); 43 static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
44 44
45 static void rds_ib_frag_drop_page(struct rds_page_frag *frag) 45 static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
46 { 46 {
47 rdsdebug("frag %p page %p\n", frag, frag->f_page); 47 rdsdebug("frag %p page %p\n", frag, frag->f_page);
48 __free_page(frag->f_page); 48 __free_page(frag->f_page);
49 frag->f_page = NULL; 49 frag->f_page = NULL;
50 } 50 }
51 51
52 static void rds_ib_frag_free(struct rds_page_frag *frag) 52 static void rds_ib_frag_free(struct rds_page_frag *frag)
53 { 53 {
54 rdsdebug("frag %p page %p\n", frag, frag->f_page); 54 rdsdebug("frag %p page %p\n", frag, frag->f_page);
55 BUG_ON(frag->f_page != NULL); 55 BUG_ON(frag->f_page != NULL);
56 kmem_cache_free(rds_ib_frag_slab, frag); 56 kmem_cache_free(rds_ib_frag_slab, frag);
57 } 57 }
58 58
59 /* 59 /*
60 * We map a page at a time. Its fragments are posted in order. This 60 * We map a page at a time. Its fragments are posted in order. This
61 * is called in fragment order as the fragments get send completion events. 61 * is called in fragment order as the fragments get send completion events.
62 * Only the last frag in the page performs the unmapping. 62 * Only the last frag in the page performs the unmapping.
63 * 63 *
64 * It's OK for ring cleanup to call this in whatever order it likes because 64 * It's OK for ring cleanup to call this in whatever order it likes because
65 * DMA is not in flight and so we can unmap while other ring entries still 65 * DMA is not in flight and so we can unmap while other ring entries still
66 * hold page references in their frags. 66 * hold page references in their frags.
67 */ 67 */
68 static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, 68 static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
69 struct rds_ib_recv_work *recv) 69 struct rds_ib_recv_work *recv)
70 { 70 {
71 struct rds_page_frag *frag = recv->r_frag; 71 struct rds_page_frag *frag = recv->r_frag;
72 72
73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); 73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
74 if (frag->f_mapped) 74 if (frag->f_mapped)
75 ib_dma_unmap_page(ic->i_cm_id->device, 75 ib_dma_unmap_page(ic->i_cm_id->device,
76 frag->f_mapped, 76 frag->f_mapped,
77 RDS_FRAG_SIZE, DMA_FROM_DEVICE); 77 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
78 frag->f_mapped = 0; 78 frag->f_mapped = 0;
79 } 79 }
80 80
81 void rds_ib_recv_init_ring(struct rds_ib_connection *ic) 81 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
82 { 82 {
83 struct rds_ib_recv_work *recv; 83 struct rds_ib_recv_work *recv;
84 u32 i; 84 u32 i;
85 85
86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { 86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
87 struct ib_sge *sge; 87 struct ib_sge *sge;
88 88
89 recv->r_ibinc = NULL; 89 recv->r_ibinc = NULL;
90 recv->r_frag = NULL; 90 recv->r_frag = NULL;
91 91
92 recv->r_wr.next = NULL; 92 recv->r_wr.next = NULL;
93 recv->r_wr.wr_id = i; 93 recv->r_wr.wr_id = i;
94 recv->r_wr.sg_list = recv->r_sge; 94 recv->r_wr.sg_list = recv->r_sge;
95 recv->r_wr.num_sge = RDS_IB_RECV_SGE; 95 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
96 96
97 sge = rds_ib_data_sge(ic, recv->r_sge); 97 sge = rds_ib_data_sge(ic, recv->r_sge);
98 sge->addr = 0; 98 sge->addr = 0;
99 sge->length = RDS_FRAG_SIZE; 99 sge->length = RDS_FRAG_SIZE;
100 sge->lkey = ic->i_mr->lkey; 100 sge->lkey = ic->i_mr->lkey;
101 101
102 sge = rds_ib_header_sge(ic, recv->r_sge); 102 sge = rds_ib_header_sge(ic, recv->r_sge);
103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); 103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
104 sge->length = sizeof(struct rds_header); 104 sge->length = sizeof(struct rds_header);
105 sge->lkey = ic->i_mr->lkey; 105 sge->lkey = ic->i_mr->lkey;
106 } 106 }
107 } 107 }
108 108
109 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, 109 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
110 struct rds_ib_recv_work *recv) 110 struct rds_ib_recv_work *recv)
111 { 111 {
112 if (recv->r_ibinc) { 112 if (recv->r_ibinc) {
113 rds_inc_put(&recv->r_ibinc->ii_inc); 113 rds_inc_put(&recv->r_ibinc->ii_inc);
114 recv->r_ibinc = NULL; 114 recv->r_ibinc = NULL;
115 } 115 }
116 if (recv->r_frag) { 116 if (recv->r_frag) {
117 rds_ib_recv_unmap_page(ic, recv); 117 rds_ib_recv_unmap_page(ic, recv);
118 if (recv->r_frag->f_page) 118 if (recv->r_frag->f_page)
119 rds_ib_frag_drop_page(recv->r_frag); 119 rds_ib_frag_drop_page(recv->r_frag);
120 rds_ib_frag_free(recv->r_frag); 120 rds_ib_frag_free(recv->r_frag);
121 recv->r_frag = NULL; 121 recv->r_frag = NULL;
122 } 122 }
123 } 123 }
124 124
125 void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) 125 void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
126 { 126 {
127 u32 i; 127 u32 i;
128 128
129 for (i = 0; i < ic->i_recv_ring.w_nr; i++) 129 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
130 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); 130 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
131 131
132 if (ic->i_frag.f_page) 132 if (ic->i_frag.f_page)
133 rds_ib_frag_drop_page(&ic->i_frag); 133 rds_ib_frag_drop_page(&ic->i_frag);
134 } 134 }
135 135
136 static int rds_ib_recv_refill_one(struct rds_connection *conn, 136 static int rds_ib_recv_refill_one(struct rds_connection *conn,
137 struct rds_ib_recv_work *recv, 137 struct rds_ib_recv_work *recv,
138 gfp_t kptr_gfp, gfp_t page_gfp) 138 gfp_t kptr_gfp, gfp_t page_gfp)
139 { 139 {
140 struct rds_ib_connection *ic = conn->c_transport_data; 140 struct rds_ib_connection *ic = conn->c_transport_data;
141 dma_addr_t dma_addr; 141 dma_addr_t dma_addr;
142 struct ib_sge *sge; 142 struct ib_sge *sge;
143 int ret = -ENOMEM; 143 int ret = -ENOMEM;
144 144
145 if (recv->r_ibinc == NULL) { 145 if (recv->r_ibinc == NULL) {
146 if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { 146 if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
147 rds_ib_stats_inc(s_ib_rx_alloc_limit); 147 rds_ib_stats_inc(s_ib_rx_alloc_limit);
148 goto out; 148 goto out;
149 } 149 }
150 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, 150 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
151 kptr_gfp); 151 kptr_gfp);
152 if (recv->r_ibinc == NULL) { 152 if (recv->r_ibinc == NULL) {
153 atomic_dec(&rds_ib_allocation); 153 atomic_dec(&rds_ib_allocation);
154 goto out; 154 goto out;
155 } 155 }
156 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); 156 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
157 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); 157 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
158 } 158 }
159 159
160 if (recv->r_frag == NULL) { 160 if (recv->r_frag == NULL) {
161 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); 161 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
162 if (recv->r_frag == NULL) 162 if (recv->r_frag == NULL)
163 goto out; 163 goto out;
164 INIT_LIST_HEAD(&recv->r_frag->f_item); 164 INIT_LIST_HEAD(&recv->r_frag->f_item);
165 recv->r_frag->f_page = NULL; 165 recv->r_frag->f_page = NULL;
166 } 166 }
167 167
168 if (ic->i_frag.f_page == NULL) { 168 if (ic->i_frag.f_page == NULL) {
169 ic->i_frag.f_page = alloc_page(page_gfp); 169 ic->i_frag.f_page = alloc_page(page_gfp);
170 if (ic->i_frag.f_page == NULL) 170 if (ic->i_frag.f_page == NULL)
171 goto out; 171 goto out;
172 ic->i_frag.f_offset = 0; 172 ic->i_frag.f_offset = 0;
173 } 173 }
174 174
175 dma_addr = ib_dma_map_page(ic->i_cm_id->device, 175 dma_addr = ib_dma_map_page(ic->i_cm_id->device,
176 ic->i_frag.f_page, 176 ic->i_frag.f_page,
177 ic->i_frag.f_offset, 177 ic->i_frag.f_offset,
178 RDS_FRAG_SIZE, 178 RDS_FRAG_SIZE,
179 DMA_FROM_DEVICE); 179 DMA_FROM_DEVICE);
180 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) 180 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
181 goto out; 181 goto out;
182 182
183 /* 183 /*
184 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() 184 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
185 * must be called on this recv. This happens as completions hit 185 * must be called on this recv. This happens as completions hit
186 * in order or on connection shutdown. 186 * in order or on connection shutdown.
187 */ 187 */
188 recv->r_frag->f_page = ic->i_frag.f_page; 188 recv->r_frag->f_page = ic->i_frag.f_page;
189 recv->r_frag->f_offset = ic->i_frag.f_offset; 189 recv->r_frag->f_offset = ic->i_frag.f_offset;
190 recv->r_frag->f_mapped = dma_addr; 190 recv->r_frag->f_mapped = dma_addr;
191 191
192 sge = rds_ib_data_sge(ic, recv->r_sge); 192 sge = rds_ib_data_sge(ic, recv->r_sge);
193 sge->addr = dma_addr; 193 sge->addr = dma_addr;
194 sge->length = RDS_FRAG_SIZE; 194 sge->length = RDS_FRAG_SIZE;
195 195
196 sge = rds_ib_header_sge(ic, recv->r_sge); 196 sge = rds_ib_header_sge(ic, recv->r_sge);
197 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); 197 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
198 sge->length = sizeof(struct rds_header); 198 sge->length = sizeof(struct rds_header);
199 199
200 get_page(recv->r_frag->f_page); 200 get_page(recv->r_frag->f_page);
201 201
202 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { 202 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
203 ic->i_frag.f_offset += RDS_FRAG_SIZE; 203 ic->i_frag.f_offset += RDS_FRAG_SIZE;
204 } else { 204 } else {
205 put_page(ic->i_frag.f_page); 205 put_page(ic->i_frag.f_page);
206 ic->i_frag.f_page = NULL; 206 ic->i_frag.f_page = NULL;
207 ic->i_frag.f_offset = 0; 207 ic->i_frag.f_offset = 0;
208 } 208 }
209 209
210 ret = 0; 210 ret = 0;
211 out: 211 out:
212 return ret; 212 return ret;
213 } 213 }
214 214
215 /* 215 /*
216 * This tries to allocate and post unused work requests after making sure that 216 * This tries to allocate and post unused work requests after making sure that
217 * they have all the allocations they need to queue received fragments into 217 * they have all the allocations they need to queue received fragments into
218 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc 218 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
219 * pairs don't go unmatched. 219 * pairs don't go unmatched.
220 * 220 *
221 * -1 is returned if posting fails due to temporary resource exhaustion. 221 * -1 is returned if posting fails due to temporary resource exhaustion.
222 */ 222 */
223 int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 223 int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
224 gfp_t page_gfp, int prefill) 224 gfp_t page_gfp, int prefill)
225 { 225 {
226 struct rds_ib_connection *ic = conn->c_transport_data; 226 struct rds_ib_connection *ic = conn->c_transport_data;
227 struct rds_ib_recv_work *recv; 227 struct rds_ib_recv_work *recv;
228 struct ib_recv_wr *failed_wr; 228 struct ib_recv_wr *failed_wr;
229 unsigned int posted = 0; 229 unsigned int posted = 0;
230 int ret = 0; 230 int ret = 0;
231 u32 pos; 231 u32 pos;
232 232
233 while ((prefill || rds_conn_up(conn)) 233 while ((prefill || rds_conn_up(conn))
234 && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { 234 && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
235 if (pos >= ic->i_recv_ring.w_nr) { 235 if (pos >= ic->i_recv_ring.w_nr) {
236 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", 236 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
237 pos); 237 pos);
238 ret = -EINVAL; 238 ret = -EINVAL;
239 break; 239 break;
240 } 240 }
241 241
242 recv = &ic->i_recvs[pos]; 242 recv = &ic->i_recvs[pos];
243 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); 243 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
244 if (ret) { 244 if (ret) {
245 ret = -1; 245 ret = -1;
246 break; 246 break;
247 } 247 }
248 248
249 /* XXX when can this fail? */ 249 /* XXX when can this fail? */
250 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 250 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
251 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, 251 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
252 recv->r_ibinc, recv->r_frag->f_page, 252 recv->r_ibinc, recv->r_frag->f_page,
253 (long) recv->r_frag->f_mapped, ret); 253 (long) recv->r_frag->f_mapped, ret);
254 if (ret) { 254 if (ret) {
255 rds_ib_conn_error(conn, "recv post on " 255 rds_ib_conn_error(conn, "recv post on "
256 "%pI4 returned %d, disconnecting and " 256 "%pI4 returned %d, disconnecting and "
257 "reconnecting\n", &conn->c_faddr, 257 "reconnecting\n", &conn->c_faddr,
258 ret); 258 ret);
259 ret = -1; 259 ret = -1;
260 break; 260 break;
261 } 261 }
262 262
263 posted++; 263 posted++;
264 } 264 }
265 265
266 /* We're doing flow control - update the window. */ 266 /* We're doing flow control - update the window. */
267 if (ic->i_flowctl && posted) 267 if (ic->i_flowctl && posted)
268 rds_ib_advertise_credits(conn, posted); 268 rds_ib_advertise_credits(conn, posted);
269 269
270 if (ret) 270 if (ret)
271 rds_ib_ring_unalloc(&ic->i_recv_ring, 1); 271 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
272 return ret; 272 return ret;
273 } 273 }
274 274
275 void rds_ib_inc_purge(struct rds_incoming *inc) 275 void rds_ib_inc_purge(struct rds_incoming *inc)
276 { 276 {
277 struct rds_ib_incoming *ibinc; 277 struct rds_ib_incoming *ibinc;
278 struct rds_page_frag *frag; 278 struct rds_page_frag *frag;
279 struct rds_page_frag *pos; 279 struct rds_page_frag *pos;
280 280
281 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 281 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
282 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); 282 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
283 283
284 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { 284 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
285 list_del_init(&frag->f_item); 285 list_del_init(&frag->f_item);
286 rds_ib_frag_drop_page(frag); 286 rds_ib_frag_drop_page(frag);
287 rds_ib_frag_free(frag); 287 rds_ib_frag_free(frag);
288 } 288 }
289 } 289 }
290 290
291 void rds_ib_inc_free(struct rds_incoming *inc) 291 void rds_ib_inc_free(struct rds_incoming *inc)
292 { 292 {
293 struct rds_ib_incoming *ibinc; 293 struct rds_ib_incoming *ibinc;
294 294
295 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 295 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
296 296
297 rds_ib_inc_purge(inc); 297 rds_ib_inc_purge(inc);
298 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); 298 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
299 BUG_ON(!list_empty(&ibinc->ii_frags)); 299 BUG_ON(!list_empty(&ibinc->ii_frags));
300 kmem_cache_free(rds_ib_incoming_slab, ibinc); 300 kmem_cache_free(rds_ib_incoming_slab, ibinc);
301 atomic_dec(&rds_ib_allocation); 301 atomic_dec(&rds_ib_allocation);
302 BUG_ON(atomic_read(&rds_ib_allocation) < 0); 302 BUG_ON(atomic_read(&rds_ib_allocation) < 0);
303 } 303 }
304 304
305 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 305 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
306 size_t size) 306 size_t size)
307 { 307 {
308 struct rds_ib_incoming *ibinc; 308 struct rds_ib_incoming *ibinc;
309 struct rds_page_frag *frag; 309 struct rds_page_frag *frag;
310 struct iovec *iov = first_iov; 310 struct iovec *iov = first_iov;
311 unsigned long to_copy; 311 unsigned long to_copy;
312 unsigned long frag_off = 0; 312 unsigned long frag_off = 0;
313 unsigned long iov_off = 0; 313 unsigned long iov_off = 0;
314 int copied = 0; 314 int copied = 0;
315 int ret; 315 int ret;
316 u32 len; 316 u32 len;
317 317
318 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 318 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
319 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); 319 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
320 len = be32_to_cpu(inc->i_hdr.h_len); 320 len = be32_to_cpu(inc->i_hdr.h_len);
321 321
322 while (copied < size && copied < len) { 322 while (copied < size && copied < len) {
323 if (frag_off == RDS_FRAG_SIZE) { 323 if (frag_off == RDS_FRAG_SIZE) {
324 frag = list_entry(frag->f_item.next, 324 frag = list_entry(frag->f_item.next,
325 struct rds_page_frag, f_item); 325 struct rds_page_frag, f_item);
326 frag_off = 0; 326 frag_off = 0;
327 } 327 }
328 while (iov_off == iov->iov_len) { 328 while (iov_off == iov->iov_len) {
329 iov_off = 0; 329 iov_off = 0;
330 iov++; 330 iov++;
331 } 331 }
332 332
333 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); 333 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
334 to_copy = min_t(size_t, to_copy, size - copied); 334 to_copy = min_t(size_t, to_copy, size - copied);
335 to_copy = min_t(unsigned long, to_copy, len - copied); 335 to_copy = min_t(unsigned long, to_copy, len - copied);
336 336
337 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " 337 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
338 "[%p, %lu] + %lu\n", 338 "[%p, %lu] + %lu\n",
339 to_copy, iov->iov_base, iov->iov_len, iov_off, 339 to_copy, iov->iov_base, iov->iov_len, iov_off,
340 frag->f_page, frag->f_offset, frag_off); 340 frag->f_page, frag->f_offset, frag_off);
341 341
342 /* XXX needs + offset for multiple recvs per page */ 342 /* XXX needs + offset for multiple recvs per page */
343 ret = rds_page_copy_to_user(frag->f_page, 343 ret = rds_page_copy_to_user(frag->f_page,
344 frag->f_offset + frag_off, 344 frag->f_offset + frag_off,
345 iov->iov_base + iov_off, 345 iov->iov_base + iov_off,
346 to_copy); 346 to_copy);
347 if (ret) { 347 if (ret) {
348 copied = ret; 348 copied = ret;
349 break; 349 break;
350 } 350 }
351 351
352 iov_off += to_copy; 352 iov_off += to_copy;
353 frag_off += to_copy; 353 frag_off += to_copy;
354 copied += to_copy; 354 copied += to_copy;
355 } 355 }
356 356
357 return copied; 357 return copied;
358 } 358 }
359 359
360 /* ic starts out kzalloc()ed */ 360 /* ic starts out kzalloc()ed */
361 void rds_ib_recv_init_ack(struct rds_ib_connection *ic) 361 void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
362 { 362 {
363 struct ib_send_wr *wr = &ic->i_ack_wr; 363 struct ib_send_wr *wr = &ic->i_ack_wr;
364 struct ib_sge *sge = &ic->i_ack_sge; 364 struct ib_sge *sge = &ic->i_ack_sge;
365 365
366 sge->addr = ic->i_ack_dma; 366 sge->addr = ic->i_ack_dma;
367 sge->length = sizeof(struct rds_header); 367 sge->length = sizeof(struct rds_header);
368 sge->lkey = ic->i_mr->lkey; 368 sge->lkey = ic->i_mr->lkey;
369 369
370 wr->sg_list = sge; 370 wr->sg_list = sge;
371 wr->num_sge = 1; 371 wr->num_sge = 1;
372 wr->opcode = IB_WR_SEND; 372 wr->opcode = IB_WR_SEND;
373 wr->wr_id = RDS_IB_ACK_WR_ID; 373 wr->wr_id = RDS_IB_ACK_WR_ID;
374 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; 374 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
375 } 375 }
376 376
377 /* 377 /*
378 * You'd think that with reliable IB connections you wouldn't need to ack 378 * You'd think that with reliable IB connections you wouldn't need to ack
379 * messages that have been received. The problem is that IB hardware generates 379 * messages that have been received. The problem is that IB hardware generates
380 * an ack message before it has DMAed the message into memory. This creates a 380 * an ack message before it has DMAed the message into memory. This creates a
381 * potential message loss if the HCA is disabled for any reason between when it 381 * potential message loss if the HCA is disabled for any reason between when it
382 * sends the ack and before the message is DMAed and processed. This is only a 382 * sends the ack and before the message is DMAed and processed. This is only a
383 * potential issue if another HCA is available for fail-over. 383 * potential issue if another HCA is available for fail-over.
384 * 384 *
385 * When the remote host receives our ack they'll free the sent message from 385 * When the remote host receives our ack they'll free the sent message from
386 * their send queue. To decrease the latency of this we always send an ack 386 * their send queue. To decrease the latency of this we always send an ack
387 * immediately after we've received messages. 387 * immediately after we've received messages.
388 * 388 *
389 * For simplicity, we only have one ack in flight at a time. This puts 389 * For simplicity, we only have one ack in flight at a time. This puts
390 * pressure on senders to have deep enough send queues to absorb the latency of 390 * pressure on senders to have deep enough send queues to absorb the latency of
391 * a single ack frame being in flight. This might not be good enough. 391 * a single ack frame being in flight. This might not be good enough.
392 * 392 *
393 * This is implemented by have a long-lived send_wr and sge which point to a 393 * This is implemented by have a long-lived send_wr and sge which point to a
394 * statically allocated ack frame. This ack wr does not fall under the ring 394 * statically allocated ack frame. This ack wr does not fall under the ring
395 * accounting that the tx and rx wrs do. The QP attribute specifically makes 395 * accounting that the tx and rx wrs do. The QP attribute specifically makes
396 * room for it beyond the ring size. Send completion notices its special 396 * room for it beyond the ring size. Send completion notices its special
397 * wr_id and avoids working with the ring in that case. 397 * wr_id and avoids working with the ring in that case.
398 */ 398 */
399 #ifndef KERNEL_HAS_ATOMIC64 399 #ifndef KERNEL_HAS_ATOMIC64
400 static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, 400 static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
401 int ack_required) 401 int ack_required)
402 { 402 {
403 unsigned long flags; 403 unsigned long flags;
404 404
405 spin_lock_irqsave(&ic->i_ack_lock, flags); 405 spin_lock_irqsave(&ic->i_ack_lock, flags);
406 ic->i_ack_next = seq; 406 ic->i_ack_next = seq;
407 if (ack_required) 407 if (ack_required)
408 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 408 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
409 spin_unlock_irqrestore(&ic->i_ack_lock, flags); 409 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
410 } 410 }
411 411
412 static u64 rds_ib_get_ack(struct rds_ib_connection *ic) 412 static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
413 { 413 {
414 unsigned long flags; 414 unsigned long flags;
415 u64 seq; 415 u64 seq;
416 416
417 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 417 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
418 418
419 spin_lock_irqsave(&ic->i_ack_lock, flags); 419 spin_lock_irqsave(&ic->i_ack_lock, flags);
420 seq = ic->i_ack_next; 420 seq = ic->i_ack_next;
421 spin_unlock_irqrestore(&ic->i_ack_lock, flags); 421 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
422 422
423 return seq; 423 return seq;
424 } 424 }
425 #else 425 #else
426 static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, 426 static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
427 int ack_required) 427 int ack_required)
428 { 428 {
429 atomic64_set(&ic->i_ack_next, seq); 429 atomic64_set(&ic->i_ack_next, seq);
430 if (ack_required) { 430 if (ack_required) {
431 smp_mb__before_clear_bit(); 431 smp_mb__before_clear_bit();
432 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 432 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
433 } 433 }
434 } 434 }
435 435
436 static u64 rds_ib_get_ack(struct rds_ib_connection *ic) 436 static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
437 { 437 {
438 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 438 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
439 smp_mb__after_clear_bit(); 439 smp_mb__after_clear_bit();
440 440
441 return atomic64_read(&ic->i_ack_next); 441 return atomic64_read(&ic->i_ack_next);
442 } 442 }
443 #endif 443 #endif
444 444
445 445
446 static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) 446 static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
447 { 447 {
448 struct rds_header *hdr = ic->i_ack; 448 struct rds_header *hdr = ic->i_ack;
449 struct ib_send_wr *failed_wr; 449 struct ib_send_wr *failed_wr;
450 u64 seq; 450 u64 seq;
451 int ret; 451 int ret;
452 452
453 seq = rds_ib_get_ack(ic); 453 seq = rds_ib_get_ack(ic);
454 454
455 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); 455 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
456 rds_message_populate_header(hdr, 0, 0, 0); 456 rds_message_populate_header(hdr, 0, 0, 0);
457 hdr->h_ack = cpu_to_be64(seq); 457 hdr->h_ack = cpu_to_be64(seq);
458 hdr->h_credit = adv_credits; 458 hdr->h_credit = adv_credits;
459 rds_message_make_checksum(hdr); 459 rds_message_make_checksum(hdr);
460 ic->i_ack_queued = jiffies; 460 ic->i_ack_queued = jiffies;
461 461
462 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); 462 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
463 if (unlikely(ret)) { 463 if (unlikely(ret)) {
464 /* Failed to send. Release the WR, and 464 /* Failed to send. Release the WR, and
465 * force another ACK. 465 * force another ACK.
466 */ 466 */
467 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 467 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
468 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 468 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
469 469
470 rds_ib_stats_inc(s_ib_ack_send_failure); 470 rds_ib_stats_inc(s_ib_ack_send_failure);
471 /* Need to finesse this later. */ 471 /* Need to finesse this later. */
472 BUG(); 472 BUG();
473 } else 473 } else
474 rds_ib_stats_inc(s_ib_ack_sent); 474 rds_ib_stats_inc(s_ib_ack_sent);
475 } 475 }
476 476
477 /* 477 /*
478 * There are 3 ways of getting acknowledgements to the peer: 478 * There are 3 ways of getting acknowledgements to the peer:
479 * 1. We call rds_ib_attempt_ack from the recv completion handler 479 * 1. We call rds_ib_attempt_ack from the recv completion handler
480 * to send an ACK-only frame. 480 * to send an ACK-only frame.
481 * However, there can be only one such frame in the send queue 481 * However, there can be only one such frame in the send queue
482 * at any time, so we may have to postpone it. 482 * at any time, so we may have to postpone it.
483 * 2. When another (data) packet is transmitted while there's 483 * 2. When another (data) packet is transmitted while there's
484 * an ACK in the queue, we piggyback the ACK sequence number 484 * an ACK in the queue, we piggyback the ACK sequence number
485 * on the data packet. 485 * on the data packet.
486 * 3. If the ACK WR is done sending, we get called from the 486 * 3. If the ACK WR is done sending, we get called from the
487 * send queue completion handler, and check whether there's 487 * send queue completion handler, and check whether there's
488 * another ACK pending (postponed because the WR was on the 488 * another ACK pending (postponed because the WR was on the
489 * queue). If so, we transmit it. 489 * queue). If so, we transmit it.
490 * 490 *
491 * We maintain 2 variables: 491 * We maintain 2 variables:
492 * - i_ack_flags, which keeps track of whether the ACK WR 492 * - i_ack_flags, which keeps track of whether the ACK WR
493 * is currently in the send queue or not (IB_ACK_IN_FLIGHT) 493 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
494 * - i_ack_next, which is the last sequence number we received 494 * - i_ack_next, which is the last sequence number we received
495 * 495 *
496 * Potentially, send queue and receive queue handlers can run concurrently. 496 * Potentially, send queue and receive queue handlers can run concurrently.
497 * It would be nice to not have to use a spinlock to synchronize things, 497 * It would be nice to not have to use a spinlock to synchronize things,
498 * but the one problem that rules this out is that 64bit updates are 498 * but the one problem that rules this out is that 64bit updates are
499 * not atomic on all platforms. Things would be a lot simpler if 499 * not atomic on all platforms. Things would be a lot simpler if
500 * we had atomic64 or maybe cmpxchg64 everywhere. 500 * we had atomic64 or maybe cmpxchg64 everywhere.
501 * 501 *
502 * Reconnecting complicates this picture just slightly. When we 502 * Reconnecting complicates this picture just slightly. When we
503 * reconnect, we may be seeing duplicate packets. The peer 503 * reconnect, we may be seeing duplicate packets. The peer
504 * is retransmitting them, because it hasn't seen an ACK for 504 * is retransmitting them, because it hasn't seen an ACK for
505 * them. It is important that we ACK these. 505 * them. It is important that we ACK these.
506 * 506 *
507 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with 507 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
508 * this flag set *MUST* be acknowledged immediately. 508 * this flag set *MUST* be acknowledged immediately.
509 */ 509 */
510 510
511 /* 511 /*
512 * When we get here, we're called from the recv queue handler. 512 * When we get here, we're called from the recv queue handler.
513 * Check whether we ought to transmit an ACK. 513 * Check whether we ought to transmit an ACK.
514 */ 514 */
515 void rds_ib_attempt_ack(struct rds_ib_connection *ic) 515 void rds_ib_attempt_ack(struct rds_ib_connection *ic)
516 { 516 {
517 unsigned int adv_credits; 517 unsigned int adv_credits;
518 518
519 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) 519 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
520 return; 520 return;
521 521
522 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { 522 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
523 rds_ib_stats_inc(s_ib_ack_send_delayed); 523 rds_ib_stats_inc(s_ib_ack_send_delayed);
524 return; 524 return;
525 } 525 }
526 526
527 /* Can we get a send credit? */ 527 /* Can we get a send credit? */
528 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { 528 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
529 rds_ib_stats_inc(s_ib_tx_throttle); 529 rds_ib_stats_inc(s_ib_tx_throttle);
530 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 530 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
531 return; 531 return;
532 } 532 }
533 533
534 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 534 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
535 rds_ib_send_ack(ic, adv_credits); 535 rds_ib_send_ack(ic, adv_credits);
536 } 536 }
537 537
538 /* 538 /*
539 * We get here from the send completion handler, when the 539 * We get here from the send completion handler, when the
540 * adapter tells us the ACK frame was sent. 540 * adapter tells us the ACK frame was sent.
541 */ 541 */
542 void rds_ib_ack_send_complete(struct rds_ib_connection *ic) 542 void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
543 { 543 {
544 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 544 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
545 rds_ib_attempt_ack(ic); 545 rds_ib_attempt_ack(ic);
546 } 546 }
547 547
548 /* 548 /*
549 * This is called by the regular xmit code when it wants to piggyback 549 * This is called by the regular xmit code when it wants to piggyback
550 * an ACK on an outgoing frame. 550 * an ACK on an outgoing frame.
551 */ 551 */
552 u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) 552 u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
553 { 553 {
554 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) 554 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
555 rds_ib_stats_inc(s_ib_ack_send_piggybacked); 555 rds_ib_stats_inc(s_ib_ack_send_piggybacked);
556 return rds_ib_get_ack(ic); 556 return rds_ib_get_ack(ic);
557 } 557 }
558 558
559 static struct rds_header *rds_ib_get_header(struct rds_connection *conn, 559 static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
560 struct rds_ib_recv_work *recv, 560 struct rds_ib_recv_work *recv,
561 u32 data_len) 561 u32 data_len)
562 { 562 {
563 struct rds_ib_connection *ic = conn->c_transport_data; 563 struct rds_ib_connection *ic = conn->c_transport_data;
564 void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; 564 void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
565 void *addr; 565 void *addr;
566 u32 misplaced_hdr_bytes; 566 u32 misplaced_hdr_bytes;
567 567
568 /* 568 /*
569 * Support header at the front (RDS 3.1+) as well as header-at-end. 569 * Support header at the front (RDS 3.1+) as well as header-at-end.
570 * 570 *
571 * Cases: 571 * Cases:
572 * 1) header all in header buff (great!) 572 * 1) header all in header buff (great!)
573 * 2) header all in data page (copy all to header buff) 573 * 2) header all in data page (copy all to header buff)
574 * 3) header split across hdr buf + data page 574 * 3) header split across hdr buf + data page
575 * (move bit in hdr buff to end before copying other bit from data page) 575 * (move bit in hdr buff to end before copying other bit from data page)
576 */ 576 */
577 if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) 577 if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
578 return hdr_buff; 578 return hdr_buff;
579 579
580 if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { 580 if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
581 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); 581 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
582 memcpy(hdr_buff, 582 memcpy(hdr_buff,
583 addr + recv->r_frag->f_offset + data_len, 583 addr + recv->r_frag->f_offset + data_len,
584 sizeof(struct rds_header)); 584 sizeof(struct rds_header));
585 kunmap_atomic(addr, KM_SOFTIRQ0); 585 kunmap_atomic(addr, KM_SOFTIRQ0);
586 return hdr_buff; 586 return hdr_buff;
587 } 587 }
588 588
589 misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); 589 misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
590 590
591 memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); 591 memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
592 592
593 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); 593 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
594 memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, 594 memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
595 sizeof(struct rds_header) - misplaced_hdr_bytes); 595 sizeof(struct rds_header) - misplaced_hdr_bytes);
596 kunmap_atomic(addr, KM_SOFTIRQ0); 596 kunmap_atomic(addr, KM_SOFTIRQ0);
597 return hdr_buff; 597 return hdr_buff;
598 } 598 }
599 599
600 /* 600 /*
601 * It's kind of lame that we're copying from the posted receive pages into 601 * It's kind of lame that we're copying from the posted receive pages into
602 * long-lived bitmaps. We could have posted the bitmaps and rdma written into 602 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
603 * them. But receiving new congestion bitmaps should be a *rare* event, so 603 * them. But receiving new congestion bitmaps should be a *rare* event, so
604 * hopefully we won't need to invest that complexity in making it more 604 * hopefully we won't need to invest that complexity in making it more
605 * efficient. By copying we can share a simpler core with TCP which has to 605 * efficient. By copying we can share a simpler core with TCP which has to
606 * copy. 606 * copy.
607 */ 607 */
608 static void rds_ib_cong_recv(struct rds_connection *conn, 608 static void rds_ib_cong_recv(struct rds_connection *conn,
609 struct rds_ib_incoming *ibinc) 609 struct rds_ib_incoming *ibinc)
610 { 610 {
611 struct rds_cong_map *map; 611 struct rds_cong_map *map;
612 unsigned int map_off; 612 unsigned int map_off;
613 unsigned int map_page; 613 unsigned int map_page;
614 struct rds_page_frag *frag; 614 struct rds_page_frag *frag;
615 unsigned long frag_off; 615 unsigned long frag_off;
616 unsigned long to_copy; 616 unsigned long to_copy;
617 unsigned long copied; 617 unsigned long copied;
618 uint64_t uncongested = 0; 618 uint64_t uncongested = 0;
619 void *addr; 619 void *addr;
620 620
621 /* catch completely corrupt packets */ 621 /* catch completely corrupt packets */
622 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 622 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
623 return; 623 return;
624 624
625 map = conn->c_fcong; 625 map = conn->c_fcong;
626 map_page = 0; 626 map_page = 0;
627 map_off = 0; 627 map_off = 0;
628 628
629 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); 629 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
630 frag_off = 0; 630 frag_off = 0;
631 631
632 copied = 0; 632 copied = 0;
633 633
634 while (copied < RDS_CONG_MAP_BYTES) { 634 while (copied < RDS_CONG_MAP_BYTES) {
635 uint64_t *src, *dst; 635 uint64_t *src, *dst;
636 unsigned int k; 636 unsigned int k;
637 637
638 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); 638 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
639 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ 639 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
640 640
641 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); 641 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
642 642
643 src = addr + frag_off; 643 src = addr + frag_off;
644 dst = (void *)map->m_page_addrs[map_page] + map_off; 644 dst = (void *)map->m_page_addrs[map_page] + map_off;
645 for (k = 0; k < to_copy; k += 8) { 645 for (k = 0; k < to_copy; k += 8) {
646 /* Record ports that became uncongested, ie 646 /* Record ports that became uncongested, ie
647 * bits that changed from 0 to 1. */ 647 * bits that changed from 0 to 1. */
648 uncongested |= ~(*src) & *dst; 648 uncongested |= ~(*src) & *dst;
649 *dst++ = *src++; 649 *dst++ = *src++;
650 } 650 }
651 kunmap_atomic(addr, KM_SOFTIRQ0); 651 kunmap_atomic(addr, KM_SOFTIRQ0);
652 652
653 copied += to_copy; 653 copied += to_copy;
654 654
655 map_off += to_copy; 655 map_off += to_copy;
656 if (map_off == PAGE_SIZE) { 656 if (map_off == PAGE_SIZE) {
657 map_off = 0; 657 map_off = 0;
658 map_page++; 658 map_page++;
659 } 659 }
660 660
661 frag_off += to_copy; 661 frag_off += to_copy;
662 if (frag_off == RDS_FRAG_SIZE) { 662 if (frag_off == RDS_FRAG_SIZE) {
663 frag = list_entry(frag->f_item.next, 663 frag = list_entry(frag->f_item.next,
664 struct rds_page_frag, f_item); 664 struct rds_page_frag, f_item);
665 frag_off = 0; 665 frag_off = 0;
666 } 666 }
667 } 667 }
668 668
669 /* the congestion map is in little endian order */ 669 /* the congestion map is in little endian order */
670 uncongested = le64_to_cpu(uncongested); 670 uncongested = le64_to_cpu(uncongested);
671 671
672 rds_cong_map_updated(map, uncongested); 672 rds_cong_map_updated(map, uncongested);
673 } 673 }
674 674
675 /* 675 /*
676 * Rings are posted with all the allocations they'll need to queue the 676 * Rings are posted with all the allocations they'll need to queue the
677 * incoming message to the receiving socket so this can't fail. 677 * incoming message to the receiving socket so this can't fail.
678 * All fragments start with a header, so we can make sure we're not receiving 678 * All fragments start with a header, so we can make sure we're not receiving
679 * garbage, and we can tell a small 8 byte fragment from an ACK frame. 679 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
680 */ 680 */
681 struct rds_ib_ack_state { 681 struct rds_ib_ack_state {
682 u64 ack_next; 682 u64 ack_next;
683 u64 ack_recv; 683 u64 ack_recv;
684 unsigned int ack_required:1; 684 unsigned int ack_required:1;
685 unsigned int ack_next_valid:1; 685 unsigned int ack_next_valid:1;
686 unsigned int ack_recv_valid:1; 686 unsigned int ack_recv_valid:1;
687 }; 687 };
688 688
689 static void rds_ib_process_recv(struct rds_connection *conn, 689 static void rds_ib_process_recv(struct rds_connection *conn,
690 struct rds_ib_recv_work *recv, u32 data_len, 690 struct rds_ib_recv_work *recv, u32 data_len,
691 struct rds_ib_ack_state *state) 691 struct rds_ib_ack_state *state)
692 { 692 {
693 struct rds_ib_connection *ic = conn->c_transport_data; 693 struct rds_ib_connection *ic = conn->c_transport_data;
694 struct rds_ib_incoming *ibinc = ic->i_ibinc; 694 struct rds_ib_incoming *ibinc = ic->i_ibinc;
695 struct rds_header *ihdr, *hdr; 695 struct rds_header *ihdr, *hdr;
696 696
697 /* XXX shut down the connection if port 0,0 are seen? */ 697 /* XXX shut down the connection if port 0,0 are seen? */
698 698
699 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, 699 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
700 data_len); 700 data_len);
701 701
702 if (data_len < sizeof(struct rds_header)) { 702 if (data_len < sizeof(struct rds_header)) {
703 rds_ib_conn_error(conn, "incoming message " 703 rds_ib_conn_error(conn, "incoming message "
704 "from %pI4 didn't inclue a " 704 "from %pI4 didn't inclue a "
705 "header, disconnecting and " 705 "header, disconnecting and "
706 "reconnecting\n", 706 "reconnecting\n",
707 &conn->c_faddr); 707 &conn->c_faddr);
708 return; 708 return;
709 } 709 }
710 data_len -= sizeof(struct rds_header); 710 data_len -= sizeof(struct rds_header);
711 711
712 ihdr = rds_ib_get_header(conn, recv, data_len); 712 ihdr = rds_ib_get_header(conn, recv, data_len);
713 713
714 /* Validate the checksum. */ 714 /* Validate the checksum. */
715 if (!rds_message_verify_checksum(ihdr)) { 715 if (!rds_message_verify_checksum(ihdr)) {
716 rds_ib_conn_error(conn, "incoming message " 716 rds_ib_conn_error(conn, "incoming message "
717 "from %pI4 has corrupted header - " 717 "from %pI4 has corrupted header - "
718 "forcing a reconnect\n", 718 "forcing a reconnect\n",
719 &conn->c_faddr); 719 &conn->c_faddr);
720 rds_stats_inc(s_recv_drop_bad_checksum); 720 rds_stats_inc(s_recv_drop_bad_checksum);
721 return; 721 return;
722 } 722 }
723 723
724 /* Process the ACK sequence which comes with every packet */ 724 /* Process the ACK sequence which comes with every packet */
725 state->ack_recv = be64_to_cpu(ihdr->h_ack); 725 state->ack_recv = be64_to_cpu(ihdr->h_ack);
726 state->ack_recv_valid = 1; 726 state->ack_recv_valid = 1;
727 727
728 /* Process the credits update if there was one */ 728 /* Process the credits update if there was one */
729 if (ihdr->h_credit) 729 if (ihdr->h_credit)
730 rds_ib_send_add_credits(conn, ihdr->h_credit); 730 rds_ib_send_add_credits(conn, ihdr->h_credit);
731 731
732 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { 732 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
733 /* This is an ACK-only packet. The fact that it gets 733 /* This is an ACK-only packet. The fact that it gets
734 * special treatment here is that historically, ACKs 734 * special treatment here is that historically, ACKs
735 * were rather special beasts. 735 * were rather special beasts.
736 */ 736 */
737 rds_ib_stats_inc(s_ib_ack_received); 737 rds_ib_stats_inc(s_ib_ack_received);
738 738
739 /* 739 /*
740 * Usually the frags make their way on to incs and are then freed as 740 * Usually the frags make their way on to incs and are then freed as
741 * the inc is freed. We don't go that route, so we have to drop the 741 * the inc is freed. We don't go that route, so we have to drop the
742 * page ref ourselves. We can't just leave the page on the recv 742 * page ref ourselves. We can't just leave the page on the recv
743 * because that confuses the dma mapping of pages and each recv's use 743 * because that confuses the dma mapping of pages and each recv's use
744 * of a partial page. We can leave the frag, though, it will be 744 * of a partial page. We can leave the frag, though, it will be
745 * reused. 745 * reused.
746 * 746 *
747 * FIXME: Fold this into the code path below. 747 * FIXME: Fold this into the code path below.
748 */ 748 */
749 rds_ib_frag_drop_page(recv->r_frag); 749 rds_ib_frag_drop_page(recv->r_frag);
750 return; 750 return;
751 } 751 }
752 752
753 /* 753 /*
754 * If we don't already have an inc on the connection then this 754 * If we don't already have an inc on the connection then this
755 * fragment has a header and starts a message.. copy its header 755 * fragment has a header and starts a message.. copy its header
756 * into the inc and save the inc so we can hang upcoming fragments 756 * into the inc and save the inc so we can hang upcoming fragments
757 * off its list. 757 * off its list.
758 */ 758 */
759 if (ibinc == NULL) { 759 if (ibinc == NULL) {
760 ibinc = recv->r_ibinc; 760 ibinc = recv->r_ibinc;
761 recv->r_ibinc = NULL; 761 recv->r_ibinc = NULL;
762 ic->i_ibinc = ibinc; 762 ic->i_ibinc = ibinc;
763 763
764 hdr = &ibinc->ii_inc.i_hdr; 764 hdr = &ibinc->ii_inc.i_hdr;
765 memcpy(hdr, ihdr, sizeof(*hdr)); 765 memcpy(hdr, ihdr, sizeof(*hdr));
766 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); 766 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
767 767
768 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, 768 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
769 ic->i_recv_data_rem, hdr->h_flags); 769 ic->i_recv_data_rem, hdr->h_flags);
770 } else { 770 } else {
771 hdr = &ibinc->ii_inc.i_hdr; 771 hdr = &ibinc->ii_inc.i_hdr;
772 /* We can't just use memcmp here; fragments of a 772 /* We can't just use memcmp here; fragments of a
773 * single message may carry different ACKs */ 773 * single message may carry different ACKs */
774 if (hdr->h_sequence != ihdr->h_sequence 774 if (hdr->h_sequence != ihdr->h_sequence
775 || hdr->h_len != ihdr->h_len 775 || hdr->h_len != ihdr->h_len
776 || hdr->h_sport != ihdr->h_sport 776 || hdr->h_sport != ihdr->h_sport
777 || hdr->h_dport != ihdr->h_dport) { 777 || hdr->h_dport != ihdr->h_dport) {
778 rds_ib_conn_error(conn, 778 rds_ib_conn_error(conn,
779 "fragment header mismatch; forcing reconnect\n"); 779 "fragment header mismatch; forcing reconnect\n");
780 return; 780 return;
781 } 781 }
782 } 782 }
783 783
784 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); 784 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
785 recv->r_frag = NULL; 785 recv->r_frag = NULL;
786 786
787 if (ic->i_recv_data_rem > RDS_FRAG_SIZE) 787 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
788 ic->i_recv_data_rem -= RDS_FRAG_SIZE; 788 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
789 else { 789 else {
790 ic->i_recv_data_rem = 0; 790 ic->i_recv_data_rem = 0;
791 ic->i_ibinc = NULL; 791 ic->i_ibinc = NULL;
792 792
793 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 793 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
794 rds_ib_cong_recv(conn, ibinc); 794 rds_ib_cong_recv(conn, ibinc);
795 else { 795 else {
796 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, 796 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
797 &ibinc->ii_inc, GFP_ATOMIC, 797 &ibinc->ii_inc, GFP_ATOMIC,
798 KM_SOFTIRQ0); 798 KM_SOFTIRQ0);
799 state->ack_next = be64_to_cpu(hdr->h_sequence); 799 state->ack_next = be64_to_cpu(hdr->h_sequence);
800 state->ack_next_valid = 1; 800 state->ack_next_valid = 1;
801 } 801 }
802 802
803 /* Evaluate the ACK_REQUIRED flag *after* we received 803 /* Evaluate the ACK_REQUIRED flag *after* we received
804 * the complete frame, and after bumping the next_rx 804 * the complete frame, and after bumping the next_rx
805 * sequence. */ 805 * sequence. */
806 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { 806 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
807 rds_stats_inc(s_recv_ack_required); 807 rds_stats_inc(s_recv_ack_required);
808 state->ack_required = 1; 808 state->ack_required = 1;
809 } 809 }
810 810
811 rds_inc_put(&ibinc->ii_inc); 811 rds_inc_put(&ibinc->ii_inc);
812 } 812 }
813 } 813 }
814 814
815 /* 815 /*
816 * Plucking the oldest entry from the ring can be done concurrently with 816 * Plucking the oldest entry from the ring can be done concurrently with
817 * the thread refilling the ring. Each ring operation is protected by 817 * the thread refilling the ring. Each ring operation is protected by
818 * spinlocks and the transient state of refilling doesn't change the 818 * spinlocks and the transient state of refilling doesn't change the
819 * recording of which entry is oldest. 819 * recording of which entry is oldest.
820 * 820 *
821 * This relies on IB only calling one cq comp_handler for each cq so that 821 * This relies on IB only calling one cq comp_handler for each cq so that
822 * there will only be one caller of rds_recv_incoming() per RDS connection. 822 * there will only be one caller of rds_recv_incoming() per RDS connection.
823 */ 823 */
824 void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) 824 void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
825 { 825 {
826 struct rds_connection *conn = context; 826 struct rds_connection *conn = context;
827 struct rds_ib_connection *ic = conn->c_transport_data; 827 struct rds_ib_connection *ic = conn->c_transport_data;
828 struct ib_wc wc;
829 struct rds_ib_ack_state state = { 0, };
830 struct rds_ib_recv_work *recv;
831 828
832 rdsdebug("conn %p cq %p\n", conn, cq); 829 rdsdebug("conn %p cq %p\n", conn, cq);
833 830
834 rds_ib_stats_inc(s_ib_rx_cq_call); 831 rds_ib_stats_inc(s_ib_rx_cq_call);
835 832
836 ib_req_notify_cq(cq, IB_CQ_SOLICITED); 833 tasklet_schedule(&ic->i_recv_tasklet);
834 }
837 835
838 while (ib_poll_cq(cq, 1, &wc) > 0) { 836 static inline void rds_poll_cq(struct rds_ib_connection *ic,
837 struct rds_ib_ack_state *state)
838 {
839 struct rds_connection *conn = ic->conn;
840 struct ib_wc wc;
841 struct rds_ib_recv_work *recv;
842
843 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
839 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 844 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
840 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 845 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
841 be32_to_cpu(wc.ex.imm_data)); 846 be32_to_cpu(wc.ex.imm_data));
842 rds_ib_stats_inc(s_ib_rx_cq_event); 847 rds_ib_stats_inc(s_ib_rx_cq_event);
843 848
844 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; 849 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
845 850
846 rds_ib_recv_unmap_page(ic, recv); 851 rds_ib_recv_unmap_page(ic, recv);
847 852
848 /* 853 /*
849 * Also process recvs in connecting state because it is possible 854 * Also process recvs in connecting state because it is possible
850 * to get a recv completion _before_ the rdmacm ESTABLISHED 855 * to get a recv completion _before_ the rdmacm ESTABLISHED
851 * event is processed. 856 * event is processed.
852 */ 857 */
853 if (rds_conn_up(conn) || rds_conn_connecting(conn)) { 858 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
854 /* We expect errors as the qp is drained during shutdown */ 859 /* We expect errors as the qp is drained during shutdown */
855 if (wc.status == IB_WC_SUCCESS) { 860 if (wc.status == IB_WC_SUCCESS) {
856 rds_ib_process_recv(conn, recv, wc.byte_len, &state); 861 rds_ib_process_recv(conn, recv, wc.byte_len, state);
857 } else { 862 } else {
858 rds_ib_conn_error(conn, "recv completion on " 863 rds_ib_conn_error(conn, "recv completion on "
859 "%pI4 had status %u, disconnecting and " 864 "%pI4 had status %u, disconnecting and "
860 "reconnecting\n", &conn->c_faddr, 865 "reconnecting\n", &conn->c_faddr,
861 wc.status); 866 wc.status);
862 } 867 }
863 } 868 }
864 869
865 rds_ib_ring_free(&ic->i_recv_ring, 1); 870 rds_ib_ring_free(&ic->i_recv_ring, 1);
866 } 871 }
872 }
873
874 void rds_ib_recv_tasklet_fn(unsigned long data)
875 {
876 struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
877 struct rds_connection *conn = ic->conn;
878 struct rds_ib_ack_state state = { 0, };
879
880 rds_poll_cq(ic, &state);
881 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
882 rds_poll_cq(ic, &state);
867 883
868 if (state.ack_next_valid) 884 if (state.ack_next_valid)
869 rds_ib_set_ack(ic, state.ack_next, state.ack_required); 885 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
870 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { 886 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
871 rds_send_drop_acked(conn, state.ack_recv, NULL); 887 rds_send_drop_acked(conn, state.ack_recv, NULL);
872 ic->i_ack_recv = state.ack_recv; 888 ic->i_ack_recv = state.ack_recv;
873 } 889 }
874 if (rds_conn_up(conn)) 890 if (rds_conn_up(conn))
875 rds_ib_attempt_ack(ic); 891 rds_ib_attempt_ack(ic);
876 892
877 /* If we ever end up with a really empty receive ring, we're 893 /* If we ever end up with a really empty receive ring, we're
878 * in deep trouble, as the sender will definitely see RNR 894 * in deep trouble, as the sender will definitely see RNR
879 * timeouts. */ 895 * timeouts. */
880 if (rds_ib_ring_empty(&ic->i_recv_ring)) 896 if (rds_ib_ring_empty(&ic->i_recv_ring))
881 rds_ib_stats_inc(s_ib_rx_ring_empty); 897 rds_ib_stats_inc(s_ib_rx_ring_empty);
882 898
883 /* 899 /*
884 * If the ring is running low, then schedule the thread to refill. 900 * If the ring is running low, then schedule the thread to refill.
885 */ 901 */
886 if (rds_ib_ring_low(&ic->i_recv_ring)) 902 if (rds_ib_ring_low(&ic->i_recv_ring))
887 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 903 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
888 } 904 }
889 905
890 int rds_ib_recv(struct rds_connection *conn) 906 int rds_ib_recv(struct rds_connection *conn)
891 { 907 {
892 struct rds_ib_connection *ic = conn->c_transport_data; 908 struct rds_ib_connection *ic = conn->c_transport_data;
893 int ret = 0; 909 int ret = 0;
894 910
895 rdsdebug("conn %p\n", conn); 911 rdsdebug("conn %p\n", conn);
896 912
897 /* 913 /*
898 * If we get a temporary posting failure in this context then 914 * If we get a temporary posting failure in this context then
899 * we're really low and we want the caller to back off for a bit. 915 * we're really low and we want the caller to back off for a bit.
900 */ 916 */
901 mutex_lock(&ic->i_recv_mutex); 917 mutex_lock(&ic->i_recv_mutex);
902 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) 918 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
903 ret = -ENOMEM; 919 ret = -ENOMEM;
904 else 920 else
905 rds_ib_stats_inc(s_ib_rx_refill_from_thread); 921 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
906 mutex_unlock(&ic->i_recv_mutex); 922 mutex_unlock(&ic->i_recv_mutex);
907 923
908 if (rds_conn_up(conn)) 924 if (rds_conn_up(conn))
909 rds_ib_attempt_ack(ic); 925 rds_ib_attempt_ack(ic);
910 926
911 return ret; 927 return ret;
912 } 928 }
913 929
914 int __init rds_ib_recv_init(void) 930 int __init rds_ib_recv_init(void)
915 { 931 {
916 struct sysinfo si; 932 struct sysinfo si;
917 int ret = -ENOMEM; 933 int ret = -ENOMEM;
918 934
919 /* Default to 30% of all available RAM for recv memory */ 935 /* Default to 30% of all available RAM for recv memory */
920 si_meminfo(&si); 936 si_meminfo(&si);
921 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; 937 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
922 938
923 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", 939 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
924 sizeof(struct rds_ib_incoming), 940 sizeof(struct rds_ib_incoming),
925 0, 0, NULL); 941 0, 0, NULL);
926 if (rds_ib_incoming_slab == NULL) 942 if (rds_ib_incoming_slab == NULL)
927 goto out; 943 goto out;
928 944
929 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", 945 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
930 sizeof(struct rds_page_frag), 946 sizeof(struct rds_page_frag),
931 0, 0, NULL); 947 0, 0, NULL);
932 if (rds_ib_frag_slab == NULL) 948 if (rds_ib_frag_slab == NULL)
933 kmem_cache_destroy(rds_ib_incoming_slab); 949 kmem_cache_destroy(rds_ib_incoming_slab);
934 else 950 else
935 ret = 0; 951 ret = 0;
936 out: 952 out:
937 return ret; 953 return ret;
938 } 954 }
939 955
940 void rds_ib_recv_exit(void) 956 void rds_ib_recv_exit(void)
941 { 957 {
942 kmem_cache_destroy(rds_ib_incoming_slab); 958 kmem_cache_destroy(rds_ib_incoming_slab);
1 #ifndef _RDS_IW_H 1 #ifndef _RDS_IW_H
2 #define _RDS_IW_H 2 #define _RDS_IW_H
3 3
4 #include <rdma/ib_verbs.h> 4 #include <rdma/ib_verbs.h>
5 #include <rdma/rdma_cm.h> 5 #include <rdma/rdma_cm.h>
6 #include "rds.h" 6 #include "rds.h"
7 #include "rdma_transport.h" 7 #include "rdma_transport.h"
8 8
9 #define RDS_FASTREG_SIZE 20 9 #define RDS_FASTREG_SIZE 20
10 #define RDS_FASTREG_POOL_SIZE 2048 10 #define RDS_FASTREG_POOL_SIZE 2048
11 11
12 #define RDS_IW_MAX_SGE 8 12 #define RDS_IW_MAX_SGE 8
13 #define RDS_IW_RECV_SGE 2 13 #define RDS_IW_RECV_SGE 2
14 14
15 #define RDS_IW_DEFAULT_RECV_WR 1024 15 #define RDS_IW_DEFAULT_RECV_WR 1024
16 #define RDS_IW_DEFAULT_SEND_WR 256 16 #define RDS_IW_DEFAULT_SEND_WR 256
17 17
18 #define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ 18 #define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
19 19
20 extern struct list_head rds_iw_devices; 20 extern struct list_head rds_iw_devices;
21 21
22 /* 22 /*
23 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to 23 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
24 * try and minimize the amount of memory tied up both the device and 24 * try and minimize the amount of memory tied up both the device and
25 * socket receive queues. 25 * socket receive queues.
26 */ 26 */
27 /* page offset of the final full frag that fits in the page */ 27 /* page offset of the final full frag that fits in the page */
28 #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) 28 #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
29 struct rds_page_frag { 29 struct rds_page_frag {
30 struct list_head f_item; 30 struct list_head f_item;
31 struct page *f_page; 31 struct page *f_page;
32 unsigned long f_offset; 32 unsigned long f_offset;
33 dma_addr_t f_mapped; 33 dma_addr_t f_mapped;
34 }; 34 };
35 35
36 struct rds_iw_incoming { 36 struct rds_iw_incoming {
37 struct list_head ii_frags; 37 struct list_head ii_frags;
38 struct rds_incoming ii_inc; 38 struct rds_incoming ii_inc;
39 }; 39 };
40 40
41 struct rds_iw_connect_private { 41 struct rds_iw_connect_private {
42 /* Add new fields at the end, and don't permute existing fields. */ 42 /* Add new fields at the end, and don't permute existing fields. */
43 __be32 dp_saddr; 43 __be32 dp_saddr;
44 __be32 dp_daddr; 44 __be32 dp_daddr;
45 u8 dp_protocol_major; 45 u8 dp_protocol_major;
46 u8 dp_protocol_minor; 46 u8 dp_protocol_minor;
47 __be16 dp_protocol_minor_mask; /* bitmask */ 47 __be16 dp_protocol_minor_mask; /* bitmask */
48 __be32 dp_reserved1; 48 __be32 dp_reserved1;
49 __be64 dp_ack_seq; 49 __be64 dp_ack_seq;
50 __be32 dp_credit; /* non-zero enables flow ctl */ 50 __be32 dp_credit; /* non-zero enables flow ctl */
51 }; 51 };
52 52
53 struct rds_iw_scatterlist { 53 struct rds_iw_scatterlist {
54 struct scatterlist *list; 54 struct scatterlist *list;
55 unsigned int len; 55 unsigned int len;
56 int dma_len; 56 int dma_len;
57 unsigned int dma_npages; 57 unsigned int dma_npages;
58 unsigned int bytes; 58 unsigned int bytes;
59 }; 59 };
60 60
61 struct rds_iw_mapping { 61 struct rds_iw_mapping {
62 spinlock_t m_lock; /* protect the mapping struct */ 62 spinlock_t m_lock; /* protect the mapping struct */
63 struct list_head m_list; 63 struct list_head m_list;
64 struct rds_iw_mr *m_mr; 64 struct rds_iw_mr *m_mr;
65 uint32_t m_rkey; 65 uint32_t m_rkey;
66 struct rds_iw_scatterlist m_sg; 66 struct rds_iw_scatterlist m_sg;
67 }; 67 };
68 68
69 struct rds_iw_send_work { 69 struct rds_iw_send_work {
70 struct rds_message *s_rm; 70 struct rds_message *s_rm;
71 71
72 /* We should really put these into a union: */ 72 /* We should really put these into a union: */
73 struct rds_rdma_op *s_op; 73 struct rds_rdma_op *s_op;
74 struct rds_iw_mapping *s_mapping; 74 struct rds_iw_mapping *s_mapping;
75 struct ib_mr *s_mr; 75 struct ib_mr *s_mr;
76 struct ib_fast_reg_page_list *s_page_list; 76 struct ib_fast_reg_page_list *s_page_list;
77 unsigned char s_remap_count; 77 unsigned char s_remap_count;
78 78
79 struct ib_send_wr s_wr; 79 struct ib_send_wr s_wr;
80 struct ib_sge s_sge[RDS_IW_MAX_SGE]; 80 struct ib_sge s_sge[RDS_IW_MAX_SGE];
81 unsigned long s_queued; 81 unsigned long s_queued;
82 }; 82 };
83 83
84 struct rds_iw_recv_work { 84 struct rds_iw_recv_work {
85 struct rds_iw_incoming *r_iwinc; 85 struct rds_iw_incoming *r_iwinc;
86 struct rds_page_frag *r_frag; 86 struct rds_page_frag *r_frag;
87 struct ib_recv_wr r_wr; 87 struct ib_recv_wr r_wr;
88 struct ib_sge r_sge[2]; 88 struct ib_sge r_sge[2];
89 }; 89 };
90 90
91 struct rds_iw_work_ring { 91 struct rds_iw_work_ring {
92 u32 w_nr; 92 u32 w_nr;
93 u32 w_alloc_ptr; 93 u32 w_alloc_ptr;
94 u32 w_alloc_ctr; 94 u32 w_alloc_ctr;
95 u32 w_free_ptr; 95 u32 w_free_ptr;
96 atomic_t w_free_ctr; 96 atomic_t w_free_ctr;
97 }; 97 };
98 98
99 struct rds_iw_device; 99 struct rds_iw_device;
100 100
101 struct rds_iw_connection { 101 struct rds_iw_connection {
102 102
103 struct list_head iw_node; 103 struct list_head iw_node;
104 struct rds_iw_device *rds_iwdev; 104 struct rds_iw_device *rds_iwdev;
105 struct rds_connection *conn; 105 struct rds_connection *conn;
106 106
107 /* alphabet soup, IBTA style */ 107 /* alphabet soup, IBTA style */
108 struct rdma_cm_id *i_cm_id; 108 struct rdma_cm_id *i_cm_id;
109 struct ib_pd *i_pd; 109 struct ib_pd *i_pd;
110 struct ib_mr *i_mr; 110 struct ib_mr *i_mr;
111 struct ib_cq *i_send_cq; 111 struct ib_cq *i_send_cq;
112 struct ib_cq *i_recv_cq; 112 struct ib_cq *i_recv_cq;
113 113
114 /* tx */ 114 /* tx */
115 struct rds_iw_work_ring i_send_ring; 115 struct rds_iw_work_ring i_send_ring;
116 struct rds_message *i_rm; 116 struct rds_message *i_rm;
117 struct rds_header *i_send_hdrs; 117 struct rds_header *i_send_hdrs;
118 u64 i_send_hdrs_dma; 118 u64 i_send_hdrs_dma;
119 struct rds_iw_send_work *i_sends; 119 struct rds_iw_send_work *i_sends;
120 120
121 /* rx */ 121 /* rx */
122 struct tasklet_struct i_recv_tasklet;
122 struct mutex i_recv_mutex; 123 struct mutex i_recv_mutex;
123 struct rds_iw_work_ring i_recv_ring; 124 struct rds_iw_work_ring i_recv_ring;
124 struct rds_iw_incoming *i_iwinc; 125 struct rds_iw_incoming *i_iwinc;
125 u32 i_recv_data_rem; 126 u32 i_recv_data_rem;
126 struct rds_header *i_recv_hdrs; 127 struct rds_header *i_recv_hdrs;
127 u64 i_recv_hdrs_dma; 128 u64 i_recv_hdrs_dma;
128 struct rds_iw_recv_work *i_recvs; 129 struct rds_iw_recv_work *i_recvs;
129 struct rds_page_frag i_frag; 130 struct rds_page_frag i_frag;
130 u64 i_ack_recv; /* last ACK received */ 131 u64 i_ack_recv; /* last ACK received */
131 132
132 /* sending acks */ 133 /* sending acks */
133 unsigned long i_ack_flags; 134 unsigned long i_ack_flags;
134 #ifdef KERNEL_HAS_ATOMIC64 135 #ifdef KERNEL_HAS_ATOMIC64
135 atomic64_t i_ack_next; /* next ACK to send */ 136 atomic64_t i_ack_next; /* next ACK to send */
136 #else 137 #else
137 spinlock_t i_ack_lock; /* protect i_ack_next */ 138 spinlock_t i_ack_lock; /* protect i_ack_next */
138 u64 i_ack_next; /* next ACK to send */ 139 u64 i_ack_next; /* next ACK to send */
139 #endif 140 #endif
140 struct rds_header *i_ack; 141 struct rds_header *i_ack;
141 struct ib_send_wr i_ack_wr; 142 struct ib_send_wr i_ack_wr;
142 struct ib_sge i_ack_sge; 143 struct ib_sge i_ack_sge;
143 u64 i_ack_dma; 144 u64 i_ack_dma;
144 unsigned long i_ack_queued; 145 unsigned long i_ack_queued;
145 146
146 /* Flow control related information 147 /* Flow control related information
147 * 148 *
148 * Our algorithm uses a pair variables that we need to access 149 * Our algorithm uses a pair variables that we need to access
149 * atomically - one for the send credits, and one posted 150 * atomically - one for the send credits, and one posted
150 * recv credits we need to transfer to remote. 151 * recv credits we need to transfer to remote.
151 * Rather than protect them using a slow spinlock, we put both into 152 * Rather than protect them using a slow spinlock, we put both into
152 * a single atomic_t and update it using cmpxchg 153 * a single atomic_t and update it using cmpxchg
153 */ 154 */
154 atomic_t i_credits; 155 atomic_t i_credits;
155 156
156 /* Protocol version specific information */ 157 /* Protocol version specific information */
157 unsigned int i_flowctl:1; /* enable/disable flow ctl */ 158 unsigned int i_flowctl:1; /* enable/disable flow ctl */
158 unsigned int i_dma_local_lkey:1; 159 unsigned int i_dma_local_lkey:1;
159 unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ 160 unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
160 /* Batched completions */ 161 /* Batched completions */
161 unsigned int i_unsignaled_wrs; 162 unsigned int i_unsignaled_wrs;
162 long i_unsignaled_bytes; 163 long i_unsignaled_bytes;
163 }; 164 };
164 165
165 /* This assumes that atomic_t is at least 32 bits */ 166 /* This assumes that atomic_t is at least 32 bits */
166 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) 167 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
167 #define IB_GET_POST_CREDITS(v) ((v) >> 16) 168 #define IB_GET_POST_CREDITS(v) ((v) >> 16)
168 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) 169 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
169 #define IB_SET_POST_CREDITS(v) ((v) << 16) 170 #define IB_SET_POST_CREDITS(v) ((v) << 16)
170 171
171 struct rds_iw_cm_id { 172 struct rds_iw_cm_id {
172 struct list_head list; 173 struct list_head list;
173 struct rdma_cm_id *cm_id; 174 struct rdma_cm_id *cm_id;
174 }; 175 };
175 176
176 struct rds_iw_device { 177 struct rds_iw_device {
177 struct list_head list; 178 struct list_head list;
178 struct list_head cm_id_list; 179 struct list_head cm_id_list;
179 struct list_head conn_list; 180 struct list_head conn_list;
180 struct ib_device *dev; 181 struct ib_device *dev;
181 struct ib_pd *pd; 182 struct ib_pd *pd;
182 struct ib_mr *mr; 183 struct ib_mr *mr;
183 struct rds_iw_mr_pool *mr_pool; 184 struct rds_iw_mr_pool *mr_pool;
184 int max_sge; 185 int max_sge;
185 unsigned int max_wrs; 186 unsigned int max_wrs;
186 unsigned int dma_local_lkey:1; 187 unsigned int dma_local_lkey:1;
187 spinlock_t spinlock; /* protect the above */ 188 spinlock_t spinlock; /* protect the above */
188 }; 189 };
189 190
190 /* bits for i_ack_flags */ 191 /* bits for i_ack_flags */
191 #define IB_ACK_IN_FLIGHT 0 192 #define IB_ACK_IN_FLIGHT 0
192 #define IB_ACK_REQUESTED 1 193 #define IB_ACK_REQUESTED 1
193 194
194 /* Magic WR_ID for ACKs */ 195 /* Magic WR_ID for ACKs */
195 #define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) 196 #define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
196 #define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) 197 #define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
197 #define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) 198 #define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
198 199
199 struct rds_iw_statistics { 200 struct rds_iw_statistics {
200 uint64_t s_iw_connect_raced; 201 uint64_t s_iw_connect_raced;
201 uint64_t s_iw_listen_closed_stale; 202 uint64_t s_iw_listen_closed_stale;
202 uint64_t s_iw_tx_cq_call; 203 uint64_t s_iw_tx_cq_call;
203 uint64_t s_iw_tx_cq_event; 204 uint64_t s_iw_tx_cq_event;
204 uint64_t s_iw_tx_ring_full; 205 uint64_t s_iw_tx_ring_full;
205 uint64_t s_iw_tx_throttle; 206 uint64_t s_iw_tx_throttle;
206 uint64_t s_iw_tx_sg_mapping_failure; 207 uint64_t s_iw_tx_sg_mapping_failure;
207 uint64_t s_iw_tx_stalled; 208 uint64_t s_iw_tx_stalled;
208 uint64_t s_iw_tx_credit_updates; 209 uint64_t s_iw_tx_credit_updates;
209 uint64_t s_iw_rx_cq_call; 210 uint64_t s_iw_rx_cq_call;
210 uint64_t s_iw_rx_cq_event; 211 uint64_t s_iw_rx_cq_event;
211 uint64_t s_iw_rx_ring_empty; 212 uint64_t s_iw_rx_ring_empty;
212 uint64_t s_iw_rx_refill_from_cq; 213 uint64_t s_iw_rx_refill_from_cq;
213 uint64_t s_iw_rx_refill_from_thread; 214 uint64_t s_iw_rx_refill_from_thread;
214 uint64_t s_iw_rx_alloc_limit; 215 uint64_t s_iw_rx_alloc_limit;
215 uint64_t s_iw_rx_credit_updates; 216 uint64_t s_iw_rx_credit_updates;
216 uint64_t s_iw_ack_sent; 217 uint64_t s_iw_ack_sent;
217 uint64_t s_iw_ack_send_failure; 218 uint64_t s_iw_ack_send_failure;
218 uint64_t s_iw_ack_send_delayed; 219 uint64_t s_iw_ack_send_delayed;
219 uint64_t s_iw_ack_send_piggybacked; 220 uint64_t s_iw_ack_send_piggybacked;
220 uint64_t s_iw_ack_received; 221 uint64_t s_iw_ack_received;
221 uint64_t s_iw_rdma_mr_alloc; 222 uint64_t s_iw_rdma_mr_alloc;
222 uint64_t s_iw_rdma_mr_free; 223 uint64_t s_iw_rdma_mr_free;
223 uint64_t s_iw_rdma_mr_used; 224 uint64_t s_iw_rdma_mr_used;
224 uint64_t s_iw_rdma_mr_pool_flush; 225 uint64_t s_iw_rdma_mr_pool_flush;
225 uint64_t s_iw_rdma_mr_pool_wait; 226 uint64_t s_iw_rdma_mr_pool_wait;
226 uint64_t s_iw_rdma_mr_pool_depleted; 227 uint64_t s_iw_rdma_mr_pool_depleted;
227 }; 228 };
228 229
229 extern struct workqueue_struct *rds_iw_wq; 230 extern struct workqueue_struct *rds_iw_wq;
230 231
231 /* 232 /*
232 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h 233 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
233 * doesn't define it. 234 * doesn't define it.
234 */ 235 */
235 static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, 236 static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
236 struct scatterlist *sg, unsigned int sg_dma_len, int direction) 237 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
237 { 238 {
238 unsigned int i; 239 unsigned int i;
239 240
240 for (i = 0; i < sg_dma_len; ++i) { 241 for (i = 0; i < sg_dma_len; ++i) {
241 ib_dma_sync_single_for_cpu(dev, 242 ib_dma_sync_single_for_cpu(dev,
242 ib_sg_dma_address(dev, &sg[i]), 243 ib_sg_dma_address(dev, &sg[i]),
243 ib_sg_dma_len(dev, &sg[i]), 244 ib_sg_dma_len(dev, &sg[i]),
244 direction); 245 direction);
245 } 246 }
246 } 247 }
247 #define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu 248 #define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
248 249
249 static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, 250 static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
250 struct scatterlist *sg, unsigned int sg_dma_len, int direction) 251 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
251 { 252 {
252 unsigned int i; 253 unsigned int i;
253 254
254 for (i = 0; i < sg_dma_len; ++i) { 255 for (i = 0; i < sg_dma_len; ++i) {
255 ib_dma_sync_single_for_device(dev, 256 ib_dma_sync_single_for_device(dev,
256 ib_sg_dma_address(dev, &sg[i]), 257 ib_sg_dma_address(dev, &sg[i]),
257 ib_sg_dma_len(dev, &sg[i]), 258 ib_sg_dma_len(dev, &sg[i]),
258 direction); 259 direction);
259 } 260 }
260 } 261 }
261 #define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device 262 #define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
262 263
263 static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) 264 static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
264 { 265 {
265 return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; 266 return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
266 } 267 }
267 268
268 /* ib.c */ 269 /* ib.c */
269 extern struct rds_transport rds_iw_transport; 270 extern struct rds_transport rds_iw_transport;
270 extern void rds_iw_add_one(struct ib_device *device); 271 extern void rds_iw_add_one(struct ib_device *device);
271 extern void rds_iw_remove_one(struct ib_device *device); 272 extern void rds_iw_remove_one(struct ib_device *device);
272 extern struct ib_client rds_iw_client; 273 extern struct ib_client rds_iw_client;
273 274
274 extern unsigned int fastreg_pool_size; 275 extern unsigned int fastreg_pool_size;
275 extern unsigned int fastreg_message_size; 276 extern unsigned int fastreg_message_size;
276 277
277 extern spinlock_t iw_nodev_conns_lock; 278 extern spinlock_t iw_nodev_conns_lock;
278 extern struct list_head iw_nodev_conns; 279 extern struct list_head iw_nodev_conns;
279 280
280 /* ib_cm.c */ 281 /* ib_cm.c */
281 int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); 282 int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
282 void rds_iw_conn_free(void *arg); 283 void rds_iw_conn_free(void *arg);
283 int rds_iw_conn_connect(struct rds_connection *conn); 284 int rds_iw_conn_connect(struct rds_connection *conn);
284 void rds_iw_conn_shutdown(struct rds_connection *conn); 285 void rds_iw_conn_shutdown(struct rds_connection *conn);
285 void rds_iw_state_change(struct sock *sk); 286 void rds_iw_state_change(struct sock *sk);
286 int __init rds_iw_listen_init(void); 287 int __init rds_iw_listen_init(void);
287 void rds_iw_listen_stop(void); 288 void rds_iw_listen_stop(void);
288 void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); 289 void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
289 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, 290 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
290 struct rdma_cm_event *event); 291 struct rdma_cm_event *event);
291 int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); 292 int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
292 void rds_iw_cm_connect_complete(struct rds_connection *conn, 293 void rds_iw_cm_connect_complete(struct rds_connection *conn,
293 struct rdma_cm_event *event); 294 struct rdma_cm_event *event);
294 295
295 296
296 #define rds_iw_conn_error(conn, fmt...) \ 297 #define rds_iw_conn_error(conn, fmt...) \
297 __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) 298 __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
298 299
299 /* ib_rdma.c */ 300 /* ib_rdma.c */
300 int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); 301 int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
301 void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); 302 void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
302 void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); 303 void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
303 void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); 304 void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
304 static inline void rds_iw_destroy_nodev_conns(void) 305 static inline void rds_iw_destroy_nodev_conns(void)
305 { 306 {
306 __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); 307 __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
307 } 308 }
308 static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) 309 static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
309 { 310 {
310 __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); 311 __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
311 } 312 }
312 struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); 313 struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
313 void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); 314 void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
314 void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); 315 void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
315 void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, 316 void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
316 struct rds_sock *rs, u32 *key_ret); 317 struct rds_sock *rs, u32 *key_ret);
317 void rds_iw_sync_mr(void *trans_private, int dir); 318 void rds_iw_sync_mr(void *trans_private, int dir);
318 void rds_iw_free_mr(void *trans_private, int invalidate); 319 void rds_iw_free_mr(void *trans_private, int invalidate);
319 void rds_iw_flush_mrs(void); 320 void rds_iw_flush_mrs(void);
320 void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); 321 void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
321 322
322 /* ib_recv.c */ 323 /* ib_recv.c */
323 int __init rds_iw_recv_init(void); 324 int __init rds_iw_recv_init(void);
324 void rds_iw_recv_exit(void); 325 void rds_iw_recv_exit(void);
325 int rds_iw_recv(struct rds_connection *conn); 326 int rds_iw_recv(struct rds_connection *conn);
326 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 327 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
327 gfp_t page_gfp, int prefill); 328 gfp_t page_gfp, int prefill);
328 void rds_iw_inc_purge(struct rds_incoming *inc); 329 void rds_iw_inc_purge(struct rds_incoming *inc);
329 void rds_iw_inc_free(struct rds_incoming *inc); 330 void rds_iw_inc_free(struct rds_incoming *inc);
330 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 331 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
331 size_t size); 332 size_t size);
332 void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); 333 void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
334 void rds_iw_recv_tasklet_fn(unsigned long data);
333 void rds_iw_recv_init_ring(struct rds_iw_connection *ic); 335 void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
334 void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); 336 void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
335 void rds_iw_recv_init_ack(struct rds_iw_connection *ic); 337 void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
336 void rds_iw_attempt_ack(struct rds_iw_connection *ic); 338 void rds_iw_attempt_ack(struct rds_iw_connection *ic);
337 void rds_iw_ack_send_complete(struct rds_iw_connection *ic); 339 void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
338 u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); 340 u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
339 341
340 /* ib_ring.c */ 342 /* ib_ring.c */
341 void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); 343 void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
342 void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); 344 void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
343 u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); 345 u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
344 void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); 346 void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
345 void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); 347 void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
346 int rds_iw_ring_empty(struct rds_iw_work_ring *ring); 348 int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
347 int rds_iw_ring_low(struct rds_iw_work_ring *ring); 349 int rds_iw_ring_low(struct rds_iw_work_ring *ring);
348 u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); 350 u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
349 u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); 351 u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
350 extern wait_queue_head_t rds_iw_ring_empty_wait; 352 extern wait_queue_head_t rds_iw_ring_empty_wait;
351 353
352 /* ib_send.c */ 354 /* ib_send.c */
353 void rds_iw_xmit_complete(struct rds_connection *conn); 355 void rds_iw_xmit_complete(struct rds_connection *conn);
354 int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, 356 int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
355 unsigned int hdr_off, unsigned int sg, unsigned int off); 357 unsigned int hdr_off, unsigned int sg, unsigned int off);
356 void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); 358 void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
357 void rds_iw_send_init_ring(struct rds_iw_connection *ic); 359 void rds_iw_send_init_ring(struct rds_iw_connection *ic);
358 void rds_iw_send_clear_ring(struct rds_iw_connection *ic); 360 void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
359 int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); 361 int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
360 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); 362 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
361 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); 363 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
362 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, 364 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
363 u32 *adv_credits, int need_posted, int max_posted); 365 u32 *adv_credits, int need_posted, int max_posted);
364 366
365 /* ib_stats.c */ 367 /* ib_stats.c */
366 DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); 368 DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
367 #define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) 369 #define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
368 unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, 370 unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
369 unsigned int avail); 371 unsigned int avail);
370 372
371 /* ib_sysctl.c */ 373 /* ib_sysctl.c */
372 int __init rds_iw_sysctl_init(void); 374 int __init rds_iw_sysctl_init(void);
373 void rds_iw_sysctl_exit(void); 375 void rds_iw_sysctl_exit(void);
374 extern unsigned long rds_iw_sysctl_max_send_wr; 376 extern unsigned long rds_iw_sysctl_max_send_wr;
375 extern unsigned long rds_iw_sysctl_max_recv_wr; 377 extern unsigned long rds_iw_sysctl_max_recv_wr;
376 extern unsigned long rds_iw_sysctl_max_unsig_wrs; 378 extern unsigned long rds_iw_sysctl_max_unsig_wrs;
377 extern unsigned long rds_iw_sysctl_max_unsig_bytes; 379 extern unsigned long rds_iw_sysctl_max_unsig_bytes;
378 extern unsigned long rds_iw_sysctl_max_recv_allocation; 380 extern unsigned long rds_iw_sysctl_max_recv_allocation;
379 extern unsigned int rds_iw_sysctl_flow_control; 381 extern unsigned int rds_iw_sysctl_flow_control;
380 extern ctl_table rds_iw_sysctl_table[]; 382 extern ctl_table rds_iw_sysctl_table[];
381 383
382 /* 384 /*
383 * Helper functions for getting/setting the header and data SGEs in 385 * Helper functions for getting/setting the header and data SGEs in
384 * RDS packets (not RDMA) 386 * RDS packets (not RDMA)
385 */ 387 */
386 static inline struct ib_sge * 388 static inline struct ib_sge *
387 rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) 389 rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
388 { 390 {
389 return &sge[0]; 391 return &sge[0];
390 } 392 }
391 393
392 static inline struct ib_sge * 394 static inline struct ib_sge *
393 rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) 395 rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
394 { 396 {
395 return &sge[1]; 397 return &sge[1];
396 } 398 }
397 399
398 #endif 400 #endif
399 401
1 /* 1 /*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006 Oracle. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file 6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the 7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below: 8 * OpenIB.org BSD license below:
9 * 9 *
10 * Redistribution and use in source and binary forms, with or 10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following 11 * without modification, are permitted provided that the following
12 * conditions are met: 12 * conditions are met:
13 * 13 *
14 * - Redistributions of source code must retain the above 14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following 15 * copyright notice, this list of conditions and the following
16 * disclaimer. 16 * disclaimer.
17 * 17 *
18 * - Redistributions in binary form must reproduce the above 18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following 19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials 20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution. 21 * provided with the distribution.
22 * 22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE. 30 * SOFTWARE.
31 * 31 *
32 */ 32 */
33 #include <linux/kernel.h> 33 #include <linux/kernel.h>
34 #include <linux/in.h> 34 #include <linux/in.h>
35 #include <linux/vmalloc.h> 35 #include <linux/vmalloc.h>
36 36
37 #include "rds.h" 37 #include "rds.h"
38 #include "iw.h" 38 #include "iw.h"
39 39
40 /* 40 /*
41 * Set the selected protocol version 41 * Set the selected protocol version
42 */ 42 */
43 static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) 43 static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
44 { 44 {
45 conn->c_version = version; 45 conn->c_version = version;
46 } 46 }
47 47
48 /* 48 /*
49 * Set up flow control 49 * Set up flow control
50 */ 50 */
51 static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) 51 static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
52 { 52 {
53 struct rds_iw_connection *ic = conn->c_transport_data; 53 struct rds_iw_connection *ic = conn->c_transport_data;
54 54
55 if (rds_iw_sysctl_flow_control && credits != 0) { 55 if (rds_iw_sysctl_flow_control && credits != 0) {
56 /* We're doing flow control */ 56 /* We're doing flow control */
57 ic->i_flowctl = 1; 57 ic->i_flowctl = 1;
58 rds_iw_send_add_credits(conn, credits); 58 rds_iw_send_add_credits(conn, credits);
59 } else { 59 } else {
60 ic->i_flowctl = 0; 60 ic->i_flowctl = 0;
61 } 61 }
62 } 62 }
63 63
64 /* 64 /*
65 * Connection established. 65 * Connection established.
66 * We get here for both outgoing and incoming connection. 66 * We get here for both outgoing and incoming connection.
67 */ 67 */
68 void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) 68 void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
69 { 69 {
70 const struct rds_iw_connect_private *dp = NULL; 70 const struct rds_iw_connect_private *dp = NULL;
71 struct rds_iw_connection *ic = conn->c_transport_data; 71 struct rds_iw_connection *ic = conn->c_transport_data;
72 struct rds_iw_device *rds_iwdev; 72 struct rds_iw_device *rds_iwdev;
73 int err; 73 int err;
74 74
75 if (event->param.conn.private_data_len) { 75 if (event->param.conn.private_data_len) {
76 dp = event->param.conn.private_data; 76 dp = event->param.conn.private_data;
77 77
78 rds_iw_set_protocol(conn, 78 rds_iw_set_protocol(conn,
79 RDS_PROTOCOL(dp->dp_protocol_major, 79 RDS_PROTOCOL(dp->dp_protocol_major,
80 dp->dp_protocol_minor)); 80 dp->dp_protocol_minor));
81 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 81 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
82 } 82 }
83 83
84 /* update ib_device with this local ipaddr & conn */ 84 /* update ib_device with this local ipaddr & conn */
85 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); 85 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
86 err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); 86 err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
87 if (err) 87 if (err)
88 printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); 88 printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
89 rds_iw_add_conn(rds_iwdev, conn); 89 rds_iw_add_conn(rds_iwdev, conn);
90 90
91 /* If the peer gave us the last packet it saw, process this as if 91 /* If the peer gave us the last packet it saw, process this as if
92 * we had received a regular ACK. */ 92 * we had received a regular ACK. */
93 if (dp && dp->dp_ack_seq) 93 if (dp && dp->dp_ack_seq)
94 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 94 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
95 95
96 printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", 96 printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
97 &conn->c_laddr, &conn->c_faddr, 97 &conn->c_laddr, &conn->c_faddr,
98 RDS_PROTOCOL_MAJOR(conn->c_version), 98 RDS_PROTOCOL_MAJOR(conn->c_version),
99 RDS_PROTOCOL_MINOR(conn->c_version), 99 RDS_PROTOCOL_MINOR(conn->c_version),
100 ic->i_flowctl ? ", flow control" : ""); 100 ic->i_flowctl ? ", flow control" : "");
101 101
102 rds_connect_complete(conn); 102 rds_connect_complete(conn);
103 } 103 }
104 104
105 static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, 105 static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
106 struct rdma_conn_param *conn_param, 106 struct rdma_conn_param *conn_param,
107 struct rds_iw_connect_private *dp, 107 struct rds_iw_connect_private *dp,
108 u32 protocol_version) 108 u32 protocol_version)
109 { 109 {
110 struct rds_iw_connection *ic = conn->c_transport_data; 110 struct rds_iw_connection *ic = conn->c_transport_data;
111 111
112 memset(conn_param, 0, sizeof(struct rdma_conn_param)); 112 memset(conn_param, 0, sizeof(struct rdma_conn_param));
113 /* XXX tune these? */ 113 /* XXX tune these? */
114 conn_param->responder_resources = 1; 114 conn_param->responder_resources = 1;
115 conn_param->initiator_depth = 1; 115 conn_param->initiator_depth = 1;
116 116
117 if (dp) { 117 if (dp) {
118 memset(dp, 0, sizeof(*dp)); 118 memset(dp, 0, sizeof(*dp));
119 dp->dp_saddr = conn->c_laddr; 119 dp->dp_saddr = conn->c_laddr;
120 dp->dp_daddr = conn->c_faddr; 120 dp->dp_daddr = conn->c_faddr;
121 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 121 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
122 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 122 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
123 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); 123 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
124 dp->dp_ack_seq = rds_iw_piggyb_ack(ic); 124 dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
125 125
126 /* Advertise flow control */ 126 /* Advertise flow control */
127 if (ic->i_flowctl) { 127 if (ic->i_flowctl) {
128 unsigned int credits; 128 unsigned int credits;
129 129
130 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); 130 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
131 dp->dp_credit = cpu_to_be32(credits); 131 dp->dp_credit = cpu_to_be32(credits);
132 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); 132 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
133 } 133 }
134 134
135 conn_param->private_data = dp; 135 conn_param->private_data = dp;
136 conn_param->private_data_len = sizeof(*dp); 136 conn_param->private_data_len = sizeof(*dp);
137 } 137 }
138 } 138 }
139 139
140 static void rds_iw_cq_event_handler(struct ib_event *event, void *data) 140 static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
141 { 141 {
142 rdsdebug("event %u data %p\n", event->event, data); 142 rdsdebug("event %u data %p\n", event->event, data);
143 } 143 }
144 144
145 static void rds_iw_qp_event_handler(struct ib_event *event, void *data) 145 static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
146 { 146 {
147 struct rds_connection *conn = data; 147 struct rds_connection *conn = data;
148 struct rds_iw_connection *ic = conn->c_transport_data; 148 struct rds_iw_connection *ic = conn->c_transport_data;
149 149
150 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); 150 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
151 151
152 switch (event->event) { 152 switch (event->event) {
153 case IB_EVENT_COMM_EST: 153 case IB_EVENT_COMM_EST:
154 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 154 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
155 break; 155 break;
156 case IB_EVENT_QP_REQ_ERR: 156 case IB_EVENT_QP_REQ_ERR:
157 case IB_EVENT_QP_FATAL: 157 case IB_EVENT_QP_FATAL:
158 default: 158 default:
159 rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n", 159 rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
160 event->event, &conn->c_laddr, 160 event->event, &conn->c_laddr,
161 &conn->c_faddr); 161 &conn->c_faddr);
162 break; 162 break;
163 } 163 }
164 } 164 }
165 165
166 /* 166 /*
167 * Create a QP 167 * Create a QP
168 */ 168 */
169 static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, 169 static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
170 struct rds_iw_device *rds_iwdev, 170 struct rds_iw_device *rds_iwdev,
171 struct rds_iw_work_ring *send_ring, 171 struct rds_iw_work_ring *send_ring,
172 void (*send_cq_handler)(struct ib_cq *, void *), 172 void (*send_cq_handler)(struct ib_cq *, void *),
173 struct rds_iw_work_ring *recv_ring, 173 struct rds_iw_work_ring *recv_ring,
174 void (*recv_cq_handler)(struct ib_cq *, void *), 174 void (*recv_cq_handler)(struct ib_cq *, void *),
175 void *context) 175 void *context)
176 { 176 {
177 struct ib_device *dev = rds_iwdev->dev; 177 struct ib_device *dev = rds_iwdev->dev;
178 unsigned int send_size, recv_size; 178 unsigned int send_size, recv_size;
179 int ret; 179 int ret;
180 180
181 /* The offset of 1 is to accomodate the additional ACK WR. */ 181 /* The offset of 1 is to accomodate the additional ACK WR. */
182 send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); 182 send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
183 recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); 183 recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
184 rds_iw_ring_resize(send_ring, send_size - 1); 184 rds_iw_ring_resize(send_ring, send_size - 1);
185 rds_iw_ring_resize(recv_ring, recv_size - 1); 185 rds_iw_ring_resize(recv_ring, recv_size - 1);
186 186
187 memset(attr, 0, sizeof(*attr)); 187 memset(attr, 0, sizeof(*attr));
188 attr->event_handler = rds_iw_qp_event_handler; 188 attr->event_handler = rds_iw_qp_event_handler;
189 attr->qp_context = context; 189 attr->qp_context = context;
190 attr->cap.max_send_wr = send_size; 190 attr->cap.max_send_wr = send_size;
191 attr->cap.max_recv_wr = recv_size; 191 attr->cap.max_recv_wr = recv_size;
192 attr->cap.max_send_sge = rds_iwdev->max_sge; 192 attr->cap.max_send_sge = rds_iwdev->max_sge;
193 attr->cap.max_recv_sge = RDS_IW_RECV_SGE; 193 attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
194 attr->sq_sig_type = IB_SIGNAL_REQ_WR; 194 attr->sq_sig_type = IB_SIGNAL_REQ_WR;
195 attr->qp_type = IB_QPT_RC; 195 attr->qp_type = IB_QPT_RC;
196 196
197 attr->send_cq = ib_create_cq(dev, send_cq_handler, 197 attr->send_cq = ib_create_cq(dev, send_cq_handler,
198 rds_iw_cq_event_handler, 198 rds_iw_cq_event_handler,
199 context, send_size, 0); 199 context, send_size, 0);
200 if (IS_ERR(attr->send_cq)) { 200 if (IS_ERR(attr->send_cq)) {
201 ret = PTR_ERR(attr->send_cq); 201 ret = PTR_ERR(attr->send_cq);
202 attr->send_cq = NULL; 202 attr->send_cq = NULL;
203 rdsdebug("ib_create_cq send failed: %d\n", ret); 203 rdsdebug("ib_create_cq send failed: %d\n", ret);
204 goto out; 204 goto out;
205 } 205 }
206 206
207 attr->recv_cq = ib_create_cq(dev, recv_cq_handler, 207 attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
208 rds_iw_cq_event_handler, 208 rds_iw_cq_event_handler,
209 context, recv_size, 0); 209 context, recv_size, 0);
210 if (IS_ERR(attr->recv_cq)) { 210 if (IS_ERR(attr->recv_cq)) {
211 ret = PTR_ERR(attr->recv_cq); 211 ret = PTR_ERR(attr->recv_cq);
212 attr->recv_cq = NULL; 212 attr->recv_cq = NULL;
213 rdsdebug("ib_create_cq send failed: %d\n", ret); 213 rdsdebug("ib_create_cq send failed: %d\n", ret);
214 goto out; 214 goto out;
215 } 215 }
216 216
217 ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); 217 ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
218 if (ret) { 218 if (ret) {
219 rdsdebug("ib_req_notify_cq send failed: %d\n", ret); 219 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
220 goto out; 220 goto out;
221 } 221 }
222 222
223 ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); 223 ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
224 if (ret) { 224 if (ret) {
225 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); 225 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
226 goto out; 226 goto out;
227 } 227 }
228 228
229 out: 229 out:
230 if (ret) { 230 if (ret) {
231 if (attr->send_cq) 231 if (attr->send_cq)
232 ib_destroy_cq(attr->send_cq); 232 ib_destroy_cq(attr->send_cq);
233 if (attr->recv_cq) 233 if (attr->recv_cq)
234 ib_destroy_cq(attr->recv_cq); 234 ib_destroy_cq(attr->recv_cq);
235 } 235 }
236 return ret; 236 return ret;
237 } 237 }
238 238
239 /* 239 /*
240 * This needs to be very careful to not leave IS_ERR pointers around for 240 * This needs to be very careful to not leave IS_ERR pointers around for
241 * cleanup to trip over. 241 * cleanup to trip over.
242 */ 242 */
243 static int rds_iw_setup_qp(struct rds_connection *conn) 243 static int rds_iw_setup_qp(struct rds_connection *conn)
244 { 244 {
245 struct rds_iw_connection *ic = conn->c_transport_data; 245 struct rds_iw_connection *ic = conn->c_transport_data;
246 struct ib_device *dev = ic->i_cm_id->device; 246 struct ib_device *dev = ic->i_cm_id->device;
247 struct ib_qp_init_attr attr; 247 struct ib_qp_init_attr attr;
248 struct rds_iw_device *rds_iwdev; 248 struct rds_iw_device *rds_iwdev;
249 int ret; 249 int ret;
250 250
251 /* rds_iw_add_one creates a rds_iw_device object per IB device, 251 /* rds_iw_add_one creates a rds_iw_device object per IB device,
252 * and allocates a protection domain, memory range and MR pool 252 * and allocates a protection domain, memory range and MR pool
253 * for each. If that fails for any reason, it will not register 253 * for each. If that fails for any reason, it will not register
254 * the rds_iwdev at all. 254 * the rds_iwdev at all.
255 */ 255 */
256 rds_iwdev = ib_get_client_data(dev, &rds_iw_client); 256 rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
257 if (rds_iwdev == NULL) { 257 if (rds_iwdev == NULL) {
258 if (printk_ratelimit()) 258 if (printk_ratelimit())
259 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", 259 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
260 dev->name); 260 dev->name);
261 return -EOPNOTSUPP; 261 return -EOPNOTSUPP;
262 } 262 }
263 263
264 /* Protection domain and memory range */ 264 /* Protection domain and memory range */
265 ic->i_pd = rds_iwdev->pd; 265 ic->i_pd = rds_iwdev->pd;
266 ic->i_mr = rds_iwdev->mr; 266 ic->i_mr = rds_iwdev->mr;
267 267
268 ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, 268 ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
269 &ic->i_send_ring, rds_iw_send_cq_comp_handler, 269 &ic->i_send_ring, rds_iw_send_cq_comp_handler,
270 &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, 270 &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
271 conn); 271 conn);
272 if (ret < 0) 272 if (ret < 0)
273 goto out; 273 goto out;
274 274
275 ic->i_send_cq = attr.send_cq; 275 ic->i_send_cq = attr.send_cq;
276 ic->i_recv_cq = attr.recv_cq; 276 ic->i_recv_cq = attr.recv_cq;
277 277
278 /* 278 /*
279 * XXX this can fail if max_*_wr is too large? Are we supposed 279 * XXX this can fail if max_*_wr is too large? Are we supposed
280 * to back off until we get a value that the hardware can support? 280 * to back off until we get a value that the hardware can support?
281 */ 281 */
282 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 282 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
283 if (ret) { 283 if (ret) {
284 rdsdebug("rdma_create_qp failed: %d\n", ret); 284 rdsdebug("rdma_create_qp failed: %d\n", ret);
285 goto out; 285 goto out;
286 } 286 }
287 287
288 ic->i_send_hdrs = ib_dma_alloc_coherent(dev, 288 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
289 ic->i_send_ring.w_nr * 289 ic->i_send_ring.w_nr *
290 sizeof(struct rds_header), 290 sizeof(struct rds_header),
291 &ic->i_send_hdrs_dma, GFP_KERNEL); 291 &ic->i_send_hdrs_dma, GFP_KERNEL);
292 if (ic->i_send_hdrs == NULL) { 292 if (ic->i_send_hdrs == NULL) {
293 ret = -ENOMEM; 293 ret = -ENOMEM;
294 rdsdebug("ib_dma_alloc_coherent send failed\n"); 294 rdsdebug("ib_dma_alloc_coherent send failed\n");
295 goto out; 295 goto out;
296 } 296 }
297 297
298 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, 298 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
299 ic->i_recv_ring.w_nr * 299 ic->i_recv_ring.w_nr *
300 sizeof(struct rds_header), 300 sizeof(struct rds_header),
301 &ic->i_recv_hdrs_dma, GFP_KERNEL); 301 &ic->i_recv_hdrs_dma, GFP_KERNEL);
302 if (ic->i_recv_hdrs == NULL) { 302 if (ic->i_recv_hdrs == NULL) {
303 ret = -ENOMEM; 303 ret = -ENOMEM;
304 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 304 rdsdebug("ib_dma_alloc_coherent recv failed\n");
305 goto out; 305 goto out;
306 } 306 }
307 307
308 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 308 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
309 &ic->i_ack_dma, GFP_KERNEL); 309 &ic->i_ack_dma, GFP_KERNEL);
310 if (ic->i_ack == NULL) { 310 if (ic->i_ack == NULL) {
311 ret = -ENOMEM; 311 ret = -ENOMEM;
312 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 312 rdsdebug("ib_dma_alloc_coherent ack failed\n");
313 goto out; 313 goto out;
314 } 314 }
315 315
316 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); 316 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
317 if (ic->i_sends == NULL) { 317 if (ic->i_sends == NULL) {
318 ret = -ENOMEM; 318 ret = -ENOMEM;
319 rdsdebug("send allocation failed\n"); 319 rdsdebug("send allocation failed\n");
320 goto out; 320 goto out;
321 } 321 }
322 rds_iw_send_init_ring(ic); 322 rds_iw_send_init_ring(ic);
323 323
324 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); 324 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
325 if (ic->i_recvs == NULL) { 325 if (ic->i_recvs == NULL) {
326 ret = -ENOMEM; 326 ret = -ENOMEM;
327 rdsdebug("recv allocation failed\n"); 327 rdsdebug("recv allocation failed\n");
328 goto out; 328 goto out;
329 } 329 }
330 330
331 rds_iw_recv_init_ring(ic); 331 rds_iw_recv_init_ring(ic);
332 rds_iw_recv_init_ack(ic); 332 rds_iw_recv_init_ack(ic);
333 333
334 /* Post receive buffers - as a side effect, this will update 334 /* Post receive buffers - as a side effect, this will update
335 * the posted credit count. */ 335 * the posted credit count. */
336 rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); 336 rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
337 337
338 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, 338 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
339 ic->i_send_cq, ic->i_recv_cq); 339 ic->i_send_cq, ic->i_recv_cq);
340 340
341 out: 341 out:
342 return ret; 342 return ret;
343 } 343 }
344 344
345 static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) 345 static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
346 { 346 {
347 u16 common; 347 u16 common;
348 u32 version = 0; 348 u32 version = 0;
349 349
350 /* rdma_cm private data is odd - when there is any private data in the 350 /* rdma_cm private data is odd - when there is any private data in the
351 * request, we will be given a pretty large buffer without telling us the 351 * request, we will be given a pretty large buffer without telling us the
352 * original size. The only way to tell the difference is by looking at 352 * original size. The only way to tell the difference is by looking at
353 * the contents, which are initialized to zero. 353 * the contents, which are initialized to zero.
354 * If the protocol version fields aren't set, this is a connection attempt 354 * If the protocol version fields aren't set, this is a connection attempt
355 * from an older version. This could could be 3.0 or 2.0 - we can't tell. 355 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
356 * We really should have changed this for OFED 1.3 :-( */ 356 * We really should have changed this for OFED 1.3 :-( */
357 if (dp->dp_protocol_major == 0) 357 if (dp->dp_protocol_major == 0)
358 return RDS_PROTOCOL_3_0; 358 return RDS_PROTOCOL_3_0;
359 359
360 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; 360 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
361 if (dp->dp_protocol_major == 3 && common) { 361 if (dp->dp_protocol_major == 3 && common) {
362 version = RDS_PROTOCOL_3_0; 362 version = RDS_PROTOCOL_3_0;
363 while ((common >>= 1) != 0) 363 while ((common >>= 1) != 0)
364 version++; 364 version++;
365 } else if (printk_ratelimit()) { 365 } else if (printk_ratelimit()) {
366 printk(KERN_NOTICE "RDS: Connection from %pI4 using " 366 printk(KERN_NOTICE "RDS: Connection from %pI4 using "
367 "incompatible protocol version %u.%u\n", 367 "incompatible protocol version %u.%u\n",
368 &dp->dp_saddr, 368 &dp->dp_saddr,
369 dp->dp_protocol_major, 369 dp->dp_protocol_major,
370 dp->dp_protocol_minor); 370 dp->dp_protocol_minor);
371 } 371 }
372 return version; 372 return version;
373 } 373 }
374 374
375 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, 375 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
376 struct rdma_cm_event *event) 376 struct rdma_cm_event *event)
377 { 377 {
378 const struct rds_iw_connect_private *dp = event->param.conn.private_data; 378 const struct rds_iw_connect_private *dp = event->param.conn.private_data;
379 struct rds_iw_connect_private dp_rep; 379 struct rds_iw_connect_private dp_rep;
380 struct rds_connection *conn = NULL; 380 struct rds_connection *conn = NULL;
381 struct rds_iw_connection *ic = NULL; 381 struct rds_iw_connection *ic = NULL;
382 struct rdma_conn_param conn_param; 382 struct rdma_conn_param conn_param;
383 struct rds_iw_device *rds_iwdev; 383 struct rds_iw_device *rds_iwdev;
384 u32 version; 384 u32 version;
385 int err, destroy = 1; 385 int err, destroy = 1;
386 386
387 /* Check whether the remote protocol version matches ours. */ 387 /* Check whether the remote protocol version matches ours. */
388 version = rds_iw_protocol_compatible(dp); 388 version = rds_iw_protocol_compatible(dp);
389 if (!version) 389 if (!version)
390 goto out; 390 goto out;
391 391
392 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", 392 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
393 &dp->dp_saddr, &dp->dp_daddr, 393 &dp->dp_saddr, &dp->dp_daddr,
394 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); 394 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
395 395
396 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, 396 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
397 GFP_KERNEL); 397 GFP_KERNEL);
398 if (IS_ERR(conn)) { 398 if (IS_ERR(conn)) {
399 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 399 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
400 conn = NULL; 400 conn = NULL;
401 goto out; 401 goto out;
402 } 402 }
403 403
404 /* 404 /*
405 * The connection request may occur while the 405 * The connection request may occur while the
406 * previous connection exist, e.g. in case of failover. 406 * previous connection exist, e.g. in case of failover.
407 * But as connections may be initiated simultaneously 407 * But as connections may be initiated simultaneously
408 * by both hosts, we have a random backoff mechanism - 408 * by both hosts, we have a random backoff mechanism -
409 * see the comment above rds_queue_reconnect() 409 * see the comment above rds_queue_reconnect()
410 */ 410 */
411 mutex_lock(&conn->c_cm_lock); 411 mutex_lock(&conn->c_cm_lock);
412 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { 412 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
413 if (rds_conn_state(conn) == RDS_CONN_UP) { 413 if (rds_conn_state(conn) == RDS_CONN_UP) {
414 rdsdebug("incoming connect while connecting\n"); 414 rdsdebug("incoming connect while connecting\n");
415 rds_conn_drop(conn); 415 rds_conn_drop(conn);
416 rds_iw_stats_inc(s_iw_listen_closed_stale); 416 rds_iw_stats_inc(s_iw_listen_closed_stale);
417 } else 417 } else
418 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { 418 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
419 /* Wait and see - our connect may still be succeeding */ 419 /* Wait and see - our connect may still be succeeding */
420 rds_iw_stats_inc(s_iw_connect_raced); 420 rds_iw_stats_inc(s_iw_connect_raced);
421 } 421 }
422 mutex_unlock(&conn->c_cm_lock); 422 mutex_unlock(&conn->c_cm_lock);
423 goto out; 423 goto out;
424 } 424 }
425 425
426 ic = conn->c_transport_data; 426 ic = conn->c_transport_data;
427 427
428 rds_iw_set_protocol(conn, version); 428 rds_iw_set_protocol(conn, version);
429 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 429 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
430 430
431 /* If the peer gave us the last packet it saw, process this as if 431 /* If the peer gave us the last packet it saw, process this as if
432 * we had received a regular ACK. */ 432 * we had received a regular ACK. */
433 if (dp->dp_ack_seq) 433 if (dp->dp_ack_seq)
434 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 434 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
435 435
436 BUG_ON(cm_id->context); 436 BUG_ON(cm_id->context);
437 BUG_ON(ic->i_cm_id); 437 BUG_ON(ic->i_cm_id);
438 438
439 ic->i_cm_id = cm_id; 439 ic->i_cm_id = cm_id;
440 cm_id->context = conn; 440 cm_id->context = conn;
441 441
442 rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); 442 rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
443 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; 443 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
444 444
445 /* We got halfway through setting up the ib_connection, if we 445 /* We got halfway through setting up the ib_connection, if we
446 * fail now, we have to take the long route out of this mess. */ 446 * fail now, we have to take the long route out of this mess. */
447 destroy = 0; 447 destroy = 0;
448 448
449 err = rds_iw_setup_qp(conn); 449 err = rds_iw_setup_qp(conn);
450 if (err) { 450 if (err) {
451 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); 451 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
452 goto out; 452 goto out;
453 } 453 }
454 454
455 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 455 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
456 456
457 /* rdma_accept() calls rdma_reject() internally if it fails */ 457 /* rdma_accept() calls rdma_reject() internally if it fails */
458 err = rdma_accept(cm_id, &conn_param); 458 err = rdma_accept(cm_id, &conn_param);
459 mutex_unlock(&conn->c_cm_lock); 459 mutex_unlock(&conn->c_cm_lock);
460 if (err) { 460 if (err) {
461 rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); 461 rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
462 goto out; 462 goto out;
463 } 463 }
464 464
465 return 0; 465 return 0;
466 466
467 out: 467 out:
468 rdma_reject(cm_id, NULL, 0); 468 rdma_reject(cm_id, NULL, 0);
469 return destroy; 469 return destroy;
470 } 470 }
471 471
472 472
473 int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) 473 int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
474 { 474 {
475 struct rds_connection *conn = cm_id->context; 475 struct rds_connection *conn = cm_id->context;
476 struct rds_iw_connection *ic = conn->c_transport_data; 476 struct rds_iw_connection *ic = conn->c_transport_data;
477 struct rdma_conn_param conn_param; 477 struct rdma_conn_param conn_param;
478 struct rds_iw_connect_private dp; 478 struct rds_iw_connect_private dp;
479 int ret; 479 int ret;
480 480
481 /* If the peer doesn't do protocol negotiation, we must 481 /* If the peer doesn't do protocol negotiation, we must
482 * default to RDSv3.0 */ 482 * default to RDSv3.0 */
483 rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); 483 rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
484 ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ 484 ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
485 485
486 ret = rds_iw_setup_qp(conn); 486 ret = rds_iw_setup_qp(conn);
487 if (ret) { 487 if (ret) {
488 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); 488 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
489 goto out; 489 goto out;
490 } 490 }
491 491
492 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); 492 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
493 493
494 ret = rdma_connect(cm_id, &conn_param); 494 ret = rdma_connect(cm_id, &conn_param);
495 if (ret) 495 if (ret)
496 rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); 496 rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
497 497
498 out: 498 out:
499 /* Beware - returning non-zero tells the rdma_cm to destroy 499 /* Beware - returning non-zero tells the rdma_cm to destroy
500 * the cm_id. We should certainly not do it as long as we still 500 * the cm_id. We should certainly not do it as long as we still
501 * "own" the cm_id. */ 501 * "own" the cm_id. */
502 if (ret) { 502 if (ret) {
503 struct rds_iw_connection *ic = conn->c_transport_data; 503 struct rds_iw_connection *ic = conn->c_transport_data;
504 504
505 if (ic->i_cm_id == cm_id) 505 if (ic->i_cm_id == cm_id)
506 ret = 0; 506 ret = 0;
507 } 507 }
508 return ret; 508 return ret;
509 } 509 }
510 510
511 int rds_iw_conn_connect(struct rds_connection *conn) 511 int rds_iw_conn_connect(struct rds_connection *conn)
512 { 512 {
513 struct rds_iw_connection *ic = conn->c_transport_data; 513 struct rds_iw_connection *ic = conn->c_transport_data;
514 struct rds_iw_device *rds_iwdev; 514 struct rds_iw_device *rds_iwdev;
515 struct sockaddr_in src, dest; 515 struct sockaddr_in src, dest;
516 int ret; 516 int ret;
517 517
518 /* XXX I wonder what affect the port space has */ 518 /* XXX I wonder what affect the port space has */
519 /* delegate cm event handler to rdma_transport */ 519 /* delegate cm event handler to rdma_transport */
520 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, 520 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
521 RDMA_PS_TCP); 521 RDMA_PS_TCP);
522 if (IS_ERR(ic->i_cm_id)) { 522 if (IS_ERR(ic->i_cm_id)) {
523 ret = PTR_ERR(ic->i_cm_id); 523 ret = PTR_ERR(ic->i_cm_id);
524 ic->i_cm_id = NULL; 524 ic->i_cm_id = NULL;
525 rdsdebug("rdma_create_id() failed: %d\n", ret); 525 rdsdebug("rdma_create_id() failed: %d\n", ret);
526 goto out; 526 goto out;
527 } 527 }
528 528
529 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); 529 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
530 530
531 src.sin_family = AF_INET; 531 src.sin_family = AF_INET;
532 src.sin_addr.s_addr = (__force u32)conn->c_laddr; 532 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
533 src.sin_port = (__force u16)htons(0); 533 src.sin_port = (__force u16)htons(0);
534 534
535 /* First, bind to the local address and device. */ 535 /* First, bind to the local address and device. */
536 ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); 536 ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
537 if (ret) { 537 if (ret) {
538 rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", 538 rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
539 &conn->c_laddr, ret); 539 &conn->c_laddr, ret);
540 rdma_destroy_id(ic->i_cm_id); 540 rdma_destroy_id(ic->i_cm_id);
541 ic->i_cm_id = NULL; 541 ic->i_cm_id = NULL;
542 goto out; 542 goto out;
543 } 543 }
544 544
545 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); 545 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
546 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; 546 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
547 547
548 dest.sin_family = AF_INET; 548 dest.sin_family = AF_INET;
549 dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 549 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
550 dest.sin_port = (__force u16)htons(RDS_PORT); 550 dest.sin_port = (__force u16)htons(RDS_PORT);
551 551
552 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 552 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
553 (struct sockaddr *)&dest, 553 (struct sockaddr *)&dest,
554 RDS_RDMA_RESOLVE_TIMEOUT_MS); 554 RDS_RDMA_RESOLVE_TIMEOUT_MS);
555 if (ret) { 555 if (ret) {
556 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, 556 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
557 ret); 557 ret);
558 rdma_destroy_id(ic->i_cm_id); 558 rdma_destroy_id(ic->i_cm_id);
559 ic->i_cm_id = NULL; 559 ic->i_cm_id = NULL;
560 } 560 }
561 561
562 out: 562 out:
563 return ret; 563 return ret;
564 } 564 }
565 565
566 /* 566 /*
567 * This is so careful about only cleaning up resources that were built up 567 * This is so careful about only cleaning up resources that were built up
568 * so that it can be called at any point during startup. In fact it 568 * so that it can be called at any point during startup. In fact it
569 * can be called multiple times for a given connection. 569 * can be called multiple times for a given connection.
570 */ 570 */
571 void rds_iw_conn_shutdown(struct rds_connection *conn) 571 void rds_iw_conn_shutdown(struct rds_connection *conn)
572 { 572 {
573 struct rds_iw_connection *ic = conn->c_transport_data; 573 struct rds_iw_connection *ic = conn->c_transport_data;
574 int err = 0; 574 int err = 0;
575 struct ib_qp_attr qp_attr; 575 struct ib_qp_attr qp_attr;
576 576
577 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, 577 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
578 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 578 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
579 ic->i_cm_id ? ic->i_cm_id->qp : NULL); 579 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
580 580
581 if (ic->i_cm_id) { 581 if (ic->i_cm_id) {
582 struct ib_device *dev = ic->i_cm_id->device; 582 struct ib_device *dev = ic->i_cm_id->device;
583 583
584 rdsdebug("disconnecting cm %p\n", ic->i_cm_id); 584 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
585 err = rdma_disconnect(ic->i_cm_id); 585 err = rdma_disconnect(ic->i_cm_id);
586 if (err) { 586 if (err) {
587 /* Actually this may happen quite frequently, when 587 /* Actually this may happen quite frequently, when
588 * an outgoing connect raced with an incoming connect. 588 * an outgoing connect raced with an incoming connect.
589 */ 589 */
590 rdsdebug("rds_iw_conn_shutdown: failed to disconnect," 590 rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
591 " cm: %p err %d\n", ic->i_cm_id, err); 591 " cm: %p err %d\n", ic->i_cm_id, err);
592 } 592 }
593 593
594 if (ic->i_cm_id->qp) { 594 if (ic->i_cm_id->qp) {
595 qp_attr.qp_state = IB_QPS_ERR; 595 qp_attr.qp_state = IB_QPS_ERR;
596 ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 596 ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
597 } 597 }
598 598
599 wait_event(rds_iw_ring_empty_wait, 599 wait_event(rds_iw_ring_empty_wait,
600 rds_iw_ring_empty(&ic->i_send_ring) && 600 rds_iw_ring_empty(&ic->i_send_ring) &&
601 rds_iw_ring_empty(&ic->i_recv_ring)); 601 rds_iw_ring_empty(&ic->i_recv_ring));
602 602
603 if (ic->i_send_hdrs) 603 if (ic->i_send_hdrs)
604 ib_dma_free_coherent(dev, 604 ib_dma_free_coherent(dev,
605 ic->i_send_ring.w_nr * 605 ic->i_send_ring.w_nr *
606 sizeof(struct rds_header), 606 sizeof(struct rds_header),
607 ic->i_send_hdrs, 607 ic->i_send_hdrs,
608 ic->i_send_hdrs_dma); 608 ic->i_send_hdrs_dma);
609 609
610 if (ic->i_recv_hdrs) 610 if (ic->i_recv_hdrs)
611 ib_dma_free_coherent(dev, 611 ib_dma_free_coherent(dev,
612 ic->i_recv_ring.w_nr * 612 ic->i_recv_ring.w_nr *
613 sizeof(struct rds_header), 613 sizeof(struct rds_header),
614 ic->i_recv_hdrs, 614 ic->i_recv_hdrs,
615 ic->i_recv_hdrs_dma); 615 ic->i_recv_hdrs_dma);
616 616
617 if (ic->i_ack) 617 if (ic->i_ack)
618 ib_dma_free_coherent(dev, sizeof(struct rds_header), 618 ib_dma_free_coherent(dev, sizeof(struct rds_header),
619 ic->i_ack, ic->i_ack_dma); 619 ic->i_ack, ic->i_ack_dma);
620 620
621 if (ic->i_sends) 621 if (ic->i_sends)
622 rds_iw_send_clear_ring(ic); 622 rds_iw_send_clear_ring(ic);
623 if (ic->i_recvs) 623 if (ic->i_recvs)
624 rds_iw_recv_clear_ring(ic); 624 rds_iw_recv_clear_ring(ic);
625 625
626 if (ic->i_cm_id->qp) 626 if (ic->i_cm_id->qp)
627 rdma_destroy_qp(ic->i_cm_id); 627 rdma_destroy_qp(ic->i_cm_id);
628 if (ic->i_send_cq) 628 if (ic->i_send_cq)
629 ib_destroy_cq(ic->i_send_cq); 629 ib_destroy_cq(ic->i_send_cq);
630 if (ic->i_recv_cq) 630 if (ic->i_recv_cq)
631 ib_destroy_cq(ic->i_recv_cq); 631 ib_destroy_cq(ic->i_recv_cq);
632 632
633 /* 633 /*
634 * If associated with an rds_iw_device: 634 * If associated with an rds_iw_device:
635 * Move connection back to the nodev list. 635 * Move connection back to the nodev list.
636 * Remove cm_id from the device cm_id list. 636 * Remove cm_id from the device cm_id list.
637 */ 637 */
638 if (ic->rds_iwdev) 638 if (ic->rds_iwdev)
639 rds_iw_remove_conn(ic->rds_iwdev, conn); 639 rds_iw_remove_conn(ic->rds_iwdev, conn);
640 640
641 rdma_destroy_id(ic->i_cm_id); 641 rdma_destroy_id(ic->i_cm_id);
642 642
643 ic->i_cm_id = NULL; 643 ic->i_cm_id = NULL;
644 ic->i_pd = NULL; 644 ic->i_pd = NULL;
645 ic->i_mr = NULL; 645 ic->i_mr = NULL;
646 ic->i_send_cq = NULL; 646 ic->i_send_cq = NULL;
647 ic->i_recv_cq = NULL; 647 ic->i_recv_cq = NULL;
648 ic->i_send_hdrs = NULL; 648 ic->i_send_hdrs = NULL;
649 ic->i_recv_hdrs = NULL; 649 ic->i_recv_hdrs = NULL;
650 ic->i_ack = NULL; 650 ic->i_ack = NULL;
651 } 651 }
652 BUG_ON(ic->rds_iwdev); 652 BUG_ON(ic->rds_iwdev);
653 653
654 /* Clear pending transmit */ 654 /* Clear pending transmit */
655 if (ic->i_rm) { 655 if (ic->i_rm) {
656 rds_message_put(ic->i_rm); 656 rds_message_put(ic->i_rm);
657 ic->i_rm = NULL; 657 ic->i_rm = NULL;
658 } 658 }
659 659
660 /* Clear the ACK state */ 660 /* Clear the ACK state */
661 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 661 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
662 #ifdef KERNEL_HAS_ATOMIC64 662 #ifdef KERNEL_HAS_ATOMIC64
663 atomic64_set(&ic->i_ack_next, 0); 663 atomic64_set(&ic->i_ack_next, 0);
664 #else 664 #else
665 ic->i_ack_next = 0; 665 ic->i_ack_next = 0;
666 #endif 666 #endif
667 ic->i_ack_recv = 0; 667 ic->i_ack_recv = 0;
668 668
669 /* Clear flow control state */ 669 /* Clear flow control state */
670 ic->i_flowctl = 0; 670 ic->i_flowctl = 0;
671 atomic_set(&ic->i_credits, 0); 671 atomic_set(&ic->i_credits, 0);
672 672
673 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); 673 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
674 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); 674 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
675 675
676 if (ic->i_iwinc) { 676 if (ic->i_iwinc) {
677 rds_inc_put(&ic->i_iwinc->ii_inc); 677 rds_inc_put(&ic->i_iwinc->ii_inc);
678 ic->i_iwinc = NULL; 678 ic->i_iwinc = NULL;
679 } 679 }
680 680
681 vfree(ic->i_sends); 681 vfree(ic->i_sends);
682 ic->i_sends = NULL; 682 ic->i_sends = NULL;
683 vfree(ic->i_recvs); 683 vfree(ic->i_recvs);
684 ic->i_recvs = NULL; 684 ic->i_recvs = NULL;
685 rdsdebug("shutdown complete\n"); 685 rdsdebug("shutdown complete\n");
686 } 686 }
687 687
688 int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) 688 int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
689 { 689 {
690 struct rds_iw_connection *ic; 690 struct rds_iw_connection *ic;
691 unsigned long flags; 691 unsigned long flags;
692 692
693 /* XXX too lazy? */ 693 /* XXX too lazy? */
694 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); 694 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
695 if (ic == NULL) 695 if (ic == NULL)
696 return -ENOMEM; 696 return -ENOMEM;
697 697
698 INIT_LIST_HEAD(&ic->iw_node); 698 INIT_LIST_HEAD(&ic->iw_node);
699 tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
700 (unsigned long) ic);
699 mutex_init(&ic->i_recv_mutex); 701 mutex_init(&ic->i_recv_mutex);
700 #ifndef KERNEL_HAS_ATOMIC64 702 #ifndef KERNEL_HAS_ATOMIC64
701 spin_lock_init(&ic->i_ack_lock); 703 spin_lock_init(&ic->i_ack_lock);
702 #endif 704 #endif
703 705
704 /* 706 /*
705 * rds_iw_conn_shutdown() waits for these to be emptied so they 707 * rds_iw_conn_shutdown() waits for these to be emptied so they
706 * must be initialized before it can be called. 708 * must be initialized before it can be called.
707 */ 709 */
708 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); 710 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
709 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); 711 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
710 712
711 ic->conn = conn; 713 ic->conn = conn;
712 conn->c_transport_data = ic; 714 conn->c_transport_data = ic;
713 715
714 spin_lock_irqsave(&iw_nodev_conns_lock, flags); 716 spin_lock_irqsave(&iw_nodev_conns_lock, flags);
715 list_add_tail(&ic->iw_node, &iw_nodev_conns); 717 list_add_tail(&ic->iw_node, &iw_nodev_conns);
716 spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); 718 spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
717 719
718 720
719 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); 721 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
720 return 0; 722 return 0;
721 } 723 }
722 724
723 /* 725 /*
724 * Free a connection. Connection must be shut down and not set for reconnect. 726 * Free a connection. Connection must be shut down and not set for reconnect.
725 */ 727 */
726 void rds_iw_conn_free(void *arg) 728 void rds_iw_conn_free(void *arg)
727 { 729 {
728 struct rds_iw_connection *ic = arg; 730 struct rds_iw_connection *ic = arg;
729 spinlock_t *lock_ptr; 731 spinlock_t *lock_ptr;
730 732
731 rdsdebug("ic %p\n", ic); 733 rdsdebug("ic %p\n", ic);
732 734
733 /* 735 /*
734 * Conn is either on a dev's list or on the nodev list. 736 * Conn is either on a dev's list or on the nodev list.
735 * A race with shutdown() or connect() would cause problems 737 * A race with shutdown() or connect() would cause problems
736 * (since rds_iwdev would change) but that should never happen. 738 * (since rds_iwdev would change) but that should never happen.
737 */ 739 */
738 lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; 740 lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
739 741
740 spin_lock_irq(lock_ptr); 742 spin_lock_irq(lock_ptr);
741 list_del(&ic->iw_node); 743 list_del(&ic->iw_node);
742 spin_unlock_irq(lock_ptr); 744 spin_unlock_irq(lock_ptr);
743 745
744 kfree(ic); 746 kfree(ic);
745 } 747 }
746 748
747 /* 749 /*
748 * An error occurred on the connection 750 * An error occurred on the connection
749 */ 751 */
750 void 752 void
751 __rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) 753 __rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
752 { 754 {
753 va_list ap; 755 va_list ap;
754 756
755 rds_conn_drop(conn); 757 rds_conn_drop(conn);
756 758
757 va_start(ap, fmt); 759 va_start(ap, fmt);
758 vprintk(fmt, ap); 760 vprintk(fmt, ap);
759 va_end(ap); 761 va_end(ap);
760 } 762 }
761 763
1 /* 1 /*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006 Oracle. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file 6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the 7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below: 8 * OpenIB.org BSD license below:
9 * 9 *
10 * Redistribution and use in source and binary forms, with or 10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following 11 * without modification, are permitted provided that the following
12 * conditions are met: 12 * conditions are met:
13 * 13 *
14 * - Redistributions of source code must retain the above 14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following 15 * copyright notice, this list of conditions and the following
16 * disclaimer. 16 * disclaimer.
17 * 17 *
18 * - Redistributions in binary form must reproduce the above 18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following 19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials 20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution. 21 * provided with the distribution.
22 * 22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE. 30 * SOFTWARE.
31 * 31 *
32 */ 32 */
33 #include <linux/kernel.h> 33 #include <linux/kernel.h>
34 #include <linux/pci.h> 34 #include <linux/pci.h>
35 #include <linux/dma-mapping.h> 35 #include <linux/dma-mapping.h>
36 #include <rdma/rdma_cm.h> 36 #include <rdma/rdma_cm.h>
37 37
38 #include "rds.h" 38 #include "rds.h"
39 #include "iw.h" 39 #include "iw.h"
40 40
41 static struct kmem_cache *rds_iw_incoming_slab; 41 static struct kmem_cache *rds_iw_incoming_slab;
42 static struct kmem_cache *rds_iw_frag_slab; 42 static struct kmem_cache *rds_iw_frag_slab;
43 static atomic_t rds_iw_allocation = ATOMIC_INIT(0); 43 static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
44 44
45 static void rds_iw_frag_drop_page(struct rds_page_frag *frag) 45 static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
46 { 46 {
47 rdsdebug("frag %p page %p\n", frag, frag->f_page); 47 rdsdebug("frag %p page %p\n", frag, frag->f_page);
48 __free_page(frag->f_page); 48 __free_page(frag->f_page);
49 frag->f_page = NULL; 49 frag->f_page = NULL;
50 } 50 }
51 51
52 static void rds_iw_frag_free(struct rds_page_frag *frag) 52 static void rds_iw_frag_free(struct rds_page_frag *frag)
53 { 53 {
54 rdsdebug("frag %p page %p\n", frag, frag->f_page); 54 rdsdebug("frag %p page %p\n", frag, frag->f_page);
55 BUG_ON(frag->f_page != NULL); 55 BUG_ON(frag->f_page != NULL);
56 kmem_cache_free(rds_iw_frag_slab, frag); 56 kmem_cache_free(rds_iw_frag_slab, frag);
57 } 57 }
58 58
59 /* 59 /*
60 * We map a page at a time. Its fragments are posted in order. This 60 * We map a page at a time. Its fragments are posted in order. This
61 * is called in fragment order as the fragments get send completion events. 61 * is called in fragment order as the fragments get send completion events.
62 * Only the last frag in the page performs the unmapping. 62 * Only the last frag in the page performs the unmapping.
63 * 63 *
64 * It's OK for ring cleanup to call this in whatever order it likes because 64 * It's OK for ring cleanup to call this in whatever order it likes because
65 * DMA is not in flight and so we can unmap while other ring entries still 65 * DMA is not in flight and so we can unmap while other ring entries still
66 * hold page references in their frags. 66 * hold page references in their frags.
67 */ 67 */
68 static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, 68 static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
69 struct rds_iw_recv_work *recv) 69 struct rds_iw_recv_work *recv)
70 { 70 {
71 struct rds_page_frag *frag = recv->r_frag; 71 struct rds_page_frag *frag = recv->r_frag;
72 72
73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); 73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
74 if (frag->f_mapped) 74 if (frag->f_mapped)
75 ib_dma_unmap_page(ic->i_cm_id->device, 75 ib_dma_unmap_page(ic->i_cm_id->device,
76 frag->f_mapped, 76 frag->f_mapped,
77 RDS_FRAG_SIZE, DMA_FROM_DEVICE); 77 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
78 frag->f_mapped = 0; 78 frag->f_mapped = 0;
79 } 79 }
80 80
81 void rds_iw_recv_init_ring(struct rds_iw_connection *ic) 81 void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
82 { 82 {
83 struct rds_iw_recv_work *recv; 83 struct rds_iw_recv_work *recv;
84 u32 i; 84 u32 i;
85 85
86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { 86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
87 struct ib_sge *sge; 87 struct ib_sge *sge;
88 88
89 recv->r_iwinc = NULL; 89 recv->r_iwinc = NULL;
90 recv->r_frag = NULL; 90 recv->r_frag = NULL;
91 91
92 recv->r_wr.next = NULL; 92 recv->r_wr.next = NULL;
93 recv->r_wr.wr_id = i; 93 recv->r_wr.wr_id = i;
94 recv->r_wr.sg_list = recv->r_sge; 94 recv->r_wr.sg_list = recv->r_sge;
95 recv->r_wr.num_sge = RDS_IW_RECV_SGE; 95 recv->r_wr.num_sge = RDS_IW_RECV_SGE;
96 96
97 sge = rds_iw_data_sge(ic, recv->r_sge); 97 sge = rds_iw_data_sge(ic, recv->r_sge);
98 sge->addr = 0; 98 sge->addr = 0;
99 sge->length = RDS_FRAG_SIZE; 99 sge->length = RDS_FRAG_SIZE;
100 sge->lkey = 0; 100 sge->lkey = 0;
101 101
102 sge = rds_iw_header_sge(ic, recv->r_sge); 102 sge = rds_iw_header_sge(ic, recv->r_sge);
103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); 103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
104 sge->length = sizeof(struct rds_header); 104 sge->length = sizeof(struct rds_header);
105 sge->lkey = 0; 105 sge->lkey = 0;
106 } 106 }
107 } 107 }
108 108
109 static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, 109 static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
110 struct rds_iw_recv_work *recv) 110 struct rds_iw_recv_work *recv)
111 { 111 {
112 if (recv->r_iwinc) { 112 if (recv->r_iwinc) {
113 rds_inc_put(&recv->r_iwinc->ii_inc); 113 rds_inc_put(&recv->r_iwinc->ii_inc);
114 recv->r_iwinc = NULL; 114 recv->r_iwinc = NULL;
115 } 115 }
116 if (recv->r_frag) { 116 if (recv->r_frag) {
117 rds_iw_recv_unmap_page(ic, recv); 117 rds_iw_recv_unmap_page(ic, recv);
118 if (recv->r_frag->f_page) 118 if (recv->r_frag->f_page)
119 rds_iw_frag_drop_page(recv->r_frag); 119 rds_iw_frag_drop_page(recv->r_frag);
120 rds_iw_frag_free(recv->r_frag); 120 rds_iw_frag_free(recv->r_frag);
121 recv->r_frag = NULL; 121 recv->r_frag = NULL;
122 } 122 }
123 } 123 }
124 124
125 void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) 125 void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
126 { 126 {
127 u32 i; 127 u32 i;
128 128
129 for (i = 0; i < ic->i_recv_ring.w_nr; i++) 129 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
130 rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); 130 rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
131 131
132 if (ic->i_frag.f_page) 132 if (ic->i_frag.f_page)
133 rds_iw_frag_drop_page(&ic->i_frag); 133 rds_iw_frag_drop_page(&ic->i_frag);
134 } 134 }
135 135
136 static int rds_iw_recv_refill_one(struct rds_connection *conn, 136 static int rds_iw_recv_refill_one(struct rds_connection *conn,
137 struct rds_iw_recv_work *recv, 137 struct rds_iw_recv_work *recv,
138 gfp_t kptr_gfp, gfp_t page_gfp) 138 gfp_t kptr_gfp, gfp_t page_gfp)
139 { 139 {
140 struct rds_iw_connection *ic = conn->c_transport_data; 140 struct rds_iw_connection *ic = conn->c_transport_data;
141 dma_addr_t dma_addr; 141 dma_addr_t dma_addr;
142 struct ib_sge *sge; 142 struct ib_sge *sge;
143 int ret = -ENOMEM; 143 int ret = -ENOMEM;
144 144
145 if (recv->r_iwinc == NULL) { 145 if (recv->r_iwinc == NULL) {
146 if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { 146 if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
147 rds_iw_stats_inc(s_iw_rx_alloc_limit); 147 rds_iw_stats_inc(s_iw_rx_alloc_limit);
148 goto out; 148 goto out;
149 } 149 }
150 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, 150 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
151 kptr_gfp); 151 kptr_gfp);
152 if (recv->r_iwinc == NULL) { 152 if (recv->r_iwinc == NULL) {
153 atomic_dec(&rds_iw_allocation); 153 atomic_dec(&rds_iw_allocation);
154 goto out; 154 goto out;
155 } 155 }
156 INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); 156 INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
157 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); 157 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
158 } 158 }
159 159
160 if (recv->r_frag == NULL) { 160 if (recv->r_frag == NULL) {
161 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); 161 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
162 if (recv->r_frag == NULL) 162 if (recv->r_frag == NULL)
163 goto out; 163 goto out;
164 INIT_LIST_HEAD(&recv->r_frag->f_item); 164 INIT_LIST_HEAD(&recv->r_frag->f_item);
165 recv->r_frag->f_page = NULL; 165 recv->r_frag->f_page = NULL;
166 } 166 }
167 167
168 if (ic->i_frag.f_page == NULL) { 168 if (ic->i_frag.f_page == NULL) {
169 ic->i_frag.f_page = alloc_page(page_gfp); 169 ic->i_frag.f_page = alloc_page(page_gfp);
170 if (ic->i_frag.f_page == NULL) 170 if (ic->i_frag.f_page == NULL)
171 goto out; 171 goto out;
172 ic->i_frag.f_offset = 0; 172 ic->i_frag.f_offset = 0;
173 } 173 }
174 174
175 dma_addr = ib_dma_map_page(ic->i_cm_id->device, 175 dma_addr = ib_dma_map_page(ic->i_cm_id->device,
176 ic->i_frag.f_page, 176 ic->i_frag.f_page,
177 ic->i_frag.f_offset, 177 ic->i_frag.f_offset,
178 RDS_FRAG_SIZE, 178 RDS_FRAG_SIZE,
179 DMA_FROM_DEVICE); 179 DMA_FROM_DEVICE);
180 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) 180 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
181 goto out; 181 goto out;
182 182
183 /* 183 /*
184 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() 184 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
185 * must be called on this recv. This happens as completions hit 185 * must be called on this recv. This happens as completions hit
186 * in order or on connection shutdown. 186 * in order or on connection shutdown.
187 */ 187 */
188 recv->r_frag->f_page = ic->i_frag.f_page; 188 recv->r_frag->f_page = ic->i_frag.f_page;
189 recv->r_frag->f_offset = ic->i_frag.f_offset; 189 recv->r_frag->f_offset = ic->i_frag.f_offset;
190 recv->r_frag->f_mapped = dma_addr; 190 recv->r_frag->f_mapped = dma_addr;
191 191
192 sge = rds_iw_data_sge(ic, recv->r_sge); 192 sge = rds_iw_data_sge(ic, recv->r_sge);
193 sge->addr = dma_addr; 193 sge->addr = dma_addr;
194 sge->length = RDS_FRAG_SIZE; 194 sge->length = RDS_FRAG_SIZE;
195 195
196 sge = rds_iw_header_sge(ic, recv->r_sge); 196 sge = rds_iw_header_sge(ic, recv->r_sge);
197 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); 197 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
198 sge->length = sizeof(struct rds_header); 198 sge->length = sizeof(struct rds_header);
199 199
200 get_page(recv->r_frag->f_page); 200 get_page(recv->r_frag->f_page);
201 201
202 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { 202 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
203 ic->i_frag.f_offset += RDS_FRAG_SIZE; 203 ic->i_frag.f_offset += RDS_FRAG_SIZE;
204 } else { 204 } else {
205 put_page(ic->i_frag.f_page); 205 put_page(ic->i_frag.f_page);
206 ic->i_frag.f_page = NULL; 206 ic->i_frag.f_page = NULL;
207 ic->i_frag.f_offset = 0; 207 ic->i_frag.f_offset = 0;
208 } 208 }
209 209
210 ret = 0; 210 ret = 0;
211 out: 211 out:
212 return ret; 212 return ret;
213 } 213 }
214 214
215 /* 215 /*
216 * This tries to allocate and post unused work requests after making sure that 216 * This tries to allocate and post unused work requests after making sure that
217 * they have all the allocations they need to queue received fragments into 217 * they have all the allocations they need to queue received fragments into
218 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc 218 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
219 * pairs don't go unmatched. 219 * pairs don't go unmatched.
220 * 220 *
221 * -1 is returned if posting fails due to temporary resource exhaustion. 221 * -1 is returned if posting fails due to temporary resource exhaustion.
222 */ 222 */
223 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 223 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
224 gfp_t page_gfp, int prefill) 224 gfp_t page_gfp, int prefill)
225 { 225 {
226 struct rds_iw_connection *ic = conn->c_transport_data; 226 struct rds_iw_connection *ic = conn->c_transport_data;
227 struct rds_iw_recv_work *recv; 227 struct rds_iw_recv_work *recv;
228 struct ib_recv_wr *failed_wr; 228 struct ib_recv_wr *failed_wr;
229 unsigned int posted = 0; 229 unsigned int posted = 0;
230 int ret = 0; 230 int ret = 0;
231 u32 pos; 231 u32 pos;
232 232
233 while ((prefill || rds_conn_up(conn)) 233 while ((prefill || rds_conn_up(conn))
234 && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { 234 && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
235 if (pos >= ic->i_recv_ring.w_nr) { 235 if (pos >= ic->i_recv_ring.w_nr) {
236 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", 236 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
237 pos); 237 pos);
238 ret = -EINVAL; 238 ret = -EINVAL;
239 break; 239 break;
240 } 240 }
241 241
242 recv = &ic->i_recvs[pos]; 242 recv = &ic->i_recvs[pos];
243 ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); 243 ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
244 if (ret) { 244 if (ret) {
245 ret = -1; 245 ret = -1;
246 break; 246 break;
247 } 247 }
248 248
249 /* XXX when can this fail? */ 249 /* XXX when can this fail? */
250 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 250 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
251 rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, 251 rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
252 recv->r_iwinc, recv->r_frag->f_page, 252 recv->r_iwinc, recv->r_frag->f_page,
253 (long) recv->r_frag->f_mapped, ret); 253 (long) recv->r_frag->f_mapped, ret);
254 if (ret) { 254 if (ret) {
255 rds_iw_conn_error(conn, "recv post on " 255 rds_iw_conn_error(conn, "recv post on "
256 "%pI4 returned %d, disconnecting and " 256 "%pI4 returned %d, disconnecting and "
257 "reconnecting\n", &conn->c_faddr, 257 "reconnecting\n", &conn->c_faddr,
258 ret); 258 ret);
259 ret = -1; 259 ret = -1;
260 break; 260 break;
261 } 261 }
262 262
263 posted++; 263 posted++;
264 } 264 }
265 265
266 /* We're doing flow control - update the window. */ 266 /* We're doing flow control - update the window. */
267 if (ic->i_flowctl && posted) 267 if (ic->i_flowctl && posted)
268 rds_iw_advertise_credits(conn, posted); 268 rds_iw_advertise_credits(conn, posted);
269 269
270 if (ret) 270 if (ret)
271 rds_iw_ring_unalloc(&ic->i_recv_ring, 1); 271 rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
272 return ret; 272 return ret;
273 } 273 }
274 274
275 void rds_iw_inc_purge(struct rds_incoming *inc) 275 void rds_iw_inc_purge(struct rds_incoming *inc)
276 { 276 {
277 struct rds_iw_incoming *iwinc; 277 struct rds_iw_incoming *iwinc;
278 struct rds_page_frag *frag; 278 struct rds_page_frag *frag;
279 struct rds_page_frag *pos; 279 struct rds_page_frag *pos;
280 280
281 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); 281 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
282 rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); 282 rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
283 283
284 list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { 284 list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
285 list_del_init(&frag->f_item); 285 list_del_init(&frag->f_item);
286 rds_iw_frag_drop_page(frag); 286 rds_iw_frag_drop_page(frag);
287 rds_iw_frag_free(frag); 287 rds_iw_frag_free(frag);
288 } 288 }
289 } 289 }
290 290
291 void rds_iw_inc_free(struct rds_incoming *inc) 291 void rds_iw_inc_free(struct rds_incoming *inc)
292 { 292 {
293 struct rds_iw_incoming *iwinc; 293 struct rds_iw_incoming *iwinc;
294 294
295 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); 295 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
296 296
297 rds_iw_inc_purge(inc); 297 rds_iw_inc_purge(inc);
298 rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); 298 rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
299 BUG_ON(!list_empty(&iwinc->ii_frags)); 299 BUG_ON(!list_empty(&iwinc->ii_frags));
300 kmem_cache_free(rds_iw_incoming_slab, iwinc); 300 kmem_cache_free(rds_iw_incoming_slab, iwinc);
301 atomic_dec(&rds_iw_allocation); 301 atomic_dec(&rds_iw_allocation);
302 BUG_ON(atomic_read(&rds_iw_allocation) < 0); 302 BUG_ON(atomic_read(&rds_iw_allocation) < 0);
303 } 303 }
304 304
305 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 305 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
306 size_t size) 306 size_t size)
307 { 307 {
308 struct rds_iw_incoming *iwinc; 308 struct rds_iw_incoming *iwinc;
309 struct rds_page_frag *frag; 309 struct rds_page_frag *frag;
310 struct iovec *iov = first_iov; 310 struct iovec *iov = first_iov;
311 unsigned long to_copy; 311 unsigned long to_copy;
312 unsigned long frag_off = 0; 312 unsigned long frag_off = 0;
313 unsigned long iov_off = 0; 313 unsigned long iov_off = 0;
314 int copied = 0; 314 int copied = 0;
315 int ret; 315 int ret;
316 u32 len; 316 u32 len;
317 317
318 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); 318 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
319 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); 319 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
320 len = be32_to_cpu(inc->i_hdr.h_len); 320 len = be32_to_cpu(inc->i_hdr.h_len);
321 321
322 while (copied < size && copied < len) { 322 while (copied < size && copied < len) {
323 if (frag_off == RDS_FRAG_SIZE) { 323 if (frag_off == RDS_FRAG_SIZE) {
324 frag = list_entry(frag->f_item.next, 324 frag = list_entry(frag->f_item.next,
325 struct rds_page_frag, f_item); 325 struct rds_page_frag, f_item);
326 frag_off = 0; 326 frag_off = 0;
327 } 327 }
328 while (iov_off == iov->iov_len) { 328 while (iov_off == iov->iov_len) {
329 iov_off = 0; 329 iov_off = 0;
330 iov++; 330 iov++;
331 } 331 }
332 332
333 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); 333 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
334 to_copy = min_t(size_t, to_copy, size - copied); 334 to_copy = min_t(size_t, to_copy, size - copied);
335 to_copy = min_t(unsigned long, to_copy, len - copied); 335 to_copy = min_t(unsigned long, to_copy, len - copied);
336 336
337 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " 337 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
338 "[%p, %lu] + %lu\n", 338 "[%p, %lu] + %lu\n",
339 to_copy, iov->iov_base, iov->iov_len, iov_off, 339 to_copy, iov->iov_base, iov->iov_len, iov_off,
340 frag->f_page, frag->f_offset, frag_off); 340 frag->f_page, frag->f_offset, frag_off);
341 341
342 /* XXX needs + offset for multiple recvs per page */ 342 /* XXX needs + offset for multiple recvs per page */
343 ret = rds_page_copy_to_user(frag->f_page, 343 ret = rds_page_copy_to_user(frag->f_page,
344 frag->f_offset + frag_off, 344 frag->f_offset + frag_off,
345 iov->iov_base + iov_off, 345 iov->iov_base + iov_off,
346 to_copy); 346 to_copy);
347 if (ret) { 347 if (ret) {
348 copied = ret; 348 copied = ret;
349 break; 349 break;
350 } 350 }
351 351
352 iov_off += to_copy; 352 iov_off += to_copy;
353 frag_off += to_copy; 353 frag_off += to_copy;
354 copied += to_copy; 354 copied += to_copy;
355 } 355 }
356 356
357 return copied; 357 return copied;
358 } 358 }
359 359
360 /* ic starts out kzalloc()ed */ 360 /* ic starts out kzalloc()ed */
361 void rds_iw_recv_init_ack(struct rds_iw_connection *ic) 361 void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
362 { 362 {
363 struct ib_send_wr *wr = &ic->i_ack_wr; 363 struct ib_send_wr *wr = &ic->i_ack_wr;
364 struct ib_sge *sge = &ic->i_ack_sge; 364 struct ib_sge *sge = &ic->i_ack_sge;
365 365
366 sge->addr = ic->i_ack_dma; 366 sge->addr = ic->i_ack_dma;
367 sge->length = sizeof(struct rds_header); 367 sge->length = sizeof(struct rds_header);
368 sge->lkey = rds_iw_local_dma_lkey(ic); 368 sge->lkey = rds_iw_local_dma_lkey(ic);
369 369
370 wr->sg_list = sge; 370 wr->sg_list = sge;
371 wr->num_sge = 1; 371 wr->num_sge = 1;
372 wr->opcode = IB_WR_SEND; 372 wr->opcode = IB_WR_SEND;
373 wr->wr_id = RDS_IW_ACK_WR_ID; 373 wr->wr_id = RDS_IW_ACK_WR_ID;
374 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; 374 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
375 } 375 }
376 376
377 /* 377 /*
378 * You'd think that with reliable IB connections you wouldn't need to ack 378 * You'd think that with reliable IB connections you wouldn't need to ack
379 * messages that have been received. The problem is that IB hardware generates 379 * messages that have been received. The problem is that IB hardware generates
380 * an ack message before it has DMAed the message into memory. This creates a 380 * an ack message before it has DMAed the message into memory. This creates a
381 * potential message loss if the HCA is disabled for any reason between when it 381 * potential message loss if the HCA is disabled for any reason between when it
382 * sends the ack and before the message is DMAed and processed. This is only a 382 * sends the ack and before the message is DMAed and processed. This is only a
383 * potential issue if another HCA is available for fail-over. 383 * potential issue if another HCA is available for fail-over.
384 * 384 *
385 * When the remote host receives our ack they'll free the sent message from 385 * When the remote host receives our ack they'll free the sent message from
386 * their send queue. To decrease the latency of this we always send an ack 386 * their send queue. To decrease the latency of this we always send an ack
387 * immediately after we've received messages. 387 * immediately after we've received messages.
388 * 388 *
389 * For simplicity, we only have one ack in flight at a time. This puts 389 * For simplicity, we only have one ack in flight at a time. This puts
390 * pressure on senders to have deep enough send queues to absorb the latency of 390 * pressure on senders to have deep enough send queues to absorb the latency of
391 * a single ack frame being in flight. This might not be good enough. 391 * a single ack frame being in flight. This might not be good enough.
392 * 392 *
393 * This is implemented by have a long-lived send_wr and sge which point to a 393 * This is implemented by have a long-lived send_wr and sge which point to a
394 * statically allocated ack frame. This ack wr does not fall under the ring 394 * statically allocated ack frame. This ack wr does not fall under the ring
395 * accounting that the tx and rx wrs do. The QP attribute specifically makes 395 * accounting that the tx and rx wrs do. The QP attribute specifically makes
396 * room for it beyond the ring size. Send completion notices its special 396 * room for it beyond the ring size. Send completion notices its special
397 * wr_id and avoids working with the ring in that case. 397 * wr_id and avoids working with the ring in that case.
398 */ 398 */
399 #ifndef KERNEL_HAS_ATOMIC64 399 #ifndef KERNEL_HAS_ATOMIC64
400 static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, 400 static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
401 int ack_required) 401 int ack_required)
402 { 402 {
403 unsigned long flags; 403 unsigned long flags;
404 404
405 spin_lock_irqsave(&ic->i_ack_lock, flags); 405 spin_lock_irqsave(&ic->i_ack_lock, flags);
406 ic->i_ack_next = seq; 406 ic->i_ack_next = seq;
407 if (ack_required) 407 if (ack_required)
408 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 408 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
409 spin_unlock_irqrestore(&ic->i_ack_lock, flags); 409 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
410 } 410 }
411 411
412 static u64 rds_iw_get_ack(struct rds_iw_connection *ic) 412 static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
413 { 413 {
414 unsigned long flags; 414 unsigned long flags;
415 u64 seq; 415 u64 seq;
416 416
417 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 417 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
418 418
419 spin_lock_irqsave(&ic->i_ack_lock, flags); 419 spin_lock_irqsave(&ic->i_ack_lock, flags);
420 seq = ic->i_ack_next; 420 seq = ic->i_ack_next;
421 spin_unlock_irqrestore(&ic->i_ack_lock, flags); 421 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
422 422
423 return seq; 423 return seq;
424 } 424 }
425 #else 425 #else
426 static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, 426 static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
427 int ack_required) 427 int ack_required)
428 { 428 {
429 atomic64_set(&ic->i_ack_next, seq); 429 atomic64_set(&ic->i_ack_next, seq);
430 if (ack_required) { 430 if (ack_required) {
431 smp_mb__before_clear_bit(); 431 smp_mb__before_clear_bit();
432 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 432 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
433 } 433 }
434 } 434 }
435 435
436 static u64 rds_iw_get_ack(struct rds_iw_connection *ic) 436 static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
437 { 437 {
438 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 438 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
439 smp_mb__after_clear_bit(); 439 smp_mb__after_clear_bit();
440 440
441 return atomic64_read(&ic->i_ack_next); 441 return atomic64_read(&ic->i_ack_next);
442 } 442 }
443 #endif 443 #endif
444 444
445 445
446 static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) 446 static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
447 { 447 {
448 struct rds_header *hdr = ic->i_ack; 448 struct rds_header *hdr = ic->i_ack;
449 struct ib_send_wr *failed_wr; 449 struct ib_send_wr *failed_wr;
450 u64 seq; 450 u64 seq;
451 int ret; 451 int ret;
452 452
453 seq = rds_iw_get_ack(ic); 453 seq = rds_iw_get_ack(ic);
454 454
455 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); 455 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
456 rds_message_populate_header(hdr, 0, 0, 0); 456 rds_message_populate_header(hdr, 0, 0, 0);
457 hdr->h_ack = cpu_to_be64(seq); 457 hdr->h_ack = cpu_to_be64(seq);
458 hdr->h_credit = adv_credits; 458 hdr->h_credit = adv_credits;
459 rds_message_make_checksum(hdr); 459 rds_message_make_checksum(hdr);
460 ic->i_ack_queued = jiffies; 460 ic->i_ack_queued = jiffies;
461 461
462 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); 462 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
463 if (unlikely(ret)) { 463 if (unlikely(ret)) {
464 /* Failed to send. Release the WR, and 464 /* Failed to send. Release the WR, and
465 * force another ACK. 465 * force another ACK.
466 */ 466 */
467 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 467 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
468 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 468 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
469 469
470 rds_iw_stats_inc(s_iw_ack_send_failure); 470 rds_iw_stats_inc(s_iw_ack_send_failure);
471 /* Need to finesse this later. */ 471 /* Need to finesse this later. */
472 BUG(); 472 BUG();
473 } else 473 } else
474 rds_iw_stats_inc(s_iw_ack_sent); 474 rds_iw_stats_inc(s_iw_ack_sent);
475 } 475 }
476 476
477 /* 477 /*
478 * There are 3 ways of getting acknowledgements to the peer: 478 * There are 3 ways of getting acknowledgements to the peer:
479 * 1. We call rds_iw_attempt_ack from the recv completion handler 479 * 1. We call rds_iw_attempt_ack from the recv completion handler
480 * to send an ACK-only frame. 480 * to send an ACK-only frame.
481 * However, there can be only one such frame in the send queue 481 * However, there can be only one such frame in the send queue
482 * at any time, so we may have to postpone it. 482 * at any time, so we may have to postpone it.
483 * 2. When another (data) packet is transmitted while there's 483 * 2. When another (data) packet is transmitted while there's
484 * an ACK in the queue, we piggyback the ACK sequence number 484 * an ACK in the queue, we piggyback the ACK sequence number
485 * on the data packet. 485 * on the data packet.
486 * 3. If the ACK WR is done sending, we get called from the 486 * 3. If the ACK WR is done sending, we get called from the
487 * send queue completion handler, and check whether there's 487 * send queue completion handler, and check whether there's
488 * another ACK pending (postponed because the WR was on the 488 * another ACK pending (postponed because the WR was on the
489 * queue). If so, we transmit it. 489 * queue). If so, we transmit it.
490 * 490 *
491 * We maintain 2 variables: 491 * We maintain 2 variables:
492 * - i_ack_flags, which keeps track of whether the ACK WR 492 * - i_ack_flags, which keeps track of whether the ACK WR
493 * is currently in the send queue or not (IB_ACK_IN_FLIGHT) 493 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
494 * - i_ack_next, which is the last sequence number we received 494 * - i_ack_next, which is the last sequence number we received
495 * 495 *
496 * Potentially, send queue and receive queue handlers can run concurrently. 496 * Potentially, send queue and receive queue handlers can run concurrently.
497 * It would be nice to not have to use a spinlock to synchronize things, 497 * It would be nice to not have to use a spinlock to synchronize things,
498 * but the one problem that rules this out is that 64bit updates are 498 * but the one problem that rules this out is that 64bit updates are
499 * not atomic on all platforms. Things would be a lot simpler if 499 * not atomic on all platforms. Things would be a lot simpler if
500 * we had atomic64 or maybe cmpxchg64 everywhere. 500 * we had atomic64 or maybe cmpxchg64 everywhere.
501 * 501 *
502 * Reconnecting complicates this picture just slightly. When we 502 * Reconnecting complicates this picture just slightly. When we
503 * reconnect, we may be seeing duplicate packets. The peer 503 * reconnect, we may be seeing duplicate packets. The peer
504 * is retransmitting them, because it hasn't seen an ACK for 504 * is retransmitting them, because it hasn't seen an ACK for
505 * them. It is important that we ACK these. 505 * them. It is important that we ACK these.
506 * 506 *
507 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with 507 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
508 * this flag set *MUST* be acknowledged immediately. 508 * this flag set *MUST* be acknowledged immediately.
509 */ 509 */
510 510
511 /* 511 /*
512 * When we get here, we're called from the recv queue handler. 512 * When we get here, we're called from the recv queue handler.
513 * Check whether we ought to transmit an ACK. 513 * Check whether we ought to transmit an ACK.
514 */ 514 */
515 void rds_iw_attempt_ack(struct rds_iw_connection *ic) 515 void rds_iw_attempt_ack(struct rds_iw_connection *ic)
516 { 516 {
517 unsigned int adv_credits; 517 unsigned int adv_credits;
518 518
519 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) 519 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
520 return; 520 return;
521 521
522 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { 522 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
523 rds_iw_stats_inc(s_iw_ack_send_delayed); 523 rds_iw_stats_inc(s_iw_ack_send_delayed);
524 return; 524 return;
525 } 525 }
526 526
527 /* Can we get a send credit? */ 527 /* Can we get a send credit? */
528 if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { 528 if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
529 rds_iw_stats_inc(s_iw_tx_throttle); 529 rds_iw_stats_inc(s_iw_tx_throttle);
530 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 530 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
531 return; 531 return;
532 } 532 }
533 533
534 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 534 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
535 rds_iw_send_ack(ic, adv_credits); 535 rds_iw_send_ack(ic, adv_credits);
536 } 536 }
537 537
538 /* 538 /*
539 * We get here from the send completion handler, when the 539 * We get here from the send completion handler, when the
540 * adapter tells us the ACK frame was sent. 540 * adapter tells us the ACK frame was sent.
541 */ 541 */
542 void rds_iw_ack_send_complete(struct rds_iw_connection *ic) 542 void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
543 { 543 {
544 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 544 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
545 rds_iw_attempt_ack(ic); 545 rds_iw_attempt_ack(ic);
546 } 546 }
547 547
548 /* 548 /*
549 * This is called by the regular xmit code when it wants to piggyback 549 * This is called by the regular xmit code when it wants to piggyback
550 * an ACK on an outgoing frame. 550 * an ACK on an outgoing frame.
551 */ 551 */
552 u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) 552 u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
553 { 553 {
554 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) 554 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
555 rds_iw_stats_inc(s_iw_ack_send_piggybacked); 555 rds_iw_stats_inc(s_iw_ack_send_piggybacked);
556 return rds_iw_get_ack(ic); 556 return rds_iw_get_ack(ic);
557 } 557 }
558 558
559 /* 559 /*
560 * It's kind of lame that we're copying from the posted receive pages into 560 * It's kind of lame that we're copying from the posted receive pages into
561 * long-lived bitmaps. We could have posted the bitmaps and rdma written into 561 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
562 * them. But receiving new congestion bitmaps should be a *rare* event, so 562 * them. But receiving new congestion bitmaps should be a *rare* event, so
563 * hopefully we won't need to invest that complexity in making it more 563 * hopefully we won't need to invest that complexity in making it more
564 * efficient. By copying we can share a simpler core with TCP which has to 564 * efficient. By copying we can share a simpler core with TCP which has to
565 * copy. 565 * copy.
566 */ 566 */
567 static void rds_iw_cong_recv(struct rds_connection *conn, 567 static void rds_iw_cong_recv(struct rds_connection *conn,
568 struct rds_iw_incoming *iwinc) 568 struct rds_iw_incoming *iwinc)
569 { 569 {
570 struct rds_cong_map *map; 570 struct rds_cong_map *map;
571 unsigned int map_off; 571 unsigned int map_off;
572 unsigned int map_page; 572 unsigned int map_page;
573 struct rds_page_frag *frag; 573 struct rds_page_frag *frag;
574 unsigned long frag_off; 574 unsigned long frag_off;
575 unsigned long to_copy; 575 unsigned long to_copy;
576 unsigned long copied; 576 unsigned long copied;
577 uint64_t uncongested = 0; 577 uint64_t uncongested = 0;
578 void *addr; 578 void *addr;
579 579
580 /* catch completely corrupt packets */ 580 /* catch completely corrupt packets */
581 if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 581 if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
582 return; 582 return;
583 583
584 map = conn->c_fcong; 584 map = conn->c_fcong;
585 map_page = 0; 585 map_page = 0;
586 map_off = 0; 586 map_off = 0;
587 587
588 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); 588 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
589 frag_off = 0; 589 frag_off = 0;
590 590
591 copied = 0; 591 copied = 0;
592 592
593 while (copied < RDS_CONG_MAP_BYTES) { 593 while (copied < RDS_CONG_MAP_BYTES) {
594 uint64_t *src, *dst; 594 uint64_t *src, *dst;
595 unsigned int k; 595 unsigned int k;
596 596
597 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); 597 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
598 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ 598 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
599 599
600 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); 600 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
601 601
602 src = addr + frag_off; 602 src = addr + frag_off;
603 dst = (void *)map->m_page_addrs[map_page] + map_off; 603 dst = (void *)map->m_page_addrs[map_page] + map_off;
604 for (k = 0; k < to_copy; k += 8) { 604 for (k = 0; k < to_copy; k += 8) {
605 /* Record ports that became uncongested, ie 605 /* Record ports that became uncongested, ie
606 * bits that changed from 0 to 1. */ 606 * bits that changed from 0 to 1. */
607 uncongested |= ~(*src) & *dst; 607 uncongested |= ~(*src) & *dst;
608 *dst++ = *src++; 608 *dst++ = *src++;
609 } 609 }
610 kunmap_atomic(addr, KM_SOFTIRQ0); 610 kunmap_atomic(addr, KM_SOFTIRQ0);
611 611
612 copied += to_copy; 612 copied += to_copy;
613 613
614 map_off += to_copy; 614 map_off += to_copy;
615 if (map_off == PAGE_SIZE) { 615 if (map_off == PAGE_SIZE) {
616 map_off = 0; 616 map_off = 0;
617 map_page++; 617 map_page++;
618 } 618 }
619 619
620 frag_off += to_copy; 620 frag_off += to_copy;
621 if (frag_off == RDS_FRAG_SIZE) { 621 if (frag_off == RDS_FRAG_SIZE) {
622 frag = list_entry(frag->f_item.next, 622 frag = list_entry(frag->f_item.next,
623 struct rds_page_frag, f_item); 623 struct rds_page_frag, f_item);
624 frag_off = 0; 624 frag_off = 0;
625 } 625 }
626 } 626 }
627 627
628 /* the congestion map is in little endian order */ 628 /* the congestion map is in little endian order */
629 uncongested = le64_to_cpu(uncongested); 629 uncongested = le64_to_cpu(uncongested);
630 630
631 rds_cong_map_updated(map, uncongested); 631 rds_cong_map_updated(map, uncongested);
632 } 632 }
633 633
634 /* 634 /*
635 * Rings are posted with all the allocations they'll need to queue the 635 * Rings are posted with all the allocations they'll need to queue the
636 * incoming message to the receiving socket so this can't fail. 636 * incoming message to the receiving socket so this can't fail.
637 * All fragments start with a header, so we can make sure we're not receiving 637 * All fragments start with a header, so we can make sure we're not receiving
638 * garbage, and we can tell a small 8 byte fragment from an ACK frame. 638 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
639 */ 639 */
640 struct rds_iw_ack_state { 640 struct rds_iw_ack_state {
641 u64 ack_next; 641 u64 ack_next;
642 u64 ack_recv; 642 u64 ack_recv;
643 unsigned int ack_required:1; 643 unsigned int ack_required:1;
644 unsigned int ack_next_valid:1; 644 unsigned int ack_next_valid:1;
645 unsigned int ack_recv_valid:1; 645 unsigned int ack_recv_valid:1;
646 }; 646 };
647 647
648 static void rds_iw_process_recv(struct rds_connection *conn, 648 static void rds_iw_process_recv(struct rds_connection *conn,
649 struct rds_iw_recv_work *recv, u32 byte_len, 649 struct rds_iw_recv_work *recv, u32 byte_len,
650 struct rds_iw_ack_state *state) 650 struct rds_iw_ack_state *state)
651 { 651 {
652 struct rds_iw_connection *ic = conn->c_transport_data; 652 struct rds_iw_connection *ic = conn->c_transport_data;
653 struct rds_iw_incoming *iwinc = ic->i_iwinc; 653 struct rds_iw_incoming *iwinc = ic->i_iwinc;
654 struct rds_header *ihdr, *hdr; 654 struct rds_header *ihdr, *hdr;
655 655
656 /* XXX shut down the connection if port 0,0 are seen? */ 656 /* XXX shut down the connection if port 0,0 are seen? */
657 657
658 rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, 658 rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
659 byte_len); 659 byte_len);
660 660
661 if (byte_len < sizeof(struct rds_header)) { 661 if (byte_len < sizeof(struct rds_header)) {
662 rds_iw_conn_error(conn, "incoming message " 662 rds_iw_conn_error(conn, "incoming message "
663 "from %pI4 didn't inclue a " 663 "from %pI4 didn't inclue a "
664 "header, disconnecting and " 664 "header, disconnecting and "
665 "reconnecting\n", 665 "reconnecting\n",
666 &conn->c_faddr); 666 &conn->c_faddr);
667 return; 667 return;
668 } 668 }
669 byte_len -= sizeof(struct rds_header); 669 byte_len -= sizeof(struct rds_header);
670 670
671 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; 671 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
672 672
673 /* Validate the checksum. */ 673 /* Validate the checksum. */
674 if (!rds_message_verify_checksum(ihdr)) { 674 if (!rds_message_verify_checksum(ihdr)) {
675 rds_iw_conn_error(conn, "incoming message " 675 rds_iw_conn_error(conn, "incoming message "
676 "from %pI4 has corrupted header - " 676 "from %pI4 has corrupted header - "
677 "forcing a reconnect\n", 677 "forcing a reconnect\n",
678 &conn->c_faddr); 678 &conn->c_faddr);
679 rds_stats_inc(s_recv_drop_bad_checksum); 679 rds_stats_inc(s_recv_drop_bad_checksum);
680 return; 680 return;
681 } 681 }
682 682
683 /* Process the ACK sequence which comes with every packet */ 683 /* Process the ACK sequence which comes with every packet */
684 state->ack_recv = be64_to_cpu(ihdr->h_ack); 684 state->ack_recv = be64_to_cpu(ihdr->h_ack);
685 state->ack_recv_valid = 1; 685 state->ack_recv_valid = 1;
686 686
687 /* Process the credits update if there was one */ 687 /* Process the credits update if there was one */
688 if (ihdr->h_credit) 688 if (ihdr->h_credit)
689 rds_iw_send_add_credits(conn, ihdr->h_credit); 689 rds_iw_send_add_credits(conn, ihdr->h_credit);
690 690
691 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { 691 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
692 /* This is an ACK-only packet. The fact that it gets 692 /* This is an ACK-only packet. The fact that it gets
693 * special treatment here is that historically, ACKs 693 * special treatment here is that historically, ACKs
694 * were rather special beasts. 694 * were rather special beasts.
695 */ 695 */
696 rds_iw_stats_inc(s_iw_ack_received); 696 rds_iw_stats_inc(s_iw_ack_received);
697 697
698 /* 698 /*
699 * Usually the frags make their way on to incs and are then freed as 699 * Usually the frags make their way on to incs and are then freed as
700 * the inc is freed. We don't go that route, so we have to drop the 700 * the inc is freed. We don't go that route, so we have to drop the
701 * page ref ourselves. We can't just leave the page on the recv 701 * page ref ourselves. We can't just leave the page on the recv
702 * because that confuses the dma mapping of pages and each recv's use 702 * because that confuses the dma mapping of pages and each recv's use
703 * of a partial page. We can leave the frag, though, it will be 703 * of a partial page. We can leave the frag, though, it will be
704 * reused. 704 * reused.
705 * 705 *
706 * FIXME: Fold this into the code path below. 706 * FIXME: Fold this into the code path below.
707 */ 707 */
708 rds_iw_frag_drop_page(recv->r_frag); 708 rds_iw_frag_drop_page(recv->r_frag);
709 return; 709 return;
710 } 710 }
711 711
712 /* 712 /*
713 * If we don't already have an inc on the connection then this 713 * If we don't already have an inc on the connection then this
714 * fragment has a header and starts a message.. copy its header 714 * fragment has a header and starts a message.. copy its header
715 * into the inc and save the inc so we can hang upcoming fragments 715 * into the inc and save the inc so we can hang upcoming fragments
716 * off its list. 716 * off its list.
717 */ 717 */
718 if (iwinc == NULL) { 718 if (iwinc == NULL) {
719 iwinc = recv->r_iwinc; 719 iwinc = recv->r_iwinc;
720 recv->r_iwinc = NULL; 720 recv->r_iwinc = NULL;
721 ic->i_iwinc = iwinc; 721 ic->i_iwinc = iwinc;
722 722
723 hdr = &iwinc->ii_inc.i_hdr; 723 hdr = &iwinc->ii_inc.i_hdr;
724 memcpy(hdr, ihdr, sizeof(*hdr)); 724 memcpy(hdr, ihdr, sizeof(*hdr));
725 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); 725 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
726 726
727 rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, 727 rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
728 ic->i_recv_data_rem, hdr->h_flags); 728 ic->i_recv_data_rem, hdr->h_flags);
729 } else { 729 } else {
730 hdr = &iwinc->ii_inc.i_hdr; 730 hdr = &iwinc->ii_inc.i_hdr;
731 /* We can't just use memcmp here; fragments of a 731 /* We can't just use memcmp here; fragments of a
732 * single message may carry different ACKs */ 732 * single message may carry different ACKs */
733 if (hdr->h_sequence != ihdr->h_sequence 733 if (hdr->h_sequence != ihdr->h_sequence
734 || hdr->h_len != ihdr->h_len 734 || hdr->h_len != ihdr->h_len
735 || hdr->h_sport != ihdr->h_sport 735 || hdr->h_sport != ihdr->h_sport
736 || hdr->h_dport != ihdr->h_dport) { 736 || hdr->h_dport != ihdr->h_dport) {
737 rds_iw_conn_error(conn, 737 rds_iw_conn_error(conn,
738 "fragment header mismatch; forcing reconnect\n"); 738 "fragment header mismatch; forcing reconnect\n");
739 return; 739 return;
740 } 740 }
741 } 741 }
742 742
743 list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); 743 list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
744 recv->r_frag = NULL; 744 recv->r_frag = NULL;
745 745
746 if (ic->i_recv_data_rem > RDS_FRAG_SIZE) 746 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
747 ic->i_recv_data_rem -= RDS_FRAG_SIZE; 747 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
748 else { 748 else {
749 ic->i_recv_data_rem = 0; 749 ic->i_recv_data_rem = 0;
750 ic->i_iwinc = NULL; 750 ic->i_iwinc = NULL;
751 751
752 if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 752 if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
753 rds_iw_cong_recv(conn, iwinc); 753 rds_iw_cong_recv(conn, iwinc);
754 else { 754 else {
755 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, 755 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
756 &iwinc->ii_inc, GFP_ATOMIC, 756 &iwinc->ii_inc, GFP_ATOMIC,
757 KM_SOFTIRQ0); 757 KM_SOFTIRQ0);
758 state->ack_next = be64_to_cpu(hdr->h_sequence); 758 state->ack_next = be64_to_cpu(hdr->h_sequence);
759 state->ack_next_valid = 1; 759 state->ack_next_valid = 1;
760 } 760 }
761 761
762 /* Evaluate the ACK_REQUIRED flag *after* we received 762 /* Evaluate the ACK_REQUIRED flag *after* we received
763 * the complete frame, and after bumping the next_rx 763 * the complete frame, and after bumping the next_rx
764 * sequence. */ 764 * sequence. */
765 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { 765 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
766 rds_stats_inc(s_recv_ack_required); 766 rds_stats_inc(s_recv_ack_required);
767 state->ack_required = 1; 767 state->ack_required = 1;
768 } 768 }
769 769
770 rds_inc_put(&iwinc->ii_inc); 770 rds_inc_put(&iwinc->ii_inc);
771 } 771 }
772 } 772 }
773 773
774 /* 774 /*
775 * Plucking the oldest entry from the ring can be done concurrently with 775 * Plucking the oldest entry from the ring can be done concurrently with
776 * the thread refilling the ring. Each ring operation is protected by 776 * the thread refilling the ring. Each ring operation is protected by
777 * spinlocks and the transient state of refilling doesn't change the 777 * spinlocks and the transient state of refilling doesn't change the
778 * recording of which entry is oldest. 778 * recording of which entry is oldest.
779 * 779 *
780 * This relies on IB only calling one cq comp_handler for each cq so that 780 * This relies on IB only calling one cq comp_handler for each cq so that
781 * there will only be one caller of rds_recv_incoming() per RDS connection. 781 * there will only be one caller of rds_recv_incoming() per RDS connection.
782 */ 782 */
783 void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) 783 void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
784 { 784 {
785 struct rds_connection *conn = context; 785 struct rds_connection *conn = context;
786 struct rds_iw_connection *ic = conn->c_transport_data; 786 struct rds_iw_connection *ic = conn->c_transport_data;
787 struct ib_wc wc;
788 struct rds_iw_ack_state state = { 0, };
789 struct rds_iw_recv_work *recv;
790 787
791 rdsdebug("conn %p cq %p\n", conn, cq); 788 rdsdebug("conn %p cq %p\n", conn, cq);
792 789
793 rds_iw_stats_inc(s_iw_rx_cq_call); 790 rds_iw_stats_inc(s_iw_rx_cq_call);
794 791
795 ib_req_notify_cq(cq, IB_CQ_SOLICITED); 792 tasklet_schedule(&ic->i_recv_tasklet);
793 }
796 794
797 while (ib_poll_cq(cq, 1, &wc) > 0) { 795 static inline void rds_poll_cq(struct rds_iw_connection *ic,
796 struct rds_iw_ack_state *state)
797 {
798 struct rds_connection *conn = ic->conn;
799 struct ib_wc wc;
800 struct rds_iw_recv_work *recv;
801
802 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
798 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 803 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
799 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 804 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
800 be32_to_cpu(wc.ex.imm_data)); 805 be32_to_cpu(wc.ex.imm_data));
801 rds_iw_stats_inc(s_iw_rx_cq_event); 806 rds_iw_stats_inc(s_iw_rx_cq_event);
802 807
803 recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; 808 recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
804 809
805 rds_iw_recv_unmap_page(ic, recv); 810 rds_iw_recv_unmap_page(ic, recv);
806 811
807 /* 812 /*
808 * Also process recvs in connecting state because it is possible 813 * Also process recvs in connecting state because it is possible
809 * to get a recv completion _before_ the rdmacm ESTABLISHED 814 * to get a recv completion _before_ the rdmacm ESTABLISHED
810 * event is processed. 815 * event is processed.
811 */ 816 */
812 if (rds_conn_up(conn) || rds_conn_connecting(conn)) { 817 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
813 /* We expect errors as the qp is drained during shutdown */ 818 /* We expect errors as the qp is drained during shutdown */
814 if (wc.status == IB_WC_SUCCESS) { 819 if (wc.status == IB_WC_SUCCESS) {
815 rds_iw_process_recv(conn, recv, wc.byte_len, &state); 820 rds_iw_process_recv(conn, recv, wc.byte_len, state);
816 } else { 821 } else {
817 rds_iw_conn_error(conn, "recv completion on " 822 rds_iw_conn_error(conn, "recv completion on "
818 "%pI4 had status %u, disconnecting and " 823 "%pI4 had status %u, disconnecting and "
819 "reconnecting\n", &conn->c_faddr, 824 "reconnecting\n", &conn->c_faddr,
820 wc.status); 825 wc.status);
821 } 826 }
822 } 827 }
823 828
824 rds_iw_ring_free(&ic->i_recv_ring, 1); 829 rds_iw_ring_free(&ic->i_recv_ring, 1);
825 } 830 }
831 }
832
833 void rds_iw_recv_tasklet_fn(unsigned long data)
834 {
835 struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
836 struct rds_connection *conn = ic->conn;
837 struct rds_iw_ack_state state = { 0, };
838
839 rds_poll_cq(ic, &state);
840 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
841 rds_poll_cq(ic, &state);
826 842
827 if (state.ack_next_valid) 843 if (state.ack_next_valid)
828 rds_iw_set_ack(ic, state.ack_next, state.ack_required); 844 rds_iw_set_ack(ic, state.ack_next, state.ack_required);
829 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { 845 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
830 rds_send_drop_acked(conn, state.ack_recv, NULL); 846 rds_send_drop_acked(conn, state.ack_recv, NULL);
831 ic->i_ack_recv = state.ack_recv; 847 ic->i_ack_recv = state.ack_recv;
832 } 848 }
833 if (rds_conn_up(conn)) 849 if (rds_conn_up(conn))
834 rds_iw_attempt_ack(ic); 850 rds_iw_attempt_ack(ic);
835 851
836 /* If we ever end up with a really empty receive ring, we're 852 /* If we ever end up with a really empty receive ring, we're
837 * in deep trouble, as the sender will definitely see RNR 853 * in deep trouble, as the sender will definitely see RNR
838 * timeouts. */ 854 * timeouts. */
839 if (rds_iw_ring_empty(&ic->i_recv_ring)) 855 if (rds_iw_ring_empty(&ic->i_recv_ring))
840 rds_iw_stats_inc(s_iw_rx_ring_empty); 856 rds_iw_stats_inc(s_iw_rx_ring_empty);
841 857
842 /* 858 /*
843 * If the ring is running low, then schedule the thread to refill. 859 * If the ring is running low, then schedule the thread to refill.
844 */ 860 */
845 if (rds_iw_ring_low(&ic->i_recv_ring)) 861 if (rds_iw_ring_low(&ic->i_recv_ring))
846 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 862 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
847 } 863 }
848 864
849 int rds_iw_recv(struct rds_connection *conn) 865 int rds_iw_recv(struct rds_connection *conn)
850 { 866 {
851 struct rds_iw_connection *ic = conn->c_transport_data; 867 struct rds_iw_connection *ic = conn->c_transport_data;
852 int ret = 0; 868 int ret = 0;
853 869
854 rdsdebug("conn %p\n", conn); 870 rdsdebug("conn %p\n", conn);
855 871
856 /* 872 /*
857 * If we get a temporary posting failure in this context then 873 * If we get a temporary posting failure in this context then
858 * we're really low and we want the caller to back off for a bit. 874 * we're really low and we want the caller to back off for a bit.
859 */ 875 */
860 mutex_lock(&ic->i_recv_mutex); 876 mutex_lock(&ic->i_recv_mutex);
861 if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) 877 if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
862 ret = -ENOMEM; 878 ret = -ENOMEM;
863 else 879 else
864 rds_iw_stats_inc(s_iw_rx_refill_from_thread); 880 rds_iw_stats_inc(s_iw_rx_refill_from_thread);
865 mutex_unlock(&ic->i_recv_mutex); 881 mutex_unlock(&ic->i_recv_mutex);
866 882
867 if (rds_conn_up(conn)) 883 if (rds_conn_up(conn))
868 rds_iw_attempt_ack(ic); 884 rds_iw_attempt_ack(ic);
869 885
870 return ret; 886 return ret;
871 } 887 }
872 888
873 int __init rds_iw_recv_init(void) 889 int __init rds_iw_recv_init(void)
874 { 890 {
875 struct sysinfo si; 891 struct sysinfo si;
876 int ret = -ENOMEM; 892 int ret = -ENOMEM;
877 893
878 /* Default to 30% of all available RAM for recv memory */ 894 /* Default to 30% of all available RAM for recv memory */
879 si_meminfo(&si); 895 si_meminfo(&si);
880 rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; 896 rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
881 897
882 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", 898 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
883 sizeof(struct rds_iw_incoming), 899 sizeof(struct rds_iw_incoming),
884 0, 0, NULL); 900 0, 0, NULL);
885 if (rds_iw_incoming_slab == NULL) 901 if (rds_iw_incoming_slab == NULL)
886 goto out; 902 goto out;
887 903
888 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", 904 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
889 sizeof(struct rds_page_frag), 905 sizeof(struct rds_page_frag),
890 0, 0, NULL); 906 0, 0, NULL);
891 if (rds_iw_frag_slab == NULL) 907 if (rds_iw_frag_slab == NULL)
892 kmem_cache_destroy(rds_iw_incoming_slab); 908 kmem_cache_destroy(rds_iw_incoming_slab);
893 else 909 else
894 ret = 0; 910 ret = 0;
895 out: 911 out:
896 return ret; 912 return ret;
897 } 913 }
898 914
899 void rds_iw_recv_exit(void) 915 void rds_iw_recv_exit(void)
900 { 916 {
901 kmem_cache_destroy(rds_iw_incoming_slab); 917 kmem_cache_destroy(rds_iw_incoming_slab);