Commit ab2910921064b657610a3b501358a305e13087ea

Authored by Jianjun Kong
Committed by David S. Miller
1 parent 6d9f239a1e

net: remove two duplicated #include

Removed duplicated #include <rdma/ib_verbs.h> in net/9p/trans_rdma.c
		and  #include <linux/thread_info.h> in net/socket.c

Signed-off-by: Jianjun Kong <jianjun@zeuux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 2 changed files with 0 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/9p/trans_rdma.c 2 * linux/fs/9p/trans_rdma.c
3 * 3 *
4 * RDMA transport layer based on the trans_fd.c implementation. 4 * RDMA transport layer based on the trans_fd.c implementation.
5 * 5 *
6 * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> 6 * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
7 * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> 7 * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
8 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> 8 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
9 * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> 9 * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
10 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> 10 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
11 * 11 *
12 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 13 * it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation. 14 * as published by the Free Software Foundation.
15 * 15 *
16 * This program is distributed in the hope that it will be useful, 16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details. 19 * GNU General Public License for more details.
20 * 20 *
21 * You should have received a copy of the GNU General Public License 21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to: 22 * along with this program; if not, write to:
23 * Free Software Foundation 23 * Free Software Foundation
24 * 51 Franklin Street, Fifth Floor 24 * 51 Franklin Street, Fifth Floor
25 * Boston, MA 02111-1301 USA 25 * Boston, MA 02111-1301 USA
26 * 26 *
27 */ 27 */
28 28
29 #include <linux/in.h> 29 #include <linux/in.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/net.h> 31 #include <linux/net.h>
32 #include <linux/ipv6.h> 32 #include <linux/ipv6.h>
33 #include <linux/kthread.h> 33 #include <linux/kthread.h>
34 #include <linux/errno.h> 34 #include <linux/errno.h>
35 #include <linux/kernel.h> 35 #include <linux/kernel.h>
36 #include <linux/un.h> 36 #include <linux/un.h>
37 #include <linux/uaccess.h> 37 #include <linux/uaccess.h>
38 #include <linux/inet.h> 38 #include <linux/inet.h>
39 #include <linux/idr.h> 39 #include <linux/idr.h>
40 #include <linux/file.h> 40 #include <linux/file.h>
41 #include <linux/parser.h> 41 #include <linux/parser.h>
42 #include <linux/semaphore.h> 42 #include <linux/semaphore.h>
43 #include <net/9p/9p.h> 43 #include <net/9p/9p.h>
44 #include <net/9p/client.h> 44 #include <net/9p/client.h>
45 #include <net/9p/transport.h> 45 #include <net/9p/transport.h>
46 #include <rdma/ib_verbs.h> 46 #include <rdma/ib_verbs.h>
47 #include <rdma/rdma_cm.h> 47 #include <rdma/rdma_cm.h>
48 #include <rdma/ib_verbs.h>
49 48
50 #define P9_PORT 5640 49 #define P9_PORT 5640
51 #define P9_RDMA_SQ_DEPTH 32 50 #define P9_RDMA_SQ_DEPTH 32
52 #define P9_RDMA_RQ_DEPTH 32 51 #define P9_RDMA_RQ_DEPTH 32
53 #define P9_RDMA_SEND_SGE 4 52 #define P9_RDMA_SEND_SGE 4
54 #define P9_RDMA_RECV_SGE 4 53 #define P9_RDMA_RECV_SGE 4
55 #define P9_RDMA_IRD 0 54 #define P9_RDMA_IRD 0
56 #define P9_RDMA_ORD 0 55 #define P9_RDMA_ORD 0
57 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ 56 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */
58 #define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can 57 #define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can
59 * safely advertise a maxsize 58 * safely advertise a maxsize
60 * of 64k */ 59 * of 64k */
61 60
62 #define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT) 61 #define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT)
63 /** 62 /**
64 * struct p9_trans_rdma - RDMA transport instance 63 * struct p9_trans_rdma - RDMA transport instance
65 * 64 *
66 * @state: tracks the transport state machine for connection setup and tear down 65 * @state: tracks the transport state machine for connection setup and tear down
67 * @cm_id: The RDMA CM ID 66 * @cm_id: The RDMA CM ID
68 * @pd: Protection Domain pointer 67 * @pd: Protection Domain pointer
69 * @qp: Queue Pair pointer 68 * @qp: Queue Pair pointer
70 * @cq: Completion Queue pointer 69 * @cq: Completion Queue pointer
71 * @lkey: The local access only memory region key 70 * @lkey: The local access only memory region key
72 * @timeout: Number of uSecs to wait for connection management events 71 * @timeout: Number of uSecs to wait for connection management events
73 * @sq_depth: The depth of the Send Queue 72 * @sq_depth: The depth of the Send Queue
74 * @sq_sem: Semaphore for the SQ 73 * @sq_sem: Semaphore for the SQ
75 * @rq_depth: The depth of the Receive Queue. 74 * @rq_depth: The depth of the Receive Queue.
76 * @addr: The remote peer's address 75 * @addr: The remote peer's address
77 * @req_lock: Protects the active request list 76 * @req_lock: Protects the active request list
78 * @send_wait: Wait list when the SQ fills up 77 * @send_wait: Wait list when the SQ fills up
79 * @cm_done: Completion event for connection management tracking 78 * @cm_done: Completion event for connection management tracking
80 */ 79 */
81 struct p9_trans_rdma { 80 struct p9_trans_rdma {
82 enum { 81 enum {
83 P9_RDMA_INIT, 82 P9_RDMA_INIT,
84 P9_RDMA_ADDR_RESOLVED, 83 P9_RDMA_ADDR_RESOLVED,
85 P9_RDMA_ROUTE_RESOLVED, 84 P9_RDMA_ROUTE_RESOLVED,
86 P9_RDMA_CONNECTED, 85 P9_RDMA_CONNECTED,
87 P9_RDMA_FLUSHING, 86 P9_RDMA_FLUSHING,
88 P9_RDMA_CLOSING, 87 P9_RDMA_CLOSING,
89 P9_RDMA_CLOSED, 88 P9_RDMA_CLOSED,
90 } state; 89 } state;
91 struct rdma_cm_id *cm_id; 90 struct rdma_cm_id *cm_id;
92 struct ib_pd *pd; 91 struct ib_pd *pd;
93 struct ib_qp *qp; 92 struct ib_qp *qp;
94 struct ib_cq *cq; 93 struct ib_cq *cq;
95 struct ib_mr *dma_mr; 94 struct ib_mr *dma_mr;
96 u32 lkey; 95 u32 lkey;
97 long timeout; 96 long timeout;
98 int sq_depth; 97 int sq_depth;
99 struct semaphore sq_sem; 98 struct semaphore sq_sem;
100 int rq_depth; 99 int rq_depth;
101 atomic_t rq_count; 100 atomic_t rq_count;
102 struct sockaddr_in addr; 101 struct sockaddr_in addr;
103 spinlock_t req_lock; 102 spinlock_t req_lock;
104 103
105 struct completion cm_done; 104 struct completion cm_done;
106 }; 105 };
107 106
108 /** 107 /**
109 * p9_rdma_context - Keeps track of in-process WR 108 * p9_rdma_context - Keeps track of in-process WR
110 * 109 *
111 * @wc_op: The original WR op for when the CQE completes in error. 110 * @wc_op: The original WR op for when the CQE completes in error.
112 * @busa: Bus address to unmap when the WR completes 111 * @busa: Bus address to unmap when the WR completes
113 * @req: Keeps track of requests (send) 112 * @req: Keeps track of requests (send)
114 * @rc: Keepts track of replies (receive) 113 * @rc: Keepts track of replies (receive)
115 */ 114 */
116 struct p9_rdma_req; 115 struct p9_rdma_req;
117 struct p9_rdma_context { 116 struct p9_rdma_context {
118 enum ib_wc_opcode wc_op; 117 enum ib_wc_opcode wc_op;
119 dma_addr_t busa; 118 dma_addr_t busa;
120 union { 119 union {
121 struct p9_req_t *req; 120 struct p9_req_t *req;
122 struct p9_fcall *rc; 121 struct p9_fcall *rc;
123 }; 122 };
124 }; 123 };
125 124
126 /** 125 /**
127 * p9_rdma_opts - Collection of mount options 126 * p9_rdma_opts - Collection of mount options
128 * @port: port of connection 127 * @port: port of connection
129 * @sq_depth: The requested depth of the SQ. This really doesn't need 128 * @sq_depth: The requested depth of the SQ. This really doesn't need
130 * to be any deeper than the number of threads used in the client 129 * to be any deeper than the number of threads used in the client
131 * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth 130 * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
132 * @timeout: Time to wait in msecs for CM events 131 * @timeout: Time to wait in msecs for CM events
133 */ 132 */
134 struct p9_rdma_opts { 133 struct p9_rdma_opts {
135 short port; 134 short port;
136 int sq_depth; 135 int sq_depth;
137 int rq_depth; 136 int rq_depth;
138 long timeout; 137 long timeout;
139 }; 138 };
140 139
141 /* 140 /*
142 * Option Parsing (code inspired by NFS code) 141 * Option Parsing (code inspired by NFS code)
143 */ 142 */
144 enum { 143 enum {
145 /* Options that take integer arguments */ 144 /* Options that take integer arguments */
146 Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, 145 Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err,
147 }; 146 };
148 147
149 static match_table_t tokens = { 148 static match_table_t tokens = {
150 {Opt_port, "port=%u"}, 149 {Opt_port, "port=%u"},
151 {Opt_sq_depth, "sq=%u"}, 150 {Opt_sq_depth, "sq=%u"},
152 {Opt_rq_depth, "rq=%u"}, 151 {Opt_rq_depth, "rq=%u"},
153 {Opt_timeout, "timeout=%u"}, 152 {Opt_timeout, "timeout=%u"},
154 {Opt_err, NULL}, 153 {Opt_err, NULL},
155 }; 154 };
156 155
157 /** 156 /**
158 * parse_options - parse mount options into session structure 157 * parse_options - parse mount options into session structure
159 * @options: options string passed from mount 158 * @options: options string passed from mount
160 * @opts: transport-specific structure to parse options into 159 * @opts: transport-specific structure to parse options into
161 * 160 *
162 * Returns 0 upon success, -ERRNO upon failure 161 * Returns 0 upon success, -ERRNO upon failure
163 */ 162 */
164 static int parse_opts(char *params, struct p9_rdma_opts *opts) 163 static int parse_opts(char *params, struct p9_rdma_opts *opts)
165 { 164 {
166 char *p; 165 char *p;
167 substring_t args[MAX_OPT_ARGS]; 166 substring_t args[MAX_OPT_ARGS];
168 int option; 167 int option;
169 char *options; 168 char *options;
170 int ret; 169 int ret;
171 170
172 opts->port = P9_PORT; 171 opts->port = P9_PORT;
173 opts->sq_depth = P9_RDMA_SQ_DEPTH; 172 opts->sq_depth = P9_RDMA_SQ_DEPTH;
174 opts->rq_depth = P9_RDMA_RQ_DEPTH; 173 opts->rq_depth = P9_RDMA_RQ_DEPTH;
175 opts->timeout = P9_RDMA_TIMEOUT; 174 opts->timeout = P9_RDMA_TIMEOUT;
176 175
177 if (!params) 176 if (!params)
178 return 0; 177 return 0;
179 178
180 options = kstrdup(params, GFP_KERNEL); 179 options = kstrdup(params, GFP_KERNEL);
181 if (!options) { 180 if (!options) {
182 P9_DPRINTK(P9_DEBUG_ERROR, 181 P9_DPRINTK(P9_DEBUG_ERROR,
183 "failed to allocate copy of option string\n"); 182 "failed to allocate copy of option string\n");
184 return -ENOMEM; 183 return -ENOMEM;
185 } 184 }
186 185
187 while ((p = strsep(&options, ",")) != NULL) { 186 while ((p = strsep(&options, ",")) != NULL) {
188 int token; 187 int token;
189 int r; 188 int r;
190 if (!*p) 189 if (!*p)
191 continue; 190 continue;
192 token = match_token(p, tokens, args); 191 token = match_token(p, tokens, args);
193 r = match_int(&args[0], &option); 192 r = match_int(&args[0], &option);
194 if (r < 0) { 193 if (r < 0) {
195 P9_DPRINTK(P9_DEBUG_ERROR, 194 P9_DPRINTK(P9_DEBUG_ERROR,
196 "integer field, but no integer?\n"); 195 "integer field, but no integer?\n");
197 ret = r; 196 ret = r;
198 continue; 197 continue;
199 } 198 }
200 switch (token) { 199 switch (token) {
201 case Opt_port: 200 case Opt_port:
202 opts->port = option; 201 opts->port = option;
203 break; 202 break;
204 case Opt_sq_depth: 203 case Opt_sq_depth:
205 opts->sq_depth = option; 204 opts->sq_depth = option;
206 break; 205 break;
207 case Opt_rq_depth: 206 case Opt_rq_depth:
208 opts->rq_depth = option; 207 opts->rq_depth = option;
209 break; 208 break;
210 case Opt_timeout: 209 case Opt_timeout:
211 opts->timeout = option; 210 opts->timeout = option;
212 break; 211 break;
213 default: 212 default:
214 continue; 213 continue;
215 } 214 }
216 } 215 }
217 /* RQ must be at least as large as the SQ */ 216 /* RQ must be at least as large as the SQ */
218 opts->rq_depth = max(opts->rq_depth, opts->sq_depth); 217 opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
219 kfree(options); 218 kfree(options);
220 return 0; 219 return 0;
221 } 220 }
222 221
223 static int 222 static int
224 p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) 223 p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
225 { 224 {
226 struct p9_client *c = id->context; 225 struct p9_client *c = id->context;
227 struct p9_trans_rdma *rdma = c->trans; 226 struct p9_trans_rdma *rdma = c->trans;
228 switch (event->event) { 227 switch (event->event) {
229 case RDMA_CM_EVENT_ADDR_RESOLVED: 228 case RDMA_CM_EVENT_ADDR_RESOLVED:
230 BUG_ON(rdma->state != P9_RDMA_INIT); 229 BUG_ON(rdma->state != P9_RDMA_INIT);
231 rdma->state = P9_RDMA_ADDR_RESOLVED; 230 rdma->state = P9_RDMA_ADDR_RESOLVED;
232 break; 231 break;
233 232
234 case RDMA_CM_EVENT_ROUTE_RESOLVED: 233 case RDMA_CM_EVENT_ROUTE_RESOLVED:
235 BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); 234 BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
236 rdma->state = P9_RDMA_ROUTE_RESOLVED; 235 rdma->state = P9_RDMA_ROUTE_RESOLVED;
237 break; 236 break;
238 237
239 case RDMA_CM_EVENT_ESTABLISHED: 238 case RDMA_CM_EVENT_ESTABLISHED:
240 BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); 239 BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
241 rdma->state = P9_RDMA_CONNECTED; 240 rdma->state = P9_RDMA_CONNECTED;
242 break; 241 break;
243 242
244 case RDMA_CM_EVENT_DISCONNECTED: 243 case RDMA_CM_EVENT_DISCONNECTED:
245 if (rdma) 244 if (rdma)
246 rdma->state = P9_RDMA_CLOSED; 245 rdma->state = P9_RDMA_CLOSED;
247 if (c) 246 if (c)
248 c->status = Disconnected; 247 c->status = Disconnected;
249 break; 248 break;
250 249
251 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 250 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
252 break; 251 break;
253 252
254 case RDMA_CM_EVENT_ADDR_CHANGE: 253 case RDMA_CM_EVENT_ADDR_CHANGE:
255 case RDMA_CM_EVENT_ROUTE_ERROR: 254 case RDMA_CM_EVENT_ROUTE_ERROR:
256 case RDMA_CM_EVENT_DEVICE_REMOVAL: 255 case RDMA_CM_EVENT_DEVICE_REMOVAL:
257 case RDMA_CM_EVENT_MULTICAST_JOIN: 256 case RDMA_CM_EVENT_MULTICAST_JOIN:
258 case RDMA_CM_EVENT_MULTICAST_ERROR: 257 case RDMA_CM_EVENT_MULTICAST_ERROR:
259 case RDMA_CM_EVENT_REJECTED: 258 case RDMA_CM_EVENT_REJECTED:
260 case RDMA_CM_EVENT_CONNECT_REQUEST: 259 case RDMA_CM_EVENT_CONNECT_REQUEST:
261 case RDMA_CM_EVENT_CONNECT_RESPONSE: 260 case RDMA_CM_EVENT_CONNECT_RESPONSE:
262 case RDMA_CM_EVENT_CONNECT_ERROR: 261 case RDMA_CM_EVENT_CONNECT_ERROR:
263 case RDMA_CM_EVENT_ADDR_ERROR: 262 case RDMA_CM_EVENT_ADDR_ERROR:
264 case RDMA_CM_EVENT_UNREACHABLE: 263 case RDMA_CM_EVENT_UNREACHABLE:
265 c->status = Disconnected; 264 c->status = Disconnected;
266 rdma_disconnect(rdma->cm_id); 265 rdma_disconnect(rdma->cm_id);
267 break; 266 break;
268 default: 267 default:
269 BUG(); 268 BUG();
270 } 269 }
271 complete(&rdma->cm_done); 270 complete(&rdma->cm_done);
272 return 0; 271 return 0;
273 } 272 }
274 273
275 static void 274 static void
276 handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, 275 handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
277 struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) 276 struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
278 { 277 {
279 struct p9_req_t *req; 278 struct p9_req_t *req;
280 int err = 0; 279 int err = 0;
281 int16_t tag; 280 int16_t tag;
282 281
283 req = NULL; 282 req = NULL;
284 ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, 283 ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
285 DMA_FROM_DEVICE); 284 DMA_FROM_DEVICE);
286 285
287 if (status != IB_WC_SUCCESS) 286 if (status != IB_WC_SUCCESS)
288 goto err_out; 287 goto err_out;
289 288
290 err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); 289 err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
291 if (err) 290 if (err)
292 goto err_out; 291 goto err_out;
293 292
294 req = p9_tag_lookup(client, tag); 293 req = p9_tag_lookup(client, tag);
295 if (!req) 294 if (!req)
296 goto err_out; 295 goto err_out;
297 296
298 req->rc = c->rc; 297 req->rc = c->rc;
299 p9_client_cb(client, req); 298 p9_client_cb(client, req);
300 299
301 return; 300 return;
302 301
303 err_out: 302 err_out:
304 P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n", 303 P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n",
305 req, err, status); 304 req, err, status);
306 rdma->state = P9_RDMA_FLUSHING; 305 rdma->state = P9_RDMA_FLUSHING;
307 client->status = Disconnected; 306 client->status = Disconnected;
308 return; 307 return;
309 } 308 }
310 309
311 static void 310 static void
312 handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, 311 handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
313 struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) 312 struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
314 { 313 {
315 ib_dma_unmap_single(rdma->cm_id->device, 314 ib_dma_unmap_single(rdma->cm_id->device,
316 c->busa, c->req->tc->size, 315 c->busa, c->req->tc->size,
317 DMA_TO_DEVICE); 316 DMA_TO_DEVICE);
318 } 317 }
319 318
320 static void qp_event_handler(struct ib_event *event, void *context) 319 static void qp_event_handler(struct ib_event *event, void *context)
321 { 320 {
322 P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, 321 P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event,
323 context); 322 context);
324 } 323 }
325 324
326 static void cq_comp_handler(struct ib_cq *cq, void *cq_context) 325 static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
327 { 326 {
328 struct p9_client *client = cq_context; 327 struct p9_client *client = cq_context;
329 struct p9_trans_rdma *rdma = client->trans; 328 struct p9_trans_rdma *rdma = client->trans;
330 int ret; 329 int ret;
331 struct ib_wc wc; 330 struct ib_wc wc;
332 331
333 ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); 332 ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
334 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { 333 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
335 struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; 334 struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id;
336 335
337 switch (c->wc_op) { 336 switch (c->wc_op) {
338 case IB_WC_RECV: 337 case IB_WC_RECV:
339 atomic_dec(&rdma->rq_count); 338 atomic_dec(&rdma->rq_count);
340 handle_recv(client, rdma, c, wc.status, wc.byte_len); 339 handle_recv(client, rdma, c, wc.status, wc.byte_len);
341 break; 340 break;
342 341
343 case IB_WC_SEND: 342 case IB_WC_SEND:
344 handle_send(client, rdma, c, wc.status, wc.byte_len); 343 handle_send(client, rdma, c, wc.status, wc.byte_len);
345 up(&rdma->sq_sem); 344 up(&rdma->sq_sem);
346 break; 345 break;
347 346
348 default: 347 default:
349 printk(KERN_ERR "9prdma: unexpected completion type, " 348 printk(KERN_ERR "9prdma: unexpected completion type, "
350 "c->wc_op=%d, wc.opcode=%d, status=%d\n", 349 "c->wc_op=%d, wc.opcode=%d, status=%d\n",
351 c->wc_op, wc.opcode, wc.status); 350 c->wc_op, wc.opcode, wc.status);
352 break; 351 break;
353 } 352 }
354 kfree(c); 353 kfree(c);
355 } 354 }
356 } 355 }
357 356
358 static void cq_event_handler(struct ib_event *e, void *v) 357 static void cq_event_handler(struct ib_event *e, void *v)
359 { 358 {
360 P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); 359 P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
361 } 360 }
362 361
363 static void rdma_destroy_trans(struct p9_trans_rdma *rdma) 362 static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
364 { 363 {
365 if (!rdma) 364 if (!rdma)
366 return; 365 return;
367 366
368 if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) 367 if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
369 ib_dereg_mr(rdma->dma_mr); 368 ib_dereg_mr(rdma->dma_mr);
370 369
371 if (rdma->qp && !IS_ERR(rdma->qp)) 370 if (rdma->qp && !IS_ERR(rdma->qp))
372 ib_destroy_qp(rdma->qp); 371 ib_destroy_qp(rdma->qp);
373 372
374 if (rdma->pd && !IS_ERR(rdma->pd)) 373 if (rdma->pd && !IS_ERR(rdma->pd))
375 ib_dealloc_pd(rdma->pd); 374 ib_dealloc_pd(rdma->pd);
376 375
377 if (rdma->cq && !IS_ERR(rdma->cq)) 376 if (rdma->cq && !IS_ERR(rdma->cq))
378 ib_destroy_cq(rdma->cq); 377 ib_destroy_cq(rdma->cq);
379 378
380 if (rdma->cm_id && !IS_ERR(rdma->cm_id)) 379 if (rdma->cm_id && !IS_ERR(rdma->cm_id))
381 rdma_destroy_id(rdma->cm_id); 380 rdma_destroy_id(rdma->cm_id);
382 381
383 kfree(rdma); 382 kfree(rdma);
384 } 383 }
385 384
386 static int 385 static int
387 post_recv(struct p9_client *client, struct p9_rdma_context *c) 386 post_recv(struct p9_client *client, struct p9_rdma_context *c)
388 { 387 {
389 struct p9_trans_rdma *rdma = client->trans; 388 struct p9_trans_rdma *rdma = client->trans;
390 struct ib_recv_wr wr, *bad_wr; 389 struct ib_recv_wr wr, *bad_wr;
391 struct ib_sge sge; 390 struct ib_sge sge;
392 391
393 c->busa = ib_dma_map_single(rdma->cm_id->device, 392 c->busa = ib_dma_map_single(rdma->cm_id->device,
394 c->rc->sdata, client->msize, 393 c->rc->sdata, client->msize,
395 DMA_FROM_DEVICE); 394 DMA_FROM_DEVICE);
396 if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) 395 if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
397 goto error; 396 goto error;
398 397
399 sge.addr = c->busa; 398 sge.addr = c->busa;
400 sge.length = client->msize; 399 sge.length = client->msize;
401 sge.lkey = rdma->lkey; 400 sge.lkey = rdma->lkey;
402 401
403 wr.next = NULL; 402 wr.next = NULL;
404 c->wc_op = IB_WC_RECV; 403 c->wc_op = IB_WC_RECV;
405 wr.wr_id = (unsigned long) c; 404 wr.wr_id = (unsigned long) c;
406 wr.sg_list = &sge; 405 wr.sg_list = &sge;
407 wr.num_sge = 1; 406 wr.num_sge = 1;
408 return ib_post_recv(rdma->qp, &wr, &bad_wr); 407 return ib_post_recv(rdma->qp, &wr, &bad_wr);
409 408
410 error: 409 error:
411 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); 410 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
412 return -EIO; 411 return -EIO;
413 } 412 }
414 413
415 static int rdma_request(struct p9_client *client, struct p9_req_t *req) 414 static int rdma_request(struct p9_client *client, struct p9_req_t *req)
416 { 415 {
417 struct p9_trans_rdma *rdma = client->trans; 416 struct p9_trans_rdma *rdma = client->trans;
418 struct ib_send_wr wr, *bad_wr; 417 struct ib_send_wr wr, *bad_wr;
419 struct ib_sge sge; 418 struct ib_sge sge;
420 int err = 0; 419 int err = 0;
421 unsigned long flags; 420 unsigned long flags;
422 struct p9_rdma_context *c = NULL; 421 struct p9_rdma_context *c = NULL;
423 struct p9_rdma_context *rpl_context = NULL; 422 struct p9_rdma_context *rpl_context = NULL;
424 423
425 /* Allocate an fcall for the reply */ 424 /* Allocate an fcall for the reply */
426 rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); 425 rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL);
427 if (!rpl_context) 426 if (!rpl_context)
428 goto err_close; 427 goto err_close;
429 428
430 /* 429 /*
431 * If the request has a buffer, steal it, otherwise 430 * If the request has a buffer, steal it, otherwise
432 * allocate a new one. Typically, requests should already 431 * allocate a new one. Typically, requests should already
433 * have receive buffers allocated and just swap them around 432 * have receive buffers allocated and just swap them around
434 */ 433 */
435 if (!req->rc) { 434 if (!req->rc) {
436 req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, 435 req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize,
437 GFP_KERNEL); 436 GFP_KERNEL);
438 if (req->rc) { 437 if (req->rc) {
439 req->rc->sdata = (char *) req->rc + 438 req->rc->sdata = (char *) req->rc +
440 sizeof(struct p9_fcall); 439 sizeof(struct p9_fcall);
441 req->rc->capacity = client->msize; 440 req->rc->capacity = client->msize;
442 } 441 }
443 } 442 }
444 rpl_context->rc = req->rc; 443 rpl_context->rc = req->rc;
445 if (!rpl_context->rc) { 444 if (!rpl_context->rc) {
446 kfree(rpl_context); 445 kfree(rpl_context);
447 goto err_close; 446 goto err_close;
448 } 447 }
449 448
450 /* 449 /*
451 * Post a receive buffer for this request. We need to ensure 450 * Post a receive buffer for this request. We need to ensure
452 * there is a reply buffer available for every outstanding 451 * there is a reply buffer available for every outstanding
453 * request. A flushed request can result in no reply for an 452 * request. A flushed request can result in no reply for an
454 * outstanding request, so we must keep a count to avoid 453 * outstanding request, so we must keep a count to avoid
455 * overflowing the RQ. 454 * overflowing the RQ.
456 */ 455 */
457 if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { 456 if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
458 err = post_recv(client, rpl_context); 457 err = post_recv(client, rpl_context);
459 if (err) { 458 if (err) {
460 kfree(rpl_context->rc); 459 kfree(rpl_context->rc);
461 kfree(rpl_context); 460 kfree(rpl_context);
462 goto err_close; 461 goto err_close;
463 } 462 }
464 } else 463 } else
465 atomic_dec(&rdma->rq_count); 464 atomic_dec(&rdma->rq_count);
466 465
467 /* remove posted receive buffer from request structure */ 466 /* remove posted receive buffer from request structure */
468 req->rc = NULL; 467 req->rc = NULL;
469 468
470 /* Post the request */ 469 /* Post the request */
471 c = kmalloc(sizeof *c, GFP_KERNEL); 470 c = kmalloc(sizeof *c, GFP_KERNEL);
472 if (!c) 471 if (!c)
473 goto err_close; 472 goto err_close;
474 c->req = req; 473 c->req = req;
475 474
476 c->busa = ib_dma_map_single(rdma->cm_id->device, 475 c->busa = ib_dma_map_single(rdma->cm_id->device,
477 c->req->tc->sdata, c->req->tc->size, 476 c->req->tc->sdata, c->req->tc->size,
478 DMA_TO_DEVICE); 477 DMA_TO_DEVICE);
479 if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) 478 if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
480 goto error; 479 goto error;
481 480
482 sge.addr = c->busa; 481 sge.addr = c->busa;
483 sge.length = c->req->tc->size; 482 sge.length = c->req->tc->size;
484 sge.lkey = rdma->lkey; 483 sge.lkey = rdma->lkey;
485 484
486 wr.next = NULL; 485 wr.next = NULL;
487 c->wc_op = IB_WC_SEND; 486 c->wc_op = IB_WC_SEND;
488 wr.wr_id = (unsigned long) c; 487 wr.wr_id = (unsigned long) c;
489 wr.opcode = IB_WR_SEND; 488 wr.opcode = IB_WR_SEND;
490 wr.send_flags = IB_SEND_SIGNALED; 489 wr.send_flags = IB_SEND_SIGNALED;
491 wr.sg_list = &sge; 490 wr.sg_list = &sge;
492 wr.num_sge = 1; 491 wr.num_sge = 1;
493 492
494 if (down_interruptible(&rdma->sq_sem)) 493 if (down_interruptible(&rdma->sq_sem))
495 goto error; 494 goto error;
496 495
497 return ib_post_send(rdma->qp, &wr, &bad_wr); 496 return ib_post_send(rdma->qp, &wr, &bad_wr);
498 497
499 error: 498 error:
500 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); 499 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
501 return -EIO; 500 return -EIO;
502 501
503 err_close: 502 err_close:
504 spin_lock_irqsave(&rdma->req_lock, flags); 503 spin_lock_irqsave(&rdma->req_lock, flags);
505 if (rdma->state < P9_RDMA_CLOSING) { 504 if (rdma->state < P9_RDMA_CLOSING) {
506 rdma->state = P9_RDMA_CLOSING; 505 rdma->state = P9_RDMA_CLOSING;
507 spin_unlock_irqrestore(&rdma->req_lock, flags); 506 spin_unlock_irqrestore(&rdma->req_lock, flags);
508 rdma_disconnect(rdma->cm_id); 507 rdma_disconnect(rdma->cm_id);
509 } else 508 } else
510 spin_unlock_irqrestore(&rdma->req_lock, flags); 509 spin_unlock_irqrestore(&rdma->req_lock, flags);
511 return err; 510 return err;
512 } 511 }
513 512
514 static void rdma_close(struct p9_client *client) 513 static void rdma_close(struct p9_client *client)
515 { 514 {
516 struct p9_trans_rdma *rdma; 515 struct p9_trans_rdma *rdma;
517 516
518 if (!client) 517 if (!client)
519 return; 518 return;
520 519
521 rdma = client->trans; 520 rdma = client->trans;
522 if (!rdma) 521 if (!rdma)
523 return; 522 return;
524 523
525 client->status = Disconnected; 524 client->status = Disconnected;
526 rdma_disconnect(rdma->cm_id); 525 rdma_disconnect(rdma->cm_id);
527 rdma_destroy_trans(rdma); 526 rdma_destroy_trans(rdma);
528 } 527 }
529 528
530 /** 529 /**
531 * alloc_rdma - Allocate and initialize the rdma transport structure 530 * alloc_rdma - Allocate and initialize the rdma transport structure
532 * @msize: MTU 531 * @msize: MTU
533 * @dotu: Extension attribute 532 * @dotu: Extension attribute
534 * @opts: Mount options structure 533 * @opts: Mount options structure
535 */ 534 */
536 static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) 535 static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
537 { 536 {
538 struct p9_trans_rdma *rdma; 537 struct p9_trans_rdma *rdma;
539 538
540 rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); 539 rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
541 if (!rdma) 540 if (!rdma)
542 return NULL; 541 return NULL;
543 542
544 rdma->sq_depth = opts->sq_depth; 543 rdma->sq_depth = opts->sq_depth;
545 rdma->rq_depth = opts->rq_depth; 544 rdma->rq_depth = opts->rq_depth;
546 rdma->timeout = opts->timeout; 545 rdma->timeout = opts->timeout;
547 spin_lock_init(&rdma->req_lock); 546 spin_lock_init(&rdma->req_lock);
548 init_completion(&rdma->cm_done); 547 init_completion(&rdma->cm_done);
549 sema_init(&rdma->sq_sem, rdma->sq_depth); 548 sema_init(&rdma->sq_sem, rdma->sq_depth);
550 atomic_set(&rdma->rq_count, 0); 549 atomic_set(&rdma->rq_count, 0);
551 550
552 return rdma; 551 return rdma;
553 } 552 }
554 553
555 /* its not clear to me we can do anything after send has been posted */ 554 /* its not clear to me we can do anything after send has been posted */
556 static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) 555 static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
557 { 556 {
558 return 1; 557 return 1;
559 } 558 }
560 559
561 /** 560 /**
562 * trans_create_rdma - Transport method for creating atransport instance 561 * trans_create_rdma - Transport method for creating atransport instance
563 * @client: client instance 562 * @client: client instance
564 * @addr: IP address string 563 * @addr: IP address string
565 * @args: Mount options string 564 * @args: Mount options string
566 */ 565 */
567 static int 566 static int
568 rdma_create_trans(struct p9_client *client, const char *addr, char *args) 567 rdma_create_trans(struct p9_client *client, const char *addr, char *args)
569 { 568 {
570 int err; 569 int err;
571 struct p9_rdma_opts opts; 570 struct p9_rdma_opts opts;
572 struct p9_trans_rdma *rdma; 571 struct p9_trans_rdma *rdma;
573 struct rdma_conn_param conn_param; 572 struct rdma_conn_param conn_param;
574 struct ib_qp_init_attr qp_attr; 573 struct ib_qp_init_attr qp_attr;
575 struct ib_device_attr devattr; 574 struct ib_device_attr devattr;
576 575
577 /* Parse the transport specific mount options */ 576 /* Parse the transport specific mount options */
578 err = parse_opts(args, &opts); 577 err = parse_opts(args, &opts);
579 if (err < 0) 578 if (err < 0)
580 return err; 579 return err;
581 580
582 /* Create and initialize the RDMA transport structure */ 581 /* Create and initialize the RDMA transport structure */
583 rdma = alloc_rdma(&opts); 582 rdma = alloc_rdma(&opts);
584 if (!rdma) 583 if (!rdma)
585 return -ENOMEM; 584 return -ENOMEM;
586 585
587 /* Create the RDMA CM ID */ 586 /* Create the RDMA CM ID */
588 rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); 587 rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP);
589 if (IS_ERR(rdma->cm_id)) 588 if (IS_ERR(rdma->cm_id))
590 goto error; 589 goto error;
591 590
592 /* Resolve the server's address */ 591 /* Resolve the server's address */
593 rdma->addr.sin_family = AF_INET; 592 rdma->addr.sin_family = AF_INET;
594 rdma->addr.sin_addr.s_addr = in_aton(addr); 593 rdma->addr.sin_addr.s_addr = in_aton(addr);
595 rdma->addr.sin_port = htons(opts.port); 594 rdma->addr.sin_port = htons(opts.port);
596 err = rdma_resolve_addr(rdma->cm_id, NULL, 595 err = rdma_resolve_addr(rdma->cm_id, NULL,
597 (struct sockaddr *)&rdma->addr, 596 (struct sockaddr *)&rdma->addr,
598 rdma->timeout); 597 rdma->timeout);
599 if (err) 598 if (err)
600 goto error; 599 goto error;
601 err = wait_for_completion_interruptible(&rdma->cm_done); 600 err = wait_for_completion_interruptible(&rdma->cm_done);
602 if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) 601 if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
603 goto error; 602 goto error;
604 603
605 /* Resolve the route to the server */ 604 /* Resolve the route to the server */
606 err = rdma_resolve_route(rdma->cm_id, rdma->timeout); 605 err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
607 if (err) 606 if (err)
608 goto error; 607 goto error;
609 err = wait_for_completion_interruptible(&rdma->cm_done); 608 err = wait_for_completion_interruptible(&rdma->cm_done);
610 if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) 609 if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
611 goto error; 610 goto error;
612 611
613 /* Query the device attributes */ 612 /* Query the device attributes */
614 err = ib_query_device(rdma->cm_id->device, &devattr); 613 err = ib_query_device(rdma->cm_id->device, &devattr);
615 if (err) 614 if (err)
616 goto error; 615 goto error;
617 616
618 /* Create the Completion Queue */ 617 /* Create the Completion Queue */
619 rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, 618 rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
620 cq_event_handler, client, 619 cq_event_handler, client,
621 opts.sq_depth + opts.rq_depth + 1, 0); 620 opts.sq_depth + opts.rq_depth + 1, 0);
622 if (IS_ERR(rdma->cq)) 621 if (IS_ERR(rdma->cq))
623 goto error; 622 goto error;
624 ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); 623 ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
625 624
626 /* Create the Protection Domain */ 625 /* Create the Protection Domain */
627 rdma->pd = ib_alloc_pd(rdma->cm_id->device); 626 rdma->pd = ib_alloc_pd(rdma->cm_id->device);
628 if (IS_ERR(rdma->pd)) 627 if (IS_ERR(rdma->pd))
629 goto error; 628 goto error;
630 629
631 /* Cache the DMA lkey in the transport */ 630 /* Cache the DMA lkey in the transport */
632 rdma->dma_mr = NULL; 631 rdma->dma_mr = NULL;
633 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) 632 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
634 rdma->lkey = rdma->cm_id->device->local_dma_lkey; 633 rdma->lkey = rdma->cm_id->device->local_dma_lkey;
635 else { 634 else {
636 rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); 635 rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
637 if (IS_ERR(rdma->dma_mr)) 636 if (IS_ERR(rdma->dma_mr))
638 goto error; 637 goto error;
639 rdma->lkey = rdma->dma_mr->lkey; 638 rdma->lkey = rdma->dma_mr->lkey;
640 } 639 }
641 640
642 /* Create the Queue Pair */ 641 /* Create the Queue Pair */
643 memset(&qp_attr, 0, sizeof qp_attr); 642 memset(&qp_attr, 0, sizeof qp_attr);
644 qp_attr.event_handler = qp_event_handler; 643 qp_attr.event_handler = qp_event_handler;
645 qp_attr.qp_context = client; 644 qp_attr.qp_context = client;
646 qp_attr.cap.max_send_wr = opts.sq_depth; 645 qp_attr.cap.max_send_wr = opts.sq_depth;
647 qp_attr.cap.max_recv_wr = opts.rq_depth; 646 qp_attr.cap.max_recv_wr = opts.rq_depth;
648 qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; 647 qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
649 qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; 648 qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
650 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 649 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
651 qp_attr.qp_type = IB_QPT_RC; 650 qp_attr.qp_type = IB_QPT_RC;
652 qp_attr.send_cq = rdma->cq; 651 qp_attr.send_cq = rdma->cq;
653 qp_attr.recv_cq = rdma->cq; 652 qp_attr.recv_cq = rdma->cq;
654 err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); 653 err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
655 if (err) 654 if (err)
656 goto error; 655 goto error;
657 rdma->qp = rdma->cm_id->qp; 656 rdma->qp = rdma->cm_id->qp;
658 657
659 /* Request a connection */ 658 /* Request a connection */
660 memset(&conn_param, 0, sizeof(conn_param)); 659 memset(&conn_param, 0, sizeof(conn_param));
661 conn_param.private_data = NULL; 660 conn_param.private_data = NULL;
662 conn_param.private_data_len = 0; 661 conn_param.private_data_len = 0;
663 conn_param.responder_resources = P9_RDMA_IRD; 662 conn_param.responder_resources = P9_RDMA_IRD;
664 conn_param.initiator_depth = P9_RDMA_ORD; 663 conn_param.initiator_depth = P9_RDMA_ORD;
665 err = rdma_connect(rdma->cm_id, &conn_param); 664 err = rdma_connect(rdma->cm_id, &conn_param);
666 if (err) 665 if (err)
667 goto error; 666 goto error;
668 err = wait_for_completion_interruptible(&rdma->cm_done); 667 err = wait_for_completion_interruptible(&rdma->cm_done);
669 if (err || (rdma->state != P9_RDMA_CONNECTED)) 668 if (err || (rdma->state != P9_RDMA_CONNECTED))
670 goto error; 669 goto error;
671 670
672 client->trans = rdma; 671 client->trans = rdma;
673 client->status = Connected; 672 client->status = Connected;
674 673
675 return 0; 674 return 0;
676 675
677 error: 676 error:
678 rdma_destroy_trans(rdma); 677 rdma_destroy_trans(rdma);
679 return -ENOTCONN; 678 return -ENOTCONN;
680 } 679 }
681 680
682 static struct p9_trans_module p9_rdma_trans = { 681 static struct p9_trans_module p9_rdma_trans = {
683 .name = "rdma", 682 .name = "rdma",
684 .maxsize = P9_RDMA_MAXSIZE, 683 .maxsize = P9_RDMA_MAXSIZE,
685 .def = 0, 684 .def = 0,
686 .owner = THIS_MODULE, 685 .owner = THIS_MODULE,
687 .create = rdma_create_trans, 686 .create = rdma_create_trans,
688 .close = rdma_close, 687 .close = rdma_close,
689 .request = rdma_request, 688 .request = rdma_request,
690 .cancel = rdma_cancel, 689 .cancel = rdma_cancel,
691 }; 690 };
692 691
693 /** 692 /**
694 * p9_trans_rdma_init - Register the 9P RDMA transport driver 693 * p9_trans_rdma_init - Register the 9P RDMA transport driver
695 */ 694 */
696 static int __init p9_trans_rdma_init(void) 695 static int __init p9_trans_rdma_init(void)
697 { 696 {
698 v9fs_register_trans(&p9_rdma_trans); 697 v9fs_register_trans(&p9_rdma_trans);
699 return 0; 698 return 0;
700 } 699 }
701 700
702 static void __exit p9_trans_rdma_exit(void) 701 static void __exit p9_trans_rdma_exit(void)
703 { 702 {
704 v9fs_unregister_trans(&p9_rdma_trans); 703 v9fs_unregister_trans(&p9_rdma_trans);
705 } 704 }
706 705
707 module_init(p9_trans_rdma_init); 706 module_init(p9_trans_rdma_init);
708 module_exit(p9_trans_rdma_exit); 707 module_exit(p9_trans_rdma_exit);
709 708
710 MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); 709 MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
711 MODULE_DESCRIPTION("RDMA Transport for 9P"); 710 MODULE_DESCRIPTION("RDMA Transport for 9P");
712 MODULE_LICENSE("Dual BSD/GPL"); 711 MODULE_LICENSE("Dual BSD/GPL");
713 712
1 /* 1 /*
2 * NET An implementation of the SOCKET network access protocol. 2 * NET An implementation of the SOCKET network access protocol.
3 * 3 *
4 * Version: @(#)socket.c 1.1.93 18/02/95 4 * Version: @(#)socket.c 1.1.93 18/02/95
5 * 5 *
6 * Authors: Orest Zborowski, <obz@Kodak.COM> 6 * Authors: Orest Zborowski, <obz@Kodak.COM>
7 * Ross Biro 7 * Ross Biro
8 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 8 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
9 * 9 *
10 * Fixes: 10 * Fixes:
11 * Anonymous : NOTSOCK/BADF cleanup. Error fix in 11 * Anonymous : NOTSOCK/BADF cleanup. Error fix in
12 * shutdown() 12 * shutdown()
13 * Alan Cox : verify_area() fixes 13 * Alan Cox : verify_area() fixes
14 * Alan Cox : Removed DDI 14 * Alan Cox : Removed DDI
15 * Jonathan Kamens : SOCK_DGRAM reconnect bug 15 * Jonathan Kamens : SOCK_DGRAM reconnect bug
16 * Alan Cox : Moved a load of checks to the very 16 * Alan Cox : Moved a load of checks to the very
17 * top level. 17 * top level.
18 * Alan Cox : Move address structures to/from user 18 * Alan Cox : Move address structures to/from user
19 * mode above the protocol layers. 19 * mode above the protocol layers.
20 * Rob Janssen : Allow 0 length sends. 20 * Rob Janssen : Allow 0 length sends.
21 * Alan Cox : Asynchronous I/O support (cribbed from the 21 * Alan Cox : Asynchronous I/O support (cribbed from the
22 * tty drivers). 22 * tty drivers).
23 * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) 23 * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style)
24 * Jeff Uphoff : Made max number of sockets command-line 24 * Jeff Uphoff : Made max number of sockets command-line
25 * configurable. 25 * configurable.
26 * Matti Aarnio : Made the number of sockets dynamic, 26 * Matti Aarnio : Made the number of sockets dynamic,
27 * to be allocated when needed, and mr. 27 * to be allocated when needed, and mr.
28 * Uphoff's max is used as max to be 28 * Uphoff's max is used as max to be
29 * allowed to allocate. 29 * allowed to allocate.
30 * Linus : Argh. removed all the socket allocation 30 * Linus : Argh. removed all the socket allocation
31 * altogether: it's in the inode now. 31 * altogether: it's in the inode now.
32 * Alan Cox : Made sock_alloc()/sock_release() public 32 * Alan Cox : Made sock_alloc()/sock_release() public
33 * for NetROM and future kernel nfsd type 33 * for NetROM and future kernel nfsd type
34 * stuff. 34 * stuff.
35 * Alan Cox : sendmsg/recvmsg basics. 35 * Alan Cox : sendmsg/recvmsg basics.
36 * Tom Dyas : Export net symbols. 36 * Tom Dyas : Export net symbols.
37 * Marcin Dalecki : Fixed problems with CONFIG_NET="n". 37 * Marcin Dalecki : Fixed problems with CONFIG_NET="n".
38 * Alan Cox : Added thread locking to sys_* calls 38 * Alan Cox : Added thread locking to sys_* calls
39 * for sockets. May have errors at the 39 * for sockets. May have errors at the
40 * moment. 40 * moment.
41 * Kevin Buhr : Fixed the dumb errors in the above. 41 * Kevin Buhr : Fixed the dumb errors in the above.
42 * Andi Kleen : Some small cleanups, optimizations, 42 * Andi Kleen : Some small cleanups, optimizations,
43 * and fixed a copy_from_user() bug. 43 * and fixed a copy_from_user() bug.
44 * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) 44 * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0)
45 * Tigran Aivazian : Made listen(2) backlog sanity checks 45 * Tigran Aivazian : Made listen(2) backlog sanity checks
46 * protocol-independent 46 * protocol-independent
47 * 47 *
48 * 48 *
49 * This program is free software; you can redistribute it and/or 49 * This program is free software; you can redistribute it and/or
50 * modify it under the terms of the GNU General Public License 50 * modify it under the terms of the GNU General Public License
51 * as published by the Free Software Foundation; either version 51 * as published by the Free Software Foundation; either version
52 * 2 of the License, or (at your option) any later version. 52 * 2 of the License, or (at your option) any later version.
53 * 53 *
54 * 54 *
55 * This module is effectively the top level interface to the BSD socket 55 * This module is effectively the top level interface to the BSD socket
56 * paradigm. 56 * paradigm.
57 * 57 *
58 * Based upon Swansea University Computer Society NET3.039 58 * Based upon Swansea University Computer Society NET3.039
59 */ 59 */
60 60
61 #include <linux/mm.h> 61 #include <linux/mm.h>
62 #include <linux/socket.h> 62 #include <linux/socket.h>
63 #include <linux/file.h> 63 #include <linux/file.h>
64 #include <linux/net.h> 64 #include <linux/net.h>
65 #include <linux/interrupt.h> 65 #include <linux/interrupt.h>
66 #include <linux/thread_info.h> 66 #include <linux/thread_info.h>
67 #include <linux/rcupdate.h> 67 #include <linux/rcupdate.h>
68 #include <linux/netdevice.h> 68 #include <linux/netdevice.h>
69 #include <linux/proc_fs.h> 69 #include <linux/proc_fs.h>
70 #include <linux/seq_file.h> 70 #include <linux/seq_file.h>
71 #include <linux/mutex.h> 71 #include <linux/mutex.h>
72 #include <linux/thread_info.h>
73 #include <linux/wanrouter.h> 72 #include <linux/wanrouter.h>
74 #include <linux/if_bridge.h> 73 #include <linux/if_bridge.h>
75 #include <linux/if_frad.h> 74 #include <linux/if_frad.h>
76 #include <linux/if_vlan.h> 75 #include <linux/if_vlan.h>
77 #include <linux/init.h> 76 #include <linux/init.h>
78 #include <linux/poll.h> 77 #include <linux/poll.h>
79 #include <linux/cache.h> 78 #include <linux/cache.h>
80 #include <linux/module.h> 79 #include <linux/module.h>
81 #include <linux/highmem.h> 80 #include <linux/highmem.h>
82 #include <linux/mount.h> 81 #include <linux/mount.h>
83 #include <linux/security.h> 82 #include <linux/security.h>
84 #include <linux/syscalls.h> 83 #include <linux/syscalls.h>
85 #include <linux/compat.h> 84 #include <linux/compat.h>
86 #include <linux/kmod.h> 85 #include <linux/kmod.h>
87 #include <linux/audit.h> 86 #include <linux/audit.h>
88 #include <linux/wireless.h> 87 #include <linux/wireless.h>
89 #include <linux/nsproxy.h> 88 #include <linux/nsproxy.h>
90 89
91 #include <asm/uaccess.h> 90 #include <asm/uaccess.h>
92 #include <asm/unistd.h> 91 #include <asm/unistd.h>
93 92
94 #include <net/compat.h> 93 #include <net/compat.h>
95 #include <net/wext.h> 94 #include <net/wext.h>
96 95
97 #include <net/sock.h> 96 #include <net/sock.h>
98 #include <linux/netfilter.h> 97 #include <linux/netfilter.h>
99 98
100 static int sock_no_open(struct inode *irrelevant, struct file *dontcare); 99 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
101 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, 100 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
102 unsigned long nr_segs, loff_t pos); 101 unsigned long nr_segs, loff_t pos);
103 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, 102 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
104 unsigned long nr_segs, loff_t pos); 103 unsigned long nr_segs, loff_t pos);
105 static int sock_mmap(struct file *file, struct vm_area_struct *vma); 104 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
106 105
107 static int sock_close(struct inode *inode, struct file *file); 106 static int sock_close(struct inode *inode, struct file *file);
108 static unsigned int sock_poll(struct file *file, 107 static unsigned int sock_poll(struct file *file,
109 struct poll_table_struct *wait); 108 struct poll_table_struct *wait);
110 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 109 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
111 #ifdef CONFIG_COMPAT 110 #ifdef CONFIG_COMPAT
112 static long compat_sock_ioctl(struct file *file, 111 static long compat_sock_ioctl(struct file *file,
113 unsigned int cmd, unsigned long arg); 112 unsigned int cmd, unsigned long arg);
114 #endif 113 #endif
115 static int sock_fasync(int fd, struct file *filp, int on); 114 static int sock_fasync(int fd, struct file *filp, int on);
116 static ssize_t sock_sendpage(struct file *file, struct page *page, 115 static ssize_t sock_sendpage(struct file *file, struct page *page,
117 int offset, size_t size, loff_t *ppos, int more); 116 int offset, size_t size, loff_t *ppos, int more);
118 static ssize_t sock_splice_read(struct file *file, loff_t *ppos, 117 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
119 struct pipe_inode_info *pipe, size_t len, 118 struct pipe_inode_info *pipe, size_t len,
120 unsigned int flags); 119 unsigned int flags);
121 120
122 /* 121 /*
123 * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear 122 * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
124 * in the operation structures but are done directly via the socketcall() multiplexor. 123 * in the operation structures but are done directly via the socketcall() multiplexor.
125 */ 124 */
126 125
127 static const struct file_operations socket_file_ops = { 126 static const struct file_operations socket_file_ops = {
128 .owner = THIS_MODULE, 127 .owner = THIS_MODULE,
129 .llseek = no_llseek, 128 .llseek = no_llseek,
130 .aio_read = sock_aio_read, 129 .aio_read = sock_aio_read,
131 .aio_write = sock_aio_write, 130 .aio_write = sock_aio_write,
132 .poll = sock_poll, 131 .poll = sock_poll,
133 .unlocked_ioctl = sock_ioctl, 132 .unlocked_ioctl = sock_ioctl,
134 #ifdef CONFIG_COMPAT 133 #ifdef CONFIG_COMPAT
135 .compat_ioctl = compat_sock_ioctl, 134 .compat_ioctl = compat_sock_ioctl,
136 #endif 135 #endif
137 .mmap = sock_mmap, 136 .mmap = sock_mmap,
138 .open = sock_no_open, /* special open code to disallow open via /proc */ 137 .open = sock_no_open, /* special open code to disallow open via /proc */
139 .release = sock_close, 138 .release = sock_close,
140 .fasync = sock_fasync, 139 .fasync = sock_fasync,
141 .sendpage = sock_sendpage, 140 .sendpage = sock_sendpage,
142 .splice_write = generic_splice_sendpage, 141 .splice_write = generic_splice_sendpage,
143 .splice_read = sock_splice_read, 142 .splice_read = sock_splice_read,
144 }; 143 };
145 144
146 /* 145 /*
147 * The protocol list. Each protocol is registered in here. 146 * The protocol list. Each protocol is registered in here.
148 */ 147 */
149 148
150 static DEFINE_SPINLOCK(net_family_lock); 149 static DEFINE_SPINLOCK(net_family_lock);
151 static const struct net_proto_family *net_families[NPROTO] __read_mostly; 150 static const struct net_proto_family *net_families[NPROTO] __read_mostly;
152 151
153 /* 152 /*
154 * Statistics counters of the socket lists 153 * Statistics counters of the socket lists
155 */ 154 */
156 155
157 static DEFINE_PER_CPU(int, sockets_in_use) = 0; 156 static DEFINE_PER_CPU(int, sockets_in_use) = 0;
158 157
159 /* 158 /*
160 * Support routines. 159 * Support routines.
161 * Move socket addresses back and forth across the kernel/user 160 * Move socket addresses back and forth across the kernel/user
162 * divide and look after the messy bits. 161 * divide and look after the messy bits.
163 */ 162 */
164 163
165 #define MAX_SOCK_ADDR 128 /* 108 for Unix domain - 164 #define MAX_SOCK_ADDR 128 /* 108 for Unix domain -
166 16 for IP, 16 for IPX, 165 16 for IP, 16 for IPX,
167 24 for IPv6, 166 24 for IPv6,
168 about 80 for AX.25 167 about 80 for AX.25
169 must be at least one bigger than 168 must be at least one bigger than
170 the AF_UNIX size (see net/unix/af_unix.c 169 the AF_UNIX size (see net/unix/af_unix.c
171 :unix_mkname()). 170 :unix_mkname()).
172 */ 171 */
173 172
174 /** 173 /**
175 * move_addr_to_kernel - copy a socket address into kernel space 174 * move_addr_to_kernel - copy a socket address into kernel space
176 * @uaddr: Address in user space 175 * @uaddr: Address in user space
177 * @kaddr: Address in kernel space 176 * @kaddr: Address in kernel space
178 * @ulen: Length in user space 177 * @ulen: Length in user space
179 * 178 *
180 * The address is copied into kernel space. If the provided address is 179 * The address is copied into kernel space. If the provided address is
181 * too long an error code of -EINVAL is returned. If the copy gives 180 * too long an error code of -EINVAL is returned. If the copy gives
182 * invalid addresses -EFAULT is returned. On a success 0 is returned. 181 * invalid addresses -EFAULT is returned. On a success 0 is returned.
183 */ 182 */
184 183
185 int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr) 184 int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
186 { 185 {
187 if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) 186 if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
188 return -EINVAL; 187 return -EINVAL;
189 if (ulen == 0) 188 if (ulen == 0)
190 return 0; 189 return 0;
191 if (copy_from_user(kaddr, uaddr, ulen)) 190 if (copy_from_user(kaddr, uaddr, ulen))
192 return -EFAULT; 191 return -EFAULT;
193 return audit_sockaddr(ulen, kaddr); 192 return audit_sockaddr(ulen, kaddr);
194 } 193 }
195 194
196 /** 195 /**
197 * move_addr_to_user - copy an address to user space 196 * move_addr_to_user - copy an address to user space
198 * @kaddr: kernel space address 197 * @kaddr: kernel space address
199 * @klen: length of address in kernel 198 * @klen: length of address in kernel
200 * @uaddr: user space address 199 * @uaddr: user space address
201 * @ulen: pointer to user length field 200 * @ulen: pointer to user length field
202 * 201 *
203 * The value pointed to by ulen on entry is the buffer length available. 202 * The value pointed to by ulen on entry is the buffer length available.
204 * This is overwritten with the buffer space used. -EINVAL is returned 203 * This is overwritten with the buffer space used. -EINVAL is returned
205 * if an overlong buffer is specified or a negative buffer size. -EFAULT 204 * if an overlong buffer is specified or a negative buffer size. -EFAULT
206 * is returned if either the buffer or the length field are not 205 * is returned if either the buffer or the length field are not
207 * accessible. 206 * accessible.
208 * After copying the data up to the limit the user specifies, the true 207 * After copying the data up to the limit the user specifies, the true
209 * length of the data is written over the length limit the user 208 * length of the data is written over the length limit the user
210 * specified. Zero is returned for a success. 209 * specified. Zero is returned for a success.
211 */ 210 */
212 211
213 int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, 212 int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr,
214 int __user *ulen) 213 int __user *ulen)
215 { 214 {
216 int err; 215 int err;
217 int len; 216 int len;
218 217
219 err = get_user(len, ulen); 218 err = get_user(len, ulen);
220 if (err) 219 if (err)
221 return err; 220 return err;
222 if (len > klen) 221 if (len > klen)
223 len = klen; 222 len = klen;
224 if (len < 0 || len > sizeof(struct sockaddr_storage)) 223 if (len < 0 || len > sizeof(struct sockaddr_storage))
225 return -EINVAL; 224 return -EINVAL;
226 if (len) { 225 if (len) {
227 if (audit_sockaddr(klen, kaddr)) 226 if (audit_sockaddr(klen, kaddr))
228 return -ENOMEM; 227 return -ENOMEM;
229 if (copy_to_user(uaddr, kaddr, len)) 228 if (copy_to_user(uaddr, kaddr, len))
230 return -EFAULT; 229 return -EFAULT;
231 } 230 }
232 /* 231 /*
233 * "fromlen shall refer to the value before truncation.." 232 * "fromlen shall refer to the value before truncation.."
234 * 1003.1g 233 * 1003.1g
235 */ 234 */
236 return __put_user(klen, ulen); 235 return __put_user(klen, ulen);
237 } 236 }
238 237
239 #define SOCKFS_MAGIC 0x534F434B 238 #define SOCKFS_MAGIC 0x534F434B
240 239
241 static struct kmem_cache *sock_inode_cachep __read_mostly; 240 static struct kmem_cache *sock_inode_cachep __read_mostly;
242 241
243 static struct inode *sock_alloc_inode(struct super_block *sb) 242 static struct inode *sock_alloc_inode(struct super_block *sb)
244 { 243 {
245 struct socket_alloc *ei; 244 struct socket_alloc *ei;
246 245
247 ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); 246 ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
248 if (!ei) 247 if (!ei)
249 return NULL; 248 return NULL;
250 init_waitqueue_head(&ei->socket.wait); 249 init_waitqueue_head(&ei->socket.wait);
251 250
252 ei->socket.fasync_list = NULL; 251 ei->socket.fasync_list = NULL;
253 ei->socket.state = SS_UNCONNECTED; 252 ei->socket.state = SS_UNCONNECTED;
254 ei->socket.flags = 0; 253 ei->socket.flags = 0;
255 ei->socket.ops = NULL; 254 ei->socket.ops = NULL;
256 ei->socket.sk = NULL; 255 ei->socket.sk = NULL;
257 ei->socket.file = NULL; 256 ei->socket.file = NULL;
258 257
259 return &ei->vfs_inode; 258 return &ei->vfs_inode;
260 } 259 }
261 260
262 static void sock_destroy_inode(struct inode *inode) 261 static void sock_destroy_inode(struct inode *inode)
263 { 262 {
264 kmem_cache_free(sock_inode_cachep, 263 kmem_cache_free(sock_inode_cachep,
265 container_of(inode, struct socket_alloc, vfs_inode)); 264 container_of(inode, struct socket_alloc, vfs_inode));
266 } 265 }
267 266
268 static void init_once(void *foo) 267 static void init_once(void *foo)
269 { 268 {
270 struct socket_alloc *ei = (struct socket_alloc *)foo; 269 struct socket_alloc *ei = (struct socket_alloc *)foo;
271 270
272 inode_init_once(&ei->vfs_inode); 271 inode_init_once(&ei->vfs_inode);
273 } 272 }
274 273
275 static int init_inodecache(void) 274 static int init_inodecache(void)
276 { 275 {
277 sock_inode_cachep = kmem_cache_create("sock_inode_cache", 276 sock_inode_cachep = kmem_cache_create("sock_inode_cache",
278 sizeof(struct socket_alloc), 277 sizeof(struct socket_alloc),
279 0, 278 0,
280 (SLAB_HWCACHE_ALIGN | 279 (SLAB_HWCACHE_ALIGN |
281 SLAB_RECLAIM_ACCOUNT | 280 SLAB_RECLAIM_ACCOUNT |
282 SLAB_MEM_SPREAD), 281 SLAB_MEM_SPREAD),
283 init_once); 282 init_once);
284 if (sock_inode_cachep == NULL) 283 if (sock_inode_cachep == NULL)
285 return -ENOMEM; 284 return -ENOMEM;
286 return 0; 285 return 0;
287 } 286 }
288 287
289 static struct super_operations sockfs_ops = { 288 static struct super_operations sockfs_ops = {
290 .alloc_inode = sock_alloc_inode, 289 .alloc_inode = sock_alloc_inode,
291 .destroy_inode =sock_destroy_inode, 290 .destroy_inode =sock_destroy_inode,
292 .statfs = simple_statfs, 291 .statfs = simple_statfs,
293 }; 292 };
294 293
295 static int sockfs_get_sb(struct file_system_type *fs_type, 294 static int sockfs_get_sb(struct file_system_type *fs_type,
296 int flags, const char *dev_name, void *data, 295 int flags, const char *dev_name, void *data,
297 struct vfsmount *mnt) 296 struct vfsmount *mnt)
298 { 297 {
299 return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC, 298 return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
300 mnt); 299 mnt);
301 } 300 }
302 301
303 static struct vfsmount *sock_mnt __read_mostly; 302 static struct vfsmount *sock_mnt __read_mostly;
304 303
305 static struct file_system_type sock_fs_type = { 304 static struct file_system_type sock_fs_type = {
306 .name = "sockfs", 305 .name = "sockfs",
307 .get_sb = sockfs_get_sb, 306 .get_sb = sockfs_get_sb,
308 .kill_sb = kill_anon_super, 307 .kill_sb = kill_anon_super,
309 }; 308 };
310 309
311 static int sockfs_delete_dentry(struct dentry *dentry) 310 static int sockfs_delete_dentry(struct dentry *dentry)
312 { 311 {
313 /* 312 /*
314 * At creation time, we pretended this dentry was hashed 313 * At creation time, we pretended this dentry was hashed
315 * (by clearing DCACHE_UNHASHED bit in d_flags) 314 * (by clearing DCACHE_UNHASHED bit in d_flags)
316 * At delete time, we restore the truth : not hashed. 315 * At delete time, we restore the truth : not hashed.
317 * (so that dput() can proceed correctly) 316 * (so that dput() can proceed correctly)
318 */ 317 */
319 dentry->d_flags |= DCACHE_UNHASHED; 318 dentry->d_flags |= DCACHE_UNHASHED;
320 return 0; 319 return 0;
321 } 320 }
322 321
323 /* 322 /*
324 * sockfs_dname() is called from d_path(). 323 * sockfs_dname() is called from d_path().
325 */ 324 */
326 static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) 325 static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
327 { 326 {
328 return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", 327 return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
329 dentry->d_inode->i_ino); 328 dentry->d_inode->i_ino);
330 } 329 }
331 330
332 static struct dentry_operations sockfs_dentry_operations = { 331 static struct dentry_operations sockfs_dentry_operations = {
333 .d_delete = sockfs_delete_dentry, 332 .d_delete = sockfs_delete_dentry,
334 .d_dname = sockfs_dname, 333 .d_dname = sockfs_dname,
335 }; 334 };
336 335
337 /* 336 /*
338 * Obtains the first available file descriptor and sets it up for use. 337 * Obtains the first available file descriptor and sets it up for use.
339 * 338 *
340 * These functions create file structures and maps them to fd space 339 * These functions create file structures and maps them to fd space
341 * of the current process. On success it returns file descriptor 340 * of the current process. On success it returns file descriptor
342 * and file struct implicitly stored in sock->file. 341 * and file struct implicitly stored in sock->file.
343 * Note that another thread may close file descriptor before we return 342 * Note that another thread may close file descriptor before we return
344 * from this function. We use the fact that now we do not refer 343 * from this function. We use the fact that now we do not refer
345 * to socket after mapping. If one day we will need it, this 344 * to socket after mapping. If one day we will need it, this
346 * function will increment ref. count on file by 1. 345 * function will increment ref. count on file by 1.
347 * 346 *
348 * In any case returned fd MAY BE not valid! 347 * In any case returned fd MAY BE not valid!
349 * This race condition is unavoidable 348 * This race condition is unavoidable
350 * with shared fd spaces, we cannot solve it inside kernel, 349 * with shared fd spaces, we cannot solve it inside kernel,
351 * but we take care of internal coherence yet. 350 * but we take care of internal coherence yet.
352 */ 351 */
353 352
354 static int sock_alloc_fd(struct file **filep, int flags) 353 static int sock_alloc_fd(struct file **filep, int flags)
355 { 354 {
356 int fd; 355 int fd;
357 356
358 fd = get_unused_fd_flags(flags); 357 fd = get_unused_fd_flags(flags);
359 if (likely(fd >= 0)) { 358 if (likely(fd >= 0)) {
360 struct file *file = get_empty_filp(); 359 struct file *file = get_empty_filp();
361 360
362 *filep = file; 361 *filep = file;
363 if (unlikely(!file)) { 362 if (unlikely(!file)) {
364 put_unused_fd(fd); 363 put_unused_fd(fd);
365 return -ENFILE; 364 return -ENFILE;
366 } 365 }
367 } else 366 } else
368 *filep = NULL; 367 *filep = NULL;
369 return fd; 368 return fd;
370 } 369 }
371 370
372 static int sock_attach_fd(struct socket *sock, struct file *file, int flags) 371 static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
373 { 372 {
374 struct dentry *dentry; 373 struct dentry *dentry;
375 struct qstr name = { .name = "" }; 374 struct qstr name = { .name = "" };
376 375
377 dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); 376 dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
378 if (unlikely(!dentry)) 377 if (unlikely(!dentry))
379 return -ENOMEM; 378 return -ENOMEM;
380 379
381 dentry->d_op = &sockfs_dentry_operations; 380 dentry->d_op = &sockfs_dentry_operations;
382 /* 381 /*
383 * We dont want to push this dentry into global dentry hash table. 382 * We dont want to push this dentry into global dentry hash table.
384 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED 383 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
385 * This permits a working /proc/$pid/fd/XXX on sockets 384 * This permits a working /proc/$pid/fd/XXX on sockets
386 */ 385 */
387 dentry->d_flags &= ~DCACHE_UNHASHED; 386 dentry->d_flags &= ~DCACHE_UNHASHED;
388 d_instantiate(dentry, SOCK_INODE(sock)); 387 d_instantiate(dentry, SOCK_INODE(sock));
389 388
390 sock->file = file; 389 sock->file = file;
391 init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE, 390 init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
392 &socket_file_ops); 391 &socket_file_ops);
393 SOCK_INODE(sock)->i_fop = &socket_file_ops; 392 SOCK_INODE(sock)->i_fop = &socket_file_ops;
394 file->f_flags = O_RDWR | (flags & O_NONBLOCK); 393 file->f_flags = O_RDWR | (flags & O_NONBLOCK);
395 file->f_pos = 0; 394 file->f_pos = 0;
396 file->private_data = sock; 395 file->private_data = sock;
397 396
398 return 0; 397 return 0;
399 } 398 }
400 399
401 int sock_map_fd(struct socket *sock, int flags) 400 int sock_map_fd(struct socket *sock, int flags)
402 { 401 {
403 struct file *newfile; 402 struct file *newfile;
404 int fd = sock_alloc_fd(&newfile, flags); 403 int fd = sock_alloc_fd(&newfile, flags);
405 404
406 if (likely(fd >= 0)) { 405 if (likely(fd >= 0)) {
407 int err = sock_attach_fd(sock, newfile, flags); 406 int err = sock_attach_fd(sock, newfile, flags);
408 407
409 if (unlikely(err < 0)) { 408 if (unlikely(err < 0)) {
410 put_filp(newfile); 409 put_filp(newfile);
411 put_unused_fd(fd); 410 put_unused_fd(fd);
412 return err; 411 return err;
413 } 412 }
414 fd_install(fd, newfile); 413 fd_install(fd, newfile);
415 } 414 }
416 return fd; 415 return fd;
417 } 416 }
418 417
419 static struct socket *sock_from_file(struct file *file, int *err) 418 static struct socket *sock_from_file(struct file *file, int *err)
420 { 419 {
421 if (file->f_op == &socket_file_ops) 420 if (file->f_op == &socket_file_ops)
422 return file->private_data; /* set in sock_map_fd */ 421 return file->private_data; /* set in sock_map_fd */
423 422
424 *err = -ENOTSOCK; 423 *err = -ENOTSOCK;
425 return NULL; 424 return NULL;
426 } 425 }
427 426
428 /** 427 /**
429 * sockfd_lookup - Go from a file number to its socket slot 428 * sockfd_lookup - Go from a file number to its socket slot
430 * @fd: file handle 429 * @fd: file handle
431 * @err: pointer to an error code return 430 * @err: pointer to an error code return
432 * 431 *
433 * The file handle passed in is locked and the socket it is bound 432 * The file handle passed in is locked and the socket it is bound
434 * too is returned. If an error occurs the err pointer is overwritten 433 * too is returned. If an error occurs the err pointer is overwritten
435 * with a negative errno code and NULL is returned. The function checks 434 * with a negative errno code and NULL is returned. The function checks
436 * for both invalid handles and passing a handle which is not a socket. 435 * for both invalid handles and passing a handle which is not a socket.
437 * 436 *
438 * On a success the socket object pointer is returned. 437 * On a success the socket object pointer is returned.
439 */ 438 */
440 439
441 struct socket *sockfd_lookup(int fd, int *err) 440 struct socket *sockfd_lookup(int fd, int *err)
442 { 441 {
443 struct file *file; 442 struct file *file;
444 struct socket *sock; 443 struct socket *sock;
445 444
446 file = fget(fd); 445 file = fget(fd);
447 if (!file) { 446 if (!file) {
448 *err = -EBADF; 447 *err = -EBADF;
449 return NULL; 448 return NULL;
450 } 449 }
451 450
452 sock = sock_from_file(file, err); 451 sock = sock_from_file(file, err);
453 if (!sock) 452 if (!sock)
454 fput(file); 453 fput(file);
455 return sock; 454 return sock;
456 } 455 }
457 456
458 static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) 457 static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
459 { 458 {
460 struct file *file; 459 struct file *file;
461 struct socket *sock; 460 struct socket *sock;
462 461
463 *err = -EBADF; 462 *err = -EBADF;
464 file = fget_light(fd, fput_needed); 463 file = fget_light(fd, fput_needed);
465 if (file) { 464 if (file) {
466 sock = sock_from_file(file, err); 465 sock = sock_from_file(file, err);
467 if (sock) 466 if (sock)
468 return sock; 467 return sock;
469 fput_light(file, *fput_needed); 468 fput_light(file, *fput_needed);
470 } 469 }
471 return NULL; 470 return NULL;
472 } 471 }
473 472
474 /** 473 /**
475 * sock_alloc - allocate a socket 474 * sock_alloc - allocate a socket
476 * 475 *
477 * Allocate a new inode and socket object. The two are bound together 476 * Allocate a new inode and socket object. The two are bound together
478 * and initialised. The socket is then returned. If we are out of inodes 477 * and initialised. The socket is then returned. If we are out of inodes
479 * NULL is returned. 478 * NULL is returned.
480 */ 479 */
481 480
482 static struct socket *sock_alloc(void) 481 static struct socket *sock_alloc(void)
483 { 482 {
484 struct inode *inode; 483 struct inode *inode;
485 struct socket *sock; 484 struct socket *sock;
486 485
487 inode = new_inode(sock_mnt->mnt_sb); 486 inode = new_inode(sock_mnt->mnt_sb);
488 if (!inode) 487 if (!inode)
489 return NULL; 488 return NULL;
490 489
491 sock = SOCKET_I(inode); 490 sock = SOCKET_I(inode);
492 491
493 inode->i_mode = S_IFSOCK | S_IRWXUGO; 492 inode->i_mode = S_IFSOCK | S_IRWXUGO;
494 inode->i_uid = current->fsuid; 493 inode->i_uid = current->fsuid;
495 inode->i_gid = current->fsgid; 494 inode->i_gid = current->fsgid;
496 495
497 get_cpu_var(sockets_in_use)++; 496 get_cpu_var(sockets_in_use)++;
498 put_cpu_var(sockets_in_use); 497 put_cpu_var(sockets_in_use);
499 return sock; 498 return sock;
500 } 499 }
501 500
502 /* 501 /*
503 * In theory you can't get an open on this inode, but /proc provides 502 * In theory you can't get an open on this inode, but /proc provides
504 * a back door. Remember to keep it shut otherwise you'll let the 503 * a back door. Remember to keep it shut otherwise you'll let the
505 * creepy crawlies in. 504 * creepy crawlies in.
506 */ 505 */
507 506
508 static int sock_no_open(struct inode *irrelevant, struct file *dontcare) 507 static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
509 { 508 {
510 return -ENXIO; 509 return -ENXIO;
511 } 510 }
512 511
513 const struct file_operations bad_sock_fops = { 512 const struct file_operations bad_sock_fops = {
514 .owner = THIS_MODULE, 513 .owner = THIS_MODULE,
515 .open = sock_no_open, 514 .open = sock_no_open,
516 }; 515 };
517 516
518 /** 517 /**
519 * sock_release - close a socket 518 * sock_release - close a socket
520 * @sock: socket to close 519 * @sock: socket to close
521 * 520 *
522 * The socket is released from the protocol stack if it has a release 521 * The socket is released from the protocol stack if it has a release
523 * callback, and the inode is then released if the socket is bound to 522 * callback, and the inode is then released if the socket is bound to
524 * an inode not a file. 523 * an inode not a file.
525 */ 524 */
526 525
527 void sock_release(struct socket *sock) 526 void sock_release(struct socket *sock)
528 { 527 {
529 if (sock->ops) { 528 if (sock->ops) {
530 struct module *owner = sock->ops->owner; 529 struct module *owner = sock->ops->owner;
531 530
532 sock->ops->release(sock); 531 sock->ops->release(sock);
533 sock->ops = NULL; 532 sock->ops = NULL;
534 module_put(owner); 533 module_put(owner);
535 } 534 }
536 535
537 if (sock->fasync_list) 536 if (sock->fasync_list)
538 printk(KERN_ERR "sock_release: fasync list not empty!\n"); 537 printk(KERN_ERR "sock_release: fasync list not empty!\n");
539 538
540 get_cpu_var(sockets_in_use)--; 539 get_cpu_var(sockets_in_use)--;
541 put_cpu_var(sockets_in_use); 540 put_cpu_var(sockets_in_use);
542 if (!sock->file) { 541 if (!sock->file) {
543 iput(SOCK_INODE(sock)); 542 iput(SOCK_INODE(sock));
544 return; 543 return;
545 } 544 }
546 sock->file = NULL; 545 sock->file = NULL;
547 } 546 }
548 547
549 static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, 548 static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
550 struct msghdr *msg, size_t size) 549 struct msghdr *msg, size_t size)
551 { 550 {
552 struct sock_iocb *si = kiocb_to_siocb(iocb); 551 struct sock_iocb *si = kiocb_to_siocb(iocb);
553 int err; 552 int err;
554 553
555 si->sock = sock; 554 si->sock = sock;
556 si->scm = NULL; 555 si->scm = NULL;
557 si->msg = msg; 556 si->msg = msg;
558 si->size = size; 557 si->size = size;
559 558
560 err = security_socket_sendmsg(sock, msg, size); 559 err = security_socket_sendmsg(sock, msg, size);
561 if (err) 560 if (err)
562 return err; 561 return err;
563 562
564 return sock->ops->sendmsg(iocb, sock, msg, size); 563 return sock->ops->sendmsg(iocb, sock, msg, size);
565 } 564 }
566 565
567 int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) 566 int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
568 { 567 {
569 struct kiocb iocb; 568 struct kiocb iocb;
570 struct sock_iocb siocb; 569 struct sock_iocb siocb;
571 int ret; 570 int ret;
572 571
573 init_sync_kiocb(&iocb, NULL); 572 init_sync_kiocb(&iocb, NULL);
574 iocb.private = &siocb; 573 iocb.private = &siocb;
575 ret = __sock_sendmsg(&iocb, sock, msg, size); 574 ret = __sock_sendmsg(&iocb, sock, msg, size);
576 if (-EIOCBQUEUED == ret) 575 if (-EIOCBQUEUED == ret)
577 ret = wait_on_sync_kiocb(&iocb); 576 ret = wait_on_sync_kiocb(&iocb);
578 return ret; 577 return ret;
579 } 578 }
580 579
581 int kernel_sendmsg(struct socket *sock, struct msghdr *msg, 580 int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
582 struct kvec *vec, size_t num, size_t size) 581 struct kvec *vec, size_t num, size_t size)
583 { 582 {
584 mm_segment_t oldfs = get_fs(); 583 mm_segment_t oldfs = get_fs();
585 int result; 584 int result;
586 585
587 set_fs(KERNEL_DS); 586 set_fs(KERNEL_DS);
588 /* 587 /*
589 * the following is safe, since for compiler definitions of kvec and 588 * the following is safe, since for compiler definitions of kvec and
590 * iovec are identical, yielding the same in-core layout and alignment 589 * iovec are identical, yielding the same in-core layout and alignment
591 */ 590 */
592 msg->msg_iov = (struct iovec *)vec; 591 msg->msg_iov = (struct iovec *)vec;
593 msg->msg_iovlen = num; 592 msg->msg_iovlen = num;
594 result = sock_sendmsg(sock, msg, size); 593 result = sock_sendmsg(sock, msg, size);
595 set_fs(oldfs); 594 set_fs(oldfs);
596 return result; 595 return result;
597 } 596 }
598 597
599 /* 598 /*
600 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) 599 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
601 */ 600 */
602 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, 601 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
603 struct sk_buff *skb) 602 struct sk_buff *skb)
604 { 603 {
605 ktime_t kt = skb->tstamp; 604 ktime_t kt = skb->tstamp;
606 605
607 if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { 606 if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
608 struct timeval tv; 607 struct timeval tv;
609 /* Race occurred between timestamp enabling and packet 608 /* Race occurred between timestamp enabling and packet
610 receiving. Fill in the current time for now. */ 609 receiving. Fill in the current time for now. */
611 if (kt.tv64 == 0) 610 if (kt.tv64 == 0)
612 kt = ktime_get_real(); 611 kt = ktime_get_real();
613 skb->tstamp = kt; 612 skb->tstamp = kt;
614 tv = ktime_to_timeval(kt); 613 tv = ktime_to_timeval(kt);
615 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); 614 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv);
616 } else { 615 } else {
617 struct timespec ts; 616 struct timespec ts;
618 /* Race occurred between timestamp enabling and packet 617 /* Race occurred between timestamp enabling and packet
619 receiving. Fill in the current time for now. */ 618 receiving. Fill in the current time for now. */
620 if (kt.tv64 == 0) 619 if (kt.tv64 == 0)
621 kt = ktime_get_real(); 620 kt = ktime_get_real();
622 skb->tstamp = kt; 621 skb->tstamp = kt;
623 ts = ktime_to_timespec(kt); 622 ts = ktime_to_timespec(kt);
624 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts); 623 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts);
625 } 624 }
626 } 625 }
627 626
628 EXPORT_SYMBOL_GPL(__sock_recv_timestamp); 627 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
629 628
630 static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, 629 static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
631 struct msghdr *msg, size_t size, int flags) 630 struct msghdr *msg, size_t size, int flags)
632 { 631 {
633 int err; 632 int err;
634 struct sock_iocb *si = kiocb_to_siocb(iocb); 633 struct sock_iocb *si = kiocb_to_siocb(iocb);
635 634
636 si->sock = sock; 635 si->sock = sock;
637 si->scm = NULL; 636 si->scm = NULL;
638 si->msg = msg; 637 si->msg = msg;
639 si->size = size; 638 si->size = size;
640 si->flags = flags; 639 si->flags = flags;
641 640
642 err = security_socket_recvmsg(sock, msg, size, flags); 641 err = security_socket_recvmsg(sock, msg, size, flags);
643 if (err) 642 if (err)
644 return err; 643 return err;
645 644
646 return sock->ops->recvmsg(iocb, sock, msg, size, flags); 645 return sock->ops->recvmsg(iocb, sock, msg, size, flags);
647 } 646 }
648 647
649 int sock_recvmsg(struct socket *sock, struct msghdr *msg, 648 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
650 size_t size, int flags) 649 size_t size, int flags)
651 { 650 {
652 struct kiocb iocb; 651 struct kiocb iocb;
653 struct sock_iocb siocb; 652 struct sock_iocb siocb;
654 int ret; 653 int ret;
655 654
656 init_sync_kiocb(&iocb, NULL); 655 init_sync_kiocb(&iocb, NULL);
657 iocb.private = &siocb; 656 iocb.private = &siocb;
658 ret = __sock_recvmsg(&iocb, sock, msg, size, flags); 657 ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
659 if (-EIOCBQUEUED == ret) 658 if (-EIOCBQUEUED == ret)
660 ret = wait_on_sync_kiocb(&iocb); 659 ret = wait_on_sync_kiocb(&iocb);
661 return ret; 660 return ret;
662 } 661 }
663 662
664 int kernel_recvmsg(struct socket *sock, struct msghdr *msg, 663 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
665 struct kvec *vec, size_t num, size_t size, int flags) 664 struct kvec *vec, size_t num, size_t size, int flags)
666 { 665 {
667 mm_segment_t oldfs = get_fs(); 666 mm_segment_t oldfs = get_fs();
668 int result; 667 int result;
669 668
670 set_fs(KERNEL_DS); 669 set_fs(KERNEL_DS);
671 /* 670 /*
672 * the following is safe, since for compiler definitions of kvec and 671 * the following is safe, since for compiler definitions of kvec and
673 * iovec are identical, yielding the same in-core layout and alignment 672 * iovec are identical, yielding the same in-core layout and alignment
674 */ 673 */
675 msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num; 674 msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
676 result = sock_recvmsg(sock, msg, size, flags); 675 result = sock_recvmsg(sock, msg, size, flags);
677 set_fs(oldfs); 676 set_fs(oldfs);
678 return result; 677 return result;
679 } 678 }
680 679
681 static void sock_aio_dtor(struct kiocb *iocb) 680 static void sock_aio_dtor(struct kiocb *iocb)
682 { 681 {
683 kfree(iocb->private); 682 kfree(iocb->private);
684 } 683 }
685 684
686 static ssize_t sock_sendpage(struct file *file, struct page *page, 685 static ssize_t sock_sendpage(struct file *file, struct page *page,
687 int offset, size_t size, loff_t *ppos, int more) 686 int offset, size_t size, loff_t *ppos, int more)
688 { 687 {
689 struct socket *sock; 688 struct socket *sock;
690 int flags; 689 int flags;
691 690
692 sock = file->private_data; 691 sock = file->private_data;
693 692
694 flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; 693 flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
695 if (more) 694 if (more)
696 flags |= MSG_MORE; 695 flags |= MSG_MORE;
697 696
698 return sock->ops->sendpage(sock, page, offset, size, flags); 697 return sock->ops->sendpage(sock, page, offset, size, flags);
699 } 698 }
700 699
701 static ssize_t sock_splice_read(struct file *file, loff_t *ppos, 700 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
702 struct pipe_inode_info *pipe, size_t len, 701 struct pipe_inode_info *pipe, size_t len,
703 unsigned int flags) 702 unsigned int flags)
704 { 703 {
705 struct socket *sock = file->private_data; 704 struct socket *sock = file->private_data;
706 705
707 if (unlikely(!sock->ops->splice_read)) 706 if (unlikely(!sock->ops->splice_read))
708 return -EINVAL; 707 return -EINVAL;
709 708
710 return sock->ops->splice_read(sock, ppos, pipe, len, flags); 709 return sock->ops->splice_read(sock, ppos, pipe, len, flags);
711 } 710 }
712 711
713 static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, 712 static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
714 struct sock_iocb *siocb) 713 struct sock_iocb *siocb)
715 { 714 {
716 if (!is_sync_kiocb(iocb)) { 715 if (!is_sync_kiocb(iocb)) {
717 siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); 716 siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
718 if (!siocb) 717 if (!siocb)
719 return NULL; 718 return NULL;
720 iocb->ki_dtor = sock_aio_dtor; 719 iocb->ki_dtor = sock_aio_dtor;
721 } 720 }
722 721
723 siocb->kiocb = iocb; 722 siocb->kiocb = iocb;
724 iocb->private = siocb; 723 iocb->private = siocb;
725 return siocb; 724 return siocb;
726 } 725 }
727 726
728 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, 727 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
729 struct file *file, const struct iovec *iov, 728 struct file *file, const struct iovec *iov,
730 unsigned long nr_segs) 729 unsigned long nr_segs)
731 { 730 {
732 struct socket *sock = file->private_data; 731 struct socket *sock = file->private_data;
733 size_t size = 0; 732 size_t size = 0;
734 int i; 733 int i;
735 734
736 for (i = 0; i < nr_segs; i++) 735 for (i = 0; i < nr_segs; i++)
737 size += iov[i].iov_len; 736 size += iov[i].iov_len;
738 737
739 msg->msg_name = NULL; 738 msg->msg_name = NULL;
740 msg->msg_namelen = 0; 739 msg->msg_namelen = 0;
741 msg->msg_control = NULL; 740 msg->msg_control = NULL;
742 msg->msg_controllen = 0; 741 msg->msg_controllen = 0;
743 msg->msg_iov = (struct iovec *)iov; 742 msg->msg_iov = (struct iovec *)iov;
744 msg->msg_iovlen = nr_segs; 743 msg->msg_iovlen = nr_segs;
745 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; 744 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
746 745
747 return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); 746 return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
748 } 747 }
749 748
750 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, 749 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
751 unsigned long nr_segs, loff_t pos) 750 unsigned long nr_segs, loff_t pos)
752 { 751 {
753 struct sock_iocb siocb, *x; 752 struct sock_iocb siocb, *x;
754 753
755 if (pos != 0) 754 if (pos != 0)
756 return -ESPIPE; 755 return -ESPIPE;
757 756
758 if (iocb->ki_left == 0) /* Match SYS5 behaviour */ 757 if (iocb->ki_left == 0) /* Match SYS5 behaviour */
759 return 0; 758 return 0;
760 759
761 760
762 x = alloc_sock_iocb(iocb, &siocb); 761 x = alloc_sock_iocb(iocb, &siocb);
763 if (!x) 762 if (!x)
764 return -ENOMEM; 763 return -ENOMEM;
765 return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); 764 return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
766 } 765 }
767 766
768 static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, 767 static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
769 struct file *file, const struct iovec *iov, 768 struct file *file, const struct iovec *iov,
770 unsigned long nr_segs) 769 unsigned long nr_segs)
771 { 770 {
772 struct socket *sock = file->private_data; 771 struct socket *sock = file->private_data;
773 size_t size = 0; 772 size_t size = 0;
774 int i; 773 int i;
775 774
776 for (i = 0; i < nr_segs; i++) 775 for (i = 0; i < nr_segs; i++)
777 size += iov[i].iov_len; 776 size += iov[i].iov_len;
778 777
779 msg->msg_name = NULL; 778 msg->msg_name = NULL;
780 msg->msg_namelen = 0; 779 msg->msg_namelen = 0;
781 msg->msg_control = NULL; 780 msg->msg_control = NULL;
782 msg->msg_controllen = 0; 781 msg->msg_controllen = 0;
783 msg->msg_iov = (struct iovec *)iov; 782 msg->msg_iov = (struct iovec *)iov;
784 msg->msg_iovlen = nr_segs; 783 msg->msg_iovlen = nr_segs;
785 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; 784 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
786 if (sock->type == SOCK_SEQPACKET) 785 if (sock->type == SOCK_SEQPACKET)
787 msg->msg_flags |= MSG_EOR; 786 msg->msg_flags |= MSG_EOR;
788 787
789 return __sock_sendmsg(iocb, sock, msg, size); 788 return __sock_sendmsg(iocb, sock, msg, size);
790 } 789 }
791 790
792 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, 791 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
793 unsigned long nr_segs, loff_t pos) 792 unsigned long nr_segs, loff_t pos)
794 { 793 {
795 struct sock_iocb siocb, *x; 794 struct sock_iocb siocb, *x;
796 795
797 if (pos != 0) 796 if (pos != 0)
798 return -ESPIPE; 797 return -ESPIPE;
799 798
800 x = alloc_sock_iocb(iocb, &siocb); 799 x = alloc_sock_iocb(iocb, &siocb);
801 if (!x) 800 if (!x)
802 return -ENOMEM; 801 return -ENOMEM;
803 802
804 return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); 803 return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
805 } 804 }
806 805
807 /* 806 /*
808 * Atomic setting of ioctl hooks to avoid race 807 * Atomic setting of ioctl hooks to avoid race
809 * with module unload. 808 * with module unload.
810 */ 809 */
811 810
812 static DEFINE_MUTEX(br_ioctl_mutex); 811 static DEFINE_MUTEX(br_ioctl_mutex);
813 static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL; 812 static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL;
814 813
815 void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) 814 void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
816 { 815 {
817 mutex_lock(&br_ioctl_mutex); 816 mutex_lock(&br_ioctl_mutex);
818 br_ioctl_hook = hook; 817 br_ioctl_hook = hook;
819 mutex_unlock(&br_ioctl_mutex); 818 mutex_unlock(&br_ioctl_mutex);
820 } 819 }
821 820
822 EXPORT_SYMBOL(brioctl_set); 821 EXPORT_SYMBOL(brioctl_set);
823 822
824 static DEFINE_MUTEX(vlan_ioctl_mutex); 823 static DEFINE_MUTEX(vlan_ioctl_mutex);
825 static int (*vlan_ioctl_hook) (struct net *, void __user *arg); 824 static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
826 825
827 void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) 826 void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
828 { 827 {
829 mutex_lock(&vlan_ioctl_mutex); 828 mutex_lock(&vlan_ioctl_mutex);
830 vlan_ioctl_hook = hook; 829 vlan_ioctl_hook = hook;
831 mutex_unlock(&vlan_ioctl_mutex); 830 mutex_unlock(&vlan_ioctl_mutex);
832 } 831 }
833 832
834 EXPORT_SYMBOL(vlan_ioctl_set); 833 EXPORT_SYMBOL(vlan_ioctl_set);
835 834
836 static DEFINE_MUTEX(dlci_ioctl_mutex); 835 static DEFINE_MUTEX(dlci_ioctl_mutex);
837 static int (*dlci_ioctl_hook) (unsigned int, void __user *); 836 static int (*dlci_ioctl_hook) (unsigned int, void __user *);
838 837
839 void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) 838 void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
840 { 839 {
841 mutex_lock(&dlci_ioctl_mutex); 840 mutex_lock(&dlci_ioctl_mutex);
842 dlci_ioctl_hook = hook; 841 dlci_ioctl_hook = hook;
843 mutex_unlock(&dlci_ioctl_mutex); 842 mutex_unlock(&dlci_ioctl_mutex);
844 } 843 }
845 844
846 EXPORT_SYMBOL(dlci_ioctl_set); 845 EXPORT_SYMBOL(dlci_ioctl_set);
847 846
848 /* 847 /*
849 * With an ioctl, arg may well be a user mode pointer, but we don't know 848 * With an ioctl, arg may well be a user mode pointer, but we don't know
850 * what to do with it - that's up to the protocol still. 849 * what to do with it - that's up to the protocol still.
851 */ 850 */
852 851
853 static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) 852 static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
854 { 853 {
855 struct socket *sock; 854 struct socket *sock;
856 struct sock *sk; 855 struct sock *sk;
857 void __user *argp = (void __user *)arg; 856 void __user *argp = (void __user *)arg;
858 int pid, err; 857 int pid, err;
859 struct net *net; 858 struct net *net;
860 859
861 sock = file->private_data; 860 sock = file->private_data;
862 sk = sock->sk; 861 sk = sock->sk;
863 net = sock_net(sk); 862 net = sock_net(sk);
864 if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { 863 if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
865 err = dev_ioctl(net, cmd, argp); 864 err = dev_ioctl(net, cmd, argp);
866 } else 865 } else
867 #ifdef CONFIG_WIRELESS_EXT 866 #ifdef CONFIG_WIRELESS_EXT
868 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { 867 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
869 err = dev_ioctl(net, cmd, argp); 868 err = dev_ioctl(net, cmd, argp);
870 } else 869 } else
871 #endif /* CONFIG_WIRELESS_EXT */ 870 #endif /* CONFIG_WIRELESS_EXT */
872 switch (cmd) { 871 switch (cmd) {
873 case FIOSETOWN: 872 case FIOSETOWN:
874 case SIOCSPGRP: 873 case SIOCSPGRP:
875 err = -EFAULT; 874 err = -EFAULT;
876 if (get_user(pid, (int __user *)argp)) 875 if (get_user(pid, (int __user *)argp))
877 break; 876 break;
878 err = f_setown(sock->file, pid, 1); 877 err = f_setown(sock->file, pid, 1);
879 break; 878 break;
880 case FIOGETOWN: 879 case FIOGETOWN:
881 case SIOCGPGRP: 880 case SIOCGPGRP:
882 err = put_user(f_getown(sock->file), 881 err = put_user(f_getown(sock->file),
883 (int __user *)argp); 882 (int __user *)argp);
884 break; 883 break;
885 case SIOCGIFBR: 884 case SIOCGIFBR:
886 case SIOCSIFBR: 885 case SIOCSIFBR:
887 case SIOCBRADDBR: 886 case SIOCBRADDBR:
888 case SIOCBRDELBR: 887 case SIOCBRDELBR:
889 err = -ENOPKG; 888 err = -ENOPKG;
890 if (!br_ioctl_hook) 889 if (!br_ioctl_hook)
891 request_module("bridge"); 890 request_module("bridge");
892 891
893 mutex_lock(&br_ioctl_mutex); 892 mutex_lock(&br_ioctl_mutex);
894 if (br_ioctl_hook) 893 if (br_ioctl_hook)
895 err = br_ioctl_hook(net, cmd, argp); 894 err = br_ioctl_hook(net, cmd, argp);
896 mutex_unlock(&br_ioctl_mutex); 895 mutex_unlock(&br_ioctl_mutex);
897 break; 896 break;
898 case SIOCGIFVLAN: 897 case SIOCGIFVLAN:
899 case SIOCSIFVLAN: 898 case SIOCSIFVLAN:
900 err = -ENOPKG; 899 err = -ENOPKG;
901 if (!vlan_ioctl_hook) 900 if (!vlan_ioctl_hook)
902 request_module("8021q"); 901 request_module("8021q");
903 902
904 mutex_lock(&vlan_ioctl_mutex); 903 mutex_lock(&vlan_ioctl_mutex);
905 if (vlan_ioctl_hook) 904 if (vlan_ioctl_hook)
906 err = vlan_ioctl_hook(net, argp); 905 err = vlan_ioctl_hook(net, argp);
907 mutex_unlock(&vlan_ioctl_mutex); 906 mutex_unlock(&vlan_ioctl_mutex);
908 break; 907 break;
909 case SIOCADDDLCI: 908 case SIOCADDDLCI:
910 case SIOCDELDLCI: 909 case SIOCDELDLCI:
911 err = -ENOPKG; 910 err = -ENOPKG;
912 if (!dlci_ioctl_hook) 911 if (!dlci_ioctl_hook)
913 request_module("dlci"); 912 request_module("dlci");
914 913
915 mutex_lock(&dlci_ioctl_mutex); 914 mutex_lock(&dlci_ioctl_mutex);
916 if (dlci_ioctl_hook) 915 if (dlci_ioctl_hook)
917 err = dlci_ioctl_hook(cmd, argp); 916 err = dlci_ioctl_hook(cmd, argp);
918 mutex_unlock(&dlci_ioctl_mutex); 917 mutex_unlock(&dlci_ioctl_mutex);
919 break; 918 break;
920 default: 919 default:
921 err = sock->ops->ioctl(sock, cmd, arg); 920 err = sock->ops->ioctl(sock, cmd, arg);
922 921
923 /* 922 /*
924 * If this ioctl is unknown try to hand it down 923 * If this ioctl is unknown try to hand it down
925 * to the NIC driver. 924 * to the NIC driver.
926 */ 925 */
927 if (err == -ENOIOCTLCMD) 926 if (err == -ENOIOCTLCMD)
928 err = dev_ioctl(net, cmd, argp); 927 err = dev_ioctl(net, cmd, argp);
929 break; 928 break;
930 } 929 }
931 return err; 930 return err;
932 } 931 }
933 932
934 int sock_create_lite(int family, int type, int protocol, struct socket **res) 933 int sock_create_lite(int family, int type, int protocol, struct socket **res)
935 { 934 {
936 int err; 935 int err;
937 struct socket *sock = NULL; 936 struct socket *sock = NULL;
938 937
939 err = security_socket_create(family, type, protocol, 1); 938 err = security_socket_create(family, type, protocol, 1);
940 if (err) 939 if (err)
941 goto out; 940 goto out;
942 941
943 sock = sock_alloc(); 942 sock = sock_alloc();
944 if (!sock) { 943 if (!sock) {
945 err = -ENOMEM; 944 err = -ENOMEM;
946 goto out; 945 goto out;
947 } 946 }
948 947
949 sock->type = type; 948 sock->type = type;
950 err = security_socket_post_create(sock, family, type, protocol, 1); 949 err = security_socket_post_create(sock, family, type, protocol, 1);
951 if (err) 950 if (err)
952 goto out_release; 951 goto out_release;
953 952
954 out: 953 out:
955 *res = sock; 954 *res = sock;
956 return err; 955 return err;
957 out_release: 956 out_release:
958 sock_release(sock); 957 sock_release(sock);
959 sock = NULL; 958 sock = NULL;
960 goto out; 959 goto out;
961 } 960 }
962 961
963 /* No kernel lock held - perfect */ 962 /* No kernel lock held - perfect */
964 static unsigned int sock_poll(struct file *file, poll_table *wait) 963 static unsigned int sock_poll(struct file *file, poll_table *wait)
965 { 964 {
966 struct socket *sock; 965 struct socket *sock;
967 966
968 /* 967 /*
969 * We can't return errors to poll, so it's either yes or no. 968 * We can't return errors to poll, so it's either yes or no.
970 */ 969 */
971 sock = file->private_data; 970 sock = file->private_data;
972 return sock->ops->poll(file, sock, wait); 971 return sock->ops->poll(file, sock, wait);
973 } 972 }
974 973
975 static int sock_mmap(struct file *file, struct vm_area_struct *vma) 974 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
976 { 975 {
977 struct socket *sock = file->private_data; 976 struct socket *sock = file->private_data;
978 977
979 return sock->ops->mmap(file, sock, vma); 978 return sock->ops->mmap(file, sock, vma);
980 } 979 }
981 980
982 static int sock_close(struct inode *inode, struct file *filp) 981 static int sock_close(struct inode *inode, struct file *filp)
983 { 982 {
984 /* 983 /*
985 * It was possible the inode is NULL we were 984 * It was possible the inode is NULL we were
986 * closing an unfinished socket. 985 * closing an unfinished socket.
987 */ 986 */
988 987
989 if (!inode) { 988 if (!inode) {
990 printk(KERN_DEBUG "sock_close: NULL inode\n"); 989 printk(KERN_DEBUG "sock_close: NULL inode\n");
991 return 0; 990 return 0;
992 } 991 }
993 sock_fasync(-1, filp, 0); 992 sock_fasync(-1, filp, 0);
994 sock_release(SOCKET_I(inode)); 993 sock_release(SOCKET_I(inode));
995 return 0; 994 return 0;
996 } 995 }
997 996
998 /* 997 /*
999 * Update the socket async list 998 * Update the socket async list
1000 * 999 *
1001 * Fasync_list locking strategy. 1000 * Fasync_list locking strategy.
1002 * 1001 *
1003 * 1. fasync_list is modified only under process context socket lock 1002 * 1. fasync_list is modified only under process context socket lock
1004 * i.e. under semaphore. 1003 * i.e. under semaphore.
1005 * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) 1004 * 2. fasync_list is used under read_lock(&sk->sk_callback_lock)
1006 * or under socket lock. 1005 * or under socket lock.
1007 * 3. fasync_list can be used from softirq context, so that 1006 * 3. fasync_list can be used from softirq context, so that
1008 * modification under socket lock have to be enhanced with 1007 * modification under socket lock have to be enhanced with
1009 * write_lock_bh(&sk->sk_callback_lock). 1008 * write_lock_bh(&sk->sk_callback_lock).
1010 * --ANK (990710) 1009 * --ANK (990710)
1011 */ 1010 */
1012 1011
1013 static int sock_fasync(int fd, struct file *filp, int on) 1012 static int sock_fasync(int fd, struct file *filp, int on)
1014 { 1013 {
1015 struct fasync_struct *fa, *fna = NULL, **prev; 1014 struct fasync_struct *fa, *fna = NULL, **prev;
1016 struct socket *sock; 1015 struct socket *sock;
1017 struct sock *sk; 1016 struct sock *sk;
1018 1017
1019 if (on) { 1018 if (on) {
1020 fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL); 1019 fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
1021 if (fna == NULL) 1020 if (fna == NULL)
1022 return -ENOMEM; 1021 return -ENOMEM;
1023 } 1022 }
1024 1023
1025 sock = filp->private_data; 1024 sock = filp->private_data;
1026 1025
1027 sk = sock->sk; 1026 sk = sock->sk;
1028 if (sk == NULL) { 1027 if (sk == NULL) {
1029 kfree(fna); 1028 kfree(fna);
1030 return -EINVAL; 1029 return -EINVAL;
1031 } 1030 }
1032 1031
1033 lock_sock(sk); 1032 lock_sock(sk);
1034 1033
1035 prev = &(sock->fasync_list); 1034 prev = &(sock->fasync_list);
1036 1035
1037 for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev) 1036 for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev)
1038 if (fa->fa_file == filp) 1037 if (fa->fa_file == filp)
1039 break; 1038 break;
1040 1039
1041 if (on) { 1040 if (on) {
1042 if (fa != NULL) { 1041 if (fa != NULL) {
1043 write_lock_bh(&sk->sk_callback_lock); 1042 write_lock_bh(&sk->sk_callback_lock);
1044 fa->fa_fd = fd; 1043 fa->fa_fd = fd;
1045 write_unlock_bh(&sk->sk_callback_lock); 1044 write_unlock_bh(&sk->sk_callback_lock);
1046 1045
1047 kfree(fna); 1046 kfree(fna);
1048 goto out; 1047 goto out;
1049 } 1048 }
1050 fna->fa_file = filp; 1049 fna->fa_file = filp;
1051 fna->fa_fd = fd; 1050 fna->fa_fd = fd;
1052 fna->magic = FASYNC_MAGIC; 1051 fna->magic = FASYNC_MAGIC;
1053 fna->fa_next = sock->fasync_list; 1052 fna->fa_next = sock->fasync_list;
1054 write_lock_bh(&sk->sk_callback_lock); 1053 write_lock_bh(&sk->sk_callback_lock);
1055 sock->fasync_list = fna; 1054 sock->fasync_list = fna;
1056 write_unlock_bh(&sk->sk_callback_lock); 1055 write_unlock_bh(&sk->sk_callback_lock);
1057 } else { 1056 } else {
1058 if (fa != NULL) { 1057 if (fa != NULL) {
1059 write_lock_bh(&sk->sk_callback_lock); 1058 write_lock_bh(&sk->sk_callback_lock);
1060 *prev = fa->fa_next; 1059 *prev = fa->fa_next;
1061 write_unlock_bh(&sk->sk_callback_lock); 1060 write_unlock_bh(&sk->sk_callback_lock);
1062 kfree(fa); 1061 kfree(fa);
1063 } 1062 }
1064 } 1063 }
1065 1064
1066 out: 1065 out:
1067 release_sock(sock->sk); 1066 release_sock(sock->sk);
1068 return 0; 1067 return 0;
1069 } 1068 }
1070 1069
1071 /* This function may be called only under socket lock or callback_lock */ 1070 /* This function may be called only under socket lock or callback_lock */
1072 1071
1073 int sock_wake_async(struct socket *sock, int how, int band) 1072 int sock_wake_async(struct socket *sock, int how, int band)
1074 { 1073 {
1075 if (!sock || !sock->fasync_list) 1074 if (!sock || !sock->fasync_list)
1076 return -1; 1075 return -1;
1077 switch (how) { 1076 switch (how) {
1078 case SOCK_WAKE_WAITD: 1077 case SOCK_WAKE_WAITD:
1079 if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) 1078 if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
1080 break; 1079 break;
1081 goto call_kill; 1080 goto call_kill;
1082 case SOCK_WAKE_SPACE: 1081 case SOCK_WAKE_SPACE:
1083 if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) 1082 if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
1084 break; 1083 break;
1085 /* fall through */ 1084 /* fall through */
1086 case SOCK_WAKE_IO: 1085 case SOCK_WAKE_IO:
1087 call_kill: 1086 call_kill:
1088 __kill_fasync(sock->fasync_list, SIGIO, band); 1087 __kill_fasync(sock->fasync_list, SIGIO, band);
1089 break; 1088 break;
1090 case SOCK_WAKE_URG: 1089 case SOCK_WAKE_URG:
1091 __kill_fasync(sock->fasync_list, SIGURG, band); 1090 __kill_fasync(sock->fasync_list, SIGURG, band);
1092 } 1091 }
1093 return 0; 1092 return 0;
1094 } 1093 }
1095 1094
1096 static int __sock_create(struct net *net, int family, int type, int protocol, 1095 static int __sock_create(struct net *net, int family, int type, int protocol,
1097 struct socket **res, int kern) 1096 struct socket **res, int kern)
1098 { 1097 {
1099 int err; 1098 int err;
1100 struct socket *sock; 1099 struct socket *sock;
1101 const struct net_proto_family *pf; 1100 const struct net_proto_family *pf;
1102 1101
1103 /* 1102 /*
1104 * Check protocol is in range 1103 * Check protocol is in range
1105 */ 1104 */
1106 if (family < 0 || family >= NPROTO) 1105 if (family < 0 || family >= NPROTO)
1107 return -EAFNOSUPPORT; 1106 return -EAFNOSUPPORT;
1108 if (type < 0 || type >= SOCK_MAX) 1107 if (type < 0 || type >= SOCK_MAX)
1109 return -EINVAL; 1108 return -EINVAL;
1110 1109
1111 /* Compatibility. 1110 /* Compatibility.
1112 1111
1113 This uglymoron is moved from INET layer to here to avoid 1112 This uglymoron is moved from INET layer to here to avoid
1114 deadlock in module load. 1113 deadlock in module load.
1115 */ 1114 */
1116 if (family == PF_INET && type == SOCK_PACKET) { 1115 if (family == PF_INET && type == SOCK_PACKET) {
1117 static int warned; 1116 static int warned;
1118 if (!warned) { 1117 if (!warned) {
1119 warned = 1; 1118 warned = 1;
1120 printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", 1119 printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
1121 current->comm); 1120 current->comm);
1122 } 1121 }
1123 family = PF_PACKET; 1122 family = PF_PACKET;
1124 } 1123 }
1125 1124
1126 err = security_socket_create(family, type, protocol, kern); 1125 err = security_socket_create(family, type, protocol, kern);
1127 if (err) 1126 if (err)
1128 return err; 1127 return err;
1129 1128
1130 /* 1129 /*
1131 * Allocate the socket and allow the family to set things up. if 1130 * Allocate the socket and allow the family to set things up. if
1132 * the protocol is 0, the family is instructed to select an appropriate 1131 * the protocol is 0, the family is instructed to select an appropriate
1133 * default. 1132 * default.
1134 */ 1133 */
1135 sock = sock_alloc(); 1134 sock = sock_alloc();
1136 if (!sock) { 1135 if (!sock) {
1137 if (net_ratelimit()) 1136 if (net_ratelimit())
1138 printk(KERN_WARNING "socket: no more sockets\n"); 1137 printk(KERN_WARNING "socket: no more sockets\n");
1139 return -ENFILE; /* Not exactly a match, but its the 1138 return -ENFILE; /* Not exactly a match, but its the
1140 closest posix thing */ 1139 closest posix thing */
1141 } 1140 }
1142 1141
1143 sock->type = type; 1142 sock->type = type;
1144 1143
1145 #ifdef CONFIG_MODULES 1144 #ifdef CONFIG_MODULES
1146 /* Attempt to load a protocol module if the find failed. 1145 /* Attempt to load a protocol module if the find failed.
1147 * 1146 *
1148 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user 1147 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
1149 * requested real, full-featured networking support upon configuration. 1148 * requested real, full-featured networking support upon configuration.
1150 * Otherwise module support will break! 1149 * Otherwise module support will break!
1151 */ 1150 */
1152 if (net_families[family] == NULL) 1151 if (net_families[family] == NULL)
1153 request_module("net-pf-%d", family); 1152 request_module("net-pf-%d", family);
1154 #endif 1153 #endif
1155 1154
1156 rcu_read_lock(); 1155 rcu_read_lock();
1157 pf = rcu_dereference(net_families[family]); 1156 pf = rcu_dereference(net_families[family]);
1158 err = -EAFNOSUPPORT; 1157 err = -EAFNOSUPPORT;
1159 if (!pf) 1158 if (!pf)
1160 goto out_release; 1159 goto out_release;
1161 1160
1162 /* 1161 /*
1163 * We will call the ->create function, that possibly is in a loadable 1162 * We will call the ->create function, that possibly is in a loadable
1164 * module, so we have to bump that loadable module refcnt first. 1163 * module, so we have to bump that loadable module refcnt first.
1165 */ 1164 */
1166 if (!try_module_get(pf->owner)) 1165 if (!try_module_get(pf->owner))
1167 goto out_release; 1166 goto out_release;
1168 1167
1169 /* Now protected by module ref count */ 1168 /* Now protected by module ref count */
1170 rcu_read_unlock(); 1169 rcu_read_unlock();
1171 1170
1172 err = pf->create(net, sock, protocol); 1171 err = pf->create(net, sock, protocol);
1173 if (err < 0) 1172 if (err < 0)
1174 goto out_module_put; 1173 goto out_module_put;
1175 1174
1176 /* 1175 /*
1177 * Now to bump the refcnt of the [loadable] module that owns this 1176 * Now to bump the refcnt of the [loadable] module that owns this
1178 * socket at sock_release time we decrement its refcnt. 1177 * socket at sock_release time we decrement its refcnt.
1179 */ 1178 */
1180 if (!try_module_get(sock->ops->owner)) 1179 if (!try_module_get(sock->ops->owner))
1181 goto out_module_busy; 1180 goto out_module_busy;
1182 1181
1183 /* 1182 /*
1184 * Now that we're done with the ->create function, the [loadable] 1183 * Now that we're done with the ->create function, the [loadable]
1185 * module can have its refcnt decremented 1184 * module can have its refcnt decremented
1186 */ 1185 */
1187 module_put(pf->owner); 1186 module_put(pf->owner);
1188 err = security_socket_post_create(sock, family, type, protocol, kern); 1187 err = security_socket_post_create(sock, family, type, protocol, kern);
1189 if (err) 1188 if (err)
1190 goto out_sock_release; 1189 goto out_sock_release;
1191 *res = sock; 1190 *res = sock;
1192 1191
1193 return 0; 1192 return 0;
1194 1193
1195 out_module_busy: 1194 out_module_busy:
1196 err = -EAFNOSUPPORT; 1195 err = -EAFNOSUPPORT;
1197 out_module_put: 1196 out_module_put:
1198 sock->ops = NULL; 1197 sock->ops = NULL;
1199 module_put(pf->owner); 1198 module_put(pf->owner);
1200 out_sock_release: 1199 out_sock_release:
1201 sock_release(sock); 1200 sock_release(sock);
1202 return err; 1201 return err;
1203 1202
1204 out_release: 1203 out_release:
1205 rcu_read_unlock(); 1204 rcu_read_unlock();
1206 goto out_sock_release; 1205 goto out_sock_release;
1207 } 1206 }
1208 1207
1209 int sock_create(int family, int type, int protocol, struct socket **res) 1208 int sock_create(int family, int type, int protocol, struct socket **res)
1210 { 1209 {
1211 return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); 1210 return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
1212 } 1211 }
1213 1212
1214 int sock_create_kern(int family, int type, int protocol, struct socket **res) 1213 int sock_create_kern(int family, int type, int protocol, struct socket **res)
1215 { 1214 {
1216 return __sock_create(&init_net, family, type, protocol, res, 1); 1215 return __sock_create(&init_net, family, type, protocol, res, 1);
1217 } 1216 }
1218 1217
1219 asmlinkage long sys_socket(int family, int type, int protocol) 1218 asmlinkage long sys_socket(int family, int type, int protocol)
1220 { 1219 {
1221 int retval; 1220 int retval;
1222 struct socket *sock; 1221 struct socket *sock;
1223 int flags; 1222 int flags;
1224 1223
1225 /* Check the SOCK_* constants for consistency. */ 1224 /* Check the SOCK_* constants for consistency. */
1226 BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); 1225 BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
1227 BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); 1226 BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
1228 BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); 1227 BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
1229 BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); 1228 BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
1230 1229
1231 flags = type & ~SOCK_TYPE_MASK; 1230 flags = type & ~SOCK_TYPE_MASK;
1232 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1231 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1233 return -EINVAL; 1232 return -EINVAL;
1234 type &= SOCK_TYPE_MASK; 1233 type &= SOCK_TYPE_MASK;
1235 1234
1236 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) 1235 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1237 flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1236 flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1238 1237
1239 retval = sock_create(family, type, protocol, &sock); 1238 retval = sock_create(family, type, protocol, &sock);
1240 if (retval < 0) 1239 if (retval < 0)
1241 goto out; 1240 goto out;
1242 1241
1243 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); 1242 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
1244 if (retval < 0) 1243 if (retval < 0)
1245 goto out_release; 1244 goto out_release;
1246 1245
1247 out: 1246 out:
1248 /* It may be already another descriptor 8) Not kernel problem. */ 1247 /* It may be already another descriptor 8) Not kernel problem. */
1249 return retval; 1248 return retval;
1250 1249
1251 out_release: 1250 out_release:
1252 sock_release(sock); 1251 sock_release(sock);
1253 return retval; 1252 return retval;
1254 } 1253 }
1255 1254
1256 /* 1255 /*
1257 * Create a pair of connected sockets. 1256 * Create a pair of connected sockets.
1258 */ 1257 */
1259 1258
1260 asmlinkage long sys_socketpair(int family, int type, int protocol, 1259 asmlinkage long sys_socketpair(int family, int type, int protocol,
1261 int __user *usockvec) 1260 int __user *usockvec)
1262 { 1261 {
1263 struct socket *sock1, *sock2; 1262 struct socket *sock1, *sock2;
1264 int fd1, fd2, err; 1263 int fd1, fd2, err;
1265 struct file *newfile1, *newfile2; 1264 struct file *newfile1, *newfile2;
1266 int flags; 1265 int flags;
1267 1266
1268 flags = type & ~SOCK_TYPE_MASK; 1267 flags = type & ~SOCK_TYPE_MASK;
1269 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1268 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1270 return -EINVAL; 1269 return -EINVAL;
1271 type &= SOCK_TYPE_MASK; 1270 type &= SOCK_TYPE_MASK;
1272 1271
1273 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) 1272 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1274 flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1273 flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1275 1274
1276 /* 1275 /*
1277 * Obtain the first socket and check if the underlying protocol 1276 * Obtain the first socket and check if the underlying protocol
1278 * supports the socketpair call. 1277 * supports the socketpair call.
1279 */ 1278 */
1280 1279
1281 err = sock_create(family, type, protocol, &sock1); 1280 err = sock_create(family, type, protocol, &sock1);
1282 if (err < 0) 1281 if (err < 0)
1283 goto out; 1282 goto out;
1284 1283
1285 err = sock_create(family, type, protocol, &sock2); 1284 err = sock_create(family, type, protocol, &sock2);
1286 if (err < 0) 1285 if (err < 0)
1287 goto out_release_1; 1286 goto out_release_1;
1288 1287
1289 err = sock1->ops->socketpair(sock1, sock2); 1288 err = sock1->ops->socketpair(sock1, sock2);
1290 if (err < 0) 1289 if (err < 0)
1291 goto out_release_both; 1290 goto out_release_both;
1292 1291
1293 fd1 = sock_alloc_fd(&newfile1, flags & O_CLOEXEC); 1292 fd1 = sock_alloc_fd(&newfile1, flags & O_CLOEXEC);
1294 if (unlikely(fd1 < 0)) { 1293 if (unlikely(fd1 < 0)) {
1295 err = fd1; 1294 err = fd1;
1296 goto out_release_both; 1295 goto out_release_both;
1297 } 1296 }
1298 1297
1299 fd2 = sock_alloc_fd(&newfile2, flags & O_CLOEXEC); 1298 fd2 = sock_alloc_fd(&newfile2, flags & O_CLOEXEC);
1300 if (unlikely(fd2 < 0)) { 1299 if (unlikely(fd2 < 0)) {
1301 err = fd2; 1300 err = fd2;
1302 put_filp(newfile1); 1301 put_filp(newfile1);
1303 put_unused_fd(fd1); 1302 put_unused_fd(fd1);
1304 goto out_release_both; 1303 goto out_release_both;
1305 } 1304 }
1306 1305
1307 err = sock_attach_fd(sock1, newfile1, flags & O_NONBLOCK); 1306 err = sock_attach_fd(sock1, newfile1, flags & O_NONBLOCK);
1308 if (unlikely(err < 0)) { 1307 if (unlikely(err < 0)) {
1309 goto out_fd2; 1308 goto out_fd2;
1310 } 1309 }
1311 1310
1312 err = sock_attach_fd(sock2, newfile2, flags & O_NONBLOCK); 1311 err = sock_attach_fd(sock2, newfile2, flags & O_NONBLOCK);
1313 if (unlikely(err < 0)) { 1312 if (unlikely(err < 0)) {
1314 fput(newfile1); 1313 fput(newfile1);
1315 goto out_fd1; 1314 goto out_fd1;
1316 } 1315 }
1317 1316
1318 err = audit_fd_pair(fd1, fd2); 1317 err = audit_fd_pair(fd1, fd2);
1319 if (err < 0) { 1318 if (err < 0) {
1320 fput(newfile1); 1319 fput(newfile1);
1321 fput(newfile2); 1320 fput(newfile2);
1322 goto out_fd; 1321 goto out_fd;
1323 } 1322 }
1324 1323
1325 fd_install(fd1, newfile1); 1324 fd_install(fd1, newfile1);
1326 fd_install(fd2, newfile2); 1325 fd_install(fd2, newfile2);
1327 /* fd1 and fd2 may be already another descriptors. 1326 /* fd1 and fd2 may be already another descriptors.
1328 * Not kernel problem. 1327 * Not kernel problem.
1329 */ 1328 */
1330 1329
1331 err = put_user(fd1, &usockvec[0]); 1330 err = put_user(fd1, &usockvec[0]);
1332 if (!err) 1331 if (!err)
1333 err = put_user(fd2, &usockvec[1]); 1332 err = put_user(fd2, &usockvec[1]);
1334 if (!err) 1333 if (!err)
1335 return 0; 1334 return 0;
1336 1335
1337 sys_close(fd2); 1336 sys_close(fd2);
1338 sys_close(fd1); 1337 sys_close(fd1);
1339 return err; 1338 return err;
1340 1339
1341 out_release_both: 1340 out_release_both:
1342 sock_release(sock2); 1341 sock_release(sock2);
1343 out_release_1: 1342 out_release_1:
1344 sock_release(sock1); 1343 sock_release(sock1);
1345 out: 1344 out:
1346 return err; 1345 return err;
1347 1346
1348 out_fd2: 1347 out_fd2:
1349 put_filp(newfile1); 1348 put_filp(newfile1);
1350 sock_release(sock1); 1349 sock_release(sock1);
1351 out_fd1: 1350 out_fd1:
1352 put_filp(newfile2); 1351 put_filp(newfile2);
1353 sock_release(sock2); 1352 sock_release(sock2);
1354 out_fd: 1353 out_fd:
1355 put_unused_fd(fd1); 1354 put_unused_fd(fd1);
1356 put_unused_fd(fd2); 1355 put_unused_fd(fd2);
1357 goto out; 1356 goto out;
1358 } 1357 }
1359 1358
1360 /* 1359 /*
1361 * Bind a name to a socket. Nothing much to do here since it's 1360 * Bind a name to a socket. Nothing much to do here since it's
1362 * the protocol's responsibility to handle the local address. 1361 * the protocol's responsibility to handle the local address.
1363 * 1362 *
1364 * We move the socket address to kernel space before we call 1363 * We move the socket address to kernel space before we call
1365 * the protocol layer (having also checked the address is ok). 1364 * the protocol layer (having also checked the address is ok).
1366 */ 1365 */
1367 1366
1368 asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) 1367 asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
1369 { 1368 {
1370 struct socket *sock; 1369 struct socket *sock;
1371 struct sockaddr_storage address; 1370 struct sockaddr_storage address;
1372 int err, fput_needed; 1371 int err, fput_needed;
1373 1372
1374 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1373 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1375 if (sock) { 1374 if (sock) {
1376 err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address); 1375 err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
1377 if (err >= 0) { 1376 if (err >= 0) {
1378 err = security_socket_bind(sock, 1377 err = security_socket_bind(sock,
1379 (struct sockaddr *)&address, 1378 (struct sockaddr *)&address,
1380 addrlen); 1379 addrlen);
1381 if (!err) 1380 if (!err)
1382 err = sock->ops->bind(sock, 1381 err = sock->ops->bind(sock,
1383 (struct sockaddr *) 1382 (struct sockaddr *)
1384 &address, addrlen); 1383 &address, addrlen);
1385 } 1384 }
1386 fput_light(sock->file, fput_needed); 1385 fput_light(sock->file, fput_needed);
1387 } 1386 }
1388 return err; 1387 return err;
1389 } 1388 }
1390 1389
1391 /* 1390 /*
1392 * Perform a listen. Basically, we allow the protocol to do anything 1391 * Perform a listen. Basically, we allow the protocol to do anything
1393 * necessary for a listen, and if that works, we mark the socket as 1392 * necessary for a listen, and if that works, we mark the socket as
1394 * ready for listening. 1393 * ready for listening.
1395 */ 1394 */
1396 1395
1397 asmlinkage long sys_listen(int fd, int backlog) 1396 asmlinkage long sys_listen(int fd, int backlog)
1398 { 1397 {
1399 struct socket *sock; 1398 struct socket *sock;
1400 int err, fput_needed; 1399 int err, fput_needed;
1401 int somaxconn; 1400 int somaxconn;
1402 1401
1403 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1402 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1404 if (sock) { 1403 if (sock) {
1405 somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; 1404 somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
1406 if ((unsigned)backlog > somaxconn) 1405 if ((unsigned)backlog > somaxconn)
1407 backlog = somaxconn; 1406 backlog = somaxconn;
1408 1407
1409 err = security_socket_listen(sock, backlog); 1408 err = security_socket_listen(sock, backlog);
1410 if (!err) 1409 if (!err)
1411 err = sock->ops->listen(sock, backlog); 1410 err = sock->ops->listen(sock, backlog);
1412 1411
1413 fput_light(sock->file, fput_needed); 1412 fput_light(sock->file, fput_needed);
1414 } 1413 }
1415 return err; 1414 return err;
1416 } 1415 }
1417 1416
1418 /* 1417 /*
1419 * For accept, we attempt to create a new socket, set up the link 1418 * For accept, we attempt to create a new socket, set up the link
1420 * with the client, wake up the client, then return the new 1419 * with the client, wake up the client, then return the new
1421 * connected fd. We collect the address of the connector in kernel 1420 * connected fd. We collect the address of the connector in kernel
1422 * space and move it to user at the very end. This is unclean because 1421 * space and move it to user at the very end. This is unclean because
1423 * we open the socket then return an error. 1422 * we open the socket then return an error.
1424 * 1423 *
1425 * 1003.1g adds the ability to recvmsg() to query connection pending 1424 * 1003.1g adds the ability to recvmsg() to query connection pending
1426 * status to recvmsg. We need to add that support in a way thats 1425 * status to recvmsg. We need to add that support in a way thats
1427 * clean when we restucture accept also. 1426 * clean when we restucture accept also.
1428 */ 1427 */
1429 1428
1430 long do_accept(int fd, struct sockaddr __user *upeer_sockaddr, 1429 long do_accept(int fd, struct sockaddr __user *upeer_sockaddr,
1431 int __user *upeer_addrlen, int flags) 1430 int __user *upeer_addrlen, int flags)
1432 { 1431 {
1433 struct socket *sock, *newsock; 1432 struct socket *sock, *newsock;
1434 struct file *newfile; 1433 struct file *newfile;
1435 int err, len, newfd, fput_needed; 1434 int err, len, newfd, fput_needed;
1436 struct sockaddr_storage address; 1435 struct sockaddr_storage address;
1437 1436
1438 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1437 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1439 return -EINVAL; 1438 return -EINVAL;
1440 1439
1441 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) 1440 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1442 flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1441 flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1443 1442
1444 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1443 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1445 if (!sock) 1444 if (!sock)
1446 goto out; 1445 goto out;
1447 1446
1448 err = -ENFILE; 1447 err = -ENFILE;
1449 if (!(newsock = sock_alloc())) 1448 if (!(newsock = sock_alloc()))
1450 goto out_put; 1449 goto out_put;
1451 1450
1452 newsock->type = sock->type; 1451 newsock->type = sock->type;
1453 newsock->ops = sock->ops; 1452 newsock->ops = sock->ops;
1454 1453
1455 /* 1454 /*
1456 * We don't need try_module_get here, as the listening socket (sock) 1455 * We don't need try_module_get here, as the listening socket (sock)
1457 * has the protocol module (sock->ops->owner) held. 1456 * has the protocol module (sock->ops->owner) held.
1458 */ 1457 */
1459 __module_get(newsock->ops->owner); 1458 __module_get(newsock->ops->owner);
1460 1459
1461 newfd = sock_alloc_fd(&newfile, flags & O_CLOEXEC); 1460 newfd = sock_alloc_fd(&newfile, flags & O_CLOEXEC);
1462 if (unlikely(newfd < 0)) { 1461 if (unlikely(newfd < 0)) {
1463 err = newfd; 1462 err = newfd;
1464 sock_release(newsock); 1463 sock_release(newsock);
1465 goto out_put; 1464 goto out_put;
1466 } 1465 }
1467 1466
1468 err = sock_attach_fd(newsock, newfile, flags & O_NONBLOCK); 1467 err = sock_attach_fd(newsock, newfile, flags & O_NONBLOCK);
1469 if (err < 0) 1468 if (err < 0)
1470 goto out_fd_simple; 1469 goto out_fd_simple;
1471 1470
1472 err = security_socket_accept(sock, newsock); 1471 err = security_socket_accept(sock, newsock);
1473 if (err) 1472 if (err)
1474 goto out_fd; 1473 goto out_fd;
1475 1474
1476 err = sock->ops->accept(sock, newsock, sock->file->f_flags); 1475 err = sock->ops->accept(sock, newsock, sock->file->f_flags);
1477 if (err < 0) 1476 if (err < 0)
1478 goto out_fd; 1477 goto out_fd;
1479 1478
1480 if (upeer_sockaddr) { 1479 if (upeer_sockaddr) {
1481 if (newsock->ops->getname(newsock, (struct sockaddr *)&address, 1480 if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
1482 &len, 2) < 0) { 1481 &len, 2) < 0) {
1483 err = -ECONNABORTED; 1482 err = -ECONNABORTED;
1484 goto out_fd; 1483 goto out_fd;
1485 } 1484 }
1486 err = move_addr_to_user((struct sockaddr *)&address, 1485 err = move_addr_to_user((struct sockaddr *)&address,
1487 len, upeer_sockaddr, upeer_addrlen); 1486 len, upeer_sockaddr, upeer_addrlen);
1488 if (err < 0) 1487 if (err < 0)
1489 goto out_fd; 1488 goto out_fd;
1490 } 1489 }
1491 1490
1492 /* File flags are not inherited via accept() unlike another OSes. */ 1491 /* File flags are not inherited via accept() unlike another OSes. */
1493 1492
1494 fd_install(newfd, newfile); 1493 fd_install(newfd, newfile);
1495 err = newfd; 1494 err = newfd;
1496 1495
1497 security_socket_post_accept(sock, newsock); 1496 security_socket_post_accept(sock, newsock);
1498 1497
1499 out_put: 1498 out_put:
1500 fput_light(sock->file, fput_needed); 1499 fput_light(sock->file, fput_needed);
1501 out: 1500 out:
1502 return err; 1501 return err;
1503 out_fd_simple: 1502 out_fd_simple:
1504 sock_release(newsock); 1503 sock_release(newsock);
1505 put_filp(newfile); 1504 put_filp(newfile);
1506 put_unused_fd(newfd); 1505 put_unused_fd(newfd);
1507 goto out_put; 1506 goto out_put;
1508 out_fd: 1507 out_fd:
1509 fput(newfile); 1508 fput(newfile);
1510 put_unused_fd(newfd); 1509 put_unused_fd(newfd);
1511 goto out_put; 1510 goto out_put;
1512 } 1511 }
1513 1512
1514 #if 0 1513 #if 0
1515 #ifdef HAVE_SET_RESTORE_SIGMASK 1514 #ifdef HAVE_SET_RESTORE_SIGMASK
1516 asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr, 1515 asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
1517 int __user *upeer_addrlen, 1516 int __user *upeer_addrlen,
1518 const sigset_t __user *sigmask, 1517 const sigset_t __user *sigmask,
1519 size_t sigsetsize, int flags) 1518 size_t sigsetsize, int flags)
1520 { 1519 {
1521 sigset_t ksigmask, sigsaved; 1520 sigset_t ksigmask, sigsaved;
1522 int ret; 1521 int ret;
1523 1522
1524 if (sigmask) { 1523 if (sigmask) {
1525 /* XXX: Don't preclude handling different sized sigset_t's. */ 1524 /* XXX: Don't preclude handling different sized sigset_t's. */
1526 if (sigsetsize != sizeof(sigset_t)) 1525 if (sigsetsize != sizeof(sigset_t))
1527 return -EINVAL; 1526 return -EINVAL;
1528 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) 1527 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1529 return -EFAULT; 1528 return -EFAULT;
1530 1529
1531 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1530 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
1532 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 1531 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1533 } 1532 }
1534 1533
1535 ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags); 1534 ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
1536 1535
1537 if (ret < 0 && signal_pending(current)) { 1536 if (ret < 0 && signal_pending(current)) {
1538 /* 1537 /*
1539 * Don't restore the signal mask yet. Let do_signal() deliver 1538 * Don't restore the signal mask yet. Let do_signal() deliver
1540 * the signal on the way back to userspace, before the signal 1539 * the signal on the way back to userspace, before the signal
1541 * mask is restored. 1540 * mask is restored.
1542 */ 1541 */
1543 if (sigmask) { 1542 if (sigmask) {
1544 memcpy(&current->saved_sigmask, &sigsaved, 1543 memcpy(&current->saved_sigmask, &sigsaved,
1545 sizeof(sigsaved)); 1544 sizeof(sigsaved));
1546 set_restore_sigmask(); 1545 set_restore_sigmask();
1547 } 1546 }
1548 } else if (sigmask) 1547 } else if (sigmask)
1549 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1548 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1550 1549
1551 return ret; 1550 return ret;
1552 } 1551 }
1553 #else 1552 #else
1554 asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr, 1553 asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
1555 int __user *upeer_addrlen, 1554 int __user *upeer_addrlen,
1556 const sigset_t __user *sigmask, 1555 const sigset_t __user *sigmask,
1557 size_t sigsetsize, int flags) 1556 size_t sigsetsize, int flags)
1558 { 1557 {
1559 /* The platform does not support restoring the signal mask in the 1558 /* The platform does not support restoring the signal mask in the
1560 * return path. So we do not allow using paccept() with a signal 1559 * return path. So we do not allow using paccept() with a signal
1561 * mask. */ 1560 * mask. */
1562 if (sigmask) 1561 if (sigmask)
1563 return -EINVAL; 1562 return -EINVAL;
1564 1563
1565 return do_accept(fd, upeer_sockaddr, upeer_addrlen, flags); 1564 return do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
1566 } 1565 }
1567 #endif 1566 #endif
1568 #endif 1567 #endif
1569 1568
1570 asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, 1569 asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
1571 int __user *upeer_addrlen) 1570 int __user *upeer_addrlen)
1572 { 1571 {
1573 return do_accept(fd, upeer_sockaddr, upeer_addrlen, 0); 1572 return do_accept(fd, upeer_sockaddr, upeer_addrlen, 0);
1574 } 1573 }
1575 1574
1576 /* 1575 /*
1577 * Attempt to connect to a socket with the server address. The address 1576 * Attempt to connect to a socket with the server address. The address
1578 * is in user space so we verify it is OK and move it to kernel space. 1577 * is in user space so we verify it is OK and move it to kernel space.
1579 * 1578 *
1580 * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to 1579 * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
1581 * break bindings 1580 * break bindings
1582 * 1581 *
1583 * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and 1582 * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
1584 * other SEQPACKET protocols that take time to connect() as it doesn't 1583 * other SEQPACKET protocols that take time to connect() as it doesn't
1585 * include the -EINPROGRESS status for such sockets. 1584 * include the -EINPROGRESS status for such sockets.
1586 */ 1585 */
1587 1586
1588 asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr, 1587 asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
1589 int addrlen) 1588 int addrlen)
1590 { 1589 {
1591 struct socket *sock; 1590 struct socket *sock;
1592 struct sockaddr_storage address; 1591 struct sockaddr_storage address;
1593 int err, fput_needed; 1592 int err, fput_needed;
1594 1593
1595 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1594 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1596 if (!sock) 1595 if (!sock)
1597 goto out; 1596 goto out;
1598 err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address); 1597 err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address);
1599 if (err < 0) 1598 if (err < 0)
1600 goto out_put; 1599 goto out_put;
1601 1600
1602 err = 1601 err =
1603 security_socket_connect(sock, (struct sockaddr *)&address, addrlen); 1602 security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
1604 if (err) 1603 if (err)
1605 goto out_put; 1604 goto out_put;
1606 1605
1607 err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, 1606 err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
1608 sock->file->f_flags); 1607 sock->file->f_flags);
1609 out_put: 1608 out_put:
1610 fput_light(sock->file, fput_needed); 1609 fput_light(sock->file, fput_needed);
1611 out: 1610 out:
1612 return err; 1611 return err;
1613 } 1612 }
1614 1613
1615 /* 1614 /*
1616 * Get the local address ('name') of a socket object. Move the obtained 1615 * Get the local address ('name') of a socket object. Move the obtained
1617 * name to user space. 1616 * name to user space.
1618 */ 1617 */
1619 1618
1620 asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr, 1619 asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr,
1621 int __user *usockaddr_len) 1620 int __user *usockaddr_len)
1622 { 1621 {
1623 struct socket *sock; 1622 struct socket *sock;
1624 struct sockaddr_storage address; 1623 struct sockaddr_storage address;
1625 int len, err, fput_needed; 1624 int len, err, fput_needed;
1626 1625
1627 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1626 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1628 if (!sock) 1627 if (!sock)
1629 goto out; 1628 goto out;
1630 1629
1631 err = security_socket_getsockname(sock); 1630 err = security_socket_getsockname(sock);
1632 if (err) 1631 if (err)
1633 goto out_put; 1632 goto out_put;
1634 1633
1635 err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0); 1634 err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
1636 if (err) 1635 if (err)
1637 goto out_put; 1636 goto out_put;
1638 err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len); 1637 err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len);
1639 1638
1640 out_put: 1639 out_put:
1641 fput_light(sock->file, fput_needed); 1640 fput_light(sock->file, fput_needed);
1642 out: 1641 out:
1643 return err; 1642 return err;
1644 } 1643 }
1645 1644
1646 /* 1645 /*
1647 * Get the remote address ('name') of a socket object. Move the obtained 1646 * Get the remote address ('name') of a socket object. Move the obtained
1648 * name to user space. 1647 * name to user space.
1649 */ 1648 */
1650 1649
1651 asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr, 1650 asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr,
1652 int __user *usockaddr_len) 1651 int __user *usockaddr_len)
1653 { 1652 {
1654 struct socket *sock; 1653 struct socket *sock;
1655 struct sockaddr_storage address; 1654 struct sockaddr_storage address;
1656 int len, err, fput_needed; 1655 int len, err, fput_needed;
1657 1656
1658 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1657 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1659 if (sock != NULL) { 1658 if (sock != NULL) {
1660 err = security_socket_getpeername(sock); 1659 err = security_socket_getpeername(sock);
1661 if (err) { 1660 if (err) {
1662 fput_light(sock->file, fput_needed); 1661 fput_light(sock->file, fput_needed);
1663 return err; 1662 return err;
1664 } 1663 }
1665 1664
1666 err = 1665 err =
1667 sock->ops->getname(sock, (struct sockaddr *)&address, &len, 1666 sock->ops->getname(sock, (struct sockaddr *)&address, &len,
1668 1); 1667 1);
1669 if (!err) 1668 if (!err)
1670 err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, 1669 err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr,
1671 usockaddr_len); 1670 usockaddr_len);
1672 fput_light(sock->file, fput_needed); 1671 fput_light(sock->file, fput_needed);
1673 } 1672 }
1674 return err; 1673 return err;
1675 } 1674 }
1676 1675
1677 /* 1676 /*
1678 * Send a datagram to a given address. We move the address into kernel 1677 * Send a datagram to a given address. We move the address into kernel
1679 * space and check the user space data area is readable before invoking 1678 * space and check the user space data area is readable before invoking
1680 * the protocol. 1679 * the protocol.
1681 */ 1680 */
1682 1681
1683 asmlinkage long sys_sendto(int fd, void __user *buff, size_t len, 1682 asmlinkage long sys_sendto(int fd, void __user *buff, size_t len,
1684 unsigned flags, struct sockaddr __user *addr, 1683 unsigned flags, struct sockaddr __user *addr,
1685 int addr_len) 1684 int addr_len)
1686 { 1685 {
1687 struct socket *sock; 1686 struct socket *sock;
1688 struct sockaddr_storage address; 1687 struct sockaddr_storage address;
1689 int err; 1688 int err;
1690 struct msghdr msg; 1689 struct msghdr msg;
1691 struct iovec iov; 1690 struct iovec iov;
1692 int fput_needed; 1691 int fput_needed;
1693 1692
1694 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1693 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1695 if (!sock) 1694 if (!sock)
1696 goto out; 1695 goto out;
1697 1696
1698 iov.iov_base = buff; 1697 iov.iov_base = buff;
1699 iov.iov_len = len; 1698 iov.iov_len = len;
1700 msg.msg_name = NULL; 1699 msg.msg_name = NULL;
1701 msg.msg_iov = &iov; 1700 msg.msg_iov = &iov;
1702 msg.msg_iovlen = 1; 1701 msg.msg_iovlen = 1;
1703 msg.msg_control = NULL; 1702 msg.msg_control = NULL;
1704 msg.msg_controllen = 0; 1703 msg.msg_controllen = 0;
1705 msg.msg_namelen = 0; 1704 msg.msg_namelen = 0;
1706 if (addr) { 1705 if (addr) {
1707 err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address); 1706 err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address);
1708 if (err < 0) 1707 if (err < 0)
1709 goto out_put; 1708 goto out_put;
1710 msg.msg_name = (struct sockaddr *)&address; 1709 msg.msg_name = (struct sockaddr *)&address;
1711 msg.msg_namelen = addr_len; 1710 msg.msg_namelen = addr_len;
1712 } 1711 }
1713 if (sock->file->f_flags & O_NONBLOCK) 1712 if (sock->file->f_flags & O_NONBLOCK)
1714 flags |= MSG_DONTWAIT; 1713 flags |= MSG_DONTWAIT;
1715 msg.msg_flags = flags; 1714 msg.msg_flags = flags;
1716 err = sock_sendmsg(sock, &msg, len); 1715 err = sock_sendmsg(sock, &msg, len);
1717 1716
1718 out_put: 1717 out_put:
1719 fput_light(sock->file, fput_needed); 1718 fput_light(sock->file, fput_needed);
1720 out: 1719 out:
1721 return err; 1720 return err;
1722 } 1721 }
1723 1722
1724 /* 1723 /*
1725 * Send a datagram down a socket. 1724 * Send a datagram down a socket.
1726 */ 1725 */
1727 1726
1728 asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags) 1727 asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags)
1729 { 1728 {
1730 return sys_sendto(fd, buff, len, flags, NULL, 0); 1729 return sys_sendto(fd, buff, len, flags, NULL, 0);
1731 } 1730 }
1732 1731
1733 /* 1732 /*
1734 * Receive a frame from the socket and optionally record the address of the 1733 * Receive a frame from the socket and optionally record the address of the
1735 * sender. We verify the buffers are writable and if needed move the 1734 * sender. We verify the buffers are writable and if needed move the
1736 * sender address from kernel to user space. 1735 * sender address from kernel to user space.
1737 */ 1736 */
1738 1737
1739 asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size, 1738 asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size,
1740 unsigned flags, struct sockaddr __user *addr, 1739 unsigned flags, struct sockaddr __user *addr,
1741 int __user *addr_len) 1740 int __user *addr_len)
1742 { 1741 {
1743 struct socket *sock; 1742 struct socket *sock;
1744 struct iovec iov; 1743 struct iovec iov;
1745 struct msghdr msg; 1744 struct msghdr msg;
1746 struct sockaddr_storage address; 1745 struct sockaddr_storage address;
1747 int err, err2; 1746 int err, err2;
1748 int fput_needed; 1747 int fput_needed;
1749 1748
1750 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1749 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1751 if (!sock) 1750 if (!sock)
1752 goto out; 1751 goto out;
1753 1752
1754 msg.msg_control = NULL; 1753 msg.msg_control = NULL;
1755 msg.msg_controllen = 0; 1754 msg.msg_controllen = 0;
1756 msg.msg_iovlen = 1; 1755 msg.msg_iovlen = 1;
1757 msg.msg_iov = &iov; 1756 msg.msg_iov = &iov;
1758 iov.iov_len = size; 1757 iov.iov_len = size;
1759 iov.iov_base = ubuf; 1758 iov.iov_base = ubuf;
1760 msg.msg_name = (struct sockaddr *)&address; 1759 msg.msg_name = (struct sockaddr *)&address;
1761 msg.msg_namelen = sizeof(address); 1760 msg.msg_namelen = sizeof(address);
1762 if (sock->file->f_flags & O_NONBLOCK) 1761 if (sock->file->f_flags & O_NONBLOCK)
1763 flags |= MSG_DONTWAIT; 1762 flags |= MSG_DONTWAIT;
1764 err = sock_recvmsg(sock, &msg, size, flags); 1763 err = sock_recvmsg(sock, &msg, size, flags);
1765 1764
1766 if (err >= 0 && addr != NULL) { 1765 if (err >= 0 && addr != NULL) {
1767 err2 = move_addr_to_user((struct sockaddr *)&address, 1766 err2 = move_addr_to_user((struct sockaddr *)&address,
1768 msg.msg_namelen, addr, addr_len); 1767 msg.msg_namelen, addr, addr_len);
1769 if (err2 < 0) 1768 if (err2 < 0)
1770 err = err2; 1769 err = err2;
1771 } 1770 }
1772 1771
1773 fput_light(sock->file, fput_needed); 1772 fput_light(sock->file, fput_needed);
1774 out: 1773 out:
1775 return err; 1774 return err;
1776 } 1775 }
1777 1776
1778 /* 1777 /*
1779 * Receive a datagram from a socket. 1778 * Receive a datagram from a socket.
1780 */ 1779 */
1781 1780
1782 asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, 1781 asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
1783 unsigned flags) 1782 unsigned flags)
1784 { 1783 {
1785 return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); 1784 return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
1786 } 1785 }
1787 1786
1788 /* 1787 /*
1789 * Set a socket option. Because we don't know the option lengths we have 1788 * Set a socket option. Because we don't know the option lengths we have
1790 * to pass the user mode parameter for the protocols to sort out. 1789 * to pass the user mode parameter for the protocols to sort out.
1791 */ 1790 */
1792 1791
1793 asmlinkage long sys_setsockopt(int fd, int level, int optname, 1792 asmlinkage long sys_setsockopt(int fd, int level, int optname,
1794 char __user *optval, int optlen) 1793 char __user *optval, int optlen)
1795 { 1794 {
1796 int err, fput_needed; 1795 int err, fput_needed;
1797 struct socket *sock; 1796 struct socket *sock;
1798 1797
1799 if (optlen < 0) 1798 if (optlen < 0)
1800 return -EINVAL; 1799 return -EINVAL;
1801 1800
1802 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1801 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1803 if (sock != NULL) { 1802 if (sock != NULL) {
1804 err = security_socket_setsockopt(sock, level, optname); 1803 err = security_socket_setsockopt(sock, level, optname);
1805 if (err) 1804 if (err)
1806 goto out_put; 1805 goto out_put;
1807 1806
1808 if (level == SOL_SOCKET) 1807 if (level == SOL_SOCKET)
1809 err = 1808 err =
1810 sock_setsockopt(sock, level, optname, optval, 1809 sock_setsockopt(sock, level, optname, optval,
1811 optlen); 1810 optlen);
1812 else 1811 else
1813 err = 1812 err =
1814 sock->ops->setsockopt(sock, level, optname, optval, 1813 sock->ops->setsockopt(sock, level, optname, optval,
1815 optlen); 1814 optlen);
1816 out_put: 1815 out_put:
1817 fput_light(sock->file, fput_needed); 1816 fput_light(sock->file, fput_needed);
1818 } 1817 }
1819 return err; 1818 return err;
1820 } 1819 }
1821 1820
1822 /* 1821 /*
1823 * Get a socket option. Because we don't know the option lengths we have 1822 * Get a socket option. Because we don't know the option lengths we have
1824 * to pass a user mode parameter for the protocols to sort out. 1823 * to pass a user mode parameter for the protocols to sort out.
1825 */ 1824 */
1826 1825
1827 asmlinkage long sys_getsockopt(int fd, int level, int optname, 1826 asmlinkage long sys_getsockopt(int fd, int level, int optname,
1828 char __user *optval, int __user *optlen) 1827 char __user *optval, int __user *optlen)
1829 { 1828 {
1830 int err, fput_needed; 1829 int err, fput_needed;
1831 struct socket *sock; 1830 struct socket *sock;
1832 1831
1833 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1832 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1834 if (sock != NULL) { 1833 if (sock != NULL) {
1835 err = security_socket_getsockopt(sock, level, optname); 1834 err = security_socket_getsockopt(sock, level, optname);
1836 if (err) 1835 if (err)
1837 goto out_put; 1836 goto out_put;
1838 1837
1839 if (level == SOL_SOCKET) 1838 if (level == SOL_SOCKET)
1840 err = 1839 err =
1841 sock_getsockopt(sock, level, optname, optval, 1840 sock_getsockopt(sock, level, optname, optval,
1842 optlen); 1841 optlen);
1843 else 1842 else
1844 err = 1843 err =
1845 sock->ops->getsockopt(sock, level, optname, optval, 1844 sock->ops->getsockopt(sock, level, optname, optval,
1846 optlen); 1845 optlen);
1847 out_put: 1846 out_put:
1848 fput_light(sock->file, fput_needed); 1847 fput_light(sock->file, fput_needed);
1849 } 1848 }
1850 return err; 1849 return err;
1851 } 1850 }
1852 1851
1853 /* 1852 /*
1854 * Shutdown a socket. 1853 * Shutdown a socket.
1855 */ 1854 */
1856 1855
1857 asmlinkage long sys_shutdown(int fd, int how) 1856 asmlinkage long sys_shutdown(int fd, int how)
1858 { 1857 {
1859 int err, fput_needed; 1858 int err, fput_needed;
1860 struct socket *sock; 1859 struct socket *sock;
1861 1860
1862 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1861 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1863 if (sock != NULL) { 1862 if (sock != NULL) {
1864 err = security_socket_shutdown(sock, how); 1863 err = security_socket_shutdown(sock, how);
1865 if (!err) 1864 if (!err)
1866 err = sock->ops->shutdown(sock, how); 1865 err = sock->ops->shutdown(sock, how);
1867 fput_light(sock->file, fput_needed); 1866 fput_light(sock->file, fput_needed);
1868 } 1867 }
1869 return err; 1868 return err;
1870 } 1869 }
1871 1870
1872 /* A couple of helpful macros for getting the address of the 32/64 bit 1871 /* A couple of helpful macros for getting the address of the 32/64 bit
1873 * fields which are the same type (int / unsigned) on our platforms. 1872 * fields which are the same type (int / unsigned) on our platforms.
1874 */ 1873 */
1875 #define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) 1874 #define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
1876 #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) 1875 #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen)
1877 #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) 1876 #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags)
1878 1877
1879 /* 1878 /*
1880 * BSD sendmsg interface 1879 * BSD sendmsg interface
1881 */ 1880 */
1882 1881
1883 asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) 1882 asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
1884 { 1883 {
1885 struct compat_msghdr __user *msg_compat = 1884 struct compat_msghdr __user *msg_compat =
1886 (struct compat_msghdr __user *)msg; 1885 (struct compat_msghdr __user *)msg;
1887 struct socket *sock; 1886 struct socket *sock;
1888 struct sockaddr_storage address; 1887 struct sockaddr_storage address;
1889 struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; 1888 struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
1890 unsigned char ctl[sizeof(struct cmsghdr) + 20] 1889 unsigned char ctl[sizeof(struct cmsghdr) + 20]
1891 __attribute__ ((aligned(sizeof(__kernel_size_t)))); 1890 __attribute__ ((aligned(sizeof(__kernel_size_t))));
1892 /* 20 is size of ipv6_pktinfo */ 1891 /* 20 is size of ipv6_pktinfo */
1893 unsigned char *ctl_buf = ctl; 1892 unsigned char *ctl_buf = ctl;
1894 struct msghdr msg_sys; 1893 struct msghdr msg_sys;
1895 int err, ctl_len, iov_size, total_len; 1894 int err, ctl_len, iov_size, total_len;
1896 int fput_needed; 1895 int fput_needed;
1897 1896
1898 err = -EFAULT; 1897 err = -EFAULT;
1899 if (MSG_CMSG_COMPAT & flags) { 1898 if (MSG_CMSG_COMPAT & flags) {
1900 if (get_compat_msghdr(&msg_sys, msg_compat)) 1899 if (get_compat_msghdr(&msg_sys, msg_compat))
1901 return -EFAULT; 1900 return -EFAULT;
1902 } 1901 }
1903 else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) 1902 else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
1904 return -EFAULT; 1903 return -EFAULT;
1905 1904
1906 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1905 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1907 if (!sock) 1906 if (!sock)
1908 goto out; 1907 goto out;
1909 1908
1910 /* do not move before msg_sys is valid */ 1909 /* do not move before msg_sys is valid */
1911 err = -EMSGSIZE; 1910 err = -EMSGSIZE;
1912 if (msg_sys.msg_iovlen > UIO_MAXIOV) 1911 if (msg_sys.msg_iovlen > UIO_MAXIOV)
1913 goto out_put; 1912 goto out_put;
1914 1913
1915 /* Check whether to allocate the iovec area */ 1914 /* Check whether to allocate the iovec area */
1916 err = -ENOMEM; 1915 err = -ENOMEM;
1917 iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); 1916 iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
1918 if (msg_sys.msg_iovlen > UIO_FASTIOV) { 1917 if (msg_sys.msg_iovlen > UIO_FASTIOV) {
1919 iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); 1918 iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
1920 if (!iov) 1919 if (!iov)
1921 goto out_put; 1920 goto out_put;
1922 } 1921 }
1923 1922
1924 /* This will also move the address data into kernel space */ 1923 /* This will also move the address data into kernel space */
1925 if (MSG_CMSG_COMPAT & flags) { 1924 if (MSG_CMSG_COMPAT & flags) {
1926 err = verify_compat_iovec(&msg_sys, iov, 1925 err = verify_compat_iovec(&msg_sys, iov,
1927 (struct sockaddr *)&address, 1926 (struct sockaddr *)&address,
1928 VERIFY_READ); 1927 VERIFY_READ);
1929 } else 1928 } else
1930 err = verify_iovec(&msg_sys, iov, 1929 err = verify_iovec(&msg_sys, iov,
1931 (struct sockaddr *)&address, 1930 (struct sockaddr *)&address,
1932 VERIFY_READ); 1931 VERIFY_READ);
1933 if (err < 0) 1932 if (err < 0)
1934 goto out_freeiov; 1933 goto out_freeiov;
1935 total_len = err; 1934 total_len = err;
1936 1935
1937 err = -ENOBUFS; 1936 err = -ENOBUFS;
1938 1937
1939 if (msg_sys.msg_controllen > INT_MAX) 1938 if (msg_sys.msg_controllen > INT_MAX)
1940 goto out_freeiov; 1939 goto out_freeiov;
1941 ctl_len = msg_sys.msg_controllen; 1940 ctl_len = msg_sys.msg_controllen;
1942 if ((MSG_CMSG_COMPAT & flags) && ctl_len) { 1941 if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
1943 err = 1942 err =
1944 cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl, 1943 cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl,
1945 sizeof(ctl)); 1944 sizeof(ctl));
1946 if (err) 1945 if (err)
1947 goto out_freeiov; 1946 goto out_freeiov;
1948 ctl_buf = msg_sys.msg_control; 1947 ctl_buf = msg_sys.msg_control;
1949 ctl_len = msg_sys.msg_controllen; 1948 ctl_len = msg_sys.msg_controllen;
1950 } else if (ctl_len) { 1949 } else if (ctl_len) {
1951 if (ctl_len > sizeof(ctl)) { 1950 if (ctl_len > sizeof(ctl)) {
1952 ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); 1951 ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
1953 if (ctl_buf == NULL) 1952 if (ctl_buf == NULL)
1954 goto out_freeiov; 1953 goto out_freeiov;
1955 } 1954 }
1956 err = -EFAULT; 1955 err = -EFAULT;
1957 /* 1956 /*
1958 * Careful! Before this, msg_sys.msg_control contains a user pointer. 1957 * Careful! Before this, msg_sys.msg_control contains a user pointer.
1959 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted 1958 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
1960 * checking falls down on this. 1959 * checking falls down on this.
1961 */ 1960 */
1962 if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control, 1961 if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control,
1963 ctl_len)) 1962 ctl_len))
1964 goto out_freectl; 1963 goto out_freectl;
1965 msg_sys.msg_control = ctl_buf; 1964 msg_sys.msg_control = ctl_buf;
1966 } 1965 }
1967 msg_sys.msg_flags = flags; 1966 msg_sys.msg_flags = flags;
1968 1967
1969 if (sock->file->f_flags & O_NONBLOCK) 1968 if (sock->file->f_flags & O_NONBLOCK)
1970 msg_sys.msg_flags |= MSG_DONTWAIT; 1969 msg_sys.msg_flags |= MSG_DONTWAIT;
1971 err = sock_sendmsg(sock, &msg_sys, total_len); 1970 err = sock_sendmsg(sock, &msg_sys, total_len);
1972 1971
1973 out_freectl: 1972 out_freectl:
1974 if (ctl_buf != ctl) 1973 if (ctl_buf != ctl)
1975 sock_kfree_s(sock->sk, ctl_buf, ctl_len); 1974 sock_kfree_s(sock->sk, ctl_buf, ctl_len);
1976 out_freeiov: 1975 out_freeiov:
1977 if (iov != iovstack) 1976 if (iov != iovstack)
1978 sock_kfree_s(sock->sk, iov, iov_size); 1977 sock_kfree_s(sock->sk, iov, iov_size);
1979 out_put: 1978 out_put:
1980 fput_light(sock->file, fput_needed); 1979 fput_light(sock->file, fput_needed);
1981 out: 1980 out:
1982 return err; 1981 return err;
1983 } 1982 }
1984 1983
1985 /* 1984 /*
1986 * BSD recvmsg interface 1985 * BSD recvmsg interface
1987 */ 1986 */
1988 1987
1989 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, 1988 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg,
1990 unsigned int flags) 1989 unsigned int flags)
1991 { 1990 {
1992 struct compat_msghdr __user *msg_compat = 1991 struct compat_msghdr __user *msg_compat =
1993 (struct compat_msghdr __user *)msg; 1992 (struct compat_msghdr __user *)msg;
1994 struct socket *sock; 1993 struct socket *sock;
1995 struct iovec iovstack[UIO_FASTIOV]; 1994 struct iovec iovstack[UIO_FASTIOV];
1996 struct iovec *iov = iovstack; 1995 struct iovec *iov = iovstack;
1997 struct msghdr msg_sys; 1996 struct msghdr msg_sys;
1998 unsigned long cmsg_ptr; 1997 unsigned long cmsg_ptr;
1999 int err, iov_size, total_len, len; 1998 int err, iov_size, total_len, len;
2000 int fput_needed; 1999 int fput_needed;
2001 2000
2002 /* kernel mode address */ 2001 /* kernel mode address */
2003 struct sockaddr_storage addr; 2002 struct sockaddr_storage addr;
2004 2003
2005 /* user mode address pointers */ 2004 /* user mode address pointers */
2006 struct sockaddr __user *uaddr; 2005 struct sockaddr __user *uaddr;
2007 int __user *uaddr_len; 2006 int __user *uaddr_len;
2008 2007
2009 if (MSG_CMSG_COMPAT & flags) { 2008 if (MSG_CMSG_COMPAT & flags) {
2010 if (get_compat_msghdr(&msg_sys, msg_compat)) 2009 if (get_compat_msghdr(&msg_sys, msg_compat))
2011 return -EFAULT; 2010 return -EFAULT;
2012 } 2011 }
2013 else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) 2012 else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
2014 return -EFAULT; 2013 return -EFAULT;
2015 2014
2016 sock = sockfd_lookup_light(fd, &err, &fput_needed); 2015 sock = sockfd_lookup_light(fd, &err, &fput_needed);
2017 if (!sock) 2016 if (!sock)
2018 goto out; 2017 goto out;
2019 2018
2020 err = -EMSGSIZE; 2019 err = -EMSGSIZE;
2021 if (msg_sys.msg_iovlen > UIO_MAXIOV) 2020 if (msg_sys.msg_iovlen > UIO_MAXIOV)
2022 goto out_put; 2021 goto out_put;
2023 2022
2024 /* Check whether to allocate the iovec area */ 2023 /* Check whether to allocate the iovec area */
2025 err = -ENOMEM; 2024 err = -ENOMEM;
2026 iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); 2025 iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
2027 if (msg_sys.msg_iovlen > UIO_FASTIOV) { 2026 if (msg_sys.msg_iovlen > UIO_FASTIOV) {
2028 iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); 2027 iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
2029 if (!iov) 2028 if (!iov)
2030 goto out_put; 2029 goto out_put;
2031 } 2030 }
2032 2031
2033 /* 2032 /*
2034 * Save the user-mode address (verify_iovec will change the 2033 * Save the user-mode address (verify_iovec will change the
2035 * kernel msghdr to use the kernel address space) 2034 * kernel msghdr to use the kernel address space)
2036 */ 2035 */
2037 2036
2038 uaddr = (__force void __user *)msg_sys.msg_name; 2037 uaddr = (__force void __user *)msg_sys.msg_name;
2039 uaddr_len = COMPAT_NAMELEN(msg); 2038 uaddr_len = COMPAT_NAMELEN(msg);
2040 if (MSG_CMSG_COMPAT & flags) { 2039 if (MSG_CMSG_COMPAT & flags) {
2041 err = verify_compat_iovec(&msg_sys, iov, 2040 err = verify_compat_iovec(&msg_sys, iov,
2042 (struct sockaddr *)&addr, 2041 (struct sockaddr *)&addr,
2043 VERIFY_WRITE); 2042 VERIFY_WRITE);
2044 } else 2043 } else
2045 err = verify_iovec(&msg_sys, iov, 2044 err = verify_iovec(&msg_sys, iov,
2046 (struct sockaddr *)&addr, 2045 (struct sockaddr *)&addr,
2047 VERIFY_WRITE); 2046 VERIFY_WRITE);
2048 if (err < 0) 2047 if (err < 0)
2049 goto out_freeiov; 2048 goto out_freeiov;
2050 total_len = err; 2049 total_len = err;
2051 2050
2052 cmsg_ptr = (unsigned long)msg_sys.msg_control; 2051 cmsg_ptr = (unsigned long)msg_sys.msg_control;
2053 msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 2052 msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
2054 2053
2055 if (sock->file->f_flags & O_NONBLOCK) 2054 if (sock->file->f_flags & O_NONBLOCK)
2056 flags |= MSG_DONTWAIT; 2055 flags |= MSG_DONTWAIT;
2057 err = sock_recvmsg(sock, &msg_sys, total_len, flags); 2056 err = sock_recvmsg(sock, &msg_sys, total_len, flags);
2058 if (err < 0) 2057 if (err < 0)
2059 goto out_freeiov; 2058 goto out_freeiov;
2060 len = err; 2059 len = err;
2061 2060
2062 if (uaddr != NULL) { 2061 if (uaddr != NULL) {
2063 err = move_addr_to_user((struct sockaddr *)&addr, 2062 err = move_addr_to_user((struct sockaddr *)&addr,
2064 msg_sys.msg_namelen, uaddr, 2063 msg_sys.msg_namelen, uaddr,
2065 uaddr_len); 2064 uaddr_len);
2066 if (err < 0) 2065 if (err < 0)
2067 goto out_freeiov; 2066 goto out_freeiov;
2068 } 2067 }
2069 err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT), 2068 err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
2070 COMPAT_FLAGS(msg)); 2069 COMPAT_FLAGS(msg));
2071 if (err) 2070 if (err)
2072 goto out_freeiov; 2071 goto out_freeiov;
2073 if (MSG_CMSG_COMPAT & flags) 2072 if (MSG_CMSG_COMPAT & flags)
2074 err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, 2073 err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
2075 &msg_compat->msg_controllen); 2074 &msg_compat->msg_controllen);
2076 else 2075 else
2077 err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, 2076 err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
2078 &msg->msg_controllen); 2077 &msg->msg_controllen);
2079 if (err) 2078 if (err)
2080 goto out_freeiov; 2079 goto out_freeiov;
2081 err = len; 2080 err = len;
2082 2081
2083 out_freeiov: 2082 out_freeiov:
2084 if (iov != iovstack) 2083 if (iov != iovstack)
2085 sock_kfree_s(sock->sk, iov, iov_size); 2084 sock_kfree_s(sock->sk, iov, iov_size);
2086 out_put: 2085 out_put:
2087 fput_light(sock->file, fput_needed); 2086 fput_light(sock->file, fput_needed);
2088 out: 2087 out:
2089 return err; 2088 return err;
2090 } 2089 }
2091 2090
2092 #ifdef __ARCH_WANT_SYS_SOCKETCALL 2091 #ifdef __ARCH_WANT_SYS_SOCKETCALL
2093 2092
2094 /* Argument list sizes for sys_socketcall */ 2093 /* Argument list sizes for sys_socketcall */
2095 #define AL(x) ((x) * sizeof(unsigned long)) 2094 #define AL(x) ((x) * sizeof(unsigned long))
2096 static const unsigned char nargs[19]={ 2095 static const unsigned char nargs[19]={
2097 AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), 2096 AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
2098 AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), 2097 AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
2099 AL(6),AL(2),AL(5),AL(5),AL(3),AL(3), 2098 AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
2100 AL(6) 2099 AL(6)
2101 }; 2100 };
2102 2101
2103 #undef AL 2102 #undef AL
2104 2103
2105 /* 2104 /*
2106 * System call vectors. 2105 * System call vectors.
2107 * 2106 *
2108 * Argument checking cleaned up. Saved 20% in size. 2107 * Argument checking cleaned up. Saved 20% in size.
2109 * This function doesn't need to set the kernel lock because 2108 * This function doesn't need to set the kernel lock because
2110 * it is set by the callees. 2109 * it is set by the callees.
2111 */ 2110 */
2112 2111
2113 asmlinkage long sys_socketcall(int call, unsigned long __user *args) 2112 asmlinkage long sys_socketcall(int call, unsigned long __user *args)
2114 { 2113 {
2115 unsigned long a[6]; 2114 unsigned long a[6];
2116 unsigned long a0, a1; 2115 unsigned long a0, a1;
2117 int err; 2116 int err;
2118 2117
2119 if (call < 1 || call > SYS_PACCEPT) 2118 if (call < 1 || call > SYS_PACCEPT)
2120 return -EINVAL; 2119 return -EINVAL;
2121 2120
2122 /* copy_from_user should be SMP safe. */ 2121 /* copy_from_user should be SMP safe. */
2123 if (copy_from_user(a, args, nargs[call])) 2122 if (copy_from_user(a, args, nargs[call]))
2124 return -EFAULT; 2123 return -EFAULT;
2125 2124
2126 err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); 2125 err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
2127 if (err) 2126 if (err)
2128 return err; 2127 return err;
2129 2128
2130 a0 = a[0]; 2129 a0 = a[0];
2131 a1 = a[1]; 2130 a1 = a[1];
2132 2131
2133 switch (call) { 2132 switch (call) {
2134 case SYS_SOCKET: 2133 case SYS_SOCKET:
2135 err = sys_socket(a0, a1, a[2]); 2134 err = sys_socket(a0, a1, a[2]);
2136 break; 2135 break;
2137 case SYS_BIND: 2136 case SYS_BIND:
2138 err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); 2137 err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
2139 break; 2138 break;
2140 case SYS_CONNECT: 2139 case SYS_CONNECT:
2141 err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); 2140 err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
2142 break; 2141 break;
2143 case SYS_LISTEN: 2142 case SYS_LISTEN:
2144 err = sys_listen(a0, a1); 2143 err = sys_listen(a0, a1);
2145 break; 2144 break;
2146 case SYS_ACCEPT: 2145 case SYS_ACCEPT:
2147 err = 2146 err =
2148 do_accept(a0, (struct sockaddr __user *)a1, 2147 do_accept(a0, (struct sockaddr __user *)a1,
2149 (int __user *)a[2], 0); 2148 (int __user *)a[2], 0);
2150 break; 2149 break;
2151 case SYS_GETSOCKNAME: 2150 case SYS_GETSOCKNAME:
2152 err = 2151 err =
2153 sys_getsockname(a0, (struct sockaddr __user *)a1, 2152 sys_getsockname(a0, (struct sockaddr __user *)a1,
2154 (int __user *)a[2]); 2153 (int __user *)a[2]);
2155 break; 2154 break;
2156 case SYS_GETPEERNAME: 2155 case SYS_GETPEERNAME:
2157 err = 2156 err =
2158 sys_getpeername(a0, (struct sockaddr __user *)a1, 2157 sys_getpeername(a0, (struct sockaddr __user *)a1,
2159 (int __user *)a[2]); 2158 (int __user *)a[2]);
2160 break; 2159 break;
2161 case SYS_SOCKETPAIR: 2160 case SYS_SOCKETPAIR:
2162 err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); 2161 err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
2163 break; 2162 break;
2164 case SYS_SEND: 2163 case SYS_SEND:
2165 err = sys_send(a0, (void __user *)a1, a[2], a[3]); 2164 err = sys_send(a0, (void __user *)a1, a[2], a[3]);
2166 break; 2165 break;
2167 case SYS_SENDTO: 2166 case SYS_SENDTO:
2168 err = sys_sendto(a0, (void __user *)a1, a[2], a[3], 2167 err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
2169 (struct sockaddr __user *)a[4], a[5]); 2168 (struct sockaddr __user *)a[4], a[5]);
2170 break; 2169 break;
2171 case SYS_RECV: 2170 case SYS_RECV:
2172 err = sys_recv(a0, (void __user *)a1, a[2], a[3]); 2171 err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
2173 break; 2172 break;
2174 case SYS_RECVFROM: 2173 case SYS_RECVFROM:
2175 err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], 2174 err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
2176 (struct sockaddr __user *)a[4], 2175 (struct sockaddr __user *)a[4],
2177 (int __user *)a[5]); 2176 (int __user *)a[5]);
2178 break; 2177 break;
2179 case SYS_SHUTDOWN: 2178 case SYS_SHUTDOWN:
2180 err = sys_shutdown(a0, a1); 2179 err = sys_shutdown(a0, a1);
2181 break; 2180 break;
2182 case SYS_SETSOCKOPT: 2181 case SYS_SETSOCKOPT:
2183 err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); 2182 err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
2184 break; 2183 break;
2185 case SYS_GETSOCKOPT: 2184 case SYS_GETSOCKOPT:
2186 err = 2185 err =
2187 sys_getsockopt(a0, a1, a[2], (char __user *)a[3], 2186 sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
2188 (int __user *)a[4]); 2187 (int __user *)a[4]);
2189 break; 2188 break;
2190 case SYS_SENDMSG: 2189 case SYS_SENDMSG:
2191 err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); 2190 err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
2192 break; 2191 break;
2193 case SYS_RECVMSG: 2192 case SYS_RECVMSG:
2194 err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); 2193 err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
2195 break; 2194 break;
2196 case SYS_PACCEPT: 2195 case SYS_PACCEPT:
2197 err = 2196 err =
2198 sys_paccept(a0, (struct sockaddr __user *)a1, 2197 sys_paccept(a0, (struct sockaddr __user *)a1,
2199 (int __user *)a[2], 2198 (int __user *)a[2],
2200 (const sigset_t __user *) a[3], 2199 (const sigset_t __user *) a[3],
2201 a[4], a[5]); 2200 a[4], a[5]);
2202 break; 2201 break;
2203 default: 2202 default:
2204 err = -EINVAL; 2203 err = -EINVAL;
2205 break; 2204 break;
2206 } 2205 }
2207 return err; 2206 return err;
2208 } 2207 }
2209 2208
2210 #endif /* __ARCH_WANT_SYS_SOCKETCALL */ 2209 #endif /* __ARCH_WANT_SYS_SOCKETCALL */
2211 2210
2212 /** 2211 /**
2213 * sock_register - add a socket protocol handler 2212 * sock_register - add a socket protocol handler
2214 * @ops: description of protocol 2213 * @ops: description of protocol
2215 * 2214 *
2216 * This function is called by a protocol handler that wants to 2215 * This function is called by a protocol handler that wants to
2217 * advertise its address family, and have it linked into the 2216 * advertise its address family, and have it linked into the
2218 * socket interface. The value ops->family coresponds to the 2217 * socket interface. The value ops->family coresponds to the
2219 * socket system call protocol family. 2218 * socket system call protocol family.
2220 */ 2219 */
2221 int sock_register(const struct net_proto_family *ops) 2220 int sock_register(const struct net_proto_family *ops)
2222 { 2221 {
2223 int err; 2222 int err;
2224 2223
2225 if (ops->family >= NPROTO) { 2224 if (ops->family >= NPROTO) {
2226 printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, 2225 printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
2227 NPROTO); 2226 NPROTO);
2228 return -ENOBUFS; 2227 return -ENOBUFS;
2229 } 2228 }
2230 2229
2231 spin_lock(&net_family_lock); 2230 spin_lock(&net_family_lock);
2232 if (net_families[ops->family]) 2231 if (net_families[ops->family])
2233 err = -EEXIST; 2232 err = -EEXIST;
2234 else { 2233 else {
2235 net_families[ops->family] = ops; 2234 net_families[ops->family] = ops;
2236 err = 0; 2235 err = 0;
2237 } 2236 }
2238 spin_unlock(&net_family_lock); 2237 spin_unlock(&net_family_lock);
2239 2238
2240 printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); 2239 printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
2241 return err; 2240 return err;
2242 } 2241 }
2243 2242
2244 /** 2243 /**
2245 * sock_unregister - remove a protocol handler 2244 * sock_unregister - remove a protocol handler
2246 * @family: protocol family to remove 2245 * @family: protocol family to remove
2247 * 2246 *
2248 * This function is called by a protocol handler that wants to 2247 * This function is called by a protocol handler that wants to
2249 * remove its address family, and have it unlinked from the 2248 * remove its address family, and have it unlinked from the
2250 * new socket creation. 2249 * new socket creation.
2251 * 2250 *
2252 * If protocol handler is a module, then it can use module reference 2251 * If protocol handler is a module, then it can use module reference
2253 * counts to protect against new references. If protocol handler is not 2252 * counts to protect against new references. If protocol handler is not
2254 * a module then it needs to provide its own protection in 2253 * a module then it needs to provide its own protection in
2255 * the ops->create routine. 2254 * the ops->create routine.
2256 */ 2255 */
2257 void sock_unregister(int family) 2256 void sock_unregister(int family)
2258 { 2257 {
2259 BUG_ON(family < 0 || family >= NPROTO); 2258 BUG_ON(family < 0 || family >= NPROTO);
2260 2259
2261 spin_lock(&net_family_lock); 2260 spin_lock(&net_family_lock);
2262 net_families[family] = NULL; 2261 net_families[family] = NULL;
2263 spin_unlock(&net_family_lock); 2262 spin_unlock(&net_family_lock);
2264 2263
2265 synchronize_rcu(); 2264 synchronize_rcu();
2266 2265
2267 printk(KERN_INFO "NET: Unregistered protocol family %d\n", family); 2266 printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
2268 } 2267 }
2269 2268
2270 static int __init sock_init(void) 2269 static int __init sock_init(void)
2271 { 2270 {
2272 /* 2271 /*
2273 * Initialize sock SLAB cache. 2272 * Initialize sock SLAB cache.
2274 */ 2273 */
2275 2274
2276 sk_init(); 2275 sk_init();
2277 2276
2278 /* 2277 /*
2279 * Initialize skbuff SLAB cache 2278 * Initialize skbuff SLAB cache
2280 */ 2279 */
2281 skb_init(); 2280 skb_init();
2282 2281
2283 /* 2282 /*
2284 * Initialize the protocols module. 2283 * Initialize the protocols module.
2285 */ 2284 */
2286 2285
2287 init_inodecache(); 2286 init_inodecache();
2288 register_filesystem(&sock_fs_type); 2287 register_filesystem(&sock_fs_type);
2289 sock_mnt = kern_mount(&sock_fs_type); 2288 sock_mnt = kern_mount(&sock_fs_type);
2290 2289
2291 /* The real protocol initialization is performed in later initcalls. 2290 /* The real protocol initialization is performed in later initcalls.
2292 */ 2291 */
2293 2292
2294 #ifdef CONFIG_NETFILTER 2293 #ifdef CONFIG_NETFILTER
2295 netfilter_init(); 2294 netfilter_init();
2296 #endif 2295 #endif
2297 2296
2298 return 0; 2297 return 0;
2299 } 2298 }
2300 2299
2301 core_initcall(sock_init); /* early initcall */ 2300 core_initcall(sock_init); /* early initcall */
2302 2301
2303 #ifdef CONFIG_PROC_FS 2302 #ifdef CONFIG_PROC_FS
2304 void socket_seq_show(struct seq_file *seq) 2303 void socket_seq_show(struct seq_file *seq)
2305 { 2304 {
2306 int cpu; 2305 int cpu;
2307 int counter = 0; 2306 int counter = 0;
2308 2307
2309 for_each_possible_cpu(cpu) 2308 for_each_possible_cpu(cpu)
2310 counter += per_cpu(sockets_in_use, cpu); 2309 counter += per_cpu(sockets_in_use, cpu);
2311 2310
2312 /* It can be negative, by the way. 8) */ 2311 /* It can be negative, by the way. 8) */
2313 if (counter < 0) 2312 if (counter < 0)
2314 counter = 0; 2313 counter = 0;
2315 2314
2316 seq_printf(seq, "sockets: used %d\n", counter); 2315 seq_printf(seq, "sockets: used %d\n", counter);
2317 } 2316 }
2318 #endif /* CONFIG_PROC_FS */ 2317 #endif /* CONFIG_PROC_FS */
2319 2318
2320 #ifdef CONFIG_COMPAT 2319 #ifdef CONFIG_COMPAT
2321 static long compat_sock_ioctl(struct file *file, unsigned cmd, 2320 static long compat_sock_ioctl(struct file *file, unsigned cmd,
2322 unsigned long arg) 2321 unsigned long arg)
2323 { 2322 {
2324 struct socket *sock = file->private_data; 2323 struct socket *sock = file->private_data;
2325 int ret = -ENOIOCTLCMD; 2324 int ret = -ENOIOCTLCMD;
2326 struct sock *sk; 2325 struct sock *sk;
2327 struct net *net; 2326 struct net *net;
2328 2327
2329 sk = sock->sk; 2328 sk = sock->sk;
2330 net = sock_net(sk); 2329 net = sock_net(sk);
2331 2330
2332 if (sock->ops->compat_ioctl) 2331 if (sock->ops->compat_ioctl)
2333 ret = sock->ops->compat_ioctl(sock, cmd, arg); 2332 ret = sock->ops->compat_ioctl(sock, cmd, arg);
2334 2333
2335 if (ret == -ENOIOCTLCMD && 2334 if (ret == -ENOIOCTLCMD &&
2336 (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) 2335 (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
2337 ret = compat_wext_handle_ioctl(net, cmd, arg); 2336 ret = compat_wext_handle_ioctl(net, cmd, arg);
2338 2337
2339 return ret; 2338 return ret;
2340 } 2339 }
2341 #endif 2340 #endif
2342 2341
2343 int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) 2342 int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
2344 { 2343 {
2345 return sock->ops->bind(sock, addr, addrlen); 2344 return sock->ops->bind(sock, addr, addrlen);
2346 } 2345 }
2347 2346
2348 int kernel_listen(struct socket *sock, int backlog) 2347 int kernel_listen(struct socket *sock, int backlog)
2349 { 2348 {
2350 return sock->ops->listen(sock, backlog); 2349 return sock->ops->listen(sock, backlog);
2351 } 2350 }
2352 2351
2353 int kernel_accept(struct socket *sock, struct socket **newsock, int flags) 2352 int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
2354 { 2353 {
2355 struct sock *sk = sock->sk; 2354 struct sock *sk = sock->sk;
2356 int err; 2355 int err;
2357 2356
2358 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, 2357 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
2359 newsock); 2358 newsock);
2360 if (err < 0) 2359 if (err < 0)
2361 goto done; 2360 goto done;
2362 2361
2363 err = sock->ops->accept(sock, *newsock, flags); 2362 err = sock->ops->accept(sock, *newsock, flags);
2364 if (err < 0) { 2363 if (err < 0) {
2365 sock_release(*newsock); 2364 sock_release(*newsock);
2366 *newsock = NULL; 2365 *newsock = NULL;
2367 goto done; 2366 goto done;
2368 } 2367 }
2369 2368
2370 (*newsock)->ops = sock->ops; 2369 (*newsock)->ops = sock->ops;
2371 2370
2372 done: 2371 done:
2373 return err; 2372 return err;
2374 } 2373 }
2375 2374
2376 int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, 2375 int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
2377 int flags) 2376 int flags)
2378 { 2377 {
2379 return sock->ops->connect(sock, addr, addrlen, flags); 2378 return sock->ops->connect(sock, addr, addrlen, flags);
2380 } 2379 }
2381 2380
2382 int kernel_getsockname(struct socket *sock, struct sockaddr *addr, 2381 int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
2383 int *addrlen) 2382 int *addrlen)
2384 { 2383 {
2385 return sock->ops->getname(sock, addr, addrlen, 0); 2384 return sock->ops->getname(sock, addr, addrlen, 0);
2386 } 2385 }
2387 2386
2388 int kernel_getpeername(struct socket *sock, struct sockaddr *addr, 2387 int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
2389 int *addrlen) 2388 int *addrlen)
2390 { 2389 {
2391 return sock->ops->getname(sock, addr, addrlen, 1); 2390 return sock->ops->getname(sock, addr, addrlen, 1);
2392 } 2391 }
2393 2392
2394 int kernel_getsockopt(struct socket *sock, int level, int optname, 2393 int kernel_getsockopt(struct socket *sock, int level, int optname,
2395 char *optval, int *optlen) 2394 char *optval, int *optlen)
2396 { 2395 {
2397 mm_segment_t oldfs = get_fs(); 2396 mm_segment_t oldfs = get_fs();
2398 int err; 2397 int err;
2399 2398
2400 set_fs(KERNEL_DS); 2399 set_fs(KERNEL_DS);
2401 if (level == SOL_SOCKET) 2400 if (level == SOL_SOCKET)
2402 err = sock_getsockopt(sock, level, optname, optval, optlen); 2401 err = sock_getsockopt(sock, level, optname, optval, optlen);
2403 else 2402 else
2404 err = sock->ops->getsockopt(sock, level, optname, optval, 2403 err = sock->ops->getsockopt(sock, level, optname, optval,
2405 optlen); 2404 optlen);
2406 set_fs(oldfs); 2405 set_fs(oldfs);
2407 return err; 2406 return err;
2408 } 2407 }
2409 2408
2410 int kernel_setsockopt(struct socket *sock, int level, int optname, 2409 int kernel_setsockopt(struct socket *sock, int level, int optname,
2411 char *optval, int optlen) 2410 char *optval, int optlen)
2412 { 2411 {
2413 mm_segment_t oldfs = get_fs(); 2412 mm_segment_t oldfs = get_fs();
2414 int err; 2413 int err;
2415 2414
2416 set_fs(KERNEL_DS); 2415 set_fs(KERNEL_DS);
2417 if (level == SOL_SOCKET) 2416 if (level == SOL_SOCKET)
2418 err = sock_setsockopt(sock, level, optname, optval, optlen); 2417 err = sock_setsockopt(sock, level, optname, optval, optlen);
2419 else 2418 else
2420 err = sock->ops->setsockopt(sock, level, optname, optval, 2419 err = sock->ops->setsockopt(sock, level, optname, optval,
2421 optlen); 2420 optlen);
2422 set_fs(oldfs); 2421 set_fs(oldfs);
2423 return err; 2422 return err;
2424 } 2423 }
2425 2424
2426 int kernel_sendpage(struct socket *sock, struct page *page, int offset, 2425 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
2427 size_t size, int flags) 2426 size_t size, int flags)
2428 { 2427 {
2429 if (sock->ops->sendpage) 2428 if (sock->ops->sendpage)
2430 return sock->ops->sendpage(sock, page, offset, size, flags); 2429 return sock->ops->sendpage(sock, page, offset, size, flags);
2431 2430
2432 return sock_no_sendpage(sock, page, offset, size, flags); 2431 return sock_no_sendpage(sock, page, offset, size, flags);
2433 } 2432 }
2434 2433
2435 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) 2434 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
2436 { 2435 {
2437 mm_segment_t oldfs = get_fs(); 2436 mm_segment_t oldfs = get_fs();
2438 int err; 2437 int err;
2439 2438
2440 set_fs(KERNEL_DS); 2439 set_fs(KERNEL_DS);
2441 err = sock->ops->ioctl(sock, cmd, arg); 2440 err = sock->ops->ioctl(sock, cmd, arg);
2442 set_fs(oldfs); 2441 set_fs(oldfs);
2443 2442
2444 return err; 2443 return err;
2445 } 2444 }
2446 2445
2447 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) 2446 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
2448 { 2447 {
2449 return sock->ops->shutdown(sock, how); 2448 return sock->ops->shutdown(sock, how);
2450 } 2449 }
2451 2450
2452 EXPORT_SYMBOL(sock_create); 2451 EXPORT_SYMBOL(sock_create);
2453 EXPORT_SYMBOL(sock_create_kern); 2452 EXPORT_SYMBOL(sock_create_kern);
2454 EXPORT_SYMBOL(sock_create_lite); 2453 EXPORT_SYMBOL(sock_create_lite);
2455 EXPORT_SYMBOL(sock_map_fd); 2454 EXPORT_SYMBOL(sock_map_fd);
2456 EXPORT_SYMBOL(sock_recvmsg); 2455 EXPORT_SYMBOL(sock_recvmsg);
2457 EXPORT_SYMBOL(sock_register); 2456 EXPORT_SYMBOL(sock_register);
2458 EXPORT_SYMBOL(sock_release); 2457 EXPORT_SYMBOL(sock_release);
2459 EXPORT_SYMBOL(sock_sendmsg); 2458 EXPORT_SYMBOL(sock_sendmsg);
2460 EXPORT_SYMBOL(sock_unregister); 2459 EXPORT_SYMBOL(sock_unregister);
2461 EXPORT_SYMBOL(sock_wake_async); 2460 EXPORT_SYMBOL(sock_wake_async);
2462 EXPORT_SYMBOL(sockfd_lookup); 2461 EXPORT_SYMBOL(sockfd_lookup);
2463 EXPORT_SYMBOL(kernel_sendmsg); 2462 EXPORT_SYMBOL(kernel_sendmsg);
2464 EXPORT_SYMBOL(kernel_recvmsg); 2463 EXPORT_SYMBOL(kernel_recvmsg);
2465 EXPORT_SYMBOL(kernel_bind); 2464 EXPORT_SYMBOL(kernel_bind);
2466 EXPORT_SYMBOL(kernel_listen); 2465 EXPORT_SYMBOL(kernel_listen);
2467 EXPORT_SYMBOL(kernel_accept); 2466 EXPORT_SYMBOL(kernel_accept);
2468 EXPORT_SYMBOL(kernel_connect); 2467 EXPORT_SYMBOL(kernel_connect);
2469 EXPORT_SYMBOL(kernel_getsockname); 2468 EXPORT_SYMBOL(kernel_getsockname);
2470 EXPORT_SYMBOL(kernel_getpeername); 2469 EXPORT_SYMBOL(kernel_getpeername);
2471 EXPORT_SYMBOL(kernel_getsockopt); 2470 EXPORT_SYMBOL(kernel_getsockopt);
2472 EXPORT_SYMBOL(kernel_setsockopt); 2471 EXPORT_SYMBOL(kernel_setsockopt);
2473 EXPORT_SYMBOL(kernel_sendpage); 2472 EXPORT_SYMBOL(kernel_sendpage);
2474 EXPORT_SYMBOL(kernel_sock_ioctl); 2473 EXPORT_SYMBOL(kernel_sock_ioctl);
2475 EXPORT_SYMBOL(kernel_sock_shutdown); 2474 EXPORT_SYMBOL(kernel_sock_shutdown);
2476 2475