Commit 7b2b1fee30df7e2165525cd03f7d1d01a3a56794

Authored by Greg Banks
Committed by Linus Torvalds
1 parent fce1456a19

[PATCH] knfsd: knfsd: cache ipmap per TCP socket

Speed up high call-rate workloads by caching the struct ip_map for the peer on
the connected struct svc_sock instead of looking it up in the ip_map cache
hashtable on every call.  This helps workloads using AUTH_SYS authentication
over TCP.

Testing was on a 4 CPU 4 NIC Altix using 4 IRIX clients, each with 16
synthetic client threads simulating an rsync (i.e.  recursive directory
listing) workload reading from an i386 RH9 install image (161480 regular files
in 10841 directories) on the server.  That tree is small enough to fill in the
server's RAM so no disk traffic was involved.  This setup gives a sustained
call rate in excess of 60000 calls/sec before being CPU-bound on the server.

Profiling showed strcmp(), called from ip_map_match(), was taking 4.8% of each
CPU, and ip_map_lookup() was taking 2.9%.  This patch drops both contribution
into the profile noise.

Note that the above result overstates this value of this patch for most
workloads.  The synthetic clients are all using separate IP addresses, so
there are 64 entries in the ip_map cache hash.  Because the kernel measured
contained the bug fixed in commit

commit 1f1e030bf75774b6a283518e1534d598e14147d4

and was running on 64bit little-endian machine, probably all of those 64
entries were on a single chain, thus increasing the cost of ip_map_lookup().

With a modern kernel you would need more clients to see the same amount of
performance improvement.  This patch has helped to scale knfsd to handle a
deployment with 2000 NFS clients.

Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 5 changed files with 61 additions and 3 deletions Inline Diff

include/linux/sunrpc/cache.h
1 /* 1 /*
2 * include/linux/sunrpc/cache.h 2 * include/linux/sunrpc/cache.h
3 * 3 *
4 * Generic code for various authentication-related caches 4 * Generic code for various authentication-related caches
5 * used by sunrpc clients and servers. 5 * used by sunrpc clients and servers.
6 * 6 *
7 * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au> 7 * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au>
8 * 8 *
9 * Released under terms in GPL version 2. See COPYING. 9 * Released under terms in GPL version 2. See COPYING.
10 * 10 *
11 */ 11 */
12 12
13 #ifndef _LINUX_SUNRPC_CACHE_H_ 13 #ifndef _LINUX_SUNRPC_CACHE_H_
14 #define _LINUX_SUNRPC_CACHE_H_ 14 #define _LINUX_SUNRPC_CACHE_H_
15 15
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <asm/atomic.h> 17 #include <asm/atomic.h>
18 #include <linux/proc_fs.h> 18 #include <linux/proc_fs.h>
19 19
20 /* 20 /*
21 * Each cache requires: 21 * Each cache requires:
22 * - A 'struct cache_detail' which contains information specific to the cache 22 * - A 'struct cache_detail' which contains information specific to the cache
23 * for common code to use. 23 * for common code to use.
24 * - An item structure that must contain a "struct cache_head" 24 * - An item structure that must contain a "struct cache_head"
25 * - A lookup function defined using DefineCacheLookup 25 * - A lookup function defined using DefineCacheLookup
26 * - A 'put' function that can release a cache item. It will only 26 * - A 'put' function that can release a cache item. It will only
27 * be called after cache_put has succeed, so there are guarantee 27 * be called after cache_put has succeed, so there are guarantee
28 * to be no references. 28 * to be no references.
29 * - A function to calculate a hash of an item's key. 29 * - A function to calculate a hash of an item's key.
30 * 30 *
31 * as well as assorted code fragments (e.g. compare keys) and numbers 31 * as well as assorted code fragments (e.g. compare keys) and numbers
32 * (e.g. hash size, goal_age, etc). 32 * (e.g. hash size, goal_age, etc).
33 * 33 *
34 * Each cache must be registered so that it can be cleaned regularly. 34 * Each cache must be registered so that it can be cleaned regularly.
35 * When the cache is unregistered, it is flushed completely. 35 * When the cache is unregistered, it is flushed completely.
36 * 36 *
37 * Entries have a ref count and a 'hashed' flag which counts the existance 37 * Entries have a ref count and a 'hashed' flag which counts the existance
38 * in the hash table. 38 * in the hash table.
39 * We only expire entries when refcount is zero. 39 * We only expire entries when refcount is zero.
40 * Existance in the cache is counted the refcount. 40 * Existance in the cache is counted the refcount.
41 */ 41 */
42 42
43 /* Every cache item has a common header that is used 43 /* Every cache item has a common header that is used
44 * for expiring and refreshing entries. 44 * for expiring and refreshing entries.
45 * 45 *
46 */ 46 */
47 struct cache_head { 47 struct cache_head {
48 struct cache_head * next; 48 struct cache_head * next;
49 time_t expiry_time; /* After time time, don't use the data */ 49 time_t expiry_time; /* After time time, don't use the data */
50 time_t last_refresh; /* If CACHE_PENDING, this is when upcall 50 time_t last_refresh; /* If CACHE_PENDING, this is when upcall
51 * was sent, else this is when update was received 51 * was sent, else this is when update was received
52 */ 52 */
53 struct kref ref; 53 struct kref ref;
54 unsigned long flags; 54 unsigned long flags;
55 }; 55 };
56 #define CACHE_VALID 0 /* Entry contains valid data */ 56 #define CACHE_VALID 0 /* Entry contains valid data */
57 #define CACHE_NEGATIVE 1 /* Negative entry - there is no match for the key */ 57 #define CACHE_NEGATIVE 1 /* Negative entry - there is no match for the key */
58 #define CACHE_PENDING 2 /* An upcall has been sent but no reply received yet*/ 58 #define CACHE_PENDING 2 /* An upcall has been sent but no reply received yet*/
59 59
60 #define CACHE_NEW_EXPIRY 120 /* keep new things pending confirmation for 120 seconds */ 60 #define CACHE_NEW_EXPIRY 120 /* keep new things pending confirmation for 120 seconds */
61 61
62 struct cache_detail { 62 struct cache_detail {
63 struct module * owner; 63 struct module * owner;
64 int hash_size; 64 int hash_size;
65 struct cache_head ** hash_table; 65 struct cache_head ** hash_table;
66 rwlock_t hash_lock; 66 rwlock_t hash_lock;
67 67
68 atomic_t inuse; /* active user-space update or lookup */ 68 atomic_t inuse; /* active user-space update or lookup */
69 69
70 char *name; 70 char *name;
71 void (*cache_put)(struct kref *); 71 void (*cache_put)(struct kref *);
72 72
73 void (*cache_request)(struct cache_detail *cd, 73 void (*cache_request)(struct cache_detail *cd,
74 struct cache_head *h, 74 struct cache_head *h,
75 char **bpp, int *blen); 75 char **bpp, int *blen);
76 int (*cache_parse)(struct cache_detail *, 76 int (*cache_parse)(struct cache_detail *,
77 char *buf, int len); 77 char *buf, int len);
78 78
79 int (*cache_show)(struct seq_file *m, 79 int (*cache_show)(struct seq_file *m,
80 struct cache_detail *cd, 80 struct cache_detail *cd,
81 struct cache_head *h); 81 struct cache_head *h);
82 82
83 struct cache_head * (*alloc)(void); 83 struct cache_head * (*alloc)(void);
84 int (*match)(struct cache_head *orig, struct cache_head *new); 84 int (*match)(struct cache_head *orig, struct cache_head *new);
85 void (*init)(struct cache_head *orig, struct cache_head *new); 85 void (*init)(struct cache_head *orig, struct cache_head *new);
86 void (*update)(struct cache_head *orig, struct cache_head *new); 86 void (*update)(struct cache_head *orig, struct cache_head *new);
87 87
88 /* fields below this comment are for internal use 88 /* fields below this comment are for internal use
89 * and should not be touched by cache owners 89 * and should not be touched by cache owners
90 */ 90 */
91 time_t flush_time; /* flush all cache items with last_refresh 91 time_t flush_time; /* flush all cache items with last_refresh
92 * earlier than this */ 92 * earlier than this */
93 struct list_head others; 93 struct list_head others;
94 time_t nextcheck; 94 time_t nextcheck;
95 int entries; 95 int entries;
96 96
97 /* fields for communication over channel */ 97 /* fields for communication over channel */
98 struct list_head queue; 98 struct list_head queue;
99 struct proc_dir_entry *proc_ent; 99 struct proc_dir_entry *proc_ent;
100 struct proc_dir_entry *flush_ent, *channel_ent, *content_ent; 100 struct proc_dir_entry *flush_ent, *channel_ent, *content_ent;
101 101
102 atomic_t readers; /* how many time is /chennel open */ 102 atomic_t readers; /* how many time is /chennel open */
103 time_t last_close; /* if no readers, when did last close */ 103 time_t last_close; /* if no readers, when did last close */
104 time_t last_warn; /* when we last warned about no readers */ 104 time_t last_warn; /* when we last warned about no readers */
105 void (*warn_no_listener)(struct cache_detail *cd); 105 void (*warn_no_listener)(struct cache_detail *cd);
106 }; 106 };
107 107
108 108
109 /* this must be embedded in any request structure that 109 /* this must be embedded in any request structure that
110 * identifies an object that will want a callback on 110 * identifies an object that will want a callback on
111 * a cache fill 111 * a cache fill
112 */ 112 */
113 struct cache_req { 113 struct cache_req {
114 struct cache_deferred_req *(*defer)(struct cache_req *req); 114 struct cache_deferred_req *(*defer)(struct cache_req *req);
115 }; 115 };
116 /* this must be embedded in a deferred_request that is being 116 /* this must be embedded in a deferred_request that is being
117 * delayed awaiting cache-fill 117 * delayed awaiting cache-fill
118 */ 118 */
119 struct cache_deferred_req { 119 struct cache_deferred_req {
120 struct list_head hash; /* on hash chain */ 120 struct list_head hash; /* on hash chain */
121 struct list_head recent; /* on fifo */ 121 struct list_head recent; /* on fifo */
122 struct cache_head *item; /* cache item we wait on */ 122 struct cache_head *item; /* cache item we wait on */
123 time_t recv_time; 123 time_t recv_time;
124 void *owner; /* we might need to discard all defered requests 124 void *owner; /* we might need to discard all defered requests
125 * owned by someone */ 125 * owned by someone */
126 void (*revisit)(struct cache_deferred_req *req, 126 void (*revisit)(struct cache_deferred_req *req,
127 int too_many); 127 int too_many);
128 }; 128 };
129 129
130 130
131 extern struct cache_head * 131 extern struct cache_head *
132 sunrpc_cache_lookup(struct cache_detail *detail, 132 sunrpc_cache_lookup(struct cache_detail *detail,
133 struct cache_head *key, int hash); 133 struct cache_head *key, int hash);
134 extern struct cache_head * 134 extern struct cache_head *
135 sunrpc_cache_update(struct cache_detail *detail, 135 sunrpc_cache_update(struct cache_detail *detail,
136 struct cache_head *new, struct cache_head *old, int hash); 136 struct cache_head *new, struct cache_head *old, int hash);
137 137
138 138
139 #define cache_for_each(pos, detail, index, member) \ 139 #define cache_for_each(pos, detail, index, member) \
140 for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \ 140 for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \
141 ({if (index==0)read_unlock(&(detail)->hash_lock); index--;}); \ 141 ({if (index==0)read_unlock(&(detail)->hash_lock); index--;}); \
142 ) \ 142 ) \
143 for (pos = container_of((detail)->hash_table[index], typeof(*pos), member); \ 143 for (pos = container_of((detail)->hash_table[index], typeof(*pos), member); \
144 &pos->member; \ 144 &pos->member; \
145 pos = container_of(pos->member.next, typeof(*pos), member)) 145 pos = container_of(pos->member.next, typeof(*pos), member))
146 146
147 147
148 148
149 extern void cache_clean_deferred(void *owner); 149 extern void cache_clean_deferred(void *owner);
150 150
151 static inline struct cache_head *cache_get(struct cache_head *h) 151 static inline struct cache_head *cache_get(struct cache_head *h)
152 { 152 {
153 kref_get(&h->ref); 153 kref_get(&h->ref);
154 return h; 154 return h;
155 } 155 }
156 156
157 157
158 static inline void cache_put(struct cache_head *h, struct cache_detail *cd) 158 static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
159 { 159 {
160 if (atomic_read(&h->ref.refcount) <= 2 && 160 if (atomic_read(&h->ref.refcount) <= 2 &&
161 h->expiry_time < cd->nextcheck) 161 h->expiry_time < cd->nextcheck)
162 cd->nextcheck = h->expiry_time; 162 cd->nextcheck = h->expiry_time;
163 kref_put(&h->ref, cd->cache_put); 163 kref_put(&h->ref, cd->cache_put);
164 } 164 }
165 165
166 static inline int cache_valid(struct cache_head *h)
167 {
168 /* If an item has been unhashed pending removal when
169 * the refcount drops to 0, the expiry_time will be
170 * set to 0. We don't want to consider such items
171 * valid in this context even though CACHE_VALID is
172 * set.
173 */
174 return (h->expiry_time != 0 && test_bit(CACHE_VALID, &h->flags));
175 }
176
166 extern int cache_check(struct cache_detail *detail, 177 extern int cache_check(struct cache_detail *detail,
167 struct cache_head *h, struct cache_req *rqstp); 178 struct cache_head *h, struct cache_req *rqstp);
168 extern void cache_flush(void); 179 extern void cache_flush(void);
169 extern void cache_purge(struct cache_detail *detail); 180 extern void cache_purge(struct cache_detail *detail);
170 #define NEVER (0x7FFFFFFF) 181 #define NEVER (0x7FFFFFFF)
171 extern void cache_register(struct cache_detail *cd); 182 extern void cache_register(struct cache_detail *cd);
172 extern int cache_unregister(struct cache_detail *cd); 183 extern int cache_unregister(struct cache_detail *cd);
173 184
174 extern void qword_add(char **bpp, int *lp, char *str); 185 extern void qword_add(char **bpp, int *lp, char *str);
175 extern void qword_addhex(char **bpp, int *lp, char *buf, int blen); 186 extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
176 extern int qword_get(char **bpp, char *dest, int bufsize); 187 extern int qword_get(char **bpp, char *dest, int bufsize);
177 188
178 static inline int get_int(char **bpp, int *anint) 189 static inline int get_int(char **bpp, int *anint)
179 { 190 {
180 char buf[50]; 191 char buf[50];
181 char *ep; 192 char *ep;
182 int rv; 193 int rv;
183 int len = qword_get(bpp, buf, 50); 194 int len = qword_get(bpp, buf, 50);
184 if (len < 0) return -EINVAL; 195 if (len < 0) return -EINVAL;
185 if (len ==0) return -ENOENT; 196 if (len ==0) return -ENOENT;
186 rv = simple_strtol(buf, &ep, 0); 197 rv = simple_strtol(buf, &ep, 0);
187 if (*ep) return -EINVAL; 198 if (*ep) return -EINVAL;
188 *anint = rv; 199 *anint = rv;
189 return 0; 200 return 0;
190 } 201 }
191 202
192 static inline time_t get_expiry(char **bpp) 203 static inline time_t get_expiry(char **bpp)
193 { 204 {
194 int rv; 205 int rv;
195 if (get_int(bpp, &rv)) 206 if (get_int(bpp, &rv))
196 return 0; 207 return 0;
197 if (rv < 0) 208 if (rv < 0)
198 return 0; 209 return 0;
199 return rv; 210 return rv;
200 } 211 }
201 212
202 #endif /* _LINUX_SUNRPC_CACHE_H_ */ 213 #endif /* _LINUX_SUNRPC_CACHE_H_ */
203 214
include/linux/sunrpc/svcauth.h
1 /* 1 /*
2 * linux/include/linux/sunrpc/svcauth.h 2 * linux/include/linux/sunrpc/svcauth.h
3 * 3 *
4 * RPC server-side authentication stuff. 4 * RPC server-side authentication stuff.
5 * 5 *
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 */ 7 */
8 8
9 #ifndef _LINUX_SUNRPC_SVCAUTH_H_ 9 #ifndef _LINUX_SUNRPC_SVCAUTH_H_
10 #define _LINUX_SUNRPC_SVCAUTH_H_ 10 #define _LINUX_SUNRPC_SVCAUTH_H_
11 11
12 #ifdef __KERNEL__ 12 #ifdef __KERNEL__
13 13
14 #include <linux/string.h> 14 #include <linux/string.h>
15 #include <linux/sunrpc/msg_prot.h> 15 #include <linux/sunrpc/msg_prot.h>
16 #include <linux/sunrpc/cache.h> 16 #include <linux/sunrpc/cache.h>
17 #include <linux/hash.h> 17 #include <linux/hash.h>
18 18
19 #define SVC_CRED_NGROUPS 32 19 #define SVC_CRED_NGROUPS 32
20 struct svc_cred { 20 struct svc_cred {
21 uid_t cr_uid; 21 uid_t cr_uid;
22 gid_t cr_gid; 22 gid_t cr_gid;
23 struct group_info *cr_group_info; 23 struct group_info *cr_group_info;
24 }; 24 };
25 25
26 struct svc_rqst; /* forward decl */ 26 struct svc_rqst; /* forward decl */
27 27
28 /* Authentication is done in the context of a domain. 28 /* Authentication is done in the context of a domain.
29 * 29 *
30 * Currently, the nfs server uses the auth_domain to stand 30 * Currently, the nfs server uses the auth_domain to stand
31 * for the "client" listed in /etc/exports. 31 * for the "client" listed in /etc/exports.
32 * 32 *
33 * More generally, a domain might represent a group of clients using 33 * More generally, a domain might represent a group of clients using
34 * a common mechanism for authentication and having a common mapping 34 * a common mechanism for authentication and having a common mapping
35 * between local identity (uid) and network identity. All clients 35 * between local identity (uid) and network identity. All clients
36 * in a domain have similar general access rights. Each domain can 36 * in a domain have similar general access rights. Each domain can
37 * contain multiple principals which will have different specific right 37 * contain multiple principals which will have different specific right
38 * based on normal Discretionary Access Control. 38 * based on normal Discretionary Access Control.
39 * 39 *
40 * A domain is created by an authentication flavour module based on name 40 * A domain is created by an authentication flavour module based on name
41 * only. Userspace then fills in detail on demand. 41 * only. Userspace then fills in detail on demand.
42 * 42 *
43 * In the case of auth_unix and auth_null, the auth_domain is also 43 * In the case of auth_unix and auth_null, the auth_domain is also
44 * associated with entries in another cache representing the mapping 44 * associated with entries in another cache representing the mapping
45 * of ip addresses to the given client. 45 * of ip addresses to the given client.
46 */ 46 */
47 struct auth_domain { 47 struct auth_domain {
48 struct kref ref; 48 struct kref ref;
49 struct hlist_node hash; 49 struct hlist_node hash;
50 char *name; 50 char *name;
51 struct auth_ops *flavour; 51 struct auth_ops *flavour;
52 }; 52 };
53 53
54 /* 54 /*
55 * Each authentication flavour registers an auth_ops 55 * Each authentication flavour registers an auth_ops
56 * structure. 56 * structure.
57 * name is simply the name. 57 * name is simply the name.
58 * flavour gives the auth flavour. It determines where the flavour is registered 58 * flavour gives the auth flavour. It determines where the flavour is registered
59 * accept() is given a request and should verify it. 59 * accept() is given a request and should verify it.
60 * It should inspect the authenticator and verifier, and possibly the data. 60 * It should inspect the authenticator and verifier, and possibly the data.
61 * If there is a problem with the authentication *authp should be set. 61 * If there is a problem with the authentication *authp should be set.
62 * The return value of accept() can indicate: 62 * The return value of accept() can indicate:
63 * OK - authorised. client and credential are set in rqstp. 63 * OK - authorised. client and credential are set in rqstp.
64 * reqbuf points to arguments 64 * reqbuf points to arguments
65 * resbuf points to good place for results. verfier 65 * resbuf points to good place for results. verfier
66 * is (probably) already in place. Certainly space is 66 * is (probably) already in place. Certainly space is
67 * reserved for it. 67 * reserved for it.
68 * DROP - simply drop the request. It may have been deferred 68 * DROP - simply drop the request. It may have been deferred
69 * GARBAGE - rpc garbage_args error 69 * GARBAGE - rpc garbage_args error
70 * SYSERR - rpc system_err error 70 * SYSERR - rpc system_err error
71 * DENIED - authp holds reason for denial. 71 * DENIED - authp holds reason for denial.
72 * COMPLETE - the reply is encoded already and ready to be sent; no 72 * COMPLETE - the reply is encoded already and ready to be sent; no
73 * further processing is necessary. (This is used for processing 73 * further processing is necessary. (This is used for processing
74 * null procedure calls which are used to set up encryption 74 * null procedure calls which are used to set up encryption
75 * contexts.) 75 * contexts.)
76 * 76 *
77 * accept is passed the proc number so that it can accept NULL rpc requests 77 * accept is passed the proc number so that it can accept NULL rpc requests
78 * even if it cannot authenticate the client (as is sometimes appropriate). 78 * even if it cannot authenticate the client (as is sometimes appropriate).
79 * 79 *
80 * release() is given a request after the procedure has been run. 80 * release() is given a request after the procedure has been run.
81 * It should sign/encrypt the results if needed 81 * It should sign/encrypt the results if needed
82 * It should return: 82 * It should return:
83 * OK - the resbuf is ready to be sent 83 * OK - the resbuf is ready to be sent
84 * DROP - the reply should be quitely dropped 84 * DROP - the reply should be quitely dropped
85 * DENIED - authp holds a reason for MSG_DENIED 85 * DENIED - authp holds a reason for MSG_DENIED
86 * SYSERR - rpc system_err 86 * SYSERR - rpc system_err
87 * 87 *
88 * domain_release() 88 * domain_release()
89 * This call releases a domain. 89 * This call releases a domain.
90 * set_client() 90 * set_client()
91 * Givens a pending request (struct svc_rqst), finds and assigns 91 * Givens a pending request (struct svc_rqst), finds and assigns
92 * an appropriate 'auth_domain' as the client. 92 * an appropriate 'auth_domain' as the client.
93 */ 93 */
94 struct auth_ops { 94 struct auth_ops {
95 char * name; 95 char * name;
96 struct module *owner; 96 struct module *owner;
97 int flavour; 97 int flavour;
98 int (*accept)(struct svc_rqst *rq, __be32 *authp); 98 int (*accept)(struct svc_rqst *rq, __be32 *authp);
99 int (*release)(struct svc_rqst *rq); 99 int (*release)(struct svc_rqst *rq);
100 void (*domain_release)(struct auth_domain *); 100 void (*domain_release)(struct auth_domain *);
101 int (*set_client)(struct svc_rqst *rq); 101 int (*set_client)(struct svc_rqst *rq);
102 }; 102 };
103 103
104 #define SVC_GARBAGE 1 104 #define SVC_GARBAGE 1
105 #define SVC_SYSERR 2 105 #define SVC_SYSERR 2
106 #define SVC_VALID 3 106 #define SVC_VALID 3
107 #define SVC_NEGATIVE 4 107 #define SVC_NEGATIVE 4
108 #define SVC_OK 5 108 #define SVC_OK 5
109 #define SVC_DROP 6 109 #define SVC_DROP 6
110 #define SVC_DENIED 7 110 #define SVC_DENIED 7
111 #define SVC_PENDING 8 111 #define SVC_PENDING 8
112 #define SVC_COMPLETE 9 112 #define SVC_COMPLETE 9
113 113
114 114
115 extern int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp); 115 extern int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp);
116 extern int svc_authorise(struct svc_rqst *rqstp); 116 extern int svc_authorise(struct svc_rqst *rqstp);
117 extern int svc_set_client(struct svc_rqst *rqstp); 117 extern int svc_set_client(struct svc_rqst *rqstp);
118 extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); 118 extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
119 extern void svc_auth_unregister(rpc_authflavor_t flavor); 119 extern void svc_auth_unregister(rpc_authflavor_t flavor);
120 120
121 extern struct auth_domain *unix_domain_find(char *name); 121 extern struct auth_domain *unix_domain_find(char *name);
122 extern void auth_domain_put(struct auth_domain *item); 122 extern void auth_domain_put(struct auth_domain *item);
123 extern int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom); 123 extern int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom);
124 extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new); 124 extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new);
125 extern struct auth_domain *auth_domain_find(char *name); 125 extern struct auth_domain *auth_domain_find(char *name);
126 extern struct auth_domain *auth_unix_lookup(struct in_addr addr); 126 extern struct auth_domain *auth_unix_lookup(struct in_addr addr);
127 extern int auth_unix_forget_old(struct auth_domain *dom); 127 extern int auth_unix_forget_old(struct auth_domain *dom);
128 extern void svcauth_unix_purge(void); 128 extern void svcauth_unix_purge(void);
129 extern void svcauth_unix_info_release(void *);
129 130
130 static inline unsigned long hash_str(char *name, int bits) 131 static inline unsigned long hash_str(char *name, int bits)
131 { 132 {
132 unsigned long hash = 0; 133 unsigned long hash = 0;
133 unsigned long l = 0; 134 unsigned long l = 0;
134 int len = 0; 135 int len = 0;
135 unsigned char c; 136 unsigned char c;
136 do { 137 do {
137 if (unlikely(!(c = *name++))) { 138 if (unlikely(!(c = *name++))) {
138 c = (char)len; len = -1; 139 c = (char)len; len = -1;
139 } 140 }
140 l = (l << 8) | c; 141 l = (l << 8) | c;
141 len++; 142 len++;
142 if ((len & (BITS_PER_LONG/8-1))==0) 143 if ((len & (BITS_PER_LONG/8-1))==0)
143 hash = hash_long(hash^l, BITS_PER_LONG); 144 hash = hash_long(hash^l, BITS_PER_LONG);
144 } while (len); 145 } while (len);
145 return hash >> (BITS_PER_LONG - bits); 146 return hash >> (BITS_PER_LONG - bits);
146 } 147 }
147 148
148 static inline unsigned long hash_mem(char *buf, int length, int bits) 149 static inline unsigned long hash_mem(char *buf, int length, int bits)
149 { 150 {
150 unsigned long hash = 0; 151 unsigned long hash = 0;
151 unsigned long l = 0; 152 unsigned long l = 0;
152 int len = 0; 153 int len = 0;
153 unsigned char c; 154 unsigned char c;
154 do { 155 do {
155 if (len == length) { 156 if (len == length) {
156 c = (char)len; len = -1; 157 c = (char)len; len = -1;
157 } else 158 } else
158 c = *buf++; 159 c = *buf++;
159 l = (l << 8) | c; 160 l = (l << 8) | c;
160 len++; 161 len++;
161 if ((len & (BITS_PER_LONG/8-1))==0) 162 if ((len & (BITS_PER_LONG/8-1))==0)
162 hash = hash_long(hash^l, BITS_PER_LONG); 163 hash = hash_long(hash^l, BITS_PER_LONG);
163 } while (len); 164 } while (len);
164 return hash >> (BITS_PER_LONG - bits); 165 return hash >> (BITS_PER_LONG - bits);
165 } 166 }
166 167
167 #endif /* __KERNEL__ */ 168 #endif /* __KERNEL__ */
168 169
169 #endif /* _LINUX_SUNRPC_SVCAUTH_H_ */ 170 #endif /* _LINUX_SUNRPC_SVCAUTH_H_ */
170 171
include/linux/sunrpc/svcsock.h
1 /* 1 /*
2 * linux/include/linux/sunrpc/svcsock.h 2 * linux/include/linux/sunrpc/svcsock.h
3 * 3 *
4 * RPC server socket I/O. 4 * RPC server socket I/O.
5 * 5 *
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 */ 7 */
8 8
9 #ifndef SUNRPC_SVCSOCK_H 9 #ifndef SUNRPC_SVCSOCK_H
10 #define SUNRPC_SVCSOCK_H 10 #define SUNRPC_SVCSOCK_H
11 11
12 #include <linux/sunrpc/svc.h> 12 #include <linux/sunrpc/svc.h>
13 13
14 /* 14 /*
15 * RPC server socket. 15 * RPC server socket.
16 */ 16 */
17 struct svc_sock { 17 struct svc_sock {
18 struct list_head sk_ready; /* list of ready sockets */ 18 struct list_head sk_ready; /* list of ready sockets */
19 struct list_head sk_list; /* list of all sockets */ 19 struct list_head sk_list; /* list of all sockets */
20 struct socket * sk_sock; /* berkeley socket layer */ 20 struct socket * sk_sock; /* berkeley socket layer */
21 struct sock * sk_sk; /* INET layer */ 21 struct sock * sk_sk; /* INET layer */
22 22
23 struct svc_pool * sk_pool; /* current pool iff queued */ 23 struct svc_pool * sk_pool; /* current pool iff queued */
24 struct svc_serv * sk_server; /* service for this socket */ 24 struct svc_serv * sk_server; /* service for this socket */
25 atomic_t sk_inuse; /* use count */ 25 atomic_t sk_inuse; /* use count */
26 unsigned long sk_flags; 26 unsigned long sk_flags;
27 #define SK_BUSY 0 /* enqueued/receiving */ 27 #define SK_BUSY 0 /* enqueued/receiving */
28 #define SK_CONN 1 /* conn pending */ 28 #define SK_CONN 1 /* conn pending */
29 #define SK_CLOSE 2 /* dead or dying */ 29 #define SK_CLOSE 2 /* dead or dying */
30 #define SK_DATA 3 /* data pending */ 30 #define SK_DATA 3 /* data pending */
31 #define SK_TEMP 4 /* temp (TCP) socket */ 31 #define SK_TEMP 4 /* temp (TCP) socket */
32 #define SK_DEAD 6 /* socket closed */ 32 #define SK_DEAD 6 /* socket closed */
33 #define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ 33 #define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */
34 #define SK_DEFERRED 8 /* request on sk_deferred */ 34 #define SK_DEFERRED 8 /* request on sk_deferred */
35 #define SK_OLD 9 /* used for temp socket aging mark+sweep */ 35 #define SK_OLD 9 /* used for temp socket aging mark+sweep */
36 #define SK_DETACHED 10 /* detached from tempsocks list */ 36 #define SK_DETACHED 10 /* detached from tempsocks list */
37 37
38 atomic_t sk_reserved; /* space on outq that is reserved */ 38 atomic_t sk_reserved; /* space on outq that is reserved */
39 39
40 spinlock_t sk_defer_lock; /* protects sk_deferred */ 40 spinlock_t sk_defer_lock; /* protects sk_deferred */
41 struct list_head sk_deferred; /* deferred requests that need to 41 struct list_head sk_deferred; /* deferred requests that need to
42 * be revisted */ 42 * be revisted */
43 struct mutex sk_mutex; /* to serialize sending data */ 43 struct mutex sk_mutex; /* to serialize sending data */
44 44
45 int (*sk_recvfrom)(struct svc_rqst *rqstp); 45 int (*sk_recvfrom)(struct svc_rqst *rqstp);
46 int (*sk_sendto)(struct svc_rqst *rqstp); 46 int (*sk_sendto)(struct svc_rqst *rqstp);
47 47
48 /* We keep the old state_change and data_ready CB's here */ 48 /* We keep the old state_change and data_ready CB's here */
49 void (*sk_ostate)(struct sock *); 49 void (*sk_ostate)(struct sock *);
50 void (*sk_odata)(struct sock *, int bytes); 50 void (*sk_odata)(struct sock *, int bytes);
51 void (*sk_owspace)(struct sock *); 51 void (*sk_owspace)(struct sock *);
52 52
53 /* private TCP part */ 53 /* private TCP part */
54 int sk_reclen; /* length of record */ 54 int sk_reclen; /* length of record */
55 int sk_tcplen; /* current read length */ 55 int sk_tcplen; /* current read length */
56 time_t sk_lastrecv; /* time of last received request */ 56 time_t sk_lastrecv; /* time of last received request */
57
58 /* cache of various info for TCP sockets */
59 void *sk_info_authunix;
57 }; 60 };
58 61
59 /* 62 /*
60 * Function prototypes. 63 * Function prototypes.
61 */ 64 */
62 int svc_makesock(struct svc_serv *, int, unsigned short); 65 int svc_makesock(struct svc_serv *, int, unsigned short);
63 void svc_delete_socket(struct svc_sock *); 66 void svc_delete_socket(struct svc_sock *);
64 int svc_recv(struct svc_rqst *, long); 67 int svc_recv(struct svc_rqst *, long);
65 int svc_send(struct svc_rqst *); 68 int svc_send(struct svc_rqst *);
66 void svc_drop(struct svc_rqst *); 69 void svc_drop(struct svc_rqst *);
67 void svc_sock_update_bufs(struct svc_serv *serv); 70 void svc_sock_update_bufs(struct svc_serv *serv);
68 int svc_sock_names(char *buf, struct svc_serv *serv, char *toclose); 71 int svc_sock_names(char *buf, struct svc_serv *serv, char *toclose);
69 int svc_addsock(struct svc_serv *serv, 72 int svc_addsock(struct svc_serv *serv,
70 int fd, 73 int fd,
71 char *name_return, 74 char *name_return,
72 int *proto); 75 int *proto);
73 76
74 #endif /* SUNRPC_SVCSOCK_H */ 77 #endif /* SUNRPC_SVCSOCK_H */
75 78
net/sunrpc/svcauth_unix.c
1 #include <linux/types.h> 1 #include <linux/types.h>
2 #include <linux/sched.h> 2 #include <linux/sched.h>
3 #include <linux/module.h> 3 #include <linux/module.h>
4 #include <linux/sunrpc/types.h> 4 #include <linux/sunrpc/types.h>
5 #include <linux/sunrpc/xdr.h> 5 #include <linux/sunrpc/xdr.h>
6 #include <linux/sunrpc/svcsock.h> 6 #include <linux/sunrpc/svcsock.h>
7 #include <linux/sunrpc/svcauth.h> 7 #include <linux/sunrpc/svcauth.h>
8 #include <linux/err.h> 8 #include <linux/err.h>
9 #include <linux/seq_file.h> 9 #include <linux/seq_file.h>
10 #include <linux/hash.h> 10 #include <linux/hash.h>
11 #include <linux/string.h> 11 #include <linux/string.h>
12 #include <net/sock.h>
12 13
13 #define RPCDBG_FACILITY RPCDBG_AUTH 14 #define RPCDBG_FACILITY RPCDBG_AUTH
14 15
15 16
16 /* 17 /*
17 * AUTHUNIX and AUTHNULL credentials are both handled here. 18 * AUTHUNIX and AUTHNULL credentials are both handled here.
18 * AUTHNULL is treated just like AUTHUNIX except that the uid/gid 19 * AUTHNULL is treated just like AUTHUNIX except that the uid/gid
19 * are always nobody (-2). i.e. we do the same IP address checks for 20 * are always nobody (-2). i.e. we do the same IP address checks for
20 * AUTHNULL as for AUTHUNIX, and that is done here. 21 * AUTHNULL as for AUTHUNIX, and that is done here.
21 */ 22 */
22 23
23 24
24 struct unix_domain { 25 struct unix_domain {
25 struct auth_domain h; 26 struct auth_domain h;
26 int addr_changes; 27 int addr_changes;
27 /* other stuff later */ 28 /* other stuff later */
28 }; 29 };
29 30
30 extern struct auth_ops svcauth_unix; 31 extern struct auth_ops svcauth_unix;
31 32
32 struct auth_domain *unix_domain_find(char *name) 33 struct auth_domain *unix_domain_find(char *name)
33 { 34 {
34 struct auth_domain *rv; 35 struct auth_domain *rv;
35 struct unix_domain *new = NULL; 36 struct unix_domain *new = NULL;
36 37
37 rv = auth_domain_lookup(name, NULL); 38 rv = auth_domain_lookup(name, NULL);
38 while(1) { 39 while(1) {
39 if (rv) { 40 if (rv) {
40 if (new && rv != &new->h) 41 if (new && rv != &new->h)
41 auth_domain_put(&new->h); 42 auth_domain_put(&new->h);
42 43
43 if (rv->flavour != &svcauth_unix) { 44 if (rv->flavour != &svcauth_unix) {
44 auth_domain_put(rv); 45 auth_domain_put(rv);
45 return NULL; 46 return NULL;
46 } 47 }
47 return rv; 48 return rv;
48 } 49 }
49 50
50 new = kmalloc(sizeof(*new), GFP_KERNEL); 51 new = kmalloc(sizeof(*new), GFP_KERNEL);
51 if (new == NULL) 52 if (new == NULL)
52 return NULL; 53 return NULL;
53 kref_init(&new->h.ref); 54 kref_init(&new->h.ref);
54 new->h.name = kstrdup(name, GFP_KERNEL); 55 new->h.name = kstrdup(name, GFP_KERNEL);
55 new->h.flavour = &svcauth_unix; 56 new->h.flavour = &svcauth_unix;
56 new->addr_changes = 0; 57 new->addr_changes = 0;
57 rv = auth_domain_lookup(name, &new->h); 58 rv = auth_domain_lookup(name, &new->h);
58 } 59 }
59 } 60 }
60 61
61 static void svcauth_unix_domain_release(struct auth_domain *dom) 62 static void svcauth_unix_domain_release(struct auth_domain *dom)
62 { 63 {
63 struct unix_domain *ud = container_of(dom, struct unix_domain, h); 64 struct unix_domain *ud = container_of(dom, struct unix_domain, h);
64 65
65 kfree(dom->name); 66 kfree(dom->name);
66 kfree(ud); 67 kfree(ud);
67 } 68 }
68 69
69 70
70 /************************************************** 71 /**************************************************
71 * cache for IP address to unix_domain 72 * cache for IP address to unix_domain
72 * as needed by AUTH_UNIX 73 * as needed by AUTH_UNIX
73 */ 74 */
74 #define IP_HASHBITS 8 75 #define IP_HASHBITS 8
75 #define IP_HASHMAX (1<<IP_HASHBITS) 76 #define IP_HASHMAX (1<<IP_HASHBITS)
76 #define IP_HASHMASK (IP_HASHMAX-1) 77 #define IP_HASHMASK (IP_HASHMAX-1)
77 78
78 struct ip_map { 79 struct ip_map {
79 struct cache_head h; 80 struct cache_head h;
80 char m_class[8]; /* e.g. "nfsd" */ 81 char m_class[8]; /* e.g. "nfsd" */
81 struct in_addr m_addr; 82 struct in_addr m_addr;
82 struct unix_domain *m_client; 83 struct unix_domain *m_client;
83 int m_add_change; 84 int m_add_change;
84 }; 85 };
85 static struct cache_head *ip_table[IP_HASHMAX]; 86 static struct cache_head *ip_table[IP_HASHMAX];
86 87
87 static void ip_map_put(struct kref *kref) 88 static void ip_map_put(struct kref *kref)
88 { 89 {
89 struct cache_head *item = container_of(kref, struct cache_head, ref); 90 struct cache_head *item = container_of(kref, struct cache_head, ref);
90 struct ip_map *im = container_of(item, struct ip_map,h); 91 struct ip_map *im = container_of(item, struct ip_map,h);
91 92
92 if (test_bit(CACHE_VALID, &item->flags) && 93 if (test_bit(CACHE_VALID, &item->flags) &&
93 !test_bit(CACHE_NEGATIVE, &item->flags)) 94 !test_bit(CACHE_NEGATIVE, &item->flags))
94 auth_domain_put(&im->m_client->h); 95 auth_domain_put(&im->m_client->h);
95 kfree(im); 96 kfree(im);
96 } 97 }
97 98
98 #if IP_HASHBITS == 8 99 #if IP_HASHBITS == 8
99 /* hash_long on a 64 bit machine is currently REALLY BAD for 100 /* hash_long on a 64 bit machine is currently REALLY BAD for
100 * IP addresses in reverse-endian (i.e. on a little-endian machine). 101 * IP addresses in reverse-endian (i.e. on a little-endian machine).
101 * So use a trivial but reliable hash instead 102 * So use a trivial but reliable hash instead
102 */ 103 */
103 static inline int hash_ip(unsigned long ip) 104 static inline int hash_ip(unsigned long ip)
104 { 105 {
105 int hash = ip ^ (ip>>16); 106 int hash = ip ^ (ip>>16);
106 return (hash ^ (hash>>8)) & 0xff; 107 return (hash ^ (hash>>8)) & 0xff;
107 } 108 }
108 #endif 109 #endif
109 static int ip_map_match(struct cache_head *corig, struct cache_head *cnew) 110 static int ip_map_match(struct cache_head *corig, struct cache_head *cnew)
110 { 111 {
111 struct ip_map *orig = container_of(corig, struct ip_map, h); 112 struct ip_map *orig = container_of(corig, struct ip_map, h);
112 struct ip_map *new = container_of(cnew, struct ip_map, h); 113 struct ip_map *new = container_of(cnew, struct ip_map, h);
113 return strcmp(orig->m_class, new->m_class) == 0 114 return strcmp(orig->m_class, new->m_class) == 0
114 && orig->m_addr.s_addr == new->m_addr.s_addr; 115 && orig->m_addr.s_addr == new->m_addr.s_addr;
115 } 116 }
116 static void ip_map_init(struct cache_head *cnew, struct cache_head *citem) 117 static void ip_map_init(struct cache_head *cnew, struct cache_head *citem)
117 { 118 {
118 struct ip_map *new = container_of(cnew, struct ip_map, h); 119 struct ip_map *new = container_of(cnew, struct ip_map, h);
119 struct ip_map *item = container_of(citem, struct ip_map, h); 120 struct ip_map *item = container_of(citem, struct ip_map, h);
120 121
121 strcpy(new->m_class, item->m_class); 122 strcpy(new->m_class, item->m_class);
122 new->m_addr.s_addr = item->m_addr.s_addr; 123 new->m_addr.s_addr = item->m_addr.s_addr;
123 } 124 }
124 static void update(struct cache_head *cnew, struct cache_head *citem) 125 static void update(struct cache_head *cnew, struct cache_head *citem)
125 { 126 {
126 struct ip_map *new = container_of(cnew, struct ip_map, h); 127 struct ip_map *new = container_of(cnew, struct ip_map, h);
127 struct ip_map *item = container_of(citem, struct ip_map, h); 128 struct ip_map *item = container_of(citem, struct ip_map, h);
128 129
129 kref_get(&item->m_client->h.ref); 130 kref_get(&item->m_client->h.ref);
130 new->m_client = item->m_client; 131 new->m_client = item->m_client;
131 new->m_add_change = item->m_add_change; 132 new->m_add_change = item->m_add_change;
132 } 133 }
133 static struct cache_head *ip_map_alloc(void) 134 static struct cache_head *ip_map_alloc(void)
134 { 135 {
135 struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL); 136 struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL);
136 if (i) 137 if (i)
137 return &i->h; 138 return &i->h;
138 else 139 else
139 return NULL; 140 return NULL;
140 } 141 }
141 142
142 static void ip_map_request(struct cache_detail *cd, 143 static void ip_map_request(struct cache_detail *cd,
143 struct cache_head *h, 144 struct cache_head *h,
144 char **bpp, int *blen) 145 char **bpp, int *blen)
145 { 146 {
146 char text_addr[20]; 147 char text_addr[20];
147 struct ip_map *im = container_of(h, struct ip_map, h); 148 struct ip_map *im = container_of(h, struct ip_map, h);
148 __be32 addr = im->m_addr.s_addr; 149 __be32 addr = im->m_addr.s_addr;
149 150
150 snprintf(text_addr, 20, "%u.%u.%u.%u", 151 snprintf(text_addr, 20, "%u.%u.%u.%u",
151 ntohl(addr) >> 24 & 0xff, 152 ntohl(addr) >> 24 & 0xff,
152 ntohl(addr) >> 16 & 0xff, 153 ntohl(addr) >> 16 & 0xff,
153 ntohl(addr) >> 8 & 0xff, 154 ntohl(addr) >> 8 & 0xff,
154 ntohl(addr) >> 0 & 0xff); 155 ntohl(addr) >> 0 & 0xff);
155 156
156 qword_add(bpp, blen, im->m_class); 157 qword_add(bpp, blen, im->m_class);
157 qword_add(bpp, blen, text_addr); 158 qword_add(bpp, blen, text_addr);
158 (*bpp)[-1] = '\n'; 159 (*bpp)[-1] = '\n';
159 } 160 }
160 161
161 static struct ip_map *ip_map_lookup(char *class, struct in_addr addr); 162 static struct ip_map *ip_map_lookup(char *class, struct in_addr addr);
162 static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry); 163 static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
163 164
164 static int ip_map_parse(struct cache_detail *cd, 165 static int ip_map_parse(struct cache_detail *cd,
165 char *mesg, int mlen) 166 char *mesg, int mlen)
166 { 167 {
167 /* class ipaddress [domainname] */ 168 /* class ipaddress [domainname] */
168 /* should be safe just to use the start of the input buffer 169 /* should be safe just to use the start of the input buffer
169 * for scratch: */ 170 * for scratch: */
170 char *buf = mesg; 171 char *buf = mesg;
171 int len; 172 int len;
172 int b1,b2,b3,b4; 173 int b1,b2,b3,b4;
173 char c; 174 char c;
174 char class[8]; 175 char class[8];
175 struct in_addr addr; 176 struct in_addr addr;
176 int err; 177 int err;
177 178
178 struct ip_map *ipmp; 179 struct ip_map *ipmp;
179 struct auth_domain *dom; 180 struct auth_domain *dom;
180 time_t expiry; 181 time_t expiry;
181 182
182 if (mesg[mlen-1] != '\n') 183 if (mesg[mlen-1] != '\n')
183 return -EINVAL; 184 return -EINVAL;
184 mesg[mlen-1] = 0; 185 mesg[mlen-1] = 0;
185 186
186 /* class */ 187 /* class */
187 len = qword_get(&mesg, class, sizeof(class)); 188 len = qword_get(&mesg, class, sizeof(class));
188 if (len <= 0) return -EINVAL; 189 if (len <= 0) return -EINVAL;
189 190
190 /* ip address */ 191 /* ip address */
191 len = qword_get(&mesg, buf, mlen); 192 len = qword_get(&mesg, buf, mlen);
192 if (len <= 0) return -EINVAL; 193 if (len <= 0) return -EINVAL;
193 194
194 if (sscanf(buf, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) 195 if (sscanf(buf, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
195 return -EINVAL; 196 return -EINVAL;
196 197
197 expiry = get_expiry(&mesg); 198 expiry = get_expiry(&mesg);
198 if (expiry ==0) 199 if (expiry ==0)
199 return -EINVAL; 200 return -EINVAL;
200 201
201 /* domainname, or empty for NEGATIVE */ 202 /* domainname, or empty for NEGATIVE */
202 len = qword_get(&mesg, buf, mlen); 203 len = qword_get(&mesg, buf, mlen);
203 if (len < 0) return -EINVAL; 204 if (len < 0) return -EINVAL;
204 205
205 if (len) { 206 if (len) {
206 dom = unix_domain_find(buf); 207 dom = unix_domain_find(buf);
207 if (dom == NULL) 208 if (dom == NULL)
208 return -ENOENT; 209 return -ENOENT;
209 } else 210 } else
210 dom = NULL; 211 dom = NULL;
211 212
212 addr.s_addr = 213 addr.s_addr =
213 htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); 214 htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4);
214 215
215 ipmp = ip_map_lookup(class,addr); 216 ipmp = ip_map_lookup(class,addr);
216 if (ipmp) { 217 if (ipmp) {
217 err = ip_map_update(ipmp, 218 err = ip_map_update(ipmp,
218 container_of(dom, struct unix_domain, h), 219 container_of(dom, struct unix_domain, h),
219 expiry); 220 expiry);
220 } else 221 } else
221 err = -ENOMEM; 222 err = -ENOMEM;
222 223
223 if (dom) 224 if (dom)
224 auth_domain_put(dom); 225 auth_domain_put(dom);
225 226
226 cache_flush(); 227 cache_flush();
227 return err; 228 return err;
228 } 229 }
229 230
230 static int ip_map_show(struct seq_file *m, 231 static int ip_map_show(struct seq_file *m,
231 struct cache_detail *cd, 232 struct cache_detail *cd,
232 struct cache_head *h) 233 struct cache_head *h)
233 { 234 {
234 struct ip_map *im; 235 struct ip_map *im;
235 struct in_addr addr; 236 struct in_addr addr;
236 char *dom = "-no-domain-"; 237 char *dom = "-no-domain-";
237 238
238 if (h == NULL) { 239 if (h == NULL) {
239 seq_puts(m, "#class IP domain\n"); 240 seq_puts(m, "#class IP domain\n");
240 return 0; 241 return 0;
241 } 242 }
242 im = container_of(h, struct ip_map, h); 243 im = container_of(h, struct ip_map, h);
243 /* class addr domain */ 244 /* class addr domain */
244 addr = im->m_addr; 245 addr = im->m_addr;
245 246
246 if (test_bit(CACHE_VALID, &h->flags) && 247 if (test_bit(CACHE_VALID, &h->flags) &&
247 !test_bit(CACHE_NEGATIVE, &h->flags)) 248 !test_bit(CACHE_NEGATIVE, &h->flags))
248 dom = im->m_client->h.name; 249 dom = im->m_client->h.name;
249 250
250 seq_printf(m, "%s %d.%d.%d.%d %s\n", 251 seq_printf(m, "%s %d.%d.%d.%d %s\n",
251 im->m_class, 252 im->m_class,
252 ntohl(addr.s_addr) >> 24 & 0xff, 253 ntohl(addr.s_addr) >> 24 & 0xff,
253 ntohl(addr.s_addr) >> 16 & 0xff, 254 ntohl(addr.s_addr) >> 16 & 0xff,
254 ntohl(addr.s_addr) >> 8 & 0xff, 255 ntohl(addr.s_addr) >> 8 & 0xff,
255 ntohl(addr.s_addr) >> 0 & 0xff, 256 ntohl(addr.s_addr) >> 0 & 0xff,
256 dom 257 dom
257 ); 258 );
258 return 0; 259 return 0;
259 } 260 }
260 261
261 262
262 struct cache_detail ip_map_cache = { 263 struct cache_detail ip_map_cache = {
263 .owner = THIS_MODULE, 264 .owner = THIS_MODULE,
264 .hash_size = IP_HASHMAX, 265 .hash_size = IP_HASHMAX,
265 .hash_table = ip_table, 266 .hash_table = ip_table,
266 .name = "auth.unix.ip", 267 .name = "auth.unix.ip",
267 .cache_put = ip_map_put, 268 .cache_put = ip_map_put,
268 .cache_request = ip_map_request, 269 .cache_request = ip_map_request,
269 .cache_parse = ip_map_parse, 270 .cache_parse = ip_map_parse,
270 .cache_show = ip_map_show, 271 .cache_show = ip_map_show,
271 .match = ip_map_match, 272 .match = ip_map_match,
272 .init = ip_map_init, 273 .init = ip_map_init,
273 .update = update, 274 .update = update,
274 .alloc = ip_map_alloc, 275 .alloc = ip_map_alloc,
275 }; 276 };
276 277
277 static struct ip_map *ip_map_lookup(char *class, struct in_addr addr) 278 static struct ip_map *ip_map_lookup(char *class, struct in_addr addr)
278 { 279 {
279 struct ip_map ip; 280 struct ip_map ip;
280 struct cache_head *ch; 281 struct cache_head *ch;
281 282
282 strcpy(ip.m_class, class); 283 strcpy(ip.m_class, class);
283 ip.m_addr = addr; 284 ip.m_addr = addr;
284 ch = sunrpc_cache_lookup(&ip_map_cache, &ip.h, 285 ch = sunrpc_cache_lookup(&ip_map_cache, &ip.h,
285 hash_str(class, IP_HASHBITS) ^ 286 hash_str(class, IP_HASHBITS) ^
286 hash_ip((unsigned long)addr.s_addr)); 287 hash_ip((unsigned long)addr.s_addr));
287 288
288 if (ch) 289 if (ch)
289 return container_of(ch, struct ip_map, h); 290 return container_of(ch, struct ip_map, h);
290 else 291 else
291 return NULL; 292 return NULL;
292 } 293 }
293 294
294 static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry) 295 static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry)
295 { 296 {
296 struct ip_map ip; 297 struct ip_map ip;
297 struct cache_head *ch; 298 struct cache_head *ch;
298 299
299 ip.m_client = udom; 300 ip.m_client = udom;
300 ip.h.flags = 0; 301 ip.h.flags = 0;
301 if (!udom) 302 if (!udom)
302 set_bit(CACHE_NEGATIVE, &ip.h.flags); 303 set_bit(CACHE_NEGATIVE, &ip.h.flags);
303 else { 304 else {
304 ip.m_add_change = udom->addr_changes; 305 ip.m_add_change = udom->addr_changes;
305 /* if this is from the legacy set_client system call, 306 /* if this is from the legacy set_client system call,
306 * we need m_add_change to be one higher 307 * we need m_add_change to be one higher
307 */ 308 */
308 if (expiry == NEVER) 309 if (expiry == NEVER)
309 ip.m_add_change++; 310 ip.m_add_change++;
310 } 311 }
311 ip.h.expiry_time = expiry; 312 ip.h.expiry_time = expiry;
312 ch = sunrpc_cache_update(&ip_map_cache, 313 ch = sunrpc_cache_update(&ip_map_cache,
313 &ip.h, &ipm->h, 314 &ip.h, &ipm->h,
314 hash_str(ipm->m_class, IP_HASHBITS) ^ 315 hash_str(ipm->m_class, IP_HASHBITS) ^
315 hash_ip((unsigned long)ipm->m_addr.s_addr)); 316 hash_ip((unsigned long)ipm->m_addr.s_addr));
316 if (!ch) 317 if (!ch)
317 return -ENOMEM; 318 return -ENOMEM;
318 cache_put(ch, &ip_map_cache); 319 cache_put(ch, &ip_map_cache);
319 return 0; 320 return 0;
320 } 321 }
321 322
322 int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) 323 int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
323 { 324 {
324 struct unix_domain *udom; 325 struct unix_domain *udom;
325 struct ip_map *ipmp; 326 struct ip_map *ipmp;
326 327
327 if (dom->flavour != &svcauth_unix) 328 if (dom->flavour != &svcauth_unix)
328 return -EINVAL; 329 return -EINVAL;
329 udom = container_of(dom, struct unix_domain, h); 330 udom = container_of(dom, struct unix_domain, h);
330 ipmp = ip_map_lookup("nfsd", addr); 331 ipmp = ip_map_lookup("nfsd", addr);
331 332
332 if (ipmp) 333 if (ipmp)
333 return ip_map_update(ipmp, udom, NEVER); 334 return ip_map_update(ipmp, udom, NEVER);
334 else 335 else
335 return -ENOMEM; 336 return -ENOMEM;
336 } 337 }
337 338
338 int auth_unix_forget_old(struct auth_domain *dom) 339 int auth_unix_forget_old(struct auth_domain *dom)
339 { 340 {
340 struct unix_domain *udom; 341 struct unix_domain *udom;
341 342
342 if (dom->flavour != &svcauth_unix) 343 if (dom->flavour != &svcauth_unix)
343 return -EINVAL; 344 return -EINVAL;
344 udom = container_of(dom, struct unix_domain, h); 345 udom = container_of(dom, struct unix_domain, h);
345 udom->addr_changes++; 346 udom->addr_changes++;
346 return 0; 347 return 0;
347 } 348 }
348 349
349 struct auth_domain *auth_unix_lookup(struct in_addr addr) 350 struct auth_domain *auth_unix_lookup(struct in_addr addr)
350 { 351 {
351 struct ip_map *ipm; 352 struct ip_map *ipm;
352 struct auth_domain *rv; 353 struct auth_domain *rv;
353 354
354 ipm = ip_map_lookup("nfsd", addr); 355 ipm = ip_map_lookup("nfsd", addr);
355 356
356 if (!ipm) 357 if (!ipm)
357 return NULL; 358 return NULL;
358 if (cache_check(&ip_map_cache, &ipm->h, NULL)) 359 if (cache_check(&ip_map_cache, &ipm->h, NULL))
359 return NULL; 360 return NULL;
360 361
361 if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { 362 if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
362 if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0) 363 if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0)
363 auth_domain_put(&ipm->m_client->h); 364 auth_domain_put(&ipm->m_client->h);
364 rv = NULL; 365 rv = NULL;
365 } else { 366 } else {
366 rv = &ipm->m_client->h; 367 rv = &ipm->m_client->h;
367 kref_get(&rv->ref); 368 kref_get(&rv->ref);
368 } 369 }
369 cache_put(&ipm->h, &ip_map_cache); 370 cache_put(&ipm->h, &ip_map_cache);
370 return rv; 371 return rv;
371 } 372 }
372 373
373 void svcauth_unix_purge(void) 374 void svcauth_unix_purge(void)
374 { 375 {
375 cache_purge(&ip_map_cache); 376 cache_purge(&ip_map_cache);
376 } 377 }
377 378
379 static inline struct ip_map *
380 ip_map_cached_get(struct svc_rqst *rqstp)
381 {
382 struct ip_map *ipm = rqstp->rq_sock->sk_info_authunix;
383 if (ipm != NULL) {
384 if (!cache_valid(&ipm->h)) {
385 /*
386 * The entry has been invalidated since it was
387 * remembered, e.g. by a second mount from the
388 * same IP address.
389 */
390 rqstp->rq_sock->sk_info_authunix = NULL;
391 cache_put(&ipm->h, &ip_map_cache);
392 return NULL;
393 }
394 cache_get(&ipm->h);
395 }
396 return ipm;
397 }
398
399 static inline void
400 ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
401 {
402 struct svc_sock *svsk = rqstp->rq_sock;
403
404 if (svsk->sk_sock->type == SOCK_STREAM && svsk->sk_info_authunix == NULL)
405 svsk->sk_info_authunix = ipm; /* newly cached, keep the reference */
406 else
407 cache_put(&ipm->h, &ip_map_cache);
408 }
409
410 void
411 svcauth_unix_info_release(void *info)
412 {
413 struct ip_map *ipm = info;
414 cache_put(&ipm->h, &ip_map_cache);
415 }
416
378 static int 417 static int
379 svcauth_unix_set_client(struct svc_rqst *rqstp) 418 svcauth_unix_set_client(struct svc_rqst *rqstp)
380 { 419 {
381 struct ip_map *ipm; 420 struct ip_map *ipm;
382 421
383 rqstp->rq_client = NULL; 422 rqstp->rq_client = NULL;
384 if (rqstp->rq_proc == 0) 423 if (rqstp->rq_proc == 0)
385 return SVC_OK; 424 return SVC_OK;
386 425
387 ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, 426 ipm = ip_map_cached_get(rqstp);
388 rqstp->rq_addr.sin_addr); 427 if (ipm == NULL)
428 ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class,
429 rqstp->rq_addr.sin_addr);
389 430
390 if (ipm == NULL) 431 if (ipm == NULL)
391 return SVC_DENIED; 432 return SVC_DENIED;
392 433
393 switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { 434 switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
394 default: 435 default:
395 BUG(); 436 BUG();
396 case -EAGAIN: 437 case -EAGAIN:
397 return SVC_DROP; 438 return SVC_DROP;
398 case -ENOENT: 439 case -ENOENT:
399 return SVC_DENIED; 440 return SVC_DENIED;
400 case 0: 441 case 0:
401 rqstp->rq_client = &ipm->m_client->h; 442 rqstp->rq_client = &ipm->m_client->h;
402 kref_get(&rqstp->rq_client->ref); 443 kref_get(&rqstp->rq_client->ref);
403 cache_put(&ipm->h, &ip_map_cache); 444 ip_map_cached_put(rqstp, ipm);
404 break; 445 break;
405 } 446 }
406 return SVC_OK; 447 return SVC_OK;
407 } 448 }
408 449
409 static int 450 static int
410 svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) 451 svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
411 { 452 {
412 struct kvec *argv = &rqstp->rq_arg.head[0]; 453 struct kvec *argv = &rqstp->rq_arg.head[0];
413 struct kvec *resv = &rqstp->rq_res.head[0]; 454 struct kvec *resv = &rqstp->rq_res.head[0];
414 struct svc_cred *cred = &rqstp->rq_cred; 455 struct svc_cred *cred = &rqstp->rq_cred;
415 456
416 cred->cr_group_info = NULL; 457 cred->cr_group_info = NULL;
417 rqstp->rq_client = NULL; 458 rqstp->rq_client = NULL;
418 459
419 if (argv->iov_len < 3*4) 460 if (argv->iov_len < 3*4)
420 return SVC_GARBAGE; 461 return SVC_GARBAGE;
421 462
422 if (svc_getu32(argv) != 0) { 463 if (svc_getu32(argv) != 0) {
423 dprintk("svc: bad null cred\n"); 464 dprintk("svc: bad null cred\n");
424 *authp = rpc_autherr_badcred; 465 *authp = rpc_autherr_badcred;
425 return SVC_DENIED; 466 return SVC_DENIED;
426 } 467 }
427 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { 468 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
428 dprintk("svc: bad null verf\n"); 469 dprintk("svc: bad null verf\n");
429 *authp = rpc_autherr_badverf; 470 *authp = rpc_autherr_badverf;
430 return SVC_DENIED; 471 return SVC_DENIED;
431 } 472 }
432 473
433 /* Signal that mapping to nobody uid/gid is required */ 474 /* Signal that mapping to nobody uid/gid is required */
434 cred->cr_uid = (uid_t) -1; 475 cred->cr_uid = (uid_t) -1;
435 cred->cr_gid = (gid_t) -1; 476 cred->cr_gid = (gid_t) -1;
436 cred->cr_group_info = groups_alloc(0); 477 cred->cr_group_info = groups_alloc(0);
437 if (cred->cr_group_info == NULL) 478 if (cred->cr_group_info == NULL)
438 return SVC_DROP; /* kmalloc failure - client must retry */ 479 return SVC_DROP; /* kmalloc failure - client must retry */
439 480
440 /* Put NULL verifier */ 481 /* Put NULL verifier */
441 svc_putnl(resv, RPC_AUTH_NULL); 482 svc_putnl(resv, RPC_AUTH_NULL);
442 svc_putnl(resv, 0); 483 svc_putnl(resv, 0);
443 484
444 return SVC_OK; 485 return SVC_OK;
445 } 486 }
446 487
447 static int 488 static int
448 svcauth_null_release(struct svc_rqst *rqstp) 489 svcauth_null_release(struct svc_rqst *rqstp)
449 { 490 {
450 if (rqstp->rq_client) 491 if (rqstp->rq_client)
451 auth_domain_put(rqstp->rq_client); 492 auth_domain_put(rqstp->rq_client);
452 rqstp->rq_client = NULL; 493 rqstp->rq_client = NULL;
453 if (rqstp->rq_cred.cr_group_info) 494 if (rqstp->rq_cred.cr_group_info)
454 put_group_info(rqstp->rq_cred.cr_group_info); 495 put_group_info(rqstp->rq_cred.cr_group_info);
455 rqstp->rq_cred.cr_group_info = NULL; 496 rqstp->rq_cred.cr_group_info = NULL;
456 497
457 return 0; /* don't drop */ 498 return 0; /* don't drop */
458 } 499 }
459 500
460 501
461 struct auth_ops svcauth_null = { 502 struct auth_ops svcauth_null = {
462 .name = "null", 503 .name = "null",
463 .owner = THIS_MODULE, 504 .owner = THIS_MODULE,
464 .flavour = RPC_AUTH_NULL, 505 .flavour = RPC_AUTH_NULL,
465 .accept = svcauth_null_accept, 506 .accept = svcauth_null_accept,
466 .release = svcauth_null_release, 507 .release = svcauth_null_release,
467 .set_client = svcauth_unix_set_client, 508 .set_client = svcauth_unix_set_client,
468 }; 509 };
469 510
470 511
471 static int 512 static int
472 svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) 513 svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
473 { 514 {
474 struct kvec *argv = &rqstp->rq_arg.head[0]; 515 struct kvec *argv = &rqstp->rq_arg.head[0];
475 struct kvec *resv = &rqstp->rq_res.head[0]; 516 struct kvec *resv = &rqstp->rq_res.head[0];
476 struct svc_cred *cred = &rqstp->rq_cred; 517 struct svc_cred *cred = &rqstp->rq_cred;
477 u32 slen, i; 518 u32 slen, i;
478 int len = argv->iov_len; 519 int len = argv->iov_len;
479 520
480 cred->cr_group_info = NULL; 521 cred->cr_group_info = NULL;
481 rqstp->rq_client = NULL; 522 rqstp->rq_client = NULL;
482 523
483 if ((len -= 3*4) < 0) 524 if ((len -= 3*4) < 0)
484 return SVC_GARBAGE; 525 return SVC_GARBAGE;
485 526
486 svc_getu32(argv); /* length */ 527 svc_getu32(argv); /* length */
487 svc_getu32(argv); /* time stamp */ 528 svc_getu32(argv); /* time stamp */
488 slen = XDR_QUADLEN(svc_getnl(argv)); /* machname length */ 529 slen = XDR_QUADLEN(svc_getnl(argv)); /* machname length */
489 if (slen > 64 || (len -= (slen + 3)*4) < 0) 530 if (slen > 64 || (len -= (slen + 3)*4) < 0)
490 goto badcred; 531 goto badcred;
491 argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */ 532 argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */
492 argv->iov_len -= slen*4; 533 argv->iov_len -= slen*4;
493 534
494 cred->cr_uid = svc_getnl(argv); /* uid */ 535 cred->cr_uid = svc_getnl(argv); /* uid */
495 cred->cr_gid = svc_getnl(argv); /* gid */ 536 cred->cr_gid = svc_getnl(argv); /* gid */
496 slen = svc_getnl(argv); /* gids length */ 537 slen = svc_getnl(argv); /* gids length */
497 if (slen > 16 || (len -= (slen + 2)*4) < 0) 538 if (slen > 16 || (len -= (slen + 2)*4) < 0)
498 goto badcred; 539 goto badcred;
499 cred->cr_group_info = groups_alloc(slen); 540 cred->cr_group_info = groups_alloc(slen);
500 if (cred->cr_group_info == NULL) 541 if (cred->cr_group_info == NULL)
501 return SVC_DROP; 542 return SVC_DROP;
502 for (i = 0; i < slen; i++) 543 for (i = 0; i < slen; i++)
503 GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv); 544 GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv);
504 545
505 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { 546 if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
506 *authp = rpc_autherr_badverf; 547 *authp = rpc_autherr_badverf;
507 return SVC_DENIED; 548 return SVC_DENIED;
508 } 549 }
509 550
510 /* Put NULL verifier */ 551 /* Put NULL verifier */
511 svc_putnl(resv, RPC_AUTH_NULL); 552 svc_putnl(resv, RPC_AUTH_NULL);
512 svc_putnl(resv, 0); 553 svc_putnl(resv, 0);
513 554
514 return SVC_OK; 555 return SVC_OK;
515 556
516 badcred: 557 badcred:
517 *authp = rpc_autherr_badcred; 558 *authp = rpc_autherr_badcred;
518 return SVC_DENIED; 559 return SVC_DENIED;
519 } 560 }
520 561
521 static int 562 static int
522 svcauth_unix_release(struct svc_rqst *rqstp) 563 svcauth_unix_release(struct svc_rqst *rqstp)
523 { 564 {
524 /* Verifier (such as it is) is already in place. 565 /* Verifier (such as it is) is already in place.
525 */ 566 */
526 if (rqstp->rq_client) 567 if (rqstp->rq_client)
527 auth_domain_put(rqstp->rq_client); 568 auth_domain_put(rqstp->rq_client);
528 rqstp->rq_client = NULL; 569 rqstp->rq_client = NULL;
529 if (rqstp->rq_cred.cr_group_info) 570 if (rqstp->rq_cred.cr_group_info)
530 put_group_info(rqstp->rq_cred.cr_group_info); 571 put_group_info(rqstp->rq_cred.cr_group_info);
531 rqstp->rq_cred.cr_group_info = NULL; 572 rqstp->rq_cred.cr_group_info = NULL;
532 573
533 return 0; 574 return 0;
534 } 575 }
535 576
536 577
537 struct auth_ops svcauth_unix = { 578 struct auth_ops svcauth_unix = {
538 .name = "unix", 579 .name = "unix",
539 .owner = THIS_MODULE, 580 .owner = THIS_MODULE,
540 .flavour = RPC_AUTH_UNIX, 581 .flavour = RPC_AUTH_UNIX,
541 .accept = svcauth_unix_accept, 582 .accept = svcauth_unix_accept,
542 .release = svcauth_unix_release, 583 .release = svcauth_unix_release,
543 .domain_release = svcauth_unix_domain_release, 584 .domain_release = svcauth_unix_domain_release,
544 .set_client = svcauth_unix_set_client, 585 .set_client = svcauth_unix_set_client,
545 }; 586 };
546 587
547 588
net/sunrpc/svcsock.c
1 /* 1 /*
2 * linux/net/sunrpc/svcsock.c 2 * linux/net/sunrpc/svcsock.c
3 * 3 *
4 * These are the RPC server socket internals. 4 * These are the RPC server socket internals.
5 * 5 *
6 * The server scheduling algorithm does not always distribute the load 6 * The server scheduling algorithm does not always distribute the load
7 * evenly when servicing a single client. May need to modify the 7 * evenly when servicing a single client. May need to modify the
8 * svc_sock_enqueue procedure... 8 * svc_sock_enqueue procedure...
9 * 9 *
10 * TCP support is largely untested and may be a little slow. The problem 10 * TCP support is largely untested and may be a little slow. The problem
11 * is that we currently do two separate recvfrom's, one for the 4-byte 11 * is that we currently do two separate recvfrom's, one for the 4-byte
12 * record length, and the second for the actual record. This could possibly 12 * record length, and the second for the actual record. This could possibly
13 * be improved by always reading a minimum size of around 100 bytes and 13 * be improved by always reading a minimum size of around 100 bytes and
14 * tucking any superfluous bytes away in a temporary store. Still, that 14 * tucking any superfluous bytes away in a temporary store. Still, that
15 * leaves write requests out in the rain. An alternative may be to peek at 15 * leaves write requests out in the rain. An alternative may be to peek at
16 * the first skb in the queue, and if it matches the next TCP sequence 16 * the first skb in the queue, and if it matches the next TCP sequence
17 * number, to extract the record marker. Yuck. 17 * number, to extract the record marker. Yuck.
18 * 18 *
19 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 19 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
20 */ 20 */
21 21
22 #include <linux/sched.h> 22 #include <linux/sched.h>
23 #include <linux/errno.h> 23 #include <linux/errno.h>
24 #include <linux/fcntl.h> 24 #include <linux/fcntl.h>
25 #include <linux/net.h> 25 #include <linux/net.h>
26 #include <linux/in.h> 26 #include <linux/in.h>
27 #include <linux/inet.h> 27 #include <linux/inet.h>
28 #include <linux/udp.h> 28 #include <linux/udp.h>
29 #include <linux/tcp.h> 29 #include <linux/tcp.h>
30 #include <linux/unistd.h> 30 #include <linux/unistd.h>
31 #include <linux/slab.h> 31 #include <linux/slab.h>
32 #include <linux/netdevice.h> 32 #include <linux/netdevice.h>
33 #include <linux/skbuff.h> 33 #include <linux/skbuff.h>
34 #include <linux/file.h> 34 #include <linux/file.h>
35 #include <net/sock.h> 35 #include <net/sock.h>
36 #include <net/checksum.h> 36 #include <net/checksum.h>
37 #include <net/ip.h> 37 #include <net/ip.h>
38 #include <net/tcp_states.h> 38 #include <net/tcp_states.h>
39 #include <asm/uaccess.h> 39 #include <asm/uaccess.h>
40 #include <asm/ioctls.h> 40 #include <asm/ioctls.h>
41 41
42 #include <linux/sunrpc/types.h> 42 #include <linux/sunrpc/types.h>
43 #include <linux/sunrpc/xdr.h> 43 #include <linux/sunrpc/xdr.h>
44 #include <linux/sunrpc/svcsock.h> 44 #include <linux/sunrpc/svcsock.h>
45 #include <linux/sunrpc/stats.h> 45 #include <linux/sunrpc/stats.h>
46 46
47 /* SMP locking strategy: 47 /* SMP locking strategy:
48 * 48 *
49 * svc_pool->sp_lock protects most of the fields of that pool. 49 * svc_pool->sp_lock protects most of the fields of that pool.
50 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 50 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
51 * when both need to be taken (rare), svc_serv->sv_lock is first. 51 * when both need to be taken (rare), svc_serv->sv_lock is first.
52 * BKL protects svc_serv->sv_nrthread. 52 * BKL protects svc_serv->sv_nrthread.
53 * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list 53 * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list
54 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. 54 * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
55 * 55 *
56 * Some flags can be set to certain values at any time 56 * Some flags can be set to certain values at any time
57 * providing that certain rules are followed: 57 * providing that certain rules are followed:
58 * 58 *
59 * SK_CONN, SK_DATA, can be set or cleared at any time. 59 * SK_CONN, SK_DATA, can be set or cleared at any time.
60 * after a set, svc_sock_enqueue must be called. 60 * after a set, svc_sock_enqueue must be called.
61 * after a clear, the socket must be read/accepted 61 * after a clear, the socket must be read/accepted
62 * if this succeeds, it must be set again. 62 * if this succeeds, it must be set again.
63 * SK_CLOSE can set at any time. It is never cleared. 63 * SK_CLOSE can set at any time. It is never cleared.
64 * 64 *
65 */ 65 */
66 66
67 #define RPCDBG_FACILITY RPCDBG_SVCSOCK 67 #define RPCDBG_FACILITY RPCDBG_SVCSOCK
68 68
69 69
70 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 70 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
71 int *errp, int pmap_reg); 71 int *errp, int pmap_reg);
72 static void svc_udp_data_ready(struct sock *, int); 72 static void svc_udp_data_ready(struct sock *, int);
73 static int svc_udp_recvfrom(struct svc_rqst *); 73 static int svc_udp_recvfrom(struct svc_rqst *);
74 static int svc_udp_sendto(struct svc_rqst *); 74 static int svc_udp_sendto(struct svc_rqst *);
75 75
76 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); 76 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
77 static int svc_deferred_recv(struct svc_rqst *rqstp); 77 static int svc_deferred_recv(struct svc_rqst *rqstp);
78 static struct cache_deferred_req *svc_defer(struct cache_req *req); 78 static struct cache_deferred_req *svc_defer(struct cache_req *req);
79 79
80 /* apparently the "standard" is that clients close 80 /* apparently the "standard" is that clients close
81 * idle connections after 5 minutes, servers after 81 * idle connections after 5 minutes, servers after
82 * 6 minutes 82 * 6 minutes
83 * http://www.connectathon.org/talks96/nfstcp.pdf 83 * http://www.connectathon.org/talks96/nfstcp.pdf
84 */ 84 */
85 static int svc_conn_age_period = 6*60; 85 static int svc_conn_age_period = 6*60;
86 86
87 /* 87 /*
88 * Queue up an idle server thread. Must have pool->sp_lock held. 88 * Queue up an idle server thread. Must have pool->sp_lock held.
89 * Note: this is really a stack rather than a queue, so that we only 89 * Note: this is really a stack rather than a queue, so that we only
90 * use as many different threads as we need, and the rest don't pollute 90 * use as many different threads as we need, and the rest don't pollute
91 * the cache. 91 * the cache.
92 */ 92 */
93 static inline void 93 static inline void
94 svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) 94 svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
95 { 95 {
96 list_add(&rqstp->rq_list, &pool->sp_threads); 96 list_add(&rqstp->rq_list, &pool->sp_threads);
97 } 97 }
98 98
99 /* 99 /*
100 * Dequeue an nfsd thread. Must have pool->sp_lock held. 100 * Dequeue an nfsd thread. Must have pool->sp_lock held.
101 */ 101 */
102 static inline void 102 static inline void
103 svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) 103 svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
104 { 104 {
105 list_del(&rqstp->rq_list); 105 list_del(&rqstp->rq_list);
106 } 106 }
107 107
108 /* 108 /*
109 * Release an skbuff after use 109 * Release an skbuff after use
110 */ 110 */
111 static inline void 111 static inline void
112 svc_release_skb(struct svc_rqst *rqstp) 112 svc_release_skb(struct svc_rqst *rqstp)
113 { 113 {
114 struct sk_buff *skb = rqstp->rq_skbuff; 114 struct sk_buff *skb = rqstp->rq_skbuff;
115 struct svc_deferred_req *dr = rqstp->rq_deferred; 115 struct svc_deferred_req *dr = rqstp->rq_deferred;
116 116
117 if (skb) { 117 if (skb) {
118 rqstp->rq_skbuff = NULL; 118 rqstp->rq_skbuff = NULL;
119 119
120 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 120 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
121 skb_free_datagram(rqstp->rq_sock->sk_sk, skb); 121 skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
122 } 122 }
123 if (dr) { 123 if (dr) {
124 rqstp->rq_deferred = NULL; 124 rqstp->rq_deferred = NULL;
125 kfree(dr); 125 kfree(dr);
126 } 126 }
127 } 127 }
128 128
129 /* 129 /*
130 * Any space to write? 130 * Any space to write?
131 */ 131 */
132 static inline unsigned long 132 static inline unsigned long
133 svc_sock_wspace(struct svc_sock *svsk) 133 svc_sock_wspace(struct svc_sock *svsk)
134 { 134 {
135 int wspace; 135 int wspace;
136 136
137 if (svsk->sk_sock->type == SOCK_STREAM) 137 if (svsk->sk_sock->type == SOCK_STREAM)
138 wspace = sk_stream_wspace(svsk->sk_sk); 138 wspace = sk_stream_wspace(svsk->sk_sk);
139 else 139 else
140 wspace = sock_wspace(svsk->sk_sk); 140 wspace = sock_wspace(svsk->sk_sk);
141 141
142 return wspace; 142 return wspace;
143 } 143 }
144 144
145 /* 145 /*
146 * Queue up a socket with data pending. If there are idle nfsd 146 * Queue up a socket with data pending. If there are idle nfsd
147 * processes, wake 'em up. 147 * processes, wake 'em up.
148 * 148 *
149 */ 149 */
150 static void 150 static void
151 svc_sock_enqueue(struct svc_sock *svsk) 151 svc_sock_enqueue(struct svc_sock *svsk)
152 { 152 {
153 struct svc_serv *serv = svsk->sk_server; 153 struct svc_serv *serv = svsk->sk_server;
154 struct svc_pool *pool; 154 struct svc_pool *pool;
155 struct svc_rqst *rqstp; 155 struct svc_rqst *rqstp;
156 int cpu; 156 int cpu;
157 157
158 if (!(svsk->sk_flags & 158 if (!(svsk->sk_flags &
159 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) 159 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
160 return; 160 return;
161 if (test_bit(SK_DEAD, &svsk->sk_flags)) 161 if (test_bit(SK_DEAD, &svsk->sk_flags))
162 return; 162 return;
163 163
164 cpu = get_cpu(); 164 cpu = get_cpu();
165 pool = svc_pool_for_cpu(svsk->sk_server, cpu); 165 pool = svc_pool_for_cpu(svsk->sk_server, cpu);
166 put_cpu(); 166 put_cpu();
167 167
168 spin_lock_bh(&pool->sp_lock); 168 spin_lock_bh(&pool->sp_lock);
169 169
170 if (!list_empty(&pool->sp_threads) && 170 if (!list_empty(&pool->sp_threads) &&
171 !list_empty(&pool->sp_sockets)) 171 !list_empty(&pool->sp_sockets))
172 printk(KERN_ERR 172 printk(KERN_ERR
173 "svc_sock_enqueue: threads and sockets both waiting??\n"); 173 "svc_sock_enqueue: threads and sockets both waiting??\n");
174 174
175 if (test_bit(SK_DEAD, &svsk->sk_flags)) { 175 if (test_bit(SK_DEAD, &svsk->sk_flags)) {
176 /* Don't enqueue dead sockets */ 176 /* Don't enqueue dead sockets */
177 dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); 177 dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
178 goto out_unlock; 178 goto out_unlock;
179 } 179 }
180 180
181 /* Mark socket as busy. It will remain in this state until the 181 /* Mark socket as busy. It will remain in this state until the
182 * server has processed all pending data and put the socket back 182 * server has processed all pending data and put the socket back
183 * on the idle list. We update SK_BUSY atomically because 183 * on the idle list. We update SK_BUSY atomically because
184 * it also guards against trying to enqueue the svc_sock twice. 184 * it also guards against trying to enqueue the svc_sock twice.
185 */ 185 */
186 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { 186 if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
187 /* Don't enqueue socket while already enqueued */ 187 /* Don't enqueue socket while already enqueued */
188 dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); 188 dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
189 goto out_unlock; 189 goto out_unlock;
190 } 190 }
191 BUG_ON(svsk->sk_pool != NULL); 191 BUG_ON(svsk->sk_pool != NULL);
192 svsk->sk_pool = pool; 192 svsk->sk_pool = pool;
193 193
194 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 194 set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
195 if (((atomic_read(&svsk->sk_reserved) + serv->sv_bufsz)*2 195 if (((atomic_read(&svsk->sk_reserved) + serv->sv_bufsz)*2
196 > svc_sock_wspace(svsk)) 196 > svc_sock_wspace(svsk))
197 && !test_bit(SK_CLOSE, &svsk->sk_flags) 197 && !test_bit(SK_CLOSE, &svsk->sk_flags)
198 && !test_bit(SK_CONN, &svsk->sk_flags)) { 198 && !test_bit(SK_CONN, &svsk->sk_flags)) {
199 /* Don't enqueue while not enough space for reply */ 199 /* Don't enqueue while not enough space for reply */
200 dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", 200 dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
201 svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_bufsz, 201 svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_bufsz,
202 svc_sock_wspace(svsk)); 202 svc_sock_wspace(svsk));
203 svsk->sk_pool = NULL; 203 svsk->sk_pool = NULL;
204 clear_bit(SK_BUSY, &svsk->sk_flags); 204 clear_bit(SK_BUSY, &svsk->sk_flags);
205 goto out_unlock; 205 goto out_unlock;
206 } 206 }
207 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 207 clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
208 208
209 209
210 if (!list_empty(&pool->sp_threads)) { 210 if (!list_empty(&pool->sp_threads)) {
211 rqstp = list_entry(pool->sp_threads.next, 211 rqstp = list_entry(pool->sp_threads.next,
212 struct svc_rqst, 212 struct svc_rqst,
213 rq_list); 213 rq_list);
214 dprintk("svc: socket %p served by daemon %p\n", 214 dprintk("svc: socket %p served by daemon %p\n",
215 svsk->sk_sk, rqstp); 215 svsk->sk_sk, rqstp);
216 svc_thread_dequeue(pool, rqstp); 216 svc_thread_dequeue(pool, rqstp);
217 if (rqstp->rq_sock) 217 if (rqstp->rq_sock)
218 printk(KERN_ERR 218 printk(KERN_ERR
219 "svc_sock_enqueue: server %p, rq_sock=%p!\n", 219 "svc_sock_enqueue: server %p, rq_sock=%p!\n",
220 rqstp, rqstp->rq_sock); 220 rqstp, rqstp->rq_sock);
221 rqstp->rq_sock = svsk; 221 rqstp->rq_sock = svsk;
222 atomic_inc(&svsk->sk_inuse); 222 atomic_inc(&svsk->sk_inuse);
223 rqstp->rq_reserved = serv->sv_bufsz; 223 rqstp->rq_reserved = serv->sv_bufsz;
224 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); 224 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
225 BUG_ON(svsk->sk_pool != pool); 225 BUG_ON(svsk->sk_pool != pool);
226 wake_up(&rqstp->rq_wait); 226 wake_up(&rqstp->rq_wait);
227 } else { 227 } else {
228 dprintk("svc: socket %p put into queue\n", svsk->sk_sk); 228 dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
229 list_add_tail(&svsk->sk_ready, &pool->sp_sockets); 229 list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
230 BUG_ON(svsk->sk_pool != pool); 230 BUG_ON(svsk->sk_pool != pool);
231 } 231 }
232 232
233 out_unlock: 233 out_unlock:
234 spin_unlock_bh(&pool->sp_lock); 234 spin_unlock_bh(&pool->sp_lock);
235 } 235 }
236 236
237 /* 237 /*
238 * Dequeue the first socket. Must be called with the pool->sp_lock held. 238 * Dequeue the first socket. Must be called with the pool->sp_lock held.
239 */ 239 */
240 static inline struct svc_sock * 240 static inline struct svc_sock *
241 svc_sock_dequeue(struct svc_pool *pool) 241 svc_sock_dequeue(struct svc_pool *pool)
242 { 242 {
243 struct svc_sock *svsk; 243 struct svc_sock *svsk;
244 244
245 if (list_empty(&pool->sp_sockets)) 245 if (list_empty(&pool->sp_sockets))
246 return NULL; 246 return NULL;
247 247
248 svsk = list_entry(pool->sp_sockets.next, 248 svsk = list_entry(pool->sp_sockets.next,
249 struct svc_sock, sk_ready); 249 struct svc_sock, sk_ready);
250 list_del_init(&svsk->sk_ready); 250 list_del_init(&svsk->sk_ready);
251 251
252 dprintk("svc: socket %p dequeued, inuse=%d\n", 252 dprintk("svc: socket %p dequeued, inuse=%d\n",
253 svsk->sk_sk, atomic_read(&svsk->sk_inuse)); 253 svsk->sk_sk, atomic_read(&svsk->sk_inuse));
254 254
255 return svsk; 255 return svsk;
256 } 256 }
257 257
258 /* 258 /*
259 * Having read something from a socket, check whether it 259 * Having read something from a socket, check whether it
260 * needs to be re-enqueued. 260 * needs to be re-enqueued.
261 * Note: SK_DATA only gets cleared when a read-attempt finds 261 * Note: SK_DATA only gets cleared when a read-attempt finds
262 * no (or insufficient) data. 262 * no (or insufficient) data.
263 */ 263 */
264 static inline void 264 static inline void
265 svc_sock_received(struct svc_sock *svsk) 265 svc_sock_received(struct svc_sock *svsk)
266 { 266 {
267 svsk->sk_pool = NULL; 267 svsk->sk_pool = NULL;
268 clear_bit(SK_BUSY, &svsk->sk_flags); 268 clear_bit(SK_BUSY, &svsk->sk_flags);
269 svc_sock_enqueue(svsk); 269 svc_sock_enqueue(svsk);
270 } 270 }
271 271
272 272
273 /** 273 /**
274 * svc_reserve - change the space reserved for the reply to a request. 274 * svc_reserve - change the space reserved for the reply to a request.
275 * @rqstp: The request in question 275 * @rqstp: The request in question
276 * @space: new max space to reserve 276 * @space: new max space to reserve
277 * 277 *
278 * Each request reserves some space on the output queue of the socket 278 * Each request reserves some space on the output queue of the socket
279 * to make sure the reply fits. This function reduces that reserved 279 * to make sure the reply fits. This function reduces that reserved
280 * space to be the amount of space used already, plus @space. 280 * space to be the amount of space used already, plus @space.
281 * 281 *
282 */ 282 */
283 void svc_reserve(struct svc_rqst *rqstp, int space) 283 void svc_reserve(struct svc_rqst *rqstp, int space)
284 { 284 {
285 space += rqstp->rq_res.head[0].iov_len; 285 space += rqstp->rq_res.head[0].iov_len;
286 286
287 if (space < rqstp->rq_reserved) { 287 if (space < rqstp->rq_reserved) {
288 struct svc_sock *svsk = rqstp->rq_sock; 288 struct svc_sock *svsk = rqstp->rq_sock;
289 atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); 289 atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
290 rqstp->rq_reserved = space; 290 rqstp->rq_reserved = space;
291 291
292 svc_sock_enqueue(svsk); 292 svc_sock_enqueue(svsk);
293 } 293 }
294 } 294 }
295 295
296 /* 296 /*
297 * Release a socket after use. 297 * Release a socket after use.
298 */ 298 */
299 static inline void 299 static inline void
300 svc_sock_put(struct svc_sock *svsk) 300 svc_sock_put(struct svc_sock *svsk)
301 { 301 {
302 if (atomic_dec_and_test(&svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { 302 if (atomic_dec_and_test(&svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) {
303 dprintk("svc: releasing dead socket\n"); 303 dprintk("svc: releasing dead socket\n");
304 sock_release(svsk->sk_sock); 304 sock_release(svsk->sk_sock);
305 kfree(svsk); 305 kfree(svsk);
306 } 306 }
307 } 307 }
308 308
309 static void 309 static void
310 svc_sock_release(struct svc_rqst *rqstp) 310 svc_sock_release(struct svc_rqst *rqstp)
311 { 311 {
312 struct svc_sock *svsk = rqstp->rq_sock; 312 struct svc_sock *svsk = rqstp->rq_sock;
313 313
314 svc_release_skb(rqstp); 314 svc_release_skb(rqstp);
315 315
316 svc_free_res_pages(rqstp); 316 svc_free_res_pages(rqstp);
317 rqstp->rq_res.page_len = 0; 317 rqstp->rq_res.page_len = 0;
318 rqstp->rq_res.page_base = 0; 318 rqstp->rq_res.page_base = 0;
319 319
320 320
321 /* Reset response buffer and release 321 /* Reset response buffer and release
322 * the reservation. 322 * the reservation.
323 * But first, check that enough space was reserved 323 * But first, check that enough space was reserved
324 * for the reply, otherwise we have a bug! 324 * for the reply, otherwise we have a bug!
325 */ 325 */
326 if ((rqstp->rq_res.len) > rqstp->rq_reserved) 326 if ((rqstp->rq_res.len) > rqstp->rq_reserved)
327 printk(KERN_ERR "RPC request reserved %d but used %d\n", 327 printk(KERN_ERR "RPC request reserved %d but used %d\n",
328 rqstp->rq_reserved, 328 rqstp->rq_reserved,
329 rqstp->rq_res.len); 329 rqstp->rq_res.len);
330 330
331 rqstp->rq_res.head[0].iov_len = 0; 331 rqstp->rq_res.head[0].iov_len = 0;
332 svc_reserve(rqstp, 0); 332 svc_reserve(rqstp, 0);
333 rqstp->rq_sock = NULL; 333 rqstp->rq_sock = NULL;
334 334
335 svc_sock_put(svsk); 335 svc_sock_put(svsk);
336 } 336 }
337 337
338 /* 338 /*
339 * External function to wake up a server waiting for data 339 * External function to wake up a server waiting for data
340 * This really only makes sense for services like lockd 340 * This really only makes sense for services like lockd
341 * which have exactly one thread anyway. 341 * which have exactly one thread anyway.
342 */ 342 */
343 void 343 void
344 svc_wake_up(struct svc_serv *serv) 344 svc_wake_up(struct svc_serv *serv)
345 { 345 {
346 struct svc_rqst *rqstp; 346 struct svc_rqst *rqstp;
347 unsigned int i; 347 unsigned int i;
348 struct svc_pool *pool; 348 struct svc_pool *pool;
349 349
350 for (i = 0; i < serv->sv_nrpools; i++) { 350 for (i = 0; i < serv->sv_nrpools; i++) {
351 pool = &serv->sv_pools[i]; 351 pool = &serv->sv_pools[i];
352 352
353 spin_lock_bh(&pool->sp_lock); 353 spin_lock_bh(&pool->sp_lock);
354 if (!list_empty(&pool->sp_threads)) { 354 if (!list_empty(&pool->sp_threads)) {
355 rqstp = list_entry(pool->sp_threads.next, 355 rqstp = list_entry(pool->sp_threads.next,
356 struct svc_rqst, 356 struct svc_rqst,
357 rq_list); 357 rq_list);
358 dprintk("svc: daemon %p woken up.\n", rqstp); 358 dprintk("svc: daemon %p woken up.\n", rqstp);
359 /* 359 /*
360 svc_thread_dequeue(pool, rqstp); 360 svc_thread_dequeue(pool, rqstp);
361 rqstp->rq_sock = NULL; 361 rqstp->rq_sock = NULL;
362 */ 362 */
363 wake_up(&rqstp->rq_wait); 363 wake_up(&rqstp->rq_wait);
364 } 364 }
365 spin_unlock_bh(&pool->sp_lock); 365 spin_unlock_bh(&pool->sp_lock);
366 } 366 }
367 } 367 }
368 368
369 /* 369 /*
370 * Generic sendto routine 370 * Generic sendto routine
371 */ 371 */
372 static int 372 static int
373 svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) 373 svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
374 { 374 {
375 struct svc_sock *svsk = rqstp->rq_sock; 375 struct svc_sock *svsk = rqstp->rq_sock;
376 struct socket *sock = svsk->sk_sock; 376 struct socket *sock = svsk->sk_sock;
377 int slen; 377 int slen;
378 char buffer[CMSG_SPACE(sizeof(struct in_pktinfo))]; 378 char buffer[CMSG_SPACE(sizeof(struct in_pktinfo))];
379 struct cmsghdr *cmh = (struct cmsghdr *)buffer; 379 struct cmsghdr *cmh = (struct cmsghdr *)buffer;
380 struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cmh); 380 struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cmh);
381 int len = 0; 381 int len = 0;
382 int result; 382 int result;
383 int size; 383 int size;
384 struct page **ppage = xdr->pages; 384 struct page **ppage = xdr->pages;
385 size_t base = xdr->page_base; 385 size_t base = xdr->page_base;
386 unsigned int pglen = xdr->page_len; 386 unsigned int pglen = xdr->page_len;
387 unsigned int flags = MSG_MORE; 387 unsigned int flags = MSG_MORE;
388 388
389 slen = xdr->len; 389 slen = xdr->len;
390 390
391 if (rqstp->rq_prot == IPPROTO_UDP) { 391 if (rqstp->rq_prot == IPPROTO_UDP) {
392 /* set the source and destination */ 392 /* set the source and destination */
393 struct msghdr msg; 393 struct msghdr msg;
394 msg.msg_name = &rqstp->rq_addr; 394 msg.msg_name = &rqstp->rq_addr;
395 msg.msg_namelen = sizeof(rqstp->rq_addr); 395 msg.msg_namelen = sizeof(rqstp->rq_addr);
396 msg.msg_iov = NULL; 396 msg.msg_iov = NULL;
397 msg.msg_iovlen = 0; 397 msg.msg_iovlen = 0;
398 msg.msg_flags = MSG_MORE; 398 msg.msg_flags = MSG_MORE;
399 399
400 msg.msg_control = cmh; 400 msg.msg_control = cmh;
401 msg.msg_controllen = sizeof(buffer); 401 msg.msg_controllen = sizeof(buffer);
402 cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); 402 cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
403 cmh->cmsg_level = SOL_IP; 403 cmh->cmsg_level = SOL_IP;
404 cmh->cmsg_type = IP_PKTINFO; 404 cmh->cmsg_type = IP_PKTINFO;
405 pki->ipi_ifindex = 0; 405 pki->ipi_ifindex = 0;
406 pki->ipi_spec_dst.s_addr = rqstp->rq_daddr; 406 pki->ipi_spec_dst.s_addr = rqstp->rq_daddr;
407 407
408 if (sock_sendmsg(sock, &msg, 0) < 0) 408 if (sock_sendmsg(sock, &msg, 0) < 0)
409 goto out; 409 goto out;
410 } 410 }
411 411
412 /* send head */ 412 /* send head */
413 if (slen == xdr->head[0].iov_len) 413 if (slen == xdr->head[0].iov_len)
414 flags = 0; 414 flags = 0;
415 len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, 415 len = kernel_sendpage(sock, rqstp->rq_respages[0], 0,
416 xdr->head[0].iov_len, flags); 416 xdr->head[0].iov_len, flags);
417 if (len != xdr->head[0].iov_len) 417 if (len != xdr->head[0].iov_len)
418 goto out; 418 goto out;
419 slen -= xdr->head[0].iov_len; 419 slen -= xdr->head[0].iov_len;
420 if (slen == 0) 420 if (slen == 0)
421 goto out; 421 goto out;
422 422
423 /* send page data */ 423 /* send page data */
424 size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; 424 size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
425 while (pglen > 0) { 425 while (pglen > 0) {
426 if (slen == size) 426 if (slen == size)
427 flags = 0; 427 flags = 0;
428 result = kernel_sendpage(sock, *ppage, base, size, flags); 428 result = kernel_sendpage(sock, *ppage, base, size, flags);
429 if (result > 0) 429 if (result > 0)
430 len += result; 430 len += result;
431 if (result != size) 431 if (result != size)
432 goto out; 432 goto out;
433 slen -= size; 433 slen -= size;
434 pglen -= size; 434 pglen -= size;
435 size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; 435 size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
436 base = 0; 436 base = 0;
437 ppage++; 437 ppage++;
438 } 438 }
439 /* send tail */ 439 /* send tail */
440 if (xdr->tail[0].iov_len) { 440 if (xdr->tail[0].iov_len) {
441 result = kernel_sendpage(sock, rqstp->rq_respages[0], 441 result = kernel_sendpage(sock, rqstp->rq_respages[0],
442 ((unsigned long)xdr->tail[0].iov_base) 442 ((unsigned long)xdr->tail[0].iov_base)
443 & (PAGE_SIZE-1), 443 & (PAGE_SIZE-1),
444 xdr->tail[0].iov_len, 0); 444 xdr->tail[0].iov_len, 0);
445 445
446 if (result > 0) 446 if (result > 0)
447 len += result; 447 len += result;
448 } 448 }
449 out: 449 out:
450 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", 450 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n",
451 rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, 451 rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len,
452 rqstp->rq_addr.sin_addr.s_addr); 452 rqstp->rq_addr.sin_addr.s_addr);
453 453
454 return len; 454 return len;
455 } 455 }
456 456
457 /* 457 /*
458 * Report socket names for nfsdfs 458 * Report socket names for nfsdfs
459 */ 459 */
460 static int one_sock_name(char *buf, struct svc_sock *svsk) 460 static int one_sock_name(char *buf, struct svc_sock *svsk)
461 { 461 {
462 int len; 462 int len;
463 463
464 switch(svsk->sk_sk->sk_family) { 464 switch(svsk->sk_sk->sk_family) {
465 case AF_INET: 465 case AF_INET:
466 len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n", 466 len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n",
467 svsk->sk_sk->sk_protocol==IPPROTO_UDP? 467 svsk->sk_sk->sk_protocol==IPPROTO_UDP?
468 "udp" : "tcp", 468 "udp" : "tcp",
469 NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr), 469 NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr),
470 inet_sk(svsk->sk_sk)->num); 470 inet_sk(svsk->sk_sk)->num);
471 break; 471 break;
472 default: 472 default:
473 len = sprintf(buf, "*unknown-%d*\n", 473 len = sprintf(buf, "*unknown-%d*\n",
474 svsk->sk_sk->sk_family); 474 svsk->sk_sk->sk_family);
475 } 475 }
476 return len; 476 return len;
477 } 477 }
478 478
479 int 479 int
480 svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) 480 svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
481 { 481 {
482 struct svc_sock *svsk, *closesk = NULL; 482 struct svc_sock *svsk, *closesk = NULL;
483 int len = 0; 483 int len = 0;
484 484
485 if (!serv) 485 if (!serv)
486 return 0; 486 return 0;
487 spin_lock(&serv->sv_lock); 487 spin_lock(&serv->sv_lock);
488 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { 488 list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) {
489 int onelen = one_sock_name(buf+len, svsk); 489 int onelen = one_sock_name(buf+len, svsk);
490 if (toclose && strcmp(toclose, buf+len) == 0) 490 if (toclose && strcmp(toclose, buf+len) == 0)
491 closesk = svsk; 491 closesk = svsk;
492 else 492 else
493 len += onelen; 493 len += onelen;
494 } 494 }
495 spin_unlock(&serv->sv_lock); 495 spin_unlock(&serv->sv_lock);
496 if (closesk) 496 if (closesk)
497 /* Should unregister with portmap, but you cannot 497 /* Should unregister with portmap, but you cannot
498 * unregister just one protocol... 498 * unregister just one protocol...
499 */ 499 */
500 svc_delete_socket(closesk); 500 svc_delete_socket(closesk);
501 else if (toclose) 501 else if (toclose)
502 return -ENOENT; 502 return -ENOENT;
503 return len; 503 return len;
504 } 504 }
505 EXPORT_SYMBOL(svc_sock_names); 505 EXPORT_SYMBOL(svc_sock_names);
506 506
507 /* 507 /*
508 * Check input queue length 508 * Check input queue length
509 */ 509 */
510 static int 510 static int
511 svc_recv_available(struct svc_sock *svsk) 511 svc_recv_available(struct svc_sock *svsk)
512 { 512 {
513 struct socket *sock = svsk->sk_sock; 513 struct socket *sock = svsk->sk_sock;
514 int avail, err; 514 int avail, err;
515 515
516 err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail); 516 err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail);
517 517
518 return (err >= 0)? avail : err; 518 return (err >= 0)? avail : err;
519 } 519 }
520 520
521 /* 521 /*
522 * Generic recvfrom routine. 522 * Generic recvfrom routine.
523 */ 523 */
524 static int 524 static int
525 svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) 525 svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
526 { 526 {
527 struct msghdr msg; 527 struct msghdr msg;
528 struct socket *sock; 528 struct socket *sock;
529 int len, alen; 529 int len, alen;
530 530
531 rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 531 rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
532 sock = rqstp->rq_sock->sk_sock; 532 sock = rqstp->rq_sock->sk_sock;
533 533
534 msg.msg_name = &rqstp->rq_addr; 534 msg.msg_name = &rqstp->rq_addr;
535 msg.msg_namelen = sizeof(rqstp->rq_addr); 535 msg.msg_namelen = sizeof(rqstp->rq_addr);
536 msg.msg_control = NULL; 536 msg.msg_control = NULL;
537 msg.msg_controllen = 0; 537 msg.msg_controllen = 0;
538 538
539 msg.msg_flags = MSG_DONTWAIT; 539 msg.msg_flags = MSG_DONTWAIT;
540 540
541 len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); 541 len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT);
542 542
543 /* sock_recvmsg doesn't fill in the name/namelen, so we must.. 543 /* sock_recvmsg doesn't fill in the name/namelen, so we must..
544 * possibly we should cache this in the svc_sock structure 544 * possibly we should cache this in the svc_sock structure
545 * at accept time. FIXME 545 * at accept time. FIXME
546 */ 546 */
547 alen = sizeof(rqstp->rq_addr); 547 alen = sizeof(rqstp->rq_addr);
548 kernel_getpeername(sock, (struct sockaddr *)&rqstp->rq_addr, &alen); 548 kernel_getpeername(sock, (struct sockaddr *)&rqstp->rq_addr, &alen);
549 549
550 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 550 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
551 rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); 551 rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len);
552 552
553 return len; 553 return len;
554 } 554 }
555 555
556 /* 556 /*
557 * Set socket snd and rcv buffer lengths 557 * Set socket snd and rcv buffer lengths
558 */ 558 */
559 static inline void 559 static inline void
560 svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) 560 svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
561 { 561 {
562 #if 0 562 #if 0
563 mm_segment_t oldfs; 563 mm_segment_t oldfs;
564 oldfs = get_fs(); set_fs(KERNEL_DS); 564 oldfs = get_fs(); set_fs(KERNEL_DS);
565 sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, 565 sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
566 (char*)&snd, sizeof(snd)); 566 (char*)&snd, sizeof(snd));
567 sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, 567 sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
568 (char*)&rcv, sizeof(rcv)); 568 (char*)&rcv, sizeof(rcv));
569 #else 569 #else
570 /* sock_setsockopt limits use to sysctl_?mem_max, 570 /* sock_setsockopt limits use to sysctl_?mem_max,
571 * which isn't acceptable. Until that is made conditional 571 * which isn't acceptable. Until that is made conditional
572 * on not having CAP_SYS_RESOURCE or similar, we go direct... 572 * on not having CAP_SYS_RESOURCE or similar, we go direct...
573 * DaveM said I could! 573 * DaveM said I could!
574 */ 574 */
575 lock_sock(sock->sk); 575 lock_sock(sock->sk);
576 sock->sk->sk_sndbuf = snd * 2; 576 sock->sk->sk_sndbuf = snd * 2;
577 sock->sk->sk_rcvbuf = rcv * 2; 577 sock->sk->sk_rcvbuf = rcv * 2;
578 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; 578 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
579 release_sock(sock->sk); 579 release_sock(sock->sk);
580 #endif 580 #endif
581 } 581 }
582 /* 582 /*
583 * INET callback when data has been received on the socket. 583 * INET callback when data has been received on the socket.
584 */ 584 */
585 static void 585 static void
586 svc_udp_data_ready(struct sock *sk, int count) 586 svc_udp_data_ready(struct sock *sk, int count)
587 { 587 {
588 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 588 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
589 589
590 if (svsk) { 590 if (svsk) {
591 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 591 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
592 svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); 592 svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
593 set_bit(SK_DATA, &svsk->sk_flags); 593 set_bit(SK_DATA, &svsk->sk_flags);
594 svc_sock_enqueue(svsk); 594 svc_sock_enqueue(svsk);
595 } 595 }
596 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 596 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
597 wake_up_interruptible(sk->sk_sleep); 597 wake_up_interruptible(sk->sk_sleep);
598 } 598 }
599 599
600 /* 600 /*
601 * INET callback when space is newly available on the socket. 601 * INET callback when space is newly available on the socket.
602 */ 602 */
603 static void 603 static void
604 svc_write_space(struct sock *sk) 604 svc_write_space(struct sock *sk)
605 { 605 {
606 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 606 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
607 607
608 if (svsk) { 608 if (svsk) {
609 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 609 dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
610 svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); 610 svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
611 svc_sock_enqueue(svsk); 611 svc_sock_enqueue(svsk);
612 } 612 }
613 613
614 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { 614 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
615 dprintk("RPC svc_write_space: someone sleeping on %p\n", 615 dprintk("RPC svc_write_space: someone sleeping on %p\n",
616 svsk); 616 svsk);
617 wake_up_interruptible(sk->sk_sleep); 617 wake_up_interruptible(sk->sk_sleep);
618 } 618 }
619 } 619 }
620 620
621 /* 621 /*
622 * Receive a datagram from a UDP socket. 622 * Receive a datagram from a UDP socket.
623 */ 623 */
624 static int 624 static int
625 svc_udp_recvfrom(struct svc_rqst *rqstp) 625 svc_udp_recvfrom(struct svc_rqst *rqstp)
626 { 626 {
627 struct svc_sock *svsk = rqstp->rq_sock; 627 struct svc_sock *svsk = rqstp->rq_sock;
628 struct svc_serv *serv = svsk->sk_server; 628 struct svc_serv *serv = svsk->sk_server;
629 struct sk_buff *skb; 629 struct sk_buff *skb;
630 int err, len; 630 int err, len;
631 631
632 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 632 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
633 /* udp sockets need large rcvbuf as all pending 633 /* udp sockets need large rcvbuf as all pending
634 * requests are still in that buffer. sndbuf must 634 * requests are still in that buffer. sndbuf must
635 * also be large enough that there is enough space 635 * also be large enough that there is enough space
636 * for one reply per thread. We count all threads 636 * for one reply per thread. We count all threads
637 * rather than threads in a particular pool, which 637 * rather than threads in a particular pool, which
638 * provides an upper bound on the number of threads 638 * provides an upper bound on the number of threads
639 * which will access the socket. 639 * which will access the socket.
640 */ 640 */
641 svc_sock_setbufsize(svsk->sk_sock, 641 svc_sock_setbufsize(svsk->sk_sock,
642 (serv->sv_nrthreads+3) * serv->sv_bufsz, 642 (serv->sv_nrthreads+3) * serv->sv_bufsz,
643 (serv->sv_nrthreads+3) * serv->sv_bufsz); 643 (serv->sv_nrthreads+3) * serv->sv_bufsz);
644 644
645 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 645 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
646 svc_sock_received(svsk); 646 svc_sock_received(svsk);
647 return svc_deferred_recv(rqstp); 647 return svc_deferred_recv(rqstp);
648 } 648 }
649 649
650 clear_bit(SK_DATA, &svsk->sk_flags); 650 clear_bit(SK_DATA, &svsk->sk_flags);
651 while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { 651 while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) {
652 if (err == -EAGAIN) { 652 if (err == -EAGAIN) {
653 svc_sock_received(svsk); 653 svc_sock_received(svsk);
654 return err; 654 return err;
655 } 655 }
656 /* possibly an icmp error */ 656 /* possibly an icmp error */
657 dprintk("svc: recvfrom returned error %d\n", -err); 657 dprintk("svc: recvfrom returned error %d\n", -err);
658 } 658 }
659 if (skb->tstamp.off_sec == 0) { 659 if (skb->tstamp.off_sec == 0) {
660 struct timeval tv; 660 struct timeval tv;
661 661
662 tv.tv_sec = xtime.tv_sec; 662 tv.tv_sec = xtime.tv_sec;
663 tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; 663 tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC;
664 skb_set_timestamp(skb, &tv); 664 skb_set_timestamp(skb, &tv);
665 /* Don't enable netstamp, sunrpc doesn't 665 /* Don't enable netstamp, sunrpc doesn't
666 need that much accuracy */ 666 need that much accuracy */
667 } 667 }
668 skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp); 668 skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
669 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 669 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
670 670
671 /* 671 /*
672 * Maybe more packets - kick another thread ASAP. 672 * Maybe more packets - kick another thread ASAP.
673 */ 673 */
674 svc_sock_received(svsk); 674 svc_sock_received(svsk);
675 675
676 len = skb->len - sizeof(struct udphdr); 676 len = skb->len - sizeof(struct udphdr);
677 rqstp->rq_arg.len = len; 677 rqstp->rq_arg.len = len;
678 678
679 rqstp->rq_prot = IPPROTO_UDP; 679 rqstp->rq_prot = IPPROTO_UDP;
680 680
681 /* Get sender address */ 681 /* Get sender address */
682 rqstp->rq_addr.sin_family = AF_INET; 682 rqstp->rq_addr.sin_family = AF_INET;
683 rqstp->rq_addr.sin_port = skb->h.uh->source; 683 rqstp->rq_addr.sin_port = skb->h.uh->source;
684 rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; 684 rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr;
685 rqstp->rq_daddr = skb->nh.iph->daddr; 685 rqstp->rq_daddr = skb->nh.iph->daddr;
686 686
687 if (skb_is_nonlinear(skb)) { 687 if (skb_is_nonlinear(skb)) {
688 /* we have to copy */ 688 /* we have to copy */
689 local_bh_disable(); 689 local_bh_disable();
690 if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { 690 if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
691 local_bh_enable(); 691 local_bh_enable();
692 /* checksum error */ 692 /* checksum error */
693 skb_free_datagram(svsk->sk_sk, skb); 693 skb_free_datagram(svsk->sk_sk, skb);
694 return 0; 694 return 0;
695 } 695 }
696 local_bh_enable(); 696 local_bh_enable();
697 skb_free_datagram(svsk->sk_sk, skb); 697 skb_free_datagram(svsk->sk_sk, skb);
698 } else { 698 } else {
699 /* we can use it in-place */ 699 /* we can use it in-place */
700 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 700 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
701 rqstp->rq_arg.head[0].iov_len = len; 701 rqstp->rq_arg.head[0].iov_len = len;
702 if (skb_checksum_complete(skb)) { 702 if (skb_checksum_complete(skb)) {
703 skb_free_datagram(svsk->sk_sk, skb); 703 skb_free_datagram(svsk->sk_sk, skb);
704 return 0; 704 return 0;
705 } 705 }
706 rqstp->rq_skbuff = skb; 706 rqstp->rq_skbuff = skb;
707 } 707 }
708 708
709 rqstp->rq_arg.page_base = 0; 709 rqstp->rq_arg.page_base = 0;
710 if (len <= rqstp->rq_arg.head[0].iov_len) { 710 if (len <= rqstp->rq_arg.head[0].iov_len) {
711 rqstp->rq_arg.head[0].iov_len = len; 711 rqstp->rq_arg.head[0].iov_len = len;
712 rqstp->rq_arg.page_len = 0; 712 rqstp->rq_arg.page_len = 0;
713 rqstp->rq_respages = rqstp->rq_pages+1; 713 rqstp->rq_respages = rqstp->rq_pages+1;
714 } else { 714 } else {
715 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 715 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
716 rqstp->rq_respages = rqstp->rq_pages + 1 + 716 rqstp->rq_respages = rqstp->rq_pages + 1 +
717 (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; 717 (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE;
718 } 718 }
719 719
720 if (serv->sv_stats) 720 if (serv->sv_stats)
721 serv->sv_stats->netudpcnt++; 721 serv->sv_stats->netudpcnt++;
722 722
723 return len; 723 return len;
724 } 724 }
725 725
726 static int 726 static int
727 svc_udp_sendto(struct svc_rqst *rqstp) 727 svc_udp_sendto(struct svc_rqst *rqstp)
728 { 728 {
729 int error; 729 int error;
730 730
731 error = svc_sendto(rqstp, &rqstp->rq_res); 731 error = svc_sendto(rqstp, &rqstp->rq_res);
732 if (error == -ECONNREFUSED) 732 if (error == -ECONNREFUSED)
733 /* ICMP error on earlier request. */ 733 /* ICMP error on earlier request. */
734 error = svc_sendto(rqstp, &rqstp->rq_res); 734 error = svc_sendto(rqstp, &rqstp->rq_res);
735 735
736 return error; 736 return error;
737 } 737 }
738 738
739 static void 739 static void
740 svc_udp_init(struct svc_sock *svsk) 740 svc_udp_init(struct svc_sock *svsk)
741 { 741 {
742 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 742 svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
743 svsk->sk_sk->sk_write_space = svc_write_space; 743 svsk->sk_sk->sk_write_space = svc_write_space;
744 svsk->sk_recvfrom = svc_udp_recvfrom; 744 svsk->sk_recvfrom = svc_udp_recvfrom;
745 svsk->sk_sendto = svc_udp_sendto; 745 svsk->sk_sendto = svc_udp_sendto;
746 746
747 /* initialise setting must have enough space to 747 /* initialise setting must have enough space to
748 * receive and respond to one request. 748 * receive and respond to one request.
749 * svc_udp_recvfrom will re-adjust if necessary 749 * svc_udp_recvfrom will re-adjust if necessary
750 */ 750 */
751 svc_sock_setbufsize(svsk->sk_sock, 751 svc_sock_setbufsize(svsk->sk_sock,
752 3 * svsk->sk_server->sv_bufsz, 752 3 * svsk->sk_server->sv_bufsz,
753 3 * svsk->sk_server->sv_bufsz); 753 3 * svsk->sk_server->sv_bufsz);
754 754
755 set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ 755 set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
756 set_bit(SK_CHNGBUF, &svsk->sk_flags); 756 set_bit(SK_CHNGBUF, &svsk->sk_flags);
757 } 757 }
758 758
759 /* 759 /*
760 * A data_ready event on a listening socket means there's a connection 760 * A data_ready event on a listening socket means there's a connection
761 * pending. Do not use state_change as a substitute for it. 761 * pending. Do not use state_change as a substitute for it.
762 */ 762 */
763 static void 763 static void
764 svc_tcp_listen_data_ready(struct sock *sk, int count_unused) 764 svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
765 { 765 {
766 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 766 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
767 767
768 dprintk("svc: socket %p TCP (listen) state change %d\n", 768 dprintk("svc: socket %p TCP (listen) state change %d\n",
769 sk, sk->sk_state); 769 sk, sk->sk_state);
770 770
771 /* 771 /*
772 * This callback may called twice when a new connection 772 * This callback may called twice when a new connection
773 * is established as a child socket inherits everything 773 * is established as a child socket inherits everything
774 * from a parent LISTEN socket. 774 * from a parent LISTEN socket.
775 * 1) data_ready method of the parent socket will be called 775 * 1) data_ready method of the parent socket will be called
776 * when one of child sockets become ESTABLISHED. 776 * when one of child sockets become ESTABLISHED.
777 * 2) data_ready method of the child socket may be called 777 * 2) data_ready method of the child socket may be called
778 * when it receives data before the socket is accepted. 778 * when it receives data before the socket is accepted.
779 * In case of 2, we should ignore it silently. 779 * In case of 2, we should ignore it silently.
780 */ 780 */
781 if (sk->sk_state == TCP_LISTEN) { 781 if (sk->sk_state == TCP_LISTEN) {
782 if (svsk) { 782 if (svsk) {
783 set_bit(SK_CONN, &svsk->sk_flags); 783 set_bit(SK_CONN, &svsk->sk_flags);
784 svc_sock_enqueue(svsk); 784 svc_sock_enqueue(svsk);
785 } else 785 } else
786 printk("svc: socket %p: no user data\n", sk); 786 printk("svc: socket %p: no user data\n", sk);
787 } 787 }
788 788
789 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 789 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
790 wake_up_interruptible_all(sk->sk_sleep); 790 wake_up_interruptible_all(sk->sk_sleep);
791 } 791 }
792 792
793 /* 793 /*
794 * A state change on a connected socket means it's dying or dead. 794 * A state change on a connected socket means it's dying or dead.
795 */ 795 */
796 static void 796 static void
797 svc_tcp_state_change(struct sock *sk) 797 svc_tcp_state_change(struct sock *sk)
798 { 798 {
799 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 799 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
800 800
801 dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", 801 dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
802 sk, sk->sk_state, sk->sk_user_data); 802 sk, sk->sk_state, sk->sk_user_data);
803 803
804 if (!svsk) 804 if (!svsk)
805 printk("svc: socket %p: no user data\n", sk); 805 printk("svc: socket %p: no user data\n", sk);
806 else { 806 else {
807 set_bit(SK_CLOSE, &svsk->sk_flags); 807 set_bit(SK_CLOSE, &svsk->sk_flags);
808 svc_sock_enqueue(svsk); 808 svc_sock_enqueue(svsk);
809 } 809 }
810 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 810 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
811 wake_up_interruptible_all(sk->sk_sleep); 811 wake_up_interruptible_all(sk->sk_sleep);
812 } 812 }
813 813
814 static void 814 static void
815 svc_tcp_data_ready(struct sock *sk, int count) 815 svc_tcp_data_ready(struct sock *sk, int count)
816 { 816 {
817 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 817 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
818 818
819 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 819 dprintk("svc: socket %p TCP data ready (svsk %p)\n",
820 sk, sk->sk_user_data); 820 sk, sk->sk_user_data);
821 if (svsk) { 821 if (svsk) {
822 set_bit(SK_DATA, &svsk->sk_flags); 822 set_bit(SK_DATA, &svsk->sk_flags);
823 svc_sock_enqueue(svsk); 823 svc_sock_enqueue(svsk);
824 } 824 }
825 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 825 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
826 wake_up_interruptible(sk->sk_sleep); 826 wake_up_interruptible(sk->sk_sleep);
827 } 827 }
828 828
829 /* 829 /*
830 * Accept a TCP connection 830 * Accept a TCP connection
831 */ 831 */
832 static void 832 static void
833 svc_tcp_accept(struct svc_sock *svsk) 833 svc_tcp_accept(struct svc_sock *svsk)
834 { 834 {
835 struct sockaddr_in sin; 835 struct sockaddr_in sin;
836 struct svc_serv *serv = svsk->sk_server; 836 struct svc_serv *serv = svsk->sk_server;
837 struct socket *sock = svsk->sk_sock; 837 struct socket *sock = svsk->sk_sock;
838 struct socket *newsock; 838 struct socket *newsock;
839 struct svc_sock *newsvsk; 839 struct svc_sock *newsvsk;
840 int err, slen; 840 int err, slen;
841 841
842 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 842 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
843 if (!sock) 843 if (!sock)
844 return; 844 return;
845 845
846 clear_bit(SK_CONN, &svsk->sk_flags); 846 clear_bit(SK_CONN, &svsk->sk_flags);
847 err = kernel_accept(sock, &newsock, O_NONBLOCK); 847 err = kernel_accept(sock, &newsock, O_NONBLOCK);
848 if (err < 0) { 848 if (err < 0) {
849 if (err == -ENOMEM) 849 if (err == -ENOMEM)
850 printk(KERN_WARNING "%s: no more sockets!\n", 850 printk(KERN_WARNING "%s: no more sockets!\n",
851 serv->sv_name); 851 serv->sv_name);
852 else if (err != -EAGAIN && net_ratelimit()) 852 else if (err != -EAGAIN && net_ratelimit())
853 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 853 printk(KERN_WARNING "%s: accept failed (err %d)!\n",
854 serv->sv_name, -err); 854 serv->sv_name, -err);
855 return; 855 return;
856 } 856 }
857 857
858 set_bit(SK_CONN, &svsk->sk_flags); 858 set_bit(SK_CONN, &svsk->sk_flags);
859 svc_sock_enqueue(svsk); 859 svc_sock_enqueue(svsk);
860 860
861 slen = sizeof(sin); 861 slen = sizeof(sin);
862 err = kernel_getpeername(newsock, (struct sockaddr *) &sin, &slen); 862 err = kernel_getpeername(newsock, (struct sockaddr *) &sin, &slen);
863 if (err < 0) { 863 if (err < 0) {
864 if (net_ratelimit()) 864 if (net_ratelimit())
865 printk(KERN_WARNING "%s: peername failed (err %d)!\n", 865 printk(KERN_WARNING "%s: peername failed (err %d)!\n",
866 serv->sv_name, -err); 866 serv->sv_name, -err);
867 goto failed; /* aborted connection or whatever */ 867 goto failed; /* aborted connection or whatever */
868 } 868 }
869 869
870 /* Ideally, we would want to reject connections from unauthorized 870 /* Ideally, we would want to reject connections from unauthorized
871 * hosts here, but when we get encription, the IP of the host won't 871 * hosts here, but when we get encription, the IP of the host won't
872 * tell us anything. For now just warn about unpriv connections. 872 * tell us anything. For now just warn about unpriv connections.
873 */ 873 */
874 if (ntohs(sin.sin_port) >= 1024) { 874 if (ntohs(sin.sin_port) >= 1024) {
875 dprintk(KERN_WARNING 875 dprintk(KERN_WARNING
876 "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", 876 "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n",
877 serv->sv_name, 877 serv->sv_name,
878 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 878 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
879 } 879 }
880 880
881 dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, 881 dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name,
882 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 882 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
883 883
884 /* make sure that a write doesn't block forever when 884 /* make sure that a write doesn't block forever when
885 * low on memory 885 * low on memory
886 */ 886 */
887 newsock->sk->sk_sndtimeo = HZ*30; 887 newsock->sk->sk_sndtimeo = HZ*30;
888 888
889 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) 889 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0)))
890 goto failed; 890 goto failed;
891 891
892 892
893 /* make sure that we don't have too many active connections. 893 /* make sure that we don't have too many active connections.
894 * If we have, something must be dropped. 894 * If we have, something must be dropped.
895 * 895 *
896 * There's no point in trying to do random drop here for 896 * There's no point in trying to do random drop here for
897 * DoS prevention. The NFS clients does 1 reconnect in 15 897 * DoS prevention. The NFS clients does 1 reconnect in 15
898 * seconds. An attacker can easily beat that. 898 * seconds. An attacker can easily beat that.
899 * 899 *
900 * The only somewhat efficient mechanism would be if drop 900 * The only somewhat efficient mechanism would be if drop
901 * old connections from the same IP first. But right now 901 * old connections from the same IP first. But right now
902 * we don't even record the client IP in svc_sock. 902 * we don't even record the client IP in svc_sock.
903 */ 903 */
904 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { 904 if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
905 struct svc_sock *svsk = NULL; 905 struct svc_sock *svsk = NULL;
906 spin_lock_bh(&serv->sv_lock); 906 spin_lock_bh(&serv->sv_lock);
907 if (!list_empty(&serv->sv_tempsocks)) { 907 if (!list_empty(&serv->sv_tempsocks)) {
908 if (net_ratelimit()) { 908 if (net_ratelimit()) {
909 /* Try to help the admin */ 909 /* Try to help the admin */
910 printk(KERN_NOTICE "%s: too many open TCP " 910 printk(KERN_NOTICE "%s: too many open TCP "
911 "sockets, consider increasing the " 911 "sockets, consider increasing the "
912 "number of nfsd threads\n", 912 "number of nfsd threads\n",
913 serv->sv_name); 913 serv->sv_name);
914 printk(KERN_NOTICE "%s: last TCP connect from " 914 printk(KERN_NOTICE "%s: last TCP connect from "
915 "%u.%u.%u.%u:%d\n", 915 "%u.%u.%u.%u:%d\n",
916 serv->sv_name, 916 serv->sv_name,
917 NIPQUAD(sin.sin_addr.s_addr), 917 NIPQUAD(sin.sin_addr.s_addr),
918 ntohs(sin.sin_port)); 918 ntohs(sin.sin_port));
919 } 919 }
920 /* 920 /*
921 * Always select the oldest socket. It's not fair, 921 * Always select the oldest socket. It's not fair,
922 * but so is life 922 * but so is life
923 */ 923 */
924 svsk = list_entry(serv->sv_tempsocks.prev, 924 svsk = list_entry(serv->sv_tempsocks.prev,
925 struct svc_sock, 925 struct svc_sock,
926 sk_list); 926 sk_list);
927 set_bit(SK_CLOSE, &svsk->sk_flags); 927 set_bit(SK_CLOSE, &svsk->sk_flags);
928 atomic_inc(&svsk->sk_inuse); 928 atomic_inc(&svsk->sk_inuse);
929 } 929 }
930 spin_unlock_bh(&serv->sv_lock); 930 spin_unlock_bh(&serv->sv_lock);
931 931
932 if (svsk) { 932 if (svsk) {
933 svc_sock_enqueue(svsk); 933 svc_sock_enqueue(svsk);
934 svc_sock_put(svsk); 934 svc_sock_put(svsk);
935 } 935 }
936 936
937 } 937 }
938 938
939 if (serv->sv_stats) 939 if (serv->sv_stats)
940 serv->sv_stats->nettcpconn++; 940 serv->sv_stats->nettcpconn++;
941 941
942 return; 942 return;
943 943
944 failed: 944 failed:
945 sock_release(newsock); 945 sock_release(newsock);
946 return; 946 return;
947 } 947 }
948 948
949 /* 949 /*
950 * Receive data from a TCP socket. 950 * Receive data from a TCP socket.
951 */ 951 */
952 static int 952 static int
953 svc_tcp_recvfrom(struct svc_rqst *rqstp) 953 svc_tcp_recvfrom(struct svc_rqst *rqstp)
954 { 954 {
955 struct svc_sock *svsk = rqstp->rq_sock; 955 struct svc_sock *svsk = rqstp->rq_sock;
956 struct svc_serv *serv = svsk->sk_server; 956 struct svc_serv *serv = svsk->sk_server;
957 int len; 957 int len;
958 struct kvec *vec; 958 struct kvec *vec;
959 int pnum, vlen; 959 int pnum, vlen;
960 960
961 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 961 dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
962 svsk, test_bit(SK_DATA, &svsk->sk_flags), 962 svsk, test_bit(SK_DATA, &svsk->sk_flags),
963 test_bit(SK_CONN, &svsk->sk_flags), 963 test_bit(SK_CONN, &svsk->sk_flags),
964 test_bit(SK_CLOSE, &svsk->sk_flags)); 964 test_bit(SK_CLOSE, &svsk->sk_flags));
965 965
966 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 966 if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
967 svc_sock_received(svsk); 967 svc_sock_received(svsk);
968 return svc_deferred_recv(rqstp); 968 return svc_deferred_recv(rqstp);
969 } 969 }
970 970
971 if (test_bit(SK_CLOSE, &svsk->sk_flags)) { 971 if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
972 svc_delete_socket(svsk); 972 svc_delete_socket(svsk);
973 return 0; 973 return 0;
974 } 974 }
975 975
976 if (test_bit(SK_CONN, &svsk->sk_flags)) { 976 if (test_bit(SK_CONN, &svsk->sk_flags)) {
977 svc_tcp_accept(svsk); 977 svc_tcp_accept(svsk);
978 svc_sock_received(svsk); 978 svc_sock_received(svsk);
979 return 0; 979 return 0;
980 } 980 }
981 981
982 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 982 if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
983 /* sndbuf needs to have room for one request 983 /* sndbuf needs to have room for one request
984 * per thread, otherwise we can stall even when the 984 * per thread, otherwise we can stall even when the
985 * network isn't a bottleneck. 985 * network isn't a bottleneck.
986 * 986 *
987 * We count all threads rather than threads in a 987 * We count all threads rather than threads in a
988 * particular pool, which provides an upper bound 988 * particular pool, which provides an upper bound
989 * on the number of threads which will access the socket. 989 * on the number of threads which will access the socket.
990 * 990 *
991 * rcvbuf just needs to be able to hold a few requests. 991 * rcvbuf just needs to be able to hold a few requests.
992 * Normally they will be removed from the queue 992 * Normally they will be removed from the queue
993 * as soon a a complete request arrives. 993 * as soon a a complete request arrives.
994 */ 994 */
995 svc_sock_setbufsize(svsk->sk_sock, 995 svc_sock_setbufsize(svsk->sk_sock,
996 (serv->sv_nrthreads+3) * serv->sv_bufsz, 996 (serv->sv_nrthreads+3) * serv->sv_bufsz,
997 3 * serv->sv_bufsz); 997 3 * serv->sv_bufsz);
998 998
999 clear_bit(SK_DATA, &svsk->sk_flags); 999 clear_bit(SK_DATA, &svsk->sk_flags);
1000 1000
1001 /* Receive data. If we haven't got the record length yet, get 1001 /* Receive data. If we haven't got the record length yet, get
1002 * the next four bytes. Otherwise try to gobble up as much as 1002 * the next four bytes. Otherwise try to gobble up as much as
1003 * possible up to the complete record length. 1003 * possible up to the complete record length.
1004 */ 1004 */
1005 if (svsk->sk_tcplen < 4) { 1005 if (svsk->sk_tcplen < 4) {
1006 unsigned long want = 4 - svsk->sk_tcplen; 1006 unsigned long want = 4 - svsk->sk_tcplen;
1007 struct kvec iov; 1007 struct kvec iov;
1008 1008
1009 iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; 1009 iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
1010 iov.iov_len = want; 1010 iov.iov_len = want;
1011 if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) 1011 if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
1012 goto error; 1012 goto error;
1013 svsk->sk_tcplen += len; 1013 svsk->sk_tcplen += len;
1014 1014
1015 if (len < want) { 1015 if (len < want) {
1016 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 1016 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
1017 len, want); 1017 len, want);
1018 svc_sock_received(svsk); 1018 svc_sock_received(svsk);
1019 return -EAGAIN; /* record header not complete */ 1019 return -EAGAIN; /* record header not complete */
1020 } 1020 }
1021 1021
1022 svsk->sk_reclen = ntohl(svsk->sk_reclen); 1022 svsk->sk_reclen = ntohl(svsk->sk_reclen);
1023 if (!(svsk->sk_reclen & 0x80000000)) { 1023 if (!(svsk->sk_reclen & 0x80000000)) {
1024 /* FIXME: technically, a record can be fragmented, 1024 /* FIXME: technically, a record can be fragmented,
1025 * and non-terminal fragments will not have the top 1025 * and non-terminal fragments will not have the top
1026 * bit set in the fragment length header. 1026 * bit set in the fragment length header.
1027 * But apparently no known nfs clients send fragmented 1027 * But apparently no known nfs clients send fragmented
1028 * records. */ 1028 * records. */
1029 printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n", 1029 printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n",
1030 (unsigned long) svsk->sk_reclen); 1030 (unsigned long) svsk->sk_reclen);
1031 goto err_delete; 1031 goto err_delete;
1032 } 1032 }
1033 svsk->sk_reclen &= 0x7fffffff; 1033 svsk->sk_reclen &= 0x7fffffff;
1034 dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); 1034 dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
1035 if (svsk->sk_reclen > serv->sv_bufsz) { 1035 if (svsk->sk_reclen > serv->sv_bufsz) {
1036 printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n", 1036 printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n",
1037 (unsigned long) svsk->sk_reclen); 1037 (unsigned long) svsk->sk_reclen);
1038 goto err_delete; 1038 goto err_delete;
1039 } 1039 }
1040 } 1040 }
1041 1041
1042 /* Check whether enough data is available */ 1042 /* Check whether enough data is available */
1043 len = svc_recv_available(svsk); 1043 len = svc_recv_available(svsk);
1044 if (len < 0) 1044 if (len < 0)
1045 goto error; 1045 goto error;
1046 1046
1047 if (len < svsk->sk_reclen) { 1047 if (len < svsk->sk_reclen) {
1048 dprintk("svc: incomplete TCP record (%d of %d)\n", 1048 dprintk("svc: incomplete TCP record (%d of %d)\n",
1049 len, svsk->sk_reclen); 1049 len, svsk->sk_reclen);
1050 svc_sock_received(svsk); 1050 svc_sock_received(svsk);
1051 return -EAGAIN; /* record not complete */ 1051 return -EAGAIN; /* record not complete */
1052 } 1052 }
1053 len = svsk->sk_reclen; 1053 len = svsk->sk_reclen;
1054 set_bit(SK_DATA, &svsk->sk_flags); 1054 set_bit(SK_DATA, &svsk->sk_flags);
1055 1055
1056 vec = rqstp->rq_vec; 1056 vec = rqstp->rq_vec;
1057 vec[0] = rqstp->rq_arg.head[0]; 1057 vec[0] = rqstp->rq_arg.head[0];
1058 vlen = PAGE_SIZE; 1058 vlen = PAGE_SIZE;
1059 pnum = 1; 1059 pnum = 1;
1060 while (vlen < len) { 1060 while (vlen < len) {
1061 vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]); 1061 vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]);
1062 vec[pnum].iov_len = PAGE_SIZE; 1062 vec[pnum].iov_len = PAGE_SIZE;
1063 pnum++; 1063 pnum++;
1064 vlen += PAGE_SIZE; 1064 vlen += PAGE_SIZE;
1065 } 1065 }
1066 rqstp->rq_respages = &rqstp->rq_pages[pnum]; 1066 rqstp->rq_respages = &rqstp->rq_pages[pnum];
1067 1067
1068 /* Now receive data */ 1068 /* Now receive data */
1069 len = svc_recvfrom(rqstp, vec, pnum, len); 1069 len = svc_recvfrom(rqstp, vec, pnum, len);
1070 if (len < 0) 1070 if (len < 0)
1071 goto error; 1071 goto error;
1072 1072
1073 dprintk("svc: TCP complete record (%d bytes)\n", len); 1073 dprintk("svc: TCP complete record (%d bytes)\n", len);
1074 rqstp->rq_arg.len = len; 1074 rqstp->rq_arg.len = len;
1075 rqstp->rq_arg.page_base = 0; 1075 rqstp->rq_arg.page_base = 0;
1076 if (len <= rqstp->rq_arg.head[0].iov_len) { 1076 if (len <= rqstp->rq_arg.head[0].iov_len) {
1077 rqstp->rq_arg.head[0].iov_len = len; 1077 rqstp->rq_arg.head[0].iov_len = len;
1078 rqstp->rq_arg.page_len = 0; 1078 rqstp->rq_arg.page_len = 0;
1079 } else { 1079 } else {
1080 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 1080 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
1081 } 1081 }
1082 1082
1083 rqstp->rq_skbuff = NULL; 1083 rqstp->rq_skbuff = NULL;
1084 rqstp->rq_prot = IPPROTO_TCP; 1084 rqstp->rq_prot = IPPROTO_TCP;
1085 1085
1086 /* Reset TCP read info */ 1086 /* Reset TCP read info */
1087 svsk->sk_reclen = 0; 1087 svsk->sk_reclen = 0;
1088 svsk->sk_tcplen = 0; 1088 svsk->sk_tcplen = 0;
1089 1089
1090 svc_sock_received(svsk); 1090 svc_sock_received(svsk);
1091 if (serv->sv_stats) 1091 if (serv->sv_stats)
1092 serv->sv_stats->nettcpcnt++; 1092 serv->sv_stats->nettcpcnt++;
1093 1093
1094 return len; 1094 return len;
1095 1095
1096 err_delete: 1096 err_delete:
1097 svc_delete_socket(svsk); 1097 svc_delete_socket(svsk);
1098 return -EAGAIN; 1098 return -EAGAIN;
1099 1099
1100 error: 1100 error:
1101 if (len == -EAGAIN) { 1101 if (len == -EAGAIN) {
1102 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 1102 dprintk("RPC: TCP recvfrom got EAGAIN\n");
1103 svc_sock_received(svsk); 1103 svc_sock_received(svsk);
1104 } else { 1104 } else {
1105 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 1105 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
1106 svsk->sk_server->sv_name, -len); 1106 svsk->sk_server->sv_name, -len);
1107 goto err_delete; 1107 goto err_delete;
1108 } 1108 }
1109 1109
1110 return len; 1110 return len;
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * Send out data on TCP socket. 1114 * Send out data on TCP socket.
1115 */ 1115 */
1116 static int 1116 static int
1117 svc_tcp_sendto(struct svc_rqst *rqstp) 1117 svc_tcp_sendto(struct svc_rqst *rqstp)
1118 { 1118 {
1119 struct xdr_buf *xbufp = &rqstp->rq_res; 1119 struct xdr_buf *xbufp = &rqstp->rq_res;
1120 int sent; 1120 int sent;
1121 __be32 reclen; 1121 __be32 reclen;
1122 1122
1123 /* Set up the first element of the reply kvec. 1123 /* Set up the first element of the reply kvec.
1124 * Any other kvecs that may be in use have been taken 1124 * Any other kvecs that may be in use have been taken
1125 * care of by the server implementation itself. 1125 * care of by the server implementation itself.
1126 */ 1126 */
1127 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 1127 reclen = htonl(0x80000000|((xbufp->len ) - 4));
1128 memcpy(xbufp->head[0].iov_base, &reclen, 4); 1128 memcpy(xbufp->head[0].iov_base, &reclen, 4);
1129 1129
1130 if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) 1130 if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags))
1131 return -ENOTCONN; 1131 return -ENOTCONN;
1132 1132
1133 sent = svc_sendto(rqstp, &rqstp->rq_res); 1133 sent = svc_sendto(rqstp, &rqstp->rq_res);
1134 if (sent != xbufp->len) { 1134 if (sent != xbufp->len) {
1135 printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", 1135 printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
1136 rqstp->rq_sock->sk_server->sv_name, 1136 rqstp->rq_sock->sk_server->sv_name,
1137 (sent<0)?"got error":"sent only", 1137 (sent<0)?"got error":"sent only",
1138 sent, xbufp->len); 1138 sent, xbufp->len);
1139 svc_delete_socket(rqstp->rq_sock); 1139 svc_delete_socket(rqstp->rq_sock);
1140 sent = -EAGAIN; 1140 sent = -EAGAIN;
1141 } 1141 }
1142 return sent; 1142 return sent;
1143 } 1143 }
1144 1144
1145 static void 1145 static void
1146 svc_tcp_init(struct svc_sock *svsk) 1146 svc_tcp_init(struct svc_sock *svsk)
1147 { 1147 {
1148 struct sock *sk = svsk->sk_sk; 1148 struct sock *sk = svsk->sk_sk;
1149 struct tcp_sock *tp = tcp_sk(sk); 1149 struct tcp_sock *tp = tcp_sk(sk);
1150 1150
1151 svsk->sk_recvfrom = svc_tcp_recvfrom; 1151 svsk->sk_recvfrom = svc_tcp_recvfrom;
1152 svsk->sk_sendto = svc_tcp_sendto; 1152 svsk->sk_sendto = svc_tcp_sendto;
1153 1153
1154 if (sk->sk_state == TCP_LISTEN) { 1154 if (sk->sk_state == TCP_LISTEN) {
1155 dprintk("setting up TCP socket for listening\n"); 1155 dprintk("setting up TCP socket for listening\n");
1156 sk->sk_data_ready = svc_tcp_listen_data_ready; 1156 sk->sk_data_ready = svc_tcp_listen_data_ready;
1157 set_bit(SK_CONN, &svsk->sk_flags); 1157 set_bit(SK_CONN, &svsk->sk_flags);
1158 } else { 1158 } else {
1159 dprintk("setting up TCP socket for reading\n"); 1159 dprintk("setting up TCP socket for reading\n");
1160 sk->sk_state_change = svc_tcp_state_change; 1160 sk->sk_state_change = svc_tcp_state_change;
1161 sk->sk_data_ready = svc_tcp_data_ready; 1161 sk->sk_data_ready = svc_tcp_data_ready;
1162 sk->sk_write_space = svc_write_space; 1162 sk->sk_write_space = svc_write_space;
1163 1163
1164 svsk->sk_reclen = 0; 1164 svsk->sk_reclen = 0;
1165 svsk->sk_tcplen = 0; 1165 svsk->sk_tcplen = 0;
1166 1166
1167 tp->nonagle = 1; /* disable Nagle's algorithm */ 1167 tp->nonagle = 1; /* disable Nagle's algorithm */
1168 1168
1169 /* initialise setting must have enough space to 1169 /* initialise setting must have enough space to
1170 * receive and respond to one request. 1170 * receive and respond to one request.
1171 * svc_tcp_recvfrom will re-adjust if necessary 1171 * svc_tcp_recvfrom will re-adjust if necessary
1172 */ 1172 */
1173 svc_sock_setbufsize(svsk->sk_sock, 1173 svc_sock_setbufsize(svsk->sk_sock,
1174 3 * svsk->sk_server->sv_bufsz, 1174 3 * svsk->sk_server->sv_bufsz,
1175 3 * svsk->sk_server->sv_bufsz); 1175 3 * svsk->sk_server->sv_bufsz);
1176 1176
1177 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1177 set_bit(SK_CHNGBUF, &svsk->sk_flags);
1178 set_bit(SK_DATA, &svsk->sk_flags); 1178 set_bit(SK_DATA, &svsk->sk_flags);
1179 if (sk->sk_state != TCP_ESTABLISHED) 1179 if (sk->sk_state != TCP_ESTABLISHED)
1180 set_bit(SK_CLOSE, &svsk->sk_flags); 1180 set_bit(SK_CLOSE, &svsk->sk_flags);
1181 } 1181 }
1182 } 1182 }
1183 1183
1184 void 1184 void
1185 svc_sock_update_bufs(struct svc_serv *serv) 1185 svc_sock_update_bufs(struct svc_serv *serv)
1186 { 1186 {
1187 /* 1187 /*
1188 * The number of server threads has changed. Update 1188 * The number of server threads has changed. Update
1189 * rcvbuf and sndbuf accordingly on all sockets 1189 * rcvbuf and sndbuf accordingly on all sockets
1190 */ 1190 */
1191 struct list_head *le; 1191 struct list_head *le;
1192 1192
1193 spin_lock_bh(&serv->sv_lock); 1193 spin_lock_bh(&serv->sv_lock);
1194 list_for_each(le, &serv->sv_permsocks) { 1194 list_for_each(le, &serv->sv_permsocks) {
1195 struct svc_sock *svsk = 1195 struct svc_sock *svsk =
1196 list_entry(le, struct svc_sock, sk_list); 1196 list_entry(le, struct svc_sock, sk_list);
1197 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1197 set_bit(SK_CHNGBUF, &svsk->sk_flags);
1198 } 1198 }
1199 list_for_each(le, &serv->sv_tempsocks) { 1199 list_for_each(le, &serv->sv_tempsocks) {
1200 struct svc_sock *svsk = 1200 struct svc_sock *svsk =
1201 list_entry(le, struct svc_sock, sk_list); 1201 list_entry(le, struct svc_sock, sk_list);
1202 set_bit(SK_CHNGBUF, &svsk->sk_flags); 1202 set_bit(SK_CHNGBUF, &svsk->sk_flags);
1203 } 1203 }
1204 spin_unlock_bh(&serv->sv_lock); 1204 spin_unlock_bh(&serv->sv_lock);
1205 } 1205 }
1206 1206
1207 /* 1207 /*
1208 * Receive the next request on any socket. This code is carefully 1208 * Receive the next request on any socket. This code is carefully
1209 * organised not to touch any cachelines in the shared svc_serv 1209 * organised not to touch any cachelines in the shared svc_serv
1210 * structure, only cachelines in the local svc_pool. 1210 * structure, only cachelines in the local svc_pool.
1211 */ 1211 */
1212 int 1212 int
1213 svc_recv(struct svc_rqst *rqstp, long timeout) 1213 svc_recv(struct svc_rqst *rqstp, long timeout)
1214 { 1214 {
1215 struct svc_sock *svsk =NULL; 1215 struct svc_sock *svsk =NULL;
1216 struct svc_serv *serv = rqstp->rq_server; 1216 struct svc_serv *serv = rqstp->rq_server;
1217 struct svc_pool *pool = rqstp->rq_pool; 1217 struct svc_pool *pool = rqstp->rq_pool;
1218 int len, i; 1218 int len, i;
1219 int pages; 1219 int pages;
1220 struct xdr_buf *arg; 1220 struct xdr_buf *arg;
1221 DECLARE_WAITQUEUE(wait, current); 1221 DECLARE_WAITQUEUE(wait, current);
1222 1222
1223 dprintk("svc: server %p waiting for data (to = %ld)\n", 1223 dprintk("svc: server %p waiting for data (to = %ld)\n",
1224 rqstp, timeout); 1224 rqstp, timeout);
1225 1225
1226 if (rqstp->rq_sock) 1226 if (rqstp->rq_sock)
1227 printk(KERN_ERR 1227 printk(KERN_ERR
1228 "svc_recv: service %p, socket not NULL!\n", 1228 "svc_recv: service %p, socket not NULL!\n",
1229 rqstp); 1229 rqstp);
1230 if (waitqueue_active(&rqstp->rq_wait)) 1230 if (waitqueue_active(&rqstp->rq_wait))
1231 printk(KERN_ERR 1231 printk(KERN_ERR
1232 "svc_recv: service %p, wait queue active!\n", 1232 "svc_recv: service %p, wait queue active!\n",
1233 rqstp); 1233 rqstp);
1234 1234
1235 1235
1236 /* now allocate needed pages. If we get a failure, sleep briefly */ 1236 /* now allocate needed pages. If we get a failure, sleep briefly */
1237 pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; 1237 pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE;
1238 for (i=0; i < pages ; i++) 1238 for (i=0; i < pages ; i++)
1239 while (rqstp->rq_pages[i] == NULL) { 1239 while (rqstp->rq_pages[i] == NULL) {
1240 struct page *p = alloc_page(GFP_KERNEL); 1240 struct page *p = alloc_page(GFP_KERNEL);
1241 if (!p) 1241 if (!p)
1242 schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 1242 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
1243 rqstp->rq_pages[i] = p; 1243 rqstp->rq_pages[i] = p;
1244 } 1244 }
1245 1245
1246 /* Make arg->head point to first page and arg->pages point to rest */ 1246 /* Make arg->head point to first page and arg->pages point to rest */
1247 arg = &rqstp->rq_arg; 1247 arg = &rqstp->rq_arg;
1248 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); 1248 arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
1249 arg->head[0].iov_len = PAGE_SIZE; 1249 arg->head[0].iov_len = PAGE_SIZE;
1250 arg->pages = rqstp->rq_pages + 1; 1250 arg->pages = rqstp->rq_pages + 1;
1251 arg->page_base = 0; 1251 arg->page_base = 0;
1252 /* save at least one page for response */ 1252 /* save at least one page for response */
1253 arg->page_len = (pages-2)*PAGE_SIZE; 1253 arg->page_len = (pages-2)*PAGE_SIZE;
1254 arg->len = (pages-1)*PAGE_SIZE; 1254 arg->len = (pages-1)*PAGE_SIZE;
1255 arg->tail[0].iov_len = 0; 1255 arg->tail[0].iov_len = 0;
1256 1256
1257 try_to_freeze(); 1257 try_to_freeze();
1258 cond_resched(); 1258 cond_resched();
1259 if (signalled()) 1259 if (signalled())
1260 return -EINTR; 1260 return -EINTR;
1261 1261
1262 spin_lock_bh(&pool->sp_lock); 1262 spin_lock_bh(&pool->sp_lock);
1263 if ((svsk = svc_sock_dequeue(pool)) != NULL) { 1263 if ((svsk = svc_sock_dequeue(pool)) != NULL) {
1264 rqstp->rq_sock = svsk; 1264 rqstp->rq_sock = svsk;
1265 atomic_inc(&svsk->sk_inuse); 1265 atomic_inc(&svsk->sk_inuse);
1266 rqstp->rq_reserved = serv->sv_bufsz; 1266 rqstp->rq_reserved = serv->sv_bufsz;
1267 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); 1267 atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
1268 } else { 1268 } else {
1269 /* No data pending. Go to sleep */ 1269 /* No data pending. Go to sleep */
1270 svc_thread_enqueue(pool, rqstp); 1270 svc_thread_enqueue(pool, rqstp);
1271 1271
1272 /* 1272 /*
1273 * We have to be able to interrupt this wait 1273 * We have to be able to interrupt this wait
1274 * to bring down the daemons ... 1274 * to bring down the daemons ...
1275 */ 1275 */
1276 set_current_state(TASK_INTERRUPTIBLE); 1276 set_current_state(TASK_INTERRUPTIBLE);
1277 add_wait_queue(&rqstp->rq_wait, &wait); 1277 add_wait_queue(&rqstp->rq_wait, &wait);
1278 spin_unlock_bh(&pool->sp_lock); 1278 spin_unlock_bh(&pool->sp_lock);
1279 1279
1280 schedule_timeout(timeout); 1280 schedule_timeout(timeout);
1281 1281
1282 try_to_freeze(); 1282 try_to_freeze();
1283 1283
1284 spin_lock_bh(&pool->sp_lock); 1284 spin_lock_bh(&pool->sp_lock);
1285 remove_wait_queue(&rqstp->rq_wait, &wait); 1285 remove_wait_queue(&rqstp->rq_wait, &wait);
1286 1286
1287 if (!(svsk = rqstp->rq_sock)) { 1287 if (!(svsk = rqstp->rq_sock)) {
1288 svc_thread_dequeue(pool, rqstp); 1288 svc_thread_dequeue(pool, rqstp);
1289 spin_unlock_bh(&pool->sp_lock); 1289 spin_unlock_bh(&pool->sp_lock);
1290 dprintk("svc: server %p, no data yet\n", rqstp); 1290 dprintk("svc: server %p, no data yet\n", rqstp);
1291 return signalled()? -EINTR : -EAGAIN; 1291 return signalled()? -EINTR : -EAGAIN;
1292 } 1292 }
1293 } 1293 }
1294 spin_unlock_bh(&pool->sp_lock); 1294 spin_unlock_bh(&pool->sp_lock);
1295 1295
1296 dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", 1296 dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
1297 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); 1297 rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
1298 len = svsk->sk_recvfrom(rqstp); 1298 len = svsk->sk_recvfrom(rqstp);
1299 dprintk("svc: got len=%d\n", len); 1299 dprintk("svc: got len=%d\n", len);
1300 1300
1301 /* No data, incomplete (TCP) read, or accept() */ 1301 /* No data, incomplete (TCP) read, or accept() */
1302 if (len == 0 || len == -EAGAIN) { 1302 if (len == 0 || len == -EAGAIN) {
1303 rqstp->rq_res.len = 0; 1303 rqstp->rq_res.len = 0;
1304 svc_sock_release(rqstp); 1304 svc_sock_release(rqstp);
1305 return -EAGAIN; 1305 return -EAGAIN;
1306 } 1306 }
1307 svsk->sk_lastrecv = get_seconds(); 1307 svsk->sk_lastrecv = get_seconds();
1308 clear_bit(SK_OLD, &svsk->sk_flags); 1308 clear_bit(SK_OLD, &svsk->sk_flags);
1309 1309
1310 rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; 1310 rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024;
1311 rqstp->rq_chandle.defer = svc_defer; 1311 rqstp->rq_chandle.defer = svc_defer;
1312 1312
1313 if (serv->sv_stats) 1313 if (serv->sv_stats)
1314 serv->sv_stats->netcnt++; 1314 serv->sv_stats->netcnt++;
1315 return len; 1315 return len;
1316 } 1316 }
1317 1317
1318 /* 1318 /*
1319 * Drop request 1319 * Drop request
1320 */ 1320 */
1321 void 1321 void
1322 svc_drop(struct svc_rqst *rqstp) 1322 svc_drop(struct svc_rqst *rqstp)
1323 { 1323 {
1324 dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); 1324 dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
1325 svc_sock_release(rqstp); 1325 svc_sock_release(rqstp);
1326 } 1326 }
1327 1327
1328 /* 1328 /*
1329 * Return reply to client. 1329 * Return reply to client.
1330 */ 1330 */
1331 int 1331 int
1332 svc_send(struct svc_rqst *rqstp) 1332 svc_send(struct svc_rqst *rqstp)
1333 { 1333 {
1334 struct svc_sock *svsk; 1334 struct svc_sock *svsk;
1335 int len; 1335 int len;
1336 struct xdr_buf *xb; 1336 struct xdr_buf *xb;
1337 1337
1338 if ((svsk = rqstp->rq_sock) == NULL) { 1338 if ((svsk = rqstp->rq_sock) == NULL) {
1339 printk(KERN_WARNING "NULL socket pointer in %s:%d\n", 1339 printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
1340 __FILE__, __LINE__); 1340 __FILE__, __LINE__);
1341 return -EFAULT; 1341 return -EFAULT;
1342 } 1342 }
1343 1343
1344 /* release the receive skb before sending the reply */ 1344 /* release the receive skb before sending the reply */
1345 svc_release_skb(rqstp); 1345 svc_release_skb(rqstp);
1346 1346
1347 /* calculate over-all length */ 1347 /* calculate over-all length */
1348 xb = & rqstp->rq_res; 1348 xb = & rqstp->rq_res;
1349 xb->len = xb->head[0].iov_len + 1349 xb->len = xb->head[0].iov_len +
1350 xb->page_len + 1350 xb->page_len +
1351 xb->tail[0].iov_len; 1351 xb->tail[0].iov_len;
1352 1352
1353 /* Grab svsk->sk_mutex to serialize outgoing data. */ 1353 /* Grab svsk->sk_mutex to serialize outgoing data. */
1354 mutex_lock(&svsk->sk_mutex); 1354 mutex_lock(&svsk->sk_mutex);
1355 if (test_bit(SK_DEAD, &svsk->sk_flags)) 1355 if (test_bit(SK_DEAD, &svsk->sk_flags))
1356 len = -ENOTCONN; 1356 len = -ENOTCONN;
1357 else 1357 else
1358 len = svsk->sk_sendto(rqstp); 1358 len = svsk->sk_sendto(rqstp);
1359 mutex_unlock(&svsk->sk_mutex); 1359 mutex_unlock(&svsk->sk_mutex);
1360 svc_sock_release(rqstp); 1360 svc_sock_release(rqstp);
1361 1361
1362 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) 1362 if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
1363 return 0; 1363 return 0;
1364 return len; 1364 return len;
1365 } 1365 }
1366 1366
1367 /* 1367 /*
1368 * Timer function to close old temporary sockets, using 1368 * Timer function to close old temporary sockets, using
1369 * a mark-and-sweep algorithm. 1369 * a mark-and-sweep algorithm.
1370 */ 1370 */
1371 static void 1371 static void
1372 svc_age_temp_sockets(unsigned long closure) 1372 svc_age_temp_sockets(unsigned long closure)
1373 { 1373 {
1374 struct svc_serv *serv = (struct svc_serv *)closure; 1374 struct svc_serv *serv = (struct svc_serv *)closure;
1375 struct svc_sock *svsk; 1375 struct svc_sock *svsk;
1376 struct list_head *le, *next; 1376 struct list_head *le, *next;
1377 LIST_HEAD(to_be_aged); 1377 LIST_HEAD(to_be_aged);
1378 1378
1379 dprintk("svc_age_temp_sockets\n"); 1379 dprintk("svc_age_temp_sockets\n");
1380 1380
1381 if (!spin_trylock_bh(&serv->sv_lock)) { 1381 if (!spin_trylock_bh(&serv->sv_lock)) {
1382 /* busy, try again 1 sec later */ 1382 /* busy, try again 1 sec later */
1383 dprintk("svc_age_temp_sockets: busy\n"); 1383 dprintk("svc_age_temp_sockets: busy\n");
1384 mod_timer(&serv->sv_temptimer, jiffies + HZ); 1384 mod_timer(&serv->sv_temptimer, jiffies + HZ);
1385 return; 1385 return;
1386 } 1386 }
1387 1387
1388 list_for_each_safe(le, next, &serv->sv_tempsocks) { 1388 list_for_each_safe(le, next, &serv->sv_tempsocks) {
1389 svsk = list_entry(le, struct svc_sock, sk_list); 1389 svsk = list_entry(le, struct svc_sock, sk_list);
1390 1390
1391 if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) 1391 if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
1392 continue; 1392 continue;
1393 if (atomic_read(&svsk->sk_inuse) || test_bit(SK_BUSY, &svsk->sk_flags)) 1393 if (atomic_read(&svsk->sk_inuse) || test_bit(SK_BUSY, &svsk->sk_flags))
1394 continue; 1394 continue;
1395 atomic_inc(&svsk->sk_inuse); 1395 atomic_inc(&svsk->sk_inuse);
1396 list_move(le, &to_be_aged); 1396 list_move(le, &to_be_aged);
1397 set_bit(SK_CLOSE, &svsk->sk_flags); 1397 set_bit(SK_CLOSE, &svsk->sk_flags);
1398 set_bit(SK_DETACHED, &svsk->sk_flags); 1398 set_bit(SK_DETACHED, &svsk->sk_flags);
1399 } 1399 }
1400 spin_unlock_bh(&serv->sv_lock); 1400 spin_unlock_bh(&serv->sv_lock);
1401 1401
1402 while (!list_empty(&to_be_aged)) { 1402 while (!list_empty(&to_be_aged)) {
1403 le = to_be_aged.next; 1403 le = to_be_aged.next;
1404 /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ 1404 /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
1405 list_del_init(le); 1405 list_del_init(le);
1406 svsk = list_entry(le, struct svc_sock, sk_list); 1406 svsk = list_entry(le, struct svc_sock, sk_list);
1407 1407
1408 dprintk("queuing svsk %p for closing, %lu seconds old\n", 1408 dprintk("queuing svsk %p for closing, %lu seconds old\n",
1409 svsk, get_seconds() - svsk->sk_lastrecv); 1409 svsk, get_seconds() - svsk->sk_lastrecv);
1410 1410
1411 /* a thread will dequeue and close it soon */ 1411 /* a thread will dequeue and close it soon */
1412 svc_sock_enqueue(svsk); 1412 svc_sock_enqueue(svsk);
1413 svc_sock_put(svsk); 1413 svc_sock_put(svsk);
1414 } 1414 }
1415 1415
1416 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); 1416 mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
1417 } 1417 }
1418 1418
1419 /* 1419 /*
1420 * Initialize socket for RPC use and create svc_sock struct 1420 * Initialize socket for RPC use and create svc_sock struct
1421 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. 1421 * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
1422 */ 1422 */
1423 static struct svc_sock * 1423 static struct svc_sock *
1424 svc_setup_socket(struct svc_serv *serv, struct socket *sock, 1424 svc_setup_socket(struct svc_serv *serv, struct socket *sock,
1425 int *errp, int pmap_register) 1425 int *errp, int pmap_register)
1426 { 1426 {
1427 struct svc_sock *svsk; 1427 struct svc_sock *svsk;
1428 struct sock *inet; 1428 struct sock *inet;
1429 1429
1430 dprintk("svc: svc_setup_socket %p\n", sock); 1430 dprintk("svc: svc_setup_socket %p\n", sock);
1431 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1431 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
1432 *errp = -ENOMEM; 1432 *errp = -ENOMEM;
1433 return NULL; 1433 return NULL;
1434 } 1434 }
1435 1435
1436 inet = sock->sk; 1436 inet = sock->sk;
1437 1437
1438 /* Register socket with portmapper */ 1438 /* Register socket with portmapper */
1439 if (*errp >= 0 && pmap_register) 1439 if (*errp >= 0 && pmap_register)
1440 *errp = svc_register(serv, inet->sk_protocol, 1440 *errp = svc_register(serv, inet->sk_protocol,
1441 ntohs(inet_sk(inet)->sport)); 1441 ntohs(inet_sk(inet)->sport));
1442 1442
1443 if (*errp < 0) { 1443 if (*errp < 0) {
1444 kfree(svsk); 1444 kfree(svsk);
1445 return NULL; 1445 return NULL;
1446 } 1446 }
1447 1447
1448 set_bit(SK_BUSY, &svsk->sk_flags); 1448 set_bit(SK_BUSY, &svsk->sk_flags);
1449 inet->sk_user_data = svsk; 1449 inet->sk_user_data = svsk;
1450 svsk->sk_sock = sock; 1450 svsk->sk_sock = sock;
1451 svsk->sk_sk = inet; 1451 svsk->sk_sk = inet;
1452 svsk->sk_ostate = inet->sk_state_change; 1452 svsk->sk_ostate = inet->sk_state_change;
1453 svsk->sk_odata = inet->sk_data_ready; 1453 svsk->sk_odata = inet->sk_data_ready;
1454 svsk->sk_owspace = inet->sk_write_space; 1454 svsk->sk_owspace = inet->sk_write_space;
1455 svsk->sk_server = serv; 1455 svsk->sk_server = serv;
1456 atomic_set(&svsk->sk_inuse, 0); 1456 atomic_set(&svsk->sk_inuse, 0);
1457 svsk->sk_lastrecv = get_seconds(); 1457 svsk->sk_lastrecv = get_seconds();
1458 spin_lock_init(&svsk->sk_defer_lock); 1458 spin_lock_init(&svsk->sk_defer_lock);
1459 INIT_LIST_HEAD(&svsk->sk_deferred); 1459 INIT_LIST_HEAD(&svsk->sk_deferred);
1460 INIT_LIST_HEAD(&svsk->sk_ready); 1460 INIT_LIST_HEAD(&svsk->sk_ready);
1461 mutex_init(&svsk->sk_mutex); 1461 mutex_init(&svsk->sk_mutex);
1462 1462
1463 /* Initialize the socket */ 1463 /* Initialize the socket */
1464 if (sock->type == SOCK_DGRAM) 1464 if (sock->type == SOCK_DGRAM)
1465 svc_udp_init(svsk); 1465 svc_udp_init(svsk);
1466 else 1466 else
1467 svc_tcp_init(svsk); 1467 svc_tcp_init(svsk);
1468 1468
1469 spin_lock_bh(&serv->sv_lock); 1469 spin_lock_bh(&serv->sv_lock);
1470 if (!pmap_register) { 1470 if (!pmap_register) {
1471 set_bit(SK_TEMP, &svsk->sk_flags); 1471 set_bit(SK_TEMP, &svsk->sk_flags);
1472 list_add(&svsk->sk_list, &serv->sv_tempsocks); 1472 list_add(&svsk->sk_list, &serv->sv_tempsocks);
1473 serv->sv_tmpcnt++; 1473 serv->sv_tmpcnt++;
1474 if (serv->sv_temptimer.function == NULL) { 1474 if (serv->sv_temptimer.function == NULL) {
1475 /* setup timer to age temp sockets */ 1475 /* setup timer to age temp sockets */
1476 setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, 1476 setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
1477 (unsigned long)serv); 1477 (unsigned long)serv);
1478 mod_timer(&serv->sv_temptimer, 1478 mod_timer(&serv->sv_temptimer,
1479 jiffies + svc_conn_age_period * HZ); 1479 jiffies + svc_conn_age_period * HZ);
1480 } 1480 }
1481 } else { 1481 } else {
1482 clear_bit(SK_TEMP, &svsk->sk_flags); 1482 clear_bit(SK_TEMP, &svsk->sk_flags);
1483 list_add(&svsk->sk_list, &serv->sv_permsocks); 1483 list_add(&svsk->sk_list, &serv->sv_permsocks);
1484 } 1484 }
1485 spin_unlock_bh(&serv->sv_lock); 1485 spin_unlock_bh(&serv->sv_lock);
1486 1486
1487 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1487 dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1488 svsk, svsk->sk_sk); 1488 svsk, svsk->sk_sk);
1489 1489
1490 clear_bit(SK_BUSY, &svsk->sk_flags); 1490 clear_bit(SK_BUSY, &svsk->sk_flags);
1491 svc_sock_enqueue(svsk); 1491 svc_sock_enqueue(svsk);
1492 return svsk; 1492 return svsk;
1493 } 1493 }
1494 1494
1495 int svc_addsock(struct svc_serv *serv, 1495 int svc_addsock(struct svc_serv *serv,
1496 int fd, 1496 int fd,
1497 char *name_return, 1497 char *name_return,
1498 int *proto) 1498 int *proto)
1499 { 1499 {
1500 int err = 0; 1500 int err = 0;
1501 struct socket *so = sockfd_lookup(fd, &err); 1501 struct socket *so = sockfd_lookup(fd, &err);
1502 struct svc_sock *svsk = NULL; 1502 struct svc_sock *svsk = NULL;
1503 1503
1504 if (!so) 1504 if (!so)
1505 return err; 1505 return err;
1506 if (so->sk->sk_family != AF_INET) 1506 if (so->sk->sk_family != AF_INET)
1507 err = -EAFNOSUPPORT; 1507 err = -EAFNOSUPPORT;
1508 else if (so->sk->sk_protocol != IPPROTO_TCP && 1508 else if (so->sk->sk_protocol != IPPROTO_TCP &&
1509 so->sk->sk_protocol != IPPROTO_UDP) 1509 so->sk->sk_protocol != IPPROTO_UDP)
1510 err = -EPROTONOSUPPORT; 1510 err = -EPROTONOSUPPORT;
1511 else if (so->state > SS_UNCONNECTED) 1511 else if (so->state > SS_UNCONNECTED)
1512 err = -EISCONN; 1512 err = -EISCONN;
1513 else { 1513 else {
1514 svsk = svc_setup_socket(serv, so, &err, 1); 1514 svsk = svc_setup_socket(serv, so, &err, 1);
1515 if (svsk) 1515 if (svsk)
1516 err = 0; 1516 err = 0;
1517 } 1517 }
1518 if (err) { 1518 if (err) {
1519 sockfd_put(so); 1519 sockfd_put(so);
1520 return err; 1520 return err;
1521 } 1521 }
1522 if (proto) *proto = so->sk->sk_protocol; 1522 if (proto) *proto = so->sk->sk_protocol;
1523 return one_sock_name(name_return, svsk); 1523 return one_sock_name(name_return, svsk);
1524 } 1524 }
1525 EXPORT_SYMBOL_GPL(svc_addsock); 1525 EXPORT_SYMBOL_GPL(svc_addsock);
1526 1526
1527 /* 1527 /*
1528 * Create socket for RPC service. 1528 * Create socket for RPC service.
1529 */ 1529 */
1530 static int 1530 static int
1531 svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) 1531 svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
1532 { 1532 {
1533 struct svc_sock *svsk; 1533 struct svc_sock *svsk;
1534 struct socket *sock; 1534 struct socket *sock;
1535 int error; 1535 int error;
1536 int type; 1536 int type;
1537 1537
1538 dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", 1538 dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n",
1539 serv->sv_program->pg_name, protocol, 1539 serv->sv_program->pg_name, protocol,
1540 NIPQUAD(sin->sin_addr.s_addr), 1540 NIPQUAD(sin->sin_addr.s_addr),
1541 ntohs(sin->sin_port)); 1541 ntohs(sin->sin_port));
1542 1542
1543 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1543 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
1544 printk(KERN_WARNING "svc: only UDP and TCP " 1544 printk(KERN_WARNING "svc: only UDP and TCP "
1545 "sockets supported\n"); 1545 "sockets supported\n");
1546 return -EINVAL; 1546 return -EINVAL;
1547 } 1547 }
1548 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1548 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1549 1549
1550 if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) 1550 if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0)
1551 return error; 1551 return error;
1552 1552
1553 if (type == SOCK_STREAM) 1553 if (type == SOCK_STREAM)
1554 sock->sk->sk_reuse = 1; /* allow address reuse */ 1554 sock->sk->sk_reuse = 1; /* allow address reuse */
1555 error = kernel_bind(sock, (struct sockaddr *) sin, 1555 error = kernel_bind(sock, (struct sockaddr *) sin,
1556 sizeof(*sin)); 1556 sizeof(*sin));
1557 if (error < 0) 1557 if (error < 0)
1558 goto bummer; 1558 goto bummer;
1559 1559
1560 if (protocol == IPPROTO_TCP) { 1560 if (protocol == IPPROTO_TCP) {
1561 if ((error = kernel_listen(sock, 64)) < 0) 1561 if ((error = kernel_listen(sock, 64)) < 0)
1562 goto bummer; 1562 goto bummer;
1563 } 1563 }
1564 1564
1565 if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) 1565 if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL)
1566 return 0; 1566 return 0;
1567 1567
1568 bummer: 1568 bummer:
1569 dprintk("svc: svc_create_socket error = %d\n", -error); 1569 dprintk("svc: svc_create_socket error = %d\n", -error);
1570 sock_release(sock); 1570 sock_release(sock);
1571 return error; 1571 return error;
1572 } 1572 }
1573 1573
1574 /* 1574 /*
1575 * Remove a dead socket 1575 * Remove a dead socket
1576 */ 1576 */
1577 void 1577 void
1578 svc_delete_socket(struct svc_sock *svsk) 1578 svc_delete_socket(struct svc_sock *svsk)
1579 { 1579 {
1580 struct svc_serv *serv; 1580 struct svc_serv *serv;
1581 struct sock *sk; 1581 struct sock *sk;
1582 1582
1583 dprintk("svc: svc_delete_socket(%p)\n", svsk); 1583 dprintk("svc: svc_delete_socket(%p)\n", svsk);
1584 1584
1585 serv = svsk->sk_server; 1585 serv = svsk->sk_server;
1586 sk = svsk->sk_sk; 1586 sk = svsk->sk_sk;
1587 1587
1588 sk->sk_state_change = svsk->sk_ostate; 1588 sk->sk_state_change = svsk->sk_ostate;
1589 sk->sk_data_ready = svsk->sk_odata; 1589 sk->sk_data_ready = svsk->sk_odata;
1590 sk->sk_write_space = svsk->sk_owspace; 1590 sk->sk_write_space = svsk->sk_owspace;
1591 1591
1592 spin_lock_bh(&serv->sv_lock); 1592 spin_lock_bh(&serv->sv_lock);
1593 1593
1594 if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) 1594 if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
1595 list_del_init(&svsk->sk_list); 1595 list_del_init(&svsk->sk_list);
1596 /* 1596 /*
1597 * We used to delete the svc_sock from whichever list 1597 * We used to delete the svc_sock from whichever list
1598 * it's sk_ready node was on, but we don't actually 1598 * it's sk_ready node was on, but we don't actually
1599 * need to. This is because the only time we're called 1599 * need to. This is because the only time we're called
1600 * while still attached to a queue, the queue itself 1600 * while still attached to a queue, the queue itself
1601 * is about to be destroyed (in svc_destroy). 1601 * is about to be destroyed (in svc_destroy).
1602 */ 1602 */
1603 if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) 1603 if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags))
1604 if (test_bit(SK_TEMP, &svsk->sk_flags)) 1604 if (test_bit(SK_TEMP, &svsk->sk_flags))
1605 serv->sv_tmpcnt--; 1605 serv->sv_tmpcnt--;
1606 1606
1607 if (!atomic_read(&svsk->sk_inuse)) { 1607 if (!atomic_read(&svsk->sk_inuse)) {
1608 spin_unlock_bh(&serv->sv_lock); 1608 spin_unlock_bh(&serv->sv_lock);
1609 if (svsk->sk_sock->file) 1609 if (svsk->sk_sock->file)
1610 sockfd_put(svsk->sk_sock); 1610 sockfd_put(svsk->sk_sock);
1611 else 1611 else
1612 sock_release(svsk->sk_sock); 1612 sock_release(svsk->sk_sock);
1613 if (svsk->sk_info_authunix != NULL)
1614 svcauth_unix_info_release(svsk->sk_info_authunix);
1613 kfree(svsk); 1615 kfree(svsk);
1614 } else { 1616 } else {
1615 spin_unlock_bh(&serv->sv_lock); 1617 spin_unlock_bh(&serv->sv_lock);
1616 dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); 1618 dprintk(KERN_NOTICE "svc: server socket destroy delayed\n");
1617 /* svsk->sk_server = NULL; */ 1619 /* svsk->sk_server = NULL; */
1618 } 1620 }
1619 } 1621 }
1620 1622
1621 /* 1623 /*
1622 * Make a socket for nfsd and lockd 1624 * Make a socket for nfsd and lockd
1623 */ 1625 */
1624 int 1626 int
1625 svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) 1627 svc_makesock(struct svc_serv *serv, int protocol, unsigned short port)
1626 { 1628 {
1627 struct sockaddr_in sin; 1629 struct sockaddr_in sin;
1628 1630
1629 dprintk("svc: creating socket proto = %d\n", protocol); 1631 dprintk("svc: creating socket proto = %d\n", protocol);
1630 sin.sin_family = AF_INET; 1632 sin.sin_family = AF_INET;
1631 sin.sin_addr.s_addr = INADDR_ANY; 1633 sin.sin_addr.s_addr = INADDR_ANY;
1632 sin.sin_port = htons(port); 1634 sin.sin_port = htons(port);
1633 return svc_create_socket(serv, protocol, &sin); 1635 return svc_create_socket(serv, protocol, &sin);
1634 } 1636 }
1635 1637
1636 /* 1638 /*
1637 * Handle defer and revisit of requests 1639 * Handle defer and revisit of requests
1638 */ 1640 */
1639 1641
1640 static void svc_revisit(struct cache_deferred_req *dreq, int too_many) 1642 static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
1641 { 1643 {
1642 struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); 1644 struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
1643 struct svc_sock *svsk; 1645 struct svc_sock *svsk;
1644 1646
1645 if (too_many) { 1647 if (too_many) {
1646 svc_sock_put(dr->svsk); 1648 svc_sock_put(dr->svsk);
1647 kfree(dr); 1649 kfree(dr);
1648 return; 1650 return;
1649 } 1651 }
1650 dprintk("revisit queued\n"); 1652 dprintk("revisit queued\n");
1651 svsk = dr->svsk; 1653 svsk = dr->svsk;
1652 dr->svsk = NULL; 1654 dr->svsk = NULL;
1653 spin_lock_bh(&svsk->sk_defer_lock); 1655 spin_lock_bh(&svsk->sk_defer_lock);
1654 list_add(&dr->handle.recent, &svsk->sk_deferred); 1656 list_add(&dr->handle.recent, &svsk->sk_deferred);
1655 spin_unlock_bh(&svsk->sk_defer_lock); 1657 spin_unlock_bh(&svsk->sk_defer_lock);
1656 set_bit(SK_DEFERRED, &svsk->sk_flags); 1658 set_bit(SK_DEFERRED, &svsk->sk_flags);
1657 svc_sock_enqueue(svsk); 1659 svc_sock_enqueue(svsk);
1658 svc_sock_put(svsk); 1660 svc_sock_put(svsk);
1659 } 1661 }
1660 1662
1661 static struct cache_deferred_req * 1663 static struct cache_deferred_req *
1662 svc_defer(struct cache_req *req) 1664 svc_defer(struct cache_req *req)
1663 { 1665 {
1664 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); 1666 struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
1665 int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); 1667 int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
1666 struct svc_deferred_req *dr; 1668 struct svc_deferred_req *dr;
1667 1669
1668 if (rqstp->rq_arg.page_len) 1670 if (rqstp->rq_arg.page_len)
1669 return NULL; /* if more than a page, give up FIXME */ 1671 return NULL; /* if more than a page, give up FIXME */
1670 if (rqstp->rq_deferred) { 1672 if (rqstp->rq_deferred) {
1671 dr = rqstp->rq_deferred; 1673 dr = rqstp->rq_deferred;
1672 rqstp->rq_deferred = NULL; 1674 rqstp->rq_deferred = NULL;
1673 } else { 1675 } else {
1674 int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; 1676 int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
1675 /* FIXME maybe discard if size too large */ 1677 /* FIXME maybe discard if size too large */
1676 dr = kmalloc(size, GFP_KERNEL); 1678 dr = kmalloc(size, GFP_KERNEL);
1677 if (dr == NULL) 1679 if (dr == NULL)
1678 return NULL; 1680 return NULL;
1679 1681
1680 dr->handle.owner = rqstp->rq_server; 1682 dr->handle.owner = rqstp->rq_server;
1681 dr->prot = rqstp->rq_prot; 1683 dr->prot = rqstp->rq_prot;
1682 dr->addr = rqstp->rq_addr; 1684 dr->addr = rqstp->rq_addr;
1683 dr->daddr = rqstp->rq_daddr; 1685 dr->daddr = rqstp->rq_daddr;
1684 dr->argslen = rqstp->rq_arg.len >> 2; 1686 dr->argslen = rqstp->rq_arg.len >> 2;
1685 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); 1687 memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
1686 } 1688 }
1687 atomic_inc(&rqstp->rq_sock->sk_inuse); 1689 atomic_inc(&rqstp->rq_sock->sk_inuse);
1688 dr->svsk = rqstp->rq_sock; 1690 dr->svsk = rqstp->rq_sock;
1689 1691
1690 dr->handle.revisit = svc_revisit; 1692 dr->handle.revisit = svc_revisit;
1691 return &dr->handle; 1693 return &dr->handle;
1692 } 1694 }
1693 1695
1694 /* 1696 /*
1695 * recv data from a deferred request into an active one 1697 * recv data from a deferred request into an active one
1696 */ 1698 */
1697 static int svc_deferred_recv(struct svc_rqst *rqstp) 1699 static int svc_deferred_recv(struct svc_rqst *rqstp)
1698 { 1700 {
1699 struct svc_deferred_req *dr = rqstp->rq_deferred; 1701 struct svc_deferred_req *dr = rqstp->rq_deferred;
1700 1702
1701 rqstp->rq_arg.head[0].iov_base = dr->args; 1703 rqstp->rq_arg.head[0].iov_base = dr->args;
1702 rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; 1704 rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
1703 rqstp->rq_arg.page_len = 0; 1705 rqstp->rq_arg.page_len = 0;
1704 rqstp->rq_arg.len = dr->argslen<<2; 1706 rqstp->rq_arg.len = dr->argslen<<2;
1705 rqstp->rq_prot = dr->prot; 1707 rqstp->rq_prot = dr->prot;
1706 rqstp->rq_addr = dr->addr; 1708 rqstp->rq_addr = dr->addr;
1707 rqstp->rq_daddr = dr->daddr; 1709 rqstp->rq_daddr = dr->daddr;
1708 rqstp->rq_respages = rqstp->rq_pages; 1710 rqstp->rq_respages = rqstp->rq_pages;
1709 return dr->argslen<<2; 1711 return dr->argslen<<2;
1710 } 1712 }
1711 1713
1712 1714
1713 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) 1715 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
1714 { 1716 {
1715 struct svc_deferred_req *dr = NULL; 1717 struct svc_deferred_req *dr = NULL;
1716 1718
1717 if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) 1719 if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
1718 return NULL; 1720 return NULL;
1719 spin_lock_bh(&svsk->sk_defer_lock); 1721 spin_lock_bh(&svsk->sk_defer_lock);
1720 clear_bit(SK_DEFERRED, &svsk->sk_flags); 1722 clear_bit(SK_DEFERRED, &svsk->sk_flags);
1721 if (!list_empty(&svsk->sk_deferred)) { 1723 if (!list_empty(&svsk->sk_deferred)) {
1722 dr = list_entry(svsk->sk_deferred.next, 1724 dr = list_entry(svsk->sk_deferred.next,
1723 struct svc_deferred_req, 1725 struct svc_deferred_req,
1724 handle.recent); 1726 handle.recent);
1725 list_del_init(&dr->handle.recent); 1727 list_del_init(&dr->handle.recent);
1726 set_bit(SK_DEFERRED, &svsk->sk_flags); 1728 set_bit(SK_DEFERRED, &svsk->sk_flags);
1727 } 1729 }
1728 spin_unlock_bh(&svsk->sk_defer_lock); 1730 spin_unlock_bh(&svsk->sk_defer_lock);
1729 return dr; 1731 return dr;
1730 } 1732 }
1731 1733