Blame view

net/ipv4/inetpeer.c 15.7 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
  /*
   *		INETPEER - A storage for permanent information about peers
   *
   *  This source is covered by the GNU GPL, the same as all kernel sources.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
6
7
8
9
10
11
12
13
14
   *  Authors:	Andrey V. Savochkin <saw@msu.ru>
   */
  
  #include <linux/module.h>
  #include <linux/types.h>
  #include <linux/slab.h>
  #include <linux/interrupt.h>
  #include <linux/spinlock.h>
  #include <linux/random.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
16
17
18
19
  #include <linux/timer.h>
  #include <linux/time.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
  #include <linux/net.h>
20380731b   Arnaldo Carvalho de Melo   [NET]: Fix sparse...
20
  #include <net/ip.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
21
  #include <net/inetpeer.h>
6e5714eaf   David S. Miller   net: Compute prot...
22
  #include <net/secure_seq.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
  
  /*
   *  Theory of operations.
   *  We keep one entry for each peer IP address.  The nodes contains long-living
   *  information about the peer which doesn't depend on routes.
   *  At this moment this information consists only of ID field for the next
   *  outgoing IP packet.  This field is incremented with each packet as encoded
   *  in inet_getid() function (include/net/inetpeer.h).
   *  At the moment of writing this notes identifier of IP packets is generated
   *  to be unpredictable using this code only for packets subjected
   *  (actually or potentially) to defragmentation.  I.e. DF packets less than
   *  PMTU in size uses a constant ID and do not use this code (see
   *  ip_select_ident() in include/net/ip.h).
   *
   *  Route cache entries hold references to our nodes.
   *  New cache entries get references via lookup by destination IP address in
   *  the avl tree.  The reference is grabbed only when it's needed i.e. only
   *  when we try to output IP packet which needs an unpredictable ID (see
   *  __ip_select_ident() in net/ipv4/route.c).
   *  Nodes are removed only when reference counter goes to 0.
   *  When it's happened the node may be removed when a sufficient amount of
   *  time has been passed since its last use.  The less-recently-used entry can
   *  also be removed if the pool is overloaded i.e. if the total amount of
   *  entries is greater-or-equal than the threshold.
   *
   *  Node pool is organised as an AVL tree.
   *  Such an implementation has been chosen not just for fun.  It's a way to
   *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
   *  amount of long living nodes in a single hash slot would significantly delay
   *  lookups performed with disabled BHs.
   *
   *  Serialisation issues.
aa1039e73   Eric Dumazet   inetpeer: RCU con...
55
56
   *  1.  Nodes may appear in the tree only with the pool lock held.
   *  2.  Nodes may disappear from the tree only with the pool lock held
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
57
   *      AND reference count being 0.
4b9d9be83   Eric Dumazet   inetpeer: remove ...
58
59
   *  3.  Global variable peer_total is modified under the pool lock.
   *  4.  struct inet_peer fields modification:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
   *		avl_left, avl_right, avl_parent, avl_height: pool lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
62
   *		refcnt: atomically against modifications on other CPU;
   *		   usually under some other lock to prevent node disappearing
582a72da9   David S. Miller   inetpeer: Introdu...
63
   *		daddr: unchangeable
317fe0e6c   Eric Dumazet   inetpeer: restore...
64
   *		ip_id_count: atomic value (no lock needed)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
65
   */
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
66
  static struct kmem_cache *peer_cachep __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
67
68
  
  #define node_height(x) x->avl_height
d6cc1d642   Eric Dumazet   inetpeer: various...
69
70
  
  #define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
71
  #define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
d6cc1d642   Eric Dumazet   inetpeer: various...
72
  static const struct inet_peer peer_fake_node = {
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
73
74
  	.avl_left	= peer_avl_empty_rcu,
  	.avl_right	= peer_avl_empty_rcu,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
75
76
  	.avl_height	= 0
  };
d6cc1d642   Eric Dumazet   inetpeer: various...
77

021e92991   David S. Miller   inetpeer: Add v6 ...
78
  struct inet_peer_base {
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
79
  	struct inet_peer __rcu *root;
65e8354ec   Eric Dumazet   inetpeer: seqlock...
80
  	seqlock_t	lock;
d6cc1d642   Eric Dumazet   inetpeer: various...
81
  	int		total;
021e92991   David S. Miller   inetpeer: Add v6 ...
82
83
84
  };
  
  static struct inet_peer_base v4_peers = {
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
85
  	.root		= peer_avl_empty_rcu,
65e8354ec   Eric Dumazet   inetpeer: seqlock...
86
  	.lock		= __SEQLOCK_UNLOCKED(v4_peers.lock),
d6cc1d642   Eric Dumazet   inetpeer: various...
87
88
  	.total		= 0,
  };
021e92991   David S. Miller   inetpeer: Add v6 ...
89
90
91
  
  static struct inet_peer_base v6_peers = {
  	.root		= peer_avl_empty_rcu,
65e8354ec   Eric Dumazet   inetpeer: seqlock...
92
  	.lock		= __SEQLOCK_UNLOCKED(v6_peers.lock),
021e92991   David S. Miller   inetpeer: Add v6 ...
93
94
  	.total		= 0,
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
95
  #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
96
  /* Exported for sysctl_net_ipv4.  */
243bbcaa0   Eric Dumazet   [IPV4]: Optimize ...
97
  int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries more
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98
  					 * aggressively at this stage */
243bbcaa0   Eric Dumazet   [IPV4]: Optimize ...
99
100
  int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
  int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
101

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
  
  /* Called from ip_output.c:ip_init  */
  void __init inet_initpeers(void)
  {
  	struct sysinfo si;
  
  	/* Use the straight interface to information about memory. */
  	si_meminfo(&si);
  	/* The values below were suggested by Alexey Kuznetsov
  	 * <kuznet@ms2.inr.ac.ru>.  I don't have any opinion about the values
  	 * myself.  --SAW
  	 */
  	if (si.totalram <= (32768*1024)/PAGE_SIZE)
  		inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
  	if (si.totalram <= (16384*1024)/PAGE_SIZE)
  		inet_peer_threshold >>= 1; /* about 512KB */
  	if (si.totalram <= (8192*1024)/PAGE_SIZE)
  		inet_peer_threshold >>= 2; /* about 128KB */
  
  	peer_cachep = kmem_cache_create("inet_peer_cache",
  			sizeof(struct inet_peer),
317fe0e6c   Eric Dumazet   inetpeer: restore...
123
  			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
20c2df83d   Paul Mundt   mm: Remove slab d...
124
  			NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
125

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
126
  }
8790ca172   David S. Miller   inetpeer: Kill us...
127
128
  static int addr_compare(const struct inetpeer_addr *a,
  			const struct inetpeer_addr *b)
026630450   David S. Miller   inetpeer: Abstrac...
129
130
131
132
  {
  	int i, n = (a->family == AF_INET ? 1 : 4);
  
  	for (i = 0; i < n; i++) {
7a71ed899   David S. Miller   inetpeer: Abstrac...
133
  		if (a->addr.a6[i] == b->addr.a6[i])
026630450   David S. Miller   inetpeer: Abstrac...
134
  			continue;
7a71ed899   David S. Miller   inetpeer: Abstrac...
135
  		if (a->addr.a6[i] < b->addr.a6[i])
026630450   David S. Miller   inetpeer: Abstrac...
136
137
138
139
140
141
  			return -1;
  		return 1;
  	}
  
  	return 0;
  }
65e8354ec   Eric Dumazet   inetpeer: seqlock...
142
143
  #define rcu_deref_locked(X, BASE)				\
  	rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
243bbcaa0   Eric Dumazet   [IPV4]: Optimize ...
144
145
  /*
   * Called with local BH disabled and the pool lock held.
243bbcaa0   Eric Dumazet   [IPV4]: Optimize ...
146
   */
98158f5a8   David S. Miller   inetpeer: Abstrac...
147
  #define lookup(_daddr, _stack, _base)				\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
148
  ({								\
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
149
150
  	struct inet_peer *u;					\
  	struct inet_peer __rcu **v;				\
aa1039e73   Eric Dumazet   inetpeer: RCU con...
151
152
  								\
  	stackptr = _stack;					\
98158f5a8   David S. Miller   inetpeer: Abstrac...
153
  	*stackptr++ = &_base->root;				\
65e8354ec   Eric Dumazet   inetpeer: seqlock...
154
  	for (u = rcu_deref_locked(_base->root, _base);		\
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
155
  	     u != peer_avl_empty; ) {				\
026630450   David S. Miller   inetpeer: Abstrac...
156
157
  		int cmp = addr_compare(_daddr, &u->daddr);	\
  		if (cmp == 0)					\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
158
  			break;					\
026630450   David S. Miller   inetpeer: Abstrac...
159
  		if (cmp == -1)					\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
161
162
  			v = &u->avl_left;			\
  		else						\
  			v = &u->avl_right;			\
aa1039e73   Eric Dumazet   inetpeer: RCU con...
163
  		*stackptr++ = v;				\
65e8354ec   Eric Dumazet   inetpeer: seqlock...
164
  		u = rcu_deref_locked(*v, _base);		\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
165
166
167
  	}							\
  	u;							\
  })
aa1039e73   Eric Dumazet   inetpeer: RCU con...
168
  /*
7b46ac4e7   David S. Miller   inetpeer: Don't d...
169
   * Called with rcu_read_lock()
aa1039e73   Eric Dumazet   inetpeer: RCU con...
170
171
172
173
174
   * Because we hold no lock against a writer, its quite possible we fall
   * in an endless loop.
   * But every pointer we follow is guaranteed to be valid thanks to RCU.
   * We exit from this function if number of links exceeds PEER_MAXDEPTH
   */
7b46ac4e7   David S. Miller   inetpeer: Don't d...
175
  static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
4b9d9be83   Eric Dumazet   inetpeer: remove ...
176
  				    struct inet_peer_base *base)
aa1039e73   Eric Dumazet   inetpeer: RCU con...
177
  {
7b46ac4e7   David S. Miller   inetpeer: Don't d...
178
  	struct inet_peer *u = rcu_dereference(base->root);
aa1039e73   Eric Dumazet   inetpeer: RCU con...
179
180
181
  	int count = 0;
  
  	while (u != peer_avl_empty) {
026630450   David S. Miller   inetpeer: Abstrac...
182
183
  		int cmp = addr_compare(daddr, &u->daddr);
  		if (cmp == 0) {
5f2f89209   Eric Dumazet   inetpeer: do not ...
184
  			/* Before taking a reference, check if this entry was
4b9d9be83   Eric Dumazet   inetpeer: remove ...
185
  			 * deleted (refcnt=-1)
5f2f89209   Eric Dumazet   inetpeer: do not ...
186
  			 */
4b9d9be83   Eric Dumazet   inetpeer: remove ...
187
  			if (!atomic_add_unless(&u->refcnt, 1, -1))
aa1039e73   Eric Dumazet   inetpeer: RCU con...
188
189
190
  				u = NULL;
  			return u;
  		}
026630450   David S. Miller   inetpeer: Abstrac...
191
  		if (cmp == -1)
7b46ac4e7   David S. Miller   inetpeer: Don't d...
192
  			u = rcu_dereference(u->avl_left);
aa1039e73   Eric Dumazet   inetpeer: RCU con...
193
  		else
7b46ac4e7   David S. Miller   inetpeer: Don't d...
194
  			u = rcu_dereference(u->avl_right);
aa1039e73   Eric Dumazet   inetpeer: RCU con...
195
196
197
198
199
200
201
  		if (unlikely(++count == PEER_MAXDEPTH))
  			break;
  	}
  	return NULL;
  }
  
  /* Called with local BH disabled and the pool lock held. */
98158f5a8   David S. Miller   inetpeer: Abstrac...
202
  #define lookup_rightempty(start, base)				\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
203
  ({								\
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
204
205
  	struct inet_peer *u;					\
  	struct inet_peer __rcu **v;				\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
206
207
  	*stackptr++ = &start->avl_left;				\
  	v = &start->avl_left;					\
65e8354ec   Eric Dumazet   inetpeer: seqlock...
208
  	for (u = rcu_deref_locked(*v, base);			\
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
209
  	     u->avl_right != peer_avl_empty_rcu; ) {		\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
  		v = &u->avl_right;				\
  		*stackptr++ = v;				\
65e8354ec   Eric Dumazet   inetpeer: seqlock...
212
  		u = rcu_deref_locked(*v, base);			\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
213
214
215
  	}							\
  	u;							\
  })
aa1039e73   Eric Dumazet   inetpeer: RCU con...
216
  /* Called with local BH disabled and the pool lock held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
217
   * Variable names are the proof of operation correctness.
aa1039e73   Eric Dumazet   inetpeer: RCU con...
218
219
   * Look into mm/map_avl.c for more detail description of the ideas.
   */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
220
  static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
98158f5a8   David S. Miller   inetpeer: Abstrac...
221
222
  			       struct inet_peer __rcu ***stackend,
  			       struct inet_peer_base *base)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
223
  {
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
224
225
  	struct inet_peer __rcu **nodep;
  	struct inet_peer *node, *l, *r;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
226
227
228
229
  	int lh, rh;
  
  	while (stackend > stack) {
  		nodep = *--stackend;
65e8354ec   Eric Dumazet   inetpeer: seqlock...
230
231
232
  		node = rcu_deref_locked(*nodep, base);
  		l = rcu_deref_locked(node->avl_left, base);
  		r = rcu_deref_locked(node->avl_right, base);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
233
234
235
236
237
  		lh = node_height(l);
  		rh = node_height(r);
  		if (lh > rh + 1) { /* l: RH+2 */
  			struct inet_peer *ll, *lr, *lrl, *lrr;
  			int lrh;
65e8354ec   Eric Dumazet   inetpeer: seqlock...
238
239
  			ll = rcu_deref_locked(l->avl_left, base);
  			lr = rcu_deref_locked(l->avl_right, base);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
240
241
  			lrh = node_height(lr);
  			if (lrh <= node_height(ll)) {	/* ll: RH+1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
242
243
  				RCU_INIT_POINTER(node->avl_left, lr);	/* lr: RH or RH+1 */
  				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
244
  				node->avl_height = lrh + 1; /* RH+1 or RH+2 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
245
246
  				RCU_INIT_POINTER(l->avl_left, ll);       /* ll: RH+1 */
  				RCU_INIT_POINTER(l->avl_right, node);	/* node: RH+1 or RH+2 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
247
  				l->avl_height = node->avl_height + 1;
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
248
  				RCU_INIT_POINTER(*nodep, l);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
249
  			} else { /* ll: RH, lr: RH+1 */
65e8354ec   Eric Dumazet   inetpeer: seqlock...
250
251
  				lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
  				lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
252
253
  				RCU_INIT_POINTER(node->avl_left, lrr);	/* lrr: RH or RH-1 */
  				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
254
  				node->avl_height = rh + 1; /* node: RH+1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
255
256
  				RCU_INIT_POINTER(l->avl_left, ll);	/* ll: RH */
  				RCU_INIT_POINTER(l->avl_right, lrl);	/* lrl: RH or RH-1 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
257
  				l->avl_height = rh + 1;	/* l: RH+1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
258
259
  				RCU_INIT_POINTER(lr->avl_left, l);	/* l: RH+1 */
  				RCU_INIT_POINTER(lr->avl_right, node);	/* node: RH+1 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
260
  				lr->avl_height = rh + 2;
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
261
  				RCU_INIT_POINTER(*nodep, lr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
262
263
264
265
  			}
  		} else if (rh > lh + 1) { /* r: LH+2 */
  			struct inet_peer *rr, *rl, *rlr, *rll;
  			int rlh;
65e8354ec   Eric Dumazet   inetpeer: seqlock...
266
267
  			rr = rcu_deref_locked(r->avl_right, base);
  			rl = rcu_deref_locked(r->avl_left, base);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
268
269
  			rlh = node_height(rl);
  			if (rlh <= node_height(rr)) {	/* rr: LH+1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
270
271
  				RCU_INIT_POINTER(node->avl_right, rl);	/* rl: LH or LH+1 */
  				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
272
  				node->avl_height = rlh + 1; /* LH+1 or LH+2 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
273
274
  				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH+1 */
  				RCU_INIT_POINTER(r->avl_left, node);	/* node: LH+1 or LH+2 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
275
  				r->avl_height = node->avl_height + 1;
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
276
  				RCU_INIT_POINTER(*nodep, r);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
277
  			} else { /* rr: RH, rl: RH+1 */
65e8354ec   Eric Dumazet   inetpeer: seqlock...
278
279
  				rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
  				rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
280
281
  				RCU_INIT_POINTER(node->avl_right, rll);	/* rll: LH or LH-1 */
  				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
282
  				node->avl_height = lh + 1; /* node: LH+1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
283
284
  				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH */
  				RCU_INIT_POINTER(r->avl_left, rlr);	/* rlr: LH or LH-1 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
285
  				r->avl_height = lh + 1;	/* r: LH+1 */
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
286
287
  				RCU_INIT_POINTER(rl->avl_right, r);	/* r: LH+1 */
  				RCU_INIT_POINTER(rl->avl_left, node);	/* node: LH+1 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
288
  				rl->avl_height = lh + 2;
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
289
  				RCU_INIT_POINTER(*nodep, rl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
290
291
292
293
294
295
  			}
  		} else {
  			node->avl_height = (lh > rh ? lh : rh) + 1;
  		}
  	}
  }
aa1039e73   Eric Dumazet   inetpeer: RCU con...
296
  /* Called with local BH disabled and the pool lock held. */
98158f5a8   David S. Miller   inetpeer: Abstrac...
297
  #define link_to_pool(n, base)					\
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
298
299
  do {								\
  	n->avl_height = 1;					\
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
300
301
302
303
  	n->avl_left = peer_avl_empty_rcu;			\
  	n->avl_right = peer_avl_empty_rcu;			\
  	/* lockless readers can catch us now */			\
  	rcu_assign_pointer(**--stackptr, n);			\
98158f5a8   David S. Miller   inetpeer: Abstrac...
304
  	peer_avl_rebalance(stack, stackptr, base);		\
d6cc1d642   Eric Dumazet   inetpeer: various...
305
  } while (0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306

aa1039e73   Eric Dumazet   inetpeer: RCU con...
307
308
309
310
  static void inetpeer_free_rcu(struct rcu_head *head)
  {
  	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
  }
66944e1c5   Eric Dumazet   inetpeer: reduce ...
311
312
  static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
  			     struct inet_peer __rcu **stack[PEER_MAXDEPTH])
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
313
  {
4b9d9be83   Eric Dumazet   inetpeer: remove ...
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
  	struct inet_peer __rcu ***stackptr, ***delp;
  
  	if (lookup(&p->daddr, stack, base) != p)
  		BUG();
  	delp = stackptr - 1; /* *delp[0] == p */
  	if (p->avl_left == peer_avl_empty_rcu) {
  		*delp[0] = p->avl_right;
  		--stackptr;
  	} else {
  		/* look for a node to insert instead of p */
  		struct inet_peer *t;
  		t = lookup_rightempty(p, base);
  		BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
  		**--stackptr = t->avl_left;
  		/* t is removed, t->daddr > x->daddr for any
  		 * x in p->avl_left subtree.
  		 * Put t in the old place of p. */
  		RCU_INIT_POINTER(*delp[0], t);
  		t->avl_left = p->avl_left;
  		t->avl_right = p->avl_right;
  		t->avl_height = p->avl_height;
  		BUG_ON(delp[1] != &p->avl_left);
  		delp[1] = &t->avl_left; /* was &p->avl_left */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
337
  	}
4b9d9be83   Eric Dumazet   inetpeer: remove ...
338
339
340
  	peer_avl_rebalance(stack, stackptr, base);
  	base->total--;
  	call_rcu(&p->rcu, inetpeer_free_rcu);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
341
  }
021e92991   David S. Miller   inetpeer: Add v6 ...
342
343
  static struct inet_peer_base *family_to_base(int family)
  {
4b9d9be83   Eric Dumazet   inetpeer: remove ...
344
  	return family == AF_INET ? &v4_peers : &v6_peers;
021e92991   David S. Miller   inetpeer: Add v6 ...
345
  }
4b9d9be83   Eric Dumazet   inetpeer: remove ...
346
347
348
349
  /* perform garbage collect on all items stacked during a lookup */
  static int inet_peer_gc(struct inet_peer_base *base,
  			struct inet_peer __rcu **stack[PEER_MAXDEPTH],
  			struct inet_peer __rcu ***stackptr)
98158f5a8   David S. Miller   inetpeer: Abstrac...
350
  {
4b9d9be83   Eric Dumazet   inetpeer: remove ...
351
352
353
  	struct inet_peer *p, *gchead = NULL;
  	__u32 delta, ttl;
  	int cnt = 0;
d71209ded   Pavel Emelyanov   [INET]: Use list_...
354

4b9d9be83   Eric Dumazet   inetpeer: remove ...
355
356
357
358
359
360
361
362
363
364
  	if (base->total >= inet_peer_threshold)
  		ttl = 0; /* be aggressive */
  	else
  		ttl = inet_peer_maxttl
  				- (inet_peer_maxttl - inet_peer_minttl) / HZ *
  					base->total / inet_peer_threshold * HZ;
  	stackptr--; /* last stack slot is peer_avl_empty */
  	while (stackptr > stack) {
  		stackptr--;
  		p = rcu_deref_locked(**stackptr, base);
6d1a3e042   Eric Dumazet   inetpeer: kill in...
365
366
367
368
369
370
371
372
  		if (atomic_read(&p->refcnt) == 0) {
  			smp_rmb();
  			delta = (__u32)jiffies - p->dtime;
  			if (delta >= ttl &&
  			    atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
  				p->gc_next = gchead;
  				gchead = p;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
373
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
374
  	}
4b9d9be83   Eric Dumazet   inetpeer: remove ...
375
376
377
378
379
380
  	while ((p = gchead) != NULL) {
  		gchead = p->gc_next;
  		cnt++;
  		unlink_from_pool(p, base, stack);
  	}
  	return cnt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
381
  }
87c48fa3b   Eric Dumazet   ipv6: make fragme...
382
  struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
383
  {
b914c4ea9   Eric Dumazet   inetpeer: __rcu a...
384
  	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
3408404a4   David S. Miller   inetpeer: Use cor...
385
  	struct inet_peer_base *base = family_to_base(daddr->family);
98158f5a8   David S. Miller   inetpeer: Abstrac...
386
  	struct inet_peer *p;
65e8354ec   Eric Dumazet   inetpeer: seqlock...
387
  	unsigned int sequence;
4b9d9be83   Eric Dumazet   inetpeer: remove ...
388
  	int invalidated, gccnt = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
389

4b9d9be83   Eric Dumazet   inetpeer: remove ...
390
  	/* Attempt a lockless lookup first.
aa1039e73   Eric Dumazet   inetpeer: RCU con...
391
392
  	 * Because of a concurrent writer, we might not find an existing entry.
  	 */
7b46ac4e7   David S. Miller   inetpeer: Don't d...
393
  	rcu_read_lock();
65e8354ec   Eric Dumazet   inetpeer: seqlock...
394
  	sequence = read_seqbegin(&base->lock);
4b9d9be83   Eric Dumazet   inetpeer: remove ...
395
  	p = lookup_rcu(daddr, base);
65e8354ec   Eric Dumazet   inetpeer: seqlock...
396
  	invalidated = read_seqretry(&base->lock, sequence);
7b46ac4e7   David S. Miller   inetpeer: Don't d...
397
  	rcu_read_unlock();
aa1039e73   Eric Dumazet   inetpeer: RCU con...
398

4b9d9be83   Eric Dumazet   inetpeer: remove ...
399
  	if (p)
aa1039e73   Eric Dumazet   inetpeer: RCU con...
400
  		return p;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
401

65e8354ec   Eric Dumazet   inetpeer: seqlock...
402
403
404
  	/* If no writer did a change during our lookup, we can return early. */
  	if (!create && !invalidated)
  		return NULL;
aa1039e73   Eric Dumazet   inetpeer: RCU con...
405
406
407
  	/* retry an exact lookup, taking the lock before.
  	 * At least, nodes should be hot in our cache.
  	 */
65e8354ec   Eric Dumazet   inetpeer: seqlock...
408
  	write_seqlock_bh(&base->lock);
4b9d9be83   Eric Dumazet   inetpeer: remove ...
409
  relookup:
026630450   David S. Miller   inetpeer: Abstrac...
410
  	p = lookup(daddr, stack, base);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
411
  	if (p != peer_avl_empty) {
4b9d9be83   Eric Dumazet   inetpeer: remove ...
412
  		atomic_inc(&p->refcnt);
65e8354ec   Eric Dumazet   inetpeer: seqlock...
413
  		write_sequnlock_bh(&base->lock);
4b9d9be83   Eric Dumazet   inetpeer: remove ...
414
415
416
417
418
419
  		return p;
  	}
  	if (!gccnt) {
  		gccnt = inet_peer_gc(base, stack, stackptr);
  		if (gccnt && create)
  			goto relookup;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
420
  	}
aa1039e73   Eric Dumazet   inetpeer: RCU con...
421
422
  	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
  	if (p) {
b534ecf1c   David S. Miller   inetpeer: Make in...
423
  		p->daddr = *daddr;
aa1039e73   Eric Dumazet   inetpeer: RCU con...
424
425
  		atomic_set(&p->refcnt, 1);
  		atomic_set(&p->rid, 0);
87c48fa3b   Eric Dumazet   ipv6: make fragme...
426
427
428
429
  		atomic_set(&p->ip_id_count,
  				(daddr->family == AF_INET) ?
  					secure_ip_id(daddr->addr.a4) :
  					secure_ipv6_id(daddr->addr.a6));
aa1039e73   Eric Dumazet   inetpeer: RCU con...
430
  		p->tcp_ts_stamp = 0;
144001bdd   David S. Miller   inetpeer: Mark me...
431
  		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
92d868292   David S. Miller   inetpeer: Move IC...
432
433
  		p->rate_tokens = 0;
  		p->rate_last = 0;
ddd4aa424   David S. Miller   inetpeer: Add red...
434
  		p->pmtu_expires = 0;
46af31800   Hiroaki SHIMODA   ipv4: Fix PMTU up...
435
  		p->pmtu_orig = 0;
ddd4aa424   David S. Miller   inetpeer: Add red...
436
  		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
aa1039e73   Eric Dumazet   inetpeer: RCU con...
437
438
439
  
  
  		/* Link the node. */
98158f5a8   David S. Miller   inetpeer: Abstrac...
440
441
  		link_to_pool(p, base);
  		base->total++;
aa1039e73   Eric Dumazet   inetpeer: RCU con...
442
  	}
65e8354ec   Eric Dumazet   inetpeer: seqlock...
443
  	write_sequnlock_bh(&base->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
444

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
445
446
  	return p;
  }
b34193638   David S. Miller   ipv6: Add infrast...
447
  EXPORT_SYMBOL_GPL(inet_getpeer);
98158f5a8   David S. Miller   inetpeer: Abstrac...
448

4663afe2c   Eric Dumazet   [NET]: reduce siz...
449
450
  void inet_putpeer(struct inet_peer *p)
  {
4b9d9be83   Eric Dumazet   inetpeer: remove ...
451
  	p->dtime = (__u32)jiffies;
6d1a3e042   Eric Dumazet   inetpeer: kill in...
452
  	smp_mb__before_atomic_dec();
4b9d9be83   Eric Dumazet   inetpeer: remove ...
453
  	atomic_dec(&p->refcnt);
4663afe2c   Eric Dumazet   [NET]: reduce siz...
454
  }
b34193638   David S. Miller   ipv6: Add infrast...
455
  EXPORT_SYMBOL_GPL(inet_putpeer);
92d868292   David S. Miller   inetpeer: Move IC...
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
  
  /*
   *	Check transmit rate limitation for given message.
   *	The rate information is held in the inet_peer entries now.
   *	This function is generic and could be used for other purposes
   *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
   *
   *	Note that the same inet_peer fields are modified by functions in
   *	route.c too, but these work for packet destinations while xrlim_allow
   *	works for icmp destinations. This means the rate limiting information
   *	for one "ip object" is shared - and these ICMPs are twice limited:
   *	by source and by destination.
   *
   *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
   *			  SHOULD allow setting of rate limits
   *
   * 	Shared between ICMPv4 and ICMPv6.
   */
  #define XRLIM_BURST_FACTOR 6
  bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
  {
  	unsigned long now, token;
  	bool rc = false;
  
  	if (!peer)
  		return true;
  
  	token = peer->rate_tokens;
  	now = jiffies;
  	token += now - peer->rate_last;
  	peer->rate_last = now;
  	if (token > XRLIM_BURST_FACTOR * timeout)
  		token = XRLIM_BURST_FACTOR * timeout;
  	if (token >= timeout) {
  		token -= timeout;
  		rc = true;
  	}
  	peer->rate_tokens = token;
  	return rc;
  }
  EXPORT_SYMBOL(inet_peer_xrlim_allow);