Commit f2c31e32b378a6653f8de606149d963baf11d7d3

Authored by Eric Dumazet
Committed by David S. Miller
1 parent 28f4881cbf

net: fix NULL dereferences in check_peer_redir()

Gergely Kalman reported crashes in check_peer_redir().

It appears commit f39925dbde778 (ipv4: Cache learned redirect
information in inetpeer.) added a race, leading to possible NULL ptr
dereference.

Since we can now change dst neighbour, we should make sure a reader can
safely use a neighbour.

Add RCU protection to dst neighbour, and make sure check_peer_redir()
can be called safely by different cpus in parallel.

As neighbours are already freed after one RCU grace period, this patch
should not add typical RCU penalty (cache cold effects)

Many thanks to Gergely for providing a pretty report pointing to the
bug.

Reported-by: Gergely Kalman <synapse@hippy.csoma.elte.hu>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 7 changed files with 67 additions and 26 deletions Inline Diff

1 /* 1 /*
2 * net/dst.h Protocol independent destination cache definitions. 2 * net/dst.h Protocol independent destination cache definitions.
3 * 3 *
4 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 4 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
5 * 5 *
6 */ 6 */
7 7
8 #ifndef _NET_DST_H 8 #ifndef _NET_DST_H
9 #define _NET_DST_H 9 #define _NET_DST_H
10 10
11 #include <net/dst_ops.h> 11 #include <net/dst_ops.h>
12 #include <linux/netdevice.h> 12 #include <linux/netdevice.h>
13 #include <linux/rtnetlink.h> 13 #include <linux/rtnetlink.h>
14 #include <linux/rcupdate.h> 14 #include <linux/rcupdate.h>
15 #include <linux/jiffies.h> 15 #include <linux/jiffies.h>
16 #include <net/neighbour.h> 16 #include <net/neighbour.h>
17 #include <asm/processor.h> 17 #include <asm/processor.h>
18 18
19 #define DST_GC_MIN (HZ/10) 19 #define DST_GC_MIN (HZ/10)
20 #define DST_GC_INC (HZ/2) 20 #define DST_GC_INC (HZ/2)
21 #define DST_GC_MAX (120*HZ) 21 #define DST_GC_MAX (120*HZ)
22 22
23 /* Each dst_entry has reference count and sits in some parent list(s). 23 /* Each dst_entry has reference count and sits in some parent list(s).
24 * When it is removed from parent list, it is "freed" (dst_free). 24 * When it is removed from parent list, it is "freed" (dst_free).
25 * After this it enters dead state (dst->obsolete > 0) and if its refcnt 25 * After this it enters dead state (dst->obsolete > 0) and if its refcnt
26 * is zero, it can be destroyed immediately, otherwise it is added 26 * is zero, it can be destroyed immediately, otherwise it is added
27 * to gc list and garbage collector periodically checks the refcnt. 27 * to gc list and garbage collector periodically checks the refcnt.
28 */ 28 */
29 29
30 struct sk_buff; 30 struct sk_buff;
31 31
32 struct dst_entry { 32 struct dst_entry {
33 struct rcu_head rcu_head; 33 struct rcu_head rcu_head;
34 struct dst_entry *child; 34 struct dst_entry *child;
35 struct net_device *dev; 35 struct net_device *dev;
36 struct dst_ops *ops; 36 struct dst_ops *ops;
37 unsigned long _metrics; 37 unsigned long _metrics;
38 unsigned long expires; 38 unsigned long expires;
39 struct dst_entry *path; 39 struct dst_entry *path;
40 struct neighbour *_neighbour; 40 struct neighbour __rcu *_neighbour;
41 #ifdef CONFIG_XFRM 41 #ifdef CONFIG_XFRM
42 struct xfrm_state *xfrm; 42 struct xfrm_state *xfrm;
43 #else 43 #else
44 void *__pad1; 44 void *__pad1;
45 #endif 45 #endif
46 int (*input)(struct sk_buff*); 46 int (*input)(struct sk_buff*);
47 int (*output)(struct sk_buff*); 47 int (*output)(struct sk_buff*);
48 48
49 int flags; 49 int flags;
50 #define DST_HOST 0x0001 50 #define DST_HOST 0x0001
51 #define DST_NOXFRM 0x0002 51 #define DST_NOXFRM 0x0002
52 #define DST_NOPOLICY 0x0004 52 #define DST_NOPOLICY 0x0004
53 #define DST_NOHASH 0x0008 53 #define DST_NOHASH 0x0008
54 #define DST_NOCACHE 0x0010 54 #define DST_NOCACHE 0x0010
55 #define DST_NOCOUNT 0x0020 55 #define DST_NOCOUNT 0x0020
56 56
57 short error; 57 short error;
58 short obsolete; 58 short obsolete;
59 unsigned short header_len; /* more space at head required */ 59 unsigned short header_len; /* more space at head required */
60 unsigned short trailer_len; /* space to reserve at tail */ 60 unsigned short trailer_len; /* space to reserve at tail */
61 #ifdef CONFIG_IP_ROUTE_CLASSID 61 #ifdef CONFIG_IP_ROUTE_CLASSID
62 __u32 tclassid; 62 __u32 tclassid;
63 #else 63 #else
64 __u32 __pad2; 64 __u32 __pad2;
65 #endif 65 #endif
66 66
67 /* 67 /*
68 * Align __refcnt to a 64 bytes alignment 68 * Align __refcnt to a 64 bytes alignment
69 * (L1_CACHE_SIZE would be too much) 69 * (L1_CACHE_SIZE would be too much)
70 */ 70 */
71 #ifdef CONFIG_64BIT 71 #ifdef CONFIG_64BIT
72 long __pad_to_align_refcnt[2]; 72 long __pad_to_align_refcnt[2];
73 #endif 73 #endif
74 /* 74 /*
75 * __refcnt wants to be on a different cache line from 75 * __refcnt wants to be on a different cache line from
76 * input/output/ops or performance tanks badly 76 * input/output/ops or performance tanks badly
77 */ 77 */
78 atomic_t __refcnt; /* client references */ 78 atomic_t __refcnt; /* client references */
79 int __use; 79 int __use;
80 unsigned long lastuse; 80 unsigned long lastuse;
81 union { 81 union {
82 struct dst_entry *next; 82 struct dst_entry *next;
83 struct rtable __rcu *rt_next; 83 struct rtable __rcu *rt_next;
84 struct rt6_info *rt6_next; 84 struct rt6_info *rt6_next;
85 struct dn_route __rcu *dn_next; 85 struct dn_route __rcu *dn_next;
86 }; 86 };
87 }; 87 };
88 88
89 static inline struct neighbour *dst_get_neighbour(struct dst_entry *dst) 89 static inline struct neighbour *dst_get_neighbour(struct dst_entry *dst)
90 { 90 {
91 return dst->_neighbour; 91 return rcu_dereference(dst->_neighbour);
92 } 92 }
93 93
94 static inline struct neighbour *dst_get_neighbour_raw(struct dst_entry *dst)
95 {
96 return rcu_dereference_raw(dst->_neighbour);
97 }
98
94 static inline void dst_set_neighbour(struct dst_entry *dst, struct neighbour *neigh) 99 static inline void dst_set_neighbour(struct dst_entry *dst, struct neighbour *neigh)
95 { 100 {
96 dst->_neighbour = neigh; 101 rcu_assign_pointer(dst->_neighbour, neigh);
97 } 102 }
98 103
99 extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); 104 extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
100 extern const u32 dst_default_metrics[RTAX_MAX]; 105 extern const u32 dst_default_metrics[RTAX_MAX];
101 106
102 #define DST_METRICS_READ_ONLY 0x1UL 107 #define DST_METRICS_READ_ONLY 0x1UL
103 #define __DST_METRICS_PTR(Y) \ 108 #define __DST_METRICS_PTR(Y) \
104 ((u32 *)((Y) & ~DST_METRICS_READ_ONLY)) 109 ((u32 *)((Y) & ~DST_METRICS_READ_ONLY))
105 #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) 110 #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics)
106 111
107 static inline bool dst_metrics_read_only(const struct dst_entry *dst) 112 static inline bool dst_metrics_read_only(const struct dst_entry *dst)
108 { 113 {
109 return dst->_metrics & DST_METRICS_READ_ONLY; 114 return dst->_metrics & DST_METRICS_READ_ONLY;
110 } 115 }
111 116
112 extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); 117 extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);
113 118
114 static inline void dst_destroy_metrics_generic(struct dst_entry *dst) 119 static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
115 { 120 {
116 unsigned long val = dst->_metrics; 121 unsigned long val = dst->_metrics;
117 if (!(val & DST_METRICS_READ_ONLY)) 122 if (!(val & DST_METRICS_READ_ONLY))
118 __dst_destroy_metrics_generic(dst, val); 123 __dst_destroy_metrics_generic(dst, val);
119 } 124 }
120 125
121 static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) 126 static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
122 { 127 {
123 unsigned long p = dst->_metrics; 128 unsigned long p = dst->_metrics;
124 129
125 BUG_ON(!p); 130 BUG_ON(!p);
126 131
127 if (p & DST_METRICS_READ_ONLY) 132 if (p & DST_METRICS_READ_ONLY)
128 return dst->ops->cow_metrics(dst, p); 133 return dst->ops->cow_metrics(dst, p);
129 return __DST_METRICS_PTR(p); 134 return __DST_METRICS_PTR(p);
130 } 135 }
131 136
132 /* This may only be invoked before the entry has reached global 137 /* This may only be invoked before the entry has reached global
133 * visibility. 138 * visibility.
134 */ 139 */
135 static inline void dst_init_metrics(struct dst_entry *dst, 140 static inline void dst_init_metrics(struct dst_entry *dst,
136 const u32 *src_metrics, 141 const u32 *src_metrics,
137 bool read_only) 142 bool read_only)
138 { 143 {
139 dst->_metrics = ((unsigned long) src_metrics) | 144 dst->_metrics = ((unsigned long) src_metrics) |
140 (read_only ? DST_METRICS_READ_ONLY : 0); 145 (read_only ? DST_METRICS_READ_ONLY : 0);
141 } 146 }
142 147
143 static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) 148 static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
144 { 149 {
145 u32 *dst_metrics = dst_metrics_write_ptr(dest); 150 u32 *dst_metrics = dst_metrics_write_ptr(dest);
146 151
147 if (dst_metrics) { 152 if (dst_metrics) {
148 u32 *src_metrics = DST_METRICS_PTR(src); 153 u32 *src_metrics = DST_METRICS_PTR(src);
149 154
150 memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); 155 memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
151 } 156 }
152 } 157 }
153 158
154 static inline u32 *dst_metrics_ptr(struct dst_entry *dst) 159 static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
155 { 160 {
156 return DST_METRICS_PTR(dst); 161 return DST_METRICS_PTR(dst);
157 } 162 }
158 163
159 static inline u32 164 static inline u32
160 dst_metric_raw(const struct dst_entry *dst, const int metric) 165 dst_metric_raw(const struct dst_entry *dst, const int metric)
161 { 166 {
162 u32 *p = DST_METRICS_PTR(dst); 167 u32 *p = DST_METRICS_PTR(dst);
163 168
164 return p[metric-1]; 169 return p[metric-1];
165 } 170 }
166 171
167 static inline u32 172 static inline u32
168 dst_metric(const struct dst_entry *dst, const int metric) 173 dst_metric(const struct dst_entry *dst, const int metric)
169 { 174 {
170 WARN_ON_ONCE(metric == RTAX_HOPLIMIT || 175 WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
171 metric == RTAX_ADVMSS || 176 metric == RTAX_ADVMSS ||
172 metric == RTAX_MTU); 177 metric == RTAX_MTU);
173 return dst_metric_raw(dst, metric); 178 return dst_metric_raw(dst, metric);
174 } 179 }
175 180
176 static inline u32 181 static inline u32
177 dst_metric_advmss(const struct dst_entry *dst) 182 dst_metric_advmss(const struct dst_entry *dst)
178 { 183 {
179 u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); 184 u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);
180 185
181 if (!advmss) 186 if (!advmss)
182 advmss = dst->ops->default_advmss(dst); 187 advmss = dst->ops->default_advmss(dst);
183 188
184 return advmss; 189 return advmss;
185 } 190 }
186 191
187 static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) 192 static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
188 { 193 {
189 u32 *p = dst_metrics_write_ptr(dst); 194 u32 *p = dst_metrics_write_ptr(dst);
190 195
191 if (p) 196 if (p)
192 p[metric-1] = val; 197 p[metric-1] = val;
193 } 198 }
194 199
195 static inline u32 200 static inline u32
196 dst_feature(const struct dst_entry *dst, u32 feature) 201 dst_feature(const struct dst_entry *dst, u32 feature)
197 { 202 {
198 return dst_metric(dst, RTAX_FEATURES) & feature; 203 return dst_metric(dst, RTAX_FEATURES) & feature;
199 } 204 }
200 205
201 static inline u32 dst_mtu(const struct dst_entry *dst) 206 static inline u32 dst_mtu(const struct dst_entry *dst)
202 { 207 {
203 u32 mtu = dst_metric_raw(dst, RTAX_MTU); 208 u32 mtu = dst_metric_raw(dst, RTAX_MTU);
204 209
205 if (!mtu) 210 if (!mtu)
206 mtu = dst->ops->default_mtu(dst); 211 mtu = dst->ops->default_mtu(dst);
207 212
208 return mtu; 213 return mtu;
209 } 214 }
210 215
211 /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ 216 /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
212 static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) 217 static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
213 { 218 {
214 return msecs_to_jiffies(dst_metric(dst, metric)); 219 return msecs_to_jiffies(dst_metric(dst, metric));
215 } 220 }
216 221
217 static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric, 222 static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
218 unsigned long rtt) 223 unsigned long rtt)
219 { 224 {
220 dst_metric_set(dst, metric, jiffies_to_msecs(rtt)); 225 dst_metric_set(dst, metric, jiffies_to_msecs(rtt));
221 } 226 }
222 227
223 static inline u32 228 static inline u32
224 dst_allfrag(const struct dst_entry *dst) 229 dst_allfrag(const struct dst_entry *dst)
225 { 230 {
226 int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG); 231 int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG);
227 return ret; 232 return ret;
228 } 233 }
229 234
230 static inline int 235 static inline int
231 dst_metric_locked(const struct dst_entry *dst, int metric) 236 dst_metric_locked(const struct dst_entry *dst, int metric)
232 { 237 {
233 return dst_metric(dst, RTAX_LOCK) & (1<<metric); 238 return dst_metric(dst, RTAX_LOCK) & (1<<metric);
234 } 239 }
235 240
236 static inline void dst_hold(struct dst_entry * dst) 241 static inline void dst_hold(struct dst_entry * dst)
237 { 242 {
238 /* 243 /*
239 * If your kernel compilation stops here, please check 244 * If your kernel compilation stops here, please check
240 * __pad_to_align_refcnt declaration in struct dst_entry 245 * __pad_to_align_refcnt declaration in struct dst_entry
241 */ 246 */
242 BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); 247 BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
243 atomic_inc(&dst->__refcnt); 248 atomic_inc(&dst->__refcnt);
244 } 249 }
245 250
246 static inline void dst_use(struct dst_entry *dst, unsigned long time) 251 static inline void dst_use(struct dst_entry *dst, unsigned long time)
247 { 252 {
248 dst_hold(dst); 253 dst_hold(dst);
249 dst->__use++; 254 dst->__use++;
250 dst->lastuse = time; 255 dst->lastuse = time;
251 } 256 }
252 257
253 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) 258 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
254 { 259 {
255 dst->__use++; 260 dst->__use++;
256 dst->lastuse = time; 261 dst->lastuse = time;
257 } 262 }
258 263
259 static inline 264 static inline
260 struct dst_entry * dst_clone(struct dst_entry * dst) 265 struct dst_entry * dst_clone(struct dst_entry * dst)
261 { 266 {
262 if (dst) 267 if (dst)
263 atomic_inc(&dst->__refcnt); 268 atomic_inc(&dst->__refcnt);
264 return dst; 269 return dst;
265 } 270 }
266 271
267 extern void dst_release(struct dst_entry *dst); 272 extern void dst_release(struct dst_entry *dst);
268 273
269 static inline void refdst_drop(unsigned long refdst) 274 static inline void refdst_drop(unsigned long refdst)
270 { 275 {
271 if (!(refdst & SKB_DST_NOREF)) 276 if (!(refdst & SKB_DST_NOREF))
272 dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); 277 dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
273 } 278 }
274 279
275 /** 280 /**
276 * skb_dst_drop - drops skb dst 281 * skb_dst_drop - drops skb dst
277 * @skb: buffer 282 * @skb: buffer
278 * 283 *
279 * Drops dst reference count if a reference was taken. 284 * Drops dst reference count if a reference was taken.
280 */ 285 */
281 static inline void skb_dst_drop(struct sk_buff *skb) 286 static inline void skb_dst_drop(struct sk_buff *skb)
282 { 287 {
283 if (skb->_skb_refdst) { 288 if (skb->_skb_refdst) {
284 refdst_drop(skb->_skb_refdst); 289 refdst_drop(skb->_skb_refdst);
285 skb->_skb_refdst = 0UL; 290 skb->_skb_refdst = 0UL;
286 } 291 }
287 } 292 }
288 293
289 static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) 294 static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
290 { 295 {
291 nskb->_skb_refdst = oskb->_skb_refdst; 296 nskb->_skb_refdst = oskb->_skb_refdst;
292 if (!(nskb->_skb_refdst & SKB_DST_NOREF)) 297 if (!(nskb->_skb_refdst & SKB_DST_NOREF))
293 dst_clone(skb_dst(nskb)); 298 dst_clone(skb_dst(nskb));
294 } 299 }
295 300
296 /** 301 /**
297 * skb_dst_force - makes sure skb dst is refcounted 302 * skb_dst_force - makes sure skb dst is refcounted
298 * @skb: buffer 303 * @skb: buffer
299 * 304 *
300 * If dst is not yet refcounted, let's do it 305 * If dst is not yet refcounted, let's do it
301 */ 306 */
302 static inline void skb_dst_force(struct sk_buff *skb) 307 static inline void skb_dst_force(struct sk_buff *skb)
303 { 308 {
304 if (skb_dst_is_noref(skb)) { 309 if (skb_dst_is_noref(skb)) {
305 WARN_ON(!rcu_read_lock_held()); 310 WARN_ON(!rcu_read_lock_held());
306 skb->_skb_refdst &= ~SKB_DST_NOREF; 311 skb->_skb_refdst &= ~SKB_DST_NOREF;
307 dst_clone(skb_dst(skb)); 312 dst_clone(skb_dst(skb));
308 } 313 }
309 } 314 }
310 315
311 316
312 /** 317 /**
313 * __skb_tunnel_rx - prepare skb for rx reinsert 318 * __skb_tunnel_rx - prepare skb for rx reinsert
314 * @skb: buffer 319 * @skb: buffer
315 * @dev: tunnel device 320 * @dev: tunnel device
316 * 321 *
317 * After decapsulation, packet is going to re-enter (netif_rx()) our stack, 322 * After decapsulation, packet is going to re-enter (netif_rx()) our stack,
318 * so make some cleanups. (no accounting done) 323 * so make some cleanups. (no accounting done)
319 */ 324 */
320 static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) 325 static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
321 { 326 {
322 skb->dev = dev; 327 skb->dev = dev;
323 skb->rxhash = 0; 328 skb->rxhash = 0;
324 skb_set_queue_mapping(skb, 0); 329 skb_set_queue_mapping(skb, 0);
325 skb_dst_drop(skb); 330 skb_dst_drop(skb);
326 nf_reset(skb); 331 nf_reset(skb);
327 } 332 }
328 333
329 /** 334 /**
330 * skb_tunnel_rx - prepare skb for rx reinsert 335 * skb_tunnel_rx - prepare skb for rx reinsert
331 * @skb: buffer 336 * @skb: buffer
332 * @dev: tunnel device 337 * @dev: tunnel device
333 * 338 *
334 * After decapsulation, packet is going to re-enter (netif_rx()) our stack, 339 * After decapsulation, packet is going to re-enter (netif_rx()) our stack,
335 * so make some cleanups, and perform accounting. 340 * so make some cleanups, and perform accounting.
336 * Note: this accounting is not SMP safe. 341 * Note: this accounting is not SMP safe.
337 */ 342 */
338 static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) 343 static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev)
339 { 344 {
340 /* TODO : stats should be SMP safe */ 345 /* TODO : stats should be SMP safe */
341 dev->stats.rx_packets++; 346 dev->stats.rx_packets++;
342 dev->stats.rx_bytes += skb->len; 347 dev->stats.rx_bytes += skb->len;
343 __skb_tunnel_rx(skb, dev); 348 __skb_tunnel_rx(skb, dev);
344 } 349 }
345 350
346 /* Children define the path of the packet through the 351 /* Children define the path of the packet through the
347 * Linux networking. Thus, destinations are stackable. 352 * Linux networking. Thus, destinations are stackable.
348 */ 353 */
349 354
350 static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb) 355 static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb)
351 { 356 {
352 struct dst_entry *child = dst_clone(skb_dst(skb)->child); 357 struct dst_entry *child = dst_clone(skb_dst(skb)->child);
353 358
354 skb_dst_drop(skb); 359 skb_dst_drop(skb);
355 return child; 360 return child;
356 } 361 }
357 362
358 extern int dst_discard(struct sk_buff *skb); 363 extern int dst_discard(struct sk_buff *skb);
359 extern void *dst_alloc(struct dst_ops * ops, struct net_device *dev, 364 extern void *dst_alloc(struct dst_ops * ops, struct net_device *dev,
360 int initial_ref, int initial_obsolete, int flags); 365 int initial_ref, int initial_obsolete, int flags);
361 extern void __dst_free(struct dst_entry * dst); 366 extern void __dst_free(struct dst_entry * dst);
362 extern struct dst_entry *dst_destroy(struct dst_entry * dst); 367 extern struct dst_entry *dst_destroy(struct dst_entry * dst);
363 368
364 static inline void dst_free(struct dst_entry * dst) 369 static inline void dst_free(struct dst_entry * dst)
365 { 370 {
366 if (dst->obsolete > 1) 371 if (dst->obsolete > 1)
367 return; 372 return;
368 if (!atomic_read(&dst->__refcnt)) { 373 if (!atomic_read(&dst->__refcnt)) {
369 dst = dst_destroy(dst); 374 dst = dst_destroy(dst);
370 if (!dst) 375 if (!dst)
371 return; 376 return;
372 } 377 }
373 __dst_free(dst); 378 __dst_free(dst);
374 } 379 }
375 380
376 static inline void dst_rcu_free(struct rcu_head *head) 381 static inline void dst_rcu_free(struct rcu_head *head)
377 { 382 {
378 struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); 383 struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
379 dst_free(dst); 384 dst_free(dst);
380 } 385 }
381 386
382 static inline void dst_confirm(struct dst_entry *dst) 387 static inline void dst_confirm(struct dst_entry *dst)
383 { 388 {
384 if (dst) { 389 if (dst) {
385 struct neighbour *n = dst_get_neighbour(dst); 390 struct neighbour *n;
391
392 rcu_read_lock();
393 n = dst_get_neighbour(dst);
386 neigh_confirm(n); 394 neigh_confirm(n);
395 rcu_read_unlock();
387 } 396 }
388 } 397 }
389 398
390 static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) 399 static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
391 { 400 {
392 return dst->ops->neigh_lookup(dst, daddr); 401 return dst->ops->neigh_lookup(dst, daddr);
393 } 402 }
394 403
395 static inline void dst_link_failure(struct sk_buff *skb) 404 static inline void dst_link_failure(struct sk_buff *skb)
396 { 405 {
397 struct dst_entry *dst = skb_dst(skb); 406 struct dst_entry *dst = skb_dst(skb);
398 if (dst && dst->ops && dst->ops->link_failure) 407 if (dst && dst->ops && dst->ops->link_failure)
399 dst->ops->link_failure(skb); 408 dst->ops->link_failure(skb);
400 } 409 }
401 410
402 static inline void dst_set_expires(struct dst_entry *dst, int timeout) 411 static inline void dst_set_expires(struct dst_entry *dst, int timeout)
403 { 412 {
404 unsigned long expires = jiffies + timeout; 413 unsigned long expires = jiffies + timeout;
405 414
406 if (expires == 0) 415 if (expires == 0)
407 expires = 1; 416 expires = 1;
408 417
409 if (dst->expires == 0 || time_before(expires, dst->expires)) 418 if (dst->expires == 0 || time_before(expires, dst->expires))
410 dst->expires = expires; 419 dst->expires = expires;
411 } 420 }
412 421
413 /* Output packet to network from transport. */ 422 /* Output packet to network from transport. */
414 static inline int dst_output(struct sk_buff *skb) 423 static inline int dst_output(struct sk_buff *skb)
415 { 424 {
416 return skb_dst(skb)->output(skb); 425 return skb_dst(skb)->output(skb);
417 } 426 }
418 427
419 /* Input packet from network to transport. */ 428 /* Input packet from network to transport. */
420 static inline int dst_input(struct sk_buff *skb) 429 static inline int dst_input(struct sk_buff *skb)
421 { 430 {
422 return skb_dst(skb)->input(skb); 431 return skb_dst(skb)->input(skb);
423 } 432 }
424 433
425 static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) 434 static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
426 { 435 {
427 if (dst->obsolete) 436 if (dst->obsolete)
428 dst = dst->ops->check(dst, cookie); 437 dst = dst->ops->check(dst, cookie);
429 return dst; 438 return dst;
430 } 439 }
431 440
432 extern void dst_init(void); 441 extern void dst_init(void);
433 442
434 /* Flags for xfrm_lookup flags argument. */ 443 /* Flags for xfrm_lookup flags argument. */
435 enum { 444 enum {
436 XFRM_LOOKUP_ICMP = 1 << 0, 445 XFRM_LOOKUP_ICMP = 1 << 0,
437 }; 446 };
438 447
439 struct flowi; 448 struct flowi;
440 #ifndef CONFIG_XFRM 449 #ifndef CONFIG_XFRM
441 static inline struct dst_entry *xfrm_lookup(struct net *net, 450 static inline struct dst_entry *xfrm_lookup(struct net *net,
442 struct dst_entry *dst_orig, 451 struct dst_entry *dst_orig,
443 const struct flowi *fl, struct sock *sk, 452 const struct flowi *fl, struct sock *sk,
444 int flags) 453 int flags)
445 { 454 {
446 return dst_orig; 455 return dst_orig;
447 } 456 }
448 #else 457 #else
449 extern struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, 458 extern struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
450 const struct flowi *fl, struct sock *sk, 459 const struct flowi *fl, struct sock *sk,
451 int flags); 460 int flags);
452 #endif 461 #endif
453 462
454 #endif /* _NET_DST_H */ 463 #endif /* _NET_DST_H */
455 464
net/ipv4/ip_output.c
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * The Internet Protocol (IP) output module. 6 * The Internet Protocol (IP) output module.
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org> 10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org> 11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood 12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de> 13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net> 14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp> 16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 * 17 *
18 * See ip_input.c for original log 18 * See ip_input.c for original log
19 * 19 *
20 * Fixes: 20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit. 21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit. 22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when 23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found. 24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit 25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by 26 * (in case if packet not accepted by
27 * output firewall rules) 27 * output firewall rules)
28 * Mike McLagan : Routing by source 28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache 29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove 30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests. 31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply. 33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path 34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86 35 * for decreased register pressure on x86
36 * and more readibility. 36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE, 37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM. 38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments. 39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP 40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams. 41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now. 42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */ 43 */
44 44
45 #include <asm/uaccess.h> 45 #include <asm/uaccess.h>
46 #include <asm/system.h> 46 #include <asm/system.h>
47 #include <linux/module.h> 47 #include <linux/module.h>
48 #include <linux/types.h> 48 #include <linux/types.h>
49 #include <linux/kernel.h> 49 #include <linux/kernel.h>
50 #include <linux/mm.h> 50 #include <linux/mm.h>
51 #include <linux/string.h> 51 #include <linux/string.h>
52 #include <linux/errno.h> 52 #include <linux/errno.h>
53 #include <linux/highmem.h> 53 #include <linux/highmem.h>
54 #include <linux/slab.h> 54 #include <linux/slab.h>
55 55
56 #include <linux/socket.h> 56 #include <linux/socket.h>
57 #include <linux/sockios.h> 57 #include <linux/sockios.h>
58 #include <linux/in.h> 58 #include <linux/in.h>
59 #include <linux/inet.h> 59 #include <linux/inet.h>
60 #include <linux/netdevice.h> 60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h> 61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h> 62 #include <linux/proc_fs.h>
63 #include <linux/stat.h> 63 #include <linux/stat.h>
64 #include <linux/init.h> 64 #include <linux/init.h>
65 65
66 #include <net/snmp.h> 66 #include <net/snmp.h>
67 #include <net/ip.h> 67 #include <net/ip.h>
68 #include <net/protocol.h> 68 #include <net/protocol.h>
69 #include <net/route.h> 69 #include <net/route.h>
70 #include <net/xfrm.h> 70 #include <net/xfrm.h>
71 #include <linux/skbuff.h> 71 #include <linux/skbuff.h>
72 #include <net/sock.h> 72 #include <net/sock.h>
73 #include <net/arp.h> 73 #include <net/arp.h>
74 #include <net/icmp.h> 74 #include <net/icmp.h>
75 #include <net/checksum.h> 75 #include <net/checksum.h>
76 #include <net/inetpeer.h> 76 #include <net/inetpeer.h>
77 #include <linux/igmp.h> 77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h> 78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h> 79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h> 80 #include <linux/mroute.h>
81 #include <linux/netlink.h> 81 #include <linux/netlink.h>
82 #include <linux/tcp.h> 82 #include <linux/tcp.h>
83 83
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl); 85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 86
87 /* Generate a checksum for an outgoing IP datagram. */ 87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph) 88 __inline__ void ip_send_check(struct iphdr *iph)
89 { 89 {
90 iph->check = 0; 90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 } 92 }
93 EXPORT_SYMBOL(ip_send_check); 93 EXPORT_SYMBOL(ip_send_check);
94 94
95 int __ip_local_out(struct sk_buff *skb) 95 int __ip_local_out(struct sk_buff *skb)
96 { 96 {
97 struct iphdr *iph = ip_hdr(skb); 97 struct iphdr *iph = ip_hdr(skb);
98 98
99 iph->tot_len = htons(skb->len); 99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph); 100 ip_send_check(iph);
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, 101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output); 102 skb_dst(skb)->dev, dst_output);
103 } 103 }
104 104
105 int ip_local_out(struct sk_buff *skb) 105 int ip_local_out(struct sk_buff *skb)
106 { 106 {
107 int err; 107 int err;
108 108
109 err = __ip_local_out(skb); 109 err = __ip_local_out(skb);
110 if (likely(err == 1)) 110 if (likely(err == 1))
111 err = dst_output(skb); 111 err = dst_output(skb);
112 112
113 return err; 113 return err;
114 } 114 }
115 EXPORT_SYMBOL_GPL(ip_local_out); 115 EXPORT_SYMBOL_GPL(ip_local_out);
116 116
117 /* dev_loopback_xmit for use with netfilter. */ 117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb) 118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 { 119 {
120 skb_reset_mac_header(newskb); 120 skb_reset_mac_header(newskb);
121 __skb_pull(newskb, skb_network_offset(newskb)); 121 __skb_pull(newskb, skb_network_offset(newskb));
122 newskb->pkt_type = PACKET_LOOPBACK; 122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY; 123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 WARN_ON(!skb_dst(newskb)); 124 WARN_ON(!skb_dst(newskb));
125 netif_rx_ni(newskb); 125 netif_rx_ni(newskb);
126 return 0; 126 return 0;
127 } 127 }
128 128
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 { 130 {
131 int ttl = inet->uc_ttl; 131 int ttl = inet->uc_ttl;
132 132
133 if (ttl < 0) 133 if (ttl < 0)
134 ttl = ip4_dst_hoplimit(dst); 134 ttl = ip4_dst_hoplimit(dst);
135 return ttl; 135 return ttl;
136 } 136 }
137 137
138 /* 138 /*
139 * Add an ip header to a skbuff and send it out. 139 * Add an ip header to a skbuff and send it out.
140 * 140 *
141 */ 141 */
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) 143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144 { 144 {
145 struct inet_sock *inet = inet_sk(sk); 145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb); 146 struct rtable *rt = skb_rtable(skb);
147 struct iphdr *iph; 147 struct iphdr *iph;
148 148
149 /* Build the IP header. */ 149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); 150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 skb_reset_network_header(skb); 151 skb_reset_network_header(skb);
152 iph = ip_hdr(skb); 152 iph = ip_hdr(skb);
153 iph->version = 4; 153 iph->version = 4;
154 iph->ihl = 5; 154 iph->ihl = 5;
155 iph->tos = inet->tos; 155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst)) 156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF); 157 iph->frag_off = htons(IP_DF);
158 else 158 else
159 iph->frag_off = 0; 159 iph->frag_off = 0;
160 iph->ttl = ip_select_ttl(inet, &rt->dst); 160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr; 162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol; 163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk); 164 ip_select_ident(iph, &rt->dst, sk);
165 165
166 if (opt && opt->opt.optlen) { 166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2; 167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0); 168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 } 169 }
170 170
171 skb->priority = sk->sk_priority; 171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark; 172 skb->mark = sk->sk_mark;
173 173
174 /* Send it out. */ 174 /* Send it out. */
175 return ip_local_out(skb); 175 return ip_local_out(skb);
176 } 176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178 178
179 static inline int ip_finish_output2(struct sk_buff *skb) 179 static inline int ip_finish_output2(struct sk_buff *skb)
180 { 180 {
181 struct dst_entry *dst = skb_dst(skb); 181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst; 182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev; 183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev); 184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh; 185 struct neighbour *neigh;
186 186
187 if (rt->rt_type == RTN_MULTICAST) { 187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST) 189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); 190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191 191
192 /* Be paranoid, rather than too clever. */ 192 /* Be paranoid, rather than too clever. */
193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 struct sk_buff *skb2; 194 struct sk_buff *skb2;
195 195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) { 197 if (skb2 == NULL) {
198 kfree_skb(skb); 198 kfree_skb(skb);
199 return -ENOMEM; 199 return -ENOMEM;
200 } 200 }
201 if (skb->sk) 201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk); 202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb); 203 kfree_skb(skb);
204 skb = skb2; 204 skb = skb2;
205 } 205 }
206 206
207 rcu_read_lock();
207 neigh = dst_get_neighbour(dst); 208 neigh = dst_get_neighbour(dst);
208 if (neigh) 209 if (neigh) {
209 return neigh_output(neigh, skb); 210 int res = neigh_output(neigh, skb);
211
212 rcu_read_unlock();
213 return res;
214 }
215 rcu_read_unlock();
210 216
211 if (net_ratelimit()) 217 if (net_ratelimit())
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); 218 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 kfree_skb(skb); 219 kfree_skb(skb);
214 return -EINVAL; 220 return -EINVAL;
215 } 221 }
216 222
217 static inline int ip_skb_dst_mtu(struct sk_buff *skb) 223 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218 { 224 {
219 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; 225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220 226
221 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? 227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); 228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223 } 229 }
224 230
225 static int ip_finish_output(struct sk_buff *skb) 231 static int ip_finish_output(struct sk_buff *skb)
226 { 232 {
227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 /* Policy lookup after SNAT yielded a new policy */ 234 /* Policy lookup after SNAT yielded a new policy */
229 if (skb_dst(skb)->xfrm != NULL) { 235 if (skb_dst(skb)->xfrm != NULL) {
230 IPCB(skb)->flags |= IPSKB_REROUTED; 236 IPCB(skb)->flags |= IPSKB_REROUTED;
231 return dst_output(skb); 237 return dst_output(skb);
232 } 238 }
233 #endif 239 #endif
234 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) 240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235 return ip_fragment(skb, ip_finish_output2); 241 return ip_fragment(skb, ip_finish_output2);
236 else 242 else
237 return ip_finish_output2(skb); 243 return ip_finish_output2(skb);
238 } 244 }
239 245
240 int ip_mc_output(struct sk_buff *skb) 246 int ip_mc_output(struct sk_buff *skb)
241 { 247 {
242 struct sock *sk = skb->sk; 248 struct sock *sk = skb->sk;
243 struct rtable *rt = skb_rtable(skb); 249 struct rtable *rt = skb_rtable(skb);
244 struct net_device *dev = rt->dst.dev; 250 struct net_device *dev = rt->dst.dev;
245 251
246 /* 252 /*
247 * If the indicated interface is up and running, send the packet. 253 * If the indicated interface is up and running, send the packet.
248 */ 254 */
249 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); 255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250 256
251 skb->dev = dev; 257 skb->dev = dev;
252 skb->protocol = htons(ETH_P_IP); 258 skb->protocol = htons(ETH_P_IP);
253 259
254 /* 260 /*
255 * Multicasts are looped back for other local users 261 * Multicasts are looped back for other local users
256 */ 262 */
257 263
258 if (rt->rt_flags&RTCF_MULTICAST) { 264 if (rt->rt_flags&RTCF_MULTICAST) {
259 if (sk_mc_loop(sk) 265 if (sk_mc_loop(sk)
260 #ifdef CONFIG_IP_MROUTE 266 #ifdef CONFIG_IP_MROUTE
261 /* Small optimization: do not loopback not local frames, 267 /* Small optimization: do not loopback not local frames,
262 which returned after forwarding; they will be dropped 268 which returned after forwarding; they will be dropped
263 by ip_mr_input in any case. 269 by ip_mr_input in any case.
264 Note, that local frames are looped back to be delivered 270 Note, that local frames are looped back to be delivered
265 to local recipients. 271 to local recipients.
266 272
267 This check is duplicated in ip_mr_input at the moment. 273 This check is duplicated in ip_mr_input at the moment.
268 */ 274 */
269 && 275 &&
270 ((rt->rt_flags & RTCF_LOCAL) || 276 ((rt->rt_flags & RTCF_LOCAL) ||
271 !(IPCB(skb)->flags & IPSKB_FORWARDED)) 277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
272 #endif 278 #endif
273 ) { 279 ) {
274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 if (newskb) 281 if (newskb)
276 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 newskb, NULL, newskb->dev, 283 newskb, NULL, newskb->dev,
278 ip_dev_loopback_xmit); 284 ip_dev_loopback_xmit);
279 } 285 }
280 286
281 /* Multicasts with ttl 0 must not go beyond the host */ 287 /* Multicasts with ttl 0 must not go beyond the host */
282 288
283 if (ip_hdr(skb)->ttl == 0) { 289 if (ip_hdr(skb)->ttl == 0) {
284 kfree_skb(skb); 290 kfree_skb(skb);
285 return 0; 291 return 0;
286 } 292 }
287 } 293 }
288 294
289 if (rt->rt_flags&RTCF_BROADCAST) { 295 if (rt->rt_flags&RTCF_BROADCAST) {
290 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291 if (newskb) 297 if (newskb)
292 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 NULL, newskb->dev, ip_dev_loopback_xmit); 299 NULL, newskb->dev, ip_dev_loopback_xmit);
294 } 300 }
295 301
296 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 skb->dev, ip_finish_output, 303 skb->dev, ip_finish_output,
298 !(IPCB(skb)->flags & IPSKB_REROUTED)); 304 !(IPCB(skb)->flags & IPSKB_REROUTED));
299 } 305 }
300 306
301 int ip_output(struct sk_buff *skb) 307 int ip_output(struct sk_buff *skb)
302 { 308 {
303 struct net_device *dev = skb_dst(skb)->dev; 309 struct net_device *dev = skb_dst(skb)->dev;
304 310
305 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); 311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306 312
307 skb->dev = dev; 313 skb->dev = dev;
308 skb->protocol = htons(ETH_P_IP); 314 skb->protocol = htons(ETH_P_IP);
309 315
310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, 316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311 ip_finish_output, 317 ip_finish_output,
312 !(IPCB(skb)->flags & IPSKB_REROUTED)); 318 !(IPCB(skb)->flags & IPSKB_REROUTED));
313 } 319 }
314 320
315 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) 321 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
316 { 322 {
317 struct sock *sk = skb->sk; 323 struct sock *sk = skb->sk;
318 struct inet_sock *inet = inet_sk(sk); 324 struct inet_sock *inet = inet_sk(sk);
319 struct ip_options_rcu *inet_opt; 325 struct ip_options_rcu *inet_opt;
320 struct flowi4 *fl4; 326 struct flowi4 *fl4;
321 struct rtable *rt; 327 struct rtable *rt;
322 struct iphdr *iph; 328 struct iphdr *iph;
323 int res; 329 int res;
324 330
325 /* Skip all of this if the packet is already routed, 331 /* Skip all of this if the packet is already routed,
326 * f.e. by something like SCTP. 332 * f.e. by something like SCTP.
327 */ 333 */
328 rcu_read_lock(); 334 rcu_read_lock();
329 inet_opt = rcu_dereference(inet->inet_opt); 335 inet_opt = rcu_dereference(inet->inet_opt);
330 fl4 = &fl->u.ip4; 336 fl4 = &fl->u.ip4;
331 rt = skb_rtable(skb); 337 rt = skb_rtable(skb);
332 if (rt != NULL) 338 if (rt != NULL)
333 goto packet_routed; 339 goto packet_routed;
334 340
335 /* Make sure we can route this packet. */ 341 /* Make sure we can route this packet. */
336 rt = (struct rtable *)__sk_dst_check(sk, 0); 342 rt = (struct rtable *)__sk_dst_check(sk, 0);
337 if (rt == NULL) { 343 if (rt == NULL) {
338 __be32 daddr; 344 __be32 daddr;
339 345
340 /* Use correct destination address if we have options. */ 346 /* Use correct destination address if we have options. */
341 daddr = inet->inet_daddr; 347 daddr = inet->inet_daddr;
342 if (inet_opt && inet_opt->opt.srr) 348 if (inet_opt && inet_opt->opt.srr)
343 daddr = inet_opt->opt.faddr; 349 daddr = inet_opt->opt.faddr;
344 350
345 /* If this fails, retransmit mechanism of transport layer will 351 /* If this fails, retransmit mechanism of transport layer will
346 * keep trying until route appears or the connection times 352 * keep trying until route appears or the connection times
347 * itself out. 353 * itself out.
348 */ 354 */
349 rt = ip_route_output_ports(sock_net(sk), fl4, sk, 355 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
350 daddr, inet->inet_saddr, 356 daddr, inet->inet_saddr,
351 inet->inet_dport, 357 inet->inet_dport,
352 inet->inet_sport, 358 inet->inet_sport,
353 sk->sk_protocol, 359 sk->sk_protocol,
354 RT_CONN_FLAGS(sk), 360 RT_CONN_FLAGS(sk),
355 sk->sk_bound_dev_if); 361 sk->sk_bound_dev_if);
356 if (IS_ERR(rt)) 362 if (IS_ERR(rt))
357 goto no_route; 363 goto no_route;
358 sk_setup_caps(sk, &rt->dst); 364 sk_setup_caps(sk, &rt->dst);
359 } 365 }
360 skb_dst_set_noref(skb, &rt->dst); 366 skb_dst_set_noref(skb, &rt->dst);
361 367
362 packet_routed: 368 packet_routed:
363 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 369 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
364 goto no_route; 370 goto no_route;
365 371
366 /* OK, we know where to send it, allocate and build IP header. */ 372 /* OK, we know where to send it, allocate and build IP header. */
367 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); 373 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
368 skb_reset_network_header(skb); 374 skb_reset_network_header(skb);
369 iph = ip_hdr(skb); 375 iph = ip_hdr(skb);
370 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 376 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
371 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) 377 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
372 iph->frag_off = htons(IP_DF); 378 iph->frag_off = htons(IP_DF);
373 else 379 else
374 iph->frag_off = 0; 380 iph->frag_off = 0;
375 iph->ttl = ip_select_ttl(inet, &rt->dst); 381 iph->ttl = ip_select_ttl(inet, &rt->dst);
376 iph->protocol = sk->sk_protocol; 382 iph->protocol = sk->sk_protocol;
377 iph->saddr = fl4->saddr; 383 iph->saddr = fl4->saddr;
378 iph->daddr = fl4->daddr; 384 iph->daddr = fl4->daddr;
379 /* Transport layer set skb->h.foo itself. */ 385 /* Transport layer set skb->h.foo itself. */
380 386
381 if (inet_opt && inet_opt->opt.optlen) { 387 if (inet_opt && inet_opt->opt.optlen) {
382 iph->ihl += inet_opt->opt.optlen >> 2; 388 iph->ihl += inet_opt->opt.optlen >> 2;
383 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 389 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
384 } 390 }
385 391
386 ip_select_ident_more(iph, &rt->dst, sk, 392 ip_select_ident_more(iph, &rt->dst, sk,
387 (skb_shinfo(skb)->gso_segs ?: 1) - 1); 393 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
388 394
389 skb->priority = sk->sk_priority; 395 skb->priority = sk->sk_priority;
390 skb->mark = sk->sk_mark; 396 skb->mark = sk->sk_mark;
391 397
392 res = ip_local_out(skb); 398 res = ip_local_out(skb);
393 rcu_read_unlock(); 399 rcu_read_unlock();
394 return res; 400 return res;
395 401
396 no_route: 402 no_route:
397 rcu_read_unlock(); 403 rcu_read_unlock();
398 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 404 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
399 kfree_skb(skb); 405 kfree_skb(skb);
400 return -EHOSTUNREACH; 406 return -EHOSTUNREACH;
401 } 407 }
402 EXPORT_SYMBOL(ip_queue_xmit); 408 EXPORT_SYMBOL(ip_queue_xmit);
403 409
404 410
405 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 411 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
406 { 412 {
407 to->pkt_type = from->pkt_type; 413 to->pkt_type = from->pkt_type;
408 to->priority = from->priority; 414 to->priority = from->priority;
409 to->protocol = from->protocol; 415 to->protocol = from->protocol;
410 skb_dst_drop(to); 416 skb_dst_drop(to);
411 skb_dst_copy(to, from); 417 skb_dst_copy(to, from);
412 to->dev = from->dev; 418 to->dev = from->dev;
413 to->mark = from->mark; 419 to->mark = from->mark;
414 420
415 /* Copy the flags to each fragment. */ 421 /* Copy the flags to each fragment. */
416 IPCB(to)->flags = IPCB(from)->flags; 422 IPCB(to)->flags = IPCB(from)->flags;
417 423
418 #ifdef CONFIG_NET_SCHED 424 #ifdef CONFIG_NET_SCHED
419 to->tc_index = from->tc_index; 425 to->tc_index = from->tc_index;
420 #endif 426 #endif
421 nf_copy(to, from); 427 nf_copy(to, from);
422 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 428 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
423 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 429 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
424 to->nf_trace = from->nf_trace; 430 to->nf_trace = from->nf_trace;
425 #endif 431 #endif
426 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 432 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427 to->ipvs_property = from->ipvs_property; 433 to->ipvs_property = from->ipvs_property;
428 #endif 434 #endif
429 skb_copy_secmark(to, from); 435 skb_copy_secmark(to, from);
430 } 436 }
431 437
432 /* 438 /*
433 * This IP datagram is too large to be sent in one piece. Break it up into 439 * This IP datagram is too large to be sent in one piece. Break it up into
434 * smaller pieces (each of size equal to IP header plus 440 * smaller pieces (each of size equal to IP header plus
435 * a block of the data of the original IP data part) that will yet fit in a 441 * a block of the data of the original IP data part) that will yet fit in a
436 * single device frame, and queue such a frame for sending. 442 * single device frame, and queue such a frame for sending.
437 */ 443 */
438 444
439 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 445 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
440 { 446 {
441 struct iphdr *iph; 447 struct iphdr *iph;
442 int ptr; 448 int ptr;
443 struct net_device *dev; 449 struct net_device *dev;
444 struct sk_buff *skb2; 450 struct sk_buff *skb2;
445 unsigned int mtu, hlen, left, len, ll_rs; 451 unsigned int mtu, hlen, left, len, ll_rs;
446 int offset; 452 int offset;
447 __be16 not_last_frag; 453 __be16 not_last_frag;
448 struct rtable *rt = skb_rtable(skb); 454 struct rtable *rt = skb_rtable(skb);
449 int err = 0; 455 int err = 0;
450 456
451 dev = rt->dst.dev; 457 dev = rt->dst.dev;
452 458
453 /* 459 /*
454 * Point into the IP datagram header. 460 * Point into the IP datagram header.
455 */ 461 */
456 462
457 iph = ip_hdr(skb); 463 iph = ip_hdr(skb);
458 464
459 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 465 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
460 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 466 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
461 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 467 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
462 htonl(ip_skb_dst_mtu(skb))); 468 htonl(ip_skb_dst_mtu(skb)));
463 kfree_skb(skb); 469 kfree_skb(skb);
464 return -EMSGSIZE; 470 return -EMSGSIZE;
465 } 471 }
466 472
467 /* 473 /*
468 * Setup starting values. 474 * Setup starting values.
469 */ 475 */
470 476
471 hlen = iph->ihl * 4; 477 hlen = iph->ihl * 4;
472 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ 478 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
473 #ifdef CONFIG_BRIDGE_NETFILTER 479 #ifdef CONFIG_BRIDGE_NETFILTER
474 if (skb->nf_bridge) 480 if (skb->nf_bridge)
475 mtu -= nf_bridge_mtu_reduction(skb); 481 mtu -= nf_bridge_mtu_reduction(skb);
476 #endif 482 #endif
477 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; 483 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
478 484
479 /* When frag_list is given, use it. First, check its validity: 485 /* When frag_list is given, use it. First, check its validity:
480 * some transformers could create wrong frag_list or break existing 486 * some transformers could create wrong frag_list or break existing
481 * one, it is not prohibited. In this case fall back to copying. 487 * one, it is not prohibited. In this case fall back to copying.
482 * 488 *
483 * LATER: this step can be merged to real generation of fragments, 489 * LATER: this step can be merged to real generation of fragments,
484 * we can switch to copy when see the first bad fragment. 490 * we can switch to copy when see the first bad fragment.
485 */ 491 */
486 if (skb_has_frag_list(skb)) { 492 if (skb_has_frag_list(skb)) {
487 struct sk_buff *frag, *frag2; 493 struct sk_buff *frag, *frag2;
488 int first_len = skb_pagelen(skb); 494 int first_len = skb_pagelen(skb);
489 495
490 if (first_len - hlen > mtu || 496 if (first_len - hlen > mtu ||
491 ((first_len - hlen) & 7) || 497 ((first_len - hlen) & 7) ||
492 ip_is_fragment(iph) || 498 ip_is_fragment(iph) ||
493 skb_cloned(skb)) 499 skb_cloned(skb))
494 goto slow_path; 500 goto slow_path;
495 501
496 skb_walk_frags(skb, frag) { 502 skb_walk_frags(skb, frag) {
497 /* Correct geometry. */ 503 /* Correct geometry. */
498 if (frag->len > mtu || 504 if (frag->len > mtu ||
499 ((frag->len & 7) && frag->next) || 505 ((frag->len & 7) && frag->next) ||
500 skb_headroom(frag) < hlen) 506 skb_headroom(frag) < hlen)
501 goto slow_path_clean; 507 goto slow_path_clean;
502 508
503 /* Partially cloned skb? */ 509 /* Partially cloned skb? */
504 if (skb_shared(frag)) 510 if (skb_shared(frag))
505 goto slow_path_clean; 511 goto slow_path_clean;
506 512
507 BUG_ON(frag->sk); 513 BUG_ON(frag->sk);
508 if (skb->sk) { 514 if (skb->sk) {
509 frag->sk = skb->sk; 515 frag->sk = skb->sk;
510 frag->destructor = sock_wfree; 516 frag->destructor = sock_wfree;
511 } 517 }
512 skb->truesize -= frag->truesize; 518 skb->truesize -= frag->truesize;
513 } 519 }
514 520
515 /* Everything is OK. Generate! */ 521 /* Everything is OK. Generate! */
516 522
517 err = 0; 523 err = 0;
518 offset = 0; 524 offset = 0;
519 frag = skb_shinfo(skb)->frag_list; 525 frag = skb_shinfo(skb)->frag_list;
520 skb_frag_list_init(skb); 526 skb_frag_list_init(skb);
521 skb->data_len = first_len - skb_headlen(skb); 527 skb->data_len = first_len - skb_headlen(skb);
522 skb->len = first_len; 528 skb->len = first_len;
523 iph->tot_len = htons(first_len); 529 iph->tot_len = htons(first_len);
524 iph->frag_off = htons(IP_MF); 530 iph->frag_off = htons(IP_MF);
525 ip_send_check(iph); 531 ip_send_check(iph);
526 532
527 for (;;) { 533 for (;;) {
528 /* Prepare header of the next frame, 534 /* Prepare header of the next frame,
529 * before previous one went down. */ 535 * before previous one went down. */
530 if (frag) { 536 if (frag) {
531 frag->ip_summed = CHECKSUM_NONE; 537 frag->ip_summed = CHECKSUM_NONE;
532 skb_reset_transport_header(frag); 538 skb_reset_transport_header(frag);
533 __skb_push(frag, hlen); 539 __skb_push(frag, hlen);
534 skb_reset_network_header(frag); 540 skb_reset_network_header(frag);
535 memcpy(skb_network_header(frag), iph, hlen); 541 memcpy(skb_network_header(frag), iph, hlen);
536 iph = ip_hdr(frag); 542 iph = ip_hdr(frag);
537 iph->tot_len = htons(frag->len); 543 iph->tot_len = htons(frag->len);
538 ip_copy_metadata(frag, skb); 544 ip_copy_metadata(frag, skb);
539 if (offset == 0) 545 if (offset == 0)
540 ip_options_fragment(frag); 546 ip_options_fragment(frag);
541 offset += skb->len - hlen; 547 offset += skb->len - hlen;
542 iph->frag_off = htons(offset>>3); 548 iph->frag_off = htons(offset>>3);
543 if (frag->next != NULL) 549 if (frag->next != NULL)
544 iph->frag_off |= htons(IP_MF); 550 iph->frag_off |= htons(IP_MF);
545 /* Ready, complete checksum */ 551 /* Ready, complete checksum */
546 ip_send_check(iph); 552 ip_send_check(iph);
547 } 553 }
548 554
549 err = output(skb); 555 err = output(skb);
550 556
551 if (!err) 557 if (!err)
552 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 558 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
553 if (err || !frag) 559 if (err || !frag)
554 break; 560 break;
555 561
556 skb = frag; 562 skb = frag;
557 frag = skb->next; 563 frag = skb->next;
558 skb->next = NULL; 564 skb->next = NULL;
559 } 565 }
560 566
561 if (err == 0) { 567 if (err == 0) {
562 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 568 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
563 return 0; 569 return 0;
564 } 570 }
565 571
566 while (frag) { 572 while (frag) {
567 skb = frag->next; 573 skb = frag->next;
568 kfree_skb(frag); 574 kfree_skb(frag);
569 frag = skb; 575 frag = skb;
570 } 576 }
571 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 577 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
572 return err; 578 return err;
573 579
574 slow_path_clean: 580 slow_path_clean:
575 skb_walk_frags(skb, frag2) { 581 skb_walk_frags(skb, frag2) {
576 if (frag2 == frag) 582 if (frag2 == frag)
577 break; 583 break;
578 frag2->sk = NULL; 584 frag2->sk = NULL;
579 frag2->destructor = NULL; 585 frag2->destructor = NULL;
580 skb->truesize += frag2->truesize; 586 skb->truesize += frag2->truesize;
581 } 587 }
582 } 588 }
583 589
584 slow_path: 590 slow_path:
585 left = skb->len - hlen; /* Space per frame */ 591 left = skb->len - hlen; /* Space per frame */
586 ptr = hlen; /* Where to start from */ 592 ptr = hlen; /* Where to start from */
587 593
588 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 594 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
589 * we need to make room for the encapsulating header 595 * we need to make room for the encapsulating header
590 */ 596 */
591 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); 597 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
592 598
593 /* 599 /*
594 * Fragment the datagram. 600 * Fragment the datagram.
595 */ 601 */
596 602
597 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; 603 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
598 not_last_frag = iph->frag_off & htons(IP_MF); 604 not_last_frag = iph->frag_off & htons(IP_MF);
599 605
600 /* 606 /*
601 * Keep copying data until we run out. 607 * Keep copying data until we run out.
602 */ 608 */
603 609
604 while (left > 0) { 610 while (left > 0) {
605 len = left; 611 len = left;
606 /* IF: it doesn't fit, use 'mtu' - the data space left */ 612 /* IF: it doesn't fit, use 'mtu' - the data space left */
607 if (len > mtu) 613 if (len > mtu)
608 len = mtu; 614 len = mtu;
609 /* IF: we are not sending up to and including the packet end 615 /* IF: we are not sending up to and including the packet end
610 then align the next start on an eight byte boundary */ 616 then align the next start on an eight byte boundary */
611 if (len < left) { 617 if (len < left) {
612 len &= ~7; 618 len &= ~7;
613 } 619 }
614 /* 620 /*
615 * Allocate buffer. 621 * Allocate buffer.
616 */ 622 */
617 623
618 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { 624 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
619 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); 625 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
620 err = -ENOMEM; 626 err = -ENOMEM;
621 goto fail; 627 goto fail;
622 } 628 }
623 629
624 /* 630 /*
625 * Set up data on packet 631 * Set up data on packet
626 */ 632 */
627 633
628 ip_copy_metadata(skb2, skb); 634 ip_copy_metadata(skb2, skb);
629 skb_reserve(skb2, ll_rs); 635 skb_reserve(skb2, ll_rs);
630 skb_put(skb2, len + hlen); 636 skb_put(skb2, len + hlen);
631 skb_reset_network_header(skb2); 637 skb_reset_network_header(skb2);
632 skb2->transport_header = skb2->network_header + hlen; 638 skb2->transport_header = skb2->network_header + hlen;
633 639
634 /* 640 /*
635 * Charge the memory for the fragment to any owner 641 * Charge the memory for the fragment to any owner
636 * it might possess 642 * it might possess
637 */ 643 */
638 644
639 if (skb->sk) 645 if (skb->sk)
640 skb_set_owner_w(skb2, skb->sk); 646 skb_set_owner_w(skb2, skb->sk);
641 647
642 /* 648 /*
643 * Copy the packet header into the new buffer. 649 * Copy the packet header into the new buffer.
644 */ 650 */
645 651
646 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); 652 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
647 653
648 /* 654 /*
649 * Copy a block of the IP datagram. 655 * Copy a block of the IP datagram.
650 */ 656 */
651 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) 657 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
652 BUG(); 658 BUG();
653 left -= len; 659 left -= len;
654 660
655 /* 661 /*
656 * Fill in the new header fields. 662 * Fill in the new header fields.
657 */ 663 */
658 iph = ip_hdr(skb2); 664 iph = ip_hdr(skb2);
659 iph->frag_off = htons((offset >> 3)); 665 iph->frag_off = htons((offset >> 3));
660 666
661 /* ANK: dirty, but effective trick. Upgrade options only if 667 /* ANK: dirty, but effective trick. Upgrade options only if
662 * the segment to be fragmented was THE FIRST (otherwise, 668 * the segment to be fragmented was THE FIRST (otherwise,
663 * options are already fixed) and make it ONCE 669 * options are already fixed) and make it ONCE
664 * on the initial skb, so that all the following fragments 670 * on the initial skb, so that all the following fragments
665 * will inherit fixed options. 671 * will inherit fixed options.
666 */ 672 */
667 if (offset == 0) 673 if (offset == 0)
668 ip_options_fragment(skb); 674 ip_options_fragment(skb);
669 675
670 /* 676 /*
671 * Added AC : If we are fragmenting a fragment that's not the 677 * Added AC : If we are fragmenting a fragment that's not the
672 * last fragment then keep MF on each bit 678 * last fragment then keep MF on each bit
673 */ 679 */
674 if (left > 0 || not_last_frag) 680 if (left > 0 || not_last_frag)
675 iph->frag_off |= htons(IP_MF); 681 iph->frag_off |= htons(IP_MF);
676 ptr += len; 682 ptr += len;
677 offset += len; 683 offset += len;
678 684
679 /* 685 /*
680 * Put this fragment into the sending queue. 686 * Put this fragment into the sending queue.
681 */ 687 */
682 iph->tot_len = htons(len + hlen); 688 iph->tot_len = htons(len + hlen);
683 689
684 ip_send_check(iph); 690 ip_send_check(iph);
685 691
686 err = output(skb2); 692 err = output(skb2);
687 if (err) 693 if (err)
688 goto fail; 694 goto fail;
689 695
690 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 696 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
691 } 697 }
692 kfree_skb(skb); 698 kfree_skb(skb);
693 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 699 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
694 return err; 700 return err;
695 701
696 fail: 702 fail:
697 kfree_skb(skb); 703 kfree_skb(skb);
698 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
699 return err; 705 return err;
700 } 706 }
701 EXPORT_SYMBOL(ip_fragment); 707 EXPORT_SYMBOL(ip_fragment);
702 708
703 int 709 int
704 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) 710 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
705 { 711 {
706 struct iovec *iov = from; 712 struct iovec *iov = from;
707 713
708 if (skb->ip_summed == CHECKSUM_PARTIAL) { 714 if (skb->ip_summed == CHECKSUM_PARTIAL) {
709 if (memcpy_fromiovecend(to, iov, offset, len) < 0) 715 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
710 return -EFAULT; 716 return -EFAULT;
711 } else { 717 } else {
712 __wsum csum = 0; 718 __wsum csum = 0;
713 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) 719 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
714 return -EFAULT; 720 return -EFAULT;
715 skb->csum = csum_block_add(skb->csum, csum, odd); 721 skb->csum = csum_block_add(skb->csum, csum, odd);
716 } 722 }
717 return 0; 723 return 0;
718 } 724 }
719 EXPORT_SYMBOL(ip_generic_getfrag); 725 EXPORT_SYMBOL(ip_generic_getfrag);
720 726
721 static inline __wsum 727 static inline __wsum
722 csum_page(struct page *page, int offset, int copy) 728 csum_page(struct page *page, int offset, int copy)
723 { 729 {
724 char *kaddr; 730 char *kaddr;
725 __wsum csum; 731 __wsum csum;
726 kaddr = kmap(page); 732 kaddr = kmap(page);
727 csum = csum_partial(kaddr + offset, copy, 0); 733 csum = csum_partial(kaddr + offset, copy, 0);
728 kunmap(page); 734 kunmap(page);
729 return csum; 735 return csum;
730 } 736 }
731 737
732 static inline int ip_ufo_append_data(struct sock *sk, 738 static inline int ip_ufo_append_data(struct sock *sk,
733 struct sk_buff_head *queue, 739 struct sk_buff_head *queue,
734 int getfrag(void *from, char *to, int offset, int len, 740 int getfrag(void *from, char *to, int offset, int len,
735 int odd, struct sk_buff *skb), 741 int odd, struct sk_buff *skb),
736 void *from, int length, int hh_len, int fragheaderlen, 742 void *from, int length, int hh_len, int fragheaderlen,
737 int transhdrlen, int maxfraglen, unsigned int flags) 743 int transhdrlen, int maxfraglen, unsigned int flags)
738 { 744 {
739 struct sk_buff *skb; 745 struct sk_buff *skb;
740 int err; 746 int err;
741 747
742 /* There is support for UDP fragmentation offload by network 748 /* There is support for UDP fragmentation offload by network
743 * device, so create one single skb packet containing complete 749 * device, so create one single skb packet containing complete
744 * udp datagram 750 * udp datagram
745 */ 751 */
746 if ((skb = skb_peek_tail(queue)) == NULL) { 752 if ((skb = skb_peek_tail(queue)) == NULL) {
747 skb = sock_alloc_send_skb(sk, 753 skb = sock_alloc_send_skb(sk,
748 hh_len + fragheaderlen + transhdrlen + 20, 754 hh_len + fragheaderlen + transhdrlen + 20,
749 (flags & MSG_DONTWAIT), &err); 755 (flags & MSG_DONTWAIT), &err);
750 756
751 if (skb == NULL) 757 if (skb == NULL)
752 return err; 758 return err;
753 759
754 /* reserve space for Hardware header */ 760 /* reserve space for Hardware header */
755 skb_reserve(skb, hh_len); 761 skb_reserve(skb, hh_len);
756 762
757 /* create space for UDP/IP header */ 763 /* create space for UDP/IP header */
758 skb_put(skb, fragheaderlen + transhdrlen); 764 skb_put(skb, fragheaderlen + transhdrlen);
759 765
760 /* initialize network header pointer */ 766 /* initialize network header pointer */
761 skb_reset_network_header(skb); 767 skb_reset_network_header(skb);
762 768
763 /* initialize protocol header pointer */ 769 /* initialize protocol header pointer */
764 skb->transport_header = skb->network_header + fragheaderlen; 770 skb->transport_header = skb->network_header + fragheaderlen;
765 771
766 skb->ip_summed = CHECKSUM_PARTIAL; 772 skb->ip_summed = CHECKSUM_PARTIAL;
767 skb->csum = 0; 773 skb->csum = 0;
768 774
769 /* specify the length of each IP datagram fragment */ 775 /* specify the length of each IP datagram fragment */
770 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; 776 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
771 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 777 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
772 __skb_queue_tail(queue, skb); 778 __skb_queue_tail(queue, skb);
773 } 779 }
774 780
775 return skb_append_datato_frags(sk, skb, getfrag, from, 781 return skb_append_datato_frags(sk, skb, getfrag, from,
776 (length - transhdrlen)); 782 (length - transhdrlen));
777 } 783 }
778 784
779 static int __ip_append_data(struct sock *sk, 785 static int __ip_append_data(struct sock *sk,
780 struct flowi4 *fl4, 786 struct flowi4 *fl4,
781 struct sk_buff_head *queue, 787 struct sk_buff_head *queue,
782 struct inet_cork *cork, 788 struct inet_cork *cork,
783 int getfrag(void *from, char *to, int offset, 789 int getfrag(void *from, char *to, int offset,
784 int len, int odd, struct sk_buff *skb), 790 int len, int odd, struct sk_buff *skb),
785 void *from, int length, int transhdrlen, 791 void *from, int length, int transhdrlen,
786 unsigned int flags) 792 unsigned int flags)
787 { 793 {
788 struct inet_sock *inet = inet_sk(sk); 794 struct inet_sock *inet = inet_sk(sk);
789 struct sk_buff *skb; 795 struct sk_buff *skb;
790 796
791 struct ip_options *opt = cork->opt; 797 struct ip_options *opt = cork->opt;
792 int hh_len; 798 int hh_len;
793 int exthdrlen; 799 int exthdrlen;
794 int mtu; 800 int mtu;
795 int copy; 801 int copy;
796 int err; 802 int err;
797 int offset = 0; 803 int offset = 0;
798 unsigned int maxfraglen, fragheaderlen; 804 unsigned int maxfraglen, fragheaderlen;
799 int csummode = CHECKSUM_NONE; 805 int csummode = CHECKSUM_NONE;
800 struct rtable *rt = (struct rtable *)cork->dst; 806 struct rtable *rt = (struct rtable *)cork->dst;
801 807
802 skb = skb_peek_tail(queue); 808 skb = skb_peek_tail(queue);
803 809
804 exthdrlen = !skb ? rt->dst.header_len : 0; 810 exthdrlen = !skb ? rt->dst.header_len : 0;
805 mtu = cork->fragsize; 811 mtu = cork->fragsize;
806 812
807 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 813 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
808 814
809 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 815 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
810 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 816 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
811 817
812 if (cork->length + length > 0xFFFF - fragheaderlen) { 818 if (cork->length + length > 0xFFFF - fragheaderlen) {
813 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 819 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
814 mtu-exthdrlen); 820 mtu-exthdrlen);
815 return -EMSGSIZE; 821 return -EMSGSIZE;
816 } 822 }
817 823
818 /* 824 /*
819 * transhdrlen > 0 means that this is the first fragment and we wish 825 * transhdrlen > 0 means that this is the first fragment and we wish
820 * it won't be fragmented in the future. 826 * it won't be fragmented in the future.
821 */ 827 */
822 if (transhdrlen && 828 if (transhdrlen &&
823 length + fragheaderlen <= mtu && 829 length + fragheaderlen <= mtu &&
824 rt->dst.dev->features & NETIF_F_V4_CSUM && 830 rt->dst.dev->features & NETIF_F_V4_CSUM &&
825 !exthdrlen) 831 !exthdrlen)
826 csummode = CHECKSUM_PARTIAL; 832 csummode = CHECKSUM_PARTIAL;
827 833
828 cork->length += length; 834 cork->length += length;
829 if (((length > mtu) || (skb && skb_is_gso(skb))) && 835 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
830 (sk->sk_protocol == IPPROTO_UDP) && 836 (sk->sk_protocol == IPPROTO_UDP) &&
831 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { 837 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
832 err = ip_ufo_append_data(sk, queue, getfrag, from, length, 838 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833 hh_len, fragheaderlen, transhdrlen, 839 hh_len, fragheaderlen, transhdrlen,
834 maxfraglen, flags); 840 maxfraglen, flags);
835 if (err) 841 if (err)
836 goto error; 842 goto error;
837 return 0; 843 return 0;
838 } 844 }
839 845
840 /* So, what's going on in the loop below? 846 /* So, what's going on in the loop below?
841 * 847 *
842 * We use calculated fragment length to generate chained skb, 848 * We use calculated fragment length to generate chained skb,
843 * each of segments is IP fragment ready for sending to network after 849 * each of segments is IP fragment ready for sending to network after
844 * adding appropriate IP header. 850 * adding appropriate IP header.
845 */ 851 */
846 852
847 if (!skb) 853 if (!skb)
848 goto alloc_new_skb; 854 goto alloc_new_skb;
849 855
850 while (length > 0) { 856 while (length > 0) {
851 /* Check if the remaining data fits into current packet. */ 857 /* Check if the remaining data fits into current packet. */
852 copy = mtu - skb->len; 858 copy = mtu - skb->len;
853 if (copy < length) 859 if (copy < length)
854 copy = maxfraglen - skb->len; 860 copy = maxfraglen - skb->len;
855 if (copy <= 0) { 861 if (copy <= 0) {
856 char *data; 862 char *data;
857 unsigned int datalen; 863 unsigned int datalen;
858 unsigned int fraglen; 864 unsigned int fraglen;
859 unsigned int fraggap; 865 unsigned int fraggap;
860 unsigned int alloclen; 866 unsigned int alloclen;
861 struct sk_buff *skb_prev; 867 struct sk_buff *skb_prev;
862 alloc_new_skb: 868 alloc_new_skb:
863 skb_prev = skb; 869 skb_prev = skb;
864 if (skb_prev) 870 if (skb_prev)
865 fraggap = skb_prev->len - maxfraglen; 871 fraggap = skb_prev->len - maxfraglen;
866 else 872 else
867 fraggap = 0; 873 fraggap = 0;
868 874
869 /* 875 /*
870 * If remaining data exceeds the mtu, 876 * If remaining data exceeds the mtu,
871 * we know we need more fragment(s). 877 * we know we need more fragment(s).
872 */ 878 */
873 datalen = length + fraggap; 879 datalen = length + fraggap;
874 if (datalen > mtu - fragheaderlen) 880 if (datalen > mtu - fragheaderlen)
875 datalen = maxfraglen - fragheaderlen; 881 datalen = maxfraglen - fragheaderlen;
876 fraglen = datalen + fragheaderlen; 882 fraglen = datalen + fragheaderlen;
877 883
878 if ((flags & MSG_MORE) && 884 if ((flags & MSG_MORE) &&
879 !(rt->dst.dev->features&NETIF_F_SG)) 885 !(rt->dst.dev->features&NETIF_F_SG))
880 alloclen = mtu; 886 alloclen = mtu;
881 else 887 else
882 alloclen = fraglen; 888 alloclen = fraglen;
883 889
884 alloclen += exthdrlen; 890 alloclen += exthdrlen;
885 891
886 /* The last fragment gets additional space at tail. 892 /* The last fragment gets additional space at tail.
887 * Note, with MSG_MORE we overallocate on fragments, 893 * Note, with MSG_MORE we overallocate on fragments,
888 * because we have no idea what fragment will be 894 * because we have no idea what fragment will be
889 * the last. 895 * the last.
890 */ 896 */
891 if (datalen == length + fraggap) 897 if (datalen == length + fraggap)
892 alloclen += rt->dst.trailer_len; 898 alloclen += rt->dst.trailer_len;
893 899
894 if (transhdrlen) { 900 if (transhdrlen) {
895 skb = sock_alloc_send_skb(sk, 901 skb = sock_alloc_send_skb(sk,
896 alloclen + hh_len + 15, 902 alloclen + hh_len + 15,
897 (flags & MSG_DONTWAIT), &err); 903 (flags & MSG_DONTWAIT), &err);
898 } else { 904 } else {
899 skb = NULL; 905 skb = NULL;
900 if (atomic_read(&sk->sk_wmem_alloc) <= 906 if (atomic_read(&sk->sk_wmem_alloc) <=
901 2 * sk->sk_sndbuf) 907 2 * sk->sk_sndbuf)
902 skb = sock_wmalloc(sk, 908 skb = sock_wmalloc(sk,
903 alloclen + hh_len + 15, 1, 909 alloclen + hh_len + 15, 1,
904 sk->sk_allocation); 910 sk->sk_allocation);
905 if (unlikely(skb == NULL)) 911 if (unlikely(skb == NULL))
906 err = -ENOBUFS; 912 err = -ENOBUFS;
907 else 913 else
908 /* only the initial fragment is 914 /* only the initial fragment is
909 time stamped */ 915 time stamped */
910 cork->tx_flags = 0; 916 cork->tx_flags = 0;
911 } 917 }
912 if (skb == NULL) 918 if (skb == NULL)
913 goto error; 919 goto error;
914 920
915 /* 921 /*
916 * Fill in the control structures 922 * Fill in the control structures
917 */ 923 */
918 skb->ip_summed = csummode; 924 skb->ip_summed = csummode;
919 skb->csum = 0; 925 skb->csum = 0;
920 skb_reserve(skb, hh_len); 926 skb_reserve(skb, hh_len);
921 skb_shinfo(skb)->tx_flags = cork->tx_flags; 927 skb_shinfo(skb)->tx_flags = cork->tx_flags;
922 928
923 /* 929 /*
924 * Find where to start putting bytes. 930 * Find where to start putting bytes.
925 */ 931 */
926 data = skb_put(skb, fraglen + exthdrlen); 932 data = skb_put(skb, fraglen + exthdrlen);
927 skb_set_network_header(skb, exthdrlen); 933 skb_set_network_header(skb, exthdrlen);
928 skb->transport_header = (skb->network_header + 934 skb->transport_header = (skb->network_header +
929 fragheaderlen); 935 fragheaderlen);
930 data += fragheaderlen + exthdrlen; 936 data += fragheaderlen + exthdrlen;
931 937
932 if (fraggap) { 938 if (fraggap) {
933 skb->csum = skb_copy_and_csum_bits( 939 skb->csum = skb_copy_and_csum_bits(
934 skb_prev, maxfraglen, 940 skb_prev, maxfraglen,
935 data + transhdrlen, fraggap, 0); 941 data + transhdrlen, fraggap, 0);
936 skb_prev->csum = csum_sub(skb_prev->csum, 942 skb_prev->csum = csum_sub(skb_prev->csum,
937 skb->csum); 943 skb->csum);
938 data += fraggap; 944 data += fraggap;
939 pskb_trim_unique(skb_prev, maxfraglen); 945 pskb_trim_unique(skb_prev, maxfraglen);
940 } 946 }
941 947
942 copy = datalen - transhdrlen - fraggap; 948 copy = datalen - transhdrlen - fraggap;
943 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 949 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
944 err = -EFAULT; 950 err = -EFAULT;
945 kfree_skb(skb); 951 kfree_skb(skb);
946 goto error; 952 goto error;
947 } 953 }
948 954
949 offset += copy; 955 offset += copy;
950 length -= datalen - fraggap; 956 length -= datalen - fraggap;
951 transhdrlen = 0; 957 transhdrlen = 0;
952 exthdrlen = 0; 958 exthdrlen = 0;
953 csummode = CHECKSUM_NONE; 959 csummode = CHECKSUM_NONE;
954 960
955 /* 961 /*
956 * Put the packet on the pending queue. 962 * Put the packet on the pending queue.
957 */ 963 */
958 __skb_queue_tail(queue, skb); 964 __skb_queue_tail(queue, skb);
959 continue; 965 continue;
960 } 966 }
961 967
962 if (copy > length) 968 if (copy > length)
963 copy = length; 969 copy = length;
964 970
965 if (!(rt->dst.dev->features&NETIF_F_SG)) { 971 if (!(rt->dst.dev->features&NETIF_F_SG)) {
966 unsigned int off; 972 unsigned int off;
967 973
968 off = skb->len; 974 off = skb->len;
969 if (getfrag(from, skb_put(skb, copy), 975 if (getfrag(from, skb_put(skb, copy),
970 offset, copy, off, skb) < 0) { 976 offset, copy, off, skb) < 0) {
971 __skb_trim(skb, off); 977 __skb_trim(skb, off);
972 err = -EFAULT; 978 err = -EFAULT;
973 goto error; 979 goto error;
974 } 980 }
975 } else { 981 } else {
976 int i = skb_shinfo(skb)->nr_frags; 982 int i = skb_shinfo(skb)->nr_frags;
977 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 983 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
978 struct page *page = cork->page; 984 struct page *page = cork->page;
979 int off = cork->off; 985 int off = cork->off;
980 unsigned int left; 986 unsigned int left;
981 987
982 if (page && (left = PAGE_SIZE - off) > 0) { 988 if (page && (left = PAGE_SIZE - off) > 0) {
983 if (copy >= left) 989 if (copy >= left)
984 copy = left; 990 copy = left;
985 if (page != frag->page) { 991 if (page != frag->page) {
986 if (i == MAX_SKB_FRAGS) { 992 if (i == MAX_SKB_FRAGS) {
987 err = -EMSGSIZE; 993 err = -EMSGSIZE;
988 goto error; 994 goto error;
989 } 995 }
990 get_page(page); 996 get_page(page);
991 skb_fill_page_desc(skb, i, page, off, 0); 997 skb_fill_page_desc(skb, i, page, off, 0);
992 frag = &skb_shinfo(skb)->frags[i]; 998 frag = &skb_shinfo(skb)->frags[i];
993 } 999 }
994 } else if (i < MAX_SKB_FRAGS) { 1000 } else if (i < MAX_SKB_FRAGS) {
995 if (copy > PAGE_SIZE) 1001 if (copy > PAGE_SIZE)
996 copy = PAGE_SIZE; 1002 copy = PAGE_SIZE;
997 page = alloc_pages(sk->sk_allocation, 0); 1003 page = alloc_pages(sk->sk_allocation, 0);
998 if (page == NULL) { 1004 if (page == NULL) {
999 err = -ENOMEM; 1005 err = -ENOMEM;
1000 goto error; 1006 goto error;
1001 } 1007 }
1002 cork->page = page; 1008 cork->page = page;
1003 cork->off = 0; 1009 cork->off = 0;
1004 1010
1005 skb_fill_page_desc(skb, i, page, 0, 0); 1011 skb_fill_page_desc(skb, i, page, 0, 0);
1006 frag = &skb_shinfo(skb)->frags[i]; 1012 frag = &skb_shinfo(skb)->frags[i];
1007 } else { 1013 } else {
1008 err = -EMSGSIZE; 1014 err = -EMSGSIZE;
1009 goto error; 1015 goto error;
1010 } 1016 }
1011 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { 1017 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012 err = -EFAULT; 1018 err = -EFAULT;
1013 goto error; 1019 goto error;
1014 } 1020 }
1015 cork->off += copy; 1021 cork->off += copy;
1016 frag->size += copy; 1022 frag->size += copy;
1017 skb->len += copy; 1023 skb->len += copy;
1018 skb->data_len += copy; 1024 skb->data_len += copy;
1019 skb->truesize += copy; 1025 skb->truesize += copy;
1020 atomic_add(copy, &sk->sk_wmem_alloc); 1026 atomic_add(copy, &sk->sk_wmem_alloc);
1021 } 1027 }
1022 offset += copy; 1028 offset += copy;
1023 length -= copy; 1029 length -= copy;
1024 } 1030 }
1025 1031
1026 return 0; 1032 return 0;
1027 1033
1028 error: 1034 error:
1029 cork->length -= length; 1035 cork->length -= length;
1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1036 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1031 return err; 1037 return err;
1032 } 1038 }
1033 1039
1034 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, 1040 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035 struct ipcm_cookie *ipc, struct rtable **rtp) 1041 struct ipcm_cookie *ipc, struct rtable **rtp)
1036 { 1042 {
1037 struct inet_sock *inet = inet_sk(sk); 1043 struct inet_sock *inet = inet_sk(sk);
1038 struct ip_options_rcu *opt; 1044 struct ip_options_rcu *opt;
1039 struct rtable *rt; 1045 struct rtable *rt;
1040 1046
1041 /* 1047 /*
1042 * setup for corking. 1048 * setup for corking.
1043 */ 1049 */
1044 opt = ipc->opt; 1050 opt = ipc->opt;
1045 if (opt) { 1051 if (opt) {
1046 if (cork->opt == NULL) { 1052 if (cork->opt == NULL) {
1047 cork->opt = kmalloc(sizeof(struct ip_options) + 40, 1053 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048 sk->sk_allocation); 1054 sk->sk_allocation);
1049 if (unlikely(cork->opt == NULL)) 1055 if (unlikely(cork->opt == NULL))
1050 return -ENOBUFS; 1056 return -ENOBUFS;
1051 } 1057 }
1052 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); 1058 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053 cork->flags |= IPCORK_OPT; 1059 cork->flags |= IPCORK_OPT;
1054 cork->addr = ipc->addr; 1060 cork->addr = ipc->addr;
1055 } 1061 }
1056 rt = *rtp; 1062 rt = *rtp;
1057 if (unlikely(!rt)) 1063 if (unlikely(!rt))
1058 return -EFAULT; 1064 return -EFAULT;
1059 /* 1065 /*
1060 * We steal reference to this route, caller should not release it 1066 * We steal reference to this route, caller should not release it
1061 */ 1067 */
1062 *rtp = NULL; 1068 *rtp = NULL;
1063 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? 1069 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064 rt->dst.dev->mtu : dst_mtu(&rt->dst); 1070 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065 cork->dst = &rt->dst; 1071 cork->dst = &rt->dst;
1066 cork->length = 0; 1072 cork->length = 0;
1067 cork->tx_flags = ipc->tx_flags; 1073 cork->tx_flags = ipc->tx_flags;
1068 cork->page = NULL; 1074 cork->page = NULL;
1069 cork->off = 0; 1075 cork->off = 0;
1070 1076
1071 return 0; 1077 return 0;
1072 } 1078 }
1073 1079
1074 /* 1080 /*
1075 * ip_append_data() and ip_append_page() can make one large IP datagram 1081 * ip_append_data() and ip_append_page() can make one large IP datagram
1076 * from many pieces of data. Each pieces will be holded on the socket 1082 * from many pieces of data. Each pieces will be holded on the socket
1077 * until ip_push_pending_frames() is called. Each piece can be a page 1083 * until ip_push_pending_frames() is called. Each piece can be a page
1078 * or non-page data. 1084 * or non-page data.
1079 * 1085 *
1080 * Not only UDP, other transport protocols - e.g. raw sockets - can use 1086 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1081 * this interface potentially. 1087 * this interface potentially.
1082 * 1088 *
1083 * LATER: length must be adjusted by pad at tail, when it is required. 1089 * LATER: length must be adjusted by pad at tail, when it is required.
1084 */ 1090 */
1085 int ip_append_data(struct sock *sk, struct flowi4 *fl4, 1091 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086 int getfrag(void *from, char *to, int offset, int len, 1092 int getfrag(void *from, char *to, int offset, int len,
1087 int odd, struct sk_buff *skb), 1093 int odd, struct sk_buff *skb),
1088 void *from, int length, int transhdrlen, 1094 void *from, int length, int transhdrlen,
1089 struct ipcm_cookie *ipc, struct rtable **rtp, 1095 struct ipcm_cookie *ipc, struct rtable **rtp,
1090 unsigned int flags) 1096 unsigned int flags)
1091 { 1097 {
1092 struct inet_sock *inet = inet_sk(sk); 1098 struct inet_sock *inet = inet_sk(sk);
1093 int err; 1099 int err;
1094 1100
1095 if (flags&MSG_PROBE) 1101 if (flags&MSG_PROBE)
1096 return 0; 1102 return 0;
1097 1103
1098 if (skb_queue_empty(&sk->sk_write_queue)) { 1104 if (skb_queue_empty(&sk->sk_write_queue)) {
1099 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); 1105 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100 if (err) 1106 if (err)
1101 return err; 1107 return err;
1102 } else { 1108 } else {
1103 transhdrlen = 0; 1109 transhdrlen = 0;
1104 } 1110 }
1105 1111
1106 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1112 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107 from, length, transhdrlen, flags); 1113 from, length, transhdrlen, flags);
1108 } 1114 }
1109 1115
1110 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, 1116 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1111 int offset, size_t size, int flags) 1117 int offset, size_t size, int flags)
1112 { 1118 {
1113 struct inet_sock *inet = inet_sk(sk); 1119 struct inet_sock *inet = inet_sk(sk);
1114 struct sk_buff *skb; 1120 struct sk_buff *skb;
1115 struct rtable *rt; 1121 struct rtable *rt;
1116 struct ip_options *opt = NULL; 1122 struct ip_options *opt = NULL;
1117 struct inet_cork *cork; 1123 struct inet_cork *cork;
1118 int hh_len; 1124 int hh_len;
1119 int mtu; 1125 int mtu;
1120 int len; 1126 int len;
1121 int err; 1127 int err;
1122 unsigned int maxfraglen, fragheaderlen, fraggap; 1128 unsigned int maxfraglen, fragheaderlen, fraggap;
1123 1129
1124 if (inet->hdrincl) 1130 if (inet->hdrincl)
1125 return -EPERM; 1131 return -EPERM;
1126 1132
1127 if (flags&MSG_PROBE) 1133 if (flags&MSG_PROBE)
1128 return 0; 1134 return 0;
1129 1135
1130 if (skb_queue_empty(&sk->sk_write_queue)) 1136 if (skb_queue_empty(&sk->sk_write_queue))
1131 return -EINVAL; 1137 return -EINVAL;
1132 1138
1133 cork = &inet->cork.base; 1139 cork = &inet->cork.base;
1134 rt = (struct rtable *)cork->dst; 1140 rt = (struct rtable *)cork->dst;
1135 if (cork->flags & IPCORK_OPT) 1141 if (cork->flags & IPCORK_OPT)
1136 opt = cork->opt; 1142 opt = cork->opt;
1137 1143
1138 if (!(rt->dst.dev->features&NETIF_F_SG)) 1144 if (!(rt->dst.dev->features&NETIF_F_SG))
1139 return -EOPNOTSUPP; 1145 return -EOPNOTSUPP;
1140 1146
1141 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1147 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1142 mtu = cork->fragsize; 1148 mtu = cork->fragsize;
1143 1149
1144 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1150 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1145 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1151 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1146 1152
1147 if (cork->length + size > 0xFFFF - fragheaderlen) { 1153 if (cork->length + size > 0xFFFF - fragheaderlen) {
1148 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); 1154 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1149 return -EMSGSIZE; 1155 return -EMSGSIZE;
1150 } 1156 }
1151 1157
1152 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1158 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1153 return -EINVAL; 1159 return -EINVAL;
1154 1160
1155 cork->length += size; 1161 cork->length += size;
1156 if ((size + skb->len > mtu) && 1162 if ((size + skb->len > mtu) &&
1157 (sk->sk_protocol == IPPROTO_UDP) && 1163 (sk->sk_protocol == IPPROTO_UDP) &&
1158 (rt->dst.dev->features & NETIF_F_UFO)) { 1164 (rt->dst.dev->features & NETIF_F_UFO)) {
1159 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 1165 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1160 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1166 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161 } 1167 }
1162 1168
1163 1169
1164 while (size > 0) { 1170 while (size > 0) {
1165 int i; 1171 int i;
1166 1172
1167 if (skb_is_gso(skb)) 1173 if (skb_is_gso(skb))
1168 len = size; 1174 len = size;
1169 else { 1175 else {
1170 1176
1171 /* Check if the remaining data fits into current packet. */ 1177 /* Check if the remaining data fits into current packet. */
1172 len = mtu - skb->len; 1178 len = mtu - skb->len;
1173 if (len < size) 1179 if (len < size)
1174 len = maxfraglen - skb->len; 1180 len = maxfraglen - skb->len;
1175 } 1181 }
1176 if (len <= 0) { 1182 if (len <= 0) {
1177 struct sk_buff *skb_prev; 1183 struct sk_buff *skb_prev;
1178 int alloclen; 1184 int alloclen;
1179 1185
1180 skb_prev = skb; 1186 skb_prev = skb;
1181 fraggap = skb_prev->len - maxfraglen; 1187 fraggap = skb_prev->len - maxfraglen;
1182 1188
1183 alloclen = fragheaderlen + hh_len + fraggap + 15; 1189 alloclen = fragheaderlen + hh_len + fraggap + 15;
1184 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); 1190 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1185 if (unlikely(!skb)) { 1191 if (unlikely(!skb)) {
1186 err = -ENOBUFS; 1192 err = -ENOBUFS;
1187 goto error; 1193 goto error;
1188 } 1194 }
1189 1195
1190 /* 1196 /*
1191 * Fill in the control structures 1197 * Fill in the control structures
1192 */ 1198 */
1193 skb->ip_summed = CHECKSUM_NONE; 1199 skb->ip_summed = CHECKSUM_NONE;
1194 skb->csum = 0; 1200 skb->csum = 0;
1195 skb_reserve(skb, hh_len); 1201 skb_reserve(skb, hh_len);
1196 1202
1197 /* 1203 /*
1198 * Find where to start putting bytes. 1204 * Find where to start putting bytes.
1199 */ 1205 */
1200 skb_put(skb, fragheaderlen + fraggap); 1206 skb_put(skb, fragheaderlen + fraggap);
1201 skb_reset_network_header(skb); 1207 skb_reset_network_header(skb);
1202 skb->transport_header = (skb->network_header + 1208 skb->transport_header = (skb->network_header +
1203 fragheaderlen); 1209 fragheaderlen);
1204 if (fraggap) { 1210 if (fraggap) {
1205 skb->csum = skb_copy_and_csum_bits(skb_prev, 1211 skb->csum = skb_copy_and_csum_bits(skb_prev,
1206 maxfraglen, 1212 maxfraglen,
1207 skb_transport_header(skb), 1213 skb_transport_header(skb),
1208 fraggap, 0); 1214 fraggap, 0);
1209 skb_prev->csum = csum_sub(skb_prev->csum, 1215 skb_prev->csum = csum_sub(skb_prev->csum,
1210 skb->csum); 1216 skb->csum);
1211 pskb_trim_unique(skb_prev, maxfraglen); 1217 pskb_trim_unique(skb_prev, maxfraglen);
1212 } 1218 }
1213 1219
1214 /* 1220 /*
1215 * Put the packet on the pending queue. 1221 * Put the packet on the pending queue.
1216 */ 1222 */
1217 __skb_queue_tail(&sk->sk_write_queue, skb); 1223 __skb_queue_tail(&sk->sk_write_queue, skb);
1218 continue; 1224 continue;
1219 } 1225 }
1220 1226
1221 i = skb_shinfo(skb)->nr_frags; 1227 i = skb_shinfo(skb)->nr_frags;
1222 if (len > size) 1228 if (len > size)
1223 len = size; 1229 len = size;
1224 if (skb_can_coalesce(skb, i, page, offset)) { 1230 if (skb_can_coalesce(skb, i, page, offset)) {
1225 skb_shinfo(skb)->frags[i-1].size += len; 1231 skb_shinfo(skb)->frags[i-1].size += len;
1226 } else if (i < MAX_SKB_FRAGS) { 1232 } else if (i < MAX_SKB_FRAGS) {
1227 get_page(page); 1233 get_page(page);
1228 skb_fill_page_desc(skb, i, page, offset, len); 1234 skb_fill_page_desc(skb, i, page, offset, len);
1229 } else { 1235 } else {
1230 err = -EMSGSIZE; 1236 err = -EMSGSIZE;
1231 goto error; 1237 goto error;
1232 } 1238 }
1233 1239
1234 if (skb->ip_summed == CHECKSUM_NONE) { 1240 if (skb->ip_summed == CHECKSUM_NONE) {
1235 __wsum csum; 1241 __wsum csum;
1236 csum = csum_page(page, offset, len); 1242 csum = csum_page(page, offset, len);
1237 skb->csum = csum_block_add(skb->csum, csum, skb->len); 1243 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1238 } 1244 }
1239 1245
1240 skb->len += len; 1246 skb->len += len;
1241 skb->data_len += len; 1247 skb->data_len += len;
1242 skb->truesize += len; 1248 skb->truesize += len;
1243 atomic_add(len, &sk->sk_wmem_alloc); 1249 atomic_add(len, &sk->sk_wmem_alloc);
1244 offset += len; 1250 offset += len;
1245 size -= len; 1251 size -= len;
1246 } 1252 }
1247 return 0; 1253 return 0;
1248 1254
1249 error: 1255 error:
1250 cork->length -= size; 1256 cork->length -= size;
1251 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1257 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1252 return err; 1258 return err;
1253 } 1259 }
1254 1260
1255 static void ip_cork_release(struct inet_cork *cork) 1261 static void ip_cork_release(struct inet_cork *cork)
1256 { 1262 {
1257 cork->flags &= ~IPCORK_OPT; 1263 cork->flags &= ~IPCORK_OPT;
1258 kfree(cork->opt); 1264 kfree(cork->opt);
1259 cork->opt = NULL; 1265 cork->opt = NULL;
1260 dst_release(cork->dst); 1266 dst_release(cork->dst);
1261 cork->dst = NULL; 1267 cork->dst = NULL;
1262 } 1268 }
1263 1269
1264 /* 1270 /*
1265 * Combined all pending IP fragments on the socket as one IP datagram 1271 * Combined all pending IP fragments on the socket as one IP datagram
1266 * and push them out. 1272 * and push them out.
1267 */ 1273 */
1268 struct sk_buff *__ip_make_skb(struct sock *sk, 1274 struct sk_buff *__ip_make_skb(struct sock *sk,
1269 struct flowi4 *fl4, 1275 struct flowi4 *fl4,
1270 struct sk_buff_head *queue, 1276 struct sk_buff_head *queue,
1271 struct inet_cork *cork) 1277 struct inet_cork *cork)
1272 { 1278 {
1273 struct sk_buff *skb, *tmp_skb; 1279 struct sk_buff *skb, *tmp_skb;
1274 struct sk_buff **tail_skb; 1280 struct sk_buff **tail_skb;
1275 struct inet_sock *inet = inet_sk(sk); 1281 struct inet_sock *inet = inet_sk(sk);
1276 struct net *net = sock_net(sk); 1282 struct net *net = sock_net(sk);
1277 struct ip_options *opt = NULL; 1283 struct ip_options *opt = NULL;
1278 struct rtable *rt = (struct rtable *)cork->dst; 1284 struct rtable *rt = (struct rtable *)cork->dst;
1279 struct iphdr *iph; 1285 struct iphdr *iph;
1280 __be16 df = 0; 1286 __be16 df = 0;
1281 __u8 ttl; 1287 __u8 ttl;
1282 1288
1283 if ((skb = __skb_dequeue(queue)) == NULL) 1289 if ((skb = __skb_dequeue(queue)) == NULL)
1284 goto out; 1290 goto out;
1285 tail_skb = &(skb_shinfo(skb)->frag_list); 1291 tail_skb = &(skb_shinfo(skb)->frag_list);
1286 1292
1287 /* move skb->data to ip header from ext header */ 1293 /* move skb->data to ip header from ext header */
1288 if (skb->data < skb_network_header(skb)) 1294 if (skb->data < skb_network_header(skb))
1289 __skb_pull(skb, skb_network_offset(skb)); 1295 __skb_pull(skb, skb_network_offset(skb));
1290 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1296 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1291 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1297 __skb_pull(tmp_skb, skb_network_header_len(skb));
1292 *tail_skb = tmp_skb; 1298 *tail_skb = tmp_skb;
1293 tail_skb = &(tmp_skb->next); 1299 tail_skb = &(tmp_skb->next);
1294 skb->len += tmp_skb->len; 1300 skb->len += tmp_skb->len;
1295 skb->data_len += tmp_skb->len; 1301 skb->data_len += tmp_skb->len;
1296 skb->truesize += tmp_skb->truesize; 1302 skb->truesize += tmp_skb->truesize;
1297 tmp_skb->destructor = NULL; 1303 tmp_skb->destructor = NULL;
1298 tmp_skb->sk = NULL; 1304 tmp_skb->sk = NULL;
1299 } 1305 }
1300 1306
1301 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow 1307 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1302 * to fragment the frame generated here. No matter, what transforms 1308 * to fragment the frame generated here. No matter, what transforms
1303 * how transforms change size of the packet, it will come out. 1309 * how transforms change size of the packet, it will come out.
1304 */ 1310 */
1305 if (inet->pmtudisc < IP_PMTUDISC_DO) 1311 if (inet->pmtudisc < IP_PMTUDISC_DO)
1306 skb->local_df = 1; 1312 skb->local_df = 1;
1307 1313
1308 /* DF bit is set when we want to see DF on outgoing frames. 1314 /* DF bit is set when we want to see DF on outgoing frames.
1309 * If local_df is set too, we still allow to fragment this frame 1315 * If local_df is set too, we still allow to fragment this frame
1310 * locally. */ 1316 * locally. */
1311 if (inet->pmtudisc >= IP_PMTUDISC_DO || 1317 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1312 (skb->len <= dst_mtu(&rt->dst) && 1318 (skb->len <= dst_mtu(&rt->dst) &&
1313 ip_dont_fragment(sk, &rt->dst))) 1319 ip_dont_fragment(sk, &rt->dst)))
1314 df = htons(IP_DF); 1320 df = htons(IP_DF);
1315 1321
1316 if (cork->flags & IPCORK_OPT) 1322 if (cork->flags & IPCORK_OPT)
1317 opt = cork->opt; 1323 opt = cork->opt;
1318 1324
1319 if (rt->rt_type == RTN_MULTICAST) 1325 if (rt->rt_type == RTN_MULTICAST)
1320 ttl = inet->mc_ttl; 1326 ttl = inet->mc_ttl;
1321 else 1327 else
1322 ttl = ip_select_ttl(inet, &rt->dst); 1328 ttl = ip_select_ttl(inet, &rt->dst);
1323 1329
1324 iph = (struct iphdr *)skb->data; 1330 iph = (struct iphdr *)skb->data;
1325 iph->version = 4; 1331 iph->version = 4;
1326 iph->ihl = 5; 1332 iph->ihl = 5;
1327 iph->tos = inet->tos; 1333 iph->tos = inet->tos;
1328 iph->frag_off = df; 1334 iph->frag_off = df;
1329 ip_select_ident(iph, &rt->dst, sk); 1335 ip_select_ident(iph, &rt->dst, sk);
1330 iph->ttl = ttl; 1336 iph->ttl = ttl;
1331 iph->protocol = sk->sk_protocol; 1337 iph->protocol = sk->sk_protocol;
1332 iph->saddr = fl4->saddr; 1338 iph->saddr = fl4->saddr;
1333 iph->daddr = fl4->daddr; 1339 iph->daddr = fl4->daddr;
1334 1340
1335 if (opt) { 1341 if (opt) {
1336 iph->ihl += opt->optlen>>2; 1342 iph->ihl += opt->optlen>>2;
1337 ip_options_build(skb, opt, cork->addr, rt, 0); 1343 ip_options_build(skb, opt, cork->addr, rt, 0);
1338 } 1344 }
1339 1345
1340 skb->priority = sk->sk_priority; 1346 skb->priority = sk->sk_priority;
1341 skb->mark = sk->sk_mark; 1347 skb->mark = sk->sk_mark;
1342 /* 1348 /*
1343 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1349 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1344 * on dst refcount 1350 * on dst refcount
1345 */ 1351 */
1346 cork->dst = NULL; 1352 cork->dst = NULL;
1347 skb_dst_set(skb, &rt->dst); 1353 skb_dst_set(skb, &rt->dst);
1348 1354
1349 if (iph->protocol == IPPROTO_ICMP) 1355 if (iph->protocol == IPPROTO_ICMP)
1350 icmp_out_count(net, ((struct icmphdr *) 1356 icmp_out_count(net, ((struct icmphdr *)
1351 skb_transport_header(skb))->type); 1357 skb_transport_header(skb))->type);
1352 1358
1353 ip_cork_release(cork); 1359 ip_cork_release(cork);
1354 out: 1360 out:
1355 return skb; 1361 return skb;
1356 } 1362 }
1357 1363
1358 int ip_send_skb(struct sk_buff *skb) 1364 int ip_send_skb(struct sk_buff *skb)
1359 { 1365 {
1360 struct net *net = sock_net(skb->sk); 1366 struct net *net = sock_net(skb->sk);
1361 int err; 1367 int err;
1362 1368
1363 err = ip_local_out(skb); 1369 err = ip_local_out(skb);
1364 if (err) { 1370 if (err) {
1365 if (err > 0) 1371 if (err > 0)
1366 err = net_xmit_errno(err); 1372 err = net_xmit_errno(err);
1367 if (err) 1373 if (err)
1368 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1374 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1369 } 1375 }
1370 1376
1371 return err; 1377 return err;
1372 } 1378 }
1373 1379
1374 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) 1380 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1375 { 1381 {
1376 struct sk_buff *skb; 1382 struct sk_buff *skb;
1377 1383
1378 skb = ip_finish_skb(sk, fl4); 1384 skb = ip_finish_skb(sk, fl4);
1379 if (!skb) 1385 if (!skb)
1380 return 0; 1386 return 0;
1381 1387
1382 /* Netfilter gets whole the not fragmented skb. */ 1388 /* Netfilter gets whole the not fragmented skb. */
1383 return ip_send_skb(skb); 1389 return ip_send_skb(skb);
1384 } 1390 }
1385 1391
1386 /* 1392 /*
1387 * Throw away all pending data on the socket. 1393 * Throw away all pending data on the socket.
1388 */ 1394 */
1389 static void __ip_flush_pending_frames(struct sock *sk, 1395 static void __ip_flush_pending_frames(struct sock *sk,
1390 struct sk_buff_head *queue, 1396 struct sk_buff_head *queue,
1391 struct inet_cork *cork) 1397 struct inet_cork *cork)
1392 { 1398 {
1393 struct sk_buff *skb; 1399 struct sk_buff *skb;
1394 1400
1395 while ((skb = __skb_dequeue_tail(queue)) != NULL) 1401 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1396 kfree_skb(skb); 1402 kfree_skb(skb);
1397 1403
1398 ip_cork_release(cork); 1404 ip_cork_release(cork);
1399 } 1405 }
1400 1406
1401 void ip_flush_pending_frames(struct sock *sk) 1407 void ip_flush_pending_frames(struct sock *sk)
1402 { 1408 {
1403 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); 1409 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1404 } 1410 }
1405 1411
1406 struct sk_buff *ip_make_skb(struct sock *sk, 1412 struct sk_buff *ip_make_skb(struct sock *sk,
1407 struct flowi4 *fl4, 1413 struct flowi4 *fl4,
1408 int getfrag(void *from, char *to, int offset, 1414 int getfrag(void *from, char *to, int offset,
1409 int len, int odd, struct sk_buff *skb), 1415 int len, int odd, struct sk_buff *skb),
1410 void *from, int length, int transhdrlen, 1416 void *from, int length, int transhdrlen,
1411 struct ipcm_cookie *ipc, struct rtable **rtp, 1417 struct ipcm_cookie *ipc, struct rtable **rtp,
1412 unsigned int flags) 1418 unsigned int flags)
1413 { 1419 {
1414 struct inet_cork cork; 1420 struct inet_cork cork;
1415 struct sk_buff_head queue; 1421 struct sk_buff_head queue;
1416 int err; 1422 int err;
1417 1423
1418 if (flags & MSG_PROBE) 1424 if (flags & MSG_PROBE)
1419 return NULL; 1425 return NULL;
1420 1426
1421 __skb_queue_head_init(&queue); 1427 __skb_queue_head_init(&queue);
1422 1428
1423 cork.flags = 0; 1429 cork.flags = 0;
1424 cork.addr = 0; 1430 cork.addr = 0;
1425 cork.opt = NULL; 1431 cork.opt = NULL;
1426 err = ip_setup_cork(sk, &cork, ipc, rtp); 1432 err = ip_setup_cork(sk, &cork, ipc, rtp);
1427 if (err) 1433 if (err)
1428 return ERR_PTR(err); 1434 return ERR_PTR(err);
1429 1435
1430 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1436 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431 from, length, transhdrlen, flags); 1437 from, length, transhdrlen, flags);
1432 if (err) { 1438 if (err) {
1433 __ip_flush_pending_frames(sk, &queue, &cork); 1439 __ip_flush_pending_frames(sk, &queue, &cork);
1434 return ERR_PTR(err); 1440 return ERR_PTR(err);
1435 } 1441 }
1436 1442
1437 return __ip_make_skb(sk, fl4, &queue, &cork); 1443 return __ip_make_skb(sk, fl4, &queue, &cork);
1438 } 1444 }
1439 1445
1440 /* 1446 /*
1441 * Fetch data from kernel space and fill in checksum if needed. 1447 * Fetch data from kernel space and fill in checksum if needed.
1442 */ 1448 */
1443 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 1449 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1444 int len, int odd, struct sk_buff *skb) 1450 int len, int odd, struct sk_buff *skb)
1445 { 1451 {
1446 __wsum csum; 1452 __wsum csum;
1447 1453
1448 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); 1454 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1449 skb->csum = csum_block_add(skb->csum, csum, odd); 1455 skb->csum = csum_block_add(skb->csum, csum, odd);
1450 return 0; 1456 return 0;
1451 } 1457 }
1452 1458
1453 /* 1459 /*
1454 * Generic function to send a packet as reply to another packet. 1460 * Generic function to send a packet as reply to another packet.
1455 * Used to send TCP resets so far. ICMP should use this function too. 1461 * Used to send TCP resets so far. ICMP should use this function too.
1456 * 1462 *
1457 * Should run single threaded per socket because it uses the sock 1463 * Should run single threaded per socket because it uses the sock
1458 * structure to pass arguments. 1464 * structure to pass arguments.
1459 */ 1465 */
1460 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 1466 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1461 struct ip_reply_arg *arg, unsigned int len) 1467 struct ip_reply_arg *arg, unsigned int len)
1462 { 1468 {
1463 struct inet_sock *inet = inet_sk(sk); 1469 struct inet_sock *inet = inet_sk(sk);
1464 struct ip_options_data replyopts; 1470 struct ip_options_data replyopts;
1465 struct ipcm_cookie ipc; 1471 struct ipcm_cookie ipc;
1466 struct flowi4 fl4; 1472 struct flowi4 fl4;
1467 struct rtable *rt = skb_rtable(skb); 1473 struct rtable *rt = skb_rtable(skb);
1468 1474
1469 if (ip_options_echo(&replyopts.opt.opt, skb)) 1475 if (ip_options_echo(&replyopts.opt.opt, skb))
1470 return; 1476 return;
1471 1477
1472 ipc.addr = daddr; 1478 ipc.addr = daddr;
1473 ipc.opt = NULL; 1479 ipc.opt = NULL;
1474 ipc.tx_flags = 0; 1480 ipc.tx_flags = 0;
1475 1481
1476 if (replyopts.opt.opt.optlen) { 1482 if (replyopts.opt.opt.optlen) {
1477 ipc.opt = &replyopts.opt; 1483 ipc.opt = &replyopts.opt;
1478 1484
1479 if (replyopts.opt.opt.srr) 1485 if (replyopts.opt.opt.srr)
1480 daddr = replyopts.opt.opt.faddr; 1486 daddr = replyopts.opt.opt.faddr;
1481 } 1487 }
1482 1488
1483 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1489 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1484 RT_TOS(ip_hdr(skb)->tos), 1490 RT_TOS(ip_hdr(skb)->tos),
1485 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1491 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1486 ip_reply_arg_flowi_flags(arg), 1492 ip_reply_arg_flowi_flags(arg),
1487 daddr, rt->rt_spec_dst, 1493 daddr, rt->rt_spec_dst,
1488 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1494 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1489 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1495 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1490 rt = ip_route_output_key(sock_net(sk), &fl4); 1496 rt = ip_route_output_key(sock_net(sk), &fl4);
1491 if (IS_ERR(rt)) 1497 if (IS_ERR(rt))
1492 return; 1498 return;
1493 1499
1494 /* And let IP do all the hard work. 1500 /* And let IP do all the hard work.
1495 1501
1496 This chunk is not reenterable, hence spinlock. 1502 This chunk is not reenterable, hence spinlock.
1497 Note that it uses the fact, that this function is called 1503 Note that it uses the fact, that this function is called
1498 with locally disabled BH and that sk cannot be already spinlocked. 1504 with locally disabled BH and that sk cannot be already spinlocked.
1499 */ 1505 */
1500 bh_lock_sock(sk); 1506 bh_lock_sock(sk);
1501 inet->tos = ip_hdr(skb)->tos; 1507 inet->tos = ip_hdr(skb)->tos;
1502 sk->sk_priority = skb->priority; 1508 sk->sk_priority = skb->priority;
1503 sk->sk_protocol = ip_hdr(skb)->protocol; 1509 sk->sk_protocol = ip_hdr(skb)->protocol;
1504 sk->sk_bound_dev_if = arg->bound_dev_if; 1510 sk->sk_bound_dev_if = arg->bound_dev_if;
1505 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1511 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1506 &ipc, &rt, MSG_DONTWAIT); 1512 &ipc, &rt, MSG_DONTWAIT);
1507 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1513 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1508 if (arg->csumoffset >= 0) 1514 if (arg->csumoffset >= 0)
1509 *((__sum16 *)skb_transport_header(skb) + 1515 *((__sum16 *)skb_transport_header(skb) +
1510 arg->csumoffset) = csum_fold(csum_add(skb->csum, 1516 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1511 arg->csum)); 1517 arg->csum));
1512 skb->ip_summed = CHECKSUM_NONE; 1518 skb->ip_summed = CHECKSUM_NONE;
1513 ip_push_pending_frames(sk, &fl4); 1519 ip_push_pending_frames(sk, &fl4);
1514 } 1520 }
1515 1521
1516 bh_unlock_sock(sk); 1522 bh_unlock_sock(sk);
1517 1523
1518 ip_rt_put(rt); 1524 ip_rt_put(rt);
1519 } 1525 }
1520 1526
1521 void __init ip_init(void) 1527 void __init ip_init(void)
1522 { 1528 {
1523 ip_rt_init(); 1529 ip_rt_init();
1524 inet_initpeers(); 1530 inet_initpeers();
1525 1531
1526 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) 1532 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1527 igmp_mc_proc_init(); 1533 igmp_mc_proc_init();
1528 #endif 1534 #endif
1529 } 1535 }
1530 1536
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * ROUTE - implementation of the IP router. 6 * ROUTE - implementation of the IP router.
7 * 7 *
8 * Authors: Ross Biro 8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * 13 *
14 * Fixes: 14 * Fixes:
15 * Alan Cox : Verify area fixes. 15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes 16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates 17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update 18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible 19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics 20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K 21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table 22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window 23 * Alan Cox : MSS actually. Also added the window
24 * clamper. 24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del() 25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support. 26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft. 27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support. 28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support. 29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support. 30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes. 31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics. 32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly 33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD 34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different. 35 * our system is still very different.
36 * Alan Cox : Faster /proc handling 36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing, 37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour. 38 * routing caches and better behaviour.
39 * 39 *
40 * Olaf Erb : irtt wasn't being copied right. 40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support. 41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope) 42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed 43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source 44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and 45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch. 46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages. 47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark 52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics 53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file 54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations 57 * Ilia Sotnikov : Removed TOS from hash calculations
58 * 58 *
59 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version 61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version. 62 * 2 of the License, or (at your option) any later version.
63 */ 63 */
64 64
65 #include <linux/module.h> 65 #include <linux/module.h>
66 #include <asm/uaccess.h> 66 #include <asm/uaccess.h>
67 #include <asm/system.h> 67 #include <asm/system.h>
68 #include <linux/bitops.h> 68 #include <linux/bitops.h>
69 #include <linux/types.h> 69 #include <linux/types.h>
70 #include <linux/kernel.h> 70 #include <linux/kernel.h>
71 #include <linux/mm.h> 71 #include <linux/mm.h>
72 #include <linux/bootmem.h> 72 #include <linux/bootmem.h>
73 #include <linux/string.h> 73 #include <linux/string.h>
74 #include <linux/socket.h> 74 #include <linux/socket.h>
75 #include <linux/sockios.h> 75 #include <linux/sockios.h>
76 #include <linux/errno.h> 76 #include <linux/errno.h>
77 #include <linux/in.h> 77 #include <linux/in.h>
78 #include <linux/inet.h> 78 #include <linux/inet.h>
79 #include <linux/netdevice.h> 79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h> 80 #include <linux/proc_fs.h>
81 #include <linux/init.h> 81 #include <linux/init.h>
82 #include <linux/workqueue.h> 82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h> 83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h> 84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h> 85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h> 86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h> 87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h> 88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h> 89 #include <linux/random.h>
90 #include <linux/jhash.h> 90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h> 91 #include <linux/rcupdate.h>
92 #include <linux/times.h> 92 #include <linux/times.h>
93 #include <linux/slab.h> 93 #include <linux/slab.h>
94 #include <net/dst.h> 94 #include <net/dst.h>
95 #include <net/net_namespace.h> 95 #include <net/net_namespace.h>
96 #include <net/protocol.h> 96 #include <net/protocol.h>
97 #include <net/ip.h> 97 #include <net/ip.h>
98 #include <net/route.h> 98 #include <net/route.h>
99 #include <net/inetpeer.h> 99 #include <net/inetpeer.h>
100 #include <net/sock.h> 100 #include <net/sock.h>
101 #include <net/ip_fib.h> 101 #include <net/ip_fib.h>
102 #include <net/arp.h> 102 #include <net/arp.h>
103 #include <net/tcp.h> 103 #include <net/tcp.h>
104 #include <net/icmp.h> 104 #include <net/icmp.h>
105 #include <net/xfrm.h> 105 #include <net/xfrm.h>
106 #include <net/netevent.h> 106 #include <net/netevent.h>
107 #include <net/rtnetlink.h> 107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL 108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h> 109 #include <linux/sysctl.h>
110 #endif 110 #endif
111 #include <net/atmclip.h> 111 #include <net/atmclip.h>
112 112
113 #define RT_FL_TOS(oldflp4) \ 113 #define RT_FL_TOS(oldflp4) \
114 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 114 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 115
116 #define IP_MAX_MTU 0xFFF0 116 #define IP_MAX_MTU 0xFFF0
117 117
118 #define RT_GC_TIMEOUT (300*HZ) 118 #define RT_GC_TIMEOUT (300*HZ)
119 119
120 static int ip_rt_max_size; 120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 121 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 122 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9; 124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50; 125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ; 127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ; 128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8; 129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256; 132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int rt_chain_length_max __read_mostly = 20; 133 static int rt_chain_length_max __read_mostly = 20;
134 134
135 /* 135 /*
136 * Interface to generic destination cache. 136 * Interface to generic destination cache.
137 */ 137 */
138 138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int ipv4_default_mtu(const struct dst_entry *dst); 141 static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142 static void ipv4_dst_destroy(struct dst_entry *dst); 142 static void ipv4_dst_destroy(struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void ipv4_link_failure(struct sk_buff *skb); 144 static void ipv4_link_failure(struct sk_buff *skb);
145 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 145 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(struct dst_ops *ops); 146 static int rt_garbage_collect(struct dst_ops *ops);
147 147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how) 149 int how)
150 { 150 {
151 } 151 }
152 152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 { 154 {
155 struct rtable *rt = (struct rtable *) dst; 155 struct rtable *rt = (struct rtable *) dst;
156 struct inet_peer *peer; 156 struct inet_peer *peer;
157 u32 *p = NULL; 157 u32 *p = NULL;
158 158
159 if (!rt->peer) 159 if (!rt->peer)
160 rt_bind_peer(rt, rt->rt_dst, 1); 160 rt_bind_peer(rt, rt->rt_dst, 1);
161 161
162 peer = rt->peer; 162 peer = rt->peer;
163 if (peer) { 163 if (peer) {
164 u32 *old_p = __DST_METRICS_PTR(old); 164 u32 *old_p = __DST_METRICS_PTR(old);
165 unsigned long prev, new; 165 unsigned long prev, new;
166 166
167 p = peer->metrics; 167 p = peer->metrics;
168 if (inet_metrics_new(peer)) 168 if (inet_metrics_new(peer))
169 memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 169 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
170 170
171 new = (unsigned long) p; 171 new = (unsigned long) p;
172 prev = cmpxchg(&dst->_metrics, old, new); 172 prev = cmpxchg(&dst->_metrics, old, new);
173 173
174 if (prev != old) { 174 if (prev != old) {
175 p = __DST_METRICS_PTR(prev); 175 p = __DST_METRICS_PTR(prev);
176 if (prev & DST_METRICS_READ_ONLY) 176 if (prev & DST_METRICS_READ_ONLY)
177 p = NULL; 177 p = NULL;
178 } else { 178 } else {
179 if (rt->fi) { 179 if (rt->fi) {
180 fib_info_put(rt->fi); 180 fib_info_put(rt->fi);
181 rt->fi = NULL; 181 rt->fi = NULL;
182 } 182 }
183 } 183 }
184 } 184 }
185 return p; 185 return p;
186 } 186 }
187 187
188 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); 188 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
189 189
190 static struct dst_ops ipv4_dst_ops = { 190 static struct dst_ops ipv4_dst_ops = {
191 .family = AF_INET, 191 .family = AF_INET,
192 .protocol = cpu_to_be16(ETH_P_IP), 192 .protocol = cpu_to_be16(ETH_P_IP),
193 .gc = rt_garbage_collect, 193 .gc = rt_garbage_collect,
194 .check = ipv4_dst_check, 194 .check = ipv4_dst_check,
195 .default_advmss = ipv4_default_advmss, 195 .default_advmss = ipv4_default_advmss,
196 .default_mtu = ipv4_default_mtu, 196 .default_mtu = ipv4_default_mtu,
197 .cow_metrics = ipv4_cow_metrics, 197 .cow_metrics = ipv4_cow_metrics,
198 .destroy = ipv4_dst_destroy, 198 .destroy = ipv4_dst_destroy,
199 .ifdown = ipv4_dst_ifdown, 199 .ifdown = ipv4_dst_ifdown,
200 .negative_advice = ipv4_negative_advice, 200 .negative_advice = ipv4_negative_advice,
201 .link_failure = ipv4_link_failure, 201 .link_failure = ipv4_link_failure,
202 .update_pmtu = ip_rt_update_pmtu, 202 .update_pmtu = ip_rt_update_pmtu,
203 .local_out = __ip_local_out, 203 .local_out = __ip_local_out,
204 .neigh_lookup = ipv4_neigh_lookup, 204 .neigh_lookup = ipv4_neigh_lookup,
205 }; 205 };
206 206
207 #define ECN_OR_COST(class) TC_PRIO_##class 207 #define ECN_OR_COST(class) TC_PRIO_##class
208 208
209 const __u8 ip_tos2prio[16] = { 209 const __u8 ip_tos2prio[16] = {
210 TC_PRIO_BESTEFFORT, 210 TC_PRIO_BESTEFFORT,
211 ECN_OR_COST(BESTEFFORT), 211 ECN_OR_COST(BESTEFFORT),
212 TC_PRIO_BESTEFFORT, 212 TC_PRIO_BESTEFFORT,
213 ECN_OR_COST(BESTEFFORT), 213 ECN_OR_COST(BESTEFFORT),
214 TC_PRIO_BULK, 214 TC_PRIO_BULK,
215 ECN_OR_COST(BULK), 215 ECN_OR_COST(BULK),
216 TC_PRIO_BULK, 216 TC_PRIO_BULK,
217 ECN_OR_COST(BULK), 217 ECN_OR_COST(BULK),
218 TC_PRIO_INTERACTIVE, 218 TC_PRIO_INTERACTIVE,
219 ECN_OR_COST(INTERACTIVE), 219 ECN_OR_COST(INTERACTIVE),
220 TC_PRIO_INTERACTIVE, 220 TC_PRIO_INTERACTIVE,
221 ECN_OR_COST(INTERACTIVE), 221 ECN_OR_COST(INTERACTIVE),
222 TC_PRIO_INTERACTIVE_BULK, 222 TC_PRIO_INTERACTIVE_BULK,
223 ECN_OR_COST(INTERACTIVE_BULK), 223 ECN_OR_COST(INTERACTIVE_BULK),
224 TC_PRIO_INTERACTIVE_BULK, 224 TC_PRIO_INTERACTIVE_BULK,
225 ECN_OR_COST(INTERACTIVE_BULK) 225 ECN_OR_COST(INTERACTIVE_BULK)
226 }; 226 };
227 227
228 228
229 /* 229 /*
230 * Route cache. 230 * Route cache.
231 */ 231 */
232 232
233 /* The locking scheme is rather straight forward: 233 /* The locking scheme is rather straight forward:
234 * 234 *
235 * 1) Read-Copy Update protects the buckets of the central route hash. 235 * 1) Read-Copy Update protects the buckets of the central route hash.
236 * 2) Only writers remove entries, and they hold the lock 236 * 2) Only writers remove entries, and they hold the lock
237 * as they look at rtable reference counts. 237 * as they look at rtable reference counts.
238 * 3) Only readers acquire references to rtable entries, 238 * 3) Only readers acquire references to rtable entries,
239 * they do so with atomic increments and with the 239 * they do so with atomic increments and with the
240 * lock held. 240 * lock held.
241 */ 241 */
242 242
243 struct rt_hash_bucket { 243 struct rt_hash_bucket {
244 struct rtable __rcu *chain; 244 struct rtable __rcu *chain;
245 }; 245 };
246 246
247 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 247 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
248 defined(CONFIG_PROVE_LOCKING) 248 defined(CONFIG_PROVE_LOCKING)
249 /* 249 /*
250 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 250 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
251 * The size of this table is a power of two and depends on the number of CPUS. 251 * The size of this table is a power of two and depends on the number of CPUS.
252 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 252 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
253 */ 253 */
254 #ifdef CONFIG_LOCKDEP 254 #ifdef CONFIG_LOCKDEP
255 # define RT_HASH_LOCK_SZ 256 255 # define RT_HASH_LOCK_SZ 256
256 #else 256 #else
257 # if NR_CPUS >= 32 257 # if NR_CPUS >= 32
258 # define RT_HASH_LOCK_SZ 4096 258 # define RT_HASH_LOCK_SZ 4096
259 # elif NR_CPUS >= 16 259 # elif NR_CPUS >= 16
260 # define RT_HASH_LOCK_SZ 2048 260 # define RT_HASH_LOCK_SZ 2048
261 # elif NR_CPUS >= 8 261 # elif NR_CPUS >= 8
262 # define RT_HASH_LOCK_SZ 1024 262 # define RT_HASH_LOCK_SZ 1024
263 # elif NR_CPUS >= 4 263 # elif NR_CPUS >= 4
264 # define RT_HASH_LOCK_SZ 512 264 # define RT_HASH_LOCK_SZ 512
265 # else 265 # else
266 # define RT_HASH_LOCK_SZ 256 266 # define RT_HASH_LOCK_SZ 256
267 # endif 267 # endif
268 #endif 268 #endif
269 269
270 static spinlock_t *rt_hash_locks; 270 static spinlock_t *rt_hash_locks;
271 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 271 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
272 272
273 static __init void rt_hash_lock_init(void) 273 static __init void rt_hash_lock_init(void)
274 { 274 {
275 int i; 275 int i;
276 276
277 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, 277 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
278 GFP_KERNEL); 278 GFP_KERNEL);
279 if (!rt_hash_locks) 279 if (!rt_hash_locks)
280 panic("IP: failed to allocate rt_hash_locks\n"); 280 panic("IP: failed to allocate rt_hash_locks\n");
281 281
282 for (i = 0; i < RT_HASH_LOCK_SZ; i++) 282 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
283 spin_lock_init(&rt_hash_locks[i]); 283 spin_lock_init(&rt_hash_locks[i]);
284 } 284 }
285 #else 285 #else
286 # define rt_hash_lock_addr(slot) NULL 286 # define rt_hash_lock_addr(slot) NULL
287 287
288 static inline void rt_hash_lock_init(void) 288 static inline void rt_hash_lock_init(void)
289 { 289 {
290 } 290 }
291 #endif 291 #endif
292 292
293 static struct rt_hash_bucket *rt_hash_table __read_mostly; 293 static struct rt_hash_bucket *rt_hash_table __read_mostly;
294 static unsigned rt_hash_mask __read_mostly; 294 static unsigned rt_hash_mask __read_mostly;
295 static unsigned int rt_hash_log __read_mostly; 295 static unsigned int rt_hash_log __read_mostly;
296 296
297 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 297 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
298 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 298 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
299 299
300 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 300 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
301 int genid) 301 int genid)
302 { 302 {
303 return jhash_3words((__force u32)daddr, (__force u32)saddr, 303 return jhash_3words((__force u32)daddr, (__force u32)saddr,
304 idx, genid) 304 idx, genid)
305 & rt_hash_mask; 305 & rt_hash_mask;
306 } 306 }
307 307
308 static inline int rt_genid(struct net *net) 308 static inline int rt_genid(struct net *net)
309 { 309 {
310 return atomic_read(&net->ipv4.rt_genid); 310 return atomic_read(&net->ipv4.rt_genid);
311 } 311 }
312 312
313 #ifdef CONFIG_PROC_FS 313 #ifdef CONFIG_PROC_FS
314 struct rt_cache_iter_state { 314 struct rt_cache_iter_state {
315 struct seq_net_private p; 315 struct seq_net_private p;
316 int bucket; 316 int bucket;
317 int genid; 317 int genid;
318 }; 318 };
319 319
320 static struct rtable *rt_cache_get_first(struct seq_file *seq) 320 static struct rtable *rt_cache_get_first(struct seq_file *seq)
321 { 321 {
322 struct rt_cache_iter_state *st = seq->private; 322 struct rt_cache_iter_state *st = seq->private;
323 struct rtable *r = NULL; 323 struct rtable *r = NULL;
324 324
325 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 325 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
326 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) 326 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
327 continue; 327 continue;
328 rcu_read_lock_bh(); 328 rcu_read_lock_bh();
329 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 329 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
330 while (r) { 330 while (r) {
331 if (dev_net(r->dst.dev) == seq_file_net(seq) && 331 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
332 r->rt_genid == st->genid) 332 r->rt_genid == st->genid)
333 return r; 333 return r;
334 r = rcu_dereference_bh(r->dst.rt_next); 334 r = rcu_dereference_bh(r->dst.rt_next);
335 } 335 }
336 rcu_read_unlock_bh(); 336 rcu_read_unlock_bh();
337 } 337 }
338 return r; 338 return r;
339 } 339 }
340 340
341 static struct rtable *__rt_cache_get_next(struct seq_file *seq, 341 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
342 struct rtable *r) 342 struct rtable *r)
343 { 343 {
344 struct rt_cache_iter_state *st = seq->private; 344 struct rt_cache_iter_state *st = seq->private;
345 345
346 r = rcu_dereference_bh(r->dst.rt_next); 346 r = rcu_dereference_bh(r->dst.rt_next);
347 while (!r) { 347 while (!r) {
348 rcu_read_unlock_bh(); 348 rcu_read_unlock_bh();
349 do { 349 do {
350 if (--st->bucket < 0) 350 if (--st->bucket < 0)
351 return NULL; 351 return NULL;
352 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); 352 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
353 rcu_read_lock_bh(); 353 rcu_read_lock_bh();
354 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 354 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
355 } 355 }
356 return r; 356 return r;
357 } 357 }
358 358
359 static struct rtable *rt_cache_get_next(struct seq_file *seq, 359 static struct rtable *rt_cache_get_next(struct seq_file *seq,
360 struct rtable *r) 360 struct rtable *r)
361 { 361 {
362 struct rt_cache_iter_state *st = seq->private; 362 struct rt_cache_iter_state *st = seq->private;
363 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 363 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
364 if (dev_net(r->dst.dev) != seq_file_net(seq)) 364 if (dev_net(r->dst.dev) != seq_file_net(seq))
365 continue; 365 continue;
366 if (r->rt_genid == st->genid) 366 if (r->rt_genid == st->genid)
367 break; 367 break;
368 } 368 }
369 return r; 369 return r;
370 } 370 }
371 371
372 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 372 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
373 { 373 {
374 struct rtable *r = rt_cache_get_first(seq); 374 struct rtable *r = rt_cache_get_first(seq);
375 375
376 if (r) 376 if (r)
377 while (pos && (r = rt_cache_get_next(seq, r))) 377 while (pos && (r = rt_cache_get_next(seq, r)))
378 --pos; 378 --pos;
379 return pos ? NULL : r; 379 return pos ? NULL : r;
380 } 380 }
381 381
382 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 382 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
383 { 383 {
384 struct rt_cache_iter_state *st = seq->private; 384 struct rt_cache_iter_state *st = seq->private;
385 if (*pos) 385 if (*pos)
386 return rt_cache_get_idx(seq, *pos - 1); 386 return rt_cache_get_idx(seq, *pos - 1);
387 st->genid = rt_genid(seq_file_net(seq)); 387 st->genid = rt_genid(seq_file_net(seq));
388 return SEQ_START_TOKEN; 388 return SEQ_START_TOKEN;
389 } 389 }
390 390
391 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 391 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
392 { 392 {
393 struct rtable *r; 393 struct rtable *r;
394 394
395 if (v == SEQ_START_TOKEN) 395 if (v == SEQ_START_TOKEN)
396 r = rt_cache_get_first(seq); 396 r = rt_cache_get_first(seq);
397 else 397 else
398 r = rt_cache_get_next(seq, v); 398 r = rt_cache_get_next(seq, v);
399 ++*pos; 399 ++*pos;
400 return r; 400 return r;
401 } 401 }
402 402
403 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 403 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
404 { 404 {
405 if (v && v != SEQ_START_TOKEN) 405 if (v && v != SEQ_START_TOKEN)
406 rcu_read_unlock_bh(); 406 rcu_read_unlock_bh();
407 } 407 }
408 408
409 static int rt_cache_seq_show(struct seq_file *seq, void *v) 409 static int rt_cache_seq_show(struct seq_file *seq, void *v)
410 { 410 {
411 if (v == SEQ_START_TOKEN) 411 if (v == SEQ_START_TOKEN)
412 seq_printf(seq, "%-127s\n", 412 seq_printf(seq, "%-127s\n",
413 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 413 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
414 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 414 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
415 "HHUptod\tSpecDst"); 415 "HHUptod\tSpecDst");
416 else { 416 else {
417 struct rtable *r = v; 417 struct rtable *r = v;
418 struct neighbour *n; 418 struct neighbour *n;
419 int len; 419 int len;
420 420
421 n = dst_get_neighbour(&r->dst); 421 n = dst_get_neighbour(&r->dst);
422 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 422 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
423 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 423 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
424 r->dst.dev ? r->dst.dev->name : "*", 424 r->dst.dev ? r->dst.dev->name : "*",
425 (__force u32)r->rt_dst, 425 (__force u32)r->rt_dst,
426 (__force u32)r->rt_gateway, 426 (__force u32)r->rt_gateway,
427 r->rt_flags, atomic_read(&r->dst.__refcnt), 427 r->rt_flags, atomic_read(&r->dst.__refcnt),
428 r->dst.__use, 0, (__force u32)r->rt_src, 428 r->dst.__use, 0, (__force u32)r->rt_src,
429 dst_metric_advmss(&r->dst) + 40, 429 dst_metric_advmss(&r->dst) + 40,
430 dst_metric(&r->dst, RTAX_WINDOW), 430 dst_metric(&r->dst, RTAX_WINDOW),
431 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 431 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
432 dst_metric(&r->dst, RTAX_RTTVAR)), 432 dst_metric(&r->dst, RTAX_RTTVAR)),
433 r->rt_key_tos, 433 r->rt_key_tos,
434 -1, 434 -1,
435 (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0, 435 (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
436 r->rt_spec_dst, &len); 436 r->rt_spec_dst, &len);
437 437
438 seq_printf(seq, "%*s\n", 127 - len, ""); 438 seq_printf(seq, "%*s\n", 127 - len, "");
439 } 439 }
440 return 0; 440 return 0;
441 } 441 }
442 442
443 static const struct seq_operations rt_cache_seq_ops = { 443 static const struct seq_operations rt_cache_seq_ops = {
444 .start = rt_cache_seq_start, 444 .start = rt_cache_seq_start,
445 .next = rt_cache_seq_next, 445 .next = rt_cache_seq_next,
446 .stop = rt_cache_seq_stop, 446 .stop = rt_cache_seq_stop,
447 .show = rt_cache_seq_show, 447 .show = rt_cache_seq_show,
448 }; 448 };
449 449
450 static int rt_cache_seq_open(struct inode *inode, struct file *file) 450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
451 { 451 {
452 return seq_open_net(inode, file, &rt_cache_seq_ops, 452 return seq_open_net(inode, file, &rt_cache_seq_ops,
453 sizeof(struct rt_cache_iter_state)); 453 sizeof(struct rt_cache_iter_state));
454 } 454 }
455 455
456 static const struct file_operations rt_cache_seq_fops = { 456 static const struct file_operations rt_cache_seq_fops = {
457 .owner = THIS_MODULE, 457 .owner = THIS_MODULE,
458 .open = rt_cache_seq_open, 458 .open = rt_cache_seq_open,
459 .read = seq_read, 459 .read = seq_read,
460 .llseek = seq_lseek, 460 .llseek = seq_lseek,
461 .release = seq_release_net, 461 .release = seq_release_net,
462 }; 462 };
463 463
464 464
465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
466 { 466 {
467 int cpu; 467 int cpu;
468 468
469 if (*pos == 0) 469 if (*pos == 0)
470 return SEQ_START_TOKEN; 470 return SEQ_START_TOKEN;
471 471
472 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 472 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
473 if (!cpu_possible(cpu)) 473 if (!cpu_possible(cpu))
474 continue; 474 continue;
475 *pos = cpu+1; 475 *pos = cpu+1;
476 return &per_cpu(rt_cache_stat, cpu); 476 return &per_cpu(rt_cache_stat, cpu);
477 } 477 }
478 return NULL; 478 return NULL;
479 } 479 }
480 480
481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
482 { 482 {
483 int cpu; 483 int cpu;
484 484
485 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 485 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
486 if (!cpu_possible(cpu)) 486 if (!cpu_possible(cpu))
487 continue; 487 continue;
488 *pos = cpu+1; 488 *pos = cpu+1;
489 return &per_cpu(rt_cache_stat, cpu); 489 return &per_cpu(rt_cache_stat, cpu);
490 } 490 }
491 return NULL; 491 return NULL;
492 492
493 } 493 }
494 494
495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
496 { 496 {
497 497
498 } 498 }
499 499
500 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
501 { 501 {
502 struct rt_cache_stat *st = v; 502 struct rt_cache_stat *st = v;
503 503
504 if (v == SEQ_START_TOKEN) { 504 if (v == SEQ_START_TOKEN) {
505 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 505 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
506 return 0; 506 return 0;
507 } 507 }
508 508
509 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 509 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
510 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 510 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
511 dst_entries_get_slow(&ipv4_dst_ops), 511 dst_entries_get_slow(&ipv4_dst_ops),
512 st->in_hit, 512 st->in_hit,
513 st->in_slow_tot, 513 st->in_slow_tot,
514 st->in_slow_mc, 514 st->in_slow_mc,
515 st->in_no_route, 515 st->in_no_route,
516 st->in_brd, 516 st->in_brd,
517 st->in_martian_dst, 517 st->in_martian_dst,
518 st->in_martian_src, 518 st->in_martian_src,
519 519
520 st->out_hit, 520 st->out_hit,
521 st->out_slow_tot, 521 st->out_slow_tot,
522 st->out_slow_mc, 522 st->out_slow_mc,
523 523
524 st->gc_total, 524 st->gc_total,
525 st->gc_ignored, 525 st->gc_ignored,
526 st->gc_goal_miss, 526 st->gc_goal_miss,
527 st->gc_dst_overflow, 527 st->gc_dst_overflow,
528 st->in_hlist_search, 528 st->in_hlist_search,
529 st->out_hlist_search 529 st->out_hlist_search
530 ); 530 );
531 return 0; 531 return 0;
532 } 532 }
533 533
534 static const struct seq_operations rt_cpu_seq_ops = { 534 static const struct seq_operations rt_cpu_seq_ops = {
535 .start = rt_cpu_seq_start, 535 .start = rt_cpu_seq_start,
536 .next = rt_cpu_seq_next, 536 .next = rt_cpu_seq_next,
537 .stop = rt_cpu_seq_stop, 537 .stop = rt_cpu_seq_stop,
538 .show = rt_cpu_seq_show, 538 .show = rt_cpu_seq_show,
539 }; 539 };
540 540
541 541
542 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
543 { 543 {
544 return seq_open(file, &rt_cpu_seq_ops); 544 return seq_open(file, &rt_cpu_seq_ops);
545 } 545 }
546 546
547 static const struct file_operations rt_cpu_seq_fops = { 547 static const struct file_operations rt_cpu_seq_fops = {
548 .owner = THIS_MODULE, 548 .owner = THIS_MODULE,
549 .open = rt_cpu_seq_open, 549 .open = rt_cpu_seq_open,
550 .read = seq_read, 550 .read = seq_read,
551 .llseek = seq_lseek, 551 .llseek = seq_lseek,
552 .release = seq_release, 552 .release = seq_release,
553 }; 553 };
554 554
555 #ifdef CONFIG_IP_ROUTE_CLASSID 555 #ifdef CONFIG_IP_ROUTE_CLASSID
556 static int rt_acct_proc_show(struct seq_file *m, void *v) 556 static int rt_acct_proc_show(struct seq_file *m, void *v)
557 { 557 {
558 struct ip_rt_acct *dst, *src; 558 struct ip_rt_acct *dst, *src;
559 unsigned int i, j; 559 unsigned int i, j;
560 560
561 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 561 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
562 if (!dst) 562 if (!dst)
563 return -ENOMEM; 563 return -ENOMEM;
564 564
565 for_each_possible_cpu(i) { 565 for_each_possible_cpu(i) {
566 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 566 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
567 for (j = 0; j < 256; j++) { 567 for (j = 0; j < 256; j++) {
568 dst[j].o_bytes += src[j].o_bytes; 568 dst[j].o_bytes += src[j].o_bytes;
569 dst[j].o_packets += src[j].o_packets; 569 dst[j].o_packets += src[j].o_packets;
570 dst[j].i_bytes += src[j].i_bytes; 570 dst[j].i_bytes += src[j].i_bytes;
571 dst[j].i_packets += src[j].i_packets; 571 dst[j].i_packets += src[j].i_packets;
572 } 572 }
573 } 573 }
574 574
575 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 575 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
576 kfree(dst); 576 kfree(dst);
577 return 0; 577 return 0;
578 } 578 }
579 579
580 static int rt_acct_proc_open(struct inode *inode, struct file *file) 580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
581 { 581 {
582 return single_open(file, rt_acct_proc_show, NULL); 582 return single_open(file, rt_acct_proc_show, NULL);
583 } 583 }
584 584
585 static const struct file_operations rt_acct_proc_fops = { 585 static const struct file_operations rt_acct_proc_fops = {
586 .owner = THIS_MODULE, 586 .owner = THIS_MODULE,
587 .open = rt_acct_proc_open, 587 .open = rt_acct_proc_open,
588 .read = seq_read, 588 .read = seq_read,
589 .llseek = seq_lseek, 589 .llseek = seq_lseek,
590 .release = single_release, 590 .release = single_release,
591 }; 591 };
592 #endif 592 #endif
593 593
594 static int __net_init ip_rt_do_proc_init(struct net *net) 594 static int __net_init ip_rt_do_proc_init(struct net *net)
595 { 595 {
596 struct proc_dir_entry *pde; 596 struct proc_dir_entry *pde;
597 597
598 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 598 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
599 &rt_cache_seq_fops); 599 &rt_cache_seq_fops);
600 if (!pde) 600 if (!pde)
601 goto err1; 601 goto err1;
602 602
603 pde = proc_create("rt_cache", S_IRUGO, 603 pde = proc_create("rt_cache", S_IRUGO,
604 net->proc_net_stat, &rt_cpu_seq_fops); 604 net->proc_net_stat, &rt_cpu_seq_fops);
605 if (!pde) 605 if (!pde)
606 goto err2; 606 goto err2;
607 607
608 #ifdef CONFIG_IP_ROUTE_CLASSID 608 #ifdef CONFIG_IP_ROUTE_CLASSID
609 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 609 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
610 if (!pde) 610 if (!pde)
611 goto err3; 611 goto err3;
612 #endif 612 #endif
613 return 0; 613 return 0;
614 614
615 #ifdef CONFIG_IP_ROUTE_CLASSID 615 #ifdef CONFIG_IP_ROUTE_CLASSID
616 err3: 616 err3:
617 remove_proc_entry("rt_cache", net->proc_net_stat); 617 remove_proc_entry("rt_cache", net->proc_net_stat);
618 #endif 618 #endif
619 err2: 619 err2:
620 remove_proc_entry("rt_cache", net->proc_net); 620 remove_proc_entry("rt_cache", net->proc_net);
621 err1: 621 err1:
622 return -ENOMEM; 622 return -ENOMEM;
623 } 623 }
624 624
625 static void __net_exit ip_rt_do_proc_exit(struct net *net) 625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
626 { 626 {
627 remove_proc_entry("rt_cache", net->proc_net_stat); 627 remove_proc_entry("rt_cache", net->proc_net_stat);
628 remove_proc_entry("rt_cache", net->proc_net); 628 remove_proc_entry("rt_cache", net->proc_net);
629 #ifdef CONFIG_IP_ROUTE_CLASSID 629 #ifdef CONFIG_IP_ROUTE_CLASSID
630 remove_proc_entry("rt_acct", net->proc_net); 630 remove_proc_entry("rt_acct", net->proc_net);
631 #endif 631 #endif
632 } 632 }
633 633
634 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 634 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
635 .init = ip_rt_do_proc_init, 635 .init = ip_rt_do_proc_init,
636 .exit = ip_rt_do_proc_exit, 636 .exit = ip_rt_do_proc_exit,
637 }; 637 };
638 638
639 static int __init ip_rt_proc_init(void) 639 static int __init ip_rt_proc_init(void)
640 { 640 {
641 return register_pernet_subsys(&ip_rt_proc_ops); 641 return register_pernet_subsys(&ip_rt_proc_ops);
642 } 642 }
643 643
644 #else 644 #else
645 static inline int ip_rt_proc_init(void) 645 static inline int ip_rt_proc_init(void)
646 { 646 {
647 return 0; 647 return 0;
648 } 648 }
649 #endif /* CONFIG_PROC_FS */ 649 #endif /* CONFIG_PROC_FS */
650 650
651 static inline void rt_free(struct rtable *rt) 651 static inline void rt_free(struct rtable *rt)
652 { 652 {
653 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 653 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
654 } 654 }
655 655
656 static inline void rt_drop(struct rtable *rt) 656 static inline void rt_drop(struct rtable *rt)
657 { 657 {
658 ip_rt_put(rt); 658 ip_rt_put(rt);
659 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 659 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
660 } 660 }
661 661
662 static inline int rt_fast_clean(struct rtable *rth) 662 static inline int rt_fast_clean(struct rtable *rth)
663 { 663 {
664 /* Kill broadcast/multicast entries very aggresively, if they 664 /* Kill broadcast/multicast entries very aggresively, if they
665 collide in hash table with more useful entries */ 665 collide in hash table with more useful entries */
666 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 666 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
667 rt_is_input_route(rth) && rth->dst.rt_next; 667 rt_is_input_route(rth) && rth->dst.rt_next;
668 } 668 }
669 669
670 static inline int rt_valuable(struct rtable *rth) 670 static inline int rt_valuable(struct rtable *rth)
671 { 671 {
672 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 672 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
673 (rth->peer && rth->peer->pmtu_expires); 673 (rth->peer && rth->peer->pmtu_expires);
674 } 674 }
675 675
676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
677 { 677 {
678 unsigned long age; 678 unsigned long age;
679 int ret = 0; 679 int ret = 0;
680 680
681 if (atomic_read(&rth->dst.__refcnt)) 681 if (atomic_read(&rth->dst.__refcnt))
682 goto out; 682 goto out;
683 683
684 age = jiffies - rth->dst.lastuse; 684 age = jiffies - rth->dst.lastuse;
685 if ((age <= tmo1 && !rt_fast_clean(rth)) || 685 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
686 (age <= tmo2 && rt_valuable(rth))) 686 (age <= tmo2 && rt_valuable(rth)))
687 goto out; 687 goto out;
688 ret = 1; 688 ret = 1;
689 out: return ret; 689 out: return ret;
690 } 690 }
691 691
692 /* Bits of score are: 692 /* Bits of score are:
693 * 31: very valuable 693 * 31: very valuable
694 * 30: not quite useless 694 * 30: not quite useless
695 * 29..0: usage counter 695 * 29..0: usage counter
696 */ 696 */
697 static inline u32 rt_score(struct rtable *rt) 697 static inline u32 rt_score(struct rtable *rt)
698 { 698 {
699 u32 score = jiffies - rt->dst.lastuse; 699 u32 score = jiffies - rt->dst.lastuse;
700 700
701 score = ~score & ~(3<<30); 701 score = ~score & ~(3<<30);
702 702
703 if (rt_valuable(rt)) 703 if (rt_valuable(rt))
704 score |= (1<<31); 704 score |= (1<<31);
705 705
706 if (rt_is_output_route(rt) || 706 if (rt_is_output_route(rt) ||
707 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 707 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
708 score |= (1<<30); 708 score |= (1<<30);
709 709
710 return score; 710 return score;
711 } 711 }
712 712
713 static inline bool rt_caching(const struct net *net) 713 static inline bool rt_caching(const struct net *net)
714 { 714 {
715 return net->ipv4.current_rt_cache_rebuild_count <= 715 return net->ipv4.current_rt_cache_rebuild_count <=
716 net->ipv4.sysctl_rt_cache_rebuild_count; 716 net->ipv4.sysctl_rt_cache_rebuild_count;
717 } 717 }
718 718
719 static inline bool compare_hash_inputs(const struct rtable *rt1, 719 static inline bool compare_hash_inputs(const struct rtable *rt1,
720 const struct rtable *rt2) 720 const struct rtable *rt2)
721 { 721 {
722 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 722 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
723 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 723 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
724 (rt1->rt_iif ^ rt2->rt_iif)) == 0); 724 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
725 } 725 }
726 726
727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) 727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
728 { 728 {
729 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 729 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 730 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
731 (rt1->rt_mark ^ rt2->rt_mark) | 731 (rt1->rt_mark ^ rt2->rt_mark) |
732 (rt1->rt_key_tos ^ rt2->rt_key_tos) | 732 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
733 (rt1->rt_oif ^ rt2->rt_oif) | 733 (rt1->rt_oif ^ rt2->rt_oif) |
734 (rt1->rt_iif ^ rt2->rt_iif)) == 0; 734 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
735 } 735 }
736 736
737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
738 { 738 {
739 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); 739 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
740 } 740 }
741 741
742 static inline int rt_is_expired(struct rtable *rth) 742 static inline int rt_is_expired(struct rtable *rth)
743 { 743 {
744 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 744 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
745 } 745 }
746 746
747 /* 747 /*
748 * Perform a full scan of hash table and free all entries. 748 * Perform a full scan of hash table and free all entries.
749 * Can be called by a softirq or a process. 749 * Can be called by a softirq or a process.
750 * In the later case, we want to be reschedule if necessary 750 * In the later case, we want to be reschedule if necessary
751 */ 751 */
752 static void rt_do_flush(struct net *net, int process_context) 752 static void rt_do_flush(struct net *net, int process_context)
753 { 753 {
754 unsigned int i; 754 unsigned int i;
755 struct rtable *rth, *next; 755 struct rtable *rth, *next;
756 756
757 for (i = 0; i <= rt_hash_mask; i++) { 757 for (i = 0; i <= rt_hash_mask; i++) {
758 struct rtable __rcu **pprev; 758 struct rtable __rcu **pprev;
759 struct rtable *list; 759 struct rtable *list;
760 760
761 if (process_context && need_resched()) 761 if (process_context && need_resched())
762 cond_resched(); 762 cond_resched();
763 rth = rcu_dereference_raw(rt_hash_table[i].chain); 763 rth = rcu_dereference_raw(rt_hash_table[i].chain);
764 if (!rth) 764 if (!rth)
765 continue; 765 continue;
766 766
767 spin_lock_bh(rt_hash_lock_addr(i)); 767 spin_lock_bh(rt_hash_lock_addr(i));
768 768
769 list = NULL; 769 list = NULL;
770 pprev = &rt_hash_table[i].chain; 770 pprev = &rt_hash_table[i].chain;
771 rth = rcu_dereference_protected(*pprev, 771 rth = rcu_dereference_protected(*pprev,
772 lockdep_is_held(rt_hash_lock_addr(i))); 772 lockdep_is_held(rt_hash_lock_addr(i)));
773 773
774 while (rth) { 774 while (rth) {
775 next = rcu_dereference_protected(rth->dst.rt_next, 775 next = rcu_dereference_protected(rth->dst.rt_next,
776 lockdep_is_held(rt_hash_lock_addr(i))); 776 lockdep_is_held(rt_hash_lock_addr(i)));
777 777
778 if (!net || 778 if (!net ||
779 net_eq(dev_net(rth->dst.dev), net)) { 779 net_eq(dev_net(rth->dst.dev), net)) {
780 rcu_assign_pointer(*pprev, next); 780 rcu_assign_pointer(*pprev, next);
781 rcu_assign_pointer(rth->dst.rt_next, list); 781 rcu_assign_pointer(rth->dst.rt_next, list);
782 list = rth; 782 list = rth;
783 } else { 783 } else {
784 pprev = &rth->dst.rt_next; 784 pprev = &rth->dst.rt_next;
785 } 785 }
786 rth = next; 786 rth = next;
787 } 787 }
788 788
789 spin_unlock_bh(rt_hash_lock_addr(i)); 789 spin_unlock_bh(rt_hash_lock_addr(i));
790 790
791 for (; list; list = next) { 791 for (; list; list = next) {
792 next = rcu_dereference_protected(list->dst.rt_next, 1); 792 next = rcu_dereference_protected(list->dst.rt_next, 1);
793 rt_free(list); 793 rt_free(list);
794 } 794 }
795 } 795 }
796 } 796 }
797 797
798 /* 798 /*
799 * While freeing expired entries, we compute average chain length 799 * While freeing expired entries, we compute average chain length
800 * and standard deviation, using fixed-point arithmetic. 800 * and standard deviation, using fixed-point arithmetic.
801 * This to have an estimation of rt_chain_length_max 801 * This to have an estimation of rt_chain_length_max
802 * rt_chain_length_max = max(elasticity, AVG + 4*SD) 802 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
803 * We use 3 bits for frational part, and 29 (or 61) for magnitude. 803 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
804 */ 804 */
805 805
806 #define FRACT_BITS 3 806 #define FRACT_BITS 3
807 #define ONE (1UL << FRACT_BITS) 807 #define ONE (1UL << FRACT_BITS)
808 808
809 /* 809 /*
810 * Given a hash chain and an item in this hash chain, 810 * Given a hash chain and an item in this hash chain,
811 * find if a previous entry has the same hash_inputs 811 * find if a previous entry has the same hash_inputs
812 * (but differs on tos, mark or oif) 812 * (but differs on tos, mark or oif)
813 * Returns 0 if an alias is found. 813 * Returns 0 if an alias is found.
814 * Returns ONE if rth has no alias before itself. 814 * Returns ONE if rth has no alias before itself.
815 */ 815 */
816 static int has_noalias(const struct rtable *head, const struct rtable *rth) 816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
817 { 817 {
818 const struct rtable *aux = head; 818 const struct rtable *aux = head;
819 819
820 while (aux != rth) { 820 while (aux != rth) {
821 if (compare_hash_inputs(aux, rth)) 821 if (compare_hash_inputs(aux, rth))
822 return 0; 822 return 0;
823 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 823 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
824 } 824 }
825 return ONE; 825 return ONE;
826 } 826 }
827 827
828 /* 828 /*
829 * Perturbation of rt_genid by a small quantity [1..256] 829 * Perturbation of rt_genid by a small quantity [1..256]
830 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 830 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
831 * many times (2^24) without giving recent rt_genid. 831 * many times (2^24) without giving recent rt_genid.
832 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 832 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
833 */ 833 */
834 static void rt_cache_invalidate(struct net *net) 834 static void rt_cache_invalidate(struct net *net)
835 { 835 {
836 unsigned char shuffle; 836 unsigned char shuffle;
837 837
838 get_random_bytes(&shuffle, sizeof(shuffle)); 838 get_random_bytes(&shuffle, sizeof(shuffle));
839 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 839 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
840 } 840 }
841 841
842 /* 842 /*
843 * delay < 0 : invalidate cache (fast : entries will be deleted later) 843 * delay < 0 : invalidate cache (fast : entries will be deleted later)
844 * delay >= 0 : invalidate & flush cache (can be long) 844 * delay >= 0 : invalidate & flush cache (can be long)
845 */ 845 */
846 void rt_cache_flush(struct net *net, int delay) 846 void rt_cache_flush(struct net *net, int delay)
847 { 847 {
848 rt_cache_invalidate(net); 848 rt_cache_invalidate(net);
849 if (delay >= 0) 849 if (delay >= 0)
850 rt_do_flush(net, !in_softirq()); 850 rt_do_flush(net, !in_softirq());
851 } 851 }
852 852
853 /* Flush previous cache invalidated entries from the cache */ 853 /* Flush previous cache invalidated entries from the cache */
854 void rt_cache_flush_batch(struct net *net) 854 void rt_cache_flush_batch(struct net *net)
855 { 855 {
856 rt_do_flush(net, !in_softirq()); 856 rt_do_flush(net, !in_softirq());
857 } 857 }
858 858
859 static void rt_emergency_hash_rebuild(struct net *net) 859 static void rt_emergency_hash_rebuild(struct net *net)
860 { 860 {
861 if (net_ratelimit()) 861 if (net_ratelimit())
862 printk(KERN_WARNING "Route hash chain too long!\n"); 862 printk(KERN_WARNING "Route hash chain too long!\n");
863 rt_cache_invalidate(net); 863 rt_cache_invalidate(net);
864 } 864 }
865 865
866 /* 866 /*
867 Short description of GC goals. 867 Short description of GC goals.
868 868
869 We want to build algorithm, which will keep routing cache 869 We want to build algorithm, which will keep routing cache
870 at some equilibrium point, when number of aged off entries 870 at some equilibrium point, when number of aged off entries
871 is kept approximately equal to newly generated ones. 871 is kept approximately equal to newly generated ones.
872 872
873 Current expiration strength is variable "expire". 873 Current expiration strength is variable "expire".
874 We try to adjust it dynamically, so that if networking 874 We try to adjust it dynamically, so that if networking
875 is idle expires is large enough to keep enough of warm entries, 875 is idle expires is large enough to keep enough of warm entries,
876 and when load increases it reduces to limit cache size. 876 and when load increases it reduces to limit cache size.
877 */ 877 */
878 878
879 static int rt_garbage_collect(struct dst_ops *ops) 879 static int rt_garbage_collect(struct dst_ops *ops)
880 { 880 {
881 static unsigned long expire = RT_GC_TIMEOUT; 881 static unsigned long expire = RT_GC_TIMEOUT;
882 static unsigned long last_gc; 882 static unsigned long last_gc;
883 static int rover; 883 static int rover;
884 static int equilibrium; 884 static int equilibrium;
885 struct rtable *rth; 885 struct rtable *rth;
886 struct rtable __rcu **rthp; 886 struct rtable __rcu **rthp;
887 unsigned long now = jiffies; 887 unsigned long now = jiffies;
888 int goal; 888 int goal;
889 int entries = dst_entries_get_fast(&ipv4_dst_ops); 889 int entries = dst_entries_get_fast(&ipv4_dst_ops);
890 890
891 /* 891 /*
892 * Garbage collection is pretty expensive, 892 * Garbage collection is pretty expensive,
893 * do not make it too frequently. 893 * do not make it too frequently.
894 */ 894 */
895 895
896 RT_CACHE_STAT_INC(gc_total); 896 RT_CACHE_STAT_INC(gc_total);
897 897
898 if (now - last_gc < ip_rt_gc_min_interval && 898 if (now - last_gc < ip_rt_gc_min_interval &&
899 entries < ip_rt_max_size) { 899 entries < ip_rt_max_size) {
900 RT_CACHE_STAT_INC(gc_ignored); 900 RT_CACHE_STAT_INC(gc_ignored);
901 goto out; 901 goto out;
902 } 902 }
903 903
904 entries = dst_entries_get_slow(&ipv4_dst_ops); 904 entries = dst_entries_get_slow(&ipv4_dst_ops);
905 /* Calculate number of entries, which we want to expire now. */ 905 /* Calculate number of entries, which we want to expire now. */
906 goal = entries - (ip_rt_gc_elasticity << rt_hash_log); 906 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
907 if (goal <= 0) { 907 if (goal <= 0) {
908 if (equilibrium < ipv4_dst_ops.gc_thresh) 908 if (equilibrium < ipv4_dst_ops.gc_thresh)
909 equilibrium = ipv4_dst_ops.gc_thresh; 909 equilibrium = ipv4_dst_ops.gc_thresh;
910 goal = entries - equilibrium; 910 goal = entries - equilibrium;
911 if (goal > 0) { 911 if (goal > 0) {
912 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 912 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
913 goal = entries - equilibrium; 913 goal = entries - equilibrium;
914 } 914 }
915 } else { 915 } else {
916 /* We are in dangerous area. Try to reduce cache really 916 /* We are in dangerous area. Try to reduce cache really
917 * aggressively. 917 * aggressively.
918 */ 918 */
919 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 919 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
920 equilibrium = entries - goal; 920 equilibrium = entries - goal;
921 } 921 }
922 922
923 if (now - last_gc >= ip_rt_gc_min_interval) 923 if (now - last_gc >= ip_rt_gc_min_interval)
924 last_gc = now; 924 last_gc = now;
925 925
926 if (goal <= 0) { 926 if (goal <= 0) {
927 equilibrium += goal; 927 equilibrium += goal;
928 goto work_done; 928 goto work_done;
929 } 929 }
930 930
931 do { 931 do {
932 int i, k; 932 int i, k;
933 933
934 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 934 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
935 unsigned long tmo = expire; 935 unsigned long tmo = expire;
936 936
937 k = (k + 1) & rt_hash_mask; 937 k = (k + 1) & rt_hash_mask;
938 rthp = &rt_hash_table[k].chain; 938 rthp = &rt_hash_table[k].chain;
939 spin_lock_bh(rt_hash_lock_addr(k)); 939 spin_lock_bh(rt_hash_lock_addr(k));
940 while ((rth = rcu_dereference_protected(*rthp, 940 while ((rth = rcu_dereference_protected(*rthp,
941 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { 941 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
942 if (!rt_is_expired(rth) && 942 if (!rt_is_expired(rth) &&
943 !rt_may_expire(rth, tmo, expire)) { 943 !rt_may_expire(rth, tmo, expire)) {
944 tmo >>= 1; 944 tmo >>= 1;
945 rthp = &rth->dst.rt_next; 945 rthp = &rth->dst.rt_next;
946 continue; 946 continue;
947 } 947 }
948 *rthp = rth->dst.rt_next; 948 *rthp = rth->dst.rt_next;
949 rt_free(rth); 949 rt_free(rth);
950 goal--; 950 goal--;
951 } 951 }
952 spin_unlock_bh(rt_hash_lock_addr(k)); 952 spin_unlock_bh(rt_hash_lock_addr(k));
953 if (goal <= 0) 953 if (goal <= 0)
954 break; 954 break;
955 } 955 }
956 rover = k; 956 rover = k;
957 957
958 if (goal <= 0) 958 if (goal <= 0)
959 goto work_done; 959 goto work_done;
960 960
961 /* Goal is not achieved. We stop process if: 961 /* Goal is not achieved. We stop process if:
962 962
963 - if expire reduced to zero. Otherwise, expire is halfed. 963 - if expire reduced to zero. Otherwise, expire is halfed.
964 - if table is not full. 964 - if table is not full.
965 - if we are called from interrupt. 965 - if we are called from interrupt.
966 - jiffies check is just fallback/debug loop breaker. 966 - jiffies check is just fallback/debug loop breaker.
967 We will not spin here for long time in any case. 967 We will not spin here for long time in any case.
968 */ 968 */
969 969
970 RT_CACHE_STAT_INC(gc_goal_miss); 970 RT_CACHE_STAT_INC(gc_goal_miss);
971 971
972 if (expire == 0) 972 if (expire == 0)
973 break; 973 break;
974 974
975 expire >>= 1; 975 expire >>= 1;
976 976
977 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 977 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978 goto out; 978 goto out;
979 } while (!in_softirq() && time_before_eq(jiffies, now)); 979 } while (!in_softirq() && time_before_eq(jiffies, now));
980 980
981 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 981 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
982 goto out; 982 goto out;
983 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) 983 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
984 goto out; 984 goto out;
985 if (net_ratelimit()) 985 if (net_ratelimit())
986 printk(KERN_WARNING "dst cache overflow\n"); 986 printk(KERN_WARNING "dst cache overflow\n");
987 RT_CACHE_STAT_INC(gc_dst_overflow); 987 RT_CACHE_STAT_INC(gc_dst_overflow);
988 return 1; 988 return 1;
989 989
990 work_done: 990 work_done:
991 expire += ip_rt_gc_min_interval; 991 expire += ip_rt_gc_min_interval;
992 if (expire > ip_rt_gc_timeout || 992 if (expire > ip_rt_gc_timeout ||
993 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || 993 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
994 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) 994 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
995 expire = ip_rt_gc_timeout; 995 expire = ip_rt_gc_timeout;
996 out: return 0; 996 out: return 0;
997 } 997 }
998 998
999 /* 999 /*
1000 * Returns number of entries in a hash chain that have different hash_inputs 1000 * Returns number of entries in a hash chain that have different hash_inputs
1001 */ 1001 */
1002 static int slow_chain_length(const struct rtable *head) 1002 static int slow_chain_length(const struct rtable *head)
1003 { 1003 {
1004 int length = 0; 1004 int length = 0;
1005 const struct rtable *rth = head; 1005 const struct rtable *rth = head;
1006 1006
1007 while (rth) { 1007 while (rth) {
1008 length += has_noalias(head, rth); 1008 length += has_noalias(head, rth);
1009 rth = rcu_dereference_protected(rth->dst.rt_next, 1); 1009 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1010 } 1010 }
1011 return length >> FRACT_BITS; 1011 return length >> FRACT_BITS;
1012 } 1012 }
1013 1013
1014 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) 1014 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1015 { 1015 {
1016 struct neigh_table *tbl = &arp_tbl; 1016 struct neigh_table *tbl = &arp_tbl;
1017 static const __be32 inaddr_any = 0; 1017 static const __be32 inaddr_any = 0;
1018 struct net_device *dev = dst->dev; 1018 struct net_device *dev = dst->dev;
1019 const __be32 *pkey = daddr; 1019 const __be32 *pkey = daddr;
1020 struct neighbour *n; 1020 struct neighbour *n;
1021 1021
1022 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 1022 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1023 if (dev->type == ARPHRD_ATM) 1023 if (dev->type == ARPHRD_ATM)
1024 tbl = clip_tbl_hook; 1024 tbl = clip_tbl_hook;
1025 #endif 1025 #endif
1026 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) 1026 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1027 pkey = &inaddr_any; 1027 pkey = &inaddr_any;
1028 1028
1029 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); 1029 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1030 if (n) 1030 if (n)
1031 return n; 1031 return n;
1032 return neigh_create(tbl, pkey, dev); 1032 return neigh_create(tbl, pkey, dev);
1033 } 1033 }
1034 1034
1035 static int rt_bind_neighbour(struct rtable *rt) 1035 static int rt_bind_neighbour(struct rtable *rt)
1036 { 1036 {
1037 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); 1037 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1038 if (IS_ERR(n)) 1038 if (IS_ERR(n))
1039 return PTR_ERR(n); 1039 return PTR_ERR(n);
1040 dst_set_neighbour(&rt->dst, n); 1040 dst_set_neighbour(&rt->dst, n);
1041 1041
1042 return 0; 1042 return 0;
1043 } 1043 }
1044 1044
1045 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, 1045 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1046 struct sk_buff *skb, int ifindex) 1046 struct sk_buff *skb, int ifindex)
1047 { 1047 {
1048 struct rtable *rth, *cand; 1048 struct rtable *rth, *cand;
1049 struct rtable __rcu **rthp, **candp; 1049 struct rtable __rcu **rthp, **candp;
1050 unsigned long now; 1050 unsigned long now;
1051 u32 min_score; 1051 u32 min_score;
1052 int chain_length; 1052 int chain_length;
1053 int attempts = !in_softirq(); 1053 int attempts = !in_softirq();
1054 1054
1055 restart: 1055 restart:
1056 chain_length = 0; 1056 chain_length = 0;
1057 min_score = ~(u32)0; 1057 min_score = ~(u32)0;
1058 cand = NULL; 1058 cand = NULL;
1059 candp = NULL; 1059 candp = NULL;
1060 now = jiffies; 1060 now = jiffies;
1061 1061
1062 if (!rt_caching(dev_net(rt->dst.dev))) { 1062 if (!rt_caching(dev_net(rt->dst.dev))) {
1063 /* 1063 /*
1064 * If we're not caching, just tell the caller we 1064 * If we're not caching, just tell the caller we
1065 * were successful and don't touch the route. The 1065 * were successful and don't touch the route. The
1066 * caller hold the sole reference to the cache entry, and 1066 * caller hold the sole reference to the cache entry, and
1067 * it will be released when the caller is done with it. 1067 * it will be released when the caller is done with it.
1068 * If we drop it here, the callers have no way to resolve routes 1068 * If we drop it here, the callers have no way to resolve routes
1069 * when we're not caching. Instead, just point *rp at rt, so 1069 * when we're not caching. Instead, just point *rp at rt, so
1070 * the caller gets a single use out of the route 1070 * the caller gets a single use out of the route
1071 * Note that we do rt_free on this new route entry, so that 1071 * Note that we do rt_free on this new route entry, so that
1072 * once its refcount hits zero, we are still able to reap it 1072 * once its refcount hits zero, we are still able to reap it
1073 * (Thanks Alexey) 1073 * (Thanks Alexey)
1074 * Note: To avoid expensive rcu stuff for this uncached dst, 1074 * Note: To avoid expensive rcu stuff for this uncached dst,
1075 * we set DST_NOCACHE so that dst_release() can free dst without 1075 * we set DST_NOCACHE so that dst_release() can free dst without
1076 * waiting a grace period. 1076 * waiting a grace period.
1077 */ 1077 */
1078 1078
1079 rt->dst.flags |= DST_NOCACHE; 1079 rt->dst.flags |= DST_NOCACHE;
1080 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1080 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1081 int err = rt_bind_neighbour(rt); 1081 int err = rt_bind_neighbour(rt);
1082 if (err) { 1082 if (err) {
1083 if (net_ratelimit()) 1083 if (net_ratelimit())
1084 printk(KERN_WARNING 1084 printk(KERN_WARNING
1085 "Neighbour table failure & not caching routes.\n"); 1085 "Neighbour table failure & not caching routes.\n");
1086 ip_rt_put(rt); 1086 ip_rt_put(rt);
1087 return ERR_PTR(err); 1087 return ERR_PTR(err);
1088 } 1088 }
1089 } 1089 }
1090 1090
1091 goto skip_hashing; 1091 goto skip_hashing;
1092 } 1092 }
1093 1093
1094 rthp = &rt_hash_table[hash].chain; 1094 rthp = &rt_hash_table[hash].chain;
1095 1095
1096 spin_lock_bh(rt_hash_lock_addr(hash)); 1096 spin_lock_bh(rt_hash_lock_addr(hash));
1097 while ((rth = rcu_dereference_protected(*rthp, 1097 while ((rth = rcu_dereference_protected(*rthp,
1098 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1098 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1099 if (rt_is_expired(rth)) { 1099 if (rt_is_expired(rth)) {
1100 *rthp = rth->dst.rt_next; 1100 *rthp = rth->dst.rt_next;
1101 rt_free(rth); 1101 rt_free(rth);
1102 continue; 1102 continue;
1103 } 1103 }
1104 if (compare_keys(rth, rt) && compare_netns(rth, rt)) { 1104 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1105 /* Put it first */ 1105 /* Put it first */
1106 *rthp = rth->dst.rt_next; 1106 *rthp = rth->dst.rt_next;
1107 /* 1107 /*
1108 * Since lookup is lockfree, the deletion 1108 * Since lookup is lockfree, the deletion
1109 * must be visible to another weakly ordered CPU before 1109 * must be visible to another weakly ordered CPU before
1110 * the insertion at the start of the hash chain. 1110 * the insertion at the start of the hash chain.
1111 */ 1111 */
1112 rcu_assign_pointer(rth->dst.rt_next, 1112 rcu_assign_pointer(rth->dst.rt_next,
1113 rt_hash_table[hash].chain); 1113 rt_hash_table[hash].chain);
1114 /* 1114 /*
1115 * Since lookup is lockfree, the update writes 1115 * Since lookup is lockfree, the update writes
1116 * must be ordered for consistency on SMP. 1116 * must be ordered for consistency on SMP.
1117 */ 1117 */
1118 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1118 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1119 1119
1120 dst_use(&rth->dst, now); 1120 dst_use(&rth->dst, now);
1121 spin_unlock_bh(rt_hash_lock_addr(hash)); 1121 spin_unlock_bh(rt_hash_lock_addr(hash));
1122 1122
1123 rt_drop(rt); 1123 rt_drop(rt);
1124 if (skb) 1124 if (skb)
1125 skb_dst_set(skb, &rth->dst); 1125 skb_dst_set(skb, &rth->dst);
1126 return rth; 1126 return rth;
1127 } 1127 }
1128 1128
1129 if (!atomic_read(&rth->dst.__refcnt)) { 1129 if (!atomic_read(&rth->dst.__refcnt)) {
1130 u32 score = rt_score(rth); 1130 u32 score = rt_score(rth);
1131 1131
1132 if (score <= min_score) { 1132 if (score <= min_score) {
1133 cand = rth; 1133 cand = rth;
1134 candp = rthp; 1134 candp = rthp;
1135 min_score = score; 1135 min_score = score;
1136 } 1136 }
1137 } 1137 }
1138 1138
1139 chain_length++; 1139 chain_length++;
1140 1140
1141 rthp = &rth->dst.rt_next; 1141 rthp = &rth->dst.rt_next;
1142 } 1142 }
1143 1143
1144 if (cand) { 1144 if (cand) {
1145 /* ip_rt_gc_elasticity used to be average length of chain 1145 /* ip_rt_gc_elasticity used to be average length of chain
1146 * length, when exceeded gc becomes really aggressive. 1146 * length, when exceeded gc becomes really aggressive.
1147 * 1147 *
1148 * The second limit is less certain. At the moment it allows 1148 * The second limit is less certain. At the moment it allows
1149 * only 2 entries per bucket. We will see. 1149 * only 2 entries per bucket. We will see.
1150 */ 1150 */
1151 if (chain_length > ip_rt_gc_elasticity) { 1151 if (chain_length > ip_rt_gc_elasticity) {
1152 *candp = cand->dst.rt_next; 1152 *candp = cand->dst.rt_next;
1153 rt_free(cand); 1153 rt_free(cand);
1154 } 1154 }
1155 } else { 1155 } else {
1156 if (chain_length > rt_chain_length_max && 1156 if (chain_length > rt_chain_length_max &&
1157 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { 1157 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1158 struct net *net = dev_net(rt->dst.dev); 1158 struct net *net = dev_net(rt->dst.dev);
1159 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1159 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1160 if (!rt_caching(net)) { 1160 if (!rt_caching(net)) {
1161 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1161 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1162 rt->dst.dev->name, num); 1162 rt->dst.dev->name, num);
1163 } 1163 }
1164 rt_emergency_hash_rebuild(net); 1164 rt_emergency_hash_rebuild(net);
1165 spin_unlock_bh(rt_hash_lock_addr(hash)); 1165 spin_unlock_bh(rt_hash_lock_addr(hash));
1166 1166
1167 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1167 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1168 ifindex, rt_genid(net)); 1168 ifindex, rt_genid(net));
1169 goto restart; 1169 goto restart;
1170 } 1170 }
1171 } 1171 }
1172 1172
1173 /* Try to bind route to arp only if it is output 1173 /* Try to bind route to arp only if it is output
1174 route or unicast forwarding path. 1174 route or unicast forwarding path.
1175 */ 1175 */
1176 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1176 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177 int err = rt_bind_neighbour(rt); 1177 int err = rt_bind_neighbour(rt);
1178 if (err) { 1178 if (err) {
1179 spin_unlock_bh(rt_hash_lock_addr(hash)); 1179 spin_unlock_bh(rt_hash_lock_addr(hash));
1180 1180
1181 if (err != -ENOBUFS) { 1181 if (err != -ENOBUFS) {
1182 rt_drop(rt); 1182 rt_drop(rt);
1183 return ERR_PTR(err); 1183 return ERR_PTR(err);
1184 } 1184 }
1185 1185
1186 /* Neighbour tables are full and nothing 1186 /* Neighbour tables are full and nothing
1187 can be released. Try to shrink route cache, 1187 can be released. Try to shrink route cache,
1188 it is most likely it holds some neighbour records. 1188 it is most likely it holds some neighbour records.
1189 */ 1189 */
1190 if (attempts-- > 0) { 1190 if (attempts-- > 0) {
1191 int saved_elasticity = ip_rt_gc_elasticity; 1191 int saved_elasticity = ip_rt_gc_elasticity;
1192 int saved_int = ip_rt_gc_min_interval; 1192 int saved_int = ip_rt_gc_min_interval;
1193 ip_rt_gc_elasticity = 1; 1193 ip_rt_gc_elasticity = 1;
1194 ip_rt_gc_min_interval = 0; 1194 ip_rt_gc_min_interval = 0;
1195 rt_garbage_collect(&ipv4_dst_ops); 1195 rt_garbage_collect(&ipv4_dst_ops);
1196 ip_rt_gc_min_interval = saved_int; 1196 ip_rt_gc_min_interval = saved_int;
1197 ip_rt_gc_elasticity = saved_elasticity; 1197 ip_rt_gc_elasticity = saved_elasticity;
1198 goto restart; 1198 goto restart;
1199 } 1199 }
1200 1200
1201 if (net_ratelimit()) 1201 if (net_ratelimit())
1202 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1202 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1203 rt_drop(rt); 1203 rt_drop(rt);
1204 return ERR_PTR(-ENOBUFS); 1204 return ERR_PTR(-ENOBUFS);
1205 } 1205 }
1206 } 1206 }
1207 1207
1208 rt->dst.rt_next = rt_hash_table[hash].chain; 1208 rt->dst.rt_next = rt_hash_table[hash].chain;
1209 1209
1210 /* 1210 /*
1211 * Since lookup is lockfree, we must make sure 1211 * Since lookup is lockfree, we must make sure
1212 * previous writes to rt are committed to memory 1212 * previous writes to rt are committed to memory
1213 * before making rt visible to other CPUS. 1213 * before making rt visible to other CPUS.
1214 */ 1214 */
1215 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1215 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1216 1216
1217 spin_unlock_bh(rt_hash_lock_addr(hash)); 1217 spin_unlock_bh(rt_hash_lock_addr(hash));
1218 1218
1219 skip_hashing: 1219 skip_hashing:
1220 if (skb) 1220 if (skb)
1221 skb_dst_set(skb, &rt->dst); 1221 skb_dst_set(skb, &rt->dst);
1222 return rt; 1222 return rt;
1223 } 1223 }
1224 1224
1225 static atomic_t __rt_peer_genid = ATOMIC_INIT(0); 1225 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1226 1226
1227 static u32 rt_peer_genid(void) 1227 static u32 rt_peer_genid(void)
1228 { 1228 {
1229 return atomic_read(&__rt_peer_genid); 1229 return atomic_read(&__rt_peer_genid);
1230 } 1230 }
1231 1231
1232 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) 1232 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1233 { 1233 {
1234 struct inet_peer *peer; 1234 struct inet_peer *peer;
1235 1235
1236 peer = inet_getpeer_v4(daddr, create); 1236 peer = inet_getpeer_v4(daddr, create);
1237 1237
1238 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1238 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1239 inet_putpeer(peer); 1239 inet_putpeer(peer);
1240 else 1240 else
1241 rt->rt_peer_genid = rt_peer_genid(); 1241 rt->rt_peer_genid = rt_peer_genid();
1242 } 1242 }
1243 1243
1244 /* 1244 /*
1245 * Peer allocation may fail only in serious out-of-memory conditions. However 1245 * Peer allocation may fail only in serious out-of-memory conditions. However
1246 * we still can generate some output. 1246 * we still can generate some output.
1247 * Random ID selection looks a bit dangerous because we have no chances to 1247 * Random ID selection looks a bit dangerous because we have no chances to
1248 * select ID being unique in a reasonable period of time. 1248 * select ID being unique in a reasonable period of time.
1249 * But broken packet identifier may be better than no packet at all. 1249 * But broken packet identifier may be better than no packet at all.
1250 */ 1250 */
1251 static void ip_select_fb_ident(struct iphdr *iph) 1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 { 1252 {
1253 static DEFINE_SPINLOCK(ip_fb_id_lock); 1253 static DEFINE_SPINLOCK(ip_fb_id_lock);
1254 static u32 ip_fallback_id; 1254 static u32 ip_fallback_id;
1255 u32 salt; 1255 u32 salt;
1256 1256
1257 spin_lock_bh(&ip_fb_id_lock); 1257 spin_lock_bh(&ip_fb_id_lock);
1258 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1258 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259 iph->id = htons(salt & 0xFFFF); 1259 iph->id = htons(salt & 0xFFFF);
1260 ip_fallback_id = salt; 1260 ip_fallback_id = salt;
1261 spin_unlock_bh(&ip_fb_id_lock); 1261 spin_unlock_bh(&ip_fb_id_lock);
1262 } 1262 }
1263 1263
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 { 1265 {
1266 struct rtable *rt = (struct rtable *) dst; 1266 struct rtable *rt = (struct rtable *) dst;
1267 1267
1268 if (rt) { 1268 if (rt) {
1269 if (rt->peer == NULL) 1269 if (rt->peer == NULL)
1270 rt_bind_peer(rt, rt->rt_dst, 1); 1270 rt_bind_peer(rt, rt->rt_dst, 1);
1271 1271
1272 /* If peer is attached to destination, it is never detached, 1272 /* If peer is attached to destination, it is never detached,
1273 so that we need not to grab a lock to dereference it. 1273 so that we need not to grab a lock to dereference it.
1274 */ 1274 */
1275 if (rt->peer) { 1275 if (rt->peer) {
1276 iph->id = htons(inet_getid(rt->peer, more)); 1276 iph->id = htons(inet_getid(rt->peer, more));
1277 return; 1277 return;
1278 } 1278 }
1279 } else 1279 } else
1280 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1280 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281 __builtin_return_address(0)); 1281 __builtin_return_address(0));
1282 1282
1283 ip_select_fb_ident(iph); 1283 ip_select_fb_ident(iph);
1284 } 1284 }
1285 EXPORT_SYMBOL(__ip_select_ident); 1285 EXPORT_SYMBOL(__ip_select_ident);
1286 1286
1287 static void rt_del(unsigned hash, struct rtable *rt) 1287 static void rt_del(unsigned hash, struct rtable *rt)
1288 { 1288 {
1289 struct rtable __rcu **rthp; 1289 struct rtable __rcu **rthp;
1290 struct rtable *aux; 1290 struct rtable *aux;
1291 1291
1292 rthp = &rt_hash_table[hash].chain; 1292 rthp = &rt_hash_table[hash].chain;
1293 spin_lock_bh(rt_hash_lock_addr(hash)); 1293 spin_lock_bh(rt_hash_lock_addr(hash));
1294 ip_rt_put(rt); 1294 ip_rt_put(rt);
1295 while ((aux = rcu_dereference_protected(*rthp, 1295 while ((aux = rcu_dereference_protected(*rthp,
1296 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1296 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1297 if (aux == rt || rt_is_expired(aux)) { 1297 if (aux == rt || rt_is_expired(aux)) {
1298 *rthp = aux->dst.rt_next; 1298 *rthp = aux->dst.rt_next;
1299 rt_free(aux); 1299 rt_free(aux);
1300 continue; 1300 continue;
1301 } 1301 }
1302 rthp = &aux->dst.rt_next; 1302 rthp = &aux->dst.rt_next;
1303 } 1303 }
1304 spin_unlock_bh(rt_hash_lock_addr(hash)); 1304 spin_unlock_bh(rt_hash_lock_addr(hash));
1305 } 1305 }
1306 1306
1307 /* called in rcu_read_lock() section */ 1307 /* called in rcu_read_lock() section */
1308 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1308 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1309 __be32 saddr, struct net_device *dev) 1309 __be32 saddr, struct net_device *dev)
1310 { 1310 {
1311 struct in_device *in_dev = __in_dev_get_rcu(dev); 1311 struct in_device *in_dev = __in_dev_get_rcu(dev);
1312 struct inet_peer *peer; 1312 struct inet_peer *peer;
1313 struct net *net; 1313 struct net *net;
1314 1314
1315 if (!in_dev) 1315 if (!in_dev)
1316 return; 1316 return;
1317 1317
1318 net = dev_net(dev); 1318 net = dev_net(dev);
1319 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 1319 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1320 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 1320 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1321 ipv4_is_zeronet(new_gw)) 1321 ipv4_is_zeronet(new_gw))
1322 goto reject_redirect; 1322 goto reject_redirect;
1323 1323
1324 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1324 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1325 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1325 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1326 goto reject_redirect; 1326 goto reject_redirect;
1327 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1327 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1328 goto reject_redirect; 1328 goto reject_redirect;
1329 } else { 1329 } else {
1330 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 1330 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1331 goto reject_redirect; 1331 goto reject_redirect;
1332 } 1332 }
1333 1333
1334 peer = inet_getpeer_v4(daddr, 1); 1334 peer = inet_getpeer_v4(daddr, 1);
1335 if (peer) { 1335 if (peer) {
1336 peer->redirect_learned.a4 = new_gw; 1336 peer->redirect_learned.a4 = new_gw;
1337 1337
1338 inet_putpeer(peer); 1338 inet_putpeer(peer);
1339 1339
1340 atomic_inc(&__rt_peer_genid); 1340 atomic_inc(&__rt_peer_genid);
1341 } 1341 }
1342 return; 1342 return;
1343 1343
1344 reject_redirect: 1344 reject_redirect:
1345 #ifdef CONFIG_IP_ROUTE_VERBOSE 1345 #ifdef CONFIG_IP_ROUTE_VERBOSE
1346 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1346 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1347 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" 1347 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1348 " Advised path = %pI4 -> %pI4\n", 1348 " Advised path = %pI4 -> %pI4\n",
1349 &old_gw, dev->name, &new_gw, 1349 &old_gw, dev->name, &new_gw,
1350 &saddr, &daddr); 1350 &saddr, &daddr);
1351 #endif 1351 #endif
1352 ; 1352 ;
1353 } 1353 }
1354 1354
1355 static bool peer_pmtu_expired(struct inet_peer *peer) 1355 static bool peer_pmtu_expired(struct inet_peer *peer)
1356 { 1356 {
1357 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1357 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1358 1358
1359 return orig && 1359 return orig &&
1360 time_after_eq(jiffies, orig) && 1360 time_after_eq(jiffies, orig) &&
1361 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1361 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1362 } 1362 }
1363 1363
1364 static bool peer_pmtu_cleaned(struct inet_peer *peer) 1364 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1365 { 1365 {
1366 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1366 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1367 1367
1368 return orig && 1368 return orig &&
1369 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1369 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1370 } 1370 }
1371 1371
1372 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1372 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1373 { 1373 {
1374 struct rtable *rt = (struct rtable *)dst; 1374 struct rtable *rt = (struct rtable *)dst;
1375 struct dst_entry *ret = dst; 1375 struct dst_entry *ret = dst;
1376 1376
1377 if (rt) { 1377 if (rt) {
1378 if (dst->obsolete > 0) { 1378 if (dst->obsolete > 0) {
1379 ip_rt_put(rt); 1379 ip_rt_put(rt);
1380 ret = NULL; 1380 ret = NULL;
1381 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1381 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1382 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1382 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1383 rt->rt_oif, 1383 rt->rt_oif,
1384 rt_genid(dev_net(dst->dev))); 1384 rt_genid(dev_net(dst->dev)));
1385 rt_del(hash, rt); 1385 rt_del(hash, rt);
1386 ret = NULL; 1386 ret = NULL;
1387 } else if (rt->peer && peer_pmtu_expired(rt->peer)) { 1387 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1388 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); 1388 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1389 } 1389 }
1390 } 1390 }
1391 return ret; 1391 return ret;
1392 } 1392 }
1393 1393
1394 /* 1394 /*
1395 * Algorithm: 1395 * Algorithm:
1396 * 1. The first ip_rt_redirect_number redirects are sent 1396 * 1. The first ip_rt_redirect_number redirects are sent
1397 * with exponential backoff, then we stop sending them at all, 1397 * with exponential backoff, then we stop sending them at all,
1398 * assuming that the host ignores our redirects. 1398 * assuming that the host ignores our redirects.
1399 * 2. If we did not see packets requiring redirects 1399 * 2. If we did not see packets requiring redirects
1400 * during ip_rt_redirect_silence, we assume that the host 1400 * during ip_rt_redirect_silence, we assume that the host
1401 * forgot redirected route and start to send redirects again. 1401 * forgot redirected route and start to send redirects again.
1402 * 1402 *
1403 * This algorithm is much cheaper and more intelligent than dumb load limiting 1403 * This algorithm is much cheaper and more intelligent than dumb load limiting
1404 * in icmp.c. 1404 * in icmp.c.
1405 * 1405 *
1406 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1406 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1407 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1407 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1408 */ 1408 */
1409 1409
1410 void ip_rt_send_redirect(struct sk_buff *skb) 1410 void ip_rt_send_redirect(struct sk_buff *skb)
1411 { 1411 {
1412 struct rtable *rt = skb_rtable(skb); 1412 struct rtable *rt = skb_rtable(skb);
1413 struct in_device *in_dev; 1413 struct in_device *in_dev;
1414 struct inet_peer *peer; 1414 struct inet_peer *peer;
1415 int log_martians; 1415 int log_martians;
1416 1416
1417 rcu_read_lock(); 1417 rcu_read_lock();
1418 in_dev = __in_dev_get_rcu(rt->dst.dev); 1418 in_dev = __in_dev_get_rcu(rt->dst.dev);
1419 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1419 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1420 rcu_read_unlock(); 1420 rcu_read_unlock();
1421 return; 1421 return;
1422 } 1422 }
1423 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1423 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1424 rcu_read_unlock(); 1424 rcu_read_unlock();
1425 1425
1426 if (!rt->peer) 1426 if (!rt->peer)
1427 rt_bind_peer(rt, rt->rt_dst, 1); 1427 rt_bind_peer(rt, rt->rt_dst, 1);
1428 peer = rt->peer; 1428 peer = rt->peer;
1429 if (!peer) { 1429 if (!peer) {
1430 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1430 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1431 return; 1431 return;
1432 } 1432 }
1433 1433
1434 /* No redirected packets during ip_rt_redirect_silence; 1434 /* No redirected packets during ip_rt_redirect_silence;
1435 * reset the algorithm. 1435 * reset the algorithm.
1436 */ 1436 */
1437 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 1437 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1438 peer->rate_tokens = 0; 1438 peer->rate_tokens = 0;
1439 1439
1440 /* Too many ignored redirects; do not send anything 1440 /* Too many ignored redirects; do not send anything
1441 * set dst.rate_last to the last seen redirected packet. 1441 * set dst.rate_last to the last seen redirected packet.
1442 */ 1442 */
1443 if (peer->rate_tokens >= ip_rt_redirect_number) { 1443 if (peer->rate_tokens >= ip_rt_redirect_number) {
1444 peer->rate_last = jiffies; 1444 peer->rate_last = jiffies;
1445 return; 1445 return;
1446 } 1446 }
1447 1447
1448 /* Check for load limit; set rate_last to the latest sent 1448 /* Check for load limit; set rate_last to the latest sent
1449 * redirect. 1449 * redirect.
1450 */ 1450 */
1451 if (peer->rate_tokens == 0 || 1451 if (peer->rate_tokens == 0 ||
1452 time_after(jiffies, 1452 time_after(jiffies,
1453 (peer->rate_last + 1453 (peer->rate_last +
1454 (ip_rt_redirect_load << peer->rate_tokens)))) { 1454 (ip_rt_redirect_load << peer->rate_tokens)))) {
1455 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1455 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1456 peer->rate_last = jiffies; 1456 peer->rate_last = jiffies;
1457 ++peer->rate_tokens; 1457 ++peer->rate_tokens;
1458 #ifdef CONFIG_IP_ROUTE_VERBOSE 1458 #ifdef CONFIG_IP_ROUTE_VERBOSE
1459 if (log_martians && 1459 if (log_martians &&
1460 peer->rate_tokens == ip_rt_redirect_number && 1460 peer->rate_tokens == ip_rt_redirect_number &&
1461 net_ratelimit()) 1461 net_ratelimit())
1462 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1462 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1463 &ip_hdr(skb)->saddr, rt->rt_iif, 1463 &ip_hdr(skb)->saddr, rt->rt_iif,
1464 &rt->rt_dst, &rt->rt_gateway); 1464 &rt->rt_dst, &rt->rt_gateway);
1465 #endif 1465 #endif
1466 } 1466 }
1467 } 1467 }
1468 1468
1469 static int ip_error(struct sk_buff *skb) 1469 static int ip_error(struct sk_buff *skb)
1470 { 1470 {
1471 struct rtable *rt = skb_rtable(skb); 1471 struct rtable *rt = skb_rtable(skb);
1472 struct inet_peer *peer; 1472 struct inet_peer *peer;
1473 unsigned long now; 1473 unsigned long now;
1474 bool send; 1474 bool send;
1475 int code; 1475 int code;
1476 1476
1477 switch (rt->dst.error) { 1477 switch (rt->dst.error) {
1478 case EINVAL: 1478 case EINVAL:
1479 default: 1479 default:
1480 goto out; 1480 goto out;
1481 case EHOSTUNREACH: 1481 case EHOSTUNREACH:
1482 code = ICMP_HOST_UNREACH; 1482 code = ICMP_HOST_UNREACH;
1483 break; 1483 break;
1484 case ENETUNREACH: 1484 case ENETUNREACH:
1485 code = ICMP_NET_UNREACH; 1485 code = ICMP_NET_UNREACH;
1486 IP_INC_STATS_BH(dev_net(rt->dst.dev), 1486 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1487 IPSTATS_MIB_INNOROUTES); 1487 IPSTATS_MIB_INNOROUTES);
1488 break; 1488 break;
1489 case EACCES: 1489 case EACCES:
1490 code = ICMP_PKT_FILTERED; 1490 code = ICMP_PKT_FILTERED;
1491 break; 1491 break;
1492 } 1492 }
1493 1493
1494 if (!rt->peer) 1494 if (!rt->peer)
1495 rt_bind_peer(rt, rt->rt_dst, 1); 1495 rt_bind_peer(rt, rt->rt_dst, 1);
1496 peer = rt->peer; 1496 peer = rt->peer;
1497 1497
1498 send = true; 1498 send = true;
1499 if (peer) { 1499 if (peer) {
1500 now = jiffies; 1500 now = jiffies;
1501 peer->rate_tokens += now - peer->rate_last; 1501 peer->rate_tokens += now - peer->rate_last;
1502 if (peer->rate_tokens > ip_rt_error_burst) 1502 if (peer->rate_tokens > ip_rt_error_burst)
1503 peer->rate_tokens = ip_rt_error_burst; 1503 peer->rate_tokens = ip_rt_error_burst;
1504 peer->rate_last = now; 1504 peer->rate_last = now;
1505 if (peer->rate_tokens >= ip_rt_error_cost) 1505 if (peer->rate_tokens >= ip_rt_error_cost)
1506 peer->rate_tokens -= ip_rt_error_cost; 1506 peer->rate_tokens -= ip_rt_error_cost;
1507 else 1507 else
1508 send = false; 1508 send = false;
1509 } 1509 }
1510 if (send) 1510 if (send)
1511 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1511 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1512 1512
1513 out: kfree_skb(skb); 1513 out: kfree_skb(skb);
1514 return 0; 1514 return 0;
1515 } 1515 }
1516 1516
1517 /* 1517 /*
1518 * The last two values are not from the RFC but 1518 * The last two values are not from the RFC but
1519 * are needed for AMPRnet AX.25 paths. 1519 * are needed for AMPRnet AX.25 paths.
1520 */ 1520 */
1521 1521
1522 static const unsigned short mtu_plateau[] = 1522 static const unsigned short mtu_plateau[] =
1523 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1523 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1524 1524
1525 static inline unsigned short guess_mtu(unsigned short old_mtu) 1525 static inline unsigned short guess_mtu(unsigned short old_mtu)
1526 { 1526 {
1527 int i; 1527 int i;
1528 1528
1529 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1529 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1530 if (old_mtu > mtu_plateau[i]) 1530 if (old_mtu > mtu_plateau[i])
1531 return mtu_plateau[i]; 1531 return mtu_plateau[i];
1532 return 68; 1532 return 68;
1533 } 1533 }
1534 1534
1535 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, 1535 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1536 unsigned short new_mtu, 1536 unsigned short new_mtu,
1537 struct net_device *dev) 1537 struct net_device *dev)
1538 { 1538 {
1539 unsigned short old_mtu = ntohs(iph->tot_len); 1539 unsigned short old_mtu = ntohs(iph->tot_len);
1540 unsigned short est_mtu = 0; 1540 unsigned short est_mtu = 0;
1541 struct inet_peer *peer; 1541 struct inet_peer *peer;
1542 1542
1543 peer = inet_getpeer_v4(iph->daddr, 1); 1543 peer = inet_getpeer_v4(iph->daddr, 1);
1544 if (peer) { 1544 if (peer) {
1545 unsigned short mtu = new_mtu; 1545 unsigned short mtu = new_mtu;
1546 1546
1547 if (new_mtu < 68 || new_mtu >= old_mtu) { 1547 if (new_mtu < 68 || new_mtu >= old_mtu) {
1548 /* BSD 4.2 derived systems incorrectly adjust 1548 /* BSD 4.2 derived systems incorrectly adjust
1549 * tot_len by the IP header length, and report 1549 * tot_len by the IP header length, and report
1550 * a zero MTU in the ICMP message. 1550 * a zero MTU in the ICMP message.
1551 */ 1551 */
1552 if (mtu == 0 && 1552 if (mtu == 0 &&
1553 old_mtu >= 68 + (iph->ihl << 2)) 1553 old_mtu >= 68 + (iph->ihl << 2))
1554 old_mtu -= iph->ihl << 2; 1554 old_mtu -= iph->ihl << 2;
1555 mtu = guess_mtu(old_mtu); 1555 mtu = guess_mtu(old_mtu);
1556 } 1556 }
1557 1557
1558 if (mtu < ip_rt_min_pmtu) 1558 if (mtu < ip_rt_min_pmtu)
1559 mtu = ip_rt_min_pmtu; 1559 mtu = ip_rt_min_pmtu;
1560 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { 1560 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1561 unsigned long pmtu_expires; 1561 unsigned long pmtu_expires;
1562 1562
1563 pmtu_expires = jiffies + ip_rt_mtu_expires; 1563 pmtu_expires = jiffies + ip_rt_mtu_expires;
1564 if (!pmtu_expires) 1564 if (!pmtu_expires)
1565 pmtu_expires = 1UL; 1565 pmtu_expires = 1UL;
1566 1566
1567 est_mtu = mtu; 1567 est_mtu = mtu;
1568 peer->pmtu_learned = mtu; 1568 peer->pmtu_learned = mtu;
1569 peer->pmtu_expires = pmtu_expires; 1569 peer->pmtu_expires = pmtu_expires;
1570 } 1570 }
1571 1571
1572 inet_putpeer(peer); 1572 inet_putpeer(peer);
1573 1573
1574 atomic_inc(&__rt_peer_genid); 1574 atomic_inc(&__rt_peer_genid);
1575 } 1575 }
1576 return est_mtu ? : new_mtu; 1576 return est_mtu ? : new_mtu;
1577 } 1577 }
1578 1578
1579 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) 1579 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1580 { 1580 {
1581 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); 1581 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1582 1582
1583 if (!expires) 1583 if (!expires)
1584 return; 1584 return;
1585 if (time_before(jiffies, expires)) { 1585 if (time_before(jiffies, expires)) {
1586 u32 orig_dst_mtu = dst_mtu(dst); 1586 u32 orig_dst_mtu = dst_mtu(dst);
1587 if (peer->pmtu_learned < orig_dst_mtu) { 1587 if (peer->pmtu_learned < orig_dst_mtu) {
1588 if (!peer->pmtu_orig) 1588 if (!peer->pmtu_orig)
1589 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); 1589 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1590 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); 1590 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1591 } 1591 }
1592 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) 1592 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1593 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); 1593 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1594 } 1594 }
1595 1595
1596 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1596 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1597 { 1597 {
1598 struct rtable *rt = (struct rtable *) dst; 1598 struct rtable *rt = (struct rtable *) dst;
1599 struct inet_peer *peer; 1599 struct inet_peer *peer;
1600 1600
1601 dst_confirm(dst); 1601 dst_confirm(dst);
1602 1602
1603 if (!rt->peer) 1603 if (!rt->peer)
1604 rt_bind_peer(rt, rt->rt_dst, 1); 1604 rt_bind_peer(rt, rt->rt_dst, 1);
1605 peer = rt->peer; 1605 peer = rt->peer;
1606 if (peer) { 1606 if (peer) {
1607 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); 1607 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1608 1608
1609 if (mtu < ip_rt_min_pmtu) 1609 if (mtu < ip_rt_min_pmtu)
1610 mtu = ip_rt_min_pmtu; 1610 mtu = ip_rt_min_pmtu;
1611 if (!pmtu_expires || mtu < peer->pmtu_learned) { 1611 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1612 1612
1613 pmtu_expires = jiffies + ip_rt_mtu_expires; 1613 pmtu_expires = jiffies + ip_rt_mtu_expires;
1614 if (!pmtu_expires) 1614 if (!pmtu_expires)
1615 pmtu_expires = 1UL; 1615 pmtu_expires = 1UL;
1616 1616
1617 peer->pmtu_learned = mtu; 1617 peer->pmtu_learned = mtu;
1618 peer->pmtu_expires = pmtu_expires; 1618 peer->pmtu_expires = pmtu_expires;
1619 1619
1620 atomic_inc(&__rt_peer_genid); 1620 atomic_inc(&__rt_peer_genid);
1621 rt->rt_peer_genid = rt_peer_genid(); 1621 rt->rt_peer_genid = rt_peer_genid();
1622 } 1622 }
1623 check_peer_pmtu(dst, peer); 1623 check_peer_pmtu(dst, peer);
1624 } 1624 }
1625 } 1625 }
1626 1626
1627 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) 1627 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1628 { 1628 {
1629 struct rtable *rt = (struct rtable *) dst; 1629 struct rtable *rt = (struct rtable *) dst;
1630 __be32 orig_gw = rt->rt_gateway; 1630 __be32 orig_gw = rt->rt_gateway;
1631 struct neighbour *n; 1631 struct neighbour *n, *old_n;
1632 1632
1633 dst_confirm(&rt->dst); 1633 dst_confirm(&rt->dst);
1634 1634
1635 neigh_release(dst_get_neighbour(&rt->dst));
1636 dst_set_neighbour(&rt->dst, NULL);
1637
1638 rt->rt_gateway = peer->redirect_learned.a4; 1635 rt->rt_gateway = peer->redirect_learned.a4;
1639 rt_bind_neighbour(rt); 1636
1640 n = dst_get_neighbour(&rt->dst); 1637 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1638 if (IS_ERR(n))
1639 return PTR_ERR(n);
1640 old_n = xchg(&rt->dst._neighbour, n);
1641 if (old_n)
1642 neigh_release(old_n);
1641 if (!n || !(n->nud_state & NUD_VALID)) { 1643 if (!n || !(n->nud_state & NUD_VALID)) {
1642 if (n) 1644 if (n)
1643 neigh_event_send(n, NULL); 1645 neigh_event_send(n, NULL);
1644 rt->rt_gateway = orig_gw; 1646 rt->rt_gateway = orig_gw;
1645 return -EAGAIN; 1647 return -EAGAIN;
1646 } else { 1648 } else {
1647 rt->rt_flags |= RTCF_REDIRECTED; 1649 rt->rt_flags |= RTCF_REDIRECTED;
1648 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 1650 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1649 } 1651 }
1650 return 0; 1652 return 0;
1651 } 1653 }
1652 1654
1653 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1655 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1654 { 1656 {
1655 struct rtable *rt = (struct rtable *) dst; 1657 struct rtable *rt = (struct rtable *) dst;
1656 1658
1657 if (rt_is_expired(rt)) 1659 if (rt_is_expired(rt))
1658 return NULL; 1660 return NULL;
1659 if (rt->rt_peer_genid != rt_peer_genid()) { 1661 if (rt->rt_peer_genid != rt_peer_genid()) {
1660 struct inet_peer *peer; 1662 struct inet_peer *peer;
1661 1663
1662 if (!rt->peer) 1664 if (!rt->peer)
1663 rt_bind_peer(rt, rt->rt_dst, 0); 1665 rt_bind_peer(rt, rt->rt_dst, 0);
1664 1666
1665 peer = rt->peer; 1667 peer = rt->peer;
1666 if (peer) { 1668 if (peer) {
1667 check_peer_pmtu(dst, peer); 1669 check_peer_pmtu(dst, peer);
1668 1670
1669 if (peer->redirect_learned.a4 && 1671 if (peer->redirect_learned.a4 &&
1670 peer->redirect_learned.a4 != rt->rt_gateway) { 1672 peer->redirect_learned.a4 != rt->rt_gateway) {
1671 if (check_peer_redir(dst, peer)) 1673 if (check_peer_redir(dst, peer))
1672 return NULL; 1674 return NULL;
1673 } 1675 }
1674 } 1676 }
1675 1677
1676 rt->rt_peer_genid = rt_peer_genid(); 1678 rt->rt_peer_genid = rt_peer_genid();
1677 } 1679 }
1678 return dst; 1680 return dst;
1679 } 1681 }
1680 1682
1681 static void ipv4_dst_destroy(struct dst_entry *dst) 1683 static void ipv4_dst_destroy(struct dst_entry *dst)
1682 { 1684 {
1683 struct rtable *rt = (struct rtable *) dst; 1685 struct rtable *rt = (struct rtable *) dst;
1684 struct inet_peer *peer = rt->peer; 1686 struct inet_peer *peer = rt->peer;
1685 1687
1686 if (rt->fi) { 1688 if (rt->fi) {
1687 fib_info_put(rt->fi); 1689 fib_info_put(rt->fi);
1688 rt->fi = NULL; 1690 rt->fi = NULL;
1689 } 1691 }
1690 if (peer) { 1692 if (peer) {
1691 rt->peer = NULL; 1693 rt->peer = NULL;
1692 inet_putpeer(peer); 1694 inet_putpeer(peer);
1693 } 1695 }
1694 } 1696 }
1695 1697
1696 1698
1697 static void ipv4_link_failure(struct sk_buff *skb) 1699 static void ipv4_link_failure(struct sk_buff *skb)
1698 { 1700 {
1699 struct rtable *rt; 1701 struct rtable *rt;
1700 1702
1701 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1703 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1702 1704
1703 rt = skb_rtable(skb); 1705 rt = skb_rtable(skb);
1704 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) 1706 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1705 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); 1707 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1706 } 1708 }
1707 1709
1708 static int ip_rt_bug(struct sk_buff *skb) 1710 static int ip_rt_bug(struct sk_buff *skb)
1709 { 1711 {
1710 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1712 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1711 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1713 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1712 skb->dev ? skb->dev->name : "?"); 1714 skb->dev ? skb->dev->name : "?");
1713 kfree_skb(skb); 1715 kfree_skb(skb);
1714 WARN_ON(1); 1716 WARN_ON(1);
1715 return 0; 1717 return 0;
1716 } 1718 }
1717 1719
1718 /* 1720 /*
1719 We do not cache source address of outgoing interface, 1721 We do not cache source address of outgoing interface,
1720 because it is used only by IP RR, TS and SRR options, 1722 because it is used only by IP RR, TS and SRR options,
1721 so that it out of fast path. 1723 so that it out of fast path.
1722 1724
1723 BTW remember: "addr" is allowed to be not aligned 1725 BTW remember: "addr" is allowed to be not aligned
1724 in IP options! 1726 in IP options!
1725 */ 1727 */
1726 1728
1727 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1729 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1728 { 1730 {
1729 __be32 src; 1731 __be32 src;
1730 1732
1731 if (rt_is_output_route(rt)) 1733 if (rt_is_output_route(rt))
1732 src = ip_hdr(skb)->saddr; 1734 src = ip_hdr(skb)->saddr;
1733 else { 1735 else {
1734 struct fib_result res; 1736 struct fib_result res;
1735 struct flowi4 fl4; 1737 struct flowi4 fl4;
1736 struct iphdr *iph; 1738 struct iphdr *iph;
1737 1739
1738 iph = ip_hdr(skb); 1740 iph = ip_hdr(skb);
1739 1741
1740 memset(&fl4, 0, sizeof(fl4)); 1742 memset(&fl4, 0, sizeof(fl4));
1741 fl4.daddr = iph->daddr; 1743 fl4.daddr = iph->daddr;
1742 fl4.saddr = iph->saddr; 1744 fl4.saddr = iph->saddr;
1743 fl4.flowi4_tos = RT_TOS(iph->tos); 1745 fl4.flowi4_tos = RT_TOS(iph->tos);
1744 fl4.flowi4_oif = rt->dst.dev->ifindex; 1746 fl4.flowi4_oif = rt->dst.dev->ifindex;
1745 fl4.flowi4_iif = skb->dev->ifindex; 1747 fl4.flowi4_iif = skb->dev->ifindex;
1746 fl4.flowi4_mark = skb->mark; 1748 fl4.flowi4_mark = skb->mark;
1747 1749
1748 rcu_read_lock(); 1750 rcu_read_lock();
1749 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1751 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1750 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1752 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1751 else 1753 else
1752 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1754 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1753 RT_SCOPE_UNIVERSE); 1755 RT_SCOPE_UNIVERSE);
1754 rcu_read_unlock(); 1756 rcu_read_unlock();
1755 } 1757 }
1756 memcpy(addr, &src, 4); 1758 memcpy(addr, &src, 4);
1757 } 1759 }
1758 1760
1759 #ifdef CONFIG_IP_ROUTE_CLASSID 1761 #ifdef CONFIG_IP_ROUTE_CLASSID
1760 static void set_class_tag(struct rtable *rt, u32 tag) 1762 static void set_class_tag(struct rtable *rt, u32 tag)
1761 { 1763 {
1762 if (!(rt->dst.tclassid & 0xFFFF)) 1764 if (!(rt->dst.tclassid & 0xFFFF))
1763 rt->dst.tclassid |= tag & 0xFFFF; 1765 rt->dst.tclassid |= tag & 0xFFFF;
1764 if (!(rt->dst.tclassid & 0xFFFF0000)) 1766 if (!(rt->dst.tclassid & 0xFFFF0000))
1765 rt->dst.tclassid |= tag & 0xFFFF0000; 1767 rt->dst.tclassid |= tag & 0xFFFF0000;
1766 } 1768 }
1767 #endif 1769 #endif
1768 1770
1769 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1771 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1770 { 1772 {
1771 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); 1773 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1772 1774
1773 if (advmss == 0) { 1775 if (advmss == 0) {
1774 advmss = max_t(unsigned int, dst->dev->mtu - 40, 1776 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1775 ip_rt_min_advmss); 1777 ip_rt_min_advmss);
1776 if (advmss > 65535 - 40) 1778 if (advmss > 65535 - 40)
1777 advmss = 65535 - 40; 1779 advmss = 65535 - 40;
1778 } 1780 }
1779 return advmss; 1781 return advmss;
1780 } 1782 }
1781 1783
1782 static unsigned int ipv4_default_mtu(const struct dst_entry *dst) 1784 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1783 { 1785 {
1784 unsigned int mtu = dst->dev->mtu; 1786 unsigned int mtu = dst->dev->mtu;
1785 1787
1786 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1788 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1787 const struct rtable *rt = (const struct rtable *) dst; 1789 const struct rtable *rt = (const struct rtable *) dst;
1788 1790
1789 if (rt->rt_gateway != rt->rt_dst && mtu > 576) 1791 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1790 mtu = 576; 1792 mtu = 576;
1791 } 1793 }
1792 1794
1793 if (mtu > IP_MAX_MTU) 1795 if (mtu > IP_MAX_MTU)
1794 mtu = IP_MAX_MTU; 1796 mtu = IP_MAX_MTU;
1795 1797
1796 return mtu; 1798 return mtu;
1797 } 1799 }
1798 1800
1799 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1801 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1800 struct fib_info *fi) 1802 struct fib_info *fi)
1801 { 1803 {
1802 struct inet_peer *peer; 1804 struct inet_peer *peer;
1803 int create = 0; 1805 int create = 0;
1804 1806
1805 /* If a peer entry exists for this destination, we must hook 1807 /* If a peer entry exists for this destination, we must hook
1806 * it up in order to get at cached metrics. 1808 * it up in order to get at cached metrics.
1807 */ 1809 */
1808 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) 1810 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1809 create = 1; 1811 create = 1;
1810 1812
1811 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); 1813 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1812 if (peer) { 1814 if (peer) {
1813 rt->rt_peer_genid = rt_peer_genid(); 1815 rt->rt_peer_genid = rt_peer_genid();
1814 if (inet_metrics_new(peer)) 1816 if (inet_metrics_new(peer))
1815 memcpy(peer->metrics, fi->fib_metrics, 1817 memcpy(peer->metrics, fi->fib_metrics,
1816 sizeof(u32) * RTAX_MAX); 1818 sizeof(u32) * RTAX_MAX);
1817 dst_init_metrics(&rt->dst, peer->metrics, false); 1819 dst_init_metrics(&rt->dst, peer->metrics, false);
1818 1820
1819 check_peer_pmtu(&rt->dst, peer); 1821 check_peer_pmtu(&rt->dst, peer);
1820 if (peer->redirect_learned.a4 && 1822 if (peer->redirect_learned.a4 &&
1821 peer->redirect_learned.a4 != rt->rt_gateway) { 1823 peer->redirect_learned.a4 != rt->rt_gateway) {
1822 rt->rt_gateway = peer->redirect_learned.a4; 1824 rt->rt_gateway = peer->redirect_learned.a4;
1823 rt->rt_flags |= RTCF_REDIRECTED; 1825 rt->rt_flags |= RTCF_REDIRECTED;
1824 } 1826 }
1825 } else { 1827 } else {
1826 if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1828 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1827 rt->fi = fi; 1829 rt->fi = fi;
1828 atomic_inc(&fi->fib_clntref); 1830 atomic_inc(&fi->fib_clntref);
1829 } 1831 }
1830 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1832 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1831 } 1833 }
1832 } 1834 }
1833 1835
1834 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1836 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1835 const struct fib_result *res, 1837 const struct fib_result *res,
1836 struct fib_info *fi, u16 type, u32 itag) 1838 struct fib_info *fi, u16 type, u32 itag)
1837 { 1839 {
1838 struct dst_entry *dst = &rt->dst; 1840 struct dst_entry *dst = &rt->dst;
1839 1841
1840 if (fi) { 1842 if (fi) {
1841 if (FIB_RES_GW(*res) && 1843 if (FIB_RES_GW(*res) &&
1842 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1844 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1843 rt->rt_gateway = FIB_RES_GW(*res); 1845 rt->rt_gateway = FIB_RES_GW(*res);
1844 rt_init_metrics(rt, fl4, fi); 1846 rt_init_metrics(rt, fl4, fi);
1845 #ifdef CONFIG_IP_ROUTE_CLASSID 1847 #ifdef CONFIG_IP_ROUTE_CLASSID
1846 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1848 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1847 #endif 1849 #endif
1848 } 1850 }
1849 1851
1850 if (dst_mtu(dst) > IP_MAX_MTU) 1852 if (dst_mtu(dst) > IP_MAX_MTU)
1851 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); 1853 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1852 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1854 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1853 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1855 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1854 1856
1855 #ifdef CONFIG_IP_ROUTE_CLASSID 1857 #ifdef CONFIG_IP_ROUTE_CLASSID
1856 #ifdef CONFIG_IP_MULTIPLE_TABLES 1858 #ifdef CONFIG_IP_MULTIPLE_TABLES
1857 set_class_tag(rt, fib_rules_tclass(res)); 1859 set_class_tag(rt, fib_rules_tclass(res));
1858 #endif 1860 #endif
1859 set_class_tag(rt, itag); 1861 set_class_tag(rt, itag);
1860 #endif 1862 #endif
1861 } 1863 }
1862 1864
1863 static struct rtable *rt_dst_alloc(struct net_device *dev, 1865 static struct rtable *rt_dst_alloc(struct net_device *dev,
1864 bool nopolicy, bool noxfrm) 1866 bool nopolicy, bool noxfrm)
1865 { 1867 {
1866 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 1868 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1867 DST_HOST | 1869 DST_HOST |
1868 (nopolicy ? DST_NOPOLICY : 0) | 1870 (nopolicy ? DST_NOPOLICY : 0) |
1869 (noxfrm ? DST_NOXFRM : 0)); 1871 (noxfrm ? DST_NOXFRM : 0));
1870 } 1872 }
1871 1873
1872 /* called in rcu_read_lock() section */ 1874 /* called in rcu_read_lock() section */
1873 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1875 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874 u8 tos, struct net_device *dev, int our) 1876 u8 tos, struct net_device *dev, int our)
1875 { 1877 {
1876 unsigned int hash; 1878 unsigned int hash;
1877 struct rtable *rth; 1879 struct rtable *rth;
1878 __be32 spec_dst; 1880 __be32 spec_dst;
1879 struct in_device *in_dev = __in_dev_get_rcu(dev); 1881 struct in_device *in_dev = __in_dev_get_rcu(dev);
1880 u32 itag = 0; 1882 u32 itag = 0;
1881 int err; 1883 int err;
1882 1884
1883 /* Primary sanity checks. */ 1885 /* Primary sanity checks. */
1884 1886
1885 if (in_dev == NULL) 1887 if (in_dev == NULL)
1886 return -EINVAL; 1888 return -EINVAL;
1887 1889
1888 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1890 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1889 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1891 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1890 goto e_inval; 1892 goto e_inval;
1891 1893
1892 if (ipv4_is_zeronet(saddr)) { 1894 if (ipv4_is_zeronet(saddr)) {
1893 if (!ipv4_is_local_multicast(daddr)) 1895 if (!ipv4_is_local_multicast(daddr))
1894 goto e_inval; 1896 goto e_inval;
1895 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1897 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1896 } else { 1898 } else {
1897 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 1899 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1898 &itag); 1900 &itag);
1899 if (err < 0) 1901 if (err < 0)
1900 goto e_err; 1902 goto e_err;
1901 } 1903 }
1902 rth = rt_dst_alloc(init_net.loopback_dev, 1904 rth = rt_dst_alloc(init_net.loopback_dev,
1903 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1905 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1904 if (!rth) 1906 if (!rth)
1905 goto e_nobufs; 1907 goto e_nobufs;
1906 1908
1907 #ifdef CONFIG_IP_ROUTE_CLASSID 1909 #ifdef CONFIG_IP_ROUTE_CLASSID
1908 rth->dst.tclassid = itag; 1910 rth->dst.tclassid = itag;
1909 #endif 1911 #endif
1910 rth->dst.output = ip_rt_bug; 1912 rth->dst.output = ip_rt_bug;
1911 1913
1912 rth->rt_key_dst = daddr; 1914 rth->rt_key_dst = daddr;
1913 rth->rt_key_src = saddr; 1915 rth->rt_key_src = saddr;
1914 rth->rt_genid = rt_genid(dev_net(dev)); 1916 rth->rt_genid = rt_genid(dev_net(dev));
1915 rth->rt_flags = RTCF_MULTICAST; 1917 rth->rt_flags = RTCF_MULTICAST;
1916 rth->rt_type = RTN_MULTICAST; 1918 rth->rt_type = RTN_MULTICAST;
1917 rth->rt_key_tos = tos; 1919 rth->rt_key_tos = tos;
1918 rth->rt_dst = daddr; 1920 rth->rt_dst = daddr;
1919 rth->rt_src = saddr; 1921 rth->rt_src = saddr;
1920 rth->rt_route_iif = dev->ifindex; 1922 rth->rt_route_iif = dev->ifindex;
1921 rth->rt_iif = dev->ifindex; 1923 rth->rt_iif = dev->ifindex;
1922 rth->rt_oif = 0; 1924 rth->rt_oif = 0;
1923 rth->rt_mark = skb->mark; 1925 rth->rt_mark = skb->mark;
1924 rth->rt_gateway = daddr; 1926 rth->rt_gateway = daddr;
1925 rth->rt_spec_dst= spec_dst; 1927 rth->rt_spec_dst= spec_dst;
1926 rth->rt_peer_genid = 0; 1928 rth->rt_peer_genid = 0;
1927 rth->peer = NULL; 1929 rth->peer = NULL;
1928 rth->fi = NULL; 1930 rth->fi = NULL;
1929 if (our) { 1931 if (our) {
1930 rth->dst.input= ip_local_deliver; 1932 rth->dst.input= ip_local_deliver;
1931 rth->rt_flags |= RTCF_LOCAL; 1933 rth->rt_flags |= RTCF_LOCAL;
1932 } 1934 }
1933 1935
1934 #ifdef CONFIG_IP_MROUTE 1936 #ifdef CONFIG_IP_MROUTE
1935 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1937 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1936 rth->dst.input = ip_mr_input; 1938 rth->dst.input = ip_mr_input;
1937 #endif 1939 #endif
1938 RT_CACHE_STAT_INC(in_slow_mc); 1940 RT_CACHE_STAT_INC(in_slow_mc);
1939 1941
1940 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1942 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1941 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 1943 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1942 return IS_ERR(rth) ? PTR_ERR(rth) : 0; 1944 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1943 1945
1944 e_nobufs: 1946 e_nobufs:
1945 return -ENOBUFS; 1947 return -ENOBUFS;
1946 e_inval: 1948 e_inval:
1947 return -EINVAL; 1949 return -EINVAL;
1948 e_err: 1950 e_err:
1949 return err; 1951 return err;
1950 } 1952 }
1951 1953
1952 1954
1953 static void ip_handle_martian_source(struct net_device *dev, 1955 static void ip_handle_martian_source(struct net_device *dev,
1954 struct in_device *in_dev, 1956 struct in_device *in_dev,
1955 struct sk_buff *skb, 1957 struct sk_buff *skb,
1956 __be32 daddr, 1958 __be32 daddr,
1957 __be32 saddr) 1959 __be32 saddr)
1958 { 1960 {
1959 RT_CACHE_STAT_INC(in_martian_src); 1961 RT_CACHE_STAT_INC(in_martian_src);
1960 #ifdef CONFIG_IP_ROUTE_VERBOSE 1962 #ifdef CONFIG_IP_ROUTE_VERBOSE
1961 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1963 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1962 /* 1964 /*
1963 * RFC1812 recommendation, if source is martian, 1965 * RFC1812 recommendation, if source is martian,
1964 * the only hint is MAC header. 1966 * the only hint is MAC header.
1965 */ 1967 */
1966 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", 1968 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1967 &daddr, &saddr, dev->name); 1969 &daddr, &saddr, dev->name);
1968 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1970 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1969 int i; 1971 int i;
1970 const unsigned char *p = skb_mac_header(skb); 1972 const unsigned char *p = skb_mac_header(skb);
1971 printk(KERN_WARNING "ll header: "); 1973 printk(KERN_WARNING "ll header: ");
1972 for (i = 0; i < dev->hard_header_len; i++, p++) { 1974 for (i = 0; i < dev->hard_header_len; i++, p++) {
1973 printk("%02x", *p); 1975 printk("%02x", *p);
1974 if (i < (dev->hard_header_len - 1)) 1976 if (i < (dev->hard_header_len - 1))
1975 printk(":"); 1977 printk(":");
1976 } 1978 }
1977 printk("\n"); 1979 printk("\n");
1978 } 1980 }
1979 } 1981 }
1980 #endif 1982 #endif
1981 } 1983 }
1982 1984
1983 /* called in rcu_read_lock() section */ 1985 /* called in rcu_read_lock() section */
1984 static int __mkroute_input(struct sk_buff *skb, 1986 static int __mkroute_input(struct sk_buff *skb,
1985 const struct fib_result *res, 1987 const struct fib_result *res,
1986 struct in_device *in_dev, 1988 struct in_device *in_dev,
1987 __be32 daddr, __be32 saddr, u32 tos, 1989 __be32 daddr, __be32 saddr, u32 tos,
1988 struct rtable **result) 1990 struct rtable **result)
1989 { 1991 {
1990 struct rtable *rth; 1992 struct rtable *rth;
1991 int err; 1993 int err;
1992 struct in_device *out_dev; 1994 struct in_device *out_dev;
1993 unsigned int flags = 0; 1995 unsigned int flags = 0;
1994 __be32 spec_dst; 1996 __be32 spec_dst;
1995 u32 itag; 1997 u32 itag;
1996 1998
1997 /* get a working reference to the output device */ 1999 /* get a working reference to the output device */
1998 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 2000 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1999 if (out_dev == NULL) { 2001 if (out_dev == NULL) {
2000 if (net_ratelimit()) 2002 if (net_ratelimit())
2001 printk(KERN_CRIT "Bug in ip_route_input" \ 2003 printk(KERN_CRIT "Bug in ip_route_input" \
2002 "_slow(). Please, report\n"); 2004 "_slow(). Please, report\n");
2003 return -EINVAL; 2005 return -EINVAL;
2004 } 2006 }
2005 2007
2006 2008
2007 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 2009 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2008 in_dev->dev, &spec_dst, &itag); 2010 in_dev->dev, &spec_dst, &itag);
2009 if (err < 0) { 2011 if (err < 0) {
2010 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 2012 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2011 saddr); 2013 saddr);
2012 2014
2013 goto cleanup; 2015 goto cleanup;
2014 } 2016 }
2015 2017
2016 if (err) 2018 if (err)
2017 flags |= RTCF_DIRECTSRC; 2019 flags |= RTCF_DIRECTSRC;
2018 2020
2019 if (out_dev == in_dev && err && 2021 if (out_dev == in_dev && err &&
2020 (IN_DEV_SHARED_MEDIA(out_dev) || 2022 (IN_DEV_SHARED_MEDIA(out_dev) ||
2021 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 2023 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2022 flags |= RTCF_DOREDIRECT; 2024 flags |= RTCF_DOREDIRECT;
2023 2025
2024 if (skb->protocol != htons(ETH_P_IP)) { 2026 if (skb->protocol != htons(ETH_P_IP)) {
2025 /* Not IP (i.e. ARP). Do not create route, if it is 2027 /* Not IP (i.e. ARP). Do not create route, if it is
2026 * invalid for proxy arp. DNAT routes are always valid. 2028 * invalid for proxy arp. DNAT routes are always valid.
2027 * 2029 *
2028 * Proxy arp feature have been extended to allow, ARP 2030 * Proxy arp feature have been extended to allow, ARP
2029 * replies back to the same interface, to support 2031 * replies back to the same interface, to support
2030 * Private VLAN switch technologies. See arp.c. 2032 * Private VLAN switch technologies. See arp.c.
2031 */ 2033 */
2032 if (out_dev == in_dev && 2034 if (out_dev == in_dev &&
2033 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 2035 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2034 err = -EINVAL; 2036 err = -EINVAL;
2035 goto cleanup; 2037 goto cleanup;
2036 } 2038 }
2037 } 2039 }
2038 2040
2039 rth = rt_dst_alloc(out_dev->dev, 2041 rth = rt_dst_alloc(out_dev->dev,
2040 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2042 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2041 IN_DEV_CONF_GET(out_dev, NOXFRM)); 2043 IN_DEV_CONF_GET(out_dev, NOXFRM));
2042 if (!rth) { 2044 if (!rth) {
2043 err = -ENOBUFS; 2045 err = -ENOBUFS;
2044 goto cleanup; 2046 goto cleanup;
2045 } 2047 }
2046 2048
2047 rth->rt_key_dst = daddr; 2049 rth->rt_key_dst = daddr;
2048 rth->rt_key_src = saddr; 2050 rth->rt_key_src = saddr;
2049 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2051 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2050 rth->rt_flags = flags; 2052 rth->rt_flags = flags;
2051 rth->rt_type = res->type; 2053 rth->rt_type = res->type;
2052 rth->rt_key_tos = tos; 2054 rth->rt_key_tos = tos;
2053 rth->rt_dst = daddr; 2055 rth->rt_dst = daddr;
2054 rth->rt_src = saddr; 2056 rth->rt_src = saddr;
2055 rth->rt_route_iif = in_dev->dev->ifindex; 2057 rth->rt_route_iif = in_dev->dev->ifindex;
2056 rth->rt_iif = in_dev->dev->ifindex; 2058 rth->rt_iif = in_dev->dev->ifindex;
2057 rth->rt_oif = 0; 2059 rth->rt_oif = 0;
2058 rth->rt_mark = skb->mark; 2060 rth->rt_mark = skb->mark;
2059 rth->rt_gateway = daddr; 2061 rth->rt_gateway = daddr;
2060 rth->rt_spec_dst= spec_dst; 2062 rth->rt_spec_dst= spec_dst;
2061 rth->rt_peer_genid = 0; 2063 rth->rt_peer_genid = 0;
2062 rth->peer = NULL; 2064 rth->peer = NULL;
2063 rth->fi = NULL; 2065 rth->fi = NULL;
2064 2066
2065 rth->dst.input = ip_forward; 2067 rth->dst.input = ip_forward;
2066 rth->dst.output = ip_output; 2068 rth->dst.output = ip_output;
2067 2069
2068 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); 2070 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2069 2071
2070 *result = rth; 2072 *result = rth;
2071 err = 0; 2073 err = 0;
2072 cleanup: 2074 cleanup:
2073 return err; 2075 return err;
2074 } 2076 }
2075 2077
2076 static int ip_mkroute_input(struct sk_buff *skb, 2078 static int ip_mkroute_input(struct sk_buff *skb,
2077 struct fib_result *res, 2079 struct fib_result *res,
2078 const struct flowi4 *fl4, 2080 const struct flowi4 *fl4,
2079 struct in_device *in_dev, 2081 struct in_device *in_dev,
2080 __be32 daddr, __be32 saddr, u32 tos) 2082 __be32 daddr, __be32 saddr, u32 tos)
2081 { 2083 {
2082 struct rtable* rth = NULL; 2084 struct rtable* rth = NULL;
2083 int err; 2085 int err;
2084 unsigned hash; 2086 unsigned hash;
2085 2087
2086 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2087 if (res->fi && res->fi->fib_nhs > 1) 2089 if (res->fi && res->fi->fib_nhs > 1)
2088 fib_select_multipath(res); 2090 fib_select_multipath(res);
2089 #endif 2091 #endif
2090 2092
2091 /* create a routing cache entry */ 2093 /* create a routing cache entry */
2092 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 2094 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2093 if (err) 2095 if (err)
2094 return err; 2096 return err;
2095 2097
2096 /* put it into the cache */ 2098 /* put it into the cache */
2097 hash = rt_hash(daddr, saddr, fl4->flowi4_iif, 2099 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2098 rt_genid(dev_net(rth->dst.dev))); 2100 rt_genid(dev_net(rth->dst.dev)));
2099 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); 2101 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2100 if (IS_ERR(rth)) 2102 if (IS_ERR(rth))
2101 return PTR_ERR(rth); 2103 return PTR_ERR(rth);
2102 return 0; 2104 return 0;
2103 } 2105 }
2104 2106
2105 /* 2107 /*
2106 * NOTE. We drop all the packets that has local source 2108 * NOTE. We drop all the packets that has local source
2107 * addresses, because every properly looped back packet 2109 * addresses, because every properly looped back packet
2108 * must have correct destination already attached by output routine. 2110 * must have correct destination already attached by output routine.
2109 * 2111 *
2110 * Such approach solves two big problems: 2112 * Such approach solves two big problems:
2111 * 1. Not simplex devices are handled properly. 2113 * 1. Not simplex devices are handled properly.
2112 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2114 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2113 * called with rcu_read_lock() 2115 * called with rcu_read_lock()
2114 */ 2116 */
2115 2117
2116 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2118 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2117 u8 tos, struct net_device *dev) 2119 u8 tos, struct net_device *dev)
2118 { 2120 {
2119 struct fib_result res; 2121 struct fib_result res;
2120 struct in_device *in_dev = __in_dev_get_rcu(dev); 2122 struct in_device *in_dev = __in_dev_get_rcu(dev);
2121 struct flowi4 fl4; 2123 struct flowi4 fl4;
2122 unsigned flags = 0; 2124 unsigned flags = 0;
2123 u32 itag = 0; 2125 u32 itag = 0;
2124 struct rtable * rth; 2126 struct rtable * rth;
2125 unsigned hash; 2127 unsigned hash;
2126 __be32 spec_dst; 2128 __be32 spec_dst;
2127 int err = -EINVAL; 2129 int err = -EINVAL;
2128 struct net * net = dev_net(dev); 2130 struct net * net = dev_net(dev);
2129 2131
2130 /* IP on this device is disabled. */ 2132 /* IP on this device is disabled. */
2131 2133
2132 if (!in_dev) 2134 if (!in_dev)
2133 goto out; 2135 goto out;
2134 2136
2135 /* Check for the most weird martians, which can be not detected 2137 /* Check for the most weird martians, which can be not detected
2136 by fib_lookup. 2138 by fib_lookup.
2137 */ 2139 */
2138 2140
2139 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2141 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2140 ipv4_is_loopback(saddr)) 2142 ipv4_is_loopback(saddr))
2141 goto martian_source; 2143 goto martian_source;
2142 2144
2143 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2145 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2144 goto brd_input; 2146 goto brd_input;
2145 2147
2146 /* Accept zero addresses only to limited broadcast; 2148 /* Accept zero addresses only to limited broadcast;
2147 * I even do not know to fix it or not. Waiting for complains :-) 2149 * I even do not know to fix it or not. Waiting for complains :-)
2148 */ 2150 */
2149 if (ipv4_is_zeronet(saddr)) 2151 if (ipv4_is_zeronet(saddr))
2150 goto martian_source; 2152 goto martian_source;
2151 2153
2152 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) 2154 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2153 goto martian_destination; 2155 goto martian_destination;
2154 2156
2155 /* 2157 /*
2156 * Now we are ready to route packet. 2158 * Now we are ready to route packet.
2157 */ 2159 */
2158 fl4.flowi4_oif = 0; 2160 fl4.flowi4_oif = 0;
2159 fl4.flowi4_iif = dev->ifindex; 2161 fl4.flowi4_iif = dev->ifindex;
2160 fl4.flowi4_mark = skb->mark; 2162 fl4.flowi4_mark = skb->mark;
2161 fl4.flowi4_tos = tos; 2163 fl4.flowi4_tos = tos;
2162 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2164 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2163 fl4.daddr = daddr; 2165 fl4.daddr = daddr;
2164 fl4.saddr = saddr; 2166 fl4.saddr = saddr;
2165 err = fib_lookup(net, &fl4, &res); 2167 err = fib_lookup(net, &fl4, &res);
2166 if (err != 0) { 2168 if (err != 0) {
2167 if (!IN_DEV_FORWARD(in_dev)) 2169 if (!IN_DEV_FORWARD(in_dev))
2168 goto e_hostunreach; 2170 goto e_hostunreach;
2169 goto no_route; 2171 goto no_route;
2170 } 2172 }
2171 2173
2172 RT_CACHE_STAT_INC(in_slow_tot); 2174 RT_CACHE_STAT_INC(in_slow_tot);
2173 2175
2174 if (res.type == RTN_BROADCAST) 2176 if (res.type == RTN_BROADCAST)
2175 goto brd_input; 2177 goto brd_input;
2176 2178
2177 if (res.type == RTN_LOCAL) { 2179 if (res.type == RTN_LOCAL) {
2178 err = fib_validate_source(skb, saddr, daddr, tos, 2180 err = fib_validate_source(skb, saddr, daddr, tos,
2179 net->loopback_dev->ifindex, 2181 net->loopback_dev->ifindex,
2180 dev, &spec_dst, &itag); 2182 dev, &spec_dst, &itag);
2181 if (err < 0) 2183 if (err < 0)
2182 goto martian_source_keep_err; 2184 goto martian_source_keep_err;
2183 if (err) 2185 if (err)
2184 flags |= RTCF_DIRECTSRC; 2186 flags |= RTCF_DIRECTSRC;
2185 spec_dst = daddr; 2187 spec_dst = daddr;
2186 goto local_input; 2188 goto local_input;
2187 } 2189 }
2188 2190
2189 if (!IN_DEV_FORWARD(in_dev)) 2191 if (!IN_DEV_FORWARD(in_dev))
2190 goto e_hostunreach; 2192 goto e_hostunreach;
2191 if (res.type != RTN_UNICAST) 2193 if (res.type != RTN_UNICAST)
2192 goto martian_destination; 2194 goto martian_destination;
2193 2195
2194 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 2196 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2195 out: return err; 2197 out: return err;
2196 2198
2197 brd_input: 2199 brd_input:
2198 if (skb->protocol != htons(ETH_P_IP)) 2200 if (skb->protocol != htons(ETH_P_IP))
2199 goto e_inval; 2201 goto e_inval;
2200 2202
2201 if (ipv4_is_zeronet(saddr)) 2203 if (ipv4_is_zeronet(saddr))
2202 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2204 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2203 else { 2205 else {
2204 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 2206 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2205 &itag); 2207 &itag);
2206 if (err < 0) 2208 if (err < 0)
2207 goto martian_source_keep_err; 2209 goto martian_source_keep_err;
2208 if (err) 2210 if (err)
2209 flags |= RTCF_DIRECTSRC; 2211 flags |= RTCF_DIRECTSRC;
2210 } 2212 }
2211 flags |= RTCF_BROADCAST; 2213 flags |= RTCF_BROADCAST;
2212 res.type = RTN_BROADCAST; 2214 res.type = RTN_BROADCAST;
2213 RT_CACHE_STAT_INC(in_brd); 2215 RT_CACHE_STAT_INC(in_brd);
2214 2216
2215 local_input: 2217 local_input:
2216 rth = rt_dst_alloc(net->loopback_dev, 2218 rth = rt_dst_alloc(net->loopback_dev,
2217 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2219 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2218 if (!rth) 2220 if (!rth)
2219 goto e_nobufs; 2221 goto e_nobufs;
2220 2222
2221 rth->dst.input= ip_local_deliver; 2223 rth->dst.input= ip_local_deliver;
2222 rth->dst.output= ip_rt_bug; 2224 rth->dst.output= ip_rt_bug;
2223 #ifdef CONFIG_IP_ROUTE_CLASSID 2225 #ifdef CONFIG_IP_ROUTE_CLASSID
2224 rth->dst.tclassid = itag; 2226 rth->dst.tclassid = itag;
2225 #endif 2227 #endif
2226 2228
2227 rth->rt_key_dst = daddr; 2229 rth->rt_key_dst = daddr;
2228 rth->rt_key_src = saddr; 2230 rth->rt_key_src = saddr;
2229 rth->rt_genid = rt_genid(net); 2231 rth->rt_genid = rt_genid(net);
2230 rth->rt_flags = flags|RTCF_LOCAL; 2232 rth->rt_flags = flags|RTCF_LOCAL;
2231 rth->rt_type = res.type; 2233 rth->rt_type = res.type;
2232 rth->rt_key_tos = tos; 2234 rth->rt_key_tos = tos;
2233 rth->rt_dst = daddr; 2235 rth->rt_dst = daddr;
2234 rth->rt_src = saddr; 2236 rth->rt_src = saddr;
2235 #ifdef CONFIG_IP_ROUTE_CLASSID 2237 #ifdef CONFIG_IP_ROUTE_CLASSID
2236 rth->dst.tclassid = itag; 2238 rth->dst.tclassid = itag;
2237 #endif 2239 #endif
2238 rth->rt_route_iif = dev->ifindex; 2240 rth->rt_route_iif = dev->ifindex;
2239 rth->rt_iif = dev->ifindex; 2241 rth->rt_iif = dev->ifindex;
2240 rth->rt_oif = 0; 2242 rth->rt_oif = 0;
2241 rth->rt_mark = skb->mark; 2243 rth->rt_mark = skb->mark;
2242 rth->rt_gateway = daddr; 2244 rth->rt_gateway = daddr;
2243 rth->rt_spec_dst= spec_dst; 2245 rth->rt_spec_dst= spec_dst;
2244 rth->rt_peer_genid = 0; 2246 rth->rt_peer_genid = 0;
2245 rth->peer = NULL; 2247 rth->peer = NULL;
2246 rth->fi = NULL; 2248 rth->fi = NULL;
2247 if (res.type == RTN_UNREACHABLE) { 2249 if (res.type == RTN_UNREACHABLE) {
2248 rth->dst.input= ip_error; 2250 rth->dst.input= ip_error;
2249 rth->dst.error= -err; 2251 rth->dst.error= -err;
2250 rth->rt_flags &= ~RTCF_LOCAL; 2252 rth->rt_flags &= ~RTCF_LOCAL;
2251 } 2253 }
2252 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 2254 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2253 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 2255 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2254 err = 0; 2256 err = 0;
2255 if (IS_ERR(rth)) 2257 if (IS_ERR(rth))
2256 err = PTR_ERR(rth); 2258 err = PTR_ERR(rth);
2257 goto out; 2259 goto out;
2258 2260
2259 no_route: 2261 no_route:
2260 RT_CACHE_STAT_INC(in_no_route); 2262 RT_CACHE_STAT_INC(in_no_route);
2261 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2263 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2262 res.type = RTN_UNREACHABLE; 2264 res.type = RTN_UNREACHABLE;
2263 if (err == -ESRCH) 2265 if (err == -ESRCH)
2264 err = -ENETUNREACH; 2266 err = -ENETUNREACH;
2265 goto local_input; 2267 goto local_input;
2266 2268
2267 /* 2269 /*
2268 * Do not cache martian addresses: they should be logged (RFC1812) 2270 * Do not cache martian addresses: they should be logged (RFC1812)
2269 */ 2271 */
2270 martian_destination: 2272 martian_destination:
2271 RT_CACHE_STAT_INC(in_martian_dst); 2273 RT_CACHE_STAT_INC(in_martian_dst);
2272 #ifdef CONFIG_IP_ROUTE_VERBOSE 2274 #ifdef CONFIG_IP_ROUTE_VERBOSE
2273 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2275 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2274 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", 2276 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2275 &daddr, &saddr, dev->name); 2277 &daddr, &saddr, dev->name);
2276 #endif 2278 #endif
2277 2279
2278 e_hostunreach: 2280 e_hostunreach:
2279 err = -EHOSTUNREACH; 2281 err = -EHOSTUNREACH;
2280 goto out; 2282 goto out;
2281 2283
2282 e_inval: 2284 e_inval:
2283 err = -EINVAL; 2285 err = -EINVAL;
2284 goto out; 2286 goto out;
2285 2287
2286 e_nobufs: 2288 e_nobufs:
2287 err = -ENOBUFS; 2289 err = -ENOBUFS;
2288 goto out; 2290 goto out;
2289 2291
2290 martian_source: 2292 martian_source:
2291 err = -EINVAL; 2293 err = -EINVAL;
2292 martian_source_keep_err: 2294 martian_source_keep_err:
2293 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2295 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2294 goto out; 2296 goto out;
2295 } 2297 }
2296 2298
2297 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2299 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2298 u8 tos, struct net_device *dev, bool noref) 2300 u8 tos, struct net_device *dev, bool noref)
2299 { 2301 {
2300 struct rtable * rth; 2302 struct rtable * rth;
2301 unsigned hash; 2303 unsigned hash;
2302 int iif = dev->ifindex; 2304 int iif = dev->ifindex;
2303 struct net *net; 2305 struct net *net;
2304 int res; 2306 int res;
2305 2307
2306 net = dev_net(dev); 2308 net = dev_net(dev);
2307 2309
2308 rcu_read_lock(); 2310 rcu_read_lock();
2309 2311
2310 if (!rt_caching(net)) 2312 if (!rt_caching(net))
2311 goto skip_cache; 2313 goto skip_cache;
2312 2314
2313 tos &= IPTOS_RT_MASK; 2315 tos &= IPTOS_RT_MASK;
2314 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2316 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2315 2317
2316 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2318 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2317 rth = rcu_dereference(rth->dst.rt_next)) { 2319 rth = rcu_dereference(rth->dst.rt_next)) {
2318 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | 2320 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2319 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | 2321 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2320 (rth->rt_iif ^ iif) | 2322 (rth->rt_iif ^ iif) |
2321 rth->rt_oif | 2323 rth->rt_oif |
2322 (rth->rt_key_tos ^ tos)) == 0 && 2324 (rth->rt_key_tos ^ tos)) == 0 &&
2323 rth->rt_mark == skb->mark && 2325 rth->rt_mark == skb->mark &&
2324 net_eq(dev_net(rth->dst.dev), net) && 2326 net_eq(dev_net(rth->dst.dev), net) &&
2325 !rt_is_expired(rth)) { 2327 !rt_is_expired(rth)) {
2326 if (noref) { 2328 if (noref) {
2327 dst_use_noref(&rth->dst, jiffies); 2329 dst_use_noref(&rth->dst, jiffies);
2328 skb_dst_set_noref(skb, &rth->dst); 2330 skb_dst_set_noref(skb, &rth->dst);
2329 } else { 2331 } else {
2330 dst_use(&rth->dst, jiffies); 2332 dst_use(&rth->dst, jiffies);
2331 skb_dst_set(skb, &rth->dst); 2333 skb_dst_set(skb, &rth->dst);
2332 } 2334 }
2333 RT_CACHE_STAT_INC(in_hit); 2335 RT_CACHE_STAT_INC(in_hit);
2334 rcu_read_unlock(); 2336 rcu_read_unlock();
2335 return 0; 2337 return 0;
2336 } 2338 }
2337 RT_CACHE_STAT_INC(in_hlist_search); 2339 RT_CACHE_STAT_INC(in_hlist_search);
2338 } 2340 }
2339 2341
2340 skip_cache: 2342 skip_cache:
2341 /* Multicast recognition logic is moved from route cache to here. 2343 /* Multicast recognition logic is moved from route cache to here.
2342 The problem was that too many Ethernet cards have broken/missing 2344 The problem was that too many Ethernet cards have broken/missing
2343 hardware multicast filters :-( As result the host on multicasting 2345 hardware multicast filters :-( As result the host on multicasting
2344 network acquires a lot of useless route cache entries, sort of 2346 network acquires a lot of useless route cache entries, sort of
2345 SDR messages from all the world. Now we try to get rid of them. 2347 SDR messages from all the world. Now we try to get rid of them.
2346 Really, provided software IP multicast filter is organized 2348 Really, provided software IP multicast filter is organized
2347 reasonably (at least, hashed), it does not result in a slowdown 2349 reasonably (at least, hashed), it does not result in a slowdown
2348 comparing with route cache reject entries. 2350 comparing with route cache reject entries.
2349 Note, that multicast routers are not affected, because 2351 Note, that multicast routers are not affected, because
2350 route cache entry is created eventually. 2352 route cache entry is created eventually.
2351 */ 2353 */
2352 if (ipv4_is_multicast(daddr)) { 2354 if (ipv4_is_multicast(daddr)) {
2353 struct in_device *in_dev = __in_dev_get_rcu(dev); 2355 struct in_device *in_dev = __in_dev_get_rcu(dev);
2354 2356
2355 if (in_dev) { 2357 if (in_dev) {
2356 int our = ip_check_mc_rcu(in_dev, daddr, saddr, 2358 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2357 ip_hdr(skb)->protocol); 2359 ip_hdr(skb)->protocol);
2358 if (our 2360 if (our
2359 #ifdef CONFIG_IP_MROUTE 2361 #ifdef CONFIG_IP_MROUTE
2360 || 2362 ||
2361 (!ipv4_is_local_multicast(daddr) && 2363 (!ipv4_is_local_multicast(daddr) &&
2362 IN_DEV_MFORWARD(in_dev)) 2364 IN_DEV_MFORWARD(in_dev))
2363 #endif 2365 #endif
2364 ) { 2366 ) {
2365 int res = ip_route_input_mc(skb, daddr, saddr, 2367 int res = ip_route_input_mc(skb, daddr, saddr,
2366 tos, dev, our); 2368 tos, dev, our);
2367 rcu_read_unlock(); 2369 rcu_read_unlock();
2368 return res; 2370 return res;
2369 } 2371 }
2370 } 2372 }
2371 rcu_read_unlock(); 2373 rcu_read_unlock();
2372 return -EINVAL; 2374 return -EINVAL;
2373 } 2375 }
2374 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 2376 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2375 rcu_read_unlock(); 2377 rcu_read_unlock();
2376 return res; 2378 return res;
2377 } 2379 }
2378 EXPORT_SYMBOL(ip_route_input_common); 2380 EXPORT_SYMBOL(ip_route_input_common);
2379 2381
2380 /* called with rcu_read_lock() */ 2382 /* called with rcu_read_lock() */
2381 static struct rtable *__mkroute_output(const struct fib_result *res, 2383 static struct rtable *__mkroute_output(const struct fib_result *res,
2382 const struct flowi4 *fl4, 2384 const struct flowi4 *fl4,
2383 __be32 orig_daddr, __be32 orig_saddr, 2385 __be32 orig_daddr, __be32 orig_saddr,
2384 int orig_oif, struct net_device *dev_out, 2386 int orig_oif, struct net_device *dev_out,
2385 unsigned int flags) 2387 unsigned int flags)
2386 { 2388 {
2387 struct fib_info *fi = res->fi; 2389 struct fib_info *fi = res->fi;
2388 u32 tos = RT_FL_TOS(fl4); 2390 u32 tos = RT_FL_TOS(fl4);
2389 struct in_device *in_dev; 2391 struct in_device *in_dev;
2390 u16 type = res->type; 2392 u16 type = res->type;
2391 struct rtable *rth; 2393 struct rtable *rth;
2392 2394
2393 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 2395 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2394 return ERR_PTR(-EINVAL); 2396 return ERR_PTR(-EINVAL);
2395 2397
2396 if (ipv4_is_lbcast(fl4->daddr)) 2398 if (ipv4_is_lbcast(fl4->daddr))
2397 type = RTN_BROADCAST; 2399 type = RTN_BROADCAST;
2398 else if (ipv4_is_multicast(fl4->daddr)) 2400 else if (ipv4_is_multicast(fl4->daddr))
2399 type = RTN_MULTICAST; 2401 type = RTN_MULTICAST;
2400 else if (ipv4_is_zeronet(fl4->daddr)) 2402 else if (ipv4_is_zeronet(fl4->daddr))
2401 return ERR_PTR(-EINVAL); 2403 return ERR_PTR(-EINVAL);
2402 2404
2403 if (dev_out->flags & IFF_LOOPBACK) 2405 if (dev_out->flags & IFF_LOOPBACK)
2404 flags |= RTCF_LOCAL; 2406 flags |= RTCF_LOCAL;
2405 2407
2406 in_dev = __in_dev_get_rcu(dev_out); 2408 in_dev = __in_dev_get_rcu(dev_out);
2407 if (!in_dev) 2409 if (!in_dev)
2408 return ERR_PTR(-EINVAL); 2410 return ERR_PTR(-EINVAL);
2409 2411
2410 if (type == RTN_BROADCAST) { 2412 if (type == RTN_BROADCAST) {
2411 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2413 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2412 fi = NULL; 2414 fi = NULL;
2413 } else if (type == RTN_MULTICAST) { 2415 } else if (type == RTN_MULTICAST) {
2414 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2416 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2415 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2417 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2416 fl4->flowi4_proto)) 2418 fl4->flowi4_proto))
2417 flags &= ~RTCF_LOCAL; 2419 flags &= ~RTCF_LOCAL;
2418 /* If multicast route do not exist use 2420 /* If multicast route do not exist use
2419 * default one, but do not gateway in this case. 2421 * default one, but do not gateway in this case.
2420 * Yes, it is hack. 2422 * Yes, it is hack.
2421 */ 2423 */
2422 if (fi && res->prefixlen < 4) 2424 if (fi && res->prefixlen < 4)
2423 fi = NULL; 2425 fi = NULL;
2424 } 2426 }
2425 2427
2426 rth = rt_dst_alloc(dev_out, 2428 rth = rt_dst_alloc(dev_out,
2427 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2429 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2428 IN_DEV_CONF_GET(in_dev, NOXFRM)); 2430 IN_DEV_CONF_GET(in_dev, NOXFRM));
2429 if (!rth) 2431 if (!rth)
2430 return ERR_PTR(-ENOBUFS); 2432 return ERR_PTR(-ENOBUFS);
2431 2433
2432 rth->dst.output = ip_output; 2434 rth->dst.output = ip_output;
2433 2435
2434 rth->rt_key_dst = orig_daddr; 2436 rth->rt_key_dst = orig_daddr;
2435 rth->rt_key_src = orig_saddr; 2437 rth->rt_key_src = orig_saddr;
2436 rth->rt_genid = rt_genid(dev_net(dev_out)); 2438 rth->rt_genid = rt_genid(dev_net(dev_out));
2437 rth->rt_flags = flags; 2439 rth->rt_flags = flags;
2438 rth->rt_type = type; 2440 rth->rt_type = type;
2439 rth->rt_key_tos = tos; 2441 rth->rt_key_tos = tos;
2440 rth->rt_dst = fl4->daddr; 2442 rth->rt_dst = fl4->daddr;
2441 rth->rt_src = fl4->saddr; 2443 rth->rt_src = fl4->saddr;
2442 rth->rt_route_iif = 0; 2444 rth->rt_route_iif = 0;
2443 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2445 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2444 rth->rt_oif = orig_oif; 2446 rth->rt_oif = orig_oif;
2445 rth->rt_mark = fl4->flowi4_mark; 2447 rth->rt_mark = fl4->flowi4_mark;
2446 rth->rt_gateway = fl4->daddr; 2448 rth->rt_gateway = fl4->daddr;
2447 rth->rt_spec_dst= fl4->saddr; 2449 rth->rt_spec_dst= fl4->saddr;
2448 rth->rt_peer_genid = 0; 2450 rth->rt_peer_genid = 0;
2449 rth->peer = NULL; 2451 rth->peer = NULL;
2450 rth->fi = NULL; 2452 rth->fi = NULL;
2451 2453
2452 RT_CACHE_STAT_INC(out_slow_tot); 2454 RT_CACHE_STAT_INC(out_slow_tot);
2453 2455
2454 if (flags & RTCF_LOCAL) { 2456 if (flags & RTCF_LOCAL) {
2455 rth->dst.input = ip_local_deliver; 2457 rth->dst.input = ip_local_deliver;
2456 rth->rt_spec_dst = fl4->daddr; 2458 rth->rt_spec_dst = fl4->daddr;
2457 } 2459 }
2458 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2460 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2459 rth->rt_spec_dst = fl4->saddr; 2461 rth->rt_spec_dst = fl4->saddr;
2460 if (flags & RTCF_LOCAL && 2462 if (flags & RTCF_LOCAL &&
2461 !(dev_out->flags & IFF_LOOPBACK)) { 2463 !(dev_out->flags & IFF_LOOPBACK)) {
2462 rth->dst.output = ip_mc_output; 2464 rth->dst.output = ip_mc_output;
2463 RT_CACHE_STAT_INC(out_slow_mc); 2465 RT_CACHE_STAT_INC(out_slow_mc);
2464 } 2466 }
2465 #ifdef CONFIG_IP_MROUTE 2467 #ifdef CONFIG_IP_MROUTE
2466 if (type == RTN_MULTICAST) { 2468 if (type == RTN_MULTICAST) {
2467 if (IN_DEV_MFORWARD(in_dev) && 2469 if (IN_DEV_MFORWARD(in_dev) &&
2468 !ipv4_is_local_multicast(fl4->daddr)) { 2470 !ipv4_is_local_multicast(fl4->daddr)) {
2469 rth->dst.input = ip_mr_input; 2471 rth->dst.input = ip_mr_input;
2470 rth->dst.output = ip_mc_output; 2472 rth->dst.output = ip_mc_output;
2471 } 2473 }
2472 } 2474 }
2473 #endif 2475 #endif
2474 } 2476 }
2475 2477
2476 rt_set_nexthop(rth, fl4, res, fi, type, 0); 2478 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2477 2479
2478 return rth; 2480 return rth;
2479 } 2481 }
2480 2482
2481 /* 2483 /*
2482 * Major route resolver routine. 2484 * Major route resolver routine.
2483 * called with rcu_read_lock(); 2485 * called with rcu_read_lock();
2484 */ 2486 */
2485 2487
2486 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 2488 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2487 { 2489 {
2488 struct net_device *dev_out = NULL; 2490 struct net_device *dev_out = NULL;
2489 u32 tos = RT_FL_TOS(fl4); 2491 u32 tos = RT_FL_TOS(fl4);
2490 unsigned int flags = 0; 2492 unsigned int flags = 0;
2491 struct fib_result res; 2493 struct fib_result res;
2492 struct rtable *rth; 2494 struct rtable *rth;
2493 __be32 orig_daddr; 2495 __be32 orig_daddr;
2494 __be32 orig_saddr; 2496 __be32 orig_saddr;
2495 int orig_oif; 2497 int orig_oif;
2496 2498
2497 res.fi = NULL; 2499 res.fi = NULL;
2498 #ifdef CONFIG_IP_MULTIPLE_TABLES 2500 #ifdef CONFIG_IP_MULTIPLE_TABLES
2499 res.r = NULL; 2501 res.r = NULL;
2500 #endif 2502 #endif
2501 2503
2502 orig_daddr = fl4->daddr; 2504 orig_daddr = fl4->daddr;
2503 orig_saddr = fl4->saddr; 2505 orig_saddr = fl4->saddr;
2504 orig_oif = fl4->flowi4_oif; 2506 orig_oif = fl4->flowi4_oif;
2505 2507
2506 fl4->flowi4_iif = net->loopback_dev->ifindex; 2508 fl4->flowi4_iif = net->loopback_dev->ifindex;
2507 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2509 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2508 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2510 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2509 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2511 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2510 2512
2511 rcu_read_lock(); 2513 rcu_read_lock();
2512 if (fl4->saddr) { 2514 if (fl4->saddr) {
2513 rth = ERR_PTR(-EINVAL); 2515 rth = ERR_PTR(-EINVAL);
2514 if (ipv4_is_multicast(fl4->saddr) || 2516 if (ipv4_is_multicast(fl4->saddr) ||
2515 ipv4_is_lbcast(fl4->saddr) || 2517 ipv4_is_lbcast(fl4->saddr) ||
2516 ipv4_is_zeronet(fl4->saddr)) 2518 ipv4_is_zeronet(fl4->saddr))
2517 goto out; 2519 goto out;
2518 2520
2519 /* I removed check for oif == dev_out->oif here. 2521 /* I removed check for oif == dev_out->oif here.
2520 It was wrong for two reasons: 2522 It was wrong for two reasons:
2521 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2523 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2522 is assigned to multiple interfaces. 2524 is assigned to multiple interfaces.
2523 2. Moreover, we are allowed to send packets with saddr 2525 2. Moreover, we are allowed to send packets with saddr
2524 of another iface. --ANK 2526 of another iface. --ANK
2525 */ 2527 */
2526 2528
2527 if (fl4->flowi4_oif == 0 && 2529 if (fl4->flowi4_oif == 0 &&
2528 (ipv4_is_multicast(fl4->daddr) || 2530 (ipv4_is_multicast(fl4->daddr) ||
2529 ipv4_is_lbcast(fl4->daddr))) { 2531 ipv4_is_lbcast(fl4->daddr))) {
2530 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2532 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2531 dev_out = __ip_dev_find(net, fl4->saddr, false); 2533 dev_out = __ip_dev_find(net, fl4->saddr, false);
2532 if (dev_out == NULL) 2534 if (dev_out == NULL)
2533 goto out; 2535 goto out;
2534 2536
2535 /* Special hack: user can direct multicasts 2537 /* Special hack: user can direct multicasts
2536 and limited broadcast via necessary interface 2538 and limited broadcast via necessary interface
2537 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2539 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2538 This hack is not just for fun, it allows 2540 This hack is not just for fun, it allows
2539 vic,vat and friends to work. 2541 vic,vat and friends to work.
2540 They bind socket to loopback, set ttl to zero 2542 They bind socket to loopback, set ttl to zero
2541 and expect that it will work. 2543 and expect that it will work.
2542 From the viewpoint of routing cache they are broken, 2544 From the viewpoint of routing cache they are broken,
2543 because we are not allowed to build multicast path 2545 because we are not allowed to build multicast path
2544 with loopback source addr (look, routing cache 2546 with loopback source addr (look, routing cache
2545 cannot know, that ttl is zero, so that packet 2547 cannot know, that ttl is zero, so that packet
2546 will not leave this host and route is valid). 2548 will not leave this host and route is valid).
2547 Luckily, this hack is good workaround. 2549 Luckily, this hack is good workaround.
2548 */ 2550 */
2549 2551
2550 fl4->flowi4_oif = dev_out->ifindex; 2552 fl4->flowi4_oif = dev_out->ifindex;
2551 goto make_route; 2553 goto make_route;
2552 } 2554 }
2553 2555
2554 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2556 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2555 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2557 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2556 if (!__ip_dev_find(net, fl4->saddr, false)) 2558 if (!__ip_dev_find(net, fl4->saddr, false))
2557 goto out; 2559 goto out;
2558 } 2560 }
2559 } 2561 }
2560 2562
2561 2563
2562 if (fl4->flowi4_oif) { 2564 if (fl4->flowi4_oif) {
2563 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2565 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2564 rth = ERR_PTR(-ENODEV); 2566 rth = ERR_PTR(-ENODEV);
2565 if (dev_out == NULL) 2567 if (dev_out == NULL)
2566 goto out; 2568 goto out;
2567 2569
2568 /* RACE: Check return value of inet_select_addr instead. */ 2570 /* RACE: Check return value of inet_select_addr instead. */
2569 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2571 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2570 rth = ERR_PTR(-ENETUNREACH); 2572 rth = ERR_PTR(-ENETUNREACH);
2571 goto out; 2573 goto out;
2572 } 2574 }
2573 if (ipv4_is_local_multicast(fl4->daddr) || 2575 if (ipv4_is_local_multicast(fl4->daddr) ||
2574 ipv4_is_lbcast(fl4->daddr)) { 2576 ipv4_is_lbcast(fl4->daddr)) {
2575 if (!fl4->saddr) 2577 if (!fl4->saddr)
2576 fl4->saddr = inet_select_addr(dev_out, 0, 2578 fl4->saddr = inet_select_addr(dev_out, 0,
2577 RT_SCOPE_LINK); 2579 RT_SCOPE_LINK);
2578 goto make_route; 2580 goto make_route;
2579 } 2581 }
2580 if (fl4->saddr) { 2582 if (fl4->saddr) {
2581 if (ipv4_is_multicast(fl4->daddr)) 2583 if (ipv4_is_multicast(fl4->daddr))
2582 fl4->saddr = inet_select_addr(dev_out, 0, 2584 fl4->saddr = inet_select_addr(dev_out, 0,
2583 fl4->flowi4_scope); 2585 fl4->flowi4_scope);
2584 else if (!fl4->daddr) 2586 else if (!fl4->daddr)
2585 fl4->saddr = inet_select_addr(dev_out, 0, 2587 fl4->saddr = inet_select_addr(dev_out, 0,
2586 RT_SCOPE_HOST); 2588 RT_SCOPE_HOST);
2587 } 2589 }
2588 } 2590 }
2589 2591
2590 if (!fl4->daddr) { 2592 if (!fl4->daddr) {
2591 fl4->daddr = fl4->saddr; 2593 fl4->daddr = fl4->saddr;
2592 if (!fl4->daddr) 2594 if (!fl4->daddr)
2593 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2595 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2594 dev_out = net->loopback_dev; 2596 dev_out = net->loopback_dev;
2595 fl4->flowi4_oif = net->loopback_dev->ifindex; 2597 fl4->flowi4_oif = net->loopback_dev->ifindex;
2596 res.type = RTN_LOCAL; 2598 res.type = RTN_LOCAL;
2597 flags |= RTCF_LOCAL; 2599 flags |= RTCF_LOCAL;
2598 goto make_route; 2600 goto make_route;
2599 } 2601 }
2600 2602
2601 if (fib_lookup(net, fl4, &res)) { 2603 if (fib_lookup(net, fl4, &res)) {
2602 res.fi = NULL; 2604 res.fi = NULL;
2603 if (fl4->flowi4_oif) { 2605 if (fl4->flowi4_oif) {
2604 /* Apparently, routing tables are wrong. Assume, 2606 /* Apparently, routing tables are wrong. Assume,
2605 that the destination is on link. 2607 that the destination is on link.
2606 2608
2607 WHY? DW. 2609 WHY? DW.
2608 Because we are allowed to send to iface 2610 Because we are allowed to send to iface
2609 even if it has NO routes and NO assigned 2611 even if it has NO routes and NO assigned
2610 addresses. When oif is specified, routing 2612 addresses. When oif is specified, routing
2611 tables are looked up with only one purpose: 2613 tables are looked up with only one purpose:
2612 to catch if destination is gatewayed, rather than 2614 to catch if destination is gatewayed, rather than
2613 direct. Moreover, if MSG_DONTROUTE is set, 2615 direct. Moreover, if MSG_DONTROUTE is set,
2614 we send packet, ignoring both routing tables 2616 we send packet, ignoring both routing tables
2615 and ifaddr state. --ANK 2617 and ifaddr state. --ANK
2616 2618
2617 2619
2618 We could make it even if oif is unknown, 2620 We could make it even if oif is unknown,
2619 likely IPv6, but we do not. 2621 likely IPv6, but we do not.
2620 */ 2622 */
2621 2623
2622 if (fl4->saddr == 0) 2624 if (fl4->saddr == 0)
2623 fl4->saddr = inet_select_addr(dev_out, 0, 2625 fl4->saddr = inet_select_addr(dev_out, 0,
2624 RT_SCOPE_LINK); 2626 RT_SCOPE_LINK);
2625 res.type = RTN_UNICAST; 2627 res.type = RTN_UNICAST;
2626 goto make_route; 2628 goto make_route;
2627 } 2629 }
2628 rth = ERR_PTR(-ENETUNREACH); 2630 rth = ERR_PTR(-ENETUNREACH);
2629 goto out; 2631 goto out;
2630 } 2632 }
2631 2633
2632 if (res.type == RTN_LOCAL) { 2634 if (res.type == RTN_LOCAL) {
2633 if (!fl4->saddr) { 2635 if (!fl4->saddr) {
2634 if (res.fi->fib_prefsrc) 2636 if (res.fi->fib_prefsrc)
2635 fl4->saddr = res.fi->fib_prefsrc; 2637 fl4->saddr = res.fi->fib_prefsrc;
2636 else 2638 else
2637 fl4->saddr = fl4->daddr; 2639 fl4->saddr = fl4->daddr;
2638 } 2640 }
2639 dev_out = net->loopback_dev; 2641 dev_out = net->loopback_dev;
2640 fl4->flowi4_oif = dev_out->ifindex; 2642 fl4->flowi4_oif = dev_out->ifindex;
2641 res.fi = NULL; 2643 res.fi = NULL;
2642 flags |= RTCF_LOCAL; 2644 flags |= RTCF_LOCAL;
2643 goto make_route; 2645 goto make_route;
2644 } 2646 }
2645 2647
2646 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2648 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2647 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) 2649 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2648 fib_select_multipath(&res); 2650 fib_select_multipath(&res);
2649 else 2651 else
2650 #endif 2652 #endif
2651 if (!res.prefixlen && 2653 if (!res.prefixlen &&
2652 res.table->tb_num_default > 1 && 2654 res.table->tb_num_default > 1 &&
2653 res.type == RTN_UNICAST && !fl4->flowi4_oif) 2655 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2654 fib_select_default(&res); 2656 fib_select_default(&res);
2655 2657
2656 if (!fl4->saddr) 2658 if (!fl4->saddr)
2657 fl4->saddr = FIB_RES_PREFSRC(net, res); 2659 fl4->saddr = FIB_RES_PREFSRC(net, res);
2658 2660
2659 dev_out = FIB_RES_DEV(res); 2661 dev_out = FIB_RES_DEV(res);
2660 fl4->flowi4_oif = dev_out->ifindex; 2662 fl4->flowi4_oif = dev_out->ifindex;
2661 2663
2662 2664
2663 make_route: 2665 make_route:
2664 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 2666 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2665 dev_out, flags); 2667 dev_out, flags);
2666 if (!IS_ERR(rth)) { 2668 if (!IS_ERR(rth)) {
2667 unsigned int hash; 2669 unsigned int hash;
2668 2670
2669 hash = rt_hash(orig_daddr, orig_saddr, orig_oif, 2671 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2670 rt_genid(dev_net(dev_out))); 2672 rt_genid(dev_net(dev_out)));
2671 rth = rt_intern_hash(hash, rth, NULL, orig_oif); 2673 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2672 } 2674 }
2673 2675
2674 out: 2676 out:
2675 rcu_read_unlock(); 2677 rcu_read_unlock();
2676 return rth; 2678 return rth;
2677 } 2679 }
2678 2680
2679 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) 2681 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2680 { 2682 {
2681 struct rtable *rth; 2683 struct rtable *rth;
2682 unsigned int hash; 2684 unsigned int hash;
2683 2685
2684 if (!rt_caching(net)) 2686 if (!rt_caching(net))
2685 goto slow_output; 2687 goto slow_output;
2686 2688
2687 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); 2689 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2688 2690
2689 rcu_read_lock_bh(); 2691 rcu_read_lock_bh();
2690 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2692 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2691 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2693 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2692 if (rth->rt_key_dst == flp4->daddr && 2694 if (rth->rt_key_dst == flp4->daddr &&
2693 rth->rt_key_src == flp4->saddr && 2695 rth->rt_key_src == flp4->saddr &&
2694 rt_is_output_route(rth) && 2696 rt_is_output_route(rth) &&
2695 rth->rt_oif == flp4->flowi4_oif && 2697 rth->rt_oif == flp4->flowi4_oif &&
2696 rth->rt_mark == flp4->flowi4_mark && 2698 rth->rt_mark == flp4->flowi4_mark &&
2697 !((rth->rt_key_tos ^ flp4->flowi4_tos) & 2699 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2698 (IPTOS_RT_MASK | RTO_ONLINK)) && 2700 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2699 net_eq(dev_net(rth->dst.dev), net) && 2701 net_eq(dev_net(rth->dst.dev), net) &&
2700 !rt_is_expired(rth)) { 2702 !rt_is_expired(rth)) {
2701 dst_use(&rth->dst, jiffies); 2703 dst_use(&rth->dst, jiffies);
2702 RT_CACHE_STAT_INC(out_hit); 2704 RT_CACHE_STAT_INC(out_hit);
2703 rcu_read_unlock_bh(); 2705 rcu_read_unlock_bh();
2704 if (!flp4->saddr) 2706 if (!flp4->saddr)
2705 flp4->saddr = rth->rt_src; 2707 flp4->saddr = rth->rt_src;
2706 if (!flp4->daddr) 2708 if (!flp4->daddr)
2707 flp4->daddr = rth->rt_dst; 2709 flp4->daddr = rth->rt_dst;
2708 return rth; 2710 return rth;
2709 } 2711 }
2710 RT_CACHE_STAT_INC(out_hlist_search); 2712 RT_CACHE_STAT_INC(out_hlist_search);
2711 } 2713 }
2712 rcu_read_unlock_bh(); 2714 rcu_read_unlock_bh();
2713 2715
2714 slow_output: 2716 slow_output:
2715 return ip_route_output_slow(net, flp4); 2717 return ip_route_output_slow(net, flp4);
2716 } 2718 }
2717 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2719 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2718 2720
2719 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2721 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2720 { 2722 {
2721 return NULL; 2723 return NULL;
2722 } 2724 }
2723 2725
2724 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) 2726 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2725 { 2727 {
2726 return 0; 2728 return 0;
2727 } 2729 }
2728 2730
2729 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2731 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2730 { 2732 {
2731 } 2733 }
2732 2734
2733 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2735 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2734 unsigned long old) 2736 unsigned long old)
2735 { 2737 {
2736 return NULL; 2738 return NULL;
2737 } 2739 }
2738 2740
2739 static struct dst_ops ipv4_dst_blackhole_ops = { 2741 static struct dst_ops ipv4_dst_blackhole_ops = {
2740 .family = AF_INET, 2742 .family = AF_INET,
2741 .protocol = cpu_to_be16(ETH_P_IP), 2743 .protocol = cpu_to_be16(ETH_P_IP),
2742 .destroy = ipv4_dst_destroy, 2744 .destroy = ipv4_dst_destroy,
2743 .check = ipv4_blackhole_dst_check, 2745 .check = ipv4_blackhole_dst_check,
2744 .default_mtu = ipv4_blackhole_default_mtu, 2746 .default_mtu = ipv4_blackhole_default_mtu,
2745 .default_advmss = ipv4_default_advmss, 2747 .default_advmss = ipv4_default_advmss,
2746 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2748 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2747 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2749 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2748 .neigh_lookup = ipv4_neigh_lookup, 2750 .neigh_lookup = ipv4_neigh_lookup,
2749 }; 2751 };
2750 2752
2751 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2753 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2752 { 2754 {
2753 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); 2755 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2754 struct rtable *ort = (struct rtable *) dst_orig; 2756 struct rtable *ort = (struct rtable *) dst_orig;
2755 2757
2756 if (rt) { 2758 if (rt) {
2757 struct dst_entry *new = &rt->dst; 2759 struct dst_entry *new = &rt->dst;
2758 2760
2759 new->__use = 1; 2761 new->__use = 1;
2760 new->input = dst_discard; 2762 new->input = dst_discard;
2761 new->output = dst_discard; 2763 new->output = dst_discard;
2762 dst_copy_metrics(new, &ort->dst); 2764 dst_copy_metrics(new, &ort->dst);
2763 2765
2764 new->dev = ort->dst.dev; 2766 new->dev = ort->dst.dev;
2765 if (new->dev) 2767 if (new->dev)
2766 dev_hold(new->dev); 2768 dev_hold(new->dev);
2767 2769
2768 rt->rt_key_dst = ort->rt_key_dst; 2770 rt->rt_key_dst = ort->rt_key_dst;
2769 rt->rt_key_src = ort->rt_key_src; 2771 rt->rt_key_src = ort->rt_key_src;
2770 rt->rt_key_tos = ort->rt_key_tos; 2772 rt->rt_key_tos = ort->rt_key_tos;
2771 rt->rt_route_iif = ort->rt_route_iif; 2773 rt->rt_route_iif = ort->rt_route_iif;
2772 rt->rt_iif = ort->rt_iif; 2774 rt->rt_iif = ort->rt_iif;
2773 rt->rt_oif = ort->rt_oif; 2775 rt->rt_oif = ort->rt_oif;
2774 rt->rt_mark = ort->rt_mark; 2776 rt->rt_mark = ort->rt_mark;
2775 2777
2776 rt->rt_genid = rt_genid(net); 2778 rt->rt_genid = rt_genid(net);
2777 rt->rt_flags = ort->rt_flags; 2779 rt->rt_flags = ort->rt_flags;
2778 rt->rt_type = ort->rt_type; 2780 rt->rt_type = ort->rt_type;
2779 rt->rt_dst = ort->rt_dst; 2781 rt->rt_dst = ort->rt_dst;
2780 rt->rt_src = ort->rt_src; 2782 rt->rt_src = ort->rt_src;
2781 rt->rt_gateway = ort->rt_gateway; 2783 rt->rt_gateway = ort->rt_gateway;
2782 rt->rt_spec_dst = ort->rt_spec_dst; 2784 rt->rt_spec_dst = ort->rt_spec_dst;
2783 rt->peer = ort->peer; 2785 rt->peer = ort->peer;
2784 if (rt->peer) 2786 if (rt->peer)
2785 atomic_inc(&rt->peer->refcnt); 2787 atomic_inc(&rt->peer->refcnt);
2786 rt->fi = ort->fi; 2788 rt->fi = ort->fi;
2787 if (rt->fi) 2789 if (rt->fi)
2788 atomic_inc(&rt->fi->fib_clntref); 2790 atomic_inc(&rt->fi->fib_clntref);
2789 2791
2790 dst_free(new); 2792 dst_free(new);
2791 } 2793 }
2792 2794
2793 dst_release(dst_orig); 2795 dst_release(dst_orig);
2794 2796
2795 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2797 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2796 } 2798 }
2797 2799
2798 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2800 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2799 struct sock *sk) 2801 struct sock *sk)
2800 { 2802 {
2801 struct rtable *rt = __ip_route_output_key(net, flp4); 2803 struct rtable *rt = __ip_route_output_key(net, flp4);
2802 2804
2803 if (IS_ERR(rt)) 2805 if (IS_ERR(rt))
2804 return rt; 2806 return rt;
2805 2807
2806 if (flp4->flowi4_proto) 2808 if (flp4->flowi4_proto)
2807 rt = (struct rtable *) xfrm_lookup(net, &rt->dst, 2809 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2808 flowi4_to_flowi(flp4), 2810 flowi4_to_flowi(flp4),
2809 sk, 0); 2811 sk, 0);
2810 2812
2811 return rt; 2813 return rt;
2812 } 2814 }
2813 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2815 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2814 2816
2815 static int rt_fill_info(struct net *net, 2817 static int rt_fill_info(struct net *net,
2816 struct sk_buff *skb, u32 pid, u32 seq, int event, 2818 struct sk_buff *skb, u32 pid, u32 seq, int event,
2817 int nowait, unsigned int flags) 2819 int nowait, unsigned int flags)
2818 { 2820 {
2819 struct rtable *rt = skb_rtable(skb); 2821 struct rtable *rt = skb_rtable(skb);
2820 struct rtmsg *r; 2822 struct rtmsg *r;
2821 struct nlmsghdr *nlh; 2823 struct nlmsghdr *nlh;
2822 long expires = 0; 2824 long expires = 0;
2823 const struct inet_peer *peer = rt->peer; 2825 const struct inet_peer *peer = rt->peer;
2824 u32 id = 0, ts = 0, tsage = 0, error; 2826 u32 id = 0, ts = 0, tsage = 0, error;
2825 2827
2826 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2828 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2827 if (nlh == NULL) 2829 if (nlh == NULL)
2828 return -EMSGSIZE; 2830 return -EMSGSIZE;
2829 2831
2830 r = nlmsg_data(nlh); 2832 r = nlmsg_data(nlh);
2831 r->rtm_family = AF_INET; 2833 r->rtm_family = AF_INET;
2832 r->rtm_dst_len = 32; 2834 r->rtm_dst_len = 32;
2833 r->rtm_src_len = 0; 2835 r->rtm_src_len = 0;
2834 r->rtm_tos = rt->rt_key_tos; 2836 r->rtm_tos = rt->rt_key_tos;
2835 r->rtm_table = RT_TABLE_MAIN; 2837 r->rtm_table = RT_TABLE_MAIN;
2836 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2838 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2837 r->rtm_type = rt->rt_type; 2839 r->rtm_type = rt->rt_type;
2838 r->rtm_scope = RT_SCOPE_UNIVERSE; 2840 r->rtm_scope = RT_SCOPE_UNIVERSE;
2839 r->rtm_protocol = RTPROT_UNSPEC; 2841 r->rtm_protocol = RTPROT_UNSPEC;
2840 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2842 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2841 if (rt->rt_flags & RTCF_NOTIFY) 2843 if (rt->rt_flags & RTCF_NOTIFY)
2842 r->rtm_flags |= RTM_F_NOTIFY; 2844 r->rtm_flags |= RTM_F_NOTIFY;
2843 2845
2844 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2846 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2845 2847
2846 if (rt->rt_key_src) { 2848 if (rt->rt_key_src) {
2847 r->rtm_src_len = 32; 2849 r->rtm_src_len = 32;
2848 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); 2850 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2849 } 2851 }
2850 if (rt->dst.dev) 2852 if (rt->dst.dev)
2851 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2853 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2852 #ifdef CONFIG_IP_ROUTE_CLASSID 2854 #ifdef CONFIG_IP_ROUTE_CLASSID
2853 if (rt->dst.tclassid) 2855 if (rt->dst.tclassid)
2854 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2856 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2855 #endif 2857 #endif
2856 if (rt_is_input_route(rt)) 2858 if (rt_is_input_route(rt))
2857 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2859 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2858 else if (rt->rt_src != rt->rt_key_src) 2860 else if (rt->rt_src != rt->rt_key_src)
2859 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2861 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2860 2862
2861 if (rt->rt_dst != rt->rt_gateway) 2863 if (rt->rt_dst != rt->rt_gateway)
2862 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2864 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2863 2865
2864 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2866 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2865 goto nla_put_failure; 2867 goto nla_put_failure;
2866 2868
2867 if (rt->rt_mark) 2869 if (rt->rt_mark)
2868 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); 2870 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2869 2871
2870 error = rt->dst.error; 2872 error = rt->dst.error;
2871 if (peer) { 2873 if (peer) {
2872 inet_peer_refcheck(rt->peer); 2874 inet_peer_refcheck(rt->peer);
2873 id = atomic_read(&peer->ip_id_count) & 0xffff; 2875 id = atomic_read(&peer->ip_id_count) & 0xffff;
2874 if (peer->tcp_ts_stamp) { 2876 if (peer->tcp_ts_stamp) {
2875 ts = peer->tcp_ts; 2877 ts = peer->tcp_ts;
2876 tsage = get_seconds() - peer->tcp_ts_stamp; 2878 tsage = get_seconds() - peer->tcp_ts_stamp;
2877 } 2879 }
2878 expires = ACCESS_ONCE(peer->pmtu_expires); 2880 expires = ACCESS_ONCE(peer->pmtu_expires);
2879 if (expires) 2881 if (expires)
2880 expires -= jiffies; 2882 expires -= jiffies;
2881 } 2883 }
2882 2884
2883 if (rt_is_input_route(rt)) { 2885 if (rt_is_input_route(rt)) {
2884 #ifdef CONFIG_IP_MROUTE 2886 #ifdef CONFIG_IP_MROUTE
2885 __be32 dst = rt->rt_dst; 2887 __be32 dst = rt->rt_dst;
2886 2888
2887 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2889 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2888 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2890 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2889 int err = ipmr_get_route(net, skb, 2891 int err = ipmr_get_route(net, skb,
2890 rt->rt_src, rt->rt_dst, 2892 rt->rt_src, rt->rt_dst,
2891 r, nowait); 2893 r, nowait);
2892 if (err <= 0) { 2894 if (err <= 0) {
2893 if (!nowait) { 2895 if (!nowait) {
2894 if (err == 0) 2896 if (err == 0)
2895 return 0; 2897 return 0;
2896 goto nla_put_failure; 2898 goto nla_put_failure;
2897 } else { 2899 } else {
2898 if (err == -EMSGSIZE) 2900 if (err == -EMSGSIZE)
2899 goto nla_put_failure; 2901 goto nla_put_failure;
2900 error = err; 2902 error = err;
2901 } 2903 }
2902 } 2904 }
2903 } else 2905 } else
2904 #endif 2906 #endif
2905 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); 2907 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2906 } 2908 }
2907 2909
2908 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2910 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2909 expires, error) < 0) 2911 expires, error) < 0)
2910 goto nla_put_failure; 2912 goto nla_put_failure;
2911 2913
2912 return nlmsg_end(skb, nlh); 2914 return nlmsg_end(skb, nlh);
2913 2915
2914 nla_put_failure: 2916 nla_put_failure:
2915 nlmsg_cancel(skb, nlh); 2917 nlmsg_cancel(skb, nlh);
2916 return -EMSGSIZE; 2918 return -EMSGSIZE;
2917 } 2919 }
2918 2920
2919 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2921 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2920 { 2922 {
2921 struct net *net = sock_net(in_skb->sk); 2923 struct net *net = sock_net(in_skb->sk);
2922 struct rtmsg *rtm; 2924 struct rtmsg *rtm;
2923 struct nlattr *tb[RTA_MAX+1]; 2925 struct nlattr *tb[RTA_MAX+1];
2924 struct rtable *rt = NULL; 2926 struct rtable *rt = NULL;
2925 __be32 dst = 0; 2927 __be32 dst = 0;
2926 __be32 src = 0; 2928 __be32 src = 0;
2927 u32 iif; 2929 u32 iif;
2928 int err; 2930 int err;
2929 int mark; 2931 int mark;
2930 struct sk_buff *skb; 2932 struct sk_buff *skb;
2931 2933
2932 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2934 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2933 if (err < 0) 2935 if (err < 0)
2934 goto errout; 2936 goto errout;
2935 2937
2936 rtm = nlmsg_data(nlh); 2938 rtm = nlmsg_data(nlh);
2937 2939
2938 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2940 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2939 if (skb == NULL) { 2941 if (skb == NULL) {
2940 err = -ENOBUFS; 2942 err = -ENOBUFS;
2941 goto errout; 2943 goto errout;
2942 } 2944 }
2943 2945
2944 /* Reserve room for dummy headers, this skb can pass 2946 /* Reserve room for dummy headers, this skb can pass
2945 through good chunk of routing engine. 2947 through good chunk of routing engine.
2946 */ 2948 */
2947 skb_reset_mac_header(skb); 2949 skb_reset_mac_header(skb);
2948 skb_reset_network_header(skb); 2950 skb_reset_network_header(skb);
2949 2951
2950 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2952 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2951 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2953 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2952 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2954 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2953 2955
2954 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2956 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2955 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2957 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2956 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2958 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2957 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2959 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2958 2960
2959 if (iif) { 2961 if (iif) {
2960 struct net_device *dev; 2962 struct net_device *dev;
2961 2963
2962 dev = __dev_get_by_index(net, iif); 2964 dev = __dev_get_by_index(net, iif);
2963 if (dev == NULL) { 2965 if (dev == NULL) {
2964 err = -ENODEV; 2966 err = -ENODEV;
2965 goto errout_free; 2967 goto errout_free;
2966 } 2968 }
2967 2969
2968 skb->protocol = htons(ETH_P_IP); 2970 skb->protocol = htons(ETH_P_IP);
2969 skb->dev = dev; 2971 skb->dev = dev;
2970 skb->mark = mark; 2972 skb->mark = mark;
2971 local_bh_disable(); 2973 local_bh_disable();
2972 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2974 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2973 local_bh_enable(); 2975 local_bh_enable();
2974 2976
2975 rt = skb_rtable(skb); 2977 rt = skb_rtable(skb);
2976 if (err == 0 && rt->dst.error) 2978 if (err == 0 && rt->dst.error)
2977 err = -rt->dst.error; 2979 err = -rt->dst.error;
2978 } else { 2980 } else {
2979 struct flowi4 fl4 = { 2981 struct flowi4 fl4 = {
2980 .daddr = dst, 2982 .daddr = dst,
2981 .saddr = src, 2983 .saddr = src,
2982 .flowi4_tos = rtm->rtm_tos, 2984 .flowi4_tos = rtm->rtm_tos,
2983 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2985 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2984 .flowi4_mark = mark, 2986 .flowi4_mark = mark,
2985 }; 2987 };
2986 rt = ip_route_output_key(net, &fl4); 2988 rt = ip_route_output_key(net, &fl4);
2987 2989
2988 err = 0; 2990 err = 0;
2989 if (IS_ERR(rt)) 2991 if (IS_ERR(rt))
2990 err = PTR_ERR(rt); 2992 err = PTR_ERR(rt);
2991 } 2993 }
2992 2994
2993 if (err) 2995 if (err)
2994 goto errout_free; 2996 goto errout_free;
2995 2997
2996 skb_dst_set(skb, &rt->dst); 2998 skb_dst_set(skb, &rt->dst);
2997 if (rtm->rtm_flags & RTM_F_NOTIFY) 2999 if (rtm->rtm_flags & RTM_F_NOTIFY)
2998 rt->rt_flags |= RTCF_NOTIFY; 3000 rt->rt_flags |= RTCF_NOTIFY;
2999 3001
3000 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 3002 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3001 RTM_NEWROUTE, 0, 0); 3003 RTM_NEWROUTE, 0, 0);
3002 if (err <= 0) 3004 if (err <= 0)
3003 goto errout_free; 3005 goto errout_free;
3004 3006
3005 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 3007 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3006 errout: 3008 errout:
3007 return err; 3009 return err;
3008 3010
3009 errout_free: 3011 errout_free:
3010 kfree_skb(skb); 3012 kfree_skb(skb);
3011 goto errout; 3013 goto errout;
3012 } 3014 }
3013 3015
3014 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 3016 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3015 { 3017 {
3016 struct rtable *rt; 3018 struct rtable *rt;
3017 int h, s_h; 3019 int h, s_h;
3018 int idx, s_idx; 3020 int idx, s_idx;
3019 struct net *net; 3021 struct net *net;
3020 3022
3021 net = sock_net(skb->sk); 3023 net = sock_net(skb->sk);
3022 3024
3023 s_h = cb->args[0]; 3025 s_h = cb->args[0];
3024 if (s_h < 0) 3026 if (s_h < 0)
3025 s_h = 0; 3027 s_h = 0;
3026 s_idx = idx = cb->args[1]; 3028 s_idx = idx = cb->args[1];
3027 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { 3029 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3028 if (!rt_hash_table[h].chain) 3030 if (!rt_hash_table[h].chain)
3029 continue; 3031 continue;
3030 rcu_read_lock_bh(); 3032 rcu_read_lock_bh();
3031 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; 3033 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3032 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { 3034 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3033 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) 3035 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3034 continue; 3036 continue;
3035 if (rt_is_expired(rt)) 3037 if (rt_is_expired(rt))
3036 continue; 3038 continue;
3037 skb_dst_set_noref(skb, &rt->dst); 3039 skb_dst_set_noref(skb, &rt->dst);
3038 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3040 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3039 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3041 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3040 1, NLM_F_MULTI) <= 0) { 3042 1, NLM_F_MULTI) <= 0) {
3041 skb_dst_drop(skb); 3043 skb_dst_drop(skb);
3042 rcu_read_unlock_bh(); 3044 rcu_read_unlock_bh();
3043 goto done; 3045 goto done;
3044 } 3046 }
3045 skb_dst_drop(skb); 3047 skb_dst_drop(skb);
3046 } 3048 }
3047 rcu_read_unlock_bh(); 3049 rcu_read_unlock_bh();
3048 } 3050 }
3049 3051
3050 done: 3052 done:
3051 cb->args[0] = h; 3053 cb->args[0] = h;
3052 cb->args[1] = idx; 3054 cb->args[1] = idx;
3053 return skb->len; 3055 return skb->len;
3054 } 3056 }
3055 3057
3056 void ip_rt_multicast_event(struct in_device *in_dev) 3058 void ip_rt_multicast_event(struct in_device *in_dev)
3057 { 3059 {
3058 rt_cache_flush(dev_net(in_dev->dev), 0); 3060 rt_cache_flush(dev_net(in_dev->dev), 0);
3059 } 3061 }
3060 3062
3061 #ifdef CONFIG_SYSCTL 3063 #ifdef CONFIG_SYSCTL
3062 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3064 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3063 void __user *buffer, 3065 void __user *buffer,
3064 size_t *lenp, loff_t *ppos) 3066 size_t *lenp, loff_t *ppos)
3065 { 3067 {
3066 if (write) { 3068 if (write) {
3067 int flush_delay; 3069 int flush_delay;
3068 ctl_table ctl; 3070 ctl_table ctl;
3069 struct net *net; 3071 struct net *net;
3070 3072
3071 memcpy(&ctl, __ctl, sizeof(ctl)); 3073 memcpy(&ctl, __ctl, sizeof(ctl));
3072 ctl.data = &flush_delay; 3074 ctl.data = &flush_delay;
3073 proc_dointvec(&ctl, write, buffer, lenp, ppos); 3075 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3074 3076
3075 net = (struct net *)__ctl->extra1; 3077 net = (struct net *)__ctl->extra1;
3076 rt_cache_flush(net, flush_delay); 3078 rt_cache_flush(net, flush_delay);
3077 return 0; 3079 return 0;
3078 } 3080 }
3079 3081
3080 return -EINVAL; 3082 return -EINVAL;
3081 } 3083 }
3082 3084
3083 static ctl_table ipv4_route_table[] = { 3085 static ctl_table ipv4_route_table[] = {
3084 { 3086 {
3085 .procname = "gc_thresh", 3087 .procname = "gc_thresh",
3086 .data = &ipv4_dst_ops.gc_thresh, 3088 .data = &ipv4_dst_ops.gc_thresh,
3087 .maxlen = sizeof(int), 3089 .maxlen = sizeof(int),
3088 .mode = 0644, 3090 .mode = 0644,
3089 .proc_handler = proc_dointvec, 3091 .proc_handler = proc_dointvec,
3090 }, 3092 },
3091 { 3093 {
3092 .procname = "max_size", 3094 .procname = "max_size",
3093 .data = &ip_rt_max_size, 3095 .data = &ip_rt_max_size,
3094 .maxlen = sizeof(int), 3096 .maxlen = sizeof(int),
3095 .mode = 0644, 3097 .mode = 0644,
3096 .proc_handler = proc_dointvec, 3098 .proc_handler = proc_dointvec,
3097 }, 3099 },
3098 { 3100 {
3099 /* Deprecated. Use gc_min_interval_ms */ 3101 /* Deprecated. Use gc_min_interval_ms */
3100 3102
3101 .procname = "gc_min_interval", 3103 .procname = "gc_min_interval",
3102 .data = &ip_rt_gc_min_interval, 3104 .data = &ip_rt_gc_min_interval,
3103 .maxlen = sizeof(int), 3105 .maxlen = sizeof(int),
3104 .mode = 0644, 3106 .mode = 0644,
3105 .proc_handler = proc_dointvec_jiffies, 3107 .proc_handler = proc_dointvec_jiffies,
3106 }, 3108 },
3107 { 3109 {
3108 .procname = "gc_min_interval_ms", 3110 .procname = "gc_min_interval_ms",
3109 .data = &ip_rt_gc_min_interval, 3111 .data = &ip_rt_gc_min_interval,
3110 .maxlen = sizeof(int), 3112 .maxlen = sizeof(int),
3111 .mode = 0644, 3113 .mode = 0644,
3112 .proc_handler = proc_dointvec_ms_jiffies, 3114 .proc_handler = proc_dointvec_ms_jiffies,
3113 }, 3115 },
3114 { 3116 {
3115 .procname = "gc_timeout", 3117 .procname = "gc_timeout",
3116 .data = &ip_rt_gc_timeout, 3118 .data = &ip_rt_gc_timeout,
3117 .maxlen = sizeof(int), 3119 .maxlen = sizeof(int),
3118 .mode = 0644, 3120 .mode = 0644,
3119 .proc_handler = proc_dointvec_jiffies, 3121 .proc_handler = proc_dointvec_jiffies,
3120 }, 3122 },
3121 { 3123 {
3122 .procname = "gc_interval", 3124 .procname = "gc_interval",
3123 .data = &ip_rt_gc_interval, 3125 .data = &ip_rt_gc_interval,
3124 .maxlen = sizeof(int), 3126 .maxlen = sizeof(int),
3125 .mode = 0644, 3127 .mode = 0644,
3126 .proc_handler = proc_dointvec_jiffies, 3128 .proc_handler = proc_dointvec_jiffies,
3127 }, 3129 },
3128 { 3130 {
3129 .procname = "redirect_load", 3131 .procname = "redirect_load",
3130 .data = &ip_rt_redirect_load, 3132 .data = &ip_rt_redirect_load,
3131 .maxlen = sizeof(int), 3133 .maxlen = sizeof(int),
3132 .mode = 0644, 3134 .mode = 0644,
3133 .proc_handler = proc_dointvec, 3135 .proc_handler = proc_dointvec,
3134 }, 3136 },
3135 { 3137 {
3136 .procname = "redirect_number", 3138 .procname = "redirect_number",
3137 .data = &ip_rt_redirect_number, 3139 .data = &ip_rt_redirect_number,
3138 .maxlen = sizeof(int), 3140 .maxlen = sizeof(int),
3139 .mode = 0644, 3141 .mode = 0644,
3140 .proc_handler = proc_dointvec, 3142 .proc_handler = proc_dointvec,
3141 }, 3143 },
3142 { 3144 {
3143 .procname = "redirect_silence", 3145 .procname = "redirect_silence",
3144 .data = &ip_rt_redirect_silence, 3146 .data = &ip_rt_redirect_silence,
3145 .maxlen = sizeof(int), 3147 .maxlen = sizeof(int),
3146 .mode = 0644, 3148 .mode = 0644,
3147 .proc_handler = proc_dointvec, 3149 .proc_handler = proc_dointvec,
3148 }, 3150 },
3149 { 3151 {
3150 .procname = "error_cost", 3152 .procname = "error_cost",
3151 .data = &ip_rt_error_cost, 3153 .data = &ip_rt_error_cost,
3152 .maxlen = sizeof(int), 3154 .maxlen = sizeof(int),
3153 .mode = 0644, 3155 .mode = 0644,
3154 .proc_handler = proc_dointvec, 3156 .proc_handler = proc_dointvec,
3155 }, 3157 },
3156 { 3158 {
3157 .procname = "error_burst", 3159 .procname = "error_burst",
3158 .data = &ip_rt_error_burst, 3160 .data = &ip_rt_error_burst,
3159 .maxlen = sizeof(int), 3161 .maxlen = sizeof(int),
3160 .mode = 0644, 3162 .mode = 0644,
3161 .proc_handler = proc_dointvec, 3163 .proc_handler = proc_dointvec,
3162 }, 3164 },
3163 { 3165 {
3164 .procname = "gc_elasticity", 3166 .procname = "gc_elasticity",
3165 .data = &ip_rt_gc_elasticity, 3167 .data = &ip_rt_gc_elasticity,
3166 .maxlen = sizeof(int), 3168 .maxlen = sizeof(int),
3167 .mode = 0644, 3169 .mode = 0644,
3168 .proc_handler = proc_dointvec, 3170 .proc_handler = proc_dointvec,
3169 }, 3171 },
3170 { 3172 {
3171 .procname = "mtu_expires", 3173 .procname = "mtu_expires",
3172 .data = &ip_rt_mtu_expires, 3174 .data = &ip_rt_mtu_expires,
3173 .maxlen = sizeof(int), 3175 .maxlen = sizeof(int),
3174 .mode = 0644, 3176 .mode = 0644,
3175 .proc_handler = proc_dointvec_jiffies, 3177 .proc_handler = proc_dointvec_jiffies,
3176 }, 3178 },
3177 { 3179 {
3178 .procname = "min_pmtu", 3180 .procname = "min_pmtu",
3179 .data = &ip_rt_min_pmtu, 3181 .data = &ip_rt_min_pmtu,
3180 .maxlen = sizeof(int), 3182 .maxlen = sizeof(int),
3181 .mode = 0644, 3183 .mode = 0644,
3182 .proc_handler = proc_dointvec, 3184 .proc_handler = proc_dointvec,
3183 }, 3185 },
3184 { 3186 {
3185 .procname = "min_adv_mss", 3187 .procname = "min_adv_mss",
3186 .data = &ip_rt_min_advmss, 3188 .data = &ip_rt_min_advmss,
3187 .maxlen = sizeof(int), 3189 .maxlen = sizeof(int),
3188 .mode = 0644, 3190 .mode = 0644,
3189 .proc_handler = proc_dointvec, 3191 .proc_handler = proc_dointvec,
3190 }, 3192 },
3191 { } 3193 { }
3192 }; 3194 };
3193 3195
3194 static struct ctl_table empty[1]; 3196 static struct ctl_table empty[1];
3195 3197
3196 static struct ctl_table ipv4_skeleton[] = 3198 static struct ctl_table ipv4_skeleton[] =
3197 { 3199 {
3198 { .procname = "route", 3200 { .procname = "route",
3199 .mode = 0555, .child = ipv4_route_table}, 3201 .mode = 0555, .child = ipv4_route_table},
3200 { .procname = "neigh", 3202 { .procname = "neigh",
3201 .mode = 0555, .child = empty}, 3203 .mode = 0555, .child = empty},
3202 { } 3204 { }
3203 }; 3205 };
3204 3206
3205 static __net_initdata struct ctl_path ipv4_path[] = { 3207 static __net_initdata struct ctl_path ipv4_path[] = {
3206 { .procname = "net", }, 3208 { .procname = "net", },
3207 { .procname = "ipv4", }, 3209 { .procname = "ipv4", },
3208 { }, 3210 { },
3209 }; 3211 };
3210 3212
3211 static struct ctl_table ipv4_route_flush_table[] = { 3213 static struct ctl_table ipv4_route_flush_table[] = {
3212 { 3214 {
3213 .procname = "flush", 3215 .procname = "flush",
3214 .maxlen = sizeof(int), 3216 .maxlen = sizeof(int),
3215 .mode = 0200, 3217 .mode = 0200,
3216 .proc_handler = ipv4_sysctl_rtcache_flush, 3218 .proc_handler = ipv4_sysctl_rtcache_flush,
3217 }, 3219 },
3218 { }, 3220 { },
3219 }; 3221 };
3220 3222
3221 static __net_initdata struct ctl_path ipv4_route_path[] = { 3223 static __net_initdata struct ctl_path ipv4_route_path[] = {
3222 { .procname = "net", }, 3224 { .procname = "net", },
3223 { .procname = "ipv4", }, 3225 { .procname = "ipv4", },
3224 { .procname = "route", }, 3226 { .procname = "route", },
3225 { }, 3227 { },
3226 }; 3228 };
3227 3229
3228 static __net_init int sysctl_route_net_init(struct net *net) 3230 static __net_init int sysctl_route_net_init(struct net *net)
3229 { 3231 {
3230 struct ctl_table *tbl; 3232 struct ctl_table *tbl;
3231 3233
3232 tbl = ipv4_route_flush_table; 3234 tbl = ipv4_route_flush_table;
3233 if (!net_eq(net, &init_net)) { 3235 if (!net_eq(net, &init_net)) {
3234 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3236 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3235 if (tbl == NULL) 3237 if (tbl == NULL)
3236 goto err_dup; 3238 goto err_dup;
3237 } 3239 }
3238 tbl[0].extra1 = net; 3240 tbl[0].extra1 = net;
3239 3241
3240 net->ipv4.route_hdr = 3242 net->ipv4.route_hdr =
3241 register_net_sysctl_table(net, ipv4_route_path, tbl); 3243 register_net_sysctl_table(net, ipv4_route_path, tbl);
3242 if (net->ipv4.route_hdr == NULL) 3244 if (net->ipv4.route_hdr == NULL)
3243 goto err_reg; 3245 goto err_reg;
3244 return 0; 3246 return 0;
3245 3247
3246 err_reg: 3248 err_reg:
3247 if (tbl != ipv4_route_flush_table) 3249 if (tbl != ipv4_route_flush_table)
3248 kfree(tbl); 3250 kfree(tbl);
3249 err_dup: 3251 err_dup:
3250 return -ENOMEM; 3252 return -ENOMEM;
3251 } 3253 }
3252 3254
3253 static __net_exit void sysctl_route_net_exit(struct net *net) 3255 static __net_exit void sysctl_route_net_exit(struct net *net)
3254 { 3256 {
3255 struct ctl_table *tbl; 3257 struct ctl_table *tbl;
3256 3258
3257 tbl = net->ipv4.route_hdr->ctl_table_arg; 3259 tbl = net->ipv4.route_hdr->ctl_table_arg;
3258 unregister_net_sysctl_table(net->ipv4.route_hdr); 3260 unregister_net_sysctl_table(net->ipv4.route_hdr);
3259 BUG_ON(tbl == ipv4_route_flush_table); 3261 BUG_ON(tbl == ipv4_route_flush_table);
3260 kfree(tbl); 3262 kfree(tbl);
3261 } 3263 }
3262 3264
3263 static __net_initdata struct pernet_operations sysctl_route_ops = { 3265 static __net_initdata struct pernet_operations sysctl_route_ops = {
3264 .init = sysctl_route_net_init, 3266 .init = sysctl_route_net_init,
3265 .exit = sysctl_route_net_exit, 3267 .exit = sysctl_route_net_exit,
3266 }; 3268 };
3267 #endif 3269 #endif
3268 3270
3269 static __net_init int rt_genid_init(struct net *net) 3271 static __net_init int rt_genid_init(struct net *net)
3270 { 3272 {
3271 get_random_bytes(&net->ipv4.rt_genid, 3273 get_random_bytes(&net->ipv4.rt_genid,
3272 sizeof(net->ipv4.rt_genid)); 3274 sizeof(net->ipv4.rt_genid));
3273 get_random_bytes(&net->ipv4.dev_addr_genid, 3275 get_random_bytes(&net->ipv4.dev_addr_genid,
3274 sizeof(net->ipv4.dev_addr_genid)); 3276 sizeof(net->ipv4.dev_addr_genid));
3275 return 0; 3277 return 0;
3276 } 3278 }
3277 3279
3278 static __net_initdata struct pernet_operations rt_genid_ops = { 3280 static __net_initdata struct pernet_operations rt_genid_ops = {
3279 .init = rt_genid_init, 3281 .init = rt_genid_init,
3280 }; 3282 };
3281 3283
3282 3284
3283 #ifdef CONFIG_IP_ROUTE_CLASSID 3285 #ifdef CONFIG_IP_ROUTE_CLASSID
3284 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3286 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3285 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3287 #endif /* CONFIG_IP_ROUTE_CLASSID */
3286 3288
3287 static __initdata unsigned long rhash_entries; 3289 static __initdata unsigned long rhash_entries;
3288 static int __init set_rhash_entries(char *str) 3290 static int __init set_rhash_entries(char *str)
3289 { 3291 {
3290 if (!str) 3292 if (!str)
3291 return 0; 3293 return 0;
3292 rhash_entries = simple_strtoul(str, &str, 0); 3294 rhash_entries = simple_strtoul(str, &str, 0);
3293 return 1; 3295 return 1;
3294 } 3296 }
3295 __setup("rhash_entries=", set_rhash_entries); 3297 __setup("rhash_entries=", set_rhash_entries);
3296 3298
3297 int __init ip_rt_init(void) 3299 int __init ip_rt_init(void)
3298 { 3300 {
3299 int rc = 0; 3301 int rc = 0;
3300 3302
3301 #ifdef CONFIG_IP_ROUTE_CLASSID 3303 #ifdef CONFIG_IP_ROUTE_CLASSID
3302 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3304 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3303 if (!ip_rt_acct) 3305 if (!ip_rt_acct)
3304 panic("IP: failed to allocate ip_rt_acct\n"); 3306 panic("IP: failed to allocate ip_rt_acct\n");
3305 #endif 3307 #endif
3306 3308
3307 ipv4_dst_ops.kmem_cachep = 3309 ipv4_dst_ops.kmem_cachep =
3308 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3310 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3309 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3311 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3310 3312
3311 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3313 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3312 3314
3313 if (dst_entries_init(&ipv4_dst_ops) < 0) 3315 if (dst_entries_init(&ipv4_dst_ops) < 0)
3314 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3316 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3315 3317
3316 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3318 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3317 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3319 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3318 3320
3319 rt_hash_table = (struct rt_hash_bucket *) 3321 rt_hash_table = (struct rt_hash_bucket *)
3320 alloc_large_system_hash("IP route cache", 3322 alloc_large_system_hash("IP route cache",
3321 sizeof(struct rt_hash_bucket), 3323 sizeof(struct rt_hash_bucket),
3322 rhash_entries, 3324 rhash_entries,
3323 (totalram_pages >= 128 * 1024) ? 3325 (totalram_pages >= 128 * 1024) ?
3324 15 : 17, 3326 15 : 17,
3325 0, 3327 0,
3326 &rt_hash_log, 3328 &rt_hash_log,
3327 &rt_hash_mask, 3329 &rt_hash_mask,
3328 rhash_entries ? 0 : 512 * 1024); 3330 rhash_entries ? 0 : 512 * 1024);
3329 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3331 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3330 rt_hash_lock_init(); 3332 rt_hash_lock_init();
3331 3333
3332 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3334 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3333 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3335 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3334 3336
3335 devinet_init(); 3337 devinet_init();
3336 ip_fib_init(); 3338 ip_fib_init();
3337 3339
3338 if (ip_rt_proc_init()) 3340 if (ip_rt_proc_init())
3339 printk(KERN_ERR "Unable to create route proc files\n"); 3341 printk(KERN_ERR "Unable to create route proc files\n");
3340 #ifdef CONFIG_XFRM 3342 #ifdef CONFIG_XFRM
3341 xfrm_init(); 3343 xfrm_init();
3342 xfrm4_init(ip_rt_max_size); 3344 xfrm4_init(ip_rt_max_size);
3343 #endif 3345 #endif
3344 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); 3346 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3345 3347
3346 #ifdef CONFIG_SYSCTL 3348 #ifdef CONFIG_SYSCTL
3347 register_pernet_subsys(&sysctl_route_ops); 3349 register_pernet_subsys(&sysctl_route_ops);
3348 #endif 3350 #endif
3349 register_pernet_subsys(&rt_genid_ops); 3351 register_pernet_subsys(&rt_genid_ops);
3350 return rc; 3352 return rc;
3351 } 3353 }
3352 3354
3353 #ifdef CONFIG_SYSCTL 3355 #ifdef CONFIG_SYSCTL
3354 /* 3356 /*
3355 * We really need to sanitize the damn ipv4 init order, then all 3357 * We really need to sanitize the damn ipv4 init order, then all
3356 * this nonsense will go away. 3358 * this nonsense will go away.
3357 */ 3359 */
3358 void __init ip_static_sysctl_init(void) 3360 void __init ip_static_sysctl_init(void)
3359 { 3361 {
3360 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3362 register_sysctl_paths(ipv4_path, ipv4_skeleton);
1 /* 1 /*
2 * IPv6 Address [auto]configuration 2 * IPv6 Address [auto]configuration
3 * Linux INET6 implementation 3 * Linux INET6 implementation
4 * 4 *
5 * Authors: 5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt> 6 * Pedro Roque <roque@di.fc.ul.pt>
7 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 7 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
8 * 8 *
9 * This program is free software; you can redistribute it and/or 9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License 10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version. 12 * 2 of the License, or (at your option) any later version.
13 */ 13 */
14 14
15 /* 15 /*
16 * Changes: 16 * Changes:
17 * 17 *
18 * Janos Farkas : delete timer on ifdown 18 * Janos Farkas : delete timer on ifdown
19 * <chexum@bankinf.banki.hu> 19 * <chexum@bankinf.banki.hu>
20 * Andi Kleen : kill double kfree on module 20 * Andi Kleen : kill double kfree on module
21 * unload. 21 * unload.
22 * Maciej W. Rozycki : FDDI support 22 * Maciej W. Rozycki : FDDI support
23 * sekiya@USAGI : Don't send too many RS 23 * sekiya@USAGI : Don't send too many RS
24 * packets. 24 * packets.
25 * yoshfuji@USAGI : Fixed interval between DAD 25 * yoshfuji@USAGI : Fixed interval between DAD
26 * packets. 26 * packets.
27 * YOSHIFUJI Hideaki @USAGI : improved accuracy of 27 * YOSHIFUJI Hideaki @USAGI : improved accuracy of
28 * address validation timer. 28 * address validation timer.
29 * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041) 29 * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041)
30 * support. 30 * support.
31 * Yuji SEKIYA @USAGI : Don't assign a same IPv6 31 * Yuji SEKIYA @USAGI : Don't assign a same IPv6
32 * address on a same interface. 32 * address on a same interface.
33 * YOSHIFUJI Hideaki @USAGI : ARCnet support 33 * YOSHIFUJI Hideaki @USAGI : ARCnet support
34 * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to 34 * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to
35 * seq_file. 35 * seq_file.
36 * YOSHIFUJI Hideaki @USAGI : improved source address 36 * YOSHIFUJI Hideaki @USAGI : improved source address
37 * selection; consider scope, 37 * selection; consider scope,
38 * status etc. 38 * status etc.
39 */ 39 */
40 40
41 #include <linux/errno.h> 41 #include <linux/errno.h>
42 #include <linux/types.h> 42 #include <linux/types.h>
43 #include <linux/kernel.h> 43 #include <linux/kernel.h>
44 #include <linux/socket.h> 44 #include <linux/socket.h>
45 #include <linux/sockios.h> 45 #include <linux/sockios.h>
46 #include <linux/net.h> 46 #include <linux/net.h>
47 #include <linux/in6.h> 47 #include <linux/in6.h>
48 #include <linux/netdevice.h> 48 #include <linux/netdevice.h>
49 #include <linux/if_addr.h> 49 #include <linux/if_addr.h>
50 #include <linux/if_arp.h> 50 #include <linux/if_arp.h>
51 #include <linux/if_arcnet.h> 51 #include <linux/if_arcnet.h>
52 #include <linux/if_infiniband.h> 52 #include <linux/if_infiniband.h>
53 #include <linux/route.h> 53 #include <linux/route.h>
54 #include <linux/inetdevice.h> 54 #include <linux/inetdevice.h>
55 #include <linux/init.h> 55 #include <linux/init.h>
56 #include <linux/slab.h> 56 #include <linux/slab.h>
57 #ifdef CONFIG_SYSCTL 57 #ifdef CONFIG_SYSCTL
58 #include <linux/sysctl.h> 58 #include <linux/sysctl.h>
59 #endif 59 #endif
60 #include <linux/capability.h> 60 #include <linux/capability.h>
61 #include <linux/delay.h> 61 #include <linux/delay.h>
62 #include <linux/notifier.h> 62 #include <linux/notifier.h>
63 #include <linux/string.h> 63 #include <linux/string.h>
64 64
65 #include <net/net_namespace.h> 65 #include <net/net_namespace.h>
66 #include <net/sock.h> 66 #include <net/sock.h>
67 #include <net/snmp.h> 67 #include <net/snmp.h>
68 68
69 #include <net/ipv6.h> 69 #include <net/ipv6.h>
70 #include <net/protocol.h> 70 #include <net/protocol.h>
71 #include <net/ndisc.h> 71 #include <net/ndisc.h>
72 #include <net/ip6_route.h> 72 #include <net/ip6_route.h>
73 #include <net/addrconf.h> 73 #include <net/addrconf.h>
74 #include <net/tcp.h> 74 #include <net/tcp.h>
75 #include <net/ip.h> 75 #include <net/ip.h>
76 #include <net/netlink.h> 76 #include <net/netlink.h>
77 #include <net/pkt_sched.h> 77 #include <net/pkt_sched.h>
78 #include <linux/if_tunnel.h> 78 #include <linux/if_tunnel.h>
79 #include <linux/rtnetlink.h> 79 #include <linux/rtnetlink.h>
80 80
81 #ifdef CONFIG_IPV6_PRIVACY 81 #ifdef CONFIG_IPV6_PRIVACY
82 #include <linux/random.h> 82 #include <linux/random.h>
83 #endif 83 #endif
84 84
85 #include <linux/uaccess.h> 85 #include <linux/uaccess.h>
86 #include <asm/unaligned.h> 86 #include <asm/unaligned.h>
87 87
88 #include <linux/proc_fs.h> 88 #include <linux/proc_fs.h>
89 #include <linux/seq_file.h> 89 #include <linux/seq_file.h>
90 90
91 /* Set to 3 to get tracing... */ 91 /* Set to 3 to get tracing... */
92 #define ACONF_DEBUG 2 92 #define ACONF_DEBUG 2
93 93
94 #if ACONF_DEBUG >= 3 94 #if ACONF_DEBUG >= 3
95 #define ADBG(x) printk x 95 #define ADBG(x) printk x
96 #else 96 #else
97 #define ADBG(x) 97 #define ADBG(x)
98 #endif 98 #endif
99 99
100 #define INFINITY_LIFE_TIME 0xFFFFFFFF 100 #define INFINITY_LIFE_TIME 0xFFFFFFFF
101 101
102 static inline u32 cstamp_delta(unsigned long cstamp) 102 static inline u32 cstamp_delta(unsigned long cstamp)
103 { 103 {
104 return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; 104 return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
105 } 105 }
106 106
107 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1) 107 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1)
108 #define ADDRCONF_TIMER_FUZZ (HZ / 4) 108 #define ADDRCONF_TIMER_FUZZ (HZ / 4)
109 #define ADDRCONF_TIMER_FUZZ_MAX (HZ) 109 #define ADDRCONF_TIMER_FUZZ_MAX (HZ)
110 110
111 #ifdef CONFIG_SYSCTL 111 #ifdef CONFIG_SYSCTL
112 static void addrconf_sysctl_register(struct inet6_dev *idev); 112 static void addrconf_sysctl_register(struct inet6_dev *idev);
113 static void addrconf_sysctl_unregister(struct inet6_dev *idev); 113 static void addrconf_sysctl_unregister(struct inet6_dev *idev);
114 #else 114 #else
115 static inline void addrconf_sysctl_register(struct inet6_dev *idev) 115 static inline void addrconf_sysctl_register(struct inet6_dev *idev)
116 { 116 {
117 } 117 }
118 118
119 static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) 119 static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
120 { 120 {
121 } 121 }
122 #endif 122 #endif
123 123
124 #ifdef CONFIG_IPV6_PRIVACY 124 #ifdef CONFIG_IPV6_PRIVACY
125 static int __ipv6_regen_rndid(struct inet6_dev *idev); 125 static int __ipv6_regen_rndid(struct inet6_dev *idev);
126 static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); 126 static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
127 static void ipv6_regen_rndid(unsigned long data); 127 static void ipv6_regen_rndid(unsigned long data);
128 #endif 128 #endif
129 129
130 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); 130 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
131 static int ipv6_count_addresses(struct inet6_dev *idev); 131 static int ipv6_count_addresses(struct inet6_dev *idev);
132 132
133 /* 133 /*
134 * Configured unicast address hash table 134 * Configured unicast address hash table
135 */ 135 */
136 static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; 136 static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
137 static DEFINE_SPINLOCK(addrconf_hash_lock); 137 static DEFINE_SPINLOCK(addrconf_hash_lock);
138 138
139 static void addrconf_verify(unsigned long); 139 static void addrconf_verify(unsigned long);
140 140
141 static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); 141 static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0);
142 static DEFINE_SPINLOCK(addrconf_verify_lock); 142 static DEFINE_SPINLOCK(addrconf_verify_lock);
143 143
144 static void addrconf_join_anycast(struct inet6_ifaddr *ifp); 144 static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
145 static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); 145 static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
146 146
147 static void addrconf_type_change(struct net_device *dev, 147 static void addrconf_type_change(struct net_device *dev,
148 unsigned long event); 148 unsigned long event);
149 static int addrconf_ifdown(struct net_device *dev, int how); 149 static int addrconf_ifdown(struct net_device *dev, int how);
150 150
151 static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); 151 static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
152 static void addrconf_dad_timer(unsigned long data); 152 static void addrconf_dad_timer(unsigned long data);
153 static void addrconf_dad_completed(struct inet6_ifaddr *ifp); 153 static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
154 static void addrconf_dad_run(struct inet6_dev *idev); 154 static void addrconf_dad_run(struct inet6_dev *idev);
155 static void addrconf_rs_timer(unsigned long data); 155 static void addrconf_rs_timer(unsigned long data);
156 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); 156 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
157 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); 157 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
158 158
159 static void inet6_prefix_notify(int event, struct inet6_dev *idev, 159 static void inet6_prefix_notify(int event, struct inet6_dev *idev,
160 struct prefix_info *pinfo); 160 struct prefix_info *pinfo);
161 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, 161 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
162 struct net_device *dev); 162 struct net_device *dev);
163 163
164 static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); 164 static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
165 165
166 static struct ipv6_devconf ipv6_devconf __read_mostly = { 166 static struct ipv6_devconf ipv6_devconf __read_mostly = {
167 .forwarding = 0, 167 .forwarding = 0,
168 .hop_limit = IPV6_DEFAULT_HOPLIMIT, 168 .hop_limit = IPV6_DEFAULT_HOPLIMIT,
169 .mtu6 = IPV6_MIN_MTU, 169 .mtu6 = IPV6_MIN_MTU,
170 .accept_ra = 1, 170 .accept_ra = 1,
171 .accept_redirects = 1, 171 .accept_redirects = 1,
172 .autoconf = 1, 172 .autoconf = 1,
173 .force_mld_version = 0, 173 .force_mld_version = 0,
174 .dad_transmits = 1, 174 .dad_transmits = 1,
175 .rtr_solicits = MAX_RTR_SOLICITATIONS, 175 .rtr_solicits = MAX_RTR_SOLICITATIONS,
176 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, 176 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
177 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, 177 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
178 #ifdef CONFIG_IPV6_PRIVACY 178 #ifdef CONFIG_IPV6_PRIVACY
179 .use_tempaddr = 0, 179 .use_tempaddr = 0,
180 .temp_valid_lft = TEMP_VALID_LIFETIME, 180 .temp_valid_lft = TEMP_VALID_LIFETIME,
181 .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, 181 .temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
182 .regen_max_retry = REGEN_MAX_RETRY, 182 .regen_max_retry = REGEN_MAX_RETRY,
183 .max_desync_factor = MAX_DESYNC_FACTOR, 183 .max_desync_factor = MAX_DESYNC_FACTOR,
184 #endif 184 #endif
185 .max_addresses = IPV6_MAX_ADDRESSES, 185 .max_addresses = IPV6_MAX_ADDRESSES,
186 .accept_ra_defrtr = 1, 186 .accept_ra_defrtr = 1,
187 .accept_ra_pinfo = 1, 187 .accept_ra_pinfo = 1,
188 #ifdef CONFIG_IPV6_ROUTER_PREF 188 #ifdef CONFIG_IPV6_ROUTER_PREF
189 .accept_ra_rtr_pref = 1, 189 .accept_ra_rtr_pref = 1,
190 .rtr_probe_interval = 60 * HZ, 190 .rtr_probe_interval = 60 * HZ,
191 #ifdef CONFIG_IPV6_ROUTE_INFO 191 #ifdef CONFIG_IPV6_ROUTE_INFO
192 .accept_ra_rt_info_max_plen = 0, 192 .accept_ra_rt_info_max_plen = 0,
193 #endif 193 #endif
194 #endif 194 #endif
195 .proxy_ndp = 0, 195 .proxy_ndp = 0,
196 .accept_source_route = 0, /* we do not accept RH0 by default. */ 196 .accept_source_route = 0, /* we do not accept RH0 by default. */
197 .disable_ipv6 = 0, 197 .disable_ipv6 = 0,
198 .accept_dad = 1, 198 .accept_dad = 1,
199 }; 199 };
200 200
201 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { 201 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
202 .forwarding = 0, 202 .forwarding = 0,
203 .hop_limit = IPV6_DEFAULT_HOPLIMIT, 203 .hop_limit = IPV6_DEFAULT_HOPLIMIT,
204 .mtu6 = IPV6_MIN_MTU, 204 .mtu6 = IPV6_MIN_MTU,
205 .accept_ra = 1, 205 .accept_ra = 1,
206 .accept_redirects = 1, 206 .accept_redirects = 1,
207 .autoconf = 1, 207 .autoconf = 1,
208 .dad_transmits = 1, 208 .dad_transmits = 1,
209 .rtr_solicits = MAX_RTR_SOLICITATIONS, 209 .rtr_solicits = MAX_RTR_SOLICITATIONS,
210 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, 210 .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
211 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, 211 .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
212 #ifdef CONFIG_IPV6_PRIVACY 212 #ifdef CONFIG_IPV6_PRIVACY
213 .use_tempaddr = 0, 213 .use_tempaddr = 0,
214 .temp_valid_lft = TEMP_VALID_LIFETIME, 214 .temp_valid_lft = TEMP_VALID_LIFETIME,
215 .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, 215 .temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
216 .regen_max_retry = REGEN_MAX_RETRY, 216 .regen_max_retry = REGEN_MAX_RETRY,
217 .max_desync_factor = MAX_DESYNC_FACTOR, 217 .max_desync_factor = MAX_DESYNC_FACTOR,
218 #endif 218 #endif
219 .max_addresses = IPV6_MAX_ADDRESSES, 219 .max_addresses = IPV6_MAX_ADDRESSES,
220 .accept_ra_defrtr = 1, 220 .accept_ra_defrtr = 1,
221 .accept_ra_pinfo = 1, 221 .accept_ra_pinfo = 1,
222 #ifdef CONFIG_IPV6_ROUTER_PREF 222 #ifdef CONFIG_IPV6_ROUTER_PREF
223 .accept_ra_rtr_pref = 1, 223 .accept_ra_rtr_pref = 1,
224 .rtr_probe_interval = 60 * HZ, 224 .rtr_probe_interval = 60 * HZ,
225 #ifdef CONFIG_IPV6_ROUTE_INFO 225 #ifdef CONFIG_IPV6_ROUTE_INFO
226 .accept_ra_rt_info_max_plen = 0, 226 .accept_ra_rt_info_max_plen = 0,
227 #endif 227 #endif
228 #endif 228 #endif
229 .proxy_ndp = 0, 229 .proxy_ndp = 0,
230 .accept_source_route = 0, /* we do not accept RH0 by default. */ 230 .accept_source_route = 0, /* we do not accept RH0 by default. */
231 .disable_ipv6 = 0, 231 .disable_ipv6 = 0,
232 .accept_dad = 1, 232 .accept_dad = 1,
233 }; 233 };
234 234
235 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ 235 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
236 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; 236 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
237 const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; 237 const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
238 const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; 238 const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
239 const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; 239 const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
240 240
241 /* Check if a valid qdisc is available */ 241 /* Check if a valid qdisc is available */
242 static inline bool addrconf_qdisc_ok(const struct net_device *dev) 242 static inline bool addrconf_qdisc_ok(const struct net_device *dev)
243 { 243 {
244 return !qdisc_tx_is_noop(dev); 244 return !qdisc_tx_is_noop(dev);
245 } 245 }
246 246
247 /* Check if a route is valid prefix route */ 247 /* Check if a route is valid prefix route */
248 static inline int addrconf_is_prefix_route(const struct rt6_info *rt) 248 static inline int addrconf_is_prefix_route(const struct rt6_info *rt)
249 { 249 {
250 return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0; 250 return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0;
251 } 251 }
252 252
253 static void addrconf_del_timer(struct inet6_ifaddr *ifp) 253 static void addrconf_del_timer(struct inet6_ifaddr *ifp)
254 { 254 {
255 if (del_timer(&ifp->timer)) 255 if (del_timer(&ifp->timer))
256 __in6_ifa_put(ifp); 256 __in6_ifa_put(ifp);
257 } 257 }
258 258
259 enum addrconf_timer_t { 259 enum addrconf_timer_t {
260 AC_NONE, 260 AC_NONE,
261 AC_DAD, 261 AC_DAD,
262 AC_RS, 262 AC_RS,
263 }; 263 };
264 264
265 static void addrconf_mod_timer(struct inet6_ifaddr *ifp, 265 static void addrconf_mod_timer(struct inet6_ifaddr *ifp,
266 enum addrconf_timer_t what, 266 enum addrconf_timer_t what,
267 unsigned long when) 267 unsigned long when)
268 { 268 {
269 if (!del_timer(&ifp->timer)) 269 if (!del_timer(&ifp->timer))
270 in6_ifa_hold(ifp); 270 in6_ifa_hold(ifp);
271 271
272 switch (what) { 272 switch (what) {
273 case AC_DAD: 273 case AC_DAD:
274 ifp->timer.function = addrconf_dad_timer; 274 ifp->timer.function = addrconf_dad_timer;
275 break; 275 break;
276 case AC_RS: 276 case AC_RS:
277 ifp->timer.function = addrconf_rs_timer; 277 ifp->timer.function = addrconf_rs_timer;
278 break; 278 break;
279 default: 279 default:
280 break; 280 break;
281 } 281 }
282 ifp->timer.expires = jiffies + when; 282 ifp->timer.expires = jiffies + when;
283 add_timer(&ifp->timer); 283 add_timer(&ifp->timer);
284 } 284 }
285 285
286 static int snmp6_alloc_dev(struct inet6_dev *idev) 286 static int snmp6_alloc_dev(struct inet6_dev *idev)
287 { 287 {
288 if (snmp_mib_init((void __percpu **)idev->stats.ipv6, 288 if (snmp_mib_init((void __percpu **)idev->stats.ipv6,
289 sizeof(struct ipstats_mib), 289 sizeof(struct ipstats_mib),
290 __alignof__(struct ipstats_mib)) < 0) 290 __alignof__(struct ipstats_mib)) < 0)
291 goto err_ip; 291 goto err_ip;
292 idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device), 292 idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
293 GFP_KERNEL); 293 GFP_KERNEL);
294 if (!idev->stats.icmpv6dev) 294 if (!idev->stats.icmpv6dev)
295 goto err_icmp; 295 goto err_icmp;
296 idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), 296 idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
297 GFP_KERNEL); 297 GFP_KERNEL);
298 if (!idev->stats.icmpv6msgdev) 298 if (!idev->stats.icmpv6msgdev)
299 goto err_icmpmsg; 299 goto err_icmpmsg;
300 300
301 return 0; 301 return 0;
302 302
303 err_icmpmsg: 303 err_icmpmsg:
304 kfree(idev->stats.icmpv6dev); 304 kfree(idev->stats.icmpv6dev);
305 err_icmp: 305 err_icmp:
306 snmp_mib_free((void __percpu **)idev->stats.ipv6); 306 snmp_mib_free((void __percpu **)idev->stats.ipv6);
307 err_ip: 307 err_ip:
308 return -ENOMEM; 308 return -ENOMEM;
309 } 309 }
310 310
311 static void snmp6_free_dev(struct inet6_dev *idev) 311 static void snmp6_free_dev(struct inet6_dev *idev)
312 { 312 {
313 kfree(idev->stats.icmpv6msgdev); 313 kfree(idev->stats.icmpv6msgdev);
314 kfree(idev->stats.icmpv6dev); 314 kfree(idev->stats.icmpv6dev);
315 snmp_mib_free((void __percpu **)idev->stats.ipv6); 315 snmp_mib_free((void __percpu **)idev->stats.ipv6);
316 } 316 }
317 317
318 /* Nobody refers to this device, we may destroy it. */ 318 /* Nobody refers to this device, we may destroy it. */
319 319
320 void in6_dev_finish_destroy(struct inet6_dev *idev) 320 void in6_dev_finish_destroy(struct inet6_dev *idev)
321 { 321 {
322 struct net_device *dev = idev->dev; 322 struct net_device *dev = idev->dev;
323 323
324 WARN_ON(!list_empty(&idev->addr_list)); 324 WARN_ON(!list_empty(&idev->addr_list));
325 WARN_ON(idev->mc_list != NULL); 325 WARN_ON(idev->mc_list != NULL);
326 326
327 #ifdef NET_REFCNT_DEBUG 327 #ifdef NET_REFCNT_DEBUG
328 printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); 328 printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL");
329 #endif 329 #endif
330 dev_put(dev); 330 dev_put(dev);
331 if (!idev->dead) { 331 if (!idev->dead) {
332 pr_warning("Freeing alive inet6 device %p\n", idev); 332 pr_warning("Freeing alive inet6 device %p\n", idev);
333 return; 333 return;
334 } 334 }
335 snmp6_free_dev(idev); 335 snmp6_free_dev(idev);
336 kfree_rcu(idev, rcu); 336 kfree_rcu(idev, rcu);
337 } 337 }
338 338
339 EXPORT_SYMBOL(in6_dev_finish_destroy); 339 EXPORT_SYMBOL(in6_dev_finish_destroy);
340 340
341 static struct inet6_dev * ipv6_add_dev(struct net_device *dev) 341 static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
342 { 342 {
343 struct inet6_dev *ndev; 343 struct inet6_dev *ndev;
344 344
345 ASSERT_RTNL(); 345 ASSERT_RTNL();
346 346
347 if (dev->mtu < IPV6_MIN_MTU) 347 if (dev->mtu < IPV6_MIN_MTU)
348 return NULL; 348 return NULL;
349 349
350 ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); 350 ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
351 351
352 if (ndev == NULL) 352 if (ndev == NULL)
353 return NULL; 353 return NULL;
354 354
355 rwlock_init(&ndev->lock); 355 rwlock_init(&ndev->lock);
356 ndev->dev = dev; 356 ndev->dev = dev;
357 INIT_LIST_HEAD(&ndev->addr_list); 357 INIT_LIST_HEAD(&ndev->addr_list);
358 358
359 memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); 359 memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
360 ndev->cnf.mtu6 = dev->mtu; 360 ndev->cnf.mtu6 = dev->mtu;
361 ndev->cnf.sysctl = NULL; 361 ndev->cnf.sysctl = NULL;
362 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); 362 ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
363 if (ndev->nd_parms == NULL) { 363 if (ndev->nd_parms == NULL) {
364 kfree(ndev); 364 kfree(ndev);
365 return NULL; 365 return NULL;
366 } 366 }
367 if (ndev->cnf.forwarding) 367 if (ndev->cnf.forwarding)
368 dev_disable_lro(dev); 368 dev_disable_lro(dev);
369 /* We refer to the device */ 369 /* We refer to the device */
370 dev_hold(dev); 370 dev_hold(dev);
371 371
372 if (snmp6_alloc_dev(ndev) < 0) { 372 if (snmp6_alloc_dev(ndev) < 0) {
373 ADBG((KERN_WARNING 373 ADBG((KERN_WARNING
374 "%s(): cannot allocate memory for statistics; dev=%s.\n", 374 "%s(): cannot allocate memory for statistics; dev=%s.\n",
375 __func__, dev->name)); 375 __func__, dev->name));
376 neigh_parms_release(&nd_tbl, ndev->nd_parms); 376 neigh_parms_release(&nd_tbl, ndev->nd_parms);
377 ndev->dead = 1; 377 ndev->dead = 1;
378 in6_dev_finish_destroy(ndev); 378 in6_dev_finish_destroy(ndev);
379 return NULL; 379 return NULL;
380 } 380 }
381 381
382 if (snmp6_register_dev(ndev) < 0) { 382 if (snmp6_register_dev(ndev) < 0) {
383 ADBG((KERN_WARNING 383 ADBG((KERN_WARNING
384 "%s(): cannot create /proc/net/dev_snmp6/%s\n", 384 "%s(): cannot create /proc/net/dev_snmp6/%s\n",
385 __func__, dev->name)); 385 __func__, dev->name));
386 neigh_parms_release(&nd_tbl, ndev->nd_parms); 386 neigh_parms_release(&nd_tbl, ndev->nd_parms);
387 ndev->dead = 1; 387 ndev->dead = 1;
388 in6_dev_finish_destroy(ndev); 388 in6_dev_finish_destroy(ndev);
389 return NULL; 389 return NULL;
390 } 390 }
391 391
392 /* One reference from device. We must do this before 392 /* One reference from device. We must do this before
393 * we invoke __ipv6_regen_rndid(). 393 * we invoke __ipv6_regen_rndid().
394 */ 394 */
395 in6_dev_hold(ndev); 395 in6_dev_hold(ndev);
396 396
397 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 397 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
398 ndev->cnf.accept_dad = -1; 398 ndev->cnf.accept_dad = -1;
399 399
400 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 400 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
401 if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { 401 if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
402 printk(KERN_INFO 402 printk(KERN_INFO
403 "%s: Disabled Multicast RS\n", 403 "%s: Disabled Multicast RS\n",
404 dev->name); 404 dev->name);
405 ndev->cnf.rtr_solicits = 0; 405 ndev->cnf.rtr_solicits = 0;
406 } 406 }
407 #endif 407 #endif
408 408
409 #ifdef CONFIG_IPV6_PRIVACY 409 #ifdef CONFIG_IPV6_PRIVACY
410 INIT_LIST_HEAD(&ndev->tempaddr_list); 410 INIT_LIST_HEAD(&ndev->tempaddr_list);
411 setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); 411 setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
412 if ((dev->flags&IFF_LOOPBACK) || 412 if ((dev->flags&IFF_LOOPBACK) ||
413 dev->type == ARPHRD_TUNNEL || 413 dev->type == ARPHRD_TUNNEL ||
414 dev->type == ARPHRD_TUNNEL6 || 414 dev->type == ARPHRD_TUNNEL6 ||
415 dev->type == ARPHRD_SIT || 415 dev->type == ARPHRD_SIT ||
416 dev->type == ARPHRD_NONE) { 416 dev->type == ARPHRD_NONE) {
417 ndev->cnf.use_tempaddr = -1; 417 ndev->cnf.use_tempaddr = -1;
418 } else { 418 } else {
419 in6_dev_hold(ndev); 419 in6_dev_hold(ndev);
420 ipv6_regen_rndid((unsigned long) ndev); 420 ipv6_regen_rndid((unsigned long) ndev);
421 } 421 }
422 #endif 422 #endif
423 423
424 if (netif_running(dev) && addrconf_qdisc_ok(dev)) 424 if (netif_running(dev) && addrconf_qdisc_ok(dev))
425 ndev->if_flags |= IF_READY; 425 ndev->if_flags |= IF_READY;
426 426
427 ipv6_mc_init_dev(ndev); 427 ipv6_mc_init_dev(ndev);
428 ndev->tstamp = jiffies; 428 ndev->tstamp = jiffies;
429 addrconf_sysctl_register(ndev); 429 addrconf_sysctl_register(ndev);
430 /* protected by rtnl_lock */ 430 /* protected by rtnl_lock */
431 rcu_assign_pointer(dev->ip6_ptr, ndev); 431 rcu_assign_pointer(dev->ip6_ptr, ndev);
432 432
433 /* Join all-node multicast group */ 433 /* Join all-node multicast group */
434 ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); 434 ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
435 435
436 return ndev; 436 return ndev;
437 } 437 }
438 438
439 static struct inet6_dev * ipv6_find_idev(struct net_device *dev) 439 static struct inet6_dev * ipv6_find_idev(struct net_device *dev)
440 { 440 {
441 struct inet6_dev *idev; 441 struct inet6_dev *idev;
442 442
443 ASSERT_RTNL(); 443 ASSERT_RTNL();
444 444
445 idev = __in6_dev_get(dev); 445 idev = __in6_dev_get(dev);
446 if (!idev) { 446 if (!idev) {
447 idev = ipv6_add_dev(dev); 447 idev = ipv6_add_dev(dev);
448 if (!idev) 448 if (!idev)
449 return NULL; 449 return NULL;
450 } 450 }
451 451
452 if (dev->flags&IFF_UP) 452 if (dev->flags&IFF_UP)
453 ipv6_mc_up(idev); 453 ipv6_mc_up(idev);
454 return idev; 454 return idev;
455 } 455 }
456 456
457 #ifdef CONFIG_SYSCTL 457 #ifdef CONFIG_SYSCTL
458 static void dev_forward_change(struct inet6_dev *idev) 458 static void dev_forward_change(struct inet6_dev *idev)
459 { 459 {
460 struct net_device *dev; 460 struct net_device *dev;
461 struct inet6_ifaddr *ifa; 461 struct inet6_ifaddr *ifa;
462 462
463 if (!idev) 463 if (!idev)
464 return; 464 return;
465 dev = idev->dev; 465 dev = idev->dev;
466 if (idev->cnf.forwarding) 466 if (idev->cnf.forwarding)
467 dev_disable_lro(dev); 467 dev_disable_lro(dev);
468 if (dev && (dev->flags & IFF_MULTICAST)) { 468 if (dev && (dev->flags & IFF_MULTICAST)) {
469 if (idev->cnf.forwarding) 469 if (idev->cnf.forwarding)
470 ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); 470 ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
471 else 471 else
472 ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); 472 ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
473 } 473 }
474 474
475 list_for_each_entry(ifa, &idev->addr_list, if_list) { 475 list_for_each_entry(ifa, &idev->addr_list, if_list) {
476 if (ifa->flags&IFA_F_TENTATIVE) 476 if (ifa->flags&IFA_F_TENTATIVE)
477 continue; 477 continue;
478 if (idev->cnf.forwarding) 478 if (idev->cnf.forwarding)
479 addrconf_join_anycast(ifa); 479 addrconf_join_anycast(ifa);
480 else 480 else
481 addrconf_leave_anycast(ifa); 481 addrconf_leave_anycast(ifa);
482 } 482 }
483 } 483 }
484 484
485 485
486 static void addrconf_forward_change(struct net *net, __s32 newf) 486 static void addrconf_forward_change(struct net *net, __s32 newf)
487 { 487 {
488 struct net_device *dev; 488 struct net_device *dev;
489 struct inet6_dev *idev; 489 struct inet6_dev *idev;
490 490
491 rcu_read_lock(); 491 rcu_read_lock();
492 for_each_netdev_rcu(net, dev) { 492 for_each_netdev_rcu(net, dev) {
493 idev = __in6_dev_get(dev); 493 idev = __in6_dev_get(dev);
494 if (idev) { 494 if (idev) {
495 int changed = (!idev->cnf.forwarding) ^ (!newf); 495 int changed = (!idev->cnf.forwarding) ^ (!newf);
496 idev->cnf.forwarding = newf; 496 idev->cnf.forwarding = newf;
497 if (changed) 497 if (changed)
498 dev_forward_change(idev); 498 dev_forward_change(idev);
499 } 499 }
500 } 500 }
501 rcu_read_unlock(); 501 rcu_read_unlock();
502 } 502 }
503 503
504 static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) 504 static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
505 { 505 {
506 struct net *net; 506 struct net *net;
507 507
508 net = (struct net *)table->extra2; 508 net = (struct net *)table->extra2;
509 if (p == &net->ipv6.devconf_dflt->forwarding) 509 if (p == &net->ipv6.devconf_dflt->forwarding)
510 return 0; 510 return 0;
511 511
512 if (!rtnl_trylock()) { 512 if (!rtnl_trylock()) {
513 /* Restore the original values before restarting */ 513 /* Restore the original values before restarting */
514 *p = old; 514 *p = old;
515 return restart_syscall(); 515 return restart_syscall();
516 } 516 }
517 517
518 if (p == &net->ipv6.devconf_all->forwarding) { 518 if (p == &net->ipv6.devconf_all->forwarding) {
519 __s32 newf = net->ipv6.devconf_all->forwarding; 519 __s32 newf = net->ipv6.devconf_all->forwarding;
520 net->ipv6.devconf_dflt->forwarding = newf; 520 net->ipv6.devconf_dflt->forwarding = newf;
521 addrconf_forward_change(net, newf); 521 addrconf_forward_change(net, newf);
522 } else if ((!*p) ^ (!old)) 522 } else if ((!*p) ^ (!old))
523 dev_forward_change((struct inet6_dev *)table->extra1); 523 dev_forward_change((struct inet6_dev *)table->extra1);
524 rtnl_unlock(); 524 rtnl_unlock();
525 525
526 if (*p) 526 if (*p)
527 rt6_purge_dflt_routers(net); 527 rt6_purge_dflt_routers(net);
528 return 1; 528 return 1;
529 } 529 }
530 #endif 530 #endif
531 531
532 /* Nobody refers to this ifaddr, destroy it */ 532 /* Nobody refers to this ifaddr, destroy it */
533 void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) 533 void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
534 { 534 {
535 WARN_ON(!hlist_unhashed(&ifp->addr_lst)); 535 WARN_ON(!hlist_unhashed(&ifp->addr_lst));
536 536
537 #ifdef NET_REFCNT_DEBUG 537 #ifdef NET_REFCNT_DEBUG
538 printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); 538 printk(KERN_DEBUG "inet6_ifa_finish_destroy\n");
539 #endif 539 #endif
540 540
541 in6_dev_put(ifp->idev); 541 in6_dev_put(ifp->idev);
542 542
543 if (del_timer(&ifp->timer)) 543 if (del_timer(&ifp->timer))
544 pr_notice("Timer is still running, when freeing ifa=%p\n", ifp); 544 pr_notice("Timer is still running, when freeing ifa=%p\n", ifp);
545 545
546 if (ifp->state != INET6_IFADDR_STATE_DEAD) { 546 if (ifp->state != INET6_IFADDR_STATE_DEAD) {
547 pr_warning("Freeing alive inet6 address %p\n", ifp); 547 pr_warning("Freeing alive inet6 address %p\n", ifp);
548 return; 548 return;
549 } 549 }
550 dst_release(&ifp->rt->dst); 550 dst_release(&ifp->rt->dst);
551 551
552 kfree_rcu(ifp, rcu); 552 kfree_rcu(ifp, rcu);
553 } 553 }
554 554
555 static void 555 static void
556 ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) 556 ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
557 { 557 {
558 struct list_head *p; 558 struct list_head *p;
559 int ifp_scope = ipv6_addr_src_scope(&ifp->addr); 559 int ifp_scope = ipv6_addr_src_scope(&ifp->addr);
560 560
561 /* 561 /*
562 * Each device address list is sorted in order of scope - 562 * Each device address list is sorted in order of scope -
563 * global before linklocal. 563 * global before linklocal.
564 */ 564 */
565 list_for_each(p, &idev->addr_list) { 565 list_for_each(p, &idev->addr_list) {
566 struct inet6_ifaddr *ifa 566 struct inet6_ifaddr *ifa
567 = list_entry(p, struct inet6_ifaddr, if_list); 567 = list_entry(p, struct inet6_ifaddr, if_list);
568 if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) 568 if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
569 break; 569 break;
570 } 570 }
571 571
572 list_add_tail(&ifp->if_list, p); 572 list_add_tail(&ifp->if_list, p);
573 } 573 }
574 574
575 static u32 ipv6_addr_hash(const struct in6_addr *addr) 575 static u32 ipv6_addr_hash(const struct in6_addr *addr)
576 { 576 {
577 /* 577 /*
578 * We perform the hash function over the last 64 bits of the address 578 * We perform the hash function over the last 64 bits of the address
579 * This will include the IEEE address token on links that support it. 579 * This will include the IEEE address token on links that support it.
580 */ 580 */
581 return jhash_2words((__force u32)addr->s6_addr32[2], 581 return jhash_2words((__force u32)addr->s6_addr32[2],
582 (__force u32)addr->s6_addr32[3], 0) 582 (__force u32)addr->s6_addr32[3], 0)
583 & (IN6_ADDR_HSIZE - 1); 583 & (IN6_ADDR_HSIZE - 1);
584 } 584 }
585 585
586 /* On success it returns ifp with increased reference count */ 586 /* On success it returns ifp with increased reference count */
587 587
588 static struct inet6_ifaddr * 588 static struct inet6_ifaddr *
589 ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, 589 ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
590 int scope, u32 flags) 590 int scope, u32 flags)
591 { 591 {
592 struct inet6_ifaddr *ifa = NULL; 592 struct inet6_ifaddr *ifa = NULL;
593 struct rt6_info *rt; 593 struct rt6_info *rt;
594 unsigned int hash; 594 unsigned int hash;
595 int err = 0; 595 int err = 0;
596 int addr_type = ipv6_addr_type(addr); 596 int addr_type = ipv6_addr_type(addr);
597 597
598 if (addr_type == IPV6_ADDR_ANY || 598 if (addr_type == IPV6_ADDR_ANY ||
599 addr_type & IPV6_ADDR_MULTICAST || 599 addr_type & IPV6_ADDR_MULTICAST ||
600 (!(idev->dev->flags & IFF_LOOPBACK) && 600 (!(idev->dev->flags & IFF_LOOPBACK) &&
601 addr_type & IPV6_ADDR_LOOPBACK)) 601 addr_type & IPV6_ADDR_LOOPBACK))
602 return ERR_PTR(-EADDRNOTAVAIL); 602 return ERR_PTR(-EADDRNOTAVAIL);
603 603
604 rcu_read_lock_bh(); 604 rcu_read_lock_bh();
605 if (idev->dead) { 605 if (idev->dead) {
606 err = -ENODEV; /*XXX*/ 606 err = -ENODEV; /*XXX*/
607 goto out2; 607 goto out2;
608 } 608 }
609 609
610 if (idev->cnf.disable_ipv6) { 610 if (idev->cnf.disable_ipv6) {
611 err = -EACCES; 611 err = -EACCES;
612 goto out2; 612 goto out2;
613 } 613 }
614 614
615 spin_lock(&addrconf_hash_lock); 615 spin_lock(&addrconf_hash_lock);
616 616
617 /* Ignore adding duplicate addresses on an interface */ 617 /* Ignore adding duplicate addresses on an interface */
618 if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { 618 if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) {
619 ADBG(("ipv6_add_addr: already assigned\n")); 619 ADBG(("ipv6_add_addr: already assigned\n"));
620 err = -EEXIST; 620 err = -EEXIST;
621 goto out; 621 goto out;
622 } 622 }
623 623
624 ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); 624 ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
625 625
626 if (ifa == NULL) { 626 if (ifa == NULL) {
627 ADBG(("ipv6_add_addr: malloc failed\n")); 627 ADBG(("ipv6_add_addr: malloc failed\n"));
628 err = -ENOBUFS; 628 err = -ENOBUFS;
629 goto out; 629 goto out;
630 } 630 }
631 631
632 rt = addrconf_dst_alloc(idev, addr, 0); 632 rt = addrconf_dst_alloc(idev, addr, 0);
633 if (IS_ERR(rt)) { 633 if (IS_ERR(rt)) {
634 err = PTR_ERR(rt); 634 err = PTR_ERR(rt);
635 goto out; 635 goto out;
636 } 636 }
637 637
638 ipv6_addr_copy(&ifa->addr, addr); 638 ipv6_addr_copy(&ifa->addr, addr);
639 639
640 spin_lock_init(&ifa->lock); 640 spin_lock_init(&ifa->lock);
641 spin_lock_init(&ifa->state_lock); 641 spin_lock_init(&ifa->state_lock);
642 init_timer(&ifa->timer); 642 init_timer(&ifa->timer);
643 INIT_HLIST_NODE(&ifa->addr_lst); 643 INIT_HLIST_NODE(&ifa->addr_lst);
644 ifa->timer.data = (unsigned long) ifa; 644 ifa->timer.data = (unsigned long) ifa;
645 ifa->scope = scope; 645 ifa->scope = scope;
646 ifa->prefix_len = pfxlen; 646 ifa->prefix_len = pfxlen;
647 ifa->flags = flags | IFA_F_TENTATIVE; 647 ifa->flags = flags | IFA_F_TENTATIVE;
648 ifa->cstamp = ifa->tstamp = jiffies; 648 ifa->cstamp = ifa->tstamp = jiffies;
649 649
650 ifa->rt = rt; 650 ifa->rt = rt;
651 651
652 /* 652 /*
653 * part one of RFC 4429, section 3.3 653 * part one of RFC 4429, section 3.3
654 * We should not configure an address as 654 * We should not configure an address as
655 * optimistic if we do not yet know the link 655 * optimistic if we do not yet know the link
656 * layer address of our nexhop router 656 * layer address of our nexhop router
657 */ 657 */
658 658
659 if (dst_get_neighbour(&rt->dst) == NULL) 659 if (dst_get_neighbour_raw(&rt->dst) == NULL)
660 ifa->flags &= ~IFA_F_OPTIMISTIC; 660 ifa->flags &= ~IFA_F_OPTIMISTIC;
661 661
662 ifa->idev = idev; 662 ifa->idev = idev;
663 in6_dev_hold(idev); 663 in6_dev_hold(idev);
664 /* For caller */ 664 /* For caller */
665 in6_ifa_hold(ifa); 665 in6_ifa_hold(ifa);
666 666
667 /* Add to big hash table */ 667 /* Add to big hash table */
668 hash = ipv6_addr_hash(addr); 668 hash = ipv6_addr_hash(addr);
669 669
670 hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); 670 hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
671 spin_unlock(&addrconf_hash_lock); 671 spin_unlock(&addrconf_hash_lock);
672 672
673 write_lock(&idev->lock); 673 write_lock(&idev->lock);
674 /* Add to inet6_dev unicast addr list. */ 674 /* Add to inet6_dev unicast addr list. */
675 ipv6_link_dev_addr(idev, ifa); 675 ipv6_link_dev_addr(idev, ifa);
676 676
677 #ifdef CONFIG_IPV6_PRIVACY 677 #ifdef CONFIG_IPV6_PRIVACY
678 if (ifa->flags&IFA_F_TEMPORARY) { 678 if (ifa->flags&IFA_F_TEMPORARY) {
679 list_add(&ifa->tmp_list, &idev->tempaddr_list); 679 list_add(&ifa->tmp_list, &idev->tempaddr_list);
680 in6_ifa_hold(ifa); 680 in6_ifa_hold(ifa);
681 } 681 }
682 #endif 682 #endif
683 683
684 in6_ifa_hold(ifa); 684 in6_ifa_hold(ifa);
685 write_unlock(&idev->lock); 685 write_unlock(&idev->lock);
686 out2: 686 out2:
687 rcu_read_unlock_bh(); 687 rcu_read_unlock_bh();
688 688
689 if (likely(err == 0)) 689 if (likely(err == 0))
690 atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); 690 atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
691 else { 691 else {
692 kfree(ifa); 692 kfree(ifa);
693 ifa = ERR_PTR(err); 693 ifa = ERR_PTR(err);
694 } 694 }
695 695
696 return ifa; 696 return ifa;
697 out: 697 out:
698 spin_unlock(&addrconf_hash_lock); 698 spin_unlock(&addrconf_hash_lock);
699 goto out2; 699 goto out2;
700 } 700 }
701 701
702 /* This function wants to get referenced ifp and releases it before return */ 702 /* This function wants to get referenced ifp and releases it before return */
703 703
704 static void ipv6_del_addr(struct inet6_ifaddr *ifp) 704 static void ipv6_del_addr(struct inet6_ifaddr *ifp)
705 { 705 {
706 struct inet6_ifaddr *ifa, *ifn; 706 struct inet6_ifaddr *ifa, *ifn;
707 struct inet6_dev *idev = ifp->idev; 707 struct inet6_dev *idev = ifp->idev;
708 int state; 708 int state;
709 int deleted = 0, onlink = 0; 709 int deleted = 0, onlink = 0;
710 unsigned long expires = jiffies; 710 unsigned long expires = jiffies;
711 711
712 spin_lock_bh(&ifp->state_lock); 712 spin_lock_bh(&ifp->state_lock);
713 state = ifp->state; 713 state = ifp->state;
714 ifp->state = INET6_IFADDR_STATE_DEAD; 714 ifp->state = INET6_IFADDR_STATE_DEAD;
715 spin_unlock_bh(&ifp->state_lock); 715 spin_unlock_bh(&ifp->state_lock);
716 716
717 if (state == INET6_IFADDR_STATE_DEAD) 717 if (state == INET6_IFADDR_STATE_DEAD)
718 goto out; 718 goto out;
719 719
720 spin_lock_bh(&addrconf_hash_lock); 720 spin_lock_bh(&addrconf_hash_lock);
721 hlist_del_init_rcu(&ifp->addr_lst); 721 hlist_del_init_rcu(&ifp->addr_lst);
722 spin_unlock_bh(&addrconf_hash_lock); 722 spin_unlock_bh(&addrconf_hash_lock);
723 723
724 write_lock_bh(&idev->lock); 724 write_lock_bh(&idev->lock);
725 #ifdef CONFIG_IPV6_PRIVACY 725 #ifdef CONFIG_IPV6_PRIVACY
726 if (ifp->flags&IFA_F_TEMPORARY) { 726 if (ifp->flags&IFA_F_TEMPORARY) {
727 list_del(&ifp->tmp_list); 727 list_del(&ifp->tmp_list);
728 if (ifp->ifpub) { 728 if (ifp->ifpub) {
729 in6_ifa_put(ifp->ifpub); 729 in6_ifa_put(ifp->ifpub);
730 ifp->ifpub = NULL; 730 ifp->ifpub = NULL;
731 } 731 }
732 __in6_ifa_put(ifp); 732 __in6_ifa_put(ifp);
733 } 733 }
734 #endif 734 #endif
735 735
736 list_for_each_entry_safe(ifa, ifn, &idev->addr_list, if_list) { 736 list_for_each_entry_safe(ifa, ifn, &idev->addr_list, if_list) {
737 if (ifa == ifp) { 737 if (ifa == ifp) {
738 list_del_init(&ifp->if_list); 738 list_del_init(&ifp->if_list);
739 __in6_ifa_put(ifp); 739 __in6_ifa_put(ifp);
740 740
741 if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) 741 if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0)
742 break; 742 break;
743 deleted = 1; 743 deleted = 1;
744 continue; 744 continue;
745 } else if (ifp->flags & IFA_F_PERMANENT) { 745 } else if (ifp->flags & IFA_F_PERMANENT) {
746 if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, 746 if (ipv6_prefix_equal(&ifa->addr, &ifp->addr,
747 ifp->prefix_len)) { 747 ifp->prefix_len)) {
748 if (ifa->flags & IFA_F_PERMANENT) { 748 if (ifa->flags & IFA_F_PERMANENT) {
749 onlink = 1; 749 onlink = 1;
750 if (deleted) 750 if (deleted)
751 break; 751 break;
752 } else { 752 } else {
753 unsigned long lifetime; 753 unsigned long lifetime;
754 754
755 if (!onlink) 755 if (!onlink)
756 onlink = -1; 756 onlink = -1;
757 757
758 spin_lock(&ifa->lock); 758 spin_lock(&ifa->lock);
759 759
760 lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); 760 lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
761 /* 761 /*
762 * Note: Because this address is 762 * Note: Because this address is
763 * not permanent, lifetime < 763 * not permanent, lifetime <
764 * LONG_MAX / HZ here. 764 * LONG_MAX / HZ here.
765 */ 765 */
766 if (time_before(expires, 766 if (time_before(expires,
767 ifa->tstamp + lifetime * HZ)) 767 ifa->tstamp + lifetime * HZ))
768 expires = ifa->tstamp + lifetime * HZ; 768 expires = ifa->tstamp + lifetime * HZ;
769 spin_unlock(&ifa->lock); 769 spin_unlock(&ifa->lock);
770 } 770 }
771 } 771 }
772 } 772 }
773 } 773 }
774 write_unlock_bh(&idev->lock); 774 write_unlock_bh(&idev->lock);
775 775
776 addrconf_del_timer(ifp); 776 addrconf_del_timer(ifp);
777 777
778 ipv6_ifa_notify(RTM_DELADDR, ifp); 778 ipv6_ifa_notify(RTM_DELADDR, ifp);
779 779
780 atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp); 780 atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
781 781
782 /* 782 /*
783 * Purge or update corresponding prefix 783 * Purge or update corresponding prefix
784 * 784 *
785 * 1) we don't purge prefix here if address was not permanent. 785 * 1) we don't purge prefix here if address was not permanent.
786 * prefix is managed by its own lifetime. 786 * prefix is managed by its own lifetime.
787 * 2) if there're no addresses, delete prefix. 787 * 2) if there're no addresses, delete prefix.
788 * 3) if there're still other permanent address(es), 788 * 3) if there're still other permanent address(es),
789 * corresponding prefix is still permanent. 789 * corresponding prefix is still permanent.
790 * 4) otherwise, update prefix lifetime to the 790 * 4) otherwise, update prefix lifetime to the
791 * longest valid lifetime among the corresponding 791 * longest valid lifetime among the corresponding
792 * addresses on the device. 792 * addresses on the device.
793 * Note: subsequent RA will update lifetime. 793 * Note: subsequent RA will update lifetime.
794 * 794 *
795 * --yoshfuji 795 * --yoshfuji
796 */ 796 */
797 if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { 797 if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) {
798 struct in6_addr prefix; 798 struct in6_addr prefix;
799 struct rt6_info *rt; 799 struct rt6_info *rt;
800 struct net *net = dev_net(ifp->idev->dev); 800 struct net *net = dev_net(ifp->idev->dev);
801 ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); 801 ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len);
802 rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1); 802 rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1);
803 803
804 if (rt && addrconf_is_prefix_route(rt)) { 804 if (rt && addrconf_is_prefix_route(rt)) {
805 if (onlink == 0) { 805 if (onlink == 0) {
806 ip6_del_rt(rt); 806 ip6_del_rt(rt);
807 rt = NULL; 807 rt = NULL;
808 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { 808 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
809 rt->rt6i_expires = expires; 809 rt->rt6i_expires = expires;
810 rt->rt6i_flags |= RTF_EXPIRES; 810 rt->rt6i_flags |= RTF_EXPIRES;
811 } 811 }
812 } 812 }
813 dst_release(&rt->dst); 813 dst_release(&rt->dst);
814 } 814 }
815 815
816 /* clean up prefsrc entries */ 816 /* clean up prefsrc entries */
817 rt6_remove_prefsrc(ifp); 817 rt6_remove_prefsrc(ifp);
818 out: 818 out:
819 in6_ifa_put(ifp); 819 in6_ifa_put(ifp);
820 } 820 }
821 821
822 #ifdef CONFIG_IPV6_PRIVACY 822 #ifdef CONFIG_IPV6_PRIVACY
823 static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) 823 static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift)
824 { 824 {
825 struct inet6_dev *idev = ifp->idev; 825 struct inet6_dev *idev = ifp->idev;
826 struct in6_addr addr, *tmpaddr; 826 struct in6_addr addr, *tmpaddr;
827 unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp, age; 827 unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp, age;
828 unsigned long regen_advance; 828 unsigned long regen_advance;
829 int tmp_plen; 829 int tmp_plen;
830 int ret = 0; 830 int ret = 0;
831 int max_addresses; 831 int max_addresses;
832 u32 addr_flags; 832 u32 addr_flags;
833 833
834 write_lock(&idev->lock); 834 write_lock(&idev->lock);
835 if (ift) { 835 if (ift) {
836 spin_lock_bh(&ift->lock); 836 spin_lock_bh(&ift->lock);
837 memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); 837 memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
838 spin_unlock_bh(&ift->lock); 838 spin_unlock_bh(&ift->lock);
839 tmpaddr = &addr; 839 tmpaddr = &addr;
840 } else { 840 } else {
841 tmpaddr = NULL; 841 tmpaddr = NULL;
842 } 842 }
843 retry: 843 retry:
844 in6_dev_hold(idev); 844 in6_dev_hold(idev);
845 if (idev->cnf.use_tempaddr <= 0) { 845 if (idev->cnf.use_tempaddr <= 0) {
846 write_unlock(&idev->lock); 846 write_unlock(&idev->lock);
847 printk(KERN_INFO 847 printk(KERN_INFO
848 "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); 848 "ipv6_create_tempaddr(): use_tempaddr is disabled.\n");
849 in6_dev_put(idev); 849 in6_dev_put(idev);
850 ret = -1; 850 ret = -1;
851 goto out; 851 goto out;
852 } 852 }
853 spin_lock_bh(&ifp->lock); 853 spin_lock_bh(&ifp->lock);
854 if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { 854 if (ifp->regen_count++ >= idev->cnf.regen_max_retry) {
855 idev->cnf.use_tempaddr = -1; /*XXX*/ 855 idev->cnf.use_tempaddr = -1; /*XXX*/
856 spin_unlock_bh(&ifp->lock); 856 spin_unlock_bh(&ifp->lock);
857 write_unlock(&idev->lock); 857 write_unlock(&idev->lock);
858 printk(KERN_WARNING 858 printk(KERN_WARNING
859 "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); 859 "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n");
860 in6_dev_put(idev); 860 in6_dev_put(idev);
861 ret = -1; 861 ret = -1;
862 goto out; 862 goto out;
863 } 863 }
864 in6_ifa_hold(ifp); 864 in6_ifa_hold(ifp);
865 memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); 865 memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
866 if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { 866 if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) {
867 spin_unlock_bh(&ifp->lock); 867 spin_unlock_bh(&ifp->lock);
868 write_unlock(&idev->lock); 868 write_unlock(&idev->lock);
869 printk(KERN_WARNING 869 printk(KERN_WARNING
870 "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); 870 "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n");
871 in6_ifa_put(ifp); 871 in6_ifa_put(ifp);
872 in6_dev_put(idev); 872 in6_dev_put(idev);
873 ret = -1; 873 ret = -1;
874 goto out; 874 goto out;
875 } 875 }
876 memcpy(&addr.s6_addr[8], idev->rndid, 8); 876 memcpy(&addr.s6_addr[8], idev->rndid, 8);
877 age = (jiffies - ifp->tstamp) / HZ; 877 age = (jiffies - ifp->tstamp) / HZ;
878 tmp_valid_lft = min_t(__u32, 878 tmp_valid_lft = min_t(__u32,
879 ifp->valid_lft, 879 ifp->valid_lft,
880 idev->cnf.temp_valid_lft + age); 880 idev->cnf.temp_valid_lft + age);
881 tmp_prefered_lft = min_t(__u32, 881 tmp_prefered_lft = min_t(__u32,
882 ifp->prefered_lft, 882 ifp->prefered_lft,
883 idev->cnf.temp_prefered_lft + age - 883 idev->cnf.temp_prefered_lft + age -
884 idev->cnf.max_desync_factor); 884 idev->cnf.max_desync_factor);
885 tmp_plen = ifp->prefix_len; 885 tmp_plen = ifp->prefix_len;
886 max_addresses = idev->cnf.max_addresses; 886 max_addresses = idev->cnf.max_addresses;
887 tmp_cstamp = ifp->cstamp; 887 tmp_cstamp = ifp->cstamp;
888 tmp_tstamp = ifp->tstamp; 888 tmp_tstamp = ifp->tstamp;
889 spin_unlock_bh(&ifp->lock); 889 spin_unlock_bh(&ifp->lock);
890 890
891 regen_advance = idev->cnf.regen_max_retry * 891 regen_advance = idev->cnf.regen_max_retry *
892 idev->cnf.dad_transmits * 892 idev->cnf.dad_transmits *
893 idev->nd_parms->retrans_time / HZ; 893 idev->nd_parms->retrans_time / HZ;
894 write_unlock(&idev->lock); 894 write_unlock(&idev->lock);
895 895
896 /* A temporary address is created only if this calculated Preferred 896 /* A temporary address is created only if this calculated Preferred
897 * Lifetime is greater than REGEN_ADVANCE time units. In particular, 897 * Lifetime is greater than REGEN_ADVANCE time units. In particular,
898 * an implementation must not create a temporary address with a zero 898 * an implementation must not create a temporary address with a zero
899 * Preferred Lifetime. 899 * Preferred Lifetime.
900 */ 900 */
901 if (tmp_prefered_lft <= regen_advance) { 901 if (tmp_prefered_lft <= regen_advance) {
902 in6_ifa_put(ifp); 902 in6_ifa_put(ifp);
903 in6_dev_put(idev); 903 in6_dev_put(idev);
904 ret = -1; 904 ret = -1;
905 goto out; 905 goto out;
906 } 906 }
907 907
908 addr_flags = IFA_F_TEMPORARY; 908 addr_flags = IFA_F_TEMPORARY;
909 /* set in addrconf_prefix_rcv() */ 909 /* set in addrconf_prefix_rcv() */
910 if (ifp->flags & IFA_F_OPTIMISTIC) 910 if (ifp->flags & IFA_F_OPTIMISTIC)
911 addr_flags |= IFA_F_OPTIMISTIC; 911 addr_flags |= IFA_F_OPTIMISTIC;
912 912
913 ift = !max_addresses || 913 ift = !max_addresses ||
914 ipv6_count_addresses(idev) < max_addresses ? 914 ipv6_count_addresses(idev) < max_addresses ?
915 ipv6_add_addr(idev, &addr, tmp_plen, 915 ipv6_add_addr(idev, &addr, tmp_plen,
916 ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, 916 ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK,
917 addr_flags) : NULL; 917 addr_flags) : NULL;
918 if (!ift || IS_ERR(ift)) { 918 if (!ift || IS_ERR(ift)) {
919 in6_ifa_put(ifp); 919 in6_ifa_put(ifp);
920 in6_dev_put(idev); 920 in6_dev_put(idev);
921 printk(KERN_INFO 921 printk(KERN_INFO
922 "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); 922 "ipv6_create_tempaddr(): retry temporary address regeneration.\n");
923 tmpaddr = &addr; 923 tmpaddr = &addr;
924 write_lock(&idev->lock); 924 write_lock(&idev->lock);
925 goto retry; 925 goto retry;
926 } 926 }
927 927
928 spin_lock_bh(&ift->lock); 928 spin_lock_bh(&ift->lock);
929 ift->ifpub = ifp; 929 ift->ifpub = ifp;
930 ift->valid_lft = tmp_valid_lft; 930 ift->valid_lft = tmp_valid_lft;
931 ift->prefered_lft = tmp_prefered_lft; 931 ift->prefered_lft = tmp_prefered_lft;
932 ift->cstamp = tmp_cstamp; 932 ift->cstamp = tmp_cstamp;
933 ift->tstamp = tmp_tstamp; 933 ift->tstamp = tmp_tstamp;
934 spin_unlock_bh(&ift->lock); 934 spin_unlock_bh(&ift->lock);
935 935
936 addrconf_dad_start(ift, 0); 936 addrconf_dad_start(ift, 0);
937 in6_ifa_put(ift); 937 in6_ifa_put(ift);
938 in6_dev_put(idev); 938 in6_dev_put(idev);
939 out: 939 out:
940 return ret; 940 return ret;
941 } 941 }
942 #endif 942 #endif
943 943
944 /* 944 /*
945 * Choose an appropriate source address (RFC3484) 945 * Choose an appropriate source address (RFC3484)
946 */ 946 */
947 enum { 947 enum {
948 IPV6_SADDR_RULE_INIT = 0, 948 IPV6_SADDR_RULE_INIT = 0,
949 IPV6_SADDR_RULE_LOCAL, 949 IPV6_SADDR_RULE_LOCAL,
950 IPV6_SADDR_RULE_SCOPE, 950 IPV6_SADDR_RULE_SCOPE,
951 IPV6_SADDR_RULE_PREFERRED, 951 IPV6_SADDR_RULE_PREFERRED,
952 #ifdef CONFIG_IPV6_MIP6 952 #ifdef CONFIG_IPV6_MIP6
953 IPV6_SADDR_RULE_HOA, 953 IPV6_SADDR_RULE_HOA,
954 #endif 954 #endif
955 IPV6_SADDR_RULE_OIF, 955 IPV6_SADDR_RULE_OIF,
956 IPV6_SADDR_RULE_LABEL, 956 IPV6_SADDR_RULE_LABEL,
957 #ifdef CONFIG_IPV6_PRIVACY 957 #ifdef CONFIG_IPV6_PRIVACY
958 IPV6_SADDR_RULE_PRIVACY, 958 IPV6_SADDR_RULE_PRIVACY,
959 #endif 959 #endif
960 IPV6_SADDR_RULE_ORCHID, 960 IPV6_SADDR_RULE_ORCHID,
961 IPV6_SADDR_RULE_PREFIX, 961 IPV6_SADDR_RULE_PREFIX,
962 IPV6_SADDR_RULE_MAX 962 IPV6_SADDR_RULE_MAX
963 }; 963 };
964 964
965 struct ipv6_saddr_score { 965 struct ipv6_saddr_score {
966 int rule; 966 int rule;
967 int addr_type; 967 int addr_type;
968 struct inet6_ifaddr *ifa; 968 struct inet6_ifaddr *ifa;
969 DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX); 969 DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX);
970 int scopedist; 970 int scopedist;
971 int matchlen; 971 int matchlen;
972 }; 972 };
973 973
974 struct ipv6_saddr_dst { 974 struct ipv6_saddr_dst {
975 const struct in6_addr *addr; 975 const struct in6_addr *addr;
976 int ifindex; 976 int ifindex;
977 int scope; 977 int scope;
978 int label; 978 int label;
979 unsigned int prefs; 979 unsigned int prefs;
980 }; 980 };
981 981
982 static inline int ipv6_saddr_preferred(int type) 982 static inline int ipv6_saddr_preferred(int type)
983 { 983 {
984 if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK)) 984 if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK))
985 return 1; 985 return 1;
986 return 0; 986 return 0;
987 } 987 }
988 988
989 static int ipv6_get_saddr_eval(struct net *net, 989 static int ipv6_get_saddr_eval(struct net *net,
990 struct ipv6_saddr_score *score, 990 struct ipv6_saddr_score *score,
991 struct ipv6_saddr_dst *dst, 991 struct ipv6_saddr_dst *dst,
992 int i) 992 int i)
993 { 993 {
994 int ret; 994 int ret;
995 995
996 if (i <= score->rule) { 996 if (i <= score->rule) {
997 switch (i) { 997 switch (i) {
998 case IPV6_SADDR_RULE_SCOPE: 998 case IPV6_SADDR_RULE_SCOPE:
999 ret = score->scopedist; 999 ret = score->scopedist;
1000 break; 1000 break;
1001 case IPV6_SADDR_RULE_PREFIX: 1001 case IPV6_SADDR_RULE_PREFIX:
1002 ret = score->matchlen; 1002 ret = score->matchlen;
1003 break; 1003 break;
1004 default: 1004 default:
1005 ret = !!test_bit(i, score->scorebits); 1005 ret = !!test_bit(i, score->scorebits);
1006 } 1006 }
1007 goto out; 1007 goto out;
1008 } 1008 }
1009 1009
1010 switch (i) { 1010 switch (i) {
1011 case IPV6_SADDR_RULE_INIT: 1011 case IPV6_SADDR_RULE_INIT:
1012 /* Rule 0: remember if hiscore is not ready yet */ 1012 /* Rule 0: remember if hiscore is not ready yet */
1013 ret = !!score->ifa; 1013 ret = !!score->ifa;
1014 break; 1014 break;
1015 case IPV6_SADDR_RULE_LOCAL: 1015 case IPV6_SADDR_RULE_LOCAL:
1016 /* Rule 1: Prefer same address */ 1016 /* Rule 1: Prefer same address */
1017 ret = ipv6_addr_equal(&score->ifa->addr, dst->addr); 1017 ret = ipv6_addr_equal(&score->ifa->addr, dst->addr);
1018 break; 1018 break;
1019 case IPV6_SADDR_RULE_SCOPE: 1019 case IPV6_SADDR_RULE_SCOPE:
1020 /* Rule 2: Prefer appropriate scope 1020 /* Rule 2: Prefer appropriate scope
1021 * 1021 *
1022 * ret 1022 * ret
1023 * ^ 1023 * ^
1024 * -1 | d 15 1024 * -1 | d 15
1025 * ---+--+-+---> scope 1025 * ---+--+-+---> scope
1026 * | 1026 * |
1027 * | d is scope of the destination. 1027 * | d is scope of the destination.
1028 * B-d | \ 1028 * B-d | \
1029 * | \ <- smaller scope is better if 1029 * | \ <- smaller scope is better if
1030 * B-15 | \ if scope is enough for destinaion. 1030 * B-15 | \ if scope is enough for destinaion.
1031 * | ret = B - scope (-1 <= scope >= d <= 15). 1031 * | ret = B - scope (-1 <= scope >= d <= 15).
1032 * d-C-1 | / 1032 * d-C-1 | /
1033 * |/ <- greater is better 1033 * |/ <- greater is better
1034 * -C / if scope is not enough for destination. 1034 * -C / if scope is not enough for destination.
1035 * /| ret = scope - C (-1 <= d < scope <= 15). 1035 * /| ret = scope - C (-1 <= d < scope <= 15).
1036 * 1036 *
1037 * d - C - 1 < B -15 (for all -1 <= d <= 15). 1037 * d - C - 1 < B -15 (for all -1 <= d <= 15).
1038 * C > d + 14 - B >= 15 + 14 - B = 29 - B. 1038 * C > d + 14 - B >= 15 + 14 - B = 29 - B.
1039 * Assume B = 0 and we get C > 29. 1039 * Assume B = 0 and we get C > 29.
1040 */ 1040 */
1041 ret = __ipv6_addr_src_scope(score->addr_type); 1041 ret = __ipv6_addr_src_scope(score->addr_type);
1042 if (ret >= dst->scope) 1042 if (ret >= dst->scope)
1043 ret = -ret; 1043 ret = -ret;
1044 else 1044 else
1045 ret -= 128; /* 30 is enough */ 1045 ret -= 128; /* 30 is enough */
1046 score->scopedist = ret; 1046 score->scopedist = ret;
1047 break; 1047 break;
1048 case IPV6_SADDR_RULE_PREFERRED: 1048 case IPV6_SADDR_RULE_PREFERRED:
1049 /* Rule 3: Avoid deprecated and optimistic addresses */ 1049 /* Rule 3: Avoid deprecated and optimistic addresses */
1050 ret = ipv6_saddr_preferred(score->addr_type) || 1050 ret = ipv6_saddr_preferred(score->addr_type) ||
1051 !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); 1051 !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC));
1052 break; 1052 break;
1053 #ifdef CONFIG_IPV6_MIP6 1053 #ifdef CONFIG_IPV6_MIP6
1054 case IPV6_SADDR_RULE_HOA: 1054 case IPV6_SADDR_RULE_HOA:
1055 { 1055 {
1056 /* Rule 4: Prefer home address */ 1056 /* Rule 4: Prefer home address */
1057 int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA); 1057 int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA);
1058 ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome; 1058 ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome;
1059 break; 1059 break;
1060 } 1060 }
1061 #endif 1061 #endif
1062 case IPV6_SADDR_RULE_OIF: 1062 case IPV6_SADDR_RULE_OIF:
1063 /* Rule 5: Prefer outgoing interface */ 1063 /* Rule 5: Prefer outgoing interface */
1064 ret = (!dst->ifindex || 1064 ret = (!dst->ifindex ||
1065 dst->ifindex == score->ifa->idev->dev->ifindex); 1065 dst->ifindex == score->ifa->idev->dev->ifindex);
1066 break; 1066 break;
1067 case IPV6_SADDR_RULE_LABEL: 1067 case IPV6_SADDR_RULE_LABEL:
1068 /* Rule 6: Prefer matching label */ 1068 /* Rule 6: Prefer matching label */
1069 ret = ipv6_addr_label(net, 1069 ret = ipv6_addr_label(net,
1070 &score->ifa->addr, score->addr_type, 1070 &score->ifa->addr, score->addr_type,
1071 score->ifa->idev->dev->ifindex) == dst->label; 1071 score->ifa->idev->dev->ifindex) == dst->label;
1072 break; 1072 break;
1073 #ifdef CONFIG_IPV6_PRIVACY 1073 #ifdef CONFIG_IPV6_PRIVACY
1074 case IPV6_SADDR_RULE_PRIVACY: 1074 case IPV6_SADDR_RULE_PRIVACY:
1075 { 1075 {
1076 /* Rule 7: Prefer public address 1076 /* Rule 7: Prefer public address
1077 * Note: prefer temporary address if use_tempaddr >= 2 1077 * Note: prefer temporary address if use_tempaddr >= 2
1078 */ 1078 */
1079 int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? 1079 int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
1080 !!(dst->prefs & IPV6_PREFER_SRC_TMP) : 1080 !!(dst->prefs & IPV6_PREFER_SRC_TMP) :
1081 score->ifa->idev->cnf.use_tempaddr >= 2; 1081 score->ifa->idev->cnf.use_tempaddr >= 2;
1082 ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; 1082 ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
1083 break; 1083 break;
1084 } 1084 }
1085 #endif 1085 #endif
1086 case IPV6_SADDR_RULE_ORCHID: 1086 case IPV6_SADDR_RULE_ORCHID:
1087 /* Rule 8-: Prefer ORCHID vs ORCHID or 1087 /* Rule 8-: Prefer ORCHID vs ORCHID or
1088 * non-ORCHID vs non-ORCHID 1088 * non-ORCHID vs non-ORCHID
1089 */ 1089 */
1090 ret = !(ipv6_addr_orchid(&score->ifa->addr) ^ 1090 ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
1091 ipv6_addr_orchid(dst->addr)); 1091 ipv6_addr_orchid(dst->addr));
1092 break; 1092 break;
1093 case IPV6_SADDR_RULE_PREFIX: 1093 case IPV6_SADDR_RULE_PREFIX:
1094 /* Rule 8: Use longest matching prefix */ 1094 /* Rule 8: Use longest matching prefix */
1095 score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr, 1095 score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr,
1096 dst->addr); 1096 dst->addr);
1097 break; 1097 break;
1098 default: 1098 default:
1099 ret = 0; 1099 ret = 0;
1100 } 1100 }
1101 1101
1102 if (ret) 1102 if (ret)
1103 __set_bit(i, score->scorebits); 1103 __set_bit(i, score->scorebits);
1104 score->rule = i; 1104 score->rule = i;
1105 out: 1105 out:
1106 return ret; 1106 return ret;
1107 } 1107 }
1108 1108
1109 int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, 1109 int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
1110 const struct in6_addr *daddr, unsigned int prefs, 1110 const struct in6_addr *daddr, unsigned int prefs,
1111 struct in6_addr *saddr) 1111 struct in6_addr *saddr)
1112 { 1112 {
1113 struct ipv6_saddr_score scores[2], 1113 struct ipv6_saddr_score scores[2],
1114 *score = &scores[0], *hiscore = &scores[1]; 1114 *score = &scores[0], *hiscore = &scores[1];
1115 struct ipv6_saddr_dst dst; 1115 struct ipv6_saddr_dst dst;
1116 struct net_device *dev; 1116 struct net_device *dev;
1117 int dst_type; 1117 int dst_type;
1118 1118
1119 dst_type = __ipv6_addr_type(daddr); 1119 dst_type = __ipv6_addr_type(daddr);
1120 dst.addr = daddr; 1120 dst.addr = daddr;
1121 dst.ifindex = dst_dev ? dst_dev->ifindex : 0; 1121 dst.ifindex = dst_dev ? dst_dev->ifindex : 0;
1122 dst.scope = __ipv6_addr_src_scope(dst_type); 1122 dst.scope = __ipv6_addr_src_scope(dst_type);
1123 dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); 1123 dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
1124 dst.prefs = prefs; 1124 dst.prefs = prefs;
1125 1125
1126 hiscore->rule = -1; 1126 hiscore->rule = -1;
1127 hiscore->ifa = NULL; 1127 hiscore->ifa = NULL;
1128 1128
1129 rcu_read_lock(); 1129 rcu_read_lock();
1130 1130
1131 for_each_netdev_rcu(net, dev) { 1131 for_each_netdev_rcu(net, dev) {
1132 struct inet6_dev *idev; 1132 struct inet6_dev *idev;
1133 1133
1134 /* Candidate Source Address (section 4) 1134 /* Candidate Source Address (section 4)
1135 * - multicast and link-local destination address, 1135 * - multicast and link-local destination address,
1136 * the set of candidate source address MUST only 1136 * the set of candidate source address MUST only
1137 * include addresses assigned to interfaces 1137 * include addresses assigned to interfaces
1138 * belonging to the same link as the outgoing 1138 * belonging to the same link as the outgoing
1139 * interface. 1139 * interface.
1140 * (- For site-local destination addresses, the 1140 * (- For site-local destination addresses, the
1141 * set of candidate source addresses MUST only 1141 * set of candidate source addresses MUST only
1142 * include addresses assigned to interfaces 1142 * include addresses assigned to interfaces
1143 * belonging to the same site as the outgoing 1143 * belonging to the same site as the outgoing
1144 * interface.) 1144 * interface.)
1145 */ 1145 */
1146 if (((dst_type & IPV6_ADDR_MULTICAST) || 1146 if (((dst_type & IPV6_ADDR_MULTICAST) ||
1147 dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && 1147 dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
1148 dst.ifindex && dev->ifindex != dst.ifindex) 1148 dst.ifindex && dev->ifindex != dst.ifindex)
1149 continue; 1149 continue;
1150 1150
1151 idev = __in6_dev_get(dev); 1151 idev = __in6_dev_get(dev);
1152 if (!idev) 1152 if (!idev)
1153 continue; 1153 continue;
1154 1154
1155 read_lock_bh(&idev->lock); 1155 read_lock_bh(&idev->lock);
1156 list_for_each_entry(score->ifa, &idev->addr_list, if_list) { 1156 list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
1157 int i; 1157 int i;
1158 1158
1159 /* 1159 /*
1160 * - Tentative Address (RFC2462 section 5.4) 1160 * - Tentative Address (RFC2462 section 5.4)
1161 * - A tentative address is not considered 1161 * - A tentative address is not considered
1162 * "assigned to an interface" in the traditional 1162 * "assigned to an interface" in the traditional
1163 * sense, unless it is also flagged as optimistic. 1163 * sense, unless it is also flagged as optimistic.
1164 * - Candidate Source Address (section 4) 1164 * - Candidate Source Address (section 4)
1165 * - In any case, anycast addresses, multicast 1165 * - In any case, anycast addresses, multicast
1166 * addresses, and the unspecified address MUST 1166 * addresses, and the unspecified address MUST
1167 * NOT be included in a candidate set. 1167 * NOT be included in a candidate set.
1168 */ 1168 */
1169 if ((score->ifa->flags & IFA_F_TENTATIVE) && 1169 if ((score->ifa->flags & IFA_F_TENTATIVE) &&
1170 (!(score->ifa->flags & IFA_F_OPTIMISTIC))) 1170 (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
1171 continue; 1171 continue;
1172 1172
1173 score->addr_type = __ipv6_addr_type(&score->ifa->addr); 1173 score->addr_type = __ipv6_addr_type(&score->ifa->addr);
1174 1174
1175 if (unlikely(score->addr_type == IPV6_ADDR_ANY || 1175 if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
1176 score->addr_type & IPV6_ADDR_MULTICAST)) { 1176 score->addr_type & IPV6_ADDR_MULTICAST)) {
1177 LIMIT_NETDEBUG(KERN_DEBUG 1177 LIMIT_NETDEBUG(KERN_DEBUG
1178 "ADDRCONF: unspecified / multicast address " 1178 "ADDRCONF: unspecified / multicast address "
1179 "assigned as unicast address on %s", 1179 "assigned as unicast address on %s",
1180 dev->name); 1180 dev->name);
1181 continue; 1181 continue;
1182 } 1182 }
1183 1183
1184 score->rule = -1; 1184 score->rule = -1;
1185 bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); 1185 bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
1186 1186
1187 for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { 1187 for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
1188 int minihiscore, miniscore; 1188 int minihiscore, miniscore;
1189 1189
1190 minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); 1190 minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
1191 miniscore = ipv6_get_saddr_eval(net, score, &dst, i); 1191 miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
1192 1192
1193 if (minihiscore > miniscore) { 1193 if (minihiscore > miniscore) {
1194 if (i == IPV6_SADDR_RULE_SCOPE && 1194 if (i == IPV6_SADDR_RULE_SCOPE &&
1195 score->scopedist > 0) { 1195 score->scopedist > 0) {
1196 /* 1196 /*
1197 * special case: 1197 * special case:
1198 * each remaining entry 1198 * each remaining entry
1199 * has too small (not enough) 1199 * has too small (not enough)
1200 * scope, because ifa entries 1200 * scope, because ifa entries
1201 * are sorted by their scope 1201 * are sorted by their scope
1202 * values. 1202 * values.
1203 */ 1203 */
1204 goto try_nextdev; 1204 goto try_nextdev;
1205 } 1205 }
1206 break; 1206 break;
1207 } else if (minihiscore < miniscore) { 1207 } else if (minihiscore < miniscore) {
1208 if (hiscore->ifa) 1208 if (hiscore->ifa)
1209 in6_ifa_put(hiscore->ifa); 1209 in6_ifa_put(hiscore->ifa);
1210 1210
1211 in6_ifa_hold(score->ifa); 1211 in6_ifa_hold(score->ifa);
1212 1212
1213 swap(hiscore, score); 1213 swap(hiscore, score);
1214 1214
1215 /* restore our iterator */ 1215 /* restore our iterator */
1216 score->ifa = hiscore->ifa; 1216 score->ifa = hiscore->ifa;
1217 1217
1218 break; 1218 break;
1219 } 1219 }
1220 } 1220 }
1221 } 1221 }
1222 try_nextdev: 1222 try_nextdev:
1223 read_unlock_bh(&idev->lock); 1223 read_unlock_bh(&idev->lock);
1224 } 1224 }
1225 rcu_read_unlock(); 1225 rcu_read_unlock();
1226 1226
1227 if (!hiscore->ifa) 1227 if (!hiscore->ifa)
1228 return -EADDRNOTAVAIL; 1228 return -EADDRNOTAVAIL;
1229 1229
1230 ipv6_addr_copy(saddr, &hiscore->ifa->addr); 1230 ipv6_addr_copy(saddr, &hiscore->ifa->addr);
1231 in6_ifa_put(hiscore->ifa); 1231 in6_ifa_put(hiscore->ifa);
1232 return 0; 1232 return 0;
1233 } 1233 }
1234 EXPORT_SYMBOL(ipv6_dev_get_saddr); 1234 EXPORT_SYMBOL(ipv6_dev_get_saddr);
1235 1235
1236 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, 1236 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
1237 unsigned char banned_flags) 1237 unsigned char banned_flags)
1238 { 1238 {
1239 struct inet6_dev *idev; 1239 struct inet6_dev *idev;
1240 int err = -EADDRNOTAVAIL; 1240 int err = -EADDRNOTAVAIL;
1241 1241
1242 rcu_read_lock(); 1242 rcu_read_lock();
1243 idev = __in6_dev_get(dev); 1243 idev = __in6_dev_get(dev);
1244 if (idev) { 1244 if (idev) {
1245 struct inet6_ifaddr *ifp; 1245 struct inet6_ifaddr *ifp;
1246 1246
1247 read_lock_bh(&idev->lock); 1247 read_lock_bh(&idev->lock);
1248 list_for_each_entry(ifp, &idev->addr_list, if_list) { 1248 list_for_each_entry(ifp, &idev->addr_list, if_list) {
1249 if (ifp->scope == IFA_LINK && 1249 if (ifp->scope == IFA_LINK &&
1250 !(ifp->flags & banned_flags)) { 1250 !(ifp->flags & banned_flags)) {
1251 ipv6_addr_copy(addr, &ifp->addr); 1251 ipv6_addr_copy(addr, &ifp->addr);
1252 err = 0; 1252 err = 0;
1253 break; 1253 break;
1254 } 1254 }
1255 } 1255 }
1256 read_unlock_bh(&idev->lock); 1256 read_unlock_bh(&idev->lock);
1257 } 1257 }
1258 rcu_read_unlock(); 1258 rcu_read_unlock();
1259 return err; 1259 return err;
1260 } 1260 }
1261 1261
1262 static int ipv6_count_addresses(struct inet6_dev *idev) 1262 static int ipv6_count_addresses(struct inet6_dev *idev)
1263 { 1263 {
1264 int cnt = 0; 1264 int cnt = 0;
1265 struct inet6_ifaddr *ifp; 1265 struct inet6_ifaddr *ifp;
1266 1266
1267 read_lock_bh(&idev->lock); 1267 read_lock_bh(&idev->lock);
1268 list_for_each_entry(ifp, &idev->addr_list, if_list) 1268 list_for_each_entry(ifp, &idev->addr_list, if_list)
1269 cnt++; 1269 cnt++;
1270 read_unlock_bh(&idev->lock); 1270 read_unlock_bh(&idev->lock);
1271 return cnt; 1271 return cnt;
1272 } 1272 }
1273 1273
1274 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, 1274 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
1275 struct net_device *dev, int strict) 1275 struct net_device *dev, int strict)
1276 { 1276 {
1277 struct inet6_ifaddr *ifp; 1277 struct inet6_ifaddr *ifp;
1278 struct hlist_node *node; 1278 struct hlist_node *node;
1279 unsigned int hash = ipv6_addr_hash(addr); 1279 unsigned int hash = ipv6_addr_hash(addr);
1280 1280
1281 rcu_read_lock_bh(); 1281 rcu_read_lock_bh();
1282 hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) { 1282 hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) {
1283 if (!net_eq(dev_net(ifp->idev->dev), net)) 1283 if (!net_eq(dev_net(ifp->idev->dev), net))
1284 continue; 1284 continue;
1285 if (ipv6_addr_equal(&ifp->addr, addr) && 1285 if (ipv6_addr_equal(&ifp->addr, addr) &&
1286 !(ifp->flags&IFA_F_TENTATIVE) && 1286 !(ifp->flags&IFA_F_TENTATIVE) &&
1287 (dev == NULL || ifp->idev->dev == dev || 1287 (dev == NULL || ifp->idev->dev == dev ||
1288 !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { 1288 !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
1289 rcu_read_unlock_bh(); 1289 rcu_read_unlock_bh();
1290 return 1; 1290 return 1;
1291 } 1291 }
1292 } 1292 }
1293 1293
1294 rcu_read_unlock_bh(); 1294 rcu_read_unlock_bh();
1295 return 0; 1295 return 0;
1296 } 1296 }
1297 EXPORT_SYMBOL(ipv6_chk_addr); 1297 EXPORT_SYMBOL(ipv6_chk_addr);
1298 1298
1299 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, 1299 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
1300 struct net_device *dev) 1300 struct net_device *dev)
1301 { 1301 {
1302 unsigned int hash = ipv6_addr_hash(addr); 1302 unsigned int hash = ipv6_addr_hash(addr);
1303 struct inet6_ifaddr *ifp; 1303 struct inet6_ifaddr *ifp;
1304 struct hlist_node *node; 1304 struct hlist_node *node;
1305 1305
1306 hlist_for_each_entry(ifp, node, &inet6_addr_lst[hash], addr_lst) { 1306 hlist_for_each_entry(ifp, node, &inet6_addr_lst[hash], addr_lst) {
1307 if (!net_eq(dev_net(ifp->idev->dev), net)) 1307 if (!net_eq(dev_net(ifp->idev->dev), net))
1308 continue; 1308 continue;
1309 if (ipv6_addr_equal(&ifp->addr, addr)) { 1309 if (ipv6_addr_equal(&ifp->addr, addr)) {
1310 if (dev == NULL || ifp->idev->dev == dev) 1310 if (dev == NULL || ifp->idev->dev == dev)
1311 return true; 1311 return true;
1312 } 1312 }
1313 } 1313 }
1314 return false; 1314 return false;
1315 } 1315 }
1316 1316
1317 int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) 1317 int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
1318 { 1318 {
1319 struct inet6_dev *idev; 1319 struct inet6_dev *idev;
1320 struct inet6_ifaddr *ifa; 1320 struct inet6_ifaddr *ifa;
1321 int onlink; 1321 int onlink;
1322 1322
1323 onlink = 0; 1323 onlink = 0;
1324 rcu_read_lock(); 1324 rcu_read_lock();
1325 idev = __in6_dev_get(dev); 1325 idev = __in6_dev_get(dev);
1326 if (idev) { 1326 if (idev) {
1327 read_lock_bh(&idev->lock); 1327 read_lock_bh(&idev->lock);
1328 list_for_each_entry(ifa, &idev->addr_list, if_list) { 1328 list_for_each_entry(ifa, &idev->addr_list, if_list) {
1329 onlink = ipv6_prefix_equal(addr, &ifa->addr, 1329 onlink = ipv6_prefix_equal(addr, &ifa->addr,
1330 ifa->prefix_len); 1330 ifa->prefix_len);
1331 if (onlink) 1331 if (onlink)
1332 break; 1332 break;
1333 } 1333 }
1334 read_unlock_bh(&idev->lock); 1334 read_unlock_bh(&idev->lock);
1335 } 1335 }
1336 rcu_read_unlock(); 1336 rcu_read_unlock();
1337 return onlink; 1337 return onlink;
1338 } 1338 }
1339 1339
1340 EXPORT_SYMBOL(ipv6_chk_prefix); 1340 EXPORT_SYMBOL(ipv6_chk_prefix);
1341 1341
1342 struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, 1342 struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
1343 struct net_device *dev, int strict) 1343 struct net_device *dev, int strict)
1344 { 1344 {
1345 struct inet6_ifaddr *ifp, *result = NULL; 1345 struct inet6_ifaddr *ifp, *result = NULL;
1346 unsigned int hash = ipv6_addr_hash(addr); 1346 unsigned int hash = ipv6_addr_hash(addr);
1347 struct hlist_node *node; 1347 struct hlist_node *node;
1348 1348
1349 rcu_read_lock_bh(); 1349 rcu_read_lock_bh();
1350 hlist_for_each_entry_rcu_bh(ifp, node, &inet6_addr_lst[hash], addr_lst) { 1350 hlist_for_each_entry_rcu_bh(ifp, node, &inet6_addr_lst[hash], addr_lst) {
1351 if (!net_eq(dev_net(ifp->idev->dev), net)) 1351 if (!net_eq(dev_net(ifp->idev->dev), net))
1352 continue; 1352 continue;
1353 if (ipv6_addr_equal(&ifp->addr, addr)) { 1353 if (ipv6_addr_equal(&ifp->addr, addr)) {
1354 if (dev == NULL || ifp->idev->dev == dev || 1354 if (dev == NULL || ifp->idev->dev == dev ||
1355 !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { 1355 !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
1356 result = ifp; 1356 result = ifp;
1357 in6_ifa_hold(ifp); 1357 in6_ifa_hold(ifp);
1358 break; 1358 break;
1359 } 1359 }
1360 } 1360 }
1361 } 1361 }
1362 rcu_read_unlock_bh(); 1362 rcu_read_unlock_bh();
1363 1363
1364 return result; 1364 return result;
1365 } 1365 }
1366 1366
1367 /* Gets referenced address, destroys ifaddr */ 1367 /* Gets referenced address, destroys ifaddr */
1368 1368
1369 static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) 1369 static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
1370 { 1370 {
1371 if (ifp->flags&IFA_F_PERMANENT) { 1371 if (ifp->flags&IFA_F_PERMANENT) {
1372 spin_lock_bh(&ifp->lock); 1372 spin_lock_bh(&ifp->lock);
1373 addrconf_del_timer(ifp); 1373 addrconf_del_timer(ifp);
1374 ifp->flags |= IFA_F_TENTATIVE; 1374 ifp->flags |= IFA_F_TENTATIVE;
1375 if (dad_failed) 1375 if (dad_failed)
1376 ifp->flags |= IFA_F_DADFAILED; 1376 ifp->flags |= IFA_F_DADFAILED;
1377 spin_unlock_bh(&ifp->lock); 1377 spin_unlock_bh(&ifp->lock);
1378 if (dad_failed) 1378 if (dad_failed)
1379 ipv6_ifa_notify(0, ifp); 1379 ipv6_ifa_notify(0, ifp);
1380 in6_ifa_put(ifp); 1380 in6_ifa_put(ifp);
1381 #ifdef CONFIG_IPV6_PRIVACY 1381 #ifdef CONFIG_IPV6_PRIVACY
1382 } else if (ifp->flags&IFA_F_TEMPORARY) { 1382 } else if (ifp->flags&IFA_F_TEMPORARY) {
1383 struct inet6_ifaddr *ifpub; 1383 struct inet6_ifaddr *ifpub;
1384 spin_lock_bh(&ifp->lock); 1384 spin_lock_bh(&ifp->lock);
1385 ifpub = ifp->ifpub; 1385 ifpub = ifp->ifpub;
1386 if (ifpub) { 1386 if (ifpub) {
1387 in6_ifa_hold(ifpub); 1387 in6_ifa_hold(ifpub);
1388 spin_unlock_bh(&ifp->lock); 1388 spin_unlock_bh(&ifp->lock);
1389 ipv6_create_tempaddr(ifpub, ifp); 1389 ipv6_create_tempaddr(ifpub, ifp);
1390 in6_ifa_put(ifpub); 1390 in6_ifa_put(ifpub);
1391 } else { 1391 } else {
1392 spin_unlock_bh(&ifp->lock); 1392 spin_unlock_bh(&ifp->lock);
1393 } 1393 }
1394 ipv6_del_addr(ifp); 1394 ipv6_del_addr(ifp);
1395 #endif 1395 #endif
1396 } else 1396 } else
1397 ipv6_del_addr(ifp); 1397 ipv6_del_addr(ifp);
1398 } 1398 }
1399 1399
1400 static int addrconf_dad_end(struct inet6_ifaddr *ifp) 1400 static int addrconf_dad_end(struct inet6_ifaddr *ifp)
1401 { 1401 {
1402 int err = -ENOENT; 1402 int err = -ENOENT;
1403 1403
1404 spin_lock(&ifp->state_lock); 1404 spin_lock(&ifp->state_lock);
1405 if (ifp->state == INET6_IFADDR_STATE_DAD) { 1405 if (ifp->state == INET6_IFADDR_STATE_DAD) {
1406 ifp->state = INET6_IFADDR_STATE_POSTDAD; 1406 ifp->state = INET6_IFADDR_STATE_POSTDAD;
1407 err = 0; 1407 err = 0;
1408 } 1408 }
1409 spin_unlock(&ifp->state_lock); 1409 spin_unlock(&ifp->state_lock);
1410 1410
1411 return err; 1411 return err;
1412 } 1412 }
1413 1413
1414 void addrconf_dad_failure(struct inet6_ifaddr *ifp) 1414 void addrconf_dad_failure(struct inet6_ifaddr *ifp)
1415 { 1415 {
1416 struct inet6_dev *idev = ifp->idev; 1416 struct inet6_dev *idev = ifp->idev;
1417 1417
1418 if (addrconf_dad_end(ifp)) { 1418 if (addrconf_dad_end(ifp)) {
1419 in6_ifa_put(ifp); 1419 in6_ifa_put(ifp);
1420 return; 1420 return;
1421 } 1421 }
1422 1422
1423 if (net_ratelimit()) 1423 if (net_ratelimit())
1424 printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", 1424 printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n",
1425 ifp->idev->dev->name, &ifp->addr); 1425 ifp->idev->dev->name, &ifp->addr);
1426 1426
1427 if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { 1427 if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
1428 struct in6_addr addr; 1428 struct in6_addr addr;
1429 1429
1430 addr.s6_addr32[0] = htonl(0xfe800000); 1430 addr.s6_addr32[0] = htonl(0xfe800000);
1431 addr.s6_addr32[1] = 0; 1431 addr.s6_addr32[1] = 0;
1432 1432
1433 if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && 1433 if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
1434 ipv6_addr_equal(&ifp->addr, &addr)) { 1434 ipv6_addr_equal(&ifp->addr, &addr)) {
1435 /* DAD failed for link-local based on MAC address */ 1435 /* DAD failed for link-local based on MAC address */
1436 idev->cnf.disable_ipv6 = 1; 1436 idev->cnf.disable_ipv6 = 1;
1437 1437
1438 printk(KERN_INFO "%s: IPv6 being disabled!\n", 1438 printk(KERN_INFO "%s: IPv6 being disabled!\n",
1439 ifp->idev->dev->name); 1439 ifp->idev->dev->name);
1440 } 1440 }
1441 } 1441 }
1442 1442
1443 addrconf_dad_stop(ifp, 1); 1443 addrconf_dad_stop(ifp, 1);
1444 } 1444 }
1445 1445
1446 /* Join to solicited addr multicast group. */ 1446 /* Join to solicited addr multicast group. */
1447 1447
1448 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) 1448 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
1449 { 1449 {
1450 struct in6_addr maddr; 1450 struct in6_addr maddr;
1451 1451
1452 if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) 1452 if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
1453 return; 1453 return;
1454 1454
1455 addrconf_addr_solict_mult(addr, &maddr); 1455 addrconf_addr_solict_mult(addr, &maddr);
1456 ipv6_dev_mc_inc(dev, &maddr); 1456 ipv6_dev_mc_inc(dev, &maddr);
1457 } 1457 }
1458 1458
1459 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) 1459 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
1460 { 1460 {
1461 struct in6_addr maddr; 1461 struct in6_addr maddr;
1462 1462
1463 if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) 1463 if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
1464 return; 1464 return;
1465 1465
1466 addrconf_addr_solict_mult(addr, &maddr); 1466 addrconf_addr_solict_mult(addr, &maddr);
1467 __ipv6_dev_mc_dec(idev, &maddr); 1467 __ipv6_dev_mc_dec(idev, &maddr);
1468 } 1468 }
1469 1469
1470 static void addrconf_join_anycast(struct inet6_ifaddr *ifp) 1470 static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
1471 { 1471 {
1472 struct in6_addr addr; 1472 struct in6_addr addr;
1473 if (ifp->prefix_len == 127) /* RFC 6164 */ 1473 if (ifp->prefix_len == 127) /* RFC 6164 */
1474 return; 1474 return;
1475 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); 1475 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
1476 if (ipv6_addr_any(&addr)) 1476 if (ipv6_addr_any(&addr))
1477 return; 1477 return;
1478 ipv6_dev_ac_inc(ifp->idev->dev, &addr); 1478 ipv6_dev_ac_inc(ifp->idev->dev, &addr);
1479 } 1479 }
1480 1480
1481 static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) 1481 static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
1482 { 1482 {
1483 struct in6_addr addr; 1483 struct in6_addr addr;
1484 if (ifp->prefix_len == 127) /* RFC 6164 */ 1484 if (ifp->prefix_len == 127) /* RFC 6164 */
1485 return; 1485 return;
1486 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); 1486 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
1487 if (ipv6_addr_any(&addr)) 1487 if (ipv6_addr_any(&addr))
1488 return; 1488 return;
1489 __ipv6_dev_ac_dec(ifp->idev, &addr); 1489 __ipv6_dev_ac_dec(ifp->idev, &addr);
1490 } 1490 }
1491 1491
1492 static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) 1492 static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
1493 { 1493 {
1494 if (dev->addr_len != ETH_ALEN) 1494 if (dev->addr_len != ETH_ALEN)
1495 return -1; 1495 return -1;
1496 memcpy(eui, dev->dev_addr, 3); 1496 memcpy(eui, dev->dev_addr, 3);
1497 memcpy(eui + 5, dev->dev_addr + 3, 3); 1497 memcpy(eui + 5, dev->dev_addr + 3, 3);
1498 1498
1499 /* 1499 /*
1500 * The zSeries OSA network cards can be shared among various 1500 * The zSeries OSA network cards can be shared among various
1501 * OS instances, but the OSA cards have only one MAC address. 1501 * OS instances, but the OSA cards have only one MAC address.
1502 * This leads to duplicate address conflicts in conjunction 1502 * This leads to duplicate address conflicts in conjunction
1503 * with IPv6 if more than one instance uses the same card. 1503 * with IPv6 if more than one instance uses the same card.
1504 * 1504 *
1505 * The driver for these cards can deliver a unique 16-bit 1505 * The driver for these cards can deliver a unique 16-bit
1506 * identifier for each instance sharing the same card. It is 1506 * identifier for each instance sharing the same card. It is
1507 * placed instead of 0xFFFE in the interface identifier. The 1507 * placed instead of 0xFFFE in the interface identifier. The
1508 * "u" bit of the interface identifier is not inverted in this 1508 * "u" bit of the interface identifier is not inverted in this
1509 * case. Hence the resulting interface identifier has local 1509 * case. Hence the resulting interface identifier has local
1510 * scope according to RFC2373. 1510 * scope according to RFC2373.
1511 */ 1511 */
1512 if (dev->dev_id) { 1512 if (dev->dev_id) {
1513 eui[3] = (dev->dev_id >> 8) & 0xFF; 1513 eui[3] = (dev->dev_id >> 8) & 0xFF;
1514 eui[4] = dev->dev_id & 0xFF; 1514 eui[4] = dev->dev_id & 0xFF;
1515 } else { 1515 } else {
1516 eui[3] = 0xFF; 1516 eui[3] = 0xFF;
1517 eui[4] = 0xFE; 1517 eui[4] = 0xFE;
1518 eui[0] ^= 2; 1518 eui[0] ^= 2;
1519 } 1519 }
1520 return 0; 1520 return 0;
1521 } 1521 }
1522 1522
1523 static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev) 1523 static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
1524 { 1524 {
1525 /* XXX: inherit EUI-64 from other interface -- yoshfuji */ 1525 /* XXX: inherit EUI-64 from other interface -- yoshfuji */
1526 if (dev->addr_len != ARCNET_ALEN) 1526 if (dev->addr_len != ARCNET_ALEN)
1527 return -1; 1527 return -1;
1528 memset(eui, 0, 7); 1528 memset(eui, 0, 7);
1529 eui[7] = *(u8*)dev->dev_addr; 1529 eui[7] = *(u8*)dev->dev_addr;
1530 return 0; 1530 return 0;
1531 } 1531 }
1532 1532
1533 static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev) 1533 static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
1534 { 1534 {
1535 if (dev->addr_len != INFINIBAND_ALEN) 1535 if (dev->addr_len != INFINIBAND_ALEN)
1536 return -1; 1536 return -1;
1537 memcpy(eui, dev->dev_addr + 12, 8); 1537 memcpy(eui, dev->dev_addr + 12, 8);
1538 eui[0] |= 2; 1538 eui[0] |= 2;
1539 return 0; 1539 return 0;
1540 } 1540 }
1541 1541
1542 static int __ipv6_isatap_ifid(u8 *eui, __be32 addr) 1542 static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
1543 { 1543 {
1544 if (addr == 0) 1544 if (addr == 0)
1545 return -1; 1545 return -1;
1546 eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) || 1546 eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) ||
1547 ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) || 1547 ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) ||
1548 ipv4_is_private_172(addr) || ipv4_is_test_192(addr) || 1548 ipv4_is_private_172(addr) || ipv4_is_test_192(addr) ||
1549 ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) || 1549 ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) ||
1550 ipv4_is_test_198(addr) || ipv4_is_multicast(addr) || 1550 ipv4_is_test_198(addr) || ipv4_is_multicast(addr) ||
1551 ipv4_is_lbcast(addr)) ? 0x00 : 0x02; 1551 ipv4_is_lbcast(addr)) ? 0x00 : 0x02;
1552 eui[1] = 0; 1552 eui[1] = 0;
1553 eui[2] = 0x5E; 1553 eui[2] = 0x5E;
1554 eui[3] = 0xFE; 1554 eui[3] = 0xFE;
1555 memcpy(eui + 4, &addr, 4); 1555 memcpy(eui + 4, &addr, 4);
1556 return 0; 1556 return 0;
1557 } 1557 }
1558 1558
1559 static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) 1559 static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
1560 { 1560 {
1561 if (dev->priv_flags & IFF_ISATAP) 1561 if (dev->priv_flags & IFF_ISATAP)
1562 return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); 1562 return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
1563 return -1; 1563 return -1;
1564 } 1564 }
1565 1565
1566 static int addrconf_ifid_gre(u8 *eui, struct net_device *dev) 1566 static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
1567 { 1567 {
1568 return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); 1568 return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
1569 } 1569 }
1570 1570
1571 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) 1571 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
1572 { 1572 {
1573 switch (dev->type) { 1573 switch (dev->type) {
1574 case ARPHRD_ETHER: 1574 case ARPHRD_ETHER:
1575 case ARPHRD_FDDI: 1575 case ARPHRD_FDDI:
1576 case ARPHRD_IEEE802_TR: 1576 case ARPHRD_IEEE802_TR:
1577 return addrconf_ifid_eui48(eui, dev); 1577 return addrconf_ifid_eui48(eui, dev);
1578 case ARPHRD_ARCNET: 1578 case ARPHRD_ARCNET:
1579 return addrconf_ifid_arcnet(eui, dev); 1579 return addrconf_ifid_arcnet(eui, dev);
1580 case ARPHRD_INFINIBAND: 1580 case ARPHRD_INFINIBAND:
1581 return addrconf_ifid_infiniband(eui, dev); 1581 return addrconf_ifid_infiniband(eui, dev);
1582 case ARPHRD_SIT: 1582 case ARPHRD_SIT:
1583 return addrconf_ifid_sit(eui, dev); 1583 return addrconf_ifid_sit(eui, dev);
1584 case ARPHRD_IPGRE: 1584 case ARPHRD_IPGRE:
1585 return addrconf_ifid_gre(eui, dev); 1585 return addrconf_ifid_gre(eui, dev);
1586 } 1586 }
1587 return -1; 1587 return -1;
1588 } 1588 }
1589 1589
1590 static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) 1590 static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
1591 { 1591 {
1592 int err = -1; 1592 int err = -1;
1593 struct inet6_ifaddr *ifp; 1593 struct inet6_ifaddr *ifp;
1594 1594
1595 read_lock_bh(&idev->lock); 1595 read_lock_bh(&idev->lock);
1596 list_for_each_entry(ifp, &idev->addr_list, if_list) { 1596 list_for_each_entry(ifp, &idev->addr_list, if_list) {
1597 if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { 1597 if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
1598 memcpy(eui, ifp->addr.s6_addr+8, 8); 1598 memcpy(eui, ifp->addr.s6_addr+8, 8);
1599 err = 0; 1599 err = 0;
1600 break; 1600 break;
1601 } 1601 }
1602 } 1602 }
1603 read_unlock_bh(&idev->lock); 1603 read_unlock_bh(&idev->lock);
1604 return err; 1604 return err;
1605 } 1605 }
1606 1606
1607 #ifdef CONFIG_IPV6_PRIVACY 1607 #ifdef CONFIG_IPV6_PRIVACY
1608 /* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ 1608 /* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
1609 static int __ipv6_regen_rndid(struct inet6_dev *idev) 1609 static int __ipv6_regen_rndid(struct inet6_dev *idev)
1610 { 1610 {
1611 regen: 1611 regen:
1612 get_random_bytes(idev->rndid, sizeof(idev->rndid)); 1612 get_random_bytes(idev->rndid, sizeof(idev->rndid));
1613 idev->rndid[0] &= ~0x02; 1613 idev->rndid[0] &= ~0x02;
1614 1614
1615 /* 1615 /*
1616 * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>: 1616 * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
1617 * check if generated address is not inappropriate 1617 * check if generated address is not inappropriate
1618 * 1618 *
1619 * - Reserved subnet anycast (RFC 2526) 1619 * - Reserved subnet anycast (RFC 2526)
1620 * 11111101 11....11 1xxxxxxx 1620 * 11111101 11....11 1xxxxxxx
1621 * - ISATAP (RFC4214) 6.1 1621 * - ISATAP (RFC4214) 6.1
1622 * 00-00-5E-FE-xx-xx-xx-xx 1622 * 00-00-5E-FE-xx-xx-xx-xx
1623 * - value 0 1623 * - value 0
1624 * - XXX: already assigned to an address on the device 1624 * - XXX: already assigned to an address on the device
1625 */ 1625 */
1626 if (idev->rndid[0] == 0xfd && 1626 if (idev->rndid[0] == 0xfd &&
1627 (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && 1627 (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
1628 (idev->rndid[7]&0x80)) 1628 (idev->rndid[7]&0x80))
1629 goto regen; 1629 goto regen;
1630 if ((idev->rndid[0]|idev->rndid[1]) == 0) { 1630 if ((idev->rndid[0]|idev->rndid[1]) == 0) {
1631 if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) 1631 if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
1632 goto regen; 1632 goto regen;
1633 if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) 1633 if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
1634 goto regen; 1634 goto regen;
1635 } 1635 }
1636 1636
1637 return 0; 1637 return 0;
1638 } 1638 }
1639 1639
1640 static void ipv6_regen_rndid(unsigned long data) 1640 static void ipv6_regen_rndid(unsigned long data)
1641 { 1641 {
1642 struct inet6_dev *idev = (struct inet6_dev *) data; 1642 struct inet6_dev *idev = (struct inet6_dev *) data;
1643 unsigned long expires; 1643 unsigned long expires;
1644 1644
1645 rcu_read_lock_bh(); 1645 rcu_read_lock_bh();
1646 write_lock_bh(&idev->lock); 1646 write_lock_bh(&idev->lock);
1647 1647
1648 if (idev->dead) 1648 if (idev->dead)
1649 goto out; 1649 goto out;
1650 1650
1651 if (__ipv6_regen_rndid(idev) < 0) 1651 if (__ipv6_regen_rndid(idev) < 0)
1652 goto out; 1652 goto out;
1653 1653
1654 expires = jiffies + 1654 expires = jiffies +
1655 idev->cnf.temp_prefered_lft * HZ - 1655 idev->cnf.temp_prefered_lft * HZ -
1656 idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - 1656 idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time -
1657 idev->cnf.max_desync_factor * HZ; 1657 idev->cnf.max_desync_factor * HZ;
1658 if (time_before(expires, jiffies)) { 1658 if (time_before(expires, jiffies)) {
1659 printk(KERN_WARNING 1659 printk(KERN_WARNING
1660 "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", 1660 "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n",
1661 idev->dev->name); 1661 idev->dev->name);
1662 goto out; 1662 goto out;
1663 } 1663 }
1664 1664
1665 if (!mod_timer(&idev->regen_timer, expires)) 1665 if (!mod_timer(&idev->regen_timer, expires))
1666 in6_dev_hold(idev); 1666 in6_dev_hold(idev);
1667 1667
1668 out: 1668 out:
1669 write_unlock_bh(&idev->lock); 1669 write_unlock_bh(&idev->lock);
1670 rcu_read_unlock_bh(); 1670 rcu_read_unlock_bh();
1671 in6_dev_put(idev); 1671 in6_dev_put(idev);
1672 } 1672 }
1673 1673
1674 static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { 1674 static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) {
1675 int ret = 0; 1675 int ret = 0;
1676 1676
1677 if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) 1677 if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
1678 ret = __ipv6_regen_rndid(idev); 1678 ret = __ipv6_regen_rndid(idev);
1679 return ret; 1679 return ret;
1680 } 1680 }
1681 #endif 1681 #endif
1682 1682
1683 /* 1683 /*
1684 * Add prefix route. 1684 * Add prefix route.
1685 */ 1685 */
1686 1686
1687 static void 1687 static void
1688 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, 1688 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1689 unsigned long expires, u32 flags) 1689 unsigned long expires, u32 flags)
1690 { 1690 {
1691 struct fib6_config cfg = { 1691 struct fib6_config cfg = {
1692 .fc_table = RT6_TABLE_PREFIX, 1692 .fc_table = RT6_TABLE_PREFIX,
1693 .fc_metric = IP6_RT_PRIO_ADDRCONF, 1693 .fc_metric = IP6_RT_PRIO_ADDRCONF,
1694 .fc_ifindex = dev->ifindex, 1694 .fc_ifindex = dev->ifindex,
1695 .fc_expires = expires, 1695 .fc_expires = expires,
1696 .fc_dst_len = plen, 1696 .fc_dst_len = plen,
1697 .fc_flags = RTF_UP | flags, 1697 .fc_flags = RTF_UP | flags,
1698 .fc_nlinfo.nl_net = dev_net(dev), 1698 .fc_nlinfo.nl_net = dev_net(dev),
1699 .fc_protocol = RTPROT_KERNEL, 1699 .fc_protocol = RTPROT_KERNEL,
1700 }; 1700 };
1701 1701
1702 ipv6_addr_copy(&cfg.fc_dst, pfx); 1702 ipv6_addr_copy(&cfg.fc_dst, pfx);
1703 1703
1704 /* Prevent useless cloning on PtP SIT. 1704 /* Prevent useless cloning on PtP SIT.
1705 This thing is done here expecting that the whole 1705 This thing is done here expecting that the whole
1706 class of non-broadcast devices need not cloning. 1706 class of non-broadcast devices need not cloning.
1707 */ 1707 */
1708 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 1708 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
1709 if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT)) 1709 if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
1710 cfg.fc_flags |= RTF_NONEXTHOP; 1710 cfg.fc_flags |= RTF_NONEXTHOP;
1711 #endif 1711 #endif
1712 1712
1713 ip6_route_add(&cfg); 1713 ip6_route_add(&cfg);
1714 } 1714 }
1715 1715
1716 /* Create "default" multicast route to the interface */ 1716 /* Create "default" multicast route to the interface */
1717 1717
1718 static void addrconf_add_mroute(struct net_device *dev) 1718 static void addrconf_add_mroute(struct net_device *dev)
1719 { 1719 {
1720 struct fib6_config cfg = { 1720 struct fib6_config cfg = {
1721 .fc_table = RT6_TABLE_LOCAL, 1721 .fc_table = RT6_TABLE_LOCAL,
1722 .fc_metric = IP6_RT_PRIO_ADDRCONF, 1722 .fc_metric = IP6_RT_PRIO_ADDRCONF,
1723 .fc_ifindex = dev->ifindex, 1723 .fc_ifindex = dev->ifindex,
1724 .fc_dst_len = 8, 1724 .fc_dst_len = 8,
1725 .fc_flags = RTF_UP, 1725 .fc_flags = RTF_UP,
1726 .fc_nlinfo.nl_net = dev_net(dev), 1726 .fc_nlinfo.nl_net = dev_net(dev),
1727 }; 1727 };
1728 1728
1729 ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); 1729 ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
1730 1730
1731 ip6_route_add(&cfg); 1731 ip6_route_add(&cfg);
1732 } 1732 }
1733 1733
1734 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 1734 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
1735 static void sit_route_add(struct net_device *dev) 1735 static void sit_route_add(struct net_device *dev)
1736 { 1736 {
1737 struct fib6_config cfg = { 1737 struct fib6_config cfg = {
1738 .fc_table = RT6_TABLE_MAIN, 1738 .fc_table = RT6_TABLE_MAIN,
1739 .fc_metric = IP6_RT_PRIO_ADDRCONF, 1739 .fc_metric = IP6_RT_PRIO_ADDRCONF,
1740 .fc_ifindex = dev->ifindex, 1740 .fc_ifindex = dev->ifindex,
1741 .fc_dst_len = 96, 1741 .fc_dst_len = 96,
1742 .fc_flags = RTF_UP | RTF_NONEXTHOP, 1742 .fc_flags = RTF_UP | RTF_NONEXTHOP,
1743 .fc_nlinfo.nl_net = dev_net(dev), 1743 .fc_nlinfo.nl_net = dev_net(dev),
1744 }; 1744 };
1745 1745
1746 /* prefix length - 96 bits "::d.d.d.d" */ 1746 /* prefix length - 96 bits "::d.d.d.d" */
1747 ip6_route_add(&cfg); 1747 ip6_route_add(&cfg);
1748 } 1748 }
1749 #endif 1749 #endif
1750 1750
1751 static void addrconf_add_lroute(struct net_device *dev) 1751 static void addrconf_add_lroute(struct net_device *dev)
1752 { 1752 {
1753 struct in6_addr addr; 1753 struct in6_addr addr;
1754 1754
1755 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); 1755 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
1756 addrconf_prefix_route(&addr, 64, dev, 0, 0); 1756 addrconf_prefix_route(&addr, 64, dev, 0, 0);
1757 } 1757 }
1758 1758
1759 static struct inet6_dev *addrconf_add_dev(struct net_device *dev) 1759 static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
1760 { 1760 {
1761 struct inet6_dev *idev; 1761 struct inet6_dev *idev;
1762 1762
1763 ASSERT_RTNL(); 1763 ASSERT_RTNL();
1764 1764
1765 idev = ipv6_find_idev(dev); 1765 idev = ipv6_find_idev(dev);
1766 if (!idev) 1766 if (!idev)
1767 return ERR_PTR(-ENOBUFS); 1767 return ERR_PTR(-ENOBUFS);
1768 1768
1769 if (idev->cnf.disable_ipv6) 1769 if (idev->cnf.disable_ipv6)
1770 return ERR_PTR(-EACCES); 1770 return ERR_PTR(-EACCES);
1771 1771
1772 /* Add default multicast route */ 1772 /* Add default multicast route */
1773 addrconf_add_mroute(dev); 1773 addrconf_add_mroute(dev);
1774 1774
1775 /* Add link local route */ 1775 /* Add link local route */
1776 addrconf_add_lroute(dev); 1776 addrconf_add_lroute(dev);
1777 return idev; 1777 return idev;
1778 } 1778 }
1779 1779
1780 void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) 1780 void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
1781 { 1781 {
1782 struct prefix_info *pinfo; 1782 struct prefix_info *pinfo;
1783 __u32 valid_lft; 1783 __u32 valid_lft;
1784 __u32 prefered_lft; 1784 __u32 prefered_lft;
1785 int addr_type; 1785 int addr_type;
1786 struct inet6_dev *in6_dev; 1786 struct inet6_dev *in6_dev;
1787 struct net *net = dev_net(dev); 1787 struct net *net = dev_net(dev);
1788 1788
1789 pinfo = (struct prefix_info *) opt; 1789 pinfo = (struct prefix_info *) opt;
1790 1790
1791 if (len < sizeof(struct prefix_info)) { 1791 if (len < sizeof(struct prefix_info)) {
1792 ADBG(("addrconf: prefix option too short\n")); 1792 ADBG(("addrconf: prefix option too short\n"));
1793 return; 1793 return;
1794 } 1794 }
1795 1795
1796 /* 1796 /*
1797 * Validation checks ([ADDRCONF], page 19) 1797 * Validation checks ([ADDRCONF], page 19)
1798 */ 1798 */
1799 1799
1800 addr_type = ipv6_addr_type(&pinfo->prefix); 1800 addr_type = ipv6_addr_type(&pinfo->prefix);
1801 1801
1802 if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) 1802 if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL))
1803 return; 1803 return;
1804 1804
1805 valid_lft = ntohl(pinfo->valid); 1805 valid_lft = ntohl(pinfo->valid);
1806 prefered_lft = ntohl(pinfo->prefered); 1806 prefered_lft = ntohl(pinfo->prefered);
1807 1807
1808 if (prefered_lft > valid_lft) { 1808 if (prefered_lft > valid_lft) {
1809 if (net_ratelimit()) 1809 if (net_ratelimit())
1810 printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); 1810 printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n");
1811 return; 1811 return;
1812 } 1812 }
1813 1813
1814 in6_dev = in6_dev_get(dev); 1814 in6_dev = in6_dev_get(dev);
1815 1815
1816 if (in6_dev == NULL) { 1816 if (in6_dev == NULL) {
1817 if (net_ratelimit()) 1817 if (net_ratelimit())
1818 printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); 1818 printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name);
1819 return; 1819 return;
1820 } 1820 }
1821 1821
1822 /* 1822 /*
1823 * Two things going on here: 1823 * Two things going on here:
1824 * 1) Add routes for on-link prefixes 1824 * 1) Add routes for on-link prefixes
1825 * 2) Configure prefixes with the auto flag set 1825 * 2) Configure prefixes with the auto flag set
1826 */ 1826 */
1827 1827
1828 if (pinfo->onlink) { 1828 if (pinfo->onlink) {
1829 struct rt6_info *rt; 1829 struct rt6_info *rt;
1830 unsigned long rt_expires; 1830 unsigned long rt_expires;
1831 1831
1832 /* Avoid arithmetic overflow. Really, we could 1832 /* Avoid arithmetic overflow. Really, we could
1833 * save rt_expires in seconds, likely valid_lft, 1833 * save rt_expires in seconds, likely valid_lft,
1834 * but it would require division in fib gc, that it 1834 * but it would require division in fib gc, that it
1835 * not good. 1835 * not good.
1836 */ 1836 */
1837 if (HZ > USER_HZ) 1837 if (HZ > USER_HZ)
1838 rt_expires = addrconf_timeout_fixup(valid_lft, HZ); 1838 rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
1839 else 1839 else
1840 rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ); 1840 rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);
1841 1841
1842 if (addrconf_finite_timeout(rt_expires)) 1842 if (addrconf_finite_timeout(rt_expires))
1843 rt_expires *= HZ; 1843 rt_expires *= HZ;
1844 1844
1845 rt = rt6_lookup(net, &pinfo->prefix, NULL, 1845 rt = rt6_lookup(net, &pinfo->prefix, NULL,
1846 dev->ifindex, 1); 1846 dev->ifindex, 1);
1847 1847
1848 if (rt && addrconf_is_prefix_route(rt)) { 1848 if (rt && addrconf_is_prefix_route(rt)) {
1849 /* Autoconf prefix route */ 1849 /* Autoconf prefix route */
1850 if (valid_lft == 0) { 1850 if (valid_lft == 0) {
1851 ip6_del_rt(rt); 1851 ip6_del_rt(rt);
1852 rt = NULL; 1852 rt = NULL;
1853 } else if (addrconf_finite_timeout(rt_expires)) { 1853 } else if (addrconf_finite_timeout(rt_expires)) {
1854 /* not infinity */ 1854 /* not infinity */
1855 rt->rt6i_expires = jiffies + rt_expires; 1855 rt->rt6i_expires = jiffies + rt_expires;
1856 rt->rt6i_flags |= RTF_EXPIRES; 1856 rt->rt6i_flags |= RTF_EXPIRES;
1857 } else { 1857 } else {
1858 rt->rt6i_flags &= ~RTF_EXPIRES; 1858 rt->rt6i_flags &= ~RTF_EXPIRES;
1859 rt->rt6i_expires = 0; 1859 rt->rt6i_expires = 0;
1860 } 1860 }
1861 } else if (valid_lft) { 1861 } else if (valid_lft) {
1862 clock_t expires = 0; 1862 clock_t expires = 0;
1863 int flags = RTF_ADDRCONF | RTF_PREFIX_RT; 1863 int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
1864 if (addrconf_finite_timeout(rt_expires)) { 1864 if (addrconf_finite_timeout(rt_expires)) {
1865 /* not infinity */ 1865 /* not infinity */
1866 flags |= RTF_EXPIRES; 1866 flags |= RTF_EXPIRES;
1867 expires = jiffies_to_clock_t(rt_expires); 1867 expires = jiffies_to_clock_t(rt_expires);
1868 } 1868 }
1869 addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, 1869 addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
1870 dev, expires, flags); 1870 dev, expires, flags);
1871 } 1871 }
1872 if (rt) 1872 if (rt)
1873 dst_release(&rt->dst); 1873 dst_release(&rt->dst);
1874 } 1874 }
1875 1875
1876 /* Try to figure out our local address for this prefix */ 1876 /* Try to figure out our local address for this prefix */
1877 1877
1878 if (pinfo->autoconf && in6_dev->cnf.autoconf) { 1878 if (pinfo->autoconf && in6_dev->cnf.autoconf) {
1879 struct inet6_ifaddr * ifp; 1879 struct inet6_ifaddr * ifp;
1880 struct in6_addr addr; 1880 struct in6_addr addr;
1881 int create = 0, update_lft = 0; 1881 int create = 0, update_lft = 0;
1882 1882
1883 if (pinfo->prefix_len == 64) { 1883 if (pinfo->prefix_len == 64) {
1884 memcpy(&addr, &pinfo->prefix, 8); 1884 memcpy(&addr, &pinfo->prefix, 8);
1885 if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && 1885 if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
1886 ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { 1886 ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
1887 in6_dev_put(in6_dev); 1887 in6_dev_put(in6_dev);
1888 return; 1888 return;
1889 } 1889 }
1890 goto ok; 1890 goto ok;
1891 } 1891 }
1892 if (net_ratelimit()) 1892 if (net_ratelimit())
1893 printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", 1893 printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n",
1894 pinfo->prefix_len); 1894 pinfo->prefix_len);
1895 in6_dev_put(in6_dev); 1895 in6_dev_put(in6_dev);
1896 return; 1896 return;
1897 1897
1898 ok: 1898 ok:
1899 1899
1900 ifp = ipv6_get_ifaddr(net, &addr, dev, 1); 1900 ifp = ipv6_get_ifaddr(net, &addr, dev, 1);
1901 1901
1902 if (ifp == NULL && valid_lft) { 1902 if (ifp == NULL && valid_lft) {
1903 int max_addresses = in6_dev->cnf.max_addresses; 1903 int max_addresses = in6_dev->cnf.max_addresses;
1904 u32 addr_flags = 0; 1904 u32 addr_flags = 0;
1905 1905
1906 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1906 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1907 if (in6_dev->cnf.optimistic_dad && 1907 if (in6_dev->cnf.optimistic_dad &&
1908 !net->ipv6.devconf_all->forwarding) 1908 !net->ipv6.devconf_all->forwarding)
1909 addr_flags = IFA_F_OPTIMISTIC; 1909 addr_flags = IFA_F_OPTIMISTIC;
1910 #endif 1910 #endif
1911 1911
1912 /* Do not allow to create too much of autoconfigured 1912 /* Do not allow to create too much of autoconfigured
1913 * addresses; this would be too easy way to crash kernel. 1913 * addresses; this would be too easy way to crash kernel.
1914 */ 1914 */
1915 if (!max_addresses || 1915 if (!max_addresses ||
1916 ipv6_count_addresses(in6_dev) < max_addresses) 1916 ipv6_count_addresses(in6_dev) < max_addresses)
1917 ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, 1917 ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len,
1918 addr_type&IPV6_ADDR_SCOPE_MASK, 1918 addr_type&IPV6_ADDR_SCOPE_MASK,
1919 addr_flags); 1919 addr_flags);
1920 1920
1921 if (!ifp || IS_ERR(ifp)) { 1921 if (!ifp || IS_ERR(ifp)) {
1922 in6_dev_put(in6_dev); 1922 in6_dev_put(in6_dev);
1923 return; 1923 return;
1924 } 1924 }
1925 1925
1926 update_lft = create = 1; 1926 update_lft = create = 1;
1927 ifp->cstamp = jiffies; 1927 ifp->cstamp = jiffies;
1928 addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); 1928 addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT);
1929 } 1929 }
1930 1930
1931 if (ifp) { 1931 if (ifp) {
1932 int flags; 1932 int flags;
1933 unsigned long now; 1933 unsigned long now;
1934 #ifdef CONFIG_IPV6_PRIVACY 1934 #ifdef CONFIG_IPV6_PRIVACY
1935 struct inet6_ifaddr *ift; 1935 struct inet6_ifaddr *ift;
1936 #endif 1936 #endif
1937 u32 stored_lft; 1937 u32 stored_lft;
1938 1938
1939 /* update lifetime (RFC2462 5.5.3 e) */ 1939 /* update lifetime (RFC2462 5.5.3 e) */
1940 spin_lock(&ifp->lock); 1940 spin_lock(&ifp->lock);
1941 now = jiffies; 1941 now = jiffies;
1942 if (ifp->valid_lft > (now - ifp->tstamp) / HZ) 1942 if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
1943 stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; 1943 stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
1944 else 1944 else
1945 stored_lft = 0; 1945 stored_lft = 0;
1946 if (!update_lft && stored_lft) { 1946 if (!update_lft && stored_lft) {
1947 if (valid_lft > MIN_VALID_LIFETIME || 1947 if (valid_lft > MIN_VALID_LIFETIME ||
1948 valid_lft > stored_lft) 1948 valid_lft > stored_lft)
1949 update_lft = 1; 1949 update_lft = 1;
1950 else if (stored_lft <= MIN_VALID_LIFETIME) { 1950 else if (stored_lft <= MIN_VALID_LIFETIME) {
1951 /* valid_lft <= stored_lft is always true */ 1951 /* valid_lft <= stored_lft is always true */
1952 /* 1952 /*
1953 * RFC 4862 Section 5.5.3e: 1953 * RFC 4862 Section 5.5.3e:
1954 * "Note that the preferred lifetime of 1954 * "Note that the preferred lifetime of
1955 * the corresponding address is always 1955 * the corresponding address is always
1956 * reset to the Preferred Lifetime in 1956 * reset to the Preferred Lifetime in
1957 * the received Prefix Information 1957 * the received Prefix Information
1958 * option, regardless of whether the 1958 * option, regardless of whether the
1959 * valid lifetime is also reset or 1959 * valid lifetime is also reset or
1960 * ignored." 1960 * ignored."
1961 * 1961 *
1962 * So if the preferred lifetime in 1962 * So if the preferred lifetime in
1963 * this advertisement is different 1963 * this advertisement is different
1964 * than what we have stored, but the 1964 * than what we have stored, but the
1965 * valid lifetime is invalid, just 1965 * valid lifetime is invalid, just
1966 * reset prefered_lft. 1966 * reset prefered_lft.
1967 * 1967 *
1968 * We must set the valid lifetime 1968 * We must set the valid lifetime
1969 * to the stored lifetime since we'll 1969 * to the stored lifetime since we'll
1970 * be updating the timestamp below, 1970 * be updating the timestamp below,
1971 * else we'll set it back to the 1971 * else we'll set it back to the
1972 * minimum. 1972 * minimum.
1973 */ 1973 */
1974 if (prefered_lft != ifp->prefered_lft) { 1974 if (prefered_lft != ifp->prefered_lft) {
1975 valid_lft = stored_lft; 1975 valid_lft = stored_lft;
1976 update_lft = 1; 1976 update_lft = 1;
1977 } 1977 }
1978 } else { 1978 } else {
1979 valid_lft = MIN_VALID_LIFETIME; 1979 valid_lft = MIN_VALID_LIFETIME;
1980 if (valid_lft < prefered_lft) 1980 if (valid_lft < prefered_lft)
1981 prefered_lft = valid_lft; 1981 prefered_lft = valid_lft;
1982 update_lft = 1; 1982 update_lft = 1;
1983 } 1983 }
1984 } 1984 }
1985 1985
1986 if (update_lft) { 1986 if (update_lft) {
1987 ifp->valid_lft = valid_lft; 1987 ifp->valid_lft = valid_lft;
1988 ifp->prefered_lft = prefered_lft; 1988 ifp->prefered_lft = prefered_lft;
1989 ifp->tstamp = now; 1989 ifp->tstamp = now;
1990 flags = ifp->flags; 1990 flags = ifp->flags;
1991 ifp->flags &= ~IFA_F_DEPRECATED; 1991 ifp->flags &= ~IFA_F_DEPRECATED;
1992 spin_unlock(&ifp->lock); 1992 spin_unlock(&ifp->lock);
1993 1993
1994 if (!(flags&IFA_F_TENTATIVE)) 1994 if (!(flags&IFA_F_TENTATIVE))
1995 ipv6_ifa_notify(0, ifp); 1995 ipv6_ifa_notify(0, ifp);
1996 } else 1996 } else
1997 spin_unlock(&ifp->lock); 1997 spin_unlock(&ifp->lock);
1998 1998
1999 #ifdef CONFIG_IPV6_PRIVACY 1999 #ifdef CONFIG_IPV6_PRIVACY
2000 read_lock_bh(&in6_dev->lock); 2000 read_lock_bh(&in6_dev->lock);
2001 /* update all temporary addresses in the list */ 2001 /* update all temporary addresses in the list */
2002 list_for_each_entry(ift, &in6_dev->tempaddr_list, tmp_list) { 2002 list_for_each_entry(ift, &in6_dev->tempaddr_list, tmp_list) {
2003 /* 2003 /*
2004 * When adjusting the lifetimes of an existing 2004 * When adjusting the lifetimes of an existing
2005 * temporary address, only lower the lifetimes. 2005 * temporary address, only lower the lifetimes.
2006 * Implementations must not increase the 2006 * Implementations must not increase the
2007 * lifetimes of an existing temporary address 2007 * lifetimes of an existing temporary address
2008 * when processing a Prefix Information Option. 2008 * when processing a Prefix Information Option.
2009 */ 2009 */
2010 if (ifp != ift->ifpub) 2010 if (ifp != ift->ifpub)
2011 continue; 2011 continue;
2012 2012
2013 spin_lock(&ift->lock); 2013 spin_lock(&ift->lock);
2014 flags = ift->flags; 2014 flags = ift->flags;
2015 if (ift->valid_lft > valid_lft && 2015 if (ift->valid_lft > valid_lft &&
2016 ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ) 2016 ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ)
2017 ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ; 2017 ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ;
2018 if (ift->prefered_lft > prefered_lft && 2018 if (ift->prefered_lft > prefered_lft &&
2019 ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ) 2019 ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ)
2020 ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ; 2020 ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ;
2021 spin_unlock(&ift->lock); 2021 spin_unlock(&ift->lock);
2022 if (!(flags&IFA_F_TENTATIVE)) 2022 if (!(flags&IFA_F_TENTATIVE))
2023 ipv6_ifa_notify(0, ift); 2023 ipv6_ifa_notify(0, ift);
2024 } 2024 }
2025 2025
2026 if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) { 2026 if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) {
2027 /* 2027 /*
2028 * When a new public address is created as described in [ADDRCONF], 2028 * When a new public address is created as described in [ADDRCONF],
2029 * also create a new temporary address. Also create a temporary 2029 * also create a new temporary address. Also create a temporary
2030 * address if it's enabled but no temporary address currently exists. 2030 * address if it's enabled but no temporary address currently exists.
2031 */ 2031 */
2032 read_unlock_bh(&in6_dev->lock); 2032 read_unlock_bh(&in6_dev->lock);
2033 ipv6_create_tempaddr(ifp, NULL); 2033 ipv6_create_tempaddr(ifp, NULL);
2034 } else { 2034 } else {
2035 read_unlock_bh(&in6_dev->lock); 2035 read_unlock_bh(&in6_dev->lock);
2036 } 2036 }
2037 #endif 2037 #endif
2038 in6_ifa_put(ifp); 2038 in6_ifa_put(ifp);
2039 addrconf_verify(0); 2039 addrconf_verify(0);
2040 } 2040 }
2041 } 2041 }
2042 inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); 2042 inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
2043 in6_dev_put(in6_dev); 2043 in6_dev_put(in6_dev);
2044 } 2044 }
2045 2045
2046 /* 2046 /*
2047 * Set destination address. 2047 * Set destination address.
2048 * Special case for SIT interfaces where we create a new "virtual" 2048 * Special case for SIT interfaces where we create a new "virtual"
2049 * device. 2049 * device.
2050 */ 2050 */
2051 int addrconf_set_dstaddr(struct net *net, void __user *arg) 2051 int addrconf_set_dstaddr(struct net *net, void __user *arg)
2052 { 2052 {
2053 struct in6_ifreq ireq; 2053 struct in6_ifreq ireq;
2054 struct net_device *dev; 2054 struct net_device *dev;
2055 int err = -EINVAL; 2055 int err = -EINVAL;
2056 2056
2057 rtnl_lock(); 2057 rtnl_lock();
2058 2058
2059 err = -EFAULT; 2059 err = -EFAULT;
2060 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) 2060 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
2061 goto err_exit; 2061 goto err_exit;
2062 2062
2063 dev = __dev_get_by_index(net, ireq.ifr6_ifindex); 2063 dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
2064 2064
2065 err = -ENODEV; 2065 err = -ENODEV;
2066 if (dev == NULL) 2066 if (dev == NULL)
2067 goto err_exit; 2067 goto err_exit;
2068 2068
2069 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 2069 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
2070 if (dev->type == ARPHRD_SIT) { 2070 if (dev->type == ARPHRD_SIT) {
2071 const struct net_device_ops *ops = dev->netdev_ops; 2071 const struct net_device_ops *ops = dev->netdev_ops;
2072 struct ifreq ifr; 2072 struct ifreq ifr;
2073 struct ip_tunnel_parm p; 2073 struct ip_tunnel_parm p;
2074 2074
2075 err = -EADDRNOTAVAIL; 2075 err = -EADDRNOTAVAIL;
2076 if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) 2076 if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
2077 goto err_exit; 2077 goto err_exit;
2078 2078
2079 memset(&p, 0, sizeof(p)); 2079 memset(&p, 0, sizeof(p));
2080 p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; 2080 p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
2081 p.iph.saddr = 0; 2081 p.iph.saddr = 0;
2082 p.iph.version = 4; 2082 p.iph.version = 4;
2083 p.iph.ihl = 5; 2083 p.iph.ihl = 5;
2084 p.iph.protocol = IPPROTO_IPV6; 2084 p.iph.protocol = IPPROTO_IPV6;
2085 p.iph.ttl = 64; 2085 p.iph.ttl = 64;
2086 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 2086 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
2087 2087
2088 if (ops->ndo_do_ioctl) { 2088 if (ops->ndo_do_ioctl) {
2089 mm_segment_t oldfs = get_fs(); 2089 mm_segment_t oldfs = get_fs();
2090 2090
2091 set_fs(KERNEL_DS); 2091 set_fs(KERNEL_DS);
2092 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 2092 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
2093 set_fs(oldfs); 2093 set_fs(oldfs);
2094 } else 2094 } else
2095 err = -EOPNOTSUPP; 2095 err = -EOPNOTSUPP;
2096 2096
2097 if (err == 0) { 2097 if (err == 0) {
2098 err = -ENOBUFS; 2098 err = -ENOBUFS;
2099 dev = __dev_get_by_name(net, p.name); 2099 dev = __dev_get_by_name(net, p.name);
2100 if (!dev) 2100 if (!dev)
2101 goto err_exit; 2101 goto err_exit;
2102 err = dev_open(dev); 2102 err = dev_open(dev);
2103 } 2103 }
2104 } 2104 }
2105 #endif 2105 #endif
2106 2106
2107 err_exit: 2107 err_exit:
2108 rtnl_unlock(); 2108 rtnl_unlock();
2109 return err; 2109 return err;
2110 } 2110 }
2111 2111
2112 /* 2112 /*
2113 * Manual configuration of address on an interface 2113 * Manual configuration of address on an interface
2114 */ 2114 */
2115 static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx, 2115 static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx,
2116 unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, 2116 unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
2117 __u32 valid_lft) 2117 __u32 valid_lft)
2118 { 2118 {
2119 struct inet6_ifaddr *ifp; 2119 struct inet6_ifaddr *ifp;
2120 struct inet6_dev *idev; 2120 struct inet6_dev *idev;
2121 struct net_device *dev; 2121 struct net_device *dev;
2122 int scope; 2122 int scope;
2123 u32 flags; 2123 u32 flags;
2124 clock_t expires; 2124 clock_t expires;
2125 unsigned long timeout; 2125 unsigned long timeout;
2126 2126
2127 ASSERT_RTNL(); 2127 ASSERT_RTNL();
2128 2128
2129 if (plen > 128) 2129 if (plen > 128)
2130 return -EINVAL; 2130 return -EINVAL;
2131 2131
2132 /* check the lifetime */ 2132 /* check the lifetime */
2133 if (!valid_lft || prefered_lft > valid_lft) 2133 if (!valid_lft || prefered_lft > valid_lft)
2134 return -EINVAL; 2134 return -EINVAL;
2135 2135
2136 dev = __dev_get_by_index(net, ifindex); 2136 dev = __dev_get_by_index(net, ifindex);
2137 if (!dev) 2137 if (!dev)
2138 return -ENODEV; 2138 return -ENODEV;
2139 2139
2140 idev = addrconf_add_dev(dev); 2140 idev = addrconf_add_dev(dev);
2141 if (IS_ERR(idev)) 2141 if (IS_ERR(idev))
2142 return PTR_ERR(idev); 2142 return PTR_ERR(idev);
2143 2143
2144 scope = ipv6_addr_scope(pfx); 2144 scope = ipv6_addr_scope(pfx);
2145 2145
2146 timeout = addrconf_timeout_fixup(valid_lft, HZ); 2146 timeout = addrconf_timeout_fixup(valid_lft, HZ);
2147 if (addrconf_finite_timeout(timeout)) { 2147 if (addrconf_finite_timeout(timeout)) {
2148 expires = jiffies_to_clock_t(timeout * HZ); 2148 expires = jiffies_to_clock_t(timeout * HZ);
2149 valid_lft = timeout; 2149 valid_lft = timeout;
2150 flags = RTF_EXPIRES; 2150 flags = RTF_EXPIRES;
2151 } else { 2151 } else {
2152 expires = 0; 2152 expires = 0;
2153 flags = 0; 2153 flags = 0;
2154 ifa_flags |= IFA_F_PERMANENT; 2154 ifa_flags |= IFA_F_PERMANENT;
2155 } 2155 }
2156 2156
2157 timeout = addrconf_timeout_fixup(prefered_lft, HZ); 2157 timeout = addrconf_timeout_fixup(prefered_lft, HZ);
2158 if (addrconf_finite_timeout(timeout)) { 2158 if (addrconf_finite_timeout(timeout)) {
2159 if (timeout == 0) 2159 if (timeout == 0)
2160 ifa_flags |= IFA_F_DEPRECATED; 2160 ifa_flags |= IFA_F_DEPRECATED;
2161 prefered_lft = timeout; 2161 prefered_lft = timeout;
2162 } 2162 }
2163 2163
2164 ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); 2164 ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags);
2165 2165
2166 if (!IS_ERR(ifp)) { 2166 if (!IS_ERR(ifp)) {
2167 spin_lock_bh(&ifp->lock); 2167 spin_lock_bh(&ifp->lock);
2168 ifp->valid_lft = valid_lft; 2168 ifp->valid_lft = valid_lft;
2169 ifp->prefered_lft = prefered_lft; 2169 ifp->prefered_lft = prefered_lft;
2170 ifp->tstamp = jiffies; 2170 ifp->tstamp = jiffies;
2171 spin_unlock_bh(&ifp->lock); 2171 spin_unlock_bh(&ifp->lock);
2172 2172
2173 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 2173 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
2174 expires, flags); 2174 expires, flags);
2175 /* 2175 /*
2176 * Note that section 3.1 of RFC 4429 indicates 2176 * Note that section 3.1 of RFC 4429 indicates
2177 * that the Optimistic flag should not be set for 2177 * that the Optimistic flag should not be set for
2178 * manually configured addresses 2178 * manually configured addresses
2179 */ 2179 */
2180 addrconf_dad_start(ifp, 0); 2180 addrconf_dad_start(ifp, 0);
2181 in6_ifa_put(ifp); 2181 in6_ifa_put(ifp);
2182 addrconf_verify(0); 2182 addrconf_verify(0);
2183 return 0; 2183 return 0;
2184 } 2184 }
2185 2185
2186 return PTR_ERR(ifp); 2186 return PTR_ERR(ifp);
2187 } 2187 }
2188 2188
2189 static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *pfx, 2189 static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *pfx,
2190 unsigned int plen) 2190 unsigned int plen)
2191 { 2191 {
2192 struct inet6_ifaddr *ifp; 2192 struct inet6_ifaddr *ifp;
2193 struct inet6_dev *idev; 2193 struct inet6_dev *idev;
2194 struct net_device *dev; 2194 struct net_device *dev;
2195 2195
2196 if (plen > 128) 2196 if (plen > 128)
2197 return -EINVAL; 2197 return -EINVAL;
2198 2198
2199 dev = __dev_get_by_index(net, ifindex); 2199 dev = __dev_get_by_index(net, ifindex);
2200 if (!dev) 2200 if (!dev)
2201 return -ENODEV; 2201 return -ENODEV;
2202 2202
2203 if ((idev = __in6_dev_get(dev)) == NULL) 2203 if ((idev = __in6_dev_get(dev)) == NULL)
2204 return -ENXIO; 2204 return -ENXIO;
2205 2205
2206 read_lock_bh(&idev->lock); 2206 read_lock_bh(&idev->lock);
2207 list_for_each_entry(ifp, &idev->addr_list, if_list) { 2207 list_for_each_entry(ifp, &idev->addr_list, if_list) {
2208 if (ifp->prefix_len == plen && 2208 if (ifp->prefix_len == plen &&
2209 ipv6_addr_equal(pfx, &ifp->addr)) { 2209 ipv6_addr_equal(pfx, &ifp->addr)) {
2210 in6_ifa_hold(ifp); 2210 in6_ifa_hold(ifp);
2211 read_unlock_bh(&idev->lock); 2211 read_unlock_bh(&idev->lock);
2212 2212
2213 ipv6_del_addr(ifp); 2213 ipv6_del_addr(ifp);
2214 2214
2215 /* If the last address is deleted administratively, 2215 /* If the last address is deleted administratively,
2216 disable IPv6 on this interface. 2216 disable IPv6 on this interface.
2217 */ 2217 */
2218 if (list_empty(&idev->addr_list)) 2218 if (list_empty(&idev->addr_list))
2219 addrconf_ifdown(idev->dev, 1); 2219 addrconf_ifdown(idev->dev, 1);
2220 return 0; 2220 return 0;
2221 } 2221 }
2222 } 2222 }
2223 read_unlock_bh(&idev->lock); 2223 read_unlock_bh(&idev->lock);
2224 return -EADDRNOTAVAIL; 2224 return -EADDRNOTAVAIL;
2225 } 2225 }
2226 2226
2227 2227
2228 int addrconf_add_ifaddr(struct net *net, void __user *arg) 2228 int addrconf_add_ifaddr(struct net *net, void __user *arg)
2229 { 2229 {
2230 struct in6_ifreq ireq; 2230 struct in6_ifreq ireq;
2231 int err; 2231 int err;
2232 2232
2233 if (!capable(CAP_NET_ADMIN)) 2233 if (!capable(CAP_NET_ADMIN))
2234 return -EPERM; 2234 return -EPERM;
2235 2235
2236 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) 2236 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
2237 return -EFAULT; 2237 return -EFAULT;
2238 2238
2239 rtnl_lock(); 2239 rtnl_lock();
2240 err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, 2240 err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr,
2241 ireq.ifr6_prefixlen, IFA_F_PERMANENT, 2241 ireq.ifr6_prefixlen, IFA_F_PERMANENT,
2242 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); 2242 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
2243 rtnl_unlock(); 2243 rtnl_unlock();
2244 return err; 2244 return err;
2245 } 2245 }
2246 2246
2247 int addrconf_del_ifaddr(struct net *net, void __user *arg) 2247 int addrconf_del_ifaddr(struct net *net, void __user *arg)
2248 { 2248 {
2249 struct in6_ifreq ireq; 2249 struct in6_ifreq ireq;
2250 int err; 2250 int err;
2251 2251
2252 if (!capable(CAP_NET_ADMIN)) 2252 if (!capable(CAP_NET_ADMIN))
2253 return -EPERM; 2253 return -EPERM;
2254 2254
2255 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) 2255 if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
2256 return -EFAULT; 2256 return -EFAULT;
2257 2257
2258 rtnl_lock(); 2258 rtnl_lock();
2259 err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, 2259 err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr,
2260 ireq.ifr6_prefixlen); 2260 ireq.ifr6_prefixlen);
2261 rtnl_unlock(); 2261 rtnl_unlock();
2262 return err; 2262 return err;
2263 } 2263 }
2264 2264
2265 static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, 2265 static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
2266 int plen, int scope) 2266 int plen, int scope)
2267 { 2267 {
2268 struct inet6_ifaddr *ifp; 2268 struct inet6_ifaddr *ifp;
2269 2269
2270 ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT); 2270 ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT);
2271 if (!IS_ERR(ifp)) { 2271 if (!IS_ERR(ifp)) {
2272 spin_lock_bh(&ifp->lock); 2272 spin_lock_bh(&ifp->lock);
2273 ifp->flags &= ~IFA_F_TENTATIVE; 2273 ifp->flags &= ~IFA_F_TENTATIVE;
2274 spin_unlock_bh(&ifp->lock); 2274 spin_unlock_bh(&ifp->lock);
2275 ipv6_ifa_notify(RTM_NEWADDR, ifp); 2275 ipv6_ifa_notify(RTM_NEWADDR, ifp);
2276 in6_ifa_put(ifp); 2276 in6_ifa_put(ifp);
2277 } 2277 }
2278 } 2278 }
2279 2279
2280 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 2280 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
2281 static void sit_add_v4_addrs(struct inet6_dev *idev) 2281 static void sit_add_v4_addrs(struct inet6_dev *idev)
2282 { 2282 {
2283 struct in6_addr addr; 2283 struct in6_addr addr;
2284 struct net_device *dev; 2284 struct net_device *dev;
2285 struct net *net = dev_net(idev->dev); 2285 struct net *net = dev_net(idev->dev);
2286 int scope; 2286 int scope;
2287 2287
2288 ASSERT_RTNL(); 2288 ASSERT_RTNL();
2289 2289
2290 memset(&addr, 0, sizeof(struct in6_addr)); 2290 memset(&addr, 0, sizeof(struct in6_addr));
2291 memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); 2291 memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
2292 2292
2293 if (idev->dev->flags&IFF_POINTOPOINT) { 2293 if (idev->dev->flags&IFF_POINTOPOINT) {
2294 addr.s6_addr32[0] = htonl(0xfe800000); 2294 addr.s6_addr32[0] = htonl(0xfe800000);
2295 scope = IFA_LINK; 2295 scope = IFA_LINK;
2296 } else { 2296 } else {
2297 scope = IPV6_ADDR_COMPATv4; 2297 scope = IPV6_ADDR_COMPATv4;
2298 } 2298 }
2299 2299
2300 if (addr.s6_addr32[3]) { 2300 if (addr.s6_addr32[3]) {
2301 add_addr(idev, &addr, 128, scope); 2301 add_addr(idev, &addr, 128, scope);
2302 return; 2302 return;
2303 } 2303 }
2304 2304
2305 for_each_netdev(net, dev) { 2305 for_each_netdev(net, dev) {
2306 struct in_device * in_dev = __in_dev_get_rtnl(dev); 2306 struct in_device * in_dev = __in_dev_get_rtnl(dev);
2307 if (in_dev && (dev->flags & IFF_UP)) { 2307 if (in_dev && (dev->flags & IFF_UP)) {
2308 struct in_ifaddr * ifa; 2308 struct in_ifaddr * ifa;
2309 2309
2310 int flag = scope; 2310 int flag = scope;
2311 2311
2312 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { 2312 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
2313 int plen; 2313 int plen;
2314 2314
2315 addr.s6_addr32[3] = ifa->ifa_local; 2315 addr.s6_addr32[3] = ifa->ifa_local;
2316 2316
2317 if (ifa->ifa_scope == RT_SCOPE_LINK) 2317 if (ifa->ifa_scope == RT_SCOPE_LINK)
2318 continue; 2318 continue;
2319 if (ifa->ifa_scope >= RT_SCOPE_HOST) { 2319 if (ifa->ifa_scope >= RT_SCOPE_HOST) {
2320 if (idev->dev->flags&IFF_POINTOPOINT) 2320 if (idev->dev->flags&IFF_POINTOPOINT)
2321 continue; 2321 continue;
2322 flag |= IFA_HOST; 2322 flag |= IFA_HOST;
2323 } 2323 }
2324 if (idev->dev->flags&IFF_POINTOPOINT) 2324 if (idev->dev->flags&IFF_POINTOPOINT)
2325 plen = 64; 2325 plen = 64;
2326 else 2326 else
2327 plen = 96; 2327 plen = 96;
2328 2328
2329 add_addr(idev, &addr, plen, flag); 2329 add_addr(idev, &addr, plen, flag);
2330 } 2330 }
2331 } 2331 }
2332 } 2332 }
2333 } 2333 }
2334 #endif 2334 #endif
2335 2335
2336 static void init_loopback(struct net_device *dev) 2336 static void init_loopback(struct net_device *dev)
2337 { 2337 {
2338 struct inet6_dev *idev; 2338 struct inet6_dev *idev;
2339 2339
2340 /* ::1 */ 2340 /* ::1 */
2341 2341
2342 ASSERT_RTNL(); 2342 ASSERT_RTNL();
2343 2343
2344 if ((idev = ipv6_find_idev(dev)) == NULL) { 2344 if ((idev = ipv6_find_idev(dev)) == NULL) {
2345 printk(KERN_DEBUG "init loopback: add_dev failed\n"); 2345 printk(KERN_DEBUG "init loopback: add_dev failed\n");
2346 return; 2346 return;
2347 } 2347 }
2348 2348
2349 add_addr(idev, &in6addr_loopback, 128, IFA_HOST); 2349 add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
2350 } 2350 }
2351 2351
2352 static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) 2352 static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr)
2353 { 2353 {
2354 struct inet6_ifaddr * ifp; 2354 struct inet6_ifaddr * ifp;
2355 u32 addr_flags = IFA_F_PERMANENT; 2355 u32 addr_flags = IFA_F_PERMANENT;
2356 2356
2357 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 2357 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
2358 if (idev->cnf.optimistic_dad && 2358 if (idev->cnf.optimistic_dad &&
2359 !dev_net(idev->dev)->ipv6.devconf_all->forwarding) 2359 !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
2360 addr_flags |= IFA_F_OPTIMISTIC; 2360 addr_flags |= IFA_F_OPTIMISTIC;
2361 #endif 2361 #endif
2362 2362
2363 2363
2364 ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags); 2364 ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags);
2365 if (!IS_ERR(ifp)) { 2365 if (!IS_ERR(ifp)) {
2366 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); 2366 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
2367 addrconf_dad_start(ifp, 0); 2367 addrconf_dad_start(ifp, 0);
2368 in6_ifa_put(ifp); 2368 in6_ifa_put(ifp);
2369 } 2369 }
2370 } 2370 }
2371 2371
2372 static void addrconf_dev_config(struct net_device *dev) 2372 static void addrconf_dev_config(struct net_device *dev)
2373 { 2373 {
2374 struct in6_addr addr; 2374 struct in6_addr addr;
2375 struct inet6_dev * idev; 2375 struct inet6_dev * idev;
2376 2376
2377 ASSERT_RTNL(); 2377 ASSERT_RTNL();
2378 2378
2379 if ((dev->type != ARPHRD_ETHER) && 2379 if ((dev->type != ARPHRD_ETHER) &&
2380 (dev->type != ARPHRD_FDDI) && 2380 (dev->type != ARPHRD_FDDI) &&
2381 (dev->type != ARPHRD_IEEE802_TR) && 2381 (dev->type != ARPHRD_IEEE802_TR) &&
2382 (dev->type != ARPHRD_ARCNET) && 2382 (dev->type != ARPHRD_ARCNET) &&
2383 (dev->type != ARPHRD_INFINIBAND)) { 2383 (dev->type != ARPHRD_INFINIBAND)) {
2384 /* Alas, we support only Ethernet autoconfiguration. */ 2384 /* Alas, we support only Ethernet autoconfiguration. */
2385 return; 2385 return;
2386 } 2386 }
2387 2387
2388 idev = addrconf_add_dev(dev); 2388 idev = addrconf_add_dev(dev);
2389 if (IS_ERR(idev)) 2389 if (IS_ERR(idev))
2390 return; 2390 return;
2391 2391
2392 memset(&addr, 0, sizeof(struct in6_addr)); 2392 memset(&addr, 0, sizeof(struct in6_addr));
2393 addr.s6_addr32[0] = htonl(0xFE800000); 2393 addr.s6_addr32[0] = htonl(0xFE800000);
2394 2394
2395 if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) 2395 if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0)
2396 addrconf_add_linklocal(idev, &addr); 2396 addrconf_add_linklocal(idev, &addr);
2397 } 2397 }
2398 2398
2399 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 2399 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
2400 static void addrconf_sit_config(struct net_device *dev) 2400 static void addrconf_sit_config(struct net_device *dev)
2401 { 2401 {
2402 struct inet6_dev *idev; 2402 struct inet6_dev *idev;
2403 2403
2404 ASSERT_RTNL(); 2404 ASSERT_RTNL();
2405 2405
2406 /* 2406 /*
2407 * Configure the tunnel with one of our IPv4 2407 * Configure the tunnel with one of our IPv4
2408 * addresses... we should configure all of 2408 * addresses... we should configure all of
2409 * our v4 addrs in the tunnel 2409 * our v4 addrs in the tunnel
2410 */ 2410 */
2411 2411
2412 if ((idev = ipv6_find_idev(dev)) == NULL) { 2412 if ((idev = ipv6_find_idev(dev)) == NULL) {
2413 printk(KERN_DEBUG "init sit: add_dev failed\n"); 2413 printk(KERN_DEBUG "init sit: add_dev failed\n");
2414 return; 2414 return;
2415 } 2415 }
2416 2416
2417 if (dev->priv_flags & IFF_ISATAP) { 2417 if (dev->priv_flags & IFF_ISATAP) {
2418 struct in6_addr addr; 2418 struct in6_addr addr;
2419 2419
2420 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); 2420 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
2421 addrconf_prefix_route(&addr, 64, dev, 0, 0); 2421 addrconf_prefix_route(&addr, 64, dev, 0, 0);
2422 if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) 2422 if (!ipv6_generate_eui64(addr.s6_addr + 8, dev))
2423 addrconf_add_linklocal(idev, &addr); 2423 addrconf_add_linklocal(idev, &addr);
2424 return; 2424 return;
2425 } 2425 }
2426 2426
2427 sit_add_v4_addrs(idev); 2427 sit_add_v4_addrs(idev);
2428 2428
2429 if (dev->flags&IFF_POINTOPOINT) { 2429 if (dev->flags&IFF_POINTOPOINT) {
2430 addrconf_add_mroute(dev); 2430 addrconf_add_mroute(dev);
2431 addrconf_add_lroute(dev); 2431 addrconf_add_lroute(dev);
2432 } else 2432 } else
2433 sit_route_add(dev); 2433 sit_route_add(dev);
2434 } 2434 }
2435 #endif 2435 #endif
2436 2436
2437 #if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE) 2437 #if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE)
2438 static void addrconf_gre_config(struct net_device *dev) 2438 static void addrconf_gre_config(struct net_device *dev)
2439 { 2439 {
2440 struct inet6_dev *idev; 2440 struct inet6_dev *idev;
2441 struct in6_addr addr; 2441 struct in6_addr addr;
2442 2442
2443 pr_info("ipv6: addrconf_gre_config(%s)\n", dev->name); 2443 pr_info("ipv6: addrconf_gre_config(%s)\n", dev->name);
2444 2444
2445 ASSERT_RTNL(); 2445 ASSERT_RTNL();
2446 2446
2447 if ((idev = ipv6_find_idev(dev)) == NULL) { 2447 if ((idev = ipv6_find_idev(dev)) == NULL) {
2448 printk(KERN_DEBUG "init gre: add_dev failed\n"); 2448 printk(KERN_DEBUG "init gre: add_dev failed\n");
2449 return; 2449 return;
2450 } 2450 }
2451 2451
2452 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); 2452 ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
2453 addrconf_prefix_route(&addr, 64, dev, 0, 0); 2453 addrconf_prefix_route(&addr, 64, dev, 0, 0);
2454 2454
2455 if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) 2455 if (!ipv6_generate_eui64(addr.s6_addr + 8, dev))
2456 addrconf_add_linklocal(idev, &addr); 2456 addrconf_add_linklocal(idev, &addr);
2457 } 2457 }
2458 #endif 2458 #endif
2459 2459
2460 static inline int 2460 static inline int
2461 ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) 2461 ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev)
2462 { 2462 {
2463 struct in6_addr lladdr; 2463 struct in6_addr lladdr;
2464 2464
2465 if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { 2465 if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) {
2466 addrconf_add_linklocal(idev, &lladdr); 2466 addrconf_add_linklocal(idev, &lladdr);
2467 return 0; 2467 return 0;
2468 } 2468 }
2469 return -1; 2469 return -1;
2470 } 2470 }
2471 2471
2472 static void ip6_tnl_add_linklocal(struct inet6_dev *idev) 2472 static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
2473 { 2473 {
2474 struct net_device *link_dev; 2474 struct net_device *link_dev;
2475 struct net *net = dev_net(idev->dev); 2475 struct net *net = dev_net(idev->dev);
2476 2476
2477 /* first try to inherit the link-local address from the link device */ 2477 /* first try to inherit the link-local address from the link device */
2478 if (idev->dev->iflink && 2478 if (idev->dev->iflink &&
2479 (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { 2479 (link_dev = __dev_get_by_index(net, idev->dev->iflink))) {
2480 if (!ipv6_inherit_linklocal(idev, link_dev)) 2480 if (!ipv6_inherit_linklocal(idev, link_dev))
2481 return; 2481 return;
2482 } 2482 }
2483 /* then try to inherit it from any device */ 2483 /* then try to inherit it from any device */
2484 for_each_netdev(net, link_dev) { 2484 for_each_netdev(net, link_dev) {
2485 if (!ipv6_inherit_linklocal(idev, link_dev)) 2485 if (!ipv6_inherit_linklocal(idev, link_dev))
2486 return; 2486 return;
2487 } 2487 }
2488 printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); 2488 printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n");
2489 } 2489 }
2490 2490
2491 /* 2491 /*
2492 * Autoconfigure tunnel with a link-local address so routing protocols, 2492 * Autoconfigure tunnel with a link-local address so routing protocols,
2493 * DHCPv6, MLD etc. can be run over the virtual link 2493 * DHCPv6, MLD etc. can be run over the virtual link
2494 */ 2494 */
2495 2495
2496 static void addrconf_ip6_tnl_config(struct net_device *dev) 2496 static void addrconf_ip6_tnl_config(struct net_device *dev)
2497 { 2497 {
2498 struct inet6_dev *idev; 2498 struct inet6_dev *idev;
2499 2499
2500 ASSERT_RTNL(); 2500 ASSERT_RTNL();
2501 2501
2502 idev = addrconf_add_dev(dev); 2502 idev = addrconf_add_dev(dev);
2503 if (IS_ERR(idev)) { 2503 if (IS_ERR(idev)) {
2504 printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); 2504 printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n");
2505 return; 2505 return;
2506 } 2506 }
2507 ip6_tnl_add_linklocal(idev); 2507 ip6_tnl_add_linklocal(idev);
2508 } 2508 }
2509 2509
2510 static int addrconf_notify(struct notifier_block *this, unsigned long event, 2510 static int addrconf_notify(struct notifier_block *this, unsigned long event,
2511 void * data) 2511 void * data)
2512 { 2512 {
2513 struct net_device *dev = (struct net_device *) data; 2513 struct net_device *dev = (struct net_device *) data;
2514 struct inet6_dev *idev = __in6_dev_get(dev); 2514 struct inet6_dev *idev = __in6_dev_get(dev);
2515 int run_pending = 0; 2515 int run_pending = 0;
2516 int err; 2516 int err;
2517 2517
2518 switch (event) { 2518 switch (event) {
2519 case NETDEV_REGISTER: 2519 case NETDEV_REGISTER:
2520 if (!idev && dev->mtu >= IPV6_MIN_MTU) { 2520 if (!idev && dev->mtu >= IPV6_MIN_MTU) {
2521 idev = ipv6_add_dev(dev); 2521 idev = ipv6_add_dev(dev);
2522 if (!idev) 2522 if (!idev)
2523 return notifier_from_errno(-ENOMEM); 2523 return notifier_from_errno(-ENOMEM);
2524 } 2524 }
2525 break; 2525 break;
2526 2526
2527 case NETDEV_UP: 2527 case NETDEV_UP:
2528 case NETDEV_CHANGE: 2528 case NETDEV_CHANGE:
2529 if (dev->flags & IFF_SLAVE) 2529 if (dev->flags & IFF_SLAVE)
2530 break; 2530 break;
2531 2531
2532 if (event == NETDEV_UP) { 2532 if (event == NETDEV_UP) {
2533 if (!addrconf_qdisc_ok(dev)) { 2533 if (!addrconf_qdisc_ok(dev)) {
2534 /* device is not ready yet. */ 2534 /* device is not ready yet. */
2535 printk(KERN_INFO 2535 printk(KERN_INFO
2536 "ADDRCONF(NETDEV_UP): %s: " 2536 "ADDRCONF(NETDEV_UP): %s: "
2537 "link is not ready\n", 2537 "link is not ready\n",
2538 dev->name); 2538 dev->name);
2539 break; 2539 break;
2540 } 2540 }
2541 2541
2542 if (!idev && dev->mtu >= IPV6_MIN_MTU) 2542 if (!idev && dev->mtu >= IPV6_MIN_MTU)
2543 idev = ipv6_add_dev(dev); 2543 idev = ipv6_add_dev(dev);
2544 2544
2545 if (idev) { 2545 if (idev) {
2546 idev->if_flags |= IF_READY; 2546 idev->if_flags |= IF_READY;
2547 run_pending = 1; 2547 run_pending = 1;
2548 } 2548 }
2549 } else { 2549 } else {
2550 if (!addrconf_qdisc_ok(dev)) { 2550 if (!addrconf_qdisc_ok(dev)) {
2551 /* device is still not ready. */ 2551 /* device is still not ready. */
2552 break; 2552 break;
2553 } 2553 }
2554 2554
2555 if (idev) { 2555 if (idev) {
2556 if (idev->if_flags & IF_READY) 2556 if (idev->if_flags & IF_READY)
2557 /* device is already configured. */ 2557 /* device is already configured. */
2558 break; 2558 break;
2559 idev->if_flags |= IF_READY; 2559 idev->if_flags |= IF_READY;
2560 } 2560 }
2561 2561
2562 printk(KERN_INFO 2562 printk(KERN_INFO
2563 "ADDRCONF(NETDEV_CHANGE): %s: " 2563 "ADDRCONF(NETDEV_CHANGE): %s: "
2564 "link becomes ready\n", 2564 "link becomes ready\n",
2565 dev->name); 2565 dev->name);
2566 2566
2567 run_pending = 1; 2567 run_pending = 1;
2568 } 2568 }
2569 2569
2570 switch (dev->type) { 2570 switch (dev->type) {
2571 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) 2571 #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
2572 case ARPHRD_SIT: 2572 case ARPHRD_SIT:
2573 addrconf_sit_config(dev); 2573 addrconf_sit_config(dev);
2574 break; 2574 break;
2575 #endif 2575 #endif
2576 #if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE) 2576 #if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE)
2577 case ARPHRD_IPGRE: 2577 case ARPHRD_IPGRE:
2578 addrconf_gre_config(dev); 2578 addrconf_gre_config(dev);
2579 break; 2579 break;
2580 #endif 2580 #endif
2581 case ARPHRD_TUNNEL6: 2581 case ARPHRD_TUNNEL6:
2582 addrconf_ip6_tnl_config(dev); 2582 addrconf_ip6_tnl_config(dev);
2583 break; 2583 break;
2584 case ARPHRD_LOOPBACK: 2584 case ARPHRD_LOOPBACK:
2585 init_loopback(dev); 2585 init_loopback(dev);
2586 break; 2586 break;
2587 2587
2588 default: 2588 default:
2589 addrconf_dev_config(dev); 2589 addrconf_dev_config(dev);
2590 break; 2590 break;
2591 } 2591 }
2592 2592
2593 if (idev) { 2593 if (idev) {
2594 if (run_pending) 2594 if (run_pending)
2595 addrconf_dad_run(idev); 2595 addrconf_dad_run(idev);
2596 2596
2597 /* 2597 /*
2598 * If the MTU changed during the interface down, 2598 * If the MTU changed during the interface down,
2599 * when the interface up, the changed MTU must be 2599 * when the interface up, the changed MTU must be
2600 * reflected in the idev as well as routers. 2600 * reflected in the idev as well as routers.
2601 */ 2601 */
2602 if (idev->cnf.mtu6 != dev->mtu && 2602 if (idev->cnf.mtu6 != dev->mtu &&
2603 dev->mtu >= IPV6_MIN_MTU) { 2603 dev->mtu >= IPV6_MIN_MTU) {
2604 rt6_mtu_change(dev, dev->mtu); 2604 rt6_mtu_change(dev, dev->mtu);
2605 idev->cnf.mtu6 = dev->mtu; 2605 idev->cnf.mtu6 = dev->mtu;
2606 } 2606 }
2607 idev->tstamp = jiffies; 2607 idev->tstamp = jiffies;
2608 inet6_ifinfo_notify(RTM_NEWLINK, idev); 2608 inet6_ifinfo_notify(RTM_NEWLINK, idev);
2609 2609
2610 /* 2610 /*
2611 * If the changed mtu during down is lower than 2611 * If the changed mtu during down is lower than
2612 * IPV6_MIN_MTU stop IPv6 on this interface. 2612 * IPV6_MIN_MTU stop IPv6 on this interface.
2613 */ 2613 */
2614 if (dev->mtu < IPV6_MIN_MTU) 2614 if (dev->mtu < IPV6_MIN_MTU)
2615 addrconf_ifdown(dev, 1); 2615 addrconf_ifdown(dev, 1);
2616 } 2616 }
2617 break; 2617 break;
2618 2618
2619 case NETDEV_CHANGEMTU: 2619 case NETDEV_CHANGEMTU:
2620 if (idev && dev->mtu >= IPV6_MIN_MTU) { 2620 if (idev && dev->mtu >= IPV6_MIN_MTU) {
2621 rt6_mtu_change(dev, dev->mtu); 2621 rt6_mtu_change(dev, dev->mtu);
2622 idev->cnf.mtu6 = dev->mtu; 2622 idev->cnf.mtu6 = dev->mtu;
2623 break; 2623 break;
2624 } 2624 }
2625 2625
2626 if (!idev && dev->mtu >= IPV6_MIN_MTU) { 2626 if (!idev && dev->mtu >= IPV6_MIN_MTU) {
2627 idev = ipv6_add_dev(dev); 2627 idev = ipv6_add_dev(dev);
2628 if (idev) 2628 if (idev)
2629 break; 2629 break;
2630 } 2630 }
2631 2631
2632 /* 2632 /*
2633 * MTU falled under IPV6_MIN_MTU. 2633 * MTU falled under IPV6_MIN_MTU.
2634 * Stop IPv6 on this interface. 2634 * Stop IPv6 on this interface.
2635 */ 2635 */
2636 2636
2637 case NETDEV_DOWN: 2637 case NETDEV_DOWN:
2638 case NETDEV_UNREGISTER: 2638 case NETDEV_UNREGISTER:
2639 /* 2639 /*
2640 * Remove all addresses from this interface. 2640 * Remove all addresses from this interface.
2641 */ 2641 */
2642 addrconf_ifdown(dev, event != NETDEV_DOWN); 2642 addrconf_ifdown(dev, event != NETDEV_DOWN);
2643 break; 2643 break;
2644 2644
2645 case NETDEV_CHANGENAME: 2645 case NETDEV_CHANGENAME:
2646 if (idev) { 2646 if (idev) {
2647 snmp6_unregister_dev(idev); 2647 snmp6_unregister_dev(idev);
2648 addrconf_sysctl_unregister(idev); 2648 addrconf_sysctl_unregister(idev);
2649 addrconf_sysctl_register(idev); 2649 addrconf_sysctl_register(idev);
2650 err = snmp6_register_dev(idev); 2650 err = snmp6_register_dev(idev);
2651 if (err) 2651 if (err)
2652 return notifier_from_errno(err); 2652 return notifier_from_errno(err);
2653 } 2653 }
2654 break; 2654 break;
2655 2655
2656 case NETDEV_PRE_TYPE_CHANGE: 2656 case NETDEV_PRE_TYPE_CHANGE:
2657 case NETDEV_POST_TYPE_CHANGE: 2657 case NETDEV_POST_TYPE_CHANGE:
2658 addrconf_type_change(dev, event); 2658 addrconf_type_change(dev, event);
2659 break; 2659 break;
2660 } 2660 }
2661 2661
2662 return NOTIFY_OK; 2662 return NOTIFY_OK;
2663 } 2663 }
2664 2664
2665 /* 2665 /*
2666 * addrconf module should be notified of a device going up 2666 * addrconf module should be notified of a device going up
2667 */ 2667 */
2668 static struct notifier_block ipv6_dev_notf = { 2668 static struct notifier_block ipv6_dev_notf = {
2669 .notifier_call = addrconf_notify, 2669 .notifier_call = addrconf_notify,
2670 }; 2670 };
2671 2671
2672 static void addrconf_type_change(struct net_device *dev, unsigned long event) 2672 static void addrconf_type_change(struct net_device *dev, unsigned long event)
2673 { 2673 {
2674 struct inet6_dev *idev; 2674 struct inet6_dev *idev;
2675 ASSERT_RTNL(); 2675 ASSERT_RTNL();
2676 2676
2677 idev = __in6_dev_get(dev); 2677 idev = __in6_dev_get(dev);
2678 2678
2679 if (event == NETDEV_POST_TYPE_CHANGE) 2679 if (event == NETDEV_POST_TYPE_CHANGE)
2680 ipv6_mc_remap(idev); 2680 ipv6_mc_remap(idev);
2681 else if (event == NETDEV_PRE_TYPE_CHANGE) 2681 else if (event == NETDEV_PRE_TYPE_CHANGE)
2682 ipv6_mc_unmap(idev); 2682 ipv6_mc_unmap(idev);
2683 } 2683 }
2684 2684
2685 static int addrconf_ifdown(struct net_device *dev, int how) 2685 static int addrconf_ifdown(struct net_device *dev, int how)
2686 { 2686 {
2687 struct net *net = dev_net(dev); 2687 struct net *net = dev_net(dev);
2688 struct inet6_dev *idev; 2688 struct inet6_dev *idev;
2689 struct inet6_ifaddr *ifa; 2689 struct inet6_ifaddr *ifa;
2690 int state, i; 2690 int state, i;
2691 2691
2692 ASSERT_RTNL(); 2692 ASSERT_RTNL();
2693 2693
2694 rt6_ifdown(net, dev); 2694 rt6_ifdown(net, dev);
2695 neigh_ifdown(&nd_tbl, dev); 2695 neigh_ifdown(&nd_tbl, dev);
2696 2696
2697 idev = __in6_dev_get(dev); 2697 idev = __in6_dev_get(dev);
2698 if (idev == NULL) 2698 if (idev == NULL)
2699 return -ENODEV; 2699 return -ENODEV;
2700 2700
2701 /* 2701 /*
2702 * Step 1: remove reference to ipv6 device from parent device. 2702 * Step 1: remove reference to ipv6 device from parent device.
2703 * Do not dev_put! 2703 * Do not dev_put!
2704 */ 2704 */
2705 if (how) { 2705 if (how) {
2706 idev->dead = 1; 2706 idev->dead = 1;
2707 2707
2708 /* protected by rtnl_lock */ 2708 /* protected by rtnl_lock */
2709 rcu_assign_pointer(dev->ip6_ptr, NULL); 2709 rcu_assign_pointer(dev->ip6_ptr, NULL);
2710 2710
2711 /* Step 1.5: remove snmp6 entry */ 2711 /* Step 1.5: remove snmp6 entry */
2712 snmp6_unregister_dev(idev); 2712 snmp6_unregister_dev(idev);
2713 2713
2714 } 2714 }
2715 2715
2716 /* Step 2: clear hash table */ 2716 /* Step 2: clear hash table */
2717 for (i = 0; i < IN6_ADDR_HSIZE; i++) { 2717 for (i = 0; i < IN6_ADDR_HSIZE; i++) {
2718 struct hlist_head *h = &inet6_addr_lst[i]; 2718 struct hlist_head *h = &inet6_addr_lst[i];
2719 struct hlist_node *n; 2719 struct hlist_node *n;
2720 2720
2721 spin_lock_bh(&addrconf_hash_lock); 2721 spin_lock_bh(&addrconf_hash_lock);
2722 restart: 2722 restart:
2723 hlist_for_each_entry_rcu(ifa, n, h, addr_lst) { 2723 hlist_for_each_entry_rcu(ifa, n, h, addr_lst) {
2724 if (ifa->idev == idev) { 2724 if (ifa->idev == idev) {
2725 hlist_del_init_rcu(&ifa->addr_lst); 2725 hlist_del_init_rcu(&ifa->addr_lst);
2726 addrconf_del_timer(ifa); 2726 addrconf_del_timer(ifa);
2727 goto restart; 2727 goto restart;
2728 } 2728 }
2729 } 2729 }
2730 spin_unlock_bh(&addrconf_hash_lock); 2730 spin_unlock_bh(&addrconf_hash_lock);
2731 } 2731 }
2732 2732
2733 write_lock_bh(&idev->lock); 2733 write_lock_bh(&idev->lock);
2734 2734
2735 /* Step 2: clear flags for stateless addrconf */ 2735 /* Step 2: clear flags for stateless addrconf */
2736 if (!how) 2736 if (!how)
2737 idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); 2737 idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
2738 2738
2739 #ifdef CONFIG_IPV6_PRIVACY 2739 #ifdef CONFIG_IPV6_PRIVACY
2740 if (how && del_timer(&idev->regen_timer)) 2740 if (how && del_timer(&idev->regen_timer))
2741 in6_dev_put(idev); 2741 in6_dev_put(idev);
2742 2742
2743 /* Step 3: clear tempaddr list */ 2743 /* Step 3: clear tempaddr list */
2744 while (!list_empty(&idev->tempaddr_list)) { 2744 while (!list_empty(&idev->tempaddr_list)) {
2745 ifa = list_first_entry(&idev->tempaddr_list, 2745 ifa = list_first_entry(&idev->tempaddr_list,
2746 struct inet6_ifaddr, tmp_list); 2746 struct inet6_ifaddr, tmp_list);
2747 list_del(&ifa->tmp_list); 2747 list_del(&ifa->tmp_list);
2748 write_unlock_bh(&idev->lock); 2748 write_unlock_bh(&idev->lock);
2749 spin_lock_bh(&ifa->lock); 2749 spin_lock_bh(&ifa->lock);
2750 2750
2751 if (ifa->ifpub) { 2751 if (ifa->ifpub) {
2752 in6_ifa_put(ifa->ifpub); 2752 in6_ifa_put(ifa->ifpub);
2753 ifa->ifpub = NULL; 2753 ifa->ifpub = NULL;
2754 } 2754 }
2755 spin_unlock_bh(&ifa->lock); 2755 spin_unlock_bh(&ifa->lock);
2756 in6_ifa_put(ifa); 2756 in6_ifa_put(ifa);
2757 write_lock_bh(&idev->lock); 2757 write_lock_bh(&idev->lock);
2758 } 2758 }
2759 #endif 2759 #endif
2760 2760
2761 while (!list_empty(&idev->addr_list)) { 2761 while (!list_empty(&idev->addr_list)) {
2762 ifa = list_first_entry(&idev->addr_list, 2762 ifa = list_first_entry(&idev->addr_list,
2763 struct inet6_ifaddr, if_list); 2763 struct inet6_ifaddr, if_list);
2764 addrconf_del_timer(ifa); 2764 addrconf_del_timer(ifa);
2765 2765
2766 list_del(&ifa->if_list); 2766 list_del(&ifa->if_list);
2767 2767
2768 write_unlock_bh(&idev->lock); 2768 write_unlock_bh(&idev->lock);
2769 2769
2770 spin_lock_bh(&ifa->state_lock); 2770 spin_lock_bh(&ifa->state_lock);
2771 state = ifa->state; 2771 state = ifa->state;
2772 ifa->state = INET6_IFADDR_STATE_DEAD; 2772 ifa->state = INET6_IFADDR_STATE_DEAD;
2773 spin_unlock_bh(&ifa->state_lock); 2773 spin_unlock_bh(&ifa->state_lock);
2774 2774
2775 if (state != INET6_IFADDR_STATE_DEAD) { 2775 if (state != INET6_IFADDR_STATE_DEAD) {
2776 __ipv6_ifa_notify(RTM_DELADDR, ifa); 2776 __ipv6_ifa_notify(RTM_DELADDR, ifa);
2777 atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); 2777 atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
2778 } 2778 }
2779 in6_ifa_put(ifa); 2779 in6_ifa_put(ifa);
2780 2780
2781 write_lock_bh(&idev->lock); 2781 write_lock_bh(&idev->lock);
2782 } 2782 }
2783 2783
2784 write_unlock_bh(&idev->lock); 2784 write_unlock_bh(&idev->lock);
2785 2785
2786 /* Step 5: Discard multicast list */ 2786 /* Step 5: Discard multicast list */
2787 if (how) 2787 if (how)
2788 ipv6_mc_destroy_dev(idev); 2788 ipv6_mc_destroy_dev(idev);
2789 else 2789 else
2790 ipv6_mc_down(idev); 2790 ipv6_mc_down(idev);
2791 2791
2792 idev->tstamp = jiffies; 2792 idev->tstamp = jiffies;
2793 2793
2794 /* Last: Shot the device (if unregistered) */ 2794 /* Last: Shot the device (if unregistered) */
2795 if (how) { 2795 if (how) {
2796 addrconf_sysctl_unregister(idev); 2796 addrconf_sysctl_unregister(idev);
2797 neigh_parms_release(&nd_tbl, idev->nd_parms); 2797 neigh_parms_release(&nd_tbl, idev->nd_parms);
2798 neigh_ifdown(&nd_tbl, dev); 2798 neigh_ifdown(&nd_tbl, dev);
2799 in6_dev_put(idev); 2799 in6_dev_put(idev);
2800 } 2800 }
2801 return 0; 2801 return 0;
2802 } 2802 }
2803 2803
2804 static void addrconf_rs_timer(unsigned long data) 2804 static void addrconf_rs_timer(unsigned long data)
2805 { 2805 {
2806 struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; 2806 struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
2807 struct inet6_dev *idev = ifp->idev; 2807 struct inet6_dev *idev = ifp->idev;
2808 2808
2809 read_lock(&idev->lock); 2809 read_lock(&idev->lock);
2810 if (idev->dead || !(idev->if_flags & IF_READY)) 2810 if (idev->dead || !(idev->if_flags & IF_READY))
2811 goto out; 2811 goto out;
2812 2812
2813 if (idev->cnf.forwarding) 2813 if (idev->cnf.forwarding)
2814 goto out; 2814 goto out;
2815 2815
2816 /* Announcement received after solicitation was sent */ 2816 /* Announcement received after solicitation was sent */
2817 if (idev->if_flags & IF_RA_RCVD) 2817 if (idev->if_flags & IF_RA_RCVD)
2818 goto out; 2818 goto out;
2819 2819
2820 spin_lock(&ifp->lock); 2820 spin_lock(&ifp->lock);
2821 if (ifp->probes++ < idev->cnf.rtr_solicits) { 2821 if (ifp->probes++ < idev->cnf.rtr_solicits) {
2822 /* The wait after the last probe can be shorter */ 2822 /* The wait after the last probe can be shorter */
2823 addrconf_mod_timer(ifp, AC_RS, 2823 addrconf_mod_timer(ifp, AC_RS,
2824 (ifp->probes == idev->cnf.rtr_solicits) ? 2824 (ifp->probes == idev->cnf.rtr_solicits) ?
2825 idev->cnf.rtr_solicit_delay : 2825 idev->cnf.rtr_solicit_delay :
2826 idev->cnf.rtr_solicit_interval); 2826 idev->cnf.rtr_solicit_interval);
2827 spin_unlock(&ifp->lock); 2827 spin_unlock(&ifp->lock);
2828 2828
2829 ndisc_send_rs(idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); 2829 ndisc_send_rs(idev->dev, &ifp->addr, &in6addr_linklocal_allrouters);
2830 } else { 2830 } else {
2831 spin_unlock(&ifp->lock); 2831 spin_unlock(&ifp->lock);
2832 /* 2832 /*
2833 * Note: we do not support deprecated "all on-link" 2833 * Note: we do not support deprecated "all on-link"
2834 * assumption any longer. 2834 * assumption any longer.
2835 */ 2835 */
2836 printk(KERN_DEBUG "%s: no IPv6 routers present\n", 2836 printk(KERN_DEBUG "%s: no IPv6 routers present\n",
2837 idev->dev->name); 2837 idev->dev->name);
2838 } 2838 }
2839 2839
2840 out: 2840 out:
2841 read_unlock(&idev->lock); 2841 read_unlock(&idev->lock);
2842 in6_ifa_put(ifp); 2842 in6_ifa_put(ifp);
2843 } 2843 }
2844 2844
2845 /* 2845 /*
2846 * Duplicate Address Detection 2846 * Duplicate Address Detection
2847 */ 2847 */
2848 static void addrconf_dad_kick(struct inet6_ifaddr *ifp) 2848 static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
2849 { 2849 {
2850 unsigned long rand_num; 2850 unsigned long rand_num;
2851 struct inet6_dev *idev = ifp->idev; 2851 struct inet6_dev *idev = ifp->idev;
2852 2852
2853 if (ifp->flags & IFA_F_OPTIMISTIC) 2853 if (ifp->flags & IFA_F_OPTIMISTIC)
2854 rand_num = 0; 2854 rand_num = 0;
2855 else 2855 else
2856 rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); 2856 rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1);
2857 2857
2858 ifp->probes = idev->cnf.dad_transmits; 2858 ifp->probes = idev->cnf.dad_transmits;
2859 addrconf_mod_timer(ifp, AC_DAD, rand_num); 2859 addrconf_mod_timer(ifp, AC_DAD, rand_num);
2860 } 2860 }
2861 2861
2862 static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) 2862 static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
2863 { 2863 {
2864 struct inet6_dev *idev = ifp->idev; 2864 struct inet6_dev *idev = ifp->idev;
2865 struct net_device *dev = idev->dev; 2865 struct net_device *dev = idev->dev;
2866 2866
2867 addrconf_join_solict(dev, &ifp->addr); 2867 addrconf_join_solict(dev, &ifp->addr);
2868 2868
2869 net_srandom(ifp->addr.s6_addr32[3]); 2869 net_srandom(ifp->addr.s6_addr32[3]);
2870 2870
2871 read_lock_bh(&idev->lock); 2871 read_lock_bh(&idev->lock);
2872 spin_lock(&ifp->lock); 2872 spin_lock(&ifp->lock);
2873 if (ifp->state == INET6_IFADDR_STATE_DEAD) 2873 if (ifp->state == INET6_IFADDR_STATE_DEAD)
2874 goto out; 2874 goto out;
2875 2875
2876 if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || 2876 if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
2877 idev->cnf.accept_dad < 1 || 2877 idev->cnf.accept_dad < 1 ||
2878 !(ifp->flags&IFA_F_TENTATIVE) || 2878 !(ifp->flags&IFA_F_TENTATIVE) ||
2879 ifp->flags & IFA_F_NODAD) { 2879 ifp->flags & IFA_F_NODAD) {
2880 ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); 2880 ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
2881 spin_unlock(&ifp->lock); 2881 spin_unlock(&ifp->lock);
2882 read_unlock_bh(&idev->lock); 2882 read_unlock_bh(&idev->lock);
2883 2883
2884 addrconf_dad_completed(ifp); 2884 addrconf_dad_completed(ifp);
2885 return; 2885 return;
2886 } 2886 }
2887 2887
2888 if (!(idev->if_flags & IF_READY)) { 2888 if (!(idev->if_flags & IF_READY)) {
2889 spin_unlock(&ifp->lock); 2889 spin_unlock(&ifp->lock);
2890 read_unlock_bh(&idev->lock); 2890 read_unlock_bh(&idev->lock);
2891 /* 2891 /*
2892 * If the device is not ready: 2892 * If the device is not ready:
2893 * - keep it tentative if it is a permanent address. 2893 * - keep it tentative if it is a permanent address.
2894 * - otherwise, kill it. 2894 * - otherwise, kill it.
2895 */ 2895 */
2896 in6_ifa_hold(ifp); 2896 in6_ifa_hold(ifp);
2897 addrconf_dad_stop(ifp, 0); 2897 addrconf_dad_stop(ifp, 0);
2898 return; 2898 return;
2899 } 2899 }
2900 2900
2901 /* 2901 /*
2902 * Optimistic nodes can start receiving 2902 * Optimistic nodes can start receiving
2903 * Frames right away 2903 * Frames right away
2904 */ 2904 */
2905 if (ifp->flags & IFA_F_OPTIMISTIC) 2905 if (ifp->flags & IFA_F_OPTIMISTIC)
2906 ip6_ins_rt(ifp->rt); 2906 ip6_ins_rt(ifp->rt);
2907 2907
2908 addrconf_dad_kick(ifp); 2908 addrconf_dad_kick(ifp);
2909 out: 2909 out:
2910 spin_unlock(&ifp->lock); 2910 spin_unlock(&ifp->lock);
2911 read_unlock_bh(&idev->lock); 2911 read_unlock_bh(&idev->lock);
2912 } 2912 }
2913 2913
2914 static void addrconf_dad_timer(unsigned long data) 2914 static void addrconf_dad_timer(unsigned long data)
2915 { 2915 {
2916 struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; 2916 struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
2917 struct inet6_dev *idev = ifp->idev; 2917 struct inet6_dev *idev = ifp->idev;
2918 struct in6_addr mcaddr; 2918 struct in6_addr mcaddr;
2919 2919
2920 if (!ifp->probes && addrconf_dad_end(ifp)) 2920 if (!ifp->probes && addrconf_dad_end(ifp))
2921 goto out; 2921 goto out;
2922 2922
2923 read_lock(&idev->lock); 2923 read_lock(&idev->lock);
2924 if (idev->dead || !(idev->if_flags & IF_READY)) { 2924 if (idev->dead || !(idev->if_flags & IF_READY)) {
2925 read_unlock(&idev->lock); 2925 read_unlock(&idev->lock);
2926 goto out; 2926 goto out;
2927 } 2927 }
2928 2928
2929 spin_lock(&ifp->lock); 2929 spin_lock(&ifp->lock);
2930 if (ifp->state == INET6_IFADDR_STATE_DEAD) { 2930 if (ifp->state == INET6_IFADDR_STATE_DEAD) {
2931 spin_unlock(&ifp->lock); 2931 spin_unlock(&ifp->lock);
2932 read_unlock(&idev->lock); 2932 read_unlock(&idev->lock);
2933 goto out; 2933 goto out;
2934 } 2934 }
2935 2935
2936 if (ifp->probes == 0) { 2936 if (ifp->probes == 0) {
2937 /* 2937 /*
2938 * DAD was successful 2938 * DAD was successful
2939 */ 2939 */
2940 2940
2941 ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); 2941 ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
2942 spin_unlock(&ifp->lock); 2942 spin_unlock(&ifp->lock);
2943 read_unlock(&idev->lock); 2943 read_unlock(&idev->lock);
2944 2944
2945 addrconf_dad_completed(ifp); 2945 addrconf_dad_completed(ifp);
2946 2946
2947 goto out; 2947 goto out;
2948 } 2948 }
2949 2949
2950 ifp->probes--; 2950 ifp->probes--;
2951 addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); 2951 addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time);
2952 spin_unlock(&ifp->lock); 2952 spin_unlock(&ifp->lock);
2953 read_unlock(&idev->lock); 2953 read_unlock(&idev->lock);
2954 2954
2955 /* send a neighbour solicitation for our addr */ 2955 /* send a neighbour solicitation for our addr */
2956 addrconf_addr_solict_mult(&ifp->addr, &mcaddr); 2956 addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
2957 ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); 2957 ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
2958 out: 2958 out:
2959 in6_ifa_put(ifp); 2959 in6_ifa_put(ifp);
2960 } 2960 }
2961 2961
2962 static void addrconf_dad_completed(struct inet6_ifaddr *ifp) 2962 static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
2963 { 2963 {
2964 struct net_device *dev = ifp->idev->dev; 2964 struct net_device *dev = ifp->idev->dev;
2965 2965
2966 /* 2966 /*
2967 * Configure the address for reception. Now it is valid. 2967 * Configure the address for reception. Now it is valid.
2968 */ 2968 */
2969 2969
2970 ipv6_ifa_notify(RTM_NEWADDR, ifp); 2970 ipv6_ifa_notify(RTM_NEWADDR, ifp);
2971 2971
2972 /* If added prefix is link local and forwarding is off, 2972 /* If added prefix is link local and forwarding is off,
2973 start sending router solicitations. 2973 start sending router solicitations.
2974 */ 2974 */
2975 2975
2976 if ((ifp->idev->cnf.forwarding == 0 || 2976 if ((ifp->idev->cnf.forwarding == 0 ||
2977 ifp->idev->cnf.forwarding == 2) && 2977 ifp->idev->cnf.forwarding == 2) &&
2978 ifp->idev->cnf.rtr_solicits > 0 && 2978 ifp->idev->cnf.rtr_solicits > 0 &&
2979 (dev->flags&IFF_LOOPBACK) == 0 && 2979 (dev->flags&IFF_LOOPBACK) == 0 &&
2980 (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { 2980 (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
2981 /* 2981 /*
2982 * If a host as already performed a random delay 2982 * If a host as already performed a random delay
2983 * [...] as part of DAD [...] there is no need 2983 * [...] as part of DAD [...] there is no need
2984 * to delay again before sending the first RS 2984 * to delay again before sending the first RS
2985 */ 2985 */
2986 ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); 2986 ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters);
2987 2987
2988 spin_lock_bh(&ifp->lock); 2988 spin_lock_bh(&ifp->lock);
2989 ifp->probes = 1; 2989 ifp->probes = 1;
2990 ifp->idev->if_flags |= IF_RS_SENT; 2990 ifp->idev->if_flags |= IF_RS_SENT;
2991 addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); 2991 addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval);
2992 spin_unlock_bh(&ifp->lock); 2992 spin_unlock_bh(&ifp->lock);
2993 } 2993 }
2994 } 2994 }
2995 2995
2996 static void addrconf_dad_run(struct inet6_dev *idev) 2996 static void addrconf_dad_run(struct inet6_dev *idev)
2997 { 2997 {
2998 struct inet6_ifaddr *ifp; 2998 struct inet6_ifaddr *ifp;
2999 2999
3000 read_lock_bh(&idev->lock); 3000 read_lock_bh(&idev->lock);
3001 list_for_each_entry(ifp, &idev->addr_list, if_list) { 3001 list_for_each_entry(ifp, &idev->addr_list, if_list) {
3002 spin_lock(&ifp->lock); 3002 spin_lock(&ifp->lock);
3003 if (ifp->flags & IFA_F_TENTATIVE && 3003 if (ifp->flags & IFA_F_TENTATIVE &&
3004 ifp->state == INET6_IFADDR_STATE_DAD) 3004 ifp->state == INET6_IFADDR_STATE_DAD)
3005 addrconf_dad_kick(ifp); 3005 addrconf_dad_kick(ifp);
3006 spin_unlock(&ifp->lock); 3006 spin_unlock(&ifp->lock);
3007 } 3007 }
3008 read_unlock_bh(&idev->lock); 3008 read_unlock_bh(&idev->lock);
3009 } 3009 }
3010 3010
3011 #ifdef CONFIG_PROC_FS 3011 #ifdef CONFIG_PROC_FS
3012 struct if6_iter_state { 3012 struct if6_iter_state {
3013 struct seq_net_private p; 3013 struct seq_net_private p;
3014 int bucket; 3014 int bucket;
3015 }; 3015 };
3016 3016
3017 static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) 3017 static struct inet6_ifaddr *if6_get_first(struct seq_file *seq)
3018 { 3018 {
3019 struct inet6_ifaddr *ifa = NULL; 3019 struct inet6_ifaddr *ifa = NULL;
3020 struct if6_iter_state *state = seq->private; 3020 struct if6_iter_state *state = seq->private;
3021 struct net *net = seq_file_net(seq); 3021 struct net *net = seq_file_net(seq);
3022 3022
3023 for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { 3023 for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
3024 struct hlist_node *n; 3024 struct hlist_node *n;
3025 hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], 3025 hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket],
3026 addr_lst) 3026 addr_lst)
3027 if (net_eq(dev_net(ifa->idev->dev), net)) 3027 if (net_eq(dev_net(ifa->idev->dev), net))
3028 return ifa; 3028 return ifa;
3029 } 3029 }
3030 return NULL; 3030 return NULL;
3031 } 3031 }
3032 3032
3033 static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, 3033 static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
3034 struct inet6_ifaddr *ifa) 3034 struct inet6_ifaddr *ifa)
3035 { 3035 {
3036 struct if6_iter_state *state = seq->private; 3036 struct if6_iter_state *state = seq->private;
3037 struct net *net = seq_file_net(seq); 3037 struct net *net = seq_file_net(seq);
3038 struct hlist_node *n = &ifa->addr_lst; 3038 struct hlist_node *n = &ifa->addr_lst;
3039 3039
3040 hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) 3040 hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst)
3041 if (net_eq(dev_net(ifa->idev->dev), net)) 3041 if (net_eq(dev_net(ifa->idev->dev), net))
3042 return ifa; 3042 return ifa;
3043 3043
3044 while (++state->bucket < IN6_ADDR_HSIZE) { 3044 while (++state->bucket < IN6_ADDR_HSIZE) {
3045 hlist_for_each_entry_rcu_bh(ifa, n, 3045 hlist_for_each_entry_rcu_bh(ifa, n,
3046 &inet6_addr_lst[state->bucket], addr_lst) { 3046 &inet6_addr_lst[state->bucket], addr_lst) {
3047 if (net_eq(dev_net(ifa->idev->dev), net)) 3047 if (net_eq(dev_net(ifa->idev->dev), net))
3048 return ifa; 3048 return ifa;
3049 } 3049 }
3050 } 3050 }
3051 3051
3052 return NULL; 3052 return NULL;
3053 } 3053 }
3054 3054
3055 static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) 3055 static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos)
3056 { 3056 {
3057 struct inet6_ifaddr *ifa = if6_get_first(seq); 3057 struct inet6_ifaddr *ifa = if6_get_first(seq);
3058 3058
3059 if (ifa) 3059 if (ifa)
3060 while (pos && (ifa = if6_get_next(seq, ifa)) != NULL) 3060 while (pos && (ifa = if6_get_next(seq, ifa)) != NULL)
3061 --pos; 3061 --pos;
3062 return pos ? NULL : ifa; 3062 return pos ? NULL : ifa;
3063 } 3063 }
3064 3064
3065 static void *if6_seq_start(struct seq_file *seq, loff_t *pos) 3065 static void *if6_seq_start(struct seq_file *seq, loff_t *pos)
3066 __acquires(rcu_bh) 3066 __acquires(rcu_bh)
3067 { 3067 {
3068 rcu_read_lock_bh(); 3068 rcu_read_lock_bh();
3069 return if6_get_idx(seq, *pos); 3069 return if6_get_idx(seq, *pos);
3070 } 3070 }
3071 3071
3072 static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3072 static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3073 { 3073 {
3074 struct inet6_ifaddr *ifa; 3074 struct inet6_ifaddr *ifa;
3075 3075
3076 ifa = if6_get_next(seq, v); 3076 ifa = if6_get_next(seq, v);
3077 ++*pos; 3077 ++*pos;
3078 return ifa; 3078 return ifa;
3079 } 3079 }
3080 3080
3081 static void if6_seq_stop(struct seq_file *seq, void *v) 3081 static void if6_seq_stop(struct seq_file *seq, void *v)
3082 __releases(rcu_bh) 3082 __releases(rcu_bh)
3083 { 3083 {
3084 rcu_read_unlock_bh(); 3084 rcu_read_unlock_bh();
3085 } 3085 }
3086 3086
3087 static int if6_seq_show(struct seq_file *seq, void *v) 3087 static int if6_seq_show(struct seq_file *seq, void *v)
3088 { 3088 {
3089 struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; 3089 struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
3090 seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", 3090 seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n",
3091 &ifp->addr, 3091 &ifp->addr,
3092 ifp->idev->dev->ifindex, 3092 ifp->idev->dev->ifindex,
3093 ifp->prefix_len, 3093 ifp->prefix_len,
3094 ifp->scope, 3094 ifp->scope,
3095 ifp->flags, 3095 ifp->flags,
3096 ifp->idev->dev->name); 3096 ifp->idev->dev->name);
3097 return 0; 3097 return 0;
3098 } 3098 }
3099 3099
3100 static const struct seq_operations if6_seq_ops = { 3100 static const struct seq_operations if6_seq_ops = {
3101 .start = if6_seq_start, 3101 .start = if6_seq_start,
3102 .next = if6_seq_next, 3102 .next = if6_seq_next,
3103 .show = if6_seq_show, 3103 .show = if6_seq_show,
3104 .stop = if6_seq_stop, 3104 .stop = if6_seq_stop,
3105 }; 3105 };
3106 3106
3107 static int if6_seq_open(struct inode *inode, struct file *file) 3107 static int if6_seq_open(struct inode *inode, struct file *file)
3108 { 3108 {
3109 return seq_open_net(inode, file, &if6_seq_ops, 3109 return seq_open_net(inode, file, &if6_seq_ops,
3110 sizeof(struct if6_iter_state)); 3110 sizeof(struct if6_iter_state));
3111 } 3111 }
3112 3112
3113 static const struct file_operations if6_fops = { 3113 static const struct file_operations if6_fops = {
3114 .owner = THIS_MODULE, 3114 .owner = THIS_MODULE,
3115 .open = if6_seq_open, 3115 .open = if6_seq_open,
3116 .read = seq_read, 3116 .read = seq_read,
3117 .llseek = seq_lseek, 3117 .llseek = seq_lseek,
3118 .release = seq_release_net, 3118 .release = seq_release_net,
3119 }; 3119 };
3120 3120
3121 static int __net_init if6_proc_net_init(struct net *net) 3121 static int __net_init if6_proc_net_init(struct net *net)
3122 { 3122 {
3123 if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) 3123 if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops))
3124 return -ENOMEM; 3124 return -ENOMEM;
3125 return 0; 3125 return 0;
3126 } 3126 }
3127 3127
3128 static void __net_exit if6_proc_net_exit(struct net *net) 3128 static void __net_exit if6_proc_net_exit(struct net *net)
3129 { 3129 {
3130 proc_net_remove(net, "if_inet6"); 3130 proc_net_remove(net, "if_inet6");
3131 } 3131 }
3132 3132
3133 static struct pernet_operations if6_proc_net_ops = { 3133 static struct pernet_operations if6_proc_net_ops = {
3134 .init = if6_proc_net_init, 3134 .init = if6_proc_net_init,
3135 .exit = if6_proc_net_exit, 3135 .exit = if6_proc_net_exit,
3136 }; 3136 };
3137 3137
3138 int __init if6_proc_init(void) 3138 int __init if6_proc_init(void)
3139 { 3139 {
3140 return register_pernet_subsys(&if6_proc_net_ops); 3140 return register_pernet_subsys(&if6_proc_net_ops);
3141 } 3141 }
3142 3142
3143 void if6_proc_exit(void) 3143 void if6_proc_exit(void)
3144 { 3144 {
3145 unregister_pernet_subsys(&if6_proc_net_ops); 3145 unregister_pernet_subsys(&if6_proc_net_ops);
3146 } 3146 }
3147 #endif /* CONFIG_PROC_FS */ 3147 #endif /* CONFIG_PROC_FS */
3148 3148
3149 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) 3149 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
3150 /* Check if address is a home address configured on any interface. */ 3150 /* Check if address is a home address configured on any interface. */
3151 int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) 3151 int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
3152 { 3152 {
3153 int ret = 0; 3153 int ret = 0;
3154 struct inet6_ifaddr *ifp = NULL; 3154 struct inet6_ifaddr *ifp = NULL;
3155 struct hlist_node *n; 3155 struct hlist_node *n;
3156 unsigned int hash = ipv6_addr_hash(addr); 3156 unsigned int hash = ipv6_addr_hash(addr);
3157 3157
3158 rcu_read_lock_bh(); 3158 rcu_read_lock_bh();
3159 hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) { 3159 hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) {
3160 if (!net_eq(dev_net(ifp->idev->dev), net)) 3160 if (!net_eq(dev_net(ifp->idev->dev), net))
3161 continue; 3161 continue;
3162 if (ipv6_addr_equal(&ifp->addr, addr) && 3162 if (ipv6_addr_equal(&ifp->addr, addr) &&
3163 (ifp->flags & IFA_F_HOMEADDRESS)) { 3163 (ifp->flags & IFA_F_HOMEADDRESS)) {
3164 ret = 1; 3164 ret = 1;
3165 break; 3165 break;
3166 } 3166 }
3167 } 3167 }
3168 rcu_read_unlock_bh(); 3168 rcu_read_unlock_bh();
3169 return ret; 3169 return ret;
3170 } 3170 }
3171 #endif 3171 #endif
3172 3172
3173 /* 3173 /*
3174 * Periodic address status verification 3174 * Periodic address status verification
3175 */ 3175 */
3176 3176
3177 static void addrconf_verify(unsigned long foo) 3177 static void addrconf_verify(unsigned long foo)
3178 { 3178 {
3179 unsigned long now, next, next_sec, next_sched; 3179 unsigned long now, next, next_sec, next_sched;
3180 struct inet6_ifaddr *ifp; 3180 struct inet6_ifaddr *ifp;
3181 struct hlist_node *node; 3181 struct hlist_node *node;
3182 int i; 3182 int i;
3183 3183
3184 rcu_read_lock_bh(); 3184 rcu_read_lock_bh();
3185 spin_lock(&addrconf_verify_lock); 3185 spin_lock(&addrconf_verify_lock);
3186 now = jiffies; 3186 now = jiffies;
3187 next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); 3187 next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
3188 3188
3189 del_timer(&addr_chk_timer); 3189 del_timer(&addr_chk_timer);
3190 3190
3191 for (i = 0; i < IN6_ADDR_HSIZE; i++) { 3191 for (i = 0; i < IN6_ADDR_HSIZE; i++) {
3192 restart: 3192 restart:
3193 hlist_for_each_entry_rcu_bh(ifp, node, 3193 hlist_for_each_entry_rcu_bh(ifp, node,
3194 &inet6_addr_lst[i], addr_lst) { 3194 &inet6_addr_lst[i], addr_lst) {
3195 unsigned long age; 3195 unsigned long age;
3196 3196
3197 if (ifp->flags & IFA_F_PERMANENT) 3197 if (ifp->flags & IFA_F_PERMANENT)
3198 continue; 3198 continue;
3199 3199
3200 spin_lock(&ifp->lock); 3200 spin_lock(&ifp->lock);
3201 /* We try to batch several events at once. */ 3201 /* We try to batch several events at once. */
3202 age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; 3202 age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
3203 3203
3204 if (ifp->valid_lft != INFINITY_LIFE_TIME && 3204 if (ifp->valid_lft != INFINITY_LIFE_TIME &&
3205 age >= ifp->valid_lft) { 3205 age >= ifp->valid_lft) {
3206 spin_unlock(&ifp->lock); 3206 spin_unlock(&ifp->lock);
3207 in6_ifa_hold(ifp); 3207 in6_ifa_hold(ifp);
3208 ipv6_del_addr(ifp); 3208 ipv6_del_addr(ifp);
3209 goto restart; 3209 goto restart;
3210 } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { 3210 } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
3211 spin_unlock(&ifp->lock); 3211 spin_unlock(&ifp->lock);
3212 continue; 3212 continue;
3213 } else if (age >= ifp->prefered_lft) { 3213 } else if (age >= ifp->prefered_lft) {
3214 /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */ 3214 /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
3215 int deprecate = 0; 3215 int deprecate = 0;
3216 3216
3217 if (!(ifp->flags&IFA_F_DEPRECATED)) { 3217 if (!(ifp->flags&IFA_F_DEPRECATED)) {
3218 deprecate = 1; 3218 deprecate = 1;
3219 ifp->flags |= IFA_F_DEPRECATED; 3219 ifp->flags |= IFA_F_DEPRECATED;
3220 } 3220 }
3221 3221
3222 if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) 3222 if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next))
3223 next = ifp->tstamp + ifp->valid_lft * HZ; 3223 next = ifp->tstamp + ifp->valid_lft * HZ;
3224 3224
3225 spin_unlock(&ifp->lock); 3225 spin_unlock(&ifp->lock);
3226 3226
3227 if (deprecate) { 3227 if (deprecate) {
3228 in6_ifa_hold(ifp); 3228 in6_ifa_hold(ifp);
3229 3229
3230 ipv6_ifa_notify(0, ifp); 3230 ipv6_ifa_notify(0, ifp);
3231 in6_ifa_put(ifp); 3231 in6_ifa_put(ifp);
3232 goto restart; 3232 goto restart;
3233 } 3233 }
3234 #ifdef CONFIG_IPV6_PRIVACY 3234 #ifdef CONFIG_IPV6_PRIVACY
3235 } else if ((ifp->flags&IFA_F_TEMPORARY) && 3235 } else if ((ifp->flags&IFA_F_TEMPORARY) &&
3236 !(ifp->flags&IFA_F_TENTATIVE)) { 3236 !(ifp->flags&IFA_F_TENTATIVE)) {
3237 unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * 3237 unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
3238 ifp->idev->cnf.dad_transmits * 3238 ifp->idev->cnf.dad_transmits *
3239 ifp->idev->nd_parms->retrans_time / HZ; 3239 ifp->idev->nd_parms->retrans_time / HZ;
3240 3240
3241 if (age >= ifp->prefered_lft - regen_advance) { 3241 if (age >= ifp->prefered_lft - regen_advance) {
3242 struct inet6_ifaddr *ifpub = ifp->ifpub; 3242 struct inet6_ifaddr *ifpub = ifp->ifpub;
3243 if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) 3243 if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
3244 next = ifp->tstamp + ifp->prefered_lft * HZ; 3244 next = ifp->tstamp + ifp->prefered_lft * HZ;
3245 if (!ifp->regen_count && ifpub) { 3245 if (!ifp->regen_count && ifpub) {
3246 ifp->regen_count++; 3246 ifp->regen_count++;
3247 in6_ifa_hold(ifp); 3247 in6_ifa_hold(ifp);
3248 in6_ifa_hold(ifpub); 3248 in6_ifa_hold(ifpub);
3249 spin_unlock(&ifp->lock); 3249 spin_unlock(&ifp->lock);
3250 3250
3251 spin_lock(&ifpub->lock); 3251 spin_lock(&ifpub->lock);
3252 ifpub->regen_count = 0; 3252 ifpub->regen_count = 0;
3253 spin_unlock(&ifpub->lock); 3253 spin_unlock(&ifpub->lock);
3254 ipv6_create_tempaddr(ifpub, ifp); 3254 ipv6_create_tempaddr(ifpub, ifp);
3255 in6_ifa_put(ifpub); 3255 in6_ifa_put(ifpub);
3256 in6_ifa_put(ifp); 3256 in6_ifa_put(ifp);
3257 goto restart; 3257 goto restart;
3258 } 3258 }
3259 } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) 3259 } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
3260 next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; 3260 next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
3261 spin_unlock(&ifp->lock); 3261 spin_unlock(&ifp->lock);
3262 #endif 3262 #endif
3263 } else { 3263 } else {
3264 /* ifp->prefered_lft <= ifp->valid_lft */ 3264 /* ifp->prefered_lft <= ifp->valid_lft */
3265 if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) 3265 if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
3266 next = ifp->tstamp + ifp->prefered_lft * HZ; 3266 next = ifp->tstamp + ifp->prefered_lft * HZ;
3267 spin_unlock(&ifp->lock); 3267 spin_unlock(&ifp->lock);
3268 } 3268 }
3269 } 3269 }
3270 } 3270 }
3271 3271
3272 next_sec = round_jiffies_up(next); 3272 next_sec = round_jiffies_up(next);
3273 next_sched = next; 3273 next_sched = next;
3274 3274
3275 /* If rounded timeout is accurate enough, accept it. */ 3275 /* If rounded timeout is accurate enough, accept it. */
3276 if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) 3276 if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
3277 next_sched = next_sec; 3277 next_sched = next_sec;
3278 3278
3279 /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ 3279 /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
3280 if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) 3280 if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
3281 next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; 3281 next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;
3282 3282
3283 ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", 3283 ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
3284 now, next, next_sec, next_sched)); 3284 now, next, next_sec, next_sched));
3285 3285
3286 addr_chk_timer.expires = next_sched; 3286 addr_chk_timer.expires = next_sched;
3287 add_timer(&addr_chk_timer); 3287 add_timer(&addr_chk_timer);
3288 spin_unlock(&addrconf_verify_lock); 3288 spin_unlock(&addrconf_verify_lock);
3289 rcu_read_unlock_bh(); 3289 rcu_read_unlock_bh();
3290 } 3290 }
3291 3291
3292 static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local) 3292 static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local)
3293 { 3293 {
3294 struct in6_addr *pfx = NULL; 3294 struct in6_addr *pfx = NULL;
3295 3295
3296 if (addr) 3296 if (addr)
3297 pfx = nla_data(addr); 3297 pfx = nla_data(addr);
3298 3298
3299 if (local) { 3299 if (local) {
3300 if (pfx && nla_memcmp(local, pfx, sizeof(*pfx))) 3300 if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
3301 pfx = NULL; 3301 pfx = NULL;
3302 else 3302 else
3303 pfx = nla_data(local); 3303 pfx = nla_data(local);
3304 } 3304 }
3305 3305
3306 return pfx; 3306 return pfx;
3307 } 3307 }
3308 3308
3309 static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { 3309 static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
3310 [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) }, 3310 [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) },
3311 [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, 3311 [IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
3312 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, 3312 [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
3313 }; 3313 };
3314 3314
3315 static int 3315 static int
3316 inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 3316 inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
3317 { 3317 {
3318 struct net *net = sock_net(skb->sk); 3318 struct net *net = sock_net(skb->sk);
3319 struct ifaddrmsg *ifm; 3319 struct ifaddrmsg *ifm;
3320 struct nlattr *tb[IFA_MAX+1]; 3320 struct nlattr *tb[IFA_MAX+1];
3321 struct in6_addr *pfx; 3321 struct in6_addr *pfx;
3322 int err; 3322 int err;
3323 3323
3324 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); 3324 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
3325 if (err < 0) 3325 if (err < 0)
3326 return err; 3326 return err;
3327 3327
3328 ifm = nlmsg_data(nlh); 3328 ifm = nlmsg_data(nlh);
3329 pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); 3329 pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
3330 if (pfx == NULL) 3330 if (pfx == NULL)
3331 return -EINVAL; 3331 return -EINVAL;
3332 3332
3333 return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen); 3333 return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen);
3334 } 3334 }
3335 3335
3336 static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, 3336 static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
3337 u32 prefered_lft, u32 valid_lft) 3337 u32 prefered_lft, u32 valid_lft)
3338 { 3338 {
3339 u32 flags; 3339 u32 flags;
3340 clock_t expires; 3340 clock_t expires;
3341 unsigned long timeout; 3341 unsigned long timeout;
3342 3342
3343 if (!valid_lft || (prefered_lft > valid_lft)) 3343 if (!valid_lft || (prefered_lft > valid_lft))
3344 return -EINVAL; 3344 return -EINVAL;
3345 3345
3346 timeout = addrconf_timeout_fixup(valid_lft, HZ); 3346 timeout = addrconf_timeout_fixup(valid_lft, HZ);
3347 if (addrconf_finite_timeout(timeout)) { 3347 if (addrconf_finite_timeout(timeout)) {
3348 expires = jiffies_to_clock_t(timeout * HZ); 3348 expires = jiffies_to_clock_t(timeout * HZ);
3349 valid_lft = timeout; 3349 valid_lft = timeout;
3350 flags = RTF_EXPIRES; 3350 flags = RTF_EXPIRES;
3351 } else { 3351 } else {
3352 expires = 0; 3352 expires = 0;
3353 flags = 0; 3353 flags = 0;
3354 ifa_flags |= IFA_F_PERMANENT; 3354 ifa_flags |= IFA_F_PERMANENT;
3355 } 3355 }
3356 3356
3357 timeout = addrconf_timeout_fixup(prefered_lft, HZ); 3357 timeout = addrconf_timeout_fixup(prefered_lft, HZ);
3358 if (addrconf_finite_timeout(timeout)) { 3358 if (addrconf_finite_timeout(timeout)) {
3359 if (timeout == 0) 3359 if (timeout == 0)
3360 ifa_flags |= IFA_F_DEPRECATED; 3360 ifa_flags |= IFA_F_DEPRECATED;
3361 prefered_lft = timeout; 3361 prefered_lft = timeout;
3362 } 3362 }
3363 3363
3364 spin_lock_bh(&ifp->lock); 3364 spin_lock_bh(&ifp->lock);
3365 ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags; 3365 ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags;
3366 ifp->tstamp = jiffies; 3366 ifp->tstamp = jiffies;
3367 ifp->valid_lft = valid_lft; 3367 ifp->valid_lft = valid_lft;
3368 ifp->prefered_lft = prefered_lft; 3368 ifp->prefered_lft = prefered_lft;
3369 3369
3370 spin_unlock_bh(&ifp->lock); 3370 spin_unlock_bh(&ifp->lock);
3371 if (!(ifp->flags&IFA_F_TENTATIVE)) 3371 if (!(ifp->flags&IFA_F_TENTATIVE))
3372 ipv6_ifa_notify(0, ifp); 3372 ipv6_ifa_notify(0, ifp);
3373 3373
3374 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, 3374 addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
3375 expires, flags); 3375 expires, flags);
3376 addrconf_verify(0); 3376 addrconf_verify(0);
3377 3377
3378 return 0; 3378 return 0;
3379 } 3379 }
3380 3380
3381 static int 3381 static int
3382 inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 3382 inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
3383 { 3383 {
3384 struct net *net = sock_net(skb->sk); 3384 struct net *net = sock_net(skb->sk);
3385 struct ifaddrmsg *ifm; 3385 struct ifaddrmsg *ifm;
3386 struct nlattr *tb[IFA_MAX+1]; 3386 struct nlattr *tb[IFA_MAX+1];
3387 struct in6_addr *pfx; 3387 struct in6_addr *pfx;
3388 struct inet6_ifaddr *ifa; 3388 struct inet6_ifaddr *ifa;
3389 struct net_device *dev; 3389 struct net_device *dev;
3390 u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; 3390 u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
3391 u8 ifa_flags; 3391 u8 ifa_flags;
3392 int err; 3392 int err;
3393 3393
3394 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); 3394 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
3395 if (err < 0) 3395 if (err < 0)
3396 return err; 3396 return err;
3397 3397
3398 ifm = nlmsg_data(nlh); 3398 ifm = nlmsg_data(nlh);
3399 pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); 3399 pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
3400 if (pfx == NULL) 3400 if (pfx == NULL)
3401 return -EINVAL; 3401 return -EINVAL;
3402 3402
3403 if (tb[IFA_CACHEINFO]) { 3403 if (tb[IFA_CACHEINFO]) {
3404 struct ifa_cacheinfo *ci; 3404 struct ifa_cacheinfo *ci;
3405 3405
3406 ci = nla_data(tb[IFA_CACHEINFO]); 3406 ci = nla_data(tb[IFA_CACHEINFO]);
3407 valid_lft = ci->ifa_valid; 3407 valid_lft = ci->ifa_valid;
3408 preferred_lft = ci->ifa_prefered; 3408 preferred_lft = ci->ifa_prefered;
3409 } else { 3409 } else {
3410 preferred_lft = INFINITY_LIFE_TIME; 3410 preferred_lft = INFINITY_LIFE_TIME;
3411 valid_lft = INFINITY_LIFE_TIME; 3411 valid_lft = INFINITY_LIFE_TIME;
3412 } 3412 }
3413 3413
3414 dev = __dev_get_by_index(net, ifm->ifa_index); 3414 dev = __dev_get_by_index(net, ifm->ifa_index);
3415 if (dev == NULL) 3415 if (dev == NULL)
3416 return -ENODEV; 3416 return -ENODEV;
3417 3417
3418 /* We ignore other flags so far. */ 3418 /* We ignore other flags so far. */
3419 ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS); 3419 ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS);
3420 3420
3421 ifa = ipv6_get_ifaddr(net, pfx, dev, 1); 3421 ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
3422 if (ifa == NULL) { 3422 if (ifa == NULL) {
3423 /* 3423 /*
3424 * It would be best to check for !NLM_F_CREATE here but 3424 * It would be best to check for !NLM_F_CREATE here but
3425 * userspace alreay relies on not having to provide this. 3425 * userspace alreay relies on not having to provide this.
3426 */ 3426 */
3427 return inet6_addr_add(net, ifm->ifa_index, pfx, 3427 return inet6_addr_add(net, ifm->ifa_index, pfx,
3428 ifm->ifa_prefixlen, ifa_flags, 3428 ifm->ifa_prefixlen, ifa_flags,
3429 preferred_lft, valid_lft); 3429 preferred_lft, valid_lft);
3430 } 3430 }
3431 3431
3432 if (nlh->nlmsg_flags & NLM_F_EXCL || 3432 if (nlh->nlmsg_flags & NLM_F_EXCL ||
3433 !(nlh->nlmsg_flags & NLM_F_REPLACE)) 3433 !(nlh->nlmsg_flags & NLM_F_REPLACE))
3434 err = -EEXIST; 3434 err = -EEXIST;
3435 else 3435 else
3436 err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft); 3436 err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
3437 3437
3438 in6_ifa_put(ifa); 3438 in6_ifa_put(ifa);
3439 3439
3440 return err; 3440 return err;
3441 } 3441 }
3442 3442
3443 static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags, 3443 static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags,
3444 u8 scope, int ifindex) 3444 u8 scope, int ifindex)
3445 { 3445 {
3446 struct ifaddrmsg *ifm; 3446 struct ifaddrmsg *ifm;
3447 3447
3448 ifm = nlmsg_data(nlh); 3448 ifm = nlmsg_data(nlh);
3449 ifm->ifa_family = AF_INET6; 3449 ifm->ifa_family = AF_INET6;
3450 ifm->ifa_prefixlen = prefixlen; 3450 ifm->ifa_prefixlen = prefixlen;
3451 ifm->ifa_flags = flags; 3451 ifm->ifa_flags = flags;
3452 ifm->ifa_scope = scope; 3452 ifm->ifa_scope = scope;
3453 ifm->ifa_index = ifindex; 3453 ifm->ifa_index = ifindex;
3454 } 3454 }
3455 3455
3456 static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, 3456 static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
3457 unsigned long tstamp, u32 preferred, u32 valid) 3457 unsigned long tstamp, u32 preferred, u32 valid)
3458 { 3458 {
3459 struct ifa_cacheinfo ci; 3459 struct ifa_cacheinfo ci;
3460 3460
3461 ci.cstamp = cstamp_delta(cstamp); 3461 ci.cstamp = cstamp_delta(cstamp);
3462 ci.tstamp = cstamp_delta(tstamp); 3462 ci.tstamp = cstamp_delta(tstamp);
3463 ci.ifa_prefered = preferred; 3463 ci.ifa_prefered = preferred;
3464 ci.ifa_valid = valid; 3464 ci.ifa_valid = valid;
3465 3465
3466 return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); 3466 return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
3467 } 3467 }
3468 3468
3469 static inline int rt_scope(int ifa_scope) 3469 static inline int rt_scope(int ifa_scope)
3470 { 3470 {
3471 if (ifa_scope & IFA_HOST) 3471 if (ifa_scope & IFA_HOST)
3472 return RT_SCOPE_HOST; 3472 return RT_SCOPE_HOST;
3473 else if (ifa_scope & IFA_LINK) 3473 else if (ifa_scope & IFA_LINK)
3474 return RT_SCOPE_LINK; 3474 return RT_SCOPE_LINK;
3475 else if (ifa_scope & IFA_SITE) 3475 else if (ifa_scope & IFA_SITE)
3476 return RT_SCOPE_SITE; 3476 return RT_SCOPE_SITE;
3477 else 3477 else
3478 return RT_SCOPE_UNIVERSE; 3478 return RT_SCOPE_UNIVERSE;
3479 } 3479 }
3480 3480
3481 static inline int inet6_ifaddr_msgsize(void) 3481 static inline int inet6_ifaddr_msgsize(void)
3482 { 3482 {
3483 return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) 3483 return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
3484 + nla_total_size(16) /* IFA_ADDRESS */ 3484 + nla_total_size(16) /* IFA_ADDRESS */
3485 + nla_total_size(sizeof(struct ifa_cacheinfo)); 3485 + nla_total_size(sizeof(struct ifa_cacheinfo));
3486 } 3486 }
3487 3487
3488 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, 3488 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
3489 u32 pid, u32 seq, int event, unsigned int flags) 3489 u32 pid, u32 seq, int event, unsigned int flags)
3490 { 3490 {
3491 struct nlmsghdr *nlh; 3491 struct nlmsghdr *nlh;
3492 u32 preferred, valid; 3492 u32 preferred, valid;
3493 3493
3494 nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); 3494 nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
3495 if (nlh == NULL) 3495 if (nlh == NULL)
3496 return -EMSGSIZE; 3496 return -EMSGSIZE;
3497 3497
3498 put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), 3498 put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
3499 ifa->idev->dev->ifindex); 3499 ifa->idev->dev->ifindex);
3500 3500
3501 if (!(ifa->flags&IFA_F_PERMANENT)) { 3501 if (!(ifa->flags&IFA_F_PERMANENT)) {
3502 preferred = ifa->prefered_lft; 3502 preferred = ifa->prefered_lft;
3503 valid = ifa->valid_lft; 3503 valid = ifa->valid_lft;
3504 if (preferred != INFINITY_LIFE_TIME) { 3504 if (preferred != INFINITY_LIFE_TIME) {
3505 long tval = (jiffies - ifa->tstamp)/HZ; 3505 long tval = (jiffies - ifa->tstamp)/HZ;
3506 if (preferred > tval) 3506 if (preferred > tval)
3507 preferred -= tval; 3507 preferred -= tval;
3508 else 3508 else
3509 preferred = 0; 3509 preferred = 0;
3510 if (valid != INFINITY_LIFE_TIME) { 3510 if (valid != INFINITY_LIFE_TIME) {
3511 if (valid > tval) 3511 if (valid > tval)
3512 valid -= tval; 3512 valid -= tval;
3513 else 3513 else
3514 valid = 0; 3514 valid = 0;
3515 } 3515 }
3516 } 3516 }
3517 } else { 3517 } else {
3518 preferred = INFINITY_LIFE_TIME; 3518 preferred = INFINITY_LIFE_TIME;
3519 valid = INFINITY_LIFE_TIME; 3519 valid = INFINITY_LIFE_TIME;
3520 } 3520 }
3521 3521
3522 if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 || 3522 if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 ||
3523 put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) { 3523 put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) {
3524 nlmsg_cancel(skb, nlh); 3524 nlmsg_cancel(skb, nlh);
3525 return -EMSGSIZE; 3525 return -EMSGSIZE;
3526 } 3526 }
3527 3527
3528 return nlmsg_end(skb, nlh); 3528 return nlmsg_end(skb, nlh);
3529 } 3529 }
3530 3530
3531 static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, 3531 static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
3532 u32 pid, u32 seq, int event, u16 flags) 3532 u32 pid, u32 seq, int event, u16 flags)
3533 { 3533 {
3534 struct nlmsghdr *nlh; 3534 struct nlmsghdr *nlh;
3535 u8 scope = RT_SCOPE_UNIVERSE; 3535 u8 scope = RT_SCOPE_UNIVERSE;
3536 int ifindex = ifmca->idev->dev->ifindex; 3536 int ifindex = ifmca->idev->dev->ifindex;
3537 3537
3538 if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) 3538 if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
3539 scope = RT_SCOPE_SITE; 3539 scope = RT_SCOPE_SITE;
3540 3540
3541 nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); 3541 nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
3542 if (nlh == NULL) 3542 if (nlh == NULL)
3543 return -EMSGSIZE; 3543 return -EMSGSIZE;
3544 3544
3545 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); 3545 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
3546 if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 || 3546 if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 ||
3547 put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, 3547 put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
3548 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { 3548 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
3549 nlmsg_cancel(skb, nlh); 3549 nlmsg_cancel(skb, nlh);
3550 return -EMSGSIZE; 3550 return -EMSGSIZE;
3551 } 3551 }
3552 3552
3553 return nlmsg_end(skb, nlh); 3553 return nlmsg_end(skb, nlh);
3554 } 3554 }
3555 3555
3556 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, 3556 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
3557 u32 pid, u32 seq, int event, unsigned int flags) 3557 u32 pid, u32 seq, int event, unsigned int flags)
3558 { 3558 {
3559 struct nlmsghdr *nlh; 3559 struct nlmsghdr *nlh;
3560 u8 scope = RT_SCOPE_UNIVERSE; 3560 u8 scope = RT_SCOPE_UNIVERSE;
3561 int ifindex = ifaca->aca_idev->dev->ifindex; 3561 int ifindex = ifaca->aca_idev->dev->ifindex;
3562 3562
3563 if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) 3563 if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
3564 scope = RT_SCOPE_SITE; 3564 scope = RT_SCOPE_SITE;
3565 3565
3566 nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); 3566 nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
3567 if (nlh == NULL) 3567 if (nlh == NULL)
3568 return -EMSGSIZE; 3568 return -EMSGSIZE;
3569 3569
3570 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); 3570 put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
3571 if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 || 3571 if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 ||
3572 put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, 3572 put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
3573 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { 3573 INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
3574 nlmsg_cancel(skb, nlh); 3574 nlmsg_cancel(skb, nlh);
3575 return -EMSGSIZE; 3575 return -EMSGSIZE;
3576 } 3576 }
3577 3577
3578 return nlmsg_end(skb, nlh); 3578 return nlmsg_end(skb, nlh);
3579 } 3579 }
3580 3580
3581 enum addr_type_t { 3581 enum addr_type_t {
3582 UNICAST_ADDR, 3582 UNICAST_ADDR,
3583 MULTICAST_ADDR, 3583 MULTICAST_ADDR,
3584 ANYCAST_ADDR, 3584 ANYCAST_ADDR,
3585 }; 3585 };
3586 3586
3587 /* called with rcu_read_lock() */ 3587 /* called with rcu_read_lock() */
3588 static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, 3588 static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
3589 struct netlink_callback *cb, enum addr_type_t type, 3589 struct netlink_callback *cb, enum addr_type_t type,
3590 int s_ip_idx, int *p_ip_idx) 3590 int s_ip_idx, int *p_ip_idx)
3591 { 3591 {
3592 struct ifmcaddr6 *ifmca; 3592 struct ifmcaddr6 *ifmca;
3593 struct ifacaddr6 *ifaca; 3593 struct ifacaddr6 *ifaca;
3594 int err = 1; 3594 int err = 1;
3595 int ip_idx = *p_ip_idx; 3595 int ip_idx = *p_ip_idx;
3596 3596
3597 read_lock_bh(&idev->lock); 3597 read_lock_bh(&idev->lock);
3598 switch (type) { 3598 switch (type) {
3599 case UNICAST_ADDR: { 3599 case UNICAST_ADDR: {
3600 struct inet6_ifaddr *ifa; 3600 struct inet6_ifaddr *ifa;
3601 3601
3602 /* unicast address incl. temp addr */ 3602 /* unicast address incl. temp addr */
3603 list_for_each_entry(ifa, &idev->addr_list, if_list) { 3603 list_for_each_entry(ifa, &idev->addr_list, if_list) {
3604 if (++ip_idx < s_ip_idx) 3604 if (++ip_idx < s_ip_idx)
3605 continue; 3605 continue;
3606 err = inet6_fill_ifaddr(skb, ifa, 3606 err = inet6_fill_ifaddr(skb, ifa,
3607 NETLINK_CB(cb->skb).pid, 3607 NETLINK_CB(cb->skb).pid,
3608 cb->nlh->nlmsg_seq, 3608 cb->nlh->nlmsg_seq,
3609 RTM_NEWADDR, 3609 RTM_NEWADDR,
3610 NLM_F_MULTI); 3610 NLM_F_MULTI);
3611 if (err <= 0) 3611 if (err <= 0)
3612 break; 3612 break;
3613 } 3613 }
3614 break; 3614 break;
3615 } 3615 }
3616 case MULTICAST_ADDR: 3616 case MULTICAST_ADDR:
3617 /* multicast address */ 3617 /* multicast address */
3618 for (ifmca = idev->mc_list; ifmca; 3618 for (ifmca = idev->mc_list; ifmca;
3619 ifmca = ifmca->next, ip_idx++) { 3619 ifmca = ifmca->next, ip_idx++) {
3620 if (ip_idx < s_ip_idx) 3620 if (ip_idx < s_ip_idx)
3621 continue; 3621 continue;
3622 err = inet6_fill_ifmcaddr(skb, ifmca, 3622 err = inet6_fill_ifmcaddr(skb, ifmca,
3623 NETLINK_CB(cb->skb).pid, 3623 NETLINK_CB(cb->skb).pid,
3624 cb->nlh->nlmsg_seq, 3624 cb->nlh->nlmsg_seq,
3625 RTM_GETMULTICAST, 3625 RTM_GETMULTICAST,
3626 NLM_F_MULTI); 3626 NLM_F_MULTI);
3627 if (err <= 0) 3627 if (err <= 0)
3628 break; 3628 break;
3629 } 3629 }
3630 break; 3630 break;
3631 case ANYCAST_ADDR: 3631 case ANYCAST_ADDR:
3632 /* anycast address */ 3632 /* anycast address */
3633 for (ifaca = idev->ac_list; ifaca; 3633 for (ifaca = idev->ac_list; ifaca;
3634 ifaca = ifaca->aca_next, ip_idx++) { 3634 ifaca = ifaca->aca_next, ip_idx++) {
3635 if (ip_idx < s_ip_idx) 3635 if (ip_idx < s_ip_idx)
3636 continue; 3636 continue;
3637 err = inet6_fill_ifacaddr(skb, ifaca, 3637 err = inet6_fill_ifacaddr(skb, ifaca,
3638 NETLINK_CB(cb->skb).pid, 3638 NETLINK_CB(cb->skb).pid,
3639 cb->nlh->nlmsg_seq, 3639 cb->nlh->nlmsg_seq,
3640 RTM_GETANYCAST, 3640 RTM_GETANYCAST,
3641 NLM_F_MULTI); 3641 NLM_F_MULTI);
3642 if (err <= 0) 3642 if (err <= 0)
3643 break; 3643 break;
3644 } 3644 }
3645 break; 3645 break;
3646 default: 3646 default:
3647 break; 3647 break;
3648 } 3648 }
3649 read_unlock_bh(&idev->lock); 3649 read_unlock_bh(&idev->lock);
3650 *p_ip_idx = ip_idx; 3650 *p_ip_idx = ip_idx;
3651 return err; 3651 return err;
3652 } 3652 }
3653 3653
3654 static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, 3654 static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
3655 enum addr_type_t type) 3655 enum addr_type_t type)
3656 { 3656 {
3657 struct net *net = sock_net(skb->sk); 3657 struct net *net = sock_net(skb->sk);
3658 int h, s_h; 3658 int h, s_h;
3659 int idx, ip_idx; 3659 int idx, ip_idx;
3660 int s_idx, s_ip_idx; 3660 int s_idx, s_ip_idx;
3661 struct net_device *dev; 3661 struct net_device *dev;
3662 struct inet6_dev *idev; 3662 struct inet6_dev *idev;
3663 struct hlist_head *head; 3663 struct hlist_head *head;
3664 struct hlist_node *node; 3664 struct hlist_node *node;
3665 3665
3666 s_h = cb->args[0]; 3666 s_h = cb->args[0];
3667 s_idx = idx = cb->args[1]; 3667 s_idx = idx = cb->args[1];
3668 s_ip_idx = ip_idx = cb->args[2]; 3668 s_ip_idx = ip_idx = cb->args[2];
3669 3669
3670 rcu_read_lock(); 3670 rcu_read_lock();
3671 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 3671 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
3672 idx = 0; 3672 idx = 0;
3673 head = &net->dev_index_head[h]; 3673 head = &net->dev_index_head[h];
3674 hlist_for_each_entry_rcu(dev, node, head, index_hlist) { 3674 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
3675 if (idx < s_idx) 3675 if (idx < s_idx)
3676 goto cont; 3676 goto cont;
3677 if (h > s_h || idx > s_idx) 3677 if (h > s_h || idx > s_idx)
3678 s_ip_idx = 0; 3678 s_ip_idx = 0;
3679 ip_idx = 0; 3679 ip_idx = 0;
3680 idev = __in6_dev_get(dev); 3680 idev = __in6_dev_get(dev);
3681 if (!idev) 3681 if (!idev)
3682 goto cont; 3682 goto cont;
3683 3683
3684 if (in6_dump_addrs(idev, skb, cb, type, 3684 if (in6_dump_addrs(idev, skb, cb, type,
3685 s_ip_idx, &ip_idx) <= 0) 3685 s_ip_idx, &ip_idx) <= 0)
3686 goto done; 3686 goto done;
3687 cont: 3687 cont:
3688 idx++; 3688 idx++;
3689 } 3689 }
3690 } 3690 }
3691 done: 3691 done:
3692 rcu_read_unlock(); 3692 rcu_read_unlock();
3693 cb->args[0] = h; 3693 cb->args[0] = h;
3694 cb->args[1] = idx; 3694 cb->args[1] = idx;
3695 cb->args[2] = ip_idx; 3695 cb->args[2] = ip_idx;
3696 3696
3697 return skb->len; 3697 return skb->len;
3698 } 3698 }
3699 3699
3700 static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 3700 static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
3701 { 3701 {
3702 enum addr_type_t type = UNICAST_ADDR; 3702 enum addr_type_t type = UNICAST_ADDR;
3703 3703
3704 return inet6_dump_addr(skb, cb, type); 3704 return inet6_dump_addr(skb, cb, type);
3705 } 3705 }
3706 3706
3707 static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) 3707 static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
3708 { 3708 {
3709 enum addr_type_t type = MULTICAST_ADDR; 3709 enum addr_type_t type = MULTICAST_ADDR;
3710 3710
3711 return inet6_dump_addr(skb, cb, type); 3711 return inet6_dump_addr(skb, cb, type);
3712 } 3712 }
3713 3713
3714 3714
3715 static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) 3715 static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
3716 { 3716 {
3717 enum addr_type_t type = ANYCAST_ADDR; 3717 enum addr_type_t type = ANYCAST_ADDR;
3718 3718
3719 return inet6_dump_addr(skb, cb, type); 3719 return inet6_dump_addr(skb, cb, type);
3720 } 3720 }
3721 3721
3722 static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, 3722 static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh,
3723 void *arg) 3723 void *arg)
3724 { 3724 {
3725 struct net *net = sock_net(in_skb->sk); 3725 struct net *net = sock_net(in_skb->sk);
3726 struct ifaddrmsg *ifm; 3726 struct ifaddrmsg *ifm;
3727 struct nlattr *tb[IFA_MAX+1]; 3727 struct nlattr *tb[IFA_MAX+1];
3728 struct in6_addr *addr = NULL; 3728 struct in6_addr *addr = NULL;
3729 struct net_device *dev = NULL; 3729 struct net_device *dev = NULL;
3730 struct inet6_ifaddr *ifa; 3730 struct inet6_ifaddr *ifa;
3731 struct sk_buff *skb; 3731 struct sk_buff *skb;
3732 int err; 3732 int err;
3733 3733
3734 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); 3734 err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
3735 if (err < 0) 3735 if (err < 0)
3736 goto errout; 3736 goto errout;
3737 3737
3738 addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); 3738 addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
3739 if (addr == NULL) { 3739 if (addr == NULL) {
3740 err = -EINVAL; 3740 err = -EINVAL;
3741 goto errout; 3741 goto errout;
3742 } 3742 }
3743 3743
3744 ifm = nlmsg_data(nlh); 3744 ifm = nlmsg_data(nlh);
3745 if (ifm->ifa_index) 3745 if (ifm->ifa_index)
3746 dev = __dev_get_by_index(net, ifm->ifa_index); 3746 dev = __dev_get_by_index(net, ifm->ifa_index);
3747 3747
3748 ifa = ipv6_get_ifaddr(net, addr, dev, 1); 3748 ifa = ipv6_get_ifaddr(net, addr, dev, 1);
3749 if (!ifa) { 3749 if (!ifa) {
3750 err = -EADDRNOTAVAIL; 3750 err = -EADDRNOTAVAIL;
3751 goto errout; 3751 goto errout;
3752 } 3752 }
3753 3753
3754 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL); 3754 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL);
3755 if (!skb) { 3755 if (!skb) {
3756 err = -ENOBUFS; 3756 err = -ENOBUFS;
3757 goto errout_ifa; 3757 goto errout_ifa;
3758 } 3758 }
3759 3759
3760 err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid, 3760 err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid,
3761 nlh->nlmsg_seq, RTM_NEWADDR, 0); 3761 nlh->nlmsg_seq, RTM_NEWADDR, 0);
3762 if (err < 0) { 3762 if (err < 0) {
3763 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ 3763 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
3764 WARN_ON(err == -EMSGSIZE); 3764 WARN_ON(err == -EMSGSIZE);
3765 kfree_skb(skb); 3765 kfree_skb(skb);
3766 goto errout_ifa; 3766 goto errout_ifa;
3767 } 3767 }
3768 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 3768 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3769 errout_ifa: 3769 errout_ifa:
3770 in6_ifa_put(ifa); 3770 in6_ifa_put(ifa);
3771 errout: 3771 errout:
3772 return err; 3772 return err;
3773 } 3773 }
3774 3774
3775 static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) 3775 static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
3776 { 3776 {
3777 struct sk_buff *skb; 3777 struct sk_buff *skb;
3778 struct net *net = dev_net(ifa->idev->dev); 3778 struct net *net = dev_net(ifa->idev->dev);
3779 int err = -ENOBUFS; 3779 int err = -ENOBUFS;
3780 3780
3781 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); 3781 skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
3782 if (skb == NULL) 3782 if (skb == NULL)
3783 goto errout; 3783 goto errout;
3784 3784
3785 err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); 3785 err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
3786 if (err < 0) { 3786 if (err < 0) {
3787 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ 3787 /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
3788 WARN_ON(err == -EMSGSIZE); 3788 WARN_ON(err == -EMSGSIZE);
3789 kfree_skb(skb); 3789 kfree_skb(skb);
3790 goto errout; 3790 goto errout;
3791 } 3791 }
3792 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); 3792 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
3793 return; 3793 return;
3794 errout: 3794 errout:
3795 if (err < 0) 3795 if (err < 0)
3796 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); 3796 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
3797 } 3797 }
3798 3798
3799 static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, 3799 static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
3800 __s32 *array, int bytes) 3800 __s32 *array, int bytes)
3801 { 3801 {
3802 BUG_ON(bytes < (DEVCONF_MAX * 4)); 3802 BUG_ON(bytes < (DEVCONF_MAX * 4));
3803 3803
3804 memset(array, 0, bytes); 3804 memset(array, 0, bytes);
3805 array[DEVCONF_FORWARDING] = cnf->forwarding; 3805 array[DEVCONF_FORWARDING] = cnf->forwarding;
3806 array[DEVCONF_HOPLIMIT] = cnf->hop_limit; 3806 array[DEVCONF_HOPLIMIT] = cnf->hop_limit;
3807 array[DEVCONF_MTU6] = cnf->mtu6; 3807 array[DEVCONF_MTU6] = cnf->mtu6;
3808 array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; 3808 array[DEVCONF_ACCEPT_RA] = cnf->accept_ra;
3809 array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; 3809 array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects;
3810 array[DEVCONF_AUTOCONF] = cnf->autoconf; 3810 array[DEVCONF_AUTOCONF] = cnf->autoconf;
3811 array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; 3811 array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits;
3812 array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; 3812 array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
3813 array[DEVCONF_RTR_SOLICIT_INTERVAL] = 3813 array[DEVCONF_RTR_SOLICIT_INTERVAL] =
3814 jiffies_to_msecs(cnf->rtr_solicit_interval); 3814 jiffies_to_msecs(cnf->rtr_solicit_interval);
3815 array[DEVCONF_RTR_SOLICIT_DELAY] = 3815 array[DEVCONF_RTR_SOLICIT_DELAY] =
3816 jiffies_to_msecs(cnf->rtr_solicit_delay); 3816 jiffies_to_msecs(cnf->rtr_solicit_delay);
3817 array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; 3817 array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
3818 #ifdef CONFIG_IPV6_PRIVACY 3818 #ifdef CONFIG_IPV6_PRIVACY
3819 array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; 3819 array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
3820 array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; 3820 array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
3821 array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; 3821 array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft;
3822 array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; 3822 array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry;
3823 array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; 3823 array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
3824 #endif 3824 #endif
3825 array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; 3825 array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
3826 array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; 3826 array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
3827 array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; 3827 array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
3828 #ifdef CONFIG_IPV6_ROUTER_PREF 3828 #ifdef CONFIG_IPV6_ROUTER_PREF
3829 array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; 3829 array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
3830 array[DEVCONF_RTR_PROBE_INTERVAL] = 3830 array[DEVCONF_RTR_PROBE_INTERVAL] =
3831 jiffies_to_msecs(cnf->rtr_probe_interval); 3831 jiffies_to_msecs(cnf->rtr_probe_interval);
3832 #ifdef CONFIG_IPV6_ROUTE_INFO 3832 #ifdef CONFIG_IPV6_ROUTE_INFO
3833 array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; 3833 array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
3834 #endif 3834 #endif
3835 #endif 3835 #endif
3836 array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp; 3836 array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
3837 array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; 3837 array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
3838 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 3838 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
3839 array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; 3839 array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
3840 #endif 3840 #endif
3841 #ifdef CONFIG_IPV6_MROUTE 3841 #ifdef CONFIG_IPV6_MROUTE
3842 array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; 3842 array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
3843 #endif 3843 #endif
3844 array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; 3844 array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
3845 array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; 3845 array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
3846 array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; 3846 array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
3847 } 3847 }
3848 3848
3849 static inline size_t inet6_ifla6_size(void) 3849 static inline size_t inet6_ifla6_size(void)
3850 { 3850 {
3851 return nla_total_size(4) /* IFLA_INET6_FLAGS */ 3851 return nla_total_size(4) /* IFLA_INET6_FLAGS */
3852 + nla_total_size(sizeof(struct ifla_cacheinfo)) 3852 + nla_total_size(sizeof(struct ifla_cacheinfo))
3853 + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ 3853 + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
3854 + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ 3854 + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
3855 + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */ 3855 + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */
3856 } 3856 }
3857 3857
3858 static inline size_t inet6_if_nlmsg_size(void) 3858 static inline size_t inet6_if_nlmsg_size(void)
3859 { 3859 {
3860 return NLMSG_ALIGN(sizeof(struct ifinfomsg)) 3860 return NLMSG_ALIGN(sizeof(struct ifinfomsg))
3861 + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ 3861 + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
3862 + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ 3862 + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
3863 + nla_total_size(4) /* IFLA_MTU */ 3863 + nla_total_size(4) /* IFLA_MTU */
3864 + nla_total_size(4) /* IFLA_LINK */ 3864 + nla_total_size(4) /* IFLA_LINK */
3865 + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ 3865 + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
3866 } 3866 }
3867 3867
3868 static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, 3868 static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
3869 int items, int bytes) 3869 int items, int bytes)
3870 { 3870 {
3871 int i; 3871 int i;
3872 int pad = bytes - sizeof(u64) * items; 3872 int pad = bytes - sizeof(u64) * items;
3873 BUG_ON(pad < 0); 3873 BUG_ON(pad < 0);
3874 3874
3875 /* Use put_unaligned() because stats may not be aligned for u64. */ 3875 /* Use put_unaligned() because stats may not be aligned for u64. */
3876 put_unaligned(items, &stats[0]); 3876 put_unaligned(items, &stats[0]);
3877 for (i = 1; i < items; i++) 3877 for (i = 1; i < items; i++)
3878 put_unaligned(atomic_long_read(&mib[i]), &stats[i]); 3878 put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
3879 3879
3880 memset(&stats[items], 0, pad); 3880 memset(&stats[items], 0, pad);
3881 } 3881 }
3882 3882
3883 static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib, 3883 static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib,
3884 int items, int bytes, size_t syncpoff) 3884 int items, int bytes, size_t syncpoff)
3885 { 3885 {
3886 int i; 3886 int i;
3887 int pad = bytes - sizeof(u64) * items; 3887 int pad = bytes - sizeof(u64) * items;
3888 BUG_ON(pad < 0); 3888 BUG_ON(pad < 0);
3889 3889
3890 /* Use put_unaligned() because stats may not be aligned for u64. */ 3890 /* Use put_unaligned() because stats may not be aligned for u64. */
3891 put_unaligned(items, &stats[0]); 3891 put_unaligned(items, &stats[0]);
3892 for (i = 1; i < items; i++) 3892 for (i = 1; i < items; i++)
3893 put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); 3893 put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
3894 3894
3895 memset(&stats[items], 0, pad); 3895 memset(&stats[items], 0, pad);
3896 } 3896 }
3897 3897
3898 static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, 3898 static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
3899 int bytes) 3899 int bytes)
3900 { 3900 {
3901 switch (attrtype) { 3901 switch (attrtype) {
3902 case IFLA_INET6_STATS: 3902 case IFLA_INET6_STATS:
3903 __snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6, 3903 __snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6,
3904 IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); 3904 IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
3905 break; 3905 break;
3906 case IFLA_INET6_ICMP6STATS: 3906 case IFLA_INET6_ICMP6STATS:
3907 __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); 3907 __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
3908 break; 3908 break;
3909 } 3909 }
3910 } 3910 }
3911 3911
3912 static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) 3912 static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
3913 { 3913 {
3914 struct nlattr *nla; 3914 struct nlattr *nla;
3915 struct ifla_cacheinfo ci; 3915 struct ifla_cacheinfo ci;
3916 3916
3917 NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); 3917 NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags);
3918 3918
3919 ci.max_reasm_len = IPV6_MAXPLEN; 3919 ci.max_reasm_len = IPV6_MAXPLEN;
3920 ci.tstamp = cstamp_delta(idev->tstamp); 3920 ci.tstamp = cstamp_delta(idev->tstamp);
3921 ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); 3921 ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
3922 ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time); 3922 ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time);
3923 NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); 3923 NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci);
3924 3924
3925 nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); 3925 nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
3926 if (nla == NULL) 3926 if (nla == NULL)
3927 goto nla_put_failure; 3927 goto nla_put_failure;
3928 ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); 3928 ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla));
3929 3929
3930 /* XXX - MC not implemented */ 3930 /* XXX - MC not implemented */
3931 3931
3932 nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); 3932 nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
3933 if (nla == NULL) 3933 if (nla == NULL)
3934 goto nla_put_failure; 3934 goto nla_put_failure;
3935 snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); 3935 snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
3936 3936
3937 nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); 3937 nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
3938 if (nla == NULL) 3938 if (nla == NULL)
3939 goto nla_put_failure; 3939 goto nla_put_failure;
3940 snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); 3940 snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
3941 3941
3942 return 0; 3942 return 0;
3943 3943
3944 nla_put_failure: 3944 nla_put_failure:
3945 return -EMSGSIZE; 3945 return -EMSGSIZE;
3946 } 3946 }
3947 3947
3948 static size_t inet6_get_link_af_size(const struct net_device *dev) 3948 static size_t inet6_get_link_af_size(const struct net_device *dev)
3949 { 3949 {
3950 if (!__in6_dev_get(dev)) 3950 if (!__in6_dev_get(dev))
3951 return 0; 3951 return 0;
3952 3952
3953 return inet6_ifla6_size(); 3953 return inet6_ifla6_size();
3954 } 3954 }
3955 3955
3956 static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) 3956 static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
3957 { 3957 {
3958 struct inet6_dev *idev = __in6_dev_get(dev); 3958 struct inet6_dev *idev = __in6_dev_get(dev);
3959 3959
3960 if (!idev) 3960 if (!idev)
3961 return -ENODATA; 3961 return -ENODATA;
3962 3962
3963 if (inet6_fill_ifla6_attrs(skb, idev) < 0) 3963 if (inet6_fill_ifla6_attrs(skb, idev) < 0)
3964 return -EMSGSIZE; 3964 return -EMSGSIZE;
3965 3965
3966 return 0; 3966 return 0;
3967 } 3967 }
3968 3968
3969 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 3969 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
3970 u32 pid, u32 seq, int event, unsigned int flags) 3970 u32 pid, u32 seq, int event, unsigned int flags)
3971 { 3971 {
3972 struct net_device *dev = idev->dev; 3972 struct net_device *dev = idev->dev;
3973 struct ifinfomsg *hdr; 3973 struct ifinfomsg *hdr;
3974 struct nlmsghdr *nlh; 3974 struct nlmsghdr *nlh;
3975 void *protoinfo; 3975 void *protoinfo;
3976 3976
3977 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); 3977 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
3978 if (nlh == NULL) 3978 if (nlh == NULL)
3979 return -EMSGSIZE; 3979 return -EMSGSIZE;
3980 3980
3981 hdr = nlmsg_data(nlh); 3981 hdr = nlmsg_data(nlh);
3982 hdr->ifi_family = AF_INET6; 3982 hdr->ifi_family = AF_INET6;
3983 hdr->__ifi_pad = 0; 3983 hdr->__ifi_pad = 0;
3984 hdr->ifi_type = dev->type; 3984 hdr->ifi_type = dev->type;
3985 hdr->ifi_index = dev->ifindex; 3985 hdr->ifi_index = dev->ifindex;
3986 hdr->ifi_flags = dev_get_flags(dev); 3986 hdr->ifi_flags = dev_get_flags(dev);
3987 hdr->ifi_change = 0; 3987 hdr->ifi_change = 0;
3988 3988
3989 NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); 3989 NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
3990 3990
3991 if (dev->addr_len) 3991 if (dev->addr_len)
3992 NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); 3992 NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
3993 3993
3994 NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); 3994 NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
3995 if (dev->ifindex != dev->iflink) 3995 if (dev->ifindex != dev->iflink)
3996 NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); 3996 NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
3997 3997
3998 protoinfo = nla_nest_start(skb, IFLA_PROTINFO); 3998 protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
3999 if (protoinfo == NULL) 3999 if (protoinfo == NULL)
4000 goto nla_put_failure; 4000 goto nla_put_failure;
4001 4001
4002 if (inet6_fill_ifla6_attrs(skb, idev) < 0) 4002 if (inet6_fill_ifla6_attrs(skb, idev) < 0)
4003 goto nla_put_failure; 4003 goto nla_put_failure;
4004 4004
4005 nla_nest_end(skb, protoinfo); 4005 nla_nest_end(skb, protoinfo);
4006 return nlmsg_end(skb, nlh); 4006 return nlmsg_end(skb, nlh);
4007 4007
4008 nla_put_failure: 4008 nla_put_failure:
4009 nlmsg_cancel(skb, nlh); 4009 nlmsg_cancel(skb, nlh);
4010 return -EMSGSIZE; 4010 return -EMSGSIZE;
4011 } 4011 }
4012 4012
4013 static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) 4013 static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
4014 { 4014 {
4015 struct net *net = sock_net(skb->sk); 4015 struct net *net = sock_net(skb->sk);
4016 int h, s_h; 4016 int h, s_h;
4017 int idx = 0, s_idx; 4017 int idx = 0, s_idx;
4018 struct net_device *dev; 4018 struct net_device *dev;
4019 struct inet6_dev *idev; 4019 struct inet6_dev *idev;
4020 struct hlist_head *head; 4020 struct hlist_head *head;
4021 struct hlist_node *node; 4021 struct hlist_node *node;
4022 4022
4023 s_h = cb->args[0]; 4023 s_h = cb->args[0];
4024 s_idx = cb->args[1]; 4024 s_idx = cb->args[1];
4025 4025
4026 rcu_read_lock(); 4026 rcu_read_lock();
4027 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 4027 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
4028 idx = 0; 4028 idx = 0;
4029 head = &net->dev_index_head[h]; 4029 head = &net->dev_index_head[h];
4030 hlist_for_each_entry_rcu(dev, node, head, index_hlist) { 4030 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
4031 if (idx < s_idx) 4031 if (idx < s_idx)
4032 goto cont; 4032 goto cont;
4033 idev = __in6_dev_get(dev); 4033 idev = __in6_dev_get(dev);
4034 if (!idev) 4034 if (!idev)
4035 goto cont; 4035 goto cont;
4036 if (inet6_fill_ifinfo(skb, idev, 4036 if (inet6_fill_ifinfo(skb, idev,
4037 NETLINK_CB(cb->skb).pid, 4037 NETLINK_CB(cb->skb).pid,
4038 cb->nlh->nlmsg_seq, 4038 cb->nlh->nlmsg_seq,
4039 RTM_NEWLINK, NLM_F_MULTI) <= 0) 4039 RTM_NEWLINK, NLM_F_MULTI) <= 0)
4040 goto out; 4040 goto out;
4041 cont: 4041 cont:
4042 idx++; 4042 idx++;
4043 } 4043 }
4044 } 4044 }
4045 out: 4045 out:
4046 rcu_read_unlock(); 4046 rcu_read_unlock();
4047 cb->args[1] = idx; 4047 cb->args[1] = idx;
4048 cb->args[0] = h; 4048 cb->args[0] = h;
4049 4049
4050 return skb->len; 4050 return skb->len;
4051 } 4051 }
4052 4052
4053 void inet6_ifinfo_notify(int event, struct inet6_dev *idev) 4053 void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
4054 { 4054 {
4055 struct sk_buff *skb; 4055 struct sk_buff *skb;
4056 struct net *net = dev_net(idev->dev); 4056 struct net *net = dev_net(idev->dev);
4057 int err = -ENOBUFS; 4057 int err = -ENOBUFS;
4058 4058
4059 skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC); 4059 skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC);
4060 if (skb == NULL) 4060 if (skb == NULL)
4061 goto errout; 4061 goto errout;
4062 4062
4063 err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0); 4063 err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0);
4064 if (err < 0) { 4064 if (err < 0) {
4065 /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */ 4065 /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */
4066 WARN_ON(err == -EMSGSIZE); 4066 WARN_ON(err == -EMSGSIZE);
4067 kfree_skb(skb); 4067 kfree_skb(skb);
4068 goto errout; 4068 goto errout;
4069 } 4069 }
4070 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); 4070 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
4071 return; 4071 return;
4072 errout: 4072 errout:
4073 if (err < 0) 4073 if (err < 0)
4074 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); 4074 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
4075 } 4075 }
4076 4076
4077 static inline size_t inet6_prefix_nlmsg_size(void) 4077 static inline size_t inet6_prefix_nlmsg_size(void)
4078 { 4078 {
4079 return NLMSG_ALIGN(sizeof(struct prefixmsg)) 4079 return NLMSG_ALIGN(sizeof(struct prefixmsg))
4080 + nla_total_size(sizeof(struct in6_addr)) 4080 + nla_total_size(sizeof(struct in6_addr))
4081 + nla_total_size(sizeof(struct prefix_cacheinfo)); 4081 + nla_total_size(sizeof(struct prefix_cacheinfo));
4082 } 4082 }
4083 4083
4084 static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, 4084 static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
4085 struct prefix_info *pinfo, u32 pid, u32 seq, 4085 struct prefix_info *pinfo, u32 pid, u32 seq,
4086 int event, unsigned int flags) 4086 int event, unsigned int flags)
4087 { 4087 {
4088 struct prefixmsg *pmsg; 4088 struct prefixmsg *pmsg;
4089 struct nlmsghdr *nlh; 4089 struct nlmsghdr *nlh;
4090 struct prefix_cacheinfo ci; 4090 struct prefix_cacheinfo ci;
4091 4091
4092 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags); 4092 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags);
4093 if (nlh == NULL) 4093 if (nlh == NULL)
4094 return -EMSGSIZE; 4094 return -EMSGSIZE;
4095 4095
4096 pmsg = nlmsg_data(nlh); 4096 pmsg = nlmsg_data(nlh);
4097 pmsg->prefix_family = AF_INET6; 4097 pmsg->prefix_family = AF_INET6;
4098 pmsg->prefix_pad1 = 0; 4098 pmsg->prefix_pad1 = 0;
4099 pmsg->prefix_pad2 = 0; 4099 pmsg->prefix_pad2 = 0;
4100 pmsg->prefix_ifindex = idev->dev->ifindex; 4100 pmsg->prefix_ifindex = idev->dev->ifindex;
4101 pmsg->prefix_len = pinfo->prefix_len; 4101 pmsg->prefix_len = pinfo->prefix_len;
4102 pmsg->prefix_type = pinfo->type; 4102 pmsg->prefix_type = pinfo->type;
4103 pmsg->prefix_pad3 = 0; 4103 pmsg->prefix_pad3 = 0;
4104 pmsg->prefix_flags = 0; 4104 pmsg->prefix_flags = 0;
4105 if (pinfo->onlink) 4105 if (pinfo->onlink)
4106 pmsg->prefix_flags |= IF_PREFIX_ONLINK; 4106 pmsg->prefix_flags |= IF_PREFIX_ONLINK;
4107 if (pinfo->autoconf) 4107 if (pinfo->autoconf)
4108 pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; 4108 pmsg->prefix_flags |= IF_PREFIX_AUTOCONF;
4109 4109
4110 NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); 4110 NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix);
4111 4111
4112 ci.preferred_time = ntohl(pinfo->prefered); 4112 ci.preferred_time = ntohl(pinfo->prefered);
4113 ci.valid_time = ntohl(pinfo->valid); 4113 ci.valid_time = ntohl(pinfo->valid);
4114 NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); 4114 NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci);
4115 4115
4116 return nlmsg_end(skb, nlh); 4116 return nlmsg_end(skb, nlh);
4117 4117
4118 nla_put_failure: 4118 nla_put_failure:
4119 nlmsg_cancel(skb, nlh); 4119 nlmsg_cancel(skb, nlh);
4120 return -EMSGSIZE; 4120 return -EMSGSIZE;
4121 } 4121 }
4122 4122
4123 static void inet6_prefix_notify(int event, struct inet6_dev *idev, 4123 static void inet6_prefix_notify(int event, struct inet6_dev *idev,
4124 struct prefix_info *pinfo) 4124 struct prefix_info *pinfo)
4125 { 4125 {
4126 struct sk_buff *skb; 4126 struct sk_buff *skb;
4127 struct net *net = dev_net(idev->dev); 4127 struct net *net = dev_net(idev->dev);
4128 int err = -ENOBUFS; 4128 int err = -ENOBUFS;
4129 4129
4130 skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC); 4130 skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC);
4131 if (skb == NULL) 4131 if (skb == NULL)
4132 goto errout; 4132 goto errout;
4133 4133
4134 err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0); 4134 err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0);
4135 if (err < 0) { 4135 if (err < 0) {
4136 /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */ 4136 /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */
4137 WARN_ON(err == -EMSGSIZE); 4137 WARN_ON(err == -EMSGSIZE);
4138 kfree_skb(skb); 4138 kfree_skb(skb);
4139 goto errout; 4139 goto errout;
4140 } 4140 }
4141 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); 4141 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
4142 return; 4142 return;
4143 errout: 4143 errout:
4144 if (err < 0) 4144 if (err < 0)
4145 rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); 4145 rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
4146 } 4146 }
4147 4147
4148 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) 4148 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
4149 { 4149 {
4150 inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); 4150 inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);
4151 4151
4152 switch (event) { 4152 switch (event) {
4153 case RTM_NEWADDR: 4153 case RTM_NEWADDR:
4154 /* 4154 /*
4155 * If the address was optimistic 4155 * If the address was optimistic
4156 * we inserted the route at the start of 4156 * we inserted the route at the start of
4157 * our DAD process, so we don't need 4157 * our DAD process, so we don't need
4158 * to do it again 4158 * to do it again
4159 */ 4159 */
4160 if (!(ifp->rt->rt6i_node)) 4160 if (!(ifp->rt->rt6i_node))
4161 ip6_ins_rt(ifp->rt); 4161 ip6_ins_rt(ifp->rt);
4162 if (ifp->idev->cnf.forwarding) 4162 if (ifp->idev->cnf.forwarding)
4163 addrconf_join_anycast(ifp); 4163 addrconf_join_anycast(ifp);
4164 break; 4164 break;
4165 case RTM_DELADDR: 4165 case RTM_DELADDR:
4166 if (ifp->idev->cnf.forwarding) 4166 if (ifp->idev->cnf.forwarding)
4167 addrconf_leave_anycast(ifp); 4167 addrconf_leave_anycast(ifp);
4168 addrconf_leave_solict(ifp->idev, &ifp->addr); 4168 addrconf_leave_solict(ifp->idev, &ifp->addr);
4169 dst_hold(&ifp->rt->dst); 4169 dst_hold(&ifp->rt->dst);
4170 4170
4171 if (ip6_del_rt(ifp->rt)) 4171 if (ip6_del_rt(ifp->rt))
4172 dst_free(&ifp->rt->dst); 4172 dst_free(&ifp->rt->dst);
4173 break; 4173 break;
4174 } 4174 }
4175 } 4175 }
4176 4176
4177 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) 4177 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
4178 { 4178 {
4179 rcu_read_lock_bh(); 4179 rcu_read_lock_bh();
4180 if (likely(ifp->idev->dead == 0)) 4180 if (likely(ifp->idev->dead == 0))
4181 __ipv6_ifa_notify(event, ifp); 4181 __ipv6_ifa_notify(event, ifp);
4182 rcu_read_unlock_bh(); 4182 rcu_read_unlock_bh();
4183 } 4183 }
4184 4184
4185 #ifdef CONFIG_SYSCTL 4185 #ifdef CONFIG_SYSCTL
4186 4186
4187 static 4187 static
4188 int addrconf_sysctl_forward(ctl_table *ctl, int write, 4188 int addrconf_sysctl_forward(ctl_table *ctl, int write,
4189 void __user *buffer, size_t *lenp, loff_t *ppos) 4189 void __user *buffer, size_t *lenp, loff_t *ppos)
4190 { 4190 {
4191 int *valp = ctl->data; 4191 int *valp = ctl->data;
4192 int val = *valp; 4192 int val = *valp;
4193 loff_t pos = *ppos; 4193 loff_t pos = *ppos;
4194 int ret; 4194 int ret;
4195 4195
4196 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 4196 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
4197 4197
4198 if (write) 4198 if (write)
4199 ret = addrconf_fixup_forwarding(ctl, valp, val); 4199 ret = addrconf_fixup_forwarding(ctl, valp, val);
4200 if (ret) 4200 if (ret)
4201 *ppos = pos; 4201 *ppos = pos;
4202 return ret; 4202 return ret;
4203 } 4203 }
4204 4204
4205 static void dev_disable_change(struct inet6_dev *idev) 4205 static void dev_disable_change(struct inet6_dev *idev)
4206 { 4206 {
4207 if (!idev || !idev->dev) 4207 if (!idev || !idev->dev)
4208 return; 4208 return;
4209 4209
4210 if (idev->cnf.disable_ipv6) 4210 if (idev->cnf.disable_ipv6)
4211 addrconf_notify(NULL, NETDEV_DOWN, idev->dev); 4211 addrconf_notify(NULL, NETDEV_DOWN, idev->dev);
4212 else 4212 else
4213 addrconf_notify(NULL, NETDEV_UP, idev->dev); 4213 addrconf_notify(NULL, NETDEV_UP, idev->dev);
4214 } 4214 }
4215 4215
4216 static void addrconf_disable_change(struct net *net, __s32 newf) 4216 static void addrconf_disable_change(struct net *net, __s32 newf)
4217 { 4217 {
4218 struct net_device *dev; 4218 struct net_device *dev;
4219 struct inet6_dev *idev; 4219 struct inet6_dev *idev;
4220 4220
4221 rcu_read_lock(); 4221 rcu_read_lock();
4222 for_each_netdev_rcu(net, dev) { 4222 for_each_netdev_rcu(net, dev) {
4223 idev = __in6_dev_get(dev); 4223 idev = __in6_dev_get(dev);
4224 if (idev) { 4224 if (idev) {
4225 int changed = (!idev->cnf.disable_ipv6) ^ (!newf); 4225 int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
4226 idev->cnf.disable_ipv6 = newf; 4226 idev->cnf.disable_ipv6 = newf;
4227 if (changed) 4227 if (changed)
4228 dev_disable_change(idev); 4228 dev_disable_change(idev);
4229 } 4229 }
4230 } 4230 }
4231 rcu_read_unlock(); 4231 rcu_read_unlock();
4232 } 4232 }
4233 4233
4234 static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) 4234 static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
4235 { 4235 {
4236 struct net *net; 4236 struct net *net;
4237 4237
4238 net = (struct net *)table->extra2; 4238 net = (struct net *)table->extra2;
4239 4239
4240 if (p == &net->ipv6.devconf_dflt->disable_ipv6) 4240 if (p == &net->ipv6.devconf_dflt->disable_ipv6)
4241 return 0; 4241 return 0;
4242 4242
4243 if (!rtnl_trylock()) { 4243 if (!rtnl_trylock()) {
4244 /* Restore the original values before restarting */ 4244 /* Restore the original values before restarting */
4245 *p = old; 4245 *p = old;
4246 return restart_syscall(); 4246 return restart_syscall();
4247 } 4247 }
4248 4248
4249 if (p == &net->ipv6.devconf_all->disable_ipv6) { 4249 if (p == &net->ipv6.devconf_all->disable_ipv6) {
4250 __s32 newf = net->ipv6.devconf_all->disable_ipv6; 4250 __s32 newf = net->ipv6.devconf_all->disable_ipv6;
4251 net->ipv6.devconf_dflt->disable_ipv6 = newf; 4251 net->ipv6.devconf_dflt->disable_ipv6 = newf;
4252 addrconf_disable_change(net, newf); 4252 addrconf_disable_change(net, newf);
4253 } else if ((!*p) ^ (!old)) 4253 } else if ((!*p) ^ (!old))
4254 dev_disable_change((struct inet6_dev *)table->extra1); 4254 dev_disable_change((struct inet6_dev *)table->extra1);
4255 4255
4256 rtnl_unlock(); 4256 rtnl_unlock();
4257 return 0; 4257 return 0;
4258 } 4258 }
4259 4259
4260 static 4260 static
4261 int addrconf_sysctl_disable(ctl_table *ctl, int write, 4261 int addrconf_sysctl_disable(ctl_table *ctl, int write,
4262 void __user *buffer, size_t *lenp, loff_t *ppos) 4262 void __user *buffer, size_t *lenp, loff_t *ppos)
4263 { 4263 {
4264 int *valp = ctl->data; 4264 int *valp = ctl->data;
4265 int val = *valp; 4265 int val = *valp;
4266 loff_t pos = *ppos; 4266 loff_t pos = *ppos;
4267 int ret; 4267 int ret;
4268 4268
4269 ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 4269 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
4270 4270
4271 if (write) 4271 if (write)
4272 ret = addrconf_disable_ipv6(ctl, valp, val); 4272 ret = addrconf_disable_ipv6(ctl, valp, val);
4273 if (ret) 4273 if (ret)
4274 *ppos = pos; 4274 *ppos = pos;
4275 return ret; 4275 return ret;
4276 } 4276 }
4277 4277
4278 static struct addrconf_sysctl_table 4278 static struct addrconf_sysctl_table
4279 { 4279 {
4280 struct ctl_table_header *sysctl_header; 4280 struct ctl_table_header *sysctl_header;
4281 ctl_table addrconf_vars[DEVCONF_MAX+1]; 4281 ctl_table addrconf_vars[DEVCONF_MAX+1];
4282 char *dev_name; 4282 char *dev_name;
4283 } addrconf_sysctl __read_mostly = { 4283 } addrconf_sysctl __read_mostly = {
4284 .sysctl_header = NULL, 4284 .sysctl_header = NULL,
4285 .addrconf_vars = { 4285 .addrconf_vars = {
4286 { 4286 {
4287 .procname = "forwarding", 4287 .procname = "forwarding",
4288 .data = &ipv6_devconf.forwarding, 4288 .data = &ipv6_devconf.forwarding,
4289 .maxlen = sizeof(int), 4289 .maxlen = sizeof(int),
4290 .mode = 0644, 4290 .mode = 0644,
4291 .proc_handler = addrconf_sysctl_forward, 4291 .proc_handler = addrconf_sysctl_forward,
4292 }, 4292 },
4293 { 4293 {
4294 .procname = "hop_limit", 4294 .procname = "hop_limit",
4295 .data = &ipv6_devconf.hop_limit, 4295 .data = &ipv6_devconf.hop_limit,
4296 .maxlen = sizeof(int), 4296 .maxlen = sizeof(int),
4297 .mode = 0644, 4297 .mode = 0644,
4298 .proc_handler = proc_dointvec, 4298 .proc_handler = proc_dointvec,
4299 }, 4299 },
4300 { 4300 {
4301 .procname = "mtu", 4301 .procname = "mtu",
4302 .data = &ipv6_devconf.mtu6, 4302 .data = &ipv6_devconf.mtu6,
4303 .maxlen = sizeof(int), 4303 .maxlen = sizeof(int),
4304 .mode = 0644, 4304 .mode = 0644,
4305 .proc_handler = proc_dointvec, 4305 .proc_handler = proc_dointvec,
4306 }, 4306 },
4307 { 4307 {
4308 .procname = "accept_ra", 4308 .procname = "accept_ra",
4309 .data = &ipv6_devconf.accept_ra, 4309 .data = &ipv6_devconf.accept_ra,
4310 .maxlen = sizeof(int), 4310 .maxlen = sizeof(int),
4311 .mode = 0644, 4311 .mode = 0644,
4312 .proc_handler = proc_dointvec, 4312 .proc_handler = proc_dointvec,
4313 }, 4313 },
4314 { 4314 {
4315 .procname = "accept_redirects", 4315 .procname = "accept_redirects",
4316 .data = &ipv6_devconf.accept_redirects, 4316 .data = &ipv6_devconf.accept_redirects,
4317 .maxlen = sizeof(int), 4317 .maxlen = sizeof(int),
4318 .mode = 0644, 4318 .mode = 0644,
4319 .proc_handler = proc_dointvec, 4319 .proc_handler = proc_dointvec,
4320 }, 4320 },
4321 { 4321 {
4322 .procname = "autoconf", 4322 .procname = "autoconf",
4323 .data = &ipv6_devconf.autoconf, 4323 .data = &ipv6_devconf.autoconf,
4324 .maxlen = sizeof(int), 4324 .maxlen = sizeof(int),
4325 .mode = 0644, 4325 .mode = 0644,
4326 .proc_handler = proc_dointvec, 4326 .proc_handler = proc_dointvec,
4327 }, 4327 },
4328 { 4328 {
4329 .procname = "dad_transmits", 4329 .procname = "dad_transmits",
4330 .data = &ipv6_devconf.dad_transmits, 4330 .data = &ipv6_devconf.dad_transmits,
4331 .maxlen = sizeof(int), 4331 .maxlen = sizeof(int),
4332 .mode = 0644, 4332 .mode = 0644,
4333 .proc_handler = proc_dointvec, 4333 .proc_handler = proc_dointvec,
4334 }, 4334 },
4335 { 4335 {
4336 .procname = "router_solicitations", 4336 .procname = "router_solicitations",
4337 .data = &ipv6_devconf.rtr_solicits, 4337 .data = &ipv6_devconf.rtr_solicits,
4338 .maxlen = sizeof(int), 4338 .maxlen = sizeof(int),
4339 .mode = 0644, 4339 .mode = 0644,
4340 .proc_handler = proc_dointvec, 4340 .proc_handler = proc_dointvec,
4341 }, 4341 },
4342 { 4342 {
4343 .procname = "router_solicitation_interval", 4343 .procname = "router_solicitation_interval",
4344 .data = &ipv6_devconf.rtr_solicit_interval, 4344 .data = &ipv6_devconf.rtr_solicit_interval,
4345 .maxlen = sizeof(int), 4345 .maxlen = sizeof(int),
4346 .mode = 0644, 4346 .mode = 0644,
4347 .proc_handler = proc_dointvec_jiffies, 4347 .proc_handler = proc_dointvec_jiffies,
4348 }, 4348 },
4349 { 4349 {
4350 .procname = "router_solicitation_delay", 4350 .procname = "router_solicitation_delay",
4351 .data = &ipv6_devconf.rtr_solicit_delay, 4351 .data = &ipv6_devconf.rtr_solicit_delay,
4352 .maxlen = sizeof(int), 4352 .maxlen = sizeof(int),
4353 .mode = 0644, 4353 .mode = 0644,
4354 .proc_handler = proc_dointvec_jiffies, 4354 .proc_handler = proc_dointvec_jiffies,
4355 }, 4355 },
4356 { 4356 {
4357 .procname = "force_mld_version", 4357 .procname = "force_mld_version",
4358 .data = &ipv6_devconf.force_mld_version, 4358 .data = &ipv6_devconf.force_mld_version,
4359 .maxlen = sizeof(int), 4359 .maxlen = sizeof(int),
4360 .mode = 0644, 4360 .mode = 0644,
4361 .proc_handler = proc_dointvec, 4361 .proc_handler = proc_dointvec,
4362 }, 4362 },
4363 #ifdef CONFIG_IPV6_PRIVACY 4363 #ifdef CONFIG_IPV6_PRIVACY
4364 { 4364 {
4365 .procname = "use_tempaddr", 4365 .procname = "use_tempaddr",
4366 .data = &ipv6_devconf.use_tempaddr, 4366 .data = &ipv6_devconf.use_tempaddr,
4367 .maxlen = sizeof(int), 4367 .maxlen = sizeof(int),
4368 .mode = 0644, 4368 .mode = 0644,
4369 .proc_handler = proc_dointvec, 4369 .proc_handler = proc_dointvec,
4370 }, 4370 },
4371 { 4371 {
4372 .procname = "temp_valid_lft", 4372 .procname = "temp_valid_lft",
4373 .data = &ipv6_devconf.temp_valid_lft, 4373 .data = &ipv6_devconf.temp_valid_lft,
4374 .maxlen = sizeof(int), 4374 .maxlen = sizeof(int),
4375 .mode = 0644, 4375 .mode = 0644,
4376 .proc_handler = proc_dointvec, 4376 .proc_handler = proc_dointvec,
4377 }, 4377 },
4378 { 4378 {
4379 .procname = "temp_prefered_lft", 4379 .procname = "temp_prefered_lft",
4380 .data = &ipv6_devconf.temp_prefered_lft, 4380 .data = &ipv6_devconf.temp_prefered_lft,
4381 .maxlen = sizeof(int), 4381 .maxlen = sizeof(int),
4382 .mode = 0644, 4382 .mode = 0644,
4383 .proc_handler = proc_dointvec, 4383 .proc_handler = proc_dointvec,
4384 }, 4384 },
4385 { 4385 {
4386 .procname = "regen_max_retry", 4386 .procname = "regen_max_retry",
4387 .data = &ipv6_devconf.regen_max_retry, 4387 .data = &ipv6_devconf.regen_max_retry,
4388 .maxlen = sizeof(int), 4388 .maxlen = sizeof(int),
4389 .mode = 0644, 4389 .mode = 0644,
4390 .proc_handler = proc_dointvec, 4390 .proc_handler = proc_dointvec,
4391 }, 4391 },
4392 { 4392 {
4393 .procname = "max_desync_factor", 4393 .procname = "max_desync_factor",
4394 .data = &ipv6_devconf.max_desync_factor, 4394 .data = &ipv6_devconf.max_desync_factor,
4395 .maxlen = sizeof(int), 4395 .maxlen = sizeof(int),
4396 .mode = 0644, 4396 .mode = 0644,
4397 .proc_handler = proc_dointvec, 4397 .proc_handler = proc_dointvec,
4398 }, 4398 },
4399 #endif 4399 #endif
4400 { 4400 {
4401 .procname = "max_addresses", 4401 .procname = "max_addresses",
4402 .data = &ipv6_devconf.max_addresses, 4402 .data = &ipv6_devconf.max_addresses,
4403 .maxlen = sizeof(int), 4403 .maxlen = sizeof(int),
4404 .mode = 0644, 4404 .mode = 0644,
4405 .proc_handler = proc_dointvec, 4405 .proc_handler = proc_dointvec,
4406 }, 4406 },
4407 { 4407 {
4408 .procname = "accept_ra_defrtr", 4408 .procname = "accept_ra_defrtr",
4409 .data = &ipv6_devconf.accept_ra_defrtr, 4409 .data = &ipv6_devconf.accept_ra_defrtr,
4410 .maxlen = sizeof(int), 4410 .maxlen = sizeof(int),
4411 .mode = 0644, 4411 .mode = 0644,
4412 .proc_handler = proc_dointvec, 4412 .proc_handler = proc_dointvec,
4413 }, 4413 },
4414 { 4414 {
4415 .procname = "accept_ra_pinfo", 4415 .procname = "accept_ra_pinfo",
4416 .data = &ipv6_devconf.accept_ra_pinfo, 4416 .data = &ipv6_devconf.accept_ra_pinfo,
4417 .maxlen = sizeof(int), 4417 .maxlen = sizeof(int),
4418 .mode = 0644, 4418 .mode = 0644,
4419 .proc_handler = proc_dointvec, 4419 .proc_handler = proc_dointvec,
4420 }, 4420 },
4421 #ifdef CONFIG_IPV6_ROUTER_PREF 4421 #ifdef CONFIG_IPV6_ROUTER_PREF
4422 { 4422 {
4423 .procname = "accept_ra_rtr_pref", 4423 .procname = "accept_ra_rtr_pref",
4424 .data = &ipv6_devconf.accept_ra_rtr_pref, 4424 .data = &ipv6_devconf.accept_ra_rtr_pref,
4425 .maxlen = sizeof(int), 4425 .maxlen = sizeof(int),
4426 .mode = 0644, 4426 .mode = 0644,
4427 .proc_handler = proc_dointvec, 4427 .proc_handler = proc_dointvec,
4428 }, 4428 },
4429 { 4429 {
4430 .procname = "router_probe_interval", 4430 .procname = "router_probe_interval",
4431 .data = &ipv6_devconf.rtr_probe_interval, 4431 .data = &ipv6_devconf.rtr_probe_interval,
4432 .maxlen = sizeof(int), 4432 .maxlen = sizeof(int),
4433 .mode = 0644, 4433 .mode = 0644,
4434 .proc_handler = proc_dointvec_jiffies, 4434 .proc_handler = proc_dointvec_jiffies,
4435 }, 4435 },
4436 #ifdef CONFIG_IPV6_ROUTE_INFO 4436 #ifdef CONFIG_IPV6_ROUTE_INFO
4437 { 4437 {
4438 .procname = "accept_ra_rt_info_max_plen", 4438 .procname = "accept_ra_rt_info_max_plen",
4439 .data = &ipv6_devconf.accept_ra_rt_info_max_plen, 4439 .data = &ipv6_devconf.accept_ra_rt_info_max_plen,
4440 .maxlen = sizeof(int), 4440 .maxlen = sizeof(int),
4441 .mode = 0644, 4441 .mode = 0644,
4442 .proc_handler = proc_dointvec, 4442 .proc_handler = proc_dointvec,
4443 }, 4443 },
4444 #endif 4444 #endif
4445 #endif 4445 #endif
4446 { 4446 {
4447 .procname = "proxy_ndp", 4447 .procname = "proxy_ndp",
4448 .data = &ipv6_devconf.proxy_ndp, 4448 .data = &ipv6_devconf.proxy_ndp,
4449 .maxlen = sizeof(int), 4449 .maxlen = sizeof(int),
4450 .mode = 0644, 4450 .mode = 0644,
4451 .proc_handler = proc_dointvec, 4451 .proc_handler = proc_dointvec,
4452 }, 4452 },
4453 { 4453 {
4454 .procname = "accept_source_route", 4454 .procname = "accept_source_route",
4455 .data = &ipv6_devconf.accept_source_route, 4455 .data = &ipv6_devconf.accept_source_route,
4456 .maxlen = sizeof(int), 4456 .maxlen = sizeof(int),
4457 .mode = 0644, 4457 .mode = 0644,
4458 .proc_handler = proc_dointvec, 4458 .proc_handler = proc_dointvec,
4459 }, 4459 },
4460 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 4460 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
4461 { 4461 {
4462 .procname = "optimistic_dad", 4462 .procname = "optimistic_dad",
4463 .data = &ipv6_devconf.optimistic_dad, 4463 .data = &ipv6_devconf.optimistic_dad,
4464 .maxlen = sizeof(int), 4464 .maxlen = sizeof(int),
4465 .mode = 0644, 4465 .mode = 0644,
4466 .proc_handler = proc_dointvec, 4466 .proc_handler = proc_dointvec,
4467 4467
4468 }, 4468 },
4469 #endif 4469 #endif
4470 #ifdef CONFIG_IPV6_MROUTE 4470 #ifdef CONFIG_IPV6_MROUTE
4471 { 4471 {
4472 .procname = "mc_forwarding", 4472 .procname = "mc_forwarding",
4473 .data = &ipv6_devconf.mc_forwarding, 4473 .data = &ipv6_devconf.mc_forwarding,
4474 .maxlen = sizeof(int), 4474 .maxlen = sizeof(int),
4475 .mode = 0444, 4475 .mode = 0444,
4476 .proc_handler = proc_dointvec, 4476 .proc_handler = proc_dointvec,
4477 }, 4477 },
4478 #endif 4478 #endif
4479 { 4479 {
4480 .procname = "disable_ipv6", 4480 .procname = "disable_ipv6",
4481 .data = &ipv6_devconf.disable_ipv6, 4481 .data = &ipv6_devconf.disable_ipv6,
4482 .maxlen = sizeof(int), 4482 .maxlen = sizeof(int),
4483 .mode = 0644, 4483 .mode = 0644,
4484 .proc_handler = addrconf_sysctl_disable, 4484 .proc_handler = addrconf_sysctl_disable,
4485 }, 4485 },
4486 { 4486 {
4487 .procname = "accept_dad", 4487 .procname = "accept_dad",
4488 .data = &ipv6_devconf.accept_dad, 4488 .data = &ipv6_devconf.accept_dad,
4489 .maxlen = sizeof(int), 4489 .maxlen = sizeof(int),
4490 .mode = 0644, 4490 .mode = 0644,
4491 .proc_handler = proc_dointvec, 4491 .proc_handler = proc_dointvec,
4492 }, 4492 },
4493 { 4493 {
4494 .procname = "force_tllao", 4494 .procname = "force_tllao",
4495 .data = &ipv6_devconf.force_tllao, 4495 .data = &ipv6_devconf.force_tllao,
4496 .maxlen = sizeof(int), 4496 .maxlen = sizeof(int),
4497 .mode = 0644, 4497 .mode = 0644,
4498 .proc_handler = proc_dointvec 4498 .proc_handler = proc_dointvec
4499 }, 4499 },
4500 { 4500 {
4501 /* sentinel */ 4501 /* sentinel */
4502 } 4502 }
4503 }, 4503 },
4504 }; 4504 };
4505 4505
4506 static int __addrconf_sysctl_register(struct net *net, char *dev_name, 4506 static int __addrconf_sysctl_register(struct net *net, char *dev_name,
4507 struct inet6_dev *idev, struct ipv6_devconf *p) 4507 struct inet6_dev *idev, struct ipv6_devconf *p)
4508 { 4508 {
4509 int i; 4509 int i;
4510 struct addrconf_sysctl_table *t; 4510 struct addrconf_sysctl_table *t;
4511 4511
4512 #define ADDRCONF_CTL_PATH_DEV 3 4512 #define ADDRCONF_CTL_PATH_DEV 3
4513 4513
4514 struct ctl_path addrconf_ctl_path[] = { 4514 struct ctl_path addrconf_ctl_path[] = {
4515 { .procname = "net", }, 4515 { .procname = "net", },
4516 { .procname = "ipv6", }, 4516 { .procname = "ipv6", },
4517 { .procname = "conf", }, 4517 { .procname = "conf", },
4518 { /* to be set */ }, 4518 { /* to be set */ },
4519 { }, 4519 { },
4520 }; 4520 };
4521 4521
4522 4522
4523 t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); 4523 t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
4524 if (t == NULL) 4524 if (t == NULL)
4525 goto out; 4525 goto out;
4526 4526
4527 for (i = 0; t->addrconf_vars[i].data; i++) { 4527 for (i = 0; t->addrconf_vars[i].data; i++) {
4528 t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf; 4528 t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
4529 t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ 4529 t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
4530 t->addrconf_vars[i].extra2 = net; 4530 t->addrconf_vars[i].extra2 = net;
4531 } 4531 }
4532 4532
4533 /* 4533 /*
4534 * Make a copy of dev_name, because '.procname' is regarded as const 4534 * Make a copy of dev_name, because '.procname' is regarded as const
4535 * by sysctl and we wouldn't want anyone to change it under our feet 4535 * by sysctl and we wouldn't want anyone to change it under our feet
4536 * (see SIOCSIFNAME). 4536 * (see SIOCSIFNAME).
4537 */ 4537 */
4538 t->dev_name = kstrdup(dev_name, GFP_KERNEL); 4538 t->dev_name = kstrdup(dev_name, GFP_KERNEL);
4539 if (!t->dev_name) 4539 if (!t->dev_name)
4540 goto free; 4540 goto free;
4541 4541
4542 addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name; 4542 addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name;
4543 4543
4544 t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path, 4544 t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path,
4545 t->addrconf_vars); 4545 t->addrconf_vars);
4546 if (t->sysctl_header == NULL) 4546 if (t->sysctl_header == NULL)
4547 goto free_procname; 4547 goto free_procname;
4548 4548
4549 p->sysctl = t; 4549 p->sysctl = t;
4550 return 0; 4550 return 0;
4551 4551
4552 free_procname: 4552 free_procname:
4553 kfree(t->dev_name); 4553 kfree(t->dev_name);
4554 free: 4554 free:
4555 kfree(t); 4555 kfree(t);
4556 out: 4556 out:
4557 return -ENOBUFS; 4557 return -ENOBUFS;
4558 } 4558 }
4559 4559
4560 static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) 4560 static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
4561 { 4561 {
4562 struct addrconf_sysctl_table *t; 4562 struct addrconf_sysctl_table *t;
4563 4563
4564 if (p->sysctl == NULL) 4564 if (p->sysctl == NULL)
4565 return; 4565 return;
4566 4566
4567 t = p->sysctl; 4567 t = p->sysctl;
4568 p->sysctl = NULL; 4568 p->sysctl = NULL;
4569 unregister_net_sysctl_table(t->sysctl_header); 4569 unregister_net_sysctl_table(t->sysctl_header);
4570 kfree(t->dev_name); 4570 kfree(t->dev_name);
4571 kfree(t); 4571 kfree(t);
4572 } 4572 }
4573 4573
4574 static void addrconf_sysctl_register(struct inet6_dev *idev) 4574 static void addrconf_sysctl_register(struct inet6_dev *idev)
4575 { 4575 {
4576 neigh_sysctl_register(idev->dev, idev->nd_parms, "ipv6", 4576 neigh_sysctl_register(idev->dev, idev->nd_parms, "ipv6",
4577 &ndisc_ifinfo_sysctl_change); 4577 &ndisc_ifinfo_sysctl_change);
4578 __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, 4578 __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
4579 idev, &idev->cnf); 4579 idev, &idev->cnf);
4580 } 4580 }
4581 4581
4582 static void addrconf_sysctl_unregister(struct inet6_dev *idev) 4582 static void addrconf_sysctl_unregister(struct inet6_dev *idev)
4583 { 4583 {
4584 __addrconf_sysctl_unregister(&idev->cnf); 4584 __addrconf_sysctl_unregister(&idev->cnf);
4585 neigh_sysctl_unregister(idev->nd_parms); 4585 neigh_sysctl_unregister(idev->nd_parms);
4586 } 4586 }
4587 4587
4588 4588
4589 #endif 4589 #endif
4590 4590
4591 static int __net_init addrconf_init_net(struct net *net) 4591 static int __net_init addrconf_init_net(struct net *net)
4592 { 4592 {
4593 int err; 4593 int err;
4594 struct ipv6_devconf *all, *dflt; 4594 struct ipv6_devconf *all, *dflt;
4595 4595
4596 err = -ENOMEM; 4596 err = -ENOMEM;
4597 all = &ipv6_devconf; 4597 all = &ipv6_devconf;
4598 dflt = &ipv6_devconf_dflt; 4598 dflt = &ipv6_devconf_dflt;
4599 4599
4600 if (!net_eq(net, &init_net)) { 4600 if (!net_eq(net, &init_net)) {
4601 all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); 4601 all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL);
4602 if (all == NULL) 4602 if (all == NULL)
4603 goto err_alloc_all; 4603 goto err_alloc_all;
4604 4604
4605 dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); 4605 dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
4606 if (dflt == NULL) 4606 if (dflt == NULL)
4607 goto err_alloc_dflt; 4607 goto err_alloc_dflt;
4608 } else { 4608 } else {
4609 /* these will be inherited by all namespaces */ 4609 /* these will be inherited by all namespaces */
4610 dflt->autoconf = ipv6_defaults.autoconf; 4610 dflt->autoconf = ipv6_defaults.autoconf;
4611 dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; 4611 dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
4612 } 4612 }
4613 4613
4614 net->ipv6.devconf_all = all; 4614 net->ipv6.devconf_all = all;
4615 net->ipv6.devconf_dflt = dflt; 4615 net->ipv6.devconf_dflt = dflt;
4616 4616
4617 #ifdef CONFIG_SYSCTL 4617 #ifdef CONFIG_SYSCTL
4618 err = __addrconf_sysctl_register(net, "all", NULL, all); 4618 err = __addrconf_sysctl_register(net, "all", NULL, all);
4619 if (err < 0) 4619 if (err < 0)
4620 goto err_reg_all; 4620 goto err_reg_all;
4621 4621
4622 err = __addrconf_sysctl_register(net, "default", NULL, dflt); 4622 err = __addrconf_sysctl_register(net, "default", NULL, dflt);
4623 if (err < 0) 4623 if (err < 0)
4624 goto err_reg_dflt; 4624 goto err_reg_dflt;
4625 #endif 4625 #endif
4626 return 0; 4626 return 0;
4627 4627
4628 #ifdef CONFIG_SYSCTL 4628 #ifdef CONFIG_SYSCTL
4629 err_reg_dflt: 4629 err_reg_dflt:
4630 __addrconf_sysctl_unregister(all); 4630 __addrconf_sysctl_unregister(all);
4631 err_reg_all: 4631 err_reg_all:
4632 kfree(dflt); 4632 kfree(dflt);
4633 #endif 4633 #endif
4634 err_alloc_dflt: 4634 err_alloc_dflt:
4635 kfree(all); 4635 kfree(all);
4636 err_alloc_all: 4636 err_alloc_all:
4637 return err; 4637 return err;
4638 } 4638 }
4639 4639
4640 static void __net_exit addrconf_exit_net(struct net *net) 4640 static void __net_exit addrconf_exit_net(struct net *net)
4641 { 4641 {
4642 #ifdef CONFIG_SYSCTL 4642 #ifdef CONFIG_SYSCTL
4643 __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); 4643 __addrconf_sysctl_unregister(net->ipv6.devconf_dflt);
4644 __addrconf_sysctl_unregister(net->ipv6.devconf_all); 4644 __addrconf_sysctl_unregister(net->ipv6.devconf_all);
4645 #endif 4645 #endif
4646 if (!net_eq(net, &init_net)) { 4646 if (!net_eq(net, &init_net)) {
4647 kfree(net->ipv6.devconf_dflt); 4647 kfree(net->ipv6.devconf_dflt);
4648 kfree(net->ipv6.devconf_all); 4648 kfree(net->ipv6.devconf_all);
4649 } 4649 }
4650 } 4650 }
4651 4651
4652 static struct pernet_operations addrconf_ops = { 4652 static struct pernet_operations addrconf_ops = {
4653 .init = addrconf_init_net, 4653 .init = addrconf_init_net,
4654 .exit = addrconf_exit_net, 4654 .exit = addrconf_exit_net,
4655 }; 4655 };
4656 4656
4657 /* 4657 /*
4658 * Device notifier 4658 * Device notifier
4659 */ 4659 */
4660 4660
4661 int register_inet6addr_notifier(struct notifier_block *nb) 4661 int register_inet6addr_notifier(struct notifier_block *nb)
4662 { 4662 {
4663 return atomic_notifier_chain_register(&inet6addr_chain, nb); 4663 return atomic_notifier_chain_register(&inet6addr_chain, nb);
4664 } 4664 }
4665 EXPORT_SYMBOL(register_inet6addr_notifier); 4665 EXPORT_SYMBOL(register_inet6addr_notifier);
4666 4666
4667 int unregister_inet6addr_notifier(struct notifier_block *nb) 4667 int unregister_inet6addr_notifier(struct notifier_block *nb)
4668 { 4668 {
4669 return atomic_notifier_chain_unregister(&inet6addr_chain, nb); 4669 return atomic_notifier_chain_unregister(&inet6addr_chain, nb);
4670 } 4670 }
4671 EXPORT_SYMBOL(unregister_inet6addr_notifier); 4671 EXPORT_SYMBOL(unregister_inet6addr_notifier);
4672 4672
4673 static struct rtnl_af_ops inet6_ops = { 4673 static struct rtnl_af_ops inet6_ops = {
4674 .family = AF_INET6, 4674 .family = AF_INET6,
4675 .fill_link_af = inet6_fill_link_af, 4675 .fill_link_af = inet6_fill_link_af,
4676 .get_link_af_size = inet6_get_link_af_size, 4676 .get_link_af_size = inet6_get_link_af_size,
4677 }; 4677 };
4678 4678
4679 /* 4679 /*
4680 * Init / cleanup code 4680 * Init / cleanup code
4681 */ 4681 */
4682 4682
4683 int __init addrconf_init(void) 4683 int __init addrconf_init(void)
4684 { 4684 {
4685 int i, err; 4685 int i, err;
4686 4686
4687 err = ipv6_addr_label_init(); 4687 err = ipv6_addr_label_init();
4688 if (err < 0) { 4688 if (err < 0) {
4689 printk(KERN_CRIT "IPv6 Addrconf:" 4689 printk(KERN_CRIT "IPv6 Addrconf:"
4690 " cannot initialize default policy table: %d.\n", err); 4690 " cannot initialize default policy table: %d.\n", err);
4691 goto out; 4691 goto out;
4692 } 4692 }
4693 4693
4694 err = register_pernet_subsys(&addrconf_ops); 4694 err = register_pernet_subsys(&addrconf_ops);
4695 if (err < 0) 4695 if (err < 0)
4696 goto out_addrlabel; 4696 goto out_addrlabel;
4697 4697
4698 /* The addrconf netdev notifier requires that loopback_dev 4698 /* The addrconf netdev notifier requires that loopback_dev
4699 * has it's ipv6 private information allocated and setup 4699 * has it's ipv6 private information allocated and setup
4700 * before it can bring up and give link-local addresses 4700 * before it can bring up and give link-local addresses
4701 * to other devices which are up. 4701 * to other devices which are up.
4702 * 4702 *
4703 * Unfortunately, loopback_dev is not necessarily the first 4703 * Unfortunately, loopback_dev is not necessarily the first
4704 * entry in the global dev_base list of net devices. In fact, 4704 * entry in the global dev_base list of net devices. In fact,
4705 * it is likely to be the very last entry on that list. 4705 * it is likely to be the very last entry on that list.
4706 * So this causes the notifier registry below to try and 4706 * So this causes the notifier registry below to try and
4707 * give link-local addresses to all devices besides loopback_dev 4707 * give link-local addresses to all devices besides loopback_dev
4708 * first, then loopback_dev, which cases all the non-loopback_dev 4708 * first, then loopback_dev, which cases all the non-loopback_dev
4709 * devices to fail to get a link-local address. 4709 * devices to fail to get a link-local address.
4710 * 4710 *
4711 * So, as a temporary fix, allocate the ipv6 structure for 4711 * So, as a temporary fix, allocate the ipv6 structure for
4712 * loopback_dev first by hand. 4712 * loopback_dev first by hand.
4713 * Longer term, all of the dependencies ipv6 has upon the loopback 4713 * Longer term, all of the dependencies ipv6 has upon the loopback
4714 * device and it being up should be removed. 4714 * device and it being up should be removed.
4715 */ 4715 */
4716 rtnl_lock(); 4716 rtnl_lock();
4717 if (!ipv6_add_dev(init_net.loopback_dev)) 4717 if (!ipv6_add_dev(init_net.loopback_dev))
4718 err = -ENOMEM; 4718 err = -ENOMEM;
4719 rtnl_unlock(); 4719 rtnl_unlock();
4720 if (err) 4720 if (err)
4721 goto errlo; 4721 goto errlo;
4722 4722
4723 for (i = 0; i < IN6_ADDR_HSIZE; i++) 4723 for (i = 0; i < IN6_ADDR_HSIZE; i++)
4724 INIT_HLIST_HEAD(&inet6_addr_lst[i]); 4724 INIT_HLIST_HEAD(&inet6_addr_lst[i]);
4725 4725
4726 register_netdevice_notifier(&ipv6_dev_notf); 4726 register_netdevice_notifier(&ipv6_dev_notf);
4727 4727
4728 addrconf_verify(0); 4728 addrconf_verify(0);
4729 4729
4730 err = rtnl_af_register(&inet6_ops); 4730 err = rtnl_af_register(&inet6_ops);
4731 if (err < 0) 4731 if (err < 0)
4732 goto errout_af; 4732 goto errout_af;
4733 4733
4734 err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, 4734 err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
4735 NULL); 4735 NULL);
4736 if (err < 0) 4736 if (err < 0)
4737 goto errout; 4737 goto errout;
4738 4738
4739 /* Only the first call to __rtnl_register can fail */ 4739 /* Only the first call to __rtnl_register can fail */
4740 __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL); 4740 __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
4741 __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL); 4741 __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
4742 __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, 4742 __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
4743 inet6_dump_ifaddr, NULL); 4743 inet6_dump_ifaddr, NULL);
4744 __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, 4744 __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
4745 inet6_dump_ifmcaddr, NULL); 4745 inet6_dump_ifmcaddr, NULL);
4746 __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, 4746 __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
4747 inet6_dump_ifacaddr, NULL); 4747 inet6_dump_ifacaddr, NULL);
4748 4748
4749 ipv6_addr_label_rtnl_register(); 4749 ipv6_addr_label_rtnl_register();
4750 4750
4751 return 0; 4751 return 0;
4752 errout: 4752 errout:
4753 rtnl_af_unregister(&inet6_ops); 4753 rtnl_af_unregister(&inet6_ops);
4754 errout_af: 4754 errout_af:
4755 unregister_netdevice_notifier(&ipv6_dev_notf); 4755 unregister_netdevice_notifier(&ipv6_dev_notf);
4756 errlo: 4756 errlo:
4757 unregister_pernet_subsys(&addrconf_ops); 4757 unregister_pernet_subsys(&addrconf_ops);
4758 out_addrlabel: 4758 out_addrlabel:
4759 ipv6_addr_label_cleanup(); 4759 ipv6_addr_label_cleanup();
4760 out: 4760 out:
4761 return err; 4761 return err;
4762 } 4762 }
4763 4763
4764 void addrconf_cleanup(void) 4764 void addrconf_cleanup(void)
4765 { 4765 {
4766 struct net_device *dev; 4766 struct net_device *dev;
4767 int i; 4767 int i;
4768 4768
4769 unregister_netdevice_notifier(&ipv6_dev_notf); 4769 unregister_netdevice_notifier(&ipv6_dev_notf);
4770 unregister_pernet_subsys(&addrconf_ops); 4770 unregister_pernet_subsys(&addrconf_ops);
4771 ipv6_addr_label_cleanup(); 4771 ipv6_addr_label_cleanup();
4772 4772
4773 rtnl_lock(); 4773 rtnl_lock();
4774 4774
4775 __rtnl_af_unregister(&inet6_ops); 4775 __rtnl_af_unregister(&inet6_ops);
4776 4776
4777 /* clean dev list */ 4777 /* clean dev list */
4778 for_each_netdev(&init_net, dev) { 4778 for_each_netdev(&init_net, dev) {
4779 if (__in6_dev_get(dev) == NULL) 4779 if (__in6_dev_get(dev) == NULL)
4780 continue; 4780 continue;
4781 addrconf_ifdown(dev, 1); 4781 addrconf_ifdown(dev, 1);
4782 } 4782 }
4783 addrconf_ifdown(init_net.loopback_dev, 2); 4783 addrconf_ifdown(init_net.loopback_dev, 2);
4784 4784
4785 /* 4785 /*
4786 * Check hash table. 4786 * Check hash table.
4787 */ 4787 */
4788 spin_lock_bh(&addrconf_hash_lock); 4788 spin_lock_bh(&addrconf_hash_lock);
4789 for (i = 0; i < IN6_ADDR_HSIZE; i++) 4789 for (i = 0; i < IN6_ADDR_HSIZE; i++)
4790 WARN_ON(!hlist_empty(&inet6_addr_lst[i])); 4790 WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
4791 spin_unlock_bh(&addrconf_hash_lock); 4791 spin_unlock_bh(&addrconf_hash_lock);
4792 4792
4793 del_timer(&addr_chk_timer); 4793 del_timer(&addr_chk_timer);
4794 rtnl_unlock(); 4794 rtnl_unlock();
4795 } 4795 }
4796 4796
1 /* 1 /*
2 * Linux INET6 implementation 2 * Linux INET6 implementation
3 * Forwarding Information Database 3 * Forwarding Information Database
4 * 4 *
5 * Authors: 5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt> 6 * Pedro Roque <roque@di.fc.ul.pt>
7 * 7 *
8 * This program is free software; you can redistribute it and/or 8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License 9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version. 11 * 2 of the License, or (at your option) any later version.
12 */ 12 */
13 13
14 /* 14 /*
15 * Changes: 15 * Changes:
16 * Yuji SEKIYA @USAGI: Support default route on router node; 16 * Yuji SEKIYA @USAGI: Support default route on router node;
17 * remove ip6_null_entry from the top of 17 * remove ip6_null_entry from the top of
18 * routing table. 18 * routing table.
19 * Ville Nuorvala: Fixed routing subtrees. 19 * Ville Nuorvala: Fixed routing subtrees.
20 */ 20 */
21 #include <linux/errno.h> 21 #include <linux/errno.h>
22 #include <linux/types.h> 22 #include <linux/types.h>
23 #include <linux/net.h> 23 #include <linux/net.h>
24 #include <linux/route.h> 24 #include <linux/route.h>
25 #include <linux/netdevice.h> 25 #include <linux/netdevice.h>
26 #include <linux/in6.h> 26 #include <linux/in6.h>
27 #include <linux/init.h> 27 #include <linux/init.h>
28 #include <linux/list.h> 28 #include <linux/list.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 30
31 #ifdef CONFIG_PROC_FS 31 #ifdef CONFIG_PROC_FS
32 #include <linux/proc_fs.h> 32 #include <linux/proc_fs.h>
33 #endif 33 #endif
34 34
35 #include <net/ipv6.h> 35 #include <net/ipv6.h>
36 #include <net/ndisc.h> 36 #include <net/ndisc.h>
37 #include <net/addrconf.h> 37 #include <net/addrconf.h>
38 38
39 #include <net/ip6_fib.h> 39 #include <net/ip6_fib.h>
40 #include <net/ip6_route.h> 40 #include <net/ip6_route.h>
41 41
42 #define RT6_DEBUG 2 42 #define RT6_DEBUG 2
43 43
44 #if RT6_DEBUG >= 3 44 #if RT6_DEBUG >= 3
45 #define RT6_TRACE(x...) printk(KERN_DEBUG x) 45 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
46 #else 46 #else
47 #define RT6_TRACE(x...) do { ; } while (0) 47 #define RT6_TRACE(x...) do { ; } while (0)
48 #endif 48 #endif
49 49
50 static struct kmem_cache * fib6_node_kmem __read_mostly; 50 static struct kmem_cache * fib6_node_kmem __read_mostly;
51 51
52 enum fib_walk_state_t 52 enum fib_walk_state_t
53 { 53 {
54 #ifdef CONFIG_IPV6_SUBTREES 54 #ifdef CONFIG_IPV6_SUBTREES
55 FWS_S, 55 FWS_S,
56 #endif 56 #endif
57 FWS_L, 57 FWS_L,
58 FWS_R, 58 FWS_R,
59 FWS_C, 59 FWS_C,
60 FWS_U 60 FWS_U
61 }; 61 };
62 62
63 struct fib6_cleaner_t 63 struct fib6_cleaner_t
64 { 64 {
65 struct fib6_walker_t w; 65 struct fib6_walker_t w;
66 struct net *net; 66 struct net *net;
67 int (*func)(struct rt6_info *, void *arg); 67 int (*func)(struct rt6_info *, void *arg);
68 void *arg; 68 void *arg;
69 }; 69 };
70 70
71 static DEFINE_RWLOCK(fib6_walker_lock); 71 static DEFINE_RWLOCK(fib6_walker_lock);
72 72
73 #ifdef CONFIG_IPV6_SUBTREES 73 #ifdef CONFIG_IPV6_SUBTREES
74 #define FWS_INIT FWS_S 74 #define FWS_INIT FWS_S
75 #else 75 #else
76 #define FWS_INIT FWS_L 76 #define FWS_INIT FWS_L
77 #endif 77 #endif
78 78
79 static void fib6_prune_clones(struct net *net, struct fib6_node *fn, 79 static void fib6_prune_clones(struct net *net, struct fib6_node *fn,
80 struct rt6_info *rt); 80 struct rt6_info *rt);
81 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); 81 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
82 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); 82 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
83 static int fib6_walk(struct fib6_walker_t *w); 83 static int fib6_walk(struct fib6_walker_t *w);
84 static int fib6_walk_continue(struct fib6_walker_t *w); 84 static int fib6_walk_continue(struct fib6_walker_t *w);
85 85
86 /* 86 /*
87 * A routing update causes an increase of the serial number on the 87 * A routing update causes an increase of the serial number on the
88 * affected subtree. This allows for cached routes to be asynchronously 88 * affected subtree. This allows for cached routes to be asynchronously
89 * tested when modifications are made to the destination cache as a 89 * tested when modifications are made to the destination cache as a
90 * result of redirects, path MTU changes, etc. 90 * result of redirects, path MTU changes, etc.
91 */ 91 */
92 92
93 static __u32 rt_sernum; 93 static __u32 rt_sernum;
94 94
95 static void fib6_gc_timer_cb(unsigned long arg); 95 static void fib6_gc_timer_cb(unsigned long arg);
96 96
97 static LIST_HEAD(fib6_walkers); 97 static LIST_HEAD(fib6_walkers);
98 #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) 98 #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
99 99
100 static inline void fib6_walker_link(struct fib6_walker_t *w) 100 static inline void fib6_walker_link(struct fib6_walker_t *w)
101 { 101 {
102 write_lock_bh(&fib6_walker_lock); 102 write_lock_bh(&fib6_walker_lock);
103 list_add(&w->lh, &fib6_walkers); 103 list_add(&w->lh, &fib6_walkers);
104 write_unlock_bh(&fib6_walker_lock); 104 write_unlock_bh(&fib6_walker_lock);
105 } 105 }
106 106
107 static inline void fib6_walker_unlink(struct fib6_walker_t *w) 107 static inline void fib6_walker_unlink(struct fib6_walker_t *w)
108 { 108 {
109 write_lock_bh(&fib6_walker_lock); 109 write_lock_bh(&fib6_walker_lock);
110 list_del(&w->lh); 110 list_del(&w->lh);
111 write_unlock_bh(&fib6_walker_lock); 111 write_unlock_bh(&fib6_walker_lock);
112 } 112 }
113 static __inline__ u32 fib6_new_sernum(void) 113 static __inline__ u32 fib6_new_sernum(void)
114 { 114 {
115 u32 n = ++rt_sernum; 115 u32 n = ++rt_sernum;
116 if ((__s32)n <= 0) 116 if ((__s32)n <= 0)
117 rt_sernum = n = 1; 117 rt_sernum = n = 1;
118 return n; 118 return n;
119 } 119 }
120 120
121 /* 121 /*
122 * Auxiliary address test functions for the radix tree. 122 * Auxiliary address test functions for the radix tree.
123 * 123 *
124 * These assume a 32bit processor (although it will work on 124 * These assume a 32bit processor (although it will work on
125 * 64bit processors) 125 * 64bit processors)
126 */ 126 */
127 127
128 /* 128 /*
129 * test bit 129 * test bit
130 */ 130 */
131 #if defined(__LITTLE_ENDIAN) 131 #if defined(__LITTLE_ENDIAN)
132 # define BITOP_BE32_SWIZZLE (0x1F & ~7) 132 # define BITOP_BE32_SWIZZLE (0x1F & ~7)
133 #else 133 #else
134 # define BITOP_BE32_SWIZZLE 0 134 # define BITOP_BE32_SWIZZLE 0
135 #endif 135 #endif
136 136
137 static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) 137 static __inline__ __be32 addr_bit_set(const void *token, int fn_bit)
138 { 138 {
139 const __be32 *addr = token; 139 const __be32 *addr = token;
140 /* 140 /*
141 * Here, 141 * Here,
142 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) 142 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
143 * is optimized version of 143 * is optimized version of
144 * htonl(1 << ((~fn_bit)&0x1F)) 144 * htonl(1 << ((~fn_bit)&0x1F))
145 * See include/asm-generic/bitops/le.h. 145 * See include/asm-generic/bitops/le.h.
146 */ 146 */
147 return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & 147 return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
148 addr[fn_bit >> 5]; 148 addr[fn_bit >> 5];
149 } 149 }
150 150
151 static __inline__ struct fib6_node * node_alloc(void) 151 static __inline__ struct fib6_node * node_alloc(void)
152 { 152 {
153 struct fib6_node *fn; 153 struct fib6_node *fn;
154 154
155 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); 155 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
156 156
157 return fn; 157 return fn;
158 } 158 }
159 159
160 static __inline__ void node_free(struct fib6_node * fn) 160 static __inline__ void node_free(struct fib6_node * fn)
161 { 161 {
162 kmem_cache_free(fib6_node_kmem, fn); 162 kmem_cache_free(fib6_node_kmem, fn);
163 } 163 }
164 164
165 static __inline__ void rt6_release(struct rt6_info *rt) 165 static __inline__ void rt6_release(struct rt6_info *rt)
166 { 166 {
167 if (atomic_dec_and_test(&rt->rt6i_ref)) 167 if (atomic_dec_and_test(&rt->rt6i_ref))
168 dst_free(&rt->dst); 168 dst_free(&rt->dst);
169 } 169 }
170 170
171 static void fib6_link_table(struct net *net, struct fib6_table *tb) 171 static void fib6_link_table(struct net *net, struct fib6_table *tb)
172 { 172 {
173 unsigned int h; 173 unsigned int h;
174 174
175 /* 175 /*
176 * Initialize table lock at a single place to give lockdep a key, 176 * Initialize table lock at a single place to give lockdep a key,
177 * tables aren't visible prior to being linked to the list. 177 * tables aren't visible prior to being linked to the list.
178 */ 178 */
179 rwlock_init(&tb->tb6_lock); 179 rwlock_init(&tb->tb6_lock);
180 180
181 h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); 181 h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
182 182
183 /* 183 /*
184 * No protection necessary, this is the only list mutatation 184 * No protection necessary, this is the only list mutatation
185 * operation, tables never disappear once they exist. 185 * operation, tables never disappear once they exist.
186 */ 186 */
187 hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); 187 hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
188 } 188 }
189 189
190 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 190 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
191 191
192 static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) 192 static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
193 { 193 {
194 struct fib6_table *table; 194 struct fib6_table *table;
195 195
196 table = kzalloc(sizeof(*table), GFP_ATOMIC); 196 table = kzalloc(sizeof(*table), GFP_ATOMIC);
197 if (table != NULL) { 197 if (table != NULL) {
198 table->tb6_id = id; 198 table->tb6_id = id;
199 table->tb6_root.leaf = net->ipv6.ip6_null_entry; 199 table->tb6_root.leaf = net->ipv6.ip6_null_entry;
200 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 200 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
201 } 201 }
202 202
203 return table; 203 return table;
204 } 204 }
205 205
206 struct fib6_table *fib6_new_table(struct net *net, u32 id) 206 struct fib6_table *fib6_new_table(struct net *net, u32 id)
207 { 207 {
208 struct fib6_table *tb; 208 struct fib6_table *tb;
209 209
210 if (id == 0) 210 if (id == 0)
211 id = RT6_TABLE_MAIN; 211 id = RT6_TABLE_MAIN;
212 tb = fib6_get_table(net, id); 212 tb = fib6_get_table(net, id);
213 if (tb) 213 if (tb)
214 return tb; 214 return tb;
215 215
216 tb = fib6_alloc_table(net, id); 216 tb = fib6_alloc_table(net, id);
217 if (tb != NULL) 217 if (tb != NULL)
218 fib6_link_table(net, tb); 218 fib6_link_table(net, tb);
219 219
220 return tb; 220 return tb;
221 } 221 }
222 222
223 struct fib6_table *fib6_get_table(struct net *net, u32 id) 223 struct fib6_table *fib6_get_table(struct net *net, u32 id)
224 { 224 {
225 struct fib6_table *tb; 225 struct fib6_table *tb;
226 struct hlist_head *head; 226 struct hlist_head *head;
227 struct hlist_node *node; 227 struct hlist_node *node;
228 unsigned int h; 228 unsigned int h;
229 229
230 if (id == 0) 230 if (id == 0)
231 id = RT6_TABLE_MAIN; 231 id = RT6_TABLE_MAIN;
232 h = id & (FIB6_TABLE_HASHSZ - 1); 232 h = id & (FIB6_TABLE_HASHSZ - 1);
233 rcu_read_lock(); 233 rcu_read_lock();
234 head = &net->ipv6.fib_table_hash[h]; 234 head = &net->ipv6.fib_table_hash[h];
235 hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { 235 hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
236 if (tb->tb6_id == id) { 236 if (tb->tb6_id == id) {
237 rcu_read_unlock(); 237 rcu_read_unlock();
238 return tb; 238 return tb;
239 } 239 }
240 } 240 }
241 rcu_read_unlock(); 241 rcu_read_unlock();
242 242
243 return NULL; 243 return NULL;
244 } 244 }
245 245
246 static void __net_init fib6_tables_init(struct net *net) 246 static void __net_init fib6_tables_init(struct net *net)
247 { 247 {
248 fib6_link_table(net, net->ipv6.fib6_main_tbl); 248 fib6_link_table(net, net->ipv6.fib6_main_tbl);
249 fib6_link_table(net, net->ipv6.fib6_local_tbl); 249 fib6_link_table(net, net->ipv6.fib6_local_tbl);
250 } 250 }
251 #else 251 #else
252 252
253 struct fib6_table *fib6_new_table(struct net *net, u32 id) 253 struct fib6_table *fib6_new_table(struct net *net, u32 id)
254 { 254 {
255 return fib6_get_table(net, id); 255 return fib6_get_table(net, id);
256 } 256 }
257 257
258 struct fib6_table *fib6_get_table(struct net *net, u32 id) 258 struct fib6_table *fib6_get_table(struct net *net, u32 id)
259 { 259 {
260 return net->ipv6.fib6_main_tbl; 260 return net->ipv6.fib6_main_tbl;
261 } 261 }
262 262
263 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, 263 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
264 int flags, pol_lookup_t lookup) 264 int flags, pol_lookup_t lookup)
265 { 265 {
266 return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); 266 return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
267 } 267 }
268 268
269 static void __net_init fib6_tables_init(struct net *net) 269 static void __net_init fib6_tables_init(struct net *net)
270 { 270 {
271 fib6_link_table(net, net->ipv6.fib6_main_tbl); 271 fib6_link_table(net, net->ipv6.fib6_main_tbl);
272 } 272 }
273 273
274 #endif 274 #endif
275 275
276 static int fib6_dump_node(struct fib6_walker_t *w) 276 static int fib6_dump_node(struct fib6_walker_t *w)
277 { 277 {
278 int res; 278 int res;
279 struct rt6_info *rt; 279 struct rt6_info *rt;
280 280
281 for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { 281 for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
282 res = rt6_dump_route(rt, w->args); 282 res = rt6_dump_route(rt, w->args);
283 if (res < 0) { 283 if (res < 0) {
284 /* Frame is full, suspend walking */ 284 /* Frame is full, suspend walking */
285 w->leaf = rt; 285 w->leaf = rt;
286 return 1; 286 return 1;
287 } 287 }
288 WARN_ON(res == 0); 288 WARN_ON(res == 0);
289 } 289 }
290 w->leaf = NULL; 290 w->leaf = NULL;
291 return 0; 291 return 0;
292 } 292 }
293 293
294 static void fib6_dump_end(struct netlink_callback *cb) 294 static void fib6_dump_end(struct netlink_callback *cb)
295 { 295 {
296 struct fib6_walker_t *w = (void*)cb->args[2]; 296 struct fib6_walker_t *w = (void*)cb->args[2];
297 297
298 if (w) { 298 if (w) {
299 if (cb->args[4]) { 299 if (cb->args[4]) {
300 cb->args[4] = 0; 300 cb->args[4] = 0;
301 fib6_walker_unlink(w); 301 fib6_walker_unlink(w);
302 } 302 }
303 cb->args[2] = 0; 303 cb->args[2] = 0;
304 kfree(w); 304 kfree(w);
305 } 305 }
306 cb->done = (void*)cb->args[3]; 306 cb->done = (void*)cb->args[3];
307 cb->args[1] = 3; 307 cb->args[1] = 3;
308 } 308 }
309 309
310 static int fib6_dump_done(struct netlink_callback *cb) 310 static int fib6_dump_done(struct netlink_callback *cb)
311 { 311 {
312 fib6_dump_end(cb); 312 fib6_dump_end(cb);
313 return cb->done ? cb->done(cb) : 0; 313 return cb->done ? cb->done(cb) : 0;
314 } 314 }
315 315
316 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, 316 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
317 struct netlink_callback *cb) 317 struct netlink_callback *cb)
318 { 318 {
319 struct fib6_walker_t *w; 319 struct fib6_walker_t *w;
320 int res; 320 int res;
321 321
322 w = (void *)cb->args[2]; 322 w = (void *)cb->args[2];
323 w->root = &table->tb6_root; 323 w->root = &table->tb6_root;
324 324
325 if (cb->args[4] == 0) { 325 if (cb->args[4] == 0) {
326 w->count = 0; 326 w->count = 0;
327 w->skip = 0; 327 w->skip = 0;
328 328
329 read_lock_bh(&table->tb6_lock); 329 read_lock_bh(&table->tb6_lock);
330 res = fib6_walk(w); 330 res = fib6_walk(w);
331 read_unlock_bh(&table->tb6_lock); 331 read_unlock_bh(&table->tb6_lock);
332 if (res > 0) { 332 if (res > 0) {
333 cb->args[4] = 1; 333 cb->args[4] = 1;
334 cb->args[5] = w->root->fn_sernum; 334 cb->args[5] = w->root->fn_sernum;
335 } 335 }
336 } else { 336 } else {
337 if (cb->args[5] != w->root->fn_sernum) { 337 if (cb->args[5] != w->root->fn_sernum) {
338 /* Begin at the root if the tree changed */ 338 /* Begin at the root if the tree changed */
339 cb->args[5] = w->root->fn_sernum; 339 cb->args[5] = w->root->fn_sernum;
340 w->state = FWS_INIT; 340 w->state = FWS_INIT;
341 w->node = w->root; 341 w->node = w->root;
342 w->skip = w->count; 342 w->skip = w->count;
343 } else 343 } else
344 w->skip = 0; 344 w->skip = 0;
345 345
346 read_lock_bh(&table->tb6_lock); 346 read_lock_bh(&table->tb6_lock);
347 res = fib6_walk_continue(w); 347 res = fib6_walk_continue(w);
348 read_unlock_bh(&table->tb6_lock); 348 read_unlock_bh(&table->tb6_lock);
349 if (res <= 0) { 349 if (res <= 0) {
350 fib6_walker_unlink(w); 350 fib6_walker_unlink(w);
351 cb->args[4] = 0; 351 cb->args[4] = 0;
352 } 352 }
353 } 353 }
354 354
355 return res; 355 return res;
356 } 356 }
357 357
358 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 358 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
359 { 359 {
360 struct net *net = sock_net(skb->sk); 360 struct net *net = sock_net(skb->sk);
361 unsigned int h, s_h; 361 unsigned int h, s_h;
362 unsigned int e = 0, s_e; 362 unsigned int e = 0, s_e;
363 struct rt6_rtnl_dump_arg arg; 363 struct rt6_rtnl_dump_arg arg;
364 struct fib6_walker_t *w; 364 struct fib6_walker_t *w;
365 struct fib6_table *tb; 365 struct fib6_table *tb;
366 struct hlist_node *node; 366 struct hlist_node *node;
367 struct hlist_head *head; 367 struct hlist_head *head;
368 int res = 0; 368 int res = 0;
369 369
370 s_h = cb->args[0]; 370 s_h = cb->args[0];
371 s_e = cb->args[1]; 371 s_e = cb->args[1];
372 372
373 w = (void *)cb->args[2]; 373 w = (void *)cb->args[2];
374 if (w == NULL) { 374 if (w == NULL) {
375 /* New dump: 375 /* New dump:
376 * 376 *
377 * 1. hook callback destructor. 377 * 1. hook callback destructor.
378 */ 378 */
379 cb->args[3] = (long)cb->done; 379 cb->args[3] = (long)cb->done;
380 cb->done = fib6_dump_done; 380 cb->done = fib6_dump_done;
381 381
382 /* 382 /*
383 * 2. allocate and initialize walker. 383 * 2. allocate and initialize walker.
384 */ 384 */
385 w = kzalloc(sizeof(*w), GFP_ATOMIC); 385 w = kzalloc(sizeof(*w), GFP_ATOMIC);
386 if (w == NULL) 386 if (w == NULL)
387 return -ENOMEM; 387 return -ENOMEM;
388 w->func = fib6_dump_node; 388 w->func = fib6_dump_node;
389 cb->args[2] = (long)w; 389 cb->args[2] = (long)w;
390 } 390 }
391 391
392 arg.skb = skb; 392 arg.skb = skb;
393 arg.cb = cb; 393 arg.cb = cb;
394 arg.net = net; 394 arg.net = net;
395 w->args = &arg; 395 w->args = &arg;
396 396
397 rcu_read_lock(); 397 rcu_read_lock();
398 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { 398 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
399 e = 0; 399 e = 0;
400 head = &net->ipv6.fib_table_hash[h]; 400 head = &net->ipv6.fib_table_hash[h];
401 hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { 401 hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
402 if (e < s_e) 402 if (e < s_e)
403 goto next; 403 goto next;
404 res = fib6_dump_table(tb, skb, cb); 404 res = fib6_dump_table(tb, skb, cb);
405 if (res != 0) 405 if (res != 0)
406 goto out; 406 goto out;
407 next: 407 next:
408 e++; 408 e++;
409 } 409 }
410 } 410 }
411 out: 411 out:
412 rcu_read_unlock(); 412 rcu_read_unlock();
413 cb->args[1] = e; 413 cb->args[1] = e;
414 cb->args[0] = h; 414 cb->args[0] = h;
415 415
416 res = res < 0 ? res : skb->len; 416 res = res < 0 ? res : skb->len;
417 if (res <= 0) 417 if (res <= 0)
418 fib6_dump_end(cb); 418 fib6_dump_end(cb);
419 return res; 419 return res;
420 } 420 }
421 421
422 /* 422 /*
423 * Routing Table 423 * Routing Table
424 * 424 *
425 * return the appropriate node for a routing tree "add" operation 425 * return the appropriate node for a routing tree "add" operation
426 * by either creating and inserting or by returning an existing 426 * by either creating and inserting or by returning an existing
427 * node. 427 * node.
428 */ 428 */
429 429
430 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, 430 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
431 int addrlen, int plen, 431 int addrlen, int plen,
432 int offset) 432 int offset)
433 { 433 {
434 struct fib6_node *fn, *in, *ln; 434 struct fib6_node *fn, *in, *ln;
435 struct fib6_node *pn = NULL; 435 struct fib6_node *pn = NULL;
436 struct rt6key *key; 436 struct rt6key *key;
437 int bit; 437 int bit;
438 __be32 dir = 0; 438 __be32 dir = 0;
439 __u32 sernum = fib6_new_sernum(); 439 __u32 sernum = fib6_new_sernum();
440 440
441 RT6_TRACE("fib6_add_1\n"); 441 RT6_TRACE("fib6_add_1\n");
442 442
443 /* insert node in tree */ 443 /* insert node in tree */
444 444
445 fn = root; 445 fn = root;
446 446
447 do { 447 do {
448 key = (struct rt6key *)((u8 *)fn->leaf + offset); 448 key = (struct rt6key *)((u8 *)fn->leaf + offset);
449 449
450 /* 450 /*
451 * Prefix match 451 * Prefix match
452 */ 452 */
453 if (plen < fn->fn_bit || 453 if (plen < fn->fn_bit ||
454 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) 454 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
455 goto insert_above; 455 goto insert_above;
456 456
457 /* 457 /*
458 * Exact match ? 458 * Exact match ?
459 */ 459 */
460 460
461 if (plen == fn->fn_bit) { 461 if (plen == fn->fn_bit) {
462 /* clean up an intermediate node */ 462 /* clean up an intermediate node */
463 if ((fn->fn_flags & RTN_RTINFO) == 0) { 463 if ((fn->fn_flags & RTN_RTINFO) == 0) {
464 rt6_release(fn->leaf); 464 rt6_release(fn->leaf);
465 fn->leaf = NULL; 465 fn->leaf = NULL;
466 } 466 }
467 467
468 fn->fn_sernum = sernum; 468 fn->fn_sernum = sernum;
469 469
470 return fn; 470 return fn;
471 } 471 }
472 472
473 /* 473 /*
474 * We have more bits to go 474 * We have more bits to go
475 */ 475 */
476 476
477 /* Try to walk down on tree. */ 477 /* Try to walk down on tree. */
478 fn->fn_sernum = sernum; 478 fn->fn_sernum = sernum;
479 dir = addr_bit_set(addr, fn->fn_bit); 479 dir = addr_bit_set(addr, fn->fn_bit);
480 pn = fn; 480 pn = fn;
481 fn = dir ? fn->right: fn->left; 481 fn = dir ? fn->right: fn->left;
482 } while (fn); 482 } while (fn);
483 483
484 /* 484 /*
485 * We walked to the bottom of tree. 485 * We walked to the bottom of tree.
486 * Create new leaf node without children. 486 * Create new leaf node without children.
487 */ 487 */
488 488
489 ln = node_alloc(); 489 ln = node_alloc();
490 490
491 if (ln == NULL) 491 if (ln == NULL)
492 return NULL; 492 return NULL;
493 ln->fn_bit = plen; 493 ln->fn_bit = plen;
494 494
495 ln->parent = pn; 495 ln->parent = pn;
496 ln->fn_sernum = sernum; 496 ln->fn_sernum = sernum;
497 497
498 if (dir) 498 if (dir)
499 pn->right = ln; 499 pn->right = ln;
500 else 500 else
501 pn->left = ln; 501 pn->left = ln;
502 502
503 return ln; 503 return ln;
504 504
505 505
506 insert_above: 506 insert_above:
507 /* 507 /*
508 * split since we don't have a common prefix anymore or 508 * split since we don't have a common prefix anymore or
509 * we have a less significant route. 509 * we have a less significant route.
510 * we've to insert an intermediate node on the list 510 * we've to insert an intermediate node on the list
511 * this new node will point to the one we need to create 511 * this new node will point to the one we need to create
512 * and the current 512 * and the current
513 */ 513 */
514 514
515 pn = fn->parent; 515 pn = fn->parent;
516 516
517 /* find 1st bit in difference between the 2 addrs. 517 /* find 1st bit in difference between the 2 addrs.
518 518
519 See comment in __ipv6_addr_diff: bit may be an invalid value, 519 See comment in __ipv6_addr_diff: bit may be an invalid value,
520 but if it is >= plen, the value is ignored in any case. 520 but if it is >= plen, the value is ignored in any case.
521 */ 521 */
522 522
523 bit = __ipv6_addr_diff(addr, &key->addr, addrlen); 523 bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
524 524
525 /* 525 /*
526 * (intermediate)[in] 526 * (intermediate)[in]
527 * / \ 527 * / \
528 * (new leaf node)[ln] (old node)[fn] 528 * (new leaf node)[ln] (old node)[fn]
529 */ 529 */
530 if (plen > bit) { 530 if (plen > bit) {
531 in = node_alloc(); 531 in = node_alloc();
532 ln = node_alloc(); 532 ln = node_alloc();
533 533
534 if (in == NULL || ln == NULL) { 534 if (in == NULL || ln == NULL) {
535 if (in) 535 if (in)
536 node_free(in); 536 node_free(in);
537 if (ln) 537 if (ln)
538 node_free(ln); 538 node_free(ln);
539 return NULL; 539 return NULL;
540 } 540 }
541 541
542 /* 542 /*
543 * new intermediate node. 543 * new intermediate node.
544 * RTN_RTINFO will 544 * RTN_RTINFO will
545 * be off since that an address that chooses one of 545 * be off since that an address that chooses one of
546 * the branches would not match less specific routes 546 * the branches would not match less specific routes
547 * in the other branch 547 * in the other branch
548 */ 548 */
549 549
550 in->fn_bit = bit; 550 in->fn_bit = bit;
551 551
552 in->parent = pn; 552 in->parent = pn;
553 in->leaf = fn->leaf; 553 in->leaf = fn->leaf;
554 atomic_inc(&in->leaf->rt6i_ref); 554 atomic_inc(&in->leaf->rt6i_ref);
555 555
556 in->fn_sernum = sernum; 556 in->fn_sernum = sernum;
557 557
558 /* update parent pointer */ 558 /* update parent pointer */
559 if (dir) 559 if (dir)
560 pn->right = in; 560 pn->right = in;
561 else 561 else
562 pn->left = in; 562 pn->left = in;
563 563
564 ln->fn_bit = plen; 564 ln->fn_bit = plen;
565 565
566 ln->parent = in; 566 ln->parent = in;
567 fn->parent = in; 567 fn->parent = in;
568 568
569 ln->fn_sernum = sernum; 569 ln->fn_sernum = sernum;
570 570
571 if (addr_bit_set(addr, bit)) { 571 if (addr_bit_set(addr, bit)) {
572 in->right = ln; 572 in->right = ln;
573 in->left = fn; 573 in->left = fn;
574 } else { 574 } else {
575 in->left = ln; 575 in->left = ln;
576 in->right = fn; 576 in->right = fn;
577 } 577 }
578 } else { /* plen <= bit */ 578 } else { /* plen <= bit */
579 579
580 /* 580 /*
581 * (new leaf node)[ln] 581 * (new leaf node)[ln]
582 * / \ 582 * / \
583 * (old node)[fn] NULL 583 * (old node)[fn] NULL
584 */ 584 */
585 585
586 ln = node_alloc(); 586 ln = node_alloc();
587 587
588 if (ln == NULL) 588 if (ln == NULL)
589 return NULL; 589 return NULL;
590 590
591 ln->fn_bit = plen; 591 ln->fn_bit = plen;
592 592
593 ln->parent = pn; 593 ln->parent = pn;
594 594
595 ln->fn_sernum = sernum; 595 ln->fn_sernum = sernum;
596 596
597 if (dir) 597 if (dir)
598 pn->right = ln; 598 pn->right = ln;
599 else 599 else
600 pn->left = ln; 600 pn->left = ln;
601 601
602 if (addr_bit_set(&key->addr, plen)) 602 if (addr_bit_set(&key->addr, plen))
603 ln->right = fn; 603 ln->right = fn;
604 else 604 else
605 ln->left = fn; 605 ln->left = fn;
606 606
607 fn->parent = ln; 607 fn->parent = ln;
608 } 608 }
609 return ln; 609 return ln;
610 } 610 }
611 611
612 /* 612 /*
613 * Insert routing information in a node. 613 * Insert routing information in a node.
614 */ 614 */
615 615
616 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 616 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
617 struct nl_info *info) 617 struct nl_info *info)
618 { 618 {
619 struct rt6_info *iter = NULL; 619 struct rt6_info *iter = NULL;
620 struct rt6_info **ins; 620 struct rt6_info **ins;
621 621
622 ins = &fn->leaf; 622 ins = &fn->leaf;
623 623
624 for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) { 624 for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) {
625 /* 625 /*
626 * Search for duplicates 626 * Search for duplicates
627 */ 627 */
628 628
629 if (iter->rt6i_metric == rt->rt6i_metric) { 629 if (iter->rt6i_metric == rt->rt6i_metric) {
630 /* 630 /*
631 * Same priority level 631 * Same priority level
632 */ 632 */
633 633
634 if (iter->rt6i_dev == rt->rt6i_dev && 634 if (iter->rt6i_dev == rt->rt6i_dev &&
635 iter->rt6i_idev == rt->rt6i_idev && 635 iter->rt6i_idev == rt->rt6i_idev &&
636 ipv6_addr_equal(&iter->rt6i_gateway, 636 ipv6_addr_equal(&iter->rt6i_gateway,
637 &rt->rt6i_gateway)) { 637 &rt->rt6i_gateway)) {
638 if (!(iter->rt6i_flags&RTF_EXPIRES)) 638 if (!(iter->rt6i_flags&RTF_EXPIRES))
639 return -EEXIST; 639 return -EEXIST;
640 iter->rt6i_expires = rt->rt6i_expires; 640 iter->rt6i_expires = rt->rt6i_expires;
641 if (!(rt->rt6i_flags&RTF_EXPIRES)) { 641 if (!(rt->rt6i_flags&RTF_EXPIRES)) {
642 iter->rt6i_flags &= ~RTF_EXPIRES; 642 iter->rt6i_flags &= ~RTF_EXPIRES;
643 iter->rt6i_expires = 0; 643 iter->rt6i_expires = 0;
644 } 644 }
645 return -EEXIST; 645 return -EEXIST;
646 } 646 }
647 } 647 }
648 648
649 if (iter->rt6i_metric > rt->rt6i_metric) 649 if (iter->rt6i_metric > rt->rt6i_metric)
650 break; 650 break;
651 651
652 ins = &iter->dst.rt6_next; 652 ins = &iter->dst.rt6_next;
653 } 653 }
654 654
655 /* Reset round-robin state, if necessary */ 655 /* Reset round-robin state, if necessary */
656 if (ins == &fn->leaf) 656 if (ins == &fn->leaf)
657 fn->rr_ptr = NULL; 657 fn->rr_ptr = NULL;
658 658
659 /* 659 /*
660 * insert node 660 * insert node
661 */ 661 */
662 662
663 rt->dst.rt6_next = iter; 663 rt->dst.rt6_next = iter;
664 *ins = rt; 664 *ins = rt;
665 rt->rt6i_node = fn; 665 rt->rt6i_node = fn;
666 atomic_inc(&rt->rt6i_ref); 666 atomic_inc(&rt->rt6i_ref);
667 inet6_rt_notify(RTM_NEWROUTE, rt, info); 667 inet6_rt_notify(RTM_NEWROUTE, rt, info);
668 info->nl_net->ipv6.rt6_stats->fib_rt_entries++; 668 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
669 669
670 if ((fn->fn_flags & RTN_RTINFO) == 0) { 670 if ((fn->fn_flags & RTN_RTINFO) == 0) {
671 info->nl_net->ipv6.rt6_stats->fib_route_nodes++; 671 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
672 fn->fn_flags |= RTN_RTINFO; 672 fn->fn_flags |= RTN_RTINFO;
673 } 673 }
674 674
675 return 0; 675 return 0;
676 } 676 }
677 677
678 static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) 678 static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt)
679 { 679 {
680 if (!timer_pending(&net->ipv6.ip6_fib_timer) && 680 if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
681 (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) 681 (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE)))
682 mod_timer(&net->ipv6.ip6_fib_timer, 682 mod_timer(&net->ipv6.ip6_fib_timer,
683 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); 683 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
684 } 684 }
685 685
686 void fib6_force_start_gc(struct net *net) 686 void fib6_force_start_gc(struct net *net)
687 { 687 {
688 if (!timer_pending(&net->ipv6.ip6_fib_timer)) 688 if (!timer_pending(&net->ipv6.ip6_fib_timer))
689 mod_timer(&net->ipv6.ip6_fib_timer, 689 mod_timer(&net->ipv6.ip6_fib_timer,
690 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); 690 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
691 } 691 }
692 692
693 /* 693 /*
694 * Add routing information to the routing tree. 694 * Add routing information to the routing tree.
695 * <destination addr>/<source addr> 695 * <destination addr>/<source addr>
696 * with source addr info in sub-trees 696 * with source addr info in sub-trees
697 */ 697 */
698 698
699 int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) 699 int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
700 { 700 {
701 struct fib6_node *fn, *pn = NULL; 701 struct fib6_node *fn, *pn = NULL;
702 int err = -ENOMEM; 702 int err = -ENOMEM;
703 703
704 fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), 704 fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
705 rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); 705 rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst));
706 706
707 if (fn == NULL) 707 if (fn == NULL)
708 goto out; 708 goto out;
709 709
710 pn = fn; 710 pn = fn;
711 711
712 #ifdef CONFIG_IPV6_SUBTREES 712 #ifdef CONFIG_IPV6_SUBTREES
713 if (rt->rt6i_src.plen) { 713 if (rt->rt6i_src.plen) {
714 struct fib6_node *sn; 714 struct fib6_node *sn;
715 715
716 if (fn->subtree == NULL) { 716 if (fn->subtree == NULL) {
717 struct fib6_node *sfn; 717 struct fib6_node *sfn;
718 718
719 /* 719 /*
720 * Create subtree. 720 * Create subtree.
721 * 721 *
722 * fn[main tree] 722 * fn[main tree]
723 * | 723 * |
724 * sfn[subtree root] 724 * sfn[subtree root]
725 * \ 725 * \
726 * sn[new leaf node] 726 * sn[new leaf node]
727 */ 727 */
728 728
729 /* Create subtree root node */ 729 /* Create subtree root node */
730 sfn = node_alloc(); 730 sfn = node_alloc();
731 if (sfn == NULL) 731 if (sfn == NULL)
732 goto st_failure; 732 goto st_failure;
733 733
734 sfn->leaf = info->nl_net->ipv6.ip6_null_entry; 734 sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
735 atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); 735 atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
736 sfn->fn_flags = RTN_ROOT; 736 sfn->fn_flags = RTN_ROOT;
737 sfn->fn_sernum = fib6_new_sernum(); 737 sfn->fn_sernum = fib6_new_sernum();
738 738
739 /* Now add the first leaf node to new subtree */ 739 /* Now add the first leaf node to new subtree */
740 740
741 sn = fib6_add_1(sfn, &rt->rt6i_src.addr, 741 sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
742 sizeof(struct in6_addr), rt->rt6i_src.plen, 742 sizeof(struct in6_addr), rt->rt6i_src.plen,
743 offsetof(struct rt6_info, rt6i_src)); 743 offsetof(struct rt6_info, rt6i_src));
744 744
745 if (sn == NULL) { 745 if (sn == NULL) {
746 /* If it is failed, discard just allocated 746 /* If it is failed, discard just allocated
747 root, and then (in st_failure) stale node 747 root, and then (in st_failure) stale node
748 in main tree. 748 in main tree.
749 */ 749 */
750 node_free(sfn); 750 node_free(sfn);
751 goto st_failure; 751 goto st_failure;
752 } 752 }
753 753
754 /* Now link new subtree to main tree */ 754 /* Now link new subtree to main tree */
755 sfn->parent = fn; 755 sfn->parent = fn;
756 fn->subtree = sfn; 756 fn->subtree = sfn;
757 } else { 757 } else {
758 sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, 758 sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
759 sizeof(struct in6_addr), rt->rt6i_src.plen, 759 sizeof(struct in6_addr), rt->rt6i_src.plen,
760 offsetof(struct rt6_info, rt6i_src)); 760 offsetof(struct rt6_info, rt6i_src));
761 761
762 if (sn == NULL) 762 if (sn == NULL)
763 goto st_failure; 763 goto st_failure;
764 } 764 }
765 765
766 if (fn->leaf == NULL) { 766 if (fn->leaf == NULL) {
767 fn->leaf = rt; 767 fn->leaf = rt;
768 atomic_inc(&rt->rt6i_ref); 768 atomic_inc(&rt->rt6i_ref);
769 } 769 }
770 fn = sn; 770 fn = sn;
771 } 771 }
772 #endif 772 #endif
773 773
774 err = fib6_add_rt2node(fn, rt, info); 774 err = fib6_add_rt2node(fn, rt, info);
775 775
776 if (err == 0) { 776 if (err == 0) {
777 fib6_start_gc(info->nl_net, rt); 777 fib6_start_gc(info->nl_net, rt);
778 if (!(rt->rt6i_flags&RTF_CACHE)) 778 if (!(rt->rt6i_flags&RTF_CACHE))
779 fib6_prune_clones(info->nl_net, pn, rt); 779 fib6_prune_clones(info->nl_net, pn, rt);
780 } 780 }
781 781
782 out: 782 out:
783 if (err) { 783 if (err) {
784 #ifdef CONFIG_IPV6_SUBTREES 784 #ifdef CONFIG_IPV6_SUBTREES
785 /* 785 /*
786 * If fib6_add_1 has cleared the old leaf pointer in the 786 * If fib6_add_1 has cleared the old leaf pointer in the
787 * super-tree leaf node we have to find a new one for it. 787 * super-tree leaf node we have to find a new one for it.
788 */ 788 */
789 if (pn != fn && pn->leaf == rt) { 789 if (pn != fn && pn->leaf == rt) {
790 pn->leaf = NULL; 790 pn->leaf = NULL;
791 atomic_dec(&rt->rt6i_ref); 791 atomic_dec(&rt->rt6i_ref);
792 } 792 }
793 if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { 793 if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
794 pn->leaf = fib6_find_prefix(info->nl_net, pn); 794 pn->leaf = fib6_find_prefix(info->nl_net, pn);
795 #if RT6_DEBUG >= 2 795 #if RT6_DEBUG >= 2
796 if (!pn->leaf) { 796 if (!pn->leaf) {
797 WARN_ON(pn->leaf == NULL); 797 WARN_ON(pn->leaf == NULL);
798 pn->leaf = info->nl_net->ipv6.ip6_null_entry; 798 pn->leaf = info->nl_net->ipv6.ip6_null_entry;
799 } 799 }
800 #endif 800 #endif
801 atomic_inc(&pn->leaf->rt6i_ref); 801 atomic_inc(&pn->leaf->rt6i_ref);
802 } 802 }
803 #endif 803 #endif
804 dst_free(&rt->dst); 804 dst_free(&rt->dst);
805 } 805 }
806 return err; 806 return err;
807 807
808 #ifdef CONFIG_IPV6_SUBTREES 808 #ifdef CONFIG_IPV6_SUBTREES
809 /* Subtree creation failed, probably main tree node 809 /* Subtree creation failed, probably main tree node
810 is orphan. If it is, shoot it. 810 is orphan. If it is, shoot it.
811 */ 811 */
812 st_failure: 812 st_failure:
813 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) 813 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
814 fib6_repair_tree(info->nl_net, fn); 814 fib6_repair_tree(info->nl_net, fn);
815 dst_free(&rt->dst); 815 dst_free(&rt->dst);
816 return err; 816 return err;
817 #endif 817 #endif
818 } 818 }
819 819
820 /* 820 /*
821 * Routing tree lookup 821 * Routing tree lookup
822 * 822 *
823 */ 823 */
824 824
825 struct lookup_args { 825 struct lookup_args {
826 int offset; /* key offset on rt6_info */ 826 int offset; /* key offset on rt6_info */
827 const struct in6_addr *addr; /* search key */ 827 const struct in6_addr *addr; /* search key */
828 }; 828 };
829 829
830 static struct fib6_node * fib6_lookup_1(struct fib6_node *root, 830 static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
831 struct lookup_args *args) 831 struct lookup_args *args)
832 { 832 {
833 struct fib6_node *fn; 833 struct fib6_node *fn;
834 __be32 dir; 834 __be32 dir;
835 835
836 if (unlikely(args->offset == 0)) 836 if (unlikely(args->offset == 0))
837 return NULL; 837 return NULL;
838 838
839 /* 839 /*
840 * Descend on a tree 840 * Descend on a tree
841 */ 841 */
842 842
843 fn = root; 843 fn = root;
844 844
845 for (;;) { 845 for (;;) {
846 struct fib6_node *next; 846 struct fib6_node *next;
847 847
848 dir = addr_bit_set(args->addr, fn->fn_bit); 848 dir = addr_bit_set(args->addr, fn->fn_bit);
849 849
850 next = dir ? fn->right : fn->left; 850 next = dir ? fn->right : fn->left;
851 851
852 if (next) { 852 if (next) {
853 fn = next; 853 fn = next;
854 continue; 854 continue;
855 } 855 }
856 856
857 break; 857 break;
858 } 858 }
859 859
860 while(fn) { 860 while(fn) {
861 if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { 861 if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
862 struct rt6key *key; 862 struct rt6key *key;
863 863
864 key = (struct rt6key *) ((u8 *) fn->leaf + 864 key = (struct rt6key *) ((u8 *) fn->leaf +
865 args->offset); 865 args->offset);
866 866
867 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { 867 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
868 #ifdef CONFIG_IPV6_SUBTREES 868 #ifdef CONFIG_IPV6_SUBTREES
869 if (fn->subtree) 869 if (fn->subtree)
870 fn = fib6_lookup_1(fn->subtree, args + 1); 870 fn = fib6_lookup_1(fn->subtree, args + 1);
871 #endif 871 #endif
872 if (!fn || fn->fn_flags & RTN_RTINFO) 872 if (!fn || fn->fn_flags & RTN_RTINFO)
873 return fn; 873 return fn;
874 } 874 }
875 } 875 }
876 876
877 if (fn->fn_flags & RTN_ROOT) 877 if (fn->fn_flags & RTN_ROOT)
878 break; 878 break;
879 879
880 fn = fn->parent; 880 fn = fn->parent;
881 } 881 }
882 882
883 return NULL; 883 return NULL;
884 } 884 }
885 885
886 struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, 886 struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
887 const struct in6_addr *saddr) 887 const struct in6_addr *saddr)
888 { 888 {
889 struct fib6_node *fn; 889 struct fib6_node *fn;
890 struct lookup_args args[] = { 890 struct lookup_args args[] = {
891 { 891 {
892 .offset = offsetof(struct rt6_info, rt6i_dst), 892 .offset = offsetof(struct rt6_info, rt6i_dst),
893 .addr = daddr, 893 .addr = daddr,
894 }, 894 },
895 #ifdef CONFIG_IPV6_SUBTREES 895 #ifdef CONFIG_IPV6_SUBTREES
896 { 896 {
897 .offset = offsetof(struct rt6_info, rt6i_src), 897 .offset = offsetof(struct rt6_info, rt6i_src),
898 .addr = saddr, 898 .addr = saddr,
899 }, 899 },
900 #endif 900 #endif
901 { 901 {
902 .offset = 0, /* sentinel */ 902 .offset = 0, /* sentinel */
903 } 903 }
904 }; 904 };
905 905
906 fn = fib6_lookup_1(root, daddr ? args : args + 1); 906 fn = fib6_lookup_1(root, daddr ? args : args + 1);
907 907
908 if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) 908 if (fn == NULL || fn->fn_flags & RTN_TL_ROOT)
909 fn = root; 909 fn = root;
910 910
911 return fn; 911 return fn;
912 } 912 }
913 913
914 /* 914 /*
915 * Get node with specified destination prefix (and source prefix, 915 * Get node with specified destination prefix (and source prefix,
916 * if subtrees are used) 916 * if subtrees are used)
917 */ 917 */
918 918
919 919
920 static struct fib6_node * fib6_locate_1(struct fib6_node *root, 920 static struct fib6_node * fib6_locate_1(struct fib6_node *root,
921 const struct in6_addr *addr, 921 const struct in6_addr *addr,
922 int plen, int offset) 922 int plen, int offset)
923 { 923 {
924 struct fib6_node *fn; 924 struct fib6_node *fn;
925 925
926 for (fn = root; fn ; ) { 926 for (fn = root; fn ; ) {
927 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); 927 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
928 928
929 /* 929 /*
930 * Prefix match 930 * Prefix match
931 */ 931 */
932 if (plen < fn->fn_bit || 932 if (plen < fn->fn_bit ||
933 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) 933 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
934 return NULL; 934 return NULL;
935 935
936 if (plen == fn->fn_bit) 936 if (plen == fn->fn_bit)
937 return fn; 937 return fn;
938 938
939 /* 939 /*
940 * We have more bits to go 940 * We have more bits to go
941 */ 941 */
942 if (addr_bit_set(addr, fn->fn_bit)) 942 if (addr_bit_set(addr, fn->fn_bit))
943 fn = fn->right; 943 fn = fn->right;
944 else 944 else
945 fn = fn->left; 945 fn = fn->left;
946 } 946 }
947 return NULL; 947 return NULL;
948 } 948 }
949 949
950 struct fib6_node * fib6_locate(struct fib6_node *root, 950 struct fib6_node * fib6_locate(struct fib6_node *root,
951 const struct in6_addr *daddr, int dst_len, 951 const struct in6_addr *daddr, int dst_len,
952 const struct in6_addr *saddr, int src_len) 952 const struct in6_addr *saddr, int src_len)
953 { 953 {
954 struct fib6_node *fn; 954 struct fib6_node *fn;
955 955
956 fn = fib6_locate_1(root, daddr, dst_len, 956 fn = fib6_locate_1(root, daddr, dst_len,
957 offsetof(struct rt6_info, rt6i_dst)); 957 offsetof(struct rt6_info, rt6i_dst));
958 958
959 #ifdef CONFIG_IPV6_SUBTREES 959 #ifdef CONFIG_IPV6_SUBTREES
960 if (src_len) { 960 if (src_len) {
961 WARN_ON(saddr == NULL); 961 WARN_ON(saddr == NULL);
962 if (fn && fn->subtree) 962 if (fn && fn->subtree)
963 fn = fib6_locate_1(fn->subtree, saddr, src_len, 963 fn = fib6_locate_1(fn->subtree, saddr, src_len,
964 offsetof(struct rt6_info, rt6i_src)); 964 offsetof(struct rt6_info, rt6i_src));
965 } 965 }
966 #endif 966 #endif
967 967
968 if (fn && fn->fn_flags&RTN_RTINFO) 968 if (fn && fn->fn_flags&RTN_RTINFO)
969 return fn; 969 return fn;
970 970
971 return NULL; 971 return NULL;
972 } 972 }
973 973
974 974
975 /* 975 /*
976 * Deletion 976 * Deletion
977 * 977 *
978 */ 978 */
979 979
980 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) 980 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
981 { 981 {
982 if (fn->fn_flags&RTN_ROOT) 982 if (fn->fn_flags&RTN_ROOT)
983 return net->ipv6.ip6_null_entry; 983 return net->ipv6.ip6_null_entry;
984 984
985 while(fn) { 985 while(fn) {
986 if(fn->left) 986 if(fn->left)
987 return fn->left->leaf; 987 return fn->left->leaf;
988 988
989 if(fn->right) 989 if(fn->right)
990 return fn->right->leaf; 990 return fn->right->leaf;
991 991
992 fn = FIB6_SUBTREE(fn); 992 fn = FIB6_SUBTREE(fn);
993 } 993 }
994 return NULL; 994 return NULL;
995 } 995 }
996 996
997 /* 997 /*
998 * Called to trim the tree of intermediate nodes when possible. "fn" 998 * Called to trim the tree of intermediate nodes when possible. "fn"
999 * is the node we want to try and remove. 999 * is the node we want to try and remove.
1000 */ 1000 */
1001 1001
1002 static struct fib6_node *fib6_repair_tree(struct net *net, 1002 static struct fib6_node *fib6_repair_tree(struct net *net,
1003 struct fib6_node *fn) 1003 struct fib6_node *fn)
1004 { 1004 {
1005 int children; 1005 int children;
1006 int nstate; 1006 int nstate;
1007 struct fib6_node *child, *pn; 1007 struct fib6_node *child, *pn;
1008 struct fib6_walker_t *w; 1008 struct fib6_walker_t *w;
1009 int iter = 0; 1009 int iter = 0;
1010 1010
1011 for (;;) { 1011 for (;;) {
1012 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); 1012 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1013 iter++; 1013 iter++;
1014 1014
1015 WARN_ON(fn->fn_flags & RTN_RTINFO); 1015 WARN_ON(fn->fn_flags & RTN_RTINFO);
1016 WARN_ON(fn->fn_flags & RTN_TL_ROOT); 1016 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1017 WARN_ON(fn->leaf != NULL); 1017 WARN_ON(fn->leaf != NULL);
1018 1018
1019 children = 0; 1019 children = 0;
1020 child = NULL; 1020 child = NULL;
1021 if (fn->right) child = fn->right, children |= 1; 1021 if (fn->right) child = fn->right, children |= 1;
1022 if (fn->left) child = fn->left, children |= 2; 1022 if (fn->left) child = fn->left, children |= 2;
1023 1023
1024 if (children == 3 || FIB6_SUBTREE(fn) 1024 if (children == 3 || FIB6_SUBTREE(fn)
1025 #ifdef CONFIG_IPV6_SUBTREES 1025 #ifdef CONFIG_IPV6_SUBTREES
1026 /* Subtree root (i.e. fn) may have one child */ 1026 /* Subtree root (i.e. fn) may have one child */
1027 || (children && fn->fn_flags&RTN_ROOT) 1027 || (children && fn->fn_flags&RTN_ROOT)
1028 #endif 1028 #endif
1029 ) { 1029 ) {
1030 fn->leaf = fib6_find_prefix(net, fn); 1030 fn->leaf = fib6_find_prefix(net, fn);
1031 #if RT6_DEBUG >= 2 1031 #if RT6_DEBUG >= 2
1032 if (fn->leaf==NULL) { 1032 if (fn->leaf==NULL) {
1033 WARN_ON(!fn->leaf); 1033 WARN_ON(!fn->leaf);
1034 fn->leaf = net->ipv6.ip6_null_entry; 1034 fn->leaf = net->ipv6.ip6_null_entry;
1035 } 1035 }
1036 #endif 1036 #endif
1037 atomic_inc(&fn->leaf->rt6i_ref); 1037 atomic_inc(&fn->leaf->rt6i_ref);
1038 return fn->parent; 1038 return fn->parent;
1039 } 1039 }
1040 1040
1041 pn = fn->parent; 1041 pn = fn->parent;
1042 #ifdef CONFIG_IPV6_SUBTREES 1042 #ifdef CONFIG_IPV6_SUBTREES
1043 if (FIB6_SUBTREE(pn) == fn) { 1043 if (FIB6_SUBTREE(pn) == fn) {
1044 WARN_ON(!(fn->fn_flags & RTN_ROOT)); 1044 WARN_ON(!(fn->fn_flags & RTN_ROOT));
1045 FIB6_SUBTREE(pn) = NULL; 1045 FIB6_SUBTREE(pn) = NULL;
1046 nstate = FWS_L; 1046 nstate = FWS_L;
1047 } else { 1047 } else {
1048 WARN_ON(fn->fn_flags & RTN_ROOT); 1048 WARN_ON(fn->fn_flags & RTN_ROOT);
1049 #endif 1049 #endif
1050 if (pn->right == fn) pn->right = child; 1050 if (pn->right == fn) pn->right = child;
1051 else if (pn->left == fn) pn->left = child; 1051 else if (pn->left == fn) pn->left = child;
1052 #if RT6_DEBUG >= 2 1052 #if RT6_DEBUG >= 2
1053 else 1053 else
1054 WARN_ON(1); 1054 WARN_ON(1);
1055 #endif 1055 #endif
1056 if (child) 1056 if (child)
1057 child->parent = pn; 1057 child->parent = pn;
1058 nstate = FWS_R; 1058 nstate = FWS_R;
1059 #ifdef CONFIG_IPV6_SUBTREES 1059 #ifdef CONFIG_IPV6_SUBTREES
1060 } 1060 }
1061 #endif 1061 #endif
1062 1062
1063 read_lock(&fib6_walker_lock); 1063 read_lock(&fib6_walker_lock);
1064 FOR_WALKERS(w) { 1064 FOR_WALKERS(w) {
1065 if (child == NULL) { 1065 if (child == NULL) {
1066 if (w->root == fn) { 1066 if (w->root == fn) {
1067 w->root = w->node = NULL; 1067 w->root = w->node = NULL;
1068 RT6_TRACE("W %p adjusted by delroot 1\n", w); 1068 RT6_TRACE("W %p adjusted by delroot 1\n", w);
1069 } else if (w->node == fn) { 1069 } else if (w->node == fn) {
1070 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); 1070 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1071 w->node = pn; 1071 w->node = pn;
1072 w->state = nstate; 1072 w->state = nstate;
1073 } 1073 }
1074 } else { 1074 } else {
1075 if (w->root == fn) { 1075 if (w->root == fn) {
1076 w->root = child; 1076 w->root = child;
1077 RT6_TRACE("W %p adjusted by delroot 2\n", w); 1077 RT6_TRACE("W %p adjusted by delroot 2\n", w);
1078 } 1078 }
1079 if (w->node == fn) { 1079 if (w->node == fn) {
1080 w->node = child; 1080 w->node = child;
1081 if (children&2) { 1081 if (children&2) {
1082 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); 1082 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1083 w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; 1083 w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
1084 } else { 1084 } else {
1085 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); 1085 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1086 w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; 1086 w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
1087 } 1087 }
1088 } 1088 }
1089 } 1089 }
1090 } 1090 }
1091 read_unlock(&fib6_walker_lock); 1091 read_unlock(&fib6_walker_lock);
1092 1092
1093 node_free(fn); 1093 node_free(fn);
1094 if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn)) 1094 if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn))
1095 return pn; 1095 return pn;
1096 1096
1097 rt6_release(pn->leaf); 1097 rt6_release(pn->leaf);
1098 pn->leaf = NULL; 1098 pn->leaf = NULL;
1099 fn = pn; 1099 fn = pn;
1100 } 1100 }
1101 } 1101 }
1102 1102
1103 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, 1103 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1104 struct nl_info *info) 1104 struct nl_info *info)
1105 { 1105 {
1106 struct fib6_walker_t *w; 1106 struct fib6_walker_t *w;
1107 struct rt6_info *rt = *rtp; 1107 struct rt6_info *rt = *rtp;
1108 struct net *net = info->nl_net; 1108 struct net *net = info->nl_net;
1109 1109
1110 RT6_TRACE("fib6_del_route\n"); 1110 RT6_TRACE("fib6_del_route\n");
1111 1111
1112 /* Unlink it */ 1112 /* Unlink it */
1113 *rtp = rt->dst.rt6_next; 1113 *rtp = rt->dst.rt6_next;
1114 rt->rt6i_node = NULL; 1114 rt->rt6i_node = NULL;
1115 net->ipv6.rt6_stats->fib_rt_entries--; 1115 net->ipv6.rt6_stats->fib_rt_entries--;
1116 net->ipv6.rt6_stats->fib_discarded_routes++; 1116 net->ipv6.rt6_stats->fib_discarded_routes++;
1117 1117
1118 /* Reset round-robin state, if necessary */ 1118 /* Reset round-robin state, if necessary */
1119 if (fn->rr_ptr == rt) 1119 if (fn->rr_ptr == rt)
1120 fn->rr_ptr = NULL; 1120 fn->rr_ptr = NULL;
1121 1121
1122 /* Adjust walkers */ 1122 /* Adjust walkers */
1123 read_lock(&fib6_walker_lock); 1123 read_lock(&fib6_walker_lock);
1124 FOR_WALKERS(w) { 1124 FOR_WALKERS(w) {
1125 if (w->state == FWS_C && w->leaf == rt) { 1125 if (w->state == FWS_C && w->leaf == rt) {
1126 RT6_TRACE("walker %p adjusted by delroute\n", w); 1126 RT6_TRACE("walker %p adjusted by delroute\n", w);
1127 w->leaf = rt->dst.rt6_next; 1127 w->leaf = rt->dst.rt6_next;
1128 if (w->leaf == NULL) 1128 if (w->leaf == NULL)
1129 w->state = FWS_U; 1129 w->state = FWS_U;
1130 } 1130 }
1131 } 1131 }
1132 read_unlock(&fib6_walker_lock); 1132 read_unlock(&fib6_walker_lock);
1133 1133
1134 rt->dst.rt6_next = NULL; 1134 rt->dst.rt6_next = NULL;
1135 1135
1136 /* If it was last route, expunge its radix tree node */ 1136 /* If it was last route, expunge its radix tree node */
1137 if (fn->leaf == NULL) { 1137 if (fn->leaf == NULL) {
1138 fn->fn_flags &= ~RTN_RTINFO; 1138 fn->fn_flags &= ~RTN_RTINFO;
1139 net->ipv6.rt6_stats->fib_route_nodes--; 1139 net->ipv6.rt6_stats->fib_route_nodes--;
1140 fn = fib6_repair_tree(net, fn); 1140 fn = fib6_repair_tree(net, fn);
1141 } 1141 }
1142 1142
1143 if (atomic_read(&rt->rt6i_ref) != 1) { 1143 if (atomic_read(&rt->rt6i_ref) != 1) {
1144 /* This route is used as dummy address holder in some split 1144 /* This route is used as dummy address holder in some split
1145 * nodes. It is not leaked, but it still holds other resources, 1145 * nodes. It is not leaked, but it still holds other resources,
1146 * which must be released in time. So, scan ascendant nodes 1146 * which must be released in time. So, scan ascendant nodes
1147 * and replace dummy references to this route with references 1147 * and replace dummy references to this route with references
1148 * to still alive ones. 1148 * to still alive ones.
1149 */ 1149 */
1150 while (fn) { 1150 while (fn) {
1151 if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { 1151 if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) {
1152 fn->leaf = fib6_find_prefix(net, fn); 1152 fn->leaf = fib6_find_prefix(net, fn);
1153 atomic_inc(&fn->leaf->rt6i_ref); 1153 atomic_inc(&fn->leaf->rt6i_ref);
1154 rt6_release(rt); 1154 rt6_release(rt);
1155 } 1155 }
1156 fn = fn->parent; 1156 fn = fn->parent;
1157 } 1157 }
1158 /* No more references are possible at this point. */ 1158 /* No more references are possible at this point. */
1159 BUG_ON(atomic_read(&rt->rt6i_ref) != 1); 1159 BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
1160 } 1160 }
1161 1161
1162 inet6_rt_notify(RTM_DELROUTE, rt, info); 1162 inet6_rt_notify(RTM_DELROUTE, rt, info);
1163 rt6_release(rt); 1163 rt6_release(rt);
1164 } 1164 }
1165 1165
1166 int fib6_del(struct rt6_info *rt, struct nl_info *info) 1166 int fib6_del(struct rt6_info *rt, struct nl_info *info)
1167 { 1167 {
1168 struct net *net = info->nl_net; 1168 struct net *net = info->nl_net;
1169 struct fib6_node *fn = rt->rt6i_node; 1169 struct fib6_node *fn = rt->rt6i_node;
1170 struct rt6_info **rtp; 1170 struct rt6_info **rtp;
1171 1171
1172 #if RT6_DEBUG >= 2 1172 #if RT6_DEBUG >= 2
1173 if (rt->dst.obsolete>0) { 1173 if (rt->dst.obsolete>0) {
1174 WARN_ON(fn != NULL); 1174 WARN_ON(fn != NULL);
1175 return -ENOENT; 1175 return -ENOENT;
1176 } 1176 }
1177 #endif 1177 #endif
1178 if (fn == NULL || rt == net->ipv6.ip6_null_entry) 1178 if (fn == NULL || rt == net->ipv6.ip6_null_entry)
1179 return -ENOENT; 1179 return -ENOENT;
1180 1180
1181 WARN_ON(!(fn->fn_flags & RTN_RTINFO)); 1181 WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1182 1182
1183 if (!(rt->rt6i_flags&RTF_CACHE)) { 1183 if (!(rt->rt6i_flags&RTF_CACHE)) {
1184 struct fib6_node *pn = fn; 1184 struct fib6_node *pn = fn;
1185 #ifdef CONFIG_IPV6_SUBTREES 1185 #ifdef CONFIG_IPV6_SUBTREES
1186 /* clones of this route might be in another subtree */ 1186 /* clones of this route might be in another subtree */
1187 if (rt->rt6i_src.plen) { 1187 if (rt->rt6i_src.plen) {
1188 while (!(pn->fn_flags&RTN_ROOT)) 1188 while (!(pn->fn_flags&RTN_ROOT))
1189 pn = pn->parent; 1189 pn = pn->parent;
1190 pn = pn->parent; 1190 pn = pn->parent;
1191 } 1191 }
1192 #endif 1192 #endif
1193 fib6_prune_clones(info->nl_net, pn, rt); 1193 fib6_prune_clones(info->nl_net, pn, rt);
1194 } 1194 }
1195 1195
1196 /* 1196 /*
1197 * Walk the leaf entries looking for ourself 1197 * Walk the leaf entries looking for ourself
1198 */ 1198 */
1199 1199
1200 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { 1200 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
1201 if (*rtp == rt) { 1201 if (*rtp == rt) {
1202 fib6_del_route(fn, rtp, info); 1202 fib6_del_route(fn, rtp, info);
1203 return 0; 1203 return 0;
1204 } 1204 }
1205 } 1205 }
1206 return -ENOENT; 1206 return -ENOENT;
1207 } 1207 }
1208 1208
1209 /* 1209 /*
1210 * Tree traversal function. 1210 * Tree traversal function.
1211 * 1211 *
1212 * Certainly, it is not interrupt safe. 1212 * Certainly, it is not interrupt safe.
1213 * However, it is internally reenterable wrt itself and fib6_add/fib6_del. 1213 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1214 * It means, that we can modify tree during walking 1214 * It means, that we can modify tree during walking
1215 * and use this function for garbage collection, clone pruning, 1215 * and use this function for garbage collection, clone pruning,
1216 * cleaning tree when a device goes down etc. etc. 1216 * cleaning tree when a device goes down etc. etc.
1217 * 1217 *
1218 * It guarantees that every node will be traversed, 1218 * It guarantees that every node will be traversed,
1219 * and that it will be traversed only once. 1219 * and that it will be traversed only once.
1220 * 1220 *
1221 * Callback function w->func may return: 1221 * Callback function w->func may return:
1222 * 0 -> continue walking. 1222 * 0 -> continue walking.
1223 * positive value -> walking is suspended (used by tree dumps, 1223 * positive value -> walking is suspended (used by tree dumps,
1224 * and probably by gc, if it will be split to several slices) 1224 * and probably by gc, if it will be split to several slices)
1225 * negative value -> terminate walking. 1225 * negative value -> terminate walking.
1226 * 1226 *
1227 * The function itself returns: 1227 * The function itself returns:
1228 * 0 -> walk is complete. 1228 * 0 -> walk is complete.
1229 * >0 -> walk is incomplete (i.e. suspended) 1229 * >0 -> walk is incomplete (i.e. suspended)
1230 * <0 -> walk is terminated by an error. 1230 * <0 -> walk is terminated by an error.
1231 */ 1231 */
1232 1232
1233 static int fib6_walk_continue(struct fib6_walker_t *w) 1233 static int fib6_walk_continue(struct fib6_walker_t *w)
1234 { 1234 {
1235 struct fib6_node *fn, *pn; 1235 struct fib6_node *fn, *pn;
1236 1236
1237 for (;;) { 1237 for (;;) {
1238 fn = w->node; 1238 fn = w->node;
1239 if (fn == NULL) 1239 if (fn == NULL)
1240 return 0; 1240 return 0;
1241 1241
1242 if (w->prune && fn != w->root && 1242 if (w->prune && fn != w->root &&
1243 fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { 1243 fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
1244 w->state = FWS_C; 1244 w->state = FWS_C;
1245 w->leaf = fn->leaf; 1245 w->leaf = fn->leaf;
1246 } 1246 }
1247 switch (w->state) { 1247 switch (w->state) {
1248 #ifdef CONFIG_IPV6_SUBTREES 1248 #ifdef CONFIG_IPV6_SUBTREES
1249 case FWS_S: 1249 case FWS_S:
1250 if (FIB6_SUBTREE(fn)) { 1250 if (FIB6_SUBTREE(fn)) {
1251 w->node = FIB6_SUBTREE(fn); 1251 w->node = FIB6_SUBTREE(fn);
1252 continue; 1252 continue;
1253 } 1253 }
1254 w->state = FWS_L; 1254 w->state = FWS_L;
1255 #endif 1255 #endif
1256 case FWS_L: 1256 case FWS_L:
1257 if (fn->left) { 1257 if (fn->left) {
1258 w->node = fn->left; 1258 w->node = fn->left;
1259 w->state = FWS_INIT; 1259 w->state = FWS_INIT;
1260 continue; 1260 continue;
1261 } 1261 }
1262 w->state = FWS_R; 1262 w->state = FWS_R;
1263 case FWS_R: 1263 case FWS_R:
1264 if (fn->right) { 1264 if (fn->right) {
1265 w->node = fn->right; 1265 w->node = fn->right;
1266 w->state = FWS_INIT; 1266 w->state = FWS_INIT;
1267 continue; 1267 continue;
1268 } 1268 }
1269 w->state = FWS_C; 1269 w->state = FWS_C;
1270 w->leaf = fn->leaf; 1270 w->leaf = fn->leaf;
1271 case FWS_C: 1271 case FWS_C:
1272 if (w->leaf && fn->fn_flags&RTN_RTINFO) { 1272 if (w->leaf && fn->fn_flags&RTN_RTINFO) {
1273 int err; 1273 int err;
1274 1274
1275 if (w->count < w->skip) { 1275 if (w->count < w->skip) {
1276 w->count++; 1276 w->count++;
1277 continue; 1277 continue;
1278 } 1278 }
1279 1279
1280 err = w->func(w); 1280 err = w->func(w);
1281 if (err) 1281 if (err)
1282 return err; 1282 return err;
1283 1283
1284 w->count++; 1284 w->count++;
1285 continue; 1285 continue;
1286 } 1286 }
1287 w->state = FWS_U; 1287 w->state = FWS_U;
1288 case FWS_U: 1288 case FWS_U:
1289 if (fn == w->root) 1289 if (fn == w->root)
1290 return 0; 1290 return 0;
1291 pn = fn->parent; 1291 pn = fn->parent;
1292 w->node = pn; 1292 w->node = pn;
1293 #ifdef CONFIG_IPV6_SUBTREES 1293 #ifdef CONFIG_IPV6_SUBTREES
1294 if (FIB6_SUBTREE(pn) == fn) { 1294 if (FIB6_SUBTREE(pn) == fn) {
1295 WARN_ON(!(fn->fn_flags & RTN_ROOT)); 1295 WARN_ON(!(fn->fn_flags & RTN_ROOT));
1296 w->state = FWS_L; 1296 w->state = FWS_L;
1297 continue; 1297 continue;
1298 } 1298 }
1299 #endif 1299 #endif
1300 if (pn->left == fn) { 1300 if (pn->left == fn) {
1301 w->state = FWS_R; 1301 w->state = FWS_R;
1302 continue; 1302 continue;
1303 } 1303 }
1304 if (pn->right == fn) { 1304 if (pn->right == fn) {
1305 w->state = FWS_C; 1305 w->state = FWS_C;
1306 w->leaf = w->node->leaf; 1306 w->leaf = w->node->leaf;
1307 continue; 1307 continue;
1308 } 1308 }
1309 #if RT6_DEBUG >= 2 1309 #if RT6_DEBUG >= 2
1310 WARN_ON(1); 1310 WARN_ON(1);
1311 #endif 1311 #endif
1312 } 1312 }
1313 } 1313 }
1314 } 1314 }
1315 1315
1316 static int fib6_walk(struct fib6_walker_t *w) 1316 static int fib6_walk(struct fib6_walker_t *w)
1317 { 1317 {
1318 int res; 1318 int res;
1319 1319
1320 w->state = FWS_INIT; 1320 w->state = FWS_INIT;
1321 w->node = w->root; 1321 w->node = w->root;
1322 1322
1323 fib6_walker_link(w); 1323 fib6_walker_link(w);
1324 res = fib6_walk_continue(w); 1324 res = fib6_walk_continue(w);
1325 if (res <= 0) 1325 if (res <= 0)
1326 fib6_walker_unlink(w); 1326 fib6_walker_unlink(w);
1327 return res; 1327 return res;
1328 } 1328 }
1329 1329
1330 static int fib6_clean_node(struct fib6_walker_t *w) 1330 static int fib6_clean_node(struct fib6_walker_t *w)
1331 { 1331 {
1332 int res; 1332 int res;
1333 struct rt6_info *rt; 1333 struct rt6_info *rt;
1334 struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); 1334 struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w);
1335 struct nl_info info = { 1335 struct nl_info info = {
1336 .nl_net = c->net, 1336 .nl_net = c->net,
1337 }; 1337 };
1338 1338
1339 for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { 1339 for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
1340 res = c->func(rt, c->arg); 1340 res = c->func(rt, c->arg);
1341 if (res < 0) { 1341 if (res < 0) {
1342 w->leaf = rt; 1342 w->leaf = rt;
1343 res = fib6_del(rt, &info); 1343 res = fib6_del(rt, &info);
1344 if (res) { 1344 if (res) {
1345 #if RT6_DEBUG >= 2 1345 #if RT6_DEBUG >= 2
1346 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); 1346 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
1347 #endif 1347 #endif
1348 continue; 1348 continue;
1349 } 1349 }
1350 return 0; 1350 return 0;
1351 } 1351 }
1352 WARN_ON(res != 0); 1352 WARN_ON(res != 0);
1353 } 1353 }
1354 w->leaf = rt; 1354 w->leaf = rt;
1355 return 0; 1355 return 0;
1356 } 1356 }
1357 1357
1358 /* 1358 /*
1359 * Convenient frontend to tree walker. 1359 * Convenient frontend to tree walker.
1360 * 1360 *
1361 * func is called on each route. 1361 * func is called on each route.
1362 * It may return -1 -> delete this route. 1362 * It may return -1 -> delete this route.
1363 * 0 -> continue walking 1363 * 0 -> continue walking
1364 * 1364 *
1365 * prune==1 -> only immediate children of node (certainly, 1365 * prune==1 -> only immediate children of node (certainly,
1366 * ignoring pure split nodes) will be scanned. 1366 * ignoring pure split nodes) will be scanned.
1367 */ 1367 */
1368 1368
1369 static void fib6_clean_tree(struct net *net, struct fib6_node *root, 1369 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
1370 int (*func)(struct rt6_info *, void *arg), 1370 int (*func)(struct rt6_info *, void *arg),
1371 int prune, void *arg) 1371 int prune, void *arg)
1372 { 1372 {
1373 struct fib6_cleaner_t c; 1373 struct fib6_cleaner_t c;
1374 1374
1375 c.w.root = root; 1375 c.w.root = root;
1376 c.w.func = fib6_clean_node; 1376 c.w.func = fib6_clean_node;
1377 c.w.prune = prune; 1377 c.w.prune = prune;
1378 c.w.count = 0; 1378 c.w.count = 0;
1379 c.w.skip = 0; 1379 c.w.skip = 0;
1380 c.func = func; 1380 c.func = func;
1381 c.arg = arg; 1381 c.arg = arg;
1382 c.net = net; 1382 c.net = net;
1383 1383
1384 fib6_walk(&c.w); 1384 fib6_walk(&c.w);
1385 } 1385 }
1386 1386
1387 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), 1387 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
1388 int prune, void *arg) 1388 int prune, void *arg)
1389 { 1389 {
1390 struct fib6_table *table; 1390 struct fib6_table *table;
1391 struct hlist_node *node; 1391 struct hlist_node *node;
1392 struct hlist_head *head; 1392 struct hlist_head *head;
1393 unsigned int h; 1393 unsigned int h;
1394 1394
1395 rcu_read_lock(); 1395 rcu_read_lock();
1396 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 1396 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
1397 head = &net->ipv6.fib_table_hash[h]; 1397 head = &net->ipv6.fib_table_hash[h];
1398 hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { 1398 hlist_for_each_entry_rcu(table, node, head, tb6_hlist) {
1399 write_lock_bh(&table->tb6_lock); 1399 write_lock_bh(&table->tb6_lock);
1400 fib6_clean_tree(net, &table->tb6_root, 1400 fib6_clean_tree(net, &table->tb6_root,
1401 func, prune, arg); 1401 func, prune, arg);
1402 write_unlock_bh(&table->tb6_lock); 1402 write_unlock_bh(&table->tb6_lock);
1403 } 1403 }
1404 } 1404 }
1405 rcu_read_unlock(); 1405 rcu_read_unlock();
1406 } 1406 }
1407 1407
1408 static int fib6_prune_clone(struct rt6_info *rt, void *arg) 1408 static int fib6_prune_clone(struct rt6_info *rt, void *arg)
1409 { 1409 {
1410 if (rt->rt6i_flags & RTF_CACHE) { 1410 if (rt->rt6i_flags & RTF_CACHE) {
1411 RT6_TRACE("pruning clone %p\n", rt); 1411 RT6_TRACE("pruning clone %p\n", rt);
1412 return -1; 1412 return -1;
1413 } 1413 }
1414 1414
1415 return 0; 1415 return 0;
1416 } 1416 }
1417 1417
1418 static void fib6_prune_clones(struct net *net, struct fib6_node *fn, 1418 static void fib6_prune_clones(struct net *net, struct fib6_node *fn,
1419 struct rt6_info *rt) 1419 struct rt6_info *rt)
1420 { 1420 {
1421 fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt); 1421 fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt);
1422 } 1422 }
1423 1423
1424 /* 1424 /*
1425 * Garbage collection 1425 * Garbage collection
1426 */ 1426 */
1427 1427
1428 static struct fib6_gc_args 1428 static struct fib6_gc_args
1429 { 1429 {
1430 int timeout; 1430 int timeout;
1431 int more; 1431 int more;
1432 } gc_args; 1432 } gc_args;
1433 1433
1434 static int fib6_age(struct rt6_info *rt, void *arg) 1434 static int fib6_age(struct rt6_info *rt, void *arg)
1435 { 1435 {
1436 unsigned long now = jiffies; 1436 unsigned long now = jiffies;
1437 1437
1438 /* 1438 /*
1439 * check addrconf expiration here. 1439 * check addrconf expiration here.
1440 * Routes are expired even if they are in use. 1440 * Routes are expired even if they are in use.
1441 * 1441 *
1442 * Also age clones. Note, that clones are aged out 1442 * Also age clones. Note, that clones are aged out
1443 * only if they are not in use now. 1443 * only if they are not in use now.
1444 */ 1444 */
1445 1445
1446 if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { 1446 if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
1447 if (time_after(now, rt->rt6i_expires)) { 1447 if (time_after(now, rt->rt6i_expires)) {
1448 RT6_TRACE("expiring %p\n", rt); 1448 RT6_TRACE("expiring %p\n", rt);
1449 return -1; 1449 return -1;
1450 } 1450 }
1451 gc_args.more++; 1451 gc_args.more++;
1452 } else if (rt->rt6i_flags & RTF_CACHE) { 1452 } else if (rt->rt6i_flags & RTF_CACHE) {
1453 if (atomic_read(&rt->dst.__refcnt) == 0 && 1453 if (atomic_read(&rt->dst.__refcnt) == 0 &&
1454 time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { 1454 time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
1455 RT6_TRACE("aging clone %p\n", rt); 1455 RT6_TRACE("aging clone %p\n", rt);
1456 return -1; 1456 return -1;
1457 } else if ((rt->rt6i_flags & RTF_GATEWAY) && 1457 } else if ((rt->rt6i_flags & RTF_GATEWAY) &&
1458 (!(dst_get_neighbour(&rt->dst)->flags & NTF_ROUTER))) { 1458 (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) {
1459 RT6_TRACE("purging route %p via non-router but gateway\n", 1459 RT6_TRACE("purging route %p via non-router but gateway\n",
1460 rt); 1460 rt);
1461 return -1; 1461 return -1;
1462 } 1462 }
1463 gc_args.more++; 1463 gc_args.more++;
1464 } 1464 }
1465 1465
1466 return 0; 1466 return 0;
1467 } 1467 }
1468 1468
1469 static DEFINE_SPINLOCK(fib6_gc_lock); 1469 static DEFINE_SPINLOCK(fib6_gc_lock);
1470 1470
1471 void fib6_run_gc(unsigned long expires, struct net *net) 1471 void fib6_run_gc(unsigned long expires, struct net *net)
1472 { 1472 {
1473 if (expires != ~0UL) { 1473 if (expires != ~0UL) {
1474 spin_lock_bh(&fib6_gc_lock); 1474 spin_lock_bh(&fib6_gc_lock);
1475 gc_args.timeout = expires ? (int)expires : 1475 gc_args.timeout = expires ? (int)expires :
1476 net->ipv6.sysctl.ip6_rt_gc_interval; 1476 net->ipv6.sysctl.ip6_rt_gc_interval;
1477 } else { 1477 } else {
1478 if (!spin_trylock_bh(&fib6_gc_lock)) { 1478 if (!spin_trylock_bh(&fib6_gc_lock)) {
1479 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); 1479 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
1480 return; 1480 return;
1481 } 1481 }
1482 gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval; 1482 gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval;
1483 } 1483 }
1484 1484
1485 gc_args.more = icmp6_dst_gc(); 1485 gc_args.more = icmp6_dst_gc();
1486 1486
1487 fib6_clean_all(net, fib6_age, 0, NULL); 1487 fib6_clean_all(net, fib6_age, 0, NULL);
1488 1488
1489 if (gc_args.more) 1489 if (gc_args.more)
1490 mod_timer(&net->ipv6.ip6_fib_timer, 1490 mod_timer(&net->ipv6.ip6_fib_timer,
1491 round_jiffies(jiffies 1491 round_jiffies(jiffies
1492 + net->ipv6.sysctl.ip6_rt_gc_interval)); 1492 + net->ipv6.sysctl.ip6_rt_gc_interval));
1493 else 1493 else
1494 del_timer(&net->ipv6.ip6_fib_timer); 1494 del_timer(&net->ipv6.ip6_fib_timer);
1495 spin_unlock_bh(&fib6_gc_lock); 1495 spin_unlock_bh(&fib6_gc_lock);
1496 } 1496 }
1497 1497
1498 static void fib6_gc_timer_cb(unsigned long arg) 1498 static void fib6_gc_timer_cb(unsigned long arg)
1499 { 1499 {
1500 fib6_run_gc(0, (struct net *)arg); 1500 fib6_run_gc(0, (struct net *)arg);
1501 } 1501 }
1502 1502
1503 static int __net_init fib6_net_init(struct net *net) 1503 static int __net_init fib6_net_init(struct net *net)
1504 { 1504 {
1505 size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; 1505 size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
1506 1506
1507 setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); 1507 setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
1508 1508
1509 net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); 1509 net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
1510 if (!net->ipv6.rt6_stats) 1510 if (!net->ipv6.rt6_stats)
1511 goto out_timer; 1511 goto out_timer;
1512 1512
1513 /* Avoid false sharing : Use at least a full cache line */ 1513 /* Avoid false sharing : Use at least a full cache line */
1514 size = max_t(size_t, size, L1_CACHE_BYTES); 1514 size = max_t(size_t, size, L1_CACHE_BYTES);
1515 1515
1516 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); 1516 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
1517 if (!net->ipv6.fib_table_hash) 1517 if (!net->ipv6.fib_table_hash)
1518 goto out_rt6_stats; 1518 goto out_rt6_stats;
1519 1519
1520 net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), 1520 net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
1521 GFP_KERNEL); 1521 GFP_KERNEL);
1522 if (!net->ipv6.fib6_main_tbl) 1522 if (!net->ipv6.fib6_main_tbl)
1523 goto out_fib_table_hash; 1523 goto out_fib_table_hash;
1524 1524
1525 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; 1525 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
1526 net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; 1526 net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
1527 net->ipv6.fib6_main_tbl->tb6_root.fn_flags = 1527 net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
1528 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 1528 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
1529 1529
1530 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 1530 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1531 net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), 1531 net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
1532 GFP_KERNEL); 1532 GFP_KERNEL);
1533 if (!net->ipv6.fib6_local_tbl) 1533 if (!net->ipv6.fib6_local_tbl)
1534 goto out_fib6_main_tbl; 1534 goto out_fib6_main_tbl;
1535 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; 1535 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
1536 net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; 1536 net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
1537 net->ipv6.fib6_local_tbl->tb6_root.fn_flags = 1537 net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
1538 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 1538 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
1539 #endif 1539 #endif
1540 fib6_tables_init(net); 1540 fib6_tables_init(net);
1541 1541
1542 return 0; 1542 return 0;
1543 1543
1544 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 1544 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1545 out_fib6_main_tbl: 1545 out_fib6_main_tbl:
1546 kfree(net->ipv6.fib6_main_tbl); 1546 kfree(net->ipv6.fib6_main_tbl);
1547 #endif 1547 #endif
1548 out_fib_table_hash: 1548 out_fib_table_hash:
1549 kfree(net->ipv6.fib_table_hash); 1549 kfree(net->ipv6.fib_table_hash);
1550 out_rt6_stats: 1550 out_rt6_stats:
1551 kfree(net->ipv6.rt6_stats); 1551 kfree(net->ipv6.rt6_stats);
1552 out_timer: 1552 out_timer:
1553 return -ENOMEM; 1553 return -ENOMEM;
1554 } 1554 }
1555 1555
1556 static void fib6_net_exit(struct net *net) 1556 static void fib6_net_exit(struct net *net)
1557 { 1557 {
1558 rt6_ifdown(net, NULL); 1558 rt6_ifdown(net, NULL);
1559 del_timer_sync(&net->ipv6.ip6_fib_timer); 1559 del_timer_sync(&net->ipv6.ip6_fib_timer);
1560 1560
1561 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 1561 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1562 kfree(net->ipv6.fib6_local_tbl); 1562 kfree(net->ipv6.fib6_local_tbl);
1563 #endif 1563 #endif
1564 kfree(net->ipv6.fib6_main_tbl); 1564 kfree(net->ipv6.fib6_main_tbl);
1565 kfree(net->ipv6.fib_table_hash); 1565 kfree(net->ipv6.fib_table_hash);
1566 kfree(net->ipv6.rt6_stats); 1566 kfree(net->ipv6.rt6_stats);
1567 } 1567 }
1568 1568
1569 static struct pernet_operations fib6_net_ops = { 1569 static struct pernet_operations fib6_net_ops = {
1570 .init = fib6_net_init, 1570 .init = fib6_net_init,
1571 .exit = fib6_net_exit, 1571 .exit = fib6_net_exit,
1572 }; 1572 };
1573 1573
1574 int __init fib6_init(void) 1574 int __init fib6_init(void)
1575 { 1575 {
1576 int ret = -ENOMEM; 1576 int ret = -ENOMEM;
1577 1577
1578 fib6_node_kmem = kmem_cache_create("fib6_nodes", 1578 fib6_node_kmem = kmem_cache_create("fib6_nodes",
1579 sizeof(struct fib6_node), 1579 sizeof(struct fib6_node),
1580 0, SLAB_HWCACHE_ALIGN, 1580 0, SLAB_HWCACHE_ALIGN,
1581 NULL); 1581 NULL);
1582 if (!fib6_node_kmem) 1582 if (!fib6_node_kmem)
1583 goto out; 1583 goto out;
1584 1584
1585 ret = register_pernet_subsys(&fib6_net_ops); 1585 ret = register_pernet_subsys(&fib6_net_ops);
1586 if (ret) 1586 if (ret)
1587 goto out_kmem_cache_create; 1587 goto out_kmem_cache_create;
1588 1588
1589 ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, 1589 ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
1590 NULL); 1590 NULL);
1591 if (ret) 1591 if (ret)
1592 goto out_unregister_subsys; 1592 goto out_unregister_subsys;
1593 out: 1593 out:
1594 return ret; 1594 return ret;
1595 1595
1596 out_unregister_subsys: 1596 out_unregister_subsys:
1597 unregister_pernet_subsys(&fib6_net_ops); 1597 unregister_pernet_subsys(&fib6_net_ops);
1598 out_kmem_cache_create: 1598 out_kmem_cache_create:
1599 kmem_cache_destroy(fib6_node_kmem); 1599 kmem_cache_destroy(fib6_node_kmem);
1600 goto out; 1600 goto out;
1601 } 1601 }
1602 1602
1603 void fib6_gc_cleanup(void) 1603 void fib6_gc_cleanup(void)
1604 { 1604 {
1605 unregister_pernet_subsys(&fib6_net_ops); 1605 unregister_pernet_subsys(&fib6_net_ops);
1606 kmem_cache_destroy(fib6_node_kmem); 1606 kmem_cache_destroy(fib6_node_kmem);
1607 } 1607 }
1608 1608
net/ipv6/ip6_output.c
1 /* 1 /*
2 * IPv6 output functions 2 * IPv6 output functions
3 * Linux INET6 implementation 3 * Linux INET6 implementation
4 * 4 *
5 * Authors: 5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt> 6 * Pedro Roque <roque@di.fc.ul.pt>
7 * 7 *
8 * Based on linux/net/ipv4/ip_output.c 8 * Based on linux/net/ipv4/ip_output.c
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version 12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 * 14 *
15 * Changes: 15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation. 16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented. 17 * extension headers are implemented.
18 * route changes now work. 18 * route changes now work.
19 * ip6_forward does not confuse sniffers. 19 * ip6_forward does not confuse sniffers.
20 * etc. 20 * etc.
21 * 21 *
22 * H. von Brand : Added missing #include <linux/string.h> 22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO 23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI 24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions 25 * : add ip6_append_data and related functions
26 * for datagram xmit 26 * for datagram xmit
27 */ 27 */
28 28
29 #include <linux/errno.h> 29 #include <linux/errno.h>
30 #include <linux/kernel.h> 30 #include <linux/kernel.h>
31 #include <linux/string.h> 31 #include <linux/string.h>
32 #include <linux/socket.h> 32 #include <linux/socket.h>
33 #include <linux/net.h> 33 #include <linux/net.h>
34 #include <linux/netdevice.h> 34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h> 35 #include <linux/if_arp.h>
36 #include <linux/in6.h> 36 #include <linux/in6.h>
37 #include <linux/tcp.h> 37 #include <linux/tcp.h>
38 #include <linux/route.h> 38 #include <linux/route.h>
39 #include <linux/module.h> 39 #include <linux/module.h>
40 #include <linux/slab.h> 40 #include <linux/slab.h>
41 41
42 #include <linux/netfilter.h> 42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h> 43 #include <linux/netfilter_ipv6.h>
44 44
45 #include <net/sock.h> 45 #include <net/sock.h>
46 #include <net/snmp.h> 46 #include <net/snmp.h>
47 47
48 #include <net/ipv6.h> 48 #include <net/ipv6.h>
49 #include <net/ndisc.h> 49 #include <net/ndisc.h>
50 #include <net/protocol.h> 50 #include <net/protocol.h>
51 #include <net/ip6_route.h> 51 #include <net/ip6_route.h>
52 #include <net/addrconf.h> 52 #include <net/addrconf.h>
53 #include <net/rawv6.h> 53 #include <net/rawv6.h>
54 #include <net/icmp.h> 54 #include <net/icmp.h>
55 #include <net/xfrm.h> 55 #include <net/xfrm.h>
56 #include <net/checksum.h> 56 #include <net/checksum.h>
57 #include <linux/mroute6.h> 57 #include <linux/mroute6.h>
58 58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); 59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 60
61 int __ip6_local_out(struct sk_buff *skb) 61 int __ip6_local_out(struct sk_buff *skb)
62 { 62 {
63 int len; 63 int len;
64 64
65 len = skb->len - sizeof(struct ipv6hdr); 65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN) 66 if (len > IPV6_MAXPLEN)
67 len = 0; 67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len); 68 ipv6_hdr(skb)->payload_len = htons(len);
69 69
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, 70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output); 71 skb_dst(skb)->dev, dst_output);
72 } 72 }
73 73
74 int ip6_local_out(struct sk_buff *skb) 74 int ip6_local_out(struct sk_buff *skb)
75 { 75 {
76 int err; 76 int err;
77 77
78 err = __ip6_local_out(skb); 78 err = __ip6_local_out(skb);
79 if (likely(err == 1)) 79 if (likely(err == 1))
80 err = dst_output(skb); 80 err = dst_output(skb);
81 81
82 return err; 82 return err;
83 } 83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out); 84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 85
86 /* dev_loopback_xmit for use with netfilter. */ 86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb) 87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 { 88 {
89 skb_reset_mac_header(newskb); 89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb)); 90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK; 91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY; 92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb)); 93 WARN_ON(!skb_dst(newskb));
94 94
95 netif_rx_ni(newskb); 95 netif_rx_ni(newskb);
96 return 0; 96 return 0;
97 } 97 }
98 98
99 static int ip6_finish_output2(struct sk_buff *skb) 99 static int ip6_finish_output2(struct sk_buff *skb)
100 { 100 {
101 struct dst_entry *dst = skb_dst(skb); 101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev; 102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh; 103 struct neighbour *neigh;
104 104
105 skb->protocol = htons(ETH_P_IPV6); 105 skb->protocol = htons(ETH_P_IPV6);
106 skb->dev = dev; 106 skb->dev = dev;
107 107
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 110
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && 111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) && 112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) { 115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 117
118 /* Do not check for IFF_ALLMULTI; multicast routing 118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case. 119 is not supported in any case.
120 */ 120 */
121 if (newskb) 121 if (newskb)
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev, 123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit); 124 ip6_dev_loopback_xmit);
125 125
126 if (ipv6_hdr(skb)->hop_limit == 0) { 126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev, 127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS); 128 IPSTATS_MIB_OUTDISCARDS);
129 kfree_skb(skb); 129 kfree_skb(skb);
130 return 0; 130 return 0;
131 } 131 }
132 } 132 }
133 133
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, 134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 skb->len); 135 skb->len);
136 } 136 }
137 137
138 rcu_read_lock();
138 neigh = dst_get_neighbour(dst); 139 neigh = dst_get_neighbour(dst);
139 if (neigh) 140 if (neigh) {
140 return neigh_output(neigh, skb); 141 int res = neigh_output(neigh, skb);
141 142
143 rcu_read_unlock();
144 return res;
145 }
146 rcu_read_unlock();
142 IP6_INC_STATS_BH(dev_net(dst->dev), 147 IP6_INC_STATS_BH(dev_net(dst->dev),
143 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 kfree_skb(skb); 149 kfree_skb(skb);
145 return -EINVAL; 150 return -EINVAL;
146 } 151 }
147 152
148 static int ip6_finish_output(struct sk_buff *skb) 153 static int ip6_finish_output(struct sk_buff *skb)
149 { 154 {
150 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 dst_allfrag(skb_dst(skb))) 156 dst_allfrag(skb_dst(skb)))
152 return ip6_fragment(skb, ip6_finish_output2); 157 return ip6_fragment(skb, ip6_finish_output2);
153 else 158 else
154 return ip6_finish_output2(skb); 159 return ip6_finish_output2(skb);
155 } 160 }
156 161
157 int ip6_output(struct sk_buff *skb) 162 int ip6_output(struct sk_buff *skb)
158 { 163 {
159 struct net_device *dev = skb_dst(skb)->dev; 164 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 if (unlikely(idev->cnf.disable_ipv6)) { 166 if (unlikely(idev->cnf.disable_ipv6)) {
162 IP6_INC_STATS(dev_net(dev), idev, 167 IP6_INC_STATS(dev_net(dev), idev,
163 IPSTATS_MIB_OUTDISCARDS); 168 IPSTATS_MIB_OUTDISCARDS);
164 kfree_skb(skb); 169 kfree_skb(skb);
165 return 0; 170 return 0;
166 } 171 }
167 172
168 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, 173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169 ip6_finish_output, 174 ip6_finish_output,
170 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 } 176 }
172 177
173 /* 178 /*
174 * xmit an sk_buff (used by TCP, SCTP and DCCP) 179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
175 */ 180 */
176 181
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
178 struct ipv6_txoptions *opt) 183 struct ipv6_txoptions *opt)
179 { 184 {
180 struct net *net = sock_net(sk); 185 struct net *net = sock_net(sk);
181 struct ipv6_pinfo *np = inet6_sk(sk); 186 struct ipv6_pinfo *np = inet6_sk(sk);
182 struct in6_addr *first_hop = &fl6->daddr; 187 struct in6_addr *first_hop = &fl6->daddr;
183 struct dst_entry *dst = skb_dst(skb); 188 struct dst_entry *dst = skb_dst(skb);
184 struct ipv6hdr *hdr; 189 struct ipv6hdr *hdr;
185 u8 proto = fl6->flowi6_proto; 190 u8 proto = fl6->flowi6_proto;
186 int seg_len = skb->len; 191 int seg_len = skb->len;
187 int hlimit = -1; 192 int hlimit = -1;
188 int tclass = 0; 193 int tclass = 0;
189 u32 mtu; 194 u32 mtu;
190 195
191 if (opt) { 196 if (opt) {
192 unsigned int head_room; 197 unsigned int head_room;
193 198
194 /* First: exthdrs may take lots of space (~8K for now) 199 /* First: exthdrs may take lots of space (~8K for now)
195 MAX_HEADER is not enough. 200 MAX_HEADER is not enough.
196 */ 201 */
197 head_room = opt->opt_nflen + opt->opt_flen; 202 head_room = opt->opt_nflen + opt->opt_flen;
198 seg_len += head_room; 203 seg_len += head_room;
199 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 204 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200 205
201 if (skb_headroom(skb) < head_room) { 206 if (skb_headroom(skb) < head_room) {
202 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 207 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203 if (skb2 == NULL) { 208 if (skb2 == NULL) {
204 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 209 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 IPSTATS_MIB_OUTDISCARDS); 210 IPSTATS_MIB_OUTDISCARDS);
206 kfree_skb(skb); 211 kfree_skb(skb);
207 return -ENOBUFS; 212 return -ENOBUFS;
208 } 213 }
209 kfree_skb(skb); 214 kfree_skb(skb);
210 skb = skb2; 215 skb = skb2;
211 skb_set_owner_w(skb, sk); 216 skb_set_owner_w(skb, sk);
212 } 217 }
213 if (opt->opt_flen) 218 if (opt->opt_flen)
214 ipv6_push_frag_opts(skb, opt, &proto); 219 ipv6_push_frag_opts(skb, opt, &proto);
215 if (opt->opt_nflen) 220 if (opt->opt_nflen)
216 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); 221 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217 } 222 }
218 223
219 skb_push(skb, sizeof(struct ipv6hdr)); 224 skb_push(skb, sizeof(struct ipv6hdr));
220 skb_reset_network_header(skb); 225 skb_reset_network_header(skb);
221 hdr = ipv6_hdr(skb); 226 hdr = ipv6_hdr(skb);
222 227
223 /* 228 /*
224 * Fill in the IPv6 header 229 * Fill in the IPv6 header
225 */ 230 */
226 if (np) { 231 if (np) {
227 tclass = np->tclass; 232 tclass = np->tclass;
228 hlimit = np->hop_limit; 233 hlimit = np->hop_limit;
229 } 234 }
230 if (hlimit < 0) 235 if (hlimit < 0)
231 hlimit = ip6_dst_hoplimit(dst); 236 hlimit = ip6_dst_hoplimit(dst);
232 237
233 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; 238 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
234 239
235 hdr->payload_len = htons(seg_len); 240 hdr->payload_len = htons(seg_len);
236 hdr->nexthdr = proto; 241 hdr->nexthdr = proto;
237 hdr->hop_limit = hlimit; 242 hdr->hop_limit = hlimit;
238 243
239 ipv6_addr_copy(&hdr->saddr, &fl6->saddr); 244 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
240 ipv6_addr_copy(&hdr->daddr, first_hop); 245 ipv6_addr_copy(&hdr->daddr, first_hop);
241 246
242 skb->priority = sk->sk_priority; 247 skb->priority = sk->sk_priority;
243 skb->mark = sk->sk_mark; 248 skb->mark = sk->sk_mark;
244 249
245 mtu = dst_mtu(dst); 250 mtu = dst_mtu(dst);
246 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { 251 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 252 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 IPSTATS_MIB_OUT, skb->len); 253 IPSTATS_MIB_OUT, skb->len);
249 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, 254 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 dst->dev, dst_output); 255 dst->dev, dst_output);
251 } 256 }
252 257
253 if (net_ratelimit()) 258 if (net_ratelimit())
254 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); 259 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255 skb->dev = dst->dev; 260 skb->dev = dst->dev;
256 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 261 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 262 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258 kfree_skb(skb); 263 kfree_skb(skb);
259 return -EMSGSIZE; 264 return -EMSGSIZE;
260 } 265 }
261 266
262 EXPORT_SYMBOL(ip6_xmit); 267 EXPORT_SYMBOL(ip6_xmit);
263 268
264 /* 269 /*
265 * To avoid extra problems ND packets are send through this 270 * To avoid extra problems ND packets are send through this
266 * routine. It's code duplication but I really want to avoid 271 * routine. It's code duplication but I really want to avoid
267 * extra checks since ipv6_build_header is used by TCP (which 272 * extra checks since ipv6_build_header is used by TCP (which
268 * is for us performance critical) 273 * is for us performance critical)
269 */ 274 */
270 275
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, 276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 const struct in6_addr *saddr, const struct in6_addr *daddr, 277 const struct in6_addr *saddr, const struct in6_addr *daddr,
273 int proto, int len) 278 int proto, int len)
274 { 279 {
275 struct ipv6_pinfo *np = inet6_sk(sk); 280 struct ipv6_pinfo *np = inet6_sk(sk);
276 struct ipv6hdr *hdr; 281 struct ipv6hdr *hdr;
277 282
278 skb->protocol = htons(ETH_P_IPV6); 283 skb->protocol = htons(ETH_P_IPV6);
279 skb->dev = dev; 284 skb->dev = dev;
280 285
281 skb_reset_network_header(skb); 286 skb_reset_network_header(skb);
282 skb_put(skb, sizeof(struct ipv6hdr)); 287 skb_put(skb, sizeof(struct ipv6hdr));
283 hdr = ipv6_hdr(skb); 288 hdr = ipv6_hdr(skb);
284 289
285 *(__be32*)hdr = htonl(0x60000000); 290 *(__be32*)hdr = htonl(0x60000000);
286 291
287 hdr->payload_len = htons(len); 292 hdr->payload_len = htons(len);
288 hdr->nexthdr = proto; 293 hdr->nexthdr = proto;
289 hdr->hop_limit = np->hop_limit; 294 hdr->hop_limit = np->hop_limit;
290 295
291 ipv6_addr_copy(&hdr->saddr, saddr); 296 ipv6_addr_copy(&hdr->saddr, saddr);
292 ipv6_addr_copy(&hdr->daddr, daddr); 297 ipv6_addr_copy(&hdr->daddr, daddr);
293 298
294 return 0; 299 return 0;
295 } 300 }
296 301
297 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 302 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298 { 303 {
299 struct ip6_ra_chain *ra; 304 struct ip6_ra_chain *ra;
300 struct sock *last = NULL; 305 struct sock *last = NULL;
301 306
302 read_lock(&ip6_ra_lock); 307 read_lock(&ip6_ra_lock);
303 for (ra = ip6_ra_chain; ra; ra = ra->next) { 308 for (ra = ip6_ra_chain; ra; ra = ra->next) {
304 struct sock *sk = ra->sk; 309 struct sock *sk = ra->sk;
305 if (sk && ra->sel == sel && 310 if (sk && ra->sel == sel &&
306 (!sk->sk_bound_dev_if || 311 (!sk->sk_bound_dev_if ||
307 sk->sk_bound_dev_if == skb->dev->ifindex)) { 312 sk->sk_bound_dev_if == skb->dev->ifindex)) {
308 if (last) { 313 if (last) {
309 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 314 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310 if (skb2) 315 if (skb2)
311 rawv6_rcv(last, skb2); 316 rawv6_rcv(last, skb2);
312 } 317 }
313 last = sk; 318 last = sk;
314 } 319 }
315 } 320 }
316 321
317 if (last) { 322 if (last) {
318 rawv6_rcv(last, skb); 323 rawv6_rcv(last, skb);
319 read_unlock(&ip6_ra_lock); 324 read_unlock(&ip6_ra_lock);
320 return 1; 325 return 1;
321 } 326 }
322 read_unlock(&ip6_ra_lock); 327 read_unlock(&ip6_ra_lock);
323 return 0; 328 return 0;
324 } 329 }
325 330
326 static int ip6_forward_proxy_check(struct sk_buff *skb) 331 static int ip6_forward_proxy_check(struct sk_buff *skb)
327 { 332 {
328 struct ipv6hdr *hdr = ipv6_hdr(skb); 333 struct ipv6hdr *hdr = ipv6_hdr(skb);
329 u8 nexthdr = hdr->nexthdr; 334 u8 nexthdr = hdr->nexthdr;
330 int offset; 335 int offset;
331 336
332 if (ipv6_ext_hdr(nexthdr)) { 337 if (ipv6_ext_hdr(nexthdr)) {
333 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr); 338 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334 if (offset < 0) 339 if (offset < 0)
335 return 0; 340 return 0;
336 } else 341 } else
337 offset = sizeof(struct ipv6hdr); 342 offset = sizeof(struct ipv6hdr);
338 343
339 if (nexthdr == IPPROTO_ICMPV6) { 344 if (nexthdr == IPPROTO_ICMPV6) {
340 struct icmp6hdr *icmp6; 345 struct icmp6hdr *icmp6;
341 346
342 if (!pskb_may_pull(skb, (skb_network_header(skb) + 347 if (!pskb_may_pull(skb, (skb_network_header(skb) +
343 offset + 1 - skb->data))) 348 offset + 1 - skb->data)))
344 return 0; 349 return 0;
345 350
346 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 351 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347 352
348 switch (icmp6->icmp6_type) { 353 switch (icmp6->icmp6_type) {
349 case NDISC_ROUTER_SOLICITATION: 354 case NDISC_ROUTER_SOLICITATION:
350 case NDISC_ROUTER_ADVERTISEMENT: 355 case NDISC_ROUTER_ADVERTISEMENT:
351 case NDISC_NEIGHBOUR_SOLICITATION: 356 case NDISC_NEIGHBOUR_SOLICITATION:
352 case NDISC_NEIGHBOUR_ADVERTISEMENT: 357 case NDISC_NEIGHBOUR_ADVERTISEMENT:
353 case NDISC_REDIRECT: 358 case NDISC_REDIRECT:
354 /* For reaction involving unicast neighbor discovery 359 /* For reaction involving unicast neighbor discovery
355 * message destined to the proxied address, pass it to 360 * message destined to the proxied address, pass it to
356 * input function. 361 * input function.
357 */ 362 */
358 return 1; 363 return 1;
359 default: 364 default:
360 break; 365 break;
361 } 366 }
362 } 367 }
363 368
364 /* 369 /*
365 * The proxying router can't forward traffic sent to a link-local 370 * The proxying router can't forward traffic sent to a link-local
366 * address, so signal the sender and discard the packet. This 371 * address, so signal the sender and discard the packet. This
367 * behavior is clarified by the MIPv6 specification. 372 * behavior is clarified by the MIPv6 specification.
368 */ 373 */
369 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 374 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370 dst_link_failure(skb); 375 dst_link_failure(skb);
371 return -1; 376 return -1;
372 } 377 }
373 378
374 return 0; 379 return 0;
375 } 380 }
376 381
377 static inline int ip6_forward_finish(struct sk_buff *skb) 382 static inline int ip6_forward_finish(struct sk_buff *skb)
378 { 383 {
379 return dst_output(skb); 384 return dst_output(skb);
380 } 385 }
381 386
382 int ip6_forward(struct sk_buff *skb) 387 int ip6_forward(struct sk_buff *skb)
383 { 388 {
384 struct dst_entry *dst = skb_dst(skb); 389 struct dst_entry *dst = skb_dst(skb);
385 struct ipv6hdr *hdr = ipv6_hdr(skb); 390 struct ipv6hdr *hdr = ipv6_hdr(skb);
386 struct inet6_skb_parm *opt = IP6CB(skb); 391 struct inet6_skb_parm *opt = IP6CB(skb);
387 struct net *net = dev_net(dst->dev); 392 struct net *net = dev_net(dst->dev);
388 struct neighbour *n; 393 struct neighbour *n;
389 u32 mtu; 394 u32 mtu;
390 395
391 if (net->ipv6.devconf_all->forwarding == 0) 396 if (net->ipv6.devconf_all->forwarding == 0)
392 goto error; 397 goto error;
393 398
394 if (skb_warn_if_lro(skb)) 399 if (skb_warn_if_lro(skb))
395 goto drop; 400 goto drop;
396 401
397 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 402 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
398 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); 403 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
399 goto drop; 404 goto drop;
400 } 405 }
401 406
402 if (skb->pkt_type != PACKET_HOST) 407 if (skb->pkt_type != PACKET_HOST)
403 goto drop; 408 goto drop;
404 409
405 skb_forward_csum(skb); 410 skb_forward_csum(skb);
406 411
407 /* 412 /*
408 * We DO NOT make any processing on 413 * We DO NOT make any processing on
409 * RA packets, pushing them to user level AS IS 414 * RA packets, pushing them to user level AS IS
410 * without ane WARRANTY that application will be able 415 * without ane WARRANTY that application will be able
411 * to interpret them. The reason is that we 416 * to interpret them. The reason is that we
412 * cannot make anything clever here. 417 * cannot make anything clever here.
413 * 418 *
414 * We are not end-node, so that if packet contains 419 * We are not end-node, so that if packet contains
415 * AH/ESP, we cannot make anything. 420 * AH/ESP, we cannot make anything.
416 * Defragmentation also would be mistake, RA packets 421 * Defragmentation also would be mistake, RA packets
417 * cannot be fragmented, because there is no warranty 422 * cannot be fragmented, because there is no warranty
418 * that different fragments will go along one path. --ANK 423 * that different fragments will go along one path. --ANK
419 */ 424 */
420 if (opt->ra) { 425 if (opt->ra) {
421 u8 *ptr = skb_network_header(skb) + opt->ra; 426 u8 *ptr = skb_network_header(skb) + opt->ra;
422 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) 427 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
423 return 0; 428 return 0;
424 } 429 }
425 430
426 /* 431 /*
427 * check and decrement ttl 432 * check and decrement ttl
428 */ 433 */
429 if (hdr->hop_limit <= 1) { 434 if (hdr->hop_limit <= 1) {
430 /* Force OUTPUT device used as source address */ 435 /* Force OUTPUT device used as source address */
431 skb->dev = dst->dev; 436 skb->dev = dst->dev;
432 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 437 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
433 IP6_INC_STATS_BH(net, 438 IP6_INC_STATS_BH(net,
434 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); 439 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
435 440
436 kfree_skb(skb); 441 kfree_skb(skb);
437 return -ETIMEDOUT; 442 return -ETIMEDOUT;
438 } 443 }
439 444
440 /* XXX: idev->cnf.proxy_ndp? */ 445 /* XXX: idev->cnf.proxy_ndp? */
441 if (net->ipv6.devconf_all->proxy_ndp && 446 if (net->ipv6.devconf_all->proxy_ndp &&
442 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 447 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
443 int proxied = ip6_forward_proxy_check(skb); 448 int proxied = ip6_forward_proxy_check(skb);
444 if (proxied > 0) 449 if (proxied > 0)
445 return ip6_input(skb); 450 return ip6_input(skb);
446 else if (proxied < 0) { 451 else if (proxied < 0) {
447 IP6_INC_STATS(net, ip6_dst_idev(dst), 452 IP6_INC_STATS(net, ip6_dst_idev(dst),
448 IPSTATS_MIB_INDISCARDS); 453 IPSTATS_MIB_INDISCARDS);
449 goto drop; 454 goto drop;
450 } 455 }
451 } 456 }
452 457
453 if (!xfrm6_route_forward(skb)) { 458 if (!xfrm6_route_forward(skb)) {
454 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); 459 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
455 goto drop; 460 goto drop;
456 } 461 }
457 dst = skb_dst(skb); 462 dst = skb_dst(skb);
458 463
459 /* IPv6 specs say nothing about it, but it is clear that we cannot 464 /* IPv6 specs say nothing about it, but it is clear that we cannot
460 send redirects to source routed frames. 465 send redirects to source routed frames.
461 We don't send redirects to frames decapsulated from IPsec. 466 We don't send redirects to frames decapsulated from IPsec.
462 */ 467 */
463 n = dst_get_neighbour(dst); 468 n = dst_get_neighbour(dst);
464 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) { 469 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 struct in6_addr *target = NULL; 470 struct in6_addr *target = NULL;
466 struct rt6_info *rt; 471 struct rt6_info *rt;
467 472
468 /* 473 /*
469 * incoming and outgoing devices are the same 474 * incoming and outgoing devices are the same
470 * send a redirect. 475 * send a redirect.
471 */ 476 */
472 477
473 rt = (struct rt6_info *) dst; 478 rt = (struct rt6_info *) dst;
474 if ((rt->rt6i_flags & RTF_GATEWAY)) 479 if ((rt->rt6i_flags & RTF_GATEWAY))
475 target = (struct in6_addr*)&n->primary_key; 480 target = (struct in6_addr*)&n->primary_key;
476 else 481 else
477 target = &hdr->daddr; 482 target = &hdr->daddr;
478 483
479 if (!rt->rt6i_peer) 484 if (!rt->rt6i_peer)
480 rt6_bind_peer(rt, 1); 485 rt6_bind_peer(rt, 1);
481 486
482 /* Limit redirects both by destination (here) 487 /* Limit redirects both by destination (here)
483 and by source (inside ndisc_send_redirect) 488 and by source (inside ndisc_send_redirect)
484 */ 489 */
485 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) 490 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 ndisc_send_redirect(skb, n, target); 491 ndisc_send_redirect(skb, n, target);
487 } else { 492 } else {
488 int addrtype = ipv6_addr_type(&hdr->saddr); 493 int addrtype = ipv6_addr_type(&hdr->saddr);
489 494
490 /* This check is security critical. */ 495 /* This check is security critical. */
491 if (addrtype == IPV6_ADDR_ANY || 496 if (addrtype == IPV6_ADDR_ANY ||
492 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 497 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 goto error; 498 goto error;
494 if (addrtype & IPV6_ADDR_LINKLOCAL) { 499 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 500 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 ICMPV6_NOT_NEIGHBOUR, 0); 501 ICMPV6_NOT_NEIGHBOUR, 0);
497 goto error; 502 goto error;
498 } 503 }
499 } 504 }
500 505
501 mtu = dst_mtu(dst); 506 mtu = dst_mtu(dst);
502 if (mtu < IPV6_MIN_MTU) 507 if (mtu < IPV6_MIN_MTU)
503 mtu = IPV6_MIN_MTU; 508 mtu = IPV6_MIN_MTU;
504 509
505 if (skb->len > mtu && !skb_is_gso(skb)) { 510 if (skb->len > mtu && !skb_is_gso(skb)) {
506 /* Again, force OUTPUT device used as source address */ 511 /* Again, force OUTPUT device used as source address */
507 skb->dev = dst->dev; 512 skb->dev = dst->dev;
508 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 513 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 IP6_INC_STATS_BH(net, 514 IP6_INC_STATS_BH(net,
510 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); 515 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(net, 516 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); 517 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 kfree_skb(skb); 518 kfree_skb(skb);
514 return -EMSGSIZE; 519 return -EMSGSIZE;
515 } 520 }
516 521
517 if (skb_cow(skb, dst->dev->hard_header_len)) { 522 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); 523 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 goto drop; 524 goto drop;
520 } 525 }
521 526
522 hdr = ipv6_hdr(skb); 527 hdr = ipv6_hdr(skb);
523 528
524 /* Mangling hops number delayed to point after skb COW */ 529 /* Mangling hops number delayed to point after skb COW */
525 530
526 hdr->hop_limit--; 531 hdr->hop_limit--;
527 532
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, 534 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 ip6_forward_finish); 535 ip6_forward_finish);
531 536
532 error: 537 error:
533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 538 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop: 539 drop:
535 kfree_skb(skb); 540 kfree_skb(skb);
536 return -EINVAL; 541 return -EINVAL;
537 } 542 }
538 543
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 { 545 {
541 to->pkt_type = from->pkt_type; 546 to->pkt_type = from->pkt_type;
542 to->priority = from->priority; 547 to->priority = from->priority;
543 to->protocol = from->protocol; 548 to->protocol = from->protocol;
544 skb_dst_drop(to); 549 skb_dst_drop(to);
545 skb_dst_set(to, dst_clone(skb_dst(from))); 550 skb_dst_set(to, dst_clone(skb_dst(from)));
546 to->dev = from->dev; 551 to->dev = from->dev;
547 to->mark = from->mark; 552 to->mark = from->mark;
548 553
549 #ifdef CONFIG_NET_SCHED 554 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index; 555 to->tc_index = from->tc_index;
551 #endif 556 #endif
552 nf_copy(to, from); 557 nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 559 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace; 560 to->nf_trace = from->nf_trace;
556 #endif 561 #endif
557 skb_copy_secmark(to, from); 562 skb_copy_secmark(to, from);
558 } 563 }
559 564
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) 565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 { 566 {
562 u16 offset = sizeof(struct ipv6hdr); 567 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr = 568 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); 569 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header; 570 unsigned int packet_len = skb->tail - skb->network_header;
566 int found_rhdr = 0; 571 int found_rhdr = 0;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr; 572 *nexthdr = &ipv6_hdr(skb)->nexthdr;
568 573
569 while (offset + 1 <= packet_len) { 574 while (offset + 1 <= packet_len) {
570 575
571 switch (**nexthdr) { 576 switch (**nexthdr) {
572 577
573 case NEXTHDR_HOP: 578 case NEXTHDR_HOP:
574 break; 579 break;
575 case NEXTHDR_ROUTING: 580 case NEXTHDR_ROUTING:
576 found_rhdr = 1; 581 found_rhdr = 1;
577 break; 582 break;
578 case NEXTHDR_DEST: 583 case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) 584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) 585 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 break; 586 break;
582 #endif 587 #endif
583 if (found_rhdr) 588 if (found_rhdr)
584 return offset; 589 return offset;
585 break; 590 break;
586 default : 591 default :
587 return offset; 592 return offset;
588 } 593 }
589 594
590 offset += ipv6_optlen(exthdr); 595 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr; 596 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + 597 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 offset); 598 offset);
594 } 599 }
595 600
596 return offset; 601 return offset;
597 } 602 }
598 603
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) 604 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 { 605 {
601 static atomic_t ipv6_fragmentation_id; 606 static atomic_t ipv6_fragmentation_id;
602 int old, new; 607 int old, new;
603 608
604 if (rt) { 609 if (rt) {
605 struct inet_peer *peer; 610 struct inet_peer *peer;
606 611
607 if (!rt->rt6i_peer) 612 if (!rt->rt6i_peer)
608 rt6_bind_peer(rt, 1); 613 rt6_bind_peer(rt, 1);
609 peer = rt->rt6i_peer; 614 peer = rt->rt6i_peer;
610 if (peer) { 615 if (peer) {
611 fhdr->identification = htonl(inet_getid(peer, 0)); 616 fhdr->identification = htonl(inet_getid(peer, 0));
612 return; 617 return;
613 } 618 }
614 } 619 }
615 do { 620 do {
616 old = atomic_read(&ipv6_fragmentation_id); 621 old = atomic_read(&ipv6_fragmentation_id);
617 new = old + 1; 622 new = old + 1;
618 if (!new) 623 if (!new)
619 new = 1; 624 new = 1;
620 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); 625 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 fhdr->identification = htonl(new); 626 fhdr->identification = htonl(new);
622 } 627 }
623 628
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 629 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 { 630 {
626 struct sk_buff *frag; 631 struct sk_buff *frag;
627 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); 632 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; 633 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 struct ipv6hdr *tmp_hdr; 634 struct ipv6hdr *tmp_hdr;
630 struct frag_hdr *fh; 635 struct frag_hdr *fh;
631 unsigned int mtu, hlen, left, len; 636 unsigned int mtu, hlen, left, len;
632 __be32 frag_id = 0; 637 __be32 frag_id = 0;
633 int ptr, offset = 0, err=0; 638 int ptr, offset = 0, err=0;
634 u8 *prevhdr, nexthdr = 0; 639 u8 *prevhdr, nexthdr = 0;
635 struct net *net = dev_net(skb_dst(skb)->dev); 640 struct net *net = dev_net(skb_dst(skb)->dev);
636 641
637 hlen = ip6_find_1stfragopt(skb, &prevhdr); 642 hlen = ip6_find_1stfragopt(skb, &prevhdr);
638 nexthdr = *prevhdr; 643 nexthdr = *prevhdr;
639 644
640 mtu = ip6_skb_dst_mtu(skb); 645 mtu = ip6_skb_dst_mtu(skb);
641 646
642 /* We must not fragment if the socket is set to force MTU discovery 647 /* We must not fragment if the socket is set to force MTU discovery
643 * or if the skb it not generated by a local socket. 648 * or if the skb it not generated by a local socket.
644 */ 649 */
645 if (!skb->local_df && skb->len > mtu) { 650 if (!skb->local_df && skb->len > mtu) {
646 skb->dev = skb_dst(skb)->dev; 651 skb->dev = skb_dst(skb)->dev;
647 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 652 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
648 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 653 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
649 IPSTATS_MIB_FRAGFAILS); 654 IPSTATS_MIB_FRAGFAILS);
650 kfree_skb(skb); 655 kfree_skb(skb);
651 return -EMSGSIZE; 656 return -EMSGSIZE;
652 } 657 }
653 658
654 if (np && np->frag_size < mtu) { 659 if (np && np->frag_size < mtu) {
655 if (np->frag_size) 660 if (np->frag_size)
656 mtu = np->frag_size; 661 mtu = np->frag_size;
657 } 662 }
658 mtu -= hlen + sizeof(struct frag_hdr); 663 mtu -= hlen + sizeof(struct frag_hdr);
659 664
660 if (skb_has_frag_list(skb)) { 665 if (skb_has_frag_list(skb)) {
661 int first_len = skb_pagelen(skb); 666 int first_len = skb_pagelen(skb);
662 struct sk_buff *frag2; 667 struct sk_buff *frag2;
663 668
664 if (first_len - hlen > mtu || 669 if (first_len - hlen > mtu ||
665 ((first_len - hlen) & 7) || 670 ((first_len - hlen) & 7) ||
666 skb_cloned(skb)) 671 skb_cloned(skb))
667 goto slow_path; 672 goto slow_path;
668 673
669 skb_walk_frags(skb, frag) { 674 skb_walk_frags(skb, frag) {
670 /* Correct geometry. */ 675 /* Correct geometry. */
671 if (frag->len > mtu || 676 if (frag->len > mtu ||
672 ((frag->len & 7) && frag->next) || 677 ((frag->len & 7) && frag->next) ||
673 skb_headroom(frag) < hlen) 678 skb_headroom(frag) < hlen)
674 goto slow_path_clean; 679 goto slow_path_clean;
675 680
676 /* Partially cloned skb? */ 681 /* Partially cloned skb? */
677 if (skb_shared(frag)) 682 if (skb_shared(frag))
678 goto slow_path_clean; 683 goto slow_path_clean;
679 684
680 BUG_ON(frag->sk); 685 BUG_ON(frag->sk);
681 if (skb->sk) { 686 if (skb->sk) {
682 frag->sk = skb->sk; 687 frag->sk = skb->sk;
683 frag->destructor = sock_wfree; 688 frag->destructor = sock_wfree;
684 } 689 }
685 skb->truesize -= frag->truesize; 690 skb->truesize -= frag->truesize;
686 } 691 }
687 692
688 err = 0; 693 err = 0;
689 offset = 0; 694 offset = 0;
690 frag = skb_shinfo(skb)->frag_list; 695 frag = skb_shinfo(skb)->frag_list;
691 skb_frag_list_init(skb); 696 skb_frag_list_init(skb);
692 /* BUILD HEADER */ 697 /* BUILD HEADER */
693 698
694 *prevhdr = NEXTHDR_FRAGMENT; 699 *prevhdr = NEXTHDR_FRAGMENT;
695 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 700 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
696 if (!tmp_hdr) { 701 if (!tmp_hdr) {
697 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 702 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
698 IPSTATS_MIB_FRAGFAILS); 703 IPSTATS_MIB_FRAGFAILS);
699 return -ENOMEM; 704 return -ENOMEM;
700 } 705 }
701 706
702 __skb_pull(skb, hlen); 707 __skb_pull(skb, hlen);
703 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); 708 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
704 __skb_push(skb, hlen); 709 __skb_push(skb, hlen);
705 skb_reset_network_header(skb); 710 skb_reset_network_header(skb);
706 memcpy(skb_network_header(skb), tmp_hdr, hlen); 711 memcpy(skb_network_header(skb), tmp_hdr, hlen);
707 712
708 ipv6_select_ident(fh, rt); 713 ipv6_select_ident(fh, rt);
709 fh->nexthdr = nexthdr; 714 fh->nexthdr = nexthdr;
710 fh->reserved = 0; 715 fh->reserved = 0;
711 fh->frag_off = htons(IP6_MF); 716 fh->frag_off = htons(IP6_MF);
712 frag_id = fh->identification; 717 frag_id = fh->identification;
713 718
714 first_len = skb_pagelen(skb); 719 first_len = skb_pagelen(skb);
715 skb->data_len = first_len - skb_headlen(skb); 720 skb->data_len = first_len - skb_headlen(skb);
716 skb->len = first_len; 721 skb->len = first_len;
717 ipv6_hdr(skb)->payload_len = htons(first_len - 722 ipv6_hdr(skb)->payload_len = htons(first_len -
718 sizeof(struct ipv6hdr)); 723 sizeof(struct ipv6hdr));
719 724
720 dst_hold(&rt->dst); 725 dst_hold(&rt->dst);
721 726
722 for (;;) { 727 for (;;) {
723 /* Prepare header of the next frame, 728 /* Prepare header of the next frame,
724 * before previous one went down. */ 729 * before previous one went down. */
725 if (frag) { 730 if (frag) {
726 frag->ip_summed = CHECKSUM_NONE; 731 frag->ip_summed = CHECKSUM_NONE;
727 skb_reset_transport_header(frag); 732 skb_reset_transport_header(frag);
728 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); 733 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
729 __skb_push(frag, hlen); 734 __skb_push(frag, hlen);
730 skb_reset_network_header(frag); 735 skb_reset_network_header(frag);
731 memcpy(skb_network_header(frag), tmp_hdr, 736 memcpy(skb_network_header(frag), tmp_hdr,
732 hlen); 737 hlen);
733 offset += skb->len - hlen - sizeof(struct frag_hdr); 738 offset += skb->len - hlen - sizeof(struct frag_hdr);
734 fh->nexthdr = nexthdr; 739 fh->nexthdr = nexthdr;
735 fh->reserved = 0; 740 fh->reserved = 0;
736 fh->frag_off = htons(offset); 741 fh->frag_off = htons(offset);
737 if (frag->next != NULL) 742 if (frag->next != NULL)
738 fh->frag_off |= htons(IP6_MF); 743 fh->frag_off |= htons(IP6_MF);
739 fh->identification = frag_id; 744 fh->identification = frag_id;
740 ipv6_hdr(frag)->payload_len = 745 ipv6_hdr(frag)->payload_len =
741 htons(frag->len - 746 htons(frag->len -
742 sizeof(struct ipv6hdr)); 747 sizeof(struct ipv6hdr));
743 ip6_copy_metadata(frag, skb); 748 ip6_copy_metadata(frag, skb);
744 } 749 }
745 750
746 err = output(skb); 751 err = output(skb);
747 if(!err) 752 if(!err)
748 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 753 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 IPSTATS_MIB_FRAGCREATES); 754 IPSTATS_MIB_FRAGCREATES);
750 755
751 if (err || !frag) 756 if (err || !frag)
752 break; 757 break;
753 758
754 skb = frag; 759 skb = frag;
755 frag = skb->next; 760 frag = skb->next;
756 skb->next = NULL; 761 skb->next = NULL;
757 } 762 }
758 763
759 kfree(tmp_hdr); 764 kfree(tmp_hdr);
760 765
761 if (err == 0) { 766 if (err == 0) {
762 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 767 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763 IPSTATS_MIB_FRAGOKS); 768 IPSTATS_MIB_FRAGOKS);
764 dst_release(&rt->dst); 769 dst_release(&rt->dst);
765 return 0; 770 return 0;
766 } 771 }
767 772
768 while (frag) { 773 while (frag) {
769 skb = frag->next; 774 skb = frag->next;
770 kfree_skb(frag); 775 kfree_skb(frag);
771 frag = skb; 776 frag = skb;
772 } 777 }
773 778
774 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 779 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
775 IPSTATS_MIB_FRAGFAILS); 780 IPSTATS_MIB_FRAGFAILS);
776 dst_release(&rt->dst); 781 dst_release(&rt->dst);
777 return err; 782 return err;
778 783
779 slow_path_clean: 784 slow_path_clean:
780 skb_walk_frags(skb, frag2) { 785 skb_walk_frags(skb, frag2) {
781 if (frag2 == frag) 786 if (frag2 == frag)
782 break; 787 break;
783 frag2->sk = NULL; 788 frag2->sk = NULL;
784 frag2->destructor = NULL; 789 frag2->destructor = NULL;
785 skb->truesize += frag2->truesize; 790 skb->truesize += frag2->truesize;
786 } 791 }
787 } 792 }
788 793
789 slow_path: 794 slow_path:
790 left = skb->len - hlen; /* Space per frame */ 795 left = skb->len - hlen; /* Space per frame */
791 ptr = hlen; /* Where to start from */ 796 ptr = hlen; /* Where to start from */
792 797
793 /* 798 /*
794 * Fragment the datagram. 799 * Fragment the datagram.
795 */ 800 */
796 801
797 *prevhdr = NEXTHDR_FRAGMENT; 802 *prevhdr = NEXTHDR_FRAGMENT;
798 803
799 /* 804 /*
800 * Keep copying data until we run out. 805 * Keep copying data until we run out.
801 */ 806 */
802 while(left > 0) { 807 while(left > 0) {
803 len = left; 808 len = left;
804 /* IF: it doesn't fit, use 'mtu' - the data space left */ 809 /* IF: it doesn't fit, use 'mtu' - the data space left */
805 if (len > mtu) 810 if (len > mtu)
806 len = mtu; 811 len = mtu;
807 /* IF: we are not sending up to and including the packet end 812 /* IF: we are not sending up to and including the packet end
808 then align the next start on an eight byte boundary */ 813 then align the next start on an eight byte boundary */
809 if (len < left) { 814 if (len < left) {
810 len &= ~7; 815 len &= ~7;
811 } 816 }
812 /* 817 /*
813 * Allocate buffer. 818 * Allocate buffer.
814 */ 819 */
815 820
816 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) { 821 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
817 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); 822 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
818 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 823 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
819 IPSTATS_MIB_FRAGFAILS); 824 IPSTATS_MIB_FRAGFAILS);
820 err = -ENOMEM; 825 err = -ENOMEM;
821 goto fail; 826 goto fail;
822 } 827 }
823 828
824 /* 829 /*
825 * Set up data on packet 830 * Set up data on packet
826 */ 831 */
827 832
828 ip6_copy_metadata(frag, skb); 833 ip6_copy_metadata(frag, skb);
829 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev)); 834 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
830 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 835 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
831 skb_reset_network_header(frag); 836 skb_reset_network_header(frag);
832 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 837 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
833 frag->transport_header = (frag->network_header + hlen + 838 frag->transport_header = (frag->network_header + hlen +
834 sizeof(struct frag_hdr)); 839 sizeof(struct frag_hdr));
835 840
836 /* 841 /*
837 * Charge the memory for the fragment to any owner 842 * Charge the memory for the fragment to any owner
838 * it might possess 843 * it might possess
839 */ 844 */
840 if (skb->sk) 845 if (skb->sk)
841 skb_set_owner_w(frag, skb->sk); 846 skb_set_owner_w(frag, skb->sk);
842 847
843 /* 848 /*
844 * Copy the packet header into the new buffer. 849 * Copy the packet header into the new buffer.
845 */ 850 */
846 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 851 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
847 852
848 /* 853 /*
849 * Build fragment header. 854 * Build fragment header.
850 */ 855 */
851 fh->nexthdr = nexthdr; 856 fh->nexthdr = nexthdr;
852 fh->reserved = 0; 857 fh->reserved = 0;
853 if (!frag_id) { 858 if (!frag_id) {
854 ipv6_select_ident(fh, rt); 859 ipv6_select_ident(fh, rt);
855 frag_id = fh->identification; 860 frag_id = fh->identification;
856 } else 861 } else
857 fh->identification = frag_id; 862 fh->identification = frag_id;
858 863
859 /* 864 /*
860 * Copy a block of the IP datagram. 865 * Copy a block of the IP datagram.
861 */ 866 */
862 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) 867 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
863 BUG(); 868 BUG();
864 left -= len; 869 left -= len;
865 870
866 fh->frag_off = htons(offset); 871 fh->frag_off = htons(offset);
867 if (left > 0) 872 if (left > 0)
868 fh->frag_off |= htons(IP6_MF); 873 fh->frag_off |= htons(IP6_MF);
869 ipv6_hdr(frag)->payload_len = htons(frag->len - 874 ipv6_hdr(frag)->payload_len = htons(frag->len -
870 sizeof(struct ipv6hdr)); 875 sizeof(struct ipv6hdr));
871 876
872 ptr += len; 877 ptr += len;
873 offset += len; 878 offset += len;
874 879
875 /* 880 /*
876 * Put this fragment into the sending queue. 881 * Put this fragment into the sending queue.
877 */ 882 */
878 err = output(frag); 883 err = output(frag);
879 if (err) 884 if (err)
880 goto fail; 885 goto fail;
881 886
882 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
883 IPSTATS_MIB_FRAGCREATES); 888 IPSTATS_MIB_FRAGCREATES);
884 } 889 }
885 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 890 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886 IPSTATS_MIB_FRAGOKS); 891 IPSTATS_MIB_FRAGOKS);
887 kfree_skb(skb); 892 kfree_skb(skb);
888 return err; 893 return err;
889 894
890 fail: 895 fail:
891 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 896 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
892 IPSTATS_MIB_FRAGFAILS); 897 IPSTATS_MIB_FRAGFAILS);
893 kfree_skb(skb); 898 kfree_skb(skb);
894 return err; 899 return err;
895 } 900 }
896 901
897 static inline int ip6_rt_check(const struct rt6key *rt_key, 902 static inline int ip6_rt_check(const struct rt6key *rt_key,
898 const struct in6_addr *fl_addr, 903 const struct in6_addr *fl_addr,
899 const struct in6_addr *addr_cache) 904 const struct in6_addr *addr_cache)
900 { 905 {
901 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 906 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
902 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); 907 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
903 } 908 }
904 909
905 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
906 struct dst_entry *dst, 911 struct dst_entry *dst,
907 const struct flowi6 *fl6) 912 const struct flowi6 *fl6)
908 { 913 {
909 struct ipv6_pinfo *np = inet6_sk(sk); 914 struct ipv6_pinfo *np = inet6_sk(sk);
910 struct rt6_info *rt = (struct rt6_info *)dst; 915 struct rt6_info *rt = (struct rt6_info *)dst;
911 916
912 if (!dst) 917 if (!dst)
913 goto out; 918 goto out;
914 919
915 /* Yes, checking route validity in not connected 920 /* Yes, checking route validity in not connected
916 * case is not very simple. Take into account, 921 * case is not very simple. Take into account,
917 * that we do not support routing by source, TOS, 922 * that we do not support routing by source, TOS,
918 * and MSG_DONTROUTE --ANK (980726) 923 * and MSG_DONTROUTE --ANK (980726)
919 * 924 *
920 * 1. ip6_rt_check(): If route was host route, 925 * 1. ip6_rt_check(): If route was host route,
921 * check that cached destination is current. 926 * check that cached destination is current.
922 * If it is network route, we still may 927 * If it is network route, we still may
923 * check its validity using saved pointer 928 * check its validity using saved pointer
924 * to the last used address: daddr_cache. 929 * to the last used address: daddr_cache.
925 * We do not want to save whole address now, 930 * We do not want to save whole address now,
926 * (because main consumer of this service 931 * (because main consumer of this service
927 * is tcp, which has not this problem), 932 * is tcp, which has not this problem),
928 * so that the last trick works only on connected 933 * so that the last trick works only on connected
929 * sockets. 934 * sockets.
930 * 2. oif also should be the same. 935 * 2. oif also should be the same.
931 */ 936 */
932 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 937 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES 938 #ifdef CONFIG_IPV6_SUBTREES
934 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 939 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif 940 #endif
936 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 941 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
937 dst_release(dst); 942 dst_release(dst);
938 dst = NULL; 943 dst = NULL;
939 } 944 }
940 945
941 out: 946 out:
942 return dst; 947 return dst;
943 } 948 }
944 949
945 static int ip6_dst_lookup_tail(struct sock *sk, 950 static int ip6_dst_lookup_tail(struct sock *sk,
946 struct dst_entry **dst, struct flowi6 *fl6) 951 struct dst_entry **dst, struct flowi6 *fl6)
947 { 952 {
948 struct net *net = sock_net(sk); 953 struct net *net = sock_net(sk);
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950 struct neighbour *n; 955 struct neighbour *n;
951 #endif 956 #endif
952 int err; 957 int err;
953 958
954 if (*dst == NULL) 959 if (*dst == NULL)
955 *dst = ip6_route_output(net, sk, fl6); 960 *dst = ip6_route_output(net, sk, fl6);
956 961
957 if ((err = (*dst)->error)) 962 if ((err = (*dst)->error))
958 goto out_err_release; 963 goto out_err_release;
959 964
960 if (ipv6_addr_any(&fl6->saddr)) { 965 if (ipv6_addr_any(&fl6->saddr)) {
961 struct rt6_info *rt = (struct rt6_info *) *dst; 966 struct rt6_info *rt = (struct rt6_info *) *dst;
962 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 967 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
963 sk ? inet6_sk(sk)->srcprefs : 0, 968 sk ? inet6_sk(sk)->srcprefs : 0,
964 &fl6->saddr); 969 &fl6->saddr);
965 if (err) 970 if (err)
966 goto out_err_release; 971 goto out_err_release;
967 } 972 }
968 973
969 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
970 /* 975 /*
971 * Here if the dst entry we've looked up 976 * Here if the dst entry we've looked up
972 * has a neighbour entry that is in the INCOMPLETE 977 * has a neighbour entry that is in the INCOMPLETE
973 * state and the src address from the flow is 978 * state and the src address from the flow is
974 * marked as OPTIMISTIC, we release the found 979 * marked as OPTIMISTIC, we release the found
975 * dst entry and replace it instead with the 980 * dst entry and replace it instead with the
976 * dst entry of the nexthop router 981 * dst entry of the nexthop router
977 */ 982 */
983 rcu_read_lock();
978 n = dst_get_neighbour(*dst); 984 n = dst_get_neighbour(*dst);
979 if (n && !(n->nud_state & NUD_VALID)) { 985 if (n && !(n->nud_state & NUD_VALID)) {
980 struct inet6_ifaddr *ifp; 986 struct inet6_ifaddr *ifp;
981 struct flowi6 fl_gw6; 987 struct flowi6 fl_gw6;
982 int redirect; 988 int redirect;
983 989
990 rcu_read_unlock();
984 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 991 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
985 (*dst)->dev, 1); 992 (*dst)->dev, 1);
986 993
987 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 994 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
988 if (ifp) 995 if (ifp)
989 in6_ifa_put(ifp); 996 in6_ifa_put(ifp);
990 997
991 if (redirect) { 998 if (redirect) {
992 /* 999 /*
993 * We need to get the dst entry for the 1000 * We need to get the dst entry for the
994 * default router instead 1001 * default router instead
995 */ 1002 */
996 dst_release(*dst); 1003 dst_release(*dst);
997 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1004 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
998 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1005 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
999 *dst = ip6_route_output(net, sk, &fl_gw6); 1006 *dst = ip6_route_output(net, sk, &fl_gw6);
1000 if ((err = (*dst)->error)) 1007 if ((err = (*dst)->error))
1001 goto out_err_release; 1008 goto out_err_release;
1002 } 1009 }
1010 } else {
1011 rcu_read_unlock();
1003 } 1012 }
1004 #endif 1013 #endif
1005 1014
1006 return 0; 1015 return 0;
1007 1016
1008 out_err_release: 1017 out_err_release:
1009 if (err == -ENETUNREACH) 1018 if (err == -ENETUNREACH)
1010 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1019 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011 dst_release(*dst); 1020 dst_release(*dst);
1012 *dst = NULL; 1021 *dst = NULL;
1013 return err; 1022 return err;
1014 } 1023 }
1015 1024
1016 /** 1025 /**
1017 * ip6_dst_lookup - perform route lookup on flow 1026 * ip6_dst_lookup - perform route lookup on flow
1018 * @sk: socket which provides route info 1027 * @sk: socket which provides route info
1019 * @dst: pointer to dst_entry * for result 1028 * @dst: pointer to dst_entry * for result
1020 * @fl6: flow to lookup 1029 * @fl6: flow to lookup
1021 * 1030 *
1022 * This function performs a route lookup on the given flow. 1031 * This function performs a route lookup on the given flow.
1023 * 1032 *
1024 * It returns zero on success, or a standard errno code on error. 1033 * It returns zero on success, or a standard errno code on error.
1025 */ 1034 */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) 1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027 { 1036 {
1028 *dst = NULL; 1037 *dst = NULL;
1029 return ip6_dst_lookup_tail(sk, dst, fl6); 1038 return ip6_dst_lookup_tail(sk, dst, fl6);
1030 } 1039 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032 1041
1033 /** 1042 /**
1034 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1043 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035 * @sk: socket which provides route info 1044 * @sk: socket which provides route info
1036 * @fl6: flow to lookup 1045 * @fl6: flow to lookup
1037 * @final_dst: final destination address for ipsec lookup 1046 * @final_dst: final destination address for ipsec lookup
1038 * @can_sleep: we are in a sleepable context 1047 * @can_sleep: we are in a sleepable context
1039 * 1048 *
1040 * This function performs a route lookup on the given flow. 1049 * This function performs a route lookup on the given flow.
1041 * 1050 *
1042 * It returns a valid dst pointer on success, or a pointer encoded 1051 * It returns a valid dst pointer on success, or a pointer encoded
1043 * error code. 1052 * error code.
1044 */ 1053 */
1045 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046 const struct in6_addr *final_dst, 1055 const struct in6_addr *final_dst,
1047 bool can_sleep) 1056 bool can_sleep)
1048 { 1057 {
1049 struct dst_entry *dst = NULL; 1058 struct dst_entry *dst = NULL;
1050 int err; 1059 int err;
1051 1060
1052 err = ip6_dst_lookup_tail(sk, &dst, fl6); 1061 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053 if (err) 1062 if (err)
1054 return ERR_PTR(err); 1063 return ERR_PTR(err);
1055 if (final_dst) 1064 if (final_dst)
1056 ipv6_addr_copy(&fl6->daddr, final_dst); 1065 ipv6_addr_copy(&fl6->daddr, final_dst);
1057 if (can_sleep) 1066 if (can_sleep)
1058 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; 1067 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059 1068
1060 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1069 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061 } 1070 }
1062 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063 1072
1064 /** 1073 /**
1065 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1074 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066 * @sk: socket which provides the dst cache and route info 1075 * @sk: socket which provides the dst cache and route info
1067 * @fl6: flow to lookup 1076 * @fl6: flow to lookup
1068 * @final_dst: final destination address for ipsec lookup 1077 * @final_dst: final destination address for ipsec lookup
1069 * @can_sleep: we are in a sleepable context 1078 * @can_sleep: we are in a sleepable context
1070 * 1079 *
1071 * This function performs a route lookup on the given flow with the 1080 * This function performs a route lookup on the given flow with the
1072 * possibility of using the cached route in the socket if it is valid. 1081 * possibility of using the cached route in the socket if it is valid.
1073 * It will take the socket dst lock when operating on the dst cache. 1082 * It will take the socket dst lock when operating on the dst cache.
1074 * As a result, this function can only be used in process context. 1083 * As a result, this function can only be used in process context.
1075 * 1084 *
1076 * It returns a valid dst pointer on success, or a pointer encoded 1085 * It returns a valid dst pointer on success, or a pointer encoded
1077 * error code. 1086 * error code.
1078 */ 1087 */
1079 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080 const struct in6_addr *final_dst, 1089 const struct in6_addr *final_dst,
1081 bool can_sleep) 1090 bool can_sleep)
1082 { 1091 {
1083 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1092 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084 int err; 1093 int err;
1085 1094
1086 dst = ip6_sk_dst_check(sk, dst, fl6); 1095 dst = ip6_sk_dst_check(sk, dst, fl6);
1087 1096
1088 err = ip6_dst_lookup_tail(sk, &dst, fl6); 1097 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089 if (err) 1098 if (err)
1090 return ERR_PTR(err); 1099 return ERR_PTR(err);
1091 if (final_dst) 1100 if (final_dst)
1092 ipv6_addr_copy(&fl6->daddr, final_dst); 1101 ipv6_addr_copy(&fl6->daddr, final_dst);
1093 if (can_sleep) 1102 if (can_sleep)
1094 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; 1103 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095 1104
1096 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1105 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 } 1106 }
1098 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099 1108
1100 static inline int ip6_ufo_append_data(struct sock *sk, 1109 static inline int ip6_ufo_append_data(struct sock *sk,
1101 int getfrag(void *from, char *to, int offset, int len, 1110 int getfrag(void *from, char *to, int offset, int len,
1102 int odd, struct sk_buff *skb), 1111 int odd, struct sk_buff *skb),
1103 void *from, int length, int hh_len, int fragheaderlen, 1112 void *from, int length, int hh_len, int fragheaderlen,
1104 int transhdrlen, int mtu,unsigned int flags, 1113 int transhdrlen, int mtu,unsigned int flags,
1105 struct rt6_info *rt) 1114 struct rt6_info *rt)
1106 1115
1107 { 1116 {
1108 struct sk_buff *skb; 1117 struct sk_buff *skb;
1109 int err; 1118 int err;
1110 1119
1111 /* There is support for UDP large send offload by network 1120 /* There is support for UDP large send offload by network
1112 * device, so create one single skb packet containing complete 1121 * device, so create one single skb packet containing complete
1113 * udp datagram 1122 * udp datagram
1114 */ 1123 */
1115 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 1124 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116 skb = sock_alloc_send_skb(sk, 1125 skb = sock_alloc_send_skb(sk,
1117 hh_len + fragheaderlen + transhdrlen + 20, 1126 hh_len + fragheaderlen + transhdrlen + 20,
1118 (flags & MSG_DONTWAIT), &err); 1127 (flags & MSG_DONTWAIT), &err);
1119 if (skb == NULL) 1128 if (skb == NULL)
1120 return -ENOMEM; 1129 return -ENOMEM;
1121 1130
1122 /* reserve space for Hardware header */ 1131 /* reserve space for Hardware header */
1123 skb_reserve(skb, hh_len); 1132 skb_reserve(skb, hh_len);
1124 1133
1125 /* create space for UDP/IP header */ 1134 /* create space for UDP/IP header */
1126 skb_put(skb,fragheaderlen + transhdrlen); 1135 skb_put(skb,fragheaderlen + transhdrlen);
1127 1136
1128 /* initialize network header pointer */ 1137 /* initialize network header pointer */
1129 skb_reset_network_header(skb); 1138 skb_reset_network_header(skb);
1130 1139
1131 /* initialize protocol header pointer */ 1140 /* initialize protocol header pointer */
1132 skb->transport_header = skb->network_header + fragheaderlen; 1141 skb->transport_header = skb->network_header + fragheaderlen;
1133 1142
1134 skb->ip_summed = CHECKSUM_PARTIAL; 1143 skb->ip_summed = CHECKSUM_PARTIAL;
1135 skb->csum = 0; 1144 skb->csum = 0;
1136 } 1145 }
1137 1146
1138 err = skb_append_datato_frags(sk,skb, getfrag, from, 1147 err = skb_append_datato_frags(sk,skb, getfrag, from,
1139 (length - transhdrlen)); 1148 (length - transhdrlen));
1140 if (!err) { 1149 if (!err) {
1141 struct frag_hdr fhdr; 1150 struct frag_hdr fhdr;
1142 1151
1143 /* Specify the length of each IPv6 datagram fragment. 1152 /* Specify the length of each IPv6 datagram fragment.
1144 * It has to be a multiple of 8. 1153 * It has to be a multiple of 8.
1145 */ 1154 */
1146 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - 1155 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147 sizeof(struct frag_hdr)) & ~7; 1156 sizeof(struct frag_hdr)) & ~7;
1148 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1157 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149 ipv6_select_ident(&fhdr, rt); 1158 ipv6_select_ident(&fhdr, rt);
1150 skb_shinfo(skb)->ip6_frag_id = fhdr.identification; 1159 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151 __skb_queue_tail(&sk->sk_write_queue, skb); 1160 __skb_queue_tail(&sk->sk_write_queue, skb);
1152 1161
1153 return 0; 1162 return 0;
1154 } 1163 }
1155 /* There is not enough support do UPD LSO, 1164 /* There is not enough support do UPD LSO,
1156 * so follow normal path 1165 * so follow normal path
1157 */ 1166 */
1158 kfree_skb(skb); 1167 kfree_skb(skb);
1159 1168
1160 return err; 1169 return err;
1161 } 1170 }
1162 1171
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164 gfp_t gfp) 1173 gfp_t gfp)
1165 { 1174 {
1166 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1175 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 } 1176 }
1168 1177
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170 gfp_t gfp) 1179 gfp_t gfp)
1171 { 1180 {
1172 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1181 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 } 1182 }
1174 1183
1175 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, 1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1176 int offset, int len, int odd, struct sk_buff *skb), 1185 int offset, int len, int odd, struct sk_buff *skb),
1177 void *from, int length, int transhdrlen, 1186 void *from, int length, int transhdrlen,
1178 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, 1187 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1179 struct rt6_info *rt, unsigned int flags, int dontfrag) 1188 struct rt6_info *rt, unsigned int flags, int dontfrag)
1180 { 1189 {
1181 struct inet_sock *inet = inet_sk(sk); 1190 struct inet_sock *inet = inet_sk(sk);
1182 struct ipv6_pinfo *np = inet6_sk(sk); 1191 struct ipv6_pinfo *np = inet6_sk(sk);
1183 struct inet_cork *cork; 1192 struct inet_cork *cork;
1184 struct sk_buff *skb; 1193 struct sk_buff *skb;
1185 unsigned int maxfraglen, fragheaderlen; 1194 unsigned int maxfraglen, fragheaderlen;
1186 int exthdrlen; 1195 int exthdrlen;
1187 int hh_len; 1196 int hh_len;
1188 int mtu; 1197 int mtu;
1189 int copy; 1198 int copy;
1190 int err; 1199 int err;
1191 int offset = 0; 1200 int offset = 0;
1192 int csummode = CHECKSUM_NONE; 1201 int csummode = CHECKSUM_NONE;
1193 __u8 tx_flags = 0; 1202 __u8 tx_flags = 0;
1194 1203
1195 if (flags&MSG_PROBE) 1204 if (flags&MSG_PROBE)
1196 return 0; 1205 return 0;
1197 cork = &inet->cork.base; 1206 cork = &inet->cork.base;
1198 if (skb_queue_empty(&sk->sk_write_queue)) { 1207 if (skb_queue_empty(&sk->sk_write_queue)) {
1199 /* 1208 /*
1200 * setup for corking 1209 * setup for corking
1201 */ 1210 */
1202 if (opt) { 1211 if (opt) {
1203 if (WARN_ON(np->cork.opt)) 1212 if (WARN_ON(np->cork.opt))
1204 return -EINVAL; 1213 return -EINVAL;
1205 1214
1206 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); 1215 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1207 if (unlikely(np->cork.opt == NULL)) 1216 if (unlikely(np->cork.opt == NULL))
1208 return -ENOBUFS; 1217 return -ENOBUFS;
1209 1218
1210 np->cork.opt->tot_len = opt->tot_len; 1219 np->cork.opt->tot_len = opt->tot_len;
1211 np->cork.opt->opt_flen = opt->opt_flen; 1220 np->cork.opt->opt_flen = opt->opt_flen;
1212 np->cork.opt->opt_nflen = opt->opt_nflen; 1221 np->cork.opt->opt_nflen = opt->opt_nflen;
1213 1222
1214 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1223 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1215 sk->sk_allocation); 1224 sk->sk_allocation);
1216 if (opt->dst0opt && !np->cork.opt->dst0opt) 1225 if (opt->dst0opt && !np->cork.opt->dst0opt)
1217 return -ENOBUFS; 1226 return -ENOBUFS;
1218 1227
1219 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1228 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1220 sk->sk_allocation); 1229 sk->sk_allocation);
1221 if (opt->dst1opt && !np->cork.opt->dst1opt) 1230 if (opt->dst1opt && !np->cork.opt->dst1opt)
1222 return -ENOBUFS; 1231 return -ENOBUFS;
1223 1232
1224 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, 1233 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1225 sk->sk_allocation); 1234 sk->sk_allocation);
1226 if (opt->hopopt && !np->cork.opt->hopopt) 1235 if (opt->hopopt && !np->cork.opt->hopopt)
1227 return -ENOBUFS; 1236 return -ENOBUFS;
1228 1237
1229 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1238 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1230 sk->sk_allocation); 1239 sk->sk_allocation);
1231 if (opt->srcrt && !np->cork.opt->srcrt) 1240 if (opt->srcrt && !np->cork.opt->srcrt)
1232 return -ENOBUFS; 1241 return -ENOBUFS;
1233 1242
1234 /* need source address above miyazawa*/ 1243 /* need source address above miyazawa*/
1235 } 1244 }
1236 dst_hold(&rt->dst); 1245 dst_hold(&rt->dst);
1237 cork->dst = &rt->dst; 1246 cork->dst = &rt->dst;
1238 inet->cork.fl.u.ip6 = *fl6; 1247 inet->cork.fl.u.ip6 = *fl6;
1239 np->cork.hop_limit = hlimit; 1248 np->cork.hop_limit = hlimit;
1240 np->cork.tclass = tclass; 1249 np->cork.tclass = tclass;
1241 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? 1250 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1242 rt->dst.dev->mtu : dst_mtu(rt->dst.path); 1251 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1243 if (np->frag_size < mtu) { 1252 if (np->frag_size < mtu) {
1244 if (np->frag_size) 1253 if (np->frag_size)
1245 mtu = np->frag_size; 1254 mtu = np->frag_size;
1246 } 1255 }
1247 cork->fragsize = mtu; 1256 cork->fragsize = mtu;
1248 if (dst_allfrag(rt->dst.path)) 1257 if (dst_allfrag(rt->dst.path))
1249 cork->flags |= IPCORK_ALLFRAG; 1258 cork->flags |= IPCORK_ALLFRAG;
1250 cork->length = 0; 1259 cork->length = 0;
1251 sk->sk_sndmsg_page = NULL; 1260 sk->sk_sndmsg_page = NULL;
1252 sk->sk_sndmsg_off = 0; 1261 sk->sk_sndmsg_off = 0;
1253 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) - 1262 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1254 rt->rt6i_nfheader_len; 1263 rt->rt6i_nfheader_len;
1255 length += exthdrlen; 1264 length += exthdrlen;
1256 transhdrlen += exthdrlen; 1265 transhdrlen += exthdrlen;
1257 } else { 1266 } else {
1258 rt = (struct rt6_info *)cork->dst; 1267 rt = (struct rt6_info *)cork->dst;
1259 fl6 = &inet->cork.fl.u.ip6; 1268 fl6 = &inet->cork.fl.u.ip6;
1260 opt = np->cork.opt; 1269 opt = np->cork.opt;
1261 transhdrlen = 0; 1270 transhdrlen = 0;
1262 exthdrlen = 0; 1271 exthdrlen = 0;
1263 mtu = cork->fragsize; 1272 mtu = cork->fragsize;
1264 } 1273 }
1265 1274
1266 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1275 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1267 1276
1268 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1277 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1269 (opt ? opt->opt_nflen : 0); 1278 (opt ? opt->opt_nflen : 0);
1270 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); 1279 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1271 1280
1272 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { 1281 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1273 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { 1282 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1274 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); 1283 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1275 return -EMSGSIZE; 1284 return -EMSGSIZE;
1276 } 1285 }
1277 } 1286 }
1278 1287
1279 /* For UDP, check if TX timestamp is enabled */ 1288 /* For UDP, check if TX timestamp is enabled */
1280 if (sk->sk_type == SOCK_DGRAM) { 1289 if (sk->sk_type == SOCK_DGRAM) {
1281 err = sock_tx_timestamp(sk, &tx_flags); 1290 err = sock_tx_timestamp(sk, &tx_flags);
1282 if (err) 1291 if (err)
1283 goto error; 1292 goto error;
1284 } 1293 }
1285 1294
1286 /* 1295 /*
1287 * Let's try using as much space as possible. 1296 * Let's try using as much space as possible.
1288 * Use MTU if total length of the message fits into the MTU. 1297 * Use MTU if total length of the message fits into the MTU.
1289 * Otherwise, we need to reserve fragment header and 1298 * Otherwise, we need to reserve fragment header and
1290 * fragment alignment (= 8-15 octects, in total). 1299 * fragment alignment (= 8-15 octects, in total).
1291 * 1300 *
1292 * Note that we may need to "move" the data from the tail of 1301 * Note that we may need to "move" the data from the tail of
1293 * of the buffer to the new fragment when we split 1302 * of the buffer to the new fragment when we split
1294 * the message. 1303 * the message.
1295 * 1304 *
1296 * FIXME: It may be fragmented into multiple chunks 1305 * FIXME: It may be fragmented into multiple chunks
1297 * at once if non-fragmentable extension headers 1306 * at once if non-fragmentable extension headers
1298 * are too large. 1307 * are too large.
1299 * --yoshfuji 1308 * --yoshfuji
1300 */ 1309 */
1301 1310
1302 cork->length += length; 1311 cork->length += length;
1303 if (length > mtu) { 1312 if (length > mtu) {
1304 int proto = sk->sk_protocol; 1313 int proto = sk->sk_protocol;
1305 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){ 1314 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1306 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); 1315 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1307 return -EMSGSIZE; 1316 return -EMSGSIZE;
1308 } 1317 }
1309 1318
1310 if (proto == IPPROTO_UDP && 1319 if (proto == IPPROTO_UDP &&
1311 (rt->dst.dev->features & NETIF_F_UFO)) { 1320 (rt->dst.dev->features & NETIF_F_UFO)) {
1312 1321
1313 err = ip6_ufo_append_data(sk, getfrag, from, length, 1322 err = ip6_ufo_append_data(sk, getfrag, from, length,
1314 hh_len, fragheaderlen, 1323 hh_len, fragheaderlen,
1315 transhdrlen, mtu, flags, rt); 1324 transhdrlen, mtu, flags, rt);
1316 if (err) 1325 if (err)
1317 goto error; 1326 goto error;
1318 return 0; 1327 return 0;
1319 } 1328 }
1320 } 1329 }
1321 1330
1322 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1331 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1323 goto alloc_new_skb; 1332 goto alloc_new_skb;
1324 1333
1325 while (length > 0) { 1334 while (length > 0) {
1326 /* Check if the remaining data fits into current packet. */ 1335 /* Check if the remaining data fits into current packet. */
1327 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1336 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1328 if (copy < length) 1337 if (copy < length)
1329 copy = maxfraglen - skb->len; 1338 copy = maxfraglen - skb->len;
1330 1339
1331 if (copy <= 0) { 1340 if (copy <= 0) {
1332 char *data; 1341 char *data;
1333 unsigned int datalen; 1342 unsigned int datalen;
1334 unsigned int fraglen; 1343 unsigned int fraglen;
1335 unsigned int fraggap; 1344 unsigned int fraggap;
1336 unsigned int alloclen; 1345 unsigned int alloclen;
1337 struct sk_buff *skb_prev; 1346 struct sk_buff *skb_prev;
1338 alloc_new_skb: 1347 alloc_new_skb:
1339 skb_prev = skb; 1348 skb_prev = skb;
1340 1349
1341 /* There's no room in the current skb */ 1350 /* There's no room in the current skb */
1342 if (skb_prev) 1351 if (skb_prev)
1343 fraggap = skb_prev->len - maxfraglen; 1352 fraggap = skb_prev->len - maxfraglen;
1344 else 1353 else
1345 fraggap = 0; 1354 fraggap = 0;
1346 1355
1347 /* 1356 /*
1348 * If remaining data exceeds the mtu, 1357 * If remaining data exceeds the mtu,
1349 * we know we need more fragment(s). 1358 * we know we need more fragment(s).
1350 */ 1359 */
1351 datalen = length + fraggap; 1360 datalen = length + fraggap;
1352 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1361 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1353 datalen = maxfraglen - fragheaderlen; 1362 datalen = maxfraglen - fragheaderlen;
1354 1363
1355 fraglen = datalen + fragheaderlen; 1364 fraglen = datalen + fragheaderlen;
1356 if ((flags & MSG_MORE) && 1365 if ((flags & MSG_MORE) &&
1357 !(rt->dst.dev->features&NETIF_F_SG)) 1366 !(rt->dst.dev->features&NETIF_F_SG))
1358 alloclen = mtu; 1367 alloclen = mtu;
1359 else 1368 else
1360 alloclen = datalen + fragheaderlen; 1369 alloclen = datalen + fragheaderlen;
1361 1370
1362 /* 1371 /*
1363 * The last fragment gets additional space at tail. 1372 * The last fragment gets additional space at tail.
1364 * Note: we overallocate on fragments with MSG_MODE 1373 * Note: we overallocate on fragments with MSG_MODE
1365 * because we have no idea if we're the last one. 1374 * because we have no idea if we're the last one.
1366 */ 1375 */
1367 if (datalen == length + fraggap) 1376 if (datalen == length + fraggap)
1368 alloclen += rt->dst.trailer_len; 1377 alloclen += rt->dst.trailer_len;
1369 1378
1370 /* 1379 /*
1371 * We just reserve space for fragment header. 1380 * We just reserve space for fragment header.
1372 * Note: this may be overallocation if the message 1381 * Note: this may be overallocation if the message
1373 * (without MSG_MORE) fits into the MTU. 1382 * (without MSG_MORE) fits into the MTU.
1374 */ 1383 */
1375 alloclen += sizeof(struct frag_hdr); 1384 alloclen += sizeof(struct frag_hdr);
1376 1385
1377 if (transhdrlen) { 1386 if (transhdrlen) {
1378 skb = sock_alloc_send_skb(sk, 1387 skb = sock_alloc_send_skb(sk,
1379 alloclen + hh_len, 1388 alloclen + hh_len,
1380 (flags & MSG_DONTWAIT), &err); 1389 (flags & MSG_DONTWAIT), &err);
1381 } else { 1390 } else {
1382 skb = NULL; 1391 skb = NULL;
1383 if (atomic_read(&sk->sk_wmem_alloc) <= 1392 if (atomic_read(&sk->sk_wmem_alloc) <=
1384 2 * sk->sk_sndbuf) 1393 2 * sk->sk_sndbuf)
1385 skb = sock_wmalloc(sk, 1394 skb = sock_wmalloc(sk,
1386 alloclen + hh_len, 1, 1395 alloclen + hh_len, 1,
1387 sk->sk_allocation); 1396 sk->sk_allocation);
1388 if (unlikely(skb == NULL)) 1397 if (unlikely(skb == NULL))
1389 err = -ENOBUFS; 1398 err = -ENOBUFS;
1390 else { 1399 else {
1391 /* Only the initial fragment 1400 /* Only the initial fragment
1392 * is time stamped. 1401 * is time stamped.
1393 */ 1402 */
1394 tx_flags = 0; 1403 tx_flags = 0;
1395 } 1404 }
1396 } 1405 }
1397 if (skb == NULL) 1406 if (skb == NULL)
1398 goto error; 1407 goto error;
1399 /* 1408 /*
1400 * Fill in the control structures 1409 * Fill in the control structures
1401 */ 1410 */
1402 skb->ip_summed = csummode; 1411 skb->ip_summed = csummode;
1403 skb->csum = 0; 1412 skb->csum = 0;
1404 /* reserve for fragmentation */ 1413 /* reserve for fragmentation */
1405 skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); 1414 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1406 1415
1407 if (sk->sk_type == SOCK_DGRAM) 1416 if (sk->sk_type == SOCK_DGRAM)
1408 skb_shinfo(skb)->tx_flags = tx_flags; 1417 skb_shinfo(skb)->tx_flags = tx_flags;
1409 1418
1410 /* 1419 /*
1411 * Find where to start putting bytes 1420 * Find where to start putting bytes
1412 */ 1421 */
1413 data = skb_put(skb, fraglen); 1422 data = skb_put(skb, fraglen);
1414 skb_set_network_header(skb, exthdrlen); 1423 skb_set_network_header(skb, exthdrlen);
1415 data += fragheaderlen; 1424 data += fragheaderlen;
1416 skb->transport_header = (skb->network_header + 1425 skb->transport_header = (skb->network_header +
1417 fragheaderlen); 1426 fragheaderlen);
1418 if (fraggap) { 1427 if (fraggap) {
1419 skb->csum = skb_copy_and_csum_bits( 1428 skb->csum = skb_copy_and_csum_bits(
1420 skb_prev, maxfraglen, 1429 skb_prev, maxfraglen,
1421 data + transhdrlen, fraggap, 0); 1430 data + transhdrlen, fraggap, 0);
1422 skb_prev->csum = csum_sub(skb_prev->csum, 1431 skb_prev->csum = csum_sub(skb_prev->csum,
1423 skb->csum); 1432 skb->csum);
1424 data += fraggap; 1433 data += fraggap;
1425 pskb_trim_unique(skb_prev, maxfraglen); 1434 pskb_trim_unique(skb_prev, maxfraglen);
1426 } 1435 }
1427 copy = datalen - transhdrlen - fraggap; 1436 copy = datalen - transhdrlen - fraggap;
1428 if (copy < 0) { 1437 if (copy < 0) {
1429 err = -EINVAL; 1438 err = -EINVAL;
1430 kfree_skb(skb); 1439 kfree_skb(skb);
1431 goto error; 1440 goto error;
1432 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 1441 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1433 err = -EFAULT; 1442 err = -EFAULT;
1434 kfree_skb(skb); 1443 kfree_skb(skb);
1435 goto error; 1444 goto error;
1436 } 1445 }
1437 1446
1438 offset += copy; 1447 offset += copy;
1439 length -= datalen - fraggap; 1448 length -= datalen - fraggap;
1440 transhdrlen = 0; 1449 transhdrlen = 0;
1441 exthdrlen = 0; 1450 exthdrlen = 0;
1442 csummode = CHECKSUM_NONE; 1451 csummode = CHECKSUM_NONE;
1443 1452
1444 /* 1453 /*
1445 * Put the packet on the pending queue 1454 * Put the packet on the pending queue
1446 */ 1455 */
1447 __skb_queue_tail(&sk->sk_write_queue, skb); 1456 __skb_queue_tail(&sk->sk_write_queue, skb);
1448 continue; 1457 continue;
1449 } 1458 }
1450 1459
1451 if (copy > length) 1460 if (copy > length)
1452 copy = length; 1461 copy = length;
1453 1462
1454 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1463 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1455 unsigned int off; 1464 unsigned int off;
1456 1465
1457 off = skb->len; 1466 off = skb->len;
1458 if (getfrag(from, skb_put(skb, copy), 1467 if (getfrag(from, skb_put(skb, copy),
1459 offset, copy, off, skb) < 0) { 1468 offset, copy, off, skb) < 0) {
1460 __skb_trim(skb, off); 1469 __skb_trim(skb, off);
1461 err = -EFAULT; 1470 err = -EFAULT;
1462 goto error; 1471 goto error;
1463 } 1472 }
1464 } else { 1473 } else {
1465 int i = skb_shinfo(skb)->nr_frags; 1474 int i = skb_shinfo(skb)->nr_frags;
1466 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 1475 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1467 struct page *page = sk->sk_sndmsg_page; 1476 struct page *page = sk->sk_sndmsg_page;
1468 int off = sk->sk_sndmsg_off; 1477 int off = sk->sk_sndmsg_off;
1469 unsigned int left; 1478 unsigned int left;
1470 1479
1471 if (page && (left = PAGE_SIZE - off) > 0) { 1480 if (page && (left = PAGE_SIZE - off) > 0) {
1472 if (copy >= left) 1481 if (copy >= left)
1473 copy = left; 1482 copy = left;
1474 if (page != frag->page) { 1483 if (page != frag->page) {
1475 if (i == MAX_SKB_FRAGS) { 1484 if (i == MAX_SKB_FRAGS) {
1476 err = -EMSGSIZE; 1485 err = -EMSGSIZE;
1477 goto error; 1486 goto error;
1478 } 1487 }
1479 get_page(page); 1488 get_page(page);
1480 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 1489 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1481 frag = &skb_shinfo(skb)->frags[i]; 1490 frag = &skb_shinfo(skb)->frags[i];
1482 } 1491 }
1483 } else if(i < MAX_SKB_FRAGS) { 1492 } else if(i < MAX_SKB_FRAGS) {
1484 if (copy > PAGE_SIZE) 1493 if (copy > PAGE_SIZE)
1485 copy = PAGE_SIZE; 1494 copy = PAGE_SIZE;
1486 page = alloc_pages(sk->sk_allocation, 0); 1495 page = alloc_pages(sk->sk_allocation, 0);
1487 if (page == NULL) { 1496 if (page == NULL) {
1488 err = -ENOMEM; 1497 err = -ENOMEM;
1489 goto error; 1498 goto error;
1490 } 1499 }
1491 sk->sk_sndmsg_page = page; 1500 sk->sk_sndmsg_page = page;
1492 sk->sk_sndmsg_off = 0; 1501 sk->sk_sndmsg_off = 0;
1493 1502
1494 skb_fill_page_desc(skb, i, page, 0, 0); 1503 skb_fill_page_desc(skb, i, page, 0, 0);
1495 frag = &skb_shinfo(skb)->frags[i]; 1504 frag = &skb_shinfo(skb)->frags[i];
1496 } else { 1505 } else {
1497 err = -EMSGSIZE; 1506 err = -EMSGSIZE;
1498 goto error; 1507 goto error;
1499 } 1508 }
1500 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { 1509 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1501 err = -EFAULT; 1510 err = -EFAULT;
1502 goto error; 1511 goto error;
1503 } 1512 }
1504 sk->sk_sndmsg_off += copy; 1513 sk->sk_sndmsg_off += copy;
1505 frag->size += copy; 1514 frag->size += copy;
1506 skb->len += copy; 1515 skb->len += copy;
1507 skb->data_len += copy; 1516 skb->data_len += copy;
1508 skb->truesize += copy; 1517 skb->truesize += copy;
1509 atomic_add(copy, &sk->sk_wmem_alloc); 1518 atomic_add(copy, &sk->sk_wmem_alloc);
1510 } 1519 }
1511 offset += copy; 1520 offset += copy;
1512 length -= copy; 1521 length -= copy;
1513 } 1522 }
1514 return 0; 1523 return 0;
1515 error: 1524 error:
1516 cork->length -= length; 1525 cork->length -= length;
1517 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1526 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1518 return err; 1527 return err;
1519 } 1528 }
1520 1529
1521 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) 1530 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1522 { 1531 {
1523 if (np->cork.opt) { 1532 if (np->cork.opt) {
1524 kfree(np->cork.opt->dst0opt); 1533 kfree(np->cork.opt->dst0opt);
1525 kfree(np->cork.opt->dst1opt); 1534 kfree(np->cork.opt->dst1opt);
1526 kfree(np->cork.opt->hopopt); 1535 kfree(np->cork.opt->hopopt);
1527 kfree(np->cork.opt->srcrt); 1536 kfree(np->cork.opt->srcrt);
1528 kfree(np->cork.opt); 1537 kfree(np->cork.opt);
1529 np->cork.opt = NULL; 1538 np->cork.opt = NULL;
1530 } 1539 }
1531 1540
1532 if (inet->cork.base.dst) { 1541 if (inet->cork.base.dst) {
1533 dst_release(inet->cork.base.dst); 1542 dst_release(inet->cork.base.dst);
1534 inet->cork.base.dst = NULL; 1543 inet->cork.base.dst = NULL;
1535 inet->cork.base.flags &= ~IPCORK_ALLFRAG; 1544 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1536 } 1545 }
1537 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); 1546 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1538 } 1547 }
1539 1548
1540 int ip6_push_pending_frames(struct sock *sk) 1549 int ip6_push_pending_frames(struct sock *sk)
1541 { 1550 {
1542 struct sk_buff *skb, *tmp_skb; 1551 struct sk_buff *skb, *tmp_skb;
1543 struct sk_buff **tail_skb; 1552 struct sk_buff **tail_skb;
1544 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1553 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1545 struct inet_sock *inet = inet_sk(sk); 1554 struct inet_sock *inet = inet_sk(sk);
1546 struct ipv6_pinfo *np = inet6_sk(sk); 1555 struct ipv6_pinfo *np = inet6_sk(sk);
1547 struct net *net = sock_net(sk); 1556 struct net *net = sock_net(sk);
1548 struct ipv6hdr *hdr; 1557 struct ipv6hdr *hdr;
1549 struct ipv6_txoptions *opt = np->cork.opt; 1558 struct ipv6_txoptions *opt = np->cork.opt;
1550 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; 1559 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1551 struct flowi6 *fl6 = &inet->cork.fl.u.ip6; 1560 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1552 unsigned char proto = fl6->flowi6_proto; 1561 unsigned char proto = fl6->flowi6_proto;
1553 int err = 0; 1562 int err = 0;
1554 1563
1555 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1564 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1556 goto out; 1565 goto out;
1557 tail_skb = &(skb_shinfo(skb)->frag_list); 1566 tail_skb = &(skb_shinfo(skb)->frag_list);
1558 1567
1559 /* move skb->data to ip header from ext header */ 1568 /* move skb->data to ip header from ext header */
1560 if (skb->data < skb_network_header(skb)) 1569 if (skb->data < skb_network_header(skb))
1561 __skb_pull(skb, skb_network_offset(skb)); 1570 __skb_pull(skb, skb_network_offset(skb));
1562 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1571 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1563 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1572 __skb_pull(tmp_skb, skb_network_header_len(skb));
1564 *tail_skb = tmp_skb; 1573 *tail_skb = tmp_skb;
1565 tail_skb = &(tmp_skb->next); 1574 tail_skb = &(tmp_skb->next);
1566 skb->len += tmp_skb->len; 1575 skb->len += tmp_skb->len;
1567 skb->data_len += tmp_skb->len; 1576 skb->data_len += tmp_skb->len;
1568 skb->truesize += tmp_skb->truesize; 1577 skb->truesize += tmp_skb->truesize;
1569 tmp_skb->destructor = NULL; 1578 tmp_skb->destructor = NULL;
1570 tmp_skb->sk = NULL; 1579 tmp_skb->sk = NULL;
1571 } 1580 }
1572 1581
1573 /* Allow local fragmentation. */ 1582 /* Allow local fragmentation. */
1574 if (np->pmtudisc < IPV6_PMTUDISC_DO) 1583 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1575 skb->local_df = 1; 1584 skb->local_df = 1;
1576 1585
1577 ipv6_addr_copy(final_dst, &fl6->daddr); 1586 ipv6_addr_copy(final_dst, &fl6->daddr);
1578 __skb_pull(skb, skb_network_header_len(skb)); 1587 __skb_pull(skb, skb_network_header_len(skb));
1579 if (opt && opt->opt_flen) 1588 if (opt && opt->opt_flen)
1580 ipv6_push_frag_opts(skb, opt, &proto); 1589 ipv6_push_frag_opts(skb, opt, &proto);
1581 if (opt && opt->opt_nflen) 1590 if (opt && opt->opt_nflen)
1582 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); 1591 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1583 1592
1584 skb_push(skb, sizeof(struct ipv6hdr)); 1593 skb_push(skb, sizeof(struct ipv6hdr));
1585 skb_reset_network_header(skb); 1594 skb_reset_network_header(skb);
1586 hdr = ipv6_hdr(skb); 1595 hdr = ipv6_hdr(skb);
1587 1596
1588 *(__be32*)hdr = fl6->flowlabel | 1597 *(__be32*)hdr = fl6->flowlabel |
1589 htonl(0x60000000 | ((int)np->cork.tclass << 20)); 1598 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1590 1599
1591 hdr->hop_limit = np->cork.hop_limit; 1600 hdr->hop_limit = np->cork.hop_limit;
1592 hdr->nexthdr = proto; 1601 hdr->nexthdr = proto;
1593 ipv6_addr_copy(&hdr->saddr, &fl6->saddr); 1602 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1594 ipv6_addr_copy(&hdr->daddr, final_dst); 1603 ipv6_addr_copy(&hdr->daddr, final_dst);
1595 1604
1596 skb->priority = sk->sk_priority; 1605 skb->priority = sk->sk_priority;
1597 skb->mark = sk->sk_mark; 1606 skb->mark = sk->sk_mark;
1598 1607
1599 skb_dst_set(skb, dst_clone(&rt->dst)); 1608 skb_dst_set(skb, dst_clone(&rt->dst));
1600 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1609 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1601 if (proto == IPPROTO_ICMPV6) { 1610 if (proto == IPPROTO_ICMPV6) {
1602 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1611 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1603 1612
1604 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); 1613 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1605 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); 1614 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1606 } 1615 }
1607 1616
1608 err = ip6_local_out(skb); 1617 err = ip6_local_out(skb);
1609 if (err) { 1618 if (err) {
1610 if (err > 0) 1619 if (err > 0)
1611 err = net_xmit_errno(err); 1620 err = net_xmit_errno(err);
1612 if (err) 1621 if (err)
1613 goto error; 1622 goto error;
1614 } 1623 }
1615 1624
1616 out: 1625 out:
1617 ip6_cork_release(inet, np); 1626 ip6_cork_release(inet, np);
1618 return err; 1627 return err;
1619 error: 1628 error:
1620 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1629 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1621 goto out; 1630 goto out;
1622 } 1631 }
1623 1632
1624 void ip6_flush_pending_frames(struct sock *sk) 1633 void ip6_flush_pending_frames(struct sock *sk)
1625 { 1634 {
1626 struct sk_buff *skb; 1635 struct sk_buff *skb;
1627 1636
1628 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { 1637 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1629 if (skb_dst(skb)) 1638 if (skb_dst(skb))
1630 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1639 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1631 IPSTATS_MIB_OUTDISCARDS); 1640 IPSTATS_MIB_OUTDISCARDS);
1632 kfree_skb(skb); 1641 kfree_skb(skb);
1633 } 1642 }
1634 1643
1635 ip6_cork_release(inet_sk(sk), inet6_sk(sk)); 1644 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1636 } 1645 }
1637 1646
1 /* 1 /*
2 * Linux INET6 implementation 2 * Linux INET6 implementation
3 * FIB front-end. 3 * FIB front-end.
4 * 4 *
5 * Authors: 5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt> 6 * Pedro Roque <roque@di.fc.ul.pt>
7 * 7 *
8 * This program is free software; you can redistribute it and/or 8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License 9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version. 11 * 2 of the License, or (at your option) any later version.
12 */ 12 */
13 13
14 /* Changes: 14 /* Changes:
15 * 15 *
16 * YOSHIFUJI Hideaki @USAGI 16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection. 17 * reworked default router selection.
18 * - respect outgoing interface 18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e. 19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states). 20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably) 21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list. 22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala 23 * Ville Nuorvala
24 * Fixed routing subtrees. 24 * Fixed routing subtrees.
25 */ 25 */
26 26
27 #include <linux/capability.h> 27 #include <linux/capability.h>
28 #include <linux/errno.h> 28 #include <linux/errno.h>
29 #include <linux/types.h> 29 #include <linux/types.h>
30 #include <linux/times.h> 30 #include <linux/times.h>
31 #include <linux/socket.h> 31 #include <linux/socket.h>
32 #include <linux/sockios.h> 32 #include <linux/sockios.h>
33 #include <linux/net.h> 33 #include <linux/net.h>
34 #include <linux/route.h> 34 #include <linux/route.h>
35 #include <linux/netdevice.h> 35 #include <linux/netdevice.h>
36 #include <linux/in6.h> 36 #include <linux/in6.h>
37 #include <linux/mroute6.h> 37 #include <linux/mroute6.h>
38 #include <linux/init.h> 38 #include <linux/init.h>
39 #include <linux/if_arp.h> 39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h> 40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h> 41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h> 42 #include <linux/nsproxy.h>
43 #include <linux/slab.h> 43 #include <linux/slab.h>
44 #include <net/net_namespace.h> 44 #include <net/net_namespace.h>
45 #include <net/snmp.h> 45 #include <net/snmp.h>
46 #include <net/ipv6.h> 46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h> 47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h> 48 #include <net/ip6_route.h>
49 #include <net/ndisc.h> 49 #include <net/ndisc.h>
50 #include <net/addrconf.h> 50 #include <net/addrconf.h>
51 #include <net/tcp.h> 51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h> 52 #include <linux/rtnetlink.h>
53 #include <net/dst.h> 53 #include <net/dst.h>
54 #include <net/xfrm.h> 54 #include <net/xfrm.h>
55 #include <net/netevent.h> 55 #include <net/netevent.h>
56 #include <net/netlink.h> 56 #include <net/netlink.h>
57 57
58 #include <asm/uaccess.h> 58 #include <asm/uaccess.h>
59 59
60 #ifdef CONFIG_SYSCTL 60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h> 61 #include <linux/sysctl.h>
62 #endif 62 #endif
63 63
64 /* Set to 3 to get tracing. */ 64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2 65 #define RT6_DEBUG 2
66 66
67 #if RT6_DEBUG >= 3 67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x 68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x) 69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else 70 #else
71 #define RDBG(x) 71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0) 72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif 73 #endif
74 74
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, 75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76 const struct in6_addr *dest); 76 const struct in6_addr *dest);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 78 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int ip6_default_mtu(const struct dst_entry *dst); 79 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void ip6_dst_destroy(struct dst_entry *); 81 static void ip6_dst_destroy(struct dst_entry *);
82 static void ip6_dst_ifdown(struct dst_entry *, 82 static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how); 83 struct net_device *dev, int how);
84 static int ip6_dst_gc(struct dst_ops *ops); 84 static int ip6_dst_gc(struct dst_ops *ops);
85 85
86 static int ip6_pkt_discard(struct sk_buff *skb); 86 static int ip6_pkt_discard(struct sk_buff *skb);
87 static int ip6_pkt_discard_out(struct sk_buff *skb); 87 static int ip6_pkt_discard_out(struct sk_buff *skb);
88 static void ip6_link_failure(struct sk_buff *skb); 88 static void ip6_link_failure(struct sk_buff *skb);
89 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 89 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 90
91 #ifdef CONFIG_IPV6_ROUTE_INFO 91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net, 92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 const struct in6_addr *prefix, int prefixlen, 93 const struct in6_addr *prefix, int prefixlen,
94 const struct in6_addr *gwaddr, int ifindex, 94 const struct in6_addr *gwaddr, int ifindex,
95 unsigned pref); 95 unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net, 96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 const struct in6_addr *prefix, int prefixlen, 97 const struct in6_addr *prefix, int prefixlen,
98 const struct in6_addr *gwaddr, int ifindex); 98 const struct in6_addr *gwaddr, int ifindex);
99 #endif 99 #endif
100 100
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 { 102 {
103 struct rt6_info *rt = (struct rt6_info *) dst; 103 struct rt6_info *rt = (struct rt6_info *) dst;
104 struct inet_peer *peer; 104 struct inet_peer *peer;
105 u32 *p = NULL; 105 u32 *p = NULL;
106 106
107 if (!rt->rt6i_peer) 107 if (!rt->rt6i_peer)
108 rt6_bind_peer(rt, 1); 108 rt6_bind_peer(rt, 1);
109 109
110 peer = rt->rt6i_peer; 110 peer = rt->rt6i_peer;
111 if (peer) { 111 if (peer) {
112 u32 *old_p = __DST_METRICS_PTR(old); 112 u32 *old_p = __DST_METRICS_PTR(old);
113 unsigned long prev, new; 113 unsigned long prev, new;
114 114
115 p = peer->metrics; 115 p = peer->metrics;
116 if (inet_metrics_new(peer)) 116 if (inet_metrics_new(peer))
117 memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 117 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118 118
119 new = (unsigned long) p; 119 new = (unsigned long) p;
120 prev = cmpxchg(&dst->_metrics, old, new); 120 prev = cmpxchg(&dst->_metrics, old, new);
121 121
122 if (prev != old) { 122 if (prev != old) {
123 p = __DST_METRICS_PTR(prev); 123 p = __DST_METRICS_PTR(prev);
124 if (prev & DST_METRICS_READ_ONLY) 124 if (prev & DST_METRICS_READ_ONLY)
125 p = NULL; 125 p = NULL;
126 } 126 }
127 } 127 }
128 return p; 128 return p;
129 } 129 }
130 130
131 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr) 131 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
132 { 132 {
133 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev); 133 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
134 } 134 }
135 135
136 static struct dst_ops ip6_dst_ops_template = { 136 static struct dst_ops ip6_dst_ops_template = {
137 .family = AF_INET6, 137 .family = AF_INET6,
138 .protocol = cpu_to_be16(ETH_P_IPV6), 138 .protocol = cpu_to_be16(ETH_P_IPV6),
139 .gc = ip6_dst_gc, 139 .gc = ip6_dst_gc,
140 .gc_thresh = 1024, 140 .gc_thresh = 1024,
141 .check = ip6_dst_check, 141 .check = ip6_dst_check,
142 .default_advmss = ip6_default_advmss, 142 .default_advmss = ip6_default_advmss,
143 .default_mtu = ip6_default_mtu, 143 .default_mtu = ip6_default_mtu,
144 .cow_metrics = ipv6_cow_metrics, 144 .cow_metrics = ipv6_cow_metrics,
145 .destroy = ip6_dst_destroy, 145 .destroy = ip6_dst_destroy,
146 .ifdown = ip6_dst_ifdown, 146 .ifdown = ip6_dst_ifdown,
147 .negative_advice = ip6_negative_advice, 147 .negative_advice = ip6_negative_advice,
148 .link_failure = ip6_link_failure, 148 .link_failure = ip6_link_failure,
149 .update_pmtu = ip6_rt_update_pmtu, 149 .update_pmtu = ip6_rt_update_pmtu,
150 .local_out = __ip6_local_out, 150 .local_out = __ip6_local_out,
151 .neigh_lookup = ip6_neigh_lookup, 151 .neigh_lookup = ip6_neigh_lookup,
152 }; 152 };
153 153
154 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst) 154 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
155 { 155 {
156 return 0; 156 return 0;
157 } 157 }
158 158
159 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 159 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
160 { 160 {
161 } 161 }
162 162
163 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, 163 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
164 unsigned long old) 164 unsigned long old)
165 { 165 {
166 return NULL; 166 return NULL;
167 } 167 }
168 168
169 static struct dst_ops ip6_dst_blackhole_ops = { 169 static struct dst_ops ip6_dst_blackhole_ops = {
170 .family = AF_INET6, 170 .family = AF_INET6,
171 .protocol = cpu_to_be16(ETH_P_IPV6), 171 .protocol = cpu_to_be16(ETH_P_IPV6),
172 .destroy = ip6_dst_destroy, 172 .destroy = ip6_dst_destroy,
173 .check = ip6_dst_check, 173 .check = ip6_dst_check,
174 .default_mtu = ip6_blackhole_default_mtu, 174 .default_mtu = ip6_blackhole_default_mtu,
175 .default_advmss = ip6_default_advmss, 175 .default_advmss = ip6_default_advmss,
176 .update_pmtu = ip6_rt_blackhole_update_pmtu, 176 .update_pmtu = ip6_rt_blackhole_update_pmtu,
177 .cow_metrics = ip6_rt_blackhole_cow_metrics, 177 .cow_metrics = ip6_rt_blackhole_cow_metrics,
178 .neigh_lookup = ip6_neigh_lookup, 178 .neigh_lookup = ip6_neigh_lookup,
179 }; 179 };
180 180
181 static const u32 ip6_template_metrics[RTAX_MAX] = { 181 static const u32 ip6_template_metrics[RTAX_MAX] = {
182 [RTAX_HOPLIMIT - 1] = 255, 182 [RTAX_HOPLIMIT - 1] = 255,
183 }; 183 };
184 184
185 static struct rt6_info ip6_null_entry_template = { 185 static struct rt6_info ip6_null_entry_template = {
186 .dst = { 186 .dst = {
187 .__refcnt = ATOMIC_INIT(1), 187 .__refcnt = ATOMIC_INIT(1),
188 .__use = 1, 188 .__use = 1,
189 .obsolete = -1, 189 .obsolete = -1,
190 .error = -ENETUNREACH, 190 .error = -ENETUNREACH,
191 .input = ip6_pkt_discard, 191 .input = ip6_pkt_discard,
192 .output = ip6_pkt_discard_out, 192 .output = ip6_pkt_discard_out,
193 }, 193 },
194 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 194 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
195 .rt6i_protocol = RTPROT_KERNEL, 195 .rt6i_protocol = RTPROT_KERNEL,
196 .rt6i_metric = ~(u32) 0, 196 .rt6i_metric = ~(u32) 0,
197 .rt6i_ref = ATOMIC_INIT(1), 197 .rt6i_ref = ATOMIC_INIT(1),
198 }; 198 };
199 199
200 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 200 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
201 201
202 static int ip6_pkt_prohibit(struct sk_buff *skb); 202 static int ip6_pkt_prohibit(struct sk_buff *skb);
203 static int ip6_pkt_prohibit_out(struct sk_buff *skb); 203 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
204 204
205 static struct rt6_info ip6_prohibit_entry_template = { 205 static struct rt6_info ip6_prohibit_entry_template = {
206 .dst = { 206 .dst = {
207 .__refcnt = ATOMIC_INIT(1), 207 .__refcnt = ATOMIC_INIT(1),
208 .__use = 1, 208 .__use = 1,
209 .obsolete = -1, 209 .obsolete = -1,
210 .error = -EACCES, 210 .error = -EACCES,
211 .input = ip6_pkt_prohibit, 211 .input = ip6_pkt_prohibit,
212 .output = ip6_pkt_prohibit_out, 212 .output = ip6_pkt_prohibit_out,
213 }, 213 },
214 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 214 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
215 .rt6i_protocol = RTPROT_KERNEL, 215 .rt6i_protocol = RTPROT_KERNEL,
216 .rt6i_metric = ~(u32) 0, 216 .rt6i_metric = ~(u32) 0,
217 .rt6i_ref = ATOMIC_INIT(1), 217 .rt6i_ref = ATOMIC_INIT(1),
218 }; 218 };
219 219
220 static struct rt6_info ip6_blk_hole_entry_template = { 220 static struct rt6_info ip6_blk_hole_entry_template = {
221 .dst = { 221 .dst = {
222 .__refcnt = ATOMIC_INIT(1), 222 .__refcnt = ATOMIC_INIT(1),
223 .__use = 1, 223 .__use = 1,
224 .obsolete = -1, 224 .obsolete = -1,
225 .error = -EINVAL, 225 .error = -EINVAL,
226 .input = dst_discard, 226 .input = dst_discard,
227 .output = dst_discard, 227 .output = dst_discard,
228 }, 228 },
229 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 229 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
230 .rt6i_protocol = RTPROT_KERNEL, 230 .rt6i_protocol = RTPROT_KERNEL,
231 .rt6i_metric = ~(u32) 0, 231 .rt6i_metric = ~(u32) 0,
232 .rt6i_ref = ATOMIC_INIT(1), 232 .rt6i_ref = ATOMIC_INIT(1),
233 }; 233 };
234 234
235 #endif 235 #endif
236 236
237 /* allocate dst with ip6_dst_ops */ 237 /* allocate dst with ip6_dst_ops */
238 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops, 238 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
239 struct net_device *dev, 239 struct net_device *dev,
240 int flags) 240 int flags)
241 { 241 {
242 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags); 242 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
243 243
244 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); 244 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
245 245
246 return rt; 246 return rt;
247 } 247 }
248 248
249 static void ip6_dst_destroy(struct dst_entry *dst) 249 static void ip6_dst_destroy(struct dst_entry *dst)
250 { 250 {
251 struct rt6_info *rt = (struct rt6_info *)dst; 251 struct rt6_info *rt = (struct rt6_info *)dst;
252 struct inet6_dev *idev = rt->rt6i_idev; 252 struct inet6_dev *idev = rt->rt6i_idev;
253 struct inet_peer *peer = rt->rt6i_peer; 253 struct inet_peer *peer = rt->rt6i_peer;
254 254
255 if (idev != NULL) { 255 if (idev != NULL) {
256 rt->rt6i_idev = NULL; 256 rt->rt6i_idev = NULL;
257 in6_dev_put(idev); 257 in6_dev_put(idev);
258 } 258 }
259 if (peer) { 259 if (peer) {
260 rt->rt6i_peer = NULL; 260 rt->rt6i_peer = NULL;
261 inet_putpeer(peer); 261 inet_putpeer(peer);
262 } 262 }
263 } 263 }
264 264
265 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0); 265 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
266 266
267 static u32 rt6_peer_genid(void) 267 static u32 rt6_peer_genid(void)
268 { 268 {
269 return atomic_read(&__rt6_peer_genid); 269 return atomic_read(&__rt6_peer_genid);
270 } 270 }
271 271
272 void rt6_bind_peer(struct rt6_info *rt, int create) 272 void rt6_bind_peer(struct rt6_info *rt, int create)
273 { 273 {
274 struct inet_peer *peer; 274 struct inet_peer *peer;
275 275
276 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create); 276 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
277 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL) 277 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
278 inet_putpeer(peer); 278 inet_putpeer(peer);
279 else 279 else
280 rt->rt6i_peer_genid = rt6_peer_genid(); 280 rt->rt6i_peer_genid = rt6_peer_genid();
281 } 281 }
282 282
283 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 283 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
284 int how) 284 int how)
285 { 285 {
286 struct rt6_info *rt = (struct rt6_info *)dst; 286 struct rt6_info *rt = (struct rt6_info *)dst;
287 struct inet6_dev *idev = rt->rt6i_idev; 287 struct inet6_dev *idev = rt->rt6i_idev;
288 struct net_device *loopback_dev = 288 struct net_device *loopback_dev =
289 dev_net(dev)->loopback_dev; 289 dev_net(dev)->loopback_dev;
290 290
291 if (dev != loopback_dev && idev != NULL && idev->dev == dev) { 291 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
292 struct inet6_dev *loopback_idev = 292 struct inet6_dev *loopback_idev =
293 in6_dev_get(loopback_dev); 293 in6_dev_get(loopback_dev);
294 if (loopback_idev != NULL) { 294 if (loopback_idev != NULL) {
295 rt->rt6i_idev = loopback_idev; 295 rt->rt6i_idev = loopback_idev;
296 in6_dev_put(idev); 296 in6_dev_put(idev);
297 } 297 }
298 } 298 }
299 } 299 }
300 300
301 static __inline__ int rt6_check_expired(const struct rt6_info *rt) 301 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
302 { 302 {
303 return (rt->rt6i_flags & RTF_EXPIRES) && 303 return (rt->rt6i_flags & RTF_EXPIRES) &&
304 time_after(jiffies, rt->rt6i_expires); 304 time_after(jiffies, rt->rt6i_expires);
305 } 305 }
306 306
307 static inline int rt6_need_strict(const struct in6_addr *daddr) 307 static inline int rt6_need_strict(const struct in6_addr *daddr)
308 { 308 {
309 return ipv6_addr_type(daddr) & 309 return ipv6_addr_type(daddr) &
310 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); 310 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
311 } 311 }
312 312
313 /* 313 /*
314 * Route lookup. Any table->tb6_lock is implied. 314 * Route lookup. Any table->tb6_lock is implied.
315 */ 315 */
316 316
317 static inline struct rt6_info *rt6_device_match(struct net *net, 317 static inline struct rt6_info *rt6_device_match(struct net *net,
318 struct rt6_info *rt, 318 struct rt6_info *rt,
319 const struct in6_addr *saddr, 319 const struct in6_addr *saddr,
320 int oif, 320 int oif,
321 int flags) 321 int flags)
322 { 322 {
323 struct rt6_info *local = NULL; 323 struct rt6_info *local = NULL;
324 struct rt6_info *sprt; 324 struct rt6_info *sprt;
325 325
326 if (!oif && ipv6_addr_any(saddr)) 326 if (!oif && ipv6_addr_any(saddr))
327 goto out; 327 goto out;
328 328
329 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { 329 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
330 struct net_device *dev = sprt->rt6i_dev; 330 struct net_device *dev = sprt->rt6i_dev;
331 331
332 if (oif) { 332 if (oif) {
333 if (dev->ifindex == oif) 333 if (dev->ifindex == oif)
334 return sprt; 334 return sprt;
335 if (dev->flags & IFF_LOOPBACK) { 335 if (dev->flags & IFF_LOOPBACK) {
336 if (sprt->rt6i_idev == NULL || 336 if (sprt->rt6i_idev == NULL ||
337 sprt->rt6i_idev->dev->ifindex != oif) { 337 sprt->rt6i_idev->dev->ifindex != oif) {
338 if (flags & RT6_LOOKUP_F_IFACE && oif) 338 if (flags & RT6_LOOKUP_F_IFACE && oif)
339 continue; 339 continue;
340 if (local && (!oif || 340 if (local && (!oif ||
341 local->rt6i_idev->dev->ifindex == oif)) 341 local->rt6i_idev->dev->ifindex == oif))
342 continue; 342 continue;
343 } 343 }
344 local = sprt; 344 local = sprt;
345 } 345 }
346 } else { 346 } else {
347 if (ipv6_chk_addr(net, saddr, dev, 347 if (ipv6_chk_addr(net, saddr, dev,
348 flags & RT6_LOOKUP_F_IFACE)) 348 flags & RT6_LOOKUP_F_IFACE))
349 return sprt; 349 return sprt;
350 } 350 }
351 } 351 }
352 352
353 if (oif) { 353 if (oif) {
354 if (local) 354 if (local)
355 return local; 355 return local;
356 356
357 if (flags & RT6_LOOKUP_F_IFACE) 357 if (flags & RT6_LOOKUP_F_IFACE)
358 return net->ipv6.ip6_null_entry; 358 return net->ipv6.ip6_null_entry;
359 } 359 }
360 out: 360 out:
361 return rt; 361 return rt;
362 } 362 }
363 363
364 #ifdef CONFIG_IPV6_ROUTER_PREF 364 #ifdef CONFIG_IPV6_ROUTER_PREF
365 static void rt6_probe(struct rt6_info *rt) 365 static void rt6_probe(struct rt6_info *rt)
366 { 366 {
367 struct neighbour *neigh = rt ? dst_get_neighbour(&rt->dst) : NULL; 367 struct neighbour *neigh;
368 /* 368 /*
369 * Okay, this does not seem to be appropriate 369 * Okay, this does not seem to be appropriate
370 * for now, however, we need to check if it 370 * for now, however, we need to check if it
371 * is really so; aka Router Reachability Probing. 371 * is really so; aka Router Reachability Probing.
372 * 372 *
373 * Router Reachability Probe MUST be rate-limited 373 * Router Reachability Probe MUST be rate-limited
374 * to no more than one per minute. 374 * to no more than one per minute.
375 */ 375 */
376 rcu_read_lock();
377 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
376 if (!neigh || (neigh->nud_state & NUD_VALID)) 378 if (!neigh || (neigh->nud_state & NUD_VALID))
377 return; 379 goto out;
378 read_lock_bh(&neigh->lock); 380 read_lock_bh(&neigh->lock);
379 if (!(neigh->nud_state & NUD_VALID) && 381 if (!(neigh->nud_state & NUD_VALID) &&
380 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { 382 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
381 struct in6_addr mcaddr; 383 struct in6_addr mcaddr;
382 struct in6_addr *target; 384 struct in6_addr *target;
383 385
384 neigh->updated = jiffies; 386 neigh->updated = jiffies;
385 read_unlock_bh(&neigh->lock); 387 read_unlock_bh(&neigh->lock);
386 388
387 target = (struct in6_addr *)&neigh->primary_key; 389 target = (struct in6_addr *)&neigh->primary_key;
388 addrconf_addr_solict_mult(target, &mcaddr); 390 addrconf_addr_solict_mult(target, &mcaddr);
389 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL); 391 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
390 } else 392 } else {
391 read_unlock_bh(&neigh->lock); 393 read_unlock_bh(&neigh->lock);
394 }
395 out:
396 rcu_read_unlock();
392 } 397 }
393 #else 398 #else
394 static inline void rt6_probe(struct rt6_info *rt) 399 static inline void rt6_probe(struct rt6_info *rt)
395 { 400 {
396 } 401 }
397 #endif 402 #endif
398 403
399 /* 404 /*
400 * Default Router Selection (RFC 2461 6.3.6) 405 * Default Router Selection (RFC 2461 6.3.6)
401 */ 406 */
402 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 407 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
403 { 408 {
404 struct net_device *dev = rt->rt6i_dev; 409 struct net_device *dev = rt->rt6i_dev;
405 if (!oif || dev->ifindex == oif) 410 if (!oif || dev->ifindex == oif)
406 return 2; 411 return 2;
407 if ((dev->flags & IFF_LOOPBACK) && 412 if ((dev->flags & IFF_LOOPBACK) &&
408 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 413 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
409 return 1; 414 return 1;
410 return 0; 415 return 0;
411 } 416 }
412 417
413 static inline int rt6_check_neigh(struct rt6_info *rt) 418 static inline int rt6_check_neigh(struct rt6_info *rt)
414 { 419 {
415 struct neighbour *neigh = dst_get_neighbour(&rt->dst); 420 struct neighbour *neigh;
416 int m; 421 int m;
422
423 rcu_read_lock();
424 neigh = dst_get_neighbour(&rt->dst);
417 if (rt->rt6i_flags & RTF_NONEXTHOP || 425 if (rt->rt6i_flags & RTF_NONEXTHOP ||
418 !(rt->rt6i_flags & RTF_GATEWAY)) 426 !(rt->rt6i_flags & RTF_GATEWAY))
419 m = 1; 427 m = 1;
420 else if (neigh) { 428 else if (neigh) {
421 read_lock_bh(&neigh->lock); 429 read_lock_bh(&neigh->lock);
422 if (neigh->nud_state & NUD_VALID) 430 if (neigh->nud_state & NUD_VALID)
423 m = 2; 431 m = 2;
424 #ifdef CONFIG_IPV6_ROUTER_PREF 432 #ifdef CONFIG_IPV6_ROUTER_PREF
425 else if (neigh->nud_state & NUD_FAILED) 433 else if (neigh->nud_state & NUD_FAILED)
426 m = 0; 434 m = 0;
427 #endif 435 #endif
428 else 436 else
429 m = 1; 437 m = 1;
430 read_unlock_bh(&neigh->lock); 438 read_unlock_bh(&neigh->lock);
431 } else 439 } else
432 m = 0; 440 m = 0;
441 rcu_read_unlock();
433 return m; 442 return m;
434 } 443 }
435 444
436 static int rt6_score_route(struct rt6_info *rt, int oif, 445 static int rt6_score_route(struct rt6_info *rt, int oif,
437 int strict) 446 int strict)
438 { 447 {
439 int m, n; 448 int m, n;
440 449
441 m = rt6_check_dev(rt, oif); 450 m = rt6_check_dev(rt, oif);
442 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 451 if (!m && (strict & RT6_LOOKUP_F_IFACE))
443 return -1; 452 return -1;
444 #ifdef CONFIG_IPV6_ROUTER_PREF 453 #ifdef CONFIG_IPV6_ROUTER_PREF
445 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 454 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
446 #endif 455 #endif
447 n = rt6_check_neigh(rt); 456 n = rt6_check_neigh(rt);
448 if (!n && (strict & RT6_LOOKUP_F_REACHABLE)) 457 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
449 return -1; 458 return -1;
450 return m; 459 return m;
451 } 460 }
452 461
453 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 462 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
454 int *mpri, struct rt6_info *match) 463 int *mpri, struct rt6_info *match)
455 { 464 {
456 int m; 465 int m;
457 466
458 if (rt6_check_expired(rt)) 467 if (rt6_check_expired(rt))
459 goto out; 468 goto out;
460 469
461 m = rt6_score_route(rt, oif, strict); 470 m = rt6_score_route(rt, oif, strict);
462 if (m < 0) 471 if (m < 0)
463 goto out; 472 goto out;
464 473
465 if (m > *mpri) { 474 if (m > *mpri) {
466 if (strict & RT6_LOOKUP_F_REACHABLE) 475 if (strict & RT6_LOOKUP_F_REACHABLE)
467 rt6_probe(match); 476 rt6_probe(match);
468 *mpri = m; 477 *mpri = m;
469 match = rt; 478 match = rt;
470 } else if (strict & RT6_LOOKUP_F_REACHABLE) { 479 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
471 rt6_probe(rt); 480 rt6_probe(rt);
472 } 481 }
473 482
474 out: 483 out:
475 return match; 484 return match;
476 } 485 }
477 486
478 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 487 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
479 struct rt6_info *rr_head, 488 struct rt6_info *rr_head,
480 u32 metric, int oif, int strict) 489 u32 metric, int oif, int strict)
481 { 490 {
482 struct rt6_info *rt, *match; 491 struct rt6_info *rt, *match;
483 int mpri = -1; 492 int mpri = -1;
484 493
485 match = NULL; 494 match = NULL;
486 for (rt = rr_head; rt && rt->rt6i_metric == metric; 495 for (rt = rr_head; rt && rt->rt6i_metric == metric;
487 rt = rt->dst.rt6_next) 496 rt = rt->dst.rt6_next)
488 match = find_match(rt, oif, strict, &mpri, match); 497 match = find_match(rt, oif, strict, &mpri, match);
489 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; 498 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
490 rt = rt->dst.rt6_next) 499 rt = rt->dst.rt6_next)
491 match = find_match(rt, oif, strict, &mpri, match); 500 match = find_match(rt, oif, strict, &mpri, match);
492 501
493 return match; 502 return match;
494 } 503 }
495 504
496 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) 505 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
497 { 506 {
498 struct rt6_info *match, *rt0; 507 struct rt6_info *match, *rt0;
499 struct net *net; 508 struct net *net;
500 509
501 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n", 510 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
502 __func__, fn->leaf, oif); 511 __func__, fn->leaf, oif);
503 512
504 rt0 = fn->rr_ptr; 513 rt0 = fn->rr_ptr;
505 if (!rt0) 514 if (!rt0)
506 fn->rr_ptr = rt0 = fn->leaf; 515 fn->rr_ptr = rt0 = fn->leaf;
507 516
508 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); 517 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
509 518
510 if (!match && 519 if (!match &&
511 (strict & RT6_LOOKUP_F_REACHABLE)) { 520 (strict & RT6_LOOKUP_F_REACHABLE)) {
512 struct rt6_info *next = rt0->dst.rt6_next; 521 struct rt6_info *next = rt0->dst.rt6_next;
513 522
514 /* no entries matched; do round-robin */ 523 /* no entries matched; do round-robin */
515 if (!next || next->rt6i_metric != rt0->rt6i_metric) 524 if (!next || next->rt6i_metric != rt0->rt6i_metric)
516 next = fn->leaf; 525 next = fn->leaf;
517 526
518 if (next != rt0) 527 if (next != rt0)
519 fn->rr_ptr = next; 528 fn->rr_ptr = next;
520 } 529 }
521 530
522 RT6_TRACE("%s() => %p\n", 531 RT6_TRACE("%s() => %p\n",
523 __func__, match); 532 __func__, match);
524 533
525 net = dev_net(rt0->rt6i_dev); 534 net = dev_net(rt0->rt6i_dev);
526 return match ? match : net->ipv6.ip6_null_entry; 535 return match ? match : net->ipv6.ip6_null_entry;
527 } 536 }
528 537
529 #ifdef CONFIG_IPV6_ROUTE_INFO 538 #ifdef CONFIG_IPV6_ROUTE_INFO
530 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 539 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
531 const struct in6_addr *gwaddr) 540 const struct in6_addr *gwaddr)
532 { 541 {
533 struct net *net = dev_net(dev); 542 struct net *net = dev_net(dev);
534 struct route_info *rinfo = (struct route_info *) opt; 543 struct route_info *rinfo = (struct route_info *) opt;
535 struct in6_addr prefix_buf, *prefix; 544 struct in6_addr prefix_buf, *prefix;
536 unsigned int pref; 545 unsigned int pref;
537 unsigned long lifetime; 546 unsigned long lifetime;
538 struct rt6_info *rt; 547 struct rt6_info *rt;
539 548
540 if (len < sizeof(struct route_info)) { 549 if (len < sizeof(struct route_info)) {
541 return -EINVAL; 550 return -EINVAL;
542 } 551 }
543 552
544 /* Sanity check for prefix_len and length */ 553 /* Sanity check for prefix_len and length */
545 if (rinfo->length > 3) { 554 if (rinfo->length > 3) {
546 return -EINVAL; 555 return -EINVAL;
547 } else if (rinfo->prefix_len > 128) { 556 } else if (rinfo->prefix_len > 128) {
548 return -EINVAL; 557 return -EINVAL;
549 } else if (rinfo->prefix_len > 64) { 558 } else if (rinfo->prefix_len > 64) {
550 if (rinfo->length < 2) { 559 if (rinfo->length < 2) {
551 return -EINVAL; 560 return -EINVAL;
552 } 561 }
553 } else if (rinfo->prefix_len > 0) { 562 } else if (rinfo->prefix_len > 0) {
554 if (rinfo->length < 1) { 563 if (rinfo->length < 1) {
555 return -EINVAL; 564 return -EINVAL;
556 } 565 }
557 } 566 }
558 567
559 pref = rinfo->route_pref; 568 pref = rinfo->route_pref;
560 if (pref == ICMPV6_ROUTER_PREF_INVALID) 569 if (pref == ICMPV6_ROUTER_PREF_INVALID)
561 return -EINVAL; 570 return -EINVAL;
562 571
563 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 572 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
564 573
565 if (rinfo->length == 3) 574 if (rinfo->length == 3)
566 prefix = (struct in6_addr *)rinfo->prefix; 575 prefix = (struct in6_addr *)rinfo->prefix;
567 else { 576 else {
568 /* this function is safe */ 577 /* this function is safe */
569 ipv6_addr_prefix(&prefix_buf, 578 ipv6_addr_prefix(&prefix_buf,
570 (struct in6_addr *)rinfo->prefix, 579 (struct in6_addr *)rinfo->prefix,
571 rinfo->prefix_len); 580 rinfo->prefix_len);
572 prefix = &prefix_buf; 581 prefix = &prefix_buf;
573 } 582 }
574 583
575 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, 584 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
576 dev->ifindex); 585 dev->ifindex);
577 586
578 if (rt && !lifetime) { 587 if (rt && !lifetime) {
579 ip6_del_rt(rt); 588 ip6_del_rt(rt);
580 rt = NULL; 589 rt = NULL;
581 } 590 }
582 591
583 if (!rt && lifetime) 592 if (!rt && lifetime)
584 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex, 593 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
585 pref); 594 pref);
586 else if (rt) 595 else if (rt)
587 rt->rt6i_flags = RTF_ROUTEINFO | 596 rt->rt6i_flags = RTF_ROUTEINFO |
588 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 597 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
589 598
590 if (rt) { 599 if (rt) {
591 if (!addrconf_finite_timeout(lifetime)) { 600 if (!addrconf_finite_timeout(lifetime)) {
592 rt->rt6i_flags &= ~RTF_EXPIRES; 601 rt->rt6i_flags &= ~RTF_EXPIRES;
593 } else { 602 } else {
594 rt->rt6i_expires = jiffies + HZ * lifetime; 603 rt->rt6i_expires = jiffies + HZ * lifetime;
595 rt->rt6i_flags |= RTF_EXPIRES; 604 rt->rt6i_flags |= RTF_EXPIRES;
596 } 605 }
597 dst_release(&rt->dst); 606 dst_release(&rt->dst);
598 } 607 }
599 return 0; 608 return 0;
600 } 609 }
601 #endif 610 #endif
602 611
603 #define BACKTRACK(__net, saddr) \ 612 #define BACKTRACK(__net, saddr) \
604 do { \ 613 do { \
605 if (rt == __net->ipv6.ip6_null_entry) { \ 614 if (rt == __net->ipv6.ip6_null_entry) { \
606 struct fib6_node *pn; \ 615 struct fib6_node *pn; \
607 while (1) { \ 616 while (1) { \
608 if (fn->fn_flags & RTN_TL_ROOT) \ 617 if (fn->fn_flags & RTN_TL_ROOT) \
609 goto out; \ 618 goto out; \
610 pn = fn->parent; \ 619 pn = fn->parent; \
611 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \ 620 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
612 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \ 621 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
613 else \ 622 else \
614 fn = pn; \ 623 fn = pn; \
615 if (fn->fn_flags & RTN_RTINFO) \ 624 if (fn->fn_flags & RTN_RTINFO) \
616 goto restart; \ 625 goto restart; \
617 } \ 626 } \
618 } \ 627 } \
619 } while(0) 628 } while(0)
620 629
621 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 630 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
622 struct fib6_table *table, 631 struct fib6_table *table,
623 struct flowi6 *fl6, int flags) 632 struct flowi6 *fl6, int flags)
624 { 633 {
625 struct fib6_node *fn; 634 struct fib6_node *fn;
626 struct rt6_info *rt; 635 struct rt6_info *rt;
627 636
628 read_lock_bh(&table->tb6_lock); 637 read_lock_bh(&table->tb6_lock);
629 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 638 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
630 restart: 639 restart:
631 rt = fn->leaf; 640 rt = fn->leaf;
632 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); 641 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
633 BACKTRACK(net, &fl6->saddr); 642 BACKTRACK(net, &fl6->saddr);
634 out: 643 out:
635 dst_use(&rt->dst, jiffies); 644 dst_use(&rt->dst, jiffies);
636 read_unlock_bh(&table->tb6_lock); 645 read_unlock_bh(&table->tb6_lock);
637 return rt; 646 return rt;
638 647
639 } 648 }
640 649
641 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 650 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
642 const struct in6_addr *saddr, int oif, int strict) 651 const struct in6_addr *saddr, int oif, int strict)
643 { 652 {
644 struct flowi6 fl6 = { 653 struct flowi6 fl6 = {
645 .flowi6_oif = oif, 654 .flowi6_oif = oif,
646 .daddr = *daddr, 655 .daddr = *daddr,
647 }; 656 };
648 struct dst_entry *dst; 657 struct dst_entry *dst;
649 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 658 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
650 659
651 if (saddr) { 660 if (saddr) {
652 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 661 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
653 flags |= RT6_LOOKUP_F_HAS_SADDR; 662 flags |= RT6_LOOKUP_F_HAS_SADDR;
654 } 663 }
655 664
656 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 665 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
657 if (dst->error == 0) 666 if (dst->error == 0)
658 return (struct rt6_info *) dst; 667 return (struct rt6_info *) dst;
659 668
660 dst_release(dst); 669 dst_release(dst);
661 670
662 return NULL; 671 return NULL;
663 } 672 }
664 673
665 EXPORT_SYMBOL(rt6_lookup); 674 EXPORT_SYMBOL(rt6_lookup);
666 675
667 /* ip6_ins_rt is called with FREE table->tb6_lock. 676 /* ip6_ins_rt is called with FREE table->tb6_lock.
668 It takes new route entry, the addition fails by any reason the 677 It takes new route entry, the addition fails by any reason the
669 route is freed. In any case, if caller does not hold it, it may 678 route is freed. In any case, if caller does not hold it, it may
670 be destroyed. 679 be destroyed.
671 */ 680 */
672 681
673 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) 682 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
674 { 683 {
675 int err; 684 int err;
676 struct fib6_table *table; 685 struct fib6_table *table;
677 686
678 table = rt->rt6i_table; 687 table = rt->rt6i_table;
679 write_lock_bh(&table->tb6_lock); 688 write_lock_bh(&table->tb6_lock);
680 err = fib6_add(&table->tb6_root, rt, info); 689 err = fib6_add(&table->tb6_root, rt, info);
681 write_unlock_bh(&table->tb6_lock); 690 write_unlock_bh(&table->tb6_lock);
682 691
683 return err; 692 return err;
684 } 693 }
685 694
686 int ip6_ins_rt(struct rt6_info *rt) 695 int ip6_ins_rt(struct rt6_info *rt)
687 { 696 {
688 struct nl_info info = { 697 struct nl_info info = {
689 .nl_net = dev_net(rt->rt6i_dev), 698 .nl_net = dev_net(rt->rt6i_dev),
690 }; 699 };
691 return __ip6_ins_rt(rt, &info); 700 return __ip6_ins_rt(rt, &info);
692 } 701 }
693 702
694 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort, 703 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
695 const struct in6_addr *daddr, 704 const struct in6_addr *daddr,
696 const struct in6_addr *saddr) 705 const struct in6_addr *saddr)
697 { 706 {
698 struct rt6_info *rt; 707 struct rt6_info *rt;
699 708
700 /* 709 /*
701 * Clone the route. 710 * Clone the route.
702 */ 711 */
703 712
704 rt = ip6_rt_copy(ort, daddr); 713 rt = ip6_rt_copy(ort, daddr);
705 714
706 if (rt) { 715 if (rt) {
707 struct neighbour *neigh; 716 struct neighbour *neigh;
708 int attempts = !in_softirq(); 717 int attempts = !in_softirq();
709 718
710 if (!(rt->rt6i_flags&RTF_GATEWAY)) { 719 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
711 if (rt->rt6i_dst.plen != 128 && 720 if (rt->rt6i_dst.plen != 128 &&
712 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 721 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
713 rt->rt6i_flags |= RTF_ANYCAST; 722 rt->rt6i_flags |= RTF_ANYCAST;
714 ipv6_addr_copy(&rt->rt6i_gateway, daddr); 723 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
715 } 724 }
716 725
717 rt->rt6i_dst.plen = 128; 726 rt->rt6i_dst.plen = 128;
718 rt->rt6i_flags |= RTF_CACHE; 727 rt->rt6i_flags |= RTF_CACHE;
719 rt->dst.flags |= DST_HOST; 728 rt->dst.flags |= DST_HOST;
720 729
721 #ifdef CONFIG_IPV6_SUBTREES 730 #ifdef CONFIG_IPV6_SUBTREES
722 if (rt->rt6i_src.plen && saddr) { 731 if (rt->rt6i_src.plen && saddr) {
723 ipv6_addr_copy(&rt->rt6i_src.addr, saddr); 732 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
724 rt->rt6i_src.plen = 128; 733 rt->rt6i_src.plen = 128;
725 } 734 }
726 #endif 735 #endif
727 736
728 retry: 737 retry:
729 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); 738 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
730 if (IS_ERR(neigh)) { 739 if (IS_ERR(neigh)) {
731 struct net *net = dev_net(rt->rt6i_dev); 740 struct net *net = dev_net(rt->rt6i_dev);
732 int saved_rt_min_interval = 741 int saved_rt_min_interval =
733 net->ipv6.sysctl.ip6_rt_gc_min_interval; 742 net->ipv6.sysctl.ip6_rt_gc_min_interval;
734 int saved_rt_elasticity = 743 int saved_rt_elasticity =
735 net->ipv6.sysctl.ip6_rt_gc_elasticity; 744 net->ipv6.sysctl.ip6_rt_gc_elasticity;
736 745
737 if (attempts-- > 0) { 746 if (attempts-- > 0) {
738 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1; 747 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
739 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0; 748 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
740 749
741 ip6_dst_gc(&net->ipv6.ip6_dst_ops); 750 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
742 751
743 net->ipv6.sysctl.ip6_rt_gc_elasticity = 752 net->ipv6.sysctl.ip6_rt_gc_elasticity =
744 saved_rt_elasticity; 753 saved_rt_elasticity;
745 net->ipv6.sysctl.ip6_rt_gc_min_interval = 754 net->ipv6.sysctl.ip6_rt_gc_min_interval =
746 saved_rt_min_interval; 755 saved_rt_min_interval;
747 goto retry; 756 goto retry;
748 } 757 }
749 758
750 if (net_ratelimit()) 759 if (net_ratelimit())
751 printk(KERN_WARNING 760 printk(KERN_WARNING
752 "ipv6: Neighbour table overflow.\n"); 761 "ipv6: Neighbour table overflow.\n");
753 dst_free(&rt->dst); 762 dst_free(&rt->dst);
754 return NULL; 763 return NULL;
755 } 764 }
756 dst_set_neighbour(&rt->dst, neigh); 765 dst_set_neighbour(&rt->dst, neigh);
757 766
758 } 767 }
759 768
760 return rt; 769 return rt;
761 } 770 }
762 771
763 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, 772 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
764 const struct in6_addr *daddr) 773 const struct in6_addr *daddr)
765 { 774 {
766 struct rt6_info *rt = ip6_rt_copy(ort, daddr); 775 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
767 776
768 if (rt) { 777 if (rt) {
769 rt->rt6i_dst.plen = 128; 778 rt->rt6i_dst.plen = 128;
770 rt->rt6i_flags |= RTF_CACHE; 779 rt->rt6i_flags |= RTF_CACHE;
771 rt->dst.flags |= DST_HOST; 780 rt->dst.flags |= DST_HOST;
772 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour(&ort->dst))); 781 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
773 } 782 }
774 return rt; 783 return rt;
775 } 784 }
776 785
777 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, 786 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
778 struct flowi6 *fl6, int flags) 787 struct flowi6 *fl6, int flags)
779 { 788 {
780 struct fib6_node *fn; 789 struct fib6_node *fn;
781 struct rt6_info *rt, *nrt; 790 struct rt6_info *rt, *nrt;
782 int strict = 0; 791 int strict = 0;
783 int attempts = 3; 792 int attempts = 3;
784 int err; 793 int err;
785 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; 794 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
786 795
787 strict |= flags & RT6_LOOKUP_F_IFACE; 796 strict |= flags & RT6_LOOKUP_F_IFACE;
788 797
789 relookup: 798 relookup:
790 read_lock_bh(&table->tb6_lock); 799 read_lock_bh(&table->tb6_lock);
791 800
792 restart_2: 801 restart_2:
793 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 802 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
794 803
795 restart: 804 restart:
796 rt = rt6_select(fn, oif, strict | reachable); 805 rt = rt6_select(fn, oif, strict | reachable);
797 806
798 BACKTRACK(net, &fl6->saddr); 807 BACKTRACK(net, &fl6->saddr);
799 if (rt == net->ipv6.ip6_null_entry || 808 if (rt == net->ipv6.ip6_null_entry ||
800 rt->rt6i_flags & RTF_CACHE) 809 rt->rt6i_flags & RTF_CACHE)
801 goto out; 810 goto out;
802 811
803 dst_hold(&rt->dst); 812 dst_hold(&rt->dst);
804 read_unlock_bh(&table->tb6_lock); 813 read_unlock_bh(&table->tb6_lock);
805 814
806 if (!dst_get_neighbour(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) 815 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
807 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); 816 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
808 else if (!(rt->dst.flags & DST_HOST)) 817 else if (!(rt->dst.flags & DST_HOST))
809 nrt = rt6_alloc_clone(rt, &fl6->daddr); 818 nrt = rt6_alloc_clone(rt, &fl6->daddr);
810 else 819 else
811 goto out2; 820 goto out2;
812 821
813 dst_release(&rt->dst); 822 dst_release(&rt->dst);
814 rt = nrt ? : net->ipv6.ip6_null_entry; 823 rt = nrt ? : net->ipv6.ip6_null_entry;
815 824
816 dst_hold(&rt->dst); 825 dst_hold(&rt->dst);
817 if (nrt) { 826 if (nrt) {
818 err = ip6_ins_rt(nrt); 827 err = ip6_ins_rt(nrt);
819 if (!err) 828 if (!err)
820 goto out2; 829 goto out2;
821 } 830 }
822 831
823 if (--attempts <= 0) 832 if (--attempts <= 0)
824 goto out2; 833 goto out2;
825 834
826 /* 835 /*
827 * Race condition! In the gap, when table->tb6_lock was 836 * Race condition! In the gap, when table->tb6_lock was
828 * released someone could insert this route. Relookup. 837 * released someone could insert this route. Relookup.
829 */ 838 */
830 dst_release(&rt->dst); 839 dst_release(&rt->dst);
831 goto relookup; 840 goto relookup;
832 841
833 out: 842 out:
834 if (reachable) { 843 if (reachable) {
835 reachable = 0; 844 reachable = 0;
836 goto restart_2; 845 goto restart_2;
837 } 846 }
838 dst_hold(&rt->dst); 847 dst_hold(&rt->dst);
839 read_unlock_bh(&table->tb6_lock); 848 read_unlock_bh(&table->tb6_lock);
840 out2: 849 out2:
841 rt->dst.lastuse = jiffies; 850 rt->dst.lastuse = jiffies;
842 rt->dst.__use++; 851 rt->dst.__use++;
843 852
844 return rt; 853 return rt;
845 } 854 }
846 855
847 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 856 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
848 struct flowi6 *fl6, int flags) 857 struct flowi6 *fl6, int flags)
849 { 858 {
850 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 859 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
851 } 860 }
852 861
853 void ip6_route_input(struct sk_buff *skb) 862 void ip6_route_input(struct sk_buff *skb)
854 { 863 {
855 const struct ipv6hdr *iph = ipv6_hdr(skb); 864 const struct ipv6hdr *iph = ipv6_hdr(skb);
856 struct net *net = dev_net(skb->dev); 865 struct net *net = dev_net(skb->dev);
857 int flags = RT6_LOOKUP_F_HAS_SADDR; 866 int flags = RT6_LOOKUP_F_HAS_SADDR;
858 struct flowi6 fl6 = { 867 struct flowi6 fl6 = {
859 .flowi6_iif = skb->dev->ifindex, 868 .flowi6_iif = skb->dev->ifindex,
860 .daddr = iph->daddr, 869 .daddr = iph->daddr,
861 .saddr = iph->saddr, 870 .saddr = iph->saddr,
862 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK, 871 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
863 .flowi6_mark = skb->mark, 872 .flowi6_mark = skb->mark,
864 .flowi6_proto = iph->nexthdr, 873 .flowi6_proto = iph->nexthdr,
865 }; 874 };
866 875
867 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG) 876 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
868 flags |= RT6_LOOKUP_F_IFACE; 877 flags |= RT6_LOOKUP_F_IFACE;
869 878
870 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input)); 879 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
871 } 880 }
872 881
873 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 882 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
874 struct flowi6 *fl6, int flags) 883 struct flowi6 *fl6, int flags)
875 { 884 {
876 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 885 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
877 } 886 }
878 887
879 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, 888 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
880 struct flowi6 *fl6) 889 struct flowi6 *fl6)
881 { 890 {
882 int flags = 0; 891 int flags = 0;
883 892
884 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) 893 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
885 flags |= RT6_LOOKUP_F_IFACE; 894 flags |= RT6_LOOKUP_F_IFACE;
886 895
887 if (!ipv6_addr_any(&fl6->saddr)) 896 if (!ipv6_addr_any(&fl6->saddr))
888 flags |= RT6_LOOKUP_F_HAS_SADDR; 897 flags |= RT6_LOOKUP_F_HAS_SADDR;
889 else if (sk) 898 else if (sk)
890 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 899 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
891 900
892 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 901 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
893 } 902 }
894 903
895 EXPORT_SYMBOL(ip6_route_output); 904 EXPORT_SYMBOL(ip6_route_output);
896 905
897 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 906 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
898 { 907 {
899 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 908 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
900 struct dst_entry *new = NULL; 909 struct dst_entry *new = NULL;
901 910
902 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0); 911 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
903 if (rt) { 912 if (rt) {
904 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); 913 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
905 914
906 new = &rt->dst; 915 new = &rt->dst;
907 916
908 new->__use = 1; 917 new->__use = 1;
909 new->input = dst_discard; 918 new->input = dst_discard;
910 new->output = dst_discard; 919 new->output = dst_discard;
911 920
912 if (dst_metrics_read_only(&ort->dst)) 921 if (dst_metrics_read_only(&ort->dst))
913 new->_metrics = ort->dst._metrics; 922 new->_metrics = ort->dst._metrics;
914 else 923 else
915 dst_copy_metrics(new, &ort->dst); 924 dst_copy_metrics(new, &ort->dst);
916 rt->rt6i_idev = ort->rt6i_idev; 925 rt->rt6i_idev = ort->rt6i_idev;
917 if (rt->rt6i_idev) 926 if (rt->rt6i_idev)
918 in6_dev_hold(rt->rt6i_idev); 927 in6_dev_hold(rt->rt6i_idev);
919 rt->rt6i_expires = 0; 928 rt->rt6i_expires = 0;
920 929
921 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); 930 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
922 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; 931 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
923 rt->rt6i_metric = 0; 932 rt->rt6i_metric = 0;
924 933
925 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 934 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
926 #ifdef CONFIG_IPV6_SUBTREES 935 #ifdef CONFIG_IPV6_SUBTREES
927 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 936 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
928 #endif 937 #endif
929 938
930 dst_free(new); 939 dst_free(new);
931 } 940 }
932 941
933 dst_release(dst_orig); 942 dst_release(dst_orig);
934 return new ? new : ERR_PTR(-ENOMEM); 943 return new ? new : ERR_PTR(-ENOMEM);
935 } 944 }
936 945
937 /* 946 /*
938 * Destination cache support functions 947 * Destination cache support functions
939 */ 948 */
940 949
941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 950 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
942 { 951 {
943 struct rt6_info *rt; 952 struct rt6_info *rt;
944 953
945 rt = (struct rt6_info *) dst; 954 rt = (struct rt6_info *) dst;
946 955
947 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) { 956 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
948 if (rt->rt6i_peer_genid != rt6_peer_genid()) { 957 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
949 if (!rt->rt6i_peer) 958 if (!rt->rt6i_peer)
950 rt6_bind_peer(rt, 0); 959 rt6_bind_peer(rt, 0);
951 rt->rt6i_peer_genid = rt6_peer_genid(); 960 rt->rt6i_peer_genid = rt6_peer_genid();
952 } 961 }
953 return dst; 962 return dst;
954 } 963 }
955 return NULL; 964 return NULL;
956 } 965 }
957 966
958 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 967 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
959 { 968 {
960 struct rt6_info *rt = (struct rt6_info *) dst; 969 struct rt6_info *rt = (struct rt6_info *) dst;
961 970
962 if (rt) { 971 if (rt) {
963 if (rt->rt6i_flags & RTF_CACHE) { 972 if (rt->rt6i_flags & RTF_CACHE) {
964 if (rt6_check_expired(rt)) { 973 if (rt6_check_expired(rt)) {
965 ip6_del_rt(rt); 974 ip6_del_rt(rt);
966 dst = NULL; 975 dst = NULL;
967 } 976 }
968 } else { 977 } else {
969 dst_release(dst); 978 dst_release(dst);
970 dst = NULL; 979 dst = NULL;
971 } 980 }
972 } 981 }
973 return dst; 982 return dst;
974 } 983 }
975 984
976 static void ip6_link_failure(struct sk_buff *skb) 985 static void ip6_link_failure(struct sk_buff *skb)
977 { 986 {
978 struct rt6_info *rt; 987 struct rt6_info *rt;
979 988
980 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 989 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
981 990
982 rt = (struct rt6_info *) skb_dst(skb); 991 rt = (struct rt6_info *) skb_dst(skb);
983 if (rt) { 992 if (rt) {
984 if (rt->rt6i_flags&RTF_CACHE) { 993 if (rt->rt6i_flags&RTF_CACHE) {
985 dst_set_expires(&rt->dst, 0); 994 dst_set_expires(&rt->dst, 0);
986 rt->rt6i_flags |= RTF_EXPIRES; 995 rt->rt6i_flags |= RTF_EXPIRES;
987 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) 996 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
988 rt->rt6i_node->fn_sernum = -1; 997 rt->rt6i_node->fn_sernum = -1;
989 } 998 }
990 } 999 }
991 1000
992 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1001 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
993 { 1002 {
994 struct rt6_info *rt6 = (struct rt6_info*)dst; 1003 struct rt6_info *rt6 = (struct rt6_info*)dst;
995 1004
996 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { 1005 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
997 rt6->rt6i_flags |= RTF_MODIFIED; 1006 rt6->rt6i_flags |= RTF_MODIFIED;
998 if (mtu < IPV6_MIN_MTU) { 1007 if (mtu < IPV6_MIN_MTU) {
999 u32 features = dst_metric(dst, RTAX_FEATURES); 1008 u32 features = dst_metric(dst, RTAX_FEATURES);
1000 mtu = IPV6_MIN_MTU; 1009 mtu = IPV6_MIN_MTU;
1001 features |= RTAX_FEATURE_ALLFRAG; 1010 features |= RTAX_FEATURE_ALLFRAG;
1002 dst_metric_set(dst, RTAX_FEATURES, features); 1011 dst_metric_set(dst, RTAX_FEATURES, features);
1003 } 1012 }
1004 dst_metric_set(dst, RTAX_MTU, mtu); 1013 dst_metric_set(dst, RTAX_MTU, mtu);
1005 } 1014 }
1006 } 1015 }
1007 1016
1008 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 1017 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1009 { 1018 {
1010 struct net_device *dev = dst->dev; 1019 struct net_device *dev = dst->dev;
1011 unsigned int mtu = dst_mtu(dst); 1020 unsigned int mtu = dst_mtu(dst);
1012 struct net *net = dev_net(dev); 1021 struct net *net = dev_net(dev);
1013 1022
1014 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 1023 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1015 1024
1016 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 1025 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1017 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 1026 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1018 1027
1019 /* 1028 /*
1020 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 1029 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1021 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 1030 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1022 * IPV6_MAXPLEN is also valid and means: "any MSS, 1031 * IPV6_MAXPLEN is also valid and means: "any MSS,
1023 * rely only on pmtu discovery" 1032 * rely only on pmtu discovery"
1024 */ 1033 */
1025 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 1034 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1026 mtu = IPV6_MAXPLEN; 1035 mtu = IPV6_MAXPLEN;
1027 return mtu; 1036 return mtu;
1028 } 1037 }
1029 1038
1030 static unsigned int ip6_default_mtu(const struct dst_entry *dst) 1039 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1031 { 1040 {
1032 unsigned int mtu = IPV6_MIN_MTU; 1041 unsigned int mtu = IPV6_MIN_MTU;
1033 struct inet6_dev *idev; 1042 struct inet6_dev *idev;
1034 1043
1035 rcu_read_lock(); 1044 rcu_read_lock();
1036 idev = __in6_dev_get(dst->dev); 1045 idev = __in6_dev_get(dst->dev);
1037 if (idev) 1046 if (idev)
1038 mtu = idev->cnf.mtu6; 1047 mtu = idev->cnf.mtu6;
1039 rcu_read_unlock(); 1048 rcu_read_unlock();
1040 1049
1041 return mtu; 1050 return mtu;
1042 } 1051 }
1043 1052
1044 static struct dst_entry *icmp6_dst_gc_list; 1053 static struct dst_entry *icmp6_dst_gc_list;
1045 static DEFINE_SPINLOCK(icmp6_dst_lock); 1054 static DEFINE_SPINLOCK(icmp6_dst_lock);
1046 1055
1047 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 1056 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1048 struct neighbour *neigh, 1057 struct neighbour *neigh,
1049 const struct in6_addr *addr) 1058 const struct in6_addr *addr)
1050 { 1059 {
1051 struct rt6_info *rt; 1060 struct rt6_info *rt;
1052 struct inet6_dev *idev = in6_dev_get(dev); 1061 struct inet6_dev *idev = in6_dev_get(dev);
1053 struct net *net = dev_net(dev); 1062 struct net *net = dev_net(dev);
1054 1063
1055 if (unlikely(idev == NULL)) 1064 if (unlikely(idev == NULL))
1056 return NULL; 1065 return NULL;
1057 1066
1058 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); 1067 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1059 if (unlikely(rt == NULL)) { 1068 if (unlikely(rt == NULL)) {
1060 in6_dev_put(idev); 1069 in6_dev_put(idev);
1061 goto out; 1070 goto out;
1062 } 1071 }
1063 1072
1064 if (neigh) 1073 if (neigh)
1065 neigh_hold(neigh); 1074 neigh_hold(neigh);
1066 else { 1075 else {
1067 neigh = ndisc_get_neigh(dev, addr); 1076 neigh = ndisc_get_neigh(dev, addr);
1068 if (IS_ERR(neigh)) 1077 if (IS_ERR(neigh))
1069 neigh = NULL; 1078 neigh = NULL;
1070 } 1079 }
1071 1080
1072 rt->rt6i_idev = idev; 1081 rt->rt6i_idev = idev;
1073 dst_set_neighbour(&rt->dst, neigh); 1082 dst_set_neighbour(&rt->dst, neigh);
1074 atomic_set(&rt->dst.__refcnt, 1); 1083 atomic_set(&rt->dst.__refcnt, 1);
1075 ipv6_addr_copy(&rt->rt6i_dst.addr, addr); 1084 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1076 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); 1085 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1077 rt->dst.output = ip6_output; 1086 rt->dst.output = ip6_output;
1078 1087
1079 spin_lock_bh(&icmp6_dst_lock); 1088 spin_lock_bh(&icmp6_dst_lock);
1080 rt->dst.next = icmp6_dst_gc_list; 1089 rt->dst.next = icmp6_dst_gc_list;
1081 icmp6_dst_gc_list = &rt->dst; 1090 icmp6_dst_gc_list = &rt->dst;
1082 spin_unlock_bh(&icmp6_dst_lock); 1091 spin_unlock_bh(&icmp6_dst_lock);
1083 1092
1084 fib6_force_start_gc(net); 1093 fib6_force_start_gc(net);
1085 1094
1086 out: 1095 out:
1087 return &rt->dst; 1096 return &rt->dst;
1088 } 1097 }
1089 1098
1090 int icmp6_dst_gc(void) 1099 int icmp6_dst_gc(void)
1091 { 1100 {
1092 struct dst_entry *dst, **pprev; 1101 struct dst_entry *dst, **pprev;
1093 int more = 0; 1102 int more = 0;
1094 1103
1095 spin_lock_bh(&icmp6_dst_lock); 1104 spin_lock_bh(&icmp6_dst_lock);
1096 pprev = &icmp6_dst_gc_list; 1105 pprev = &icmp6_dst_gc_list;
1097 1106
1098 while ((dst = *pprev) != NULL) { 1107 while ((dst = *pprev) != NULL) {
1099 if (!atomic_read(&dst->__refcnt)) { 1108 if (!atomic_read(&dst->__refcnt)) {
1100 *pprev = dst->next; 1109 *pprev = dst->next;
1101 dst_free(dst); 1110 dst_free(dst);
1102 } else { 1111 } else {
1103 pprev = &dst->next; 1112 pprev = &dst->next;
1104 ++more; 1113 ++more;
1105 } 1114 }
1106 } 1115 }
1107 1116
1108 spin_unlock_bh(&icmp6_dst_lock); 1117 spin_unlock_bh(&icmp6_dst_lock);
1109 1118
1110 return more; 1119 return more;
1111 } 1120 }
1112 1121
1113 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), 1122 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1114 void *arg) 1123 void *arg)
1115 { 1124 {
1116 struct dst_entry *dst, **pprev; 1125 struct dst_entry *dst, **pprev;
1117 1126
1118 spin_lock_bh(&icmp6_dst_lock); 1127 spin_lock_bh(&icmp6_dst_lock);
1119 pprev = &icmp6_dst_gc_list; 1128 pprev = &icmp6_dst_gc_list;
1120 while ((dst = *pprev) != NULL) { 1129 while ((dst = *pprev) != NULL) {
1121 struct rt6_info *rt = (struct rt6_info *) dst; 1130 struct rt6_info *rt = (struct rt6_info *) dst;
1122 if (func(rt, arg)) { 1131 if (func(rt, arg)) {
1123 *pprev = dst->next; 1132 *pprev = dst->next;
1124 dst_free(dst); 1133 dst_free(dst);
1125 } else { 1134 } else {
1126 pprev = &dst->next; 1135 pprev = &dst->next;
1127 } 1136 }
1128 } 1137 }
1129 spin_unlock_bh(&icmp6_dst_lock); 1138 spin_unlock_bh(&icmp6_dst_lock);
1130 } 1139 }
1131 1140
1132 static int ip6_dst_gc(struct dst_ops *ops) 1141 static int ip6_dst_gc(struct dst_ops *ops)
1133 { 1142 {
1134 unsigned long now = jiffies; 1143 unsigned long now = jiffies;
1135 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 1144 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1136 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 1145 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1137 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 1146 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1138 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 1147 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1139 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 1148 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1140 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 1149 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1141 int entries; 1150 int entries;
1142 1151
1143 entries = dst_entries_get_fast(ops); 1152 entries = dst_entries_get_fast(ops);
1144 if (time_after(rt_last_gc + rt_min_interval, now) && 1153 if (time_after(rt_last_gc + rt_min_interval, now) &&
1145 entries <= rt_max_size) 1154 entries <= rt_max_size)
1146 goto out; 1155 goto out;
1147 1156
1148 net->ipv6.ip6_rt_gc_expire++; 1157 net->ipv6.ip6_rt_gc_expire++;
1149 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net); 1158 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1150 net->ipv6.ip6_rt_last_gc = now; 1159 net->ipv6.ip6_rt_last_gc = now;
1151 entries = dst_entries_get_slow(ops); 1160 entries = dst_entries_get_slow(ops);
1152 if (entries < ops->gc_thresh) 1161 if (entries < ops->gc_thresh)
1153 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 1162 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1154 out: 1163 out:
1155 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 1164 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1156 return entries > rt_max_size; 1165 return entries > rt_max_size;
1157 } 1166 }
1158 1167
1159 /* Clean host part of a prefix. Not necessary in radix tree, 1168 /* Clean host part of a prefix. Not necessary in radix tree,
1160 but results in cleaner routing tables. 1169 but results in cleaner routing tables.
1161 1170
1162 Remove it only when all the things will work! 1171 Remove it only when all the things will work!
1163 */ 1172 */
1164 1173
1165 int ip6_dst_hoplimit(struct dst_entry *dst) 1174 int ip6_dst_hoplimit(struct dst_entry *dst)
1166 { 1175 {
1167 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); 1176 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1168 if (hoplimit == 0) { 1177 if (hoplimit == 0) {
1169 struct net_device *dev = dst->dev; 1178 struct net_device *dev = dst->dev;
1170 struct inet6_dev *idev; 1179 struct inet6_dev *idev;
1171 1180
1172 rcu_read_lock(); 1181 rcu_read_lock();
1173 idev = __in6_dev_get(dev); 1182 idev = __in6_dev_get(dev);
1174 if (idev) 1183 if (idev)
1175 hoplimit = idev->cnf.hop_limit; 1184 hoplimit = idev->cnf.hop_limit;
1176 else 1185 else
1177 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; 1186 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1178 rcu_read_unlock(); 1187 rcu_read_unlock();
1179 } 1188 }
1180 return hoplimit; 1189 return hoplimit;
1181 } 1190 }
1182 EXPORT_SYMBOL(ip6_dst_hoplimit); 1191 EXPORT_SYMBOL(ip6_dst_hoplimit);
1183 1192
1184 /* 1193 /*
1185 * 1194 *
1186 */ 1195 */
1187 1196
1188 int ip6_route_add(struct fib6_config *cfg) 1197 int ip6_route_add(struct fib6_config *cfg)
1189 { 1198 {
1190 int err; 1199 int err;
1191 struct net *net = cfg->fc_nlinfo.nl_net; 1200 struct net *net = cfg->fc_nlinfo.nl_net;
1192 struct rt6_info *rt = NULL; 1201 struct rt6_info *rt = NULL;
1193 struct net_device *dev = NULL; 1202 struct net_device *dev = NULL;
1194 struct inet6_dev *idev = NULL; 1203 struct inet6_dev *idev = NULL;
1195 struct fib6_table *table; 1204 struct fib6_table *table;
1196 int addr_type; 1205 int addr_type;
1197 1206
1198 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) 1207 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1199 return -EINVAL; 1208 return -EINVAL;
1200 #ifndef CONFIG_IPV6_SUBTREES 1209 #ifndef CONFIG_IPV6_SUBTREES
1201 if (cfg->fc_src_len) 1210 if (cfg->fc_src_len)
1202 return -EINVAL; 1211 return -EINVAL;
1203 #endif 1212 #endif
1204 if (cfg->fc_ifindex) { 1213 if (cfg->fc_ifindex) {
1205 err = -ENODEV; 1214 err = -ENODEV;
1206 dev = dev_get_by_index(net, cfg->fc_ifindex); 1215 dev = dev_get_by_index(net, cfg->fc_ifindex);
1207 if (!dev) 1216 if (!dev)
1208 goto out; 1217 goto out;
1209 idev = in6_dev_get(dev); 1218 idev = in6_dev_get(dev);
1210 if (!idev) 1219 if (!idev)
1211 goto out; 1220 goto out;
1212 } 1221 }
1213 1222
1214 if (cfg->fc_metric == 0) 1223 if (cfg->fc_metric == 0)
1215 cfg->fc_metric = IP6_RT_PRIO_USER; 1224 cfg->fc_metric = IP6_RT_PRIO_USER;
1216 1225
1217 table = fib6_new_table(net, cfg->fc_table); 1226 table = fib6_new_table(net, cfg->fc_table);
1218 if (table == NULL) { 1227 if (table == NULL) {
1219 err = -ENOBUFS; 1228 err = -ENOBUFS;
1220 goto out; 1229 goto out;
1221 } 1230 }
1222 1231
1223 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT); 1232 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1224 1233
1225 if (rt == NULL) { 1234 if (rt == NULL) {
1226 err = -ENOMEM; 1235 err = -ENOMEM;
1227 goto out; 1236 goto out;
1228 } 1237 }
1229 1238
1230 rt->dst.obsolete = -1; 1239 rt->dst.obsolete = -1;
1231 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ? 1240 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1232 jiffies + clock_t_to_jiffies(cfg->fc_expires) : 1241 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1233 0; 1242 0;
1234 1243
1235 if (cfg->fc_protocol == RTPROT_UNSPEC) 1244 if (cfg->fc_protocol == RTPROT_UNSPEC)
1236 cfg->fc_protocol = RTPROT_BOOT; 1245 cfg->fc_protocol = RTPROT_BOOT;
1237 rt->rt6i_protocol = cfg->fc_protocol; 1246 rt->rt6i_protocol = cfg->fc_protocol;
1238 1247
1239 addr_type = ipv6_addr_type(&cfg->fc_dst); 1248 addr_type = ipv6_addr_type(&cfg->fc_dst);
1240 1249
1241 if (addr_type & IPV6_ADDR_MULTICAST) 1250 if (addr_type & IPV6_ADDR_MULTICAST)
1242 rt->dst.input = ip6_mc_input; 1251 rt->dst.input = ip6_mc_input;
1243 else if (cfg->fc_flags & RTF_LOCAL) 1252 else if (cfg->fc_flags & RTF_LOCAL)
1244 rt->dst.input = ip6_input; 1253 rt->dst.input = ip6_input;
1245 else 1254 else
1246 rt->dst.input = ip6_forward; 1255 rt->dst.input = ip6_forward;
1247 1256
1248 rt->dst.output = ip6_output; 1257 rt->dst.output = ip6_output;
1249 1258
1250 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 1259 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1251 rt->rt6i_dst.plen = cfg->fc_dst_len; 1260 rt->rt6i_dst.plen = cfg->fc_dst_len;
1252 if (rt->rt6i_dst.plen == 128) 1261 if (rt->rt6i_dst.plen == 128)
1253 rt->dst.flags |= DST_HOST; 1262 rt->dst.flags |= DST_HOST;
1254 1263
1255 #ifdef CONFIG_IPV6_SUBTREES 1264 #ifdef CONFIG_IPV6_SUBTREES
1256 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 1265 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1257 rt->rt6i_src.plen = cfg->fc_src_len; 1266 rt->rt6i_src.plen = cfg->fc_src_len;
1258 #endif 1267 #endif
1259 1268
1260 rt->rt6i_metric = cfg->fc_metric; 1269 rt->rt6i_metric = cfg->fc_metric;
1261 1270
1262 /* We cannot add true routes via loopback here, 1271 /* We cannot add true routes via loopback here,
1263 they would result in kernel looping; promote them to reject routes 1272 they would result in kernel looping; promote them to reject routes
1264 */ 1273 */
1265 if ((cfg->fc_flags & RTF_REJECT) || 1274 if ((cfg->fc_flags & RTF_REJECT) ||
1266 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK) 1275 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1267 && !(cfg->fc_flags&RTF_LOCAL))) { 1276 && !(cfg->fc_flags&RTF_LOCAL))) {
1268 /* hold loopback dev/idev if we haven't done so. */ 1277 /* hold loopback dev/idev if we haven't done so. */
1269 if (dev != net->loopback_dev) { 1278 if (dev != net->loopback_dev) {
1270 if (dev) { 1279 if (dev) {
1271 dev_put(dev); 1280 dev_put(dev);
1272 in6_dev_put(idev); 1281 in6_dev_put(idev);
1273 } 1282 }
1274 dev = net->loopback_dev; 1283 dev = net->loopback_dev;
1275 dev_hold(dev); 1284 dev_hold(dev);
1276 idev = in6_dev_get(dev); 1285 idev = in6_dev_get(dev);
1277 if (!idev) { 1286 if (!idev) {
1278 err = -ENODEV; 1287 err = -ENODEV;
1279 goto out; 1288 goto out;
1280 } 1289 }
1281 } 1290 }
1282 rt->dst.output = ip6_pkt_discard_out; 1291 rt->dst.output = ip6_pkt_discard_out;
1283 rt->dst.input = ip6_pkt_discard; 1292 rt->dst.input = ip6_pkt_discard;
1284 rt->dst.error = -ENETUNREACH; 1293 rt->dst.error = -ENETUNREACH;
1285 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 1294 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1286 goto install_route; 1295 goto install_route;
1287 } 1296 }
1288 1297
1289 if (cfg->fc_flags & RTF_GATEWAY) { 1298 if (cfg->fc_flags & RTF_GATEWAY) {
1290 const struct in6_addr *gw_addr; 1299 const struct in6_addr *gw_addr;
1291 int gwa_type; 1300 int gwa_type;
1292 1301
1293 gw_addr = &cfg->fc_gateway; 1302 gw_addr = &cfg->fc_gateway;
1294 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr); 1303 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1295 gwa_type = ipv6_addr_type(gw_addr); 1304 gwa_type = ipv6_addr_type(gw_addr);
1296 1305
1297 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 1306 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1298 struct rt6_info *grt; 1307 struct rt6_info *grt;
1299 1308
1300 /* IPv6 strictly inhibits using not link-local 1309 /* IPv6 strictly inhibits using not link-local
1301 addresses as nexthop address. 1310 addresses as nexthop address.
1302 Otherwise, router will not able to send redirects. 1311 Otherwise, router will not able to send redirects.
1303 It is very good, but in some (rare!) circumstances 1312 It is very good, but in some (rare!) circumstances
1304 (SIT, PtP, NBMA NOARP links) it is handy to allow 1313 (SIT, PtP, NBMA NOARP links) it is handy to allow
1305 some exceptions. --ANK 1314 some exceptions. --ANK
1306 */ 1315 */
1307 err = -EINVAL; 1316 err = -EINVAL;
1308 if (!(gwa_type&IPV6_ADDR_UNICAST)) 1317 if (!(gwa_type&IPV6_ADDR_UNICAST))
1309 goto out; 1318 goto out;
1310 1319
1311 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); 1320 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1312 1321
1313 err = -EHOSTUNREACH; 1322 err = -EHOSTUNREACH;
1314 if (grt == NULL) 1323 if (grt == NULL)
1315 goto out; 1324 goto out;
1316 if (dev) { 1325 if (dev) {
1317 if (dev != grt->rt6i_dev) { 1326 if (dev != grt->rt6i_dev) {
1318 dst_release(&grt->dst); 1327 dst_release(&grt->dst);
1319 goto out; 1328 goto out;
1320 } 1329 }
1321 } else { 1330 } else {
1322 dev = grt->rt6i_dev; 1331 dev = grt->rt6i_dev;
1323 idev = grt->rt6i_idev; 1332 idev = grt->rt6i_idev;
1324 dev_hold(dev); 1333 dev_hold(dev);
1325 in6_dev_hold(grt->rt6i_idev); 1334 in6_dev_hold(grt->rt6i_idev);
1326 } 1335 }
1327 if (!(grt->rt6i_flags&RTF_GATEWAY)) 1336 if (!(grt->rt6i_flags&RTF_GATEWAY))
1328 err = 0; 1337 err = 0;
1329 dst_release(&grt->dst); 1338 dst_release(&grt->dst);
1330 1339
1331 if (err) 1340 if (err)
1332 goto out; 1341 goto out;
1333 } 1342 }
1334 err = -EINVAL; 1343 err = -EINVAL;
1335 if (dev == NULL || (dev->flags&IFF_LOOPBACK)) 1344 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1336 goto out; 1345 goto out;
1337 } 1346 }
1338 1347
1339 err = -ENODEV; 1348 err = -ENODEV;
1340 if (dev == NULL) 1349 if (dev == NULL)
1341 goto out; 1350 goto out;
1342 1351
1343 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 1352 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1344 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 1353 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1345 err = -EINVAL; 1354 err = -EINVAL;
1346 goto out; 1355 goto out;
1347 } 1356 }
1348 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc); 1357 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1349 rt->rt6i_prefsrc.plen = 128; 1358 rt->rt6i_prefsrc.plen = 128;
1350 } else 1359 } else
1351 rt->rt6i_prefsrc.plen = 0; 1360 rt->rt6i_prefsrc.plen = 0;
1352 1361
1353 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { 1362 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1354 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); 1363 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1355 if (IS_ERR(n)) { 1364 if (IS_ERR(n)) {
1356 err = PTR_ERR(n); 1365 err = PTR_ERR(n);
1357 goto out; 1366 goto out;
1358 } 1367 }
1359 dst_set_neighbour(&rt->dst, n); 1368 dst_set_neighbour(&rt->dst, n);
1360 } 1369 }
1361 1370
1362 rt->rt6i_flags = cfg->fc_flags; 1371 rt->rt6i_flags = cfg->fc_flags;
1363 1372
1364 install_route: 1373 install_route:
1365 if (cfg->fc_mx) { 1374 if (cfg->fc_mx) {
1366 struct nlattr *nla; 1375 struct nlattr *nla;
1367 int remaining; 1376 int remaining;
1368 1377
1369 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 1378 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1370 int type = nla_type(nla); 1379 int type = nla_type(nla);
1371 1380
1372 if (type) { 1381 if (type) {
1373 if (type > RTAX_MAX) { 1382 if (type > RTAX_MAX) {
1374 err = -EINVAL; 1383 err = -EINVAL;
1375 goto out; 1384 goto out;
1376 } 1385 }
1377 1386
1378 dst_metric_set(&rt->dst, type, nla_get_u32(nla)); 1387 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1379 } 1388 }
1380 } 1389 }
1381 } 1390 }
1382 1391
1383 rt->dst.dev = dev; 1392 rt->dst.dev = dev;
1384 rt->rt6i_idev = idev; 1393 rt->rt6i_idev = idev;
1385 rt->rt6i_table = table; 1394 rt->rt6i_table = table;
1386 1395
1387 cfg->fc_nlinfo.nl_net = dev_net(dev); 1396 cfg->fc_nlinfo.nl_net = dev_net(dev);
1388 1397
1389 return __ip6_ins_rt(rt, &cfg->fc_nlinfo); 1398 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1390 1399
1391 out: 1400 out:
1392 if (dev) 1401 if (dev)
1393 dev_put(dev); 1402 dev_put(dev);
1394 if (idev) 1403 if (idev)
1395 in6_dev_put(idev); 1404 in6_dev_put(idev);
1396 if (rt) 1405 if (rt)
1397 dst_free(&rt->dst); 1406 dst_free(&rt->dst);
1398 return err; 1407 return err;
1399 } 1408 }
1400 1409
1401 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 1410 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1402 { 1411 {
1403 int err; 1412 int err;
1404 struct fib6_table *table; 1413 struct fib6_table *table;
1405 struct net *net = dev_net(rt->rt6i_dev); 1414 struct net *net = dev_net(rt->rt6i_dev);
1406 1415
1407 if (rt == net->ipv6.ip6_null_entry) 1416 if (rt == net->ipv6.ip6_null_entry)
1408 return -ENOENT; 1417 return -ENOENT;
1409 1418
1410 table = rt->rt6i_table; 1419 table = rt->rt6i_table;
1411 write_lock_bh(&table->tb6_lock); 1420 write_lock_bh(&table->tb6_lock);
1412 1421
1413 err = fib6_del(rt, info); 1422 err = fib6_del(rt, info);
1414 dst_release(&rt->dst); 1423 dst_release(&rt->dst);
1415 1424
1416 write_unlock_bh(&table->tb6_lock); 1425 write_unlock_bh(&table->tb6_lock);
1417 1426
1418 return err; 1427 return err;
1419 } 1428 }
1420 1429
1421 int ip6_del_rt(struct rt6_info *rt) 1430 int ip6_del_rt(struct rt6_info *rt)
1422 { 1431 {
1423 struct nl_info info = { 1432 struct nl_info info = {
1424 .nl_net = dev_net(rt->rt6i_dev), 1433 .nl_net = dev_net(rt->rt6i_dev),
1425 }; 1434 };
1426 return __ip6_del_rt(rt, &info); 1435 return __ip6_del_rt(rt, &info);
1427 } 1436 }
1428 1437
1429 static int ip6_route_del(struct fib6_config *cfg) 1438 static int ip6_route_del(struct fib6_config *cfg)
1430 { 1439 {
1431 struct fib6_table *table; 1440 struct fib6_table *table;
1432 struct fib6_node *fn; 1441 struct fib6_node *fn;
1433 struct rt6_info *rt; 1442 struct rt6_info *rt;
1434 int err = -ESRCH; 1443 int err = -ESRCH;
1435 1444
1436 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 1445 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1437 if (table == NULL) 1446 if (table == NULL)
1438 return err; 1447 return err;
1439 1448
1440 read_lock_bh(&table->tb6_lock); 1449 read_lock_bh(&table->tb6_lock);
1441 1450
1442 fn = fib6_locate(&table->tb6_root, 1451 fn = fib6_locate(&table->tb6_root,
1443 &cfg->fc_dst, cfg->fc_dst_len, 1452 &cfg->fc_dst, cfg->fc_dst_len,
1444 &cfg->fc_src, cfg->fc_src_len); 1453 &cfg->fc_src, cfg->fc_src_len);
1445 1454
1446 if (fn) { 1455 if (fn) {
1447 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 1456 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1448 if (cfg->fc_ifindex && 1457 if (cfg->fc_ifindex &&
1449 (rt->rt6i_dev == NULL || 1458 (rt->rt6i_dev == NULL ||
1450 rt->rt6i_dev->ifindex != cfg->fc_ifindex)) 1459 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1451 continue; 1460 continue;
1452 if (cfg->fc_flags & RTF_GATEWAY && 1461 if (cfg->fc_flags & RTF_GATEWAY &&
1453 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 1462 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1454 continue; 1463 continue;
1455 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 1464 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1456 continue; 1465 continue;
1457 dst_hold(&rt->dst); 1466 dst_hold(&rt->dst);
1458 read_unlock_bh(&table->tb6_lock); 1467 read_unlock_bh(&table->tb6_lock);
1459 1468
1460 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 1469 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1461 } 1470 }
1462 } 1471 }
1463 read_unlock_bh(&table->tb6_lock); 1472 read_unlock_bh(&table->tb6_lock);
1464 1473
1465 return err; 1474 return err;
1466 } 1475 }
1467 1476
1468 /* 1477 /*
1469 * Handle redirects 1478 * Handle redirects
1470 */ 1479 */
1471 struct ip6rd_flowi { 1480 struct ip6rd_flowi {
1472 struct flowi6 fl6; 1481 struct flowi6 fl6;
1473 struct in6_addr gateway; 1482 struct in6_addr gateway;
1474 }; 1483 };
1475 1484
1476 static struct rt6_info *__ip6_route_redirect(struct net *net, 1485 static struct rt6_info *__ip6_route_redirect(struct net *net,
1477 struct fib6_table *table, 1486 struct fib6_table *table,
1478 struct flowi6 *fl6, 1487 struct flowi6 *fl6,
1479 int flags) 1488 int flags)
1480 { 1489 {
1481 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 1490 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1482 struct rt6_info *rt; 1491 struct rt6_info *rt;
1483 struct fib6_node *fn; 1492 struct fib6_node *fn;
1484 1493
1485 /* 1494 /*
1486 * Get the "current" route for this destination and 1495 * Get the "current" route for this destination and
1487 * check if the redirect has come from approriate router. 1496 * check if the redirect has come from approriate router.
1488 * 1497 *
1489 * RFC 2461 specifies that redirects should only be 1498 * RFC 2461 specifies that redirects should only be
1490 * accepted if they come from the nexthop to the target. 1499 * accepted if they come from the nexthop to the target.
1491 * Due to the way the routes are chosen, this notion 1500 * Due to the way the routes are chosen, this notion
1492 * is a bit fuzzy and one might need to check all possible 1501 * is a bit fuzzy and one might need to check all possible
1493 * routes. 1502 * routes.
1494 */ 1503 */
1495 1504
1496 read_lock_bh(&table->tb6_lock); 1505 read_lock_bh(&table->tb6_lock);
1497 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1506 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1498 restart: 1507 restart:
1499 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 1508 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1500 /* 1509 /*
1501 * Current route is on-link; redirect is always invalid. 1510 * Current route is on-link; redirect is always invalid.
1502 * 1511 *
1503 * Seems, previous statement is not true. It could 1512 * Seems, previous statement is not true. It could
1504 * be node, which looks for us as on-link (f.e. proxy ndisc) 1513 * be node, which looks for us as on-link (f.e. proxy ndisc)
1505 * But then router serving it might decide, that we should 1514 * But then router serving it might decide, that we should
1506 * know truth 8)8) --ANK (980726). 1515 * know truth 8)8) --ANK (980726).
1507 */ 1516 */
1508 if (rt6_check_expired(rt)) 1517 if (rt6_check_expired(rt))
1509 continue; 1518 continue;
1510 if (!(rt->rt6i_flags & RTF_GATEWAY)) 1519 if (!(rt->rt6i_flags & RTF_GATEWAY))
1511 continue; 1520 continue;
1512 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex) 1521 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1513 continue; 1522 continue;
1514 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) 1523 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1515 continue; 1524 continue;
1516 break; 1525 break;
1517 } 1526 }
1518 1527
1519 if (!rt) 1528 if (!rt)
1520 rt = net->ipv6.ip6_null_entry; 1529 rt = net->ipv6.ip6_null_entry;
1521 BACKTRACK(net, &fl6->saddr); 1530 BACKTRACK(net, &fl6->saddr);
1522 out: 1531 out:
1523 dst_hold(&rt->dst); 1532 dst_hold(&rt->dst);
1524 1533
1525 read_unlock_bh(&table->tb6_lock); 1534 read_unlock_bh(&table->tb6_lock);
1526 1535
1527 return rt; 1536 return rt;
1528 }; 1537 };
1529 1538
1530 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest, 1539 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1531 const struct in6_addr *src, 1540 const struct in6_addr *src,
1532 const struct in6_addr *gateway, 1541 const struct in6_addr *gateway,
1533 struct net_device *dev) 1542 struct net_device *dev)
1534 { 1543 {
1535 int flags = RT6_LOOKUP_F_HAS_SADDR; 1544 int flags = RT6_LOOKUP_F_HAS_SADDR;
1536 struct net *net = dev_net(dev); 1545 struct net *net = dev_net(dev);
1537 struct ip6rd_flowi rdfl = { 1546 struct ip6rd_flowi rdfl = {
1538 .fl6 = { 1547 .fl6 = {
1539 .flowi6_oif = dev->ifindex, 1548 .flowi6_oif = dev->ifindex,
1540 .daddr = *dest, 1549 .daddr = *dest,
1541 .saddr = *src, 1550 .saddr = *src,
1542 }, 1551 },
1543 }; 1552 };
1544 1553
1545 ipv6_addr_copy(&rdfl.gateway, gateway); 1554 ipv6_addr_copy(&rdfl.gateway, gateway);
1546 1555
1547 if (rt6_need_strict(dest)) 1556 if (rt6_need_strict(dest))
1548 flags |= RT6_LOOKUP_F_IFACE; 1557 flags |= RT6_LOOKUP_F_IFACE;
1549 1558
1550 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6, 1559 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1551 flags, __ip6_route_redirect); 1560 flags, __ip6_route_redirect);
1552 } 1561 }
1553 1562
1554 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, 1563 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1555 const struct in6_addr *saddr, 1564 const struct in6_addr *saddr,
1556 struct neighbour *neigh, u8 *lladdr, int on_link) 1565 struct neighbour *neigh, u8 *lladdr, int on_link)
1557 { 1566 {
1558 struct rt6_info *rt, *nrt = NULL; 1567 struct rt6_info *rt, *nrt = NULL;
1559 struct netevent_redirect netevent; 1568 struct netevent_redirect netevent;
1560 struct net *net = dev_net(neigh->dev); 1569 struct net *net = dev_net(neigh->dev);
1561 1570
1562 rt = ip6_route_redirect(dest, src, saddr, neigh->dev); 1571 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1563 1572
1564 if (rt == net->ipv6.ip6_null_entry) { 1573 if (rt == net->ipv6.ip6_null_entry) {
1565 if (net_ratelimit()) 1574 if (net_ratelimit())
1566 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " 1575 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1567 "for redirect target\n"); 1576 "for redirect target\n");
1568 goto out; 1577 goto out;
1569 } 1578 }
1570 1579
1571 /* 1580 /*
1572 * We have finally decided to accept it. 1581 * We have finally decided to accept it.
1573 */ 1582 */
1574 1583
1575 neigh_update(neigh, lladdr, NUD_STALE, 1584 neigh_update(neigh, lladdr, NUD_STALE,
1576 NEIGH_UPDATE_F_WEAK_OVERRIDE| 1585 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1577 NEIGH_UPDATE_F_OVERRIDE| 1586 NEIGH_UPDATE_F_OVERRIDE|
1578 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 1587 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1579 NEIGH_UPDATE_F_ISROUTER)) 1588 NEIGH_UPDATE_F_ISROUTER))
1580 ); 1589 );
1581 1590
1582 /* 1591 /*
1583 * Redirect received -> path was valid. 1592 * Redirect received -> path was valid.
1584 * Look, redirects are sent only in response to data packets, 1593 * Look, redirects are sent only in response to data packets,
1585 * so that this nexthop apparently is reachable. --ANK 1594 * so that this nexthop apparently is reachable. --ANK
1586 */ 1595 */
1587 dst_confirm(&rt->dst); 1596 dst_confirm(&rt->dst);
1588 1597
1589 /* Duplicate redirect: silently ignore. */ 1598 /* Duplicate redirect: silently ignore. */
1590 if (neigh == dst_get_neighbour(&rt->dst)) 1599 if (neigh == dst_get_neighbour_raw(&rt->dst))
1591 goto out; 1600 goto out;
1592 1601
1593 nrt = ip6_rt_copy(rt, dest); 1602 nrt = ip6_rt_copy(rt, dest);
1594 if (nrt == NULL) 1603 if (nrt == NULL)
1595 goto out; 1604 goto out;
1596 1605
1597 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 1606 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1598 if (on_link) 1607 if (on_link)
1599 nrt->rt6i_flags &= ~RTF_GATEWAY; 1608 nrt->rt6i_flags &= ~RTF_GATEWAY;
1600 1609
1601 nrt->rt6i_dst.plen = 128; 1610 nrt->rt6i_dst.plen = 128;
1602 nrt->dst.flags |= DST_HOST; 1611 nrt->dst.flags |= DST_HOST;
1603 1612
1604 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); 1613 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1605 dst_set_neighbour(&nrt->dst, neigh_clone(neigh)); 1614 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1606 1615
1607 if (ip6_ins_rt(nrt)) 1616 if (ip6_ins_rt(nrt))
1608 goto out; 1617 goto out;
1609 1618
1610 netevent.old = &rt->dst; 1619 netevent.old = &rt->dst;
1611 netevent.new = &nrt->dst; 1620 netevent.new = &nrt->dst;
1612 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 1621 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1613 1622
1614 if (rt->rt6i_flags&RTF_CACHE) { 1623 if (rt->rt6i_flags&RTF_CACHE) {
1615 ip6_del_rt(rt); 1624 ip6_del_rt(rt);
1616 return; 1625 return;
1617 } 1626 }
1618 1627
1619 out: 1628 out:
1620 dst_release(&rt->dst); 1629 dst_release(&rt->dst);
1621 } 1630 }
1622 1631
1623 /* 1632 /*
1624 * Handle ICMP "packet too big" messages 1633 * Handle ICMP "packet too big" messages
1625 * i.e. Path MTU discovery 1634 * i.e. Path MTU discovery
1626 */ 1635 */
1627 1636
1628 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr, 1637 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1629 struct net *net, u32 pmtu, int ifindex) 1638 struct net *net, u32 pmtu, int ifindex)
1630 { 1639 {
1631 struct rt6_info *rt, *nrt; 1640 struct rt6_info *rt, *nrt;
1632 int allfrag = 0; 1641 int allfrag = 0;
1633 again: 1642 again:
1634 rt = rt6_lookup(net, daddr, saddr, ifindex, 0); 1643 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1635 if (rt == NULL) 1644 if (rt == NULL)
1636 return; 1645 return;
1637 1646
1638 if (rt6_check_expired(rt)) { 1647 if (rt6_check_expired(rt)) {
1639 ip6_del_rt(rt); 1648 ip6_del_rt(rt);
1640 goto again; 1649 goto again;
1641 } 1650 }
1642 1651
1643 if (pmtu >= dst_mtu(&rt->dst)) 1652 if (pmtu >= dst_mtu(&rt->dst))
1644 goto out; 1653 goto out;
1645 1654
1646 if (pmtu < IPV6_MIN_MTU) { 1655 if (pmtu < IPV6_MIN_MTU) {
1647 /* 1656 /*
1648 * According to RFC2460, PMTU is set to the IPv6 Minimum Link 1657 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1649 * MTU (1280) and a fragment header should always be included 1658 * MTU (1280) and a fragment header should always be included
1650 * after a node receiving Too Big message reporting PMTU is 1659 * after a node receiving Too Big message reporting PMTU is
1651 * less than the IPv6 Minimum Link MTU. 1660 * less than the IPv6 Minimum Link MTU.
1652 */ 1661 */
1653 pmtu = IPV6_MIN_MTU; 1662 pmtu = IPV6_MIN_MTU;
1654 allfrag = 1; 1663 allfrag = 1;
1655 } 1664 }
1656 1665
1657 /* New mtu received -> path was valid. 1666 /* New mtu received -> path was valid.
1658 They are sent only in response to data packets, 1667 They are sent only in response to data packets,
1659 so that this nexthop apparently is reachable. --ANK 1668 so that this nexthop apparently is reachable. --ANK
1660 */ 1669 */
1661 dst_confirm(&rt->dst); 1670 dst_confirm(&rt->dst);
1662 1671
1663 /* Host route. If it is static, it would be better 1672 /* Host route. If it is static, it would be better
1664 not to override it, but add new one, so that 1673 not to override it, but add new one, so that
1665 when cache entry will expire old pmtu 1674 when cache entry will expire old pmtu
1666 would return automatically. 1675 would return automatically.
1667 */ 1676 */
1668 if (rt->rt6i_flags & RTF_CACHE) { 1677 if (rt->rt6i_flags & RTF_CACHE) {
1669 dst_metric_set(&rt->dst, RTAX_MTU, pmtu); 1678 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1670 if (allfrag) { 1679 if (allfrag) {
1671 u32 features = dst_metric(&rt->dst, RTAX_FEATURES); 1680 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1672 features |= RTAX_FEATURE_ALLFRAG; 1681 features |= RTAX_FEATURE_ALLFRAG;
1673 dst_metric_set(&rt->dst, RTAX_FEATURES, features); 1682 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1674 } 1683 }
1675 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); 1684 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1676 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; 1685 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1677 goto out; 1686 goto out;
1678 } 1687 }
1679 1688
1680 /* Network route. 1689 /* Network route.
1681 Two cases are possible: 1690 Two cases are possible:
1682 1. It is connected route. Action: COW 1691 1. It is connected route. Action: COW
1683 2. It is gatewayed route or NONEXTHOP route. Action: clone it. 1692 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1684 */ 1693 */
1685 if (!dst_get_neighbour(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) 1694 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1686 nrt = rt6_alloc_cow(rt, daddr, saddr); 1695 nrt = rt6_alloc_cow(rt, daddr, saddr);
1687 else 1696 else
1688 nrt = rt6_alloc_clone(rt, daddr); 1697 nrt = rt6_alloc_clone(rt, daddr);
1689 1698
1690 if (nrt) { 1699 if (nrt) {
1691 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu); 1700 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1692 if (allfrag) { 1701 if (allfrag) {
1693 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES); 1702 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1694 features |= RTAX_FEATURE_ALLFRAG; 1703 features |= RTAX_FEATURE_ALLFRAG;
1695 dst_metric_set(&nrt->dst, RTAX_FEATURES, features); 1704 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1696 } 1705 }
1697 1706
1698 /* According to RFC 1981, detecting PMTU increase shouldn't be 1707 /* According to RFC 1981, detecting PMTU increase shouldn't be
1699 * happened within 5 mins, the recommended timer is 10 mins. 1708 * happened within 5 mins, the recommended timer is 10 mins.
1700 * Here this route expiration time is set to ip6_rt_mtu_expires 1709 * Here this route expiration time is set to ip6_rt_mtu_expires
1701 * which is 10 mins. After 10 mins the decreased pmtu is expired 1710 * which is 10 mins. After 10 mins the decreased pmtu is expired
1702 * and detecting PMTU increase will be automatically happened. 1711 * and detecting PMTU increase will be automatically happened.
1703 */ 1712 */
1704 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); 1713 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1705 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; 1714 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1706 1715
1707 ip6_ins_rt(nrt); 1716 ip6_ins_rt(nrt);
1708 } 1717 }
1709 out: 1718 out:
1710 dst_release(&rt->dst); 1719 dst_release(&rt->dst);
1711 } 1720 }
1712 1721
1713 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr, 1722 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1714 struct net_device *dev, u32 pmtu) 1723 struct net_device *dev, u32 pmtu)
1715 { 1724 {
1716 struct net *net = dev_net(dev); 1725 struct net *net = dev_net(dev);
1717 1726
1718 /* 1727 /*
1719 * RFC 1981 states that a node "MUST reduce the size of the packets it 1728 * RFC 1981 states that a node "MUST reduce the size of the packets it
1720 * is sending along the path" that caused the Packet Too Big message. 1729 * is sending along the path" that caused the Packet Too Big message.
1721 * Since it's not possible in the general case to determine which 1730 * Since it's not possible in the general case to determine which
1722 * interface was used to send the original packet, we update the MTU 1731 * interface was used to send the original packet, we update the MTU
1723 * on the interface that will be used to send future packets. We also 1732 * on the interface that will be used to send future packets. We also
1724 * update the MTU on the interface that received the Packet Too Big in 1733 * update the MTU on the interface that received the Packet Too Big in
1725 * case the original packet was forced out that interface with 1734 * case the original packet was forced out that interface with
1726 * SO_BINDTODEVICE or similar. This is the next best thing to the 1735 * SO_BINDTODEVICE or similar. This is the next best thing to the
1727 * correct behaviour, which would be to update the MTU on all 1736 * correct behaviour, which would be to update the MTU on all
1728 * interfaces. 1737 * interfaces.
1729 */ 1738 */
1730 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0); 1739 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1731 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex); 1740 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1732 } 1741 }
1733 1742
1734 /* 1743 /*
1735 * Misc support functions 1744 * Misc support functions
1736 */ 1745 */
1737 1746
1738 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, 1747 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1739 const struct in6_addr *dest) 1748 const struct in6_addr *dest)
1740 { 1749 {
1741 struct net *net = dev_net(ort->rt6i_dev); 1750 struct net *net = dev_net(ort->rt6i_dev);
1742 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, 1751 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1743 ort->dst.dev, 0); 1752 ort->dst.dev, 0);
1744 1753
1745 if (rt) { 1754 if (rt) {
1746 rt->dst.input = ort->dst.input; 1755 rt->dst.input = ort->dst.input;
1747 rt->dst.output = ort->dst.output; 1756 rt->dst.output = ort->dst.output;
1748 1757
1749 ipv6_addr_copy(&rt->rt6i_dst.addr, dest); 1758 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1750 rt->rt6i_dst.plen = ort->rt6i_dst.plen; 1759 rt->rt6i_dst.plen = ort->rt6i_dst.plen;
1751 dst_copy_metrics(&rt->dst, &ort->dst); 1760 dst_copy_metrics(&rt->dst, &ort->dst);
1752 rt->dst.error = ort->dst.error; 1761 rt->dst.error = ort->dst.error;
1753 rt->rt6i_idev = ort->rt6i_idev; 1762 rt->rt6i_idev = ort->rt6i_idev;
1754 if (rt->rt6i_idev) 1763 if (rt->rt6i_idev)
1755 in6_dev_hold(rt->rt6i_idev); 1764 in6_dev_hold(rt->rt6i_idev);
1756 rt->dst.lastuse = jiffies; 1765 rt->dst.lastuse = jiffies;
1757 rt->rt6i_expires = 0; 1766 rt->rt6i_expires = 0;
1758 1767
1759 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); 1768 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1760 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; 1769 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1761 rt->rt6i_metric = 0; 1770 rt->rt6i_metric = 0;
1762 1771
1763 #ifdef CONFIG_IPV6_SUBTREES 1772 #ifdef CONFIG_IPV6_SUBTREES
1764 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1773 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1765 #endif 1774 #endif
1766 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); 1775 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1767 rt->rt6i_table = ort->rt6i_table; 1776 rt->rt6i_table = ort->rt6i_table;
1768 } 1777 }
1769 return rt; 1778 return rt;
1770 } 1779 }
1771 1780
1772 #ifdef CONFIG_IPV6_ROUTE_INFO 1781 #ifdef CONFIG_IPV6_ROUTE_INFO
1773 static struct rt6_info *rt6_get_route_info(struct net *net, 1782 static struct rt6_info *rt6_get_route_info(struct net *net,
1774 const struct in6_addr *prefix, int prefixlen, 1783 const struct in6_addr *prefix, int prefixlen,
1775 const struct in6_addr *gwaddr, int ifindex) 1784 const struct in6_addr *gwaddr, int ifindex)
1776 { 1785 {
1777 struct fib6_node *fn; 1786 struct fib6_node *fn;
1778 struct rt6_info *rt = NULL; 1787 struct rt6_info *rt = NULL;
1779 struct fib6_table *table; 1788 struct fib6_table *table;
1780 1789
1781 table = fib6_get_table(net, RT6_TABLE_INFO); 1790 table = fib6_get_table(net, RT6_TABLE_INFO);
1782 if (table == NULL) 1791 if (table == NULL)
1783 return NULL; 1792 return NULL;
1784 1793
1785 write_lock_bh(&table->tb6_lock); 1794 write_lock_bh(&table->tb6_lock);
1786 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); 1795 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1787 if (!fn) 1796 if (!fn)
1788 goto out; 1797 goto out;
1789 1798
1790 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 1799 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1791 if (rt->rt6i_dev->ifindex != ifindex) 1800 if (rt->rt6i_dev->ifindex != ifindex)
1792 continue; 1801 continue;
1793 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 1802 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1794 continue; 1803 continue;
1795 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 1804 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1796 continue; 1805 continue;
1797 dst_hold(&rt->dst); 1806 dst_hold(&rt->dst);
1798 break; 1807 break;
1799 } 1808 }
1800 out: 1809 out:
1801 write_unlock_bh(&table->tb6_lock); 1810 write_unlock_bh(&table->tb6_lock);
1802 return rt; 1811 return rt;
1803 } 1812 }
1804 1813
1805 static struct rt6_info *rt6_add_route_info(struct net *net, 1814 static struct rt6_info *rt6_add_route_info(struct net *net,
1806 const struct in6_addr *prefix, int prefixlen, 1815 const struct in6_addr *prefix, int prefixlen,
1807 const struct in6_addr *gwaddr, int ifindex, 1816 const struct in6_addr *gwaddr, int ifindex,
1808 unsigned pref) 1817 unsigned pref)
1809 { 1818 {
1810 struct fib6_config cfg = { 1819 struct fib6_config cfg = {
1811 .fc_table = RT6_TABLE_INFO, 1820 .fc_table = RT6_TABLE_INFO,
1812 .fc_metric = IP6_RT_PRIO_USER, 1821 .fc_metric = IP6_RT_PRIO_USER,
1813 .fc_ifindex = ifindex, 1822 .fc_ifindex = ifindex,
1814 .fc_dst_len = prefixlen, 1823 .fc_dst_len = prefixlen,
1815 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 1824 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1816 RTF_UP | RTF_PREF(pref), 1825 RTF_UP | RTF_PREF(pref),
1817 .fc_nlinfo.pid = 0, 1826 .fc_nlinfo.pid = 0,
1818 .fc_nlinfo.nlh = NULL, 1827 .fc_nlinfo.nlh = NULL,
1819 .fc_nlinfo.nl_net = net, 1828 .fc_nlinfo.nl_net = net,
1820 }; 1829 };
1821 1830
1822 ipv6_addr_copy(&cfg.fc_dst, prefix); 1831 ipv6_addr_copy(&cfg.fc_dst, prefix);
1823 ipv6_addr_copy(&cfg.fc_gateway, gwaddr); 1832 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1824 1833
1825 /* We should treat it as a default route if prefix length is 0. */ 1834 /* We should treat it as a default route if prefix length is 0. */
1826 if (!prefixlen) 1835 if (!prefixlen)
1827 cfg.fc_flags |= RTF_DEFAULT; 1836 cfg.fc_flags |= RTF_DEFAULT;
1828 1837
1829 ip6_route_add(&cfg); 1838 ip6_route_add(&cfg);
1830 1839
1831 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex); 1840 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1832 } 1841 }
1833 #endif 1842 #endif
1834 1843
1835 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 1844 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1836 { 1845 {
1837 struct rt6_info *rt; 1846 struct rt6_info *rt;
1838 struct fib6_table *table; 1847 struct fib6_table *table;
1839 1848
1840 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); 1849 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1841 if (table == NULL) 1850 if (table == NULL)
1842 return NULL; 1851 return NULL;
1843 1852
1844 write_lock_bh(&table->tb6_lock); 1853 write_lock_bh(&table->tb6_lock);
1845 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { 1854 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1846 if (dev == rt->rt6i_dev && 1855 if (dev == rt->rt6i_dev &&
1847 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 1856 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1848 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 1857 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1849 break; 1858 break;
1850 } 1859 }
1851 if (rt) 1860 if (rt)
1852 dst_hold(&rt->dst); 1861 dst_hold(&rt->dst);
1853 write_unlock_bh(&table->tb6_lock); 1862 write_unlock_bh(&table->tb6_lock);
1854 return rt; 1863 return rt;
1855 } 1864 }
1856 1865
1857 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 1866 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1858 struct net_device *dev, 1867 struct net_device *dev,
1859 unsigned int pref) 1868 unsigned int pref)
1860 { 1869 {
1861 struct fib6_config cfg = { 1870 struct fib6_config cfg = {
1862 .fc_table = RT6_TABLE_DFLT, 1871 .fc_table = RT6_TABLE_DFLT,
1863 .fc_metric = IP6_RT_PRIO_USER, 1872 .fc_metric = IP6_RT_PRIO_USER,
1864 .fc_ifindex = dev->ifindex, 1873 .fc_ifindex = dev->ifindex,
1865 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 1874 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1866 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 1875 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1867 .fc_nlinfo.pid = 0, 1876 .fc_nlinfo.pid = 0,
1868 .fc_nlinfo.nlh = NULL, 1877 .fc_nlinfo.nlh = NULL,
1869 .fc_nlinfo.nl_net = dev_net(dev), 1878 .fc_nlinfo.nl_net = dev_net(dev),
1870 }; 1879 };
1871 1880
1872 ipv6_addr_copy(&cfg.fc_gateway, gwaddr); 1881 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1873 1882
1874 ip6_route_add(&cfg); 1883 ip6_route_add(&cfg);
1875 1884
1876 return rt6_get_dflt_router(gwaddr, dev); 1885 return rt6_get_dflt_router(gwaddr, dev);
1877 } 1886 }
1878 1887
1879 void rt6_purge_dflt_routers(struct net *net) 1888 void rt6_purge_dflt_routers(struct net *net)
1880 { 1889 {
1881 struct rt6_info *rt; 1890 struct rt6_info *rt;
1882 struct fib6_table *table; 1891 struct fib6_table *table;
1883 1892
1884 /* NOTE: Keep consistent with rt6_get_dflt_router */ 1893 /* NOTE: Keep consistent with rt6_get_dflt_router */
1885 table = fib6_get_table(net, RT6_TABLE_DFLT); 1894 table = fib6_get_table(net, RT6_TABLE_DFLT);
1886 if (table == NULL) 1895 if (table == NULL)
1887 return; 1896 return;
1888 1897
1889 restart: 1898 restart:
1890 read_lock_bh(&table->tb6_lock); 1899 read_lock_bh(&table->tb6_lock);
1891 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 1900 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1892 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { 1901 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1893 dst_hold(&rt->dst); 1902 dst_hold(&rt->dst);
1894 read_unlock_bh(&table->tb6_lock); 1903 read_unlock_bh(&table->tb6_lock);
1895 ip6_del_rt(rt); 1904 ip6_del_rt(rt);
1896 goto restart; 1905 goto restart;
1897 } 1906 }
1898 } 1907 }
1899 read_unlock_bh(&table->tb6_lock); 1908 read_unlock_bh(&table->tb6_lock);
1900 } 1909 }
1901 1910
1902 static void rtmsg_to_fib6_config(struct net *net, 1911 static void rtmsg_to_fib6_config(struct net *net,
1903 struct in6_rtmsg *rtmsg, 1912 struct in6_rtmsg *rtmsg,
1904 struct fib6_config *cfg) 1913 struct fib6_config *cfg)
1905 { 1914 {
1906 memset(cfg, 0, sizeof(*cfg)); 1915 memset(cfg, 0, sizeof(*cfg));
1907 1916
1908 cfg->fc_table = RT6_TABLE_MAIN; 1917 cfg->fc_table = RT6_TABLE_MAIN;
1909 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 1918 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1910 cfg->fc_metric = rtmsg->rtmsg_metric; 1919 cfg->fc_metric = rtmsg->rtmsg_metric;
1911 cfg->fc_expires = rtmsg->rtmsg_info; 1920 cfg->fc_expires = rtmsg->rtmsg_info;
1912 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 1921 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1913 cfg->fc_src_len = rtmsg->rtmsg_src_len; 1922 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1914 cfg->fc_flags = rtmsg->rtmsg_flags; 1923 cfg->fc_flags = rtmsg->rtmsg_flags;
1915 1924
1916 cfg->fc_nlinfo.nl_net = net; 1925 cfg->fc_nlinfo.nl_net = net;
1917 1926
1918 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst); 1927 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1919 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src); 1928 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1920 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway); 1929 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1921 } 1930 }
1922 1931
1923 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 1932 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1924 { 1933 {
1925 struct fib6_config cfg; 1934 struct fib6_config cfg;
1926 struct in6_rtmsg rtmsg; 1935 struct in6_rtmsg rtmsg;
1927 int err; 1936 int err;
1928 1937
1929 switch(cmd) { 1938 switch(cmd) {
1930 case SIOCADDRT: /* Add a route */ 1939 case SIOCADDRT: /* Add a route */
1931 case SIOCDELRT: /* Delete a route */ 1940 case SIOCDELRT: /* Delete a route */
1932 if (!capable(CAP_NET_ADMIN)) 1941 if (!capable(CAP_NET_ADMIN))
1933 return -EPERM; 1942 return -EPERM;
1934 err = copy_from_user(&rtmsg, arg, 1943 err = copy_from_user(&rtmsg, arg,
1935 sizeof(struct in6_rtmsg)); 1944 sizeof(struct in6_rtmsg));
1936 if (err) 1945 if (err)
1937 return -EFAULT; 1946 return -EFAULT;
1938 1947
1939 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 1948 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1940 1949
1941 rtnl_lock(); 1950 rtnl_lock();
1942 switch (cmd) { 1951 switch (cmd) {
1943 case SIOCADDRT: 1952 case SIOCADDRT:
1944 err = ip6_route_add(&cfg); 1953 err = ip6_route_add(&cfg);
1945 break; 1954 break;
1946 case SIOCDELRT: 1955 case SIOCDELRT:
1947 err = ip6_route_del(&cfg); 1956 err = ip6_route_del(&cfg);
1948 break; 1957 break;
1949 default: 1958 default:
1950 err = -EINVAL; 1959 err = -EINVAL;
1951 } 1960 }
1952 rtnl_unlock(); 1961 rtnl_unlock();
1953 1962
1954 return err; 1963 return err;
1955 } 1964 }
1956 1965
1957 return -EINVAL; 1966 return -EINVAL;
1958 } 1967 }
1959 1968
1960 /* 1969 /*
1961 * Drop the packet on the floor 1970 * Drop the packet on the floor
1962 */ 1971 */
1963 1972
1964 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 1973 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1965 { 1974 {
1966 int type; 1975 int type;
1967 struct dst_entry *dst = skb_dst(skb); 1976 struct dst_entry *dst = skb_dst(skb);
1968 switch (ipstats_mib_noroutes) { 1977 switch (ipstats_mib_noroutes) {
1969 case IPSTATS_MIB_INNOROUTES: 1978 case IPSTATS_MIB_INNOROUTES:
1970 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 1979 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1971 if (type == IPV6_ADDR_ANY) { 1980 if (type == IPV6_ADDR_ANY) {
1972 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 1981 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1973 IPSTATS_MIB_INADDRERRORS); 1982 IPSTATS_MIB_INADDRERRORS);
1974 break; 1983 break;
1975 } 1984 }
1976 /* FALLTHROUGH */ 1985 /* FALLTHROUGH */
1977 case IPSTATS_MIB_OUTNOROUTES: 1986 case IPSTATS_MIB_OUTNOROUTES:
1978 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 1987 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1979 ipstats_mib_noroutes); 1988 ipstats_mib_noroutes);
1980 break; 1989 break;
1981 } 1990 }
1982 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 1991 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1983 kfree_skb(skb); 1992 kfree_skb(skb);
1984 return 0; 1993 return 0;
1985 } 1994 }
1986 1995
1987 static int ip6_pkt_discard(struct sk_buff *skb) 1996 static int ip6_pkt_discard(struct sk_buff *skb)
1988 { 1997 {
1989 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 1998 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1990 } 1999 }
1991 2000
1992 static int ip6_pkt_discard_out(struct sk_buff *skb) 2001 static int ip6_pkt_discard_out(struct sk_buff *skb)
1993 { 2002 {
1994 skb->dev = skb_dst(skb)->dev; 2003 skb->dev = skb_dst(skb)->dev;
1995 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 2004 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1996 } 2005 }
1997 2006
1998 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1999 2008
2000 static int ip6_pkt_prohibit(struct sk_buff *skb) 2009 static int ip6_pkt_prohibit(struct sk_buff *skb)
2001 { 2010 {
2002 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 2011 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2003 } 2012 }
2004 2013
2005 static int ip6_pkt_prohibit_out(struct sk_buff *skb) 2014 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2006 { 2015 {
2007 skb->dev = skb_dst(skb)->dev; 2016 skb->dev = skb_dst(skb)->dev;
2008 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 2017 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2009 } 2018 }
2010 2019
2011 #endif 2020 #endif
2012 2021
2013 /* 2022 /*
2014 * Allocate a dst for local (unicast / anycast) address. 2023 * Allocate a dst for local (unicast / anycast) address.
2015 */ 2024 */
2016 2025
2017 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 2026 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2018 const struct in6_addr *addr, 2027 const struct in6_addr *addr,
2019 int anycast) 2028 int anycast)
2020 { 2029 {
2021 struct net *net = dev_net(idev->dev); 2030 struct net *net = dev_net(idev->dev);
2022 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, 2031 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2023 net->loopback_dev, 0); 2032 net->loopback_dev, 0);
2024 struct neighbour *neigh; 2033 struct neighbour *neigh;
2025 2034
2026 if (rt == NULL) { 2035 if (rt == NULL) {
2027 if (net_ratelimit()) 2036 if (net_ratelimit())
2028 pr_warning("IPv6: Maximum number of routes reached," 2037 pr_warning("IPv6: Maximum number of routes reached,"
2029 " consider increasing route/max_size.\n"); 2038 " consider increasing route/max_size.\n");
2030 return ERR_PTR(-ENOMEM); 2039 return ERR_PTR(-ENOMEM);
2031 } 2040 }
2032 2041
2033 in6_dev_hold(idev); 2042 in6_dev_hold(idev);
2034 2043
2035 rt->dst.flags |= DST_HOST; 2044 rt->dst.flags |= DST_HOST;
2036 rt->dst.input = ip6_input; 2045 rt->dst.input = ip6_input;
2037 rt->dst.output = ip6_output; 2046 rt->dst.output = ip6_output;
2038 rt->rt6i_idev = idev; 2047 rt->rt6i_idev = idev;
2039 rt->dst.obsolete = -1; 2048 rt->dst.obsolete = -1;
2040 2049
2041 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 2050 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2042 if (anycast) 2051 if (anycast)
2043 rt->rt6i_flags |= RTF_ANYCAST; 2052 rt->rt6i_flags |= RTF_ANYCAST;
2044 else 2053 else
2045 rt->rt6i_flags |= RTF_LOCAL; 2054 rt->rt6i_flags |= RTF_LOCAL;
2046 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); 2055 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2047 if (IS_ERR(neigh)) { 2056 if (IS_ERR(neigh)) {
2048 dst_free(&rt->dst); 2057 dst_free(&rt->dst);
2049 2058
2050 return ERR_CAST(neigh); 2059 return ERR_CAST(neigh);
2051 } 2060 }
2052 dst_set_neighbour(&rt->dst, neigh); 2061 dst_set_neighbour(&rt->dst, neigh);
2053 2062
2054 ipv6_addr_copy(&rt->rt6i_dst.addr, addr); 2063 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2055 rt->rt6i_dst.plen = 128; 2064 rt->rt6i_dst.plen = 128;
2056 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); 2065 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2057 2066
2058 atomic_set(&rt->dst.__refcnt, 1); 2067 atomic_set(&rt->dst.__refcnt, 1);
2059 2068
2060 return rt; 2069 return rt;
2061 } 2070 }
2062 2071
2063 int ip6_route_get_saddr(struct net *net, 2072 int ip6_route_get_saddr(struct net *net,
2064 struct rt6_info *rt, 2073 struct rt6_info *rt,
2065 const struct in6_addr *daddr, 2074 const struct in6_addr *daddr,
2066 unsigned int prefs, 2075 unsigned int prefs,
2067 struct in6_addr *saddr) 2076 struct in6_addr *saddr)
2068 { 2077 {
2069 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); 2078 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2070 int err = 0; 2079 int err = 0;
2071 if (rt->rt6i_prefsrc.plen) 2080 if (rt->rt6i_prefsrc.plen)
2072 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr); 2081 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2073 else 2082 else
2074 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, 2083 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2075 daddr, prefs, saddr); 2084 daddr, prefs, saddr);
2076 return err; 2085 return err;
2077 } 2086 }
2078 2087
2079 /* remove deleted ip from prefsrc entries */ 2088 /* remove deleted ip from prefsrc entries */
2080 struct arg_dev_net_ip { 2089 struct arg_dev_net_ip {
2081 struct net_device *dev; 2090 struct net_device *dev;
2082 struct net *net; 2091 struct net *net;
2083 struct in6_addr *addr; 2092 struct in6_addr *addr;
2084 }; 2093 };
2085 2094
2086 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 2095 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2087 { 2096 {
2088 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 2097 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2089 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 2098 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2090 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 2099 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2091 2100
2092 if (((void *)rt->rt6i_dev == dev || dev == NULL) && 2101 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2093 rt != net->ipv6.ip6_null_entry && 2102 rt != net->ipv6.ip6_null_entry &&
2094 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 2103 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2095 /* remove prefsrc entry */ 2104 /* remove prefsrc entry */
2096 rt->rt6i_prefsrc.plen = 0; 2105 rt->rt6i_prefsrc.plen = 0;
2097 } 2106 }
2098 return 0; 2107 return 0;
2099 } 2108 }
2100 2109
2101 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 2110 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2102 { 2111 {
2103 struct net *net = dev_net(ifp->idev->dev); 2112 struct net *net = dev_net(ifp->idev->dev);
2104 struct arg_dev_net_ip adni = { 2113 struct arg_dev_net_ip adni = {
2105 .dev = ifp->idev->dev, 2114 .dev = ifp->idev->dev,
2106 .net = net, 2115 .net = net,
2107 .addr = &ifp->addr, 2116 .addr = &ifp->addr,
2108 }; 2117 };
2109 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni); 2118 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2110 } 2119 }
2111 2120
2112 struct arg_dev_net { 2121 struct arg_dev_net {
2113 struct net_device *dev; 2122 struct net_device *dev;
2114 struct net *net; 2123 struct net *net;
2115 }; 2124 };
2116 2125
2117 static int fib6_ifdown(struct rt6_info *rt, void *arg) 2126 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2118 { 2127 {
2119 const struct arg_dev_net *adn = arg; 2128 const struct arg_dev_net *adn = arg;
2120 const struct net_device *dev = adn->dev; 2129 const struct net_device *dev = adn->dev;
2121 2130
2122 if ((rt->rt6i_dev == dev || dev == NULL) && 2131 if ((rt->rt6i_dev == dev || dev == NULL) &&
2123 rt != adn->net->ipv6.ip6_null_entry) { 2132 rt != adn->net->ipv6.ip6_null_entry) {
2124 RT6_TRACE("deleted by ifdown %p\n", rt); 2133 RT6_TRACE("deleted by ifdown %p\n", rt);
2125 return -1; 2134 return -1;
2126 } 2135 }
2127 return 0; 2136 return 0;
2128 } 2137 }
2129 2138
2130 void rt6_ifdown(struct net *net, struct net_device *dev) 2139 void rt6_ifdown(struct net *net, struct net_device *dev)
2131 { 2140 {
2132 struct arg_dev_net adn = { 2141 struct arg_dev_net adn = {
2133 .dev = dev, 2142 .dev = dev,
2134 .net = net, 2143 .net = net,
2135 }; 2144 };
2136 2145
2137 fib6_clean_all(net, fib6_ifdown, 0, &adn); 2146 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2138 icmp6_clean_all(fib6_ifdown, &adn); 2147 icmp6_clean_all(fib6_ifdown, &adn);
2139 } 2148 }
2140 2149
2141 struct rt6_mtu_change_arg 2150 struct rt6_mtu_change_arg
2142 { 2151 {
2143 struct net_device *dev; 2152 struct net_device *dev;
2144 unsigned mtu; 2153 unsigned mtu;
2145 }; 2154 };
2146 2155
2147 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 2156 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2148 { 2157 {
2149 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 2158 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2150 struct inet6_dev *idev; 2159 struct inet6_dev *idev;
2151 2160
2152 /* In IPv6 pmtu discovery is not optional, 2161 /* In IPv6 pmtu discovery is not optional,
2153 so that RTAX_MTU lock cannot disable it. 2162 so that RTAX_MTU lock cannot disable it.
2154 We still use this lock to block changes 2163 We still use this lock to block changes
2155 caused by addrconf/ndisc. 2164 caused by addrconf/ndisc.
2156 */ 2165 */
2157 2166
2158 idev = __in6_dev_get(arg->dev); 2167 idev = __in6_dev_get(arg->dev);
2159 if (idev == NULL) 2168 if (idev == NULL)
2160 return 0; 2169 return 0;
2161 2170
2162 /* For administrative MTU increase, there is no way to discover 2171 /* For administrative MTU increase, there is no way to discover
2163 IPv6 PMTU increase, so PMTU increase should be updated here. 2172 IPv6 PMTU increase, so PMTU increase should be updated here.
2164 Since RFC 1981 doesn't include administrative MTU increase 2173 Since RFC 1981 doesn't include administrative MTU increase
2165 update PMTU increase is a MUST. (i.e. jumbo frame) 2174 update PMTU increase is a MUST. (i.e. jumbo frame)
2166 */ 2175 */
2167 /* 2176 /*
2168 If new MTU is less than route PMTU, this new MTU will be the 2177 If new MTU is less than route PMTU, this new MTU will be the
2169 lowest MTU in the path, update the route PMTU to reflect PMTU 2178 lowest MTU in the path, update the route PMTU to reflect PMTU
2170 decreases; if new MTU is greater than route PMTU, and the 2179 decreases; if new MTU is greater than route PMTU, and the
2171 old MTU is the lowest MTU in the path, update the route PMTU 2180 old MTU is the lowest MTU in the path, update the route PMTU
2172 to reflect the increase. In this case if the other nodes' MTU 2181 to reflect the increase. In this case if the other nodes' MTU
2173 also have the lowest MTU, TOO BIG MESSAGE will be lead to 2182 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2174 PMTU discouvery. 2183 PMTU discouvery.
2175 */ 2184 */
2176 if (rt->rt6i_dev == arg->dev && 2185 if (rt->rt6i_dev == arg->dev &&
2177 !dst_metric_locked(&rt->dst, RTAX_MTU) && 2186 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2178 (dst_mtu(&rt->dst) >= arg->mtu || 2187 (dst_mtu(&rt->dst) >= arg->mtu ||
2179 (dst_mtu(&rt->dst) < arg->mtu && 2188 (dst_mtu(&rt->dst) < arg->mtu &&
2180 dst_mtu(&rt->dst) == idev->cnf.mtu6))) { 2189 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2181 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 2190 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2182 } 2191 }
2183 return 0; 2192 return 0;
2184 } 2193 }
2185 2194
2186 void rt6_mtu_change(struct net_device *dev, unsigned mtu) 2195 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2187 { 2196 {
2188 struct rt6_mtu_change_arg arg = { 2197 struct rt6_mtu_change_arg arg = {
2189 .dev = dev, 2198 .dev = dev,
2190 .mtu = mtu, 2199 .mtu = mtu,
2191 }; 2200 };
2192 2201
2193 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg); 2202 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2194 } 2203 }
2195 2204
2196 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 2205 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2197 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 2206 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2198 [RTA_OIF] = { .type = NLA_U32 }, 2207 [RTA_OIF] = { .type = NLA_U32 },
2199 [RTA_IIF] = { .type = NLA_U32 }, 2208 [RTA_IIF] = { .type = NLA_U32 },
2200 [RTA_PRIORITY] = { .type = NLA_U32 }, 2209 [RTA_PRIORITY] = { .type = NLA_U32 },
2201 [RTA_METRICS] = { .type = NLA_NESTED }, 2210 [RTA_METRICS] = { .type = NLA_NESTED },
2202 }; 2211 };
2203 2212
2204 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2213 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2205 struct fib6_config *cfg) 2214 struct fib6_config *cfg)
2206 { 2215 {
2207 struct rtmsg *rtm; 2216 struct rtmsg *rtm;
2208 struct nlattr *tb[RTA_MAX+1]; 2217 struct nlattr *tb[RTA_MAX+1];
2209 int err; 2218 int err;
2210 2219
2211 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); 2220 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2212 if (err < 0) 2221 if (err < 0)
2213 goto errout; 2222 goto errout;
2214 2223
2215 err = -EINVAL; 2224 err = -EINVAL;
2216 rtm = nlmsg_data(nlh); 2225 rtm = nlmsg_data(nlh);
2217 memset(cfg, 0, sizeof(*cfg)); 2226 memset(cfg, 0, sizeof(*cfg));
2218 2227
2219 cfg->fc_table = rtm->rtm_table; 2228 cfg->fc_table = rtm->rtm_table;
2220 cfg->fc_dst_len = rtm->rtm_dst_len; 2229 cfg->fc_dst_len = rtm->rtm_dst_len;
2221 cfg->fc_src_len = rtm->rtm_src_len; 2230 cfg->fc_src_len = rtm->rtm_src_len;
2222 cfg->fc_flags = RTF_UP; 2231 cfg->fc_flags = RTF_UP;
2223 cfg->fc_protocol = rtm->rtm_protocol; 2232 cfg->fc_protocol = rtm->rtm_protocol;
2224 2233
2225 if (rtm->rtm_type == RTN_UNREACHABLE) 2234 if (rtm->rtm_type == RTN_UNREACHABLE)
2226 cfg->fc_flags |= RTF_REJECT; 2235 cfg->fc_flags |= RTF_REJECT;
2227 2236
2228 if (rtm->rtm_type == RTN_LOCAL) 2237 if (rtm->rtm_type == RTN_LOCAL)
2229 cfg->fc_flags |= RTF_LOCAL; 2238 cfg->fc_flags |= RTF_LOCAL;
2230 2239
2231 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; 2240 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2232 cfg->fc_nlinfo.nlh = nlh; 2241 cfg->fc_nlinfo.nlh = nlh;
2233 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 2242 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2234 2243
2235 if (tb[RTA_GATEWAY]) { 2244 if (tb[RTA_GATEWAY]) {
2236 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16); 2245 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2237 cfg->fc_flags |= RTF_GATEWAY; 2246 cfg->fc_flags |= RTF_GATEWAY;
2238 } 2247 }
2239 2248
2240 if (tb[RTA_DST]) { 2249 if (tb[RTA_DST]) {
2241 int plen = (rtm->rtm_dst_len + 7) >> 3; 2250 int plen = (rtm->rtm_dst_len + 7) >> 3;
2242 2251
2243 if (nla_len(tb[RTA_DST]) < plen) 2252 if (nla_len(tb[RTA_DST]) < plen)
2244 goto errout; 2253 goto errout;
2245 2254
2246 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 2255 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2247 } 2256 }
2248 2257
2249 if (tb[RTA_SRC]) { 2258 if (tb[RTA_SRC]) {
2250 int plen = (rtm->rtm_src_len + 7) >> 3; 2259 int plen = (rtm->rtm_src_len + 7) >> 3;
2251 2260
2252 if (nla_len(tb[RTA_SRC]) < plen) 2261 if (nla_len(tb[RTA_SRC]) < plen)
2253 goto errout; 2262 goto errout;
2254 2263
2255 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 2264 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2256 } 2265 }
2257 2266
2258 if (tb[RTA_PREFSRC]) 2267 if (tb[RTA_PREFSRC])
2259 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); 2268 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2260 2269
2261 if (tb[RTA_OIF]) 2270 if (tb[RTA_OIF])
2262 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 2271 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2263 2272
2264 if (tb[RTA_PRIORITY]) 2273 if (tb[RTA_PRIORITY])
2265 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 2274 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2266 2275
2267 if (tb[RTA_METRICS]) { 2276 if (tb[RTA_METRICS]) {
2268 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 2277 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2269 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 2278 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2270 } 2279 }
2271 2280
2272 if (tb[RTA_TABLE]) 2281 if (tb[RTA_TABLE])
2273 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 2282 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2274 2283
2275 err = 0; 2284 err = 0;
2276 errout: 2285 errout:
2277 return err; 2286 return err;
2278 } 2287 }
2279 2288
2280 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 2289 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2281 { 2290 {
2282 struct fib6_config cfg; 2291 struct fib6_config cfg;
2283 int err; 2292 int err;
2284 2293
2285 err = rtm_to_fib6_config(skb, nlh, &cfg); 2294 err = rtm_to_fib6_config(skb, nlh, &cfg);
2286 if (err < 0) 2295 if (err < 0)
2287 return err; 2296 return err;
2288 2297
2289 return ip6_route_del(&cfg); 2298 return ip6_route_del(&cfg);
2290 } 2299 }
2291 2300
2292 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 2301 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2293 { 2302 {
2294 struct fib6_config cfg; 2303 struct fib6_config cfg;
2295 int err; 2304 int err;
2296 2305
2297 err = rtm_to_fib6_config(skb, nlh, &cfg); 2306 err = rtm_to_fib6_config(skb, nlh, &cfg);
2298 if (err < 0) 2307 if (err < 0)
2299 return err; 2308 return err;
2300 2309
2301 return ip6_route_add(&cfg); 2310 return ip6_route_add(&cfg);
2302 } 2311 }
2303 2312
2304 static inline size_t rt6_nlmsg_size(void) 2313 static inline size_t rt6_nlmsg_size(void)
2305 { 2314 {
2306 return NLMSG_ALIGN(sizeof(struct rtmsg)) 2315 return NLMSG_ALIGN(sizeof(struct rtmsg))
2307 + nla_total_size(16) /* RTA_SRC */ 2316 + nla_total_size(16) /* RTA_SRC */
2308 + nla_total_size(16) /* RTA_DST */ 2317 + nla_total_size(16) /* RTA_DST */
2309 + nla_total_size(16) /* RTA_GATEWAY */ 2318 + nla_total_size(16) /* RTA_GATEWAY */
2310 + nla_total_size(16) /* RTA_PREFSRC */ 2319 + nla_total_size(16) /* RTA_PREFSRC */
2311 + nla_total_size(4) /* RTA_TABLE */ 2320 + nla_total_size(4) /* RTA_TABLE */
2312 + nla_total_size(4) /* RTA_IIF */ 2321 + nla_total_size(4) /* RTA_IIF */
2313 + nla_total_size(4) /* RTA_OIF */ 2322 + nla_total_size(4) /* RTA_OIF */
2314 + nla_total_size(4) /* RTA_PRIORITY */ 2323 + nla_total_size(4) /* RTA_PRIORITY */
2315 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 2324 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2316 + nla_total_size(sizeof(struct rta_cacheinfo)); 2325 + nla_total_size(sizeof(struct rta_cacheinfo));
2317 } 2326 }
2318 2327
2319 static int rt6_fill_node(struct net *net, 2328 static int rt6_fill_node(struct net *net,
2320 struct sk_buff *skb, struct rt6_info *rt, 2329 struct sk_buff *skb, struct rt6_info *rt,
2321 struct in6_addr *dst, struct in6_addr *src, 2330 struct in6_addr *dst, struct in6_addr *src,
2322 int iif, int type, u32 pid, u32 seq, 2331 int iif, int type, u32 pid, u32 seq,
2323 int prefix, int nowait, unsigned int flags) 2332 int prefix, int nowait, unsigned int flags)
2324 { 2333 {
2325 struct rtmsg *rtm; 2334 struct rtmsg *rtm;
2326 struct nlmsghdr *nlh; 2335 struct nlmsghdr *nlh;
2327 long expires; 2336 long expires;
2328 u32 table; 2337 u32 table;
2338 struct neighbour *n;
2329 2339
2330 if (prefix) { /* user wants prefix routes only */ 2340 if (prefix) { /* user wants prefix routes only */
2331 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { 2341 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2332 /* success since this is not a prefix route */ 2342 /* success since this is not a prefix route */
2333 return 1; 2343 return 1;
2334 } 2344 }
2335 } 2345 }
2336 2346
2337 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags); 2347 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2338 if (nlh == NULL) 2348 if (nlh == NULL)
2339 return -EMSGSIZE; 2349 return -EMSGSIZE;
2340 2350
2341 rtm = nlmsg_data(nlh); 2351 rtm = nlmsg_data(nlh);
2342 rtm->rtm_family = AF_INET6; 2352 rtm->rtm_family = AF_INET6;
2343 rtm->rtm_dst_len = rt->rt6i_dst.plen; 2353 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2344 rtm->rtm_src_len = rt->rt6i_src.plen; 2354 rtm->rtm_src_len = rt->rt6i_src.plen;
2345 rtm->rtm_tos = 0; 2355 rtm->rtm_tos = 0;
2346 if (rt->rt6i_table) 2356 if (rt->rt6i_table)
2347 table = rt->rt6i_table->tb6_id; 2357 table = rt->rt6i_table->tb6_id;
2348 else 2358 else
2349 table = RT6_TABLE_UNSPEC; 2359 table = RT6_TABLE_UNSPEC;
2350 rtm->rtm_table = table; 2360 rtm->rtm_table = table;
2351 NLA_PUT_U32(skb, RTA_TABLE, table); 2361 NLA_PUT_U32(skb, RTA_TABLE, table);
2352 if (rt->rt6i_flags&RTF_REJECT) 2362 if (rt->rt6i_flags&RTF_REJECT)
2353 rtm->rtm_type = RTN_UNREACHABLE; 2363 rtm->rtm_type = RTN_UNREACHABLE;
2354 else if (rt->rt6i_flags&RTF_LOCAL) 2364 else if (rt->rt6i_flags&RTF_LOCAL)
2355 rtm->rtm_type = RTN_LOCAL; 2365 rtm->rtm_type = RTN_LOCAL;
2356 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) 2366 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2357 rtm->rtm_type = RTN_LOCAL; 2367 rtm->rtm_type = RTN_LOCAL;
2358 else 2368 else
2359 rtm->rtm_type = RTN_UNICAST; 2369 rtm->rtm_type = RTN_UNICAST;
2360 rtm->rtm_flags = 0; 2370 rtm->rtm_flags = 0;
2361 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 2371 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2362 rtm->rtm_protocol = rt->rt6i_protocol; 2372 rtm->rtm_protocol = rt->rt6i_protocol;
2363 if (rt->rt6i_flags&RTF_DYNAMIC) 2373 if (rt->rt6i_flags&RTF_DYNAMIC)
2364 rtm->rtm_protocol = RTPROT_REDIRECT; 2374 rtm->rtm_protocol = RTPROT_REDIRECT;
2365 else if (rt->rt6i_flags & RTF_ADDRCONF) 2375 else if (rt->rt6i_flags & RTF_ADDRCONF)
2366 rtm->rtm_protocol = RTPROT_KERNEL; 2376 rtm->rtm_protocol = RTPROT_KERNEL;
2367 else if (rt->rt6i_flags&RTF_DEFAULT) 2377 else if (rt->rt6i_flags&RTF_DEFAULT)
2368 rtm->rtm_protocol = RTPROT_RA; 2378 rtm->rtm_protocol = RTPROT_RA;
2369 2379
2370 if (rt->rt6i_flags&RTF_CACHE) 2380 if (rt->rt6i_flags&RTF_CACHE)
2371 rtm->rtm_flags |= RTM_F_CLONED; 2381 rtm->rtm_flags |= RTM_F_CLONED;
2372 2382
2373 if (dst) { 2383 if (dst) {
2374 NLA_PUT(skb, RTA_DST, 16, dst); 2384 NLA_PUT(skb, RTA_DST, 16, dst);
2375 rtm->rtm_dst_len = 128; 2385 rtm->rtm_dst_len = 128;
2376 } else if (rtm->rtm_dst_len) 2386 } else if (rtm->rtm_dst_len)
2377 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); 2387 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2378 #ifdef CONFIG_IPV6_SUBTREES 2388 #ifdef CONFIG_IPV6_SUBTREES
2379 if (src) { 2389 if (src) {
2380 NLA_PUT(skb, RTA_SRC, 16, src); 2390 NLA_PUT(skb, RTA_SRC, 16, src);
2381 rtm->rtm_src_len = 128; 2391 rtm->rtm_src_len = 128;
2382 } else if (rtm->rtm_src_len) 2392 } else if (rtm->rtm_src_len)
2383 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); 2393 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2384 #endif 2394 #endif
2385 if (iif) { 2395 if (iif) {
2386 #ifdef CONFIG_IPV6_MROUTE 2396 #ifdef CONFIG_IPV6_MROUTE
2387 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 2397 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2388 int err = ip6mr_get_route(net, skb, rtm, nowait); 2398 int err = ip6mr_get_route(net, skb, rtm, nowait);
2389 if (err <= 0) { 2399 if (err <= 0) {
2390 if (!nowait) { 2400 if (!nowait) {
2391 if (err == 0) 2401 if (err == 0)
2392 return 0; 2402 return 0;
2393 goto nla_put_failure; 2403 goto nla_put_failure;
2394 } else { 2404 } else {
2395 if (err == -EMSGSIZE) 2405 if (err == -EMSGSIZE)
2396 goto nla_put_failure; 2406 goto nla_put_failure;
2397 } 2407 }
2398 } 2408 }
2399 } else 2409 } else
2400 #endif 2410 #endif
2401 NLA_PUT_U32(skb, RTA_IIF, iif); 2411 NLA_PUT_U32(skb, RTA_IIF, iif);
2402 } else if (dst) { 2412 } else if (dst) {
2403 struct in6_addr saddr_buf; 2413 struct in6_addr saddr_buf;
2404 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0) 2414 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2405 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); 2415 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2406 } 2416 }
2407 2417
2408 if (rt->rt6i_prefsrc.plen) { 2418 if (rt->rt6i_prefsrc.plen) {
2409 struct in6_addr saddr_buf; 2419 struct in6_addr saddr_buf;
2410 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr); 2420 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2411 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); 2421 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2412 } 2422 }
2413 2423
2414 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2424 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2415 goto nla_put_failure; 2425 goto nla_put_failure;
2416 2426
2417 if (dst_get_neighbour(&rt->dst)) 2427 rcu_read_lock();
2418 NLA_PUT(skb, RTA_GATEWAY, 16, &dst_get_neighbour(&rt->dst)->primary_key); 2428 n = dst_get_neighbour(&rt->dst);
2429 if (n)
2430 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2431 rcu_read_unlock();
2419 2432
2420 if (rt->dst.dev) 2433 if (rt->dst.dev)
2421 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex); 2434 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2422 2435
2423 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); 2436 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2424 2437
2425 if (!(rt->rt6i_flags & RTF_EXPIRES)) 2438 if (!(rt->rt6i_flags & RTF_EXPIRES))
2426 expires = 0; 2439 expires = 0;
2427 else if (rt->rt6i_expires - jiffies < INT_MAX) 2440 else if (rt->rt6i_expires - jiffies < INT_MAX)
2428 expires = rt->rt6i_expires - jiffies; 2441 expires = rt->rt6i_expires - jiffies;
2429 else 2442 else
2430 expires = INT_MAX; 2443 expires = INT_MAX;
2431 2444
2432 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, 2445 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2433 expires, rt->dst.error) < 0) 2446 expires, rt->dst.error) < 0)
2434 goto nla_put_failure; 2447 goto nla_put_failure;
2435 2448
2436 return nlmsg_end(skb, nlh); 2449 return nlmsg_end(skb, nlh);
2437 2450
2438 nla_put_failure: 2451 nla_put_failure:
2439 nlmsg_cancel(skb, nlh); 2452 nlmsg_cancel(skb, nlh);
2440 return -EMSGSIZE; 2453 return -EMSGSIZE;
2441 } 2454 }
2442 2455
2443 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 2456 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2444 { 2457 {
2445 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 2458 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2446 int prefix; 2459 int prefix;
2447 2460
2448 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 2461 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2449 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 2462 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2450 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; 2463 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2451 } else 2464 } else
2452 prefix = 0; 2465 prefix = 0;
2453 2466
2454 return rt6_fill_node(arg->net, 2467 return rt6_fill_node(arg->net,
2455 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 2468 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2456 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, 2469 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2457 prefix, 0, NLM_F_MULTI); 2470 prefix, 0, NLM_F_MULTI);
2458 } 2471 }
2459 2472
2460 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2473 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2461 { 2474 {
2462 struct net *net = sock_net(in_skb->sk); 2475 struct net *net = sock_net(in_skb->sk);
2463 struct nlattr *tb[RTA_MAX+1]; 2476 struct nlattr *tb[RTA_MAX+1];
2464 struct rt6_info *rt; 2477 struct rt6_info *rt;
2465 struct sk_buff *skb; 2478 struct sk_buff *skb;
2466 struct rtmsg *rtm; 2479 struct rtmsg *rtm;
2467 struct flowi6 fl6; 2480 struct flowi6 fl6;
2468 int err, iif = 0; 2481 int err, iif = 0;
2469 2482
2470 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); 2483 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2471 if (err < 0) 2484 if (err < 0)
2472 goto errout; 2485 goto errout;
2473 2486
2474 err = -EINVAL; 2487 err = -EINVAL;
2475 memset(&fl6, 0, sizeof(fl6)); 2488 memset(&fl6, 0, sizeof(fl6));
2476 2489
2477 if (tb[RTA_SRC]) { 2490 if (tb[RTA_SRC]) {
2478 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 2491 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2479 goto errout; 2492 goto errout;
2480 2493
2481 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC])); 2494 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2482 } 2495 }
2483 2496
2484 if (tb[RTA_DST]) { 2497 if (tb[RTA_DST]) {
2485 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 2498 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2486 goto errout; 2499 goto errout;
2487 2500
2488 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST])); 2501 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2489 } 2502 }
2490 2503
2491 if (tb[RTA_IIF]) 2504 if (tb[RTA_IIF])
2492 iif = nla_get_u32(tb[RTA_IIF]); 2505 iif = nla_get_u32(tb[RTA_IIF]);
2493 2506
2494 if (tb[RTA_OIF]) 2507 if (tb[RTA_OIF])
2495 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]); 2508 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2496 2509
2497 if (iif) { 2510 if (iif) {
2498 struct net_device *dev; 2511 struct net_device *dev;
2499 dev = __dev_get_by_index(net, iif); 2512 dev = __dev_get_by_index(net, iif);
2500 if (!dev) { 2513 if (!dev) {
2501 err = -ENODEV; 2514 err = -ENODEV;
2502 goto errout; 2515 goto errout;
2503 } 2516 }
2504 } 2517 }
2505 2518
2506 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2519 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2507 if (skb == NULL) { 2520 if (skb == NULL) {
2508 err = -ENOBUFS; 2521 err = -ENOBUFS;
2509 goto errout; 2522 goto errout;
2510 } 2523 }
2511 2524
2512 /* Reserve room for dummy headers, this skb can pass 2525 /* Reserve room for dummy headers, this skb can pass
2513 through good chunk of routing engine. 2526 through good chunk of routing engine.
2514 */ 2527 */
2515 skb_reset_mac_header(skb); 2528 skb_reset_mac_header(skb);
2516 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); 2529 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2517 2530
2518 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6); 2531 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2519 skb_dst_set(skb, &rt->dst); 2532 skb_dst_set(skb, &rt->dst);
2520 2533
2521 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 2534 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2522 RTM_NEWROUTE, NETLINK_CB(in_skb).pid, 2535 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2523 nlh->nlmsg_seq, 0, 0, 0); 2536 nlh->nlmsg_seq, 0, 0, 0);
2524 if (err < 0) { 2537 if (err < 0) {
2525 kfree_skb(skb); 2538 kfree_skb(skb);
2526 goto errout; 2539 goto errout;
2527 } 2540 }
2528 2541
2529 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2542 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2530 errout: 2543 errout:
2531 return err; 2544 return err;
2532 } 2545 }
2533 2546
2534 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) 2547 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2535 { 2548 {
2536 struct sk_buff *skb; 2549 struct sk_buff *skb;
2537 struct net *net = info->nl_net; 2550 struct net *net = info->nl_net;
2538 u32 seq; 2551 u32 seq;
2539 int err; 2552 int err;
2540 2553
2541 err = -ENOBUFS; 2554 err = -ENOBUFS;
2542 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0; 2555 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2543 2556
2544 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); 2557 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2545 if (skb == NULL) 2558 if (skb == NULL)
2546 goto errout; 2559 goto errout;
2547 2560
2548 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 2561 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2549 event, info->pid, seq, 0, 0, 0); 2562 event, info->pid, seq, 0, 0, 0);
2550 if (err < 0) { 2563 if (err < 0) {
2551 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 2564 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2552 WARN_ON(err == -EMSGSIZE); 2565 WARN_ON(err == -EMSGSIZE);
2553 kfree_skb(skb); 2566 kfree_skb(skb);
2554 goto errout; 2567 goto errout;
2555 } 2568 }
2556 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, 2569 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2557 info->nlh, gfp_any()); 2570 info->nlh, gfp_any());
2558 return; 2571 return;
2559 errout: 2572 errout:
2560 if (err < 0) 2573 if (err < 0)
2561 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 2574 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2562 } 2575 }
2563 2576
2564 static int ip6_route_dev_notify(struct notifier_block *this, 2577 static int ip6_route_dev_notify(struct notifier_block *this,
2565 unsigned long event, void *data) 2578 unsigned long event, void *data)
2566 { 2579 {
2567 struct net_device *dev = (struct net_device *)data; 2580 struct net_device *dev = (struct net_device *)data;
2568 struct net *net = dev_net(dev); 2581 struct net *net = dev_net(dev);
2569 2582
2570 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { 2583 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2571 net->ipv6.ip6_null_entry->dst.dev = dev; 2584 net->ipv6.ip6_null_entry->dst.dev = dev;
2572 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 2585 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2573 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2586 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2574 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 2587 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2575 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 2588 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2576 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 2589 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2577 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 2590 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2578 #endif 2591 #endif
2579 } 2592 }
2580 2593
2581 return NOTIFY_OK; 2594 return NOTIFY_OK;
2582 } 2595 }
2583 2596
2584 /* 2597 /*
2585 * /proc 2598 * /proc
2586 */ 2599 */
2587 2600
2588 #ifdef CONFIG_PROC_FS 2601 #ifdef CONFIG_PROC_FS
2589 2602
2590 struct rt6_proc_arg 2603 struct rt6_proc_arg
2591 { 2604 {
2592 char *buffer; 2605 char *buffer;
2593 int offset; 2606 int offset;
2594 int length; 2607 int length;
2595 int skip; 2608 int skip;
2596 int len; 2609 int len;
2597 }; 2610 };
2598 2611
2599 static int rt6_info_route(struct rt6_info *rt, void *p_arg) 2612 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2600 { 2613 {
2601 struct seq_file *m = p_arg; 2614 struct seq_file *m = p_arg;
2602 struct neighbour *n; 2615 struct neighbour *n;
2603 2616
2604 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); 2617 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2605 2618
2606 #ifdef CONFIG_IPV6_SUBTREES 2619 #ifdef CONFIG_IPV6_SUBTREES
2607 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); 2620 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2608 #else 2621 #else
2609 seq_puts(m, "00000000000000000000000000000000 00 "); 2622 seq_puts(m, "00000000000000000000000000000000 00 ");
2610 #endif 2623 #endif
2624 rcu_read_lock();
2611 n = dst_get_neighbour(&rt->dst); 2625 n = dst_get_neighbour(&rt->dst);
2612 if (n) { 2626 if (n) {
2613 seq_printf(m, "%pi6", n->primary_key); 2627 seq_printf(m, "%pi6", n->primary_key);
2614 } else { 2628 } else {
2615 seq_puts(m, "00000000000000000000000000000000"); 2629 seq_puts(m, "00000000000000000000000000000000");
2616 } 2630 }
2631 rcu_read_unlock();
2617 seq_printf(m, " %08x %08x %08x %08x %8s\n", 2632 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2618 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), 2633 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2619 rt->dst.__use, rt->rt6i_flags, 2634 rt->dst.__use, rt->rt6i_flags,
2620 rt->rt6i_dev ? rt->rt6i_dev->name : ""); 2635 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2621 return 0; 2636 return 0;
2622 } 2637 }
2623 2638
2624 static int ipv6_route_show(struct seq_file *m, void *v) 2639 static int ipv6_route_show(struct seq_file *m, void *v)
2625 { 2640 {
2626 struct net *net = (struct net *)m->private; 2641 struct net *net = (struct net *)m->private;
2627 fib6_clean_all(net, rt6_info_route, 0, m); 2642 fib6_clean_all(net, rt6_info_route, 0, m);
2628 return 0; 2643 return 0;
2629 } 2644 }
2630 2645
2631 static int ipv6_route_open(struct inode *inode, struct file *file) 2646 static int ipv6_route_open(struct inode *inode, struct file *file)
2632 { 2647 {
2633 return single_open_net(inode, file, ipv6_route_show); 2648 return single_open_net(inode, file, ipv6_route_show);
2634 } 2649 }
2635 2650
2636 static const struct file_operations ipv6_route_proc_fops = { 2651 static const struct file_operations ipv6_route_proc_fops = {
2637 .owner = THIS_MODULE, 2652 .owner = THIS_MODULE,
2638 .open = ipv6_route_open, 2653 .open = ipv6_route_open,
2639 .read = seq_read, 2654 .read = seq_read,
2640 .llseek = seq_lseek, 2655 .llseek = seq_lseek,
2641 .release = single_release_net, 2656 .release = single_release_net,
2642 }; 2657 };
2643 2658
2644 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 2659 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2645 { 2660 {
2646 struct net *net = (struct net *)seq->private; 2661 struct net *net = (struct net *)seq->private;
2647 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 2662 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2648 net->ipv6.rt6_stats->fib_nodes, 2663 net->ipv6.rt6_stats->fib_nodes,
2649 net->ipv6.rt6_stats->fib_route_nodes, 2664 net->ipv6.rt6_stats->fib_route_nodes,
2650 net->ipv6.rt6_stats->fib_rt_alloc, 2665 net->ipv6.rt6_stats->fib_rt_alloc,
2651 net->ipv6.rt6_stats->fib_rt_entries, 2666 net->ipv6.rt6_stats->fib_rt_entries,
2652 net->ipv6.rt6_stats->fib_rt_cache, 2667 net->ipv6.rt6_stats->fib_rt_cache,
2653 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 2668 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2654 net->ipv6.rt6_stats->fib_discarded_routes); 2669 net->ipv6.rt6_stats->fib_discarded_routes);
2655 2670
2656 return 0; 2671 return 0;
2657 } 2672 }
2658 2673
2659 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 2674 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2660 { 2675 {
2661 return single_open_net(inode, file, rt6_stats_seq_show); 2676 return single_open_net(inode, file, rt6_stats_seq_show);
2662 } 2677 }
2663 2678
2664 static const struct file_operations rt6_stats_seq_fops = { 2679 static const struct file_operations rt6_stats_seq_fops = {
2665 .owner = THIS_MODULE, 2680 .owner = THIS_MODULE,
2666 .open = rt6_stats_seq_open, 2681 .open = rt6_stats_seq_open,
2667 .read = seq_read, 2682 .read = seq_read,
2668 .llseek = seq_lseek, 2683 .llseek = seq_lseek,
2669 .release = single_release_net, 2684 .release = single_release_net,
2670 }; 2685 };
2671 #endif /* CONFIG_PROC_FS */ 2686 #endif /* CONFIG_PROC_FS */
2672 2687
2673 #ifdef CONFIG_SYSCTL 2688 #ifdef CONFIG_SYSCTL
2674 2689
2675 static 2690 static
2676 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, 2691 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2677 void __user *buffer, size_t *lenp, loff_t *ppos) 2692 void __user *buffer, size_t *lenp, loff_t *ppos)
2678 { 2693 {
2679 struct net *net; 2694 struct net *net;
2680 int delay; 2695 int delay;
2681 if (!write) 2696 if (!write)
2682 return -EINVAL; 2697 return -EINVAL;
2683 2698
2684 net = (struct net *)ctl->extra1; 2699 net = (struct net *)ctl->extra1;
2685 delay = net->ipv6.sysctl.flush_delay; 2700 delay = net->ipv6.sysctl.flush_delay;
2686 proc_dointvec(ctl, write, buffer, lenp, ppos); 2701 proc_dointvec(ctl, write, buffer, lenp, ppos);
2687 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); 2702 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2688 return 0; 2703 return 0;
2689 } 2704 }
2690 2705
2691 ctl_table ipv6_route_table_template[] = { 2706 ctl_table ipv6_route_table_template[] = {
2692 { 2707 {
2693 .procname = "flush", 2708 .procname = "flush",
2694 .data = &init_net.ipv6.sysctl.flush_delay, 2709 .data = &init_net.ipv6.sysctl.flush_delay,
2695 .maxlen = sizeof(int), 2710 .maxlen = sizeof(int),
2696 .mode = 0200, 2711 .mode = 0200,
2697 .proc_handler = ipv6_sysctl_rtcache_flush 2712 .proc_handler = ipv6_sysctl_rtcache_flush
2698 }, 2713 },
2699 { 2714 {
2700 .procname = "gc_thresh", 2715 .procname = "gc_thresh",
2701 .data = &ip6_dst_ops_template.gc_thresh, 2716 .data = &ip6_dst_ops_template.gc_thresh,
2702 .maxlen = sizeof(int), 2717 .maxlen = sizeof(int),
2703 .mode = 0644, 2718 .mode = 0644,
2704 .proc_handler = proc_dointvec, 2719 .proc_handler = proc_dointvec,
2705 }, 2720 },
2706 { 2721 {
2707 .procname = "max_size", 2722 .procname = "max_size",
2708 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 2723 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2709 .maxlen = sizeof(int), 2724 .maxlen = sizeof(int),
2710 .mode = 0644, 2725 .mode = 0644,
2711 .proc_handler = proc_dointvec, 2726 .proc_handler = proc_dointvec,
2712 }, 2727 },
2713 { 2728 {
2714 .procname = "gc_min_interval", 2729 .procname = "gc_min_interval",
2715 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 2730 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2716 .maxlen = sizeof(int), 2731 .maxlen = sizeof(int),
2717 .mode = 0644, 2732 .mode = 0644,
2718 .proc_handler = proc_dointvec_jiffies, 2733 .proc_handler = proc_dointvec_jiffies,
2719 }, 2734 },
2720 { 2735 {
2721 .procname = "gc_timeout", 2736 .procname = "gc_timeout",
2722 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 2737 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2723 .maxlen = sizeof(int), 2738 .maxlen = sizeof(int),
2724 .mode = 0644, 2739 .mode = 0644,
2725 .proc_handler = proc_dointvec_jiffies, 2740 .proc_handler = proc_dointvec_jiffies,
2726 }, 2741 },
2727 { 2742 {
2728 .procname = "gc_interval", 2743 .procname = "gc_interval",
2729 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 2744 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2730 .maxlen = sizeof(int), 2745 .maxlen = sizeof(int),
2731 .mode = 0644, 2746 .mode = 0644,
2732 .proc_handler = proc_dointvec_jiffies, 2747 .proc_handler = proc_dointvec_jiffies,
2733 }, 2748 },
2734 { 2749 {
2735 .procname = "gc_elasticity", 2750 .procname = "gc_elasticity",
2736 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 2751 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2737 .maxlen = sizeof(int), 2752 .maxlen = sizeof(int),
2738 .mode = 0644, 2753 .mode = 0644,
2739 .proc_handler = proc_dointvec, 2754 .proc_handler = proc_dointvec,
2740 }, 2755 },
2741 { 2756 {
2742 .procname = "mtu_expires", 2757 .procname = "mtu_expires",
2743 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 2758 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2744 .maxlen = sizeof(int), 2759 .maxlen = sizeof(int),
2745 .mode = 0644, 2760 .mode = 0644,
2746 .proc_handler = proc_dointvec_jiffies, 2761 .proc_handler = proc_dointvec_jiffies,
2747 }, 2762 },
2748 { 2763 {
2749 .procname = "min_adv_mss", 2764 .procname = "min_adv_mss",
2750 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 2765 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2751 .maxlen = sizeof(int), 2766 .maxlen = sizeof(int),
2752 .mode = 0644, 2767 .mode = 0644,
2753 .proc_handler = proc_dointvec, 2768 .proc_handler = proc_dointvec,
2754 }, 2769 },
2755 { 2770 {
2756 .procname = "gc_min_interval_ms", 2771 .procname = "gc_min_interval_ms",
2757 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 2772 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758 .maxlen = sizeof(int), 2773 .maxlen = sizeof(int),
2759 .mode = 0644, 2774 .mode = 0644,
2760 .proc_handler = proc_dointvec_ms_jiffies, 2775 .proc_handler = proc_dointvec_ms_jiffies,
2761 }, 2776 },
2762 { } 2777 { }
2763 }; 2778 };
2764 2779
2765 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 2780 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2766 { 2781 {
2767 struct ctl_table *table; 2782 struct ctl_table *table;
2768 2783
2769 table = kmemdup(ipv6_route_table_template, 2784 table = kmemdup(ipv6_route_table_template,
2770 sizeof(ipv6_route_table_template), 2785 sizeof(ipv6_route_table_template),
2771 GFP_KERNEL); 2786 GFP_KERNEL);
2772 2787
2773 if (table) { 2788 if (table) {
2774 table[0].data = &net->ipv6.sysctl.flush_delay; 2789 table[0].data = &net->ipv6.sysctl.flush_delay;
2775 table[0].extra1 = net; 2790 table[0].extra1 = net;
2776 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 2791 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2777 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 2792 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2778 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 2793 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2779 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 2794 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2780 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 2795 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2781 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 2796 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2782 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 2797 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2783 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 2798 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2784 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 2799 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2785 } 2800 }
2786 2801
2787 return table; 2802 return table;
2788 } 2803 }
2789 #endif 2804 #endif
2790 2805
2791 static int __net_init ip6_route_net_init(struct net *net) 2806 static int __net_init ip6_route_net_init(struct net *net)
2792 { 2807 {
2793 int ret = -ENOMEM; 2808 int ret = -ENOMEM;
2794 2809
2795 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 2810 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2796 sizeof(net->ipv6.ip6_dst_ops)); 2811 sizeof(net->ipv6.ip6_dst_ops));
2797 2812
2798 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 2813 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2799 goto out_ip6_dst_ops; 2814 goto out_ip6_dst_ops;
2800 2815
2801 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 2816 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2802 sizeof(*net->ipv6.ip6_null_entry), 2817 sizeof(*net->ipv6.ip6_null_entry),
2803 GFP_KERNEL); 2818 GFP_KERNEL);
2804 if (!net->ipv6.ip6_null_entry) 2819 if (!net->ipv6.ip6_null_entry)
2805 goto out_ip6_dst_entries; 2820 goto out_ip6_dst_entries;
2806 net->ipv6.ip6_null_entry->dst.path = 2821 net->ipv6.ip6_null_entry->dst.path =
2807 (struct dst_entry *)net->ipv6.ip6_null_entry; 2822 (struct dst_entry *)net->ipv6.ip6_null_entry;
2808 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2823 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2809 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 2824 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2810 ip6_template_metrics, true); 2825 ip6_template_metrics, true);
2811 2826
2812 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2827 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2813 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 2828 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2814 sizeof(*net->ipv6.ip6_prohibit_entry), 2829 sizeof(*net->ipv6.ip6_prohibit_entry),
2815 GFP_KERNEL); 2830 GFP_KERNEL);
2816 if (!net->ipv6.ip6_prohibit_entry) 2831 if (!net->ipv6.ip6_prohibit_entry)
2817 goto out_ip6_null_entry; 2832 goto out_ip6_null_entry;
2818 net->ipv6.ip6_prohibit_entry->dst.path = 2833 net->ipv6.ip6_prohibit_entry->dst.path =
2819 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 2834 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2820 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2835 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2821 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 2836 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2822 ip6_template_metrics, true); 2837 ip6_template_metrics, true);
2823 2838
2824 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 2839 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2825 sizeof(*net->ipv6.ip6_blk_hole_entry), 2840 sizeof(*net->ipv6.ip6_blk_hole_entry),
2826 GFP_KERNEL); 2841 GFP_KERNEL);
2827 if (!net->ipv6.ip6_blk_hole_entry) 2842 if (!net->ipv6.ip6_blk_hole_entry)
2828 goto out_ip6_prohibit_entry; 2843 goto out_ip6_prohibit_entry;
2829 net->ipv6.ip6_blk_hole_entry->dst.path = 2844 net->ipv6.ip6_blk_hole_entry->dst.path =
2830 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 2845 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2831 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2846 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2832 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 2847 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2833 ip6_template_metrics, true); 2848 ip6_template_metrics, true);
2834 #endif 2849 #endif
2835 2850
2836 net->ipv6.sysctl.flush_delay = 0; 2851 net->ipv6.sysctl.flush_delay = 0;
2837 net->ipv6.sysctl.ip6_rt_max_size = 4096; 2852 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2838 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 2853 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2839 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 2854 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2840 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 2855 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2841 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 2856 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2842 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 2857 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2843 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 2858 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2844 2859
2845 #ifdef CONFIG_PROC_FS 2860 #ifdef CONFIG_PROC_FS
2846 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); 2861 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2847 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); 2862 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2848 #endif 2863 #endif
2849 net->ipv6.ip6_rt_gc_expire = 30*HZ; 2864 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2850 2865
2851 ret = 0; 2866 ret = 0;
2852 out: 2867 out:
2853 return ret; 2868 return ret;
2854 2869
2855 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2870 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2856 out_ip6_prohibit_entry: 2871 out_ip6_prohibit_entry:
2857 kfree(net->ipv6.ip6_prohibit_entry); 2872 kfree(net->ipv6.ip6_prohibit_entry);
2858 out_ip6_null_entry: 2873 out_ip6_null_entry:
2859 kfree(net->ipv6.ip6_null_entry); 2874 kfree(net->ipv6.ip6_null_entry);
2860 #endif 2875 #endif
2861 out_ip6_dst_entries: 2876 out_ip6_dst_entries:
2862 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 2877 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2863 out_ip6_dst_ops: 2878 out_ip6_dst_ops:
2864 goto out; 2879 goto out;
2865 } 2880 }
2866 2881
2867 static void __net_exit ip6_route_net_exit(struct net *net) 2882 static void __net_exit ip6_route_net_exit(struct net *net)
2868 { 2883 {
2869 #ifdef CONFIG_PROC_FS 2884 #ifdef CONFIG_PROC_FS
2870 proc_net_remove(net, "ipv6_route"); 2885 proc_net_remove(net, "ipv6_route");
2871 proc_net_remove(net, "rt6_stats"); 2886 proc_net_remove(net, "rt6_stats");
2872 #endif 2887 #endif
2873 kfree(net->ipv6.ip6_null_entry); 2888 kfree(net->ipv6.ip6_null_entry);
2874 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2889 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2875 kfree(net->ipv6.ip6_prohibit_entry); 2890 kfree(net->ipv6.ip6_prohibit_entry);
2876 kfree(net->ipv6.ip6_blk_hole_entry); 2891 kfree(net->ipv6.ip6_blk_hole_entry);
2877 #endif 2892 #endif
2878 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 2893 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2879 } 2894 }
2880 2895
2881 static struct pernet_operations ip6_route_net_ops = { 2896 static struct pernet_operations ip6_route_net_ops = {
2882 .init = ip6_route_net_init, 2897 .init = ip6_route_net_init,
2883 .exit = ip6_route_net_exit, 2898 .exit = ip6_route_net_exit,
2884 }; 2899 };
2885 2900
2886 static struct notifier_block ip6_route_dev_notifier = { 2901 static struct notifier_block ip6_route_dev_notifier = {
2887 .notifier_call = ip6_route_dev_notify, 2902 .notifier_call = ip6_route_dev_notify,
2888 .priority = 0, 2903 .priority = 0,
2889 }; 2904 };
2890 2905
2891 int __init ip6_route_init(void) 2906 int __init ip6_route_init(void)
2892 { 2907 {
2893 int ret; 2908 int ret;
2894 2909
2895 ret = -ENOMEM; 2910 ret = -ENOMEM;
2896 ip6_dst_ops_template.kmem_cachep = 2911 ip6_dst_ops_template.kmem_cachep =
2897 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 2912 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2898 SLAB_HWCACHE_ALIGN, NULL); 2913 SLAB_HWCACHE_ALIGN, NULL);
2899 if (!ip6_dst_ops_template.kmem_cachep) 2914 if (!ip6_dst_ops_template.kmem_cachep)
2900 goto out; 2915 goto out;
2901 2916
2902 ret = dst_entries_init(&ip6_dst_blackhole_ops); 2917 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2903 if (ret) 2918 if (ret)
2904 goto out_kmem_cache; 2919 goto out_kmem_cache;
2905 2920
2906 ret = register_pernet_subsys(&ip6_route_net_ops); 2921 ret = register_pernet_subsys(&ip6_route_net_ops);
2907 if (ret) 2922 if (ret)
2908 goto out_dst_entries; 2923 goto out_dst_entries;
2909 2924
2910 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 2925 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2911 2926
2912 /* Registering of the loopback is done before this portion of code, 2927 /* Registering of the loopback is done before this portion of code,
2913 * the loopback reference in rt6_info will not be taken, do it 2928 * the loopback reference in rt6_info will not be taken, do it
2914 * manually for init_net */ 2929 * manually for init_net */
2915 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 2930 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2916 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 2931 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2917 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2932 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2918 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 2933 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2919 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 2934 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2920 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 2935 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2921 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 2936 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2922 #endif 2937 #endif
2923 ret = fib6_init(); 2938 ret = fib6_init();
2924 if (ret) 2939 if (ret)
2925 goto out_register_subsys; 2940 goto out_register_subsys;
2926 2941
2927 ret = xfrm6_init(); 2942 ret = xfrm6_init();
2928 if (ret) 2943 if (ret)
2929 goto out_fib6_init; 2944 goto out_fib6_init;
2930 2945
2931 ret = fib6_rules_init(); 2946 ret = fib6_rules_init();
2932 if (ret) 2947 if (ret)
2933 goto xfrm6_init; 2948 goto xfrm6_init;
2934 2949
2935 ret = -ENOBUFS; 2950 ret = -ENOBUFS;
2936 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || 2951 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2937 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || 2952 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2938 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) 2953 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2939 goto fib6_rules_init; 2954 goto fib6_rules_init;
2940 2955
2941 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 2956 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2942 if (ret) 2957 if (ret)
2943 goto fib6_rules_init; 2958 goto fib6_rules_init;
2944 2959
2945 out: 2960 out:
2946 return ret; 2961 return ret;
2947 2962
2948 fib6_rules_init: 2963 fib6_rules_init:
2949 fib6_rules_cleanup(); 2964 fib6_rules_cleanup();
2950 xfrm6_init: 2965 xfrm6_init:
2951 xfrm6_fini(); 2966 xfrm6_fini();
2952 out_fib6_init: 2967 out_fib6_init:
2953 fib6_gc_cleanup(); 2968 fib6_gc_cleanup();
2954 out_register_subsys: 2969 out_register_subsys:
2955 unregister_pernet_subsys(&ip6_route_net_ops); 2970 unregister_pernet_subsys(&ip6_route_net_ops);
2956 out_dst_entries: 2971 out_dst_entries:
2957 dst_entries_destroy(&ip6_dst_blackhole_ops); 2972 dst_entries_destroy(&ip6_dst_blackhole_ops);
2958 out_kmem_cache: 2973 out_kmem_cache:
2959 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 2974 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2960 goto out; 2975 goto out;
2961 } 2976 }
2962 2977
2963 void ip6_route_cleanup(void) 2978 void ip6_route_cleanup(void)
2964 { 2979 {
2965 unregister_netdevice_notifier(&ip6_route_dev_notifier); 2980 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2966 fib6_rules_cleanup(); 2981 fib6_rules_cleanup();
2967 xfrm6_fini(); 2982 xfrm6_fini();
2968 fib6_gc_cleanup(); 2983 fib6_gc_cleanup();
2969 unregister_pernet_subsys(&ip6_route_net_ops); 2984 unregister_pernet_subsys(&ip6_route_net_ops);
2970 dst_entries_destroy(&ip6_dst_blackhole_ops); 2985 dst_entries_destroy(&ip6_dst_blackhole_ops);
2971 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 2986 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2972 } 2987 }
2973 2988