net dst: use a percpu_counter to track entries
[linux-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       cpu_to_be16(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124 };
125
126 static struct rt6_info ip6_null_entry_template = {
127         .dst = {
128                 .__refcnt       = ATOMIC_INIT(1),
129                 .__use          = 1,
130                 .obsolete       = -1,
131                 .error          = -ENETUNREACH,
132                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
133                 .input          = ip6_pkt_discard,
134                 .output         = ip6_pkt_discard_out,
135         },
136         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
137         .rt6i_protocol  = RTPROT_KERNEL,
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146
147 static struct rt6_info ip6_prohibit_entry_template = {
148         .dst = {
149                 .__refcnt       = ATOMIC_INIT(1),
150                 .__use          = 1,
151                 .obsolete       = -1,
152                 .error          = -EACCES,
153                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
154                 .input          = ip6_pkt_prohibit,
155                 .output         = ip6_pkt_prohibit_out,
156         },
157         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
158         .rt6i_protocol  = RTPROT_KERNEL,
159         .rt6i_metric    = ~(u32) 0,
160         .rt6i_ref       = ATOMIC_INIT(1),
161 };
162
163 static struct rt6_info ip6_blk_hole_entry_template = {
164         .dst = {
165                 .__refcnt       = ATOMIC_INIT(1),
166                 .__use          = 1,
167                 .obsolete       = -1,
168                 .error          = -EINVAL,
169                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
170                 .input          = dst_discard,
171                 .output         = dst_discard,
172         },
173         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
174         .rt6i_protocol  = RTPROT_KERNEL,
175         .rt6i_metric    = ~(u32) 0,
176         .rt6i_ref       = ATOMIC_INIT(1),
177 };
178
179 #endif
180
181 /* allocate dst with ip6_dst_ops */
182 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
183 {
184         return (struct rt6_info *)dst_alloc(ops);
185 }
186
187 static void ip6_dst_destroy(struct dst_entry *dst)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190         struct inet6_dev *idev = rt->rt6i_idev;
191
192         if (idev != NULL) {
193                 rt->rt6i_idev = NULL;
194                 in6_dev_put(idev);
195         }
196 }
197
198 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
199                            int how)
200 {
201         struct rt6_info *rt = (struct rt6_info *)dst;
202         struct inet6_dev *idev = rt->rt6i_idev;
203         struct net_device *loopback_dev =
204                 dev_net(dev)->loopback_dev;
205
206         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
207                 struct inet6_dev *loopback_idev =
208                         in6_dev_get(loopback_dev);
209                 if (loopback_idev != NULL) {
210                         rt->rt6i_idev = loopback_idev;
211                         in6_dev_put(idev);
212                 }
213         }
214 }
215
216 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
217 {
218         return (rt->rt6i_flags & RTF_EXPIRES) &&
219                 time_after(jiffies, rt->rt6i_expires);
220 }
221
222 static inline int rt6_need_strict(struct in6_addr *daddr)
223 {
224         return ipv6_addr_type(daddr) &
225                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
226 }
227
228 /*
229  *      Route lookup. Any table->tb6_lock is implied.
230  */
231
232 static inline struct rt6_info *rt6_device_match(struct net *net,
233                                                     struct rt6_info *rt,
234                                                     struct in6_addr *saddr,
235                                                     int oif,
236                                                     int flags)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (!oif && ipv6_addr_any(saddr))
242                 goto out;
243
244         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
245                 struct net_device *dev = sprt->rt6i_dev;
246
247                 if (oif) {
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
254                                                 continue;
255                                         if (local && (!oif ||
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 } else {
262                         if (ipv6_chk_addr(net, saddr, dev,
263                                           flags & RT6_LOOKUP_F_IFACE))
264                                 return sprt;
265                 }
266         }
267
268         if (oif) {
269                 if (local)
270                         return local;
271
272                 if (flags & RT6_LOOKUP_F_IFACE)
273                         return net->ipv6.ip6_null_entry;
274         }
275 out:
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311 }
312 #endif
313
314 /*
315  * Default Router Selection (RFC 2461 6.3.6)
316  */
317 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
318 {
319         struct net_device *dev = rt->rt6i_dev;
320         if (!oif || dev->ifindex == oif)
321                 return 2;
322         if ((dev->flags & IFF_LOOPBACK) &&
323             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
324                 return 1;
325         return 0;
326 }
327
328 static inline int rt6_check_neigh(struct rt6_info *rt)
329 {
330         struct neighbour *neigh = rt->rt6i_nexthop;
331         int m;
332         if (rt->rt6i_flags & RTF_NONEXTHOP ||
333             !(rt->rt6i_flags & RTF_GATEWAY))
334                 m = 1;
335         else if (neigh) {
336                 read_lock_bh(&neigh->lock);
337                 if (neigh->nud_state & NUD_VALID)
338                         m = 2;
339 #ifdef CONFIG_IPV6_ROUTER_PREF
340                 else if (neigh->nud_state & NUD_FAILED)
341                         m = 0;
342 #endif
343                 else
344                         m = 1;
345                 read_unlock_bh(&neigh->lock);
346         } else
347                 m = 0;
348         return m;
349 }
350
351 static int rt6_score_route(struct rt6_info *rt, int oif,
352                            int strict)
353 {
354         int m, n;
355
356         m = rt6_check_dev(rt, oif);
357         if (!m && (strict & RT6_LOOKUP_F_IFACE))
358                 return -1;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
361 #endif
362         n = rt6_check_neigh(rt);
363         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
364                 return -1;
365         return m;
366 }
367
368 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
369                                    int *mpri, struct rt6_info *match)
370 {
371         int m;
372
373         if (rt6_check_expired(rt))
374                 goto out;
375
376         m = rt6_score_route(rt, oif, strict);
377         if (m < 0)
378                 goto out;
379
380         if (m > *mpri) {
381                 if (strict & RT6_LOOKUP_F_REACHABLE)
382                         rt6_probe(match);
383                 *mpri = m;
384                 match = rt;
385         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
386                 rt6_probe(rt);
387         }
388
389 out:
390         return match;
391 }
392
393 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
394                                      struct rt6_info *rr_head,
395                                      u32 metric, int oif, int strict)
396 {
397         struct rt6_info *rt, *match;
398         int mpri = -1;
399
400         match = NULL;
401         for (rt = rr_head; rt && rt->rt6i_metric == metric;
402              rt = rt->dst.rt6_next)
403                 match = find_match(rt, oif, strict, &mpri, match);
404         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
405              rt = rt->dst.rt6_next)
406                 match = find_match(rt, oif, strict, &mpri, match);
407
408         return match;
409 }
410
411 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
412 {
413         struct rt6_info *match, *rt0;
414         struct net *net;
415
416         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
417                   __func__, fn->leaf, oif);
418
419         rt0 = fn->rr_ptr;
420         if (!rt0)
421                 fn->rr_ptr = rt0 = fn->leaf;
422
423         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
424
425         if (!match &&
426             (strict & RT6_LOOKUP_F_REACHABLE)) {
427                 struct rt6_info *next = rt0->dst.rt6_next;
428
429                 /* no entries matched; do round-robin */
430                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
431                         next = fn->leaf;
432
433                 if (next != rt0)
434                         fn->rr_ptr = next;
435         }
436
437         RT6_TRACE("%s() => %p\n",
438                   __func__, match);
439
440         net = dev_net(rt0->rt6i_dev);
441         return match ? match : net->ipv6.ip6_null_entry;
442 }
443
444 #ifdef CONFIG_IPV6_ROUTE_INFO
445 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
446                   struct in6_addr *gwaddr)
447 {
448         struct net *net = dev_net(dev);
449         struct route_info *rinfo = (struct route_info *) opt;
450         struct in6_addr prefix_buf, *prefix;
451         unsigned int pref;
452         unsigned long lifetime;
453         struct rt6_info *rt;
454
455         if (len < sizeof(struct route_info)) {
456                 return -EINVAL;
457         }
458
459         /* Sanity check for prefix_len and length */
460         if (rinfo->length > 3) {
461                 return -EINVAL;
462         } else if (rinfo->prefix_len > 128) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 64) {
465                 if (rinfo->length < 2) {
466                         return -EINVAL;
467                 }
468         } else if (rinfo->prefix_len > 0) {
469                 if (rinfo->length < 1) {
470                         return -EINVAL;
471                 }
472         }
473
474         pref = rinfo->route_pref;
475         if (pref == ICMPV6_ROUTER_PREF_INVALID)
476                 return -EINVAL;
477
478         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
479
480         if (rinfo->length == 3)
481                 prefix = (struct in6_addr *)rinfo->prefix;
482         else {
483                 /* this function is safe */
484                 ipv6_addr_prefix(&prefix_buf,
485                                  (struct in6_addr *)rinfo->prefix,
486                                  rinfo->prefix_len);
487                 prefix = &prefix_buf;
488         }
489
490         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
491                                 dev->ifindex);
492
493         if (rt && !lifetime) {
494                 ip6_del_rt(rt);
495                 rt = NULL;
496         }
497
498         if (!rt && lifetime)
499                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
500                                         pref);
501         else if (rt)
502                 rt->rt6i_flags = RTF_ROUTEINFO |
503                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
504
505         if (rt) {
506                 if (!addrconf_finite_timeout(lifetime)) {
507                         rt->rt6i_flags &= ~RTF_EXPIRES;
508                 } else {
509                         rt->rt6i_expires = jiffies + HZ * lifetime;
510                         rt->rt6i_flags |= RTF_EXPIRES;
511                 }
512                 dst_release(&rt->dst);
513         }
514         return 0;
515 }
516 #endif
517
518 #define BACKTRACK(__net, saddr)                 \
519 do { \
520         if (rt == __net->ipv6.ip6_null_entry) { \
521                 struct fib6_node *pn; \
522                 while (1) { \
523                         if (fn->fn_flags & RTN_TL_ROOT) \
524                                 goto out; \
525                         pn = fn->parent; \
526                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
527                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
528                         else \
529                                 fn = pn; \
530                         if (fn->fn_flags & RTN_RTINFO) \
531                                 goto restart; \
532                 } \
533         } \
534 } while(0)
535
536 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
537                                              struct fib6_table *table,
538                                              struct flowi *fl, int flags)
539 {
540         struct fib6_node *fn;
541         struct rt6_info *rt;
542
543         read_lock_bh(&table->tb6_lock);
544         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
545 restart:
546         rt = fn->leaf;
547         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
548         BACKTRACK(net, &fl->fl6_src);
549 out:
550         dst_use(&rt->dst, jiffies);
551         read_unlock_bh(&table->tb6_lock);
552         return rt;
553
554 }
555
556 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
557                             const struct in6_addr *saddr, int oif, int strict)
558 {
559         struct flowi fl = {
560                 .oif = oif,
561                 .nl_u = {
562                         .ip6_u = {
563                                 .daddr = *daddr,
564                         },
565                 },
566         };
567         struct dst_entry *dst;
568         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
569
570         if (saddr) {
571                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
572                 flags |= RT6_LOOKUP_F_HAS_SADDR;
573         }
574
575         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
576         if (dst->error == 0)
577                 return (struct rt6_info *) dst;
578
579         dst_release(dst);
580
581         return NULL;
582 }
583
584 EXPORT_SYMBOL(rt6_lookup);
585
586 /* ip6_ins_rt is called with FREE table->tb6_lock.
587    It takes new route entry, the addition fails by any reason the
588    route is freed. In any case, if caller does not hold it, it may
589    be destroyed.
590  */
591
592 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
593 {
594         int err;
595         struct fib6_table *table;
596
597         table = rt->rt6i_table;
598         write_lock_bh(&table->tb6_lock);
599         err = fib6_add(&table->tb6_root, rt, info);
600         write_unlock_bh(&table->tb6_lock);
601
602         return err;
603 }
604
605 int ip6_ins_rt(struct rt6_info *rt)
606 {
607         struct nl_info info = {
608                 .nl_net = dev_net(rt->rt6i_dev),
609         };
610         return __ip6_ins_rt(rt, &info);
611 }
612
613 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
614                                       struct in6_addr *saddr)
615 {
616         struct rt6_info *rt;
617
618         /*
619          *      Clone the route.
620          */
621
622         rt = ip6_rt_copy(ort);
623
624         if (rt) {
625                 struct neighbour *neigh;
626                 int attempts = !in_softirq();
627
628                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
629                         if (rt->rt6i_dst.plen != 128 &&
630                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
631                                 rt->rt6i_flags |= RTF_ANYCAST;
632                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
633                 }
634
635                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
636                 rt->rt6i_dst.plen = 128;
637                 rt->rt6i_flags |= RTF_CACHE;
638                 rt->dst.flags |= DST_HOST;
639
640 #ifdef CONFIG_IPV6_SUBTREES
641                 if (rt->rt6i_src.plen && saddr) {
642                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
643                         rt->rt6i_src.plen = 128;
644                 }
645 #endif
646
647         retry:
648                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
649                 if (IS_ERR(neigh)) {
650                         struct net *net = dev_net(rt->rt6i_dev);
651                         int saved_rt_min_interval =
652                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
653                         int saved_rt_elasticity =
654                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
655
656                         if (attempts-- > 0) {
657                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
658                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
659
660                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
661
662                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
663                                         saved_rt_elasticity;
664                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
665                                         saved_rt_min_interval;
666                                 goto retry;
667                         }
668
669                         if (net_ratelimit())
670                                 printk(KERN_WARNING
671                                        "ipv6: Neighbour table overflow.\n");
672                         dst_free(&rt->dst);
673                         return NULL;
674                 }
675                 rt->rt6i_nexthop = neigh;
676
677         }
678
679         return rt;
680 }
681
682 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
683 {
684         struct rt6_info *rt = ip6_rt_copy(ort);
685         if (rt) {
686                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
687                 rt->rt6i_dst.plen = 128;
688                 rt->rt6i_flags |= RTF_CACHE;
689                 rt->dst.flags |= DST_HOST;
690                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
691         }
692         return rt;
693 }
694
695 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
696                                       struct flowi *fl, int flags)
697 {
698         struct fib6_node *fn;
699         struct rt6_info *rt, *nrt;
700         int strict = 0;
701         int attempts = 3;
702         int err;
703         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
704
705         strict |= flags & RT6_LOOKUP_F_IFACE;
706
707 relookup:
708         read_lock_bh(&table->tb6_lock);
709
710 restart_2:
711         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
712
713 restart:
714         rt = rt6_select(fn, oif, strict | reachable);
715
716         BACKTRACK(net, &fl->fl6_src);
717         if (rt == net->ipv6.ip6_null_entry ||
718             rt->rt6i_flags & RTF_CACHE)
719                 goto out;
720
721         dst_hold(&rt->dst);
722         read_unlock_bh(&table->tb6_lock);
723
724         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
725                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
726         else {
727 #if CLONE_OFFLINK_ROUTE
728                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
729 #else
730                 goto out2;
731 #endif
732         }
733
734         dst_release(&rt->dst);
735         rt = nrt ? : net->ipv6.ip6_null_entry;
736
737         dst_hold(&rt->dst);
738         if (nrt) {
739                 err = ip6_ins_rt(nrt);
740                 if (!err)
741                         goto out2;
742         }
743
744         if (--attempts <= 0)
745                 goto out2;
746
747         /*
748          * Race condition! In the gap, when table->tb6_lock was
749          * released someone could insert this route.  Relookup.
750          */
751         dst_release(&rt->dst);
752         goto relookup;
753
754 out:
755         if (reachable) {
756                 reachable = 0;
757                 goto restart_2;
758         }
759         dst_hold(&rt->dst);
760         read_unlock_bh(&table->tb6_lock);
761 out2:
762         rt->dst.lastuse = jiffies;
763         rt->dst.__use++;
764
765         return rt;
766 }
767
768 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
769                                             struct flowi *fl, int flags)
770 {
771         return ip6_pol_route(net, table, fl->iif, fl, flags);
772 }
773
774 void ip6_route_input(struct sk_buff *skb)
775 {
776         struct ipv6hdr *iph = ipv6_hdr(skb);
777         struct net *net = dev_net(skb->dev);
778         int flags = RT6_LOOKUP_F_HAS_SADDR;
779         struct flowi fl = {
780                 .iif = skb->dev->ifindex,
781                 .nl_u = {
782                         .ip6_u = {
783                                 .daddr = iph->daddr,
784                                 .saddr = iph->saddr,
785                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
786                         },
787                 },
788                 .mark = skb->mark,
789                 .proto = iph->nexthdr,
790         };
791
792         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
793                 flags |= RT6_LOOKUP_F_IFACE;
794
795         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
796 }
797
798 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
799                                              struct flowi *fl, int flags)
800 {
801         return ip6_pol_route(net, table, fl->oif, fl, flags);
802 }
803
804 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
805                                     struct flowi *fl)
806 {
807         int flags = 0;
808
809         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
810                 flags |= RT6_LOOKUP_F_IFACE;
811
812         if (!ipv6_addr_any(&fl->fl6_src))
813                 flags |= RT6_LOOKUP_F_HAS_SADDR;
814         else if (sk)
815                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
816
817         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
818 }
819
820 EXPORT_SYMBOL(ip6_route_output);
821
822 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
823 {
824         struct rt6_info *ort = (struct rt6_info *) *dstp;
825         struct rt6_info *rt = (struct rt6_info *)
826                 dst_alloc(&ip6_dst_blackhole_ops);
827         struct dst_entry *new = NULL;
828
829         if (rt) {
830                 new = &rt->dst;
831
832                 atomic_set(&new->__refcnt, 1);
833                 new->__use = 1;
834                 new->input = dst_discard;
835                 new->output = dst_discard;
836
837                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
838                 new->dev = ort->dst.dev;
839                 if (new->dev)
840                         dev_hold(new->dev);
841                 rt->rt6i_idev = ort->rt6i_idev;
842                 if (rt->rt6i_idev)
843                         in6_dev_hold(rt->rt6i_idev);
844                 rt->rt6i_expires = 0;
845
846                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
847                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
848                 rt->rt6i_metric = 0;
849
850                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
851 #ifdef CONFIG_IPV6_SUBTREES
852                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
853 #endif
854
855                 dst_free(new);
856         }
857
858         dst_release(*dstp);
859         *dstp = new;
860         return new ? 0 : -ENOMEM;
861 }
862 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
863
864 /*
865  *      Destination cache support functions
866  */
867
868 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
869 {
870         struct rt6_info *rt;
871
872         rt = (struct rt6_info *) dst;
873
874         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
875                 return dst;
876
877         return NULL;
878 }
879
880 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
881 {
882         struct rt6_info *rt = (struct rt6_info *) dst;
883
884         if (rt) {
885                 if (rt->rt6i_flags & RTF_CACHE) {
886                         if (rt6_check_expired(rt)) {
887                                 ip6_del_rt(rt);
888                                 dst = NULL;
889                         }
890                 } else {
891                         dst_release(dst);
892                         dst = NULL;
893                 }
894         }
895         return dst;
896 }
897
898 static void ip6_link_failure(struct sk_buff *skb)
899 {
900         struct rt6_info *rt;
901
902         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
903
904         rt = (struct rt6_info *) skb_dst(skb);
905         if (rt) {
906                 if (rt->rt6i_flags&RTF_CACHE) {
907                         dst_set_expires(&rt->dst, 0);
908                         rt->rt6i_flags |= RTF_EXPIRES;
909                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
910                         rt->rt6i_node->fn_sernum = -1;
911         }
912 }
913
914 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
915 {
916         struct rt6_info *rt6 = (struct rt6_info*)dst;
917
918         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
919                 rt6->rt6i_flags |= RTF_MODIFIED;
920                 if (mtu < IPV6_MIN_MTU) {
921                         mtu = IPV6_MIN_MTU;
922                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
923                 }
924                 dst->metrics[RTAX_MTU-1] = mtu;
925                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
926         }
927 }
928
929 static int ipv6_get_mtu(struct net_device *dev);
930
931 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
932 {
933         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
934
935         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
936                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
937
938         /*
939          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
940          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
941          * IPV6_MAXPLEN is also valid and means: "any MSS,
942          * rely only on pmtu discovery"
943          */
944         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
945                 mtu = IPV6_MAXPLEN;
946         return mtu;
947 }
948
949 static struct dst_entry *icmp6_dst_gc_list;
950 static DEFINE_SPINLOCK(icmp6_dst_lock);
951
952 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
953                                   struct neighbour *neigh,
954                                   const struct in6_addr *addr)
955 {
956         struct rt6_info *rt;
957         struct inet6_dev *idev = in6_dev_get(dev);
958         struct net *net = dev_net(dev);
959
960         if (unlikely(idev == NULL))
961                 return NULL;
962
963         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
964         if (unlikely(rt == NULL)) {
965                 in6_dev_put(idev);
966                 goto out;
967         }
968
969         dev_hold(dev);
970         if (neigh)
971                 neigh_hold(neigh);
972         else {
973                 neigh = ndisc_get_neigh(dev, addr);
974                 if (IS_ERR(neigh))
975                         neigh = NULL;
976         }
977
978         rt->rt6i_dev      = dev;
979         rt->rt6i_idev     = idev;
980         rt->rt6i_nexthop  = neigh;
981         atomic_set(&rt->dst.__refcnt, 1);
982         rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
983         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
984         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
985         rt->dst.output  = ip6_output;
986
987 #if 0   /* there's no chance to use these for ndisc */
988         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
989                                 ? DST_HOST
990                                 : 0;
991         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
992         rt->rt6i_dst.plen = 128;
993 #endif
994
995         spin_lock_bh(&icmp6_dst_lock);
996         rt->dst.next = icmp6_dst_gc_list;
997         icmp6_dst_gc_list = &rt->dst;
998         spin_unlock_bh(&icmp6_dst_lock);
999
1000         fib6_force_start_gc(net);
1001
1002 out:
1003         return &rt->dst;
1004 }
1005
1006 int icmp6_dst_gc(void)
1007 {
1008         struct dst_entry *dst, *next, **pprev;
1009         int more = 0;
1010
1011         next = NULL;
1012
1013         spin_lock_bh(&icmp6_dst_lock);
1014         pprev = &icmp6_dst_gc_list;
1015
1016         while ((dst = *pprev) != NULL) {
1017                 if (!atomic_read(&dst->__refcnt)) {
1018                         *pprev = dst->next;
1019                         dst_free(dst);
1020                 } else {
1021                         pprev = &dst->next;
1022                         ++more;
1023                 }
1024         }
1025
1026         spin_unlock_bh(&icmp6_dst_lock);
1027
1028         return more;
1029 }
1030
1031 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1032                             void *arg)
1033 {
1034         struct dst_entry *dst, **pprev;
1035
1036         spin_lock_bh(&icmp6_dst_lock);
1037         pprev = &icmp6_dst_gc_list;
1038         while ((dst = *pprev) != NULL) {
1039                 struct rt6_info *rt = (struct rt6_info *) dst;
1040                 if (func(rt, arg)) {
1041                         *pprev = dst->next;
1042                         dst_free(dst);
1043                 } else {
1044                         pprev = &dst->next;
1045                 }
1046         }
1047         spin_unlock_bh(&icmp6_dst_lock);
1048 }
1049
1050 static int ip6_dst_gc(struct dst_ops *ops)
1051 {
1052         unsigned long now = jiffies;
1053         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1054         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1055         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1056         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1057         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1058         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1059         int entries;
1060
1061         entries = dst_entries_get_fast(ops);
1062         if (time_after(rt_last_gc + rt_min_interval, now) &&
1063             entries <= rt_max_size)
1064                 goto out;
1065
1066         net->ipv6.ip6_rt_gc_expire++;
1067         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068         net->ipv6.ip6_rt_last_gc = now;
1069         entries = dst_entries_get_slow(ops);
1070         if (entries < ops->gc_thresh)
1071                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1072 out:
1073         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1074         return entries > rt_max_size;
1075 }
1076
1077 /* Clean host part of a prefix. Not necessary in radix tree,
1078    but results in cleaner routing tables.
1079
1080    Remove it only when all the things will work!
1081  */
1082
1083 static int ipv6_get_mtu(struct net_device *dev)
1084 {
1085         int mtu = IPV6_MIN_MTU;
1086         struct inet6_dev *idev;
1087
1088         rcu_read_lock();
1089         idev = __in6_dev_get(dev);
1090         if (idev)
1091                 mtu = idev->cnf.mtu6;
1092         rcu_read_unlock();
1093         return mtu;
1094 }
1095
1096 int ip6_dst_hoplimit(struct dst_entry *dst)
1097 {
1098         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1099         if (hoplimit < 0) {
1100                 struct net_device *dev = dst->dev;
1101                 struct inet6_dev *idev;
1102
1103                 rcu_read_lock();
1104                 idev = __in6_dev_get(dev);
1105                 if (idev)
1106                         hoplimit = idev->cnf.hop_limit;
1107                 else
1108                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1109                 rcu_read_unlock();
1110         }
1111         return hoplimit;
1112 }
1113
1114 /*
1115  *
1116  */
1117
1118 int ip6_route_add(struct fib6_config *cfg)
1119 {
1120         int err;
1121         struct net *net = cfg->fc_nlinfo.nl_net;
1122         struct rt6_info *rt = NULL;
1123         struct net_device *dev = NULL;
1124         struct inet6_dev *idev = NULL;
1125         struct fib6_table *table;
1126         int addr_type;
1127
1128         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1129                 return -EINVAL;
1130 #ifndef CONFIG_IPV6_SUBTREES
1131         if (cfg->fc_src_len)
1132                 return -EINVAL;
1133 #endif
1134         if (cfg->fc_ifindex) {
1135                 err = -ENODEV;
1136                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1137                 if (!dev)
1138                         goto out;
1139                 idev = in6_dev_get(dev);
1140                 if (!idev)
1141                         goto out;
1142         }
1143
1144         if (cfg->fc_metric == 0)
1145                 cfg->fc_metric = IP6_RT_PRIO_USER;
1146
1147         table = fib6_new_table(net, cfg->fc_table);
1148         if (table == NULL) {
1149                 err = -ENOBUFS;
1150                 goto out;
1151         }
1152
1153         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1154
1155         if (rt == NULL) {
1156                 err = -ENOMEM;
1157                 goto out;
1158         }
1159
1160         rt->dst.obsolete = -1;
1161         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1162                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1163                                 0;
1164
1165         if (cfg->fc_protocol == RTPROT_UNSPEC)
1166                 cfg->fc_protocol = RTPROT_BOOT;
1167         rt->rt6i_protocol = cfg->fc_protocol;
1168
1169         addr_type = ipv6_addr_type(&cfg->fc_dst);
1170
1171         if (addr_type & IPV6_ADDR_MULTICAST)
1172                 rt->dst.input = ip6_mc_input;
1173         else if (cfg->fc_flags & RTF_LOCAL)
1174                 rt->dst.input = ip6_input;
1175         else
1176                 rt->dst.input = ip6_forward;
1177
1178         rt->dst.output = ip6_output;
1179
1180         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1181         rt->rt6i_dst.plen = cfg->fc_dst_len;
1182         if (rt->rt6i_dst.plen == 128)
1183                rt->dst.flags = DST_HOST;
1184
1185 #ifdef CONFIG_IPV6_SUBTREES
1186         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1187         rt->rt6i_src.plen = cfg->fc_src_len;
1188 #endif
1189
1190         rt->rt6i_metric = cfg->fc_metric;
1191
1192         /* We cannot add true routes via loopback here,
1193            they would result in kernel looping; promote them to reject routes
1194          */
1195         if ((cfg->fc_flags & RTF_REJECT) ||
1196             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1197                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1198                 /* hold loopback dev/idev if we haven't done so. */
1199                 if (dev != net->loopback_dev) {
1200                         if (dev) {
1201                                 dev_put(dev);
1202                                 in6_dev_put(idev);
1203                         }
1204                         dev = net->loopback_dev;
1205                         dev_hold(dev);
1206                         idev = in6_dev_get(dev);
1207                         if (!idev) {
1208                                 err = -ENODEV;
1209                                 goto out;
1210                         }
1211                 }
1212                 rt->dst.output = ip6_pkt_discard_out;
1213                 rt->dst.input = ip6_pkt_discard;
1214                 rt->dst.error = -ENETUNREACH;
1215                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1216                 goto install_route;
1217         }
1218
1219         if (cfg->fc_flags & RTF_GATEWAY) {
1220                 struct in6_addr *gw_addr;
1221                 int gwa_type;
1222
1223                 gw_addr = &cfg->fc_gateway;
1224                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1225                 gwa_type = ipv6_addr_type(gw_addr);
1226
1227                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1228                         struct rt6_info *grt;
1229
1230                         /* IPv6 strictly inhibits using not link-local
1231                            addresses as nexthop address.
1232                            Otherwise, router will not able to send redirects.
1233                            It is very good, but in some (rare!) circumstances
1234                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1235                            some exceptions. --ANK
1236                          */
1237                         err = -EINVAL;
1238                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1239                                 goto out;
1240
1241                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1242
1243                         err = -EHOSTUNREACH;
1244                         if (grt == NULL)
1245                                 goto out;
1246                         if (dev) {
1247                                 if (dev != grt->rt6i_dev) {
1248                                         dst_release(&grt->dst);
1249                                         goto out;
1250                                 }
1251                         } else {
1252                                 dev = grt->rt6i_dev;
1253                                 idev = grt->rt6i_idev;
1254                                 dev_hold(dev);
1255                                 in6_dev_hold(grt->rt6i_idev);
1256                         }
1257                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1258                                 err = 0;
1259                         dst_release(&grt->dst);
1260
1261                         if (err)
1262                                 goto out;
1263                 }
1264                 err = -EINVAL;
1265                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1266                         goto out;
1267         }
1268
1269         err = -ENODEV;
1270         if (dev == NULL)
1271                 goto out;
1272
1273         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1274                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1275                 if (IS_ERR(rt->rt6i_nexthop)) {
1276                         err = PTR_ERR(rt->rt6i_nexthop);
1277                         rt->rt6i_nexthop = NULL;
1278                         goto out;
1279                 }
1280         }
1281
1282         rt->rt6i_flags = cfg->fc_flags;
1283
1284 install_route:
1285         if (cfg->fc_mx) {
1286                 struct nlattr *nla;
1287                 int remaining;
1288
1289                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1290                         int type = nla_type(nla);
1291
1292                         if (type) {
1293                                 if (type > RTAX_MAX) {
1294                                         err = -EINVAL;
1295                                         goto out;
1296                                 }
1297
1298                                 rt->dst.metrics[type - 1] = nla_get_u32(nla);
1299                         }
1300                 }
1301         }
1302
1303         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1304                 rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1305         if (!dst_mtu(&rt->dst))
1306                 rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1307         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1308                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1309         rt->dst.dev = dev;
1310         rt->rt6i_idev = idev;
1311         rt->rt6i_table = table;
1312
1313         cfg->fc_nlinfo.nl_net = dev_net(dev);
1314
1315         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1316
1317 out:
1318         if (dev)
1319                 dev_put(dev);
1320         if (idev)
1321                 in6_dev_put(idev);
1322         if (rt)
1323                 dst_free(&rt->dst);
1324         return err;
1325 }
1326
1327 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1328 {
1329         int err;
1330         struct fib6_table *table;
1331         struct net *net = dev_net(rt->rt6i_dev);
1332
1333         if (rt == net->ipv6.ip6_null_entry)
1334                 return -ENOENT;
1335
1336         table = rt->rt6i_table;
1337         write_lock_bh(&table->tb6_lock);
1338
1339         err = fib6_del(rt, info);
1340         dst_release(&rt->dst);
1341
1342         write_unlock_bh(&table->tb6_lock);
1343
1344         return err;
1345 }
1346
1347 int ip6_del_rt(struct rt6_info *rt)
1348 {
1349         struct nl_info info = {
1350                 .nl_net = dev_net(rt->rt6i_dev),
1351         };
1352         return __ip6_del_rt(rt, &info);
1353 }
1354
1355 static int ip6_route_del(struct fib6_config *cfg)
1356 {
1357         struct fib6_table *table;
1358         struct fib6_node *fn;
1359         struct rt6_info *rt;
1360         int err = -ESRCH;
1361
1362         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1363         if (table == NULL)
1364                 return err;
1365
1366         read_lock_bh(&table->tb6_lock);
1367
1368         fn = fib6_locate(&table->tb6_root,
1369                          &cfg->fc_dst, cfg->fc_dst_len,
1370                          &cfg->fc_src, cfg->fc_src_len);
1371
1372         if (fn) {
1373                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1374                         if (cfg->fc_ifindex &&
1375                             (rt->rt6i_dev == NULL ||
1376                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1377                                 continue;
1378                         if (cfg->fc_flags & RTF_GATEWAY &&
1379                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1380                                 continue;
1381                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1382                                 continue;
1383                         dst_hold(&rt->dst);
1384                         read_unlock_bh(&table->tb6_lock);
1385
1386                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1387                 }
1388         }
1389         read_unlock_bh(&table->tb6_lock);
1390
1391         return err;
1392 }
1393
1394 /*
1395  *      Handle redirects
1396  */
1397 struct ip6rd_flowi {
1398         struct flowi fl;
1399         struct in6_addr gateway;
1400 };
1401
1402 static struct rt6_info *__ip6_route_redirect(struct net *net,
1403                                              struct fib6_table *table,
1404                                              struct flowi *fl,
1405                                              int flags)
1406 {
1407         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1408         struct rt6_info *rt;
1409         struct fib6_node *fn;
1410
1411         /*
1412          * Get the "current" route for this destination and
1413          * check if the redirect has come from approriate router.
1414          *
1415          * RFC 2461 specifies that redirects should only be
1416          * accepted if they come from the nexthop to the target.
1417          * Due to the way the routes are chosen, this notion
1418          * is a bit fuzzy and one might need to check all possible
1419          * routes.
1420          */
1421
1422         read_lock_bh(&table->tb6_lock);
1423         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1424 restart:
1425         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1426                 /*
1427                  * Current route is on-link; redirect is always invalid.
1428                  *
1429                  * Seems, previous statement is not true. It could
1430                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1431                  * But then router serving it might decide, that we should
1432                  * know truth 8)8) --ANK (980726).
1433                  */
1434                 if (rt6_check_expired(rt))
1435                         continue;
1436                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1437                         continue;
1438                 if (fl->oif != rt->rt6i_dev->ifindex)
1439                         continue;
1440                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1441                         continue;
1442                 break;
1443         }
1444
1445         if (!rt)
1446                 rt = net->ipv6.ip6_null_entry;
1447         BACKTRACK(net, &fl->fl6_src);
1448 out:
1449         dst_hold(&rt->dst);
1450
1451         read_unlock_bh(&table->tb6_lock);
1452
1453         return rt;
1454 };
1455
1456 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1457                                            struct in6_addr *src,
1458                                            struct in6_addr *gateway,
1459                                            struct net_device *dev)
1460 {
1461         int flags = RT6_LOOKUP_F_HAS_SADDR;
1462         struct net *net = dev_net(dev);
1463         struct ip6rd_flowi rdfl = {
1464                 .fl = {
1465                         .oif = dev->ifindex,
1466                         .nl_u = {
1467                                 .ip6_u = {
1468                                         .daddr = *dest,
1469                                         .saddr = *src,
1470                                 },
1471                         },
1472                 },
1473         };
1474
1475         ipv6_addr_copy(&rdfl.gateway, gateway);
1476
1477         if (rt6_need_strict(dest))
1478                 flags |= RT6_LOOKUP_F_IFACE;
1479
1480         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1481                                                    flags, __ip6_route_redirect);
1482 }
1483
1484 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1485                   struct in6_addr *saddr,
1486                   struct neighbour *neigh, u8 *lladdr, int on_link)
1487 {
1488         struct rt6_info *rt, *nrt = NULL;
1489         struct netevent_redirect netevent;
1490         struct net *net = dev_net(neigh->dev);
1491
1492         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493
1494         if (rt == net->ipv6.ip6_null_entry) {
1495                 if (net_ratelimit())
1496                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1497                                "for redirect target\n");
1498                 goto out;
1499         }
1500
1501         /*
1502          *      We have finally decided to accept it.
1503          */
1504
1505         neigh_update(neigh, lladdr, NUD_STALE,
1506                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1507                      NEIGH_UPDATE_F_OVERRIDE|
1508                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1509                                      NEIGH_UPDATE_F_ISROUTER))
1510                      );
1511
1512         /*
1513          * Redirect received -> path was valid.
1514          * Look, redirects are sent only in response to data packets,
1515          * so that this nexthop apparently is reachable. --ANK
1516          */
1517         dst_confirm(&rt->dst);
1518
1519         /* Duplicate redirect: silently ignore. */
1520         if (neigh == rt->dst.neighbour)
1521                 goto out;
1522
1523         nrt = ip6_rt_copy(rt);
1524         if (nrt == NULL)
1525                 goto out;
1526
1527         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1528         if (on_link)
1529                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1530
1531         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1532         nrt->rt6i_dst.plen = 128;
1533         nrt->dst.flags |= DST_HOST;
1534
1535         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1536         nrt->rt6i_nexthop = neigh_clone(neigh);
1537         /* Reset pmtu, it may be better */
1538         nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1539         nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1540                                                         dst_mtu(&nrt->dst));
1541
1542         if (ip6_ins_rt(nrt))
1543                 goto out;
1544
1545         netevent.old = &rt->dst;
1546         netevent.new = &nrt->dst;
1547         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1548
1549         if (rt->rt6i_flags&RTF_CACHE) {
1550                 ip6_del_rt(rt);
1551                 return;
1552         }
1553
1554 out:
1555         dst_release(&rt->dst);
1556 }
1557
1558 /*
1559  *      Handle ICMP "packet too big" messages
1560  *      i.e. Path MTU discovery
1561  */
1562
1563 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1564                              struct net *net, u32 pmtu, int ifindex)
1565 {
1566         struct rt6_info *rt, *nrt;
1567         int allfrag = 0;
1568
1569         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1570         if (rt == NULL)
1571                 return;
1572
1573         if (pmtu >= dst_mtu(&rt->dst))
1574                 goto out;
1575
1576         if (pmtu < IPV6_MIN_MTU) {
1577                 /*
1578                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1579                  * MTU (1280) and a fragment header should always be included
1580                  * after a node receiving Too Big message reporting PMTU is
1581                  * less than the IPv6 Minimum Link MTU.
1582                  */
1583                 pmtu = IPV6_MIN_MTU;
1584                 allfrag = 1;
1585         }
1586
1587         /* New mtu received -> path was valid.
1588            They are sent only in response to data packets,
1589            so that this nexthop apparently is reachable. --ANK
1590          */
1591         dst_confirm(&rt->dst);
1592
1593         /* Host route. If it is static, it would be better
1594            not to override it, but add new one, so that
1595            when cache entry will expire old pmtu
1596            would return automatically.
1597          */
1598         if (rt->rt6i_flags & RTF_CACHE) {
1599                 rt->dst.metrics[RTAX_MTU-1] = pmtu;
1600                 if (allfrag)
1601                         rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1602                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1603                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1604                 goto out;
1605         }
1606
1607         /* Network route.
1608            Two cases are possible:
1609            1. It is connected route. Action: COW
1610            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1611          */
1612         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1613                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1614         else
1615                 nrt = rt6_alloc_clone(rt, daddr);
1616
1617         if (nrt) {
1618                 nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1619                 if (allfrag)
1620                         nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1621
1622                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1623                  * happened within 5 mins, the recommended timer is 10 mins.
1624                  * Here this route expiration time is set to ip6_rt_mtu_expires
1625                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1626                  * and detecting PMTU increase will be automatically happened.
1627                  */
1628                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1629                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1630
1631                 ip6_ins_rt(nrt);
1632         }
1633 out:
1634         dst_release(&rt->dst);
1635 }
1636
1637 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1638                         struct net_device *dev, u32 pmtu)
1639 {
1640         struct net *net = dev_net(dev);
1641
1642         /*
1643          * RFC 1981 states that a node "MUST reduce the size of the packets it
1644          * is sending along the path" that caused the Packet Too Big message.
1645          * Since it's not possible in the general case to determine which
1646          * interface was used to send the original packet, we update the MTU
1647          * on the interface that will be used to send future packets. We also
1648          * update the MTU on the interface that received the Packet Too Big in
1649          * case the original packet was forced out that interface with
1650          * SO_BINDTODEVICE or similar. This is the next best thing to the
1651          * correct behaviour, which would be to update the MTU on all
1652          * interfaces.
1653          */
1654         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1655         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1656 }
1657
1658 /*
1659  *      Misc support functions
1660  */
1661
1662 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1663 {
1664         struct net *net = dev_net(ort->rt6i_dev);
1665         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1666
1667         if (rt) {
1668                 rt->dst.input = ort->dst.input;
1669                 rt->dst.output = ort->dst.output;
1670
1671                 memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1672                 rt->dst.error = ort->dst.error;
1673                 rt->dst.dev = ort->dst.dev;
1674                 if (rt->dst.dev)
1675                         dev_hold(rt->dst.dev);
1676                 rt->rt6i_idev = ort->rt6i_idev;
1677                 if (rt->rt6i_idev)
1678                         in6_dev_hold(rt->rt6i_idev);
1679                 rt->dst.lastuse = jiffies;
1680                 rt->rt6i_expires = 0;
1681
1682                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1683                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1684                 rt->rt6i_metric = 0;
1685
1686                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1687 #ifdef CONFIG_IPV6_SUBTREES
1688                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1689 #endif
1690                 rt->rt6i_table = ort->rt6i_table;
1691         }
1692         return rt;
1693 }
1694
1695 #ifdef CONFIG_IPV6_ROUTE_INFO
1696 static struct rt6_info *rt6_get_route_info(struct net *net,
1697                                            struct in6_addr *prefix, int prefixlen,
1698                                            struct in6_addr *gwaddr, int ifindex)
1699 {
1700         struct fib6_node *fn;
1701         struct rt6_info *rt = NULL;
1702         struct fib6_table *table;
1703
1704         table = fib6_get_table(net, RT6_TABLE_INFO);
1705         if (table == NULL)
1706                 return NULL;
1707
1708         write_lock_bh(&table->tb6_lock);
1709         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1710         if (!fn)
1711                 goto out;
1712
1713         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1714                 if (rt->rt6i_dev->ifindex != ifindex)
1715                         continue;
1716                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1717                         continue;
1718                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1719                         continue;
1720                 dst_hold(&rt->dst);
1721                 break;
1722         }
1723 out:
1724         write_unlock_bh(&table->tb6_lock);
1725         return rt;
1726 }
1727
1728 static struct rt6_info *rt6_add_route_info(struct net *net,
1729                                            struct in6_addr *prefix, int prefixlen,
1730                                            struct in6_addr *gwaddr, int ifindex,
1731                                            unsigned pref)
1732 {
1733         struct fib6_config cfg = {
1734                 .fc_table       = RT6_TABLE_INFO,
1735                 .fc_metric      = IP6_RT_PRIO_USER,
1736                 .fc_ifindex     = ifindex,
1737                 .fc_dst_len     = prefixlen,
1738                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1739                                   RTF_UP | RTF_PREF(pref),
1740                 .fc_nlinfo.pid = 0,
1741                 .fc_nlinfo.nlh = NULL,
1742                 .fc_nlinfo.nl_net = net,
1743         };
1744
1745         ipv6_addr_copy(&cfg.fc_dst, prefix);
1746         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1747
1748         /* We should treat it as a default route if prefix length is 0. */
1749         if (!prefixlen)
1750                 cfg.fc_flags |= RTF_DEFAULT;
1751
1752         ip6_route_add(&cfg);
1753
1754         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1755 }
1756 #endif
1757
1758 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1759 {
1760         struct rt6_info *rt;
1761         struct fib6_table *table;
1762
1763         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1764         if (table == NULL)
1765                 return NULL;
1766
1767         write_lock_bh(&table->tb6_lock);
1768         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1769                 if (dev == rt->rt6i_dev &&
1770                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1771                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1772                         break;
1773         }
1774         if (rt)
1775                 dst_hold(&rt->dst);
1776         write_unlock_bh(&table->tb6_lock);
1777         return rt;
1778 }
1779
1780 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1781                                      struct net_device *dev,
1782                                      unsigned int pref)
1783 {
1784         struct fib6_config cfg = {
1785                 .fc_table       = RT6_TABLE_DFLT,
1786                 .fc_metric      = IP6_RT_PRIO_USER,
1787                 .fc_ifindex     = dev->ifindex,
1788                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1789                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1790                 .fc_nlinfo.pid = 0,
1791                 .fc_nlinfo.nlh = NULL,
1792                 .fc_nlinfo.nl_net = dev_net(dev),
1793         };
1794
1795         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1796
1797         ip6_route_add(&cfg);
1798
1799         return rt6_get_dflt_router(gwaddr, dev);
1800 }
1801
1802 void rt6_purge_dflt_routers(struct net *net)
1803 {
1804         struct rt6_info *rt;
1805         struct fib6_table *table;
1806
1807         /* NOTE: Keep consistent with rt6_get_dflt_router */
1808         table = fib6_get_table(net, RT6_TABLE_DFLT);
1809         if (table == NULL)
1810                 return;
1811
1812 restart:
1813         read_lock_bh(&table->tb6_lock);
1814         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1815                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1816                         dst_hold(&rt->dst);
1817                         read_unlock_bh(&table->tb6_lock);
1818                         ip6_del_rt(rt);
1819                         goto restart;
1820                 }
1821         }
1822         read_unlock_bh(&table->tb6_lock);
1823 }
1824
1825 static void rtmsg_to_fib6_config(struct net *net,
1826                                  struct in6_rtmsg *rtmsg,
1827                                  struct fib6_config *cfg)
1828 {
1829         memset(cfg, 0, sizeof(*cfg));
1830
1831         cfg->fc_table = RT6_TABLE_MAIN;
1832         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1833         cfg->fc_metric = rtmsg->rtmsg_metric;
1834         cfg->fc_expires = rtmsg->rtmsg_info;
1835         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1836         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1837         cfg->fc_flags = rtmsg->rtmsg_flags;
1838
1839         cfg->fc_nlinfo.nl_net = net;
1840
1841         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1842         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1843         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1844 }
1845
1846 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1847 {
1848         struct fib6_config cfg;
1849         struct in6_rtmsg rtmsg;
1850         int err;
1851
1852         switch(cmd) {
1853         case SIOCADDRT:         /* Add a route */
1854         case SIOCDELRT:         /* Delete a route */
1855                 if (!capable(CAP_NET_ADMIN))
1856                         return -EPERM;
1857                 err = copy_from_user(&rtmsg, arg,
1858                                      sizeof(struct in6_rtmsg));
1859                 if (err)
1860                         return -EFAULT;
1861
1862                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1863
1864                 rtnl_lock();
1865                 switch (cmd) {
1866                 case SIOCADDRT:
1867                         err = ip6_route_add(&cfg);
1868                         break;
1869                 case SIOCDELRT:
1870                         err = ip6_route_del(&cfg);
1871                         break;
1872                 default:
1873                         err = -EINVAL;
1874                 }
1875                 rtnl_unlock();
1876
1877                 return err;
1878         }
1879
1880         return -EINVAL;
1881 }
1882
1883 /*
1884  *      Drop the packet on the floor
1885  */
1886
1887 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1888 {
1889         int type;
1890         struct dst_entry *dst = skb_dst(skb);
1891         switch (ipstats_mib_noroutes) {
1892         case IPSTATS_MIB_INNOROUTES:
1893                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1894                 if (type == IPV6_ADDR_ANY) {
1895                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1896                                       IPSTATS_MIB_INADDRERRORS);
1897                         break;
1898                 }
1899                 /* FALLTHROUGH */
1900         case IPSTATS_MIB_OUTNOROUTES:
1901                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1902                               ipstats_mib_noroutes);
1903                 break;
1904         }
1905         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1906         kfree_skb(skb);
1907         return 0;
1908 }
1909
1910 static int ip6_pkt_discard(struct sk_buff *skb)
1911 {
1912         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1913 }
1914
1915 static int ip6_pkt_discard_out(struct sk_buff *skb)
1916 {
1917         skb->dev = skb_dst(skb)->dev;
1918         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1919 }
1920
1921 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1922
1923 static int ip6_pkt_prohibit(struct sk_buff *skb)
1924 {
1925         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1926 }
1927
1928 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1929 {
1930         skb->dev = skb_dst(skb)->dev;
1931         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1932 }
1933
1934 #endif
1935
1936 /*
1937  *      Allocate a dst for local (unicast / anycast) address.
1938  */
1939
1940 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1941                                     const struct in6_addr *addr,
1942                                     int anycast)
1943 {
1944         struct net *net = dev_net(idev->dev);
1945         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1946         struct neighbour *neigh;
1947
1948         if (rt == NULL)
1949                 return ERR_PTR(-ENOMEM);
1950
1951         dev_hold(net->loopback_dev);
1952         in6_dev_hold(idev);
1953
1954         rt->dst.flags = DST_HOST;
1955         rt->dst.input = ip6_input;
1956         rt->dst.output = ip6_output;
1957         rt->rt6i_dev = net->loopback_dev;
1958         rt->rt6i_idev = idev;
1959         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1960         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1961         rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1962         rt->dst.obsolete = -1;
1963
1964         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1965         if (anycast)
1966                 rt->rt6i_flags |= RTF_ANYCAST;
1967         else
1968                 rt->rt6i_flags |= RTF_LOCAL;
1969         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1970         if (IS_ERR(neigh)) {
1971                 dst_free(&rt->dst);
1972
1973                 /* We are casting this because that is the return
1974                  * value type.  But an errno encoded pointer is the
1975                  * same regardless of the underlying pointer type,
1976                  * and that's what we are returning.  So this is OK.
1977                  */
1978                 return (struct rt6_info *) neigh;
1979         }
1980         rt->rt6i_nexthop = neigh;
1981
1982         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1983         rt->rt6i_dst.plen = 128;
1984         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1985
1986         atomic_set(&rt->dst.__refcnt, 1);
1987
1988         return rt;
1989 }
1990
1991 struct arg_dev_net {
1992         struct net_device *dev;
1993         struct net *net;
1994 };
1995
1996 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1997 {
1998         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1999         struct net *net = ((struct arg_dev_net *)arg)->net;
2000
2001         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2002             rt != net->ipv6.ip6_null_entry) {
2003                 RT6_TRACE("deleted by ifdown %p\n", rt);
2004                 return -1;
2005         }
2006         return 0;
2007 }
2008
2009 void rt6_ifdown(struct net *net, struct net_device *dev)
2010 {
2011         struct arg_dev_net adn = {
2012                 .dev = dev,
2013                 .net = net,
2014         };
2015
2016         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2017         icmp6_clean_all(fib6_ifdown, &adn);
2018 }
2019
2020 struct rt6_mtu_change_arg
2021 {
2022         struct net_device *dev;
2023         unsigned mtu;
2024 };
2025
2026 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2027 {
2028         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2029         struct inet6_dev *idev;
2030         struct net *net = dev_net(arg->dev);
2031
2032         /* In IPv6 pmtu discovery is not optional,
2033            so that RTAX_MTU lock cannot disable it.
2034            We still use this lock to block changes
2035            caused by addrconf/ndisc.
2036         */
2037
2038         idev = __in6_dev_get(arg->dev);
2039         if (idev == NULL)
2040                 return 0;
2041
2042         /* For administrative MTU increase, there is no way to discover
2043            IPv6 PMTU increase, so PMTU increase should be updated here.
2044            Since RFC 1981 doesn't include administrative MTU increase
2045            update PMTU increase is a MUST. (i.e. jumbo frame)
2046          */
2047         /*
2048            If new MTU is less than route PMTU, this new MTU will be the
2049            lowest MTU in the path, update the route PMTU to reflect PMTU
2050            decreases; if new MTU is greater than route PMTU, and the
2051            old MTU is the lowest MTU in the path, update the route PMTU
2052            to reflect the increase. In this case if the other nodes' MTU
2053            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2054            PMTU discouvery.
2055          */
2056         if (rt->rt6i_dev == arg->dev &&
2057             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2058             (dst_mtu(&rt->dst) >= arg->mtu ||
2059              (dst_mtu(&rt->dst) < arg->mtu &&
2060               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2061                 rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2062                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2063         }
2064         return 0;
2065 }
2066
2067 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2068 {
2069         struct rt6_mtu_change_arg arg = {
2070                 .dev = dev,
2071                 .mtu = mtu,
2072         };
2073
2074         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2075 }
2076
2077 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2078         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2079         [RTA_OIF]               = { .type = NLA_U32 },
2080         [RTA_IIF]               = { .type = NLA_U32 },
2081         [RTA_PRIORITY]          = { .type = NLA_U32 },
2082         [RTA_METRICS]           = { .type = NLA_NESTED },
2083 };
2084
2085 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2086                               struct fib6_config *cfg)
2087 {
2088         struct rtmsg *rtm;
2089         struct nlattr *tb[RTA_MAX+1];
2090         int err;
2091
2092         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2093         if (err < 0)
2094                 goto errout;
2095
2096         err = -EINVAL;
2097         rtm = nlmsg_data(nlh);
2098         memset(cfg, 0, sizeof(*cfg));
2099
2100         cfg->fc_table = rtm->rtm_table;
2101         cfg->fc_dst_len = rtm->rtm_dst_len;
2102         cfg->fc_src_len = rtm->rtm_src_len;
2103         cfg->fc_flags = RTF_UP;
2104         cfg->fc_protocol = rtm->rtm_protocol;
2105
2106         if (rtm->rtm_type == RTN_UNREACHABLE)
2107                 cfg->fc_flags |= RTF_REJECT;
2108
2109         if (rtm->rtm_type == RTN_LOCAL)
2110                 cfg->fc_flags |= RTF_LOCAL;
2111
2112         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2113         cfg->fc_nlinfo.nlh = nlh;
2114         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2115
2116         if (tb[RTA_GATEWAY]) {
2117                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2118                 cfg->fc_flags |= RTF_GATEWAY;
2119         }
2120
2121         if (tb[RTA_DST]) {
2122                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2123
2124                 if (nla_len(tb[RTA_DST]) < plen)
2125                         goto errout;
2126
2127                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2128         }
2129
2130         if (tb[RTA_SRC]) {
2131                 int plen = (rtm->rtm_src_len + 7) >> 3;
2132
2133                 if (nla_len(tb[RTA_SRC]) < plen)
2134                         goto errout;
2135
2136                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2137         }
2138
2139         if (tb[RTA_OIF])
2140                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2141
2142         if (tb[RTA_PRIORITY])
2143                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2144
2145         if (tb[RTA_METRICS]) {
2146                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2147                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2148         }
2149
2150         if (tb[RTA_TABLE])
2151                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2152
2153         err = 0;
2154 errout:
2155         return err;
2156 }
2157
2158 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2159 {
2160         struct fib6_config cfg;
2161         int err;
2162
2163         err = rtm_to_fib6_config(skb, nlh, &cfg);
2164         if (err < 0)
2165                 return err;
2166
2167         return ip6_route_del(&cfg);
2168 }
2169
2170 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2171 {
2172         struct fib6_config cfg;
2173         int err;
2174
2175         err = rtm_to_fib6_config(skb, nlh, &cfg);
2176         if (err < 0)
2177                 return err;
2178
2179         return ip6_route_add(&cfg);
2180 }
2181
2182 static inline size_t rt6_nlmsg_size(void)
2183 {
2184         return NLMSG_ALIGN(sizeof(struct rtmsg))
2185                + nla_total_size(16) /* RTA_SRC */
2186                + nla_total_size(16) /* RTA_DST */
2187                + nla_total_size(16) /* RTA_GATEWAY */
2188                + nla_total_size(16) /* RTA_PREFSRC */
2189                + nla_total_size(4) /* RTA_TABLE */
2190                + nla_total_size(4) /* RTA_IIF */
2191                + nla_total_size(4) /* RTA_OIF */
2192                + nla_total_size(4) /* RTA_PRIORITY */
2193                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2194                + nla_total_size(sizeof(struct rta_cacheinfo));
2195 }
2196
2197 static int rt6_fill_node(struct net *net,
2198                          struct sk_buff *skb, struct rt6_info *rt,
2199                          struct in6_addr *dst, struct in6_addr *src,
2200                          int iif, int type, u32 pid, u32 seq,
2201                          int prefix, int nowait, unsigned int flags)
2202 {
2203         struct rtmsg *rtm;
2204         struct nlmsghdr *nlh;
2205         long expires;
2206         u32 table;
2207
2208         if (prefix) {   /* user wants prefix routes only */
2209                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2210                         /* success since this is not a prefix route */
2211                         return 1;
2212                 }
2213         }
2214
2215         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2216         if (nlh == NULL)
2217                 return -EMSGSIZE;
2218
2219         rtm = nlmsg_data(nlh);
2220         rtm->rtm_family = AF_INET6;
2221         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2222         rtm->rtm_src_len = rt->rt6i_src.plen;
2223         rtm->rtm_tos = 0;
2224         if (rt->rt6i_table)
2225                 table = rt->rt6i_table->tb6_id;
2226         else
2227                 table = RT6_TABLE_UNSPEC;
2228         rtm->rtm_table = table;
2229         NLA_PUT_U32(skb, RTA_TABLE, table);
2230         if (rt->rt6i_flags&RTF_REJECT)
2231                 rtm->rtm_type = RTN_UNREACHABLE;
2232         else if (rt->rt6i_flags&RTF_LOCAL)
2233                 rtm->rtm_type = RTN_LOCAL;
2234         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2235                 rtm->rtm_type = RTN_LOCAL;
2236         else
2237                 rtm->rtm_type = RTN_UNICAST;
2238         rtm->rtm_flags = 0;
2239         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2240         rtm->rtm_protocol = rt->rt6i_protocol;
2241         if (rt->rt6i_flags&RTF_DYNAMIC)
2242                 rtm->rtm_protocol = RTPROT_REDIRECT;
2243         else if (rt->rt6i_flags & RTF_ADDRCONF)
2244                 rtm->rtm_protocol = RTPROT_KERNEL;
2245         else if (rt->rt6i_flags&RTF_DEFAULT)
2246                 rtm->rtm_protocol = RTPROT_RA;
2247
2248         if (rt->rt6i_flags&RTF_CACHE)
2249                 rtm->rtm_flags |= RTM_F_CLONED;
2250
2251         if (dst) {
2252                 NLA_PUT(skb, RTA_DST, 16, dst);
2253                 rtm->rtm_dst_len = 128;
2254         } else if (rtm->rtm_dst_len)
2255                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2256 #ifdef CONFIG_IPV6_SUBTREES
2257         if (src) {
2258                 NLA_PUT(skb, RTA_SRC, 16, src);
2259                 rtm->rtm_src_len = 128;
2260         } else if (rtm->rtm_src_len)
2261                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2262 #endif
2263         if (iif) {
2264 #ifdef CONFIG_IPV6_MROUTE
2265                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2266                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2267                         if (err <= 0) {
2268                                 if (!nowait) {
2269                                         if (err == 0)
2270                                                 return 0;
2271                                         goto nla_put_failure;
2272                                 } else {
2273                                         if (err == -EMSGSIZE)
2274                                                 goto nla_put_failure;
2275                                 }
2276                         }
2277                 } else
2278 #endif
2279                         NLA_PUT_U32(skb, RTA_IIF, iif);
2280         } else if (dst) {
2281                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2282                 struct in6_addr saddr_buf;
2283                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2284                                        dst, 0, &saddr_buf) == 0)
2285                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2286         }
2287
2288         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2289                 goto nla_put_failure;
2290
2291         if (rt->dst.neighbour)
2292                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2293
2294         if (rt->dst.dev)
2295                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2296
2297         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2298
2299         if (!(rt->rt6i_flags & RTF_EXPIRES))
2300                 expires = 0;
2301         else if (rt->rt6i_expires - jiffies < INT_MAX)
2302                 expires = rt->rt6i_expires - jiffies;
2303         else
2304                 expires = INT_MAX;
2305
2306         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2307                                expires, rt->dst.error) < 0)
2308                 goto nla_put_failure;
2309
2310         return nlmsg_end(skb, nlh);
2311
2312 nla_put_failure:
2313         nlmsg_cancel(skb, nlh);
2314         return -EMSGSIZE;
2315 }
2316
2317 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2318 {
2319         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2320         int prefix;
2321
2322         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2323                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2324                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2325         } else
2326                 prefix = 0;
2327
2328         return rt6_fill_node(arg->net,
2329                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2330                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2331                      prefix, 0, NLM_F_MULTI);
2332 }
2333
2334 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2335 {
2336         struct net *net = sock_net(in_skb->sk);
2337         struct nlattr *tb[RTA_MAX+1];
2338         struct rt6_info *rt;
2339         struct sk_buff *skb;
2340         struct rtmsg *rtm;
2341         struct flowi fl;
2342         int err, iif = 0;
2343
2344         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2345         if (err < 0)
2346                 goto errout;
2347
2348         err = -EINVAL;
2349         memset(&fl, 0, sizeof(fl));
2350
2351         if (tb[RTA_SRC]) {
2352                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2353                         goto errout;
2354
2355                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2356         }
2357
2358         if (tb[RTA_DST]) {
2359                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2360                         goto errout;
2361
2362                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2363         }
2364
2365         if (tb[RTA_IIF])
2366                 iif = nla_get_u32(tb[RTA_IIF]);
2367
2368         if (tb[RTA_OIF])
2369                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2370
2371         if (iif) {
2372                 struct net_device *dev;
2373                 dev = __dev_get_by_index(net, iif);
2374                 if (!dev) {
2375                         err = -ENODEV;
2376                         goto errout;
2377                 }
2378         }
2379
2380         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2381         if (skb == NULL) {
2382                 err = -ENOBUFS;
2383                 goto errout;
2384         }
2385
2386         /* Reserve room for dummy headers, this skb can pass
2387            through good chunk of routing engine.
2388          */
2389         skb_reset_mac_header(skb);
2390         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2391
2392         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2393         skb_dst_set(skb, &rt->dst);
2394
2395         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2396                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2397                             nlh->nlmsg_seq, 0, 0, 0);
2398         if (err < 0) {
2399                 kfree_skb(skb);
2400                 goto errout;
2401         }
2402
2403         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2404 errout:
2405         return err;
2406 }
2407
2408 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2409 {
2410         struct sk_buff *skb;
2411         struct net *net = info->nl_net;
2412         u32 seq;
2413         int err;
2414
2415         err = -ENOBUFS;
2416         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2417
2418         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2419         if (skb == NULL)
2420                 goto errout;
2421
2422         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2423                                 event, info->pid, seq, 0, 0, 0);
2424         if (err < 0) {
2425                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2426                 WARN_ON(err == -EMSGSIZE);
2427                 kfree_skb(skb);
2428                 goto errout;
2429         }
2430         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2431                     info->nlh, gfp_any());
2432         return;
2433 errout:
2434         if (err < 0)
2435                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2436 }
2437
2438 static int ip6_route_dev_notify(struct notifier_block *this,
2439                                 unsigned long event, void *data)
2440 {
2441         struct net_device *dev = (struct net_device *)data;
2442         struct net *net = dev_net(dev);
2443
2444         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2445                 net->ipv6.ip6_null_entry->dst.dev = dev;
2446                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2447 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2448                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2449                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2450                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2451                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2452 #endif
2453         }
2454
2455         return NOTIFY_OK;
2456 }
2457
2458 /*
2459  *      /proc
2460  */
2461
2462 #ifdef CONFIG_PROC_FS
2463
2464 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2465
2466 struct rt6_proc_arg
2467 {
2468         char *buffer;
2469         int offset;
2470         int length;
2471         int skip;
2472         int len;
2473 };
2474
2475 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2476 {
2477         struct seq_file *m = p_arg;
2478
2479         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2480
2481 #ifdef CONFIG_IPV6_SUBTREES
2482         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2483 #else
2484         seq_puts(m, "00000000000000000000000000000000 00 ");
2485 #endif
2486
2487         if (rt->rt6i_nexthop) {
2488                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2489         } else {
2490                 seq_puts(m, "00000000000000000000000000000000");
2491         }
2492         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2493                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2494                    rt->dst.__use, rt->rt6i_flags,
2495                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2496         return 0;
2497 }
2498
2499 static int ipv6_route_show(struct seq_file *m, void *v)
2500 {
2501         struct net *net = (struct net *)m->private;
2502         fib6_clean_all(net, rt6_info_route, 0, m);
2503         return 0;
2504 }
2505
2506 static int ipv6_route_open(struct inode *inode, struct file *file)
2507 {
2508         return single_open_net(inode, file, ipv6_route_show);
2509 }
2510
2511 static const struct file_operations ipv6_route_proc_fops = {
2512         .owner          = THIS_MODULE,
2513         .open           = ipv6_route_open,
2514         .read           = seq_read,
2515         .llseek         = seq_lseek,
2516         .release        = single_release_net,
2517 };
2518
2519 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2520 {
2521         struct net *net = (struct net *)seq->private;
2522         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2523                    net->ipv6.rt6_stats->fib_nodes,
2524                    net->ipv6.rt6_stats->fib_route_nodes,
2525                    net->ipv6.rt6_stats->fib_rt_alloc,
2526                    net->ipv6.rt6_stats->fib_rt_entries,
2527                    net->ipv6.rt6_stats->fib_rt_cache,
2528                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2529                    net->ipv6.rt6_stats->fib_discarded_routes);
2530
2531         return 0;
2532 }
2533
2534 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2535 {
2536         return single_open_net(inode, file, rt6_stats_seq_show);
2537 }
2538
2539 static const struct file_operations rt6_stats_seq_fops = {
2540         .owner   = THIS_MODULE,
2541         .open    = rt6_stats_seq_open,
2542         .read    = seq_read,
2543         .llseek  = seq_lseek,
2544         .release = single_release_net,
2545 };
2546 #endif  /* CONFIG_PROC_FS */
2547
2548 #ifdef CONFIG_SYSCTL
2549
2550 static
2551 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2552                               void __user *buffer, size_t *lenp, loff_t *ppos)
2553 {
2554         struct net *net = current->nsproxy->net_ns;
2555         int delay = net->ipv6.sysctl.flush_delay;
2556         if (write) {
2557                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2558                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2559                 return 0;
2560         } else
2561                 return -EINVAL;
2562 }
2563
2564 ctl_table ipv6_route_table_template[] = {
2565         {
2566                 .procname       =       "flush",
2567                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2568                 .maxlen         =       sizeof(int),
2569                 .mode           =       0200,
2570                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2571         },
2572         {
2573                 .procname       =       "gc_thresh",
2574                 .data           =       &ip6_dst_ops_template.gc_thresh,
2575                 .maxlen         =       sizeof(int),
2576                 .mode           =       0644,
2577                 .proc_handler   =       proc_dointvec,
2578         },
2579         {
2580                 .procname       =       "max_size",
2581                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2582                 .maxlen         =       sizeof(int),
2583                 .mode           =       0644,
2584                 .proc_handler   =       proc_dointvec,
2585         },
2586         {
2587                 .procname       =       "gc_min_interval",
2588                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2589                 .maxlen         =       sizeof(int),
2590                 .mode           =       0644,
2591                 .proc_handler   =       proc_dointvec_jiffies,
2592         },
2593         {
2594                 .procname       =       "gc_timeout",
2595                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2596                 .maxlen         =       sizeof(int),
2597                 .mode           =       0644,
2598                 .proc_handler   =       proc_dointvec_jiffies,
2599         },
2600         {
2601                 .procname       =       "gc_interval",
2602                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2603                 .maxlen         =       sizeof(int),
2604                 .mode           =       0644,
2605                 .proc_handler   =       proc_dointvec_jiffies,
2606         },
2607         {
2608                 .procname       =       "gc_elasticity",
2609                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2610                 .maxlen         =       sizeof(int),
2611                 .mode           =       0644,
2612                 .proc_handler   =       proc_dointvec,
2613         },
2614         {
2615                 .procname       =       "mtu_expires",
2616                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2617                 .maxlen         =       sizeof(int),
2618                 .mode           =       0644,
2619                 .proc_handler   =       proc_dointvec_jiffies,
2620         },
2621         {
2622                 .procname       =       "min_adv_mss",
2623                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2624                 .maxlen         =       sizeof(int),
2625                 .mode           =       0644,
2626                 .proc_handler   =       proc_dointvec,
2627         },
2628         {
2629                 .procname       =       "gc_min_interval_ms",
2630                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2631                 .maxlen         =       sizeof(int),
2632                 .mode           =       0644,
2633                 .proc_handler   =       proc_dointvec_ms_jiffies,
2634         },
2635         { }
2636 };
2637
2638 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2639 {
2640         struct ctl_table *table;
2641
2642         table = kmemdup(ipv6_route_table_template,
2643                         sizeof(ipv6_route_table_template),
2644                         GFP_KERNEL);
2645
2646         if (table) {
2647                 table[0].data = &net->ipv6.sysctl.flush_delay;
2648                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2649                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2650                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2651                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2652                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2653                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2654                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2655                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2656                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2657         }
2658
2659         return table;
2660 }
2661 #endif
2662
2663 static int __net_init ip6_route_net_init(struct net *net)
2664 {
2665         int ret = -ENOMEM;
2666
2667         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2668                sizeof(net->ipv6.ip6_dst_ops));
2669
2670         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2671                 goto out_ip6_dst_ops;
2672
2673         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2674                                            sizeof(*net->ipv6.ip6_null_entry),
2675                                            GFP_KERNEL);
2676         if (!net->ipv6.ip6_null_entry)
2677                 goto out_ip6_dst_entries;
2678         net->ipv6.ip6_null_entry->dst.path =
2679                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2680         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2681
2682 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2683         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2684                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2685                                                GFP_KERNEL);
2686         if (!net->ipv6.ip6_prohibit_entry)
2687                 goto out_ip6_null_entry;
2688         net->ipv6.ip6_prohibit_entry->dst.path =
2689                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2690         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2691
2692         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2693                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2694                                                GFP_KERNEL);
2695         if (!net->ipv6.ip6_blk_hole_entry)
2696                 goto out_ip6_prohibit_entry;
2697         net->ipv6.ip6_blk_hole_entry->dst.path =
2698                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2699         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2700 #endif
2701
2702         net->ipv6.sysctl.flush_delay = 0;
2703         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2704         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2705         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2706         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2707         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2708         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2709         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2710
2711 #ifdef CONFIG_PROC_FS
2712         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2713         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2714 #endif
2715         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2716
2717         ret = 0;
2718 out:
2719         return ret;
2720
2721 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2722 out_ip6_prohibit_entry:
2723         kfree(net->ipv6.ip6_prohibit_entry);
2724 out_ip6_null_entry:
2725         kfree(net->ipv6.ip6_null_entry);
2726 #endif
2727 out_ip6_dst_entries:
2728         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2729 out_ip6_dst_ops:
2730         goto out;
2731 }
2732
2733 static void __net_exit ip6_route_net_exit(struct net *net)
2734 {
2735 #ifdef CONFIG_PROC_FS
2736         proc_net_remove(net, "ipv6_route");
2737         proc_net_remove(net, "rt6_stats");
2738 #endif
2739         kfree(net->ipv6.ip6_null_entry);
2740 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2741         kfree(net->ipv6.ip6_prohibit_entry);
2742         kfree(net->ipv6.ip6_blk_hole_entry);
2743 #endif
2744 }
2745
2746 static struct pernet_operations ip6_route_net_ops = {
2747         .init = ip6_route_net_init,
2748         .exit = ip6_route_net_exit,
2749 };
2750
2751 static struct notifier_block ip6_route_dev_notifier = {
2752         .notifier_call = ip6_route_dev_notify,
2753         .priority = 0,
2754 };
2755
2756 int __init ip6_route_init(void)
2757 {
2758         int ret;
2759
2760         ret = -ENOMEM;
2761         ip6_dst_ops_template.kmem_cachep =
2762                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2763                                   SLAB_HWCACHE_ALIGN, NULL);
2764         if (!ip6_dst_ops_template.kmem_cachep)
2765                 goto out;
2766
2767         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2768         if (ret)
2769                 goto out_kmem_cache;
2770
2771         ret = register_pernet_subsys(&ip6_route_net_ops);
2772         if (ret)
2773                 goto out_dst_entries;
2774
2775         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2776
2777         /* Registering of the loopback is done before this portion of code,
2778          * the loopback reference in rt6_info will not be taken, do it
2779          * manually for init_net */
2780         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2781         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2782   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2783         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2784         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2785         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2786         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2787   #endif
2788         ret = fib6_init();
2789         if (ret)
2790                 goto out_register_subsys;
2791
2792         ret = xfrm6_init();
2793         if (ret)
2794                 goto out_fib6_init;
2795
2796         ret = fib6_rules_init();
2797         if (ret)
2798                 goto xfrm6_init;
2799
2800         ret = -ENOBUFS;
2801         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2802             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2803             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2804                 goto fib6_rules_init;
2805
2806         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2807         if (ret)
2808                 goto fib6_rules_init;
2809
2810 out:
2811         return ret;
2812
2813 fib6_rules_init:
2814         fib6_rules_cleanup();
2815 xfrm6_init:
2816         xfrm6_fini();
2817 out_fib6_init:
2818         fib6_gc_cleanup();
2819 out_register_subsys:
2820         unregister_pernet_subsys(&ip6_route_net_ops);
2821 out_dst_entries:
2822         dst_entries_destroy(&ip6_dst_blackhole_ops);
2823 out_kmem_cache:
2824         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2825         goto out;
2826 }
2827
2828 void ip6_route_cleanup(void)
2829 {
2830         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2831         fib6_rules_cleanup();
2832         xfrm6_fini();
2833         fib6_gc_cleanup();
2834         unregister_pernet_subsys(&ip6_route_net_ops);
2835         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2836 }