Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            const struct in6_addr *prefix, int prefixlen,
97                                            const struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157                                          unsigned long old)
158 {
159         return NULL;
160 }
161
162 static struct dst_ops ip6_dst_blackhole_ops = {
163         .family                 =       AF_INET6,
164         .protocol               =       cpu_to_be16(ETH_P_IPV6),
165         .destroy                =       ip6_dst_destroy,
166         .check                  =       ip6_dst_check,
167         .default_mtu            =       ip6_blackhole_default_mtu,
168         .default_advmss         =       ip6_default_advmss,
169         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
170         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
171 };
172
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174         [RTAX_HOPLIMIT - 1] = 255,
175 };
176
177 static struct rt6_info ip6_null_entry_template = {
178         .dst = {
179                 .__refcnt       = ATOMIC_INIT(1),
180                 .__use          = 1,
181                 .obsolete       = -1,
182                 .error          = -ENETUNREACH,
183                 .input          = ip6_pkt_discard,
184                 .output         = ip6_pkt_discard_out,
185         },
186         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
187         .rt6i_protocol  = RTPROT_KERNEL,
188         .rt6i_metric    = ~(u32) 0,
189         .rt6i_ref       = ATOMIC_INIT(1),
190 };
191
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196
197 static struct rt6_info ip6_prohibit_entry_template = {
198         .dst = {
199                 .__refcnt       = ATOMIC_INIT(1),
200                 .__use          = 1,
201                 .obsolete       = -1,
202                 .error          = -EACCES,
203                 .input          = ip6_pkt_prohibit,
204                 .output         = ip6_pkt_prohibit_out,
205         },
206         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
207         .rt6i_protocol  = RTPROT_KERNEL,
208         .rt6i_metric    = ~(u32) 0,
209         .rt6i_ref       = ATOMIC_INIT(1),
210 };
211
212 static struct rt6_info ip6_blk_hole_entry_template = {
213         .dst = {
214                 .__refcnt       = ATOMIC_INIT(1),
215                 .__use          = 1,
216                 .obsolete       = -1,
217                 .error          = -EINVAL,
218                 .input          = dst_discard,
219                 .output         = dst_discard,
220         },
221         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
222         .rt6i_protocol  = RTPROT_KERNEL,
223         .rt6i_metric    = ~(u32) 0,
224         .rt6i_ref       = ATOMIC_INIT(1),
225 };
226
227 #endif
228
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231                                              struct net_device *dev,
232                                              int flags)
233 {
234         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
235
236         memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
237
238         return rt;
239 }
240
241 static void ip6_dst_destroy(struct dst_entry *dst)
242 {
243         struct rt6_info *rt = (struct rt6_info *)dst;
244         struct inet6_dev *idev = rt->rt6i_idev;
245         struct inet_peer *peer = rt->rt6i_peer;
246
247         if (idev != NULL) {
248                 rt->rt6i_idev = NULL;
249                 in6_dev_put(idev);
250         }
251         if (peer) {
252                 rt->rt6i_peer = NULL;
253                 inet_putpeer(peer);
254         }
255 }
256
257 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
258
259 static u32 rt6_peer_genid(void)
260 {
261         return atomic_read(&__rt6_peer_genid);
262 }
263
264 void rt6_bind_peer(struct rt6_info *rt, int create)
265 {
266         struct inet_peer *peer;
267
268         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
269         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
270                 inet_putpeer(peer);
271         else
272                 rt->rt6i_peer_genid = rt6_peer_genid();
273 }
274
275 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
276                            int how)
277 {
278         struct rt6_info *rt = (struct rt6_info *)dst;
279         struct inet6_dev *idev = rt->rt6i_idev;
280         struct net_device *loopback_dev =
281                 dev_net(dev)->loopback_dev;
282
283         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
284                 struct inet6_dev *loopback_idev =
285                         in6_dev_get(loopback_dev);
286                 if (loopback_idev != NULL) {
287                         rt->rt6i_idev = loopback_idev;
288                         in6_dev_put(idev);
289                 }
290         }
291 }
292
293 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
294 {
295         return (rt->rt6i_flags & RTF_EXPIRES) &&
296                 time_after(jiffies, rt->rt6i_expires);
297 }
298
299 static inline int rt6_need_strict(const struct in6_addr *daddr)
300 {
301         return ipv6_addr_type(daddr) &
302                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
303 }
304
305 /*
306  *      Route lookup. Any table->tb6_lock is implied.
307  */
308
309 static inline struct rt6_info *rt6_device_match(struct net *net,
310                                                     struct rt6_info *rt,
311                                                     const struct in6_addr *saddr,
312                                                     int oif,
313                                                     int flags)
314 {
315         struct rt6_info *local = NULL;
316         struct rt6_info *sprt;
317
318         if (!oif && ipv6_addr_any(saddr))
319                 goto out;
320
321         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
322                 struct net_device *dev = sprt->rt6i_dev;
323
324                 if (oif) {
325                         if (dev->ifindex == oif)
326                                 return sprt;
327                         if (dev->flags & IFF_LOOPBACK) {
328                                 if (sprt->rt6i_idev == NULL ||
329                                     sprt->rt6i_idev->dev->ifindex != oif) {
330                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
331                                                 continue;
332                                         if (local && (!oif ||
333                                                       local->rt6i_idev->dev->ifindex == oif))
334                                                 continue;
335                                 }
336                                 local = sprt;
337                         }
338                 } else {
339                         if (ipv6_chk_addr(net, saddr, dev,
340                                           flags & RT6_LOOKUP_F_IFACE))
341                                 return sprt;
342                 }
343         }
344
345         if (oif) {
346                 if (local)
347                         return local;
348
349                 if (flags & RT6_LOOKUP_F_IFACE)
350                         return net->ipv6.ip6_null_entry;
351         }
352 out:
353         return rt;
354 }
355
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 static void rt6_probe(struct rt6_info *rt)
358 {
359         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
360         /*
361          * Okay, this does not seem to be appropriate
362          * for now, however, we need to check if it
363          * is really so; aka Router Reachability Probing.
364          *
365          * Router Reachability Probe MUST be rate-limited
366          * to no more than one per minute.
367          */
368         if (!neigh || (neigh->nud_state & NUD_VALID))
369                 return;
370         read_lock_bh(&neigh->lock);
371         if (!(neigh->nud_state & NUD_VALID) &&
372             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
373                 struct in6_addr mcaddr;
374                 struct in6_addr *target;
375
376                 neigh->updated = jiffies;
377                 read_unlock_bh(&neigh->lock);
378
379                 target = (struct in6_addr *)&neigh->primary_key;
380                 addrconf_addr_solict_mult(target, &mcaddr);
381                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
382         } else
383                 read_unlock_bh(&neigh->lock);
384 }
385 #else
386 static inline void rt6_probe(struct rt6_info *rt)
387 {
388 }
389 #endif
390
391 /*
392  * Default Router Selection (RFC 2461 6.3.6)
393  */
394 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
395 {
396         struct net_device *dev = rt->rt6i_dev;
397         if (!oif || dev->ifindex == oif)
398                 return 2;
399         if ((dev->flags & IFF_LOOPBACK) &&
400             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
401                 return 1;
402         return 0;
403 }
404
405 static inline int rt6_check_neigh(struct rt6_info *rt)
406 {
407         struct neighbour *neigh = rt->rt6i_nexthop;
408         int m;
409         if (rt->rt6i_flags & RTF_NONEXTHOP ||
410             !(rt->rt6i_flags & RTF_GATEWAY))
411                 m = 1;
412         else if (neigh) {
413                 read_lock_bh(&neigh->lock);
414                 if (neigh->nud_state & NUD_VALID)
415                         m = 2;
416 #ifdef CONFIG_IPV6_ROUTER_PREF
417                 else if (neigh->nud_state & NUD_FAILED)
418                         m = 0;
419 #endif
420                 else
421                         m = 1;
422                 read_unlock_bh(&neigh->lock);
423         } else
424                 m = 0;
425         return m;
426 }
427
428 static int rt6_score_route(struct rt6_info *rt, int oif,
429                            int strict)
430 {
431         int m, n;
432
433         m = rt6_check_dev(rt, oif);
434         if (!m && (strict & RT6_LOOKUP_F_IFACE))
435                 return -1;
436 #ifdef CONFIG_IPV6_ROUTER_PREF
437         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
438 #endif
439         n = rt6_check_neigh(rt);
440         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
441                 return -1;
442         return m;
443 }
444
445 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
446                                    int *mpri, struct rt6_info *match)
447 {
448         int m;
449
450         if (rt6_check_expired(rt))
451                 goto out;
452
453         m = rt6_score_route(rt, oif, strict);
454         if (m < 0)
455                 goto out;
456
457         if (m > *mpri) {
458                 if (strict & RT6_LOOKUP_F_REACHABLE)
459                         rt6_probe(match);
460                 *mpri = m;
461                 match = rt;
462         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
463                 rt6_probe(rt);
464         }
465
466 out:
467         return match;
468 }
469
470 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
471                                      struct rt6_info *rr_head,
472                                      u32 metric, int oif, int strict)
473 {
474         struct rt6_info *rt, *match;
475         int mpri = -1;
476
477         match = NULL;
478         for (rt = rr_head; rt && rt->rt6i_metric == metric;
479              rt = rt->dst.rt6_next)
480                 match = find_match(rt, oif, strict, &mpri, match);
481         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
482              rt = rt->dst.rt6_next)
483                 match = find_match(rt, oif, strict, &mpri, match);
484
485         return match;
486 }
487
488 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
489 {
490         struct rt6_info *match, *rt0;
491         struct net *net;
492
493         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
494                   __func__, fn->leaf, oif);
495
496         rt0 = fn->rr_ptr;
497         if (!rt0)
498                 fn->rr_ptr = rt0 = fn->leaf;
499
500         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
501
502         if (!match &&
503             (strict & RT6_LOOKUP_F_REACHABLE)) {
504                 struct rt6_info *next = rt0->dst.rt6_next;
505
506                 /* no entries matched; do round-robin */
507                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
508                         next = fn->leaf;
509
510                 if (next != rt0)
511                         fn->rr_ptr = next;
512         }
513
514         RT6_TRACE("%s() => %p\n",
515                   __func__, match);
516
517         net = dev_net(rt0->rt6i_dev);
518         return match ? match : net->ipv6.ip6_null_entry;
519 }
520
521 #ifdef CONFIG_IPV6_ROUTE_INFO
522 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
523                   const struct in6_addr *gwaddr)
524 {
525         struct net *net = dev_net(dev);
526         struct route_info *rinfo = (struct route_info *) opt;
527         struct in6_addr prefix_buf, *prefix;
528         unsigned int pref;
529         unsigned long lifetime;
530         struct rt6_info *rt;
531
532         if (len < sizeof(struct route_info)) {
533                 return -EINVAL;
534         }
535
536         /* Sanity check for prefix_len and length */
537         if (rinfo->length > 3) {
538                 return -EINVAL;
539         } else if (rinfo->prefix_len > 128) {
540                 return -EINVAL;
541         } else if (rinfo->prefix_len > 64) {
542                 if (rinfo->length < 2) {
543                         return -EINVAL;
544                 }
545         } else if (rinfo->prefix_len > 0) {
546                 if (rinfo->length < 1) {
547                         return -EINVAL;
548                 }
549         }
550
551         pref = rinfo->route_pref;
552         if (pref == ICMPV6_ROUTER_PREF_INVALID)
553                 return -EINVAL;
554
555         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
556
557         if (rinfo->length == 3)
558                 prefix = (struct in6_addr *)rinfo->prefix;
559         else {
560                 /* this function is safe */
561                 ipv6_addr_prefix(&prefix_buf,
562                                  (struct in6_addr *)rinfo->prefix,
563                                  rinfo->prefix_len);
564                 prefix = &prefix_buf;
565         }
566
567         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
568                                 dev->ifindex);
569
570         if (rt && !lifetime) {
571                 ip6_del_rt(rt);
572                 rt = NULL;
573         }
574
575         if (!rt && lifetime)
576                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
577                                         pref);
578         else if (rt)
579                 rt->rt6i_flags = RTF_ROUTEINFO |
580                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
581
582         if (rt) {
583                 if (!addrconf_finite_timeout(lifetime)) {
584                         rt->rt6i_flags &= ~RTF_EXPIRES;
585                 } else {
586                         rt->rt6i_expires = jiffies + HZ * lifetime;
587                         rt->rt6i_flags |= RTF_EXPIRES;
588                 }
589                 dst_release(&rt->dst);
590         }
591         return 0;
592 }
593 #endif
594
595 #define BACKTRACK(__net, saddr)                 \
596 do { \
597         if (rt == __net->ipv6.ip6_null_entry) { \
598                 struct fib6_node *pn; \
599                 while (1) { \
600                         if (fn->fn_flags & RTN_TL_ROOT) \
601                                 goto out; \
602                         pn = fn->parent; \
603                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
604                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
605                         else \
606                                 fn = pn; \
607                         if (fn->fn_flags & RTN_RTINFO) \
608                                 goto restart; \
609                 } \
610         } \
611 } while(0)
612
613 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
614                                              struct fib6_table *table,
615                                              struct flowi6 *fl6, int flags)
616 {
617         struct fib6_node *fn;
618         struct rt6_info *rt;
619
620         read_lock_bh(&table->tb6_lock);
621         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
622 restart:
623         rt = fn->leaf;
624         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
625         BACKTRACK(net, &fl6->saddr);
626 out:
627         dst_use(&rt->dst, jiffies);
628         read_unlock_bh(&table->tb6_lock);
629         return rt;
630
631 }
632
633 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
634                             const struct in6_addr *saddr, int oif, int strict)
635 {
636         struct flowi6 fl6 = {
637                 .flowi6_oif = oif,
638                 .daddr = *daddr,
639         };
640         struct dst_entry *dst;
641         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
642
643         if (saddr) {
644                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
645                 flags |= RT6_LOOKUP_F_HAS_SADDR;
646         }
647
648         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
649         if (dst->error == 0)
650                 return (struct rt6_info *) dst;
651
652         dst_release(dst);
653
654         return NULL;
655 }
656
657 EXPORT_SYMBOL(rt6_lookup);
658
659 /* ip6_ins_rt is called with FREE table->tb6_lock.
660    It takes new route entry, the addition fails by any reason the
661    route is freed. In any case, if caller does not hold it, it may
662    be destroyed.
663  */
664
665 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
666 {
667         int err;
668         struct fib6_table *table;
669
670         table = rt->rt6i_table;
671         write_lock_bh(&table->tb6_lock);
672         err = fib6_add(&table->tb6_root, rt, info);
673         write_unlock_bh(&table->tb6_lock);
674
675         return err;
676 }
677
678 int ip6_ins_rt(struct rt6_info *rt)
679 {
680         struct nl_info info = {
681                 .nl_net = dev_net(rt->rt6i_dev),
682         };
683         return __ip6_ins_rt(rt, &info);
684 }
685
686 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
687                                       const struct in6_addr *saddr)
688 {
689         struct rt6_info *rt;
690
691         /*
692          *      Clone the route.
693          */
694
695         rt = ip6_rt_copy(ort);
696
697         if (rt) {
698                 struct neighbour *neigh;
699                 int attempts = !in_softirq();
700
701                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
702                         if (rt->rt6i_dst.plen != 128 &&
703                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
704                                 rt->rt6i_flags |= RTF_ANYCAST;
705                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
706                 }
707
708                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
709                 rt->rt6i_dst.plen = 128;
710                 rt->rt6i_flags |= RTF_CACHE;
711                 rt->dst.flags |= DST_HOST;
712
713 #ifdef CONFIG_IPV6_SUBTREES
714                 if (rt->rt6i_src.plen && saddr) {
715                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
716                         rt->rt6i_src.plen = 128;
717                 }
718 #endif
719
720         retry:
721                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
722                 if (IS_ERR(neigh)) {
723                         struct net *net = dev_net(rt->rt6i_dev);
724                         int saved_rt_min_interval =
725                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
726                         int saved_rt_elasticity =
727                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
728
729                         if (attempts-- > 0) {
730                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
731                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
732
733                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
734
735                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
736                                         saved_rt_elasticity;
737                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
738                                         saved_rt_min_interval;
739                                 goto retry;
740                         }
741
742                         if (net_ratelimit())
743                                 printk(KERN_WARNING
744                                        "ipv6: Neighbour table overflow.\n");
745                         dst_free(&rt->dst);
746                         return NULL;
747                 }
748                 rt->rt6i_nexthop = neigh;
749
750         }
751
752         return rt;
753 }
754
755 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
756 {
757         struct rt6_info *rt = ip6_rt_copy(ort);
758         if (rt) {
759                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
760                 rt->rt6i_dst.plen = 128;
761                 rt->rt6i_flags |= RTF_CACHE;
762                 rt->dst.flags |= DST_HOST;
763                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
764         }
765         return rt;
766 }
767
768 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
769                                       struct flowi6 *fl6, int flags)
770 {
771         struct fib6_node *fn;
772         struct rt6_info *rt, *nrt;
773         int strict = 0;
774         int attempts = 3;
775         int err;
776         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
777
778         strict |= flags & RT6_LOOKUP_F_IFACE;
779
780 relookup:
781         read_lock_bh(&table->tb6_lock);
782
783 restart_2:
784         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
785
786 restart:
787         rt = rt6_select(fn, oif, strict | reachable);
788
789         BACKTRACK(net, &fl6->saddr);
790         if (rt == net->ipv6.ip6_null_entry ||
791             rt->rt6i_flags & RTF_CACHE)
792                 goto out;
793
794         dst_hold(&rt->dst);
795         read_unlock_bh(&table->tb6_lock);
796
797         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
798                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
799         else if (!(rt->dst.flags & DST_HOST))
800                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
801         else
802                 goto out2;
803
804         dst_release(&rt->dst);
805         rt = nrt ? : net->ipv6.ip6_null_entry;
806
807         dst_hold(&rt->dst);
808         if (nrt) {
809                 err = ip6_ins_rt(nrt);
810                 if (!err)
811                         goto out2;
812         }
813
814         if (--attempts <= 0)
815                 goto out2;
816
817         /*
818          * Race condition! In the gap, when table->tb6_lock was
819          * released someone could insert this route.  Relookup.
820          */
821         dst_release(&rt->dst);
822         goto relookup;
823
824 out:
825         if (reachable) {
826                 reachable = 0;
827                 goto restart_2;
828         }
829         dst_hold(&rt->dst);
830         read_unlock_bh(&table->tb6_lock);
831 out2:
832         rt->dst.lastuse = jiffies;
833         rt->dst.__use++;
834
835         return rt;
836 }
837
838 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
839                                             struct flowi6 *fl6, int flags)
840 {
841         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
842 }
843
844 void ip6_route_input(struct sk_buff *skb)
845 {
846         const struct ipv6hdr *iph = ipv6_hdr(skb);
847         struct net *net = dev_net(skb->dev);
848         int flags = RT6_LOOKUP_F_HAS_SADDR;
849         struct flowi6 fl6 = {
850                 .flowi6_iif = skb->dev->ifindex,
851                 .daddr = iph->daddr,
852                 .saddr = iph->saddr,
853                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
854                 .flowi6_mark = skb->mark,
855                 .flowi6_proto = iph->nexthdr,
856         };
857
858         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
859                 flags |= RT6_LOOKUP_F_IFACE;
860
861         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
862 }
863
864 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
865                                              struct flowi6 *fl6, int flags)
866 {
867         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
868 }
869
870 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
871                                     struct flowi6 *fl6)
872 {
873         int flags = 0;
874
875         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
876                 flags |= RT6_LOOKUP_F_IFACE;
877
878         if (!ipv6_addr_any(&fl6->saddr))
879                 flags |= RT6_LOOKUP_F_HAS_SADDR;
880         else if (sk)
881                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
882
883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
884 }
885
886 EXPORT_SYMBOL(ip6_route_output);
887
888 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
889 {
890         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
891         struct dst_entry *new = NULL;
892
893         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
894         if (rt) {
895                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
896
897                 new = &rt->dst;
898
899                 new->__use = 1;
900                 new->input = dst_discard;
901                 new->output = dst_discard;
902
903                 dst_copy_metrics(new, &ort->dst);
904                 rt->rt6i_idev = ort->rt6i_idev;
905                 if (rt->rt6i_idev)
906                         in6_dev_hold(rt->rt6i_idev);
907                 rt->rt6i_expires = 0;
908
909                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
910                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
911                 rt->rt6i_metric = 0;
912
913                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
914 #ifdef CONFIG_IPV6_SUBTREES
915                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
916 #endif
917
918                 dst_free(new);
919         }
920
921         dst_release(dst_orig);
922         return new ? new : ERR_PTR(-ENOMEM);
923 }
924
925 /*
926  *      Destination cache support functions
927  */
928
929 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
930 {
931         struct rt6_info *rt;
932
933         rt = (struct rt6_info *) dst;
934
935         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
936                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
937                         if (!rt->rt6i_peer)
938                                 rt6_bind_peer(rt, 0);
939                         rt->rt6i_peer_genid = rt6_peer_genid();
940                 }
941                 return dst;
942         }
943         return NULL;
944 }
945
946 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
947 {
948         struct rt6_info *rt = (struct rt6_info *) dst;
949
950         if (rt) {
951                 if (rt->rt6i_flags & RTF_CACHE) {
952                         if (rt6_check_expired(rt)) {
953                                 ip6_del_rt(rt);
954                                 dst = NULL;
955                         }
956                 } else {
957                         dst_release(dst);
958                         dst = NULL;
959                 }
960         }
961         return dst;
962 }
963
964 static void ip6_link_failure(struct sk_buff *skb)
965 {
966         struct rt6_info *rt;
967
968         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
969
970         rt = (struct rt6_info *) skb_dst(skb);
971         if (rt) {
972                 if (rt->rt6i_flags&RTF_CACHE) {
973                         dst_set_expires(&rt->dst, 0);
974                         rt->rt6i_flags |= RTF_EXPIRES;
975                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
976                         rt->rt6i_node->fn_sernum = -1;
977         }
978 }
979
980 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
981 {
982         struct rt6_info *rt6 = (struct rt6_info*)dst;
983
984         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
985                 rt6->rt6i_flags |= RTF_MODIFIED;
986                 if (mtu < IPV6_MIN_MTU) {
987                         u32 features = dst_metric(dst, RTAX_FEATURES);
988                         mtu = IPV6_MIN_MTU;
989                         features |= RTAX_FEATURE_ALLFRAG;
990                         dst_metric_set(dst, RTAX_FEATURES, features);
991                 }
992                 dst_metric_set(dst, RTAX_MTU, mtu);
993         }
994 }
995
996 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
997 {
998         struct net_device *dev = dst->dev;
999         unsigned int mtu = dst_mtu(dst);
1000         struct net *net = dev_net(dev);
1001
1002         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1003
1004         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1005                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1006
1007         /*
1008          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1009          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1010          * IPV6_MAXPLEN is also valid and means: "any MSS,
1011          * rely only on pmtu discovery"
1012          */
1013         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1014                 mtu = IPV6_MAXPLEN;
1015         return mtu;
1016 }
1017
1018 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1019 {
1020         unsigned int mtu = IPV6_MIN_MTU;
1021         struct inet6_dev *idev;
1022
1023         rcu_read_lock();
1024         idev = __in6_dev_get(dst->dev);
1025         if (idev)
1026                 mtu = idev->cnf.mtu6;
1027         rcu_read_unlock();
1028
1029         return mtu;
1030 }
1031
1032 static struct dst_entry *icmp6_dst_gc_list;
1033 static DEFINE_SPINLOCK(icmp6_dst_lock);
1034
1035 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1036                                   struct neighbour *neigh,
1037                                   const struct in6_addr *addr)
1038 {
1039         struct rt6_info *rt;
1040         struct inet6_dev *idev = in6_dev_get(dev);
1041         struct net *net = dev_net(dev);
1042
1043         if (unlikely(idev == NULL))
1044                 return NULL;
1045
1046         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1047         if (unlikely(rt == NULL)) {
1048                 in6_dev_put(idev);
1049                 goto out;
1050         }
1051
1052         if (neigh)
1053                 neigh_hold(neigh);
1054         else {
1055                 neigh = ndisc_get_neigh(dev, addr);
1056                 if (IS_ERR(neigh))
1057                         neigh = NULL;
1058         }
1059
1060         rt->rt6i_idev     = idev;
1061         rt->rt6i_nexthop  = neigh;
1062         atomic_set(&rt->dst.__refcnt, 1);
1063         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1064         rt->dst.output  = ip6_output;
1065
1066         spin_lock_bh(&icmp6_dst_lock);
1067         rt->dst.next = icmp6_dst_gc_list;
1068         icmp6_dst_gc_list = &rt->dst;
1069         spin_unlock_bh(&icmp6_dst_lock);
1070
1071         fib6_force_start_gc(net);
1072
1073 out:
1074         return &rt->dst;
1075 }
1076
1077 int icmp6_dst_gc(void)
1078 {
1079         struct dst_entry *dst, **pprev;
1080         int more = 0;
1081
1082         spin_lock_bh(&icmp6_dst_lock);
1083         pprev = &icmp6_dst_gc_list;
1084
1085         while ((dst = *pprev) != NULL) {
1086                 if (!atomic_read(&dst->__refcnt)) {
1087                         *pprev = dst->next;
1088                         dst_free(dst);
1089                 } else {
1090                         pprev = &dst->next;
1091                         ++more;
1092                 }
1093         }
1094
1095         spin_unlock_bh(&icmp6_dst_lock);
1096
1097         return more;
1098 }
1099
1100 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1101                             void *arg)
1102 {
1103         struct dst_entry *dst, **pprev;
1104
1105         spin_lock_bh(&icmp6_dst_lock);
1106         pprev = &icmp6_dst_gc_list;
1107         while ((dst = *pprev) != NULL) {
1108                 struct rt6_info *rt = (struct rt6_info *) dst;
1109                 if (func(rt, arg)) {
1110                         *pprev = dst->next;
1111                         dst_free(dst);
1112                 } else {
1113                         pprev = &dst->next;
1114                 }
1115         }
1116         spin_unlock_bh(&icmp6_dst_lock);
1117 }
1118
1119 static int ip6_dst_gc(struct dst_ops *ops)
1120 {
1121         unsigned long now = jiffies;
1122         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1123         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1124         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1125         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1126         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1127         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1128         int entries;
1129
1130         entries = dst_entries_get_fast(ops);
1131         if (time_after(rt_last_gc + rt_min_interval, now) &&
1132             entries <= rt_max_size)
1133                 goto out;
1134
1135         net->ipv6.ip6_rt_gc_expire++;
1136         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1137         net->ipv6.ip6_rt_last_gc = now;
1138         entries = dst_entries_get_slow(ops);
1139         if (entries < ops->gc_thresh)
1140                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1141 out:
1142         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1143         return entries > rt_max_size;
1144 }
1145
1146 /* Clean host part of a prefix. Not necessary in radix tree,
1147    but results in cleaner routing tables.
1148
1149    Remove it only when all the things will work!
1150  */
1151
1152 int ip6_dst_hoplimit(struct dst_entry *dst)
1153 {
1154         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1155         if (hoplimit == 0) {
1156                 struct net_device *dev = dst->dev;
1157                 struct inet6_dev *idev;
1158
1159                 rcu_read_lock();
1160                 idev = __in6_dev_get(dev);
1161                 if (idev)
1162                         hoplimit = idev->cnf.hop_limit;
1163                 else
1164                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1165                 rcu_read_unlock();
1166         }
1167         return hoplimit;
1168 }
1169 EXPORT_SYMBOL(ip6_dst_hoplimit);
1170
1171 /*
1172  *
1173  */
1174
1175 int ip6_route_add(struct fib6_config *cfg)
1176 {
1177         int err;
1178         struct net *net = cfg->fc_nlinfo.nl_net;
1179         struct rt6_info *rt = NULL;
1180         struct net_device *dev = NULL;
1181         struct inet6_dev *idev = NULL;
1182         struct fib6_table *table;
1183         int addr_type;
1184
1185         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1186                 return -EINVAL;
1187 #ifndef CONFIG_IPV6_SUBTREES
1188         if (cfg->fc_src_len)
1189                 return -EINVAL;
1190 #endif
1191         if (cfg->fc_ifindex) {
1192                 err = -ENODEV;
1193                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1194                 if (!dev)
1195                         goto out;
1196                 idev = in6_dev_get(dev);
1197                 if (!idev)
1198                         goto out;
1199         }
1200
1201         if (cfg->fc_metric == 0)
1202                 cfg->fc_metric = IP6_RT_PRIO_USER;
1203
1204         table = fib6_new_table(net, cfg->fc_table);
1205         if (table == NULL) {
1206                 err = -ENOBUFS;
1207                 goto out;
1208         }
1209
1210         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1211
1212         if (rt == NULL) {
1213                 err = -ENOMEM;
1214                 goto out;
1215         }
1216
1217         rt->dst.obsolete = -1;
1218         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1219                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1220                                 0;
1221
1222         if (cfg->fc_protocol == RTPROT_UNSPEC)
1223                 cfg->fc_protocol = RTPROT_BOOT;
1224         rt->rt6i_protocol = cfg->fc_protocol;
1225
1226         addr_type = ipv6_addr_type(&cfg->fc_dst);
1227
1228         if (addr_type & IPV6_ADDR_MULTICAST)
1229                 rt->dst.input = ip6_mc_input;
1230         else if (cfg->fc_flags & RTF_LOCAL)
1231                 rt->dst.input = ip6_input;
1232         else
1233                 rt->dst.input = ip6_forward;
1234
1235         rt->dst.output = ip6_output;
1236
1237         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1238         rt->rt6i_dst.plen = cfg->fc_dst_len;
1239         if (rt->rt6i_dst.plen == 128)
1240                rt->dst.flags |= DST_HOST;
1241
1242 #ifdef CONFIG_IPV6_SUBTREES
1243         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1244         rt->rt6i_src.plen = cfg->fc_src_len;
1245 #endif
1246
1247         rt->rt6i_metric = cfg->fc_metric;
1248
1249         /* We cannot add true routes via loopback here,
1250            they would result in kernel looping; promote them to reject routes
1251          */
1252         if ((cfg->fc_flags & RTF_REJECT) ||
1253             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1254                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1255                 /* hold loopback dev/idev if we haven't done so. */
1256                 if (dev != net->loopback_dev) {
1257                         if (dev) {
1258                                 dev_put(dev);
1259                                 in6_dev_put(idev);
1260                         }
1261                         dev = net->loopback_dev;
1262                         dev_hold(dev);
1263                         idev = in6_dev_get(dev);
1264                         if (!idev) {
1265                                 err = -ENODEV;
1266                                 goto out;
1267                         }
1268                 }
1269                 rt->dst.output = ip6_pkt_discard_out;
1270                 rt->dst.input = ip6_pkt_discard;
1271                 rt->dst.error = -ENETUNREACH;
1272                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1273                 goto install_route;
1274         }
1275
1276         if (cfg->fc_flags & RTF_GATEWAY) {
1277                 const struct in6_addr *gw_addr;
1278                 int gwa_type;
1279
1280                 gw_addr = &cfg->fc_gateway;
1281                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1282                 gwa_type = ipv6_addr_type(gw_addr);
1283
1284                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1285                         struct rt6_info *grt;
1286
1287                         /* IPv6 strictly inhibits using not link-local
1288                            addresses as nexthop address.
1289                            Otherwise, router will not able to send redirects.
1290                            It is very good, but in some (rare!) circumstances
1291                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1292                            some exceptions. --ANK
1293                          */
1294                         err = -EINVAL;
1295                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1296                                 goto out;
1297
1298                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1299
1300                         err = -EHOSTUNREACH;
1301                         if (grt == NULL)
1302                                 goto out;
1303                         if (dev) {
1304                                 if (dev != grt->rt6i_dev) {
1305                                         dst_release(&grt->dst);
1306                                         goto out;
1307                                 }
1308                         } else {
1309                                 dev = grt->rt6i_dev;
1310                                 idev = grt->rt6i_idev;
1311                                 dev_hold(dev);
1312                                 in6_dev_hold(grt->rt6i_idev);
1313                         }
1314                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1315                                 err = 0;
1316                         dst_release(&grt->dst);
1317
1318                         if (err)
1319                                 goto out;
1320                 }
1321                 err = -EINVAL;
1322                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1323                         goto out;
1324         }
1325
1326         err = -ENODEV;
1327         if (dev == NULL)
1328                 goto out;
1329
1330         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1331                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1332                         err = -EINVAL;
1333                         goto out;
1334                 }
1335                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1336                 rt->rt6i_prefsrc.plen = 128;
1337         } else
1338                 rt->rt6i_prefsrc.plen = 0;
1339
1340         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1341                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1342                 if (IS_ERR(rt->rt6i_nexthop)) {
1343                         err = PTR_ERR(rt->rt6i_nexthop);
1344                         rt->rt6i_nexthop = NULL;
1345                         goto out;
1346                 }
1347         }
1348
1349         rt->rt6i_flags = cfg->fc_flags;
1350
1351 install_route:
1352         if (cfg->fc_mx) {
1353                 struct nlattr *nla;
1354                 int remaining;
1355
1356                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1357                         int type = nla_type(nla);
1358
1359                         if (type) {
1360                                 if (type > RTAX_MAX) {
1361                                         err = -EINVAL;
1362                                         goto out;
1363                                 }
1364
1365                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1366                         }
1367                 }
1368         }
1369
1370         rt->dst.dev = dev;
1371         rt->rt6i_idev = idev;
1372         rt->rt6i_table = table;
1373
1374         cfg->fc_nlinfo.nl_net = dev_net(dev);
1375
1376         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1377
1378 out:
1379         if (dev)
1380                 dev_put(dev);
1381         if (idev)
1382                 in6_dev_put(idev);
1383         if (rt)
1384                 dst_free(&rt->dst);
1385         return err;
1386 }
1387
1388 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1389 {
1390         int err;
1391         struct fib6_table *table;
1392         struct net *net = dev_net(rt->rt6i_dev);
1393
1394         if (rt == net->ipv6.ip6_null_entry)
1395                 return -ENOENT;
1396
1397         table = rt->rt6i_table;
1398         write_lock_bh(&table->tb6_lock);
1399
1400         err = fib6_del(rt, info);
1401         dst_release(&rt->dst);
1402
1403         write_unlock_bh(&table->tb6_lock);
1404
1405         return err;
1406 }
1407
1408 int ip6_del_rt(struct rt6_info *rt)
1409 {
1410         struct nl_info info = {
1411                 .nl_net = dev_net(rt->rt6i_dev),
1412         };
1413         return __ip6_del_rt(rt, &info);
1414 }
1415
1416 static int ip6_route_del(struct fib6_config *cfg)
1417 {
1418         struct fib6_table *table;
1419         struct fib6_node *fn;
1420         struct rt6_info *rt;
1421         int err = -ESRCH;
1422
1423         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1424         if (table == NULL)
1425                 return err;
1426
1427         read_lock_bh(&table->tb6_lock);
1428
1429         fn = fib6_locate(&table->tb6_root,
1430                          &cfg->fc_dst, cfg->fc_dst_len,
1431                          &cfg->fc_src, cfg->fc_src_len);
1432
1433         if (fn) {
1434                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1435                         if (cfg->fc_ifindex &&
1436                             (rt->rt6i_dev == NULL ||
1437                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1438                                 continue;
1439                         if (cfg->fc_flags & RTF_GATEWAY &&
1440                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1441                                 continue;
1442                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1443                                 continue;
1444                         dst_hold(&rt->dst);
1445                         read_unlock_bh(&table->tb6_lock);
1446
1447                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1448                 }
1449         }
1450         read_unlock_bh(&table->tb6_lock);
1451
1452         return err;
1453 }
1454
1455 /*
1456  *      Handle redirects
1457  */
1458 struct ip6rd_flowi {
1459         struct flowi6 fl6;
1460         struct in6_addr gateway;
1461 };
1462
1463 static struct rt6_info *__ip6_route_redirect(struct net *net,
1464                                              struct fib6_table *table,
1465                                              struct flowi6 *fl6,
1466                                              int flags)
1467 {
1468         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1469         struct rt6_info *rt;
1470         struct fib6_node *fn;
1471
1472         /*
1473          * Get the "current" route for this destination and
1474          * check if the redirect has come from approriate router.
1475          *
1476          * RFC 2461 specifies that redirects should only be
1477          * accepted if they come from the nexthop to the target.
1478          * Due to the way the routes are chosen, this notion
1479          * is a bit fuzzy and one might need to check all possible
1480          * routes.
1481          */
1482
1483         read_lock_bh(&table->tb6_lock);
1484         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1485 restart:
1486         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1487                 /*
1488                  * Current route is on-link; redirect is always invalid.
1489                  *
1490                  * Seems, previous statement is not true. It could
1491                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1492                  * But then router serving it might decide, that we should
1493                  * know truth 8)8) --ANK (980726).
1494                  */
1495                 if (rt6_check_expired(rt))
1496                         continue;
1497                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1498                         continue;
1499                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1500                         continue;
1501                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1502                         continue;
1503                 break;
1504         }
1505
1506         if (!rt)
1507                 rt = net->ipv6.ip6_null_entry;
1508         BACKTRACK(net, &fl6->saddr);
1509 out:
1510         dst_hold(&rt->dst);
1511
1512         read_unlock_bh(&table->tb6_lock);
1513
1514         return rt;
1515 };
1516
1517 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1518                                            const struct in6_addr *src,
1519                                            const struct in6_addr *gateway,
1520                                            struct net_device *dev)
1521 {
1522         int flags = RT6_LOOKUP_F_HAS_SADDR;
1523         struct net *net = dev_net(dev);
1524         struct ip6rd_flowi rdfl = {
1525                 .fl6 = {
1526                         .flowi6_oif = dev->ifindex,
1527                         .daddr = *dest,
1528                         .saddr = *src,
1529                 },
1530         };
1531
1532         ipv6_addr_copy(&rdfl.gateway, gateway);
1533
1534         if (rt6_need_strict(dest))
1535                 flags |= RT6_LOOKUP_F_IFACE;
1536
1537         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1538                                                    flags, __ip6_route_redirect);
1539 }
1540
1541 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1542                   const struct in6_addr *saddr,
1543                   struct neighbour *neigh, u8 *lladdr, int on_link)
1544 {
1545         struct rt6_info *rt, *nrt = NULL;
1546         struct netevent_redirect netevent;
1547         struct net *net = dev_net(neigh->dev);
1548
1549         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1550
1551         if (rt == net->ipv6.ip6_null_entry) {
1552                 if (net_ratelimit())
1553                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1554                                "for redirect target\n");
1555                 goto out;
1556         }
1557
1558         /*
1559          *      We have finally decided to accept it.
1560          */
1561
1562         neigh_update(neigh, lladdr, NUD_STALE,
1563                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1564                      NEIGH_UPDATE_F_OVERRIDE|
1565                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1566                                      NEIGH_UPDATE_F_ISROUTER))
1567                      );
1568
1569         /*
1570          * Redirect received -> path was valid.
1571          * Look, redirects are sent only in response to data packets,
1572          * so that this nexthop apparently is reachable. --ANK
1573          */
1574         dst_confirm(&rt->dst);
1575
1576         /* Duplicate redirect: silently ignore. */
1577         if (neigh == rt->dst.neighbour)
1578                 goto out;
1579
1580         nrt = ip6_rt_copy(rt);
1581         if (nrt == NULL)
1582                 goto out;
1583
1584         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1585         if (on_link)
1586                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1587
1588         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1589         nrt->rt6i_dst.plen = 128;
1590         nrt->dst.flags |= DST_HOST;
1591
1592         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1593         nrt->rt6i_nexthop = neigh_clone(neigh);
1594
1595         if (ip6_ins_rt(nrt))
1596                 goto out;
1597
1598         netevent.old = &rt->dst;
1599         netevent.new = &nrt->dst;
1600         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1601
1602         if (rt->rt6i_flags&RTF_CACHE) {
1603                 ip6_del_rt(rt);
1604                 return;
1605         }
1606
1607 out:
1608         dst_release(&rt->dst);
1609 }
1610
1611 /*
1612  *      Handle ICMP "packet too big" messages
1613  *      i.e. Path MTU discovery
1614  */
1615
1616 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1617                              struct net *net, u32 pmtu, int ifindex)
1618 {
1619         struct rt6_info *rt, *nrt;
1620         int allfrag = 0;
1621 again:
1622         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1623         if (rt == NULL)
1624                 return;
1625
1626         if (rt6_check_expired(rt)) {
1627                 ip6_del_rt(rt);
1628                 goto again;
1629         }
1630
1631         if (pmtu >= dst_mtu(&rt->dst))
1632                 goto out;
1633
1634         if (pmtu < IPV6_MIN_MTU) {
1635                 /*
1636                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1637                  * MTU (1280) and a fragment header should always be included
1638                  * after a node receiving Too Big message reporting PMTU is
1639                  * less than the IPv6 Minimum Link MTU.
1640                  */
1641                 pmtu = IPV6_MIN_MTU;
1642                 allfrag = 1;
1643         }
1644
1645         /* New mtu received -> path was valid.
1646            They are sent only in response to data packets,
1647            so that this nexthop apparently is reachable. --ANK
1648          */
1649         dst_confirm(&rt->dst);
1650
1651         /* Host route. If it is static, it would be better
1652            not to override it, but add new one, so that
1653            when cache entry will expire old pmtu
1654            would return automatically.
1655          */
1656         if (rt->rt6i_flags & RTF_CACHE) {
1657                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1658                 if (allfrag) {
1659                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1660                         features |= RTAX_FEATURE_ALLFRAG;
1661                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1662                 }
1663                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1664                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1665                 goto out;
1666         }
1667
1668         /* Network route.
1669            Two cases are possible:
1670            1. It is connected route. Action: COW
1671            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1672          */
1673         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1674                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1675         else
1676                 nrt = rt6_alloc_clone(rt, daddr);
1677
1678         if (nrt) {
1679                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1680                 if (allfrag) {
1681                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1682                         features |= RTAX_FEATURE_ALLFRAG;
1683                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1684                 }
1685
1686                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1687                  * happened within 5 mins, the recommended timer is 10 mins.
1688                  * Here this route expiration time is set to ip6_rt_mtu_expires
1689                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1690                  * and detecting PMTU increase will be automatically happened.
1691                  */
1692                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1693                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1694
1695                 ip6_ins_rt(nrt);
1696         }
1697 out:
1698         dst_release(&rt->dst);
1699 }
1700
1701 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1702                         struct net_device *dev, u32 pmtu)
1703 {
1704         struct net *net = dev_net(dev);
1705
1706         /*
1707          * RFC 1981 states that a node "MUST reduce the size of the packets it
1708          * is sending along the path" that caused the Packet Too Big message.
1709          * Since it's not possible in the general case to determine which
1710          * interface was used to send the original packet, we update the MTU
1711          * on the interface that will be used to send future packets. We also
1712          * update the MTU on the interface that received the Packet Too Big in
1713          * case the original packet was forced out that interface with
1714          * SO_BINDTODEVICE or similar. This is the next best thing to the
1715          * correct behaviour, which would be to update the MTU on all
1716          * interfaces.
1717          */
1718         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1719         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1720 }
1721
1722 /*
1723  *      Misc support functions
1724  */
1725
1726 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1727 {
1728         struct net *net = dev_net(ort->rt6i_dev);
1729         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1730                                             ort->dst.dev, 0);
1731
1732         if (rt) {
1733                 rt->dst.input = ort->dst.input;
1734                 rt->dst.output = ort->dst.output;
1735
1736                 dst_copy_metrics(&rt->dst, &ort->dst);
1737                 rt->dst.error = ort->dst.error;
1738                 rt->rt6i_idev = ort->rt6i_idev;
1739                 if (rt->rt6i_idev)
1740                         in6_dev_hold(rt->rt6i_idev);
1741                 rt->dst.lastuse = jiffies;
1742                 rt->rt6i_expires = 0;
1743
1744                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1745                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1746                 rt->rt6i_metric = 0;
1747
1748                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1749 #ifdef CONFIG_IPV6_SUBTREES
1750                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1751 #endif
1752                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1753                 rt->rt6i_table = ort->rt6i_table;
1754         }
1755         return rt;
1756 }
1757
1758 #ifdef CONFIG_IPV6_ROUTE_INFO
1759 static struct rt6_info *rt6_get_route_info(struct net *net,
1760                                            const struct in6_addr *prefix, int prefixlen,
1761                                            const struct in6_addr *gwaddr, int ifindex)
1762 {
1763         struct fib6_node *fn;
1764         struct rt6_info *rt = NULL;
1765         struct fib6_table *table;
1766
1767         table = fib6_get_table(net, RT6_TABLE_INFO);
1768         if (table == NULL)
1769                 return NULL;
1770
1771         write_lock_bh(&table->tb6_lock);
1772         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1773         if (!fn)
1774                 goto out;
1775
1776         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1777                 if (rt->rt6i_dev->ifindex != ifindex)
1778                         continue;
1779                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1780                         continue;
1781                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1782                         continue;
1783                 dst_hold(&rt->dst);
1784                 break;
1785         }
1786 out:
1787         write_unlock_bh(&table->tb6_lock);
1788         return rt;
1789 }
1790
1791 static struct rt6_info *rt6_add_route_info(struct net *net,
1792                                            const struct in6_addr *prefix, int prefixlen,
1793                                            const struct in6_addr *gwaddr, int ifindex,
1794                                            unsigned pref)
1795 {
1796         struct fib6_config cfg = {
1797                 .fc_table       = RT6_TABLE_INFO,
1798                 .fc_metric      = IP6_RT_PRIO_USER,
1799                 .fc_ifindex     = ifindex,
1800                 .fc_dst_len     = prefixlen,
1801                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1802                                   RTF_UP | RTF_PREF(pref),
1803                 .fc_nlinfo.pid = 0,
1804                 .fc_nlinfo.nlh = NULL,
1805                 .fc_nlinfo.nl_net = net,
1806         };
1807
1808         ipv6_addr_copy(&cfg.fc_dst, prefix);
1809         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1810
1811         /* We should treat it as a default route if prefix length is 0. */
1812         if (!prefixlen)
1813                 cfg.fc_flags |= RTF_DEFAULT;
1814
1815         ip6_route_add(&cfg);
1816
1817         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1818 }
1819 #endif
1820
1821 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1822 {
1823         struct rt6_info *rt;
1824         struct fib6_table *table;
1825
1826         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1827         if (table == NULL)
1828                 return NULL;
1829
1830         write_lock_bh(&table->tb6_lock);
1831         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1832                 if (dev == rt->rt6i_dev &&
1833                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1834                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1835                         break;
1836         }
1837         if (rt)
1838                 dst_hold(&rt->dst);
1839         write_unlock_bh(&table->tb6_lock);
1840         return rt;
1841 }
1842
1843 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1844                                      struct net_device *dev,
1845                                      unsigned int pref)
1846 {
1847         struct fib6_config cfg = {
1848                 .fc_table       = RT6_TABLE_DFLT,
1849                 .fc_metric      = IP6_RT_PRIO_USER,
1850                 .fc_ifindex     = dev->ifindex,
1851                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1852                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1853                 .fc_nlinfo.pid = 0,
1854                 .fc_nlinfo.nlh = NULL,
1855                 .fc_nlinfo.nl_net = dev_net(dev),
1856         };
1857
1858         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1859
1860         ip6_route_add(&cfg);
1861
1862         return rt6_get_dflt_router(gwaddr, dev);
1863 }
1864
1865 void rt6_purge_dflt_routers(struct net *net)
1866 {
1867         struct rt6_info *rt;
1868         struct fib6_table *table;
1869
1870         /* NOTE: Keep consistent with rt6_get_dflt_router */
1871         table = fib6_get_table(net, RT6_TABLE_DFLT);
1872         if (table == NULL)
1873                 return;
1874
1875 restart:
1876         read_lock_bh(&table->tb6_lock);
1877         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1878                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1879                         dst_hold(&rt->dst);
1880                         read_unlock_bh(&table->tb6_lock);
1881                         ip6_del_rt(rt);
1882                         goto restart;
1883                 }
1884         }
1885         read_unlock_bh(&table->tb6_lock);
1886 }
1887
1888 static void rtmsg_to_fib6_config(struct net *net,
1889                                  struct in6_rtmsg *rtmsg,
1890                                  struct fib6_config *cfg)
1891 {
1892         memset(cfg, 0, sizeof(*cfg));
1893
1894         cfg->fc_table = RT6_TABLE_MAIN;
1895         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1896         cfg->fc_metric = rtmsg->rtmsg_metric;
1897         cfg->fc_expires = rtmsg->rtmsg_info;
1898         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1899         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1900         cfg->fc_flags = rtmsg->rtmsg_flags;
1901
1902         cfg->fc_nlinfo.nl_net = net;
1903
1904         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1905         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1906         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1907 }
1908
1909 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1910 {
1911         struct fib6_config cfg;
1912         struct in6_rtmsg rtmsg;
1913         int err;
1914
1915         switch(cmd) {
1916         case SIOCADDRT:         /* Add a route */
1917         case SIOCDELRT:         /* Delete a route */
1918                 if (!capable(CAP_NET_ADMIN))
1919                         return -EPERM;
1920                 err = copy_from_user(&rtmsg, arg,
1921                                      sizeof(struct in6_rtmsg));
1922                 if (err)
1923                         return -EFAULT;
1924
1925                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1926
1927                 rtnl_lock();
1928                 switch (cmd) {
1929                 case SIOCADDRT:
1930                         err = ip6_route_add(&cfg);
1931                         break;
1932                 case SIOCDELRT:
1933                         err = ip6_route_del(&cfg);
1934                         break;
1935                 default:
1936                         err = -EINVAL;
1937                 }
1938                 rtnl_unlock();
1939
1940                 return err;
1941         }
1942
1943         return -EINVAL;
1944 }
1945
1946 /*
1947  *      Drop the packet on the floor
1948  */
1949
1950 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1951 {
1952         int type;
1953         struct dst_entry *dst = skb_dst(skb);
1954         switch (ipstats_mib_noroutes) {
1955         case IPSTATS_MIB_INNOROUTES:
1956                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1957                 if (type == IPV6_ADDR_ANY) {
1958                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1959                                       IPSTATS_MIB_INADDRERRORS);
1960                         break;
1961                 }
1962                 /* FALLTHROUGH */
1963         case IPSTATS_MIB_OUTNOROUTES:
1964                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1965                               ipstats_mib_noroutes);
1966                 break;
1967         }
1968         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1969         kfree_skb(skb);
1970         return 0;
1971 }
1972
1973 static int ip6_pkt_discard(struct sk_buff *skb)
1974 {
1975         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1976 }
1977
1978 static int ip6_pkt_discard_out(struct sk_buff *skb)
1979 {
1980         skb->dev = skb_dst(skb)->dev;
1981         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1982 }
1983
1984 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1985
1986 static int ip6_pkt_prohibit(struct sk_buff *skb)
1987 {
1988         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1989 }
1990
1991 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1992 {
1993         skb->dev = skb_dst(skb)->dev;
1994         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1995 }
1996
1997 #endif
1998
1999 /*
2000  *      Allocate a dst for local (unicast / anycast) address.
2001  */
2002
2003 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2004                                     const struct in6_addr *addr,
2005                                     int anycast)
2006 {
2007         struct net *net = dev_net(idev->dev);
2008         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2009                                             net->loopback_dev, 0);
2010         struct neighbour *neigh;
2011
2012         if (rt == NULL) {
2013                 if (net_ratelimit())
2014                         pr_warning("IPv6:  Maximum number of routes reached,"
2015                                    " consider increasing route/max_size.\n");
2016                 return ERR_PTR(-ENOMEM);
2017         }
2018
2019         in6_dev_hold(idev);
2020
2021         rt->dst.flags |= DST_HOST;
2022         rt->dst.input = ip6_input;
2023         rt->dst.output = ip6_output;
2024         rt->rt6i_idev = idev;
2025         rt->dst.obsolete = -1;
2026
2027         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2028         if (anycast)
2029                 rt->rt6i_flags |= RTF_ANYCAST;
2030         else
2031                 rt->rt6i_flags |= RTF_LOCAL;
2032         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2033         if (IS_ERR(neigh)) {
2034                 dst_free(&rt->dst);
2035
2036                 return ERR_CAST(neigh);
2037         }
2038         rt->rt6i_nexthop = neigh;
2039
2040         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2041         rt->rt6i_dst.plen = 128;
2042         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2043
2044         atomic_set(&rt->dst.__refcnt, 1);
2045
2046         return rt;
2047 }
2048
2049 int ip6_route_get_saddr(struct net *net,
2050                         struct rt6_info *rt,
2051                         const struct in6_addr *daddr,
2052                         unsigned int prefs,
2053                         struct in6_addr *saddr)
2054 {
2055         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2056         int err = 0;
2057         if (rt->rt6i_prefsrc.plen)
2058                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2059         else
2060                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2061                                          daddr, prefs, saddr);
2062         return err;
2063 }
2064
2065 /* remove deleted ip from prefsrc entries */
2066 struct arg_dev_net_ip {
2067         struct net_device *dev;
2068         struct net *net;
2069         struct in6_addr *addr;
2070 };
2071
2072 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2073 {
2074         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2075         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2076         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2077
2078         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2079             rt != net->ipv6.ip6_null_entry &&
2080             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2081                 /* remove prefsrc entry */
2082                 rt->rt6i_prefsrc.plen = 0;
2083         }
2084         return 0;
2085 }
2086
2087 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2088 {
2089         struct net *net = dev_net(ifp->idev->dev);
2090         struct arg_dev_net_ip adni = {
2091                 .dev = ifp->idev->dev,
2092                 .net = net,
2093                 .addr = &ifp->addr,
2094         };
2095         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2096 }
2097
2098 struct arg_dev_net {
2099         struct net_device *dev;
2100         struct net *net;
2101 };
2102
2103 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2104 {
2105         const struct arg_dev_net *adn = arg;
2106         const struct net_device *dev = adn->dev;
2107
2108         if ((rt->rt6i_dev == dev || dev == NULL) &&
2109             rt != adn->net->ipv6.ip6_null_entry) {
2110                 RT6_TRACE("deleted by ifdown %p\n", rt);
2111                 return -1;
2112         }
2113         return 0;
2114 }
2115
2116 void rt6_ifdown(struct net *net, struct net_device *dev)
2117 {
2118         struct arg_dev_net adn = {
2119                 .dev = dev,
2120                 .net = net,
2121         };
2122
2123         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2124         icmp6_clean_all(fib6_ifdown, &adn);
2125 }
2126
2127 struct rt6_mtu_change_arg
2128 {
2129         struct net_device *dev;
2130         unsigned mtu;
2131 };
2132
2133 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2134 {
2135         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2136         struct inet6_dev *idev;
2137
2138         /* In IPv6 pmtu discovery is not optional,
2139            so that RTAX_MTU lock cannot disable it.
2140            We still use this lock to block changes
2141            caused by addrconf/ndisc.
2142         */
2143
2144         idev = __in6_dev_get(arg->dev);
2145         if (idev == NULL)
2146                 return 0;
2147
2148         /* For administrative MTU increase, there is no way to discover
2149            IPv6 PMTU increase, so PMTU increase should be updated here.
2150            Since RFC 1981 doesn't include administrative MTU increase
2151            update PMTU increase is a MUST. (i.e. jumbo frame)
2152          */
2153         /*
2154            If new MTU is less than route PMTU, this new MTU will be the
2155            lowest MTU in the path, update the route PMTU to reflect PMTU
2156            decreases; if new MTU is greater than route PMTU, and the
2157            old MTU is the lowest MTU in the path, update the route PMTU
2158            to reflect the increase. In this case if the other nodes' MTU
2159            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2160            PMTU discouvery.
2161          */
2162         if (rt->rt6i_dev == arg->dev &&
2163             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2164             (dst_mtu(&rt->dst) >= arg->mtu ||
2165              (dst_mtu(&rt->dst) < arg->mtu &&
2166               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2167                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2168         }
2169         return 0;
2170 }
2171
2172 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2173 {
2174         struct rt6_mtu_change_arg arg = {
2175                 .dev = dev,
2176                 .mtu = mtu,
2177         };
2178
2179         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2180 }
2181
2182 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2183         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2184         [RTA_OIF]               = { .type = NLA_U32 },
2185         [RTA_IIF]               = { .type = NLA_U32 },
2186         [RTA_PRIORITY]          = { .type = NLA_U32 },
2187         [RTA_METRICS]           = { .type = NLA_NESTED },
2188 };
2189
2190 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2191                               struct fib6_config *cfg)
2192 {
2193         struct rtmsg *rtm;
2194         struct nlattr *tb[RTA_MAX+1];
2195         int err;
2196
2197         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2198         if (err < 0)
2199                 goto errout;
2200
2201         err = -EINVAL;
2202         rtm = nlmsg_data(nlh);
2203         memset(cfg, 0, sizeof(*cfg));
2204
2205         cfg->fc_table = rtm->rtm_table;
2206         cfg->fc_dst_len = rtm->rtm_dst_len;
2207         cfg->fc_src_len = rtm->rtm_src_len;
2208         cfg->fc_flags = RTF_UP;
2209         cfg->fc_protocol = rtm->rtm_protocol;
2210
2211         if (rtm->rtm_type == RTN_UNREACHABLE)
2212                 cfg->fc_flags |= RTF_REJECT;
2213
2214         if (rtm->rtm_type == RTN_LOCAL)
2215                 cfg->fc_flags |= RTF_LOCAL;
2216
2217         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2218         cfg->fc_nlinfo.nlh = nlh;
2219         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2220
2221         if (tb[RTA_GATEWAY]) {
2222                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2223                 cfg->fc_flags |= RTF_GATEWAY;
2224         }
2225
2226         if (tb[RTA_DST]) {
2227                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2228
2229                 if (nla_len(tb[RTA_DST]) < plen)
2230                         goto errout;
2231
2232                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2233         }
2234
2235         if (tb[RTA_SRC]) {
2236                 int plen = (rtm->rtm_src_len + 7) >> 3;
2237
2238                 if (nla_len(tb[RTA_SRC]) < plen)
2239                         goto errout;
2240
2241                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2242         }
2243
2244         if (tb[RTA_PREFSRC])
2245                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2246
2247         if (tb[RTA_OIF])
2248                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2249
2250         if (tb[RTA_PRIORITY])
2251                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2252
2253         if (tb[RTA_METRICS]) {
2254                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2255                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2256         }
2257
2258         if (tb[RTA_TABLE])
2259                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2260
2261         err = 0;
2262 errout:
2263         return err;
2264 }
2265
2266 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2267 {
2268         struct fib6_config cfg;
2269         int err;
2270
2271         err = rtm_to_fib6_config(skb, nlh, &cfg);
2272         if (err < 0)
2273                 return err;
2274
2275         return ip6_route_del(&cfg);
2276 }
2277
2278 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2279 {
2280         struct fib6_config cfg;
2281         int err;
2282
2283         err = rtm_to_fib6_config(skb, nlh, &cfg);
2284         if (err < 0)
2285                 return err;
2286
2287         return ip6_route_add(&cfg);
2288 }
2289
2290 static inline size_t rt6_nlmsg_size(void)
2291 {
2292         return NLMSG_ALIGN(sizeof(struct rtmsg))
2293                + nla_total_size(16) /* RTA_SRC */
2294                + nla_total_size(16) /* RTA_DST */
2295                + nla_total_size(16) /* RTA_GATEWAY */
2296                + nla_total_size(16) /* RTA_PREFSRC */
2297                + nla_total_size(4) /* RTA_TABLE */
2298                + nla_total_size(4) /* RTA_IIF */
2299                + nla_total_size(4) /* RTA_OIF */
2300                + nla_total_size(4) /* RTA_PRIORITY */
2301                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2302                + nla_total_size(sizeof(struct rta_cacheinfo));
2303 }
2304
2305 static int rt6_fill_node(struct net *net,
2306                          struct sk_buff *skb, struct rt6_info *rt,
2307                          struct in6_addr *dst, struct in6_addr *src,
2308                          int iif, int type, u32 pid, u32 seq,
2309                          int prefix, int nowait, unsigned int flags)
2310 {
2311         struct rtmsg *rtm;
2312         struct nlmsghdr *nlh;
2313         long expires;
2314         u32 table;
2315
2316         if (prefix) {   /* user wants prefix routes only */
2317                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2318                         /* success since this is not a prefix route */
2319                         return 1;
2320                 }
2321         }
2322
2323         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2324         if (nlh == NULL)
2325                 return -EMSGSIZE;
2326
2327         rtm = nlmsg_data(nlh);
2328         rtm->rtm_family = AF_INET6;
2329         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2330         rtm->rtm_src_len = rt->rt6i_src.plen;
2331         rtm->rtm_tos = 0;
2332         if (rt->rt6i_table)
2333                 table = rt->rt6i_table->tb6_id;
2334         else
2335                 table = RT6_TABLE_UNSPEC;
2336         rtm->rtm_table = table;
2337         NLA_PUT_U32(skb, RTA_TABLE, table);
2338         if (rt->rt6i_flags&RTF_REJECT)
2339                 rtm->rtm_type = RTN_UNREACHABLE;
2340         else if (rt->rt6i_flags&RTF_LOCAL)
2341                 rtm->rtm_type = RTN_LOCAL;
2342         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2343                 rtm->rtm_type = RTN_LOCAL;
2344         else
2345                 rtm->rtm_type = RTN_UNICAST;
2346         rtm->rtm_flags = 0;
2347         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2348         rtm->rtm_protocol = rt->rt6i_protocol;
2349         if (rt->rt6i_flags&RTF_DYNAMIC)
2350                 rtm->rtm_protocol = RTPROT_REDIRECT;
2351         else if (rt->rt6i_flags & RTF_ADDRCONF)
2352                 rtm->rtm_protocol = RTPROT_KERNEL;
2353         else if (rt->rt6i_flags&RTF_DEFAULT)
2354                 rtm->rtm_protocol = RTPROT_RA;
2355
2356         if (rt->rt6i_flags&RTF_CACHE)
2357                 rtm->rtm_flags |= RTM_F_CLONED;
2358
2359         if (dst) {
2360                 NLA_PUT(skb, RTA_DST, 16, dst);
2361                 rtm->rtm_dst_len = 128;
2362         } else if (rtm->rtm_dst_len)
2363                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2364 #ifdef CONFIG_IPV6_SUBTREES
2365         if (src) {
2366                 NLA_PUT(skb, RTA_SRC, 16, src);
2367                 rtm->rtm_src_len = 128;
2368         } else if (rtm->rtm_src_len)
2369                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2370 #endif
2371         if (iif) {
2372 #ifdef CONFIG_IPV6_MROUTE
2373                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2374                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2375                         if (err <= 0) {
2376                                 if (!nowait) {
2377                                         if (err == 0)
2378                                                 return 0;
2379                                         goto nla_put_failure;
2380                                 } else {
2381                                         if (err == -EMSGSIZE)
2382                                                 goto nla_put_failure;
2383                                 }
2384                         }
2385                 } else
2386 #endif
2387                         NLA_PUT_U32(skb, RTA_IIF, iif);
2388         } else if (dst) {
2389                 struct in6_addr saddr_buf;
2390                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2391                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2392         }
2393
2394         if (rt->rt6i_prefsrc.plen) {
2395                 struct in6_addr saddr_buf;
2396                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2397                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2398         }
2399
2400         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2401                 goto nla_put_failure;
2402
2403         if (rt->dst.neighbour)
2404                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2405
2406         if (rt->dst.dev)
2407                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2408
2409         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2410
2411         if (!(rt->rt6i_flags & RTF_EXPIRES))
2412                 expires = 0;
2413         else if (rt->rt6i_expires - jiffies < INT_MAX)
2414                 expires = rt->rt6i_expires - jiffies;
2415         else
2416                 expires = INT_MAX;
2417
2418         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2419                                expires, rt->dst.error) < 0)
2420                 goto nla_put_failure;
2421
2422         return nlmsg_end(skb, nlh);
2423
2424 nla_put_failure:
2425         nlmsg_cancel(skb, nlh);
2426         return -EMSGSIZE;
2427 }
2428
2429 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2430 {
2431         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2432         int prefix;
2433
2434         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2435                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2436                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2437         } else
2438                 prefix = 0;
2439
2440         return rt6_fill_node(arg->net,
2441                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2442                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2443                      prefix, 0, NLM_F_MULTI);
2444 }
2445
2446 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2447 {
2448         struct net *net = sock_net(in_skb->sk);
2449         struct nlattr *tb[RTA_MAX+1];
2450         struct rt6_info *rt;
2451         struct sk_buff *skb;
2452         struct rtmsg *rtm;
2453         struct flowi6 fl6;
2454         int err, iif = 0;
2455
2456         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2457         if (err < 0)
2458                 goto errout;
2459
2460         err = -EINVAL;
2461         memset(&fl6, 0, sizeof(fl6));
2462
2463         if (tb[RTA_SRC]) {
2464                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2465                         goto errout;
2466
2467                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2468         }
2469
2470         if (tb[RTA_DST]) {
2471                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2472                         goto errout;
2473
2474                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2475         }
2476
2477         if (tb[RTA_IIF])
2478                 iif = nla_get_u32(tb[RTA_IIF]);
2479
2480         if (tb[RTA_OIF])
2481                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2482
2483         if (iif) {
2484                 struct net_device *dev;
2485                 dev = __dev_get_by_index(net, iif);
2486                 if (!dev) {
2487                         err = -ENODEV;
2488                         goto errout;
2489                 }
2490         }
2491
2492         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2493         if (skb == NULL) {
2494                 err = -ENOBUFS;
2495                 goto errout;
2496         }
2497
2498         /* Reserve room for dummy headers, this skb can pass
2499            through good chunk of routing engine.
2500          */
2501         skb_reset_mac_header(skb);
2502         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2503
2504         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2505         skb_dst_set(skb, &rt->dst);
2506
2507         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2508                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2509                             nlh->nlmsg_seq, 0, 0, 0);
2510         if (err < 0) {
2511                 kfree_skb(skb);
2512                 goto errout;
2513         }
2514
2515         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2516 errout:
2517         return err;
2518 }
2519
2520 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2521 {
2522         struct sk_buff *skb;
2523         struct net *net = info->nl_net;
2524         u32 seq;
2525         int err;
2526
2527         err = -ENOBUFS;
2528         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2529
2530         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2531         if (skb == NULL)
2532                 goto errout;
2533
2534         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2535                                 event, info->pid, seq, 0, 0, 0);
2536         if (err < 0) {
2537                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2538                 WARN_ON(err == -EMSGSIZE);
2539                 kfree_skb(skb);
2540                 goto errout;
2541         }
2542         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2543                     info->nlh, gfp_any());
2544         return;
2545 errout:
2546         if (err < 0)
2547                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2548 }
2549
2550 static int ip6_route_dev_notify(struct notifier_block *this,
2551                                 unsigned long event, void *data)
2552 {
2553         struct net_device *dev = (struct net_device *)data;
2554         struct net *net = dev_net(dev);
2555
2556         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2557                 net->ipv6.ip6_null_entry->dst.dev = dev;
2558                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2559 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2560                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2561                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2562                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2563                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2564 #endif
2565         }
2566
2567         return NOTIFY_OK;
2568 }
2569
2570 /*
2571  *      /proc
2572  */
2573
2574 #ifdef CONFIG_PROC_FS
2575
2576 struct rt6_proc_arg
2577 {
2578         char *buffer;
2579         int offset;
2580         int length;
2581         int skip;
2582         int len;
2583 };
2584
2585 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2586 {
2587         struct seq_file *m = p_arg;
2588
2589         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2590
2591 #ifdef CONFIG_IPV6_SUBTREES
2592         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2593 #else
2594         seq_puts(m, "00000000000000000000000000000000 00 ");
2595 #endif
2596
2597         if (rt->rt6i_nexthop) {
2598                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2599         } else {
2600                 seq_puts(m, "00000000000000000000000000000000");
2601         }
2602         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2603                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2604                    rt->dst.__use, rt->rt6i_flags,
2605                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2606         return 0;
2607 }
2608
2609 static int ipv6_route_show(struct seq_file *m, void *v)
2610 {
2611         struct net *net = (struct net *)m->private;
2612         fib6_clean_all(net, rt6_info_route, 0, m);
2613         return 0;
2614 }
2615
2616 static int ipv6_route_open(struct inode *inode, struct file *file)
2617 {
2618         return single_open_net(inode, file, ipv6_route_show);
2619 }
2620
2621 static const struct file_operations ipv6_route_proc_fops = {
2622         .owner          = THIS_MODULE,
2623         .open           = ipv6_route_open,
2624         .read           = seq_read,
2625         .llseek         = seq_lseek,
2626         .release        = single_release_net,
2627 };
2628
2629 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2630 {
2631         struct net *net = (struct net *)seq->private;
2632         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2633                    net->ipv6.rt6_stats->fib_nodes,
2634                    net->ipv6.rt6_stats->fib_route_nodes,
2635                    net->ipv6.rt6_stats->fib_rt_alloc,
2636                    net->ipv6.rt6_stats->fib_rt_entries,
2637                    net->ipv6.rt6_stats->fib_rt_cache,
2638                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2639                    net->ipv6.rt6_stats->fib_discarded_routes);
2640
2641         return 0;
2642 }
2643
2644 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2645 {
2646         return single_open_net(inode, file, rt6_stats_seq_show);
2647 }
2648
2649 static const struct file_operations rt6_stats_seq_fops = {
2650         .owner   = THIS_MODULE,
2651         .open    = rt6_stats_seq_open,
2652         .read    = seq_read,
2653         .llseek  = seq_lseek,
2654         .release = single_release_net,
2655 };
2656 #endif  /* CONFIG_PROC_FS */
2657
2658 #ifdef CONFIG_SYSCTL
2659
2660 static
2661 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2662                               void __user *buffer, size_t *lenp, loff_t *ppos)
2663 {
2664         struct net *net;
2665         int delay;
2666         if (!write)
2667                 return -EINVAL;
2668
2669         net = (struct net *)ctl->extra1;
2670         delay = net->ipv6.sysctl.flush_delay;
2671         proc_dointvec(ctl, write, buffer, lenp, ppos);
2672         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2673         return 0;
2674 }
2675
2676 ctl_table ipv6_route_table_template[] = {
2677         {
2678                 .procname       =       "flush",
2679                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2680                 .maxlen         =       sizeof(int),
2681                 .mode           =       0200,
2682                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2683         },
2684         {
2685                 .procname       =       "gc_thresh",
2686                 .data           =       &ip6_dst_ops_template.gc_thresh,
2687                 .maxlen         =       sizeof(int),
2688                 .mode           =       0644,
2689                 .proc_handler   =       proc_dointvec,
2690         },
2691         {
2692                 .procname       =       "max_size",
2693                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2694                 .maxlen         =       sizeof(int),
2695                 .mode           =       0644,
2696                 .proc_handler   =       proc_dointvec,
2697         },
2698         {
2699                 .procname       =       "gc_min_interval",
2700                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2701                 .maxlen         =       sizeof(int),
2702                 .mode           =       0644,
2703                 .proc_handler   =       proc_dointvec_jiffies,
2704         },
2705         {
2706                 .procname       =       "gc_timeout",
2707                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2708                 .maxlen         =       sizeof(int),
2709                 .mode           =       0644,
2710                 .proc_handler   =       proc_dointvec_jiffies,
2711         },
2712         {
2713                 .procname       =       "gc_interval",
2714                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2715                 .maxlen         =       sizeof(int),
2716                 .mode           =       0644,
2717                 .proc_handler   =       proc_dointvec_jiffies,
2718         },
2719         {
2720                 .procname       =       "gc_elasticity",
2721                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2722                 .maxlen         =       sizeof(int),
2723                 .mode           =       0644,
2724                 .proc_handler   =       proc_dointvec,
2725         },
2726         {
2727                 .procname       =       "mtu_expires",
2728                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2729                 .maxlen         =       sizeof(int),
2730                 .mode           =       0644,
2731                 .proc_handler   =       proc_dointvec_jiffies,
2732         },
2733         {
2734                 .procname       =       "min_adv_mss",
2735                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2736                 .maxlen         =       sizeof(int),
2737                 .mode           =       0644,
2738                 .proc_handler   =       proc_dointvec,
2739         },
2740         {
2741                 .procname       =       "gc_min_interval_ms",
2742                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2743                 .maxlen         =       sizeof(int),
2744                 .mode           =       0644,
2745                 .proc_handler   =       proc_dointvec_ms_jiffies,
2746         },
2747         { }
2748 };
2749
2750 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2751 {
2752         struct ctl_table *table;
2753
2754         table = kmemdup(ipv6_route_table_template,
2755                         sizeof(ipv6_route_table_template),
2756                         GFP_KERNEL);
2757
2758         if (table) {
2759                 table[0].data = &net->ipv6.sysctl.flush_delay;
2760                 table[0].extra1 = net;
2761                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2762                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2763                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2764                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2765                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2766                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2767                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2768                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2769                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2770         }
2771
2772         return table;
2773 }
2774 #endif
2775
2776 static int __net_init ip6_route_net_init(struct net *net)
2777 {
2778         int ret = -ENOMEM;
2779
2780         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2781                sizeof(net->ipv6.ip6_dst_ops));
2782
2783         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2784                 goto out_ip6_dst_ops;
2785
2786         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2787                                            sizeof(*net->ipv6.ip6_null_entry),
2788                                            GFP_KERNEL);
2789         if (!net->ipv6.ip6_null_entry)
2790                 goto out_ip6_dst_entries;
2791         net->ipv6.ip6_null_entry->dst.path =
2792                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2793         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2794         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2795                          ip6_template_metrics, true);
2796
2797 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2798         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2799                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2800                                                GFP_KERNEL);
2801         if (!net->ipv6.ip6_prohibit_entry)
2802                 goto out_ip6_null_entry;
2803         net->ipv6.ip6_prohibit_entry->dst.path =
2804                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2805         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2806         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2807                          ip6_template_metrics, true);
2808
2809         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2810                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2811                                                GFP_KERNEL);
2812         if (!net->ipv6.ip6_blk_hole_entry)
2813                 goto out_ip6_prohibit_entry;
2814         net->ipv6.ip6_blk_hole_entry->dst.path =
2815                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2816         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2817         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2818                          ip6_template_metrics, true);
2819 #endif
2820
2821         net->ipv6.sysctl.flush_delay = 0;
2822         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2823         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2824         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2825         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2826         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2827         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2828         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2829
2830 #ifdef CONFIG_PROC_FS
2831         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2832         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2833 #endif
2834         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2835
2836         ret = 0;
2837 out:
2838         return ret;
2839
2840 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2841 out_ip6_prohibit_entry:
2842         kfree(net->ipv6.ip6_prohibit_entry);
2843 out_ip6_null_entry:
2844         kfree(net->ipv6.ip6_null_entry);
2845 #endif
2846 out_ip6_dst_entries:
2847         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2848 out_ip6_dst_ops:
2849         goto out;
2850 }
2851
2852 static void __net_exit ip6_route_net_exit(struct net *net)
2853 {
2854 #ifdef CONFIG_PROC_FS
2855         proc_net_remove(net, "ipv6_route");
2856         proc_net_remove(net, "rt6_stats");
2857 #endif
2858         kfree(net->ipv6.ip6_null_entry);
2859 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2860         kfree(net->ipv6.ip6_prohibit_entry);
2861         kfree(net->ipv6.ip6_blk_hole_entry);
2862 #endif
2863         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2864 }
2865
2866 static struct pernet_operations ip6_route_net_ops = {
2867         .init = ip6_route_net_init,
2868         .exit = ip6_route_net_exit,
2869 };
2870
2871 static struct notifier_block ip6_route_dev_notifier = {
2872         .notifier_call = ip6_route_dev_notify,
2873         .priority = 0,
2874 };
2875
2876 int __init ip6_route_init(void)
2877 {
2878         int ret;
2879
2880         ret = -ENOMEM;
2881         ip6_dst_ops_template.kmem_cachep =
2882                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2883                                   SLAB_HWCACHE_ALIGN, NULL);
2884         if (!ip6_dst_ops_template.kmem_cachep)
2885                 goto out;
2886
2887         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2888         if (ret)
2889                 goto out_kmem_cache;
2890
2891         ret = register_pernet_subsys(&ip6_route_net_ops);
2892         if (ret)
2893                 goto out_dst_entries;
2894
2895         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2896
2897         /* Registering of the loopback is done before this portion of code,
2898          * the loopback reference in rt6_info will not be taken, do it
2899          * manually for init_net */
2900         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2901         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2902   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2903         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2904         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2905         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2906         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2907   #endif
2908         ret = fib6_init();
2909         if (ret)
2910                 goto out_register_subsys;
2911
2912         ret = xfrm6_init();
2913         if (ret)
2914                 goto out_fib6_init;
2915
2916         ret = fib6_rules_init();
2917         if (ret)
2918                 goto xfrm6_init;
2919
2920         ret = -ENOBUFS;
2921         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2922             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2923             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2924                 goto fib6_rules_init;
2925
2926         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2927         if (ret)
2928                 goto fib6_rules_init;
2929
2930 out:
2931         return ret;
2932
2933 fib6_rules_init:
2934         fib6_rules_cleanup();
2935 xfrm6_init:
2936         xfrm6_fini();
2937 out_fib6_init:
2938         fib6_gc_cleanup();
2939 out_register_subsys:
2940         unregister_pernet_subsys(&ip6_route_net_ops);
2941 out_dst_entries:
2942         dst_entries_destroy(&ip6_dst_blackhole_ops);
2943 out_kmem_cache:
2944         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2945         goto out;
2946 }
2947
2948 void ip6_route_cleanup(void)
2949 {
2950         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2951         fib6_rules_cleanup();
2952         xfrm6_fini();
2953         fib6_gc_cleanup();
2954         unregister_pernet_subsys(&ip6_route_net_ops);
2955         dst_entries_destroy(&ip6_dst_blackhole_ops);
2956         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2957 }