sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd
[linux-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76                                     const struct in6_addr *dest);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sk_buff *skb);
88 static void             ip6_link_failure(struct sk_buff *skb);
89 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93                                            const struct in6_addr *prefix, int prefixlen,
94                                            const struct in6_addr *gwaddr, int ifindex,
95                                            unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97                                            const struct in6_addr *prefix, int prefixlen,
98                                            const struct in6_addr *gwaddr, int ifindex);
99 #endif
100
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 {
103         struct rt6_info *rt = (struct rt6_info *) dst;
104         struct inet_peer *peer;
105         u32 *p = NULL;
106
107         if (!(rt->dst.flags & DST_HOST))
108                 return NULL;
109
110         if (!rt->rt6i_peer)
111                 rt6_bind_peer(rt, 1);
112
113         peer = rt->rt6i_peer;
114         if (peer) {
115                 u32 *old_p = __DST_METRICS_PTR(old);
116                 unsigned long prev, new;
117
118                 p = peer->metrics;
119                 if (inet_metrics_new(peer))
120                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
121
122                 new = (unsigned long) p;
123                 prev = cmpxchg(&dst->_metrics, old, new);
124
125                 if (prev != old) {
126                         p = __DST_METRICS_PTR(prev);
127                         if (prev & DST_METRICS_READ_ONLY)
128                                 p = NULL;
129                 }
130         }
131         return p;
132 }
133
134 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
135 {
136         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
137 }
138
139 static struct dst_ops ip6_dst_ops_template = {
140         .family                 =       AF_INET6,
141         .protocol               =       cpu_to_be16(ETH_P_IPV6),
142         .gc                     =       ip6_dst_gc,
143         .gc_thresh              =       1024,
144         .check                  =       ip6_dst_check,
145         .default_advmss         =       ip6_default_advmss,
146         .default_mtu            =       ip6_default_mtu,
147         .cow_metrics            =       ipv6_cow_metrics,
148         .destroy                =       ip6_dst_destroy,
149         .ifdown                 =       ip6_dst_ifdown,
150         .negative_advice        =       ip6_negative_advice,
151         .link_failure           =       ip6_link_failure,
152         .update_pmtu            =       ip6_rt_update_pmtu,
153         .local_out              =       __ip6_local_out,
154         .neigh_lookup           =       ip6_neigh_lookup,
155 };
156
157 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
158 {
159         return 0;
160 }
161
162 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
163 {
164 }
165
166 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
167                                          unsigned long old)
168 {
169         return NULL;
170 }
171
172 static struct dst_ops ip6_dst_blackhole_ops = {
173         .family                 =       AF_INET6,
174         .protocol               =       cpu_to_be16(ETH_P_IPV6),
175         .destroy                =       ip6_dst_destroy,
176         .check                  =       ip6_dst_check,
177         .default_mtu            =       ip6_blackhole_default_mtu,
178         .default_advmss         =       ip6_default_advmss,
179         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
180         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
181         .neigh_lookup           =       ip6_neigh_lookup,
182 };
183
184 static const u32 ip6_template_metrics[RTAX_MAX] = {
185         [RTAX_HOPLIMIT - 1] = 255,
186 };
187
188 static struct rt6_info ip6_null_entry_template = {
189         .dst = {
190                 .__refcnt       = ATOMIC_INIT(1),
191                 .__use          = 1,
192                 .obsolete       = -1,
193                 .error          = -ENETUNREACH,
194                 .input          = ip6_pkt_discard,
195                 .output         = ip6_pkt_discard_out,
196         },
197         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
198         .rt6i_protocol  = RTPROT_KERNEL,
199         .rt6i_metric    = ~(u32) 0,
200         .rt6i_ref       = ATOMIC_INIT(1),
201 };
202
203 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
204
205 static int ip6_pkt_prohibit(struct sk_buff *skb);
206 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
207
208 static struct rt6_info ip6_prohibit_entry_template = {
209         .dst = {
210                 .__refcnt       = ATOMIC_INIT(1),
211                 .__use          = 1,
212                 .obsolete       = -1,
213                 .error          = -EACCES,
214                 .input          = ip6_pkt_prohibit,
215                 .output         = ip6_pkt_prohibit_out,
216         },
217         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
218         .rt6i_protocol  = RTPROT_KERNEL,
219         .rt6i_metric    = ~(u32) 0,
220         .rt6i_ref       = ATOMIC_INIT(1),
221 };
222
223 static struct rt6_info ip6_blk_hole_entry_template = {
224         .dst = {
225                 .__refcnt       = ATOMIC_INIT(1),
226                 .__use          = 1,
227                 .obsolete       = -1,
228                 .error          = -EINVAL,
229                 .input          = dst_discard,
230                 .output         = dst_discard,
231         },
232         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
233         .rt6i_protocol  = RTPROT_KERNEL,
234         .rt6i_metric    = ~(u32) 0,
235         .rt6i_ref       = ATOMIC_INIT(1),
236 };
237
238 #endif
239
240 /* allocate dst with ip6_dst_ops */
241 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
242                                              struct net_device *dev,
243                                              int flags)
244 {
245         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
246
247         if (rt != NULL)
248                 memset(&rt->rt6i_table, 0,
249                         sizeof(*rt) - sizeof(struct dst_entry));
250
251         return rt;
252 }
253
254 static void ip6_dst_destroy(struct dst_entry *dst)
255 {
256         struct rt6_info *rt = (struct rt6_info *)dst;
257         struct inet6_dev *idev = rt->rt6i_idev;
258         struct inet_peer *peer = rt->rt6i_peer;
259
260         if (!(rt->dst.flags & DST_HOST))
261                 dst_destroy_metrics_generic(dst);
262
263         if (idev != NULL) {
264                 rt->rt6i_idev = NULL;
265                 in6_dev_put(idev);
266         }
267         if (peer) {
268                 rt->rt6i_peer = NULL;
269                 inet_putpeer(peer);
270         }
271 }
272
273 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
274
275 static u32 rt6_peer_genid(void)
276 {
277         return atomic_read(&__rt6_peer_genid);
278 }
279
280 void rt6_bind_peer(struct rt6_info *rt, int create)
281 {
282         struct inet_peer *peer;
283
284         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
285         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
286                 inet_putpeer(peer);
287         else
288                 rt->rt6i_peer_genid = rt6_peer_genid();
289 }
290
291 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
292                            int how)
293 {
294         struct rt6_info *rt = (struct rt6_info *)dst;
295         struct inet6_dev *idev = rt->rt6i_idev;
296         struct net_device *loopback_dev =
297                 dev_net(dev)->loopback_dev;
298
299         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
300                 struct inet6_dev *loopback_idev =
301                         in6_dev_get(loopback_dev);
302                 if (loopback_idev != NULL) {
303                         rt->rt6i_idev = loopback_idev;
304                         in6_dev_put(idev);
305                 }
306         }
307 }
308
309 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
310 {
311         return (rt->rt6i_flags & RTF_EXPIRES) &&
312                 time_after(jiffies, rt->rt6i_expires);
313 }
314
315 static inline int rt6_need_strict(const struct in6_addr *daddr)
316 {
317         return ipv6_addr_type(daddr) &
318                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
319 }
320
321 /*
322  *      Route lookup. Any table->tb6_lock is implied.
323  */
324
325 static inline struct rt6_info *rt6_device_match(struct net *net,
326                                                     struct rt6_info *rt,
327                                                     const struct in6_addr *saddr,
328                                                     int oif,
329                                                     int flags)
330 {
331         struct rt6_info *local = NULL;
332         struct rt6_info *sprt;
333
334         if (!oif && ipv6_addr_any(saddr))
335                 goto out;
336
337         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
338                 struct net_device *dev = sprt->rt6i_dev;
339
340                 if (oif) {
341                         if (dev->ifindex == oif)
342                                 return sprt;
343                         if (dev->flags & IFF_LOOPBACK) {
344                                 if (sprt->rt6i_idev == NULL ||
345                                     sprt->rt6i_idev->dev->ifindex != oif) {
346                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
347                                                 continue;
348                                         if (local && (!oif ||
349                                                       local->rt6i_idev->dev->ifindex == oif))
350                                                 continue;
351                                 }
352                                 local = sprt;
353                         }
354                 } else {
355                         if (ipv6_chk_addr(net, saddr, dev,
356                                           flags & RT6_LOOKUP_F_IFACE))
357                                 return sprt;
358                 }
359         }
360
361         if (oif) {
362                 if (local)
363                         return local;
364
365                 if (flags & RT6_LOOKUP_F_IFACE)
366                         return net->ipv6.ip6_null_entry;
367         }
368 out:
369         return rt;
370 }
371
372 #ifdef CONFIG_IPV6_ROUTER_PREF
373 static void rt6_probe(struct rt6_info *rt)
374 {
375         struct neighbour *neigh;
376         /*
377          * Okay, this does not seem to be appropriate
378          * for now, however, we need to check if it
379          * is really so; aka Router Reachability Probing.
380          *
381          * Router Reachability Probe MUST be rate-limited
382          * to no more than one per minute.
383          */
384         rcu_read_lock();
385         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
386         if (!neigh || (neigh->nud_state & NUD_VALID))
387                 goto out;
388         read_lock_bh(&neigh->lock);
389         if (!(neigh->nud_state & NUD_VALID) &&
390             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
391                 struct in6_addr mcaddr;
392                 struct in6_addr *target;
393
394                 neigh->updated = jiffies;
395                 read_unlock_bh(&neigh->lock);
396
397                 target = (struct in6_addr *)&neigh->primary_key;
398                 addrconf_addr_solict_mult(target, &mcaddr);
399                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
400         } else {
401                 read_unlock_bh(&neigh->lock);
402         }
403 out:
404         rcu_read_unlock();
405 }
406 #else
407 static inline void rt6_probe(struct rt6_info *rt)
408 {
409 }
410 #endif
411
412 /*
413  * Default Router Selection (RFC 2461 6.3.6)
414  */
415 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
416 {
417         struct net_device *dev = rt->rt6i_dev;
418         if (!oif || dev->ifindex == oif)
419                 return 2;
420         if ((dev->flags & IFF_LOOPBACK) &&
421             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
422                 return 1;
423         return 0;
424 }
425
426 static inline int rt6_check_neigh(struct rt6_info *rt)
427 {
428         struct neighbour *neigh;
429         int m;
430
431         rcu_read_lock();
432         neigh = dst_get_neighbour(&rt->dst);
433         if (rt->rt6i_flags & RTF_NONEXTHOP ||
434             !(rt->rt6i_flags & RTF_GATEWAY))
435                 m = 1;
436         else if (neigh) {
437                 read_lock_bh(&neigh->lock);
438                 if (neigh->nud_state & NUD_VALID)
439                         m = 2;
440 #ifdef CONFIG_IPV6_ROUTER_PREF
441                 else if (neigh->nud_state & NUD_FAILED)
442                         m = 0;
443 #endif
444                 else
445                         m = 1;
446                 read_unlock_bh(&neigh->lock);
447         } else
448                 m = 0;
449         rcu_read_unlock();
450         return m;
451 }
452
453 static int rt6_score_route(struct rt6_info *rt, int oif,
454                            int strict)
455 {
456         int m, n;
457
458         m = rt6_check_dev(rt, oif);
459         if (!m && (strict & RT6_LOOKUP_F_IFACE))
460                 return -1;
461 #ifdef CONFIG_IPV6_ROUTER_PREF
462         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
463 #endif
464         n = rt6_check_neigh(rt);
465         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
466                 return -1;
467         return m;
468 }
469
470 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
471                                    int *mpri, struct rt6_info *match)
472 {
473         int m;
474
475         if (rt6_check_expired(rt))
476                 goto out;
477
478         m = rt6_score_route(rt, oif, strict);
479         if (m < 0)
480                 goto out;
481
482         if (m > *mpri) {
483                 if (strict & RT6_LOOKUP_F_REACHABLE)
484                         rt6_probe(match);
485                 *mpri = m;
486                 match = rt;
487         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
488                 rt6_probe(rt);
489         }
490
491 out:
492         return match;
493 }
494
495 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
496                                      struct rt6_info *rr_head,
497                                      u32 metric, int oif, int strict)
498 {
499         struct rt6_info *rt, *match;
500         int mpri = -1;
501
502         match = NULL;
503         for (rt = rr_head; rt && rt->rt6i_metric == metric;
504              rt = rt->dst.rt6_next)
505                 match = find_match(rt, oif, strict, &mpri, match);
506         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509
510         return match;
511 }
512
513 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
514 {
515         struct rt6_info *match, *rt0;
516         struct net *net;
517
518         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
519                   __func__, fn->leaf, oif);
520
521         rt0 = fn->rr_ptr;
522         if (!rt0)
523                 fn->rr_ptr = rt0 = fn->leaf;
524
525         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
526
527         if (!match &&
528             (strict & RT6_LOOKUP_F_REACHABLE)) {
529                 struct rt6_info *next = rt0->dst.rt6_next;
530
531                 /* no entries matched; do round-robin */
532                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
533                         next = fn->leaf;
534
535                 if (next != rt0)
536                         fn->rr_ptr = next;
537         }
538
539         RT6_TRACE("%s() => %p\n",
540                   __func__, match);
541
542         net = dev_net(rt0->rt6i_dev);
543         return match ? match : net->ipv6.ip6_null_entry;
544 }
545
546 #ifdef CONFIG_IPV6_ROUTE_INFO
547 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
548                   const struct in6_addr *gwaddr)
549 {
550         struct net *net = dev_net(dev);
551         struct route_info *rinfo = (struct route_info *) opt;
552         struct in6_addr prefix_buf, *prefix;
553         unsigned int pref;
554         unsigned long lifetime;
555         struct rt6_info *rt;
556
557         if (len < sizeof(struct route_info)) {
558                 return -EINVAL;
559         }
560
561         /* Sanity check for prefix_len and length */
562         if (rinfo->length > 3) {
563                 return -EINVAL;
564         } else if (rinfo->prefix_len > 128) {
565                 return -EINVAL;
566         } else if (rinfo->prefix_len > 64) {
567                 if (rinfo->length < 2) {
568                         return -EINVAL;
569                 }
570         } else if (rinfo->prefix_len > 0) {
571                 if (rinfo->length < 1) {
572                         return -EINVAL;
573                 }
574         }
575
576         pref = rinfo->route_pref;
577         if (pref == ICMPV6_ROUTER_PREF_INVALID)
578                 return -EINVAL;
579
580         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
581
582         if (rinfo->length == 3)
583                 prefix = (struct in6_addr *)rinfo->prefix;
584         else {
585                 /* this function is safe */
586                 ipv6_addr_prefix(&prefix_buf,
587                                  (struct in6_addr *)rinfo->prefix,
588                                  rinfo->prefix_len);
589                 prefix = &prefix_buf;
590         }
591
592         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
593                                 dev->ifindex);
594
595         if (rt && !lifetime) {
596                 ip6_del_rt(rt);
597                 rt = NULL;
598         }
599
600         if (!rt && lifetime)
601                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
602                                         pref);
603         else if (rt)
604                 rt->rt6i_flags = RTF_ROUTEINFO |
605                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
606
607         if (rt) {
608                 if (!addrconf_finite_timeout(lifetime)) {
609                         rt->rt6i_flags &= ~RTF_EXPIRES;
610                 } else {
611                         rt->rt6i_expires = jiffies + HZ * lifetime;
612                         rt->rt6i_flags |= RTF_EXPIRES;
613                 }
614                 dst_release(&rt->dst);
615         }
616         return 0;
617 }
618 #endif
619
620 #define BACKTRACK(__net, saddr)                 \
621 do { \
622         if (rt == __net->ipv6.ip6_null_entry) { \
623                 struct fib6_node *pn; \
624                 while (1) { \
625                         if (fn->fn_flags & RTN_TL_ROOT) \
626                                 goto out; \
627                         pn = fn->parent; \
628                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
629                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
630                         else \
631                                 fn = pn; \
632                         if (fn->fn_flags & RTN_RTINFO) \
633                                 goto restart; \
634                 } \
635         } \
636 } while(0)
637
638 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
639                                              struct fib6_table *table,
640                                              struct flowi6 *fl6, int flags)
641 {
642         struct fib6_node *fn;
643         struct rt6_info *rt;
644
645         read_lock_bh(&table->tb6_lock);
646         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
647 restart:
648         rt = fn->leaf;
649         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
650         BACKTRACK(net, &fl6->saddr);
651 out:
652         dst_use(&rt->dst, jiffies);
653         read_unlock_bh(&table->tb6_lock);
654         return rt;
655
656 }
657
658 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
659                             const struct in6_addr *saddr, int oif, int strict)
660 {
661         struct flowi6 fl6 = {
662                 .flowi6_oif = oif,
663                 .daddr = *daddr,
664         };
665         struct dst_entry *dst;
666         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
667
668         if (saddr) {
669                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
670                 flags |= RT6_LOOKUP_F_HAS_SADDR;
671         }
672
673         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
674         if (dst->error == 0)
675                 return (struct rt6_info *) dst;
676
677         dst_release(dst);
678
679         return NULL;
680 }
681
682 EXPORT_SYMBOL(rt6_lookup);
683
684 /* ip6_ins_rt is called with FREE table->tb6_lock.
685    It takes new route entry, the addition fails by any reason the
686    route is freed. In any case, if caller does not hold it, it may
687    be destroyed.
688  */
689
690 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
691 {
692         int err;
693         struct fib6_table *table;
694
695         table = rt->rt6i_table;
696         write_lock_bh(&table->tb6_lock);
697         err = fib6_add(&table->tb6_root, rt, info);
698         write_unlock_bh(&table->tb6_lock);
699
700         return err;
701 }
702
703 int ip6_ins_rt(struct rt6_info *rt)
704 {
705         struct nl_info info = {
706                 .nl_net = dev_net(rt->rt6i_dev),
707         };
708         return __ip6_ins_rt(rt, &info);
709 }
710
711 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
712                                       const struct in6_addr *daddr,
713                                       const struct in6_addr *saddr)
714 {
715         struct rt6_info *rt;
716
717         /*
718          *      Clone the route.
719          */
720
721         rt = ip6_rt_copy(ort, daddr);
722
723         if (rt) {
724                 struct neighbour *neigh;
725                 int attempts = !in_softirq();
726
727                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
728                         if (rt->rt6i_dst.plen != 128 &&
729                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
730                                 rt->rt6i_flags |= RTF_ANYCAST;
731                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
732                 }
733
734                 rt->rt6i_flags |= RTF_CACHE;
735
736 #ifdef CONFIG_IPV6_SUBTREES
737                 if (rt->rt6i_src.plen && saddr) {
738                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
739                         rt->rt6i_src.plen = 128;
740                 }
741 #endif
742
743         retry:
744                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
745                 if (IS_ERR(neigh)) {
746                         struct net *net = dev_net(rt->rt6i_dev);
747                         int saved_rt_min_interval =
748                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
749                         int saved_rt_elasticity =
750                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
751
752                         if (attempts-- > 0) {
753                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
754                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
755
756                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
757
758                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
759                                         saved_rt_elasticity;
760                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
761                                         saved_rt_min_interval;
762                                 goto retry;
763                         }
764
765                         if (net_ratelimit())
766                                 printk(KERN_WARNING
767                                        "ipv6: Neighbour table overflow.\n");
768                         dst_free(&rt->dst);
769                         return NULL;
770                 }
771                 dst_set_neighbour(&rt->dst, neigh);
772
773         }
774
775         return rt;
776 }
777
778 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
779                                         const struct in6_addr *daddr)
780 {
781         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
782
783         if (rt) {
784                 rt->rt6i_flags |= RTF_CACHE;
785                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
786         }
787         return rt;
788 }
789
790 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
791                                       struct flowi6 *fl6, int flags)
792 {
793         struct fib6_node *fn;
794         struct rt6_info *rt, *nrt;
795         int strict = 0;
796         int attempts = 3;
797         int err;
798         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
799
800         strict |= flags & RT6_LOOKUP_F_IFACE;
801
802 relookup:
803         read_lock_bh(&table->tb6_lock);
804
805 restart_2:
806         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
807
808 restart:
809         rt = rt6_select(fn, oif, strict | reachable);
810
811         BACKTRACK(net, &fl6->saddr);
812         if (rt == net->ipv6.ip6_null_entry ||
813             rt->rt6i_flags & RTF_CACHE)
814                 goto out;
815
816         dst_hold(&rt->dst);
817         read_unlock_bh(&table->tb6_lock);
818
819         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
820                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
821         else if (!(rt->dst.flags & DST_HOST))
822                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
823         else
824                 goto out2;
825
826         dst_release(&rt->dst);
827         rt = nrt ? : net->ipv6.ip6_null_entry;
828
829         dst_hold(&rt->dst);
830         if (nrt) {
831                 err = ip6_ins_rt(nrt);
832                 if (!err)
833                         goto out2;
834         }
835
836         if (--attempts <= 0)
837                 goto out2;
838
839         /*
840          * Race condition! In the gap, when table->tb6_lock was
841          * released someone could insert this route.  Relookup.
842          */
843         dst_release(&rt->dst);
844         goto relookup;
845
846 out:
847         if (reachable) {
848                 reachable = 0;
849                 goto restart_2;
850         }
851         dst_hold(&rt->dst);
852         read_unlock_bh(&table->tb6_lock);
853 out2:
854         rt->dst.lastuse = jiffies;
855         rt->dst.__use++;
856
857         return rt;
858 }
859
860 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
861                                             struct flowi6 *fl6, int flags)
862 {
863         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
864 }
865
866 void ip6_route_input(struct sk_buff *skb)
867 {
868         const struct ipv6hdr *iph = ipv6_hdr(skb);
869         struct net *net = dev_net(skb->dev);
870         int flags = RT6_LOOKUP_F_HAS_SADDR;
871         struct flowi6 fl6 = {
872                 .flowi6_iif = skb->dev->ifindex,
873                 .daddr = iph->daddr,
874                 .saddr = iph->saddr,
875                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
876                 .flowi6_mark = skb->mark,
877                 .flowi6_proto = iph->nexthdr,
878         };
879
880         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
881                 flags |= RT6_LOOKUP_F_IFACE;
882
883         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
884 }
885
886 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
887                                              struct flowi6 *fl6, int flags)
888 {
889         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
890 }
891
892 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
893                                     struct flowi6 *fl6)
894 {
895         int flags = 0;
896
897         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
898                 flags |= RT6_LOOKUP_F_IFACE;
899
900         if (!ipv6_addr_any(&fl6->saddr))
901                 flags |= RT6_LOOKUP_F_HAS_SADDR;
902         else if (sk)
903                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
904
905         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
906 }
907
908 EXPORT_SYMBOL(ip6_route_output);
909
910 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
911 {
912         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
913         struct dst_entry *new = NULL;
914
915         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
916         if (rt) {
917                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
918
919                 new = &rt->dst;
920
921                 new->__use = 1;
922                 new->input = dst_discard;
923                 new->output = dst_discard;
924
925                 if (dst_metrics_read_only(&ort->dst))
926                         new->_metrics = ort->dst._metrics;
927                 else
928                         dst_copy_metrics(new, &ort->dst);
929                 rt->rt6i_idev = ort->rt6i_idev;
930                 if (rt->rt6i_idev)
931                         in6_dev_hold(rt->rt6i_idev);
932                 rt->rt6i_expires = 0;
933
934                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
935                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
936                 rt->rt6i_metric = 0;
937
938                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
939 #ifdef CONFIG_IPV6_SUBTREES
940                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
941 #endif
942
943                 dst_free(new);
944         }
945
946         dst_release(dst_orig);
947         return new ? new : ERR_PTR(-ENOMEM);
948 }
949
950 /*
951  *      Destination cache support functions
952  */
953
954 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
955 {
956         struct rt6_info *rt;
957
958         rt = (struct rt6_info *) dst;
959
960         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
961                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
962                         if (!rt->rt6i_peer)
963                                 rt6_bind_peer(rt, 0);
964                         rt->rt6i_peer_genid = rt6_peer_genid();
965                 }
966                 return dst;
967         }
968         return NULL;
969 }
970
971 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
972 {
973         struct rt6_info *rt = (struct rt6_info *) dst;
974
975         if (rt) {
976                 if (rt->rt6i_flags & RTF_CACHE) {
977                         if (rt6_check_expired(rt)) {
978                                 ip6_del_rt(rt);
979                                 dst = NULL;
980                         }
981                 } else {
982                         dst_release(dst);
983                         dst = NULL;
984                 }
985         }
986         return dst;
987 }
988
989 static void ip6_link_failure(struct sk_buff *skb)
990 {
991         struct rt6_info *rt;
992
993         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
994
995         rt = (struct rt6_info *) skb_dst(skb);
996         if (rt) {
997                 if (rt->rt6i_flags&RTF_CACHE) {
998                         dst_set_expires(&rt->dst, 0);
999                         rt->rt6i_flags |= RTF_EXPIRES;
1000                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1001                         rt->rt6i_node->fn_sernum = -1;
1002         }
1003 }
1004
1005 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1006 {
1007         struct rt6_info *rt6 = (struct rt6_info*)dst;
1008
1009         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1010                 rt6->rt6i_flags |= RTF_MODIFIED;
1011                 if (mtu < IPV6_MIN_MTU) {
1012                         u32 features = dst_metric(dst, RTAX_FEATURES);
1013                         mtu = IPV6_MIN_MTU;
1014                         features |= RTAX_FEATURE_ALLFRAG;
1015                         dst_metric_set(dst, RTAX_FEATURES, features);
1016                 }
1017                 dst_metric_set(dst, RTAX_MTU, mtu);
1018         }
1019 }
1020
1021 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1022 {
1023         struct net_device *dev = dst->dev;
1024         unsigned int mtu = dst_mtu(dst);
1025         struct net *net = dev_net(dev);
1026
1027         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1028
1029         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1030                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1031
1032         /*
1033          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1034          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1035          * IPV6_MAXPLEN is also valid and means: "any MSS,
1036          * rely only on pmtu discovery"
1037          */
1038         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1039                 mtu = IPV6_MAXPLEN;
1040         return mtu;
1041 }
1042
1043 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1044 {
1045         unsigned int mtu = IPV6_MIN_MTU;
1046         struct inet6_dev *idev;
1047
1048         rcu_read_lock();
1049         idev = __in6_dev_get(dst->dev);
1050         if (idev)
1051                 mtu = idev->cnf.mtu6;
1052         rcu_read_unlock();
1053
1054         return mtu;
1055 }
1056
1057 static struct dst_entry *icmp6_dst_gc_list;
1058 static DEFINE_SPINLOCK(icmp6_dst_lock);
1059
1060 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1061                                   struct neighbour *neigh,
1062                                   const struct in6_addr *addr)
1063 {
1064         struct rt6_info *rt;
1065         struct inet6_dev *idev = in6_dev_get(dev);
1066         struct net *net = dev_net(dev);
1067
1068         if (unlikely(idev == NULL))
1069                 return NULL;
1070
1071         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1072         if (unlikely(rt == NULL)) {
1073                 in6_dev_put(idev);
1074                 goto out;
1075         }
1076
1077         if (neigh)
1078                 neigh_hold(neigh);
1079         else {
1080                 neigh = ndisc_get_neigh(dev, addr);
1081                 if (IS_ERR(neigh))
1082                         neigh = NULL;
1083         }
1084
1085         rt->dst.flags |= DST_HOST;
1086         rt->dst.output  = ip6_output;
1087         dst_set_neighbour(&rt->dst, neigh);
1088         atomic_set(&rt->dst.__refcnt, 1);
1089         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1090         rt->rt6i_dst.plen = 128;
1091         rt->rt6i_idev     = idev;
1092         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1093
1094         spin_lock_bh(&icmp6_dst_lock);
1095         rt->dst.next = icmp6_dst_gc_list;
1096         icmp6_dst_gc_list = &rt->dst;
1097         spin_unlock_bh(&icmp6_dst_lock);
1098
1099         fib6_force_start_gc(net);
1100
1101 out:
1102         return &rt->dst;
1103 }
1104
1105 int icmp6_dst_gc(void)
1106 {
1107         struct dst_entry *dst, **pprev;
1108         int more = 0;
1109
1110         spin_lock_bh(&icmp6_dst_lock);
1111         pprev = &icmp6_dst_gc_list;
1112
1113         while ((dst = *pprev) != NULL) {
1114                 if (!atomic_read(&dst->__refcnt)) {
1115                         *pprev = dst->next;
1116                         dst_free(dst);
1117                 } else {
1118                         pprev = &dst->next;
1119                         ++more;
1120                 }
1121         }
1122
1123         spin_unlock_bh(&icmp6_dst_lock);
1124
1125         return more;
1126 }
1127
1128 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1129                             void *arg)
1130 {
1131         struct dst_entry *dst, **pprev;
1132
1133         spin_lock_bh(&icmp6_dst_lock);
1134         pprev = &icmp6_dst_gc_list;
1135         while ((dst = *pprev) != NULL) {
1136                 struct rt6_info *rt = (struct rt6_info *) dst;
1137                 if (func(rt, arg)) {
1138                         *pprev = dst->next;
1139                         dst_free(dst);
1140                 } else {
1141                         pprev = &dst->next;
1142                 }
1143         }
1144         spin_unlock_bh(&icmp6_dst_lock);
1145 }
1146
1147 static int ip6_dst_gc(struct dst_ops *ops)
1148 {
1149         unsigned long now = jiffies;
1150         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1151         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1152         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1153         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1154         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1155         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1156         int entries;
1157
1158         entries = dst_entries_get_fast(ops);
1159         if (time_after(rt_last_gc + rt_min_interval, now) &&
1160             entries <= rt_max_size)
1161                 goto out;
1162
1163         net->ipv6.ip6_rt_gc_expire++;
1164         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1165         net->ipv6.ip6_rt_last_gc = now;
1166         entries = dst_entries_get_slow(ops);
1167         if (entries < ops->gc_thresh)
1168                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1169 out:
1170         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1171         return entries > rt_max_size;
1172 }
1173
1174 /* Clean host part of a prefix. Not necessary in radix tree,
1175    but results in cleaner routing tables.
1176
1177    Remove it only when all the things will work!
1178  */
1179
1180 int ip6_dst_hoplimit(struct dst_entry *dst)
1181 {
1182         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1183         if (hoplimit == 0) {
1184                 struct net_device *dev = dst->dev;
1185                 struct inet6_dev *idev;
1186
1187                 rcu_read_lock();
1188                 idev = __in6_dev_get(dev);
1189                 if (idev)
1190                         hoplimit = idev->cnf.hop_limit;
1191                 else
1192                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1193                 rcu_read_unlock();
1194         }
1195         return hoplimit;
1196 }
1197 EXPORT_SYMBOL(ip6_dst_hoplimit);
1198
1199 /*
1200  *
1201  */
1202
1203 int ip6_route_add(struct fib6_config *cfg)
1204 {
1205         int err;
1206         struct net *net = cfg->fc_nlinfo.nl_net;
1207         struct rt6_info *rt = NULL;
1208         struct net_device *dev = NULL;
1209         struct inet6_dev *idev = NULL;
1210         struct fib6_table *table;
1211         int addr_type;
1212
1213         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1214                 return -EINVAL;
1215 #ifndef CONFIG_IPV6_SUBTREES
1216         if (cfg->fc_src_len)
1217                 return -EINVAL;
1218 #endif
1219         if (cfg->fc_ifindex) {
1220                 err = -ENODEV;
1221                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1222                 if (!dev)
1223                         goto out;
1224                 idev = in6_dev_get(dev);
1225                 if (!idev)
1226                         goto out;
1227         }
1228
1229         if (cfg->fc_metric == 0)
1230                 cfg->fc_metric = IP6_RT_PRIO_USER;
1231
1232         table = fib6_new_table(net, cfg->fc_table);
1233         if (table == NULL) {
1234                 err = -ENOBUFS;
1235                 goto out;
1236         }
1237
1238         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1239
1240         if (rt == NULL) {
1241                 err = -ENOMEM;
1242                 goto out;
1243         }
1244
1245         rt->dst.obsolete = -1;
1246         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1247                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1248                                 0;
1249
1250         if (cfg->fc_protocol == RTPROT_UNSPEC)
1251                 cfg->fc_protocol = RTPROT_BOOT;
1252         rt->rt6i_protocol = cfg->fc_protocol;
1253
1254         addr_type = ipv6_addr_type(&cfg->fc_dst);
1255
1256         if (addr_type & IPV6_ADDR_MULTICAST)
1257                 rt->dst.input = ip6_mc_input;
1258         else if (cfg->fc_flags & RTF_LOCAL)
1259                 rt->dst.input = ip6_input;
1260         else
1261                 rt->dst.input = ip6_forward;
1262
1263         rt->dst.output = ip6_output;
1264
1265         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1266         rt->rt6i_dst.plen = cfg->fc_dst_len;
1267         if (rt->rt6i_dst.plen == 128)
1268                rt->dst.flags |= DST_HOST;
1269
1270         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1271                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1272                 if (!metrics) {
1273                         err = -ENOMEM;
1274                         goto out;
1275                 }
1276                 dst_init_metrics(&rt->dst, metrics, 0);
1277         }
1278 #ifdef CONFIG_IPV6_SUBTREES
1279         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1280         rt->rt6i_src.plen = cfg->fc_src_len;
1281 #endif
1282
1283         rt->rt6i_metric = cfg->fc_metric;
1284
1285         /* We cannot add true routes via loopback here,
1286            they would result in kernel looping; promote them to reject routes
1287          */
1288         if ((cfg->fc_flags & RTF_REJECT) ||
1289             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1290                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1291                 /* hold loopback dev/idev if we haven't done so. */
1292                 if (dev != net->loopback_dev) {
1293                         if (dev) {
1294                                 dev_put(dev);
1295                                 in6_dev_put(idev);
1296                         }
1297                         dev = net->loopback_dev;
1298                         dev_hold(dev);
1299                         idev = in6_dev_get(dev);
1300                         if (!idev) {
1301                                 err = -ENODEV;
1302                                 goto out;
1303                         }
1304                 }
1305                 rt->dst.output = ip6_pkt_discard_out;
1306                 rt->dst.input = ip6_pkt_discard;
1307                 rt->dst.error = -ENETUNREACH;
1308                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1309                 goto install_route;
1310         }
1311
1312         if (cfg->fc_flags & RTF_GATEWAY) {
1313                 const struct in6_addr *gw_addr;
1314                 int gwa_type;
1315
1316                 gw_addr = &cfg->fc_gateway;
1317                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1318                 gwa_type = ipv6_addr_type(gw_addr);
1319
1320                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1321                         struct rt6_info *grt;
1322
1323                         /* IPv6 strictly inhibits using not link-local
1324                            addresses as nexthop address.
1325                            Otherwise, router will not able to send redirects.
1326                            It is very good, but in some (rare!) circumstances
1327                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1328                            some exceptions. --ANK
1329                          */
1330                         err = -EINVAL;
1331                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1332                                 goto out;
1333
1334                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1335
1336                         err = -EHOSTUNREACH;
1337                         if (grt == NULL)
1338                                 goto out;
1339                         if (dev) {
1340                                 if (dev != grt->rt6i_dev) {
1341                                         dst_release(&grt->dst);
1342                                         goto out;
1343                                 }
1344                         } else {
1345                                 dev = grt->rt6i_dev;
1346                                 idev = grt->rt6i_idev;
1347                                 dev_hold(dev);
1348                                 in6_dev_hold(grt->rt6i_idev);
1349                         }
1350                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1351                                 err = 0;
1352                         dst_release(&grt->dst);
1353
1354                         if (err)
1355                                 goto out;
1356                 }
1357                 err = -EINVAL;
1358                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1359                         goto out;
1360         }
1361
1362         err = -ENODEV;
1363         if (dev == NULL)
1364                 goto out;
1365
1366         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1367                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1368                         err = -EINVAL;
1369                         goto out;
1370                 }
1371                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1372                 rt->rt6i_prefsrc.plen = 128;
1373         } else
1374                 rt->rt6i_prefsrc.plen = 0;
1375
1376         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1377                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1378                 if (IS_ERR(n)) {
1379                         err = PTR_ERR(n);
1380                         goto out;
1381                 }
1382                 dst_set_neighbour(&rt->dst, n);
1383         }
1384
1385         rt->rt6i_flags = cfg->fc_flags;
1386
1387 install_route:
1388         if (cfg->fc_mx) {
1389                 struct nlattr *nla;
1390                 int remaining;
1391
1392                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1393                         int type = nla_type(nla);
1394
1395                         if (type) {
1396                                 if (type > RTAX_MAX) {
1397                                         err = -EINVAL;
1398                                         goto out;
1399                                 }
1400
1401                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1402                         }
1403                 }
1404         }
1405
1406         rt->dst.dev = dev;
1407         rt->rt6i_idev = idev;
1408         rt->rt6i_table = table;
1409
1410         cfg->fc_nlinfo.nl_net = dev_net(dev);
1411
1412         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1413
1414 out:
1415         if (dev)
1416                 dev_put(dev);
1417         if (idev)
1418                 in6_dev_put(idev);
1419         if (rt)
1420                 dst_free(&rt->dst);
1421         return err;
1422 }
1423
1424 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1425 {
1426         int err;
1427         struct fib6_table *table;
1428         struct net *net = dev_net(rt->rt6i_dev);
1429
1430         if (rt == net->ipv6.ip6_null_entry)
1431                 return -ENOENT;
1432
1433         table = rt->rt6i_table;
1434         write_lock_bh(&table->tb6_lock);
1435
1436         err = fib6_del(rt, info);
1437         dst_release(&rt->dst);
1438
1439         write_unlock_bh(&table->tb6_lock);
1440
1441         return err;
1442 }
1443
1444 int ip6_del_rt(struct rt6_info *rt)
1445 {
1446         struct nl_info info = {
1447                 .nl_net = dev_net(rt->rt6i_dev),
1448         };
1449         return __ip6_del_rt(rt, &info);
1450 }
1451
1452 static int ip6_route_del(struct fib6_config *cfg)
1453 {
1454         struct fib6_table *table;
1455         struct fib6_node *fn;
1456         struct rt6_info *rt;
1457         int err = -ESRCH;
1458
1459         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1460         if (table == NULL)
1461                 return err;
1462
1463         read_lock_bh(&table->tb6_lock);
1464
1465         fn = fib6_locate(&table->tb6_root,
1466                          &cfg->fc_dst, cfg->fc_dst_len,
1467                          &cfg->fc_src, cfg->fc_src_len);
1468
1469         if (fn) {
1470                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1471                         if (cfg->fc_ifindex &&
1472                             (rt->rt6i_dev == NULL ||
1473                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1474                                 continue;
1475                         if (cfg->fc_flags & RTF_GATEWAY &&
1476                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1477                                 continue;
1478                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1479                                 continue;
1480                         dst_hold(&rt->dst);
1481                         read_unlock_bh(&table->tb6_lock);
1482
1483                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1484                 }
1485         }
1486         read_unlock_bh(&table->tb6_lock);
1487
1488         return err;
1489 }
1490
1491 /*
1492  *      Handle redirects
1493  */
1494 struct ip6rd_flowi {
1495         struct flowi6 fl6;
1496         struct in6_addr gateway;
1497 };
1498
1499 static struct rt6_info *__ip6_route_redirect(struct net *net,
1500                                              struct fib6_table *table,
1501                                              struct flowi6 *fl6,
1502                                              int flags)
1503 {
1504         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1505         struct rt6_info *rt;
1506         struct fib6_node *fn;
1507
1508         /*
1509          * Get the "current" route for this destination and
1510          * check if the redirect has come from approriate router.
1511          *
1512          * RFC 2461 specifies that redirects should only be
1513          * accepted if they come from the nexthop to the target.
1514          * Due to the way the routes are chosen, this notion
1515          * is a bit fuzzy and one might need to check all possible
1516          * routes.
1517          */
1518
1519         read_lock_bh(&table->tb6_lock);
1520         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1521 restart:
1522         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1523                 /*
1524                  * Current route is on-link; redirect is always invalid.
1525                  *
1526                  * Seems, previous statement is not true. It could
1527                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1528                  * But then router serving it might decide, that we should
1529                  * know truth 8)8) --ANK (980726).
1530                  */
1531                 if (rt6_check_expired(rt))
1532                         continue;
1533                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1534                         continue;
1535                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1536                         continue;
1537                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1538                         continue;
1539                 break;
1540         }
1541
1542         if (!rt)
1543                 rt = net->ipv6.ip6_null_entry;
1544         BACKTRACK(net, &fl6->saddr);
1545 out:
1546         dst_hold(&rt->dst);
1547
1548         read_unlock_bh(&table->tb6_lock);
1549
1550         return rt;
1551 };
1552
1553 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1554                                            const struct in6_addr *src,
1555                                            const struct in6_addr *gateway,
1556                                            struct net_device *dev)
1557 {
1558         int flags = RT6_LOOKUP_F_HAS_SADDR;
1559         struct net *net = dev_net(dev);
1560         struct ip6rd_flowi rdfl = {
1561                 .fl6 = {
1562                         .flowi6_oif = dev->ifindex,
1563                         .daddr = *dest,
1564                         .saddr = *src,
1565                 },
1566         };
1567
1568         ipv6_addr_copy(&rdfl.gateway, gateway);
1569
1570         if (rt6_need_strict(dest))
1571                 flags |= RT6_LOOKUP_F_IFACE;
1572
1573         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1574                                                    flags, __ip6_route_redirect);
1575 }
1576
1577 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1578                   const struct in6_addr *saddr,
1579                   struct neighbour *neigh, u8 *lladdr, int on_link)
1580 {
1581         struct rt6_info *rt, *nrt = NULL;
1582         struct netevent_redirect netevent;
1583         struct net *net = dev_net(neigh->dev);
1584
1585         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1586
1587         if (rt == net->ipv6.ip6_null_entry) {
1588                 if (net_ratelimit())
1589                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1590                                "for redirect target\n");
1591                 goto out;
1592         }
1593
1594         /*
1595          *      We have finally decided to accept it.
1596          */
1597
1598         neigh_update(neigh, lladdr, NUD_STALE,
1599                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1600                      NEIGH_UPDATE_F_OVERRIDE|
1601                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1602                                      NEIGH_UPDATE_F_ISROUTER))
1603                      );
1604
1605         /*
1606          * Redirect received -> path was valid.
1607          * Look, redirects are sent only in response to data packets,
1608          * so that this nexthop apparently is reachable. --ANK
1609          */
1610         dst_confirm(&rt->dst);
1611
1612         /* Duplicate redirect: silently ignore. */
1613         if (neigh == dst_get_neighbour_raw(&rt->dst))
1614                 goto out;
1615
1616         nrt = ip6_rt_copy(rt, dest);
1617         if (nrt == NULL)
1618                 goto out;
1619
1620         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1621         if (on_link)
1622                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1623
1624         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1625         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1626
1627         if (ip6_ins_rt(nrt))
1628                 goto out;
1629
1630         netevent.old = &rt->dst;
1631         netevent.new = &nrt->dst;
1632         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1633
1634         if (rt->rt6i_flags&RTF_CACHE) {
1635                 ip6_del_rt(rt);
1636                 return;
1637         }
1638
1639 out:
1640         dst_release(&rt->dst);
1641 }
1642
1643 /*
1644  *      Handle ICMP "packet too big" messages
1645  *      i.e. Path MTU discovery
1646  */
1647
1648 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1649                              struct net *net, u32 pmtu, int ifindex)
1650 {
1651         struct rt6_info *rt, *nrt;
1652         int allfrag = 0;
1653 again:
1654         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1655         if (rt == NULL)
1656                 return;
1657
1658         if (rt6_check_expired(rt)) {
1659                 ip6_del_rt(rt);
1660                 goto again;
1661         }
1662
1663         if (pmtu >= dst_mtu(&rt->dst))
1664                 goto out;
1665
1666         if (pmtu < IPV6_MIN_MTU) {
1667                 /*
1668                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1669                  * MTU (1280) and a fragment header should always be included
1670                  * after a node receiving Too Big message reporting PMTU is
1671                  * less than the IPv6 Minimum Link MTU.
1672                  */
1673                 pmtu = IPV6_MIN_MTU;
1674                 allfrag = 1;
1675         }
1676
1677         /* New mtu received -> path was valid.
1678            They are sent only in response to data packets,
1679            so that this nexthop apparently is reachable. --ANK
1680          */
1681         dst_confirm(&rt->dst);
1682
1683         /* Host route. If it is static, it would be better
1684            not to override it, but add new one, so that
1685            when cache entry will expire old pmtu
1686            would return automatically.
1687          */
1688         if (rt->rt6i_flags & RTF_CACHE) {
1689                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1690                 if (allfrag) {
1691                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1692                         features |= RTAX_FEATURE_ALLFRAG;
1693                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1694                 }
1695                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1696                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1697                 goto out;
1698         }
1699
1700         /* Network route.
1701            Two cases are possible:
1702            1. It is connected route. Action: COW
1703            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1704          */
1705         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1706                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1707         else
1708                 nrt = rt6_alloc_clone(rt, daddr);
1709
1710         if (nrt) {
1711                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1712                 if (allfrag) {
1713                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1714                         features |= RTAX_FEATURE_ALLFRAG;
1715                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1716                 }
1717
1718                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1719                  * happened within 5 mins, the recommended timer is 10 mins.
1720                  * Here this route expiration time is set to ip6_rt_mtu_expires
1721                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1722                  * and detecting PMTU increase will be automatically happened.
1723                  */
1724                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1725                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1726
1727                 ip6_ins_rt(nrt);
1728         }
1729 out:
1730         dst_release(&rt->dst);
1731 }
1732
1733 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1734                         struct net_device *dev, u32 pmtu)
1735 {
1736         struct net *net = dev_net(dev);
1737
1738         /*
1739          * RFC 1981 states that a node "MUST reduce the size of the packets it
1740          * is sending along the path" that caused the Packet Too Big message.
1741          * Since it's not possible in the general case to determine which
1742          * interface was used to send the original packet, we update the MTU
1743          * on the interface that will be used to send future packets. We also
1744          * update the MTU on the interface that received the Packet Too Big in
1745          * case the original packet was forced out that interface with
1746          * SO_BINDTODEVICE or similar. This is the next best thing to the
1747          * correct behaviour, which would be to update the MTU on all
1748          * interfaces.
1749          */
1750         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1751         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1752 }
1753
1754 /*
1755  *      Misc support functions
1756  */
1757
1758 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1759                                     const struct in6_addr *dest)
1760 {
1761         struct net *net = dev_net(ort->rt6i_dev);
1762         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1763                                             ort->dst.dev, 0);
1764
1765         if (rt) {
1766                 rt->dst.input = ort->dst.input;
1767                 rt->dst.output = ort->dst.output;
1768                 rt->dst.flags |= DST_HOST;
1769
1770                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1771                 rt->rt6i_dst.plen = 128;
1772                 dst_copy_metrics(&rt->dst, &ort->dst);
1773                 rt->dst.error = ort->dst.error;
1774                 rt->rt6i_idev = ort->rt6i_idev;
1775                 if (rt->rt6i_idev)
1776                         in6_dev_hold(rt->rt6i_idev);
1777                 rt->dst.lastuse = jiffies;
1778                 rt->rt6i_expires = 0;
1779
1780                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1781                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1782                 rt->rt6i_metric = 0;
1783
1784 #ifdef CONFIG_IPV6_SUBTREES
1785                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1786 #endif
1787                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1788                 rt->rt6i_table = ort->rt6i_table;
1789         }
1790         return rt;
1791 }
1792
1793 #ifdef CONFIG_IPV6_ROUTE_INFO
1794 static struct rt6_info *rt6_get_route_info(struct net *net,
1795                                            const struct in6_addr *prefix, int prefixlen,
1796                                            const struct in6_addr *gwaddr, int ifindex)
1797 {
1798         struct fib6_node *fn;
1799         struct rt6_info *rt = NULL;
1800         struct fib6_table *table;
1801
1802         table = fib6_get_table(net, RT6_TABLE_INFO);
1803         if (table == NULL)
1804                 return NULL;
1805
1806         write_lock_bh(&table->tb6_lock);
1807         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1808         if (!fn)
1809                 goto out;
1810
1811         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1812                 if (rt->rt6i_dev->ifindex != ifindex)
1813                         continue;
1814                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1815                         continue;
1816                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1817                         continue;
1818                 dst_hold(&rt->dst);
1819                 break;
1820         }
1821 out:
1822         write_unlock_bh(&table->tb6_lock);
1823         return rt;
1824 }
1825
1826 static struct rt6_info *rt6_add_route_info(struct net *net,
1827                                            const struct in6_addr *prefix, int prefixlen,
1828                                            const struct in6_addr *gwaddr, int ifindex,
1829                                            unsigned pref)
1830 {
1831         struct fib6_config cfg = {
1832                 .fc_table       = RT6_TABLE_INFO,
1833                 .fc_metric      = IP6_RT_PRIO_USER,
1834                 .fc_ifindex     = ifindex,
1835                 .fc_dst_len     = prefixlen,
1836                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1837                                   RTF_UP | RTF_PREF(pref),
1838                 .fc_nlinfo.pid = 0,
1839                 .fc_nlinfo.nlh = NULL,
1840                 .fc_nlinfo.nl_net = net,
1841         };
1842
1843         ipv6_addr_copy(&cfg.fc_dst, prefix);
1844         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1845
1846         /* We should treat it as a default route if prefix length is 0. */
1847         if (!prefixlen)
1848                 cfg.fc_flags |= RTF_DEFAULT;
1849
1850         ip6_route_add(&cfg);
1851
1852         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1853 }
1854 #endif
1855
1856 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1857 {
1858         struct rt6_info *rt;
1859         struct fib6_table *table;
1860
1861         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1862         if (table == NULL)
1863                 return NULL;
1864
1865         write_lock_bh(&table->tb6_lock);
1866         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1867                 if (dev == rt->rt6i_dev &&
1868                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1869                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1870                         break;
1871         }
1872         if (rt)
1873                 dst_hold(&rt->dst);
1874         write_unlock_bh(&table->tb6_lock);
1875         return rt;
1876 }
1877
1878 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1879                                      struct net_device *dev,
1880                                      unsigned int pref)
1881 {
1882         struct fib6_config cfg = {
1883                 .fc_table       = RT6_TABLE_DFLT,
1884                 .fc_metric      = IP6_RT_PRIO_USER,
1885                 .fc_ifindex     = dev->ifindex,
1886                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1887                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1888                 .fc_nlinfo.pid = 0,
1889                 .fc_nlinfo.nlh = NULL,
1890                 .fc_nlinfo.nl_net = dev_net(dev),
1891         };
1892
1893         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1894
1895         ip6_route_add(&cfg);
1896
1897         return rt6_get_dflt_router(gwaddr, dev);
1898 }
1899
1900 void rt6_purge_dflt_routers(struct net *net)
1901 {
1902         struct rt6_info *rt;
1903         struct fib6_table *table;
1904
1905         /* NOTE: Keep consistent with rt6_get_dflt_router */
1906         table = fib6_get_table(net, RT6_TABLE_DFLT);
1907         if (table == NULL)
1908                 return;
1909
1910 restart:
1911         read_lock_bh(&table->tb6_lock);
1912         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1913                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1914                         dst_hold(&rt->dst);
1915                         read_unlock_bh(&table->tb6_lock);
1916                         ip6_del_rt(rt);
1917                         goto restart;
1918                 }
1919         }
1920         read_unlock_bh(&table->tb6_lock);
1921 }
1922
1923 static void rtmsg_to_fib6_config(struct net *net,
1924                                  struct in6_rtmsg *rtmsg,
1925                                  struct fib6_config *cfg)
1926 {
1927         memset(cfg, 0, sizeof(*cfg));
1928
1929         cfg->fc_table = RT6_TABLE_MAIN;
1930         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1931         cfg->fc_metric = rtmsg->rtmsg_metric;
1932         cfg->fc_expires = rtmsg->rtmsg_info;
1933         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1934         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1935         cfg->fc_flags = rtmsg->rtmsg_flags;
1936
1937         cfg->fc_nlinfo.nl_net = net;
1938
1939         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1940         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1941         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1942 }
1943
1944 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1945 {
1946         struct fib6_config cfg;
1947         struct in6_rtmsg rtmsg;
1948         int err;
1949
1950         switch(cmd) {
1951         case SIOCADDRT:         /* Add a route */
1952         case SIOCDELRT:         /* Delete a route */
1953                 if (!capable(CAP_NET_ADMIN))
1954                         return -EPERM;
1955                 err = copy_from_user(&rtmsg, arg,
1956                                      sizeof(struct in6_rtmsg));
1957                 if (err)
1958                         return -EFAULT;
1959
1960                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1961
1962                 rtnl_lock();
1963                 switch (cmd) {
1964                 case SIOCADDRT:
1965                         err = ip6_route_add(&cfg);
1966                         break;
1967                 case SIOCDELRT:
1968                         err = ip6_route_del(&cfg);
1969                         break;
1970                 default:
1971                         err = -EINVAL;
1972                 }
1973                 rtnl_unlock();
1974
1975                 return err;
1976         }
1977
1978         return -EINVAL;
1979 }
1980
1981 /*
1982  *      Drop the packet on the floor
1983  */
1984
1985 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1986 {
1987         int type;
1988         struct dst_entry *dst = skb_dst(skb);
1989         switch (ipstats_mib_noroutes) {
1990         case IPSTATS_MIB_INNOROUTES:
1991                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1992                 if (type == IPV6_ADDR_ANY) {
1993                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1994                                       IPSTATS_MIB_INADDRERRORS);
1995                         break;
1996                 }
1997                 /* FALLTHROUGH */
1998         case IPSTATS_MIB_OUTNOROUTES:
1999                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2000                               ipstats_mib_noroutes);
2001                 break;
2002         }
2003         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2004         kfree_skb(skb);
2005         return 0;
2006 }
2007
2008 static int ip6_pkt_discard(struct sk_buff *skb)
2009 {
2010         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2011 }
2012
2013 static int ip6_pkt_discard_out(struct sk_buff *skb)
2014 {
2015         skb->dev = skb_dst(skb)->dev;
2016         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2017 }
2018
2019 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2020
2021 static int ip6_pkt_prohibit(struct sk_buff *skb)
2022 {
2023         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2024 }
2025
2026 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2027 {
2028         skb->dev = skb_dst(skb)->dev;
2029         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2030 }
2031
2032 #endif
2033
2034 /*
2035  *      Allocate a dst for local (unicast / anycast) address.
2036  */
2037
2038 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2039                                     const struct in6_addr *addr,
2040                                     int anycast)
2041 {
2042         struct net *net = dev_net(idev->dev);
2043         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2044                                             net->loopback_dev, 0);
2045         struct neighbour *neigh;
2046
2047         if (rt == NULL) {
2048                 if (net_ratelimit())
2049                         pr_warning("IPv6:  Maximum number of routes reached,"
2050                                    " consider increasing route/max_size.\n");
2051                 return ERR_PTR(-ENOMEM);
2052         }
2053
2054         in6_dev_hold(idev);
2055
2056         rt->dst.flags |= DST_HOST;
2057         rt->dst.input = ip6_input;
2058         rt->dst.output = ip6_output;
2059         rt->rt6i_idev = idev;
2060         rt->dst.obsolete = -1;
2061
2062         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2063         if (anycast)
2064                 rt->rt6i_flags |= RTF_ANYCAST;
2065         else
2066                 rt->rt6i_flags |= RTF_LOCAL;
2067         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2068         if (IS_ERR(neigh)) {
2069                 dst_free(&rt->dst);
2070
2071                 return ERR_CAST(neigh);
2072         }
2073         dst_set_neighbour(&rt->dst, neigh);
2074
2075         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2076         rt->rt6i_dst.plen = 128;
2077         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2078
2079         atomic_set(&rt->dst.__refcnt, 1);
2080
2081         return rt;
2082 }
2083
2084 int ip6_route_get_saddr(struct net *net,
2085                         struct rt6_info *rt,
2086                         const struct in6_addr *daddr,
2087                         unsigned int prefs,
2088                         struct in6_addr *saddr)
2089 {
2090         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2091         int err = 0;
2092         if (rt->rt6i_prefsrc.plen)
2093                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2094         else
2095                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2096                                          daddr, prefs, saddr);
2097         return err;
2098 }
2099
2100 /* remove deleted ip from prefsrc entries */
2101 struct arg_dev_net_ip {
2102         struct net_device *dev;
2103         struct net *net;
2104         struct in6_addr *addr;
2105 };
2106
2107 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2108 {
2109         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2110         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2111         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2112
2113         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2114             rt != net->ipv6.ip6_null_entry &&
2115             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2116                 /* remove prefsrc entry */
2117                 rt->rt6i_prefsrc.plen = 0;
2118         }
2119         return 0;
2120 }
2121
2122 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2123 {
2124         struct net *net = dev_net(ifp->idev->dev);
2125         struct arg_dev_net_ip adni = {
2126                 .dev = ifp->idev->dev,
2127                 .net = net,
2128                 .addr = &ifp->addr,
2129         };
2130         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2131 }
2132
2133 struct arg_dev_net {
2134         struct net_device *dev;
2135         struct net *net;
2136 };
2137
2138 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2139 {
2140         const struct arg_dev_net *adn = arg;
2141         const struct net_device *dev = adn->dev;
2142
2143         if ((rt->rt6i_dev == dev || dev == NULL) &&
2144             rt != adn->net->ipv6.ip6_null_entry) {
2145                 RT6_TRACE("deleted by ifdown %p\n", rt);
2146                 return -1;
2147         }
2148         return 0;
2149 }
2150
2151 void rt6_ifdown(struct net *net, struct net_device *dev)
2152 {
2153         struct arg_dev_net adn = {
2154                 .dev = dev,
2155                 .net = net,
2156         };
2157
2158         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2159         icmp6_clean_all(fib6_ifdown, &adn);
2160 }
2161
2162 struct rt6_mtu_change_arg
2163 {
2164         struct net_device *dev;
2165         unsigned mtu;
2166 };
2167
2168 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2169 {
2170         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2171         struct inet6_dev *idev;
2172
2173         /* In IPv6 pmtu discovery is not optional,
2174            so that RTAX_MTU lock cannot disable it.
2175            We still use this lock to block changes
2176            caused by addrconf/ndisc.
2177         */
2178
2179         idev = __in6_dev_get(arg->dev);
2180         if (idev == NULL)
2181                 return 0;
2182
2183         /* For administrative MTU increase, there is no way to discover
2184            IPv6 PMTU increase, so PMTU increase should be updated here.
2185            Since RFC 1981 doesn't include administrative MTU increase
2186            update PMTU increase is a MUST. (i.e. jumbo frame)
2187          */
2188         /*
2189            If new MTU is less than route PMTU, this new MTU will be the
2190            lowest MTU in the path, update the route PMTU to reflect PMTU
2191            decreases; if new MTU is greater than route PMTU, and the
2192            old MTU is the lowest MTU in the path, update the route PMTU
2193            to reflect the increase. In this case if the other nodes' MTU
2194            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2195            PMTU discouvery.
2196          */
2197         if (rt->rt6i_dev == arg->dev &&
2198             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2199             (dst_mtu(&rt->dst) >= arg->mtu ||
2200              (dst_mtu(&rt->dst) < arg->mtu &&
2201               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2202                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2203         }
2204         return 0;
2205 }
2206
2207 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2208 {
2209         struct rt6_mtu_change_arg arg = {
2210                 .dev = dev,
2211                 .mtu = mtu,
2212         };
2213
2214         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2215 }
2216
2217 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2218         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2219         [RTA_OIF]               = { .type = NLA_U32 },
2220         [RTA_IIF]               = { .type = NLA_U32 },
2221         [RTA_PRIORITY]          = { .type = NLA_U32 },
2222         [RTA_METRICS]           = { .type = NLA_NESTED },
2223 };
2224
2225 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2226                               struct fib6_config *cfg)
2227 {
2228         struct rtmsg *rtm;
2229         struct nlattr *tb[RTA_MAX+1];
2230         int err;
2231
2232         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2233         if (err < 0)
2234                 goto errout;
2235
2236         err = -EINVAL;
2237         rtm = nlmsg_data(nlh);
2238         memset(cfg, 0, sizeof(*cfg));
2239
2240         cfg->fc_table = rtm->rtm_table;
2241         cfg->fc_dst_len = rtm->rtm_dst_len;
2242         cfg->fc_src_len = rtm->rtm_src_len;
2243         cfg->fc_flags = RTF_UP;
2244         cfg->fc_protocol = rtm->rtm_protocol;
2245
2246         if (rtm->rtm_type == RTN_UNREACHABLE)
2247                 cfg->fc_flags |= RTF_REJECT;
2248
2249         if (rtm->rtm_type == RTN_LOCAL)
2250                 cfg->fc_flags |= RTF_LOCAL;
2251
2252         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2253         cfg->fc_nlinfo.nlh = nlh;
2254         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2255
2256         if (tb[RTA_GATEWAY]) {
2257                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2258                 cfg->fc_flags |= RTF_GATEWAY;
2259         }
2260
2261         if (tb[RTA_DST]) {
2262                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2263
2264                 if (nla_len(tb[RTA_DST]) < plen)
2265                         goto errout;
2266
2267                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2268         }
2269
2270         if (tb[RTA_SRC]) {
2271                 int plen = (rtm->rtm_src_len + 7) >> 3;
2272
2273                 if (nla_len(tb[RTA_SRC]) < plen)
2274                         goto errout;
2275
2276                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2277         }
2278
2279         if (tb[RTA_PREFSRC])
2280                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2281
2282         if (tb[RTA_OIF])
2283                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2284
2285         if (tb[RTA_PRIORITY])
2286                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2287
2288         if (tb[RTA_METRICS]) {
2289                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2290                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2291         }
2292
2293         if (tb[RTA_TABLE])
2294                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2295
2296         err = 0;
2297 errout:
2298         return err;
2299 }
2300
2301 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2302 {
2303         struct fib6_config cfg;
2304         int err;
2305
2306         err = rtm_to_fib6_config(skb, nlh, &cfg);
2307         if (err < 0)
2308                 return err;
2309
2310         return ip6_route_del(&cfg);
2311 }
2312
2313 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2314 {
2315         struct fib6_config cfg;
2316         int err;
2317
2318         err = rtm_to_fib6_config(skb, nlh, &cfg);
2319         if (err < 0)
2320                 return err;
2321
2322         return ip6_route_add(&cfg);
2323 }
2324
2325 static inline size_t rt6_nlmsg_size(void)
2326 {
2327         return NLMSG_ALIGN(sizeof(struct rtmsg))
2328                + nla_total_size(16) /* RTA_SRC */
2329                + nla_total_size(16) /* RTA_DST */
2330                + nla_total_size(16) /* RTA_GATEWAY */
2331                + nla_total_size(16) /* RTA_PREFSRC */
2332                + nla_total_size(4) /* RTA_TABLE */
2333                + nla_total_size(4) /* RTA_IIF */
2334                + nla_total_size(4) /* RTA_OIF */
2335                + nla_total_size(4) /* RTA_PRIORITY */
2336                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2337                + nla_total_size(sizeof(struct rta_cacheinfo));
2338 }
2339
2340 static int rt6_fill_node(struct net *net,
2341                          struct sk_buff *skb, struct rt6_info *rt,
2342                          struct in6_addr *dst, struct in6_addr *src,
2343                          int iif, int type, u32 pid, u32 seq,
2344                          int prefix, int nowait, unsigned int flags)
2345 {
2346         struct rtmsg *rtm;
2347         struct nlmsghdr *nlh;
2348         long expires;
2349         u32 table;
2350         struct neighbour *n;
2351
2352         if (prefix) {   /* user wants prefix routes only */
2353                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2354                         /* success since this is not a prefix route */
2355                         return 1;
2356                 }
2357         }
2358
2359         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2360         if (nlh == NULL)
2361                 return -EMSGSIZE;
2362
2363         rtm = nlmsg_data(nlh);
2364         rtm->rtm_family = AF_INET6;
2365         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2366         rtm->rtm_src_len = rt->rt6i_src.plen;
2367         rtm->rtm_tos = 0;
2368         if (rt->rt6i_table)
2369                 table = rt->rt6i_table->tb6_id;
2370         else
2371                 table = RT6_TABLE_UNSPEC;
2372         rtm->rtm_table = table;
2373         NLA_PUT_U32(skb, RTA_TABLE, table);
2374         if (rt->rt6i_flags&RTF_REJECT)
2375                 rtm->rtm_type = RTN_UNREACHABLE;
2376         else if (rt->rt6i_flags&RTF_LOCAL)
2377                 rtm->rtm_type = RTN_LOCAL;
2378         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2379                 rtm->rtm_type = RTN_LOCAL;
2380         else
2381                 rtm->rtm_type = RTN_UNICAST;
2382         rtm->rtm_flags = 0;
2383         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2384         rtm->rtm_protocol = rt->rt6i_protocol;
2385         if (rt->rt6i_flags&RTF_DYNAMIC)
2386                 rtm->rtm_protocol = RTPROT_REDIRECT;
2387         else if (rt->rt6i_flags & RTF_ADDRCONF)
2388                 rtm->rtm_protocol = RTPROT_KERNEL;
2389         else if (rt->rt6i_flags&RTF_DEFAULT)
2390                 rtm->rtm_protocol = RTPROT_RA;
2391
2392         if (rt->rt6i_flags&RTF_CACHE)
2393                 rtm->rtm_flags |= RTM_F_CLONED;
2394
2395         if (dst) {
2396                 NLA_PUT(skb, RTA_DST, 16, dst);
2397                 rtm->rtm_dst_len = 128;
2398         } else if (rtm->rtm_dst_len)
2399                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2400 #ifdef CONFIG_IPV6_SUBTREES
2401         if (src) {
2402                 NLA_PUT(skb, RTA_SRC, 16, src);
2403                 rtm->rtm_src_len = 128;
2404         } else if (rtm->rtm_src_len)
2405                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2406 #endif
2407         if (iif) {
2408 #ifdef CONFIG_IPV6_MROUTE
2409                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2410                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2411                         if (err <= 0) {
2412                                 if (!nowait) {
2413                                         if (err == 0)
2414                                                 return 0;
2415                                         goto nla_put_failure;
2416                                 } else {
2417                                         if (err == -EMSGSIZE)
2418                                                 goto nla_put_failure;
2419                                 }
2420                         }
2421                 } else
2422 #endif
2423                         NLA_PUT_U32(skb, RTA_IIF, iif);
2424         } else if (dst) {
2425                 struct in6_addr saddr_buf;
2426                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2427                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2428         }
2429
2430         if (rt->rt6i_prefsrc.plen) {
2431                 struct in6_addr saddr_buf;
2432                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2433                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2434         }
2435
2436         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2437                 goto nla_put_failure;
2438
2439         rcu_read_lock();
2440         n = dst_get_neighbour(&rt->dst);
2441         if (n)
2442                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2443         rcu_read_unlock();
2444
2445         if (rt->dst.dev)
2446                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2447
2448         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2449
2450         if (!(rt->rt6i_flags & RTF_EXPIRES))
2451                 expires = 0;
2452         else if (rt->rt6i_expires - jiffies < INT_MAX)
2453                 expires = rt->rt6i_expires - jiffies;
2454         else
2455                 expires = INT_MAX;
2456
2457         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2458                                expires, rt->dst.error) < 0)
2459                 goto nla_put_failure;
2460
2461         return nlmsg_end(skb, nlh);
2462
2463 nla_put_failure:
2464         nlmsg_cancel(skb, nlh);
2465         return -EMSGSIZE;
2466 }
2467
2468 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2469 {
2470         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2471         int prefix;
2472
2473         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2474                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2475                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2476         } else
2477                 prefix = 0;
2478
2479         return rt6_fill_node(arg->net,
2480                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2481                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2482                      prefix, 0, NLM_F_MULTI);
2483 }
2484
2485 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2486 {
2487         struct net *net = sock_net(in_skb->sk);
2488         struct nlattr *tb[RTA_MAX+1];
2489         struct rt6_info *rt;
2490         struct sk_buff *skb;
2491         struct rtmsg *rtm;
2492         struct flowi6 fl6;
2493         int err, iif = 0;
2494
2495         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2496         if (err < 0)
2497                 goto errout;
2498
2499         err = -EINVAL;
2500         memset(&fl6, 0, sizeof(fl6));
2501
2502         if (tb[RTA_SRC]) {
2503                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2504                         goto errout;
2505
2506                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2507         }
2508
2509         if (tb[RTA_DST]) {
2510                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2511                         goto errout;
2512
2513                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2514         }
2515
2516         if (tb[RTA_IIF])
2517                 iif = nla_get_u32(tb[RTA_IIF]);
2518
2519         if (tb[RTA_OIF])
2520                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2521
2522         if (iif) {
2523                 struct net_device *dev;
2524                 dev = __dev_get_by_index(net, iif);
2525                 if (!dev) {
2526                         err = -ENODEV;
2527                         goto errout;
2528                 }
2529         }
2530
2531         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2532         if (skb == NULL) {
2533                 err = -ENOBUFS;
2534                 goto errout;
2535         }
2536
2537         /* Reserve room for dummy headers, this skb can pass
2538            through good chunk of routing engine.
2539          */
2540         skb_reset_mac_header(skb);
2541         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2542
2543         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2544         skb_dst_set(skb, &rt->dst);
2545
2546         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2547                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2548                             nlh->nlmsg_seq, 0, 0, 0);
2549         if (err < 0) {
2550                 kfree_skb(skb);
2551                 goto errout;
2552         }
2553
2554         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2555 errout:
2556         return err;
2557 }
2558
2559 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2560 {
2561         struct sk_buff *skb;
2562         struct net *net = info->nl_net;
2563         u32 seq;
2564         int err;
2565
2566         err = -ENOBUFS;
2567         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2568
2569         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2570         if (skb == NULL)
2571                 goto errout;
2572
2573         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2574                                 event, info->pid, seq, 0, 0, 0);
2575         if (err < 0) {
2576                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2577                 WARN_ON(err == -EMSGSIZE);
2578                 kfree_skb(skb);
2579                 goto errout;
2580         }
2581         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2582                     info->nlh, gfp_any());
2583         return;
2584 errout:
2585         if (err < 0)
2586                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2587 }
2588
2589 static int ip6_route_dev_notify(struct notifier_block *this,
2590                                 unsigned long event, void *data)
2591 {
2592         struct net_device *dev = (struct net_device *)data;
2593         struct net *net = dev_net(dev);
2594
2595         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2596                 net->ipv6.ip6_null_entry->dst.dev = dev;
2597                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2598 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2599                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2600                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2601                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2602                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2603 #endif
2604         }
2605
2606         return NOTIFY_OK;
2607 }
2608
2609 /*
2610  *      /proc
2611  */
2612
2613 #ifdef CONFIG_PROC_FS
2614
2615 struct rt6_proc_arg
2616 {
2617         char *buffer;
2618         int offset;
2619         int length;
2620         int skip;
2621         int len;
2622 };
2623
2624 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2625 {
2626         struct seq_file *m = p_arg;
2627         struct neighbour *n;
2628
2629         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2630
2631 #ifdef CONFIG_IPV6_SUBTREES
2632         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2633 #else
2634         seq_puts(m, "00000000000000000000000000000000 00 ");
2635 #endif
2636         rcu_read_lock();
2637         n = dst_get_neighbour(&rt->dst);
2638         if (n) {
2639                 seq_printf(m, "%pi6", n->primary_key);
2640         } else {
2641                 seq_puts(m, "00000000000000000000000000000000");
2642         }
2643         rcu_read_unlock();
2644         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2645                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2646                    rt->dst.__use, rt->rt6i_flags,
2647                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2648         return 0;
2649 }
2650
2651 static int ipv6_route_show(struct seq_file *m, void *v)
2652 {
2653         struct net *net = (struct net *)m->private;
2654         fib6_clean_all(net, rt6_info_route, 0, m);
2655         return 0;
2656 }
2657
2658 static int ipv6_route_open(struct inode *inode, struct file *file)
2659 {
2660         return single_open_net(inode, file, ipv6_route_show);
2661 }
2662
2663 static const struct file_operations ipv6_route_proc_fops = {
2664         .owner          = THIS_MODULE,
2665         .open           = ipv6_route_open,
2666         .read           = seq_read,
2667         .llseek         = seq_lseek,
2668         .release        = single_release_net,
2669 };
2670
2671 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2672 {
2673         struct net *net = (struct net *)seq->private;
2674         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2675                    net->ipv6.rt6_stats->fib_nodes,
2676                    net->ipv6.rt6_stats->fib_route_nodes,
2677                    net->ipv6.rt6_stats->fib_rt_alloc,
2678                    net->ipv6.rt6_stats->fib_rt_entries,
2679                    net->ipv6.rt6_stats->fib_rt_cache,
2680                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2681                    net->ipv6.rt6_stats->fib_discarded_routes);
2682
2683         return 0;
2684 }
2685
2686 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2687 {
2688         return single_open_net(inode, file, rt6_stats_seq_show);
2689 }
2690
2691 static const struct file_operations rt6_stats_seq_fops = {
2692         .owner   = THIS_MODULE,
2693         .open    = rt6_stats_seq_open,
2694         .read    = seq_read,
2695         .llseek  = seq_lseek,
2696         .release = single_release_net,
2697 };
2698 #endif  /* CONFIG_PROC_FS */
2699
2700 #ifdef CONFIG_SYSCTL
2701
2702 static
2703 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2704                               void __user *buffer, size_t *lenp, loff_t *ppos)
2705 {
2706         struct net *net;
2707         int delay;
2708         if (!write)
2709                 return -EINVAL;
2710
2711         net = (struct net *)ctl->extra1;
2712         delay = net->ipv6.sysctl.flush_delay;
2713         proc_dointvec(ctl, write, buffer, lenp, ppos);
2714         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2715         return 0;
2716 }
2717
2718 ctl_table ipv6_route_table_template[] = {
2719         {
2720                 .procname       =       "flush",
2721                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2722                 .maxlen         =       sizeof(int),
2723                 .mode           =       0200,
2724                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2725         },
2726         {
2727                 .procname       =       "gc_thresh",
2728                 .data           =       &ip6_dst_ops_template.gc_thresh,
2729                 .maxlen         =       sizeof(int),
2730                 .mode           =       0644,
2731                 .proc_handler   =       proc_dointvec,
2732         },
2733         {
2734                 .procname       =       "max_size",
2735                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2736                 .maxlen         =       sizeof(int),
2737                 .mode           =       0644,
2738                 .proc_handler   =       proc_dointvec,
2739         },
2740         {
2741                 .procname       =       "gc_min_interval",
2742                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2743                 .maxlen         =       sizeof(int),
2744                 .mode           =       0644,
2745                 .proc_handler   =       proc_dointvec_jiffies,
2746         },
2747         {
2748                 .procname       =       "gc_timeout",
2749                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2750                 .maxlen         =       sizeof(int),
2751                 .mode           =       0644,
2752                 .proc_handler   =       proc_dointvec_jiffies,
2753         },
2754         {
2755                 .procname       =       "gc_interval",
2756                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2757                 .maxlen         =       sizeof(int),
2758                 .mode           =       0644,
2759                 .proc_handler   =       proc_dointvec_jiffies,
2760         },
2761         {
2762                 .procname       =       "gc_elasticity",
2763                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2764                 .maxlen         =       sizeof(int),
2765                 .mode           =       0644,
2766                 .proc_handler   =       proc_dointvec,
2767         },
2768         {
2769                 .procname       =       "mtu_expires",
2770                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2771                 .maxlen         =       sizeof(int),
2772                 .mode           =       0644,
2773                 .proc_handler   =       proc_dointvec_jiffies,
2774         },
2775         {
2776                 .procname       =       "min_adv_mss",
2777                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2778                 .maxlen         =       sizeof(int),
2779                 .mode           =       0644,
2780                 .proc_handler   =       proc_dointvec,
2781         },
2782         {
2783                 .procname       =       "gc_min_interval_ms",
2784                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2785                 .maxlen         =       sizeof(int),
2786                 .mode           =       0644,
2787                 .proc_handler   =       proc_dointvec_ms_jiffies,
2788         },
2789         { }
2790 };
2791
2792 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2793 {
2794         struct ctl_table *table;
2795
2796         table = kmemdup(ipv6_route_table_template,
2797                         sizeof(ipv6_route_table_template),
2798                         GFP_KERNEL);
2799
2800         if (table) {
2801                 table[0].data = &net->ipv6.sysctl.flush_delay;
2802                 table[0].extra1 = net;
2803                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2804                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2805                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2806                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2807                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2808                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2809                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2810                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2811                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2812         }
2813
2814         return table;
2815 }
2816 #endif
2817
2818 static int __net_init ip6_route_net_init(struct net *net)
2819 {
2820         int ret = -ENOMEM;
2821
2822         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2823                sizeof(net->ipv6.ip6_dst_ops));
2824
2825         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2826                 goto out_ip6_dst_ops;
2827
2828         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2829                                            sizeof(*net->ipv6.ip6_null_entry),
2830                                            GFP_KERNEL);
2831         if (!net->ipv6.ip6_null_entry)
2832                 goto out_ip6_dst_entries;
2833         net->ipv6.ip6_null_entry->dst.path =
2834                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2835         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2836         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2837                          ip6_template_metrics, true);
2838
2839 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2840         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2841                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2842                                                GFP_KERNEL);
2843         if (!net->ipv6.ip6_prohibit_entry)
2844                 goto out_ip6_null_entry;
2845         net->ipv6.ip6_prohibit_entry->dst.path =
2846                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2847         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2848         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2849                          ip6_template_metrics, true);
2850
2851         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2852                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2853                                                GFP_KERNEL);
2854         if (!net->ipv6.ip6_blk_hole_entry)
2855                 goto out_ip6_prohibit_entry;
2856         net->ipv6.ip6_blk_hole_entry->dst.path =
2857                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2858         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2859         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2860                          ip6_template_metrics, true);
2861 #endif
2862
2863         net->ipv6.sysctl.flush_delay = 0;
2864         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2865         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2866         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2867         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2868         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2869         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2870         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2871
2872 #ifdef CONFIG_PROC_FS
2873         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2874         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2875 #endif
2876         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2877
2878         ret = 0;
2879 out:
2880         return ret;
2881
2882 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2883 out_ip6_prohibit_entry:
2884         kfree(net->ipv6.ip6_prohibit_entry);
2885 out_ip6_null_entry:
2886         kfree(net->ipv6.ip6_null_entry);
2887 #endif
2888 out_ip6_dst_entries:
2889         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2890 out_ip6_dst_ops:
2891         goto out;
2892 }
2893
2894 static void __net_exit ip6_route_net_exit(struct net *net)
2895 {
2896 #ifdef CONFIG_PROC_FS
2897         proc_net_remove(net, "ipv6_route");
2898         proc_net_remove(net, "rt6_stats");
2899 #endif
2900         kfree(net->ipv6.ip6_null_entry);
2901 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2902         kfree(net->ipv6.ip6_prohibit_entry);
2903         kfree(net->ipv6.ip6_blk_hole_entry);
2904 #endif
2905         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2906 }
2907
2908 static struct pernet_operations ip6_route_net_ops = {
2909         .init = ip6_route_net_init,
2910         .exit = ip6_route_net_exit,
2911 };
2912
2913 static struct notifier_block ip6_route_dev_notifier = {
2914         .notifier_call = ip6_route_dev_notify,
2915         .priority = 0,
2916 };
2917
2918 int __init ip6_route_init(void)
2919 {
2920         int ret;
2921
2922         ret = -ENOMEM;
2923         ip6_dst_ops_template.kmem_cachep =
2924                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2925                                   SLAB_HWCACHE_ALIGN, NULL);
2926         if (!ip6_dst_ops_template.kmem_cachep)
2927                 goto out;
2928
2929         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2930         if (ret)
2931                 goto out_kmem_cache;
2932
2933         ret = register_pernet_subsys(&ip6_route_net_ops);
2934         if (ret)
2935                 goto out_dst_entries;
2936
2937         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2938
2939         /* Registering of the loopback is done before this portion of code,
2940          * the loopback reference in rt6_info will not be taken, do it
2941          * manually for init_net */
2942         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2943         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2944   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2945         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2946         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2947         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2948         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2949   #endif
2950         ret = fib6_init();
2951         if (ret)
2952                 goto out_register_subsys;
2953
2954         ret = xfrm6_init();
2955         if (ret)
2956                 goto out_fib6_init;
2957
2958         ret = fib6_rules_init();
2959         if (ret)
2960                 goto xfrm6_init;
2961
2962         ret = -ENOBUFS;
2963         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2964             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2965             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2966                 goto fib6_rules_init;
2967
2968         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2969         if (ret)
2970                 goto fib6_rules_init;
2971
2972 out:
2973         return ret;
2974
2975 fib6_rules_init:
2976         fib6_rules_cleanup();
2977 xfrm6_init:
2978         xfrm6_fini();
2979 out_fib6_init:
2980         fib6_gc_cleanup();
2981 out_register_subsys:
2982         unregister_pernet_subsys(&ip6_route_net_ops);
2983 out_dst_entries:
2984         dst_entries_destroy(&ip6_dst_blackhole_ops);
2985 out_kmem_cache:
2986         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2987         goto out;
2988 }
2989
2990 void ip6_route_cleanup(void)
2991 {
2992         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2993         fib6_rules_cleanup();
2994         xfrm6_fini();
2995         fib6_gc_cleanup();
2996         unregister_pernet_subsys(&ip6_route_net_ops);
2997         dst_entries_destroy(&ip6_dst_blackhole_ops);
2998         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2999 }