net: Loopback ifindex is constant now
[linux-3.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
82                                            struct sk_buff *skb, u32 mtu);
83 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
84                                         struct sk_buff *skb);
85
86 #ifdef CONFIG_IPV6_ROUTE_INFO
87 static struct rt6_info *rt6_add_route_info(struct net *net,
88                                            const struct in6_addr *prefix, int prefixlen,
89                                            const struct in6_addr *gwaddr, int ifindex,
90                                            unsigned int pref);
91 static struct rt6_info *rt6_get_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex);
94 #endif
95
96 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
97 {
98         struct rt6_info *rt = (struct rt6_info *) dst;
99         struct inet_peer *peer;
100         u32 *p = NULL;
101
102         if (!(rt->dst.flags & DST_HOST))
103                 return NULL;
104
105         peer = rt6_get_peer_create(rt);
106         if (peer) {
107                 u32 *old_p = __DST_METRICS_PTR(old);
108                 unsigned long prev, new;
109
110                 p = peer->metrics;
111                 if (inet_metrics_new(peer))
112                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
113
114                 new = (unsigned long) p;
115                 prev = cmpxchg(&dst->_metrics, old, new);
116
117                 if (prev != old) {
118                         p = __DST_METRICS_PTR(prev);
119                         if (prev & DST_METRICS_READ_ONLY)
120                                 p = NULL;
121                 }
122         }
123         return p;
124 }
125
126 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
127                                              struct sk_buff *skb,
128                                              const void *daddr)
129 {
130         struct in6_addr *p = &rt->rt6i_gateway;
131
132         if (!ipv6_addr_any(p))
133                 return (const void *) p;
134         else if (skb)
135                 return &ipv6_hdr(skb)->daddr;
136         return daddr;
137 }
138
139 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
140                                           struct sk_buff *skb,
141                                           const void *daddr)
142 {
143         struct rt6_info *rt = (struct rt6_info *) dst;
144         struct neighbour *n;
145
146         daddr = choose_neigh_daddr(rt, skb, daddr);
147         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
148         if (n)
149                 return n;
150         return neigh_create(&nd_tbl, daddr, dst->dev);
151 }
152
153 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
154 {
155         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
156         if (!n) {
157                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
158                 if (IS_ERR(n))
159                         return PTR_ERR(n);
160         }
161         rt->n = n;
162
163         return 0;
164 }
165
166 static struct dst_ops ip6_dst_ops_template = {
167         .family                 =       AF_INET6,
168         .protocol               =       cpu_to_be16(ETH_P_IPV6),
169         .gc                     =       ip6_dst_gc,
170         .gc_thresh              =       1024,
171         .check                  =       ip6_dst_check,
172         .default_advmss         =       ip6_default_advmss,
173         .mtu                    =       ip6_mtu,
174         .cow_metrics            =       ipv6_cow_metrics,
175         .destroy                =       ip6_dst_destroy,
176         .ifdown                 =       ip6_dst_ifdown,
177         .negative_advice        =       ip6_negative_advice,
178         .link_failure           =       ip6_link_failure,
179         .update_pmtu            =       ip6_rt_update_pmtu,
180         .redirect               =       rt6_do_redirect,
181         .local_out              =       __ip6_local_out,
182         .neigh_lookup           =       ip6_neigh_lookup,
183 };
184
185 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
186 {
187         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
188
189         return mtu ? : dst->dev->mtu;
190 }
191
192 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
193                                          struct sk_buff *skb, u32 mtu)
194 {
195 }
196
197 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
198                                       struct sk_buff *skb)
199 {
200 }
201
202 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
203                                          unsigned long old)
204 {
205         return NULL;
206 }
207
208 static struct dst_ops ip6_dst_blackhole_ops = {
209         .family                 =       AF_INET6,
210         .protocol               =       cpu_to_be16(ETH_P_IPV6),
211         .destroy                =       ip6_dst_destroy,
212         .check                  =       ip6_dst_check,
213         .mtu                    =       ip6_blackhole_mtu,
214         .default_advmss         =       ip6_default_advmss,
215         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
216         .redirect               =       ip6_rt_blackhole_redirect,
217         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
218         .neigh_lookup           =       ip6_neigh_lookup,
219 };
220
221 static const u32 ip6_template_metrics[RTAX_MAX] = {
222         [RTAX_HOPLIMIT - 1] = 255,
223 };
224
225 static struct rt6_info ip6_null_entry_template = {
226         .dst = {
227                 .__refcnt       = ATOMIC_INIT(1),
228                 .__use          = 1,
229                 .obsolete       = -1,
230                 .error          = -ENETUNREACH,
231                 .input          = ip6_pkt_discard,
232                 .output         = ip6_pkt_discard_out,
233         },
234         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
235         .rt6i_protocol  = RTPROT_KERNEL,
236         .rt6i_metric    = ~(u32) 0,
237         .rt6i_ref       = ATOMIC_INIT(1),
238 };
239
240 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
241
242 static int ip6_pkt_prohibit(struct sk_buff *skb);
243 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
244
245 static struct rt6_info ip6_prohibit_entry_template = {
246         .dst = {
247                 .__refcnt       = ATOMIC_INIT(1),
248                 .__use          = 1,
249                 .obsolete       = -1,
250                 .error          = -EACCES,
251                 .input          = ip6_pkt_prohibit,
252                 .output         = ip6_pkt_prohibit_out,
253         },
254         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
255         .rt6i_protocol  = RTPROT_KERNEL,
256         .rt6i_metric    = ~(u32) 0,
257         .rt6i_ref       = ATOMIC_INIT(1),
258 };
259
260 static struct rt6_info ip6_blk_hole_entry_template = {
261         .dst = {
262                 .__refcnt       = ATOMIC_INIT(1),
263                 .__use          = 1,
264                 .obsolete       = -1,
265                 .error          = -EINVAL,
266                 .input          = dst_discard,
267                 .output         = dst_discard,
268         },
269         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
270         .rt6i_protocol  = RTPROT_KERNEL,
271         .rt6i_metric    = ~(u32) 0,
272         .rt6i_ref       = ATOMIC_INIT(1),
273 };
274
275 #endif
276
277 /* allocate dst with ip6_dst_ops */
278 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
279                                              struct net_device *dev,
280                                              int flags,
281                                              struct fib6_table *table)
282 {
283         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
284                                         0, DST_OBSOLETE_NONE, flags);
285
286         if (rt) {
287                 struct dst_entry *dst = &rt->dst;
288
289                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
290                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
291         }
292         return rt;
293 }
294
295 static void ip6_dst_destroy(struct dst_entry *dst)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299
300         if (rt->n)
301                 neigh_release(rt->n);
302
303         if (!(rt->dst.flags & DST_HOST))
304                 dst_destroy_metrics_generic(dst);
305
306         if (idev) {
307                 rt->rt6i_idev = NULL;
308                 in6_dev_put(idev);
309         }
310
311         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
312                 dst_release(dst->from);
313
314         if (rt6_has_peer(rt)) {
315                 struct inet_peer *peer = rt6_peer_ptr(rt);
316                 inet_putpeer(peer);
317         }
318 }
319
320 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
321
322 static u32 rt6_peer_genid(void)
323 {
324         return atomic_read(&__rt6_peer_genid);
325 }
326
327 void rt6_bind_peer(struct rt6_info *rt, int create)
328 {
329         struct inet_peer_base *base;
330         struct inet_peer *peer;
331
332         base = inetpeer_base_ptr(rt->_rt6i_peer);
333         if (!base)
334                 return;
335
336         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
337         if (peer) {
338                 if (!rt6_set_peer(rt, peer))
339                         inet_putpeer(peer);
340                 else
341                         rt->rt6i_peer_genid = rt6_peer_genid();
342         }
343 }
344
345 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
346                            int how)
347 {
348         struct rt6_info *rt = (struct rt6_info *)dst;
349         struct inet6_dev *idev = rt->rt6i_idev;
350         struct net_device *loopback_dev =
351                 dev_net(dev)->loopback_dev;
352
353         if (dev != loopback_dev) {
354                 if (idev && idev->dev == dev) {
355                         struct inet6_dev *loopback_idev =
356                                 in6_dev_get(loopback_dev);
357                         if (loopback_idev) {
358                                 rt->rt6i_idev = loopback_idev;
359                                 in6_dev_put(idev);
360                         }
361                 }
362                 if (rt->n && rt->n->dev == dev) {
363                         rt->n->dev = loopback_dev;
364                         dev_hold(loopback_dev);
365                         dev_put(dev);
366                 }
367         }
368 }
369
370 static bool rt6_check_expired(const struct rt6_info *rt)
371 {
372         struct rt6_info *ort = NULL;
373
374         if (rt->rt6i_flags & RTF_EXPIRES) {
375                 if (time_after(jiffies, rt->dst.expires))
376                         return true;
377         } else if (rt->dst.from) {
378                 ort = (struct rt6_info *) rt->dst.from;
379                 return (ort->rt6i_flags & RTF_EXPIRES) &&
380                         time_after(jiffies, ort->dst.expires);
381         }
382         return false;
383 }
384
385 static bool rt6_need_strict(const struct in6_addr *daddr)
386 {
387         return ipv6_addr_type(daddr) &
388                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
389 }
390
391 /*
392  *      Route lookup. Any table->tb6_lock is implied.
393  */
394
395 static inline struct rt6_info *rt6_device_match(struct net *net,
396                                                     struct rt6_info *rt,
397                                                     const struct in6_addr *saddr,
398                                                     int oif,
399                                                     int flags)
400 {
401         struct rt6_info *local = NULL;
402         struct rt6_info *sprt;
403
404         if (!oif && ipv6_addr_any(saddr))
405                 goto out;
406
407         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
408                 struct net_device *dev = sprt->dst.dev;
409
410                 if (oif) {
411                         if (dev->ifindex == oif)
412                                 return sprt;
413                         if (dev->flags & IFF_LOOPBACK) {
414                                 if (!sprt->rt6i_idev ||
415                                     sprt->rt6i_idev->dev->ifindex != oif) {
416                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
417                                                 continue;
418                                         if (local && (!oif ||
419                                                       local->rt6i_idev->dev->ifindex == oif))
420                                                 continue;
421                                 }
422                                 local = sprt;
423                         }
424                 } else {
425                         if (ipv6_chk_addr(net, saddr, dev,
426                                           flags & RT6_LOOKUP_F_IFACE))
427                                 return sprt;
428                 }
429         }
430
431         if (oif) {
432                 if (local)
433                         return local;
434
435                 if (flags & RT6_LOOKUP_F_IFACE)
436                         return net->ipv6.ip6_null_entry;
437         }
438 out:
439         return rt;
440 }
441
442 #ifdef CONFIG_IPV6_ROUTER_PREF
443 static void rt6_probe(struct rt6_info *rt)
444 {
445         struct neighbour *neigh;
446         /*
447          * Okay, this does not seem to be appropriate
448          * for now, however, we need to check if it
449          * is really so; aka Router Reachability Probing.
450          *
451          * Router Reachability Probe MUST be rate-limited
452          * to no more than one per minute.
453          */
454         rcu_read_lock();
455         neigh = rt ? rt->n : NULL;
456         if (!neigh || (neigh->nud_state & NUD_VALID))
457                 goto out;
458         read_lock_bh(&neigh->lock);
459         if (!(neigh->nud_state & NUD_VALID) &&
460             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
461                 struct in6_addr mcaddr;
462                 struct in6_addr *target;
463
464                 neigh->updated = jiffies;
465                 read_unlock_bh(&neigh->lock);
466
467                 target = (struct in6_addr *)&neigh->primary_key;
468                 addrconf_addr_solict_mult(target, &mcaddr);
469                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
470         } else {
471                 read_unlock_bh(&neigh->lock);
472         }
473 out:
474         rcu_read_unlock();
475 }
476 #else
477 static inline void rt6_probe(struct rt6_info *rt)
478 {
479 }
480 #endif
481
482 /*
483  * Default Router Selection (RFC 2461 6.3.6)
484  */
485 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
486 {
487         struct net_device *dev = rt->dst.dev;
488         if (!oif || dev->ifindex == oif)
489                 return 2;
490         if ((dev->flags & IFF_LOOPBACK) &&
491             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
492                 return 1;
493         return 0;
494 }
495
496 static inline int rt6_check_neigh(struct rt6_info *rt)
497 {
498         struct neighbour *neigh;
499         int m;
500
501         rcu_read_lock();
502         neigh = rt->n;
503         if (rt->rt6i_flags & RTF_NONEXTHOP ||
504             !(rt->rt6i_flags & RTF_GATEWAY))
505                 m = 1;
506         else if (neigh) {
507                 read_lock_bh(&neigh->lock);
508                 if (neigh->nud_state & NUD_VALID)
509                         m = 2;
510 #ifdef CONFIG_IPV6_ROUTER_PREF
511                 else if (neigh->nud_state & NUD_FAILED)
512                         m = 0;
513 #endif
514                 else
515                         m = 1;
516                 read_unlock_bh(&neigh->lock);
517         } else
518                 m = 0;
519         rcu_read_unlock();
520         return m;
521 }
522
523 static int rt6_score_route(struct rt6_info *rt, int oif,
524                            int strict)
525 {
526         int m, n;
527
528         m = rt6_check_dev(rt, oif);
529         if (!m && (strict & RT6_LOOKUP_F_IFACE))
530                 return -1;
531 #ifdef CONFIG_IPV6_ROUTER_PREF
532         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
533 #endif
534         n = rt6_check_neigh(rt);
535         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
536                 return -1;
537         return m;
538 }
539
540 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
541                                    int *mpri, struct rt6_info *match)
542 {
543         int m;
544
545         if (rt6_check_expired(rt))
546                 goto out;
547
548         m = rt6_score_route(rt, oif, strict);
549         if (m < 0)
550                 goto out;
551
552         if (m > *mpri) {
553                 if (strict & RT6_LOOKUP_F_REACHABLE)
554                         rt6_probe(match);
555                 *mpri = m;
556                 match = rt;
557         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
558                 rt6_probe(rt);
559         }
560
561 out:
562         return match;
563 }
564
565 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
566                                      struct rt6_info *rr_head,
567                                      u32 metric, int oif, int strict)
568 {
569         struct rt6_info *rt, *match;
570         int mpri = -1;
571
572         match = NULL;
573         for (rt = rr_head; rt && rt->rt6i_metric == metric;
574              rt = rt->dst.rt6_next)
575                 match = find_match(rt, oif, strict, &mpri, match);
576         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
577              rt = rt->dst.rt6_next)
578                 match = find_match(rt, oif, strict, &mpri, match);
579
580         return match;
581 }
582
583 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
584 {
585         struct rt6_info *match, *rt0;
586         struct net *net;
587
588         rt0 = fn->rr_ptr;
589         if (!rt0)
590                 fn->rr_ptr = rt0 = fn->leaf;
591
592         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
593
594         if (!match &&
595             (strict & RT6_LOOKUP_F_REACHABLE)) {
596                 struct rt6_info *next = rt0->dst.rt6_next;
597
598                 /* no entries matched; do round-robin */
599                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
600                         next = fn->leaf;
601
602                 if (next != rt0)
603                         fn->rr_ptr = next;
604         }
605
606         net = dev_net(rt0->dst.dev);
607         return match ? match : net->ipv6.ip6_null_entry;
608 }
609
610 #ifdef CONFIG_IPV6_ROUTE_INFO
611 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
612                   const struct in6_addr *gwaddr)
613 {
614         struct net *net = dev_net(dev);
615         struct route_info *rinfo = (struct route_info *) opt;
616         struct in6_addr prefix_buf, *prefix;
617         unsigned int pref;
618         unsigned long lifetime;
619         struct rt6_info *rt;
620
621         if (len < sizeof(struct route_info)) {
622                 return -EINVAL;
623         }
624
625         /* Sanity check for prefix_len and length */
626         if (rinfo->length > 3) {
627                 return -EINVAL;
628         } else if (rinfo->prefix_len > 128) {
629                 return -EINVAL;
630         } else if (rinfo->prefix_len > 64) {
631                 if (rinfo->length < 2) {
632                         return -EINVAL;
633                 }
634         } else if (rinfo->prefix_len > 0) {
635                 if (rinfo->length < 1) {
636                         return -EINVAL;
637                 }
638         }
639
640         pref = rinfo->route_pref;
641         if (pref == ICMPV6_ROUTER_PREF_INVALID)
642                 return -EINVAL;
643
644         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
645
646         if (rinfo->length == 3)
647                 prefix = (struct in6_addr *)rinfo->prefix;
648         else {
649                 /* this function is safe */
650                 ipv6_addr_prefix(&prefix_buf,
651                                  (struct in6_addr *)rinfo->prefix,
652                                  rinfo->prefix_len);
653                 prefix = &prefix_buf;
654         }
655
656         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
657                                 dev->ifindex);
658
659         if (rt && !lifetime) {
660                 ip6_del_rt(rt);
661                 rt = NULL;
662         }
663
664         if (!rt && lifetime)
665                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
666                                         pref);
667         else if (rt)
668                 rt->rt6i_flags = RTF_ROUTEINFO |
669                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
670
671         if (rt) {
672                 if (!addrconf_finite_timeout(lifetime))
673                         rt6_clean_expires(rt);
674                 else
675                         rt6_set_expires(rt, jiffies + HZ * lifetime);
676
677                 dst_release(&rt->dst);
678         }
679         return 0;
680 }
681 #endif
682
683 #define BACKTRACK(__net, saddr)                 \
684 do { \
685         if (rt == __net->ipv6.ip6_null_entry) { \
686                 struct fib6_node *pn; \
687                 while (1) { \
688                         if (fn->fn_flags & RTN_TL_ROOT) \
689                                 goto out; \
690                         pn = fn->parent; \
691                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
692                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
693                         else \
694                                 fn = pn; \
695                         if (fn->fn_flags & RTN_RTINFO) \
696                                 goto restart; \
697                 } \
698         } \
699 } while (0)
700
701 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
702                                              struct fib6_table *table,
703                                              struct flowi6 *fl6, int flags)
704 {
705         struct fib6_node *fn;
706         struct rt6_info *rt;
707
708         read_lock_bh(&table->tb6_lock);
709         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
710 restart:
711         rt = fn->leaf;
712         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
713         BACKTRACK(net, &fl6->saddr);
714 out:
715         dst_use(&rt->dst, jiffies);
716         read_unlock_bh(&table->tb6_lock);
717         return rt;
718
719 }
720
721 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
722                                     int flags)
723 {
724         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
725 }
726 EXPORT_SYMBOL_GPL(ip6_route_lookup);
727
728 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
729                             const struct in6_addr *saddr, int oif, int strict)
730 {
731         struct flowi6 fl6 = {
732                 .flowi6_oif = oif,
733                 .daddr = *daddr,
734         };
735         struct dst_entry *dst;
736         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
737
738         if (saddr) {
739                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
740                 flags |= RT6_LOOKUP_F_HAS_SADDR;
741         }
742
743         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
744         if (dst->error == 0)
745                 return (struct rt6_info *) dst;
746
747         dst_release(dst);
748
749         return NULL;
750 }
751
752 EXPORT_SYMBOL(rt6_lookup);
753
754 /* ip6_ins_rt is called with FREE table->tb6_lock.
755    It takes new route entry, the addition fails by any reason the
756    route is freed. In any case, if caller does not hold it, it may
757    be destroyed.
758  */
759
760 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
761 {
762         int err;
763         struct fib6_table *table;
764
765         table = rt->rt6i_table;
766         write_lock_bh(&table->tb6_lock);
767         err = fib6_add(&table->tb6_root, rt, info);
768         write_unlock_bh(&table->tb6_lock);
769
770         return err;
771 }
772
773 int ip6_ins_rt(struct rt6_info *rt)
774 {
775         struct nl_info info = {
776                 .nl_net = dev_net(rt->dst.dev),
777         };
778         return __ip6_ins_rt(rt, &info);
779 }
780
781 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
782                                       const struct in6_addr *daddr,
783                                       const struct in6_addr *saddr)
784 {
785         struct rt6_info *rt;
786
787         /*
788          *      Clone the route.
789          */
790
791         rt = ip6_rt_copy(ort, daddr);
792
793         if (rt) {
794                 int attempts = !in_softirq();
795
796                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
797                         if (ort->rt6i_dst.plen != 128 &&
798                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
799                                 rt->rt6i_flags |= RTF_ANYCAST;
800                         rt->rt6i_gateway = *daddr;
801                 }
802
803                 rt->rt6i_flags |= RTF_CACHE;
804
805 #ifdef CONFIG_IPV6_SUBTREES
806                 if (rt->rt6i_src.plen && saddr) {
807                         rt->rt6i_src.addr = *saddr;
808                         rt->rt6i_src.plen = 128;
809                 }
810 #endif
811
812         retry:
813                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
814                         struct net *net = dev_net(rt->dst.dev);
815                         int saved_rt_min_interval =
816                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
817                         int saved_rt_elasticity =
818                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
819
820                         if (attempts-- > 0) {
821                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
822                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
823
824                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
825
826                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
827                                         saved_rt_elasticity;
828                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
829                                         saved_rt_min_interval;
830                                 goto retry;
831                         }
832
833                         net_warn_ratelimited("Neighbour table overflow\n");
834                         dst_free(&rt->dst);
835                         return NULL;
836                 }
837         }
838
839         return rt;
840 }
841
842 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
843                                         const struct in6_addr *daddr)
844 {
845         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
846
847         if (rt) {
848                 rt->rt6i_flags |= RTF_CACHE;
849                 rt->n = neigh_clone(ort->n);
850         }
851         return rt;
852 }
853
854 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
855                                       struct flowi6 *fl6, int flags)
856 {
857         struct fib6_node *fn;
858         struct rt6_info *rt, *nrt;
859         int strict = 0;
860         int attempts = 3;
861         int err;
862         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
863
864         strict |= flags & RT6_LOOKUP_F_IFACE;
865
866 relookup:
867         read_lock_bh(&table->tb6_lock);
868
869 restart_2:
870         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
871
872 restart:
873         rt = rt6_select(fn, oif, strict | reachable);
874
875         BACKTRACK(net, &fl6->saddr);
876         if (rt == net->ipv6.ip6_null_entry ||
877             rt->rt6i_flags & RTF_CACHE)
878                 goto out;
879
880         dst_hold(&rt->dst);
881         read_unlock_bh(&table->tb6_lock);
882
883         if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
884                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
885         else if (!(rt->dst.flags & DST_HOST))
886                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
887         else
888                 goto out2;
889
890         dst_release(&rt->dst);
891         rt = nrt ? : net->ipv6.ip6_null_entry;
892
893         dst_hold(&rt->dst);
894         if (nrt) {
895                 err = ip6_ins_rt(nrt);
896                 if (!err)
897                         goto out2;
898         }
899
900         if (--attempts <= 0)
901                 goto out2;
902
903         /*
904          * Race condition! In the gap, when table->tb6_lock was
905          * released someone could insert this route.  Relookup.
906          */
907         dst_release(&rt->dst);
908         goto relookup;
909
910 out:
911         if (reachable) {
912                 reachable = 0;
913                 goto restart_2;
914         }
915         dst_hold(&rt->dst);
916         read_unlock_bh(&table->tb6_lock);
917 out2:
918         rt->dst.lastuse = jiffies;
919         rt->dst.__use++;
920
921         return rt;
922 }
923
924 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
925                                             struct flowi6 *fl6, int flags)
926 {
927         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
928 }
929
930 static struct dst_entry *ip6_route_input_lookup(struct net *net,
931                                                 struct net_device *dev,
932                                                 struct flowi6 *fl6, int flags)
933 {
934         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
935                 flags |= RT6_LOOKUP_F_IFACE;
936
937         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
938 }
939
940 void ip6_route_input(struct sk_buff *skb)
941 {
942         const struct ipv6hdr *iph = ipv6_hdr(skb);
943         struct net *net = dev_net(skb->dev);
944         int flags = RT6_LOOKUP_F_HAS_SADDR;
945         struct flowi6 fl6 = {
946                 .flowi6_iif = skb->dev->ifindex,
947                 .daddr = iph->daddr,
948                 .saddr = iph->saddr,
949                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
950                 .flowi6_mark = skb->mark,
951                 .flowi6_proto = iph->nexthdr,
952         };
953
954         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
955 }
956
957 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
958                                              struct flowi6 *fl6, int flags)
959 {
960         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
961 }
962
963 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
964                                     struct flowi6 *fl6)
965 {
966         int flags = 0;
967
968         fl6->flowi6_iif = LOOPBACK_IFINDEX;
969
970         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
971                 flags |= RT6_LOOKUP_F_IFACE;
972
973         if (!ipv6_addr_any(&fl6->saddr))
974                 flags |= RT6_LOOKUP_F_HAS_SADDR;
975         else if (sk)
976                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
977
978         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
979 }
980
981 EXPORT_SYMBOL(ip6_route_output);
982
983 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
984 {
985         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
986         struct dst_entry *new = NULL;
987
988         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
989         if (rt) {
990                 new = &rt->dst;
991
992                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
993                 rt6_init_peer(rt, net->ipv6.peers);
994
995                 new->__use = 1;
996                 new->input = dst_discard;
997                 new->output = dst_discard;
998
999                 if (dst_metrics_read_only(&ort->dst))
1000                         new->_metrics = ort->dst._metrics;
1001                 else
1002                         dst_copy_metrics(new, &ort->dst);
1003                 rt->rt6i_idev = ort->rt6i_idev;
1004                 if (rt->rt6i_idev)
1005                         in6_dev_hold(rt->rt6i_idev);
1006
1007                 rt->rt6i_gateway = ort->rt6i_gateway;
1008                 rt->rt6i_flags = ort->rt6i_flags;
1009                 rt6_clean_expires(rt);
1010                 rt->rt6i_metric = 0;
1011
1012                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1013 #ifdef CONFIG_IPV6_SUBTREES
1014                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1015 #endif
1016
1017                 dst_free(new);
1018         }
1019
1020         dst_release(dst_orig);
1021         return new ? new : ERR_PTR(-ENOMEM);
1022 }
1023
1024 /*
1025  *      Destination cache support functions
1026  */
1027
1028 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1029 {
1030         struct rt6_info *rt;
1031
1032         rt = (struct rt6_info *) dst;
1033
1034         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1035                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1036                         if (!rt6_has_peer(rt))
1037                                 rt6_bind_peer(rt, 0);
1038                         rt->rt6i_peer_genid = rt6_peer_genid();
1039                 }
1040                 return dst;
1041         }
1042         return NULL;
1043 }
1044
1045 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1046 {
1047         struct rt6_info *rt = (struct rt6_info *) dst;
1048
1049         if (rt) {
1050                 if (rt->rt6i_flags & RTF_CACHE) {
1051                         if (rt6_check_expired(rt)) {
1052                                 ip6_del_rt(rt);
1053                                 dst = NULL;
1054                         }
1055                 } else {
1056                         dst_release(dst);
1057                         dst = NULL;
1058                 }
1059         }
1060         return dst;
1061 }
1062
1063 static void ip6_link_failure(struct sk_buff *skb)
1064 {
1065         struct rt6_info *rt;
1066
1067         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1068
1069         rt = (struct rt6_info *) skb_dst(skb);
1070         if (rt) {
1071                 if (rt->rt6i_flags & RTF_CACHE)
1072                         rt6_update_expires(rt, 0);
1073                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1074                         rt->rt6i_node->fn_sernum = -1;
1075         }
1076 }
1077
1078 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1079                                struct sk_buff *skb, u32 mtu)
1080 {
1081         struct rt6_info *rt6 = (struct rt6_info*)dst;
1082
1083         dst_confirm(dst);
1084         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1085                 struct net *net = dev_net(dst->dev);
1086
1087                 rt6->rt6i_flags |= RTF_MODIFIED;
1088                 if (mtu < IPV6_MIN_MTU) {
1089                         u32 features = dst_metric(dst, RTAX_FEATURES);
1090                         mtu = IPV6_MIN_MTU;
1091                         features |= RTAX_FEATURE_ALLFRAG;
1092                         dst_metric_set(dst, RTAX_FEATURES, features);
1093                 }
1094                 dst_metric_set(dst, RTAX_MTU, mtu);
1095                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1096         }
1097 }
1098
1099 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1100                      int oif, u32 mark)
1101 {
1102         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1103         struct dst_entry *dst;
1104         struct flowi6 fl6;
1105
1106         memset(&fl6, 0, sizeof(fl6));
1107         fl6.flowi6_oif = oif;
1108         fl6.flowi6_mark = mark;
1109         fl6.flowi6_flags = 0;
1110         fl6.daddr = iph->daddr;
1111         fl6.saddr = iph->saddr;
1112         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1113
1114         dst = ip6_route_output(net, NULL, &fl6);
1115         if (!dst->error)
1116                 ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
1117         dst_release(dst);
1118 }
1119 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1120
1121 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1122 {
1123         ip6_update_pmtu(skb, sock_net(sk), mtu,
1124                         sk->sk_bound_dev_if, sk->sk_mark);
1125 }
1126 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1127
1128 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1129 {
1130         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1131         struct dst_entry *dst;
1132         struct flowi6 fl6;
1133
1134         memset(&fl6, 0, sizeof(fl6));
1135         fl6.flowi6_oif = oif;
1136         fl6.flowi6_mark = mark;
1137         fl6.flowi6_flags = 0;
1138         fl6.daddr = iph->daddr;
1139         fl6.saddr = iph->saddr;
1140         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1141
1142         dst = ip6_route_output(net, NULL, &fl6);
1143         if (!dst->error)
1144                 rt6_do_redirect(dst, NULL, skb);
1145         dst_release(dst);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_redirect);
1148
1149 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1150 {
1151         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1152 }
1153 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1154
1155 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1156 {
1157         struct net_device *dev = dst->dev;
1158         unsigned int mtu = dst_mtu(dst);
1159         struct net *net = dev_net(dev);
1160
1161         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1162
1163         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1164                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1165
1166         /*
1167          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1168          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1169          * IPV6_MAXPLEN is also valid and means: "any MSS,
1170          * rely only on pmtu discovery"
1171          */
1172         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1173                 mtu = IPV6_MAXPLEN;
1174         return mtu;
1175 }
1176
1177 static unsigned int ip6_mtu(const struct dst_entry *dst)
1178 {
1179         struct inet6_dev *idev;
1180         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1181
1182         if (mtu)
1183                 return mtu;
1184
1185         mtu = IPV6_MIN_MTU;
1186
1187         rcu_read_lock();
1188         idev = __in6_dev_get(dst->dev);
1189         if (idev)
1190                 mtu = idev->cnf.mtu6;
1191         rcu_read_unlock();
1192
1193         return mtu;
1194 }
1195
1196 static struct dst_entry *icmp6_dst_gc_list;
1197 static DEFINE_SPINLOCK(icmp6_dst_lock);
1198
1199 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1200                                   struct neighbour *neigh,
1201                                   struct flowi6 *fl6)
1202 {
1203         struct dst_entry *dst;
1204         struct rt6_info *rt;
1205         struct inet6_dev *idev = in6_dev_get(dev);
1206         struct net *net = dev_net(dev);
1207
1208         if (unlikely(!idev))
1209                 return ERR_PTR(-ENODEV);
1210
1211         rt = ip6_dst_alloc(net, dev, 0, NULL);
1212         if (unlikely(!rt)) {
1213                 in6_dev_put(idev);
1214                 dst = ERR_PTR(-ENOMEM);
1215                 goto out;
1216         }
1217
1218         if (neigh)
1219                 neigh_hold(neigh);
1220         else {
1221                 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1222                 if (IS_ERR(neigh)) {
1223                         in6_dev_put(idev);
1224                         dst_free(&rt->dst);
1225                         return ERR_CAST(neigh);
1226                 }
1227         }
1228
1229         rt->dst.flags |= DST_HOST;
1230         rt->dst.output  = ip6_output;
1231         rt->n = neigh;
1232         atomic_set(&rt->dst.__refcnt, 1);
1233         rt->rt6i_dst.addr = fl6->daddr;
1234         rt->rt6i_dst.plen = 128;
1235         rt->rt6i_idev     = idev;
1236         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1237
1238         spin_lock_bh(&icmp6_dst_lock);
1239         rt->dst.next = icmp6_dst_gc_list;
1240         icmp6_dst_gc_list = &rt->dst;
1241         spin_unlock_bh(&icmp6_dst_lock);
1242
1243         fib6_force_start_gc(net);
1244
1245         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1246
1247 out:
1248         return dst;
1249 }
1250
1251 int icmp6_dst_gc(void)
1252 {
1253         struct dst_entry *dst, **pprev;
1254         int more = 0;
1255
1256         spin_lock_bh(&icmp6_dst_lock);
1257         pprev = &icmp6_dst_gc_list;
1258
1259         while ((dst = *pprev) != NULL) {
1260                 if (!atomic_read(&dst->__refcnt)) {
1261                         *pprev = dst->next;
1262                         dst_free(dst);
1263                 } else {
1264                         pprev = &dst->next;
1265                         ++more;
1266                 }
1267         }
1268
1269         spin_unlock_bh(&icmp6_dst_lock);
1270
1271         return more;
1272 }
1273
1274 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1275                             void *arg)
1276 {
1277         struct dst_entry *dst, **pprev;
1278
1279         spin_lock_bh(&icmp6_dst_lock);
1280         pprev = &icmp6_dst_gc_list;
1281         while ((dst = *pprev) != NULL) {
1282                 struct rt6_info *rt = (struct rt6_info *) dst;
1283                 if (func(rt, arg)) {
1284                         *pprev = dst->next;
1285                         dst_free(dst);
1286                 } else {
1287                         pprev = &dst->next;
1288                 }
1289         }
1290         spin_unlock_bh(&icmp6_dst_lock);
1291 }
1292
1293 static int ip6_dst_gc(struct dst_ops *ops)
1294 {
1295         unsigned long now = jiffies;
1296         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1297         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1298         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1299         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1300         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1301         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1302         int entries;
1303
1304         entries = dst_entries_get_fast(ops);
1305         if (time_after(rt_last_gc + rt_min_interval, now) &&
1306             entries <= rt_max_size)
1307                 goto out;
1308
1309         net->ipv6.ip6_rt_gc_expire++;
1310         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1311         net->ipv6.ip6_rt_last_gc = now;
1312         entries = dst_entries_get_slow(ops);
1313         if (entries < ops->gc_thresh)
1314                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1315 out:
1316         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1317         return entries > rt_max_size;
1318 }
1319
1320 /* Clean host part of a prefix. Not necessary in radix tree,
1321    but results in cleaner routing tables.
1322
1323    Remove it only when all the things will work!
1324  */
1325
1326 int ip6_dst_hoplimit(struct dst_entry *dst)
1327 {
1328         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1329         if (hoplimit == 0) {
1330                 struct net_device *dev = dst->dev;
1331                 struct inet6_dev *idev;
1332
1333                 rcu_read_lock();
1334                 idev = __in6_dev_get(dev);
1335                 if (idev)
1336                         hoplimit = idev->cnf.hop_limit;
1337                 else
1338                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1339                 rcu_read_unlock();
1340         }
1341         return hoplimit;
1342 }
1343 EXPORT_SYMBOL(ip6_dst_hoplimit);
1344
1345 /*
1346  *
1347  */
1348
1349 int ip6_route_add(struct fib6_config *cfg)
1350 {
1351         int err;
1352         struct net *net = cfg->fc_nlinfo.nl_net;
1353         struct rt6_info *rt = NULL;
1354         struct net_device *dev = NULL;
1355         struct inet6_dev *idev = NULL;
1356         struct fib6_table *table;
1357         int addr_type;
1358
1359         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1360                 return -EINVAL;
1361 #ifndef CONFIG_IPV6_SUBTREES
1362         if (cfg->fc_src_len)
1363                 return -EINVAL;
1364 #endif
1365         if (cfg->fc_ifindex) {
1366                 err = -ENODEV;
1367                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1368                 if (!dev)
1369                         goto out;
1370                 idev = in6_dev_get(dev);
1371                 if (!idev)
1372                         goto out;
1373         }
1374
1375         if (cfg->fc_metric == 0)
1376                 cfg->fc_metric = IP6_RT_PRIO_USER;
1377
1378         err = -ENOBUFS;
1379         if (cfg->fc_nlinfo.nlh &&
1380             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1381                 table = fib6_get_table(net, cfg->fc_table);
1382                 if (!table) {
1383                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1384                         table = fib6_new_table(net, cfg->fc_table);
1385                 }
1386         } else {
1387                 table = fib6_new_table(net, cfg->fc_table);
1388         }
1389
1390         if (!table)
1391                 goto out;
1392
1393         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1394
1395         if (!rt) {
1396                 err = -ENOMEM;
1397                 goto out;
1398         }
1399
1400         rt->dst.obsolete = -1;
1401
1402         if (cfg->fc_flags & RTF_EXPIRES)
1403                 rt6_set_expires(rt, jiffies +
1404                                 clock_t_to_jiffies(cfg->fc_expires));
1405         else
1406                 rt6_clean_expires(rt);
1407
1408         if (cfg->fc_protocol == RTPROT_UNSPEC)
1409                 cfg->fc_protocol = RTPROT_BOOT;
1410         rt->rt6i_protocol = cfg->fc_protocol;
1411
1412         addr_type = ipv6_addr_type(&cfg->fc_dst);
1413
1414         if (addr_type & IPV6_ADDR_MULTICAST)
1415                 rt->dst.input = ip6_mc_input;
1416         else if (cfg->fc_flags & RTF_LOCAL)
1417                 rt->dst.input = ip6_input;
1418         else
1419                 rt->dst.input = ip6_forward;
1420
1421         rt->dst.output = ip6_output;
1422
1423         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1424         rt->rt6i_dst.plen = cfg->fc_dst_len;
1425         if (rt->rt6i_dst.plen == 128)
1426                rt->dst.flags |= DST_HOST;
1427
1428         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1429                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1430                 if (!metrics) {
1431                         err = -ENOMEM;
1432                         goto out;
1433                 }
1434                 dst_init_metrics(&rt->dst, metrics, 0);
1435         }
1436 #ifdef CONFIG_IPV6_SUBTREES
1437         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1438         rt->rt6i_src.plen = cfg->fc_src_len;
1439 #endif
1440
1441         rt->rt6i_metric = cfg->fc_metric;
1442
1443         /* We cannot add true routes via loopback here,
1444            they would result in kernel looping; promote them to reject routes
1445          */
1446         if ((cfg->fc_flags & RTF_REJECT) ||
1447             (dev && (dev->flags & IFF_LOOPBACK) &&
1448              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1449              !(cfg->fc_flags & RTF_LOCAL))) {
1450                 /* hold loopback dev/idev if we haven't done so. */
1451                 if (dev != net->loopback_dev) {
1452                         if (dev) {
1453                                 dev_put(dev);
1454                                 in6_dev_put(idev);
1455                         }
1456                         dev = net->loopback_dev;
1457                         dev_hold(dev);
1458                         idev = in6_dev_get(dev);
1459                         if (!idev) {
1460                                 err = -ENODEV;
1461                                 goto out;
1462                         }
1463                 }
1464                 rt->dst.output = ip6_pkt_discard_out;
1465                 rt->dst.input = ip6_pkt_discard;
1466                 rt->dst.error = -ENETUNREACH;
1467                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1468                 goto install_route;
1469         }
1470
1471         if (cfg->fc_flags & RTF_GATEWAY) {
1472                 const struct in6_addr *gw_addr;
1473                 int gwa_type;
1474
1475                 gw_addr = &cfg->fc_gateway;
1476                 rt->rt6i_gateway = *gw_addr;
1477                 gwa_type = ipv6_addr_type(gw_addr);
1478
1479                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1480                         struct rt6_info *grt;
1481
1482                         /* IPv6 strictly inhibits using not link-local
1483                            addresses as nexthop address.
1484                            Otherwise, router will not able to send redirects.
1485                            It is very good, but in some (rare!) circumstances
1486                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1487                            some exceptions. --ANK
1488                          */
1489                         err = -EINVAL;
1490                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1491                                 goto out;
1492
1493                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1494
1495                         err = -EHOSTUNREACH;
1496                         if (!grt)
1497                                 goto out;
1498                         if (dev) {
1499                                 if (dev != grt->dst.dev) {
1500                                         dst_release(&grt->dst);
1501                                         goto out;
1502                                 }
1503                         } else {
1504                                 dev = grt->dst.dev;
1505                                 idev = grt->rt6i_idev;
1506                                 dev_hold(dev);
1507                                 in6_dev_hold(grt->rt6i_idev);
1508                         }
1509                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1510                                 err = 0;
1511                         dst_release(&grt->dst);
1512
1513                         if (err)
1514                                 goto out;
1515                 }
1516                 err = -EINVAL;
1517                 if (!dev || (dev->flags & IFF_LOOPBACK))
1518                         goto out;
1519         }
1520
1521         err = -ENODEV;
1522         if (!dev)
1523                 goto out;
1524
1525         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1526                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1527                         err = -EINVAL;
1528                         goto out;
1529                 }
1530                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1531                 rt->rt6i_prefsrc.plen = 128;
1532         } else
1533                 rt->rt6i_prefsrc.plen = 0;
1534
1535         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1536                 err = rt6_bind_neighbour(rt, dev);
1537                 if (err)
1538                         goto out;
1539         }
1540
1541         rt->rt6i_flags = cfg->fc_flags;
1542
1543 install_route:
1544         if (cfg->fc_mx) {
1545                 struct nlattr *nla;
1546                 int remaining;
1547
1548                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1549                         int type = nla_type(nla);
1550
1551                         if (type) {
1552                                 if (type > RTAX_MAX) {
1553                                         err = -EINVAL;
1554                                         goto out;
1555                                 }
1556
1557                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1558                         }
1559                 }
1560         }
1561
1562         rt->dst.dev = dev;
1563         rt->rt6i_idev = idev;
1564         rt->rt6i_table = table;
1565
1566         cfg->fc_nlinfo.nl_net = dev_net(dev);
1567
1568         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1569
1570 out:
1571         if (dev)
1572                 dev_put(dev);
1573         if (idev)
1574                 in6_dev_put(idev);
1575         if (rt)
1576                 dst_free(&rt->dst);
1577         return err;
1578 }
1579
1580 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1581 {
1582         int err;
1583         struct fib6_table *table;
1584         struct net *net = dev_net(rt->dst.dev);
1585
1586         if (rt == net->ipv6.ip6_null_entry)
1587                 return -ENOENT;
1588
1589         table = rt->rt6i_table;
1590         write_lock_bh(&table->tb6_lock);
1591
1592         err = fib6_del(rt, info);
1593         dst_release(&rt->dst);
1594
1595         write_unlock_bh(&table->tb6_lock);
1596
1597         return err;
1598 }
1599
1600 int ip6_del_rt(struct rt6_info *rt)
1601 {
1602         struct nl_info info = {
1603                 .nl_net = dev_net(rt->dst.dev),
1604         };
1605         return __ip6_del_rt(rt, &info);
1606 }
1607
1608 static int ip6_route_del(struct fib6_config *cfg)
1609 {
1610         struct fib6_table *table;
1611         struct fib6_node *fn;
1612         struct rt6_info *rt;
1613         int err = -ESRCH;
1614
1615         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1616         if (!table)
1617                 return err;
1618
1619         read_lock_bh(&table->tb6_lock);
1620
1621         fn = fib6_locate(&table->tb6_root,
1622                          &cfg->fc_dst, cfg->fc_dst_len,
1623                          &cfg->fc_src, cfg->fc_src_len);
1624
1625         if (fn) {
1626                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1627                         if (cfg->fc_ifindex &&
1628                             (!rt->dst.dev ||
1629                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1630                                 continue;
1631                         if (cfg->fc_flags & RTF_GATEWAY &&
1632                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1633                                 continue;
1634                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1635                                 continue;
1636                         dst_hold(&rt->dst);
1637                         read_unlock_bh(&table->tb6_lock);
1638
1639                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1640                 }
1641         }
1642         read_unlock_bh(&table->tb6_lock);
1643
1644         return err;
1645 }
1646
1647 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1648 {
1649         struct net *net = dev_net(skb->dev);
1650         struct netevent_redirect netevent;
1651         struct rt6_info *rt, *nrt = NULL;
1652         const struct in6_addr *target;
1653         struct ndisc_options ndopts;
1654         const struct in6_addr *dest;
1655         struct neighbour *old_neigh;
1656         struct inet6_dev *in6_dev;
1657         struct neighbour *neigh;
1658         struct icmp6hdr *icmph;
1659         int optlen, on_link;
1660         u8 *lladdr;
1661
1662         optlen = skb->tail - skb->transport_header;
1663         optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
1664
1665         if (optlen < 0) {
1666                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
1667                 return;
1668         }
1669
1670         icmph = icmp6_hdr(skb);
1671         target = (const struct in6_addr *) (icmph + 1);
1672         dest = target + 1;
1673
1674         if (ipv6_addr_is_multicast(dest)) {
1675                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
1676                 return;
1677         }
1678
1679         on_link = 0;
1680         if (ipv6_addr_equal(dest, target)) {
1681                 on_link = 1;
1682         } else if (ipv6_addr_type(target) !=
1683                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
1684                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
1685                 return;
1686         }
1687
1688         in6_dev = __in6_dev_get(skb->dev);
1689         if (!in6_dev)
1690                 return;
1691         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
1692                 return;
1693
1694         /* RFC2461 8.1:
1695          *      The IP source address of the Redirect MUST be the same as the current
1696          *      first-hop router for the specified ICMP Destination Address.
1697          */
1698
1699         if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
1700                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
1701                 return;
1702         }
1703
1704         lladdr = NULL;
1705         if (ndopts.nd_opts_tgt_lladdr) {
1706                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
1707                                              skb->dev);
1708                 if (!lladdr) {
1709                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
1710                         return;
1711                 }
1712         }
1713
1714         rt = (struct rt6_info *) dst;
1715         if (rt == net->ipv6.ip6_null_entry) {
1716                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1717                 return;
1718         }
1719
1720         /* Redirect received -> path was valid.
1721          * Look, redirects are sent only in response to data packets,
1722          * so that this nexthop apparently is reachable. --ANK
1723          */
1724         dst_confirm(&rt->dst);
1725
1726         neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
1727         if (!neigh)
1728                 return;
1729
1730         /* Duplicate redirect: silently ignore. */
1731         old_neigh = rt->n;
1732         if (neigh == old_neigh)
1733                 goto out;
1734
1735         /*
1736          *      We have finally decided to accept it.
1737          */
1738
1739         neigh_update(neigh, lladdr, NUD_STALE,
1740                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1741                      NEIGH_UPDATE_F_OVERRIDE|
1742                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1743                                      NEIGH_UPDATE_F_ISROUTER))
1744                      );
1745
1746         nrt = ip6_rt_copy(rt, dest);
1747         if (!nrt)
1748                 goto out;
1749
1750         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1751         if (on_link)
1752                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1753
1754         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1755         nrt->n = neigh_clone(neigh);
1756
1757         if (ip6_ins_rt(nrt))
1758                 goto out;
1759
1760         netevent.old = &rt->dst;
1761         netevent.old_neigh = old_neigh;
1762         netevent.new = &nrt->dst;
1763         netevent.new_neigh = neigh;
1764         netevent.daddr = dest;
1765         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1766
1767         if (rt->rt6i_flags & RTF_CACHE) {
1768                 rt = (struct rt6_info *) dst_clone(&rt->dst);
1769                 ip6_del_rt(rt);
1770         }
1771
1772 out:
1773         neigh_release(neigh);
1774 }
1775
1776 /*
1777  *      Misc support functions
1778  */
1779
1780 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1781                                     const struct in6_addr *dest)
1782 {
1783         struct net *net = dev_net(ort->dst.dev);
1784         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1785                                             ort->rt6i_table);
1786
1787         if (rt) {
1788                 rt->dst.input = ort->dst.input;
1789                 rt->dst.output = ort->dst.output;
1790                 rt->dst.flags |= DST_HOST;
1791
1792                 rt->rt6i_dst.addr = *dest;
1793                 rt->rt6i_dst.plen = 128;
1794                 dst_copy_metrics(&rt->dst, &ort->dst);
1795                 rt->dst.error = ort->dst.error;
1796                 rt->rt6i_idev = ort->rt6i_idev;
1797                 if (rt->rt6i_idev)
1798                         in6_dev_hold(rt->rt6i_idev);
1799                 rt->dst.lastuse = jiffies;
1800
1801                 rt->rt6i_gateway = ort->rt6i_gateway;
1802                 rt->rt6i_flags = ort->rt6i_flags;
1803                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1804                     (RTF_DEFAULT | RTF_ADDRCONF))
1805                         rt6_set_from(rt, ort);
1806                 else
1807                         rt6_clean_expires(rt);
1808                 rt->rt6i_metric = 0;
1809
1810 #ifdef CONFIG_IPV6_SUBTREES
1811                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1812 #endif
1813                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1814                 rt->rt6i_table = ort->rt6i_table;
1815         }
1816         return rt;
1817 }
1818
1819 #ifdef CONFIG_IPV6_ROUTE_INFO
1820 static struct rt6_info *rt6_get_route_info(struct net *net,
1821                                            const struct in6_addr *prefix, int prefixlen,
1822                                            const struct in6_addr *gwaddr, int ifindex)
1823 {
1824         struct fib6_node *fn;
1825         struct rt6_info *rt = NULL;
1826         struct fib6_table *table;
1827
1828         table = fib6_get_table(net, RT6_TABLE_INFO);
1829         if (!table)
1830                 return NULL;
1831
1832         write_lock_bh(&table->tb6_lock);
1833         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1834         if (!fn)
1835                 goto out;
1836
1837         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1838                 if (rt->dst.dev->ifindex != ifindex)
1839                         continue;
1840                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1841                         continue;
1842                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1843                         continue;
1844                 dst_hold(&rt->dst);
1845                 break;
1846         }
1847 out:
1848         write_unlock_bh(&table->tb6_lock);
1849         return rt;
1850 }
1851
1852 static struct rt6_info *rt6_add_route_info(struct net *net,
1853                                            const struct in6_addr *prefix, int prefixlen,
1854                                            const struct in6_addr *gwaddr, int ifindex,
1855                                            unsigned int pref)
1856 {
1857         struct fib6_config cfg = {
1858                 .fc_table       = RT6_TABLE_INFO,
1859                 .fc_metric      = IP6_RT_PRIO_USER,
1860                 .fc_ifindex     = ifindex,
1861                 .fc_dst_len     = prefixlen,
1862                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1863                                   RTF_UP | RTF_PREF(pref),
1864                 .fc_nlinfo.pid = 0,
1865                 .fc_nlinfo.nlh = NULL,
1866                 .fc_nlinfo.nl_net = net,
1867         };
1868
1869         cfg.fc_dst = *prefix;
1870         cfg.fc_gateway = *gwaddr;
1871
1872         /* We should treat it as a default route if prefix length is 0. */
1873         if (!prefixlen)
1874                 cfg.fc_flags |= RTF_DEFAULT;
1875
1876         ip6_route_add(&cfg);
1877
1878         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1879 }
1880 #endif
1881
1882 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1883 {
1884         struct rt6_info *rt;
1885         struct fib6_table *table;
1886
1887         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1888         if (!table)
1889                 return NULL;
1890
1891         write_lock_bh(&table->tb6_lock);
1892         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1893                 if (dev == rt->dst.dev &&
1894                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1895                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1896                         break;
1897         }
1898         if (rt)
1899                 dst_hold(&rt->dst);
1900         write_unlock_bh(&table->tb6_lock);
1901         return rt;
1902 }
1903
1904 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1905                                      struct net_device *dev,
1906                                      unsigned int pref)
1907 {
1908         struct fib6_config cfg = {
1909                 .fc_table       = RT6_TABLE_DFLT,
1910                 .fc_metric      = IP6_RT_PRIO_USER,
1911                 .fc_ifindex     = dev->ifindex,
1912                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1913                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1914                 .fc_nlinfo.pid = 0,
1915                 .fc_nlinfo.nlh = NULL,
1916                 .fc_nlinfo.nl_net = dev_net(dev),
1917         };
1918
1919         cfg.fc_gateway = *gwaddr;
1920
1921         ip6_route_add(&cfg);
1922
1923         return rt6_get_dflt_router(gwaddr, dev);
1924 }
1925
1926 void rt6_purge_dflt_routers(struct net *net)
1927 {
1928         struct rt6_info *rt;
1929         struct fib6_table *table;
1930
1931         /* NOTE: Keep consistent with rt6_get_dflt_router */
1932         table = fib6_get_table(net, RT6_TABLE_DFLT);
1933         if (!table)
1934                 return;
1935
1936 restart:
1937         read_lock_bh(&table->tb6_lock);
1938         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1939                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1940                         dst_hold(&rt->dst);
1941                         read_unlock_bh(&table->tb6_lock);
1942                         ip6_del_rt(rt);
1943                         goto restart;
1944                 }
1945         }
1946         read_unlock_bh(&table->tb6_lock);
1947 }
1948
1949 static void rtmsg_to_fib6_config(struct net *net,
1950                                  struct in6_rtmsg *rtmsg,
1951                                  struct fib6_config *cfg)
1952 {
1953         memset(cfg, 0, sizeof(*cfg));
1954
1955         cfg->fc_table = RT6_TABLE_MAIN;
1956         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1957         cfg->fc_metric = rtmsg->rtmsg_metric;
1958         cfg->fc_expires = rtmsg->rtmsg_info;
1959         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1960         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1961         cfg->fc_flags = rtmsg->rtmsg_flags;
1962
1963         cfg->fc_nlinfo.nl_net = net;
1964
1965         cfg->fc_dst = rtmsg->rtmsg_dst;
1966         cfg->fc_src = rtmsg->rtmsg_src;
1967         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1968 }
1969
1970 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1971 {
1972         struct fib6_config cfg;
1973         struct in6_rtmsg rtmsg;
1974         int err;
1975
1976         switch(cmd) {
1977         case SIOCADDRT:         /* Add a route */
1978         case SIOCDELRT:         /* Delete a route */
1979                 if (!capable(CAP_NET_ADMIN))
1980                         return -EPERM;
1981                 err = copy_from_user(&rtmsg, arg,
1982                                      sizeof(struct in6_rtmsg));
1983                 if (err)
1984                         return -EFAULT;
1985
1986                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1987
1988                 rtnl_lock();
1989                 switch (cmd) {
1990                 case SIOCADDRT:
1991                         err = ip6_route_add(&cfg);
1992                         break;
1993                 case SIOCDELRT:
1994                         err = ip6_route_del(&cfg);
1995                         break;
1996                 default:
1997                         err = -EINVAL;
1998                 }
1999                 rtnl_unlock();
2000
2001                 return err;
2002         }
2003
2004         return -EINVAL;
2005 }
2006
2007 /*
2008  *      Drop the packet on the floor
2009  */
2010
2011 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2012 {
2013         int type;
2014         struct dst_entry *dst = skb_dst(skb);
2015         switch (ipstats_mib_noroutes) {
2016         case IPSTATS_MIB_INNOROUTES:
2017                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2018                 if (type == IPV6_ADDR_ANY) {
2019                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2020                                       IPSTATS_MIB_INADDRERRORS);
2021                         break;
2022                 }
2023                 /* FALLTHROUGH */
2024         case IPSTATS_MIB_OUTNOROUTES:
2025                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2026                               ipstats_mib_noroutes);
2027                 break;
2028         }
2029         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2030         kfree_skb(skb);
2031         return 0;
2032 }
2033
2034 static int ip6_pkt_discard(struct sk_buff *skb)
2035 {
2036         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2037 }
2038
2039 static int ip6_pkt_discard_out(struct sk_buff *skb)
2040 {
2041         skb->dev = skb_dst(skb)->dev;
2042         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2043 }
2044
2045 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2046
2047 static int ip6_pkt_prohibit(struct sk_buff *skb)
2048 {
2049         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2050 }
2051
2052 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2053 {
2054         skb->dev = skb_dst(skb)->dev;
2055         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2056 }
2057
2058 #endif
2059
2060 /*
2061  *      Allocate a dst for local (unicast / anycast) address.
2062  */
2063
2064 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2065                                     const struct in6_addr *addr,
2066                                     bool anycast)
2067 {
2068         struct net *net = dev_net(idev->dev);
2069         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2070         int err;
2071
2072         if (!rt) {
2073                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2074                 return ERR_PTR(-ENOMEM);
2075         }
2076
2077         in6_dev_hold(idev);
2078
2079         rt->dst.flags |= DST_HOST;
2080         rt->dst.input = ip6_input;
2081         rt->dst.output = ip6_output;
2082         rt->rt6i_idev = idev;
2083         rt->dst.obsolete = -1;
2084
2085         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2086         if (anycast)
2087                 rt->rt6i_flags |= RTF_ANYCAST;
2088         else
2089                 rt->rt6i_flags |= RTF_LOCAL;
2090         err = rt6_bind_neighbour(rt, rt->dst.dev);
2091         if (err) {
2092                 dst_free(&rt->dst);
2093                 return ERR_PTR(err);
2094         }
2095
2096         rt->rt6i_dst.addr = *addr;
2097         rt->rt6i_dst.plen = 128;
2098         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2099
2100         atomic_set(&rt->dst.__refcnt, 1);
2101
2102         return rt;
2103 }
2104
2105 int ip6_route_get_saddr(struct net *net,
2106                         struct rt6_info *rt,
2107                         const struct in6_addr *daddr,
2108                         unsigned int prefs,
2109                         struct in6_addr *saddr)
2110 {
2111         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2112         int err = 0;
2113         if (rt->rt6i_prefsrc.plen)
2114                 *saddr = rt->rt6i_prefsrc.addr;
2115         else
2116                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2117                                          daddr, prefs, saddr);
2118         return err;
2119 }
2120
2121 /* remove deleted ip from prefsrc entries */
2122 struct arg_dev_net_ip {
2123         struct net_device *dev;
2124         struct net *net;
2125         struct in6_addr *addr;
2126 };
2127
2128 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2129 {
2130         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2131         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2132         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2133
2134         if (((void *)rt->dst.dev == dev || !dev) &&
2135             rt != net->ipv6.ip6_null_entry &&
2136             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2137                 /* remove prefsrc entry */
2138                 rt->rt6i_prefsrc.plen = 0;
2139         }
2140         return 0;
2141 }
2142
2143 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2144 {
2145         struct net *net = dev_net(ifp->idev->dev);
2146         struct arg_dev_net_ip adni = {
2147                 .dev = ifp->idev->dev,
2148                 .net = net,
2149                 .addr = &ifp->addr,
2150         };
2151         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2152 }
2153
2154 struct arg_dev_net {
2155         struct net_device *dev;
2156         struct net *net;
2157 };
2158
2159 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2160 {
2161         const struct arg_dev_net *adn = arg;
2162         const struct net_device *dev = adn->dev;
2163
2164         if ((rt->dst.dev == dev || !dev) &&
2165             rt != adn->net->ipv6.ip6_null_entry)
2166                 return -1;
2167
2168         return 0;
2169 }
2170
2171 void rt6_ifdown(struct net *net, struct net_device *dev)
2172 {
2173         struct arg_dev_net adn = {
2174                 .dev = dev,
2175                 .net = net,
2176         };
2177
2178         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2179         icmp6_clean_all(fib6_ifdown, &adn);
2180 }
2181
2182 struct rt6_mtu_change_arg {
2183         struct net_device *dev;
2184         unsigned int mtu;
2185 };
2186
2187 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2188 {
2189         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2190         struct inet6_dev *idev;
2191
2192         /* In IPv6 pmtu discovery is not optional,
2193            so that RTAX_MTU lock cannot disable it.
2194            We still use this lock to block changes
2195            caused by addrconf/ndisc.
2196         */
2197
2198         idev = __in6_dev_get(arg->dev);
2199         if (!idev)
2200                 return 0;
2201
2202         /* For administrative MTU increase, there is no way to discover
2203            IPv6 PMTU increase, so PMTU increase should be updated here.
2204            Since RFC 1981 doesn't include administrative MTU increase
2205            update PMTU increase is a MUST. (i.e. jumbo frame)
2206          */
2207         /*
2208            If new MTU is less than route PMTU, this new MTU will be the
2209            lowest MTU in the path, update the route PMTU to reflect PMTU
2210            decreases; if new MTU is greater than route PMTU, and the
2211            old MTU is the lowest MTU in the path, update the route PMTU
2212            to reflect the increase. In this case if the other nodes' MTU
2213            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2214            PMTU discouvery.
2215          */
2216         if (rt->dst.dev == arg->dev &&
2217             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2218             (dst_mtu(&rt->dst) >= arg->mtu ||
2219              (dst_mtu(&rt->dst) < arg->mtu &&
2220               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2221                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2222         }
2223         return 0;
2224 }
2225
2226 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2227 {
2228         struct rt6_mtu_change_arg arg = {
2229                 .dev = dev,
2230                 .mtu = mtu,
2231         };
2232
2233         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2234 }
2235
2236 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2237         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2238         [RTA_OIF]               = { .type = NLA_U32 },
2239         [RTA_IIF]               = { .type = NLA_U32 },
2240         [RTA_PRIORITY]          = { .type = NLA_U32 },
2241         [RTA_METRICS]           = { .type = NLA_NESTED },
2242 };
2243
2244 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2245                               struct fib6_config *cfg)
2246 {
2247         struct rtmsg *rtm;
2248         struct nlattr *tb[RTA_MAX+1];
2249         int err;
2250
2251         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2252         if (err < 0)
2253                 goto errout;
2254
2255         err = -EINVAL;
2256         rtm = nlmsg_data(nlh);
2257         memset(cfg, 0, sizeof(*cfg));
2258
2259         cfg->fc_table = rtm->rtm_table;
2260         cfg->fc_dst_len = rtm->rtm_dst_len;
2261         cfg->fc_src_len = rtm->rtm_src_len;
2262         cfg->fc_flags = RTF_UP;
2263         cfg->fc_protocol = rtm->rtm_protocol;
2264
2265         if (rtm->rtm_type == RTN_UNREACHABLE)
2266                 cfg->fc_flags |= RTF_REJECT;
2267
2268         if (rtm->rtm_type == RTN_LOCAL)
2269                 cfg->fc_flags |= RTF_LOCAL;
2270
2271         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2272         cfg->fc_nlinfo.nlh = nlh;
2273         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2274
2275         if (tb[RTA_GATEWAY]) {
2276                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2277                 cfg->fc_flags |= RTF_GATEWAY;
2278         }
2279
2280         if (tb[RTA_DST]) {
2281                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2282
2283                 if (nla_len(tb[RTA_DST]) < plen)
2284                         goto errout;
2285
2286                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2287         }
2288
2289         if (tb[RTA_SRC]) {
2290                 int plen = (rtm->rtm_src_len + 7) >> 3;
2291
2292                 if (nla_len(tb[RTA_SRC]) < plen)
2293                         goto errout;
2294
2295                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2296         }
2297
2298         if (tb[RTA_PREFSRC])
2299                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2300
2301         if (tb[RTA_OIF])
2302                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2303
2304         if (tb[RTA_PRIORITY])
2305                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2306
2307         if (tb[RTA_METRICS]) {
2308                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2309                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2310         }
2311
2312         if (tb[RTA_TABLE])
2313                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2314
2315         err = 0;
2316 errout:
2317         return err;
2318 }
2319
2320 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2321 {
2322         struct fib6_config cfg;
2323         int err;
2324
2325         err = rtm_to_fib6_config(skb, nlh, &cfg);
2326         if (err < 0)
2327                 return err;
2328
2329         return ip6_route_del(&cfg);
2330 }
2331
2332 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2333 {
2334         struct fib6_config cfg;
2335         int err;
2336
2337         err = rtm_to_fib6_config(skb, nlh, &cfg);
2338         if (err < 0)
2339                 return err;
2340
2341         return ip6_route_add(&cfg);
2342 }
2343
2344 static inline size_t rt6_nlmsg_size(void)
2345 {
2346         return NLMSG_ALIGN(sizeof(struct rtmsg))
2347                + nla_total_size(16) /* RTA_SRC */
2348                + nla_total_size(16) /* RTA_DST */
2349                + nla_total_size(16) /* RTA_GATEWAY */
2350                + nla_total_size(16) /* RTA_PREFSRC */
2351                + nla_total_size(4) /* RTA_TABLE */
2352                + nla_total_size(4) /* RTA_IIF */
2353                + nla_total_size(4) /* RTA_OIF */
2354                + nla_total_size(4) /* RTA_PRIORITY */
2355                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2356                + nla_total_size(sizeof(struct rta_cacheinfo));
2357 }
2358
2359 static int rt6_fill_node(struct net *net,
2360                          struct sk_buff *skb, struct rt6_info *rt,
2361                          struct in6_addr *dst, struct in6_addr *src,
2362                          int iif, int type, u32 pid, u32 seq,
2363                          int prefix, int nowait, unsigned int flags)
2364 {
2365         struct rtmsg *rtm;
2366         struct nlmsghdr *nlh;
2367         long expires;
2368         u32 table;
2369         struct neighbour *n;
2370
2371         if (prefix) {   /* user wants prefix routes only */
2372                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2373                         /* success since this is not a prefix route */
2374                         return 1;
2375                 }
2376         }
2377
2378         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2379         if (!nlh)
2380                 return -EMSGSIZE;
2381
2382         rtm = nlmsg_data(nlh);
2383         rtm->rtm_family = AF_INET6;
2384         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2385         rtm->rtm_src_len = rt->rt6i_src.plen;
2386         rtm->rtm_tos = 0;
2387         if (rt->rt6i_table)
2388                 table = rt->rt6i_table->tb6_id;
2389         else
2390                 table = RT6_TABLE_UNSPEC;
2391         rtm->rtm_table = table;
2392         if (nla_put_u32(skb, RTA_TABLE, table))
2393                 goto nla_put_failure;
2394         if (rt->rt6i_flags & RTF_REJECT)
2395                 rtm->rtm_type = RTN_UNREACHABLE;
2396         else if (rt->rt6i_flags & RTF_LOCAL)
2397                 rtm->rtm_type = RTN_LOCAL;
2398         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2399                 rtm->rtm_type = RTN_LOCAL;
2400         else
2401                 rtm->rtm_type = RTN_UNICAST;
2402         rtm->rtm_flags = 0;
2403         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2404         rtm->rtm_protocol = rt->rt6i_protocol;
2405         if (rt->rt6i_flags & RTF_DYNAMIC)
2406                 rtm->rtm_protocol = RTPROT_REDIRECT;
2407         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2408                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2409                         rtm->rtm_protocol = RTPROT_RA;
2410                 else
2411                         rtm->rtm_protocol = RTPROT_KERNEL;
2412         }
2413
2414         if (rt->rt6i_flags & RTF_CACHE)
2415                 rtm->rtm_flags |= RTM_F_CLONED;
2416
2417         if (dst) {
2418                 if (nla_put(skb, RTA_DST, 16, dst))
2419                         goto nla_put_failure;
2420                 rtm->rtm_dst_len = 128;
2421         } else if (rtm->rtm_dst_len)
2422                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2423                         goto nla_put_failure;
2424 #ifdef CONFIG_IPV6_SUBTREES
2425         if (src) {
2426                 if (nla_put(skb, RTA_SRC, 16, src))
2427                         goto nla_put_failure;
2428                 rtm->rtm_src_len = 128;
2429         } else if (rtm->rtm_src_len &&
2430                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2431                 goto nla_put_failure;
2432 #endif
2433         if (iif) {
2434 #ifdef CONFIG_IPV6_MROUTE
2435                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2436                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2437                         if (err <= 0) {
2438                                 if (!nowait) {
2439                                         if (err == 0)
2440                                                 return 0;
2441                                         goto nla_put_failure;
2442                                 } else {
2443                                         if (err == -EMSGSIZE)
2444                                                 goto nla_put_failure;
2445                                 }
2446                         }
2447                 } else
2448 #endif
2449                         if (nla_put_u32(skb, RTA_IIF, iif))
2450                                 goto nla_put_failure;
2451         } else if (dst) {
2452                 struct in6_addr saddr_buf;
2453                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2454                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2455                         goto nla_put_failure;
2456         }
2457
2458         if (rt->rt6i_prefsrc.plen) {
2459                 struct in6_addr saddr_buf;
2460                 saddr_buf = rt->rt6i_prefsrc.addr;
2461                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2462                         goto nla_put_failure;
2463         }
2464
2465         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2466                 goto nla_put_failure;
2467
2468         rcu_read_lock();
2469         n = rt->n;
2470         if (n) {
2471                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2472                         rcu_read_unlock();
2473                         goto nla_put_failure;
2474                 }
2475         }
2476         rcu_read_unlock();
2477
2478         if (rt->dst.dev &&
2479             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2480                 goto nla_put_failure;
2481         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2482                 goto nla_put_failure;
2483
2484         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2485
2486         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2487                 goto nla_put_failure;
2488
2489         return nlmsg_end(skb, nlh);
2490
2491 nla_put_failure:
2492         nlmsg_cancel(skb, nlh);
2493         return -EMSGSIZE;
2494 }
2495
2496 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2497 {
2498         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2499         int prefix;
2500
2501         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2502                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2503                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2504         } else
2505                 prefix = 0;
2506
2507         return rt6_fill_node(arg->net,
2508                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2509                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2510                      prefix, 0, NLM_F_MULTI);
2511 }
2512
2513 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2514 {
2515         struct net *net = sock_net(in_skb->sk);
2516         struct nlattr *tb[RTA_MAX+1];
2517         struct rt6_info *rt;
2518         struct sk_buff *skb;
2519         struct rtmsg *rtm;
2520         struct flowi6 fl6;
2521         int err, iif = 0, oif = 0;
2522
2523         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2524         if (err < 0)
2525                 goto errout;
2526
2527         err = -EINVAL;
2528         memset(&fl6, 0, sizeof(fl6));
2529
2530         if (tb[RTA_SRC]) {
2531                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2532                         goto errout;
2533
2534                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2535         }
2536
2537         if (tb[RTA_DST]) {
2538                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2539                         goto errout;
2540
2541                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2542         }
2543
2544         if (tb[RTA_IIF])
2545                 iif = nla_get_u32(tb[RTA_IIF]);
2546
2547         if (tb[RTA_OIF])
2548                 oif = nla_get_u32(tb[RTA_OIF]);
2549
2550         if (iif) {
2551                 struct net_device *dev;
2552                 int flags = 0;
2553
2554                 dev = __dev_get_by_index(net, iif);
2555                 if (!dev) {
2556                         err = -ENODEV;
2557                         goto errout;
2558                 }
2559
2560                 fl6.flowi6_iif = iif;
2561
2562                 if (!ipv6_addr_any(&fl6.saddr))
2563                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2564
2565                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2566                                                                flags);
2567         } else {
2568                 fl6.flowi6_oif = oif;
2569
2570                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2571         }
2572
2573         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2574         if (!skb) {
2575                 dst_release(&rt->dst);
2576                 err = -ENOBUFS;
2577                 goto errout;
2578         }
2579
2580         /* Reserve room for dummy headers, this skb can pass
2581            through good chunk of routing engine.
2582          */
2583         skb_reset_mac_header(skb);
2584         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2585
2586         skb_dst_set(skb, &rt->dst);
2587
2588         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2589                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2590                             nlh->nlmsg_seq, 0, 0, 0);
2591         if (err < 0) {
2592                 kfree_skb(skb);
2593                 goto errout;
2594         }
2595
2596         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2597 errout:
2598         return err;
2599 }
2600
2601 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2602 {
2603         struct sk_buff *skb;
2604         struct net *net = info->nl_net;
2605         u32 seq;
2606         int err;
2607
2608         err = -ENOBUFS;
2609         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2610
2611         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2612         if (!skb)
2613                 goto errout;
2614
2615         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2616                                 event, info->pid, seq, 0, 0, 0);
2617         if (err < 0) {
2618                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2619                 WARN_ON(err == -EMSGSIZE);
2620                 kfree_skb(skb);
2621                 goto errout;
2622         }
2623         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2624                     info->nlh, gfp_any());
2625         return;
2626 errout:
2627         if (err < 0)
2628                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2629 }
2630
2631 static int ip6_route_dev_notify(struct notifier_block *this,
2632                                 unsigned long event, void *data)
2633 {
2634         struct net_device *dev = (struct net_device *)data;
2635         struct net *net = dev_net(dev);
2636
2637         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2638                 net->ipv6.ip6_null_entry->dst.dev = dev;
2639                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2640 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2641                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2642                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2643                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2644                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2645 #endif
2646         }
2647
2648         return NOTIFY_OK;
2649 }
2650
2651 /*
2652  *      /proc
2653  */
2654
2655 #ifdef CONFIG_PROC_FS
2656
2657 struct rt6_proc_arg
2658 {
2659         char *buffer;
2660         int offset;
2661         int length;
2662         int skip;
2663         int len;
2664 };
2665
2666 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2667 {
2668         struct seq_file *m = p_arg;
2669         struct neighbour *n;
2670
2671         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2672
2673 #ifdef CONFIG_IPV6_SUBTREES
2674         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2675 #else
2676         seq_puts(m, "00000000000000000000000000000000 00 ");
2677 #endif
2678         rcu_read_lock();
2679         n = rt->n;
2680         if (n) {
2681                 seq_printf(m, "%pi6", n->primary_key);
2682         } else {
2683                 seq_puts(m, "00000000000000000000000000000000");
2684         }
2685         rcu_read_unlock();
2686         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2687                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2688                    rt->dst.__use, rt->rt6i_flags,
2689                    rt->dst.dev ? rt->dst.dev->name : "");
2690         return 0;
2691 }
2692
2693 static int ipv6_route_show(struct seq_file *m, void *v)
2694 {
2695         struct net *net = (struct net *)m->private;
2696         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2697         return 0;
2698 }
2699
2700 static int ipv6_route_open(struct inode *inode, struct file *file)
2701 {
2702         return single_open_net(inode, file, ipv6_route_show);
2703 }
2704
2705 static const struct file_operations ipv6_route_proc_fops = {
2706         .owner          = THIS_MODULE,
2707         .open           = ipv6_route_open,
2708         .read           = seq_read,
2709         .llseek         = seq_lseek,
2710         .release        = single_release_net,
2711 };
2712
2713 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2714 {
2715         struct net *net = (struct net *)seq->private;
2716         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2717                    net->ipv6.rt6_stats->fib_nodes,
2718                    net->ipv6.rt6_stats->fib_route_nodes,
2719                    net->ipv6.rt6_stats->fib_rt_alloc,
2720                    net->ipv6.rt6_stats->fib_rt_entries,
2721                    net->ipv6.rt6_stats->fib_rt_cache,
2722                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2723                    net->ipv6.rt6_stats->fib_discarded_routes);
2724
2725         return 0;
2726 }
2727
2728 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2729 {
2730         return single_open_net(inode, file, rt6_stats_seq_show);
2731 }
2732
2733 static const struct file_operations rt6_stats_seq_fops = {
2734         .owner   = THIS_MODULE,
2735         .open    = rt6_stats_seq_open,
2736         .read    = seq_read,
2737         .llseek  = seq_lseek,
2738         .release = single_release_net,
2739 };
2740 #endif  /* CONFIG_PROC_FS */
2741
2742 #ifdef CONFIG_SYSCTL
2743
2744 static
2745 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2746                               void __user *buffer, size_t *lenp, loff_t *ppos)
2747 {
2748         struct net *net;
2749         int delay;
2750         if (!write)
2751                 return -EINVAL;
2752
2753         net = (struct net *)ctl->extra1;
2754         delay = net->ipv6.sysctl.flush_delay;
2755         proc_dointvec(ctl, write, buffer, lenp, ppos);
2756         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2757         return 0;
2758 }
2759
2760 ctl_table ipv6_route_table_template[] = {
2761         {
2762                 .procname       =       "flush",
2763                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2764                 .maxlen         =       sizeof(int),
2765                 .mode           =       0200,
2766                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2767         },
2768         {
2769                 .procname       =       "gc_thresh",
2770                 .data           =       &ip6_dst_ops_template.gc_thresh,
2771                 .maxlen         =       sizeof(int),
2772                 .mode           =       0644,
2773                 .proc_handler   =       proc_dointvec,
2774         },
2775         {
2776                 .procname       =       "max_size",
2777                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2778                 .maxlen         =       sizeof(int),
2779                 .mode           =       0644,
2780                 .proc_handler   =       proc_dointvec,
2781         },
2782         {
2783                 .procname       =       "gc_min_interval",
2784                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2785                 .maxlen         =       sizeof(int),
2786                 .mode           =       0644,
2787                 .proc_handler   =       proc_dointvec_jiffies,
2788         },
2789         {
2790                 .procname       =       "gc_timeout",
2791                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2792                 .maxlen         =       sizeof(int),
2793                 .mode           =       0644,
2794                 .proc_handler   =       proc_dointvec_jiffies,
2795         },
2796         {
2797                 .procname       =       "gc_interval",
2798                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2799                 .maxlen         =       sizeof(int),
2800                 .mode           =       0644,
2801                 .proc_handler   =       proc_dointvec_jiffies,
2802         },
2803         {
2804                 .procname       =       "gc_elasticity",
2805                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2806                 .maxlen         =       sizeof(int),
2807                 .mode           =       0644,
2808                 .proc_handler   =       proc_dointvec,
2809         },
2810         {
2811                 .procname       =       "mtu_expires",
2812                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2813                 .maxlen         =       sizeof(int),
2814                 .mode           =       0644,
2815                 .proc_handler   =       proc_dointvec_jiffies,
2816         },
2817         {
2818                 .procname       =       "min_adv_mss",
2819                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2820                 .maxlen         =       sizeof(int),
2821                 .mode           =       0644,
2822                 .proc_handler   =       proc_dointvec,
2823         },
2824         {
2825                 .procname       =       "gc_min_interval_ms",
2826                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2827                 .maxlen         =       sizeof(int),
2828                 .mode           =       0644,
2829                 .proc_handler   =       proc_dointvec_ms_jiffies,
2830         },
2831         { }
2832 };
2833
2834 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2835 {
2836         struct ctl_table *table;
2837
2838         table = kmemdup(ipv6_route_table_template,
2839                         sizeof(ipv6_route_table_template),
2840                         GFP_KERNEL);
2841
2842         if (table) {
2843                 table[0].data = &net->ipv6.sysctl.flush_delay;
2844                 table[0].extra1 = net;
2845                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2846                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2847                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2848                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2849                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2850                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2851                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2852                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2853                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2854         }
2855
2856         return table;
2857 }
2858 #endif
2859
2860 static int __net_init ip6_route_net_init(struct net *net)
2861 {
2862         int ret = -ENOMEM;
2863
2864         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2865                sizeof(net->ipv6.ip6_dst_ops));
2866
2867         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2868                 goto out_ip6_dst_ops;
2869
2870         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2871                                            sizeof(*net->ipv6.ip6_null_entry),
2872                                            GFP_KERNEL);
2873         if (!net->ipv6.ip6_null_entry)
2874                 goto out_ip6_dst_entries;
2875         net->ipv6.ip6_null_entry->dst.path =
2876                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2877         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2879                          ip6_template_metrics, true);
2880
2881 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2882         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2883                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2884                                                GFP_KERNEL);
2885         if (!net->ipv6.ip6_prohibit_entry)
2886                 goto out_ip6_null_entry;
2887         net->ipv6.ip6_prohibit_entry->dst.path =
2888                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2889         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2890         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2891                          ip6_template_metrics, true);
2892
2893         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2894                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2895                                                GFP_KERNEL);
2896         if (!net->ipv6.ip6_blk_hole_entry)
2897                 goto out_ip6_prohibit_entry;
2898         net->ipv6.ip6_blk_hole_entry->dst.path =
2899                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2900         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2901         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2902                          ip6_template_metrics, true);
2903 #endif
2904
2905         net->ipv6.sysctl.flush_delay = 0;
2906         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2907         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2908         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2909         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2910         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2911         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2912         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2913
2914         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2915
2916         ret = 0;
2917 out:
2918         return ret;
2919
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921 out_ip6_prohibit_entry:
2922         kfree(net->ipv6.ip6_prohibit_entry);
2923 out_ip6_null_entry:
2924         kfree(net->ipv6.ip6_null_entry);
2925 #endif
2926 out_ip6_dst_entries:
2927         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2928 out_ip6_dst_ops:
2929         goto out;
2930 }
2931
2932 static void __net_exit ip6_route_net_exit(struct net *net)
2933 {
2934         kfree(net->ipv6.ip6_null_entry);
2935 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2936         kfree(net->ipv6.ip6_prohibit_entry);
2937         kfree(net->ipv6.ip6_blk_hole_entry);
2938 #endif
2939         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2940 }
2941
2942 static int __net_init ip6_route_net_init_late(struct net *net)
2943 {
2944 #ifdef CONFIG_PROC_FS
2945         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2946         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2947 #endif
2948         return 0;
2949 }
2950
2951 static void __net_exit ip6_route_net_exit_late(struct net *net)
2952 {
2953 #ifdef CONFIG_PROC_FS
2954         proc_net_remove(net, "ipv6_route");
2955         proc_net_remove(net, "rt6_stats");
2956 #endif
2957 }
2958
2959 static struct pernet_operations ip6_route_net_ops = {
2960         .init = ip6_route_net_init,
2961         .exit = ip6_route_net_exit,
2962 };
2963
2964 static int __net_init ipv6_inetpeer_init(struct net *net)
2965 {
2966         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2967
2968         if (!bp)
2969                 return -ENOMEM;
2970         inet_peer_base_init(bp);
2971         net->ipv6.peers = bp;
2972         return 0;
2973 }
2974
2975 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2976 {
2977         struct inet_peer_base *bp = net->ipv6.peers;
2978
2979         net->ipv6.peers = NULL;
2980         inetpeer_invalidate_tree(bp);
2981         kfree(bp);
2982 }
2983
2984 static struct pernet_operations ipv6_inetpeer_ops = {
2985         .init   =       ipv6_inetpeer_init,
2986         .exit   =       ipv6_inetpeer_exit,
2987 };
2988
2989 static struct pernet_operations ip6_route_net_late_ops = {
2990         .init = ip6_route_net_init_late,
2991         .exit = ip6_route_net_exit_late,
2992 };
2993
2994 static struct notifier_block ip6_route_dev_notifier = {
2995         .notifier_call = ip6_route_dev_notify,
2996         .priority = 0,
2997 };
2998
2999 int __init ip6_route_init(void)
3000 {
3001         int ret;
3002
3003         ret = -ENOMEM;
3004         ip6_dst_ops_template.kmem_cachep =
3005                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3006                                   SLAB_HWCACHE_ALIGN, NULL);
3007         if (!ip6_dst_ops_template.kmem_cachep)
3008                 goto out;
3009
3010         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3011         if (ret)
3012                 goto out_kmem_cache;
3013
3014         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3015         if (ret)
3016                 goto out_dst_entries;
3017
3018         ret = register_pernet_subsys(&ip6_route_net_ops);
3019         if (ret)
3020                 goto out_register_inetpeer;
3021
3022         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3023
3024         /* Registering of the loopback is done before this portion of code,
3025          * the loopback reference in rt6_info will not be taken, do it
3026          * manually for init_net */
3027         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3028         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3029   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3030         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3031         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3032         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3033         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3034   #endif
3035         ret = fib6_init();
3036         if (ret)
3037                 goto out_register_subsys;
3038
3039         ret = xfrm6_init();
3040         if (ret)
3041                 goto out_fib6_init;
3042
3043         ret = fib6_rules_init();
3044         if (ret)
3045                 goto xfrm6_init;
3046
3047         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3048         if (ret)
3049                 goto fib6_rules_init;
3050
3051         ret = -ENOBUFS;
3052         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3053             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3054             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3055                 goto out_register_late_subsys;
3056
3057         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3058         if (ret)
3059                 goto out_register_late_subsys;
3060
3061 out:
3062         return ret;
3063
3064 out_register_late_subsys:
3065         unregister_pernet_subsys(&ip6_route_net_late_ops);
3066 fib6_rules_init:
3067         fib6_rules_cleanup();
3068 xfrm6_init:
3069         xfrm6_fini();
3070 out_fib6_init:
3071         fib6_gc_cleanup();
3072 out_register_subsys:
3073         unregister_pernet_subsys(&ip6_route_net_ops);
3074 out_register_inetpeer:
3075         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3076 out_dst_entries:
3077         dst_entries_destroy(&ip6_dst_blackhole_ops);
3078 out_kmem_cache:
3079         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3080         goto out;
3081 }
3082
3083 void ip6_route_cleanup(void)
3084 {
3085         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3086         unregister_pernet_subsys(&ip6_route_net_late_ops);
3087         fib6_rules_cleanup();
3088         xfrm6_fini();
3089         fib6_gc_cleanup();
3090         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3091         unregister_pernet_subsys(&ip6_route_net_ops);
3092         dst_entries_destroy(&ip6_dst_blackhole_ops);
3093         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3094 }