inetpeer: add parameter net for inet_getpeer_v4,v6
[linux-3.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         if (!rt->rt6i_peer)
103                 rt6_bind_peer(rt, 1);
104
105         peer = rt->rt6i_peer;
106         if (peer) {
107                 u32 *old_p = __DST_METRICS_PTR(old);
108                 unsigned long prev, new;
109
110                 p = peer->metrics;
111                 if (inet_metrics_new(peer))
112                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
113
114                 new = (unsigned long) p;
115                 prev = cmpxchg(&dst->_metrics, old, new);
116
117                 if (prev != old) {
118                         p = __DST_METRICS_PTR(prev);
119                         if (prev & DST_METRICS_READ_ONLY)
120                                 p = NULL;
121                 }
122         }
123         return p;
124 }
125
126 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
127 {
128         struct in6_addr *p = &rt->rt6i_gateway;
129
130         if (!ipv6_addr_any(p))
131                 return (const void *) p;
132         return daddr;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         struct rt6_info *rt = (struct rt6_info *) dst;
138         struct neighbour *n;
139
140         daddr = choose_neigh_daddr(rt, daddr);
141         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
142         if (n)
143                 return n;
144         return neigh_create(&nd_tbl, daddr, dst->dev);
145 }
146
147 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
148 {
149         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
150         if (!n) {
151                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
152                 if (IS_ERR(n))
153                         return PTR_ERR(n);
154         }
155         dst_set_neighbour(&rt->dst, n);
156
157         return 0;
158 }
159
160 static struct dst_ops ip6_dst_ops_template = {
161         .family                 =       AF_INET6,
162         .protocol               =       cpu_to_be16(ETH_P_IPV6),
163         .gc                     =       ip6_dst_gc,
164         .gc_thresh              =       1024,
165         .check                  =       ip6_dst_check,
166         .default_advmss         =       ip6_default_advmss,
167         .mtu                    =       ip6_mtu,
168         .cow_metrics            =       ipv6_cow_metrics,
169         .destroy                =       ip6_dst_destroy,
170         .ifdown                 =       ip6_dst_ifdown,
171         .negative_advice        =       ip6_negative_advice,
172         .link_failure           =       ip6_link_failure,
173         .update_pmtu            =       ip6_rt_update_pmtu,
174         .local_out              =       __ip6_local_out,
175         .neigh_lookup           =       ip6_neigh_lookup,
176 };
177
178 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
179 {
180         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
181
182         return mtu ? : dst->dev->mtu;
183 }
184
185 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
186 {
187 }
188
189 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
190                                          unsigned long old)
191 {
192         return NULL;
193 }
194
195 static struct dst_ops ip6_dst_blackhole_ops = {
196         .family                 =       AF_INET6,
197         .protocol               =       cpu_to_be16(ETH_P_IPV6),
198         .destroy                =       ip6_dst_destroy,
199         .check                  =       ip6_dst_check,
200         .mtu                    =       ip6_blackhole_mtu,
201         .default_advmss         =       ip6_default_advmss,
202         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
203         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
204         .neigh_lookup           =       ip6_neigh_lookup,
205 };
206
207 static const u32 ip6_template_metrics[RTAX_MAX] = {
208         [RTAX_HOPLIMIT - 1] = 255,
209 };
210
211 static struct rt6_info ip6_null_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -ENETUNREACH,
217                 .input          = ip6_pkt_discard,
218                 .output         = ip6_pkt_discard_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
227
228 static int ip6_pkt_prohibit(struct sk_buff *skb);
229 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
230
231 static struct rt6_info ip6_prohibit_entry_template = {
232         .dst = {
233                 .__refcnt       = ATOMIC_INIT(1),
234                 .__use          = 1,
235                 .obsolete       = -1,
236                 .error          = -EACCES,
237                 .input          = ip6_pkt_prohibit,
238                 .output         = ip6_pkt_prohibit_out,
239         },
240         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
241         .rt6i_protocol  = RTPROT_KERNEL,
242         .rt6i_metric    = ~(u32) 0,
243         .rt6i_ref       = ATOMIC_INIT(1),
244 };
245
246 static struct rt6_info ip6_blk_hole_entry_template = {
247         .dst = {
248                 .__refcnt       = ATOMIC_INIT(1),
249                 .__use          = 1,
250                 .obsolete       = -1,
251                 .error          = -EINVAL,
252                 .input          = dst_discard,
253                 .output         = dst_discard,
254         },
255         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
256         .rt6i_protocol  = RTPROT_KERNEL,
257         .rt6i_metric    = ~(u32) 0,
258         .rt6i_ref       = ATOMIC_INIT(1),
259 };
260
261 #endif
262
263 /* allocate dst with ip6_dst_ops */
264 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
265                                              struct net_device *dev,
266                                              int flags)
267 {
268         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
269
270         if (rt)
271                 memset(&rt->rt6i_table, 0,
272                        sizeof(*rt) - sizeof(struct dst_entry));
273
274         return rt;
275 }
276
277 static void ip6_dst_destroy(struct dst_entry *dst)
278 {
279         struct rt6_info *rt = (struct rt6_info *)dst;
280         struct inet6_dev *idev = rt->rt6i_idev;
281         struct inet_peer *peer = rt->rt6i_peer;
282
283         if (!(rt->dst.flags & DST_HOST))
284                 dst_destroy_metrics_generic(dst);
285
286         if (idev) {
287                 rt->rt6i_idev = NULL;
288                 in6_dev_put(idev);
289         }
290
291         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
292                 dst_release(dst->from);
293
294         if (peer) {
295                 rt->rt6i_peer = NULL;
296                 inet_putpeer(peer);
297         }
298 }
299
300 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
301
302 static u32 rt6_peer_genid(void)
303 {
304         return atomic_read(&__rt6_peer_genid);
305 }
306
307 void rt6_bind_peer(struct rt6_info *rt, int create)
308 {
309         struct net *net = dev_net(rt->dst.dev);
310         struct inet_peer *peer;
311
312         peer = inet_getpeer_v6(net, &rt->rt6i_dst.addr, create);
313         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
314                 inet_putpeer(peer);
315         else
316                 rt->rt6i_peer_genid = rt6_peer_genid();
317 }
318
319 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
320                            int how)
321 {
322         struct rt6_info *rt = (struct rt6_info *)dst;
323         struct inet6_dev *idev = rt->rt6i_idev;
324         struct net_device *loopback_dev =
325                 dev_net(dev)->loopback_dev;
326
327         if (dev != loopback_dev && idev && idev->dev == dev) {
328                 struct inet6_dev *loopback_idev =
329                         in6_dev_get(loopback_dev);
330                 if (loopback_idev) {
331                         rt->rt6i_idev = loopback_idev;
332                         in6_dev_put(idev);
333                 }
334         }
335 }
336
337 static bool rt6_check_expired(const struct rt6_info *rt)
338 {
339         struct rt6_info *ort = NULL;
340
341         if (rt->rt6i_flags & RTF_EXPIRES) {
342                 if (time_after(jiffies, rt->dst.expires))
343                         return true;
344         } else if (rt->dst.from) {
345                 ort = (struct rt6_info *) rt->dst.from;
346                 return (ort->rt6i_flags & RTF_EXPIRES) &&
347                         time_after(jiffies, ort->dst.expires);
348         }
349         return false;
350 }
351
352 static bool rt6_need_strict(const struct in6_addr *daddr)
353 {
354         return ipv6_addr_type(daddr) &
355                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
356 }
357
358 /*
359  *      Route lookup. Any table->tb6_lock is implied.
360  */
361
362 static inline struct rt6_info *rt6_device_match(struct net *net,
363                                                     struct rt6_info *rt,
364                                                     const struct in6_addr *saddr,
365                                                     int oif,
366                                                     int flags)
367 {
368         struct rt6_info *local = NULL;
369         struct rt6_info *sprt;
370
371         if (!oif && ipv6_addr_any(saddr))
372                 goto out;
373
374         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
375                 struct net_device *dev = sprt->dst.dev;
376
377                 if (oif) {
378                         if (dev->ifindex == oif)
379                                 return sprt;
380                         if (dev->flags & IFF_LOOPBACK) {
381                                 if (!sprt->rt6i_idev ||
382                                     sprt->rt6i_idev->dev->ifindex != oif) {
383                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
384                                                 continue;
385                                         if (local && (!oif ||
386                                                       local->rt6i_idev->dev->ifindex == oif))
387                                                 continue;
388                                 }
389                                 local = sprt;
390                         }
391                 } else {
392                         if (ipv6_chk_addr(net, saddr, dev,
393                                           flags & RT6_LOOKUP_F_IFACE))
394                                 return sprt;
395                 }
396         }
397
398         if (oif) {
399                 if (local)
400                         return local;
401
402                 if (flags & RT6_LOOKUP_F_IFACE)
403                         return net->ipv6.ip6_null_entry;
404         }
405 out:
406         return rt;
407 }
408
409 #ifdef CONFIG_IPV6_ROUTER_PREF
410 static void rt6_probe(struct rt6_info *rt)
411 {
412         struct neighbour *neigh;
413         /*
414          * Okay, this does not seem to be appropriate
415          * for now, however, we need to check if it
416          * is really so; aka Router Reachability Probing.
417          *
418          * Router Reachability Probe MUST be rate-limited
419          * to no more than one per minute.
420          */
421         rcu_read_lock();
422         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
423         if (!neigh || (neigh->nud_state & NUD_VALID))
424                 goto out;
425         read_lock_bh(&neigh->lock);
426         if (!(neigh->nud_state & NUD_VALID) &&
427             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
428                 struct in6_addr mcaddr;
429                 struct in6_addr *target;
430
431                 neigh->updated = jiffies;
432                 read_unlock_bh(&neigh->lock);
433
434                 target = (struct in6_addr *)&neigh->primary_key;
435                 addrconf_addr_solict_mult(target, &mcaddr);
436                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
437         } else {
438                 read_unlock_bh(&neigh->lock);
439         }
440 out:
441         rcu_read_unlock();
442 }
443 #else
444 static inline void rt6_probe(struct rt6_info *rt)
445 {
446 }
447 #endif
448
449 /*
450  * Default Router Selection (RFC 2461 6.3.6)
451  */
452 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
453 {
454         struct net_device *dev = rt->dst.dev;
455         if (!oif || dev->ifindex == oif)
456                 return 2;
457         if ((dev->flags & IFF_LOOPBACK) &&
458             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
459                 return 1;
460         return 0;
461 }
462
463 static inline int rt6_check_neigh(struct rt6_info *rt)
464 {
465         struct neighbour *neigh;
466         int m;
467
468         rcu_read_lock();
469         neigh = dst_get_neighbour_noref(&rt->dst);
470         if (rt->rt6i_flags & RTF_NONEXTHOP ||
471             !(rt->rt6i_flags & RTF_GATEWAY))
472                 m = 1;
473         else if (neigh) {
474                 read_lock_bh(&neigh->lock);
475                 if (neigh->nud_state & NUD_VALID)
476                         m = 2;
477 #ifdef CONFIG_IPV6_ROUTER_PREF
478                 else if (neigh->nud_state & NUD_FAILED)
479                         m = 0;
480 #endif
481                 else
482                         m = 1;
483                 read_unlock_bh(&neigh->lock);
484         } else
485                 m = 0;
486         rcu_read_unlock();
487         return m;
488 }
489
490 static int rt6_score_route(struct rt6_info *rt, int oif,
491                            int strict)
492 {
493         int m, n;
494
495         m = rt6_check_dev(rt, oif);
496         if (!m && (strict & RT6_LOOKUP_F_IFACE))
497                 return -1;
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
500 #endif
501         n = rt6_check_neigh(rt);
502         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
503                 return -1;
504         return m;
505 }
506
507 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
508                                    int *mpri, struct rt6_info *match)
509 {
510         int m;
511
512         if (rt6_check_expired(rt))
513                 goto out;
514
515         m = rt6_score_route(rt, oif, strict);
516         if (m < 0)
517                 goto out;
518
519         if (m > *mpri) {
520                 if (strict & RT6_LOOKUP_F_REACHABLE)
521                         rt6_probe(match);
522                 *mpri = m;
523                 match = rt;
524         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
525                 rt6_probe(rt);
526         }
527
528 out:
529         return match;
530 }
531
532 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
533                                      struct rt6_info *rr_head,
534                                      u32 metric, int oif, int strict)
535 {
536         struct rt6_info *rt, *match;
537         int mpri = -1;
538
539         match = NULL;
540         for (rt = rr_head; rt && rt->rt6i_metric == metric;
541              rt = rt->dst.rt6_next)
542                 match = find_match(rt, oif, strict, &mpri, match);
543         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
544              rt = rt->dst.rt6_next)
545                 match = find_match(rt, oif, strict, &mpri, match);
546
547         return match;
548 }
549
550 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
551 {
552         struct rt6_info *match, *rt0;
553         struct net *net;
554
555         rt0 = fn->rr_ptr;
556         if (!rt0)
557                 fn->rr_ptr = rt0 = fn->leaf;
558
559         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
560
561         if (!match &&
562             (strict & RT6_LOOKUP_F_REACHABLE)) {
563                 struct rt6_info *next = rt0->dst.rt6_next;
564
565                 /* no entries matched; do round-robin */
566                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
567                         next = fn->leaf;
568
569                 if (next != rt0)
570                         fn->rr_ptr = next;
571         }
572
573         net = dev_net(rt0->dst.dev);
574         return match ? match : net->ipv6.ip6_null_entry;
575 }
576
577 #ifdef CONFIG_IPV6_ROUTE_INFO
578 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
579                   const struct in6_addr *gwaddr)
580 {
581         struct net *net = dev_net(dev);
582         struct route_info *rinfo = (struct route_info *) opt;
583         struct in6_addr prefix_buf, *prefix;
584         unsigned int pref;
585         unsigned long lifetime;
586         struct rt6_info *rt;
587
588         if (len < sizeof(struct route_info)) {
589                 return -EINVAL;
590         }
591
592         /* Sanity check for prefix_len and length */
593         if (rinfo->length > 3) {
594                 return -EINVAL;
595         } else if (rinfo->prefix_len > 128) {
596                 return -EINVAL;
597         } else if (rinfo->prefix_len > 64) {
598                 if (rinfo->length < 2) {
599                         return -EINVAL;
600                 }
601         } else if (rinfo->prefix_len > 0) {
602                 if (rinfo->length < 1) {
603                         return -EINVAL;
604                 }
605         }
606
607         pref = rinfo->route_pref;
608         if (pref == ICMPV6_ROUTER_PREF_INVALID)
609                 return -EINVAL;
610
611         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
612
613         if (rinfo->length == 3)
614                 prefix = (struct in6_addr *)rinfo->prefix;
615         else {
616                 /* this function is safe */
617                 ipv6_addr_prefix(&prefix_buf,
618                                  (struct in6_addr *)rinfo->prefix,
619                                  rinfo->prefix_len);
620                 prefix = &prefix_buf;
621         }
622
623         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
624                                 dev->ifindex);
625
626         if (rt && !lifetime) {
627                 ip6_del_rt(rt);
628                 rt = NULL;
629         }
630
631         if (!rt && lifetime)
632                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
633                                         pref);
634         else if (rt)
635                 rt->rt6i_flags = RTF_ROUTEINFO |
636                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
637
638         if (rt) {
639                 if (!addrconf_finite_timeout(lifetime))
640                         rt6_clean_expires(rt);
641                 else
642                         rt6_set_expires(rt, jiffies + HZ * lifetime);
643
644                 dst_release(&rt->dst);
645         }
646         return 0;
647 }
648 #endif
649
650 #define BACKTRACK(__net, saddr)                 \
651 do { \
652         if (rt == __net->ipv6.ip6_null_entry) { \
653                 struct fib6_node *pn; \
654                 while (1) { \
655                         if (fn->fn_flags & RTN_TL_ROOT) \
656                                 goto out; \
657                         pn = fn->parent; \
658                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
659                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
660                         else \
661                                 fn = pn; \
662                         if (fn->fn_flags & RTN_RTINFO) \
663                                 goto restart; \
664                 } \
665         } \
666 } while (0)
667
668 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
669                                              struct fib6_table *table,
670                                              struct flowi6 *fl6, int flags)
671 {
672         struct fib6_node *fn;
673         struct rt6_info *rt;
674
675         read_lock_bh(&table->tb6_lock);
676         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
677 restart:
678         rt = fn->leaf;
679         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
680         BACKTRACK(net, &fl6->saddr);
681 out:
682         dst_use(&rt->dst, jiffies);
683         read_unlock_bh(&table->tb6_lock);
684         return rt;
685
686 }
687
688 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
689                                     int flags)
690 {
691         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
692 }
693 EXPORT_SYMBOL_GPL(ip6_route_lookup);
694
695 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
696                             const struct in6_addr *saddr, int oif, int strict)
697 {
698         struct flowi6 fl6 = {
699                 .flowi6_oif = oif,
700                 .daddr = *daddr,
701         };
702         struct dst_entry *dst;
703         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
704
705         if (saddr) {
706                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
707                 flags |= RT6_LOOKUP_F_HAS_SADDR;
708         }
709
710         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
711         if (dst->error == 0)
712                 return (struct rt6_info *) dst;
713
714         dst_release(dst);
715
716         return NULL;
717 }
718
719 EXPORT_SYMBOL(rt6_lookup);
720
721 /* ip6_ins_rt is called with FREE table->tb6_lock.
722    It takes new route entry, the addition fails by any reason the
723    route is freed. In any case, if caller does not hold it, it may
724    be destroyed.
725  */
726
727 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
728 {
729         int err;
730         struct fib6_table *table;
731
732         table = rt->rt6i_table;
733         write_lock_bh(&table->tb6_lock);
734         err = fib6_add(&table->tb6_root, rt, info);
735         write_unlock_bh(&table->tb6_lock);
736
737         return err;
738 }
739
740 int ip6_ins_rt(struct rt6_info *rt)
741 {
742         struct nl_info info = {
743                 .nl_net = dev_net(rt->dst.dev),
744         };
745         return __ip6_ins_rt(rt, &info);
746 }
747
748 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
749                                       const struct in6_addr *daddr,
750                                       const struct in6_addr *saddr)
751 {
752         struct rt6_info *rt;
753
754         /*
755          *      Clone the route.
756          */
757
758         rt = ip6_rt_copy(ort, daddr);
759
760         if (rt) {
761                 int attempts = !in_softirq();
762
763                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
764                         if (ort->rt6i_dst.plen != 128 &&
765                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
766                                 rt->rt6i_flags |= RTF_ANYCAST;
767                         rt->rt6i_gateway = *daddr;
768                 }
769
770                 rt->rt6i_flags |= RTF_CACHE;
771
772 #ifdef CONFIG_IPV6_SUBTREES
773                 if (rt->rt6i_src.plen && saddr) {
774                         rt->rt6i_src.addr = *saddr;
775                         rt->rt6i_src.plen = 128;
776                 }
777 #endif
778
779         retry:
780                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
781                         struct net *net = dev_net(rt->dst.dev);
782                         int saved_rt_min_interval =
783                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
784                         int saved_rt_elasticity =
785                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
786
787                         if (attempts-- > 0) {
788                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
789                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
790
791                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
792
793                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
794                                         saved_rt_elasticity;
795                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
796                                         saved_rt_min_interval;
797                                 goto retry;
798                         }
799
800                         net_warn_ratelimited("Neighbour table overflow\n");
801                         dst_free(&rt->dst);
802                         return NULL;
803                 }
804         }
805
806         return rt;
807 }
808
809 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
810                                         const struct in6_addr *daddr)
811 {
812         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
813
814         if (rt) {
815                 rt->rt6i_flags |= RTF_CACHE;
816                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
817         }
818         return rt;
819 }
820
821 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
822                                       struct flowi6 *fl6, int flags)
823 {
824         struct fib6_node *fn;
825         struct rt6_info *rt, *nrt;
826         int strict = 0;
827         int attempts = 3;
828         int err;
829         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
830
831         strict |= flags & RT6_LOOKUP_F_IFACE;
832
833 relookup:
834         read_lock_bh(&table->tb6_lock);
835
836 restart_2:
837         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
838
839 restart:
840         rt = rt6_select(fn, oif, strict | reachable);
841
842         BACKTRACK(net, &fl6->saddr);
843         if (rt == net->ipv6.ip6_null_entry ||
844             rt->rt6i_flags & RTF_CACHE)
845                 goto out;
846
847         dst_hold(&rt->dst);
848         read_unlock_bh(&table->tb6_lock);
849
850         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
851                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
852         else if (!(rt->dst.flags & DST_HOST))
853                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
854         else
855                 goto out2;
856
857         dst_release(&rt->dst);
858         rt = nrt ? : net->ipv6.ip6_null_entry;
859
860         dst_hold(&rt->dst);
861         if (nrt) {
862                 err = ip6_ins_rt(nrt);
863                 if (!err)
864                         goto out2;
865         }
866
867         if (--attempts <= 0)
868                 goto out2;
869
870         /*
871          * Race condition! In the gap, when table->tb6_lock was
872          * released someone could insert this route.  Relookup.
873          */
874         dst_release(&rt->dst);
875         goto relookup;
876
877 out:
878         if (reachable) {
879                 reachable = 0;
880                 goto restart_2;
881         }
882         dst_hold(&rt->dst);
883         read_unlock_bh(&table->tb6_lock);
884 out2:
885         rt->dst.lastuse = jiffies;
886         rt->dst.__use++;
887
888         return rt;
889 }
890
891 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
892                                             struct flowi6 *fl6, int flags)
893 {
894         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
895 }
896
897 static struct dst_entry *ip6_route_input_lookup(struct net *net,
898                                                 struct net_device *dev,
899                                                 struct flowi6 *fl6, int flags)
900 {
901         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
902                 flags |= RT6_LOOKUP_F_IFACE;
903
904         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
905 }
906
907 void ip6_route_input(struct sk_buff *skb)
908 {
909         const struct ipv6hdr *iph = ipv6_hdr(skb);
910         struct net *net = dev_net(skb->dev);
911         int flags = RT6_LOOKUP_F_HAS_SADDR;
912         struct flowi6 fl6 = {
913                 .flowi6_iif = skb->dev->ifindex,
914                 .daddr = iph->daddr,
915                 .saddr = iph->saddr,
916                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
917                 .flowi6_mark = skb->mark,
918                 .flowi6_proto = iph->nexthdr,
919         };
920
921         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
922 }
923
924 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
925                                              struct flowi6 *fl6, int flags)
926 {
927         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
928 }
929
930 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
931                                     struct flowi6 *fl6)
932 {
933         int flags = 0;
934
935         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
936                 flags |= RT6_LOOKUP_F_IFACE;
937
938         if (!ipv6_addr_any(&fl6->saddr))
939                 flags |= RT6_LOOKUP_F_HAS_SADDR;
940         else if (sk)
941                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
942
943         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
944 }
945
946 EXPORT_SYMBOL(ip6_route_output);
947
948 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
949 {
950         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
951         struct dst_entry *new = NULL;
952
953         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
954         if (rt) {
955                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
956
957                 new = &rt->dst;
958
959                 new->__use = 1;
960                 new->input = dst_discard;
961                 new->output = dst_discard;
962
963                 if (dst_metrics_read_only(&ort->dst))
964                         new->_metrics = ort->dst._metrics;
965                 else
966                         dst_copy_metrics(new, &ort->dst);
967                 rt->rt6i_idev = ort->rt6i_idev;
968                 if (rt->rt6i_idev)
969                         in6_dev_hold(rt->rt6i_idev);
970
971                 rt->rt6i_gateway = ort->rt6i_gateway;
972                 rt->rt6i_flags = ort->rt6i_flags;
973                 rt6_clean_expires(rt);
974                 rt->rt6i_metric = 0;
975
976                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
977 #ifdef CONFIG_IPV6_SUBTREES
978                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
979 #endif
980
981                 dst_free(new);
982         }
983
984         dst_release(dst_orig);
985         return new ? new : ERR_PTR(-ENOMEM);
986 }
987
988 /*
989  *      Destination cache support functions
990  */
991
992 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
993 {
994         struct rt6_info *rt;
995
996         rt = (struct rt6_info *) dst;
997
998         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
999                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1000                         if (!rt->rt6i_peer)
1001                                 rt6_bind_peer(rt, 0);
1002                         rt->rt6i_peer_genid = rt6_peer_genid();
1003                 }
1004                 return dst;
1005         }
1006         return NULL;
1007 }
1008
1009 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1010 {
1011         struct rt6_info *rt = (struct rt6_info *) dst;
1012
1013         if (rt) {
1014                 if (rt->rt6i_flags & RTF_CACHE) {
1015                         if (rt6_check_expired(rt)) {
1016                                 ip6_del_rt(rt);
1017                                 dst = NULL;
1018                         }
1019                 } else {
1020                         dst_release(dst);
1021                         dst = NULL;
1022                 }
1023         }
1024         return dst;
1025 }
1026
1027 static void ip6_link_failure(struct sk_buff *skb)
1028 {
1029         struct rt6_info *rt;
1030
1031         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1032
1033         rt = (struct rt6_info *) skb_dst(skb);
1034         if (rt) {
1035                 if (rt->rt6i_flags & RTF_CACHE)
1036                         rt6_update_expires(rt, 0);
1037                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1038                         rt->rt6i_node->fn_sernum = -1;
1039         }
1040 }
1041
1042 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1043 {
1044         struct rt6_info *rt6 = (struct rt6_info*)dst;
1045
1046         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1047                 rt6->rt6i_flags |= RTF_MODIFIED;
1048                 if (mtu < IPV6_MIN_MTU) {
1049                         u32 features = dst_metric(dst, RTAX_FEATURES);
1050                         mtu = IPV6_MIN_MTU;
1051                         features |= RTAX_FEATURE_ALLFRAG;
1052                         dst_metric_set(dst, RTAX_FEATURES, features);
1053                 }
1054                 dst_metric_set(dst, RTAX_MTU, mtu);
1055         }
1056 }
1057
1058 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1059 {
1060         struct net_device *dev = dst->dev;
1061         unsigned int mtu = dst_mtu(dst);
1062         struct net *net = dev_net(dev);
1063
1064         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1065
1066         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1067                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1068
1069         /*
1070          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1071          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1072          * IPV6_MAXPLEN is also valid and means: "any MSS,
1073          * rely only on pmtu discovery"
1074          */
1075         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1076                 mtu = IPV6_MAXPLEN;
1077         return mtu;
1078 }
1079
1080 static unsigned int ip6_mtu(const struct dst_entry *dst)
1081 {
1082         struct inet6_dev *idev;
1083         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1084
1085         if (mtu)
1086                 return mtu;
1087
1088         mtu = IPV6_MIN_MTU;
1089
1090         rcu_read_lock();
1091         idev = __in6_dev_get(dst->dev);
1092         if (idev)
1093                 mtu = idev->cnf.mtu6;
1094         rcu_read_unlock();
1095
1096         return mtu;
1097 }
1098
1099 static struct dst_entry *icmp6_dst_gc_list;
1100 static DEFINE_SPINLOCK(icmp6_dst_lock);
1101
1102 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1103                                   struct neighbour *neigh,
1104                                   struct flowi6 *fl6)
1105 {
1106         struct dst_entry *dst;
1107         struct rt6_info *rt;
1108         struct inet6_dev *idev = in6_dev_get(dev);
1109         struct net *net = dev_net(dev);
1110
1111         if (unlikely(!idev))
1112                 return ERR_PTR(-ENODEV);
1113
1114         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1115         if (unlikely(!rt)) {
1116                 in6_dev_put(idev);
1117                 dst = ERR_PTR(-ENOMEM);
1118                 goto out;
1119         }
1120
1121         if (neigh)
1122                 neigh_hold(neigh);
1123         else {
1124                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1125                 if (IS_ERR(neigh)) {
1126                         in6_dev_put(idev);
1127                         dst_free(&rt->dst);
1128                         return ERR_CAST(neigh);
1129                 }
1130         }
1131
1132         rt->dst.flags |= DST_HOST;
1133         rt->dst.output  = ip6_output;
1134         dst_set_neighbour(&rt->dst, neigh);
1135         atomic_set(&rt->dst.__refcnt, 1);
1136         rt->rt6i_dst.addr = fl6->daddr;
1137         rt->rt6i_dst.plen = 128;
1138         rt->rt6i_idev     = idev;
1139         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1140
1141         spin_lock_bh(&icmp6_dst_lock);
1142         rt->dst.next = icmp6_dst_gc_list;
1143         icmp6_dst_gc_list = &rt->dst;
1144         spin_unlock_bh(&icmp6_dst_lock);
1145
1146         fib6_force_start_gc(net);
1147
1148         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1149
1150 out:
1151         return dst;
1152 }
1153
1154 int icmp6_dst_gc(void)
1155 {
1156         struct dst_entry *dst, **pprev;
1157         int more = 0;
1158
1159         spin_lock_bh(&icmp6_dst_lock);
1160         pprev = &icmp6_dst_gc_list;
1161
1162         while ((dst = *pprev) != NULL) {
1163                 if (!atomic_read(&dst->__refcnt)) {
1164                         *pprev = dst->next;
1165                         dst_free(dst);
1166                 } else {
1167                         pprev = &dst->next;
1168                         ++more;
1169                 }
1170         }
1171
1172         spin_unlock_bh(&icmp6_dst_lock);
1173
1174         return more;
1175 }
1176
1177 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1178                             void *arg)
1179 {
1180         struct dst_entry *dst, **pprev;
1181
1182         spin_lock_bh(&icmp6_dst_lock);
1183         pprev = &icmp6_dst_gc_list;
1184         while ((dst = *pprev) != NULL) {
1185                 struct rt6_info *rt = (struct rt6_info *) dst;
1186                 if (func(rt, arg)) {
1187                         *pprev = dst->next;
1188                         dst_free(dst);
1189                 } else {
1190                         pprev = &dst->next;
1191                 }
1192         }
1193         spin_unlock_bh(&icmp6_dst_lock);
1194 }
1195
1196 static int ip6_dst_gc(struct dst_ops *ops)
1197 {
1198         unsigned long now = jiffies;
1199         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1200         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1201         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1202         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1203         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1204         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1205         int entries;
1206
1207         entries = dst_entries_get_fast(ops);
1208         if (time_after(rt_last_gc + rt_min_interval, now) &&
1209             entries <= rt_max_size)
1210                 goto out;
1211
1212         net->ipv6.ip6_rt_gc_expire++;
1213         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1214         net->ipv6.ip6_rt_last_gc = now;
1215         entries = dst_entries_get_slow(ops);
1216         if (entries < ops->gc_thresh)
1217                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1218 out:
1219         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1220         return entries > rt_max_size;
1221 }
1222
1223 /* Clean host part of a prefix. Not necessary in radix tree,
1224    but results in cleaner routing tables.
1225
1226    Remove it only when all the things will work!
1227  */
1228
1229 int ip6_dst_hoplimit(struct dst_entry *dst)
1230 {
1231         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1232         if (hoplimit == 0) {
1233                 struct net_device *dev = dst->dev;
1234                 struct inet6_dev *idev;
1235
1236                 rcu_read_lock();
1237                 idev = __in6_dev_get(dev);
1238                 if (idev)
1239                         hoplimit = idev->cnf.hop_limit;
1240                 else
1241                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1242                 rcu_read_unlock();
1243         }
1244         return hoplimit;
1245 }
1246 EXPORT_SYMBOL(ip6_dst_hoplimit);
1247
1248 /*
1249  *
1250  */
1251
1252 int ip6_route_add(struct fib6_config *cfg)
1253 {
1254         int err;
1255         struct net *net = cfg->fc_nlinfo.nl_net;
1256         struct rt6_info *rt = NULL;
1257         struct net_device *dev = NULL;
1258         struct inet6_dev *idev = NULL;
1259         struct fib6_table *table;
1260         int addr_type;
1261
1262         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1263                 return -EINVAL;
1264 #ifndef CONFIG_IPV6_SUBTREES
1265         if (cfg->fc_src_len)
1266                 return -EINVAL;
1267 #endif
1268         if (cfg->fc_ifindex) {
1269                 err = -ENODEV;
1270                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1271                 if (!dev)
1272                         goto out;
1273                 idev = in6_dev_get(dev);
1274                 if (!idev)
1275                         goto out;
1276         }
1277
1278         if (cfg->fc_metric == 0)
1279                 cfg->fc_metric = IP6_RT_PRIO_USER;
1280
1281         err = -ENOBUFS;
1282         if (cfg->fc_nlinfo.nlh &&
1283             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1284                 table = fib6_get_table(net, cfg->fc_table);
1285                 if (!table) {
1286                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1287                         table = fib6_new_table(net, cfg->fc_table);
1288                 }
1289         } else {
1290                 table = fib6_new_table(net, cfg->fc_table);
1291         }
1292
1293         if (!table)
1294                 goto out;
1295
1296         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1297
1298         if (!rt) {
1299                 err = -ENOMEM;
1300                 goto out;
1301         }
1302
1303         rt->dst.obsolete = -1;
1304
1305         if (cfg->fc_flags & RTF_EXPIRES)
1306                 rt6_set_expires(rt, jiffies +
1307                                 clock_t_to_jiffies(cfg->fc_expires));
1308         else
1309                 rt6_clean_expires(rt);
1310
1311         if (cfg->fc_protocol == RTPROT_UNSPEC)
1312                 cfg->fc_protocol = RTPROT_BOOT;
1313         rt->rt6i_protocol = cfg->fc_protocol;
1314
1315         addr_type = ipv6_addr_type(&cfg->fc_dst);
1316
1317         if (addr_type & IPV6_ADDR_MULTICAST)
1318                 rt->dst.input = ip6_mc_input;
1319         else if (cfg->fc_flags & RTF_LOCAL)
1320                 rt->dst.input = ip6_input;
1321         else
1322                 rt->dst.input = ip6_forward;
1323
1324         rt->dst.output = ip6_output;
1325
1326         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1327         rt->rt6i_dst.plen = cfg->fc_dst_len;
1328         if (rt->rt6i_dst.plen == 128)
1329                rt->dst.flags |= DST_HOST;
1330
1331         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1332                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1333                 if (!metrics) {
1334                         err = -ENOMEM;
1335                         goto out;
1336                 }
1337                 dst_init_metrics(&rt->dst, metrics, 0);
1338         }
1339 #ifdef CONFIG_IPV6_SUBTREES
1340         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1341         rt->rt6i_src.plen = cfg->fc_src_len;
1342 #endif
1343
1344         rt->rt6i_metric = cfg->fc_metric;
1345
1346         /* We cannot add true routes via loopback here,
1347            they would result in kernel looping; promote them to reject routes
1348          */
1349         if ((cfg->fc_flags & RTF_REJECT) ||
1350             (dev && (dev->flags & IFF_LOOPBACK) &&
1351              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1352              !(cfg->fc_flags & RTF_LOCAL))) {
1353                 /* hold loopback dev/idev if we haven't done so. */
1354                 if (dev != net->loopback_dev) {
1355                         if (dev) {
1356                                 dev_put(dev);
1357                                 in6_dev_put(idev);
1358                         }
1359                         dev = net->loopback_dev;
1360                         dev_hold(dev);
1361                         idev = in6_dev_get(dev);
1362                         if (!idev) {
1363                                 err = -ENODEV;
1364                                 goto out;
1365                         }
1366                 }
1367                 rt->dst.output = ip6_pkt_discard_out;
1368                 rt->dst.input = ip6_pkt_discard;
1369                 rt->dst.error = -ENETUNREACH;
1370                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1371                 goto install_route;
1372         }
1373
1374         if (cfg->fc_flags & RTF_GATEWAY) {
1375                 const struct in6_addr *gw_addr;
1376                 int gwa_type;
1377
1378                 gw_addr = &cfg->fc_gateway;
1379                 rt->rt6i_gateway = *gw_addr;
1380                 gwa_type = ipv6_addr_type(gw_addr);
1381
1382                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1383                         struct rt6_info *grt;
1384
1385                         /* IPv6 strictly inhibits using not link-local
1386                            addresses as nexthop address.
1387                            Otherwise, router will not able to send redirects.
1388                            It is very good, but in some (rare!) circumstances
1389                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1390                            some exceptions. --ANK
1391                          */
1392                         err = -EINVAL;
1393                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1394                                 goto out;
1395
1396                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1397
1398                         err = -EHOSTUNREACH;
1399                         if (!grt)
1400                                 goto out;
1401                         if (dev) {
1402                                 if (dev != grt->dst.dev) {
1403                                         dst_release(&grt->dst);
1404                                         goto out;
1405                                 }
1406                         } else {
1407                                 dev = grt->dst.dev;
1408                                 idev = grt->rt6i_idev;
1409                                 dev_hold(dev);
1410                                 in6_dev_hold(grt->rt6i_idev);
1411                         }
1412                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1413                                 err = 0;
1414                         dst_release(&grt->dst);
1415
1416                         if (err)
1417                                 goto out;
1418                 }
1419                 err = -EINVAL;
1420                 if (!dev || (dev->flags & IFF_LOOPBACK))
1421                         goto out;
1422         }
1423
1424         err = -ENODEV;
1425         if (!dev)
1426                 goto out;
1427
1428         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1429                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1430                         err = -EINVAL;
1431                         goto out;
1432                 }
1433                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1434                 rt->rt6i_prefsrc.plen = 128;
1435         } else
1436                 rt->rt6i_prefsrc.plen = 0;
1437
1438         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1439                 err = rt6_bind_neighbour(rt, dev);
1440                 if (err)
1441                         goto out;
1442         }
1443
1444         rt->rt6i_flags = cfg->fc_flags;
1445
1446 install_route:
1447         if (cfg->fc_mx) {
1448                 struct nlattr *nla;
1449                 int remaining;
1450
1451                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1452                         int type = nla_type(nla);
1453
1454                         if (type) {
1455                                 if (type > RTAX_MAX) {
1456                                         err = -EINVAL;
1457                                         goto out;
1458                                 }
1459
1460                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1461                         }
1462                 }
1463         }
1464
1465         rt->dst.dev = dev;
1466         rt->rt6i_idev = idev;
1467         rt->rt6i_table = table;
1468
1469         cfg->fc_nlinfo.nl_net = dev_net(dev);
1470
1471         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1472
1473 out:
1474         if (dev)
1475                 dev_put(dev);
1476         if (idev)
1477                 in6_dev_put(idev);
1478         if (rt)
1479                 dst_free(&rt->dst);
1480         return err;
1481 }
1482
1483 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1484 {
1485         int err;
1486         struct fib6_table *table;
1487         struct net *net = dev_net(rt->dst.dev);
1488
1489         if (rt == net->ipv6.ip6_null_entry)
1490                 return -ENOENT;
1491
1492         table = rt->rt6i_table;
1493         write_lock_bh(&table->tb6_lock);
1494
1495         err = fib6_del(rt, info);
1496         dst_release(&rt->dst);
1497
1498         write_unlock_bh(&table->tb6_lock);
1499
1500         return err;
1501 }
1502
1503 int ip6_del_rt(struct rt6_info *rt)
1504 {
1505         struct nl_info info = {
1506                 .nl_net = dev_net(rt->dst.dev),
1507         };
1508         return __ip6_del_rt(rt, &info);
1509 }
1510
1511 static int ip6_route_del(struct fib6_config *cfg)
1512 {
1513         struct fib6_table *table;
1514         struct fib6_node *fn;
1515         struct rt6_info *rt;
1516         int err = -ESRCH;
1517
1518         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1519         if (!table)
1520                 return err;
1521
1522         read_lock_bh(&table->tb6_lock);
1523
1524         fn = fib6_locate(&table->tb6_root,
1525                          &cfg->fc_dst, cfg->fc_dst_len,
1526                          &cfg->fc_src, cfg->fc_src_len);
1527
1528         if (fn) {
1529                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1530                         if (cfg->fc_ifindex &&
1531                             (!rt->dst.dev ||
1532                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1533                                 continue;
1534                         if (cfg->fc_flags & RTF_GATEWAY &&
1535                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1536                                 continue;
1537                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1538                                 continue;
1539                         dst_hold(&rt->dst);
1540                         read_unlock_bh(&table->tb6_lock);
1541
1542                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1543                 }
1544         }
1545         read_unlock_bh(&table->tb6_lock);
1546
1547         return err;
1548 }
1549
1550 /*
1551  *      Handle redirects
1552  */
1553 struct ip6rd_flowi {
1554         struct flowi6 fl6;
1555         struct in6_addr gateway;
1556 };
1557
1558 static struct rt6_info *__ip6_route_redirect(struct net *net,
1559                                              struct fib6_table *table,
1560                                              struct flowi6 *fl6,
1561                                              int flags)
1562 {
1563         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1564         struct rt6_info *rt;
1565         struct fib6_node *fn;
1566
1567         /*
1568          * Get the "current" route for this destination and
1569          * check if the redirect has come from approriate router.
1570          *
1571          * RFC 2461 specifies that redirects should only be
1572          * accepted if they come from the nexthop to the target.
1573          * Due to the way the routes are chosen, this notion
1574          * is a bit fuzzy and one might need to check all possible
1575          * routes.
1576          */
1577
1578         read_lock_bh(&table->tb6_lock);
1579         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1580 restart:
1581         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1582                 /*
1583                  * Current route is on-link; redirect is always invalid.
1584                  *
1585                  * Seems, previous statement is not true. It could
1586                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1587                  * But then router serving it might decide, that we should
1588                  * know truth 8)8) --ANK (980726).
1589                  */
1590                 if (rt6_check_expired(rt))
1591                         continue;
1592                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1593                         continue;
1594                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1595                         continue;
1596                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1597                         continue;
1598                 break;
1599         }
1600
1601         if (!rt)
1602                 rt = net->ipv6.ip6_null_entry;
1603         BACKTRACK(net, &fl6->saddr);
1604 out:
1605         dst_hold(&rt->dst);
1606
1607         read_unlock_bh(&table->tb6_lock);
1608
1609         return rt;
1610 };
1611
1612 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1613                                            const struct in6_addr *src,
1614                                            const struct in6_addr *gateway,
1615                                            struct net_device *dev)
1616 {
1617         int flags = RT6_LOOKUP_F_HAS_SADDR;
1618         struct net *net = dev_net(dev);
1619         struct ip6rd_flowi rdfl = {
1620                 .fl6 = {
1621                         .flowi6_oif = dev->ifindex,
1622                         .daddr = *dest,
1623                         .saddr = *src,
1624                 },
1625         };
1626
1627         rdfl.gateway = *gateway;
1628
1629         if (rt6_need_strict(dest))
1630                 flags |= RT6_LOOKUP_F_IFACE;
1631
1632         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1633                                                    flags, __ip6_route_redirect);
1634 }
1635
1636 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1637                   const struct in6_addr *saddr,
1638                   struct neighbour *neigh, u8 *lladdr, int on_link)
1639 {
1640         struct rt6_info *rt, *nrt = NULL;
1641         struct netevent_redirect netevent;
1642         struct net *net = dev_net(neigh->dev);
1643
1644         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1645
1646         if (rt == net->ipv6.ip6_null_entry) {
1647                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1648                 goto out;
1649         }
1650
1651         /*
1652          *      We have finally decided to accept it.
1653          */
1654
1655         neigh_update(neigh, lladdr, NUD_STALE,
1656                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1657                      NEIGH_UPDATE_F_OVERRIDE|
1658                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1659                                      NEIGH_UPDATE_F_ISROUTER))
1660                      );
1661
1662         /*
1663          * Redirect received -> path was valid.
1664          * Look, redirects are sent only in response to data packets,
1665          * so that this nexthop apparently is reachable. --ANK
1666          */
1667         dst_confirm(&rt->dst);
1668
1669         /* Duplicate redirect: silently ignore. */
1670         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1671                 goto out;
1672
1673         nrt = ip6_rt_copy(rt, dest);
1674         if (!nrt)
1675                 goto out;
1676
1677         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1678         if (on_link)
1679                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1680
1681         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1682         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1683
1684         if (ip6_ins_rt(nrt))
1685                 goto out;
1686
1687         netevent.old = &rt->dst;
1688         netevent.new = &nrt->dst;
1689         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1690
1691         if (rt->rt6i_flags & RTF_CACHE) {
1692                 ip6_del_rt(rt);
1693                 return;
1694         }
1695
1696 out:
1697         dst_release(&rt->dst);
1698 }
1699
1700 /*
1701  *      Handle ICMP "packet too big" messages
1702  *      i.e. Path MTU discovery
1703  */
1704
1705 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1706                              struct net *net, u32 pmtu, int ifindex)
1707 {
1708         struct rt6_info *rt, *nrt;
1709         int allfrag = 0;
1710 again:
1711         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1712         if (!rt)
1713                 return;
1714
1715         if (rt6_check_expired(rt)) {
1716                 ip6_del_rt(rt);
1717                 goto again;
1718         }
1719
1720         if (pmtu >= dst_mtu(&rt->dst))
1721                 goto out;
1722
1723         if (pmtu < IPV6_MIN_MTU) {
1724                 /*
1725                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1726                  * MTU (1280) and a fragment header should always be included
1727                  * after a node receiving Too Big message reporting PMTU is
1728                  * less than the IPv6 Minimum Link MTU.
1729                  */
1730                 pmtu = IPV6_MIN_MTU;
1731                 allfrag = 1;
1732         }
1733
1734         /* New mtu received -> path was valid.
1735            They are sent only in response to data packets,
1736            so that this nexthop apparently is reachable. --ANK
1737          */
1738         dst_confirm(&rt->dst);
1739
1740         /* Host route. If it is static, it would be better
1741            not to override it, but add new one, so that
1742            when cache entry will expire old pmtu
1743            would return automatically.
1744          */
1745         if (rt->rt6i_flags & RTF_CACHE) {
1746                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1747                 if (allfrag) {
1748                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1749                         features |= RTAX_FEATURE_ALLFRAG;
1750                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1751                 }
1752                 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1753                 rt->rt6i_flags |= RTF_MODIFIED;
1754                 goto out;
1755         }
1756
1757         /* Network route.
1758            Two cases are possible:
1759            1. It is connected route. Action: COW
1760            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1761          */
1762         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1763                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1764         else
1765                 nrt = rt6_alloc_clone(rt, daddr);
1766
1767         if (nrt) {
1768                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1769                 if (allfrag) {
1770                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1771                         features |= RTAX_FEATURE_ALLFRAG;
1772                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1773                 }
1774
1775                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1776                  * happened within 5 mins, the recommended timer is 10 mins.
1777                  * Here this route expiration time is set to ip6_rt_mtu_expires
1778                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1779                  * and detecting PMTU increase will be automatically happened.
1780                  */
1781                 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1782                 nrt->rt6i_flags |= RTF_DYNAMIC;
1783                 ip6_ins_rt(nrt);
1784         }
1785 out:
1786         dst_release(&rt->dst);
1787 }
1788
1789 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1790                         struct net_device *dev, u32 pmtu)
1791 {
1792         struct net *net = dev_net(dev);
1793
1794         /*
1795          * RFC 1981 states that a node "MUST reduce the size of the packets it
1796          * is sending along the path" that caused the Packet Too Big message.
1797          * Since it's not possible in the general case to determine which
1798          * interface was used to send the original packet, we update the MTU
1799          * on the interface that will be used to send future packets. We also
1800          * update the MTU on the interface that received the Packet Too Big in
1801          * case the original packet was forced out that interface with
1802          * SO_BINDTODEVICE or similar. This is the next best thing to the
1803          * correct behaviour, which would be to update the MTU on all
1804          * interfaces.
1805          */
1806         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1807         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1808 }
1809
1810 /*
1811  *      Misc support functions
1812  */
1813
1814 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1815                                     const struct in6_addr *dest)
1816 {
1817         struct net *net = dev_net(ort->dst.dev);
1818         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1819                                             ort->dst.dev, 0);
1820
1821         if (rt) {
1822                 rt->dst.input = ort->dst.input;
1823                 rt->dst.output = ort->dst.output;
1824                 rt->dst.flags |= DST_HOST;
1825
1826                 rt->rt6i_dst.addr = *dest;
1827                 rt->rt6i_dst.plen = 128;
1828                 dst_copy_metrics(&rt->dst, &ort->dst);
1829                 rt->dst.error = ort->dst.error;
1830                 rt->rt6i_idev = ort->rt6i_idev;
1831                 if (rt->rt6i_idev)
1832                         in6_dev_hold(rt->rt6i_idev);
1833                 rt->dst.lastuse = jiffies;
1834
1835                 rt->rt6i_gateway = ort->rt6i_gateway;
1836                 rt->rt6i_flags = ort->rt6i_flags;
1837                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1838                     (RTF_DEFAULT | RTF_ADDRCONF))
1839                         rt6_set_from(rt, ort);
1840                 else
1841                         rt6_clean_expires(rt);
1842                 rt->rt6i_metric = 0;
1843
1844 #ifdef CONFIG_IPV6_SUBTREES
1845                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1846 #endif
1847                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1848                 rt->rt6i_table = ort->rt6i_table;
1849         }
1850         return rt;
1851 }
1852
1853 #ifdef CONFIG_IPV6_ROUTE_INFO
1854 static struct rt6_info *rt6_get_route_info(struct net *net,
1855                                            const struct in6_addr *prefix, int prefixlen,
1856                                            const struct in6_addr *gwaddr, int ifindex)
1857 {
1858         struct fib6_node *fn;
1859         struct rt6_info *rt = NULL;
1860         struct fib6_table *table;
1861
1862         table = fib6_get_table(net, RT6_TABLE_INFO);
1863         if (!table)
1864                 return NULL;
1865
1866         write_lock_bh(&table->tb6_lock);
1867         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1868         if (!fn)
1869                 goto out;
1870
1871         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1872                 if (rt->dst.dev->ifindex != ifindex)
1873                         continue;
1874                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1875                         continue;
1876                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1877                         continue;
1878                 dst_hold(&rt->dst);
1879                 break;
1880         }
1881 out:
1882         write_unlock_bh(&table->tb6_lock);
1883         return rt;
1884 }
1885
1886 static struct rt6_info *rt6_add_route_info(struct net *net,
1887                                            const struct in6_addr *prefix, int prefixlen,
1888                                            const struct in6_addr *gwaddr, int ifindex,
1889                                            unsigned int pref)
1890 {
1891         struct fib6_config cfg = {
1892                 .fc_table       = RT6_TABLE_INFO,
1893                 .fc_metric      = IP6_RT_PRIO_USER,
1894                 .fc_ifindex     = ifindex,
1895                 .fc_dst_len     = prefixlen,
1896                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1897                                   RTF_UP | RTF_PREF(pref),
1898                 .fc_nlinfo.pid = 0,
1899                 .fc_nlinfo.nlh = NULL,
1900                 .fc_nlinfo.nl_net = net,
1901         };
1902
1903         cfg.fc_dst = *prefix;
1904         cfg.fc_gateway = *gwaddr;
1905
1906         /* We should treat it as a default route if prefix length is 0. */
1907         if (!prefixlen)
1908                 cfg.fc_flags |= RTF_DEFAULT;
1909
1910         ip6_route_add(&cfg);
1911
1912         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1913 }
1914 #endif
1915
1916 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1917 {
1918         struct rt6_info *rt;
1919         struct fib6_table *table;
1920
1921         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1922         if (!table)
1923                 return NULL;
1924
1925         write_lock_bh(&table->tb6_lock);
1926         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1927                 if (dev == rt->dst.dev &&
1928                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1929                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1930                         break;
1931         }
1932         if (rt)
1933                 dst_hold(&rt->dst);
1934         write_unlock_bh(&table->tb6_lock);
1935         return rt;
1936 }
1937
1938 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1939                                      struct net_device *dev,
1940                                      unsigned int pref)
1941 {
1942         struct fib6_config cfg = {
1943                 .fc_table       = RT6_TABLE_DFLT,
1944                 .fc_metric      = IP6_RT_PRIO_USER,
1945                 .fc_ifindex     = dev->ifindex,
1946                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1947                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1948                 .fc_nlinfo.pid = 0,
1949                 .fc_nlinfo.nlh = NULL,
1950                 .fc_nlinfo.nl_net = dev_net(dev),
1951         };
1952
1953         cfg.fc_gateway = *gwaddr;
1954
1955         ip6_route_add(&cfg);
1956
1957         return rt6_get_dflt_router(gwaddr, dev);
1958 }
1959
1960 void rt6_purge_dflt_routers(struct net *net)
1961 {
1962         struct rt6_info *rt;
1963         struct fib6_table *table;
1964
1965         /* NOTE: Keep consistent with rt6_get_dflt_router */
1966         table = fib6_get_table(net, RT6_TABLE_DFLT);
1967         if (!table)
1968                 return;
1969
1970 restart:
1971         read_lock_bh(&table->tb6_lock);
1972         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1973                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1974                         dst_hold(&rt->dst);
1975                         read_unlock_bh(&table->tb6_lock);
1976                         ip6_del_rt(rt);
1977                         goto restart;
1978                 }
1979         }
1980         read_unlock_bh(&table->tb6_lock);
1981 }
1982
1983 static void rtmsg_to_fib6_config(struct net *net,
1984                                  struct in6_rtmsg *rtmsg,
1985                                  struct fib6_config *cfg)
1986 {
1987         memset(cfg, 0, sizeof(*cfg));
1988
1989         cfg->fc_table = RT6_TABLE_MAIN;
1990         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1991         cfg->fc_metric = rtmsg->rtmsg_metric;
1992         cfg->fc_expires = rtmsg->rtmsg_info;
1993         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1994         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1995         cfg->fc_flags = rtmsg->rtmsg_flags;
1996
1997         cfg->fc_nlinfo.nl_net = net;
1998
1999         cfg->fc_dst = rtmsg->rtmsg_dst;
2000         cfg->fc_src = rtmsg->rtmsg_src;
2001         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2002 }
2003
2004 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2005 {
2006         struct fib6_config cfg;
2007         struct in6_rtmsg rtmsg;
2008         int err;
2009
2010         switch(cmd) {
2011         case SIOCADDRT:         /* Add a route */
2012         case SIOCDELRT:         /* Delete a route */
2013                 if (!capable(CAP_NET_ADMIN))
2014                         return -EPERM;
2015                 err = copy_from_user(&rtmsg, arg,
2016                                      sizeof(struct in6_rtmsg));
2017                 if (err)
2018                         return -EFAULT;
2019
2020                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2021
2022                 rtnl_lock();
2023                 switch (cmd) {
2024                 case SIOCADDRT:
2025                         err = ip6_route_add(&cfg);
2026                         break;
2027                 case SIOCDELRT:
2028                         err = ip6_route_del(&cfg);
2029                         break;
2030                 default:
2031                         err = -EINVAL;
2032                 }
2033                 rtnl_unlock();
2034
2035                 return err;
2036         }
2037
2038         return -EINVAL;
2039 }
2040
2041 /*
2042  *      Drop the packet on the floor
2043  */
2044
2045 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2046 {
2047         int type;
2048         struct dst_entry *dst = skb_dst(skb);
2049         switch (ipstats_mib_noroutes) {
2050         case IPSTATS_MIB_INNOROUTES:
2051                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2052                 if (type == IPV6_ADDR_ANY) {
2053                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2054                                       IPSTATS_MIB_INADDRERRORS);
2055                         break;
2056                 }
2057                 /* FALLTHROUGH */
2058         case IPSTATS_MIB_OUTNOROUTES:
2059                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2060                               ipstats_mib_noroutes);
2061                 break;
2062         }
2063         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2064         kfree_skb(skb);
2065         return 0;
2066 }
2067
2068 static int ip6_pkt_discard(struct sk_buff *skb)
2069 {
2070         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2071 }
2072
2073 static int ip6_pkt_discard_out(struct sk_buff *skb)
2074 {
2075         skb->dev = skb_dst(skb)->dev;
2076         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2077 }
2078
2079 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2080
2081 static int ip6_pkt_prohibit(struct sk_buff *skb)
2082 {
2083         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2084 }
2085
2086 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2087 {
2088         skb->dev = skb_dst(skb)->dev;
2089         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2090 }
2091
2092 #endif
2093
2094 /*
2095  *      Allocate a dst for local (unicast / anycast) address.
2096  */
2097
2098 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2099                                     const struct in6_addr *addr,
2100                                     bool anycast)
2101 {
2102         struct net *net = dev_net(idev->dev);
2103         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2104                                             net->loopback_dev, 0);
2105         int err;
2106
2107         if (!rt) {
2108                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2109                 return ERR_PTR(-ENOMEM);
2110         }
2111
2112         in6_dev_hold(idev);
2113
2114         rt->dst.flags |= DST_HOST;
2115         rt->dst.input = ip6_input;
2116         rt->dst.output = ip6_output;
2117         rt->rt6i_idev = idev;
2118         rt->dst.obsolete = -1;
2119
2120         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2121         if (anycast)
2122                 rt->rt6i_flags |= RTF_ANYCAST;
2123         else
2124                 rt->rt6i_flags |= RTF_LOCAL;
2125         err = rt6_bind_neighbour(rt, rt->dst.dev);
2126         if (err) {
2127                 dst_free(&rt->dst);
2128                 return ERR_PTR(err);
2129         }
2130
2131         rt->rt6i_dst.addr = *addr;
2132         rt->rt6i_dst.plen = 128;
2133         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2134
2135         atomic_set(&rt->dst.__refcnt, 1);
2136
2137         return rt;
2138 }
2139
2140 int ip6_route_get_saddr(struct net *net,
2141                         struct rt6_info *rt,
2142                         const struct in6_addr *daddr,
2143                         unsigned int prefs,
2144                         struct in6_addr *saddr)
2145 {
2146         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2147         int err = 0;
2148         if (rt->rt6i_prefsrc.plen)
2149                 *saddr = rt->rt6i_prefsrc.addr;
2150         else
2151                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2152                                          daddr, prefs, saddr);
2153         return err;
2154 }
2155
2156 /* remove deleted ip from prefsrc entries */
2157 struct arg_dev_net_ip {
2158         struct net_device *dev;
2159         struct net *net;
2160         struct in6_addr *addr;
2161 };
2162
2163 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2164 {
2165         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2166         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2167         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2168
2169         if (((void *)rt->dst.dev == dev || !dev) &&
2170             rt != net->ipv6.ip6_null_entry &&
2171             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2172                 /* remove prefsrc entry */
2173                 rt->rt6i_prefsrc.plen = 0;
2174         }
2175         return 0;
2176 }
2177
2178 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2179 {
2180         struct net *net = dev_net(ifp->idev->dev);
2181         struct arg_dev_net_ip adni = {
2182                 .dev = ifp->idev->dev,
2183                 .net = net,
2184                 .addr = &ifp->addr,
2185         };
2186         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2187 }
2188
2189 struct arg_dev_net {
2190         struct net_device *dev;
2191         struct net *net;
2192 };
2193
2194 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2195 {
2196         const struct arg_dev_net *adn = arg;
2197         const struct net_device *dev = adn->dev;
2198
2199         if ((rt->dst.dev == dev || !dev) &&
2200             rt != adn->net->ipv6.ip6_null_entry)
2201                 return -1;
2202
2203         return 0;
2204 }
2205
2206 void rt6_ifdown(struct net *net, struct net_device *dev)
2207 {
2208         struct arg_dev_net adn = {
2209                 .dev = dev,
2210                 .net = net,
2211         };
2212
2213         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2214         icmp6_clean_all(fib6_ifdown, &adn);
2215 }
2216
2217 struct rt6_mtu_change_arg {
2218         struct net_device *dev;
2219         unsigned int mtu;
2220 };
2221
2222 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2223 {
2224         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2225         struct inet6_dev *idev;
2226
2227         /* In IPv6 pmtu discovery is not optional,
2228            so that RTAX_MTU lock cannot disable it.
2229            We still use this lock to block changes
2230            caused by addrconf/ndisc.
2231         */
2232
2233         idev = __in6_dev_get(arg->dev);
2234         if (!idev)
2235                 return 0;
2236
2237         /* For administrative MTU increase, there is no way to discover
2238            IPv6 PMTU increase, so PMTU increase should be updated here.
2239            Since RFC 1981 doesn't include administrative MTU increase
2240            update PMTU increase is a MUST. (i.e. jumbo frame)
2241          */
2242         /*
2243            If new MTU is less than route PMTU, this new MTU will be the
2244            lowest MTU in the path, update the route PMTU to reflect PMTU
2245            decreases; if new MTU is greater than route PMTU, and the
2246            old MTU is the lowest MTU in the path, update the route PMTU
2247            to reflect the increase. In this case if the other nodes' MTU
2248            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2249            PMTU discouvery.
2250          */
2251         if (rt->dst.dev == arg->dev &&
2252             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2253             (dst_mtu(&rt->dst) >= arg->mtu ||
2254              (dst_mtu(&rt->dst) < arg->mtu &&
2255               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2256                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2257         }
2258         return 0;
2259 }
2260
2261 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2262 {
2263         struct rt6_mtu_change_arg arg = {
2264                 .dev = dev,
2265                 .mtu = mtu,
2266         };
2267
2268         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2269 }
2270
2271 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2272         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2273         [RTA_OIF]               = { .type = NLA_U32 },
2274         [RTA_IIF]               = { .type = NLA_U32 },
2275         [RTA_PRIORITY]          = { .type = NLA_U32 },
2276         [RTA_METRICS]           = { .type = NLA_NESTED },
2277 };
2278
2279 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2280                               struct fib6_config *cfg)
2281 {
2282         struct rtmsg *rtm;
2283         struct nlattr *tb[RTA_MAX+1];
2284         int err;
2285
2286         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2287         if (err < 0)
2288                 goto errout;
2289
2290         err = -EINVAL;
2291         rtm = nlmsg_data(nlh);
2292         memset(cfg, 0, sizeof(*cfg));
2293
2294         cfg->fc_table = rtm->rtm_table;
2295         cfg->fc_dst_len = rtm->rtm_dst_len;
2296         cfg->fc_src_len = rtm->rtm_src_len;
2297         cfg->fc_flags = RTF_UP;
2298         cfg->fc_protocol = rtm->rtm_protocol;
2299
2300         if (rtm->rtm_type == RTN_UNREACHABLE)
2301                 cfg->fc_flags |= RTF_REJECT;
2302
2303         if (rtm->rtm_type == RTN_LOCAL)
2304                 cfg->fc_flags |= RTF_LOCAL;
2305
2306         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2307         cfg->fc_nlinfo.nlh = nlh;
2308         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2309
2310         if (tb[RTA_GATEWAY]) {
2311                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2312                 cfg->fc_flags |= RTF_GATEWAY;
2313         }
2314
2315         if (tb[RTA_DST]) {
2316                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2317
2318                 if (nla_len(tb[RTA_DST]) < plen)
2319                         goto errout;
2320
2321                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2322         }
2323
2324         if (tb[RTA_SRC]) {
2325                 int plen = (rtm->rtm_src_len + 7) >> 3;
2326
2327                 if (nla_len(tb[RTA_SRC]) < plen)
2328                         goto errout;
2329
2330                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2331         }
2332
2333         if (tb[RTA_PREFSRC])
2334                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2335
2336         if (tb[RTA_OIF])
2337                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2338
2339         if (tb[RTA_PRIORITY])
2340                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2341
2342         if (tb[RTA_METRICS]) {
2343                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2344                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2345         }
2346
2347         if (tb[RTA_TABLE])
2348                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2349
2350         err = 0;
2351 errout:
2352         return err;
2353 }
2354
2355 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2356 {
2357         struct fib6_config cfg;
2358         int err;
2359
2360         err = rtm_to_fib6_config(skb, nlh, &cfg);
2361         if (err < 0)
2362                 return err;
2363
2364         return ip6_route_del(&cfg);
2365 }
2366
2367 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2368 {
2369         struct fib6_config cfg;
2370         int err;
2371
2372         err = rtm_to_fib6_config(skb, nlh, &cfg);
2373         if (err < 0)
2374                 return err;
2375
2376         return ip6_route_add(&cfg);
2377 }
2378
2379 static inline size_t rt6_nlmsg_size(void)
2380 {
2381         return NLMSG_ALIGN(sizeof(struct rtmsg))
2382                + nla_total_size(16) /* RTA_SRC */
2383                + nla_total_size(16) /* RTA_DST */
2384                + nla_total_size(16) /* RTA_GATEWAY */
2385                + nla_total_size(16) /* RTA_PREFSRC */
2386                + nla_total_size(4) /* RTA_TABLE */
2387                + nla_total_size(4) /* RTA_IIF */
2388                + nla_total_size(4) /* RTA_OIF */
2389                + nla_total_size(4) /* RTA_PRIORITY */
2390                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2391                + nla_total_size(sizeof(struct rta_cacheinfo));
2392 }
2393
2394 static int rt6_fill_node(struct net *net,
2395                          struct sk_buff *skb, struct rt6_info *rt,
2396                          struct in6_addr *dst, struct in6_addr *src,
2397                          int iif, int type, u32 pid, u32 seq,
2398                          int prefix, int nowait, unsigned int flags)
2399 {
2400         const struct inet_peer *peer;
2401         struct rtmsg *rtm;
2402         struct nlmsghdr *nlh;
2403         long expires;
2404         u32 table;
2405         struct neighbour *n;
2406         u32 ts, tsage;
2407
2408         if (prefix) {   /* user wants prefix routes only */
2409                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2410                         /* success since this is not a prefix route */
2411                         return 1;
2412                 }
2413         }
2414
2415         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2416         if (!nlh)
2417                 return -EMSGSIZE;
2418
2419         rtm = nlmsg_data(nlh);
2420         rtm->rtm_family = AF_INET6;
2421         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2422         rtm->rtm_src_len = rt->rt6i_src.plen;
2423         rtm->rtm_tos = 0;
2424         if (rt->rt6i_table)
2425                 table = rt->rt6i_table->tb6_id;
2426         else
2427                 table = RT6_TABLE_UNSPEC;
2428         rtm->rtm_table = table;
2429         if (nla_put_u32(skb, RTA_TABLE, table))
2430                 goto nla_put_failure;
2431         if (rt->rt6i_flags & RTF_REJECT)
2432                 rtm->rtm_type = RTN_UNREACHABLE;
2433         else if (rt->rt6i_flags & RTF_LOCAL)
2434                 rtm->rtm_type = RTN_LOCAL;
2435         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2436                 rtm->rtm_type = RTN_LOCAL;
2437         else
2438                 rtm->rtm_type = RTN_UNICAST;
2439         rtm->rtm_flags = 0;
2440         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2441         rtm->rtm_protocol = rt->rt6i_protocol;
2442         if (rt->rt6i_flags & RTF_DYNAMIC)
2443                 rtm->rtm_protocol = RTPROT_REDIRECT;
2444         else if (rt->rt6i_flags & RTF_ADDRCONF)
2445                 rtm->rtm_protocol = RTPROT_KERNEL;
2446         else if (rt->rt6i_flags & RTF_DEFAULT)
2447                 rtm->rtm_protocol = RTPROT_RA;
2448
2449         if (rt->rt6i_flags & RTF_CACHE)
2450                 rtm->rtm_flags |= RTM_F_CLONED;
2451
2452         if (dst) {
2453                 if (nla_put(skb, RTA_DST, 16, dst))
2454                         goto nla_put_failure;
2455                 rtm->rtm_dst_len = 128;
2456         } else if (rtm->rtm_dst_len)
2457                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2458                         goto nla_put_failure;
2459 #ifdef CONFIG_IPV6_SUBTREES
2460         if (src) {
2461                 if (nla_put(skb, RTA_SRC, 16, src))
2462                         goto nla_put_failure;
2463                 rtm->rtm_src_len = 128;
2464         } else if (rtm->rtm_src_len &&
2465                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2466                 goto nla_put_failure;
2467 #endif
2468         if (iif) {
2469 #ifdef CONFIG_IPV6_MROUTE
2470                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2471                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2472                         if (err <= 0) {
2473                                 if (!nowait) {
2474                                         if (err == 0)
2475                                                 return 0;
2476                                         goto nla_put_failure;
2477                                 } else {
2478                                         if (err == -EMSGSIZE)
2479                                                 goto nla_put_failure;
2480                                 }
2481                         }
2482                 } else
2483 #endif
2484                         if (nla_put_u32(skb, RTA_IIF, iif))
2485                                 goto nla_put_failure;
2486         } else if (dst) {
2487                 struct in6_addr saddr_buf;
2488                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2489                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2490                         goto nla_put_failure;
2491         }
2492
2493         if (rt->rt6i_prefsrc.plen) {
2494                 struct in6_addr saddr_buf;
2495                 saddr_buf = rt->rt6i_prefsrc.addr;
2496                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2497                         goto nla_put_failure;
2498         }
2499
2500         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2501                 goto nla_put_failure;
2502
2503         rcu_read_lock();
2504         n = dst_get_neighbour_noref(&rt->dst);
2505         if (n) {
2506                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2507                         rcu_read_unlock();
2508                         goto nla_put_failure;
2509                 }
2510         }
2511         rcu_read_unlock();
2512
2513         if (rt->dst.dev &&
2514             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2515                 goto nla_put_failure;
2516         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2517                 goto nla_put_failure;
2518         if (!(rt->rt6i_flags & RTF_EXPIRES))
2519                 expires = 0;
2520         else if (rt->dst.expires - jiffies < INT_MAX)
2521                 expires = rt->dst.expires - jiffies;
2522         else
2523                 expires = INT_MAX;
2524
2525         peer = rt->rt6i_peer;
2526         ts = tsage = 0;
2527         if (peer && peer->tcp_ts_stamp) {
2528                 ts = peer->tcp_ts;
2529                 tsage = get_seconds() - peer->tcp_ts_stamp;
2530         }
2531
2532         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2533                                expires, rt->dst.error) < 0)
2534                 goto nla_put_failure;
2535
2536         return nlmsg_end(skb, nlh);
2537
2538 nla_put_failure:
2539         nlmsg_cancel(skb, nlh);
2540         return -EMSGSIZE;
2541 }
2542
2543 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2544 {
2545         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2546         int prefix;
2547
2548         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2549                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2550                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2551         } else
2552                 prefix = 0;
2553
2554         return rt6_fill_node(arg->net,
2555                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2556                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2557                      prefix, 0, NLM_F_MULTI);
2558 }
2559
2560 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2561 {
2562         struct net *net = sock_net(in_skb->sk);
2563         struct nlattr *tb[RTA_MAX+1];
2564         struct rt6_info *rt;
2565         struct sk_buff *skb;
2566         struct rtmsg *rtm;
2567         struct flowi6 fl6;
2568         int err, iif = 0, oif = 0;
2569
2570         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2571         if (err < 0)
2572                 goto errout;
2573
2574         err = -EINVAL;
2575         memset(&fl6, 0, sizeof(fl6));
2576
2577         if (tb[RTA_SRC]) {
2578                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2579                         goto errout;
2580
2581                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2582         }
2583
2584         if (tb[RTA_DST]) {
2585                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2586                         goto errout;
2587
2588                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2589         }
2590
2591         if (tb[RTA_IIF])
2592                 iif = nla_get_u32(tb[RTA_IIF]);
2593
2594         if (tb[RTA_OIF])
2595                 oif = nla_get_u32(tb[RTA_OIF]);
2596
2597         if (iif) {
2598                 struct net_device *dev;
2599                 int flags = 0;
2600
2601                 dev = __dev_get_by_index(net, iif);
2602                 if (!dev) {
2603                         err = -ENODEV;
2604                         goto errout;
2605                 }
2606
2607                 fl6.flowi6_iif = iif;
2608
2609                 if (!ipv6_addr_any(&fl6.saddr))
2610                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2611
2612                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2613                                                                flags);
2614         } else {
2615                 fl6.flowi6_oif = oif;
2616
2617                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2618         }
2619
2620         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2621         if (!skb) {
2622                 dst_release(&rt->dst);
2623                 err = -ENOBUFS;
2624                 goto errout;
2625         }
2626
2627         /* Reserve room for dummy headers, this skb can pass
2628            through good chunk of routing engine.
2629          */
2630         skb_reset_mac_header(skb);
2631         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2632
2633         skb_dst_set(skb, &rt->dst);
2634
2635         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2636                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2637                             nlh->nlmsg_seq, 0, 0, 0);
2638         if (err < 0) {
2639                 kfree_skb(skb);
2640                 goto errout;
2641         }
2642
2643         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2644 errout:
2645         return err;
2646 }
2647
2648 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2649 {
2650         struct sk_buff *skb;
2651         struct net *net = info->nl_net;
2652         u32 seq;
2653         int err;
2654
2655         err = -ENOBUFS;
2656         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2657
2658         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2659         if (!skb)
2660                 goto errout;
2661
2662         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2663                                 event, info->pid, seq, 0, 0, 0);
2664         if (err < 0) {
2665                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2666                 WARN_ON(err == -EMSGSIZE);
2667                 kfree_skb(skb);
2668                 goto errout;
2669         }
2670         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2671                     info->nlh, gfp_any());
2672         return;
2673 errout:
2674         if (err < 0)
2675                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2676 }
2677
2678 static int ip6_route_dev_notify(struct notifier_block *this,
2679                                 unsigned long event, void *data)
2680 {
2681         struct net_device *dev = (struct net_device *)data;
2682         struct net *net = dev_net(dev);
2683
2684         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2685                 net->ipv6.ip6_null_entry->dst.dev = dev;
2686                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2687 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2688                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2689                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2690                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2691                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2692 #endif
2693         }
2694
2695         return NOTIFY_OK;
2696 }
2697
2698 /*
2699  *      /proc
2700  */
2701
2702 #ifdef CONFIG_PROC_FS
2703
2704 struct rt6_proc_arg
2705 {
2706         char *buffer;
2707         int offset;
2708         int length;
2709         int skip;
2710         int len;
2711 };
2712
2713 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2714 {
2715         struct seq_file *m = p_arg;
2716         struct neighbour *n;
2717
2718         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2719
2720 #ifdef CONFIG_IPV6_SUBTREES
2721         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2722 #else
2723         seq_puts(m, "00000000000000000000000000000000 00 ");
2724 #endif
2725         rcu_read_lock();
2726         n = dst_get_neighbour_noref(&rt->dst);
2727         if (n) {
2728                 seq_printf(m, "%pi6", n->primary_key);
2729         } else {
2730                 seq_puts(m, "00000000000000000000000000000000");
2731         }
2732         rcu_read_unlock();
2733         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2734                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2735                    rt->dst.__use, rt->rt6i_flags,
2736                    rt->dst.dev ? rt->dst.dev->name : "");
2737         return 0;
2738 }
2739
2740 static int ipv6_route_show(struct seq_file *m, void *v)
2741 {
2742         struct net *net = (struct net *)m->private;
2743         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2744         return 0;
2745 }
2746
2747 static int ipv6_route_open(struct inode *inode, struct file *file)
2748 {
2749         return single_open_net(inode, file, ipv6_route_show);
2750 }
2751
2752 static const struct file_operations ipv6_route_proc_fops = {
2753         .owner          = THIS_MODULE,
2754         .open           = ipv6_route_open,
2755         .read           = seq_read,
2756         .llseek         = seq_lseek,
2757         .release        = single_release_net,
2758 };
2759
2760 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2761 {
2762         struct net *net = (struct net *)seq->private;
2763         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2764                    net->ipv6.rt6_stats->fib_nodes,
2765                    net->ipv6.rt6_stats->fib_route_nodes,
2766                    net->ipv6.rt6_stats->fib_rt_alloc,
2767                    net->ipv6.rt6_stats->fib_rt_entries,
2768                    net->ipv6.rt6_stats->fib_rt_cache,
2769                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2770                    net->ipv6.rt6_stats->fib_discarded_routes);
2771
2772         return 0;
2773 }
2774
2775 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2776 {
2777         return single_open_net(inode, file, rt6_stats_seq_show);
2778 }
2779
2780 static const struct file_operations rt6_stats_seq_fops = {
2781         .owner   = THIS_MODULE,
2782         .open    = rt6_stats_seq_open,
2783         .read    = seq_read,
2784         .llseek  = seq_lseek,
2785         .release = single_release_net,
2786 };
2787 #endif  /* CONFIG_PROC_FS */
2788
2789 #ifdef CONFIG_SYSCTL
2790
2791 static
2792 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2793                               void __user *buffer, size_t *lenp, loff_t *ppos)
2794 {
2795         struct net *net;
2796         int delay;
2797         if (!write)
2798                 return -EINVAL;
2799
2800         net = (struct net *)ctl->extra1;
2801         delay = net->ipv6.sysctl.flush_delay;
2802         proc_dointvec(ctl, write, buffer, lenp, ppos);
2803         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2804         return 0;
2805 }
2806
2807 ctl_table ipv6_route_table_template[] = {
2808         {
2809                 .procname       =       "flush",
2810                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2811                 .maxlen         =       sizeof(int),
2812                 .mode           =       0200,
2813                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2814         },
2815         {
2816                 .procname       =       "gc_thresh",
2817                 .data           =       &ip6_dst_ops_template.gc_thresh,
2818                 .maxlen         =       sizeof(int),
2819                 .mode           =       0644,
2820                 .proc_handler   =       proc_dointvec,
2821         },
2822         {
2823                 .procname       =       "max_size",
2824                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2825                 .maxlen         =       sizeof(int),
2826                 .mode           =       0644,
2827                 .proc_handler   =       proc_dointvec,
2828         },
2829         {
2830                 .procname       =       "gc_min_interval",
2831                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2832                 .maxlen         =       sizeof(int),
2833                 .mode           =       0644,
2834                 .proc_handler   =       proc_dointvec_jiffies,
2835         },
2836         {
2837                 .procname       =       "gc_timeout",
2838                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2839                 .maxlen         =       sizeof(int),
2840                 .mode           =       0644,
2841                 .proc_handler   =       proc_dointvec_jiffies,
2842         },
2843         {
2844                 .procname       =       "gc_interval",
2845                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2846                 .maxlen         =       sizeof(int),
2847                 .mode           =       0644,
2848                 .proc_handler   =       proc_dointvec_jiffies,
2849         },
2850         {
2851                 .procname       =       "gc_elasticity",
2852                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2853                 .maxlen         =       sizeof(int),
2854                 .mode           =       0644,
2855                 .proc_handler   =       proc_dointvec,
2856         },
2857         {
2858                 .procname       =       "mtu_expires",
2859                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2860                 .maxlen         =       sizeof(int),
2861                 .mode           =       0644,
2862                 .proc_handler   =       proc_dointvec_jiffies,
2863         },
2864         {
2865                 .procname       =       "min_adv_mss",
2866                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2867                 .maxlen         =       sizeof(int),
2868                 .mode           =       0644,
2869                 .proc_handler   =       proc_dointvec,
2870         },
2871         {
2872                 .procname       =       "gc_min_interval_ms",
2873                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2874                 .maxlen         =       sizeof(int),
2875                 .mode           =       0644,
2876                 .proc_handler   =       proc_dointvec_ms_jiffies,
2877         },
2878         { }
2879 };
2880
2881 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2882 {
2883         struct ctl_table *table;
2884
2885         table = kmemdup(ipv6_route_table_template,
2886                         sizeof(ipv6_route_table_template),
2887                         GFP_KERNEL);
2888
2889         if (table) {
2890                 table[0].data = &net->ipv6.sysctl.flush_delay;
2891                 table[0].extra1 = net;
2892                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2893                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2894                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2895                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2896                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2897                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2898                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2899                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2900                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2901         }
2902
2903         return table;
2904 }
2905 #endif
2906
2907 static int __net_init ip6_route_net_init(struct net *net)
2908 {
2909         int ret = -ENOMEM;
2910
2911         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2912                sizeof(net->ipv6.ip6_dst_ops));
2913
2914         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2915                 goto out_ip6_dst_ops;
2916
2917         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2918                                            sizeof(*net->ipv6.ip6_null_entry),
2919                                            GFP_KERNEL);
2920         if (!net->ipv6.ip6_null_entry)
2921                 goto out_ip6_dst_entries;
2922         net->ipv6.ip6_null_entry->dst.path =
2923                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2924         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2925         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2926                          ip6_template_metrics, true);
2927
2928 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2929         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2930                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2931                                                GFP_KERNEL);
2932         if (!net->ipv6.ip6_prohibit_entry)
2933                 goto out_ip6_null_entry;
2934         net->ipv6.ip6_prohibit_entry->dst.path =
2935                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2936         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2937         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2938                          ip6_template_metrics, true);
2939
2940         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2941                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2942                                                GFP_KERNEL);
2943         if (!net->ipv6.ip6_blk_hole_entry)
2944                 goto out_ip6_prohibit_entry;
2945         net->ipv6.ip6_blk_hole_entry->dst.path =
2946                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2947         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2948         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2949                          ip6_template_metrics, true);
2950 #endif
2951
2952         net->ipv6.sysctl.flush_delay = 0;
2953         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2954         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2955         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2956         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2957         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2958         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2959         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2960
2961 #ifdef CONFIG_PROC_FS
2962         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2963         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2964 #endif
2965         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2966
2967         ret = 0;
2968 out:
2969         return ret;
2970
2971 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2972 out_ip6_prohibit_entry:
2973         kfree(net->ipv6.ip6_prohibit_entry);
2974 out_ip6_null_entry:
2975         kfree(net->ipv6.ip6_null_entry);
2976 #endif
2977 out_ip6_dst_entries:
2978         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2979 out_ip6_dst_ops:
2980         goto out;
2981 }
2982
2983 static void __net_exit ip6_route_net_exit(struct net *net)
2984 {
2985 #ifdef CONFIG_PROC_FS
2986         proc_net_remove(net, "ipv6_route");
2987         proc_net_remove(net, "rt6_stats");
2988 #endif
2989         kfree(net->ipv6.ip6_null_entry);
2990 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2991         kfree(net->ipv6.ip6_prohibit_entry);
2992         kfree(net->ipv6.ip6_blk_hole_entry);
2993 #endif
2994         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2995 }
2996
2997 static struct pernet_operations ip6_route_net_ops = {
2998         .init = ip6_route_net_init,
2999         .exit = ip6_route_net_exit,
3000 };
3001
3002 static struct notifier_block ip6_route_dev_notifier = {
3003         .notifier_call = ip6_route_dev_notify,
3004         .priority = 0,
3005 };
3006
3007 int __init ip6_route_init(void)
3008 {
3009         int ret;
3010
3011         ret = -ENOMEM;
3012         ip6_dst_ops_template.kmem_cachep =
3013                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3014                                   SLAB_HWCACHE_ALIGN, NULL);
3015         if (!ip6_dst_ops_template.kmem_cachep)
3016                 goto out;
3017
3018         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3019         if (ret)
3020                 goto out_kmem_cache;
3021
3022         ret = register_pernet_subsys(&ip6_route_net_ops);
3023         if (ret)
3024                 goto out_dst_entries;
3025
3026         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3027
3028         /* Registering of the loopback is done before this portion of code,
3029          * the loopback reference in rt6_info will not be taken, do it
3030          * manually for init_net */
3031         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3032         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3033   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3034         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3035         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3036         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3037         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3038   #endif
3039         ret = fib6_init();
3040         if (ret)
3041                 goto out_register_subsys;
3042
3043         ret = xfrm6_init();
3044         if (ret)
3045                 goto out_fib6_init;
3046
3047         ret = fib6_rules_init();
3048         if (ret)
3049                 goto xfrm6_init;
3050
3051         ret = -ENOBUFS;
3052         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3053             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3054             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3055                 goto fib6_rules_init;
3056
3057         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3058         if (ret)
3059                 goto fib6_rules_init;
3060
3061 out:
3062         return ret;
3063
3064 fib6_rules_init:
3065         fib6_rules_cleanup();
3066 xfrm6_init:
3067         xfrm6_fini();
3068 out_fib6_init:
3069         fib6_gc_cleanup();
3070 out_register_subsys:
3071         unregister_pernet_subsys(&ip6_route_net_ops);
3072 out_dst_entries:
3073         dst_entries_destroy(&ip6_dst_blackhole_ops);
3074 out_kmem_cache:
3075         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3076         goto out;
3077 }
3078
3079 void ip6_route_cleanup(void)
3080 {
3081         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3082         fib6_rules_cleanup();
3083         xfrm6_fini();
3084         fib6_gc_cleanup();
3085         unregister_pernet_subsys(&ip6_route_net_ops);
3086         dst_entries_destroy(&ip6_dst_blackhole_ops);
3087         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3088 }