net/ipv6/route.c: packets originating on device match lo
[linux-3.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         peer = rt6_get_peer_create(rt);
103         if (peer) {
104                 u32 *old_p = __DST_METRICS_PTR(old);
105                 unsigned long prev, new;
106
107                 p = peer->metrics;
108                 if (inet_metrics_new(peer))
109                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
110
111                 new = (unsigned long) p;
112                 prev = cmpxchg(&dst->_metrics, old, new);
113
114                 if (prev != old) {
115                         p = __DST_METRICS_PTR(prev);
116                         if (prev & DST_METRICS_READ_ONLY)
117                                 p = NULL;
118                 }
119         }
120         return p;
121 }
122
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
124 {
125         struct in6_addr *p = &rt->rt6i_gateway;
126
127         if (!ipv6_addr_any(p))
128                 return (const void *) p;
129         return daddr;
130 }
131
132 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
133 {
134         struct rt6_info *rt = (struct rt6_info *) dst;
135         struct neighbour *n;
136
137         daddr = choose_neigh_daddr(rt, daddr);
138         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
139         if (n)
140                 return n;
141         return neigh_create(&nd_tbl, daddr, dst->dev);
142 }
143
144 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
145 {
146         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
147         if (!n) {
148                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
149                 if (IS_ERR(n))
150                         return PTR_ERR(n);
151         }
152         dst_set_neighbour(&rt->dst, n);
153
154         return 0;
155 }
156
157 static struct dst_ops ip6_dst_ops_template = {
158         .family                 =       AF_INET6,
159         .protocol               =       cpu_to_be16(ETH_P_IPV6),
160         .gc                     =       ip6_dst_gc,
161         .gc_thresh              =       1024,
162         .check                  =       ip6_dst_check,
163         .default_advmss         =       ip6_default_advmss,
164         .mtu                    =       ip6_mtu,
165         .cow_metrics            =       ipv6_cow_metrics,
166         .destroy                =       ip6_dst_destroy,
167         .ifdown                 =       ip6_dst_ifdown,
168         .negative_advice        =       ip6_negative_advice,
169         .link_failure           =       ip6_link_failure,
170         .update_pmtu            =       ip6_rt_update_pmtu,
171         .local_out              =       __ip6_local_out,
172         .neigh_lookup           =       ip6_neigh_lookup,
173 };
174
175 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
176 {
177         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
178
179         return mtu ? : dst->dev->mtu;
180 }
181
182 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
183 {
184 }
185
186 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
187                                          unsigned long old)
188 {
189         return NULL;
190 }
191
192 static struct dst_ops ip6_dst_blackhole_ops = {
193         .family                 =       AF_INET6,
194         .protocol               =       cpu_to_be16(ETH_P_IPV6),
195         .destroy                =       ip6_dst_destroy,
196         .check                  =       ip6_dst_check,
197         .mtu                    =       ip6_blackhole_mtu,
198         .default_advmss         =       ip6_default_advmss,
199         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
200         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
201         .neigh_lookup           =       ip6_neigh_lookup,
202 };
203
204 static const u32 ip6_template_metrics[RTAX_MAX] = {
205         [RTAX_HOPLIMIT - 1] = 255,
206 };
207
208 static struct rt6_info ip6_null_entry_template = {
209         .dst = {
210                 .__refcnt       = ATOMIC_INIT(1),
211                 .__use          = 1,
212                 .obsolete       = -1,
213                 .error          = -ENETUNREACH,
214                 .input          = ip6_pkt_discard,
215                 .output         = ip6_pkt_discard_out,
216         },
217         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
218         .rt6i_protocol  = RTPROT_KERNEL,
219         .rt6i_metric    = ~(u32) 0,
220         .rt6i_ref       = ATOMIC_INIT(1),
221 };
222
223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
224
225 static int ip6_pkt_prohibit(struct sk_buff *skb);
226 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
227
228 static struct rt6_info ip6_prohibit_entry_template = {
229         .dst = {
230                 .__refcnt       = ATOMIC_INIT(1),
231                 .__use          = 1,
232                 .obsolete       = -1,
233                 .error          = -EACCES,
234                 .input          = ip6_pkt_prohibit,
235                 .output         = ip6_pkt_prohibit_out,
236         },
237         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
238         .rt6i_protocol  = RTPROT_KERNEL,
239         .rt6i_metric    = ~(u32) 0,
240         .rt6i_ref       = ATOMIC_INIT(1),
241 };
242
243 static struct rt6_info ip6_blk_hole_entry_template = {
244         .dst = {
245                 .__refcnt       = ATOMIC_INIT(1),
246                 .__use          = 1,
247                 .obsolete       = -1,
248                 .error          = -EINVAL,
249                 .input          = dst_discard,
250                 .output         = dst_discard,
251         },
252         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
253         .rt6i_protocol  = RTPROT_KERNEL,
254         .rt6i_metric    = ~(u32) 0,
255         .rt6i_ref       = ATOMIC_INIT(1),
256 };
257
258 #endif
259
260 /* allocate dst with ip6_dst_ops */
261 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
262                                              struct net_device *dev,
263                                              int flags,
264                                              struct fib6_table *table)
265 {
266         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
267                                         0, 0, flags);
268
269         if (rt) {
270                 memset(&rt->rt6i_table, 0,
271                        sizeof(*rt) - sizeof(struct dst_entry));
272                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
273         }
274         return rt;
275 }
276
277 static void ip6_dst_destroy(struct dst_entry *dst)
278 {
279         struct rt6_info *rt = (struct rt6_info *)dst;
280         struct inet6_dev *idev = rt->rt6i_idev;
281
282         if (!(rt->dst.flags & DST_HOST))
283                 dst_destroy_metrics_generic(dst);
284
285         if (idev) {
286                 rt->rt6i_idev = NULL;
287                 in6_dev_put(idev);
288         }
289
290         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
291                 dst_release(dst->from);
292
293         if (rt6_has_peer(rt)) {
294                 struct inet_peer *peer = rt6_peer_ptr(rt);
295                 inet_putpeer(peer);
296         }
297 }
298
299 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
300
301 static u32 rt6_peer_genid(void)
302 {
303         return atomic_read(&__rt6_peer_genid);
304 }
305
306 void rt6_bind_peer(struct rt6_info *rt, int create)
307 {
308         struct inet_peer_base *base;
309         struct inet_peer *peer;
310
311         base = inetpeer_base_ptr(rt->_rt6i_peer);
312         if (!base)
313                 return;
314
315         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
316         if (peer) {
317                 if (!rt6_set_peer(rt, peer))
318                         inet_putpeer(peer);
319                 else
320                         rt->rt6i_peer_genid = rt6_peer_genid();
321         }
322 }
323
324 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
325                            int how)
326 {
327         struct rt6_info *rt = (struct rt6_info *)dst;
328         struct inet6_dev *idev = rt->rt6i_idev;
329         struct net_device *loopback_dev =
330                 dev_net(dev)->loopback_dev;
331
332         if (dev != loopback_dev && idev && idev->dev == dev) {
333                 struct inet6_dev *loopback_idev =
334                         in6_dev_get(loopback_dev);
335                 if (loopback_idev) {
336                         rt->rt6i_idev = loopback_idev;
337                         in6_dev_put(idev);
338                 }
339         }
340 }
341
342 static bool rt6_check_expired(const struct rt6_info *rt)
343 {
344         struct rt6_info *ort = NULL;
345
346         if (rt->rt6i_flags & RTF_EXPIRES) {
347                 if (time_after(jiffies, rt->dst.expires))
348                         return true;
349         } else if (rt->dst.from) {
350                 ort = (struct rt6_info *) rt->dst.from;
351                 return (ort->rt6i_flags & RTF_EXPIRES) &&
352                         time_after(jiffies, ort->dst.expires);
353         }
354         return false;
355 }
356
357 static bool rt6_need_strict(const struct in6_addr *daddr)
358 {
359         return ipv6_addr_type(daddr) &
360                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
361 }
362
363 /*
364  *      Route lookup. Any table->tb6_lock is implied.
365  */
366
367 static inline struct rt6_info *rt6_device_match(struct net *net,
368                                                     struct rt6_info *rt,
369                                                     const struct in6_addr *saddr,
370                                                     int oif,
371                                                     int flags)
372 {
373         struct rt6_info *local = NULL;
374         struct rt6_info *sprt;
375
376         if (!oif && ipv6_addr_any(saddr))
377                 goto out;
378
379         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
380                 struct net_device *dev = sprt->dst.dev;
381
382                 if (oif) {
383                         if (dev->ifindex == oif)
384                                 return sprt;
385                         if (dev->flags & IFF_LOOPBACK) {
386                                 if (!sprt->rt6i_idev ||
387                                     sprt->rt6i_idev->dev->ifindex != oif) {
388                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
389                                                 continue;
390                                         if (local && (!oif ||
391                                                       local->rt6i_idev->dev->ifindex == oif))
392                                                 continue;
393                                 }
394                                 local = sprt;
395                         }
396                 } else {
397                         if (ipv6_chk_addr(net, saddr, dev,
398                                           flags & RT6_LOOKUP_F_IFACE))
399                                 return sprt;
400                 }
401         }
402
403         if (oif) {
404                 if (local)
405                         return local;
406
407                 if (flags & RT6_LOOKUP_F_IFACE)
408                         return net->ipv6.ip6_null_entry;
409         }
410 out:
411         return rt;
412 }
413
414 #ifdef CONFIG_IPV6_ROUTER_PREF
415 static void rt6_probe(struct rt6_info *rt)
416 {
417         struct neighbour *neigh;
418         /*
419          * Okay, this does not seem to be appropriate
420          * for now, however, we need to check if it
421          * is really so; aka Router Reachability Probing.
422          *
423          * Router Reachability Probe MUST be rate-limited
424          * to no more than one per minute.
425          */
426         rcu_read_lock();
427         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
428         if (!neigh || (neigh->nud_state & NUD_VALID))
429                 goto out;
430         read_lock_bh(&neigh->lock);
431         if (!(neigh->nud_state & NUD_VALID) &&
432             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
433                 struct in6_addr mcaddr;
434                 struct in6_addr *target;
435
436                 neigh->updated = jiffies;
437                 read_unlock_bh(&neigh->lock);
438
439                 target = (struct in6_addr *)&neigh->primary_key;
440                 addrconf_addr_solict_mult(target, &mcaddr);
441                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
442         } else {
443                 read_unlock_bh(&neigh->lock);
444         }
445 out:
446         rcu_read_unlock();
447 }
448 #else
449 static inline void rt6_probe(struct rt6_info *rt)
450 {
451 }
452 #endif
453
454 /*
455  * Default Router Selection (RFC 2461 6.3.6)
456  */
457 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
458 {
459         struct net_device *dev = rt->dst.dev;
460         if (!oif || dev->ifindex == oif)
461                 return 2;
462         if ((dev->flags & IFF_LOOPBACK) &&
463             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
464                 return 1;
465         return 0;
466 }
467
468 static inline int rt6_check_neigh(struct rt6_info *rt)
469 {
470         struct neighbour *neigh;
471         int m;
472
473         rcu_read_lock();
474         neigh = dst_get_neighbour_noref(&rt->dst);
475         if (rt->rt6i_flags & RTF_NONEXTHOP ||
476             !(rt->rt6i_flags & RTF_GATEWAY))
477                 m = 1;
478         else if (neigh) {
479                 read_lock_bh(&neigh->lock);
480                 if (neigh->nud_state & NUD_VALID)
481                         m = 2;
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483                 else if (neigh->nud_state & NUD_FAILED)
484                         m = 0;
485 #endif
486                 else
487                         m = 1;
488                 read_unlock_bh(&neigh->lock);
489         } else
490                 m = 0;
491         rcu_read_unlock();
492         return m;
493 }
494
495 static int rt6_score_route(struct rt6_info *rt, int oif,
496                            int strict)
497 {
498         int m, n;
499
500         m = rt6_check_dev(rt, oif);
501         if (!m && (strict & RT6_LOOKUP_F_IFACE))
502                 return -1;
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
505 #endif
506         n = rt6_check_neigh(rt);
507         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
508                 return -1;
509         return m;
510 }
511
512 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
513                                    int *mpri, struct rt6_info *match)
514 {
515         int m;
516
517         if (rt6_check_expired(rt))
518                 goto out;
519
520         m = rt6_score_route(rt, oif, strict);
521         if (m < 0)
522                 goto out;
523
524         if (m > *mpri) {
525                 if (strict & RT6_LOOKUP_F_REACHABLE)
526                         rt6_probe(match);
527                 *mpri = m;
528                 match = rt;
529         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
530                 rt6_probe(rt);
531         }
532
533 out:
534         return match;
535 }
536
537 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
538                                      struct rt6_info *rr_head,
539                                      u32 metric, int oif, int strict)
540 {
541         struct rt6_info *rt, *match;
542         int mpri = -1;
543
544         match = NULL;
545         for (rt = rr_head; rt && rt->rt6i_metric == metric;
546              rt = rt->dst.rt6_next)
547                 match = find_match(rt, oif, strict, &mpri, match);
548         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
549              rt = rt->dst.rt6_next)
550                 match = find_match(rt, oif, strict, &mpri, match);
551
552         return match;
553 }
554
555 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
556 {
557         struct rt6_info *match, *rt0;
558         struct net *net;
559
560         rt0 = fn->rr_ptr;
561         if (!rt0)
562                 fn->rr_ptr = rt0 = fn->leaf;
563
564         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
565
566         if (!match &&
567             (strict & RT6_LOOKUP_F_REACHABLE)) {
568                 struct rt6_info *next = rt0->dst.rt6_next;
569
570                 /* no entries matched; do round-robin */
571                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
572                         next = fn->leaf;
573
574                 if (next != rt0)
575                         fn->rr_ptr = next;
576         }
577
578         net = dev_net(rt0->dst.dev);
579         return match ? match : net->ipv6.ip6_null_entry;
580 }
581
582 #ifdef CONFIG_IPV6_ROUTE_INFO
583 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
584                   const struct in6_addr *gwaddr)
585 {
586         struct net *net = dev_net(dev);
587         struct route_info *rinfo = (struct route_info *) opt;
588         struct in6_addr prefix_buf, *prefix;
589         unsigned int pref;
590         unsigned long lifetime;
591         struct rt6_info *rt;
592
593         if (len < sizeof(struct route_info)) {
594                 return -EINVAL;
595         }
596
597         /* Sanity check for prefix_len and length */
598         if (rinfo->length > 3) {
599                 return -EINVAL;
600         } else if (rinfo->prefix_len > 128) {
601                 return -EINVAL;
602         } else if (rinfo->prefix_len > 64) {
603                 if (rinfo->length < 2) {
604                         return -EINVAL;
605                 }
606         } else if (rinfo->prefix_len > 0) {
607                 if (rinfo->length < 1) {
608                         return -EINVAL;
609                 }
610         }
611
612         pref = rinfo->route_pref;
613         if (pref == ICMPV6_ROUTER_PREF_INVALID)
614                 return -EINVAL;
615
616         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
617
618         if (rinfo->length == 3)
619                 prefix = (struct in6_addr *)rinfo->prefix;
620         else {
621                 /* this function is safe */
622                 ipv6_addr_prefix(&prefix_buf,
623                                  (struct in6_addr *)rinfo->prefix,
624                                  rinfo->prefix_len);
625                 prefix = &prefix_buf;
626         }
627
628         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
629                                 dev->ifindex);
630
631         if (rt && !lifetime) {
632                 ip6_del_rt(rt);
633                 rt = NULL;
634         }
635
636         if (!rt && lifetime)
637                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
638                                         pref);
639         else if (rt)
640                 rt->rt6i_flags = RTF_ROUTEINFO |
641                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
642
643         if (rt) {
644                 if (!addrconf_finite_timeout(lifetime))
645                         rt6_clean_expires(rt);
646                 else
647                         rt6_set_expires(rt, jiffies + HZ * lifetime);
648
649                 dst_release(&rt->dst);
650         }
651         return 0;
652 }
653 #endif
654
655 #define BACKTRACK(__net, saddr)                 \
656 do { \
657         if (rt == __net->ipv6.ip6_null_entry) { \
658                 struct fib6_node *pn; \
659                 while (1) { \
660                         if (fn->fn_flags & RTN_TL_ROOT) \
661                                 goto out; \
662                         pn = fn->parent; \
663                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
664                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
665                         else \
666                                 fn = pn; \
667                         if (fn->fn_flags & RTN_RTINFO) \
668                                 goto restart; \
669                 } \
670         } \
671 } while (0)
672
673 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
674                                              struct fib6_table *table,
675                                              struct flowi6 *fl6, int flags)
676 {
677         struct fib6_node *fn;
678         struct rt6_info *rt;
679
680         read_lock_bh(&table->tb6_lock);
681         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
682 restart:
683         rt = fn->leaf;
684         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
685         BACKTRACK(net, &fl6->saddr);
686 out:
687         dst_use(&rt->dst, jiffies);
688         read_unlock_bh(&table->tb6_lock);
689         return rt;
690
691 }
692
693 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
694                                     int flags)
695 {
696         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
697 }
698 EXPORT_SYMBOL_GPL(ip6_route_lookup);
699
700 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
701                             const struct in6_addr *saddr, int oif, int strict)
702 {
703         struct flowi6 fl6 = {
704                 .flowi6_oif = oif,
705                 .daddr = *daddr,
706         };
707         struct dst_entry *dst;
708         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
709
710         if (saddr) {
711                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
712                 flags |= RT6_LOOKUP_F_HAS_SADDR;
713         }
714
715         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
716         if (dst->error == 0)
717                 return (struct rt6_info *) dst;
718
719         dst_release(dst);
720
721         return NULL;
722 }
723
724 EXPORT_SYMBOL(rt6_lookup);
725
726 /* ip6_ins_rt is called with FREE table->tb6_lock.
727    It takes new route entry, the addition fails by any reason the
728    route is freed. In any case, if caller does not hold it, it may
729    be destroyed.
730  */
731
732 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
733 {
734         int err;
735         struct fib6_table *table;
736
737         table = rt->rt6i_table;
738         write_lock_bh(&table->tb6_lock);
739         err = fib6_add(&table->tb6_root, rt, info);
740         write_unlock_bh(&table->tb6_lock);
741
742         return err;
743 }
744
745 int ip6_ins_rt(struct rt6_info *rt)
746 {
747         struct nl_info info = {
748                 .nl_net = dev_net(rt->dst.dev),
749         };
750         return __ip6_ins_rt(rt, &info);
751 }
752
753 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
754                                       const struct in6_addr *daddr,
755                                       const struct in6_addr *saddr)
756 {
757         struct rt6_info *rt;
758
759         /*
760          *      Clone the route.
761          */
762
763         rt = ip6_rt_copy(ort, daddr);
764
765         if (rt) {
766                 int attempts = !in_softirq();
767
768                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
769                         if (ort->rt6i_dst.plen != 128 &&
770                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
771                                 rt->rt6i_flags |= RTF_ANYCAST;
772                         rt->rt6i_gateway = *daddr;
773                 }
774
775                 rt->rt6i_flags |= RTF_CACHE;
776
777 #ifdef CONFIG_IPV6_SUBTREES
778                 if (rt->rt6i_src.plen && saddr) {
779                         rt->rt6i_src.addr = *saddr;
780                         rt->rt6i_src.plen = 128;
781                 }
782 #endif
783
784         retry:
785                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
786                         struct net *net = dev_net(rt->dst.dev);
787                         int saved_rt_min_interval =
788                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
789                         int saved_rt_elasticity =
790                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
791
792                         if (attempts-- > 0) {
793                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
794                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
795
796                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
797
798                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
799                                         saved_rt_elasticity;
800                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
801                                         saved_rt_min_interval;
802                                 goto retry;
803                         }
804
805                         net_warn_ratelimited("Neighbour table overflow\n");
806                         dst_free(&rt->dst);
807                         return NULL;
808                 }
809         }
810
811         return rt;
812 }
813
814 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
815                                         const struct in6_addr *daddr)
816 {
817         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
818
819         if (rt) {
820                 rt->rt6i_flags |= RTF_CACHE;
821                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
822         }
823         return rt;
824 }
825
826 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
827                                       struct flowi6 *fl6, int flags)
828 {
829         struct fib6_node *fn;
830         struct rt6_info *rt, *nrt;
831         int strict = 0;
832         int attempts = 3;
833         int err;
834         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
835
836         strict |= flags & RT6_LOOKUP_F_IFACE;
837
838 relookup:
839         read_lock_bh(&table->tb6_lock);
840
841 restart_2:
842         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
843
844 restart:
845         rt = rt6_select(fn, oif, strict | reachable);
846
847         BACKTRACK(net, &fl6->saddr);
848         if (rt == net->ipv6.ip6_null_entry ||
849             rt->rt6i_flags & RTF_CACHE)
850                 goto out;
851
852         dst_hold(&rt->dst);
853         read_unlock_bh(&table->tb6_lock);
854
855         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
856                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
857         else if (!(rt->dst.flags & DST_HOST))
858                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
859         else
860                 goto out2;
861
862         dst_release(&rt->dst);
863         rt = nrt ? : net->ipv6.ip6_null_entry;
864
865         dst_hold(&rt->dst);
866         if (nrt) {
867                 err = ip6_ins_rt(nrt);
868                 if (!err)
869                         goto out2;
870         }
871
872         if (--attempts <= 0)
873                 goto out2;
874
875         /*
876          * Race condition! In the gap, when table->tb6_lock was
877          * released someone could insert this route.  Relookup.
878          */
879         dst_release(&rt->dst);
880         goto relookup;
881
882 out:
883         if (reachable) {
884                 reachable = 0;
885                 goto restart_2;
886         }
887         dst_hold(&rt->dst);
888         read_unlock_bh(&table->tb6_lock);
889 out2:
890         rt->dst.lastuse = jiffies;
891         rt->dst.__use++;
892
893         return rt;
894 }
895
896 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
897                                             struct flowi6 *fl6, int flags)
898 {
899         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
900 }
901
902 static struct dst_entry *ip6_route_input_lookup(struct net *net,
903                                                 struct net_device *dev,
904                                                 struct flowi6 *fl6, int flags)
905 {
906         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
907                 flags |= RT6_LOOKUP_F_IFACE;
908
909         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
910 }
911
912 void ip6_route_input(struct sk_buff *skb)
913 {
914         const struct ipv6hdr *iph = ipv6_hdr(skb);
915         struct net *net = dev_net(skb->dev);
916         int flags = RT6_LOOKUP_F_HAS_SADDR;
917         struct flowi6 fl6 = {
918                 .flowi6_iif = skb->dev->ifindex,
919                 .daddr = iph->daddr,
920                 .saddr = iph->saddr,
921                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
922                 .flowi6_mark = skb->mark,
923                 .flowi6_proto = iph->nexthdr,
924         };
925
926         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
927 }
928
929 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
930                                              struct flowi6 *fl6, int flags)
931 {
932         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
933 }
934
935 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
936                                     struct flowi6 *fl6)
937 {
938         int flags = 0;
939
940         fl6->flowi6_iif = net->loopback_dev->ifindex;
941
942         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
943                 flags |= RT6_LOOKUP_F_IFACE;
944
945         if (!ipv6_addr_any(&fl6->saddr))
946                 flags |= RT6_LOOKUP_F_HAS_SADDR;
947         else if (sk)
948                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
949
950         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
951 }
952
953 EXPORT_SYMBOL(ip6_route_output);
954
955 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
956 {
957         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
958         struct dst_entry *new = NULL;
959
960         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
961         if (rt) {
962                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
963                 rt6_init_peer(rt, net->ipv6.peers);
964
965                 new = &rt->dst;
966
967                 new->__use = 1;
968                 new->input = dst_discard;
969                 new->output = dst_discard;
970
971                 if (dst_metrics_read_only(&ort->dst))
972                         new->_metrics = ort->dst._metrics;
973                 else
974                         dst_copy_metrics(new, &ort->dst);
975                 rt->rt6i_idev = ort->rt6i_idev;
976                 if (rt->rt6i_idev)
977                         in6_dev_hold(rt->rt6i_idev);
978
979                 rt->rt6i_gateway = ort->rt6i_gateway;
980                 rt->rt6i_flags = ort->rt6i_flags;
981                 rt6_clean_expires(rt);
982                 rt->rt6i_metric = 0;
983
984                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
985 #ifdef CONFIG_IPV6_SUBTREES
986                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
987 #endif
988
989                 dst_free(new);
990         }
991
992         dst_release(dst_orig);
993         return new ? new : ERR_PTR(-ENOMEM);
994 }
995
996 /*
997  *      Destination cache support functions
998  */
999
1000 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1001 {
1002         struct rt6_info *rt;
1003
1004         rt = (struct rt6_info *) dst;
1005
1006         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1007                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1008                         if (!rt6_has_peer(rt))
1009                                 rt6_bind_peer(rt, 0);
1010                         rt->rt6i_peer_genid = rt6_peer_genid();
1011                 }
1012                 return dst;
1013         }
1014         return NULL;
1015 }
1016
1017 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1018 {
1019         struct rt6_info *rt = (struct rt6_info *) dst;
1020
1021         if (rt) {
1022                 if (rt->rt6i_flags & RTF_CACHE) {
1023                         if (rt6_check_expired(rt)) {
1024                                 ip6_del_rt(rt);
1025                                 dst = NULL;
1026                         }
1027                 } else {
1028                         dst_release(dst);
1029                         dst = NULL;
1030                 }
1031         }
1032         return dst;
1033 }
1034
1035 static void ip6_link_failure(struct sk_buff *skb)
1036 {
1037         struct rt6_info *rt;
1038
1039         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1040
1041         rt = (struct rt6_info *) skb_dst(skb);
1042         if (rt) {
1043                 if (rt->rt6i_flags & RTF_CACHE)
1044                         rt6_update_expires(rt, 0);
1045                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1046                         rt->rt6i_node->fn_sernum = -1;
1047         }
1048 }
1049
1050 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1051 {
1052         struct rt6_info *rt6 = (struct rt6_info*)dst;
1053
1054         dst_confirm(dst);
1055         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1056                 struct net *net = dev_net(dst->dev);
1057
1058                 rt6->rt6i_flags |= RTF_MODIFIED;
1059                 if (mtu < IPV6_MIN_MTU) {
1060                         u32 features = dst_metric(dst, RTAX_FEATURES);
1061                         mtu = IPV6_MIN_MTU;
1062                         features |= RTAX_FEATURE_ALLFRAG;
1063                         dst_metric_set(dst, RTAX_FEATURES, features);
1064                 }
1065                 dst_metric_set(dst, RTAX_MTU, mtu);
1066                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1067         }
1068 }
1069
1070 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1071                      int oif, u32 mark)
1072 {
1073         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1074         struct dst_entry *dst;
1075         struct flowi6 fl6;
1076
1077         memset(&fl6, 0, sizeof(fl6));
1078         fl6.flowi6_oif = oif;
1079         fl6.flowi6_mark = mark;
1080         fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1081         fl6.daddr = iph->daddr;
1082         fl6.saddr = iph->saddr;
1083         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1084
1085         dst = ip6_route_output(net, NULL, &fl6);
1086         if (!dst->error)
1087                 ip6_rt_update_pmtu(dst, ntohl(mtu));
1088         dst_release(dst);
1089 }
1090 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1091
1092 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1093 {
1094         ip6_update_pmtu(skb, sock_net(sk), mtu,
1095                         sk->sk_bound_dev_if, sk->sk_mark);
1096 }
1097 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1098
1099 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1100 {
1101         struct net_device *dev = dst->dev;
1102         unsigned int mtu = dst_mtu(dst);
1103         struct net *net = dev_net(dev);
1104
1105         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1106
1107         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1108                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1109
1110         /*
1111          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1112          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1113          * IPV6_MAXPLEN is also valid and means: "any MSS,
1114          * rely only on pmtu discovery"
1115          */
1116         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1117                 mtu = IPV6_MAXPLEN;
1118         return mtu;
1119 }
1120
1121 static unsigned int ip6_mtu(const struct dst_entry *dst)
1122 {
1123         struct inet6_dev *idev;
1124         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1125
1126         if (mtu)
1127                 return mtu;
1128
1129         mtu = IPV6_MIN_MTU;
1130
1131         rcu_read_lock();
1132         idev = __in6_dev_get(dst->dev);
1133         if (idev)
1134                 mtu = idev->cnf.mtu6;
1135         rcu_read_unlock();
1136
1137         return mtu;
1138 }
1139
1140 static struct dst_entry *icmp6_dst_gc_list;
1141 static DEFINE_SPINLOCK(icmp6_dst_lock);
1142
1143 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1144                                   struct neighbour *neigh,
1145                                   struct flowi6 *fl6)
1146 {
1147         struct dst_entry *dst;
1148         struct rt6_info *rt;
1149         struct inet6_dev *idev = in6_dev_get(dev);
1150         struct net *net = dev_net(dev);
1151
1152         if (unlikely(!idev))
1153                 return ERR_PTR(-ENODEV);
1154
1155         rt = ip6_dst_alloc(net, dev, 0, NULL);
1156         if (unlikely(!rt)) {
1157                 in6_dev_put(idev);
1158                 dst = ERR_PTR(-ENOMEM);
1159                 goto out;
1160         }
1161
1162         if (neigh)
1163                 neigh_hold(neigh);
1164         else {
1165                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1166                 if (IS_ERR(neigh)) {
1167                         in6_dev_put(idev);
1168                         dst_free(&rt->dst);
1169                         return ERR_CAST(neigh);
1170                 }
1171         }
1172
1173         rt->dst.flags |= DST_HOST;
1174         rt->dst.output  = ip6_output;
1175         dst_set_neighbour(&rt->dst, neigh);
1176         atomic_set(&rt->dst.__refcnt, 1);
1177         rt->rt6i_dst.addr = fl6->daddr;
1178         rt->rt6i_dst.plen = 128;
1179         rt->rt6i_idev     = idev;
1180         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1181
1182         spin_lock_bh(&icmp6_dst_lock);
1183         rt->dst.next = icmp6_dst_gc_list;
1184         icmp6_dst_gc_list = &rt->dst;
1185         spin_unlock_bh(&icmp6_dst_lock);
1186
1187         fib6_force_start_gc(net);
1188
1189         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1190
1191 out:
1192         return dst;
1193 }
1194
1195 int icmp6_dst_gc(void)
1196 {
1197         struct dst_entry *dst, **pprev;
1198         int more = 0;
1199
1200         spin_lock_bh(&icmp6_dst_lock);
1201         pprev = &icmp6_dst_gc_list;
1202
1203         while ((dst = *pprev) != NULL) {
1204                 if (!atomic_read(&dst->__refcnt)) {
1205                         *pprev = dst->next;
1206                         dst_free(dst);
1207                 } else {
1208                         pprev = &dst->next;
1209                         ++more;
1210                 }
1211         }
1212
1213         spin_unlock_bh(&icmp6_dst_lock);
1214
1215         return more;
1216 }
1217
1218 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1219                             void *arg)
1220 {
1221         struct dst_entry *dst, **pprev;
1222
1223         spin_lock_bh(&icmp6_dst_lock);
1224         pprev = &icmp6_dst_gc_list;
1225         while ((dst = *pprev) != NULL) {
1226                 struct rt6_info *rt = (struct rt6_info *) dst;
1227                 if (func(rt, arg)) {
1228                         *pprev = dst->next;
1229                         dst_free(dst);
1230                 } else {
1231                         pprev = &dst->next;
1232                 }
1233         }
1234         spin_unlock_bh(&icmp6_dst_lock);
1235 }
1236
1237 static int ip6_dst_gc(struct dst_ops *ops)
1238 {
1239         unsigned long now = jiffies;
1240         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1241         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1242         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1243         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1244         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1245         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1246         int entries;
1247
1248         entries = dst_entries_get_fast(ops);
1249         if (time_after(rt_last_gc + rt_min_interval, now) &&
1250             entries <= rt_max_size)
1251                 goto out;
1252
1253         net->ipv6.ip6_rt_gc_expire++;
1254         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1255         net->ipv6.ip6_rt_last_gc = now;
1256         entries = dst_entries_get_slow(ops);
1257         if (entries < ops->gc_thresh)
1258                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1259 out:
1260         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1261         return entries > rt_max_size;
1262 }
1263
1264 /* Clean host part of a prefix. Not necessary in radix tree,
1265    but results in cleaner routing tables.
1266
1267    Remove it only when all the things will work!
1268  */
1269
1270 int ip6_dst_hoplimit(struct dst_entry *dst)
1271 {
1272         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1273         if (hoplimit == 0) {
1274                 struct net_device *dev = dst->dev;
1275                 struct inet6_dev *idev;
1276
1277                 rcu_read_lock();
1278                 idev = __in6_dev_get(dev);
1279                 if (idev)
1280                         hoplimit = idev->cnf.hop_limit;
1281                 else
1282                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1283                 rcu_read_unlock();
1284         }
1285         return hoplimit;
1286 }
1287 EXPORT_SYMBOL(ip6_dst_hoplimit);
1288
1289 /*
1290  *
1291  */
1292
1293 int ip6_route_add(struct fib6_config *cfg)
1294 {
1295         int err;
1296         struct net *net = cfg->fc_nlinfo.nl_net;
1297         struct rt6_info *rt = NULL;
1298         struct net_device *dev = NULL;
1299         struct inet6_dev *idev = NULL;
1300         struct fib6_table *table;
1301         int addr_type;
1302
1303         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1304                 return -EINVAL;
1305 #ifndef CONFIG_IPV6_SUBTREES
1306         if (cfg->fc_src_len)
1307                 return -EINVAL;
1308 #endif
1309         if (cfg->fc_ifindex) {
1310                 err = -ENODEV;
1311                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1312                 if (!dev)
1313                         goto out;
1314                 idev = in6_dev_get(dev);
1315                 if (!idev)
1316                         goto out;
1317         }
1318
1319         if (cfg->fc_metric == 0)
1320                 cfg->fc_metric = IP6_RT_PRIO_USER;
1321
1322         err = -ENOBUFS;
1323         if (cfg->fc_nlinfo.nlh &&
1324             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1325                 table = fib6_get_table(net, cfg->fc_table);
1326                 if (!table) {
1327                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1328                         table = fib6_new_table(net, cfg->fc_table);
1329                 }
1330         } else {
1331                 table = fib6_new_table(net, cfg->fc_table);
1332         }
1333
1334         if (!table)
1335                 goto out;
1336
1337         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1338
1339         if (!rt) {
1340                 err = -ENOMEM;
1341                 goto out;
1342         }
1343
1344         rt->dst.obsolete = -1;
1345
1346         if (cfg->fc_flags & RTF_EXPIRES)
1347                 rt6_set_expires(rt, jiffies +
1348                                 clock_t_to_jiffies(cfg->fc_expires));
1349         else
1350                 rt6_clean_expires(rt);
1351
1352         if (cfg->fc_protocol == RTPROT_UNSPEC)
1353                 cfg->fc_protocol = RTPROT_BOOT;
1354         rt->rt6i_protocol = cfg->fc_protocol;
1355
1356         addr_type = ipv6_addr_type(&cfg->fc_dst);
1357
1358         if (addr_type & IPV6_ADDR_MULTICAST)
1359                 rt->dst.input = ip6_mc_input;
1360         else if (cfg->fc_flags & RTF_LOCAL)
1361                 rt->dst.input = ip6_input;
1362         else
1363                 rt->dst.input = ip6_forward;
1364
1365         rt->dst.output = ip6_output;
1366
1367         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1368         rt->rt6i_dst.plen = cfg->fc_dst_len;
1369         if (rt->rt6i_dst.plen == 128)
1370                rt->dst.flags |= DST_HOST;
1371
1372         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1373                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1374                 if (!metrics) {
1375                         err = -ENOMEM;
1376                         goto out;
1377                 }
1378                 dst_init_metrics(&rt->dst, metrics, 0);
1379         }
1380 #ifdef CONFIG_IPV6_SUBTREES
1381         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1382         rt->rt6i_src.plen = cfg->fc_src_len;
1383 #endif
1384
1385         rt->rt6i_metric = cfg->fc_metric;
1386
1387         /* We cannot add true routes via loopback here,
1388            they would result in kernel looping; promote them to reject routes
1389          */
1390         if ((cfg->fc_flags & RTF_REJECT) ||
1391             (dev && (dev->flags & IFF_LOOPBACK) &&
1392              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1393              !(cfg->fc_flags & RTF_LOCAL))) {
1394                 /* hold loopback dev/idev if we haven't done so. */
1395                 if (dev != net->loopback_dev) {
1396                         if (dev) {
1397                                 dev_put(dev);
1398                                 in6_dev_put(idev);
1399                         }
1400                         dev = net->loopback_dev;
1401                         dev_hold(dev);
1402                         idev = in6_dev_get(dev);
1403                         if (!idev) {
1404                                 err = -ENODEV;
1405                                 goto out;
1406                         }
1407                 }
1408                 rt->dst.output = ip6_pkt_discard_out;
1409                 rt->dst.input = ip6_pkt_discard;
1410                 rt->dst.error = -ENETUNREACH;
1411                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1412                 goto install_route;
1413         }
1414
1415         if (cfg->fc_flags & RTF_GATEWAY) {
1416                 const struct in6_addr *gw_addr;
1417                 int gwa_type;
1418
1419                 gw_addr = &cfg->fc_gateway;
1420                 rt->rt6i_gateway = *gw_addr;
1421                 gwa_type = ipv6_addr_type(gw_addr);
1422
1423                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1424                         struct rt6_info *grt;
1425
1426                         /* IPv6 strictly inhibits using not link-local
1427                            addresses as nexthop address.
1428                            Otherwise, router will not able to send redirects.
1429                            It is very good, but in some (rare!) circumstances
1430                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1431                            some exceptions. --ANK
1432                          */
1433                         err = -EINVAL;
1434                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1435                                 goto out;
1436
1437                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1438
1439                         err = -EHOSTUNREACH;
1440                         if (!grt)
1441                                 goto out;
1442                         if (dev) {
1443                                 if (dev != grt->dst.dev) {
1444                                         dst_release(&grt->dst);
1445                                         goto out;
1446                                 }
1447                         } else {
1448                                 dev = grt->dst.dev;
1449                                 idev = grt->rt6i_idev;
1450                                 dev_hold(dev);
1451                                 in6_dev_hold(grt->rt6i_idev);
1452                         }
1453                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1454                                 err = 0;
1455                         dst_release(&grt->dst);
1456
1457                         if (err)
1458                                 goto out;
1459                 }
1460                 err = -EINVAL;
1461                 if (!dev || (dev->flags & IFF_LOOPBACK))
1462                         goto out;
1463         }
1464
1465         err = -ENODEV;
1466         if (!dev)
1467                 goto out;
1468
1469         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1470                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1471                         err = -EINVAL;
1472                         goto out;
1473                 }
1474                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1475                 rt->rt6i_prefsrc.plen = 128;
1476         } else
1477                 rt->rt6i_prefsrc.plen = 0;
1478
1479         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1480                 err = rt6_bind_neighbour(rt, dev);
1481                 if (err)
1482                         goto out;
1483         }
1484
1485         rt->rt6i_flags = cfg->fc_flags;
1486
1487 install_route:
1488         if (cfg->fc_mx) {
1489                 struct nlattr *nla;
1490                 int remaining;
1491
1492                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1493                         int type = nla_type(nla);
1494
1495                         if (type) {
1496                                 if (type > RTAX_MAX) {
1497                                         err = -EINVAL;
1498                                         goto out;
1499                                 }
1500
1501                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1502                         }
1503                 }
1504         }
1505
1506         rt->dst.dev = dev;
1507         rt->rt6i_idev = idev;
1508         rt->rt6i_table = table;
1509
1510         cfg->fc_nlinfo.nl_net = dev_net(dev);
1511
1512         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1513
1514 out:
1515         if (dev)
1516                 dev_put(dev);
1517         if (idev)
1518                 in6_dev_put(idev);
1519         if (rt)
1520                 dst_free(&rt->dst);
1521         return err;
1522 }
1523
1524 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1525 {
1526         int err;
1527         struct fib6_table *table;
1528         struct net *net = dev_net(rt->dst.dev);
1529
1530         if (rt == net->ipv6.ip6_null_entry)
1531                 return -ENOENT;
1532
1533         table = rt->rt6i_table;
1534         write_lock_bh(&table->tb6_lock);
1535
1536         err = fib6_del(rt, info);
1537         dst_release(&rt->dst);
1538
1539         write_unlock_bh(&table->tb6_lock);
1540
1541         return err;
1542 }
1543
1544 int ip6_del_rt(struct rt6_info *rt)
1545 {
1546         struct nl_info info = {
1547                 .nl_net = dev_net(rt->dst.dev),
1548         };
1549         return __ip6_del_rt(rt, &info);
1550 }
1551
1552 static int ip6_route_del(struct fib6_config *cfg)
1553 {
1554         struct fib6_table *table;
1555         struct fib6_node *fn;
1556         struct rt6_info *rt;
1557         int err = -ESRCH;
1558
1559         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1560         if (!table)
1561                 return err;
1562
1563         read_lock_bh(&table->tb6_lock);
1564
1565         fn = fib6_locate(&table->tb6_root,
1566                          &cfg->fc_dst, cfg->fc_dst_len,
1567                          &cfg->fc_src, cfg->fc_src_len);
1568
1569         if (fn) {
1570                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1571                         if (cfg->fc_ifindex &&
1572                             (!rt->dst.dev ||
1573                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1574                                 continue;
1575                         if (cfg->fc_flags & RTF_GATEWAY &&
1576                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1577                                 continue;
1578                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1579                                 continue;
1580                         dst_hold(&rt->dst);
1581                         read_unlock_bh(&table->tb6_lock);
1582
1583                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1584                 }
1585         }
1586         read_unlock_bh(&table->tb6_lock);
1587
1588         return err;
1589 }
1590
1591 /*
1592  *      Handle redirects
1593  */
1594 struct ip6rd_flowi {
1595         struct flowi6 fl6;
1596         struct in6_addr gateway;
1597 };
1598
1599 static struct rt6_info *__ip6_route_redirect(struct net *net,
1600                                              struct fib6_table *table,
1601                                              struct flowi6 *fl6,
1602                                              int flags)
1603 {
1604         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1605         struct rt6_info *rt;
1606         struct fib6_node *fn;
1607
1608         /*
1609          * Get the "current" route for this destination and
1610          * check if the redirect has come from approriate router.
1611          *
1612          * RFC 2461 specifies that redirects should only be
1613          * accepted if they come from the nexthop to the target.
1614          * Due to the way the routes are chosen, this notion
1615          * is a bit fuzzy and one might need to check all possible
1616          * routes.
1617          */
1618
1619         read_lock_bh(&table->tb6_lock);
1620         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1621 restart:
1622         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1623                 /*
1624                  * Current route is on-link; redirect is always invalid.
1625                  *
1626                  * Seems, previous statement is not true. It could
1627                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1628                  * But then router serving it might decide, that we should
1629                  * know truth 8)8) --ANK (980726).
1630                  */
1631                 if (rt6_check_expired(rt))
1632                         continue;
1633                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1634                         continue;
1635                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1636                         continue;
1637                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1638                         continue;
1639                 break;
1640         }
1641
1642         if (!rt)
1643                 rt = net->ipv6.ip6_null_entry;
1644         BACKTRACK(net, &fl6->saddr);
1645 out:
1646         dst_hold(&rt->dst);
1647
1648         read_unlock_bh(&table->tb6_lock);
1649
1650         return rt;
1651 };
1652
1653 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1654                                            const struct in6_addr *src,
1655                                            const struct in6_addr *gateway,
1656                                            struct net_device *dev)
1657 {
1658         int flags = RT6_LOOKUP_F_HAS_SADDR;
1659         struct net *net = dev_net(dev);
1660         struct ip6rd_flowi rdfl = {
1661                 .fl6 = {
1662                         .flowi6_oif = dev->ifindex,
1663                         .daddr = *dest,
1664                         .saddr = *src,
1665                 },
1666         };
1667
1668         rdfl.gateway = *gateway;
1669
1670         if (rt6_need_strict(dest))
1671                 flags |= RT6_LOOKUP_F_IFACE;
1672
1673         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1674                                                    flags, __ip6_route_redirect);
1675 }
1676
1677 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1678                   const struct in6_addr *saddr,
1679                   struct neighbour *neigh, u8 *lladdr, int on_link)
1680 {
1681         struct rt6_info *rt, *nrt = NULL;
1682         struct netevent_redirect netevent;
1683         struct net *net = dev_net(neigh->dev);
1684
1685         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1686
1687         if (rt == net->ipv6.ip6_null_entry) {
1688                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1689                 goto out;
1690         }
1691
1692         /*
1693          *      We have finally decided to accept it.
1694          */
1695
1696         neigh_update(neigh, lladdr, NUD_STALE,
1697                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1698                      NEIGH_UPDATE_F_OVERRIDE|
1699                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1700                                      NEIGH_UPDATE_F_ISROUTER))
1701                      );
1702
1703         /*
1704          * Redirect received -> path was valid.
1705          * Look, redirects are sent only in response to data packets,
1706          * so that this nexthop apparently is reachable. --ANK
1707          */
1708         dst_confirm(&rt->dst);
1709
1710         /* Duplicate redirect: silently ignore. */
1711         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1712                 goto out;
1713
1714         nrt = ip6_rt_copy(rt, dest);
1715         if (!nrt)
1716                 goto out;
1717
1718         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1719         if (on_link)
1720                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1721
1722         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1723         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1724
1725         if (ip6_ins_rt(nrt))
1726                 goto out;
1727
1728         netevent.old = &rt->dst;
1729         netevent.new = &nrt->dst;
1730         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1731
1732         if (rt->rt6i_flags & RTF_CACHE) {
1733                 ip6_del_rt(rt);
1734                 return;
1735         }
1736
1737 out:
1738         dst_release(&rt->dst);
1739 }
1740
1741 /*
1742  *      Misc support functions
1743  */
1744
1745 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1746                                     const struct in6_addr *dest)
1747 {
1748         struct net *net = dev_net(ort->dst.dev);
1749         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1750                                             ort->rt6i_table);
1751
1752         if (rt) {
1753                 rt->dst.input = ort->dst.input;
1754                 rt->dst.output = ort->dst.output;
1755                 rt->dst.flags |= DST_HOST;
1756
1757                 rt->rt6i_dst.addr = *dest;
1758                 rt->rt6i_dst.plen = 128;
1759                 dst_copy_metrics(&rt->dst, &ort->dst);
1760                 rt->dst.error = ort->dst.error;
1761                 rt->rt6i_idev = ort->rt6i_idev;
1762                 if (rt->rt6i_idev)
1763                         in6_dev_hold(rt->rt6i_idev);
1764                 rt->dst.lastuse = jiffies;
1765
1766                 rt->rt6i_gateway = ort->rt6i_gateway;
1767                 rt->rt6i_flags = ort->rt6i_flags;
1768                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1769                     (RTF_DEFAULT | RTF_ADDRCONF))
1770                         rt6_set_from(rt, ort);
1771                 else
1772                         rt6_clean_expires(rt);
1773                 rt->rt6i_metric = 0;
1774
1775 #ifdef CONFIG_IPV6_SUBTREES
1776                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1777 #endif
1778                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1779                 rt->rt6i_table = ort->rt6i_table;
1780         }
1781         return rt;
1782 }
1783
1784 #ifdef CONFIG_IPV6_ROUTE_INFO
1785 static struct rt6_info *rt6_get_route_info(struct net *net,
1786                                            const struct in6_addr *prefix, int prefixlen,
1787                                            const struct in6_addr *gwaddr, int ifindex)
1788 {
1789         struct fib6_node *fn;
1790         struct rt6_info *rt = NULL;
1791         struct fib6_table *table;
1792
1793         table = fib6_get_table(net, RT6_TABLE_INFO);
1794         if (!table)
1795                 return NULL;
1796
1797         write_lock_bh(&table->tb6_lock);
1798         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1799         if (!fn)
1800                 goto out;
1801
1802         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1803                 if (rt->dst.dev->ifindex != ifindex)
1804                         continue;
1805                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1806                         continue;
1807                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1808                         continue;
1809                 dst_hold(&rt->dst);
1810                 break;
1811         }
1812 out:
1813         write_unlock_bh(&table->tb6_lock);
1814         return rt;
1815 }
1816
1817 static struct rt6_info *rt6_add_route_info(struct net *net,
1818                                            const struct in6_addr *prefix, int prefixlen,
1819                                            const struct in6_addr *gwaddr, int ifindex,
1820                                            unsigned int pref)
1821 {
1822         struct fib6_config cfg = {
1823                 .fc_table       = RT6_TABLE_INFO,
1824                 .fc_metric      = IP6_RT_PRIO_USER,
1825                 .fc_ifindex     = ifindex,
1826                 .fc_dst_len     = prefixlen,
1827                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1828                                   RTF_UP | RTF_PREF(pref),
1829                 .fc_nlinfo.pid = 0,
1830                 .fc_nlinfo.nlh = NULL,
1831                 .fc_nlinfo.nl_net = net,
1832         };
1833
1834         cfg.fc_dst = *prefix;
1835         cfg.fc_gateway = *gwaddr;
1836
1837         /* We should treat it as a default route if prefix length is 0. */
1838         if (!prefixlen)
1839                 cfg.fc_flags |= RTF_DEFAULT;
1840
1841         ip6_route_add(&cfg);
1842
1843         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1844 }
1845 #endif
1846
1847 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1848 {
1849         struct rt6_info *rt;
1850         struct fib6_table *table;
1851
1852         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1853         if (!table)
1854                 return NULL;
1855
1856         write_lock_bh(&table->tb6_lock);
1857         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1858                 if (dev == rt->dst.dev &&
1859                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1860                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1861                         break;
1862         }
1863         if (rt)
1864                 dst_hold(&rt->dst);
1865         write_unlock_bh(&table->tb6_lock);
1866         return rt;
1867 }
1868
1869 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1870                                      struct net_device *dev,
1871                                      unsigned int pref)
1872 {
1873         struct fib6_config cfg = {
1874                 .fc_table       = RT6_TABLE_DFLT,
1875                 .fc_metric      = IP6_RT_PRIO_USER,
1876                 .fc_ifindex     = dev->ifindex,
1877                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1878                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1879                 .fc_nlinfo.pid = 0,
1880                 .fc_nlinfo.nlh = NULL,
1881                 .fc_nlinfo.nl_net = dev_net(dev),
1882         };
1883
1884         cfg.fc_gateway = *gwaddr;
1885
1886         ip6_route_add(&cfg);
1887
1888         return rt6_get_dflt_router(gwaddr, dev);
1889 }
1890
1891 void rt6_purge_dflt_routers(struct net *net)
1892 {
1893         struct rt6_info *rt;
1894         struct fib6_table *table;
1895
1896         /* NOTE: Keep consistent with rt6_get_dflt_router */
1897         table = fib6_get_table(net, RT6_TABLE_DFLT);
1898         if (!table)
1899                 return;
1900
1901 restart:
1902         read_lock_bh(&table->tb6_lock);
1903         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1904                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1905                         dst_hold(&rt->dst);
1906                         read_unlock_bh(&table->tb6_lock);
1907                         ip6_del_rt(rt);
1908                         goto restart;
1909                 }
1910         }
1911         read_unlock_bh(&table->tb6_lock);
1912 }
1913
1914 static void rtmsg_to_fib6_config(struct net *net,
1915                                  struct in6_rtmsg *rtmsg,
1916                                  struct fib6_config *cfg)
1917 {
1918         memset(cfg, 0, sizeof(*cfg));
1919
1920         cfg->fc_table = RT6_TABLE_MAIN;
1921         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1922         cfg->fc_metric = rtmsg->rtmsg_metric;
1923         cfg->fc_expires = rtmsg->rtmsg_info;
1924         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1925         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1926         cfg->fc_flags = rtmsg->rtmsg_flags;
1927
1928         cfg->fc_nlinfo.nl_net = net;
1929
1930         cfg->fc_dst = rtmsg->rtmsg_dst;
1931         cfg->fc_src = rtmsg->rtmsg_src;
1932         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1933 }
1934
1935 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1936 {
1937         struct fib6_config cfg;
1938         struct in6_rtmsg rtmsg;
1939         int err;
1940
1941         switch(cmd) {
1942         case SIOCADDRT:         /* Add a route */
1943         case SIOCDELRT:         /* Delete a route */
1944                 if (!capable(CAP_NET_ADMIN))
1945                         return -EPERM;
1946                 err = copy_from_user(&rtmsg, arg,
1947                                      sizeof(struct in6_rtmsg));
1948                 if (err)
1949                         return -EFAULT;
1950
1951                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1952
1953                 rtnl_lock();
1954                 switch (cmd) {
1955                 case SIOCADDRT:
1956                         err = ip6_route_add(&cfg);
1957                         break;
1958                 case SIOCDELRT:
1959                         err = ip6_route_del(&cfg);
1960                         break;
1961                 default:
1962                         err = -EINVAL;
1963                 }
1964                 rtnl_unlock();
1965
1966                 return err;
1967         }
1968
1969         return -EINVAL;
1970 }
1971
1972 /*
1973  *      Drop the packet on the floor
1974  */
1975
1976 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1977 {
1978         int type;
1979         struct dst_entry *dst = skb_dst(skb);
1980         switch (ipstats_mib_noroutes) {
1981         case IPSTATS_MIB_INNOROUTES:
1982                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1983                 if (type == IPV6_ADDR_ANY) {
1984                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1985                                       IPSTATS_MIB_INADDRERRORS);
1986                         break;
1987                 }
1988                 /* FALLTHROUGH */
1989         case IPSTATS_MIB_OUTNOROUTES:
1990                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1991                               ipstats_mib_noroutes);
1992                 break;
1993         }
1994         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1995         kfree_skb(skb);
1996         return 0;
1997 }
1998
1999 static int ip6_pkt_discard(struct sk_buff *skb)
2000 {
2001         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2002 }
2003
2004 static int ip6_pkt_discard_out(struct sk_buff *skb)
2005 {
2006         skb->dev = skb_dst(skb)->dev;
2007         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2008 }
2009
2010 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2011
2012 static int ip6_pkt_prohibit(struct sk_buff *skb)
2013 {
2014         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2015 }
2016
2017 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2018 {
2019         skb->dev = skb_dst(skb)->dev;
2020         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2021 }
2022
2023 #endif
2024
2025 /*
2026  *      Allocate a dst for local (unicast / anycast) address.
2027  */
2028
2029 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2030                                     const struct in6_addr *addr,
2031                                     bool anycast)
2032 {
2033         struct net *net = dev_net(idev->dev);
2034         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2035         int err;
2036
2037         if (!rt) {
2038                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2039                 return ERR_PTR(-ENOMEM);
2040         }
2041
2042         in6_dev_hold(idev);
2043
2044         rt->dst.flags |= DST_HOST;
2045         rt->dst.input = ip6_input;
2046         rt->dst.output = ip6_output;
2047         rt->rt6i_idev = idev;
2048         rt->dst.obsolete = -1;
2049
2050         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2051         if (anycast)
2052                 rt->rt6i_flags |= RTF_ANYCAST;
2053         else
2054                 rt->rt6i_flags |= RTF_LOCAL;
2055         err = rt6_bind_neighbour(rt, rt->dst.dev);
2056         if (err) {
2057                 dst_free(&rt->dst);
2058                 return ERR_PTR(err);
2059         }
2060
2061         rt->rt6i_dst.addr = *addr;
2062         rt->rt6i_dst.plen = 128;
2063         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2064
2065         atomic_set(&rt->dst.__refcnt, 1);
2066
2067         return rt;
2068 }
2069
2070 int ip6_route_get_saddr(struct net *net,
2071                         struct rt6_info *rt,
2072                         const struct in6_addr *daddr,
2073                         unsigned int prefs,
2074                         struct in6_addr *saddr)
2075 {
2076         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2077         int err = 0;
2078         if (rt->rt6i_prefsrc.plen)
2079                 *saddr = rt->rt6i_prefsrc.addr;
2080         else
2081                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2082                                          daddr, prefs, saddr);
2083         return err;
2084 }
2085
2086 /* remove deleted ip from prefsrc entries */
2087 struct arg_dev_net_ip {
2088         struct net_device *dev;
2089         struct net *net;
2090         struct in6_addr *addr;
2091 };
2092
2093 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2094 {
2095         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2096         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2097         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2098
2099         if (((void *)rt->dst.dev == dev || !dev) &&
2100             rt != net->ipv6.ip6_null_entry &&
2101             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2102                 /* remove prefsrc entry */
2103                 rt->rt6i_prefsrc.plen = 0;
2104         }
2105         return 0;
2106 }
2107
2108 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2109 {
2110         struct net *net = dev_net(ifp->idev->dev);
2111         struct arg_dev_net_ip adni = {
2112                 .dev = ifp->idev->dev,
2113                 .net = net,
2114                 .addr = &ifp->addr,
2115         };
2116         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2117 }
2118
2119 struct arg_dev_net {
2120         struct net_device *dev;
2121         struct net *net;
2122 };
2123
2124 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2125 {
2126         const struct arg_dev_net *adn = arg;
2127         const struct net_device *dev = adn->dev;
2128
2129         if ((rt->dst.dev == dev || !dev) &&
2130             rt != adn->net->ipv6.ip6_null_entry)
2131                 return -1;
2132
2133         return 0;
2134 }
2135
2136 void rt6_ifdown(struct net *net, struct net_device *dev)
2137 {
2138         struct arg_dev_net adn = {
2139                 .dev = dev,
2140                 .net = net,
2141         };
2142
2143         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2144         icmp6_clean_all(fib6_ifdown, &adn);
2145 }
2146
2147 struct rt6_mtu_change_arg {
2148         struct net_device *dev;
2149         unsigned int mtu;
2150 };
2151
2152 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2153 {
2154         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2155         struct inet6_dev *idev;
2156
2157         /* In IPv6 pmtu discovery is not optional,
2158            so that RTAX_MTU lock cannot disable it.
2159            We still use this lock to block changes
2160            caused by addrconf/ndisc.
2161         */
2162
2163         idev = __in6_dev_get(arg->dev);
2164         if (!idev)
2165                 return 0;
2166
2167         /* For administrative MTU increase, there is no way to discover
2168            IPv6 PMTU increase, so PMTU increase should be updated here.
2169            Since RFC 1981 doesn't include administrative MTU increase
2170            update PMTU increase is a MUST. (i.e. jumbo frame)
2171          */
2172         /*
2173            If new MTU is less than route PMTU, this new MTU will be the
2174            lowest MTU in the path, update the route PMTU to reflect PMTU
2175            decreases; if new MTU is greater than route PMTU, and the
2176            old MTU is the lowest MTU in the path, update the route PMTU
2177            to reflect the increase. In this case if the other nodes' MTU
2178            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2179            PMTU discouvery.
2180          */
2181         if (rt->dst.dev == arg->dev &&
2182             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2183             (dst_mtu(&rt->dst) >= arg->mtu ||
2184              (dst_mtu(&rt->dst) < arg->mtu &&
2185               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2186                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2187         }
2188         return 0;
2189 }
2190
2191 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2192 {
2193         struct rt6_mtu_change_arg arg = {
2194                 .dev = dev,
2195                 .mtu = mtu,
2196         };
2197
2198         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2199 }
2200
2201 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2202         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2203         [RTA_OIF]               = { .type = NLA_U32 },
2204         [RTA_IIF]               = { .type = NLA_U32 },
2205         [RTA_PRIORITY]          = { .type = NLA_U32 },
2206         [RTA_METRICS]           = { .type = NLA_NESTED },
2207 };
2208
2209 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2210                               struct fib6_config *cfg)
2211 {
2212         struct rtmsg *rtm;
2213         struct nlattr *tb[RTA_MAX+1];
2214         int err;
2215
2216         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2217         if (err < 0)
2218                 goto errout;
2219
2220         err = -EINVAL;
2221         rtm = nlmsg_data(nlh);
2222         memset(cfg, 0, sizeof(*cfg));
2223
2224         cfg->fc_table = rtm->rtm_table;
2225         cfg->fc_dst_len = rtm->rtm_dst_len;
2226         cfg->fc_src_len = rtm->rtm_src_len;
2227         cfg->fc_flags = RTF_UP;
2228         cfg->fc_protocol = rtm->rtm_protocol;
2229
2230         if (rtm->rtm_type == RTN_UNREACHABLE)
2231                 cfg->fc_flags |= RTF_REJECT;
2232
2233         if (rtm->rtm_type == RTN_LOCAL)
2234                 cfg->fc_flags |= RTF_LOCAL;
2235
2236         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2237         cfg->fc_nlinfo.nlh = nlh;
2238         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2239
2240         if (tb[RTA_GATEWAY]) {
2241                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2242                 cfg->fc_flags |= RTF_GATEWAY;
2243         }
2244
2245         if (tb[RTA_DST]) {
2246                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2247
2248                 if (nla_len(tb[RTA_DST]) < plen)
2249                         goto errout;
2250
2251                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2252         }
2253
2254         if (tb[RTA_SRC]) {
2255                 int plen = (rtm->rtm_src_len + 7) >> 3;
2256
2257                 if (nla_len(tb[RTA_SRC]) < plen)
2258                         goto errout;
2259
2260                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2261         }
2262
2263         if (tb[RTA_PREFSRC])
2264                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2265
2266         if (tb[RTA_OIF])
2267                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2268
2269         if (tb[RTA_PRIORITY])
2270                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2271
2272         if (tb[RTA_METRICS]) {
2273                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2274                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2275         }
2276
2277         if (tb[RTA_TABLE])
2278                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2279
2280         err = 0;
2281 errout:
2282         return err;
2283 }
2284
2285 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2286 {
2287         struct fib6_config cfg;
2288         int err;
2289
2290         err = rtm_to_fib6_config(skb, nlh, &cfg);
2291         if (err < 0)
2292                 return err;
2293
2294         return ip6_route_del(&cfg);
2295 }
2296
2297 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2298 {
2299         struct fib6_config cfg;
2300         int err;
2301
2302         err = rtm_to_fib6_config(skb, nlh, &cfg);
2303         if (err < 0)
2304                 return err;
2305
2306         return ip6_route_add(&cfg);
2307 }
2308
2309 static inline size_t rt6_nlmsg_size(void)
2310 {
2311         return NLMSG_ALIGN(sizeof(struct rtmsg))
2312                + nla_total_size(16) /* RTA_SRC */
2313                + nla_total_size(16) /* RTA_DST */
2314                + nla_total_size(16) /* RTA_GATEWAY */
2315                + nla_total_size(16) /* RTA_PREFSRC */
2316                + nla_total_size(4) /* RTA_TABLE */
2317                + nla_total_size(4) /* RTA_IIF */
2318                + nla_total_size(4) /* RTA_OIF */
2319                + nla_total_size(4) /* RTA_PRIORITY */
2320                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2321                + nla_total_size(sizeof(struct rta_cacheinfo));
2322 }
2323
2324 static int rt6_fill_node(struct net *net,
2325                          struct sk_buff *skb, struct rt6_info *rt,
2326                          struct in6_addr *dst, struct in6_addr *src,
2327                          int iif, int type, u32 pid, u32 seq,
2328                          int prefix, int nowait, unsigned int flags)
2329 {
2330         const struct inet_peer *peer;
2331         struct rtmsg *rtm;
2332         struct nlmsghdr *nlh;
2333         long expires;
2334         u32 table;
2335         struct neighbour *n;
2336         u32 ts, tsage;
2337
2338         if (prefix) {   /* user wants prefix routes only */
2339                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2340                         /* success since this is not a prefix route */
2341                         return 1;
2342                 }
2343         }
2344
2345         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2346         if (!nlh)
2347                 return -EMSGSIZE;
2348
2349         rtm = nlmsg_data(nlh);
2350         rtm->rtm_family = AF_INET6;
2351         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2352         rtm->rtm_src_len = rt->rt6i_src.plen;
2353         rtm->rtm_tos = 0;
2354         if (rt->rt6i_table)
2355                 table = rt->rt6i_table->tb6_id;
2356         else
2357                 table = RT6_TABLE_UNSPEC;
2358         rtm->rtm_table = table;
2359         if (nla_put_u32(skb, RTA_TABLE, table))
2360                 goto nla_put_failure;
2361         if (rt->rt6i_flags & RTF_REJECT)
2362                 rtm->rtm_type = RTN_UNREACHABLE;
2363         else if (rt->rt6i_flags & RTF_LOCAL)
2364                 rtm->rtm_type = RTN_LOCAL;
2365         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2366                 rtm->rtm_type = RTN_LOCAL;
2367         else
2368                 rtm->rtm_type = RTN_UNICAST;
2369         rtm->rtm_flags = 0;
2370         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2371         rtm->rtm_protocol = rt->rt6i_protocol;
2372         if (rt->rt6i_flags & RTF_DYNAMIC)
2373                 rtm->rtm_protocol = RTPROT_REDIRECT;
2374         else if (rt->rt6i_flags & RTF_ADDRCONF)
2375                 rtm->rtm_protocol = RTPROT_KERNEL;
2376         else if (rt->rt6i_flags & RTF_DEFAULT)
2377                 rtm->rtm_protocol = RTPROT_RA;
2378
2379         if (rt->rt6i_flags & RTF_CACHE)
2380                 rtm->rtm_flags |= RTM_F_CLONED;
2381
2382         if (dst) {
2383                 if (nla_put(skb, RTA_DST, 16, dst))
2384                         goto nla_put_failure;
2385                 rtm->rtm_dst_len = 128;
2386         } else if (rtm->rtm_dst_len)
2387                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2388                         goto nla_put_failure;
2389 #ifdef CONFIG_IPV6_SUBTREES
2390         if (src) {
2391                 if (nla_put(skb, RTA_SRC, 16, src))
2392                         goto nla_put_failure;
2393                 rtm->rtm_src_len = 128;
2394         } else if (rtm->rtm_src_len &&
2395                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2396                 goto nla_put_failure;
2397 #endif
2398         if (iif) {
2399 #ifdef CONFIG_IPV6_MROUTE
2400                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2401                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2402                         if (err <= 0) {
2403                                 if (!nowait) {
2404                                         if (err == 0)
2405                                                 return 0;
2406                                         goto nla_put_failure;
2407                                 } else {
2408                                         if (err == -EMSGSIZE)
2409                                                 goto nla_put_failure;
2410                                 }
2411                         }
2412                 } else
2413 #endif
2414                         if (nla_put_u32(skb, RTA_IIF, iif))
2415                                 goto nla_put_failure;
2416         } else if (dst) {
2417                 struct in6_addr saddr_buf;
2418                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2419                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2420                         goto nla_put_failure;
2421         }
2422
2423         if (rt->rt6i_prefsrc.plen) {
2424                 struct in6_addr saddr_buf;
2425                 saddr_buf = rt->rt6i_prefsrc.addr;
2426                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2427                         goto nla_put_failure;
2428         }
2429
2430         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2431                 goto nla_put_failure;
2432
2433         rcu_read_lock();
2434         n = dst_get_neighbour_noref(&rt->dst);
2435         if (n) {
2436                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2437                         rcu_read_unlock();
2438                         goto nla_put_failure;
2439                 }
2440         }
2441         rcu_read_unlock();
2442
2443         if (rt->dst.dev &&
2444             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2445                 goto nla_put_failure;
2446         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2447                 goto nla_put_failure;
2448         if (!(rt->rt6i_flags & RTF_EXPIRES))
2449                 expires = 0;
2450         else if (rt->dst.expires - jiffies < INT_MAX)
2451                 expires = rt->dst.expires - jiffies;
2452         else
2453                 expires = INT_MAX;
2454
2455         peer = NULL;
2456         if (rt6_has_peer(rt))
2457                 peer = rt6_peer_ptr(rt);
2458         ts = tsage = 0;
2459         if (peer && peer->tcp_ts_stamp) {
2460                 ts = peer->tcp_ts;
2461                 tsage = get_seconds() - peer->tcp_ts_stamp;
2462         }
2463
2464         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2465                                expires, rt->dst.error) < 0)
2466                 goto nla_put_failure;
2467
2468         return nlmsg_end(skb, nlh);
2469
2470 nla_put_failure:
2471         nlmsg_cancel(skb, nlh);
2472         return -EMSGSIZE;
2473 }
2474
2475 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2476 {
2477         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2478         int prefix;
2479
2480         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2481                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2482                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2483         } else
2484                 prefix = 0;
2485
2486         return rt6_fill_node(arg->net,
2487                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2488                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2489                      prefix, 0, NLM_F_MULTI);
2490 }
2491
2492 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2493 {
2494         struct net *net = sock_net(in_skb->sk);
2495         struct nlattr *tb[RTA_MAX+1];
2496         struct rt6_info *rt;
2497         struct sk_buff *skb;
2498         struct rtmsg *rtm;
2499         struct flowi6 fl6;
2500         int err, iif = 0, oif = 0;
2501
2502         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2503         if (err < 0)
2504                 goto errout;
2505
2506         err = -EINVAL;
2507         memset(&fl6, 0, sizeof(fl6));
2508
2509         if (tb[RTA_SRC]) {
2510                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2511                         goto errout;
2512
2513                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2514         }
2515
2516         if (tb[RTA_DST]) {
2517                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2518                         goto errout;
2519
2520                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2521         }
2522
2523         if (tb[RTA_IIF])
2524                 iif = nla_get_u32(tb[RTA_IIF]);
2525
2526         if (tb[RTA_OIF])
2527                 oif = nla_get_u32(tb[RTA_OIF]);
2528
2529         if (iif) {
2530                 struct net_device *dev;
2531                 int flags = 0;
2532
2533                 dev = __dev_get_by_index(net, iif);
2534                 if (!dev) {
2535                         err = -ENODEV;
2536                         goto errout;
2537                 }
2538
2539                 fl6.flowi6_iif = iif;
2540
2541                 if (!ipv6_addr_any(&fl6.saddr))
2542                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2543
2544                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2545                                                                flags);
2546         } else {
2547                 fl6.flowi6_oif = oif;
2548
2549                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2550         }
2551
2552         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2553         if (!skb) {
2554                 dst_release(&rt->dst);
2555                 err = -ENOBUFS;
2556                 goto errout;
2557         }
2558
2559         /* Reserve room for dummy headers, this skb can pass
2560            through good chunk of routing engine.
2561          */
2562         skb_reset_mac_header(skb);
2563         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2564
2565         skb_dst_set(skb, &rt->dst);
2566
2567         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2568                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2569                             nlh->nlmsg_seq, 0, 0, 0);
2570         if (err < 0) {
2571                 kfree_skb(skb);
2572                 goto errout;
2573         }
2574
2575         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2576 errout:
2577         return err;
2578 }
2579
2580 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2581 {
2582         struct sk_buff *skb;
2583         struct net *net = info->nl_net;
2584         u32 seq;
2585         int err;
2586
2587         err = -ENOBUFS;
2588         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2589
2590         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2591         if (!skb)
2592                 goto errout;
2593
2594         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2595                                 event, info->pid, seq, 0, 0, 0);
2596         if (err < 0) {
2597                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2598                 WARN_ON(err == -EMSGSIZE);
2599                 kfree_skb(skb);
2600                 goto errout;
2601         }
2602         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2603                     info->nlh, gfp_any());
2604         return;
2605 errout:
2606         if (err < 0)
2607                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2608 }
2609
2610 static int ip6_route_dev_notify(struct notifier_block *this,
2611                                 unsigned long event, void *data)
2612 {
2613         struct net_device *dev = (struct net_device *)data;
2614         struct net *net = dev_net(dev);
2615
2616         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2617                 net->ipv6.ip6_null_entry->dst.dev = dev;
2618                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2619 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2620                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2621                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2622                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2623                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2624 #endif
2625         }
2626
2627         return NOTIFY_OK;
2628 }
2629
2630 /*
2631  *      /proc
2632  */
2633
2634 #ifdef CONFIG_PROC_FS
2635
2636 struct rt6_proc_arg
2637 {
2638         char *buffer;
2639         int offset;
2640         int length;
2641         int skip;
2642         int len;
2643 };
2644
2645 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2646 {
2647         struct seq_file *m = p_arg;
2648         struct neighbour *n;
2649
2650         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2651
2652 #ifdef CONFIG_IPV6_SUBTREES
2653         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2654 #else
2655         seq_puts(m, "00000000000000000000000000000000 00 ");
2656 #endif
2657         rcu_read_lock();
2658         n = dst_get_neighbour_noref(&rt->dst);
2659         if (n) {
2660                 seq_printf(m, "%pi6", n->primary_key);
2661         } else {
2662                 seq_puts(m, "00000000000000000000000000000000");
2663         }
2664         rcu_read_unlock();
2665         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2666                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2667                    rt->dst.__use, rt->rt6i_flags,
2668                    rt->dst.dev ? rt->dst.dev->name : "");
2669         return 0;
2670 }
2671
2672 static int ipv6_route_show(struct seq_file *m, void *v)
2673 {
2674         struct net *net = (struct net *)m->private;
2675         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2676         return 0;
2677 }
2678
2679 static int ipv6_route_open(struct inode *inode, struct file *file)
2680 {
2681         return single_open_net(inode, file, ipv6_route_show);
2682 }
2683
2684 static const struct file_operations ipv6_route_proc_fops = {
2685         .owner          = THIS_MODULE,
2686         .open           = ipv6_route_open,
2687         .read           = seq_read,
2688         .llseek         = seq_lseek,
2689         .release        = single_release_net,
2690 };
2691
2692 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2693 {
2694         struct net *net = (struct net *)seq->private;
2695         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2696                    net->ipv6.rt6_stats->fib_nodes,
2697                    net->ipv6.rt6_stats->fib_route_nodes,
2698                    net->ipv6.rt6_stats->fib_rt_alloc,
2699                    net->ipv6.rt6_stats->fib_rt_entries,
2700                    net->ipv6.rt6_stats->fib_rt_cache,
2701                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2702                    net->ipv6.rt6_stats->fib_discarded_routes);
2703
2704         return 0;
2705 }
2706
2707 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2708 {
2709         return single_open_net(inode, file, rt6_stats_seq_show);
2710 }
2711
2712 static const struct file_operations rt6_stats_seq_fops = {
2713         .owner   = THIS_MODULE,
2714         .open    = rt6_stats_seq_open,
2715         .read    = seq_read,
2716         .llseek  = seq_lseek,
2717         .release = single_release_net,
2718 };
2719 #endif  /* CONFIG_PROC_FS */
2720
2721 #ifdef CONFIG_SYSCTL
2722
2723 static
2724 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2725                               void __user *buffer, size_t *lenp, loff_t *ppos)
2726 {
2727         struct net *net;
2728         int delay;
2729         if (!write)
2730                 return -EINVAL;
2731
2732         net = (struct net *)ctl->extra1;
2733         delay = net->ipv6.sysctl.flush_delay;
2734         proc_dointvec(ctl, write, buffer, lenp, ppos);
2735         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2736         return 0;
2737 }
2738
2739 ctl_table ipv6_route_table_template[] = {
2740         {
2741                 .procname       =       "flush",
2742                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2743                 .maxlen         =       sizeof(int),
2744                 .mode           =       0200,
2745                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2746         },
2747         {
2748                 .procname       =       "gc_thresh",
2749                 .data           =       &ip6_dst_ops_template.gc_thresh,
2750                 .maxlen         =       sizeof(int),
2751                 .mode           =       0644,
2752                 .proc_handler   =       proc_dointvec,
2753         },
2754         {
2755                 .procname       =       "max_size",
2756                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2757                 .maxlen         =       sizeof(int),
2758                 .mode           =       0644,
2759                 .proc_handler   =       proc_dointvec,
2760         },
2761         {
2762                 .procname       =       "gc_min_interval",
2763                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2764                 .maxlen         =       sizeof(int),
2765                 .mode           =       0644,
2766                 .proc_handler   =       proc_dointvec_jiffies,
2767         },
2768         {
2769                 .procname       =       "gc_timeout",
2770                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2771                 .maxlen         =       sizeof(int),
2772                 .mode           =       0644,
2773                 .proc_handler   =       proc_dointvec_jiffies,
2774         },
2775         {
2776                 .procname       =       "gc_interval",
2777                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2778                 .maxlen         =       sizeof(int),
2779                 .mode           =       0644,
2780                 .proc_handler   =       proc_dointvec_jiffies,
2781         },
2782         {
2783                 .procname       =       "gc_elasticity",
2784                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2785                 .maxlen         =       sizeof(int),
2786                 .mode           =       0644,
2787                 .proc_handler   =       proc_dointvec,
2788         },
2789         {
2790                 .procname       =       "mtu_expires",
2791                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2792                 .maxlen         =       sizeof(int),
2793                 .mode           =       0644,
2794                 .proc_handler   =       proc_dointvec_jiffies,
2795         },
2796         {
2797                 .procname       =       "min_adv_mss",
2798                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2799                 .maxlen         =       sizeof(int),
2800                 .mode           =       0644,
2801                 .proc_handler   =       proc_dointvec,
2802         },
2803         {
2804                 .procname       =       "gc_min_interval_ms",
2805                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2806                 .maxlen         =       sizeof(int),
2807                 .mode           =       0644,
2808                 .proc_handler   =       proc_dointvec_ms_jiffies,
2809         },
2810         { }
2811 };
2812
2813 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2814 {
2815         struct ctl_table *table;
2816
2817         table = kmemdup(ipv6_route_table_template,
2818                         sizeof(ipv6_route_table_template),
2819                         GFP_KERNEL);
2820
2821         if (table) {
2822                 table[0].data = &net->ipv6.sysctl.flush_delay;
2823                 table[0].extra1 = net;
2824                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2825                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2826                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2827                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2828                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2829                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2830                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2831                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2832                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2833         }
2834
2835         return table;
2836 }
2837 #endif
2838
2839 static int __net_init ip6_route_net_init(struct net *net)
2840 {
2841         int ret = -ENOMEM;
2842
2843         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2844                sizeof(net->ipv6.ip6_dst_ops));
2845
2846         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2847                 goto out_ip6_dst_ops;
2848
2849         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2850                                            sizeof(*net->ipv6.ip6_null_entry),
2851                                            GFP_KERNEL);
2852         if (!net->ipv6.ip6_null_entry)
2853                 goto out_ip6_dst_entries;
2854         net->ipv6.ip6_null_entry->dst.path =
2855                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2856         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2857         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2858                          ip6_template_metrics, true);
2859
2860 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2861         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2862                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2863                                                GFP_KERNEL);
2864         if (!net->ipv6.ip6_prohibit_entry)
2865                 goto out_ip6_null_entry;
2866         net->ipv6.ip6_prohibit_entry->dst.path =
2867                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2868         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2869         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2870                          ip6_template_metrics, true);
2871
2872         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2873                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2874                                                GFP_KERNEL);
2875         if (!net->ipv6.ip6_blk_hole_entry)
2876                 goto out_ip6_prohibit_entry;
2877         net->ipv6.ip6_blk_hole_entry->dst.path =
2878                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2879         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2880         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2881                          ip6_template_metrics, true);
2882 #endif
2883
2884         net->ipv6.sysctl.flush_delay = 0;
2885         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2886         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2887         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2888         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2889         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2890         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2891         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2892
2893         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2894
2895         ret = 0;
2896 out:
2897         return ret;
2898
2899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2900 out_ip6_prohibit_entry:
2901         kfree(net->ipv6.ip6_prohibit_entry);
2902 out_ip6_null_entry:
2903         kfree(net->ipv6.ip6_null_entry);
2904 #endif
2905 out_ip6_dst_entries:
2906         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907 out_ip6_dst_ops:
2908         goto out;
2909 }
2910
2911 static void __net_exit ip6_route_net_exit(struct net *net)
2912 {
2913         kfree(net->ipv6.ip6_null_entry);
2914 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2915         kfree(net->ipv6.ip6_prohibit_entry);
2916         kfree(net->ipv6.ip6_blk_hole_entry);
2917 #endif
2918         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2919 }
2920
2921 static int __net_init ip6_route_net_init_late(struct net *net)
2922 {
2923 #ifdef CONFIG_PROC_FS
2924         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2925         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2926 #endif
2927         return 0;
2928 }
2929
2930 static void __net_exit ip6_route_net_exit_late(struct net *net)
2931 {
2932 #ifdef CONFIG_PROC_FS
2933         proc_net_remove(net, "ipv6_route");
2934         proc_net_remove(net, "rt6_stats");
2935 #endif
2936 }
2937
2938 static struct pernet_operations ip6_route_net_ops = {
2939         .init = ip6_route_net_init,
2940         .exit = ip6_route_net_exit,
2941 };
2942
2943 static int __net_init ipv6_inetpeer_init(struct net *net)
2944 {
2945         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2946
2947         if (!bp)
2948                 return -ENOMEM;
2949         inet_peer_base_init(bp);
2950         net->ipv6.peers = bp;
2951         return 0;
2952 }
2953
2954 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2955 {
2956         struct inet_peer_base *bp = net->ipv6.peers;
2957
2958         net->ipv6.peers = NULL;
2959         inetpeer_invalidate_tree(bp);
2960         kfree(bp);
2961 }
2962
2963 static struct pernet_operations ipv6_inetpeer_ops = {
2964         .init   =       ipv6_inetpeer_init,
2965         .exit   =       ipv6_inetpeer_exit,
2966 };
2967
2968 static struct pernet_operations ip6_route_net_late_ops = {
2969         .init = ip6_route_net_init_late,
2970         .exit = ip6_route_net_exit_late,
2971 };
2972
2973 static struct notifier_block ip6_route_dev_notifier = {
2974         .notifier_call = ip6_route_dev_notify,
2975         .priority = 0,
2976 };
2977
2978 int __init ip6_route_init(void)
2979 {
2980         int ret;
2981
2982         ret = -ENOMEM;
2983         ip6_dst_ops_template.kmem_cachep =
2984                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2985                                   SLAB_HWCACHE_ALIGN, NULL);
2986         if (!ip6_dst_ops_template.kmem_cachep)
2987                 goto out;
2988
2989         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2990         if (ret)
2991                 goto out_kmem_cache;
2992
2993         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
2994         if (ret)
2995                 goto out_dst_entries;
2996
2997         ret = register_pernet_subsys(&ip6_route_net_ops);
2998         if (ret)
2999                 goto out_register_inetpeer;
3000
3001         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3002
3003         /* Registering of the loopback is done before this portion of code,
3004          * the loopback reference in rt6_info will not be taken, do it
3005          * manually for init_net */
3006         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3007         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3008   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3009         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3010         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3011         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3012         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3013   #endif
3014         ret = fib6_init();
3015         if (ret)
3016                 goto out_register_subsys;
3017
3018         ret = xfrm6_init();
3019         if (ret)
3020                 goto out_fib6_init;
3021
3022         ret = fib6_rules_init();
3023         if (ret)
3024                 goto xfrm6_init;
3025
3026         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3027         if (ret)
3028                 goto fib6_rules_init;
3029
3030         ret = -ENOBUFS;
3031         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3032             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3033             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3034                 goto out_register_late_subsys;
3035
3036         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3037         if (ret)
3038                 goto out_register_late_subsys;
3039
3040 out:
3041         return ret;
3042
3043 out_register_late_subsys:
3044         unregister_pernet_subsys(&ip6_route_net_late_ops);
3045 fib6_rules_init:
3046         fib6_rules_cleanup();
3047 xfrm6_init:
3048         xfrm6_fini();
3049 out_fib6_init:
3050         fib6_gc_cleanup();
3051 out_register_subsys:
3052         unregister_pernet_subsys(&ip6_route_net_ops);
3053 out_register_inetpeer:
3054         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3055 out_dst_entries:
3056         dst_entries_destroy(&ip6_dst_blackhole_ops);
3057 out_kmem_cache:
3058         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3059         goto out;
3060 }
3061
3062 void ip6_route_cleanup(void)
3063 {
3064         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3065         unregister_pernet_subsys(&ip6_route_net_late_ops);
3066         fib6_rules_cleanup();
3067         xfrm6_fini();
3068         fib6_gc_cleanup();
3069         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3070         unregister_pernet_subsys(&ip6_route_net_ops);
3071         dst_entries_destroy(&ip6_dst_blackhole_ops);
3072         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3073 }