ipv6: Handle PMTU in ICMP error handlers.
[linux-3.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         peer = rt6_get_peer_create(rt);
103         if (peer) {
104                 u32 *old_p = __DST_METRICS_PTR(old);
105                 unsigned long prev, new;
106
107                 p = peer->metrics;
108                 if (inet_metrics_new(peer))
109                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
110
111                 new = (unsigned long) p;
112                 prev = cmpxchg(&dst->_metrics, old, new);
113
114                 if (prev != old) {
115                         p = __DST_METRICS_PTR(prev);
116                         if (prev & DST_METRICS_READ_ONLY)
117                                 p = NULL;
118                 }
119         }
120         return p;
121 }
122
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
124 {
125         struct in6_addr *p = &rt->rt6i_gateway;
126
127         if (!ipv6_addr_any(p))
128                 return (const void *) p;
129         return daddr;
130 }
131
132 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
133 {
134         struct rt6_info *rt = (struct rt6_info *) dst;
135         struct neighbour *n;
136
137         daddr = choose_neigh_daddr(rt, daddr);
138         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
139         if (n)
140                 return n;
141         return neigh_create(&nd_tbl, daddr, dst->dev);
142 }
143
144 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
145 {
146         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
147         if (!n) {
148                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
149                 if (IS_ERR(n))
150                         return PTR_ERR(n);
151         }
152         dst_set_neighbour(&rt->dst, n);
153
154         return 0;
155 }
156
157 static struct dst_ops ip6_dst_ops_template = {
158         .family                 =       AF_INET6,
159         .protocol               =       cpu_to_be16(ETH_P_IPV6),
160         .gc                     =       ip6_dst_gc,
161         .gc_thresh              =       1024,
162         .check                  =       ip6_dst_check,
163         .default_advmss         =       ip6_default_advmss,
164         .mtu                    =       ip6_mtu,
165         .cow_metrics            =       ipv6_cow_metrics,
166         .destroy                =       ip6_dst_destroy,
167         .ifdown                 =       ip6_dst_ifdown,
168         .negative_advice        =       ip6_negative_advice,
169         .link_failure           =       ip6_link_failure,
170         .update_pmtu            =       ip6_rt_update_pmtu,
171         .local_out              =       __ip6_local_out,
172         .neigh_lookup           =       ip6_neigh_lookup,
173 };
174
175 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
176 {
177         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
178
179         return mtu ? : dst->dev->mtu;
180 }
181
182 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
183 {
184 }
185
186 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
187                                          unsigned long old)
188 {
189         return NULL;
190 }
191
192 static struct dst_ops ip6_dst_blackhole_ops = {
193         .family                 =       AF_INET6,
194         .protocol               =       cpu_to_be16(ETH_P_IPV6),
195         .destroy                =       ip6_dst_destroy,
196         .check                  =       ip6_dst_check,
197         .mtu                    =       ip6_blackhole_mtu,
198         .default_advmss         =       ip6_default_advmss,
199         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
200         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
201         .neigh_lookup           =       ip6_neigh_lookup,
202 };
203
204 static const u32 ip6_template_metrics[RTAX_MAX] = {
205         [RTAX_HOPLIMIT - 1] = 255,
206 };
207
208 static struct rt6_info ip6_null_entry_template = {
209         .dst = {
210                 .__refcnt       = ATOMIC_INIT(1),
211                 .__use          = 1,
212                 .obsolete       = -1,
213                 .error          = -ENETUNREACH,
214                 .input          = ip6_pkt_discard,
215                 .output         = ip6_pkt_discard_out,
216         },
217         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
218         .rt6i_protocol  = RTPROT_KERNEL,
219         .rt6i_metric    = ~(u32) 0,
220         .rt6i_ref       = ATOMIC_INIT(1),
221 };
222
223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
224
225 static int ip6_pkt_prohibit(struct sk_buff *skb);
226 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
227
228 static struct rt6_info ip6_prohibit_entry_template = {
229         .dst = {
230                 .__refcnt       = ATOMIC_INIT(1),
231                 .__use          = 1,
232                 .obsolete       = -1,
233                 .error          = -EACCES,
234                 .input          = ip6_pkt_prohibit,
235                 .output         = ip6_pkt_prohibit_out,
236         },
237         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
238         .rt6i_protocol  = RTPROT_KERNEL,
239         .rt6i_metric    = ~(u32) 0,
240         .rt6i_ref       = ATOMIC_INIT(1),
241 };
242
243 static struct rt6_info ip6_blk_hole_entry_template = {
244         .dst = {
245                 .__refcnt       = ATOMIC_INIT(1),
246                 .__use          = 1,
247                 .obsolete       = -1,
248                 .error          = -EINVAL,
249                 .input          = dst_discard,
250                 .output         = dst_discard,
251         },
252         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
253         .rt6i_protocol  = RTPROT_KERNEL,
254         .rt6i_metric    = ~(u32) 0,
255         .rt6i_ref       = ATOMIC_INIT(1),
256 };
257
258 #endif
259
260 /* allocate dst with ip6_dst_ops */
261 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
262                                              struct net_device *dev,
263                                              int flags,
264                                              struct fib6_table *table)
265 {
266         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
267                                         0, 0, flags);
268
269         if (rt) {
270                 memset(&rt->rt6i_table, 0,
271                        sizeof(*rt) - sizeof(struct dst_entry));
272                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
273         }
274         return rt;
275 }
276
277 static void ip6_dst_destroy(struct dst_entry *dst)
278 {
279         struct rt6_info *rt = (struct rt6_info *)dst;
280         struct inet6_dev *idev = rt->rt6i_idev;
281
282         if (!(rt->dst.flags & DST_HOST))
283                 dst_destroy_metrics_generic(dst);
284
285         if (idev) {
286                 rt->rt6i_idev = NULL;
287                 in6_dev_put(idev);
288         }
289
290         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
291                 dst_release(dst->from);
292
293         if (rt6_has_peer(rt)) {
294                 struct inet_peer *peer = rt6_peer_ptr(rt);
295                 inet_putpeer(peer);
296         }
297 }
298
299 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
300
301 static u32 rt6_peer_genid(void)
302 {
303         return atomic_read(&__rt6_peer_genid);
304 }
305
306 void rt6_bind_peer(struct rt6_info *rt, int create)
307 {
308         struct inet_peer_base *base;
309         struct inet_peer *peer;
310
311         base = inetpeer_base_ptr(rt->_rt6i_peer);
312         if (!base)
313                 return;
314
315         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
316         if (peer) {
317                 if (!rt6_set_peer(rt, peer))
318                         inet_putpeer(peer);
319                 else
320                         rt->rt6i_peer_genid = rt6_peer_genid();
321         }
322 }
323
324 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
325                            int how)
326 {
327         struct rt6_info *rt = (struct rt6_info *)dst;
328         struct inet6_dev *idev = rt->rt6i_idev;
329         struct net_device *loopback_dev =
330                 dev_net(dev)->loopback_dev;
331
332         if (dev != loopback_dev && idev && idev->dev == dev) {
333                 struct inet6_dev *loopback_idev =
334                         in6_dev_get(loopback_dev);
335                 if (loopback_idev) {
336                         rt->rt6i_idev = loopback_idev;
337                         in6_dev_put(idev);
338                 }
339         }
340 }
341
342 static bool rt6_check_expired(const struct rt6_info *rt)
343 {
344         struct rt6_info *ort = NULL;
345
346         if (rt->rt6i_flags & RTF_EXPIRES) {
347                 if (time_after(jiffies, rt->dst.expires))
348                         return true;
349         } else if (rt->dst.from) {
350                 ort = (struct rt6_info *) rt->dst.from;
351                 return (ort->rt6i_flags & RTF_EXPIRES) &&
352                         time_after(jiffies, ort->dst.expires);
353         }
354         return false;
355 }
356
357 static bool rt6_need_strict(const struct in6_addr *daddr)
358 {
359         return ipv6_addr_type(daddr) &
360                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
361 }
362
363 /*
364  *      Route lookup. Any table->tb6_lock is implied.
365  */
366
367 static inline struct rt6_info *rt6_device_match(struct net *net,
368                                                     struct rt6_info *rt,
369                                                     const struct in6_addr *saddr,
370                                                     int oif,
371                                                     int flags)
372 {
373         struct rt6_info *local = NULL;
374         struct rt6_info *sprt;
375
376         if (!oif && ipv6_addr_any(saddr))
377                 goto out;
378
379         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
380                 struct net_device *dev = sprt->dst.dev;
381
382                 if (oif) {
383                         if (dev->ifindex == oif)
384                                 return sprt;
385                         if (dev->flags & IFF_LOOPBACK) {
386                                 if (!sprt->rt6i_idev ||
387                                     sprt->rt6i_idev->dev->ifindex != oif) {
388                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
389                                                 continue;
390                                         if (local && (!oif ||
391                                                       local->rt6i_idev->dev->ifindex == oif))
392                                                 continue;
393                                 }
394                                 local = sprt;
395                         }
396                 } else {
397                         if (ipv6_chk_addr(net, saddr, dev,
398                                           flags & RT6_LOOKUP_F_IFACE))
399                                 return sprt;
400                 }
401         }
402
403         if (oif) {
404                 if (local)
405                         return local;
406
407                 if (flags & RT6_LOOKUP_F_IFACE)
408                         return net->ipv6.ip6_null_entry;
409         }
410 out:
411         return rt;
412 }
413
414 #ifdef CONFIG_IPV6_ROUTER_PREF
415 static void rt6_probe(struct rt6_info *rt)
416 {
417         struct neighbour *neigh;
418         /*
419          * Okay, this does not seem to be appropriate
420          * for now, however, we need to check if it
421          * is really so; aka Router Reachability Probing.
422          *
423          * Router Reachability Probe MUST be rate-limited
424          * to no more than one per minute.
425          */
426         rcu_read_lock();
427         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
428         if (!neigh || (neigh->nud_state & NUD_VALID))
429                 goto out;
430         read_lock_bh(&neigh->lock);
431         if (!(neigh->nud_state & NUD_VALID) &&
432             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
433                 struct in6_addr mcaddr;
434                 struct in6_addr *target;
435
436                 neigh->updated = jiffies;
437                 read_unlock_bh(&neigh->lock);
438
439                 target = (struct in6_addr *)&neigh->primary_key;
440                 addrconf_addr_solict_mult(target, &mcaddr);
441                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
442         } else {
443                 read_unlock_bh(&neigh->lock);
444         }
445 out:
446         rcu_read_unlock();
447 }
448 #else
449 static inline void rt6_probe(struct rt6_info *rt)
450 {
451 }
452 #endif
453
454 /*
455  * Default Router Selection (RFC 2461 6.3.6)
456  */
457 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
458 {
459         struct net_device *dev = rt->dst.dev;
460         if (!oif || dev->ifindex == oif)
461                 return 2;
462         if ((dev->flags & IFF_LOOPBACK) &&
463             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
464                 return 1;
465         return 0;
466 }
467
468 static inline int rt6_check_neigh(struct rt6_info *rt)
469 {
470         struct neighbour *neigh;
471         int m;
472
473         rcu_read_lock();
474         neigh = dst_get_neighbour_noref(&rt->dst);
475         if (rt->rt6i_flags & RTF_NONEXTHOP ||
476             !(rt->rt6i_flags & RTF_GATEWAY))
477                 m = 1;
478         else if (neigh) {
479                 read_lock_bh(&neigh->lock);
480                 if (neigh->nud_state & NUD_VALID)
481                         m = 2;
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483                 else if (neigh->nud_state & NUD_FAILED)
484                         m = 0;
485 #endif
486                 else
487                         m = 1;
488                 read_unlock_bh(&neigh->lock);
489         } else
490                 m = 0;
491         rcu_read_unlock();
492         return m;
493 }
494
495 static int rt6_score_route(struct rt6_info *rt, int oif,
496                            int strict)
497 {
498         int m, n;
499
500         m = rt6_check_dev(rt, oif);
501         if (!m && (strict & RT6_LOOKUP_F_IFACE))
502                 return -1;
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
505 #endif
506         n = rt6_check_neigh(rt);
507         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
508                 return -1;
509         return m;
510 }
511
512 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
513                                    int *mpri, struct rt6_info *match)
514 {
515         int m;
516
517         if (rt6_check_expired(rt))
518                 goto out;
519
520         m = rt6_score_route(rt, oif, strict);
521         if (m < 0)
522                 goto out;
523
524         if (m > *mpri) {
525                 if (strict & RT6_LOOKUP_F_REACHABLE)
526                         rt6_probe(match);
527                 *mpri = m;
528                 match = rt;
529         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
530                 rt6_probe(rt);
531         }
532
533 out:
534         return match;
535 }
536
537 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
538                                      struct rt6_info *rr_head,
539                                      u32 metric, int oif, int strict)
540 {
541         struct rt6_info *rt, *match;
542         int mpri = -1;
543
544         match = NULL;
545         for (rt = rr_head; rt && rt->rt6i_metric == metric;
546              rt = rt->dst.rt6_next)
547                 match = find_match(rt, oif, strict, &mpri, match);
548         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
549              rt = rt->dst.rt6_next)
550                 match = find_match(rt, oif, strict, &mpri, match);
551
552         return match;
553 }
554
555 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
556 {
557         struct rt6_info *match, *rt0;
558         struct net *net;
559
560         rt0 = fn->rr_ptr;
561         if (!rt0)
562                 fn->rr_ptr = rt0 = fn->leaf;
563
564         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
565
566         if (!match &&
567             (strict & RT6_LOOKUP_F_REACHABLE)) {
568                 struct rt6_info *next = rt0->dst.rt6_next;
569
570                 /* no entries matched; do round-robin */
571                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
572                         next = fn->leaf;
573
574                 if (next != rt0)
575                         fn->rr_ptr = next;
576         }
577
578         net = dev_net(rt0->dst.dev);
579         return match ? match : net->ipv6.ip6_null_entry;
580 }
581
582 #ifdef CONFIG_IPV6_ROUTE_INFO
583 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
584                   const struct in6_addr *gwaddr)
585 {
586         struct net *net = dev_net(dev);
587         struct route_info *rinfo = (struct route_info *) opt;
588         struct in6_addr prefix_buf, *prefix;
589         unsigned int pref;
590         unsigned long lifetime;
591         struct rt6_info *rt;
592
593         if (len < sizeof(struct route_info)) {
594                 return -EINVAL;
595         }
596
597         /* Sanity check for prefix_len and length */
598         if (rinfo->length > 3) {
599                 return -EINVAL;
600         } else if (rinfo->prefix_len > 128) {
601                 return -EINVAL;
602         } else if (rinfo->prefix_len > 64) {
603                 if (rinfo->length < 2) {
604                         return -EINVAL;
605                 }
606         } else if (rinfo->prefix_len > 0) {
607                 if (rinfo->length < 1) {
608                         return -EINVAL;
609                 }
610         }
611
612         pref = rinfo->route_pref;
613         if (pref == ICMPV6_ROUTER_PREF_INVALID)
614                 return -EINVAL;
615
616         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
617
618         if (rinfo->length == 3)
619                 prefix = (struct in6_addr *)rinfo->prefix;
620         else {
621                 /* this function is safe */
622                 ipv6_addr_prefix(&prefix_buf,
623                                  (struct in6_addr *)rinfo->prefix,
624                                  rinfo->prefix_len);
625                 prefix = &prefix_buf;
626         }
627
628         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
629                                 dev->ifindex);
630
631         if (rt && !lifetime) {
632                 ip6_del_rt(rt);
633                 rt = NULL;
634         }
635
636         if (!rt && lifetime)
637                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
638                                         pref);
639         else if (rt)
640                 rt->rt6i_flags = RTF_ROUTEINFO |
641                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
642
643         if (rt) {
644                 if (!addrconf_finite_timeout(lifetime))
645                         rt6_clean_expires(rt);
646                 else
647                         rt6_set_expires(rt, jiffies + HZ * lifetime);
648
649                 dst_release(&rt->dst);
650         }
651         return 0;
652 }
653 #endif
654
655 #define BACKTRACK(__net, saddr)                 \
656 do { \
657         if (rt == __net->ipv6.ip6_null_entry) { \
658                 struct fib6_node *pn; \
659                 while (1) { \
660                         if (fn->fn_flags & RTN_TL_ROOT) \
661                                 goto out; \
662                         pn = fn->parent; \
663                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
664                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
665                         else \
666                                 fn = pn; \
667                         if (fn->fn_flags & RTN_RTINFO) \
668                                 goto restart; \
669                 } \
670         } \
671 } while (0)
672
673 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
674                                              struct fib6_table *table,
675                                              struct flowi6 *fl6, int flags)
676 {
677         struct fib6_node *fn;
678         struct rt6_info *rt;
679
680         read_lock_bh(&table->tb6_lock);
681         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
682 restart:
683         rt = fn->leaf;
684         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
685         BACKTRACK(net, &fl6->saddr);
686 out:
687         dst_use(&rt->dst, jiffies);
688         read_unlock_bh(&table->tb6_lock);
689         return rt;
690
691 }
692
693 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
694                                     int flags)
695 {
696         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
697 }
698 EXPORT_SYMBOL_GPL(ip6_route_lookup);
699
700 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
701                             const struct in6_addr *saddr, int oif, int strict)
702 {
703         struct flowi6 fl6 = {
704                 .flowi6_oif = oif,
705                 .daddr = *daddr,
706         };
707         struct dst_entry *dst;
708         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
709
710         if (saddr) {
711                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
712                 flags |= RT6_LOOKUP_F_HAS_SADDR;
713         }
714
715         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
716         if (dst->error == 0)
717                 return (struct rt6_info *) dst;
718
719         dst_release(dst);
720
721         return NULL;
722 }
723
724 EXPORT_SYMBOL(rt6_lookup);
725
726 /* ip6_ins_rt is called with FREE table->tb6_lock.
727    It takes new route entry, the addition fails by any reason the
728    route is freed. In any case, if caller does not hold it, it may
729    be destroyed.
730  */
731
732 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
733 {
734         int err;
735         struct fib6_table *table;
736
737         table = rt->rt6i_table;
738         write_lock_bh(&table->tb6_lock);
739         err = fib6_add(&table->tb6_root, rt, info);
740         write_unlock_bh(&table->tb6_lock);
741
742         return err;
743 }
744
745 int ip6_ins_rt(struct rt6_info *rt)
746 {
747         struct nl_info info = {
748                 .nl_net = dev_net(rt->dst.dev),
749         };
750         return __ip6_ins_rt(rt, &info);
751 }
752
753 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
754                                       const struct in6_addr *daddr,
755                                       const struct in6_addr *saddr)
756 {
757         struct rt6_info *rt;
758
759         /*
760          *      Clone the route.
761          */
762
763         rt = ip6_rt_copy(ort, daddr);
764
765         if (rt) {
766                 int attempts = !in_softirq();
767
768                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
769                         if (ort->rt6i_dst.plen != 128 &&
770                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
771                                 rt->rt6i_flags |= RTF_ANYCAST;
772                         rt->rt6i_gateway = *daddr;
773                 }
774
775                 rt->rt6i_flags |= RTF_CACHE;
776
777 #ifdef CONFIG_IPV6_SUBTREES
778                 if (rt->rt6i_src.plen && saddr) {
779                         rt->rt6i_src.addr = *saddr;
780                         rt->rt6i_src.plen = 128;
781                 }
782 #endif
783
784         retry:
785                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
786                         struct net *net = dev_net(rt->dst.dev);
787                         int saved_rt_min_interval =
788                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
789                         int saved_rt_elasticity =
790                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
791
792                         if (attempts-- > 0) {
793                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
794                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
795
796                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
797
798                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
799                                         saved_rt_elasticity;
800                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
801                                         saved_rt_min_interval;
802                                 goto retry;
803                         }
804
805                         net_warn_ratelimited("Neighbour table overflow\n");
806                         dst_free(&rt->dst);
807                         return NULL;
808                 }
809         }
810
811         return rt;
812 }
813
814 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
815                                         const struct in6_addr *daddr)
816 {
817         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
818
819         if (rt) {
820                 rt->rt6i_flags |= RTF_CACHE;
821                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
822         }
823         return rt;
824 }
825
826 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
827                                       struct flowi6 *fl6, int flags)
828 {
829         struct fib6_node *fn;
830         struct rt6_info *rt, *nrt;
831         int strict = 0;
832         int attempts = 3;
833         int err;
834         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
835
836         strict |= flags & RT6_LOOKUP_F_IFACE;
837
838 relookup:
839         read_lock_bh(&table->tb6_lock);
840
841 restart_2:
842         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
843
844 restart:
845         rt = rt6_select(fn, oif, strict | reachable);
846
847         BACKTRACK(net, &fl6->saddr);
848         if (rt == net->ipv6.ip6_null_entry ||
849             rt->rt6i_flags & RTF_CACHE)
850                 goto out;
851
852         dst_hold(&rt->dst);
853         read_unlock_bh(&table->tb6_lock);
854
855         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
856                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
857         else if (!(rt->dst.flags & DST_HOST))
858                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
859         else
860                 goto out2;
861
862         dst_release(&rt->dst);
863         rt = nrt ? : net->ipv6.ip6_null_entry;
864
865         dst_hold(&rt->dst);
866         if (nrt) {
867                 err = ip6_ins_rt(nrt);
868                 if (!err)
869                         goto out2;
870         }
871
872         if (--attempts <= 0)
873                 goto out2;
874
875         /*
876          * Race condition! In the gap, when table->tb6_lock was
877          * released someone could insert this route.  Relookup.
878          */
879         dst_release(&rt->dst);
880         goto relookup;
881
882 out:
883         if (reachable) {
884                 reachable = 0;
885                 goto restart_2;
886         }
887         dst_hold(&rt->dst);
888         read_unlock_bh(&table->tb6_lock);
889 out2:
890         rt->dst.lastuse = jiffies;
891         rt->dst.__use++;
892
893         return rt;
894 }
895
896 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
897                                             struct flowi6 *fl6, int flags)
898 {
899         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
900 }
901
902 static struct dst_entry *ip6_route_input_lookup(struct net *net,
903                                                 struct net_device *dev,
904                                                 struct flowi6 *fl6, int flags)
905 {
906         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
907                 flags |= RT6_LOOKUP_F_IFACE;
908
909         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
910 }
911
912 void ip6_route_input(struct sk_buff *skb)
913 {
914         const struct ipv6hdr *iph = ipv6_hdr(skb);
915         struct net *net = dev_net(skb->dev);
916         int flags = RT6_LOOKUP_F_HAS_SADDR;
917         struct flowi6 fl6 = {
918                 .flowi6_iif = skb->dev->ifindex,
919                 .daddr = iph->daddr,
920                 .saddr = iph->saddr,
921                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
922                 .flowi6_mark = skb->mark,
923                 .flowi6_proto = iph->nexthdr,
924         };
925
926         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
927 }
928
929 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
930                                              struct flowi6 *fl6, int flags)
931 {
932         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
933 }
934
935 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
936                                     struct flowi6 *fl6)
937 {
938         int flags = 0;
939
940         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
941                 flags |= RT6_LOOKUP_F_IFACE;
942
943         if (!ipv6_addr_any(&fl6->saddr))
944                 flags |= RT6_LOOKUP_F_HAS_SADDR;
945         else if (sk)
946                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
947
948         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
949 }
950
951 EXPORT_SYMBOL(ip6_route_output);
952
953 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
954 {
955         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
956         struct dst_entry *new = NULL;
957
958         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
959         if (rt) {
960                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
961                 rt6_init_peer(rt, net->ipv6.peers);
962
963                 new = &rt->dst;
964
965                 new->__use = 1;
966                 new->input = dst_discard;
967                 new->output = dst_discard;
968
969                 if (dst_metrics_read_only(&ort->dst))
970                         new->_metrics = ort->dst._metrics;
971                 else
972                         dst_copy_metrics(new, &ort->dst);
973                 rt->rt6i_idev = ort->rt6i_idev;
974                 if (rt->rt6i_idev)
975                         in6_dev_hold(rt->rt6i_idev);
976
977                 rt->rt6i_gateway = ort->rt6i_gateway;
978                 rt->rt6i_flags = ort->rt6i_flags;
979                 rt6_clean_expires(rt);
980                 rt->rt6i_metric = 0;
981
982                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
983 #ifdef CONFIG_IPV6_SUBTREES
984                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
985 #endif
986
987                 dst_free(new);
988         }
989
990         dst_release(dst_orig);
991         return new ? new : ERR_PTR(-ENOMEM);
992 }
993
994 /*
995  *      Destination cache support functions
996  */
997
998 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
999 {
1000         struct rt6_info *rt;
1001
1002         rt = (struct rt6_info *) dst;
1003
1004         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1005                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1006                         if (!rt6_has_peer(rt))
1007                                 rt6_bind_peer(rt, 0);
1008                         rt->rt6i_peer_genid = rt6_peer_genid();
1009                 }
1010                 return dst;
1011         }
1012         return NULL;
1013 }
1014
1015 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1016 {
1017         struct rt6_info *rt = (struct rt6_info *) dst;
1018
1019         if (rt) {
1020                 if (rt->rt6i_flags & RTF_CACHE) {
1021                         if (rt6_check_expired(rt)) {
1022                                 ip6_del_rt(rt);
1023                                 dst = NULL;
1024                         }
1025                 } else {
1026                         dst_release(dst);
1027                         dst = NULL;
1028                 }
1029         }
1030         return dst;
1031 }
1032
1033 static void ip6_link_failure(struct sk_buff *skb)
1034 {
1035         struct rt6_info *rt;
1036
1037         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1038
1039         rt = (struct rt6_info *) skb_dst(skb);
1040         if (rt) {
1041                 if (rt->rt6i_flags & RTF_CACHE)
1042                         rt6_update_expires(rt, 0);
1043                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1044                         rt->rt6i_node->fn_sernum = -1;
1045         }
1046 }
1047
1048 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1049 {
1050         struct rt6_info *rt6 = (struct rt6_info*)dst;
1051
1052         dst_confirm(dst);
1053         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1054                 struct net *net = dev_net(dst->dev);
1055
1056                 rt6->rt6i_flags |= RTF_MODIFIED;
1057                 if (mtu < IPV6_MIN_MTU) {
1058                         u32 features = dst_metric(dst, RTAX_FEATURES);
1059                         mtu = IPV6_MIN_MTU;
1060                         features |= RTAX_FEATURE_ALLFRAG;
1061                         dst_metric_set(dst, RTAX_FEATURES, features);
1062                 }
1063                 dst_metric_set(dst, RTAX_MTU, mtu);
1064                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1065         }
1066 }
1067
1068 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1069                      int oif, __be32 mark)
1070 {
1071         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1072         struct dst_entry *dst;
1073         struct flowi6 fl6;
1074
1075         memset(&fl6, 0, sizeof(fl6));
1076         fl6.flowi6_oif = oif;
1077         fl6.flowi6_mark = mark;
1078         fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1079         fl6.daddr = iph->daddr;
1080         fl6.saddr = iph->saddr;
1081         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1082
1083         dst = ip6_route_output(net, NULL, &fl6);
1084         if (!dst->error)
1085                 ip6_rt_update_pmtu(dst, ntohl(mtu));
1086         dst_release(dst);
1087 }
1088 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1089
1090 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1091 {
1092         ip6_update_pmtu(skb, sock_net(sk), mtu,
1093                         sk->sk_bound_dev_if, sk->sk_mark);
1094 }
1095 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1096
1097 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1098 {
1099         struct net_device *dev = dst->dev;
1100         unsigned int mtu = dst_mtu(dst);
1101         struct net *net = dev_net(dev);
1102
1103         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1104
1105         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1106                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1107
1108         /*
1109          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1110          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1111          * IPV6_MAXPLEN is also valid and means: "any MSS,
1112          * rely only on pmtu discovery"
1113          */
1114         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1115                 mtu = IPV6_MAXPLEN;
1116         return mtu;
1117 }
1118
1119 static unsigned int ip6_mtu(const struct dst_entry *dst)
1120 {
1121         struct inet6_dev *idev;
1122         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1123
1124         if (mtu)
1125                 return mtu;
1126
1127         mtu = IPV6_MIN_MTU;
1128
1129         rcu_read_lock();
1130         idev = __in6_dev_get(dst->dev);
1131         if (idev)
1132                 mtu = idev->cnf.mtu6;
1133         rcu_read_unlock();
1134
1135         return mtu;
1136 }
1137
1138 static struct dst_entry *icmp6_dst_gc_list;
1139 static DEFINE_SPINLOCK(icmp6_dst_lock);
1140
1141 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1142                                   struct neighbour *neigh,
1143                                   struct flowi6 *fl6)
1144 {
1145         struct dst_entry *dst;
1146         struct rt6_info *rt;
1147         struct inet6_dev *idev = in6_dev_get(dev);
1148         struct net *net = dev_net(dev);
1149
1150         if (unlikely(!idev))
1151                 return ERR_PTR(-ENODEV);
1152
1153         rt = ip6_dst_alloc(net, dev, 0, NULL);
1154         if (unlikely(!rt)) {
1155                 in6_dev_put(idev);
1156                 dst = ERR_PTR(-ENOMEM);
1157                 goto out;
1158         }
1159
1160         if (neigh)
1161                 neigh_hold(neigh);
1162         else {
1163                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1164                 if (IS_ERR(neigh)) {
1165                         in6_dev_put(idev);
1166                         dst_free(&rt->dst);
1167                         return ERR_CAST(neigh);
1168                 }
1169         }
1170
1171         rt->dst.flags |= DST_HOST;
1172         rt->dst.output  = ip6_output;
1173         dst_set_neighbour(&rt->dst, neigh);
1174         atomic_set(&rt->dst.__refcnt, 1);
1175         rt->rt6i_dst.addr = fl6->daddr;
1176         rt->rt6i_dst.plen = 128;
1177         rt->rt6i_idev     = idev;
1178         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1179
1180         spin_lock_bh(&icmp6_dst_lock);
1181         rt->dst.next = icmp6_dst_gc_list;
1182         icmp6_dst_gc_list = &rt->dst;
1183         spin_unlock_bh(&icmp6_dst_lock);
1184
1185         fib6_force_start_gc(net);
1186
1187         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1188
1189 out:
1190         return dst;
1191 }
1192
1193 int icmp6_dst_gc(void)
1194 {
1195         struct dst_entry *dst, **pprev;
1196         int more = 0;
1197
1198         spin_lock_bh(&icmp6_dst_lock);
1199         pprev = &icmp6_dst_gc_list;
1200
1201         while ((dst = *pprev) != NULL) {
1202                 if (!atomic_read(&dst->__refcnt)) {
1203                         *pprev = dst->next;
1204                         dst_free(dst);
1205                 } else {
1206                         pprev = &dst->next;
1207                         ++more;
1208                 }
1209         }
1210
1211         spin_unlock_bh(&icmp6_dst_lock);
1212
1213         return more;
1214 }
1215
1216 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1217                             void *arg)
1218 {
1219         struct dst_entry *dst, **pprev;
1220
1221         spin_lock_bh(&icmp6_dst_lock);
1222         pprev = &icmp6_dst_gc_list;
1223         while ((dst = *pprev) != NULL) {
1224                 struct rt6_info *rt = (struct rt6_info *) dst;
1225                 if (func(rt, arg)) {
1226                         *pprev = dst->next;
1227                         dst_free(dst);
1228                 } else {
1229                         pprev = &dst->next;
1230                 }
1231         }
1232         spin_unlock_bh(&icmp6_dst_lock);
1233 }
1234
1235 static int ip6_dst_gc(struct dst_ops *ops)
1236 {
1237         unsigned long now = jiffies;
1238         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1239         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1240         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1241         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1242         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1243         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1244         int entries;
1245
1246         entries = dst_entries_get_fast(ops);
1247         if (time_after(rt_last_gc + rt_min_interval, now) &&
1248             entries <= rt_max_size)
1249                 goto out;
1250
1251         net->ipv6.ip6_rt_gc_expire++;
1252         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1253         net->ipv6.ip6_rt_last_gc = now;
1254         entries = dst_entries_get_slow(ops);
1255         if (entries < ops->gc_thresh)
1256                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1257 out:
1258         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1259         return entries > rt_max_size;
1260 }
1261
1262 /* Clean host part of a prefix. Not necessary in radix tree,
1263    but results in cleaner routing tables.
1264
1265    Remove it only when all the things will work!
1266  */
1267
1268 int ip6_dst_hoplimit(struct dst_entry *dst)
1269 {
1270         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1271         if (hoplimit == 0) {
1272                 struct net_device *dev = dst->dev;
1273                 struct inet6_dev *idev;
1274
1275                 rcu_read_lock();
1276                 idev = __in6_dev_get(dev);
1277                 if (idev)
1278                         hoplimit = idev->cnf.hop_limit;
1279                 else
1280                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1281                 rcu_read_unlock();
1282         }
1283         return hoplimit;
1284 }
1285 EXPORT_SYMBOL(ip6_dst_hoplimit);
1286
1287 /*
1288  *
1289  */
1290
1291 int ip6_route_add(struct fib6_config *cfg)
1292 {
1293         int err;
1294         struct net *net = cfg->fc_nlinfo.nl_net;
1295         struct rt6_info *rt = NULL;
1296         struct net_device *dev = NULL;
1297         struct inet6_dev *idev = NULL;
1298         struct fib6_table *table;
1299         int addr_type;
1300
1301         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1302                 return -EINVAL;
1303 #ifndef CONFIG_IPV6_SUBTREES
1304         if (cfg->fc_src_len)
1305                 return -EINVAL;
1306 #endif
1307         if (cfg->fc_ifindex) {
1308                 err = -ENODEV;
1309                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1310                 if (!dev)
1311                         goto out;
1312                 idev = in6_dev_get(dev);
1313                 if (!idev)
1314                         goto out;
1315         }
1316
1317         if (cfg->fc_metric == 0)
1318                 cfg->fc_metric = IP6_RT_PRIO_USER;
1319
1320         err = -ENOBUFS;
1321         if (cfg->fc_nlinfo.nlh &&
1322             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1323                 table = fib6_get_table(net, cfg->fc_table);
1324                 if (!table) {
1325                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1326                         table = fib6_new_table(net, cfg->fc_table);
1327                 }
1328         } else {
1329                 table = fib6_new_table(net, cfg->fc_table);
1330         }
1331
1332         if (!table)
1333                 goto out;
1334
1335         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1336
1337         if (!rt) {
1338                 err = -ENOMEM;
1339                 goto out;
1340         }
1341
1342         rt->dst.obsolete = -1;
1343
1344         if (cfg->fc_flags & RTF_EXPIRES)
1345                 rt6_set_expires(rt, jiffies +
1346                                 clock_t_to_jiffies(cfg->fc_expires));
1347         else
1348                 rt6_clean_expires(rt);
1349
1350         if (cfg->fc_protocol == RTPROT_UNSPEC)
1351                 cfg->fc_protocol = RTPROT_BOOT;
1352         rt->rt6i_protocol = cfg->fc_protocol;
1353
1354         addr_type = ipv6_addr_type(&cfg->fc_dst);
1355
1356         if (addr_type & IPV6_ADDR_MULTICAST)
1357                 rt->dst.input = ip6_mc_input;
1358         else if (cfg->fc_flags & RTF_LOCAL)
1359                 rt->dst.input = ip6_input;
1360         else
1361                 rt->dst.input = ip6_forward;
1362
1363         rt->dst.output = ip6_output;
1364
1365         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1366         rt->rt6i_dst.plen = cfg->fc_dst_len;
1367         if (rt->rt6i_dst.plen == 128)
1368                rt->dst.flags |= DST_HOST;
1369
1370         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1371                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1372                 if (!metrics) {
1373                         err = -ENOMEM;
1374                         goto out;
1375                 }
1376                 dst_init_metrics(&rt->dst, metrics, 0);
1377         }
1378 #ifdef CONFIG_IPV6_SUBTREES
1379         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1380         rt->rt6i_src.plen = cfg->fc_src_len;
1381 #endif
1382
1383         rt->rt6i_metric = cfg->fc_metric;
1384
1385         /* We cannot add true routes via loopback here,
1386            they would result in kernel looping; promote them to reject routes
1387          */
1388         if ((cfg->fc_flags & RTF_REJECT) ||
1389             (dev && (dev->flags & IFF_LOOPBACK) &&
1390              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1391              !(cfg->fc_flags & RTF_LOCAL))) {
1392                 /* hold loopback dev/idev if we haven't done so. */
1393                 if (dev != net->loopback_dev) {
1394                         if (dev) {
1395                                 dev_put(dev);
1396                                 in6_dev_put(idev);
1397                         }
1398                         dev = net->loopback_dev;
1399                         dev_hold(dev);
1400                         idev = in6_dev_get(dev);
1401                         if (!idev) {
1402                                 err = -ENODEV;
1403                                 goto out;
1404                         }
1405                 }
1406                 rt->dst.output = ip6_pkt_discard_out;
1407                 rt->dst.input = ip6_pkt_discard;
1408                 rt->dst.error = -ENETUNREACH;
1409                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1410                 goto install_route;
1411         }
1412
1413         if (cfg->fc_flags & RTF_GATEWAY) {
1414                 const struct in6_addr *gw_addr;
1415                 int gwa_type;
1416
1417                 gw_addr = &cfg->fc_gateway;
1418                 rt->rt6i_gateway = *gw_addr;
1419                 gwa_type = ipv6_addr_type(gw_addr);
1420
1421                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1422                         struct rt6_info *grt;
1423
1424                         /* IPv6 strictly inhibits using not link-local
1425                            addresses as nexthop address.
1426                            Otherwise, router will not able to send redirects.
1427                            It is very good, but in some (rare!) circumstances
1428                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1429                            some exceptions. --ANK
1430                          */
1431                         err = -EINVAL;
1432                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1433                                 goto out;
1434
1435                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1436
1437                         err = -EHOSTUNREACH;
1438                         if (!grt)
1439                                 goto out;
1440                         if (dev) {
1441                                 if (dev != grt->dst.dev) {
1442                                         dst_release(&grt->dst);
1443                                         goto out;
1444                                 }
1445                         } else {
1446                                 dev = grt->dst.dev;
1447                                 idev = grt->rt6i_idev;
1448                                 dev_hold(dev);
1449                                 in6_dev_hold(grt->rt6i_idev);
1450                         }
1451                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1452                                 err = 0;
1453                         dst_release(&grt->dst);
1454
1455                         if (err)
1456                                 goto out;
1457                 }
1458                 err = -EINVAL;
1459                 if (!dev || (dev->flags & IFF_LOOPBACK))
1460                         goto out;
1461         }
1462
1463         err = -ENODEV;
1464         if (!dev)
1465                 goto out;
1466
1467         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1468                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1469                         err = -EINVAL;
1470                         goto out;
1471                 }
1472                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1473                 rt->rt6i_prefsrc.plen = 128;
1474         } else
1475                 rt->rt6i_prefsrc.plen = 0;
1476
1477         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1478                 err = rt6_bind_neighbour(rt, dev);
1479                 if (err)
1480                         goto out;
1481         }
1482
1483         rt->rt6i_flags = cfg->fc_flags;
1484
1485 install_route:
1486         if (cfg->fc_mx) {
1487                 struct nlattr *nla;
1488                 int remaining;
1489
1490                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1491                         int type = nla_type(nla);
1492
1493                         if (type) {
1494                                 if (type > RTAX_MAX) {
1495                                         err = -EINVAL;
1496                                         goto out;
1497                                 }
1498
1499                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1500                         }
1501                 }
1502         }
1503
1504         rt->dst.dev = dev;
1505         rt->rt6i_idev = idev;
1506         rt->rt6i_table = table;
1507
1508         cfg->fc_nlinfo.nl_net = dev_net(dev);
1509
1510         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1511
1512 out:
1513         if (dev)
1514                 dev_put(dev);
1515         if (idev)
1516                 in6_dev_put(idev);
1517         if (rt)
1518                 dst_free(&rt->dst);
1519         return err;
1520 }
1521
1522 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1523 {
1524         int err;
1525         struct fib6_table *table;
1526         struct net *net = dev_net(rt->dst.dev);
1527
1528         if (rt == net->ipv6.ip6_null_entry)
1529                 return -ENOENT;
1530
1531         table = rt->rt6i_table;
1532         write_lock_bh(&table->tb6_lock);
1533
1534         err = fib6_del(rt, info);
1535         dst_release(&rt->dst);
1536
1537         write_unlock_bh(&table->tb6_lock);
1538
1539         return err;
1540 }
1541
1542 int ip6_del_rt(struct rt6_info *rt)
1543 {
1544         struct nl_info info = {
1545                 .nl_net = dev_net(rt->dst.dev),
1546         };
1547         return __ip6_del_rt(rt, &info);
1548 }
1549
1550 static int ip6_route_del(struct fib6_config *cfg)
1551 {
1552         struct fib6_table *table;
1553         struct fib6_node *fn;
1554         struct rt6_info *rt;
1555         int err = -ESRCH;
1556
1557         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1558         if (!table)
1559                 return err;
1560
1561         read_lock_bh(&table->tb6_lock);
1562
1563         fn = fib6_locate(&table->tb6_root,
1564                          &cfg->fc_dst, cfg->fc_dst_len,
1565                          &cfg->fc_src, cfg->fc_src_len);
1566
1567         if (fn) {
1568                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1569                         if (cfg->fc_ifindex &&
1570                             (!rt->dst.dev ||
1571                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1572                                 continue;
1573                         if (cfg->fc_flags & RTF_GATEWAY &&
1574                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1575                                 continue;
1576                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1577                                 continue;
1578                         dst_hold(&rt->dst);
1579                         read_unlock_bh(&table->tb6_lock);
1580
1581                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1582                 }
1583         }
1584         read_unlock_bh(&table->tb6_lock);
1585
1586         return err;
1587 }
1588
1589 /*
1590  *      Handle redirects
1591  */
1592 struct ip6rd_flowi {
1593         struct flowi6 fl6;
1594         struct in6_addr gateway;
1595 };
1596
1597 static struct rt6_info *__ip6_route_redirect(struct net *net,
1598                                              struct fib6_table *table,
1599                                              struct flowi6 *fl6,
1600                                              int flags)
1601 {
1602         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1603         struct rt6_info *rt;
1604         struct fib6_node *fn;
1605
1606         /*
1607          * Get the "current" route for this destination and
1608          * check if the redirect has come from approriate router.
1609          *
1610          * RFC 2461 specifies that redirects should only be
1611          * accepted if they come from the nexthop to the target.
1612          * Due to the way the routes are chosen, this notion
1613          * is a bit fuzzy and one might need to check all possible
1614          * routes.
1615          */
1616
1617         read_lock_bh(&table->tb6_lock);
1618         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1619 restart:
1620         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1621                 /*
1622                  * Current route is on-link; redirect is always invalid.
1623                  *
1624                  * Seems, previous statement is not true. It could
1625                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1626                  * But then router serving it might decide, that we should
1627                  * know truth 8)8) --ANK (980726).
1628                  */
1629                 if (rt6_check_expired(rt))
1630                         continue;
1631                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1632                         continue;
1633                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1634                         continue;
1635                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1636                         continue;
1637                 break;
1638         }
1639
1640         if (!rt)
1641                 rt = net->ipv6.ip6_null_entry;
1642         BACKTRACK(net, &fl6->saddr);
1643 out:
1644         dst_hold(&rt->dst);
1645
1646         read_unlock_bh(&table->tb6_lock);
1647
1648         return rt;
1649 };
1650
1651 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1652                                            const struct in6_addr *src,
1653                                            const struct in6_addr *gateway,
1654                                            struct net_device *dev)
1655 {
1656         int flags = RT6_LOOKUP_F_HAS_SADDR;
1657         struct net *net = dev_net(dev);
1658         struct ip6rd_flowi rdfl = {
1659                 .fl6 = {
1660                         .flowi6_oif = dev->ifindex,
1661                         .daddr = *dest,
1662                         .saddr = *src,
1663                 },
1664         };
1665
1666         rdfl.gateway = *gateway;
1667
1668         if (rt6_need_strict(dest))
1669                 flags |= RT6_LOOKUP_F_IFACE;
1670
1671         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1672                                                    flags, __ip6_route_redirect);
1673 }
1674
1675 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1676                   const struct in6_addr *saddr,
1677                   struct neighbour *neigh, u8 *lladdr, int on_link)
1678 {
1679         struct rt6_info *rt, *nrt = NULL;
1680         struct netevent_redirect netevent;
1681         struct net *net = dev_net(neigh->dev);
1682
1683         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1684
1685         if (rt == net->ipv6.ip6_null_entry) {
1686                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1687                 goto out;
1688         }
1689
1690         /*
1691          *      We have finally decided to accept it.
1692          */
1693
1694         neigh_update(neigh, lladdr, NUD_STALE,
1695                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1696                      NEIGH_UPDATE_F_OVERRIDE|
1697                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1698                                      NEIGH_UPDATE_F_ISROUTER))
1699                      );
1700
1701         /*
1702          * Redirect received -> path was valid.
1703          * Look, redirects are sent only in response to data packets,
1704          * so that this nexthop apparently is reachable. --ANK
1705          */
1706         dst_confirm(&rt->dst);
1707
1708         /* Duplicate redirect: silently ignore. */
1709         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1710                 goto out;
1711
1712         nrt = ip6_rt_copy(rt, dest);
1713         if (!nrt)
1714                 goto out;
1715
1716         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1717         if (on_link)
1718                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1719
1720         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1721         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1722
1723         if (ip6_ins_rt(nrt))
1724                 goto out;
1725
1726         netevent.old = &rt->dst;
1727         netevent.new = &nrt->dst;
1728         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1729
1730         if (rt->rt6i_flags & RTF_CACHE) {
1731                 ip6_del_rt(rt);
1732                 return;
1733         }
1734
1735 out:
1736         dst_release(&rt->dst);
1737 }
1738
1739 /*
1740  *      Misc support functions
1741  */
1742
1743 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1744                                     const struct in6_addr *dest)
1745 {
1746         struct net *net = dev_net(ort->dst.dev);
1747         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1748                                             ort->rt6i_table);
1749
1750         if (rt) {
1751                 rt->dst.input = ort->dst.input;
1752                 rt->dst.output = ort->dst.output;
1753                 rt->dst.flags |= DST_HOST;
1754
1755                 rt->rt6i_dst.addr = *dest;
1756                 rt->rt6i_dst.plen = 128;
1757                 dst_copy_metrics(&rt->dst, &ort->dst);
1758                 rt->dst.error = ort->dst.error;
1759                 rt->rt6i_idev = ort->rt6i_idev;
1760                 if (rt->rt6i_idev)
1761                         in6_dev_hold(rt->rt6i_idev);
1762                 rt->dst.lastuse = jiffies;
1763
1764                 rt->rt6i_gateway = ort->rt6i_gateway;
1765                 rt->rt6i_flags = ort->rt6i_flags;
1766                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1767                     (RTF_DEFAULT | RTF_ADDRCONF))
1768                         rt6_set_from(rt, ort);
1769                 else
1770                         rt6_clean_expires(rt);
1771                 rt->rt6i_metric = 0;
1772
1773 #ifdef CONFIG_IPV6_SUBTREES
1774                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1775 #endif
1776                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1777                 rt->rt6i_table = ort->rt6i_table;
1778         }
1779         return rt;
1780 }
1781
1782 #ifdef CONFIG_IPV6_ROUTE_INFO
1783 static struct rt6_info *rt6_get_route_info(struct net *net,
1784                                            const struct in6_addr *prefix, int prefixlen,
1785                                            const struct in6_addr *gwaddr, int ifindex)
1786 {
1787         struct fib6_node *fn;
1788         struct rt6_info *rt = NULL;
1789         struct fib6_table *table;
1790
1791         table = fib6_get_table(net, RT6_TABLE_INFO);
1792         if (!table)
1793                 return NULL;
1794
1795         write_lock_bh(&table->tb6_lock);
1796         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1797         if (!fn)
1798                 goto out;
1799
1800         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1801                 if (rt->dst.dev->ifindex != ifindex)
1802                         continue;
1803                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1804                         continue;
1805                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1806                         continue;
1807                 dst_hold(&rt->dst);
1808                 break;
1809         }
1810 out:
1811         write_unlock_bh(&table->tb6_lock);
1812         return rt;
1813 }
1814
1815 static struct rt6_info *rt6_add_route_info(struct net *net,
1816                                            const struct in6_addr *prefix, int prefixlen,
1817                                            const struct in6_addr *gwaddr, int ifindex,
1818                                            unsigned int pref)
1819 {
1820         struct fib6_config cfg = {
1821                 .fc_table       = RT6_TABLE_INFO,
1822                 .fc_metric      = IP6_RT_PRIO_USER,
1823                 .fc_ifindex     = ifindex,
1824                 .fc_dst_len     = prefixlen,
1825                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1826                                   RTF_UP | RTF_PREF(pref),
1827                 .fc_nlinfo.pid = 0,
1828                 .fc_nlinfo.nlh = NULL,
1829                 .fc_nlinfo.nl_net = net,
1830         };
1831
1832         cfg.fc_dst = *prefix;
1833         cfg.fc_gateway = *gwaddr;
1834
1835         /* We should treat it as a default route if prefix length is 0. */
1836         if (!prefixlen)
1837                 cfg.fc_flags |= RTF_DEFAULT;
1838
1839         ip6_route_add(&cfg);
1840
1841         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1842 }
1843 #endif
1844
1845 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1846 {
1847         struct rt6_info *rt;
1848         struct fib6_table *table;
1849
1850         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1851         if (!table)
1852                 return NULL;
1853
1854         write_lock_bh(&table->tb6_lock);
1855         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1856                 if (dev == rt->dst.dev &&
1857                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1858                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1859                         break;
1860         }
1861         if (rt)
1862                 dst_hold(&rt->dst);
1863         write_unlock_bh(&table->tb6_lock);
1864         return rt;
1865 }
1866
1867 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1868                                      struct net_device *dev,
1869                                      unsigned int pref)
1870 {
1871         struct fib6_config cfg = {
1872                 .fc_table       = RT6_TABLE_DFLT,
1873                 .fc_metric      = IP6_RT_PRIO_USER,
1874                 .fc_ifindex     = dev->ifindex,
1875                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1876                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1877                 .fc_nlinfo.pid = 0,
1878                 .fc_nlinfo.nlh = NULL,
1879                 .fc_nlinfo.nl_net = dev_net(dev),
1880         };
1881
1882         cfg.fc_gateway = *gwaddr;
1883
1884         ip6_route_add(&cfg);
1885
1886         return rt6_get_dflt_router(gwaddr, dev);
1887 }
1888
1889 void rt6_purge_dflt_routers(struct net *net)
1890 {
1891         struct rt6_info *rt;
1892         struct fib6_table *table;
1893
1894         /* NOTE: Keep consistent with rt6_get_dflt_router */
1895         table = fib6_get_table(net, RT6_TABLE_DFLT);
1896         if (!table)
1897                 return;
1898
1899 restart:
1900         read_lock_bh(&table->tb6_lock);
1901         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1902                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1903                         dst_hold(&rt->dst);
1904                         read_unlock_bh(&table->tb6_lock);
1905                         ip6_del_rt(rt);
1906                         goto restart;
1907                 }
1908         }
1909         read_unlock_bh(&table->tb6_lock);
1910 }
1911
1912 static void rtmsg_to_fib6_config(struct net *net,
1913                                  struct in6_rtmsg *rtmsg,
1914                                  struct fib6_config *cfg)
1915 {
1916         memset(cfg, 0, sizeof(*cfg));
1917
1918         cfg->fc_table = RT6_TABLE_MAIN;
1919         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1920         cfg->fc_metric = rtmsg->rtmsg_metric;
1921         cfg->fc_expires = rtmsg->rtmsg_info;
1922         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1923         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1924         cfg->fc_flags = rtmsg->rtmsg_flags;
1925
1926         cfg->fc_nlinfo.nl_net = net;
1927
1928         cfg->fc_dst = rtmsg->rtmsg_dst;
1929         cfg->fc_src = rtmsg->rtmsg_src;
1930         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1931 }
1932
1933 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1934 {
1935         struct fib6_config cfg;
1936         struct in6_rtmsg rtmsg;
1937         int err;
1938
1939         switch(cmd) {
1940         case SIOCADDRT:         /* Add a route */
1941         case SIOCDELRT:         /* Delete a route */
1942                 if (!capable(CAP_NET_ADMIN))
1943                         return -EPERM;
1944                 err = copy_from_user(&rtmsg, arg,
1945                                      sizeof(struct in6_rtmsg));
1946                 if (err)
1947                         return -EFAULT;
1948
1949                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1950
1951                 rtnl_lock();
1952                 switch (cmd) {
1953                 case SIOCADDRT:
1954                         err = ip6_route_add(&cfg);
1955                         break;
1956                 case SIOCDELRT:
1957                         err = ip6_route_del(&cfg);
1958                         break;
1959                 default:
1960                         err = -EINVAL;
1961                 }
1962                 rtnl_unlock();
1963
1964                 return err;
1965         }
1966
1967         return -EINVAL;
1968 }
1969
1970 /*
1971  *      Drop the packet on the floor
1972  */
1973
1974 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1975 {
1976         int type;
1977         struct dst_entry *dst = skb_dst(skb);
1978         switch (ipstats_mib_noroutes) {
1979         case IPSTATS_MIB_INNOROUTES:
1980                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1981                 if (type == IPV6_ADDR_ANY) {
1982                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1983                                       IPSTATS_MIB_INADDRERRORS);
1984                         break;
1985                 }
1986                 /* FALLTHROUGH */
1987         case IPSTATS_MIB_OUTNOROUTES:
1988                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1989                               ipstats_mib_noroutes);
1990                 break;
1991         }
1992         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1993         kfree_skb(skb);
1994         return 0;
1995 }
1996
1997 static int ip6_pkt_discard(struct sk_buff *skb)
1998 {
1999         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2000 }
2001
2002 static int ip6_pkt_discard_out(struct sk_buff *skb)
2003 {
2004         skb->dev = skb_dst(skb)->dev;
2005         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2006 }
2007
2008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2009
2010 static int ip6_pkt_prohibit(struct sk_buff *skb)
2011 {
2012         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2013 }
2014
2015 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2016 {
2017         skb->dev = skb_dst(skb)->dev;
2018         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2019 }
2020
2021 #endif
2022
2023 /*
2024  *      Allocate a dst for local (unicast / anycast) address.
2025  */
2026
2027 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2028                                     const struct in6_addr *addr,
2029                                     bool anycast)
2030 {
2031         struct net *net = dev_net(idev->dev);
2032         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2033         int err;
2034
2035         if (!rt) {
2036                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2037                 return ERR_PTR(-ENOMEM);
2038         }
2039
2040         in6_dev_hold(idev);
2041
2042         rt->dst.flags |= DST_HOST;
2043         rt->dst.input = ip6_input;
2044         rt->dst.output = ip6_output;
2045         rt->rt6i_idev = idev;
2046         rt->dst.obsolete = -1;
2047
2048         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2049         if (anycast)
2050                 rt->rt6i_flags |= RTF_ANYCAST;
2051         else
2052                 rt->rt6i_flags |= RTF_LOCAL;
2053         err = rt6_bind_neighbour(rt, rt->dst.dev);
2054         if (err) {
2055                 dst_free(&rt->dst);
2056                 return ERR_PTR(err);
2057         }
2058
2059         rt->rt6i_dst.addr = *addr;
2060         rt->rt6i_dst.plen = 128;
2061         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2062
2063         atomic_set(&rt->dst.__refcnt, 1);
2064
2065         return rt;
2066 }
2067
2068 int ip6_route_get_saddr(struct net *net,
2069                         struct rt6_info *rt,
2070                         const struct in6_addr *daddr,
2071                         unsigned int prefs,
2072                         struct in6_addr *saddr)
2073 {
2074         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2075         int err = 0;
2076         if (rt->rt6i_prefsrc.plen)
2077                 *saddr = rt->rt6i_prefsrc.addr;
2078         else
2079                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2080                                          daddr, prefs, saddr);
2081         return err;
2082 }
2083
2084 /* remove deleted ip from prefsrc entries */
2085 struct arg_dev_net_ip {
2086         struct net_device *dev;
2087         struct net *net;
2088         struct in6_addr *addr;
2089 };
2090
2091 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2092 {
2093         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2094         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2095         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2096
2097         if (((void *)rt->dst.dev == dev || !dev) &&
2098             rt != net->ipv6.ip6_null_entry &&
2099             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2100                 /* remove prefsrc entry */
2101                 rt->rt6i_prefsrc.plen = 0;
2102         }
2103         return 0;
2104 }
2105
2106 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2107 {
2108         struct net *net = dev_net(ifp->idev->dev);
2109         struct arg_dev_net_ip adni = {
2110                 .dev = ifp->idev->dev,
2111                 .net = net,
2112                 .addr = &ifp->addr,
2113         };
2114         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2115 }
2116
2117 struct arg_dev_net {
2118         struct net_device *dev;
2119         struct net *net;
2120 };
2121
2122 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2123 {
2124         const struct arg_dev_net *adn = arg;
2125         const struct net_device *dev = adn->dev;
2126
2127         if ((rt->dst.dev == dev || !dev) &&
2128             rt != adn->net->ipv6.ip6_null_entry)
2129                 return -1;
2130
2131         return 0;
2132 }
2133
2134 void rt6_ifdown(struct net *net, struct net_device *dev)
2135 {
2136         struct arg_dev_net adn = {
2137                 .dev = dev,
2138                 .net = net,
2139         };
2140
2141         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2142         icmp6_clean_all(fib6_ifdown, &adn);
2143 }
2144
2145 struct rt6_mtu_change_arg {
2146         struct net_device *dev;
2147         unsigned int mtu;
2148 };
2149
2150 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2151 {
2152         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2153         struct inet6_dev *idev;
2154
2155         /* In IPv6 pmtu discovery is not optional,
2156            so that RTAX_MTU lock cannot disable it.
2157            We still use this lock to block changes
2158            caused by addrconf/ndisc.
2159         */
2160
2161         idev = __in6_dev_get(arg->dev);
2162         if (!idev)
2163                 return 0;
2164
2165         /* For administrative MTU increase, there is no way to discover
2166            IPv6 PMTU increase, so PMTU increase should be updated here.
2167            Since RFC 1981 doesn't include administrative MTU increase
2168            update PMTU increase is a MUST. (i.e. jumbo frame)
2169          */
2170         /*
2171            If new MTU is less than route PMTU, this new MTU will be the
2172            lowest MTU in the path, update the route PMTU to reflect PMTU
2173            decreases; if new MTU is greater than route PMTU, and the
2174            old MTU is the lowest MTU in the path, update the route PMTU
2175            to reflect the increase. In this case if the other nodes' MTU
2176            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2177            PMTU discouvery.
2178          */
2179         if (rt->dst.dev == arg->dev &&
2180             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2181             (dst_mtu(&rt->dst) >= arg->mtu ||
2182              (dst_mtu(&rt->dst) < arg->mtu &&
2183               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2184                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2185         }
2186         return 0;
2187 }
2188
2189 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2190 {
2191         struct rt6_mtu_change_arg arg = {
2192                 .dev = dev,
2193                 .mtu = mtu,
2194         };
2195
2196         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2197 }
2198
2199 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2200         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2201         [RTA_OIF]               = { .type = NLA_U32 },
2202         [RTA_IIF]               = { .type = NLA_U32 },
2203         [RTA_PRIORITY]          = { .type = NLA_U32 },
2204         [RTA_METRICS]           = { .type = NLA_NESTED },
2205 };
2206
2207 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2208                               struct fib6_config *cfg)
2209 {
2210         struct rtmsg *rtm;
2211         struct nlattr *tb[RTA_MAX+1];
2212         int err;
2213
2214         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2215         if (err < 0)
2216                 goto errout;
2217
2218         err = -EINVAL;
2219         rtm = nlmsg_data(nlh);
2220         memset(cfg, 0, sizeof(*cfg));
2221
2222         cfg->fc_table = rtm->rtm_table;
2223         cfg->fc_dst_len = rtm->rtm_dst_len;
2224         cfg->fc_src_len = rtm->rtm_src_len;
2225         cfg->fc_flags = RTF_UP;
2226         cfg->fc_protocol = rtm->rtm_protocol;
2227
2228         if (rtm->rtm_type == RTN_UNREACHABLE)
2229                 cfg->fc_flags |= RTF_REJECT;
2230
2231         if (rtm->rtm_type == RTN_LOCAL)
2232                 cfg->fc_flags |= RTF_LOCAL;
2233
2234         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2235         cfg->fc_nlinfo.nlh = nlh;
2236         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2237
2238         if (tb[RTA_GATEWAY]) {
2239                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2240                 cfg->fc_flags |= RTF_GATEWAY;
2241         }
2242
2243         if (tb[RTA_DST]) {
2244                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2245
2246                 if (nla_len(tb[RTA_DST]) < plen)
2247                         goto errout;
2248
2249                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2250         }
2251
2252         if (tb[RTA_SRC]) {
2253                 int plen = (rtm->rtm_src_len + 7) >> 3;
2254
2255                 if (nla_len(tb[RTA_SRC]) < plen)
2256                         goto errout;
2257
2258                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2259         }
2260
2261         if (tb[RTA_PREFSRC])
2262                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2263
2264         if (tb[RTA_OIF])
2265                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2266
2267         if (tb[RTA_PRIORITY])
2268                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2269
2270         if (tb[RTA_METRICS]) {
2271                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2272                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2273         }
2274
2275         if (tb[RTA_TABLE])
2276                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2277
2278         err = 0;
2279 errout:
2280         return err;
2281 }
2282
2283 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2284 {
2285         struct fib6_config cfg;
2286         int err;
2287
2288         err = rtm_to_fib6_config(skb, nlh, &cfg);
2289         if (err < 0)
2290                 return err;
2291
2292         return ip6_route_del(&cfg);
2293 }
2294
2295 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2296 {
2297         struct fib6_config cfg;
2298         int err;
2299
2300         err = rtm_to_fib6_config(skb, nlh, &cfg);
2301         if (err < 0)
2302                 return err;
2303
2304         return ip6_route_add(&cfg);
2305 }
2306
2307 static inline size_t rt6_nlmsg_size(void)
2308 {
2309         return NLMSG_ALIGN(sizeof(struct rtmsg))
2310                + nla_total_size(16) /* RTA_SRC */
2311                + nla_total_size(16) /* RTA_DST */
2312                + nla_total_size(16) /* RTA_GATEWAY */
2313                + nla_total_size(16) /* RTA_PREFSRC */
2314                + nla_total_size(4) /* RTA_TABLE */
2315                + nla_total_size(4) /* RTA_IIF */
2316                + nla_total_size(4) /* RTA_OIF */
2317                + nla_total_size(4) /* RTA_PRIORITY */
2318                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2319                + nla_total_size(sizeof(struct rta_cacheinfo));
2320 }
2321
2322 static int rt6_fill_node(struct net *net,
2323                          struct sk_buff *skb, struct rt6_info *rt,
2324                          struct in6_addr *dst, struct in6_addr *src,
2325                          int iif, int type, u32 pid, u32 seq,
2326                          int prefix, int nowait, unsigned int flags)
2327 {
2328         const struct inet_peer *peer;
2329         struct rtmsg *rtm;
2330         struct nlmsghdr *nlh;
2331         long expires;
2332         u32 table;
2333         struct neighbour *n;
2334         u32 ts, tsage;
2335
2336         if (prefix) {   /* user wants prefix routes only */
2337                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2338                         /* success since this is not a prefix route */
2339                         return 1;
2340                 }
2341         }
2342
2343         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2344         if (!nlh)
2345                 return -EMSGSIZE;
2346
2347         rtm = nlmsg_data(nlh);
2348         rtm->rtm_family = AF_INET6;
2349         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2350         rtm->rtm_src_len = rt->rt6i_src.plen;
2351         rtm->rtm_tos = 0;
2352         if (rt->rt6i_table)
2353                 table = rt->rt6i_table->tb6_id;
2354         else
2355                 table = RT6_TABLE_UNSPEC;
2356         rtm->rtm_table = table;
2357         if (nla_put_u32(skb, RTA_TABLE, table))
2358                 goto nla_put_failure;
2359         if (rt->rt6i_flags & RTF_REJECT)
2360                 rtm->rtm_type = RTN_UNREACHABLE;
2361         else if (rt->rt6i_flags & RTF_LOCAL)
2362                 rtm->rtm_type = RTN_LOCAL;
2363         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2364                 rtm->rtm_type = RTN_LOCAL;
2365         else
2366                 rtm->rtm_type = RTN_UNICAST;
2367         rtm->rtm_flags = 0;
2368         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2369         rtm->rtm_protocol = rt->rt6i_protocol;
2370         if (rt->rt6i_flags & RTF_DYNAMIC)
2371                 rtm->rtm_protocol = RTPROT_REDIRECT;
2372         else if (rt->rt6i_flags & RTF_ADDRCONF)
2373                 rtm->rtm_protocol = RTPROT_KERNEL;
2374         else if (rt->rt6i_flags & RTF_DEFAULT)
2375                 rtm->rtm_protocol = RTPROT_RA;
2376
2377         if (rt->rt6i_flags & RTF_CACHE)
2378                 rtm->rtm_flags |= RTM_F_CLONED;
2379
2380         if (dst) {
2381                 if (nla_put(skb, RTA_DST, 16, dst))
2382                         goto nla_put_failure;
2383                 rtm->rtm_dst_len = 128;
2384         } else if (rtm->rtm_dst_len)
2385                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2386                         goto nla_put_failure;
2387 #ifdef CONFIG_IPV6_SUBTREES
2388         if (src) {
2389                 if (nla_put(skb, RTA_SRC, 16, src))
2390                         goto nla_put_failure;
2391                 rtm->rtm_src_len = 128;
2392         } else if (rtm->rtm_src_len &&
2393                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2394                 goto nla_put_failure;
2395 #endif
2396         if (iif) {
2397 #ifdef CONFIG_IPV6_MROUTE
2398                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2399                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2400                         if (err <= 0) {
2401                                 if (!nowait) {
2402                                         if (err == 0)
2403                                                 return 0;
2404                                         goto nla_put_failure;
2405                                 } else {
2406                                         if (err == -EMSGSIZE)
2407                                                 goto nla_put_failure;
2408                                 }
2409                         }
2410                 } else
2411 #endif
2412                         if (nla_put_u32(skb, RTA_IIF, iif))
2413                                 goto nla_put_failure;
2414         } else if (dst) {
2415                 struct in6_addr saddr_buf;
2416                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2417                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2418                         goto nla_put_failure;
2419         }
2420
2421         if (rt->rt6i_prefsrc.plen) {
2422                 struct in6_addr saddr_buf;
2423                 saddr_buf = rt->rt6i_prefsrc.addr;
2424                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2425                         goto nla_put_failure;
2426         }
2427
2428         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2429                 goto nla_put_failure;
2430
2431         rcu_read_lock();
2432         n = dst_get_neighbour_noref(&rt->dst);
2433         if (n) {
2434                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2435                         rcu_read_unlock();
2436                         goto nla_put_failure;
2437                 }
2438         }
2439         rcu_read_unlock();
2440
2441         if (rt->dst.dev &&
2442             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2443                 goto nla_put_failure;
2444         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2445                 goto nla_put_failure;
2446         if (!(rt->rt6i_flags & RTF_EXPIRES))
2447                 expires = 0;
2448         else if (rt->dst.expires - jiffies < INT_MAX)
2449                 expires = rt->dst.expires - jiffies;
2450         else
2451                 expires = INT_MAX;
2452
2453         peer = NULL;
2454         if (rt6_has_peer(rt))
2455                 peer = rt6_peer_ptr(rt);
2456         ts = tsage = 0;
2457         if (peer && peer->tcp_ts_stamp) {
2458                 ts = peer->tcp_ts;
2459                 tsage = get_seconds() - peer->tcp_ts_stamp;
2460         }
2461
2462         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2463                                expires, rt->dst.error) < 0)
2464                 goto nla_put_failure;
2465
2466         return nlmsg_end(skb, nlh);
2467
2468 nla_put_failure:
2469         nlmsg_cancel(skb, nlh);
2470         return -EMSGSIZE;
2471 }
2472
2473 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2474 {
2475         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2476         int prefix;
2477
2478         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2479                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2480                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2481         } else
2482                 prefix = 0;
2483
2484         return rt6_fill_node(arg->net,
2485                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2486                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2487                      prefix, 0, NLM_F_MULTI);
2488 }
2489
2490 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2491 {
2492         struct net *net = sock_net(in_skb->sk);
2493         struct nlattr *tb[RTA_MAX+1];
2494         struct rt6_info *rt;
2495         struct sk_buff *skb;
2496         struct rtmsg *rtm;
2497         struct flowi6 fl6;
2498         int err, iif = 0, oif = 0;
2499
2500         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2501         if (err < 0)
2502                 goto errout;
2503
2504         err = -EINVAL;
2505         memset(&fl6, 0, sizeof(fl6));
2506
2507         if (tb[RTA_SRC]) {
2508                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2509                         goto errout;
2510
2511                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2512         }
2513
2514         if (tb[RTA_DST]) {
2515                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2516                         goto errout;
2517
2518                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2519         }
2520
2521         if (tb[RTA_IIF])
2522                 iif = nla_get_u32(tb[RTA_IIF]);
2523
2524         if (tb[RTA_OIF])
2525                 oif = nla_get_u32(tb[RTA_OIF]);
2526
2527         if (iif) {
2528                 struct net_device *dev;
2529                 int flags = 0;
2530
2531                 dev = __dev_get_by_index(net, iif);
2532                 if (!dev) {
2533                         err = -ENODEV;
2534                         goto errout;
2535                 }
2536
2537                 fl6.flowi6_iif = iif;
2538
2539                 if (!ipv6_addr_any(&fl6.saddr))
2540                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2541
2542                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2543                                                                flags);
2544         } else {
2545                 fl6.flowi6_oif = oif;
2546
2547                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2548         }
2549
2550         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2551         if (!skb) {
2552                 dst_release(&rt->dst);
2553                 err = -ENOBUFS;
2554                 goto errout;
2555         }
2556
2557         /* Reserve room for dummy headers, this skb can pass
2558            through good chunk of routing engine.
2559          */
2560         skb_reset_mac_header(skb);
2561         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2562
2563         skb_dst_set(skb, &rt->dst);
2564
2565         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2566                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2567                             nlh->nlmsg_seq, 0, 0, 0);
2568         if (err < 0) {
2569                 kfree_skb(skb);
2570                 goto errout;
2571         }
2572
2573         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2574 errout:
2575         return err;
2576 }
2577
2578 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2579 {
2580         struct sk_buff *skb;
2581         struct net *net = info->nl_net;
2582         u32 seq;
2583         int err;
2584
2585         err = -ENOBUFS;
2586         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2587
2588         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2589         if (!skb)
2590                 goto errout;
2591
2592         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2593                                 event, info->pid, seq, 0, 0, 0);
2594         if (err < 0) {
2595                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2596                 WARN_ON(err == -EMSGSIZE);
2597                 kfree_skb(skb);
2598                 goto errout;
2599         }
2600         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2601                     info->nlh, gfp_any());
2602         return;
2603 errout:
2604         if (err < 0)
2605                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2606 }
2607
2608 static int ip6_route_dev_notify(struct notifier_block *this,
2609                                 unsigned long event, void *data)
2610 {
2611         struct net_device *dev = (struct net_device *)data;
2612         struct net *net = dev_net(dev);
2613
2614         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2615                 net->ipv6.ip6_null_entry->dst.dev = dev;
2616                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2617 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2618                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2619                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2620                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2621                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2622 #endif
2623         }
2624
2625         return NOTIFY_OK;
2626 }
2627
2628 /*
2629  *      /proc
2630  */
2631
2632 #ifdef CONFIG_PROC_FS
2633
2634 struct rt6_proc_arg
2635 {
2636         char *buffer;
2637         int offset;
2638         int length;
2639         int skip;
2640         int len;
2641 };
2642
2643 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2644 {
2645         struct seq_file *m = p_arg;
2646         struct neighbour *n;
2647
2648         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2649
2650 #ifdef CONFIG_IPV6_SUBTREES
2651         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2652 #else
2653         seq_puts(m, "00000000000000000000000000000000 00 ");
2654 #endif
2655         rcu_read_lock();
2656         n = dst_get_neighbour_noref(&rt->dst);
2657         if (n) {
2658                 seq_printf(m, "%pi6", n->primary_key);
2659         } else {
2660                 seq_puts(m, "00000000000000000000000000000000");
2661         }
2662         rcu_read_unlock();
2663         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2664                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2665                    rt->dst.__use, rt->rt6i_flags,
2666                    rt->dst.dev ? rt->dst.dev->name : "");
2667         return 0;
2668 }
2669
2670 static int ipv6_route_show(struct seq_file *m, void *v)
2671 {
2672         struct net *net = (struct net *)m->private;
2673         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2674         return 0;
2675 }
2676
2677 static int ipv6_route_open(struct inode *inode, struct file *file)
2678 {
2679         return single_open_net(inode, file, ipv6_route_show);
2680 }
2681
2682 static const struct file_operations ipv6_route_proc_fops = {
2683         .owner          = THIS_MODULE,
2684         .open           = ipv6_route_open,
2685         .read           = seq_read,
2686         .llseek         = seq_lseek,
2687         .release        = single_release_net,
2688 };
2689
2690 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2691 {
2692         struct net *net = (struct net *)seq->private;
2693         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2694                    net->ipv6.rt6_stats->fib_nodes,
2695                    net->ipv6.rt6_stats->fib_route_nodes,
2696                    net->ipv6.rt6_stats->fib_rt_alloc,
2697                    net->ipv6.rt6_stats->fib_rt_entries,
2698                    net->ipv6.rt6_stats->fib_rt_cache,
2699                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2700                    net->ipv6.rt6_stats->fib_discarded_routes);
2701
2702         return 0;
2703 }
2704
2705 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2706 {
2707         return single_open_net(inode, file, rt6_stats_seq_show);
2708 }
2709
2710 static const struct file_operations rt6_stats_seq_fops = {
2711         .owner   = THIS_MODULE,
2712         .open    = rt6_stats_seq_open,
2713         .read    = seq_read,
2714         .llseek  = seq_lseek,
2715         .release = single_release_net,
2716 };
2717 #endif  /* CONFIG_PROC_FS */
2718
2719 #ifdef CONFIG_SYSCTL
2720
2721 static
2722 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2723                               void __user *buffer, size_t *lenp, loff_t *ppos)
2724 {
2725         struct net *net;
2726         int delay;
2727         if (!write)
2728                 return -EINVAL;
2729
2730         net = (struct net *)ctl->extra1;
2731         delay = net->ipv6.sysctl.flush_delay;
2732         proc_dointvec(ctl, write, buffer, lenp, ppos);
2733         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2734         return 0;
2735 }
2736
2737 ctl_table ipv6_route_table_template[] = {
2738         {
2739                 .procname       =       "flush",
2740                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2741                 .maxlen         =       sizeof(int),
2742                 .mode           =       0200,
2743                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2744         },
2745         {
2746                 .procname       =       "gc_thresh",
2747                 .data           =       &ip6_dst_ops_template.gc_thresh,
2748                 .maxlen         =       sizeof(int),
2749                 .mode           =       0644,
2750                 .proc_handler   =       proc_dointvec,
2751         },
2752         {
2753                 .procname       =       "max_size",
2754                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2755                 .maxlen         =       sizeof(int),
2756                 .mode           =       0644,
2757                 .proc_handler   =       proc_dointvec,
2758         },
2759         {
2760                 .procname       =       "gc_min_interval",
2761                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2762                 .maxlen         =       sizeof(int),
2763                 .mode           =       0644,
2764                 .proc_handler   =       proc_dointvec_jiffies,
2765         },
2766         {
2767                 .procname       =       "gc_timeout",
2768                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2769                 .maxlen         =       sizeof(int),
2770                 .mode           =       0644,
2771                 .proc_handler   =       proc_dointvec_jiffies,
2772         },
2773         {
2774                 .procname       =       "gc_interval",
2775                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2776                 .maxlen         =       sizeof(int),
2777                 .mode           =       0644,
2778                 .proc_handler   =       proc_dointvec_jiffies,
2779         },
2780         {
2781                 .procname       =       "gc_elasticity",
2782                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2783                 .maxlen         =       sizeof(int),
2784                 .mode           =       0644,
2785                 .proc_handler   =       proc_dointvec,
2786         },
2787         {
2788                 .procname       =       "mtu_expires",
2789                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2790                 .maxlen         =       sizeof(int),
2791                 .mode           =       0644,
2792                 .proc_handler   =       proc_dointvec_jiffies,
2793         },
2794         {
2795                 .procname       =       "min_adv_mss",
2796                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2797                 .maxlen         =       sizeof(int),
2798                 .mode           =       0644,
2799                 .proc_handler   =       proc_dointvec,
2800         },
2801         {
2802                 .procname       =       "gc_min_interval_ms",
2803                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2804                 .maxlen         =       sizeof(int),
2805                 .mode           =       0644,
2806                 .proc_handler   =       proc_dointvec_ms_jiffies,
2807         },
2808         { }
2809 };
2810
2811 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2812 {
2813         struct ctl_table *table;
2814
2815         table = kmemdup(ipv6_route_table_template,
2816                         sizeof(ipv6_route_table_template),
2817                         GFP_KERNEL);
2818
2819         if (table) {
2820                 table[0].data = &net->ipv6.sysctl.flush_delay;
2821                 table[0].extra1 = net;
2822                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2823                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2824                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2825                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2826                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2827                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2828                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2829                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2830                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831         }
2832
2833         return table;
2834 }
2835 #endif
2836
2837 static int __net_init ip6_route_net_init(struct net *net)
2838 {
2839         int ret = -ENOMEM;
2840
2841         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2842                sizeof(net->ipv6.ip6_dst_ops));
2843
2844         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2845                 goto out_ip6_dst_ops;
2846
2847         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2848                                            sizeof(*net->ipv6.ip6_null_entry),
2849                                            GFP_KERNEL);
2850         if (!net->ipv6.ip6_null_entry)
2851                 goto out_ip6_dst_entries;
2852         net->ipv6.ip6_null_entry->dst.path =
2853                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2854         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2855         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2856                          ip6_template_metrics, true);
2857
2858 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2859         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2860                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2861                                                GFP_KERNEL);
2862         if (!net->ipv6.ip6_prohibit_entry)
2863                 goto out_ip6_null_entry;
2864         net->ipv6.ip6_prohibit_entry->dst.path =
2865                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2866         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2867         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2868                          ip6_template_metrics, true);
2869
2870         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2871                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2872                                                GFP_KERNEL);
2873         if (!net->ipv6.ip6_blk_hole_entry)
2874                 goto out_ip6_prohibit_entry;
2875         net->ipv6.ip6_blk_hole_entry->dst.path =
2876                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2877         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2879                          ip6_template_metrics, true);
2880 #endif
2881
2882         net->ipv6.sysctl.flush_delay = 0;
2883         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2884         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2885         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2886         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2887         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2888         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2889         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2890
2891 #ifdef CONFIG_PROC_FS
2892         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2893         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2894 #endif
2895         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2896
2897         ret = 0;
2898 out:
2899         return ret;
2900
2901 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2902 out_ip6_prohibit_entry:
2903         kfree(net->ipv6.ip6_prohibit_entry);
2904 out_ip6_null_entry:
2905         kfree(net->ipv6.ip6_null_entry);
2906 #endif
2907 out_ip6_dst_entries:
2908         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2909 out_ip6_dst_ops:
2910         goto out;
2911 }
2912
2913 static void __net_exit ip6_route_net_exit(struct net *net)
2914 {
2915 #ifdef CONFIG_PROC_FS
2916         proc_net_remove(net, "ipv6_route");
2917         proc_net_remove(net, "rt6_stats");
2918 #endif
2919         kfree(net->ipv6.ip6_null_entry);
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921         kfree(net->ipv6.ip6_prohibit_entry);
2922         kfree(net->ipv6.ip6_blk_hole_entry);
2923 #endif
2924         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2925 }
2926
2927 static struct pernet_operations ip6_route_net_ops = {
2928         .init = ip6_route_net_init,
2929         .exit = ip6_route_net_exit,
2930 };
2931
2932 static int __net_init ipv6_inetpeer_init(struct net *net)
2933 {
2934         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2935
2936         if (!bp)
2937                 return -ENOMEM;
2938         inet_peer_base_init(bp);
2939         net->ipv6.peers = bp;
2940         return 0;
2941 }
2942
2943 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2944 {
2945         struct inet_peer_base *bp = net->ipv6.peers;
2946
2947         net->ipv6.peers = NULL;
2948         inetpeer_invalidate_tree(bp);
2949         kfree(bp);
2950 }
2951
2952 static struct pernet_operations ipv6_inetpeer_ops = {
2953         .init   =       ipv6_inetpeer_init,
2954         .exit   =       ipv6_inetpeer_exit,
2955 };
2956
2957 static struct notifier_block ip6_route_dev_notifier = {
2958         .notifier_call = ip6_route_dev_notify,
2959         .priority = 0,
2960 };
2961
2962 int __init ip6_route_init(void)
2963 {
2964         int ret;
2965
2966         ret = -ENOMEM;
2967         ip6_dst_ops_template.kmem_cachep =
2968                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2969                                   SLAB_HWCACHE_ALIGN, NULL);
2970         if (!ip6_dst_ops_template.kmem_cachep)
2971                 goto out;
2972
2973         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2974         if (ret)
2975                 goto out_kmem_cache;
2976
2977         ret = register_pernet_subsys(&ip6_route_net_ops);
2978         if (ret)
2979                 goto out_dst_entries;
2980
2981         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
2982         if (ret)
2983                 goto out_register_subsys;
2984
2985         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2986
2987         /* Registering of the loopback is done before this portion of code,
2988          * the loopback reference in rt6_info will not be taken, do it
2989          * manually for init_net */
2990         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2991         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2992   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2993         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2994         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2995         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2996         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2997   #endif
2998         ret = fib6_init();
2999         if (ret)
3000                 goto out_register_inetpeer;
3001
3002         ret = xfrm6_init();
3003         if (ret)
3004                 goto out_fib6_init;
3005
3006         ret = fib6_rules_init();
3007         if (ret)
3008                 goto xfrm6_init;
3009
3010         ret = -ENOBUFS;
3011         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3012             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3013             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3014                 goto fib6_rules_init;
3015
3016         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3017         if (ret)
3018                 goto fib6_rules_init;
3019
3020 out:
3021         return ret;
3022
3023 fib6_rules_init:
3024         fib6_rules_cleanup();
3025 xfrm6_init:
3026         xfrm6_fini();
3027 out_fib6_init:
3028         fib6_gc_cleanup();
3029 out_register_inetpeer:
3030         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3031 out_register_subsys:
3032         unregister_pernet_subsys(&ip6_route_net_ops);
3033 out_dst_entries:
3034         dst_entries_destroy(&ip6_dst_blackhole_ops);
3035 out_kmem_cache:
3036         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3037         goto out;
3038 }
3039
3040 void ip6_route_cleanup(void)
3041 {
3042         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3043         fib6_rules_cleanup();
3044         xfrm6_fini();
3045         fib6_gc_cleanup();
3046         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3047         unregister_pernet_subsys(&ip6_route_net_ops);
3048         dst_entries_destroy(&ip6_dst_blackhole_ops);
3049         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3050 }