ipv4/ipv6: Prepare for new route gateway semantics.
[linux-3.10.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (p->s6_addr32[0] | p->s6_addr32[1] |
129             p->s6_addr32[2] | p->s6_addr32[3])
130                 return (const void *) p;
131         return daddr;
132 }
133
134 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
135 {
136         struct rt6_info *rt = (struct rt6_info *) dst;
137         struct neighbour *n;
138
139         daddr = choose_neigh_daddr(rt, daddr);
140         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
141         if (n)
142                 return n;
143         return neigh_create(&nd_tbl, daddr, dst->dev);
144 }
145
146 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
147 {
148         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
149         if (!n) {
150                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
151                 if (IS_ERR(n))
152                         return PTR_ERR(n);
153         }
154         dst_set_neighbour(&rt->dst, n);
155
156         return 0;
157 }
158
159 static struct dst_ops ip6_dst_ops_template = {
160         .family                 =       AF_INET6,
161         .protocol               =       cpu_to_be16(ETH_P_IPV6),
162         .gc                     =       ip6_dst_gc,
163         .gc_thresh              =       1024,
164         .check                  =       ip6_dst_check,
165         .default_advmss         =       ip6_default_advmss,
166         .mtu                    =       ip6_mtu,
167         .cow_metrics            =       ipv6_cow_metrics,
168         .destroy                =       ip6_dst_destroy,
169         .ifdown                 =       ip6_dst_ifdown,
170         .negative_advice        =       ip6_negative_advice,
171         .link_failure           =       ip6_link_failure,
172         .update_pmtu            =       ip6_rt_update_pmtu,
173         .local_out              =       __ip6_local_out,
174         .neigh_lookup           =       ip6_neigh_lookup,
175 };
176
177 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
178 {
179         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
180
181         return mtu ? : dst->dev->mtu;
182 }
183
184 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
185 {
186 }
187
188 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
189                                          unsigned long old)
190 {
191         return NULL;
192 }
193
194 static struct dst_ops ip6_dst_blackhole_ops = {
195         .family                 =       AF_INET6,
196         .protocol               =       cpu_to_be16(ETH_P_IPV6),
197         .destroy                =       ip6_dst_destroy,
198         .check                  =       ip6_dst_check,
199         .mtu                    =       ip6_blackhole_mtu,
200         .default_advmss         =       ip6_default_advmss,
201         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
202         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
203         .neigh_lookup           =       ip6_neigh_lookup,
204 };
205
206 static const u32 ip6_template_metrics[RTAX_MAX] = {
207         [RTAX_HOPLIMIT - 1] = 255,
208 };
209
210 static struct rt6_info ip6_null_entry_template = {
211         .dst = {
212                 .__refcnt       = ATOMIC_INIT(1),
213                 .__use          = 1,
214                 .obsolete       = -1,
215                 .error          = -ENETUNREACH,
216                 .input          = ip6_pkt_discard,
217                 .output         = ip6_pkt_discard_out,
218         },
219         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
220         .rt6i_protocol  = RTPROT_KERNEL,
221         .rt6i_metric    = ~(u32) 0,
222         .rt6i_ref       = ATOMIC_INIT(1),
223 };
224
225 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
226
227 static int ip6_pkt_prohibit(struct sk_buff *skb);
228 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
229
230 static struct rt6_info ip6_prohibit_entry_template = {
231         .dst = {
232                 .__refcnt       = ATOMIC_INIT(1),
233                 .__use          = 1,
234                 .obsolete       = -1,
235                 .error          = -EACCES,
236                 .input          = ip6_pkt_prohibit,
237                 .output         = ip6_pkt_prohibit_out,
238         },
239         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
240         .rt6i_protocol  = RTPROT_KERNEL,
241         .rt6i_metric    = ~(u32) 0,
242         .rt6i_ref       = ATOMIC_INIT(1),
243 };
244
245 static struct rt6_info ip6_blk_hole_entry_template = {
246         .dst = {
247                 .__refcnt       = ATOMIC_INIT(1),
248                 .__use          = 1,
249                 .obsolete       = -1,
250                 .error          = -EINVAL,
251                 .input          = dst_discard,
252                 .output         = dst_discard,
253         },
254         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
255         .rt6i_protocol  = RTPROT_KERNEL,
256         .rt6i_metric    = ~(u32) 0,
257         .rt6i_ref       = ATOMIC_INIT(1),
258 };
259
260 #endif
261
262 /* allocate dst with ip6_dst_ops */
263 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
264                                              struct net_device *dev,
265                                              int flags)
266 {
267         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
268
269         if (rt)
270                 memset(&rt->rt6i_table, 0,
271                        sizeof(*rt) - sizeof(struct dst_entry));
272
273         return rt;
274 }
275
276 static void ip6_dst_destroy(struct dst_entry *dst)
277 {
278         struct rt6_info *rt = (struct rt6_info *)dst;
279         struct inet6_dev *idev = rt->rt6i_idev;
280         struct inet_peer *peer = rt->rt6i_peer;
281
282         if (!(rt->dst.flags & DST_HOST))
283                 dst_destroy_metrics_generic(dst);
284
285         if (idev) {
286                 rt->rt6i_idev = NULL;
287                 in6_dev_put(idev);
288         }
289         if (peer) {
290                 rt->rt6i_peer = NULL;
291                 inet_putpeer(peer);
292         }
293 }
294
295 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
296
297 static u32 rt6_peer_genid(void)
298 {
299         return atomic_read(&__rt6_peer_genid);
300 }
301
302 void rt6_bind_peer(struct rt6_info *rt, int create)
303 {
304         struct inet_peer *peer;
305
306         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
307         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
308                 inet_putpeer(peer);
309         else
310                 rt->rt6i_peer_genid = rt6_peer_genid();
311 }
312
313 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
314                            int how)
315 {
316         struct rt6_info *rt = (struct rt6_info *)dst;
317         struct inet6_dev *idev = rt->rt6i_idev;
318         struct net_device *loopback_dev =
319                 dev_net(dev)->loopback_dev;
320
321         if (dev != loopback_dev && idev && idev->dev == dev) {
322                 struct inet6_dev *loopback_idev =
323                         in6_dev_get(loopback_dev);
324                 if (loopback_idev) {
325                         rt->rt6i_idev = loopback_idev;
326                         in6_dev_put(idev);
327                 }
328         }
329 }
330
331 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
332 {
333         return (rt->rt6i_flags & RTF_EXPIRES) &&
334                 time_after(jiffies, rt->dst.expires);
335 }
336
337 static inline int rt6_need_strict(const struct in6_addr *daddr)
338 {
339         return ipv6_addr_type(daddr) &
340                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
341 }
342
343 /*
344  *      Route lookup. Any table->tb6_lock is implied.
345  */
346
347 static inline struct rt6_info *rt6_device_match(struct net *net,
348                                                     struct rt6_info *rt,
349                                                     const struct in6_addr *saddr,
350                                                     int oif,
351                                                     int flags)
352 {
353         struct rt6_info *local = NULL;
354         struct rt6_info *sprt;
355
356         if (!oif && ipv6_addr_any(saddr))
357                 goto out;
358
359         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
360                 struct net_device *dev = sprt->dst.dev;
361
362                 if (oif) {
363                         if (dev->ifindex == oif)
364                                 return sprt;
365                         if (dev->flags & IFF_LOOPBACK) {
366                                 if (!sprt->rt6i_idev ||
367                                     sprt->rt6i_idev->dev->ifindex != oif) {
368                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
369                                                 continue;
370                                         if (local && (!oif ||
371                                                       local->rt6i_idev->dev->ifindex == oif))
372                                                 continue;
373                                 }
374                                 local = sprt;
375                         }
376                 } else {
377                         if (ipv6_chk_addr(net, saddr, dev,
378                                           flags & RT6_LOOKUP_F_IFACE))
379                                 return sprt;
380                 }
381         }
382
383         if (oif) {
384                 if (local)
385                         return local;
386
387                 if (flags & RT6_LOOKUP_F_IFACE)
388                         return net->ipv6.ip6_null_entry;
389         }
390 out:
391         return rt;
392 }
393
394 #ifdef CONFIG_IPV6_ROUTER_PREF
395 static void rt6_probe(struct rt6_info *rt)
396 {
397         struct neighbour *neigh;
398         /*
399          * Okay, this does not seem to be appropriate
400          * for now, however, we need to check if it
401          * is really so; aka Router Reachability Probing.
402          *
403          * Router Reachability Probe MUST be rate-limited
404          * to no more than one per minute.
405          */
406         rcu_read_lock();
407         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
408         if (!neigh || (neigh->nud_state & NUD_VALID))
409                 goto out;
410         read_lock_bh(&neigh->lock);
411         if (!(neigh->nud_state & NUD_VALID) &&
412             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
413                 struct in6_addr mcaddr;
414                 struct in6_addr *target;
415
416                 neigh->updated = jiffies;
417                 read_unlock_bh(&neigh->lock);
418
419                 target = (struct in6_addr *)&neigh->primary_key;
420                 addrconf_addr_solict_mult(target, &mcaddr);
421                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
422         } else {
423                 read_unlock_bh(&neigh->lock);
424         }
425 out:
426         rcu_read_unlock();
427 }
428 #else
429 static inline void rt6_probe(struct rt6_info *rt)
430 {
431 }
432 #endif
433
434 /*
435  * Default Router Selection (RFC 2461 6.3.6)
436  */
437 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
438 {
439         struct net_device *dev = rt->dst.dev;
440         if (!oif || dev->ifindex == oif)
441                 return 2;
442         if ((dev->flags & IFF_LOOPBACK) &&
443             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
444                 return 1;
445         return 0;
446 }
447
448 static inline int rt6_check_neigh(struct rt6_info *rt)
449 {
450         struct neighbour *neigh;
451         int m;
452
453         rcu_read_lock();
454         neigh = dst_get_neighbour_noref(&rt->dst);
455         if (rt->rt6i_flags & RTF_NONEXTHOP ||
456             !(rt->rt6i_flags & RTF_GATEWAY))
457                 m = 1;
458         else if (neigh) {
459                 read_lock_bh(&neigh->lock);
460                 if (neigh->nud_state & NUD_VALID)
461                         m = 2;
462 #ifdef CONFIG_IPV6_ROUTER_PREF
463                 else if (neigh->nud_state & NUD_FAILED)
464                         m = 0;
465 #endif
466                 else
467                         m = 1;
468                 read_unlock_bh(&neigh->lock);
469         } else
470                 m = 0;
471         rcu_read_unlock();
472         return m;
473 }
474
475 static int rt6_score_route(struct rt6_info *rt, int oif,
476                            int strict)
477 {
478         int m, n;
479
480         m = rt6_check_dev(rt, oif);
481         if (!m && (strict & RT6_LOOKUP_F_IFACE))
482                 return -1;
483 #ifdef CONFIG_IPV6_ROUTER_PREF
484         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
485 #endif
486         n = rt6_check_neigh(rt);
487         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
488                 return -1;
489         return m;
490 }
491
492 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
493                                    int *mpri, struct rt6_info *match)
494 {
495         int m;
496
497         if (rt6_check_expired(rt))
498                 goto out;
499
500         m = rt6_score_route(rt, oif, strict);
501         if (m < 0)
502                 goto out;
503
504         if (m > *mpri) {
505                 if (strict & RT6_LOOKUP_F_REACHABLE)
506                         rt6_probe(match);
507                 *mpri = m;
508                 match = rt;
509         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
510                 rt6_probe(rt);
511         }
512
513 out:
514         return match;
515 }
516
517 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
518                                      struct rt6_info *rr_head,
519                                      u32 metric, int oif, int strict)
520 {
521         struct rt6_info *rt, *match;
522         int mpri = -1;
523
524         match = NULL;
525         for (rt = rr_head; rt && rt->rt6i_metric == metric;
526              rt = rt->dst.rt6_next)
527                 match = find_match(rt, oif, strict, &mpri, match);
528         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
529              rt = rt->dst.rt6_next)
530                 match = find_match(rt, oif, strict, &mpri, match);
531
532         return match;
533 }
534
535 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
536 {
537         struct rt6_info *match, *rt0;
538         struct net *net;
539
540         rt0 = fn->rr_ptr;
541         if (!rt0)
542                 fn->rr_ptr = rt0 = fn->leaf;
543
544         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
545
546         if (!match &&
547             (strict & RT6_LOOKUP_F_REACHABLE)) {
548                 struct rt6_info *next = rt0->dst.rt6_next;
549
550                 /* no entries matched; do round-robin */
551                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
552                         next = fn->leaf;
553
554                 if (next != rt0)
555                         fn->rr_ptr = next;
556         }
557
558         net = dev_net(rt0->dst.dev);
559         return match ? match : net->ipv6.ip6_null_entry;
560 }
561
562 #ifdef CONFIG_IPV6_ROUTE_INFO
563 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
564                   const struct in6_addr *gwaddr)
565 {
566         struct net *net = dev_net(dev);
567         struct route_info *rinfo = (struct route_info *) opt;
568         struct in6_addr prefix_buf, *prefix;
569         unsigned int pref;
570         unsigned long lifetime;
571         struct rt6_info *rt;
572
573         if (len < sizeof(struct route_info)) {
574                 return -EINVAL;
575         }
576
577         /* Sanity check for prefix_len and length */
578         if (rinfo->length > 3) {
579                 return -EINVAL;
580         } else if (rinfo->prefix_len > 128) {
581                 return -EINVAL;
582         } else if (rinfo->prefix_len > 64) {
583                 if (rinfo->length < 2) {
584                         return -EINVAL;
585                 }
586         } else if (rinfo->prefix_len > 0) {
587                 if (rinfo->length < 1) {
588                         return -EINVAL;
589                 }
590         }
591
592         pref = rinfo->route_pref;
593         if (pref == ICMPV6_ROUTER_PREF_INVALID)
594                 return -EINVAL;
595
596         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
597
598         if (rinfo->length == 3)
599                 prefix = (struct in6_addr *)rinfo->prefix;
600         else {
601                 /* this function is safe */
602                 ipv6_addr_prefix(&prefix_buf,
603                                  (struct in6_addr *)rinfo->prefix,
604                                  rinfo->prefix_len);
605                 prefix = &prefix_buf;
606         }
607
608         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
609                                 dev->ifindex);
610
611         if (rt && !lifetime) {
612                 ip6_del_rt(rt);
613                 rt = NULL;
614         }
615
616         if (!rt && lifetime)
617                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
618                                         pref);
619         else if (rt)
620                 rt->rt6i_flags = RTF_ROUTEINFO |
621                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
622
623         if (rt) {
624                 if (!addrconf_finite_timeout(lifetime)) {
625                         rt->rt6i_flags &= ~RTF_EXPIRES;
626                 } else {
627                         rt->dst.expires = jiffies + HZ * lifetime;
628                         rt->rt6i_flags |= RTF_EXPIRES;
629                 }
630                 dst_release(&rt->dst);
631         }
632         return 0;
633 }
634 #endif
635
636 #define BACKTRACK(__net, saddr)                 \
637 do { \
638         if (rt == __net->ipv6.ip6_null_entry) { \
639                 struct fib6_node *pn; \
640                 while (1) { \
641                         if (fn->fn_flags & RTN_TL_ROOT) \
642                                 goto out; \
643                         pn = fn->parent; \
644                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
645                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
646                         else \
647                                 fn = pn; \
648                         if (fn->fn_flags & RTN_RTINFO) \
649                                 goto restart; \
650                 } \
651         } \
652 } while (0)
653
654 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
655                                              struct fib6_table *table,
656                                              struct flowi6 *fl6, int flags)
657 {
658         struct fib6_node *fn;
659         struct rt6_info *rt;
660
661         read_lock_bh(&table->tb6_lock);
662         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
663 restart:
664         rt = fn->leaf;
665         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
666         BACKTRACK(net, &fl6->saddr);
667 out:
668         dst_use(&rt->dst, jiffies);
669         read_unlock_bh(&table->tb6_lock);
670         return rt;
671
672 }
673
674 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
675                                     int flags)
676 {
677         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
678 }
679 EXPORT_SYMBOL_GPL(ip6_route_lookup);
680
681 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
682                             const struct in6_addr *saddr, int oif, int strict)
683 {
684         struct flowi6 fl6 = {
685                 .flowi6_oif = oif,
686                 .daddr = *daddr,
687         };
688         struct dst_entry *dst;
689         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
690
691         if (saddr) {
692                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
693                 flags |= RT6_LOOKUP_F_HAS_SADDR;
694         }
695
696         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
697         if (dst->error == 0)
698                 return (struct rt6_info *) dst;
699
700         dst_release(dst);
701
702         return NULL;
703 }
704
705 EXPORT_SYMBOL(rt6_lookup);
706
707 /* ip6_ins_rt is called with FREE table->tb6_lock.
708    It takes new route entry, the addition fails by any reason the
709    route is freed. In any case, if caller does not hold it, it may
710    be destroyed.
711  */
712
713 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
714 {
715         int err;
716         struct fib6_table *table;
717
718         table = rt->rt6i_table;
719         write_lock_bh(&table->tb6_lock);
720         err = fib6_add(&table->tb6_root, rt, info);
721         write_unlock_bh(&table->tb6_lock);
722
723         return err;
724 }
725
726 int ip6_ins_rt(struct rt6_info *rt)
727 {
728         struct nl_info info = {
729                 .nl_net = dev_net(rt->dst.dev),
730         };
731         return __ip6_ins_rt(rt, &info);
732 }
733
734 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
735                                       const struct in6_addr *daddr,
736                                       const struct in6_addr *saddr)
737 {
738         struct rt6_info *rt;
739
740         /*
741          *      Clone the route.
742          */
743
744         rt = ip6_rt_copy(ort, daddr);
745
746         if (rt) {
747                 int attempts = !in_softirq();
748
749                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
750                         if (ort->rt6i_dst.plen != 128 &&
751                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
752                                 rt->rt6i_flags |= RTF_ANYCAST;
753                         rt->rt6i_gateway = *daddr;
754                 }
755
756                 rt->rt6i_flags |= RTF_CACHE;
757
758 #ifdef CONFIG_IPV6_SUBTREES
759                 if (rt->rt6i_src.plen && saddr) {
760                         rt->rt6i_src.addr = *saddr;
761                         rt->rt6i_src.plen = 128;
762                 }
763 #endif
764
765         retry:
766                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
767                         struct net *net = dev_net(rt->dst.dev);
768                         int saved_rt_min_interval =
769                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
770                         int saved_rt_elasticity =
771                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
772
773                         if (attempts-- > 0) {
774                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
775                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
776
777                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
778
779                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
780                                         saved_rt_elasticity;
781                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
782                                         saved_rt_min_interval;
783                                 goto retry;
784                         }
785
786                         if (net_ratelimit())
787                                 printk(KERN_WARNING
788                                        "ipv6: Neighbour table overflow.\n");
789                         dst_free(&rt->dst);
790                         return NULL;
791                 }
792         }
793
794         return rt;
795 }
796
797 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
798                                         const struct in6_addr *daddr)
799 {
800         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
801
802         if (rt) {
803                 rt->rt6i_flags |= RTF_CACHE;
804                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
805         }
806         return rt;
807 }
808
809 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
810                                       struct flowi6 *fl6, int flags)
811 {
812         struct fib6_node *fn;
813         struct rt6_info *rt, *nrt;
814         int strict = 0;
815         int attempts = 3;
816         int err;
817         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
818
819         strict |= flags & RT6_LOOKUP_F_IFACE;
820
821 relookup:
822         read_lock_bh(&table->tb6_lock);
823
824 restart_2:
825         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
826
827 restart:
828         rt = rt6_select(fn, oif, strict | reachable);
829
830         BACKTRACK(net, &fl6->saddr);
831         if (rt == net->ipv6.ip6_null_entry ||
832             rt->rt6i_flags & RTF_CACHE)
833                 goto out;
834
835         dst_hold(&rt->dst);
836         read_unlock_bh(&table->tb6_lock);
837
838         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
839                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
840         else if (!(rt->dst.flags & DST_HOST))
841                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
842         else
843                 goto out2;
844
845         dst_release(&rt->dst);
846         rt = nrt ? : net->ipv6.ip6_null_entry;
847
848         dst_hold(&rt->dst);
849         if (nrt) {
850                 err = ip6_ins_rt(nrt);
851                 if (!err)
852                         goto out2;
853         }
854
855         if (--attempts <= 0)
856                 goto out2;
857
858         /*
859          * Race condition! In the gap, when table->tb6_lock was
860          * released someone could insert this route.  Relookup.
861          */
862         dst_release(&rt->dst);
863         goto relookup;
864
865 out:
866         if (reachable) {
867                 reachable = 0;
868                 goto restart_2;
869         }
870         dst_hold(&rt->dst);
871         read_unlock_bh(&table->tb6_lock);
872 out2:
873         rt->dst.lastuse = jiffies;
874         rt->dst.__use++;
875
876         return rt;
877 }
878
879 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
880                                             struct flowi6 *fl6, int flags)
881 {
882         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
883 }
884
885 void ip6_route_input(struct sk_buff *skb)
886 {
887         const struct ipv6hdr *iph = ipv6_hdr(skb);
888         struct net *net = dev_net(skb->dev);
889         int flags = RT6_LOOKUP_F_HAS_SADDR;
890         struct flowi6 fl6 = {
891                 .flowi6_iif = skb->dev->ifindex,
892                 .daddr = iph->daddr,
893                 .saddr = iph->saddr,
894                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
895                 .flowi6_mark = skb->mark,
896                 .flowi6_proto = iph->nexthdr,
897         };
898
899         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
900                 flags |= RT6_LOOKUP_F_IFACE;
901
902         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
903 }
904
905 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
906                                              struct flowi6 *fl6, int flags)
907 {
908         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
909 }
910
911 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
912                                     struct flowi6 *fl6)
913 {
914         int flags = 0;
915
916         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
917                 flags |= RT6_LOOKUP_F_IFACE;
918
919         if (!ipv6_addr_any(&fl6->saddr))
920                 flags |= RT6_LOOKUP_F_HAS_SADDR;
921         else if (sk)
922                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
923
924         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
925 }
926
927 EXPORT_SYMBOL(ip6_route_output);
928
929 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
930 {
931         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
932         struct dst_entry *new = NULL;
933
934         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
935         if (rt) {
936                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
937
938                 new = &rt->dst;
939
940                 new->__use = 1;
941                 new->input = dst_discard;
942                 new->output = dst_discard;
943
944                 if (dst_metrics_read_only(&ort->dst))
945                         new->_metrics = ort->dst._metrics;
946                 else
947                         dst_copy_metrics(new, &ort->dst);
948                 rt->rt6i_idev = ort->rt6i_idev;
949                 if (rt->rt6i_idev)
950                         in6_dev_hold(rt->rt6i_idev);
951                 rt->dst.expires = 0;
952
953                 rt->rt6i_gateway = ort->rt6i_gateway;
954                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
955                 rt->rt6i_metric = 0;
956
957                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
958 #ifdef CONFIG_IPV6_SUBTREES
959                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
960 #endif
961
962                 dst_free(new);
963         }
964
965         dst_release(dst_orig);
966         return new ? new : ERR_PTR(-ENOMEM);
967 }
968
969 /*
970  *      Destination cache support functions
971  */
972
973 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
974 {
975         struct rt6_info *rt;
976
977         rt = (struct rt6_info *) dst;
978
979         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
980                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
981                         if (!rt->rt6i_peer)
982                                 rt6_bind_peer(rt, 0);
983                         rt->rt6i_peer_genid = rt6_peer_genid();
984                 }
985                 return dst;
986         }
987         return NULL;
988 }
989
990 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
991 {
992         struct rt6_info *rt = (struct rt6_info *) dst;
993
994         if (rt) {
995                 if (rt->rt6i_flags & RTF_CACHE) {
996                         if (rt6_check_expired(rt)) {
997                                 ip6_del_rt(rt);
998                                 dst = NULL;
999                         }
1000                 } else {
1001                         dst_release(dst);
1002                         dst = NULL;
1003                 }
1004         }
1005         return dst;
1006 }
1007
1008 static void ip6_link_failure(struct sk_buff *skb)
1009 {
1010         struct rt6_info *rt;
1011
1012         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1013
1014         rt = (struct rt6_info *) skb_dst(skb);
1015         if (rt) {
1016                 if (rt->rt6i_flags & RTF_CACHE) {
1017                         dst_set_expires(&rt->dst, 0);
1018                         rt->rt6i_flags |= RTF_EXPIRES;
1019                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1020                         rt->rt6i_node->fn_sernum = -1;
1021         }
1022 }
1023
1024 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1025 {
1026         struct rt6_info *rt6 = (struct rt6_info*)dst;
1027
1028         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1029                 rt6->rt6i_flags |= RTF_MODIFIED;
1030                 if (mtu < IPV6_MIN_MTU) {
1031                         u32 features = dst_metric(dst, RTAX_FEATURES);
1032                         mtu = IPV6_MIN_MTU;
1033                         features |= RTAX_FEATURE_ALLFRAG;
1034                         dst_metric_set(dst, RTAX_FEATURES, features);
1035                 }
1036                 dst_metric_set(dst, RTAX_MTU, mtu);
1037         }
1038 }
1039
1040 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1041 {
1042         struct net_device *dev = dst->dev;
1043         unsigned int mtu = dst_mtu(dst);
1044         struct net *net = dev_net(dev);
1045
1046         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1047
1048         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1049                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1050
1051         /*
1052          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1053          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1054          * IPV6_MAXPLEN is also valid and means: "any MSS,
1055          * rely only on pmtu discovery"
1056          */
1057         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1058                 mtu = IPV6_MAXPLEN;
1059         return mtu;
1060 }
1061
1062 static unsigned int ip6_mtu(const struct dst_entry *dst)
1063 {
1064         struct inet6_dev *idev;
1065         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1066
1067         if (mtu)
1068                 return mtu;
1069
1070         mtu = IPV6_MIN_MTU;
1071
1072         rcu_read_lock();
1073         idev = __in6_dev_get(dst->dev);
1074         if (idev)
1075                 mtu = idev->cnf.mtu6;
1076         rcu_read_unlock();
1077
1078         return mtu;
1079 }
1080
1081 static struct dst_entry *icmp6_dst_gc_list;
1082 static DEFINE_SPINLOCK(icmp6_dst_lock);
1083
1084 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1085                                   struct neighbour *neigh,
1086                                   struct flowi6 *fl6)
1087 {
1088         struct dst_entry *dst;
1089         struct rt6_info *rt;
1090         struct inet6_dev *idev = in6_dev_get(dev);
1091         struct net *net = dev_net(dev);
1092
1093         if (unlikely(!idev))
1094                 return NULL;
1095
1096         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1097         if (unlikely(!rt)) {
1098                 in6_dev_put(idev);
1099                 dst = ERR_PTR(-ENOMEM);
1100                 goto out;
1101         }
1102
1103         if (neigh)
1104                 neigh_hold(neigh);
1105         else {
1106                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1107                 if (IS_ERR(neigh)) {
1108                         in6_dev_put(idev);
1109                         dst_free(&rt->dst);
1110                         return ERR_CAST(neigh);
1111                 }
1112         }
1113
1114         rt->dst.flags |= DST_HOST;
1115         rt->dst.output  = ip6_output;
1116         dst_set_neighbour(&rt->dst, neigh);
1117         atomic_set(&rt->dst.__refcnt, 1);
1118         rt->rt6i_dst.addr = fl6->daddr;
1119         rt->rt6i_dst.plen = 128;
1120         rt->rt6i_idev     = idev;
1121         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1122
1123         spin_lock_bh(&icmp6_dst_lock);
1124         rt->dst.next = icmp6_dst_gc_list;
1125         icmp6_dst_gc_list = &rt->dst;
1126         spin_unlock_bh(&icmp6_dst_lock);
1127
1128         fib6_force_start_gc(net);
1129
1130         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1131
1132 out:
1133         return dst;
1134 }
1135
1136 int icmp6_dst_gc(void)
1137 {
1138         struct dst_entry *dst, **pprev;
1139         int more = 0;
1140
1141         spin_lock_bh(&icmp6_dst_lock);
1142         pprev = &icmp6_dst_gc_list;
1143
1144         while ((dst = *pprev) != NULL) {
1145                 if (!atomic_read(&dst->__refcnt)) {
1146                         *pprev = dst->next;
1147                         dst_free(dst);
1148                 } else {
1149                         pprev = &dst->next;
1150                         ++more;
1151                 }
1152         }
1153
1154         spin_unlock_bh(&icmp6_dst_lock);
1155
1156         return more;
1157 }
1158
1159 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1160                             void *arg)
1161 {
1162         struct dst_entry *dst, **pprev;
1163
1164         spin_lock_bh(&icmp6_dst_lock);
1165         pprev = &icmp6_dst_gc_list;
1166         while ((dst = *pprev) != NULL) {
1167                 struct rt6_info *rt = (struct rt6_info *) dst;
1168                 if (func(rt, arg)) {
1169                         *pprev = dst->next;
1170                         dst_free(dst);
1171                 } else {
1172                         pprev = &dst->next;
1173                 }
1174         }
1175         spin_unlock_bh(&icmp6_dst_lock);
1176 }
1177
1178 static int ip6_dst_gc(struct dst_ops *ops)
1179 {
1180         unsigned long now = jiffies;
1181         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1182         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1183         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1184         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1185         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1186         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1187         int entries;
1188
1189         entries = dst_entries_get_fast(ops);
1190         if (time_after(rt_last_gc + rt_min_interval, now) &&
1191             entries <= rt_max_size)
1192                 goto out;
1193
1194         net->ipv6.ip6_rt_gc_expire++;
1195         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1196         net->ipv6.ip6_rt_last_gc = now;
1197         entries = dst_entries_get_slow(ops);
1198         if (entries < ops->gc_thresh)
1199                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1200 out:
1201         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1202         return entries > rt_max_size;
1203 }
1204
1205 /* Clean host part of a prefix. Not necessary in radix tree,
1206    but results in cleaner routing tables.
1207
1208    Remove it only when all the things will work!
1209  */
1210
1211 int ip6_dst_hoplimit(struct dst_entry *dst)
1212 {
1213         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1214         if (hoplimit == 0) {
1215                 struct net_device *dev = dst->dev;
1216                 struct inet6_dev *idev;
1217
1218                 rcu_read_lock();
1219                 idev = __in6_dev_get(dev);
1220                 if (idev)
1221                         hoplimit = idev->cnf.hop_limit;
1222                 else
1223                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1224                 rcu_read_unlock();
1225         }
1226         return hoplimit;
1227 }
1228 EXPORT_SYMBOL(ip6_dst_hoplimit);
1229
1230 /*
1231  *
1232  */
1233
1234 int ip6_route_add(struct fib6_config *cfg)
1235 {
1236         int err;
1237         struct net *net = cfg->fc_nlinfo.nl_net;
1238         struct rt6_info *rt = NULL;
1239         struct net_device *dev = NULL;
1240         struct inet6_dev *idev = NULL;
1241         struct fib6_table *table;
1242         int addr_type;
1243
1244         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1245                 return -EINVAL;
1246 #ifndef CONFIG_IPV6_SUBTREES
1247         if (cfg->fc_src_len)
1248                 return -EINVAL;
1249 #endif
1250         if (cfg->fc_ifindex) {
1251                 err = -ENODEV;
1252                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1253                 if (!dev)
1254                         goto out;
1255                 idev = in6_dev_get(dev);
1256                 if (!idev)
1257                         goto out;
1258         }
1259
1260         if (cfg->fc_metric == 0)
1261                 cfg->fc_metric = IP6_RT_PRIO_USER;
1262
1263         err = -ENOBUFS;
1264         if (cfg->fc_nlinfo.nlh &&
1265             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1266                 table = fib6_get_table(net, cfg->fc_table);
1267                 if (!table) {
1268                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1269                         table = fib6_new_table(net, cfg->fc_table);
1270                 }
1271         } else {
1272                 table = fib6_new_table(net, cfg->fc_table);
1273         }
1274
1275         if (!table)
1276                 goto out;
1277
1278         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1279
1280         if (!rt) {
1281                 err = -ENOMEM;
1282                 goto out;
1283         }
1284
1285         rt->dst.obsolete = -1;
1286         rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1287                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1288                                 0;
1289
1290         if (cfg->fc_protocol == RTPROT_UNSPEC)
1291                 cfg->fc_protocol = RTPROT_BOOT;
1292         rt->rt6i_protocol = cfg->fc_protocol;
1293
1294         addr_type = ipv6_addr_type(&cfg->fc_dst);
1295
1296         if (addr_type & IPV6_ADDR_MULTICAST)
1297                 rt->dst.input = ip6_mc_input;
1298         else if (cfg->fc_flags & RTF_LOCAL)
1299                 rt->dst.input = ip6_input;
1300         else
1301                 rt->dst.input = ip6_forward;
1302
1303         rt->dst.output = ip6_output;
1304
1305         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1306         rt->rt6i_dst.plen = cfg->fc_dst_len;
1307         if (rt->rt6i_dst.plen == 128)
1308                rt->dst.flags |= DST_HOST;
1309
1310         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1311                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1312                 if (!metrics) {
1313                         err = -ENOMEM;
1314                         goto out;
1315                 }
1316                 dst_init_metrics(&rt->dst, metrics, 0);
1317         }
1318 #ifdef CONFIG_IPV6_SUBTREES
1319         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1320         rt->rt6i_src.plen = cfg->fc_src_len;
1321 #endif
1322
1323         rt->rt6i_metric = cfg->fc_metric;
1324
1325         /* We cannot add true routes via loopback here,
1326            they would result in kernel looping; promote them to reject routes
1327          */
1328         if ((cfg->fc_flags & RTF_REJECT) ||
1329             (dev && (dev->flags & IFF_LOOPBACK) &&
1330              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1331              !(cfg->fc_flags & RTF_LOCAL))) {
1332                 /* hold loopback dev/idev if we haven't done so. */
1333                 if (dev != net->loopback_dev) {
1334                         if (dev) {
1335                                 dev_put(dev);
1336                                 in6_dev_put(idev);
1337                         }
1338                         dev = net->loopback_dev;
1339                         dev_hold(dev);
1340                         idev = in6_dev_get(dev);
1341                         if (!idev) {
1342                                 err = -ENODEV;
1343                                 goto out;
1344                         }
1345                 }
1346                 rt->dst.output = ip6_pkt_discard_out;
1347                 rt->dst.input = ip6_pkt_discard;
1348                 rt->dst.error = -ENETUNREACH;
1349                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1350                 goto install_route;
1351         }
1352
1353         if (cfg->fc_flags & RTF_GATEWAY) {
1354                 const struct in6_addr *gw_addr;
1355                 int gwa_type;
1356
1357                 gw_addr = &cfg->fc_gateway;
1358                 rt->rt6i_gateway = *gw_addr;
1359                 gwa_type = ipv6_addr_type(gw_addr);
1360
1361                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1362                         struct rt6_info *grt;
1363
1364                         /* IPv6 strictly inhibits using not link-local
1365                            addresses as nexthop address.
1366                            Otherwise, router will not able to send redirects.
1367                            It is very good, but in some (rare!) circumstances
1368                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1369                            some exceptions. --ANK
1370                          */
1371                         err = -EINVAL;
1372                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1373                                 goto out;
1374
1375                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1376
1377                         err = -EHOSTUNREACH;
1378                         if (!grt)
1379                                 goto out;
1380                         if (dev) {
1381                                 if (dev != grt->dst.dev) {
1382                                         dst_release(&grt->dst);
1383                                         goto out;
1384                                 }
1385                         } else {
1386                                 dev = grt->dst.dev;
1387                                 idev = grt->rt6i_idev;
1388                                 dev_hold(dev);
1389                                 in6_dev_hold(grt->rt6i_idev);
1390                         }
1391                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1392                                 err = 0;
1393                         dst_release(&grt->dst);
1394
1395                         if (err)
1396                                 goto out;
1397                 }
1398                 err = -EINVAL;
1399                 if (!dev || (dev->flags & IFF_LOOPBACK))
1400                         goto out;
1401         }
1402
1403         err = -ENODEV;
1404         if (!dev)
1405                 goto out;
1406
1407         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1408                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1409                         err = -EINVAL;
1410                         goto out;
1411                 }
1412                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1413                 rt->rt6i_prefsrc.plen = 128;
1414         } else
1415                 rt->rt6i_prefsrc.plen = 0;
1416
1417         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1418                 err = rt6_bind_neighbour(rt, dev);
1419                 if (err)
1420                         goto out;
1421         }
1422
1423         rt->rt6i_flags = cfg->fc_flags;
1424
1425 install_route:
1426         if (cfg->fc_mx) {
1427                 struct nlattr *nla;
1428                 int remaining;
1429
1430                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1431                         int type = nla_type(nla);
1432
1433                         if (type) {
1434                                 if (type > RTAX_MAX) {
1435                                         err = -EINVAL;
1436                                         goto out;
1437                                 }
1438
1439                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1440                         }
1441                 }
1442         }
1443
1444         rt->dst.dev = dev;
1445         rt->rt6i_idev = idev;
1446         rt->rt6i_table = table;
1447
1448         cfg->fc_nlinfo.nl_net = dev_net(dev);
1449
1450         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1451
1452 out:
1453         if (dev)
1454                 dev_put(dev);
1455         if (idev)
1456                 in6_dev_put(idev);
1457         if (rt)
1458                 dst_free(&rt->dst);
1459         return err;
1460 }
1461
1462 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1463 {
1464         int err;
1465         struct fib6_table *table;
1466         struct net *net = dev_net(rt->dst.dev);
1467
1468         if (rt == net->ipv6.ip6_null_entry)
1469                 return -ENOENT;
1470
1471         table = rt->rt6i_table;
1472         write_lock_bh(&table->tb6_lock);
1473
1474         err = fib6_del(rt, info);
1475         dst_release(&rt->dst);
1476
1477         write_unlock_bh(&table->tb6_lock);
1478
1479         return err;
1480 }
1481
1482 int ip6_del_rt(struct rt6_info *rt)
1483 {
1484         struct nl_info info = {
1485                 .nl_net = dev_net(rt->dst.dev),
1486         };
1487         return __ip6_del_rt(rt, &info);
1488 }
1489
1490 static int ip6_route_del(struct fib6_config *cfg)
1491 {
1492         struct fib6_table *table;
1493         struct fib6_node *fn;
1494         struct rt6_info *rt;
1495         int err = -ESRCH;
1496
1497         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1498         if (!table)
1499                 return err;
1500
1501         read_lock_bh(&table->tb6_lock);
1502
1503         fn = fib6_locate(&table->tb6_root,
1504                          &cfg->fc_dst, cfg->fc_dst_len,
1505                          &cfg->fc_src, cfg->fc_src_len);
1506
1507         if (fn) {
1508                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1509                         if (cfg->fc_ifindex &&
1510                             (!rt->dst.dev ||
1511                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1512                                 continue;
1513                         if (cfg->fc_flags & RTF_GATEWAY &&
1514                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1515                                 continue;
1516                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1517                                 continue;
1518                         dst_hold(&rt->dst);
1519                         read_unlock_bh(&table->tb6_lock);
1520
1521                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1522                 }
1523         }
1524         read_unlock_bh(&table->tb6_lock);
1525
1526         return err;
1527 }
1528
1529 /*
1530  *      Handle redirects
1531  */
1532 struct ip6rd_flowi {
1533         struct flowi6 fl6;
1534         struct in6_addr gateway;
1535 };
1536
1537 static struct rt6_info *__ip6_route_redirect(struct net *net,
1538                                              struct fib6_table *table,
1539                                              struct flowi6 *fl6,
1540                                              int flags)
1541 {
1542         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1543         struct rt6_info *rt;
1544         struct fib6_node *fn;
1545
1546         /*
1547          * Get the "current" route for this destination and
1548          * check if the redirect has come from approriate router.
1549          *
1550          * RFC 2461 specifies that redirects should only be
1551          * accepted if they come from the nexthop to the target.
1552          * Due to the way the routes are chosen, this notion
1553          * is a bit fuzzy and one might need to check all possible
1554          * routes.
1555          */
1556
1557         read_lock_bh(&table->tb6_lock);
1558         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1559 restart:
1560         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1561                 /*
1562                  * Current route is on-link; redirect is always invalid.
1563                  *
1564                  * Seems, previous statement is not true. It could
1565                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1566                  * But then router serving it might decide, that we should
1567                  * know truth 8)8) --ANK (980726).
1568                  */
1569                 if (rt6_check_expired(rt))
1570                         continue;
1571                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1572                         continue;
1573                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1574                         continue;
1575                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1576                         continue;
1577                 break;
1578         }
1579
1580         if (!rt)
1581                 rt = net->ipv6.ip6_null_entry;
1582         BACKTRACK(net, &fl6->saddr);
1583 out:
1584         dst_hold(&rt->dst);
1585
1586         read_unlock_bh(&table->tb6_lock);
1587
1588         return rt;
1589 };
1590
1591 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1592                                            const struct in6_addr *src,
1593                                            const struct in6_addr *gateway,
1594                                            struct net_device *dev)
1595 {
1596         int flags = RT6_LOOKUP_F_HAS_SADDR;
1597         struct net *net = dev_net(dev);
1598         struct ip6rd_flowi rdfl = {
1599                 .fl6 = {
1600                         .flowi6_oif = dev->ifindex,
1601                         .daddr = *dest,
1602                         .saddr = *src,
1603                 },
1604         };
1605
1606         rdfl.gateway = *gateway;
1607
1608         if (rt6_need_strict(dest))
1609                 flags |= RT6_LOOKUP_F_IFACE;
1610
1611         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1612                                                    flags, __ip6_route_redirect);
1613 }
1614
1615 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1616                   const struct in6_addr *saddr,
1617                   struct neighbour *neigh, u8 *lladdr, int on_link)
1618 {
1619         struct rt6_info *rt, *nrt = NULL;
1620         struct netevent_redirect netevent;
1621         struct net *net = dev_net(neigh->dev);
1622
1623         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1624
1625         if (rt == net->ipv6.ip6_null_entry) {
1626                 if (net_ratelimit())
1627                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1628                                "for redirect target\n");
1629                 goto out;
1630         }
1631
1632         /*
1633          *      We have finally decided to accept it.
1634          */
1635
1636         neigh_update(neigh, lladdr, NUD_STALE,
1637                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1638                      NEIGH_UPDATE_F_OVERRIDE|
1639                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1640                                      NEIGH_UPDATE_F_ISROUTER))
1641                      );
1642
1643         /*
1644          * Redirect received -> path was valid.
1645          * Look, redirects are sent only in response to data packets,
1646          * so that this nexthop apparently is reachable. --ANK
1647          */
1648         dst_confirm(&rt->dst);
1649
1650         /* Duplicate redirect: silently ignore. */
1651         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1652                 goto out;
1653
1654         nrt = ip6_rt_copy(rt, dest);
1655         if (!nrt)
1656                 goto out;
1657
1658         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1659         if (on_link)
1660                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1661
1662         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1663         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1664
1665         if (ip6_ins_rt(nrt))
1666                 goto out;
1667
1668         netevent.old = &rt->dst;
1669         netevent.new = &nrt->dst;
1670         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1671
1672         if (rt->rt6i_flags & RTF_CACHE) {
1673                 ip6_del_rt(rt);
1674                 return;
1675         }
1676
1677 out:
1678         dst_release(&rt->dst);
1679 }
1680
1681 /*
1682  *      Handle ICMP "packet too big" messages
1683  *      i.e. Path MTU discovery
1684  */
1685
1686 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1687                              struct net *net, u32 pmtu, int ifindex)
1688 {
1689         struct rt6_info *rt, *nrt;
1690         int allfrag = 0;
1691 again:
1692         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1693         if (!rt)
1694                 return;
1695
1696         if (rt6_check_expired(rt)) {
1697                 ip6_del_rt(rt);
1698                 goto again;
1699         }
1700
1701         if (pmtu >= dst_mtu(&rt->dst))
1702                 goto out;
1703
1704         if (pmtu < IPV6_MIN_MTU) {
1705                 /*
1706                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1707                  * MTU (1280) and a fragment header should always be included
1708                  * after a node receiving Too Big message reporting PMTU is
1709                  * less than the IPv6 Minimum Link MTU.
1710                  */
1711                 pmtu = IPV6_MIN_MTU;
1712                 allfrag = 1;
1713         }
1714
1715         /* New mtu received -> path was valid.
1716            They are sent only in response to data packets,
1717            so that this nexthop apparently is reachable. --ANK
1718          */
1719         dst_confirm(&rt->dst);
1720
1721         /* Host route. If it is static, it would be better
1722            not to override it, but add new one, so that
1723            when cache entry will expire old pmtu
1724            would return automatically.
1725          */
1726         if (rt->rt6i_flags & RTF_CACHE) {
1727                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1728                 if (allfrag) {
1729                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1730                         features |= RTAX_FEATURE_ALLFRAG;
1731                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1732                 }
1733                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1734                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1735                 goto out;
1736         }
1737
1738         /* Network route.
1739            Two cases are possible:
1740            1. It is connected route. Action: COW
1741            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1742          */
1743         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1744                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1745         else
1746                 nrt = rt6_alloc_clone(rt, daddr);
1747
1748         if (nrt) {
1749                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1750                 if (allfrag) {
1751                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1752                         features |= RTAX_FEATURE_ALLFRAG;
1753                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1754                 }
1755
1756                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1757                  * happened within 5 mins, the recommended timer is 10 mins.
1758                  * Here this route expiration time is set to ip6_rt_mtu_expires
1759                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1760                  * and detecting PMTU increase will be automatically happened.
1761                  */
1762                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1763                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1764
1765                 ip6_ins_rt(nrt);
1766         }
1767 out:
1768         dst_release(&rt->dst);
1769 }
1770
1771 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1772                         struct net_device *dev, u32 pmtu)
1773 {
1774         struct net *net = dev_net(dev);
1775
1776         /*
1777          * RFC 1981 states that a node "MUST reduce the size of the packets it
1778          * is sending along the path" that caused the Packet Too Big message.
1779          * Since it's not possible in the general case to determine which
1780          * interface was used to send the original packet, we update the MTU
1781          * on the interface that will be used to send future packets. We also
1782          * update the MTU on the interface that received the Packet Too Big in
1783          * case the original packet was forced out that interface with
1784          * SO_BINDTODEVICE or similar. This is the next best thing to the
1785          * correct behaviour, which would be to update the MTU on all
1786          * interfaces.
1787          */
1788         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1789         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1790 }
1791
1792 /*
1793  *      Misc support functions
1794  */
1795
1796 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1797                                     const struct in6_addr *dest)
1798 {
1799         struct net *net = dev_net(ort->dst.dev);
1800         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1801                                             ort->dst.dev, 0);
1802
1803         if (rt) {
1804                 rt->dst.input = ort->dst.input;
1805                 rt->dst.output = ort->dst.output;
1806                 rt->dst.flags |= DST_HOST;
1807
1808                 rt->rt6i_dst.addr = *dest;
1809                 rt->rt6i_dst.plen = 128;
1810                 dst_copy_metrics(&rt->dst, &ort->dst);
1811                 rt->dst.error = ort->dst.error;
1812                 rt->rt6i_idev = ort->rt6i_idev;
1813                 if (rt->rt6i_idev)
1814                         in6_dev_hold(rt->rt6i_idev);
1815                 rt->dst.lastuse = jiffies;
1816                 rt->dst.expires = 0;
1817
1818                 rt->rt6i_gateway = ort->rt6i_gateway;
1819                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1820                 rt->rt6i_metric = 0;
1821
1822 #ifdef CONFIG_IPV6_SUBTREES
1823                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1824 #endif
1825                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1826                 rt->rt6i_table = ort->rt6i_table;
1827         }
1828         return rt;
1829 }
1830
1831 #ifdef CONFIG_IPV6_ROUTE_INFO
1832 static struct rt6_info *rt6_get_route_info(struct net *net,
1833                                            const struct in6_addr *prefix, int prefixlen,
1834                                            const struct in6_addr *gwaddr, int ifindex)
1835 {
1836         struct fib6_node *fn;
1837         struct rt6_info *rt = NULL;
1838         struct fib6_table *table;
1839
1840         table = fib6_get_table(net, RT6_TABLE_INFO);
1841         if (!table)
1842                 return NULL;
1843
1844         write_lock_bh(&table->tb6_lock);
1845         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1846         if (!fn)
1847                 goto out;
1848
1849         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1850                 if (rt->dst.dev->ifindex != ifindex)
1851                         continue;
1852                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1853                         continue;
1854                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1855                         continue;
1856                 dst_hold(&rt->dst);
1857                 break;
1858         }
1859 out:
1860         write_unlock_bh(&table->tb6_lock);
1861         return rt;
1862 }
1863
1864 static struct rt6_info *rt6_add_route_info(struct net *net,
1865                                            const struct in6_addr *prefix, int prefixlen,
1866                                            const struct in6_addr *gwaddr, int ifindex,
1867                                            unsigned pref)
1868 {
1869         struct fib6_config cfg = {
1870                 .fc_table       = RT6_TABLE_INFO,
1871                 .fc_metric      = IP6_RT_PRIO_USER,
1872                 .fc_ifindex     = ifindex,
1873                 .fc_dst_len     = prefixlen,
1874                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1875                                   RTF_UP | RTF_PREF(pref),
1876                 .fc_nlinfo.pid = 0,
1877                 .fc_nlinfo.nlh = NULL,
1878                 .fc_nlinfo.nl_net = net,
1879         };
1880
1881         cfg.fc_dst = *prefix;
1882         cfg.fc_gateway = *gwaddr;
1883
1884         /* We should treat it as a default route if prefix length is 0. */
1885         if (!prefixlen)
1886                 cfg.fc_flags |= RTF_DEFAULT;
1887
1888         ip6_route_add(&cfg);
1889
1890         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1891 }
1892 #endif
1893
1894 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1895 {
1896         struct rt6_info *rt;
1897         struct fib6_table *table;
1898
1899         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1900         if (!table)
1901                 return NULL;
1902
1903         write_lock_bh(&table->tb6_lock);
1904         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1905                 if (dev == rt->dst.dev &&
1906                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1907                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1908                         break;
1909         }
1910         if (rt)
1911                 dst_hold(&rt->dst);
1912         write_unlock_bh(&table->tb6_lock);
1913         return rt;
1914 }
1915
1916 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1917                                      struct net_device *dev,
1918                                      unsigned int pref)
1919 {
1920         struct fib6_config cfg = {
1921                 .fc_table       = RT6_TABLE_DFLT,
1922                 .fc_metric      = IP6_RT_PRIO_USER,
1923                 .fc_ifindex     = dev->ifindex,
1924                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1925                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1926                 .fc_nlinfo.pid = 0,
1927                 .fc_nlinfo.nlh = NULL,
1928                 .fc_nlinfo.nl_net = dev_net(dev),
1929         };
1930
1931         cfg.fc_gateway = *gwaddr;
1932
1933         ip6_route_add(&cfg);
1934
1935         return rt6_get_dflt_router(gwaddr, dev);
1936 }
1937
1938 void rt6_purge_dflt_routers(struct net *net)
1939 {
1940         struct rt6_info *rt;
1941         struct fib6_table *table;
1942
1943         /* NOTE: Keep consistent with rt6_get_dflt_router */
1944         table = fib6_get_table(net, RT6_TABLE_DFLT);
1945         if (!table)
1946                 return;
1947
1948 restart:
1949         read_lock_bh(&table->tb6_lock);
1950         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1951                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1952                         dst_hold(&rt->dst);
1953                         read_unlock_bh(&table->tb6_lock);
1954                         ip6_del_rt(rt);
1955                         goto restart;
1956                 }
1957         }
1958         read_unlock_bh(&table->tb6_lock);
1959 }
1960
1961 static void rtmsg_to_fib6_config(struct net *net,
1962                                  struct in6_rtmsg *rtmsg,
1963                                  struct fib6_config *cfg)
1964 {
1965         memset(cfg, 0, sizeof(*cfg));
1966
1967         cfg->fc_table = RT6_TABLE_MAIN;
1968         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1969         cfg->fc_metric = rtmsg->rtmsg_metric;
1970         cfg->fc_expires = rtmsg->rtmsg_info;
1971         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1972         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1973         cfg->fc_flags = rtmsg->rtmsg_flags;
1974
1975         cfg->fc_nlinfo.nl_net = net;
1976
1977         cfg->fc_dst = rtmsg->rtmsg_dst;
1978         cfg->fc_src = rtmsg->rtmsg_src;
1979         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1980 }
1981
1982 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1983 {
1984         struct fib6_config cfg;
1985         struct in6_rtmsg rtmsg;
1986         int err;
1987
1988         switch(cmd) {
1989         case SIOCADDRT:         /* Add a route */
1990         case SIOCDELRT:         /* Delete a route */
1991                 if (!capable(CAP_NET_ADMIN))
1992                         return -EPERM;
1993                 err = copy_from_user(&rtmsg, arg,
1994                                      sizeof(struct in6_rtmsg));
1995                 if (err)
1996                         return -EFAULT;
1997
1998                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1999
2000                 rtnl_lock();
2001                 switch (cmd) {
2002                 case SIOCADDRT:
2003                         err = ip6_route_add(&cfg);
2004                         break;
2005                 case SIOCDELRT:
2006                         err = ip6_route_del(&cfg);
2007                         break;
2008                 default:
2009                         err = -EINVAL;
2010                 }
2011                 rtnl_unlock();
2012
2013                 return err;
2014         }
2015
2016         return -EINVAL;
2017 }
2018
2019 /*
2020  *      Drop the packet on the floor
2021  */
2022
2023 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2024 {
2025         int type;
2026         struct dst_entry *dst = skb_dst(skb);
2027         switch (ipstats_mib_noroutes) {
2028         case IPSTATS_MIB_INNOROUTES:
2029                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2030                 if (type == IPV6_ADDR_ANY) {
2031                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2032                                       IPSTATS_MIB_INADDRERRORS);
2033                         break;
2034                 }
2035                 /* FALLTHROUGH */
2036         case IPSTATS_MIB_OUTNOROUTES:
2037                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2038                               ipstats_mib_noroutes);
2039                 break;
2040         }
2041         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2042         kfree_skb(skb);
2043         return 0;
2044 }
2045
2046 static int ip6_pkt_discard(struct sk_buff *skb)
2047 {
2048         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2049 }
2050
2051 static int ip6_pkt_discard_out(struct sk_buff *skb)
2052 {
2053         skb->dev = skb_dst(skb)->dev;
2054         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2055 }
2056
2057 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2058
2059 static int ip6_pkt_prohibit(struct sk_buff *skb)
2060 {
2061         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2062 }
2063
2064 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2065 {
2066         skb->dev = skb_dst(skb)->dev;
2067         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2068 }
2069
2070 #endif
2071
2072 /*
2073  *      Allocate a dst for local (unicast / anycast) address.
2074  */
2075
2076 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2077                                     const struct in6_addr *addr,
2078                                     bool anycast)
2079 {
2080         struct net *net = dev_net(idev->dev);
2081         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2082                                             net->loopback_dev, 0);
2083         int err;
2084
2085         if (!rt) {
2086                 if (net_ratelimit())
2087                         pr_warning("IPv6:  Maximum number of routes reached,"
2088                                    " consider increasing route/max_size.\n");
2089                 return ERR_PTR(-ENOMEM);
2090         }
2091
2092         in6_dev_hold(idev);
2093
2094         rt->dst.flags |= DST_HOST;
2095         rt->dst.input = ip6_input;
2096         rt->dst.output = ip6_output;
2097         rt->rt6i_idev = idev;
2098         rt->dst.obsolete = -1;
2099
2100         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2101         if (anycast)
2102                 rt->rt6i_flags |= RTF_ANYCAST;
2103         else
2104                 rt->rt6i_flags |= RTF_LOCAL;
2105         err = rt6_bind_neighbour(rt, rt->dst.dev);
2106         if (err) {
2107                 dst_free(&rt->dst);
2108                 return ERR_PTR(err);
2109         }
2110
2111         rt->rt6i_dst.addr = *addr;
2112         rt->rt6i_dst.plen = 128;
2113         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2114
2115         atomic_set(&rt->dst.__refcnt, 1);
2116
2117         return rt;
2118 }
2119
2120 int ip6_route_get_saddr(struct net *net,
2121                         struct rt6_info *rt,
2122                         const struct in6_addr *daddr,
2123                         unsigned int prefs,
2124                         struct in6_addr *saddr)
2125 {
2126         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2127         int err = 0;
2128         if (rt->rt6i_prefsrc.plen)
2129                 *saddr = rt->rt6i_prefsrc.addr;
2130         else
2131                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2132                                          daddr, prefs, saddr);
2133         return err;
2134 }
2135
2136 /* remove deleted ip from prefsrc entries */
2137 struct arg_dev_net_ip {
2138         struct net_device *dev;
2139         struct net *net;
2140         struct in6_addr *addr;
2141 };
2142
2143 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2144 {
2145         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2146         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2147         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2148
2149         if (((void *)rt->dst.dev == dev || !dev) &&
2150             rt != net->ipv6.ip6_null_entry &&
2151             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2152                 /* remove prefsrc entry */
2153                 rt->rt6i_prefsrc.plen = 0;
2154         }
2155         return 0;
2156 }
2157
2158 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2159 {
2160         struct net *net = dev_net(ifp->idev->dev);
2161         struct arg_dev_net_ip adni = {
2162                 .dev = ifp->idev->dev,
2163                 .net = net,
2164                 .addr = &ifp->addr,
2165         };
2166         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2167 }
2168
2169 struct arg_dev_net {
2170         struct net_device *dev;
2171         struct net *net;
2172 };
2173
2174 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2175 {
2176         const struct arg_dev_net *adn = arg;
2177         const struct net_device *dev = adn->dev;
2178
2179         if ((rt->dst.dev == dev || !dev) &&
2180             rt != adn->net->ipv6.ip6_null_entry)
2181                 return -1;
2182
2183         return 0;
2184 }
2185
2186 void rt6_ifdown(struct net *net, struct net_device *dev)
2187 {
2188         struct arg_dev_net adn = {
2189                 .dev = dev,
2190                 .net = net,
2191         };
2192
2193         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2194         icmp6_clean_all(fib6_ifdown, &adn);
2195 }
2196
2197 struct rt6_mtu_change_arg
2198 {
2199         struct net_device *dev;
2200         unsigned mtu;
2201 };
2202
2203 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2204 {
2205         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2206         struct inet6_dev *idev;
2207
2208         /* In IPv6 pmtu discovery is not optional,
2209            so that RTAX_MTU lock cannot disable it.
2210            We still use this lock to block changes
2211            caused by addrconf/ndisc.
2212         */
2213
2214         idev = __in6_dev_get(arg->dev);
2215         if (!idev)
2216                 return 0;
2217
2218         /* For administrative MTU increase, there is no way to discover
2219            IPv6 PMTU increase, so PMTU increase should be updated here.
2220            Since RFC 1981 doesn't include administrative MTU increase
2221            update PMTU increase is a MUST. (i.e. jumbo frame)
2222          */
2223         /*
2224            If new MTU is less than route PMTU, this new MTU will be the
2225            lowest MTU in the path, update the route PMTU to reflect PMTU
2226            decreases; if new MTU is greater than route PMTU, and the
2227            old MTU is the lowest MTU in the path, update the route PMTU
2228            to reflect the increase. In this case if the other nodes' MTU
2229            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2230            PMTU discouvery.
2231          */
2232         if (rt->dst.dev == arg->dev &&
2233             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2234             (dst_mtu(&rt->dst) >= arg->mtu ||
2235              (dst_mtu(&rt->dst) < arg->mtu &&
2236               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2237                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2238         }
2239         return 0;
2240 }
2241
2242 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2243 {
2244         struct rt6_mtu_change_arg arg = {
2245                 .dev = dev,
2246                 .mtu = mtu,
2247         };
2248
2249         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2250 }
2251
2252 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2253         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2254         [RTA_OIF]               = { .type = NLA_U32 },
2255         [RTA_IIF]               = { .type = NLA_U32 },
2256         [RTA_PRIORITY]          = { .type = NLA_U32 },
2257         [RTA_METRICS]           = { .type = NLA_NESTED },
2258 };
2259
2260 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2261                               struct fib6_config *cfg)
2262 {
2263         struct rtmsg *rtm;
2264         struct nlattr *tb[RTA_MAX+1];
2265         int err;
2266
2267         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2268         if (err < 0)
2269                 goto errout;
2270
2271         err = -EINVAL;
2272         rtm = nlmsg_data(nlh);
2273         memset(cfg, 0, sizeof(*cfg));
2274
2275         cfg->fc_table = rtm->rtm_table;
2276         cfg->fc_dst_len = rtm->rtm_dst_len;
2277         cfg->fc_src_len = rtm->rtm_src_len;
2278         cfg->fc_flags = RTF_UP;
2279         cfg->fc_protocol = rtm->rtm_protocol;
2280
2281         if (rtm->rtm_type == RTN_UNREACHABLE)
2282                 cfg->fc_flags |= RTF_REJECT;
2283
2284         if (rtm->rtm_type == RTN_LOCAL)
2285                 cfg->fc_flags |= RTF_LOCAL;
2286
2287         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2288         cfg->fc_nlinfo.nlh = nlh;
2289         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2290
2291         if (tb[RTA_GATEWAY]) {
2292                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2293                 cfg->fc_flags |= RTF_GATEWAY;
2294         }
2295
2296         if (tb[RTA_DST]) {
2297                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2298
2299                 if (nla_len(tb[RTA_DST]) < plen)
2300                         goto errout;
2301
2302                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2303         }
2304
2305         if (tb[RTA_SRC]) {
2306                 int plen = (rtm->rtm_src_len + 7) >> 3;
2307
2308                 if (nla_len(tb[RTA_SRC]) < plen)
2309                         goto errout;
2310
2311                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2312         }
2313
2314         if (tb[RTA_PREFSRC])
2315                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2316
2317         if (tb[RTA_OIF])
2318                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2319
2320         if (tb[RTA_PRIORITY])
2321                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2322
2323         if (tb[RTA_METRICS]) {
2324                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2325                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2326         }
2327
2328         if (tb[RTA_TABLE])
2329                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2330
2331         err = 0;
2332 errout:
2333         return err;
2334 }
2335
2336 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2337 {
2338         struct fib6_config cfg;
2339         int err;
2340
2341         err = rtm_to_fib6_config(skb, nlh, &cfg);
2342         if (err < 0)
2343                 return err;
2344
2345         return ip6_route_del(&cfg);
2346 }
2347
2348 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2349 {
2350         struct fib6_config cfg;
2351         int err;
2352
2353         err = rtm_to_fib6_config(skb, nlh, &cfg);
2354         if (err < 0)
2355                 return err;
2356
2357         return ip6_route_add(&cfg);
2358 }
2359
2360 static inline size_t rt6_nlmsg_size(void)
2361 {
2362         return NLMSG_ALIGN(sizeof(struct rtmsg))
2363                + nla_total_size(16) /* RTA_SRC */
2364                + nla_total_size(16) /* RTA_DST */
2365                + nla_total_size(16) /* RTA_GATEWAY */
2366                + nla_total_size(16) /* RTA_PREFSRC */
2367                + nla_total_size(4) /* RTA_TABLE */
2368                + nla_total_size(4) /* RTA_IIF */
2369                + nla_total_size(4) /* RTA_OIF */
2370                + nla_total_size(4) /* RTA_PRIORITY */
2371                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2372                + nla_total_size(sizeof(struct rta_cacheinfo));
2373 }
2374
2375 static int rt6_fill_node(struct net *net,
2376                          struct sk_buff *skb, struct rt6_info *rt,
2377                          struct in6_addr *dst, struct in6_addr *src,
2378                          int iif, int type, u32 pid, u32 seq,
2379                          int prefix, int nowait, unsigned int flags)
2380 {
2381         const struct inet_peer *peer;
2382         struct rtmsg *rtm;
2383         struct nlmsghdr *nlh;
2384         long expires;
2385         u32 table;
2386         struct neighbour *n;
2387         u32 ts, tsage;
2388
2389         if (prefix) {   /* user wants prefix routes only */
2390                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2391                         /* success since this is not a prefix route */
2392                         return 1;
2393                 }
2394         }
2395
2396         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2397         if (!nlh)
2398                 return -EMSGSIZE;
2399
2400         rtm = nlmsg_data(nlh);
2401         rtm->rtm_family = AF_INET6;
2402         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2403         rtm->rtm_src_len = rt->rt6i_src.plen;
2404         rtm->rtm_tos = 0;
2405         if (rt->rt6i_table)
2406                 table = rt->rt6i_table->tb6_id;
2407         else
2408                 table = RT6_TABLE_UNSPEC;
2409         rtm->rtm_table = table;
2410         NLA_PUT_U32(skb, RTA_TABLE, table);
2411         if (rt->rt6i_flags & RTF_REJECT)
2412                 rtm->rtm_type = RTN_UNREACHABLE;
2413         else if (rt->rt6i_flags & RTF_LOCAL)
2414                 rtm->rtm_type = RTN_LOCAL;
2415         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2416                 rtm->rtm_type = RTN_LOCAL;
2417         else
2418                 rtm->rtm_type = RTN_UNICAST;
2419         rtm->rtm_flags = 0;
2420         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2421         rtm->rtm_protocol = rt->rt6i_protocol;
2422         if (rt->rt6i_flags & RTF_DYNAMIC)
2423                 rtm->rtm_protocol = RTPROT_REDIRECT;
2424         else if (rt->rt6i_flags & RTF_ADDRCONF)
2425                 rtm->rtm_protocol = RTPROT_KERNEL;
2426         else if (rt->rt6i_flags & RTF_DEFAULT)
2427                 rtm->rtm_protocol = RTPROT_RA;
2428
2429         if (rt->rt6i_flags & RTF_CACHE)
2430                 rtm->rtm_flags |= RTM_F_CLONED;
2431
2432         if (dst) {
2433                 NLA_PUT(skb, RTA_DST, 16, dst);
2434                 rtm->rtm_dst_len = 128;
2435         } else if (rtm->rtm_dst_len)
2436                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2437 #ifdef CONFIG_IPV6_SUBTREES
2438         if (src) {
2439                 NLA_PUT(skb, RTA_SRC, 16, src);
2440                 rtm->rtm_src_len = 128;
2441         } else if (rtm->rtm_src_len)
2442                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2443 #endif
2444         if (iif) {
2445 #ifdef CONFIG_IPV6_MROUTE
2446                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2447                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2448                         if (err <= 0) {
2449                                 if (!nowait) {
2450                                         if (err == 0)
2451                                                 return 0;
2452                                         goto nla_put_failure;
2453                                 } else {
2454                                         if (err == -EMSGSIZE)
2455                                                 goto nla_put_failure;
2456                                 }
2457                         }
2458                 } else
2459 #endif
2460                         NLA_PUT_U32(skb, RTA_IIF, iif);
2461         } else if (dst) {
2462                 struct in6_addr saddr_buf;
2463                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2464                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2465         }
2466
2467         if (rt->rt6i_prefsrc.plen) {
2468                 struct in6_addr saddr_buf;
2469                 saddr_buf = rt->rt6i_prefsrc.addr;
2470                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2471         }
2472
2473         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2474                 goto nla_put_failure;
2475
2476         rcu_read_lock();
2477         n = dst_get_neighbour_noref(&rt->dst);
2478         if (n)
2479                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2480         rcu_read_unlock();
2481
2482         if (rt->dst.dev)
2483                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2484
2485         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2486
2487         if (!(rt->rt6i_flags & RTF_EXPIRES))
2488                 expires = 0;
2489         else if (rt->dst.expires - jiffies < INT_MAX)
2490                 expires = rt->dst.expires - jiffies;
2491         else
2492                 expires = INT_MAX;
2493
2494         peer = rt->rt6i_peer;
2495         ts = tsage = 0;
2496         if (peer && peer->tcp_ts_stamp) {
2497                 ts = peer->tcp_ts;
2498                 tsage = get_seconds() - peer->tcp_ts_stamp;
2499         }
2500
2501         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2502                                expires, rt->dst.error) < 0)
2503                 goto nla_put_failure;
2504
2505         return nlmsg_end(skb, nlh);
2506
2507 nla_put_failure:
2508         nlmsg_cancel(skb, nlh);
2509         return -EMSGSIZE;
2510 }
2511
2512 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2513 {
2514         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2515         int prefix;
2516
2517         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2518                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2519                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2520         } else
2521                 prefix = 0;
2522
2523         return rt6_fill_node(arg->net,
2524                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2525                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2526                      prefix, 0, NLM_F_MULTI);
2527 }
2528
2529 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2530 {
2531         struct net *net = sock_net(in_skb->sk);
2532         struct nlattr *tb[RTA_MAX+1];
2533         struct rt6_info *rt;
2534         struct sk_buff *skb;
2535         struct rtmsg *rtm;
2536         struct flowi6 fl6;
2537         int err, iif = 0;
2538
2539         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2540         if (err < 0)
2541                 goto errout;
2542
2543         err = -EINVAL;
2544         memset(&fl6, 0, sizeof(fl6));
2545
2546         if (tb[RTA_SRC]) {
2547                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2548                         goto errout;
2549
2550                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2551         }
2552
2553         if (tb[RTA_DST]) {
2554                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2555                         goto errout;
2556
2557                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2558         }
2559
2560         if (tb[RTA_IIF])
2561                 iif = nla_get_u32(tb[RTA_IIF]);
2562
2563         if (tb[RTA_OIF])
2564                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2565
2566         if (iif) {
2567                 struct net_device *dev;
2568                 dev = __dev_get_by_index(net, iif);
2569                 if (!dev) {
2570                         err = -ENODEV;
2571                         goto errout;
2572                 }
2573         }
2574
2575         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2576         if (!skb) {
2577                 err = -ENOBUFS;
2578                 goto errout;
2579         }
2580
2581         /* Reserve room for dummy headers, this skb can pass
2582            through good chunk of routing engine.
2583          */
2584         skb_reset_mac_header(skb);
2585         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2586
2587         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2588         skb_dst_set(skb, &rt->dst);
2589
2590         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2591                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2592                             nlh->nlmsg_seq, 0, 0, 0);
2593         if (err < 0) {
2594                 kfree_skb(skb);
2595                 goto errout;
2596         }
2597
2598         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2599 errout:
2600         return err;
2601 }
2602
2603 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2604 {
2605         struct sk_buff *skb;
2606         struct net *net = info->nl_net;
2607         u32 seq;
2608         int err;
2609
2610         err = -ENOBUFS;
2611         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2612
2613         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2614         if (!skb)
2615                 goto errout;
2616
2617         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2618                                 event, info->pid, seq, 0, 0, 0);
2619         if (err < 0) {
2620                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2621                 WARN_ON(err == -EMSGSIZE);
2622                 kfree_skb(skb);
2623                 goto errout;
2624         }
2625         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2626                     info->nlh, gfp_any());
2627         return;
2628 errout:
2629         if (err < 0)
2630                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2631 }
2632
2633 static int ip6_route_dev_notify(struct notifier_block *this,
2634                                 unsigned long event, void *data)
2635 {
2636         struct net_device *dev = (struct net_device *)data;
2637         struct net *net = dev_net(dev);
2638
2639         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2640                 net->ipv6.ip6_null_entry->dst.dev = dev;
2641                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2642 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2643                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2644                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2645                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2646                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2647 #endif
2648         }
2649
2650         return NOTIFY_OK;
2651 }
2652
2653 /*
2654  *      /proc
2655  */
2656
2657 #ifdef CONFIG_PROC_FS
2658
2659 struct rt6_proc_arg
2660 {
2661         char *buffer;
2662         int offset;
2663         int length;
2664         int skip;
2665         int len;
2666 };
2667
2668 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2669 {
2670         struct seq_file *m = p_arg;
2671         struct neighbour *n;
2672
2673         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2674
2675 #ifdef CONFIG_IPV6_SUBTREES
2676         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2677 #else
2678         seq_puts(m, "00000000000000000000000000000000 00 ");
2679 #endif
2680         rcu_read_lock();
2681         n = dst_get_neighbour_noref(&rt->dst);
2682         if (n) {
2683                 seq_printf(m, "%pi6", n->primary_key);
2684         } else {
2685                 seq_puts(m, "00000000000000000000000000000000");
2686         }
2687         rcu_read_unlock();
2688         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2689                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2690                    rt->dst.__use, rt->rt6i_flags,
2691                    rt->dst.dev ? rt->dst.dev->name : "");
2692         return 0;
2693 }
2694
2695 static int ipv6_route_show(struct seq_file *m, void *v)
2696 {
2697         struct net *net = (struct net *)m->private;
2698         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2699         return 0;
2700 }
2701
2702 static int ipv6_route_open(struct inode *inode, struct file *file)
2703 {
2704         return single_open_net(inode, file, ipv6_route_show);
2705 }
2706
2707 static const struct file_operations ipv6_route_proc_fops = {
2708         .owner          = THIS_MODULE,
2709         .open           = ipv6_route_open,
2710         .read           = seq_read,
2711         .llseek         = seq_lseek,
2712         .release        = single_release_net,
2713 };
2714
2715 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2716 {
2717         struct net *net = (struct net *)seq->private;
2718         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2719                    net->ipv6.rt6_stats->fib_nodes,
2720                    net->ipv6.rt6_stats->fib_route_nodes,
2721                    net->ipv6.rt6_stats->fib_rt_alloc,
2722                    net->ipv6.rt6_stats->fib_rt_entries,
2723                    net->ipv6.rt6_stats->fib_rt_cache,
2724                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2725                    net->ipv6.rt6_stats->fib_discarded_routes);
2726
2727         return 0;
2728 }
2729
2730 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2731 {
2732         return single_open_net(inode, file, rt6_stats_seq_show);
2733 }
2734
2735 static const struct file_operations rt6_stats_seq_fops = {
2736         .owner   = THIS_MODULE,
2737         .open    = rt6_stats_seq_open,
2738         .read    = seq_read,
2739         .llseek  = seq_lseek,
2740         .release = single_release_net,
2741 };
2742 #endif  /* CONFIG_PROC_FS */
2743
2744 #ifdef CONFIG_SYSCTL
2745
2746 static
2747 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2748                               void __user *buffer, size_t *lenp, loff_t *ppos)
2749 {
2750         struct net *net;
2751         int delay;
2752         if (!write)
2753                 return -EINVAL;
2754
2755         net = (struct net *)ctl->extra1;
2756         delay = net->ipv6.sysctl.flush_delay;
2757         proc_dointvec(ctl, write, buffer, lenp, ppos);
2758         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2759         return 0;
2760 }
2761
2762 ctl_table ipv6_route_table_template[] = {
2763         {
2764                 .procname       =       "flush",
2765                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2766                 .maxlen         =       sizeof(int),
2767                 .mode           =       0200,
2768                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2769         },
2770         {
2771                 .procname       =       "gc_thresh",
2772                 .data           =       &ip6_dst_ops_template.gc_thresh,
2773                 .maxlen         =       sizeof(int),
2774                 .mode           =       0644,
2775                 .proc_handler   =       proc_dointvec,
2776         },
2777         {
2778                 .procname       =       "max_size",
2779                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2780                 .maxlen         =       sizeof(int),
2781                 .mode           =       0644,
2782                 .proc_handler   =       proc_dointvec,
2783         },
2784         {
2785                 .procname       =       "gc_min_interval",
2786                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2787                 .maxlen         =       sizeof(int),
2788                 .mode           =       0644,
2789                 .proc_handler   =       proc_dointvec_jiffies,
2790         },
2791         {
2792                 .procname       =       "gc_timeout",
2793                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2794                 .maxlen         =       sizeof(int),
2795                 .mode           =       0644,
2796                 .proc_handler   =       proc_dointvec_jiffies,
2797         },
2798         {
2799                 .procname       =       "gc_interval",
2800                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2801                 .maxlen         =       sizeof(int),
2802                 .mode           =       0644,
2803                 .proc_handler   =       proc_dointvec_jiffies,
2804         },
2805         {
2806                 .procname       =       "gc_elasticity",
2807                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2808                 .maxlen         =       sizeof(int),
2809                 .mode           =       0644,
2810                 .proc_handler   =       proc_dointvec,
2811         },
2812         {
2813                 .procname       =       "mtu_expires",
2814                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2815                 .maxlen         =       sizeof(int),
2816                 .mode           =       0644,
2817                 .proc_handler   =       proc_dointvec_jiffies,
2818         },
2819         {
2820                 .procname       =       "min_adv_mss",
2821                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2822                 .maxlen         =       sizeof(int),
2823                 .mode           =       0644,
2824                 .proc_handler   =       proc_dointvec,
2825         },
2826         {
2827                 .procname       =       "gc_min_interval_ms",
2828                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2829                 .maxlen         =       sizeof(int),
2830                 .mode           =       0644,
2831                 .proc_handler   =       proc_dointvec_ms_jiffies,
2832         },
2833         { }
2834 };
2835
2836 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2837 {
2838         struct ctl_table *table;
2839
2840         table = kmemdup(ipv6_route_table_template,
2841                         sizeof(ipv6_route_table_template),
2842                         GFP_KERNEL);
2843
2844         if (table) {
2845                 table[0].data = &net->ipv6.sysctl.flush_delay;
2846                 table[0].extra1 = net;
2847                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2848                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2849                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2850                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2851                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2852                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2853                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2854                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2855                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2856         }
2857
2858         return table;
2859 }
2860 #endif
2861
2862 static int __net_init ip6_route_net_init(struct net *net)
2863 {
2864         int ret = -ENOMEM;
2865
2866         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2867                sizeof(net->ipv6.ip6_dst_ops));
2868
2869         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2870                 goto out_ip6_dst_ops;
2871
2872         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2873                                            sizeof(*net->ipv6.ip6_null_entry),
2874                                            GFP_KERNEL);
2875         if (!net->ipv6.ip6_null_entry)
2876                 goto out_ip6_dst_entries;
2877         net->ipv6.ip6_null_entry->dst.path =
2878                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2879         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2880         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2881                          ip6_template_metrics, true);
2882
2883 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2884         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2885                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2886                                                GFP_KERNEL);
2887         if (!net->ipv6.ip6_prohibit_entry)
2888                 goto out_ip6_null_entry;
2889         net->ipv6.ip6_prohibit_entry->dst.path =
2890                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2891         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2892         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2893                          ip6_template_metrics, true);
2894
2895         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2896                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2897                                                GFP_KERNEL);
2898         if (!net->ipv6.ip6_blk_hole_entry)
2899                 goto out_ip6_prohibit_entry;
2900         net->ipv6.ip6_blk_hole_entry->dst.path =
2901                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2902         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2903         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2904                          ip6_template_metrics, true);
2905 #endif
2906
2907         net->ipv6.sysctl.flush_delay = 0;
2908         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2909         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2910         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2911         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2912         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2913         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2914         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2915
2916 #ifdef CONFIG_PROC_FS
2917         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2918         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2919 #endif
2920         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2921
2922         ret = 0;
2923 out:
2924         return ret;
2925
2926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2927 out_ip6_prohibit_entry:
2928         kfree(net->ipv6.ip6_prohibit_entry);
2929 out_ip6_null_entry:
2930         kfree(net->ipv6.ip6_null_entry);
2931 #endif
2932 out_ip6_dst_entries:
2933         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2934 out_ip6_dst_ops:
2935         goto out;
2936 }
2937
2938 static void __net_exit ip6_route_net_exit(struct net *net)
2939 {
2940 #ifdef CONFIG_PROC_FS
2941         proc_net_remove(net, "ipv6_route");
2942         proc_net_remove(net, "rt6_stats");
2943 #endif
2944         kfree(net->ipv6.ip6_null_entry);
2945 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2946         kfree(net->ipv6.ip6_prohibit_entry);
2947         kfree(net->ipv6.ip6_blk_hole_entry);
2948 #endif
2949         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2950 }
2951
2952 static struct pernet_operations ip6_route_net_ops = {
2953         .init = ip6_route_net_init,
2954         .exit = ip6_route_net_exit,
2955 };
2956
2957 static struct notifier_block ip6_route_dev_notifier = {
2958         .notifier_call = ip6_route_dev_notify,
2959         .priority = 0,
2960 };
2961
2962 int __init ip6_route_init(void)
2963 {
2964         int ret;
2965
2966         ret = -ENOMEM;
2967         ip6_dst_ops_template.kmem_cachep =
2968                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2969                                   SLAB_HWCACHE_ALIGN, NULL);
2970         if (!ip6_dst_ops_template.kmem_cachep)
2971                 goto out;
2972
2973         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2974         if (ret)
2975                 goto out_kmem_cache;
2976
2977         ret = register_pernet_subsys(&ip6_route_net_ops);
2978         if (ret)
2979                 goto out_dst_entries;
2980
2981         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2982
2983         /* Registering of the loopback is done before this portion of code,
2984          * the loopback reference in rt6_info will not be taken, do it
2985          * manually for init_net */
2986         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2987         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2988   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2989         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2990         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2991         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2992         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2993   #endif
2994         ret = fib6_init();
2995         if (ret)
2996                 goto out_register_subsys;
2997
2998         ret = xfrm6_init();
2999         if (ret)
3000                 goto out_fib6_init;
3001
3002         ret = fib6_rules_init();
3003         if (ret)
3004                 goto xfrm6_init;
3005
3006         ret = -ENOBUFS;
3007         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3008             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3009             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3010                 goto fib6_rules_init;
3011
3012         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3013         if (ret)
3014                 goto fib6_rules_init;
3015
3016 out:
3017         return ret;
3018
3019 fib6_rules_init:
3020         fib6_rules_cleanup();
3021 xfrm6_init:
3022         xfrm6_fini();
3023 out_fib6_init:
3024         fib6_gc_cleanup();
3025 out_register_subsys:
3026         unregister_pernet_subsys(&ip6_route_net_ops);
3027 out_dst_entries:
3028         dst_entries_destroy(&ip6_dst_blackhole_ops);
3029 out_kmem_cache:
3030         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3031         goto out;
3032 }
3033
3034 void ip6_route_cleanup(void)
3035 {
3036         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3037         fib6_rules_cleanup();
3038         xfrm6_fini();
3039         fib6_gc_cleanup();
3040         unregister_pernet_subsys(&ip6_route_net_ops);
3041         dst_entries_destroy(&ip6_dst_blackhole_ops);
3042         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3043 }