ipv4: Don't create nh exeption when the device mtu is smaller than the reported pmtu
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208         if (*pos)
209                 return NULL;
210         return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215         ++*pos;
216         return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225         if (v == SEQ_START_TOKEN)
226                 seq_printf(seq, "%-127s\n",
227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229                            "HHUptod\tSpecDst");
230         return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234         .start  = rt_cache_seq_start,
235         .next   = rt_cache_seq_next,
236         .stop   = rt_cache_seq_stop,
237         .show   = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242         return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246         .owner   = THIS_MODULE,
247         .open    = rt_cache_seq_open,
248         .read    = seq_read,
249         .llseek  = seq_lseek,
250         .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256         int cpu;
257
258         if (*pos == 0)
259                 return SEQ_START_TOKEN;
260
261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262                 if (!cpu_possible(cpu))
263                         continue;
264                 *pos = cpu+1;
265                 return &per_cpu(rt_cache_stat, cpu);
266         }
267         return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272         int cpu;
273
274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275                 if (!cpu_possible(cpu))
276                         continue;
277                 *pos = cpu+1;
278                 return &per_cpu(rt_cache_stat, cpu);
279         }
280         return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291         struct rt_cache_stat *st = v;
292
293         if (v == SEQ_START_TOKEN) {
294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295                 return 0;
296         }
297
298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300                    dst_entries_get_slow(&ipv4_dst_ops),
301                    st->in_hit,
302                    st->in_slow_tot,
303                    st->in_slow_mc,
304                    st->in_no_route,
305                    st->in_brd,
306                    st->in_martian_dst,
307                    st->in_martian_src,
308
309                    st->out_hit,
310                    st->out_slow_tot,
311                    st->out_slow_mc,
312
313                    st->gc_total,
314                    st->gc_ignored,
315                    st->gc_goal_miss,
316                    st->gc_dst_overflow,
317                    st->in_hlist_search,
318                    st->out_hlist_search
319                 );
320         return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324         .start  = rt_cpu_seq_start,
325         .next   = rt_cpu_seq_next,
326         .stop   = rt_cpu_seq_stop,
327         .show   = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333         return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337         .owner   = THIS_MODULE,
338         .open    = rt_cpu_seq_open,
339         .read    = seq_read,
340         .llseek  = seq_lseek,
341         .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347         struct ip_rt_acct *dst, *src;
348         unsigned int i, j;
349
350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351         if (!dst)
352                 return -ENOMEM;
353
354         for_each_possible_cpu(i) {
355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356                 for (j = 0; j < 256; j++) {
357                         dst[j].o_bytes   += src[j].o_bytes;
358                         dst[j].o_packets += src[j].o_packets;
359                         dst[j].i_bytes   += src[j].i_bytes;
360                         dst[j].i_packets += src[j].i_packets;
361                 }
362         }
363
364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365         kfree(dst);
366         return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371         return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375         .owner          = THIS_MODULE,
376         .open           = rt_acct_proc_open,
377         .read           = seq_read,
378         .llseek         = seq_lseek,
379         .release        = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385         struct proc_dir_entry *pde;
386
387         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388                         &rt_cache_seq_fops);
389         if (!pde)
390                 goto err1;
391
392         pde = proc_create("rt_cache", S_IRUGO,
393                           net->proc_net_stat, &rt_cpu_seq_fops);
394         if (!pde)
395                 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399         if (!pde)
400                 goto err3;
401 #endif
402         return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406         remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409         remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411         return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416         remove_proc_entry("rt_cache", net->proc_net_stat);
417         remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419         remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424         .init = ip_rt_do_proc_init,
425         .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430         return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436         return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447         rt_genid_bump(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451                                            struct sk_buff *skb,
452                                            const void *daddr)
453 {
454         struct net_device *dev = dst->dev;
455         const __be32 *pkey = daddr;
456         const struct rtable *rt;
457         struct neighbour *n;
458
459         rt = (const struct rtable *) dst;
460         if (rt->rt_gateway)
461                 pkey = (const __be32 *) &rt->rt_gateway;
462         else if (skb)
463                 pkey = &ip_hdr(skb)->daddr;
464
465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466         if (n)
467                 return n;
468         return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480         static DEFINE_SPINLOCK(ip_fb_id_lock);
481         static u32 ip_fallback_id;
482         u32 salt;
483
484         spin_lock_bh(&ip_fb_id_lock);
485         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486         iph->id = htons(salt & 0xFFFF);
487         ip_fallback_id = salt;
488         spin_unlock_bh(&ip_fb_id_lock);
489 }
490
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493         struct net *net = dev_net(dst->dev);
494         struct inet_peer *peer;
495
496         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497         if (peer) {
498                 iph->id = htons(inet_getid(peer, more));
499                 inet_putpeer(peer);
500                 return;
501         }
502
503         ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508                              const struct iphdr *iph,
509                              int oif, u8 tos,
510                              u8 prot, u32 mark, int flow_flags)
511 {
512         if (sk) {
513                 const struct inet_sock *inet = inet_sk(sk);
514
515                 oif = sk->sk_bound_dev_if;
516                 mark = sk->sk_mark;
517                 tos = RT_CONN_FLAGS(sk);
518                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519         }
520         flowi4_init_output(fl4, oif, mark, tos,
521                            RT_SCOPE_UNIVERSE, prot,
522                            flow_flags,
523                            iph->daddr, iph->saddr, 0, 0);
524 }
525
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527                                const struct sock *sk)
528 {
529         const struct iphdr *iph = ip_hdr(skb);
530         int oif = skb->dev->ifindex;
531         u8 tos = RT_TOS(iph->tos);
532         u8 prot = iph->protocol;
533         u32 mark = skb->mark;
534
535         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540         const struct inet_sock *inet = inet_sk(sk);
541         const struct ip_options_rcu *inet_opt;
542         __be32 daddr = inet->inet_daddr;
543
544         rcu_read_lock();
545         inet_opt = rcu_dereference(inet->inet_opt);
546         if (inet_opt && inet_opt->opt.srr)
547                 daddr = inet_opt->opt.faddr;
548         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551                            inet_sk_flowi_flags(sk),
552                            daddr, inet->inet_saddr, 0, 0);
553         rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557                                  const struct sk_buff *skb)
558 {
559         if (skb)
560                 build_skb_flow_key(fl4, skb, sk);
561         else
562                 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574         struct fib_nh_exception *fnhe, *oldest;
575         struct rtable *orig;
576
577         oldest = rcu_dereference(hash->chain);
578         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579              fnhe = rcu_dereference(fnhe->fnhe_next)) {
580                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581                         oldest = fnhe;
582         }
583         orig = rcu_dereference(oldest->fnhe_rth);
584         if (orig) {
585                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586                 rt_free(orig);
587         }
588         return oldest;
589 }
590
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593         u32 hval;
594
595         hval = (__force u32) daddr;
596         hval ^= (hval >> 11) ^ (hval >> 22);
597
598         return hval & (FNHE_HASH_SIZE - 1);
599 }
600
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602                                   u32 pmtu, unsigned long expires)
603 {
604         struct fnhe_hash_bucket *hash;
605         struct fib_nh_exception *fnhe;
606         int depth;
607         u32 hval = fnhe_hashfun(daddr);
608
609         spin_lock_bh(&fnhe_lock);
610
611         hash = nh->nh_exceptions;
612         if (!hash) {
613                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614                 if (!hash)
615                         goto out_unlock;
616                 nh->nh_exceptions = hash;
617         }
618
619         hash += hval;
620
621         depth = 0;
622         for (fnhe = rcu_dereference(hash->chain); fnhe;
623              fnhe = rcu_dereference(fnhe->fnhe_next)) {
624                 if (fnhe->fnhe_daddr == daddr)
625                         break;
626                 depth++;
627         }
628
629         if (fnhe) {
630                 if (gw)
631                         fnhe->fnhe_gw = gw;
632                 if (pmtu) {
633                         fnhe->fnhe_pmtu = pmtu;
634                         fnhe->fnhe_expires = expires;
635                 }
636         } else {
637                 if (depth > FNHE_RECLAIM_DEPTH)
638                         fnhe = fnhe_oldest(hash);
639                 else {
640                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641                         if (!fnhe)
642                                 goto out_unlock;
643
644                         fnhe->fnhe_next = hash->chain;
645                         rcu_assign_pointer(hash->chain, fnhe);
646                 }
647                 fnhe->fnhe_daddr = daddr;
648                 fnhe->fnhe_gw = gw;
649                 fnhe->fnhe_pmtu = pmtu;
650                 fnhe->fnhe_expires = expires;
651         }
652
653         fnhe->fnhe_stamp = jiffies;
654
655 out_unlock:
656         spin_unlock_bh(&fnhe_lock);
657         return;
658 }
659
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661                              bool kill_route)
662 {
663         __be32 new_gw = icmp_hdr(skb)->un.gateway;
664         __be32 old_gw = ip_hdr(skb)->saddr;
665         struct net_device *dev = skb->dev;
666         struct in_device *in_dev;
667         struct fib_result res;
668         struct neighbour *n;
669         struct net *net;
670
671         switch (icmp_hdr(skb)->code & 7) {
672         case ICMP_REDIR_NET:
673         case ICMP_REDIR_NETTOS:
674         case ICMP_REDIR_HOST:
675         case ICMP_REDIR_HOSTTOS:
676                 break;
677
678         default:
679                 return;
680         }
681
682         if (rt->rt_gateway != old_gw)
683                 return;
684
685         in_dev = __in_dev_get_rcu(dev);
686         if (!in_dev)
687                 return;
688
689         net = dev_net(dev);
690         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692             ipv4_is_zeronet(new_gw))
693                 goto reject_redirect;
694
695         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697                         goto reject_redirect;
698                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699                         goto reject_redirect;
700         } else {
701                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702                         goto reject_redirect;
703         }
704
705         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706         if (n) {
707                 if (!(n->nud_state & NUD_VALID)) {
708                         neigh_event_send(n, NULL);
709                 } else {
710                         if (fib_lookup(net, fl4, &res) == 0) {
711                                 struct fib_nh *nh = &FIB_RES_NH(res);
712
713                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714                                                       0, 0);
715                         }
716                         if (kill_route)
717                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
718                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719                 }
720                 neigh_release(n);
721         }
722         return;
723
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726         if (IN_DEV_LOG_MARTIANS(in_dev)) {
727                 const struct iphdr *iph = (const struct iphdr *) skb->data;
728                 __be32 daddr = iph->daddr;
729                 __be32 saddr = iph->saddr;
730
731                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732                                      "  Advised path = %pI4 -> %pI4\n",
733                                      &old_gw, dev->name, &new_gw,
734                                      &saddr, &daddr);
735         }
736 #endif
737         ;
738 }
739
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742         struct rtable *rt;
743         struct flowi4 fl4;
744
745         rt = (struct rtable *) dst;
746
747         ip_rt_build_flow_key(&fl4, sk, skb);
748         __ip_do_redirect(rt, skb, &fl4, true);
749 }
750
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753         struct rtable *rt = (struct rtable *)dst;
754         struct dst_entry *ret = dst;
755
756         if (rt) {
757                 if (dst->obsolete > 0) {
758                         ip_rt_put(rt);
759                         ret = NULL;
760                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761                            rt->dst.expires) {
762                         ip_rt_put(rt);
763                         ret = NULL;
764                 }
765         }
766         return ret;
767 }
768
769 /*
770  * Algorithm:
771  *      1. The first ip_rt_redirect_number redirects are sent
772  *         with exponential backoff, then we stop sending them at all,
773  *         assuming that the host ignores our redirects.
774  *      2. If we did not see packets requiring redirects
775  *         during ip_rt_redirect_silence, we assume that the host
776  *         forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787         struct rtable *rt = skb_rtable(skb);
788         struct in_device *in_dev;
789         struct inet_peer *peer;
790         struct net *net;
791         int log_martians;
792
793         rcu_read_lock();
794         in_dev = __in_dev_get_rcu(rt->dst.dev);
795         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796                 rcu_read_unlock();
797                 return;
798         }
799         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800         rcu_read_unlock();
801
802         net = dev_net(rt->dst.dev);
803         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804         if (!peer) {
805                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
806                 return;
807         }
808
809         /* No redirected packets during ip_rt_redirect_silence;
810          * reset the algorithm.
811          */
812         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
813                 peer->rate_tokens = 0;
814
815         /* Too many ignored redirects; do not send anything
816          * set dst.rate_last to the last seen redirected packet.
817          */
818         if (peer->rate_tokens >= ip_rt_redirect_number) {
819                 peer->rate_last = jiffies;
820                 goto out_put_peer;
821         }
822
823         /* Check for load limit; set rate_last to the latest sent
824          * redirect.
825          */
826         if (peer->rate_tokens == 0 ||
827             time_after(jiffies,
828                        (peer->rate_last +
829                         (ip_rt_redirect_load << peer->rate_tokens)))) {
830                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
831                 peer->rate_last = jiffies;
832                 ++peer->rate_tokens;
833 #ifdef CONFIG_IP_ROUTE_VERBOSE
834                 if (log_martians &&
835                     peer->rate_tokens == ip_rt_redirect_number)
836                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
837                                              &ip_hdr(skb)->saddr, inet_iif(skb),
838                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
839 #endif
840         }
841 out_put_peer:
842         inet_putpeer(peer);
843 }
844
845 static int ip_error(struct sk_buff *skb)
846 {
847         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
848         struct rtable *rt = skb_rtable(skb);
849         struct inet_peer *peer;
850         unsigned long now;
851         struct net *net;
852         bool send;
853         int code;
854
855         net = dev_net(rt->dst.dev);
856         if (!IN_DEV_FORWARD(in_dev)) {
857                 switch (rt->dst.error) {
858                 case EHOSTUNREACH:
859                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
860                         break;
861
862                 case ENETUNREACH:
863                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
864                         break;
865                 }
866                 goto out;
867         }
868
869         switch (rt->dst.error) {
870         case EINVAL:
871         default:
872                 goto out;
873         case EHOSTUNREACH:
874                 code = ICMP_HOST_UNREACH;
875                 break;
876         case ENETUNREACH:
877                 code = ICMP_NET_UNREACH;
878                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
879                 break;
880         case EACCES:
881                 code = ICMP_PKT_FILTERED;
882                 break;
883         }
884
885         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
886
887         send = true;
888         if (peer) {
889                 now = jiffies;
890                 peer->rate_tokens += now - peer->rate_last;
891                 if (peer->rate_tokens > ip_rt_error_burst)
892                         peer->rate_tokens = ip_rt_error_burst;
893                 peer->rate_last = now;
894                 if (peer->rate_tokens >= ip_rt_error_cost)
895                         peer->rate_tokens -= ip_rt_error_cost;
896                 else
897                         send = false;
898                 inet_putpeer(peer);
899         }
900         if (send)
901                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
902
903 out:    kfree_skb(skb);
904         return 0;
905 }
906
907 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
908 {
909         struct dst_entry *dst = &rt->dst;
910         struct fib_result res;
911
912         if (dst->dev->mtu < mtu)
913                 return;
914
915         if (mtu < ip_rt_min_pmtu)
916                 mtu = ip_rt_min_pmtu;
917
918         if (!rt->rt_pmtu) {
919                 dst->obsolete = DST_OBSOLETE_KILL;
920         } else {
921                 rt->rt_pmtu = mtu;
922                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
923         }
924
925         rcu_read_lock();
926         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
927                 struct fib_nh *nh = &FIB_RES_NH(res);
928
929                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
930                                       jiffies + ip_rt_mtu_expires);
931         }
932         rcu_read_unlock();
933 }
934
935 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
936                               struct sk_buff *skb, u32 mtu)
937 {
938         struct rtable *rt = (struct rtable *) dst;
939         struct flowi4 fl4;
940
941         ip_rt_build_flow_key(&fl4, sk, skb);
942         __ip_rt_update_pmtu(rt, &fl4, mtu);
943 }
944
945 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
946                       int oif, u32 mark, u8 protocol, int flow_flags)
947 {
948         const struct iphdr *iph = (const struct iphdr *) skb->data;
949         struct flowi4 fl4;
950         struct rtable *rt;
951
952         __build_flow_key(&fl4, NULL, iph, oif,
953                          RT_TOS(iph->tos), protocol, mark, flow_flags);
954         rt = __ip_route_output_key(net, &fl4);
955         if (!IS_ERR(rt)) {
956                 __ip_rt_update_pmtu(rt, &fl4, mtu);
957                 ip_rt_put(rt);
958         }
959 }
960 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
961
962 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
963 {
964         const struct iphdr *iph = (const struct iphdr *) skb->data;
965         struct flowi4 fl4;
966         struct rtable *rt;
967
968         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
969         rt = __ip_route_output_key(sock_net(sk), &fl4);
970         if (!IS_ERR(rt)) {
971                 __ip_rt_update_pmtu(rt, &fl4, mtu);
972                 ip_rt_put(rt);
973         }
974 }
975 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
976
977 void ipv4_redirect(struct sk_buff *skb, struct net *net,
978                    int oif, u32 mark, u8 protocol, int flow_flags)
979 {
980         const struct iphdr *iph = (const struct iphdr *) skb->data;
981         struct flowi4 fl4;
982         struct rtable *rt;
983
984         __build_flow_key(&fl4, NULL, iph, oif,
985                          RT_TOS(iph->tos), protocol, mark, flow_flags);
986         rt = __ip_route_output_key(net, &fl4);
987         if (!IS_ERR(rt)) {
988                 __ip_do_redirect(rt, skb, &fl4, false);
989                 ip_rt_put(rt);
990         }
991 }
992 EXPORT_SYMBOL_GPL(ipv4_redirect);
993
994 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
995 {
996         const struct iphdr *iph = (const struct iphdr *) skb->data;
997         struct flowi4 fl4;
998         struct rtable *rt;
999
1000         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1001         rt = __ip_route_output_key(sock_net(sk), &fl4);
1002         if (!IS_ERR(rt)) {
1003                 __ip_do_redirect(rt, skb, &fl4, false);
1004                 ip_rt_put(rt);
1005         }
1006 }
1007 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1008
1009 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1010 {
1011         struct rtable *rt = (struct rtable *) dst;
1012
1013         /* All IPV4 dsts are created with ->obsolete set to the value
1014          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1015          * into this function always.
1016          *
1017          * When a PMTU/redirect information update invalidates a
1018          * route, this is indicated by setting obsolete to
1019          * DST_OBSOLETE_KILL.
1020          */
1021         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1022                 return NULL;
1023         return dst;
1024 }
1025
1026 static void ipv4_link_failure(struct sk_buff *skb)
1027 {
1028         struct rtable *rt;
1029
1030         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1031
1032         rt = skb_rtable(skb);
1033         if (rt)
1034                 dst_set_expires(&rt->dst, 0);
1035 }
1036
1037 static int ip_rt_bug(struct sk_buff *skb)
1038 {
1039         pr_debug("%s: %pI4 -> %pI4, %s\n",
1040                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1041                  skb->dev ? skb->dev->name : "?");
1042         kfree_skb(skb);
1043         WARN_ON(1);
1044         return 0;
1045 }
1046
1047 /*
1048    We do not cache source address of outgoing interface,
1049    because it is used only by IP RR, TS and SRR options,
1050    so that it out of fast path.
1051
1052    BTW remember: "addr" is allowed to be not aligned
1053    in IP options!
1054  */
1055
1056 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1057 {
1058         __be32 src;
1059
1060         if (rt_is_output_route(rt))
1061                 src = ip_hdr(skb)->saddr;
1062         else {
1063                 struct fib_result res;
1064                 struct flowi4 fl4;
1065                 struct iphdr *iph;
1066
1067                 iph = ip_hdr(skb);
1068
1069                 memset(&fl4, 0, sizeof(fl4));
1070                 fl4.daddr = iph->daddr;
1071                 fl4.saddr = iph->saddr;
1072                 fl4.flowi4_tos = RT_TOS(iph->tos);
1073                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1074                 fl4.flowi4_iif = skb->dev->ifindex;
1075                 fl4.flowi4_mark = skb->mark;
1076
1077                 rcu_read_lock();
1078                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1079                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1080                 else
1081                         src = inet_select_addr(rt->dst.dev,
1082                                                rt_nexthop(rt, iph->daddr),
1083                                                RT_SCOPE_UNIVERSE);
1084                 rcu_read_unlock();
1085         }
1086         memcpy(addr, &src, 4);
1087 }
1088
1089 #ifdef CONFIG_IP_ROUTE_CLASSID
1090 static void set_class_tag(struct rtable *rt, u32 tag)
1091 {
1092         if (!(rt->dst.tclassid & 0xFFFF))
1093                 rt->dst.tclassid |= tag & 0xFFFF;
1094         if (!(rt->dst.tclassid & 0xFFFF0000))
1095                 rt->dst.tclassid |= tag & 0xFFFF0000;
1096 }
1097 #endif
1098
1099 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1100 {
1101         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1102
1103         if (advmss == 0) {
1104                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1105                                ip_rt_min_advmss);
1106                 if (advmss > 65535 - 40)
1107                         advmss = 65535 - 40;
1108         }
1109         return advmss;
1110 }
1111
1112 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1113 {
1114         const struct rtable *rt = (const struct rtable *) dst;
1115         unsigned int mtu = rt->rt_pmtu;
1116
1117         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1118                 mtu = dst_metric_raw(dst, RTAX_MTU);
1119
1120         if (mtu && rt_is_output_route(rt))
1121                 return mtu;
1122
1123         mtu = dst->dev->mtu;
1124
1125         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1126                 if (rt->rt_gateway && mtu > 576)
1127                         mtu = 576;
1128         }
1129
1130         if (mtu > IP_MAX_MTU)
1131                 mtu = IP_MAX_MTU;
1132
1133         return mtu;
1134 }
1135
1136 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1137 {
1138         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1139         struct fib_nh_exception *fnhe;
1140         u32 hval;
1141
1142         if (!hash)
1143                 return NULL;
1144
1145         hval = fnhe_hashfun(daddr);
1146
1147         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1148              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1149                 if (fnhe->fnhe_daddr == daddr)
1150                         return fnhe;
1151         }
1152         return NULL;
1153 }
1154
1155 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1156                               __be32 daddr)
1157 {
1158         bool ret = false;
1159
1160         spin_lock_bh(&fnhe_lock);
1161
1162         if (daddr == fnhe->fnhe_daddr) {
1163                 struct rtable *orig;
1164
1165                 if (fnhe->fnhe_pmtu) {
1166                         unsigned long expires = fnhe->fnhe_expires;
1167                         unsigned long diff = expires - jiffies;
1168
1169                         if (time_before(jiffies, expires)) {
1170                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1171                                 dst_set_expires(&rt->dst, diff);
1172                         }
1173                 }
1174                 if (fnhe->fnhe_gw) {
1175                         rt->rt_flags |= RTCF_REDIRECTED;
1176                         rt->rt_gateway = fnhe->fnhe_gw;
1177                 }
1178
1179                 orig = rcu_dereference(fnhe->fnhe_rth);
1180                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1181                 if (orig)
1182                         rt_free(orig);
1183
1184                 fnhe->fnhe_stamp = jiffies;
1185                 ret = true;
1186         } else {
1187                 /* Routes we intend to cache in nexthop exception have
1188                  * the DST_NOCACHE bit clear.  However, if we are
1189                  * unsuccessful at storing this route into the cache
1190                  * we really need to set it.
1191                  */
1192                 rt->dst.flags |= DST_NOCACHE;
1193         }
1194         spin_unlock_bh(&fnhe_lock);
1195
1196         return ret;
1197 }
1198
1199 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1200 {
1201         struct rtable *orig, *prev, **p;
1202         bool ret = true;
1203
1204         if (rt_is_input_route(rt)) {
1205                 p = (struct rtable **)&nh->nh_rth_input;
1206         } else {
1207                 if (!nh->nh_pcpu_rth_output)
1208                         goto nocache;
1209                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1210         }
1211         orig = *p;
1212
1213         prev = cmpxchg(p, orig, rt);
1214         if (prev == orig) {
1215                 if (orig)
1216                         rt_free(orig);
1217         } else {
1218                 /* Routes we intend to cache in the FIB nexthop have
1219                  * the DST_NOCACHE bit clear.  However, if we are
1220                  * unsuccessful at storing this route into the cache
1221                  * we really need to set it.
1222                  */
1223 nocache:
1224                 rt->dst.flags |= DST_NOCACHE;
1225                 ret = false;
1226         }
1227
1228         return ret;
1229 }
1230
1231 static DEFINE_SPINLOCK(rt_uncached_lock);
1232 static LIST_HEAD(rt_uncached_list);
1233
1234 static void rt_add_uncached_list(struct rtable *rt)
1235 {
1236         spin_lock_bh(&rt_uncached_lock);
1237         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1238         spin_unlock_bh(&rt_uncached_lock);
1239 }
1240
1241 static void ipv4_dst_destroy(struct dst_entry *dst)
1242 {
1243         struct rtable *rt = (struct rtable *) dst;
1244
1245         if (!list_empty(&rt->rt_uncached)) {
1246                 spin_lock_bh(&rt_uncached_lock);
1247                 list_del(&rt->rt_uncached);
1248                 spin_unlock_bh(&rt_uncached_lock);
1249         }
1250 }
1251
1252 void rt_flush_dev(struct net_device *dev)
1253 {
1254         if (!list_empty(&rt_uncached_list)) {
1255                 struct net *net = dev_net(dev);
1256                 struct rtable *rt;
1257
1258                 spin_lock_bh(&rt_uncached_lock);
1259                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1260                         if (rt->dst.dev != dev)
1261                                 continue;
1262                         rt->dst.dev = net->loopback_dev;
1263                         dev_hold(rt->dst.dev);
1264                         dev_put(dev);
1265                 }
1266                 spin_unlock_bh(&rt_uncached_lock);
1267         }
1268 }
1269
1270 static bool rt_cache_valid(const struct rtable *rt)
1271 {
1272         return  rt &&
1273                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1274                 !rt_is_expired(rt);
1275 }
1276
1277 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1278                            const struct fib_result *res,
1279                            struct fib_nh_exception *fnhe,
1280                            struct fib_info *fi, u16 type, u32 itag)
1281 {
1282         bool cached = false;
1283
1284         if (fi) {
1285                 struct fib_nh *nh = &FIB_RES_NH(*res);
1286
1287                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1288                         rt->rt_gateway = nh->nh_gw;
1289                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291                 rt->dst.tclassid = nh->nh_tclassid;
1292 #endif
1293                 if (unlikely(fnhe))
1294                         cached = rt_bind_exception(rt, fnhe, daddr);
1295                 else if (!(rt->dst.flags & DST_NOCACHE))
1296                         cached = rt_cache_route(nh, rt);
1297         }
1298         if (unlikely(!cached))
1299                 rt_add_uncached_list(rt);
1300
1301 #ifdef CONFIG_IP_ROUTE_CLASSID
1302 #ifdef CONFIG_IP_MULTIPLE_TABLES
1303         set_class_tag(rt, res->tclassid);
1304 #endif
1305         set_class_tag(rt, itag);
1306 #endif
1307 }
1308
1309 static struct rtable *rt_dst_alloc(struct net_device *dev,
1310                                    bool nopolicy, bool noxfrm, bool will_cache)
1311 {
1312         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1313                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1314                          (nopolicy ? DST_NOPOLICY : 0) |
1315                          (noxfrm ? DST_NOXFRM : 0));
1316 }
1317
1318 /* called in rcu_read_lock() section */
1319 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1320                                 u8 tos, struct net_device *dev, int our)
1321 {
1322         struct rtable *rth;
1323         struct in_device *in_dev = __in_dev_get_rcu(dev);
1324         u32 itag = 0;
1325         int err;
1326
1327         /* Primary sanity checks. */
1328
1329         if (in_dev == NULL)
1330                 return -EINVAL;
1331
1332         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1333             skb->protocol != htons(ETH_P_IP))
1334                 goto e_inval;
1335
1336         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1337                 if (ipv4_is_loopback(saddr))
1338                         goto e_inval;
1339
1340         if (ipv4_is_zeronet(saddr)) {
1341                 if (!ipv4_is_local_multicast(daddr))
1342                         goto e_inval;
1343         } else {
1344                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1345                                           in_dev, &itag);
1346                 if (err < 0)
1347                         goto e_err;
1348         }
1349         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1350                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1351         if (!rth)
1352                 goto e_nobufs;
1353
1354 #ifdef CONFIG_IP_ROUTE_CLASSID
1355         rth->dst.tclassid = itag;
1356 #endif
1357         rth->dst.output = ip_rt_bug;
1358
1359         rth->rt_genid   = rt_genid(dev_net(dev));
1360         rth->rt_flags   = RTCF_MULTICAST;
1361         rth->rt_type    = RTN_MULTICAST;
1362         rth->rt_is_input= 1;
1363         rth->rt_iif     = 0;
1364         rth->rt_pmtu    = 0;
1365         rth->rt_gateway = 0;
1366         INIT_LIST_HEAD(&rth->rt_uncached);
1367         if (our) {
1368                 rth->dst.input= ip_local_deliver;
1369                 rth->rt_flags |= RTCF_LOCAL;
1370         }
1371
1372 #ifdef CONFIG_IP_MROUTE
1373         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1374                 rth->dst.input = ip_mr_input;
1375 #endif
1376         RT_CACHE_STAT_INC(in_slow_mc);
1377
1378         skb_dst_set(skb, &rth->dst);
1379         return 0;
1380
1381 e_nobufs:
1382         return -ENOBUFS;
1383 e_inval:
1384         return -EINVAL;
1385 e_err:
1386         return err;
1387 }
1388
1389
1390 static void ip_handle_martian_source(struct net_device *dev,
1391                                      struct in_device *in_dev,
1392                                      struct sk_buff *skb,
1393                                      __be32 daddr,
1394                                      __be32 saddr)
1395 {
1396         RT_CACHE_STAT_INC(in_martian_src);
1397 #ifdef CONFIG_IP_ROUTE_VERBOSE
1398         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1399                 /*
1400                  *      RFC1812 recommendation, if source is martian,
1401                  *      the only hint is MAC header.
1402                  */
1403                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1404                         &daddr, &saddr, dev->name);
1405                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1406                         print_hex_dump(KERN_WARNING, "ll header: ",
1407                                        DUMP_PREFIX_OFFSET, 16, 1,
1408                                        skb_mac_header(skb),
1409                                        dev->hard_header_len, true);
1410                 }
1411         }
1412 #endif
1413 }
1414
1415 /* called in rcu_read_lock() section */
1416 static int __mkroute_input(struct sk_buff *skb,
1417                            const struct fib_result *res,
1418                            struct in_device *in_dev,
1419                            __be32 daddr, __be32 saddr, u32 tos)
1420 {
1421         struct rtable *rth;
1422         int err;
1423         struct in_device *out_dev;
1424         unsigned int flags = 0;
1425         bool do_cache;
1426         u32 itag;
1427
1428         /* get a working reference to the output device */
1429         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1430         if (out_dev == NULL) {
1431                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1432                 return -EINVAL;
1433         }
1434
1435
1436         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1437                                   in_dev->dev, in_dev, &itag);
1438         if (err < 0) {
1439                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1440                                          saddr);
1441
1442                 goto cleanup;
1443         }
1444
1445         if (out_dev == in_dev && err &&
1446             (IN_DEV_SHARED_MEDIA(out_dev) ||
1447              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1448                 flags |= RTCF_DOREDIRECT;
1449
1450         if (skb->protocol != htons(ETH_P_IP)) {
1451                 /* Not IP (i.e. ARP). Do not create route, if it is
1452                  * invalid for proxy arp. DNAT routes are always valid.
1453                  *
1454                  * Proxy arp feature have been extended to allow, ARP
1455                  * replies back to the same interface, to support
1456                  * Private VLAN switch technologies. See arp.c.
1457                  */
1458                 if (out_dev == in_dev &&
1459                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1460                         err = -EINVAL;
1461                         goto cleanup;
1462                 }
1463         }
1464
1465         do_cache = false;
1466         if (res->fi) {
1467                 if (!itag) {
1468                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1469                         if (rt_cache_valid(rth)) {
1470                                 skb_dst_set_noref(skb, &rth->dst);
1471                                 goto out;
1472                         }
1473                         do_cache = true;
1474                 }
1475         }
1476
1477         rth = rt_dst_alloc(out_dev->dev,
1478                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1479                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1480         if (!rth) {
1481                 err = -ENOBUFS;
1482                 goto cleanup;
1483         }
1484
1485         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1486         rth->rt_flags = flags;
1487         rth->rt_type = res->type;
1488         rth->rt_is_input = 1;
1489         rth->rt_iif     = 0;
1490         rth->rt_pmtu    = 0;
1491         rth->rt_gateway = 0;
1492         INIT_LIST_HEAD(&rth->rt_uncached);
1493
1494         rth->dst.input = ip_forward;
1495         rth->dst.output = ip_output;
1496
1497         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1498         skb_dst_set(skb, &rth->dst);
1499 out:
1500         err = 0;
1501  cleanup:
1502         return err;
1503 }
1504
1505 static int ip_mkroute_input(struct sk_buff *skb,
1506                             struct fib_result *res,
1507                             const struct flowi4 *fl4,
1508                             struct in_device *in_dev,
1509                             __be32 daddr, __be32 saddr, u32 tos)
1510 {
1511 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1512         if (res->fi && res->fi->fib_nhs > 1)
1513                 fib_select_multipath(res);
1514 #endif
1515
1516         /* create a routing cache entry */
1517         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1518 }
1519
1520 /*
1521  *      NOTE. We drop all the packets that has local source
1522  *      addresses, because every properly looped back packet
1523  *      must have correct destination already attached by output routine.
1524  *
1525  *      Such approach solves two big problems:
1526  *      1. Not simplex devices are handled properly.
1527  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1528  *      called with rcu_read_lock()
1529  */
1530
1531 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1532                                u8 tos, struct net_device *dev)
1533 {
1534         struct fib_result res;
1535         struct in_device *in_dev = __in_dev_get_rcu(dev);
1536         struct flowi4   fl4;
1537         unsigned int    flags = 0;
1538         u32             itag = 0;
1539         struct rtable   *rth;
1540         int             err = -EINVAL;
1541         struct net    *net = dev_net(dev);
1542         bool do_cache;
1543
1544         /* IP on this device is disabled. */
1545
1546         if (!in_dev)
1547                 goto out;
1548
1549         /* Check for the most weird martians, which can be not detected
1550            by fib_lookup.
1551          */
1552
1553         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1554                 goto martian_source;
1555
1556         res.fi = NULL;
1557         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1558                 goto brd_input;
1559
1560         /* Accept zero addresses only to limited broadcast;
1561          * I even do not know to fix it or not. Waiting for complains :-)
1562          */
1563         if (ipv4_is_zeronet(saddr))
1564                 goto martian_source;
1565
1566         if (ipv4_is_zeronet(daddr))
1567                 goto martian_destination;
1568
1569         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1570          * and call it once if daddr or/and saddr are loopback addresses
1571          */
1572         if (ipv4_is_loopback(daddr)) {
1573                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1574                         goto martian_destination;
1575         } else if (ipv4_is_loopback(saddr)) {
1576                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577                         goto martian_source;
1578         }
1579
1580         /*
1581          *      Now we are ready to route packet.
1582          */
1583         fl4.flowi4_oif = 0;
1584         fl4.flowi4_iif = dev->ifindex;
1585         fl4.flowi4_mark = skb->mark;
1586         fl4.flowi4_tos = tos;
1587         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1588         fl4.daddr = daddr;
1589         fl4.saddr = saddr;
1590         err = fib_lookup(net, &fl4, &res);
1591         if (err != 0)
1592                 goto no_route;
1593
1594         RT_CACHE_STAT_INC(in_slow_tot);
1595
1596         if (res.type == RTN_BROADCAST)
1597                 goto brd_input;
1598
1599         if (res.type == RTN_LOCAL) {
1600                 err = fib_validate_source(skb, saddr, daddr, tos,
1601                                           LOOPBACK_IFINDEX,
1602                                           dev, in_dev, &itag);
1603                 if (err < 0)
1604                         goto martian_source_keep_err;
1605                 goto local_input;
1606         }
1607
1608         if (!IN_DEV_FORWARD(in_dev))
1609                 goto no_route;
1610         if (res.type != RTN_UNICAST)
1611                 goto martian_destination;
1612
1613         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1614 out:    return err;
1615
1616 brd_input:
1617         if (skb->protocol != htons(ETH_P_IP))
1618                 goto e_inval;
1619
1620         if (!ipv4_is_zeronet(saddr)) {
1621                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1622                                           in_dev, &itag);
1623                 if (err < 0)
1624                         goto martian_source_keep_err;
1625         }
1626         flags |= RTCF_BROADCAST;
1627         res.type = RTN_BROADCAST;
1628         RT_CACHE_STAT_INC(in_brd);
1629
1630 local_input:
1631         do_cache = false;
1632         if (res.fi) {
1633                 if (!itag) {
1634                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1635                         if (rt_cache_valid(rth)) {
1636                                 skb_dst_set_noref(skb, &rth->dst);
1637                                 err = 0;
1638                                 goto out;
1639                         }
1640                         do_cache = true;
1641                 }
1642         }
1643
1644         rth = rt_dst_alloc(net->loopback_dev,
1645                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1646         if (!rth)
1647                 goto e_nobufs;
1648
1649         rth->dst.input= ip_local_deliver;
1650         rth->dst.output= ip_rt_bug;
1651 #ifdef CONFIG_IP_ROUTE_CLASSID
1652         rth->dst.tclassid = itag;
1653 #endif
1654
1655         rth->rt_genid = rt_genid(net);
1656         rth->rt_flags   = flags|RTCF_LOCAL;
1657         rth->rt_type    = res.type;
1658         rth->rt_is_input = 1;
1659         rth->rt_iif     = 0;
1660         rth->rt_pmtu    = 0;
1661         rth->rt_gateway = 0;
1662         INIT_LIST_HEAD(&rth->rt_uncached);
1663         if (res.type == RTN_UNREACHABLE) {
1664                 rth->dst.input= ip_error;
1665                 rth->dst.error= -err;
1666                 rth->rt_flags   &= ~RTCF_LOCAL;
1667         }
1668         if (do_cache)
1669                 rt_cache_route(&FIB_RES_NH(res), rth);
1670         skb_dst_set(skb, &rth->dst);
1671         err = 0;
1672         goto out;
1673
1674 no_route:
1675         RT_CACHE_STAT_INC(in_no_route);
1676         res.type = RTN_UNREACHABLE;
1677         if (err == -ESRCH)
1678                 err = -ENETUNREACH;
1679         goto local_input;
1680
1681         /*
1682          *      Do not cache martian addresses: they should be logged (RFC1812)
1683          */
1684 martian_destination:
1685         RT_CACHE_STAT_INC(in_martian_dst);
1686 #ifdef CONFIG_IP_ROUTE_VERBOSE
1687         if (IN_DEV_LOG_MARTIANS(in_dev))
1688                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1689                                      &daddr, &saddr, dev->name);
1690 #endif
1691
1692 e_inval:
1693         err = -EINVAL;
1694         goto out;
1695
1696 e_nobufs:
1697         err = -ENOBUFS;
1698         goto out;
1699
1700 martian_source:
1701         err = -EINVAL;
1702 martian_source_keep_err:
1703         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1704         goto out;
1705 }
1706
1707 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1708                          u8 tos, struct net_device *dev)
1709 {
1710         int res;
1711
1712         rcu_read_lock();
1713
1714         /* Multicast recognition logic is moved from route cache to here.
1715            The problem was that too many Ethernet cards have broken/missing
1716            hardware multicast filters :-( As result the host on multicasting
1717            network acquires a lot of useless route cache entries, sort of
1718            SDR messages from all the world. Now we try to get rid of them.
1719            Really, provided software IP multicast filter is organized
1720            reasonably (at least, hashed), it does not result in a slowdown
1721            comparing with route cache reject entries.
1722            Note, that multicast routers are not affected, because
1723            route cache entry is created eventually.
1724          */
1725         if (ipv4_is_multicast(daddr)) {
1726                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1727
1728                 if (in_dev) {
1729                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1730                                                   ip_hdr(skb)->protocol);
1731                         if (our
1732 #ifdef CONFIG_IP_MROUTE
1733                                 ||
1734                             (!ipv4_is_local_multicast(daddr) &&
1735                              IN_DEV_MFORWARD(in_dev))
1736 #endif
1737                            ) {
1738                                 int res = ip_route_input_mc(skb, daddr, saddr,
1739                                                             tos, dev, our);
1740                                 rcu_read_unlock();
1741                                 return res;
1742                         }
1743                 }
1744                 rcu_read_unlock();
1745                 return -EINVAL;
1746         }
1747         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1748         rcu_read_unlock();
1749         return res;
1750 }
1751 EXPORT_SYMBOL(ip_route_input_noref);
1752
1753 /* called with rcu_read_lock() */
1754 static struct rtable *__mkroute_output(const struct fib_result *res,
1755                                        const struct flowi4 *fl4, int orig_oif,
1756                                        struct net_device *dev_out,
1757                                        unsigned int flags)
1758 {
1759         struct fib_info *fi = res->fi;
1760         struct fib_nh_exception *fnhe;
1761         struct in_device *in_dev;
1762         u16 type = res->type;
1763         struct rtable *rth;
1764
1765         in_dev = __in_dev_get_rcu(dev_out);
1766         if (!in_dev)
1767                 return ERR_PTR(-EINVAL);
1768
1769         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1770                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1771                         return ERR_PTR(-EINVAL);
1772
1773         if (ipv4_is_lbcast(fl4->daddr))
1774                 type = RTN_BROADCAST;
1775         else if (ipv4_is_multicast(fl4->daddr))
1776                 type = RTN_MULTICAST;
1777         else if (ipv4_is_zeronet(fl4->daddr))
1778                 return ERR_PTR(-EINVAL);
1779
1780         if (dev_out->flags & IFF_LOOPBACK)
1781                 flags |= RTCF_LOCAL;
1782
1783         if (type == RTN_BROADCAST) {
1784                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1785                 fi = NULL;
1786         } else if (type == RTN_MULTICAST) {
1787                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1788                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1789                                      fl4->flowi4_proto))
1790                         flags &= ~RTCF_LOCAL;
1791                 /* If multicast route do not exist use
1792                  * default one, but do not gateway in this case.
1793                  * Yes, it is hack.
1794                  */
1795                 if (fi && res->prefixlen < 4)
1796                         fi = NULL;
1797         }
1798
1799         fnhe = NULL;
1800         if (fi) {
1801                 struct rtable __rcu **prth;
1802
1803                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1804                 if (fnhe)
1805                         prth = &fnhe->fnhe_rth;
1806                 else
1807                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1808                 rth = rcu_dereference(*prth);
1809                 if (rt_cache_valid(rth)) {
1810                         dst_hold(&rth->dst);
1811                         return rth;
1812                 }
1813         }
1814         rth = rt_dst_alloc(dev_out,
1815                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1816                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1817                            fi);
1818         if (!rth)
1819                 return ERR_PTR(-ENOBUFS);
1820
1821         rth->dst.output = ip_output;
1822
1823         rth->rt_genid = rt_genid(dev_net(dev_out));
1824         rth->rt_flags   = flags;
1825         rth->rt_type    = type;
1826         rth->rt_is_input = 0;
1827         rth->rt_iif     = orig_oif ? : 0;
1828         rth->rt_pmtu    = 0;
1829         rth->rt_gateway = 0;
1830         INIT_LIST_HEAD(&rth->rt_uncached);
1831
1832         RT_CACHE_STAT_INC(out_slow_tot);
1833
1834         if (flags & RTCF_LOCAL)
1835                 rth->dst.input = ip_local_deliver;
1836         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1837                 if (flags & RTCF_LOCAL &&
1838                     !(dev_out->flags & IFF_LOOPBACK)) {
1839                         rth->dst.output = ip_mc_output;
1840                         RT_CACHE_STAT_INC(out_slow_mc);
1841                 }
1842 #ifdef CONFIG_IP_MROUTE
1843                 if (type == RTN_MULTICAST) {
1844                         if (IN_DEV_MFORWARD(in_dev) &&
1845                             !ipv4_is_local_multicast(fl4->daddr)) {
1846                                 rth->dst.input = ip_mr_input;
1847                                 rth->dst.output = ip_mc_output;
1848                         }
1849                 }
1850 #endif
1851         }
1852
1853         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1854
1855         return rth;
1856 }
1857
1858 /*
1859  * Major route resolver routine.
1860  */
1861
1862 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1863 {
1864         struct net_device *dev_out = NULL;
1865         __u8 tos = RT_FL_TOS(fl4);
1866         unsigned int flags = 0;
1867         struct fib_result res;
1868         struct rtable *rth;
1869         int orig_oif;
1870
1871         res.tclassid    = 0;
1872         res.fi          = NULL;
1873         res.table       = NULL;
1874
1875         orig_oif = fl4->flowi4_oif;
1876
1877         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1878         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1879         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1880                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1881
1882         rcu_read_lock();
1883         if (fl4->saddr) {
1884                 rth = ERR_PTR(-EINVAL);
1885                 if (ipv4_is_multicast(fl4->saddr) ||
1886                     ipv4_is_lbcast(fl4->saddr) ||
1887                     ipv4_is_zeronet(fl4->saddr))
1888                         goto out;
1889
1890                 /* I removed check for oif == dev_out->oif here.
1891                    It was wrong for two reasons:
1892                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1893                       is assigned to multiple interfaces.
1894                    2. Moreover, we are allowed to send packets with saddr
1895                       of another iface. --ANK
1896                  */
1897
1898                 if (fl4->flowi4_oif == 0 &&
1899                     (ipv4_is_multicast(fl4->daddr) ||
1900                      ipv4_is_lbcast(fl4->daddr))) {
1901                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1902                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1903                         if (dev_out == NULL)
1904                                 goto out;
1905
1906                         /* Special hack: user can direct multicasts
1907                            and limited broadcast via necessary interface
1908                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1909                            This hack is not just for fun, it allows
1910                            vic,vat and friends to work.
1911                            They bind socket to loopback, set ttl to zero
1912                            and expect that it will work.
1913                            From the viewpoint of routing cache they are broken,
1914                            because we are not allowed to build multicast path
1915                            with loopback source addr (look, routing cache
1916                            cannot know, that ttl is zero, so that packet
1917                            will not leave this host and route is valid).
1918                            Luckily, this hack is good workaround.
1919                          */
1920
1921                         fl4->flowi4_oif = dev_out->ifindex;
1922                         goto make_route;
1923                 }
1924
1925                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1926                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1927                         if (!__ip_dev_find(net, fl4->saddr, false))
1928                                 goto out;
1929                 }
1930         }
1931
1932
1933         if (fl4->flowi4_oif) {
1934                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1935                 rth = ERR_PTR(-ENODEV);
1936                 if (dev_out == NULL)
1937                         goto out;
1938
1939                 /* RACE: Check return value of inet_select_addr instead. */
1940                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1941                         rth = ERR_PTR(-ENETUNREACH);
1942                         goto out;
1943                 }
1944                 if (ipv4_is_local_multicast(fl4->daddr) ||
1945                     ipv4_is_lbcast(fl4->daddr)) {
1946                         if (!fl4->saddr)
1947                                 fl4->saddr = inet_select_addr(dev_out, 0,
1948                                                               RT_SCOPE_LINK);
1949                         goto make_route;
1950                 }
1951                 if (fl4->saddr) {
1952                         if (ipv4_is_multicast(fl4->daddr))
1953                                 fl4->saddr = inet_select_addr(dev_out, 0,
1954                                                               fl4->flowi4_scope);
1955                         else if (!fl4->daddr)
1956                                 fl4->saddr = inet_select_addr(dev_out, 0,
1957                                                               RT_SCOPE_HOST);
1958                 }
1959         }
1960
1961         if (!fl4->daddr) {
1962                 fl4->daddr = fl4->saddr;
1963                 if (!fl4->daddr)
1964                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1965                 dev_out = net->loopback_dev;
1966                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1967                 res.type = RTN_LOCAL;
1968                 flags |= RTCF_LOCAL;
1969                 goto make_route;
1970         }
1971
1972         if (fib_lookup(net, fl4, &res)) {
1973                 res.fi = NULL;
1974                 res.table = NULL;
1975                 if (fl4->flowi4_oif) {
1976                         /* Apparently, routing tables are wrong. Assume,
1977                            that the destination is on link.
1978
1979                            WHY? DW.
1980                            Because we are allowed to send to iface
1981                            even if it has NO routes and NO assigned
1982                            addresses. When oif is specified, routing
1983                            tables are looked up with only one purpose:
1984                            to catch if destination is gatewayed, rather than
1985                            direct. Moreover, if MSG_DONTROUTE is set,
1986                            we send packet, ignoring both routing tables
1987                            and ifaddr state. --ANK
1988
1989
1990                            We could make it even if oif is unknown,
1991                            likely IPv6, but we do not.
1992                          */
1993
1994                         if (fl4->saddr == 0)
1995                                 fl4->saddr = inet_select_addr(dev_out, 0,
1996                                                               RT_SCOPE_LINK);
1997                         res.type = RTN_UNICAST;
1998                         goto make_route;
1999                 }
2000                 rth = ERR_PTR(-ENETUNREACH);
2001                 goto out;
2002         }
2003
2004         if (res.type == RTN_LOCAL) {
2005                 if (!fl4->saddr) {
2006                         if (res.fi->fib_prefsrc)
2007                                 fl4->saddr = res.fi->fib_prefsrc;
2008                         else
2009                                 fl4->saddr = fl4->daddr;
2010                 }
2011                 dev_out = net->loopback_dev;
2012                 fl4->flowi4_oif = dev_out->ifindex;
2013                 flags |= RTCF_LOCAL;
2014                 goto make_route;
2015         }
2016
2017 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2018         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2019                 fib_select_multipath(&res);
2020         else
2021 #endif
2022         if (!res.prefixlen &&
2023             res.table->tb_num_default > 1 &&
2024             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2025                 fib_select_default(&res);
2026
2027         if (!fl4->saddr)
2028                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2029
2030         dev_out = FIB_RES_DEV(res);
2031         fl4->flowi4_oif = dev_out->ifindex;
2032
2033
2034 make_route:
2035         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2036
2037 out:
2038         rcu_read_unlock();
2039         return rth;
2040 }
2041 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2042
2043 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2044 {
2045         return NULL;
2046 }
2047
2048 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2049 {
2050         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2051
2052         return mtu ? : dst->dev->mtu;
2053 }
2054
2055 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2056                                           struct sk_buff *skb, u32 mtu)
2057 {
2058 }
2059
2060 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2061                                        struct sk_buff *skb)
2062 {
2063 }
2064
2065 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2066                                           unsigned long old)
2067 {
2068         return NULL;
2069 }
2070
2071 static struct dst_ops ipv4_dst_blackhole_ops = {
2072         .family                 =       AF_INET,
2073         .protocol               =       cpu_to_be16(ETH_P_IP),
2074         .check                  =       ipv4_blackhole_dst_check,
2075         .mtu                    =       ipv4_blackhole_mtu,
2076         .default_advmss         =       ipv4_default_advmss,
2077         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2078         .redirect               =       ipv4_rt_blackhole_redirect,
2079         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2080         .neigh_lookup           =       ipv4_neigh_lookup,
2081 };
2082
2083 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2084 {
2085         struct rtable *ort = (struct rtable *) dst_orig;
2086         struct rtable *rt;
2087
2088         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2089         if (rt) {
2090                 struct dst_entry *new = &rt->dst;
2091
2092                 new->__use = 1;
2093                 new->input = dst_discard;
2094                 new->output = dst_discard;
2095
2096                 new->dev = ort->dst.dev;
2097                 if (new->dev)
2098                         dev_hold(new->dev);
2099
2100                 rt->rt_is_input = ort->rt_is_input;
2101                 rt->rt_iif = ort->rt_iif;
2102                 rt->rt_pmtu = ort->rt_pmtu;
2103
2104                 rt->rt_genid = rt_genid(net);
2105                 rt->rt_flags = ort->rt_flags;
2106                 rt->rt_type = ort->rt_type;
2107                 rt->rt_gateway = ort->rt_gateway;
2108
2109                 INIT_LIST_HEAD(&rt->rt_uncached);
2110
2111                 dst_free(new);
2112         }
2113
2114         dst_release(dst_orig);
2115
2116         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2117 }
2118
2119 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2120                                     struct sock *sk)
2121 {
2122         struct rtable *rt = __ip_route_output_key(net, flp4);
2123
2124         if (IS_ERR(rt))
2125                 return rt;
2126
2127         if (flp4->flowi4_proto)
2128                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2129                                                    flowi4_to_flowi(flp4),
2130                                                    sk, 0);
2131
2132         return rt;
2133 }
2134 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2135
2136 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2137                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2138                         u32 seq, int event, int nowait, unsigned int flags)
2139 {
2140         struct rtable *rt = skb_rtable(skb);
2141         struct rtmsg *r;
2142         struct nlmsghdr *nlh;
2143         unsigned long expires = 0;
2144         u32 error;
2145         u32 metrics[RTAX_MAX];
2146
2147         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2148         if (nlh == NULL)
2149                 return -EMSGSIZE;
2150
2151         r = nlmsg_data(nlh);
2152         r->rtm_family    = AF_INET;
2153         r->rtm_dst_len  = 32;
2154         r->rtm_src_len  = 0;
2155         r->rtm_tos      = fl4->flowi4_tos;
2156         r->rtm_table    = RT_TABLE_MAIN;
2157         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2158                 goto nla_put_failure;
2159         r->rtm_type     = rt->rt_type;
2160         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2161         r->rtm_protocol = RTPROT_UNSPEC;
2162         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2163         if (rt->rt_flags & RTCF_NOTIFY)
2164                 r->rtm_flags |= RTM_F_NOTIFY;
2165
2166         if (nla_put_be32(skb, RTA_DST, dst))
2167                 goto nla_put_failure;
2168         if (src) {
2169                 r->rtm_src_len = 32;
2170                 if (nla_put_be32(skb, RTA_SRC, src))
2171                         goto nla_put_failure;
2172         }
2173         if (rt->dst.dev &&
2174             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2175                 goto nla_put_failure;
2176 #ifdef CONFIG_IP_ROUTE_CLASSID
2177         if (rt->dst.tclassid &&
2178             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2179                 goto nla_put_failure;
2180 #endif
2181         if (!rt_is_input_route(rt) &&
2182             fl4->saddr != src) {
2183                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2184                         goto nla_put_failure;
2185         }
2186         if (rt->rt_gateway &&
2187             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2188                 goto nla_put_failure;
2189
2190         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2191         if (rt->rt_pmtu)
2192                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2193         if (rtnetlink_put_metrics(skb, metrics) < 0)
2194                 goto nla_put_failure;
2195
2196         if (fl4->flowi4_mark &&
2197             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2198                 goto nla_put_failure;
2199
2200         error = rt->dst.error;
2201         expires = rt->dst.expires;
2202         if (expires) {
2203                 if (time_before(jiffies, expires))
2204                         expires -= jiffies;
2205                 else
2206                         expires = 0;
2207         }
2208
2209         if (rt_is_input_route(rt)) {
2210                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2211                         goto nla_put_failure;
2212         }
2213
2214         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2215                 goto nla_put_failure;
2216
2217         return nlmsg_end(skb, nlh);
2218
2219 nla_put_failure:
2220         nlmsg_cancel(skb, nlh);
2221         return -EMSGSIZE;
2222 }
2223
2224 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2225 {
2226         struct net *net = sock_net(in_skb->sk);
2227         struct rtmsg *rtm;
2228         struct nlattr *tb[RTA_MAX+1];
2229         struct rtable *rt = NULL;
2230         struct flowi4 fl4;
2231         __be32 dst = 0;
2232         __be32 src = 0;
2233         u32 iif;
2234         int err;
2235         int mark;
2236         struct sk_buff *skb;
2237
2238         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2239         if (err < 0)
2240                 goto errout;
2241
2242         rtm = nlmsg_data(nlh);
2243
2244         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2245         if (skb == NULL) {
2246                 err = -ENOBUFS;
2247                 goto errout;
2248         }
2249
2250         /* Reserve room for dummy headers, this skb can pass
2251            through good chunk of routing engine.
2252          */
2253         skb_reset_mac_header(skb);
2254         skb_reset_network_header(skb);
2255
2256         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2257         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2258         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2259
2260         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2261         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2262         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2263         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2264
2265         memset(&fl4, 0, sizeof(fl4));
2266         fl4.daddr = dst;
2267         fl4.saddr = src;
2268         fl4.flowi4_tos = rtm->rtm_tos;
2269         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2270         fl4.flowi4_mark = mark;
2271
2272         if (iif) {
2273                 struct net_device *dev;
2274
2275                 dev = __dev_get_by_index(net, iif);
2276                 if (dev == NULL) {
2277                         err = -ENODEV;
2278                         goto errout_free;
2279                 }
2280
2281                 skb->protocol   = htons(ETH_P_IP);
2282                 skb->dev        = dev;
2283                 skb->mark       = mark;
2284                 local_bh_disable();
2285                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2286                 local_bh_enable();
2287
2288                 rt = skb_rtable(skb);
2289                 if (err == 0 && rt->dst.error)
2290                         err = -rt->dst.error;
2291         } else {
2292                 rt = ip_route_output_key(net, &fl4);
2293
2294                 err = 0;
2295                 if (IS_ERR(rt))
2296                         err = PTR_ERR(rt);
2297         }
2298
2299         if (err)
2300                 goto errout_free;
2301
2302         skb_dst_set(skb, &rt->dst);
2303         if (rtm->rtm_flags & RTM_F_NOTIFY)
2304                 rt->rt_flags |= RTCF_NOTIFY;
2305
2306         err = rt_fill_info(net, dst, src, &fl4, skb,
2307                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2308                            RTM_NEWROUTE, 0, 0);
2309         if (err <= 0)
2310                 goto errout_free;
2311
2312         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2313 errout:
2314         return err;
2315
2316 errout_free:
2317         kfree_skb(skb);
2318         goto errout;
2319 }
2320
2321 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2322 {
2323         return skb->len;
2324 }
2325
2326 void ip_rt_multicast_event(struct in_device *in_dev)
2327 {
2328         rt_cache_flush(dev_net(in_dev->dev));
2329 }
2330
2331 #ifdef CONFIG_SYSCTL
2332 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2333                                         void __user *buffer,
2334                                         size_t *lenp, loff_t *ppos)
2335 {
2336         if (write) {
2337                 rt_cache_flush((struct net *)__ctl->extra1);
2338                 return 0;
2339         }
2340
2341         return -EINVAL;
2342 }
2343
2344 static ctl_table ipv4_route_table[] = {
2345         {
2346                 .procname       = "gc_thresh",
2347                 .data           = &ipv4_dst_ops.gc_thresh,
2348                 .maxlen         = sizeof(int),
2349                 .mode           = 0644,
2350                 .proc_handler   = proc_dointvec,
2351         },
2352         {
2353                 .procname       = "max_size",
2354                 .data           = &ip_rt_max_size,
2355                 .maxlen         = sizeof(int),
2356                 .mode           = 0644,
2357                 .proc_handler   = proc_dointvec,
2358         },
2359         {
2360                 /*  Deprecated. Use gc_min_interval_ms */
2361
2362                 .procname       = "gc_min_interval",
2363                 .data           = &ip_rt_gc_min_interval,
2364                 .maxlen         = sizeof(int),
2365                 .mode           = 0644,
2366                 .proc_handler   = proc_dointvec_jiffies,
2367         },
2368         {
2369                 .procname       = "gc_min_interval_ms",
2370                 .data           = &ip_rt_gc_min_interval,
2371                 .maxlen         = sizeof(int),
2372                 .mode           = 0644,
2373                 .proc_handler   = proc_dointvec_ms_jiffies,
2374         },
2375         {
2376                 .procname       = "gc_timeout",
2377                 .data           = &ip_rt_gc_timeout,
2378                 .maxlen         = sizeof(int),
2379                 .mode           = 0644,
2380                 .proc_handler   = proc_dointvec_jiffies,
2381         },
2382         {
2383                 .procname       = "gc_interval",
2384                 .data           = &ip_rt_gc_interval,
2385                 .maxlen         = sizeof(int),
2386                 .mode           = 0644,
2387                 .proc_handler   = proc_dointvec_jiffies,
2388         },
2389         {
2390                 .procname       = "redirect_load",
2391                 .data           = &ip_rt_redirect_load,
2392                 .maxlen         = sizeof(int),
2393                 .mode           = 0644,
2394                 .proc_handler   = proc_dointvec,
2395         },
2396         {
2397                 .procname       = "redirect_number",
2398                 .data           = &ip_rt_redirect_number,
2399                 .maxlen         = sizeof(int),
2400                 .mode           = 0644,
2401                 .proc_handler   = proc_dointvec,
2402         },
2403         {
2404                 .procname       = "redirect_silence",
2405                 .data           = &ip_rt_redirect_silence,
2406                 .maxlen         = sizeof(int),
2407                 .mode           = 0644,
2408                 .proc_handler   = proc_dointvec,
2409         },
2410         {
2411                 .procname       = "error_cost",
2412                 .data           = &ip_rt_error_cost,
2413                 .maxlen         = sizeof(int),
2414                 .mode           = 0644,
2415                 .proc_handler   = proc_dointvec,
2416         },
2417         {
2418                 .procname       = "error_burst",
2419                 .data           = &ip_rt_error_burst,
2420                 .maxlen         = sizeof(int),
2421                 .mode           = 0644,
2422                 .proc_handler   = proc_dointvec,
2423         },
2424         {
2425                 .procname       = "gc_elasticity",
2426                 .data           = &ip_rt_gc_elasticity,
2427                 .maxlen         = sizeof(int),
2428                 .mode           = 0644,
2429                 .proc_handler   = proc_dointvec,
2430         },
2431         {
2432                 .procname       = "mtu_expires",
2433                 .data           = &ip_rt_mtu_expires,
2434                 .maxlen         = sizeof(int),
2435                 .mode           = 0644,
2436                 .proc_handler   = proc_dointvec_jiffies,
2437         },
2438         {
2439                 .procname       = "min_pmtu",
2440                 .data           = &ip_rt_min_pmtu,
2441                 .maxlen         = sizeof(int),
2442                 .mode           = 0644,
2443                 .proc_handler   = proc_dointvec,
2444         },
2445         {
2446                 .procname       = "min_adv_mss",
2447                 .data           = &ip_rt_min_advmss,
2448                 .maxlen         = sizeof(int),
2449                 .mode           = 0644,
2450                 .proc_handler   = proc_dointvec,
2451         },
2452         { }
2453 };
2454
2455 static struct ctl_table ipv4_route_flush_table[] = {
2456         {
2457                 .procname       = "flush",
2458                 .maxlen         = sizeof(int),
2459                 .mode           = 0200,
2460                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2461         },
2462         { },
2463 };
2464
2465 static __net_init int sysctl_route_net_init(struct net *net)
2466 {
2467         struct ctl_table *tbl;
2468
2469         tbl = ipv4_route_flush_table;
2470         if (!net_eq(net, &init_net)) {
2471                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2472                 if (tbl == NULL)
2473                         goto err_dup;
2474         }
2475         tbl[0].extra1 = net;
2476
2477         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2478         if (net->ipv4.route_hdr == NULL)
2479                 goto err_reg;
2480         return 0;
2481
2482 err_reg:
2483         if (tbl != ipv4_route_flush_table)
2484                 kfree(tbl);
2485 err_dup:
2486         return -ENOMEM;
2487 }
2488
2489 static __net_exit void sysctl_route_net_exit(struct net *net)
2490 {
2491         struct ctl_table *tbl;
2492
2493         tbl = net->ipv4.route_hdr->ctl_table_arg;
2494         unregister_net_sysctl_table(net->ipv4.route_hdr);
2495         BUG_ON(tbl == ipv4_route_flush_table);
2496         kfree(tbl);
2497 }
2498
2499 static __net_initdata struct pernet_operations sysctl_route_ops = {
2500         .init = sysctl_route_net_init,
2501         .exit = sysctl_route_net_exit,
2502 };
2503 #endif
2504
2505 static __net_init int rt_genid_init(struct net *net)
2506 {
2507         atomic_set(&net->rt_genid, 0);
2508         get_random_bytes(&net->ipv4.dev_addr_genid,
2509                          sizeof(net->ipv4.dev_addr_genid));
2510         return 0;
2511 }
2512
2513 static __net_initdata struct pernet_operations rt_genid_ops = {
2514         .init = rt_genid_init,
2515 };
2516
2517 static int __net_init ipv4_inetpeer_init(struct net *net)
2518 {
2519         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2520
2521         if (!bp)
2522                 return -ENOMEM;
2523         inet_peer_base_init(bp);
2524         net->ipv4.peers = bp;
2525         return 0;
2526 }
2527
2528 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2529 {
2530         struct inet_peer_base *bp = net->ipv4.peers;
2531
2532         net->ipv4.peers = NULL;
2533         inetpeer_invalidate_tree(bp);
2534         kfree(bp);
2535 }
2536
2537 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2538         .init   =       ipv4_inetpeer_init,
2539         .exit   =       ipv4_inetpeer_exit,
2540 };
2541
2542 #ifdef CONFIG_IP_ROUTE_CLASSID
2543 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2544 #endif /* CONFIG_IP_ROUTE_CLASSID */
2545
2546 int __init ip_rt_init(void)
2547 {
2548         int rc = 0;
2549
2550 #ifdef CONFIG_IP_ROUTE_CLASSID
2551         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2552         if (!ip_rt_acct)
2553                 panic("IP: failed to allocate ip_rt_acct\n");
2554 #endif
2555
2556         ipv4_dst_ops.kmem_cachep =
2557                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2558                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2559
2560         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2561
2562         if (dst_entries_init(&ipv4_dst_ops) < 0)
2563                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2564
2565         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2566                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2567
2568         ipv4_dst_ops.gc_thresh = ~0;
2569         ip_rt_max_size = INT_MAX;
2570
2571         devinet_init();
2572         ip_fib_init();
2573
2574         if (ip_rt_proc_init())
2575                 pr_err("Unable to create route proc files\n");
2576 #ifdef CONFIG_XFRM
2577         xfrm_init();
2578         xfrm4_init(ip_rt_max_size);
2579 #endif
2580         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2581
2582 #ifdef CONFIG_SYSCTL
2583         register_pernet_subsys(&sysctl_route_ops);
2584 #endif
2585         register_pernet_subsys(&rt_genid_ops);
2586         register_pernet_subsys(&ipv4_inetpeer_ops);
2587         return rc;
2588 }
2589
2590 #ifdef CONFIG_SYSCTL
2591 /*
2592  * We really need to sanitize the damn ipv4 init order, then all
2593  * this nonsense will go away.
2594  */
2595 void __init ip_static_sysctl_init(void)
2596 {
2597         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2598 }
2599 #endif