ipv4: Always invalidate or update the route on pmtu events
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208         if (*pos)
209                 return NULL;
210         return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215         ++*pos;
216         return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225         if (v == SEQ_START_TOKEN)
226                 seq_printf(seq, "%-127s\n",
227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229                            "HHUptod\tSpecDst");
230         return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234         .start  = rt_cache_seq_start,
235         .next   = rt_cache_seq_next,
236         .stop   = rt_cache_seq_stop,
237         .show   = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242         return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246         .owner   = THIS_MODULE,
247         .open    = rt_cache_seq_open,
248         .read    = seq_read,
249         .llseek  = seq_lseek,
250         .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256         int cpu;
257
258         if (*pos == 0)
259                 return SEQ_START_TOKEN;
260
261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262                 if (!cpu_possible(cpu))
263                         continue;
264                 *pos = cpu+1;
265                 return &per_cpu(rt_cache_stat, cpu);
266         }
267         return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272         int cpu;
273
274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275                 if (!cpu_possible(cpu))
276                         continue;
277                 *pos = cpu+1;
278                 return &per_cpu(rt_cache_stat, cpu);
279         }
280         return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291         struct rt_cache_stat *st = v;
292
293         if (v == SEQ_START_TOKEN) {
294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295                 return 0;
296         }
297
298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300                    dst_entries_get_slow(&ipv4_dst_ops),
301                    st->in_hit,
302                    st->in_slow_tot,
303                    st->in_slow_mc,
304                    st->in_no_route,
305                    st->in_brd,
306                    st->in_martian_dst,
307                    st->in_martian_src,
308
309                    st->out_hit,
310                    st->out_slow_tot,
311                    st->out_slow_mc,
312
313                    st->gc_total,
314                    st->gc_ignored,
315                    st->gc_goal_miss,
316                    st->gc_dst_overflow,
317                    st->in_hlist_search,
318                    st->out_hlist_search
319                 );
320         return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324         .start  = rt_cpu_seq_start,
325         .next   = rt_cpu_seq_next,
326         .stop   = rt_cpu_seq_stop,
327         .show   = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333         return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337         .owner   = THIS_MODULE,
338         .open    = rt_cpu_seq_open,
339         .read    = seq_read,
340         .llseek  = seq_lseek,
341         .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347         struct ip_rt_acct *dst, *src;
348         unsigned int i, j;
349
350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351         if (!dst)
352                 return -ENOMEM;
353
354         for_each_possible_cpu(i) {
355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356                 for (j = 0; j < 256; j++) {
357                         dst[j].o_bytes   += src[j].o_bytes;
358                         dst[j].o_packets += src[j].o_packets;
359                         dst[j].i_bytes   += src[j].i_bytes;
360                         dst[j].i_packets += src[j].i_packets;
361                 }
362         }
363
364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365         kfree(dst);
366         return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371         return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375         .owner          = THIS_MODULE,
376         .open           = rt_acct_proc_open,
377         .read           = seq_read,
378         .llseek         = seq_lseek,
379         .release        = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385         struct proc_dir_entry *pde;
386
387         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388                         &rt_cache_seq_fops);
389         if (!pde)
390                 goto err1;
391
392         pde = proc_create("rt_cache", S_IRUGO,
393                           net->proc_net_stat, &rt_cpu_seq_fops);
394         if (!pde)
395                 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399         if (!pde)
400                 goto err3;
401 #endif
402         return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406         remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409         remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411         return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416         remove_proc_entry("rt_cache", net->proc_net_stat);
417         remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419         remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424         .init = ip_rt_do_proc_init,
425         .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430         return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436         return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447         rt_genid_bump(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451                                            struct sk_buff *skb,
452                                            const void *daddr)
453 {
454         struct net_device *dev = dst->dev;
455         const __be32 *pkey = daddr;
456         const struct rtable *rt;
457         struct neighbour *n;
458
459         rt = (const struct rtable *) dst;
460         if (rt->rt_gateway)
461                 pkey = (const __be32 *) &rt->rt_gateway;
462         else if (skb)
463                 pkey = &ip_hdr(skb)->daddr;
464
465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466         if (n)
467                 return n;
468         return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480         static DEFINE_SPINLOCK(ip_fb_id_lock);
481         static u32 ip_fallback_id;
482         u32 salt;
483
484         spin_lock_bh(&ip_fb_id_lock);
485         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486         iph->id = htons(salt & 0xFFFF);
487         ip_fallback_id = salt;
488         spin_unlock_bh(&ip_fb_id_lock);
489 }
490
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493         struct net *net = dev_net(dst->dev);
494         struct inet_peer *peer;
495
496         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497         if (peer) {
498                 iph->id = htons(inet_getid(peer, more));
499                 inet_putpeer(peer);
500                 return;
501         }
502
503         ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508                              const struct iphdr *iph,
509                              int oif, u8 tos,
510                              u8 prot, u32 mark, int flow_flags)
511 {
512         if (sk) {
513                 const struct inet_sock *inet = inet_sk(sk);
514
515                 oif = sk->sk_bound_dev_if;
516                 mark = sk->sk_mark;
517                 tos = RT_CONN_FLAGS(sk);
518                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519         }
520         flowi4_init_output(fl4, oif, mark, tos,
521                            RT_SCOPE_UNIVERSE, prot,
522                            flow_flags,
523                            iph->daddr, iph->saddr, 0, 0);
524 }
525
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527                                const struct sock *sk)
528 {
529         const struct iphdr *iph = ip_hdr(skb);
530         int oif = skb->dev->ifindex;
531         u8 tos = RT_TOS(iph->tos);
532         u8 prot = iph->protocol;
533         u32 mark = skb->mark;
534
535         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540         const struct inet_sock *inet = inet_sk(sk);
541         const struct ip_options_rcu *inet_opt;
542         __be32 daddr = inet->inet_daddr;
543
544         rcu_read_lock();
545         inet_opt = rcu_dereference(inet->inet_opt);
546         if (inet_opt && inet_opt->opt.srr)
547                 daddr = inet_opt->opt.faddr;
548         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551                            inet_sk_flowi_flags(sk),
552                            daddr, inet->inet_saddr, 0, 0);
553         rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557                                  const struct sk_buff *skb)
558 {
559         if (skb)
560                 build_skb_flow_key(fl4, skb, sk);
561         else
562                 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574         struct fib_nh_exception *fnhe, *oldest;
575         struct rtable *orig;
576
577         oldest = rcu_dereference(hash->chain);
578         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579              fnhe = rcu_dereference(fnhe->fnhe_next)) {
580                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581                         oldest = fnhe;
582         }
583         orig = rcu_dereference(oldest->fnhe_rth);
584         if (orig) {
585                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586                 rt_free(orig);
587         }
588         return oldest;
589 }
590
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593         u32 hval;
594
595         hval = (__force u32) daddr;
596         hval ^= (hval >> 11) ^ (hval >> 22);
597
598         return hval & (FNHE_HASH_SIZE - 1);
599 }
600
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602                                   u32 pmtu, unsigned long expires)
603 {
604         struct fnhe_hash_bucket *hash;
605         struct fib_nh_exception *fnhe;
606         int depth;
607         u32 hval = fnhe_hashfun(daddr);
608
609         spin_lock_bh(&fnhe_lock);
610
611         hash = nh->nh_exceptions;
612         if (!hash) {
613                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614                 if (!hash)
615                         goto out_unlock;
616                 nh->nh_exceptions = hash;
617         }
618
619         hash += hval;
620
621         depth = 0;
622         for (fnhe = rcu_dereference(hash->chain); fnhe;
623              fnhe = rcu_dereference(fnhe->fnhe_next)) {
624                 if (fnhe->fnhe_daddr == daddr)
625                         break;
626                 depth++;
627         }
628
629         if (fnhe) {
630                 if (gw)
631                         fnhe->fnhe_gw = gw;
632                 if (pmtu) {
633                         fnhe->fnhe_pmtu = pmtu;
634                         fnhe->fnhe_expires = expires;
635                 }
636         } else {
637                 if (depth > FNHE_RECLAIM_DEPTH)
638                         fnhe = fnhe_oldest(hash);
639                 else {
640                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641                         if (!fnhe)
642                                 goto out_unlock;
643
644                         fnhe->fnhe_next = hash->chain;
645                         rcu_assign_pointer(hash->chain, fnhe);
646                 }
647                 fnhe->fnhe_daddr = daddr;
648                 fnhe->fnhe_gw = gw;
649                 fnhe->fnhe_pmtu = pmtu;
650                 fnhe->fnhe_expires = expires;
651         }
652
653         fnhe->fnhe_stamp = jiffies;
654
655 out_unlock:
656         spin_unlock_bh(&fnhe_lock);
657         return;
658 }
659
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661                              bool kill_route)
662 {
663         __be32 new_gw = icmp_hdr(skb)->un.gateway;
664         __be32 old_gw = ip_hdr(skb)->saddr;
665         struct net_device *dev = skb->dev;
666         struct in_device *in_dev;
667         struct fib_result res;
668         struct neighbour *n;
669         struct net *net;
670
671         switch (icmp_hdr(skb)->code & 7) {
672         case ICMP_REDIR_NET:
673         case ICMP_REDIR_NETTOS:
674         case ICMP_REDIR_HOST:
675         case ICMP_REDIR_HOSTTOS:
676                 break;
677
678         default:
679                 return;
680         }
681
682         if (rt->rt_gateway != old_gw)
683                 return;
684
685         in_dev = __in_dev_get_rcu(dev);
686         if (!in_dev)
687                 return;
688
689         net = dev_net(dev);
690         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692             ipv4_is_zeronet(new_gw))
693                 goto reject_redirect;
694
695         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697                         goto reject_redirect;
698                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699                         goto reject_redirect;
700         } else {
701                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702                         goto reject_redirect;
703         }
704
705         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706         if (n) {
707                 if (!(n->nud_state & NUD_VALID)) {
708                         neigh_event_send(n, NULL);
709                 } else {
710                         if (fib_lookup(net, fl4, &res) == 0) {
711                                 struct fib_nh *nh = &FIB_RES_NH(res);
712
713                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714                                                       0, 0);
715                         }
716                         if (kill_route)
717                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
718                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719                 }
720                 neigh_release(n);
721         }
722         return;
723
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726         if (IN_DEV_LOG_MARTIANS(in_dev)) {
727                 const struct iphdr *iph = (const struct iphdr *) skb->data;
728                 __be32 daddr = iph->daddr;
729                 __be32 saddr = iph->saddr;
730
731                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732                                      "  Advised path = %pI4 -> %pI4\n",
733                                      &old_gw, dev->name, &new_gw,
734                                      &saddr, &daddr);
735         }
736 #endif
737         ;
738 }
739
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742         struct rtable *rt;
743         struct flowi4 fl4;
744
745         rt = (struct rtable *) dst;
746
747         ip_rt_build_flow_key(&fl4, sk, skb);
748         __ip_do_redirect(rt, skb, &fl4, true);
749 }
750
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753         struct rtable *rt = (struct rtable *)dst;
754         struct dst_entry *ret = dst;
755
756         if (rt) {
757                 if (dst->obsolete > 0) {
758                         ip_rt_put(rt);
759                         ret = NULL;
760                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761                            rt->dst.expires) {
762                         ip_rt_put(rt);
763                         ret = NULL;
764                 }
765         }
766         return ret;
767 }
768
769 /*
770  * Algorithm:
771  *      1. The first ip_rt_redirect_number redirects are sent
772  *         with exponential backoff, then we stop sending them at all,
773  *         assuming that the host ignores our redirects.
774  *      2. If we did not see packets requiring redirects
775  *         during ip_rt_redirect_silence, we assume that the host
776  *         forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787         struct rtable *rt = skb_rtable(skb);
788         struct in_device *in_dev;
789         struct inet_peer *peer;
790         struct net *net;
791         int log_martians;
792
793         rcu_read_lock();
794         in_dev = __in_dev_get_rcu(rt->dst.dev);
795         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796                 rcu_read_unlock();
797                 return;
798         }
799         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800         rcu_read_unlock();
801
802         net = dev_net(rt->dst.dev);
803         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804         if (!peer) {
805                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
806                 return;
807         }
808
809         /* No redirected packets during ip_rt_redirect_silence;
810          * reset the algorithm.
811          */
812         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
813                 peer->rate_tokens = 0;
814
815         /* Too many ignored redirects; do not send anything
816          * set dst.rate_last to the last seen redirected packet.
817          */
818         if (peer->rate_tokens >= ip_rt_redirect_number) {
819                 peer->rate_last = jiffies;
820                 goto out_put_peer;
821         }
822
823         /* Check for load limit; set rate_last to the latest sent
824          * redirect.
825          */
826         if (peer->rate_tokens == 0 ||
827             time_after(jiffies,
828                        (peer->rate_last +
829                         (ip_rt_redirect_load << peer->rate_tokens)))) {
830                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
831                 peer->rate_last = jiffies;
832                 ++peer->rate_tokens;
833 #ifdef CONFIG_IP_ROUTE_VERBOSE
834                 if (log_martians &&
835                     peer->rate_tokens == ip_rt_redirect_number)
836                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
837                                              &ip_hdr(skb)->saddr, inet_iif(skb),
838                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
839 #endif
840         }
841 out_put_peer:
842         inet_putpeer(peer);
843 }
844
845 static int ip_error(struct sk_buff *skb)
846 {
847         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
848         struct rtable *rt = skb_rtable(skb);
849         struct inet_peer *peer;
850         unsigned long now;
851         struct net *net;
852         bool send;
853         int code;
854
855         net = dev_net(rt->dst.dev);
856         if (!IN_DEV_FORWARD(in_dev)) {
857                 switch (rt->dst.error) {
858                 case EHOSTUNREACH:
859                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
860                         break;
861
862                 case ENETUNREACH:
863                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
864                         break;
865                 }
866                 goto out;
867         }
868
869         switch (rt->dst.error) {
870         case EINVAL:
871         default:
872                 goto out;
873         case EHOSTUNREACH:
874                 code = ICMP_HOST_UNREACH;
875                 break;
876         case ENETUNREACH:
877                 code = ICMP_NET_UNREACH;
878                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
879                 break;
880         case EACCES:
881                 code = ICMP_PKT_FILTERED;
882                 break;
883         }
884
885         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
886
887         send = true;
888         if (peer) {
889                 now = jiffies;
890                 peer->rate_tokens += now - peer->rate_last;
891                 if (peer->rate_tokens > ip_rt_error_burst)
892                         peer->rate_tokens = ip_rt_error_burst;
893                 peer->rate_last = now;
894                 if (peer->rate_tokens >= ip_rt_error_cost)
895                         peer->rate_tokens -= ip_rt_error_cost;
896                 else
897                         send = false;
898                 inet_putpeer(peer);
899         }
900         if (send)
901                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
902
903 out:    kfree_skb(skb);
904         return 0;
905 }
906
907 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
908 {
909         struct dst_entry *dst = &rt->dst;
910         struct fib_result res;
911
912         if (mtu < ip_rt_min_pmtu)
913                 mtu = ip_rt_min_pmtu;
914
915         if (!rt->rt_pmtu) {
916                 dst->obsolete = DST_OBSOLETE_KILL;
917         } else {
918                 rt->rt_pmtu = mtu;
919                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
920         }
921
922         rcu_read_lock();
923         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
924                 struct fib_nh *nh = &FIB_RES_NH(res);
925
926                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
927                                       jiffies + ip_rt_mtu_expires);
928         }
929         rcu_read_unlock();
930 }
931
932 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
933                               struct sk_buff *skb, u32 mtu)
934 {
935         struct rtable *rt = (struct rtable *) dst;
936         struct flowi4 fl4;
937
938         ip_rt_build_flow_key(&fl4, sk, skb);
939         __ip_rt_update_pmtu(rt, &fl4, mtu);
940 }
941
942 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
943                       int oif, u32 mark, u8 protocol, int flow_flags)
944 {
945         const struct iphdr *iph = (const struct iphdr *) skb->data;
946         struct flowi4 fl4;
947         struct rtable *rt;
948
949         __build_flow_key(&fl4, NULL, iph, oif,
950                          RT_TOS(iph->tos), protocol, mark, flow_flags);
951         rt = __ip_route_output_key(net, &fl4);
952         if (!IS_ERR(rt)) {
953                 __ip_rt_update_pmtu(rt, &fl4, mtu);
954                 ip_rt_put(rt);
955         }
956 }
957 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
958
959 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
960 {
961         const struct iphdr *iph = (const struct iphdr *) skb->data;
962         struct flowi4 fl4;
963         struct rtable *rt;
964
965         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
966         rt = __ip_route_output_key(sock_net(sk), &fl4);
967         if (!IS_ERR(rt)) {
968                 __ip_rt_update_pmtu(rt, &fl4, mtu);
969                 ip_rt_put(rt);
970         }
971 }
972 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
973
974 void ipv4_redirect(struct sk_buff *skb, struct net *net,
975                    int oif, u32 mark, u8 protocol, int flow_flags)
976 {
977         const struct iphdr *iph = (const struct iphdr *) skb->data;
978         struct flowi4 fl4;
979         struct rtable *rt;
980
981         __build_flow_key(&fl4, NULL, iph, oif,
982                          RT_TOS(iph->tos), protocol, mark, flow_flags);
983         rt = __ip_route_output_key(net, &fl4);
984         if (!IS_ERR(rt)) {
985                 __ip_do_redirect(rt, skb, &fl4, false);
986                 ip_rt_put(rt);
987         }
988 }
989 EXPORT_SYMBOL_GPL(ipv4_redirect);
990
991 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
992 {
993         const struct iphdr *iph = (const struct iphdr *) skb->data;
994         struct flowi4 fl4;
995         struct rtable *rt;
996
997         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
998         rt = __ip_route_output_key(sock_net(sk), &fl4);
999         if (!IS_ERR(rt)) {
1000                 __ip_do_redirect(rt, skb, &fl4, false);
1001                 ip_rt_put(rt);
1002         }
1003 }
1004 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1005
1006 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1007 {
1008         struct rtable *rt = (struct rtable *) dst;
1009
1010         /* All IPV4 dsts are created with ->obsolete set to the value
1011          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1012          * into this function always.
1013          *
1014          * When a PMTU/redirect information update invalidates a
1015          * route, this is indicated by setting obsolete to
1016          * DST_OBSOLETE_KILL.
1017          */
1018         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1019                 return NULL;
1020         return dst;
1021 }
1022
1023 static void ipv4_link_failure(struct sk_buff *skb)
1024 {
1025         struct rtable *rt;
1026
1027         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1028
1029         rt = skb_rtable(skb);
1030         if (rt)
1031                 dst_set_expires(&rt->dst, 0);
1032 }
1033
1034 static int ip_rt_bug(struct sk_buff *skb)
1035 {
1036         pr_debug("%s: %pI4 -> %pI4, %s\n",
1037                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1038                  skb->dev ? skb->dev->name : "?");
1039         kfree_skb(skb);
1040         WARN_ON(1);
1041         return 0;
1042 }
1043
1044 /*
1045    We do not cache source address of outgoing interface,
1046    because it is used only by IP RR, TS and SRR options,
1047    so that it out of fast path.
1048
1049    BTW remember: "addr" is allowed to be not aligned
1050    in IP options!
1051  */
1052
1053 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1054 {
1055         __be32 src;
1056
1057         if (rt_is_output_route(rt))
1058                 src = ip_hdr(skb)->saddr;
1059         else {
1060                 struct fib_result res;
1061                 struct flowi4 fl4;
1062                 struct iphdr *iph;
1063
1064                 iph = ip_hdr(skb);
1065
1066                 memset(&fl4, 0, sizeof(fl4));
1067                 fl4.daddr = iph->daddr;
1068                 fl4.saddr = iph->saddr;
1069                 fl4.flowi4_tos = RT_TOS(iph->tos);
1070                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1071                 fl4.flowi4_iif = skb->dev->ifindex;
1072                 fl4.flowi4_mark = skb->mark;
1073
1074                 rcu_read_lock();
1075                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1076                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1077                 else
1078                         src = inet_select_addr(rt->dst.dev,
1079                                                rt_nexthop(rt, iph->daddr),
1080                                                RT_SCOPE_UNIVERSE);
1081                 rcu_read_unlock();
1082         }
1083         memcpy(addr, &src, 4);
1084 }
1085
1086 #ifdef CONFIG_IP_ROUTE_CLASSID
1087 static void set_class_tag(struct rtable *rt, u32 tag)
1088 {
1089         if (!(rt->dst.tclassid & 0xFFFF))
1090                 rt->dst.tclassid |= tag & 0xFFFF;
1091         if (!(rt->dst.tclassid & 0xFFFF0000))
1092                 rt->dst.tclassid |= tag & 0xFFFF0000;
1093 }
1094 #endif
1095
1096 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1097 {
1098         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1099
1100         if (advmss == 0) {
1101                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1102                                ip_rt_min_advmss);
1103                 if (advmss > 65535 - 40)
1104                         advmss = 65535 - 40;
1105         }
1106         return advmss;
1107 }
1108
1109 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1110 {
1111         const struct rtable *rt = (const struct rtable *) dst;
1112         unsigned int mtu = rt->rt_pmtu;
1113
1114         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1115                 mtu = dst_metric_raw(dst, RTAX_MTU);
1116
1117         if (mtu && rt_is_output_route(rt))
1118                 return mtu;
1119
1120         mtu = dst->dev->mtu;
1121
1122         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1123                 if (rt->rt_gateway && mtu > 576)
1124                         mtu = 576;
1125         }
1126
1127         if (mtu > IP_MAX_MTU)
1128                 mtu = IP_MAX_MTU;
1129
1130         return mtu;
1131 }
1132
1133 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1134 {
1135         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1136         struct fib_nh_exception *fnhe;
1137         u32 hval;
1138
1139         if (!hash)
1140                 return NULL;
1141
1142         hval = fnhe_hashfun(daddr);
1143
1144         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1145              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1146                 if (fnhe->fnhe_daddr == daddr)
1147                         return fnhe;
1148         }
1149         return NULL;
1150 }
1151
1152 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1153                               __be32 daddr)
1154 {
1155         bool ret = false;
1156
1157         spin_lock_bh(&fnhe_lock);
1158
1159         if (daddr == fnhe->fnhe_daddr) {
1160                 struct rtable *orig;
1161
1162                 if (fnhe->fnhe_pmtu) {
1163                         unsigned long expires = fnhe->fnhe_expires;
1164                         unsigned long diff = expires - jiffies;
1165
1166                         if (time_before(jiffies, expires)) {
1167                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1168                                 dst_set_expires(&rt->dst, diff);
1169                         }
1170                 }
1171                 if (fnhe->fnhe_gw) {
1172                         rt->rt_flags |= RTCF_REDIRECTED;
1173                         rt->rt_gateway = fnhe->fnhe_gw;
1174                 }
1175
1176                 orig = rcu_dereference(fnhe->fnhe_rth);
1177                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1178                 if (orig)
1179                         rt_free(orig);
1180
1181                 fnhe->fnhe_stamp = jiffies;
1182                 ret = true;
1183         } else {
1184                 /* Routes we intend to cache in nexthop exception have
1185                  * the DST_NOCACHE bit clear.  However, if we are
1186                  * unsuccessful at storing this route into the cache
1187                  * we really need to set it.
1188                  */
1189                 rt->dst.flags |= DST_NOCACHE;
1190         }
1191         spin_unlock_bh(&fnhe_lock);
1192
1193         return ret;
1194 }
1195
1196 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1197 {
1198         struct rtable *orig, *prev, **p;
1199         bool ret = true;
1200
1201         if (rt_is_input_route(rt)) {
1202                 p = (struct rtable **)&nh->nh_rth_input;
1203         } else {
1204                 if (!nh->nh_pcpu_rth_output)
1205                         goto nocache;
1206                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1207         }
1208         orig = *p;
1209
1210         prev = cmpxchg(p, orig, rt);
1211         if (prev == orig) {
1212                 if (orig)
1213                         rt_free(orig);
1214         } else {
1215                 /* Routes we intend to cache in the FIB nexthop have
1216                  * the DST_NOCACHE bit clear.  However, if we are
1217                  * unsuccessful at storing this route into the cache
1218                  * we really need to set it.
1219                  */
1220 nocache:
1221                 rt->dst.flags |= DST_NOCACHE;
1222                 ret = false;
1223         }
1224
1225         return ret;
1226 }
1227
1228 static DEFINE_SPINLOCK(rt_uncached_lock);
1229 static LIST_HEAD(rt_uncached_list);
1230
1231 static void rt_add_uncached_list(struct rtable *rt)
1232 {
1233         spin_lock_bh(&rt_uncached_lock);
1234         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1235         spin_unlock_bh(&rt_uncached_lock);
1236 }
1237
1238 static void ipv4_dst_destroy(struct dst_entry *dst)
1239 {
1240         struct rtable *rt = (struct rtable *) dst;
1241
1242         if (!list_empty(&rt->rt_uncached)) {
1243                 spin_lock_bh(&rt_uncached_lock);
1244                 list_del(&rt->rt_uncached);
1245                 spin_unlock_bh(&rt_uncached_lock);
1246         }
1247 }
1248
1249 void rt_flush_dev(struct net_device *dev)
1250 {
1251         if (!list_empty(&rt_uncached_list)) {
1252                 struct net *net = dev_net(dev);
1253                 struct rtable *rt;
1254
1255                 spin_lock_bh(&rt_uncached_lock);
1256                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1257                         if (rt->dst.dev != dev)
1258                                 continue;
1259                         rt->dst.dev = net->loopback_dev;
1260                         dev_hold(rt->dst.dev);
1261                         dev_put(dev);
1262                 }
1263                 spin_unlock_bh(&rt_uncached_lock);
1264         }
1265 }
1266
1267 static bool rt_cache_valid(const struct rtable *rt)
1268 {
1269         return  rt &&
1270                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1271                 !rt_is_expired(rt);
1272 }
1273
1274 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1275                            const struct fib_result *res,
1276                            struct fib_nh_exception *fnhe,
1277                            struct fib_info *fi, u16 type, u32 itag)
1278 {
1279         bool cached = false;
1280
1281         if (fi) {
1282                 struct fib_nh *nh = &FIB_RES_NH(*res);
1283
1284                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1285                         rt->rt_gateway = nh->nh_gw;
1286                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1287 #ifdef CONFIG_IP_ROUTE_CLASSID
1288                 rt->dst.tclassid = nh->nh_tclassid;
1289 #endif
1290                 if (unlikely(fnhe))
1291                         cached = rt_bind_exception(rt, fnhe, daddr);
1292                 else if (!(rt->dst.flags & DST_NOCACHE))
1293                         cached = rt_cache_route(nh, rt);
1294         }
1295         if (unlikely(!cached))
1296                 rt_add_uncached_list(rt);
1297
1298 #ifdef CONFIG_IP_ROUTE_CLASSID
1299 #ifdef CONFIG_IP_MULTIPLE_TABLES
1300         set_class_tag(rt, res->tclassid);
1301 #endif
1302         set_class_tag(rt, itag);
1303 #endif
1304 }
1305
1306 static struct rtable *rt_dst_alloc(struct net_device *dev,
1307                                    bool nopolicy, bool noxfrm, bool will_cache)
1308 {
1309         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1310                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1311                          (nopolicy ? DST_NOPOLICY : 0) |
1312                          (noxfrm ? DST_NOXFRM : 0));
1313 }
1314
1315 /* called in rcu_read_lock() section */
1316 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1317                                 u8 tos, struct net_device *dev, int our)
1318 {
1319         struct rtable *rth;
1320         struct in_device *in_dev = __in_dev_get_rcu(dev);
1321         u32 itag = 0;
1322         int err;
1323
1324         /* Primary sanity checks. */
1325
1326         if (in_dev == NULL)
1327                 return -EINVAL;
1328
1329         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1330             skb->protocol != htons(ETH_P_IP))
1331                 goto e_inval;
1332
1333         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1334                 if (ipv4_is_loopback(saddr))
1335                         goto e_inval;
1336
1337         if (ipv4_is_zeronet(saddr)) {
1338                 if (!ipv4_is_local_multicast(daddr))
1339                         goto e_inval;
1340         } else {
1341                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1342                                           in_dev, &itag);
1343                 if (err < 0)
1344                         goto e_err;
1345         }
1346         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1347                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1348         if (!rth)
1349                 goto e_nobufs;
1350
1351 #ifdef CONFIG_IP_ROUTE_CLASSID
1352         rth->dst.tclassid = itag;
1353 #endif
1354         rth->dst.output = ip_rt_bug;
1355
1356         rth->rt_genid   = rt_genid(dev_net(dev));
1357         rth->rt_flags   = RTCF_MULTICAST;
1358         rth->rt_type    = RTN_MULTICAST;
1359         rth->rt_is_input= 1;
1360         rth->rt_iif     = 0;
1361         rth->rt_pmtu    = 0;
1362         rth->rt_gateway = 0;
1363         INIT_LIST_HEAD(&rth->rt_uncached);
1364         if (our) {
1365                 rth->dst.input= ip_local_deliver;
1366                 rth->rt_flags |= RTCF_LOCAL;
1367         }
1368
1369 #ifdef CONFIG_IP_MROUTE
1370         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1371                 rth->dst.input = ip_mr_input;
1372 #endif
1373         RT_CACHE_STAT_INC(in_slow_mc);
1374
1375         skb_dst_set(skb, &rth->dst);
1376         return 0;
1377
1378 e_nobufs:
1379         return -ENOBUFS;
1380 e_inval:
1381         return -EINVAL;
1382 e_err:
1383         return err;
1384 }
1385
1386
1387 static void ip_handle_martian_source(struct net_device *dev,
1388                                      struct in_device *in_dev,
1389                                      struct sk_buff *skb,
1390                                      __be32 daddr,
1391                                      __be32 saddr)
1392 {
1393         RT_CACHE_STAT_INC(in_martian_src);
1394 #ifdef CONFIG_IP_ROUTE_VERBOSE
1395         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1396                 /*
1397                  *      RFC1812 recommendation, if source is martian,
1398                  *      the only hint is MAC header.
1399                  */
1400                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1401                         &daddr, &saddr, dev->name);
1402                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1403                         print_hex_dump(KERN_WARNING, "ll header: ",
1404                                        DUMP_PREFIX_OFFSET, 16, 1,
1405                                        skb_mac_header(skb),
1406                                        dev->hard_header_len, true);
1407                 }
1408         }
1409 #endif
1410 }
1411
1412 /* called in rcu_read_lock() section */
1413 static int __mkroute_input(struct sk_buff *skb,
1414                            const struct fib_result *res,
1415                            struct in_device *in_dev,
1416                            __be32 daddr, __be32 saddr, u32 tos)
1417 {
1418         struct rtable *rth;
1419         int err;
1420         struct in_device *out_dev;
1421         unsigned int flags = 0;
1422         bool do_cache;
1423         u32 itag;
1424
1425         /* get a working reference to the output device */
1426         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1427         if (out_dev == NULL) {
1428                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1429                 return -EINVAL;
1430         }
1431
1432
1433         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1434                                   in_dev->dev, in_dev, &itag);
1435         if (err < 0) {
1436                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1437                                          saddr);
1438
1439                 goto cleanup;
1440         }
1441
1442         if (out_dev == in_dev && err &&
1443             (IN_DEV_SHARED_MEDIA(out_dev) ||
1444              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1445                 flags |= RTCF_DOREDIRECT;
1446
1447         if (skb->protocol != htons(ETH_P_IP)) {
1448                 /* Not IP (i.e. ARP). Do not create route, if it is
1449                  * invalid for proxy arp. DNAT routes are always valid.
1450                  *
1451                  * Proxy arp feature have been extended to allow, ARP
1452                  * replies back to the same interface, to support
1453                  * Private VLAN switch technologies. See arp.c.
1454                  */
1455                 if (out_dev == in_dev &&
1456                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1457                         err = -EINVAL;
1458                         goto cleanup;
1459                 }
1460         }
1461
1462         do_cache = false;
1463         if (res->fi) {
1464                 if (!itag) {
1465                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1466                         if (rt_cache_valid(rth)) {
1467                                 skb_dst_set_noref(skb, &rth->dst);
1468                                 goto out;
1469                         }
1470                         do_cache = true;
1471                 }
1472         }
1473
1474         rth = rt_dst_alloc(out_dev->dev,
1475                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1476                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1477         if (!rth) {
1478                 err = -ENOBUFS;
1479                 goto cleanup;
1480         }
1481
1482         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1483         rth->rt_flags = flags;
1484         rth->rt_type = res->type;
1485         rth->rt_is_input = 1;
1486         rth->rt_iif     = 0;
1487         rth->rt_pmtu    = 0;
1488         rth->rt_gateway = 0;
1489         INIT_LIST_HEAD(&rth->rt_uncached);
1490
1491         rth->dst.input = ip_forward;
1492         rth->dst.output = ip_output;
1493
1494         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1495         skb_dst_set(skb, &rth->dst);
1496 out:
1497         err = 0;
1498  cleanup:
1499         return err;
1500 }
1501
1502 static int ip_mkroute_input(struct sk_buff *skb,
1503                             struct fib_result *res,
1504                             const struct flowi4 *fl4,
1505                             struct in_device *in_dev,
1506                             __be32 daddr, __be32 saddr, u32 tos)
1507 {
1508 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1509         if (res->fi && res->fi->fib_nhs > 1)
1510                 fib_select_multipath(res);
1511 #endif
1512
1513         /* create a routing cache entry */
1514         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1515 }
1516
1517 /*
1518  *      NOTE. We drop all the packets that has local source
1519  *      addresses, because every properly looped back packet
1520  *      must have correct destination already attached by output routine.
1521  *
1522  *      Such approach solves two big problems:
1523  *      1. Not simplex devices are handled properly.
1524  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1525  *      called with rcu_read_lock()
1526  */
1527
1528 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1529                                u8 tos, struct net_device *dev)
1530 {
1531         struct fib_result res;
1532         struct in_device *in_dev = __in_dev_get_rcu(dev);
1533         struct flowi4   fl4;
1534         unsigned int    flags = 0;
1535         u32             itag = 0;
1536         struct rtable   *rth;
1537         int             err = -EINVAL;
1538         struct net    *net = dev_net(dev);
1539         bool do_cache;
1540
1541         /* IP on this device is disabled. */
1542
1543         if (!in_dev)
1544                 goto out;
1545
1546         /* Check for the most weird martians, which can be not detected
1547            by fib_lookup.
1548          */
1549
1550         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1551                 goto martian_source;
1552
1553         res.fi = NULL;
1554         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1555                 goto brd_input;
1556
1557         /* Accept zero addresses only to limited broadcast;
1558          * I even do not know to fix it or not. Waiting for complains :-)
1559          */
1560         if (ipv4_is_zeronet(saddr))
1561                 goto martian_source;
1562
1563         if (ipv4_is_zeronet(daddr))
1564                 goto martian_destination;
1565
1566         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1567          * and call it once if daddr or/and saddr are loopback addresses
1568          */
1569         if (ipv4_is_loopback(daddr)) {
1570                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1571                         goto martian_destination;
1572         } else if (ipv4_is_loopback(saddr)) {
1573                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1574                         goto martian_source;
1575         }
1576
1577         /*
1578          *      Now we are ready to route packet.
1579          */
1580         fl4.flowi4_oif = 0;
1581         fl4.flowi4_iif = dev->ifindex;
1582         fl4.flowi4_mark = skb->mark;
1583         fl4.flowi4_tos = tos;
1584         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1585         fl4.daddr = daddr;
1586         fl4.saddr = saddr;
1587         err = fib_lookup(net, &fl4, &res);
1588         if (err != 0)
1589                 goto no_route;
1590
1591         RT_CACHE_STAT_INC(in_slow_tot);
1592
1593         if (res.type == RTN_BROADCAST)
1594                 goto brd_input;
1595
1596         if (res.type == RTN_LOCAL) {
1597                 err = fib_validate_source(skb, saddr, daddr, tos,
1598                                           LOOPBACK_IFINDEX,
1599                                           dev, in_dev, &itag);
1600                 if (err < 0)
1601                         goto martian_source_keep_err;
1602                 goto local_input;
1603         }
1604
1605         if (!IN_DEV_FORWARD(in_dev))
1606                 goto no_route;
1607         if (res.type != RTN_UNICAST)
1608                 goto martian_destination;
1609
1610         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1611 out:    return err;
1612
1613 brd_input:
1614         if (skb->protocol != htons(ETH_P_IP))
1615                 goto e_inval;
1616
1617         if (!ipv4_is_zeronet(saddr)) {
1618                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1619                                           in_dev, &itag);
1620                 if (err < 0)
1621                         goto martian_source_keep_err;
1622         }
1623         flags |= RTCF_BROADCAST;
1624         res.type = RTN_BROADCAST;
1625         RT_CACHE_STAT_INC(in_brd);
1626
1627 local_input:
1628         do_cache = false;
1629         if (res.fi) {
1630                 if (!itag) {
1631                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1632                         if (rt_cache_valid(rth)) {
1633                                 skb_dst_set_noref(skb, &rth->dst);
1634                                 err = 0;
1635                                 goto out;
1636                         }
1637                         do_cache = true;
1638                 }
1639         }
1640
1641         rth = rt_dst_alloc(net->loopback_dev,
1642                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1643         if (!rth)
1644                 goto e_nobufs;
1645
1646         rth->dst.input= ip_local_deliver;
1647         rth->dst.output= ip_rt_bug;
1648 #ifdef CONFIG_IP_ROUTE_CLASSID
1649         rth->dst.tclassid = itag;
1650 #endif
1651
1652         rth->rt_genid = rt_genid(net);
1653         rth->rt_flags   = flags|RTCF_LOCAL;
1654         rth->rt_type    = res.type;
1655         rth->rt_is_input = 1;
1656         rth->rt_iif     = 0;
1657         rth->rt_pmtu    = 0;
1658         rth->rt_gateway = 0;
1659         INIT_LIST_HEAD(&rth->rt_uncached);
1660         if (res.type == RTN_UNREACHABLE) {
1661                 rth->dst.input= ip_error;
1662                 rth->dst.error= -err;
1663                 rth->rt_flags   &= ~RTCF_LOCAL;
1664         }
1665         if (do_cache)
1666                 rt_cache_route(&FIB_RES_NH(res), rth);
1667         skb_dst_set(skb, &rth->dst);
1668         err = 0;
1669         goto out;
1670
1671 no_route:
1672         RT_CACHE_STAT_INC(in_no_route);
1673         res.type = RTN_UNREACHABLE;
1674         if (err == -ESRCH)
1675                 err = -ENETUNREACH;
1676         goto local_input;
1677
1678         /*
1679          *      Do not cache martian addresses: they should be logged (RFC1812)
1680          */
1681 martian_destination:
1682         RT_CACHE_STAT_INC(in_martian_dst);
1683 #ifdef CONFIG_IP_ROUTE_VERBOSE
1684         if (IN_DEV_LOG_MARTIANS(in_dev))
1685                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1686                                      &daddr, &saddr, dev->name);
1687 #endif
1688
1689 e_inval:
1690         err = -EINVAL;
1691         goto out;
1692
1693 e_nobufs:
1694         err = -ENOBUFS;
1695         goto out;
1696
1697 martian_source:
1698         err = -EINVAL;
1699 martian_source_keep_err:
1700         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1701         goto out;
1702 }
1703
1704 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1705                          u8 tos, struct net_device *dev)
1706 {
1707         int res;
1708
1709         rcu_read_lock();
1710
1711         /* Multicast recognition logic is moved from route cache to here.
1712            The problem was that too many Ethernet cards have broken/missing
1713            hardware multicast filters :-( As result the host on multicasting
1714            network acquires a lot of useless route cache entries, sort of
1715            SDR messages from all the world. Now we try to get rid of them.
1716            Really, provided software IP multicast filter is organized
1717            reasonably (at least, hashed), it does not result in a slowdown
1718            comparing with route cache reject entries.
1719            Note, that multicast routers are not affected, because
1720            route cache entry is created eventually.
1721          */
1722         if (ipv4_is_multicast(daddr)) {
1723                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1724
1725                 if (in_dev) {
1726                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1727                                                   ip_hdr(skb)->protocol);
1728                         if (our
1729 #ifdef CONFIG_IP_MROUTE
1730                                 ||
1731                             (!ipv4_is_local_multicast(daddr) &&
1732                              IN_DEV_MFORWARD(in_dev))
1733 #endif
1734                            ) {
1735                                 int res = ip_route_input_mc(skb, daddr, saddr,
1736                                                             tos, dev, our);
1737                                 rcu_read_unlock();
1738                                 return res;
1739                         }
1740                 }
1741                 rcu_read_unlock();
1742                 return -EINVAL;
1743         }
1744         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1745         rcu_read_unlock();
1746         return res;
1747 }
1748 EXPORT_SYMBOL(ip_route_input_noref);
1749
1750 /* called with rcu_read_lock() */
1751 static struct rtable *__mkroute_output(const struct fib_result *res,
1752                                        const struct flowi4 *fl4, int orig_oif,
1753                                        struct net_device *dev_out,
1754                                        unsigned int flags)
1755 {
1756         struct fib_info *fi = res->fi;
1757         struct fib_nh_exception *fnhe;
1758         struct in_device *in_dev;
1759         u16 type = res->type;
1760         struct rtable *rth;
1761
1762         in_dev = __in_dev_get_rcu(dev_out);
1763         if (!in_dev)
1764                 return ERR_PTR(-EINVAL);
1765
1766         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1767                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1768                         return ERR_PTR(-EINVAL);
1769
1770         if (ipv4_is_lbcast(fl4->daddr))
1771                 type = RTN_BROADCAST;
1772         else if (ipv4_is_multicast(fl4->daddr))
1773                 type = RTN_MULTICAST;
1774         else if (ipv4_is_zeronet(fl4->daddr))
1775                 return ERR_PTR(-EINVAL);
1776
1777         if (dev_out->flags & IFF_LOOPBACK)
1778                 flags |= RTCF_LOCAL;
1779
1780         if (type == RTN_BROADCAST) {
1781                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1782                 fi = NULL;
1783         } else if (type == RTN_MULTICAST) {
1784                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1785                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1786                                      fl4->flowi4_proto))
1787                         flags &= ~RTCF_LOCAL;
1788                 /* If multicast route do not exist use
1789                  * default one, but do not gateway in this case.
1790                  * Yes, it is hack.
1791                  */
1792                 if (fi && res->prefixlen < 4)
1793                         fi = NULL;
1794         }
1795
1796         fnhe = NULL;
1797         if (fi) {
1798                 struct rtable __rcu **prth;
1799
1800                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1801                 if (fnhe)
1802                         prth = &fnhe->fnhe_rth;
1803                 else
1804                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1805                 rth = rcu_dereference(*prth);
1806                 if (rt_cache_valid(rth)) {
1807                         dst_hold(&rth->dst);
1808                         return rth;
1809                 }
1810         }
1811         rth = rt_dst_alloc(dev_out,
1812                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1813                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1814                            fi);
1815         if (!rth)
1816                 return ERR_PTR(-ENOBUFS);
1817
1818         rth->dst.output = ip_output;
1819
1820         rth->rt_genid = rt_genid(dev_net(dev_out));
1821         rth->rt_flags   = flags;
1822         rth->rt_type    = type;
1823         rth->rt_is_input = 0;
1824         rth->rt_iif     = orig_oif ? : 0;
1825         rth->rt_pmtu    = 0;
1826         rth->rt_gateway = 0;
1827         INIT_LIST_HEAD(&rth->rt_uncached);
1828
1829         RT_CACHE_STAT_INC(out_slow_tot);
1830
1831         if (flags & RTCF_LOCAL)
1832                 rth->dst.input = ip_local_deliver;
1833         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1834                 if (flags & RTCF_LOCAL &&
1835                     !(dev_out->flags & IFF_LOOPBACK)) {
1836                         rth->dst.output = ip_mc_output;
1837                         RT_CACHE_STAT_INC(out_slow_mc);
1838                 }
1839 #ifdef CONFIG_IP_MROUTE
1840                 if (type == RTN_MULTICAST) {
1841                         if (IN_DEV_MFORWARD(in_dev) &&
1842                             !ipv4_is_local_multicast(fl4->daddr)) {
1843                                 rth->dst.input = ip_mr_input;
1844                                 rth->dst.output = ip_mc_output;
1845                         }
1846                 }
1847 #endif
1848         }
1849
1850         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1851
1852         return rth;
1853 }
1854
1855 /*
1856  * Major route resolver routine.
1857  */
1858
1859 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1860 {
1861         struct net_device *dev_out = NULL;
1862         __u8 tos = RT_FL_TOS(fl4);
1863         unsigned int flags = 0;
1864         struct fib_result res;
1865         struct rtable *rth;
1866         int orig_oif;
1867
1868         res.tclassid    = 0;
1869         res.fi          = NULL;
1870         res.table       = NULL;
1871
1872         orig_oif = fl4->flowi4_oif;
1873
1874         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1875         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1876         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1877                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1878
1879         rcu_read_lock();
1880         if (fl4->saddr) {
1881                 rth = ERR_PTR(-EINVAL);
1882                 if (ipv4_is_multicast(fl4->saddr) ||
1883                     ipv4_is_lbcast(fl4->saddr) ||
1884                     ipv4_is_zeronet(fl4->saddr))
1885                         goto out;
1886
1887                 /* I removed check for oif == dev_out->oif here.
1888                    It was wrong for two reasons:
1889                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1890                       is assigned to multiple interfaces.
1891                    2. Moreover, we are allowed to send packets with saddr
1892                       of another iface. --ANK
1893                  */
1894
1895                 if (fl4->flowi4_oif == 0 &&
1896                     (ipv4_is_multicast(fl4->daddr) ||
1897                      ipv4_is_lbcast(fl4->daddr))) {
1898                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1899                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1900                         if (dev_out == NULL)
1901                                 goto out;
1902
1903                         /* Special hack: user can direct multicasts
1904                            and limited broadcast via necessary interface
1905                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1906                            This hack is not just for fun, it allows
1907                            vic,vat and friends to work.
1908                            They bind socket to loopback, set ttl to zero
1909                            and expect that it will work.
1910                            From the viewpoint of routing cache they are broken,
1911                            because we are not allowed to build multicast path
1912                            with loopback source addr (look, routing cache
1913                            cannot know, that ttl is zero, so that packet
1914                            will not leave this host and route is valid).
1915                            Luckily, this hack is good workaround.
1916                          */
1917
1918                         fl4->flowi4_oif = dev_out->ifindex;
1919                         goto make_route;
1920                 }
1921
1922                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1923                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1924                         if (!__ip_dev_find(net, fl4->saddr, false))
1925                                 goto out;
1926                 }
1927         }
1928
1929
1930         if (fl4->flowi4_oif) {
1931                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1932                 rth = ERR_PTR(-ENODEV);
1933                 if (dev_out == NULL)
1934                         goto out;
1935
1936                 /* RACE: Check return value of inet_select_addr instead. */
1937                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1938                         rth = ERR_PTR(-ENETUNREACH);
1939                         goto out;
1940                 }
1941                 if (ipv4_is_local_multicast(fl4->daddr) ||
1942                     ipv4_is_lbcast(fl4->daddr)) {
1943                         if (!fl4->saddr)
1944                                 fl4->saddr = inet_select_addr(dev_out, 0,
1945                                                               RT_SCOPE_LINK);
1946                         goto make_route;
1947                 }
1948                 if (fl4->saddr) {
1949                         if (ipv4_is_multicast(fl4->daddr))
1950                                 fl4->saddr = inet_select_addr(dev_out, 0,
1951                                                               fl4->flowi4_scope);
1952                         else if (!fl4->daddr)
1953                                 fl4->saddr = inet_select_addr(dev_out, 0,
1954                                                               RT_SCOPE_HOST);
1955                 }
1956         }
1957
1958         if (!fl4->daddr) {
1959                 fl4->daddr = fl4->saddr;
1960                 if (!fl4->daddr)
1961                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1962                 dev_out = net->loopback_dev;
1963                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1964                 res.type = RTN_LOCAL;
1965                 flags |= RTCF_LOCAL;
1966                 goto make_route;
1967         }
1968
1969         if (fib_lookup(net, fl4, &res)) {
1970                 res.fi = NULL;
1971                 res.table = NULL;
1972                 if (fl4->flowi4_oif) {
1973                         /* Apparently, routing tables are wrong. Assume,
1974                            that the destination is on link.
1975
1976                            WHY? DW.
1977                            Because we are allowed to send to iface
1978                            even if it has NO routes and NO assigned
1979                            addresses. When oif is specified, routing
1980                            tables are looked up with only one purpose:
1981                            to catch if destination is gatewayed, rather than
1982                            direct. Moreover, if MSG_DONTROUTE is set,
1983                            we send packet, ignoring both routing tables
1984                            and ifaddr state. --ANK
1985
1986
1987                            We could make it even if oif is unknown,
1988                            likely IPv6, but we do not.
1989                          */
1990
1991                         if (fl4->saddr == 0)
1992                                 fl4->saddr = inet_select_addr(dev_out, 0,
1993                                                               RT_SCOPE_LINK);
1994                         res.type = RTN_UNICAST;
1995                         goto make_route;
1996                 }
1997                 rth = ERR_PTR(-ENETUNREACH);
1998                 goto out;
1999         }
2000
2001         if (res.type == RTN_LOCAL) {
2002                 if (!fl4->saddr) {
2003                         if (res.fi->fib_prefsrc)
2004                                 fl4->saddr = res.fi->fib_prefsrc;
2005                         else
2006                                 fl4->saddr = fl4->daddr;
2007                 }
2008                 dev_out = net->loopback_dev;
2009                 fl4->flowi4_oif = dev_out->ifindex;
2010                 flags |= RTCF_LOCAL;
2011                 goto make_route;
2012         }
2013
2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2015         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2016                 fib_select_multipath(&res);
2017         else
2018 #endif
2019         if (!res.prefixlen &&
2020             res.table->tb_num_default > 1 &&
2021             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2022                 fib_select_default(&res);
2023
2024         if (!fl4->saddr)
2025                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2026
2027         dev_out = FIB_RES_DEV(res);
2028         fl4->flowi4_oif = dev_out->ifindex;
2029
2030
2031 make_route:
2032         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2033
2034 out:
2035         rcu_read_unlock();
2036         return rth;
2037 }
2038 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2039
2040 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2041 {
2042         return NULL;
2043 }
2044
2045 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2046 {
2047         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2048
2049         return mtu ? : dst->dev->mtu;
2050 }
2051
2052 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2053                                           struct sk_buff *skb, u32 mtu)
2054 {
2055 }
2056
2057 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2058                                        struct sk_buff *skb)
2059 {
2060 }
2061
2062 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2063                                           unsigned long old)
2064 {
2065         return NULL;
2066 }
2067
2068 static struct dst_ops ipv4_dst_blackhole_ops = {
2069         .family                 =       AF_INET,
2070         .protocol               =       cpu_to_be16(ETH_P_IP),
2071         .check                  =       ipv4_blackhole_dst_check,
2072         .mtu                    =       ipv4_blackhole_mtu,
2073         .default_advmss         =       ipv4_default_advmss,
2074         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2075         .redirect               =       ipv4_rt_blackhole_redirect,
2076         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2077         .neigh_lookup           =       ipv4_neigh_lookup,
2078 };
2079
2080 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2081 {
2082         struct rtable *ort = (struct rtable *) dst_orig;
2083         struct rtable *rt;
2084
2085         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2086         if (rt) {
2087                 struct dst_entry *new = &rt->dst;
2088
2089                 new->__use = 1;
2090                 new->input = dst_discard;
2091                 new->output = dst_discard;
2092
2093                 new->dev = ort->dst.dev;
2094                 if (new->dev)
2095                         dev_hold(new->dev);
2096
2097                 rt->rt_is_input = ort->rt_is_input;
2098                 rt->rt_iif = ort->rt_iif;
2099                 rt->rt_pmtu = ort->rt_pmtu;
2100
2101                 rt->rt_genid = rt_genid(net);
2102                 rt->rt_flags = ort->rt_flags;
2103                 rt->rt_type = ort->rt_type;
2104                 rt->rt_gateway = ort->rt_gateway;
2105
2106                 INIT_LIST_HEAD(&rt->rt_uncached);
2107
2108                 dst_free(new);
2109         }
2110
2111         dst_release(dst_orig);
2112
2113         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2114 }
2115
2116 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2117                                     struct sock *sk)
2118 {
2119         struct rtable *rt = __ip_route_output_key(net, flp4);
2120
2121         if (IS_ERR(rt))
2122                 return rt;
2123
2124         if (flp4->flowi4_proto)
2125                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2126                                                    flowi4_to_flowi(flp4),
2127                                                    sk, 0);
2128
2129         return rt;
2130 }
2131 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2132
2133 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2134                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2135                         u32 seq, int event, int nowait, unsigned int flags)
2136 {
2137         struct rtable *rt = skb_rtable(skb);
2138         struct rtmsg *r;
2139         struct nlmsghdr *nlh;
2140         unsigned long expires = 0;
2141         u32 error;
2142         u32 metrics[RTAX_MAX];
2143
2144         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2145         if (nlh == NULL)
2146                 return -EMSGSIZE;
2147
2148         r = nlmsg_data(nlh);
2149         r->rtm_family    = AF_INET;
2150         r->rtm_dst_len  = 32;
2151         r->rtm_src_len  = 0;
2152         r->rtm_tos      = fl4->flowi4_tos;
2153         r->rtm_table    = RT_TABLE_MAIN;
2154         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2155                 goto nla_put_failure;
2156         r->rtm_type     = rt->rt_type;
2157         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2158         r->rtm_protocol = RTPROT_UNSPEC;
2159         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2160         if (rt->rt_flags & RTCF_NOTIFY)
2161                 r->rtm_flags |= RTM_F_NOTIFY;
2162
2163         if (nla_put_be32(skb, RTA_DST, dst))
2164                 goto nla_put_failure;
2165         if (src) {
2166                 r->rtm_src_len = 32;
2167                 if (nla_put_be32(skb, RTA_SRC, src))
2168                         goto nla_put_failure;
2169         }
2170         if (rt->dst.dev &&
2171             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2172                 goto nla_put_failure;
2173 #ifdef CONFIG_IP_ROUTE_CLASSID
2174         if (rt->dst.tclassid &&
2175             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2176                 goto nla_put_failure;
2177 #endif
2178         if (!rt_is_input_route(rt) &&
2179             fl4->saddr != src) {
2180                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2181                         goto nla_put_failure;
2182         }
2183         if (rt->rt_gateway &&
2184             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2185                 goto nla_put_failure;
2186
2187         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2188         if (rt->rt_pmtu)
2189                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2190         if (rtnetlink_put_metrics(skb, metrics) < 0)
2191                 goto nla_put_failure;
2192
2193         if (fl4->flowi4_mark &&
2194             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2195                 goto nla_put_failure;
2196
2197         error = rt->dst.error;
2198         expires = rt->dst.expires;
2199         if (expires) {
2200                 if (time_before(jiffies, expires))
2201                         expires -= jiffies;
2202                 else
2203                         expires = 0;
2204         }
2205
2206         if (rt_is_input_route(rt)) {
2207                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2208                         goto nla_put_failure;
2209         }
2210
2211         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2212                 goto nla_put_failure;
2213
2214         return nlmsg_end(skb, nlh);
2215
2216 nla_put_failure:
2217         nlmsg_cancel(skb, nlh);
2218         return -EMSGSIZE;
2219 }
2220
2221 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2222 {
2223         struct net *net = sock_net(in_skb->sk);
2224         struct rtmsg *rtm;
2225         struct nlattr *tb[RTA_MAX+1];
2226         struct rtable *rt = NULL;
2227         struct flowi4 fl4;
2228         __be32 dst = 0;
2229         __be32 src = 0;
2230         u32 iif;
2231         int err;
2232         int mark;
2233         struct sk_buff *skb;
2234
2235         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2236         if (err < 0)
2237                 goto errout;
2238
2239         rtm = nlmsg_data(nlh);
2240
2241         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2242         if (skb == NULL) {
2243                 err = -ENOBUFS;
2244                 goto errout;
2245         }
2246
2247         /* Reserve room for dummy headers, this skb can pass
2248            through good chunk of routing engine.
2249          */
2250         skb_reset_mac_header(skb);
2251         skb_reset_network_header(skb);
2252
2253         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2254         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2255         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2256
2257         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2258         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2259         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2260         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2261
2262         memset(&fl4, 0, sizeof(fl4));
2263         fl4.daddr = dst;
2264         fl4.saddr = src;
2265         fl4.flowi4_tos = rtm->rtm_tos;
2266         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2267         fl4.flowi4_mark = mark;
2268
2269         if (iif) {
2270                 struct net_device *dev;
2271
2272                 dev = __dev_get_by_index(net, iif);
2273                 if (dev == NULL) {
2274                         err = -ENODEV;
2275                         goto errout_free;
2276                 }
2277
2278                 skb->protocol   = htons(ETH_P_IP);
2279                 skb->dev        = dev;
2280                 skb->mark       = mark;
2281                 local_bh_disable();
2282                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2283                 local_bh_enable();
2284
2285                 rt = skb_rtable(skb);
2286                 if (err == 0 && rt->dst.error)
2287                         err = -rt->dst.error;
2288         } else {
2289                 rt = ip_route_output_key(net, &fl4);
2290
2291                 err = 0;
2292                 if (IS_ERR(rt))
2293                         err = PTR_ERR(rt);
2294         }
2295
2296         if (err)
2297                 goto errout_free;
2298
2299         skb_dst_set(skb, &rt->dst);
2300         if (rtm->rtm_flags & RTM_F_NOTIFY)
2301                 rt->rt_flags |= RTCF_NOTIFY;
2302
2303         err = rt_fill_info(net, dst, src, &fl4, skb,
2304                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2305                            RTM_NEWROUTE, 0, 0);
2306         if (err <= 0)
2307                 goto errout_free;
2308
2309         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2310 errout:
2311         return err;
2312
2313 errout_free:
2314         kfree_skb(skb);
2315         goto errout;
2316 }
2317
2318 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2319 {
2320         return skb->len;
2321 }
2322
2323 void ip_rt_multicast_event(struct in_device *in_dev)
2324 {
2325         rt_cache_flush(dev_net(in_dev->dev));
2326 }
2327
2328 #ifdef CONFIG_SYSCTL
2329 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2330                                         void __user *buffer,
2331                                         size_t *lenp, loff_t *ppos)
2332 {
2333         if (write) {
2334                 rt_cache_flush((struct net *)__ctl->extra1);
2335                 return 0;
2336         }
2337
2338         return -EINVAL;
2339 }
2340
2341 static ctl_table ipv4_route_table[] = {
2342         {
2343                 .procname       = "gc_thresh",
2344                 .data           = &ipv4_dst_ops.gc_thresh,
2345                 .maxlen         = sizeof(int),
2346                 .mode           = 0644,
2347                 .proc_handler   = proc_dointvec,
2348         },
2349         {
2350                 .procname       = "max_size",
2351                 .data           = &ip_rt_max_size,
2352                 .maxlen         = sizeof(int),
2353                 .mode           = 0644,
2354                 .proc_handler   = proc_dointvec,
2355         },
2356         {
2357                 /*  Deprecated. Use gc_min_interval_ms */
2358
2359                 .procname       = "gc_min_interval",
2360                 .data           = &ip_rt_gc_min_interval,
2361                 .maxlen         = sizeof(int),
2362                 .mode           = 0644,
2363                 .proc_handler   = proc_dointvec_jiffies,
2364         },
2365         {
2366                 .procname       = "gc_min_interval_ms",
2367                 .data           = &ip_rt_gc_min_interval,
2368                 .maxlen         = sizeof(int),
2369                 .mode           = 0644,
2370                 .proc_handler   = proc_dointvec_ms_jiffies,
2371         },
2372         {
2373                 .procname       = "gc_timeout",
2374                 .data           = &ip_rt_gc_timeout,
2375                 .maxlen         = sizeof(int),
2376                 .mode           = 0644,
2377                 .proc_handler   = proc_dointvec_jiffies,
2378         },
2379         {
2380                 .procname       = "gc_interval",
2381                 .data           = &ip_rt_gc_interval,
2382                 .maxlen         = sizeof(int),
2383                 .mode           = 0644,
2384                 .proc_handler   = proc_dointvec_jiffies,
2385         },
2386         {
2387                 .procname       = "redirect_load",
2388                 .data           = &ip_rt_redirect_load,
2389                 .maxlen         = sizeof(int),
2390                 .mode           = 0644,
2391                 .proc_handler   = proc_dointvec,
2392         },
2393         {
2394                 .procname       = "redirect_number",
2395                 .data           = &ip_rt_redirect_number,
2396                 .maxlen         = sizeof(int),
2397                 .mode           = 0644,
2398                 .proc_handler   = proc_dointvec,
2399         },
2400         {
2401                 .procname       = "redirect_silence",
2402                 .data           = &ip_rt_redirect_silence,
2403                 .maxlen         = sizeof(int),
2404                 .mode           = 0644,
2405                 .proc_handler   = proc_dointvec,
2406         },
2407         {
2408                 .procname       = "error_cost",
2409                 .data           = &ip_rt_error_cost,
2410                 .maxlen         = sizeof(int),
2411                 .mode           = 0644,
2412                 .proc_handler   = proc_dointvec,
2413         },
2414         {
2415                 .procname       = "error_burst",
2416                 .data           = &ip_rt_error_burst,
2417                 .maxlen         = sizeof(int),
2418                 .mode           = 0644,
2419                 .proc_handler   = proc_dointvec,
2420         },
2421         {
2422                 .procname       = "gc_elasticity",
2423                 .data           = &ip_rt_gc_elasticity,
2424                 .maxlen         = sizeof(int),
2425                 .mode           = 0644,
2426                 .proc_handler   = proc_dointvec,
2427         },
2428         {
2429                 .procname       = "mtu_expires",
2430                 .data           = &ip_rt_mtu_expires,
2431                 .maxlen         = sizeof(int),
2432                 .mode           = 0644,
2433                 .proc_handler   = proc_dointvec_jiffies,
2434         },
2435         {
2436                 .procname       = "min_pmtu",
2437                 .data           = &ip_rt_min_pmtu,
2438                 .maxlen         = sizeof(int),
2439                 .mode           = 0644,
2440                 .proc_handler   = proc_dointvec,
2441         },
2442         {
2443                 .procname       = "min_adv_mss",
2444                 .data           = &ip_rt_min_advmss,
2445                 .maxlen         = sizeof(int),
2446                 .mode           = 0644,
2447                 .proc_handler   = proc_dointvec,
2448         },
2449         { }
2450 };
2451
2452 static struct ctl_table ipv4_route_flush_table[] = {
2453         {
2454                 .procname       = "flush",
2455                 .maxlen         = sizeof(int),
2456                 .mode           = 0200,
2457                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2458         },
2459         { },
2460 };
2461
2462 static __net_init int sysctl_route_net_init(struct net *net)
2463 {
2464         struct ctl_table *tbl;
2465
2466         tbl = ipv4_route_flush_table;
2467         if (!net_eq(net, &init_net)) {
2468                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2469                 if (tbl == NULL)
2470                         goto err_dup;
2471         }
2472         tbl[0].extra1 = net;
2473
2474         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2475         if (net->ipv4.route_hdr == NULL)
2476                 goto err_reg;
2477         return 0;
2478
2479 err_reg:
2480         if (tbl != ipv4_route_flush_table)
2481                 kfree(tbl);
2482 err_dup:
2483         return -ENOMEM;
2484 }
2485
2486 static __net_exit void sysctl_route_net_exit(struct net *net)
2487 {
2488         struct ctl_table *tbl;
2489
2490         tbl = net->ipv4.route_hdr->ctl_table_arg;
2491         unregister_net_sysctl_table(net->ipv4.route_hdr);
2492         BUG_ON(tbl == ipv4_route_flush_table);
2493         kfree(tbl);
2494 }
2495
2496 static __net_initdata struct pernet_operations sysctl_route_ops = {
2497         .init = sysctl_route_net_init,
2498         .exit = sysctl_route_net_exit,
2499 };
2500 #endif
2501
2502 static __net_init int rt_genid_init(struct net *net)
2503 {
2504         atomic_set(&net->rt_genid, 0);
2505         get_random_bytes(&net->ipv4.dev_addr_genid,
2506                          sizeof(net->ipv4.dev_addr_genid));
2507         return 0;
2508 }
2509
2510 static __net_initdata struct pernet_operations rt_genid_ops = {
2511         .init = rt_genid_init,
2512 };
2513
2514 static int __net_init ipv4_inetpeer_init(struct net *net)
2515 {
2516         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2517
2518         if (!bp)
2519                 return -ENOMEM;
2520         inet_peer_base_init(bp);
2521         net->ipv4.peers = bp;
2522         return 0;
2523 }
2524
2525 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2526 {
2527         struct inet_peer_base *bp = net->ipv4.peers;
2528
2529         net->ipv4.peers = NULL;
2530         inetpeer_invalidate_tree(bp);
2531         kfree(bp);
2532 }
2533
2534 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2535         .init   =       ipv4_inetpeer_init,
2536         .exit   =       ipv4_inetpeer_exit,
2537 };
2538
2539 #ifdef CONFIG_IP_ROUTE_CLASSID
2540 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2541 #endif /* CONFIG_IP_ROUTE_CLASSID */
2542
2543 int __init ip_rt_init(void)
2544 {
2545         int rc = 0;
2546
2547 #ifdef CONFIG_IP_ROUTE_CLASSID
2548         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2549         if (!ip_rt_acct)
2550                 panic("IP: failed to allocate ip_rt_acct\n");
2551 #endif
2552
2553         ipv4_dst_ops.kmem_cachep =
2554                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2555                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2556
2557         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2558
2559         if (dst_entries_init(&ipv4_dst_ops) < 0)
2560                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2561
2562         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2563                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2564
2565         ipv4_dst_ops.gc_thresh = ~0;
2566         ip_rt_max_size = INT_MAX;
2567
2568         devinet_init();
2569         ip_fib_init();
2570
2571         if (ip_rt_proc_init())
2572                 pr_err("Unable to create route proc files\n");
2573 #ifdef CONFIG_XFRM
2574         xfrm_init();
2575         xfrm4_init(ip_rt_max_size);
2576 #endif
2577         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2578
2579 #ifdef CONFIG_SYSCTL
2580         register_pernet_subsys(&sysctl_route_ops);
2581 #endif
2582         register_pernet_subsys(&rt_genid_ops);
2583         register_pernet_subsys(&ipv4_inetpeer_ops);
2584         return rc;
2585 }
2586
2587 #ifdef CONFIG_SYSCTL
2588 /*
2589  * We really need to sanitize the damn ipv4 init order, then all
2590  * this nonsense will go away.
2591  */
2592 void __init ip_static_sysctl_init(void)
2593 {
2594         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2595 }
2596 #endif