24fd4c596643117faeca700748956ebe27541652
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 static inline int rt_genid(struct net *net)
206 {
207         return atomic_read(&net->ipv4.rt_genid);
208 }
209
210 #ifdef CONFIG_PROC_FS
211 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
212 {
213         if (*pos)
214                 return NULL;
215         return SEQ_START_TOKEN;
216 }
217
218 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
219 {
220         ++*pos;
221         return NULL;
222 }
223
224 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
225 {
226 }
227
228 static int rt_cache_seq_show(struct seq_file *seq, void *v)
229 {
230         if (v == SEQ_START_TOKEN)
231                 seq_printf(seq, "%-127s\n",
232                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
233                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
234                            "HHUptod\tSpecDst");
235         return 0;
236 }
237
238 static const struct seq_operations rt_cache_seq_ops = {
239         .start  = rt_cache_seq_start,
240         .next   = rt_cache_seq_next,
241         .stop   = rt_cache_seq_stop,
242         .show   = rt_cache_seq_show,
243 };
244
245 static int rt_cache_seq_open(struct inode *inode, struct file *file)
246 {
247         return seq_open(file, &rt_cache_seq_ops);
248 }
249
250 static const struct file_operations rt_cache_seq_fops = {
251         .owner   = THIS_MODULE,
252         .open    = rt_cache_seq_open,
253         .read    = seq_read,
254         .llseek  = seq_lseek,
255         .release = seq_release,
256 };
257
258
259 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
260 {
261         int cpu;
262
263         if (*pos == 0)
264                 return SEQ_START_TOKEN;
265
266         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
267                 if (!cpu_possible(cpu))
268                         continue;
269                 *pos = cpu+1;
270                 return &per_cpu(rt_cache_stat, cpu);
271         }
272         return NULL;
273 }
274
275 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 {
277         int cpu;
278
279         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
280                 if (!cpu_possible(cpu))
281                         continue;
282                 *pos = cpu+1;
283                 return &per_cpu(rt_cache_stat, cpu);
284         }
285         return NULL;
286
287 }
288
289 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
290 {
291
292 }
293
294 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
295 {
296         struct rt_cache_stat *st = v;
297
298         if (v == SEQ_START_TOKEN) {
299                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
300                 return 0;
301         }
302
303         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
304                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
305                    dst_entries_get_slow(&ipv4_dst_ops),
306                    st->in_hit,
307                    st->in_slow_tot,
308                    st->in_slow_mc,
309                    st->in_no_route,
310                    st->in_brd,
311                    st->in_martian_dst,
312                    st->in_martian_src,
313
314                    st->out_hit,
315                    st->out_slow_tot,
316                    st->out_slow_mc,
317
318                    st->gc_total,
319                    st->gc_ignored,
320                    st->gc_goal_miss,
321                    st->gc_dst_overflow,
322                    st->in_hlist_search,
323                    st->out_hlist_search
324                 );
325         return 0;
326 }
327
328 static const struct seq_operations rt_cpu_seq_ops = {
329         .start  = rt_cpu_seq_start,
330         .next   = rt_cpu_seq_next,
331         .stop   = rt_cpu_seq_stop,
332         .show   = rt_cpu_seq_show,
333 };
334
335
336 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
337 {
338         return seq_open(file, &rt_cpu_seq_ops);
339 }
340
341 static const struct file_operations rt_cpu_seq_fops = {
342         .owner   = THIS_MODULE,
343         .open    = rt_cpu_seq_open,
344         .read    = seq_read,
345         .llseek  = seq_lseek,
346         .release = seq_release,
347 };
348
349 #ifdef CONFIG_IP_ROUTE_CLASSID
350 static int rt_acct_proc_show(struct seq_file *m, void *v)
351 {
352         struct ip_rt_acct *dst, *src;
353         unsigned int i, j;
354
355         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
356         if (!dst)
357                 return -ENOMEM;
358
359         for_each_possible_cpu(i) {
360                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
361                 for (j = 0; j < 256; j++) {
362                         dst[j].o_bytes   += src[j].o_bytes;
363                         dst[j].o_packets += src[j].o_packets;
364                         dst[j].i_bytes   += src[j].i_bytes;
365                         dst[j].i_packets += src[j].i_packets;
366                 }
367         }
368
369         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
370         kfree(dst);
371         return 0;
372 }
373
374 static int rt_acct_proc_open(struct inode *inode, struct file *file)
375 {
376         return single_open(file, rt_acct_proc_show, NULL);
377 }
378
379 static const struct file_operations rt_acct_proc_fops = {
380         .owner          = THIS_MODULE,
381         .open           = rt_acct_proc_open,
382         .read           = seq_read,
383         .llseek         = seq_lseek,
384         .release        = single_release,
385 };
386 #endif
387
388 static int __net_init ip_rt_do_proc_init(struct net *net)
389 {
390         struct proc_dir_entry *pde;
391
392         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
393                         &rt_cache_seq_fops);
394         if (!pde)
395                 goto err1;
396
397         pde = proc_create("rt_cache", S_IRUGO,
398                           net->proc_net_stat, &rt_cpu_seq_fops);
399         if (!pde)
400                 goto err2;
401
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
404         if (!pde)
405                 goto err3;
406 #endif
407         return 0;
408
409 #ifdef CONFIG_IP_ROUTE_CLASSID
410 err3:
411         remove_proc_entry("rt_cache", net->proc_net_stat);
412 #endif
413 err2:
414         remove_proc_entry("rt_cache", net->proc_net);
415 err1:
416         return -ENOMEM;
417 }
418
419 static void __net_exit ip_rt_do_proc_exit(struct net *net)
420 {
421         remove_proc_entry("rt_cache", net->proc_net_stat);
422         remove_proc_entry("rt_cache", net->proc_net);
423 #ifdef CONFIG_IP_ROUTE_CLASSID
424         remove_proc_entry("rt_acct", net->proc_net);
425 #endif
426 }
427
428 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
429         .init = ip_rt_do_proc_init,
430         .exit = ip_rt_do_proc_exit,
431 };
432
433 static int __init ip_rt_proc_init(void)
434 {
435         return register_pernet_subsys(&ip_rt_proc_ops);
436 }
437
438 #else
439 static inline int ip_rt_proc_init(void)
440 {
441         return 0;
442 }
443 #endif /* CONFIG_PROC_FS */
444
445 static inline bool rt_is_expired(const struct rtable *rth)
446 {
447         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
448 }
449
450 /*
451  * Perturbation of rt_genid by a small quantity [1..256]
452  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
453  * many times (2^24) without giving recent rt_genid.
454  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
455  */
456 static void rt_cache_invalidate(struct net *net)
457 {
458         unsigned char shuffle;
459
460         get_random_bytes(&shuffle, sizeof(shuffle));
461         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
462 }
463
464 /*
465  * delay < 0  : invalidate cache (fast : entries will be deleted later)
466  * delay >= 0 : invalidate & flush cache (can be long)
467  */
468 void rt_cache_flush(struct net *net, int delay)
469 {
470         rt_cache_invalidate(net);
471 }
472
473 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
474                                            struct sk_buff *skb,
475                                            const void *daddr)
476 {
477         struct net_device *dev = dst->dev;
478         const __be32 *pkey = daddr;
479         const struct rtable *rt;
480         struct neighbour *n;
481
482         rt = (const struct rtable *) dst;
483         if (rt->rt_gateway)
484                 pkey = (const __be32 *) &rt->rt_gateway;
485         else if (skb)
486                 pkey = &ip_hdr(skb)->daddr;
487
488         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
489         if (n)
490                 return n;
491         return neigh_create(&arp_tbl, pkey, dev);
492 }
493
494 /*
495  * Peer allocation may fail only in serious out-of-memory conditions.  However
496  * we still can generate some output.
497  * Random ID selection looks a bit dangerous because we have no chances to
498  * select ID being unique in a reasonable period of time.
499  * But broken packet identifier may be better than no packet at all.
500  */
501 static void ip_select_fb_ident(struct iphdr *iph)
502 {
503         static DEFINE_SPINLOCK(ip_fb_id_lock);
504         static u32 ip_fallback_id;
505         u32 salt;
506
507         spin_lock_bh(&ip_fb_id_lock);
508         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
509         iph->id = htons(salt & 0xFFFF);
510         ip_fallback_id = salt;
511         spin_unlock_bh(&ip_fb_id_lock);
512 }
513
514 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
515 {
516         struct net *net = dev_net(dst->dev);
517         struct inet_peer *peer;
518
519         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
520         if (peer) {
521                 iph->id = htons(inet_getid(peer, more));
522                 inet_putpeer(peer);
523                 return;
524         }
525
526         ip_select_fb_ident(iph);
527 }
528 EXPORT_SYMBOL(__ip_select_ident);
529
530 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
531                              const struct iphdr *iph,
532                              int oif, u8 tos,
533                              u8 prot, u32 mark, int flow_flags)
534 {
535         if (sk) {
536                 const struct inet_sock *inet = inet_sk(sk);
537
538                 oif = sk->sk_bound_dev_if;
539                 mark = sk->sk_mark;
540                 tos = RT_CONN_FLAGS(sk);
541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542         }
543         flowi4_init_output(fl4, oif, mark, tos,
544                            RT_SCOPE_UNIVERSE, prot,
545                            flow_flags,
546                            iph->daddr, iph->saddr, 0, 0);
547 }
548
549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
550                                const struct sock *sk)
551 {
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557
558         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0);
576         rcu_read_unlock();
577 }
578
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587
588 static inline void rt_free(struct rtable *rt)
589 {
590         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
591 }
592
593 static DEFINE_SPINLOCK(fnhe_lock);
594
595 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
596 {
597         struct fib_nh_exception *fnhe, *oldest;
598         struct rtable *orig;
599
600         oldest = rcu_dereference(hash->chain);
601         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
602              fnhe = rcu_dereference(fnhe->fnhe_next)) {
603                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
604                         oldest = fnhe;
605         }
606         orig = rcu_dereference(oldest->fnhe_rth);
607         if (orig) {
608                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
609                 rt_free(orig);
610         }
611         return oldest;
612 }
613
614 static inline u32 fnhe_hashfun(__be32 daddr)
615 {
616         u32 hval;
617
618         hval = (__force u32) daddr;
619         hval ^= (hval >> 11) ^ (hval >> 22);
620
621         return hval & (FNHE_HASH_SIZE - 1);
622 }
623
624 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
625                                   u32 pmtu, unsigned long expires)
626 {
627         struct fnhe_hash_bucket *hash;
628         struct fib_nh_exception *fnhe;
629         int depth;
630         u32 hval = fnhe_hashfun(daddr);
631
632         spin_lock_bh(&fnhe_lock);
633
634         hash = nh->nh_exceptions;
635         if (!hash) {
636                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
637                 if (!hash)
638                         goto out_unlock;
639                 nh->nh_exceptions = hash;
640         }
641
642         hash += hval;
643
644         depth = 0;
645         for (fnhe = rcu_dereference(hash->chain); fnhe;
646              fnhe = rcu_dereference(fnhe->fnhe_next)) {
647                 if (fnhe->fnhe_daddr == daddr)
648                         break;
649                 depth++;
650         }
651
652         if (fnhe) {
653                 if (gw)
654                         fnhe->fnhe_gw = gw;
655                 if (pmtu) {
656                         fnhe->fnhe_pmtu = pmtu;
657                         fnhe->fnhe_expires = expires;
658                 }
659         } else {
660                 if (depth > FNHE_RECLAIM_DEPTH)
661                         fnhe = fnhe_oldest(hash);
662                 else {
663                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
664                         if (!fnhe)
665                                 goto out_unlock;
666
667                         fnhe->fnhe_next = hash->chain;
668                         rcu_assign_pointer(hash->chain, fnhe);
669                 }
670                 fnhe->fnhe_daddr = daddr;
671                 fnhe->fnhe_gw = gw;
672                 fnhe->fnhe_pmtu = pmtu;
673                 fnhe->fnhe_expires = expires;
674         }
675
676         fnhe->fnhe_stamp = jiffies;
677
678 out_unlock:
679         spin_unlock_bh(&fnhe_lock);
680         return;
681 }
682
683 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
684                              bool kill_route)
685 {
686         __be32 new_gw = icmp_hdr(skb)->un.gateway;
687         __be32 old_gw = ip_hdr(skb)->saddr;
688         struct net_device *dev = skb->dev;
689         struct in_device *in_dev;
690         struct fib_result res;
691         struct neighbour *n;
692         struct net *net;
693
694         switch (icmp_hdr(skb)->code & 7) {
695         case ICMP_REDIR_NET:
696         case ICMP_REDIR_NETTOS:
697         case ICMP_REDIR_HOST:
698         case ICMP_REDIR_HOSTTOS:
699                 break;
700
701         default:
702                 return;
703         }
704
705         if (rt->rt_gateway != old_gw)
706                 return;
707
708         in_dev = __in_dev_get_rcu(dev);
709         if (!in_dev)
710                 return;
711
712         net = dev_net(dev);
713         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
714             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
715             ipv4_is_zeronet(new_gw))
716                 goto reject_redirect;
717
718         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
719                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
720                         goto reject_redirect;
721                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
722                         goto reject_redirect;
723         } else {
724                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
725                         goto reject_redirect;
726         }
727
728         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
729         if (n) {
730                 if (!(n->nud_state & NUD_VALID)) {
731                         neigh_event_send(n, NULL);
732                 } else {
733                         if (fib_lookup(net, fl4, &res) == 0) {
734                                 struct fib_nh *nh = &FIB_RES_NH(res);
735
736                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
737                                                       0, 0);
738                         }
739                         if (kill_route)
740                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
741                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
742                 }
743                 neigh_release(n);
744         }
745         return;
746
747 reject_redirect:
748 #ifdef CONFIG_IP_ROUTE_VERBOSE
749         if (IN_DEV_LOG_MARTIANS(in_dev)) {
750                 const struct iphdr *iph = (const struct iphdr *) skb->data;
751                 __be32 daddr = iph->daddr;
752                 __be32 saddr = iph->saddr;
753
754                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
755                                      "  Advised path = %pI4 -> %pI4\n",
756                                      &old_gw, dev->name, &new_gw,
757                                      &saddr, &daddr);
758         }
759 #endif
760         ;
761 }
762
763 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
764 {
765         struct rtable *rt;
766         struct flowi4 fl4;
767
768         rt = (struct rtable *) dst;
769
770         ip_rt_build_flow_key(&fl4, sk, skb);
771         __ip_do_redirect(rt, skb, &fl4, true);
772 }
773
774 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
775 {
776         struct rtable *rt = (struct rtable *)dst;
777         struct dst_entry *ret = dst;
778
779         if (rt) {
780                 if (dst->obsolete > 0) {
781                         ip_rt_put(rt);
782                         ret = NULL;
783                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
784                            rt->dst.expires) {
785                         ip_rt_put(rt);
786                         ret = NULL;
787                 }
788         }
789         return ret;
790 }
791
792 /*
793  * Algorithm:
794  *      1. The first ip_rt_redirect_number redirects are sent
795  *         with exponential backoff, then we stop sending them at all,
796  *         assuming that the host ignores our redirects.
797  *      2. If we did not see packets requiring redirects
798  *         during ip_rt_redirect_silence, we assume that the host
799  *         forgot redirected route and start to send redirects again.
800  *
801  * This algorithm is much cheaper and more intelligent than dumb load limiting
802  * in icmp.c.
803  *
804  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
805  * and "frag. need" (breaks PMTU discovery) in icmp.c.
806  */
807
808 void ip_rt_send_redirect(struct sk_buff *skb)
809 {
810         struct rtable *rt = skb_rtable(skb);
811         struct in_device *in_dev;
812         struct inet_peer *peer;
813         struct net *net;
814         int log_martians;
815
816         rcu_read_lock();
817         in_dev = __in_dev_get_rcu(rt->dst.dev);
818         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
819                 rcu_read_unlock();
820                 return;
821         }
822         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
823         rcu_read_unlock();
824
825         net = dev_net(rt->dst.dev);
826         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
827         if (!peer) {
828                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
829                 return;
830         }
831
832         /* No redirected packets during ip_rt_redirect_silence;
833          * reset the algorithm.
834          */
835         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
836                 peer->rate_tokens = 0;
837
838         /* Too many ignored redirects; do not send anything
839          * set dst.rate_last to the last seen redirected packet.
840          */
841         if (peer->rate_tokens >= ip_rt_redirect_number) {
842                 peer->rate_last = jiffies;
843                 goto out_put_peer;
844         }
845
846         /* Check for load limit; set rate_last to the latest sent
847          * redirect.
848          */
849         if (peer->rate_tokens == 0 ||
850             time_after(jiffies,
851                        (peer->rate_last +
852                         (ip_rt_redirect_load << peer->rate_tokens)))) {
853                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
854                 peer->rate_last = jiffies;
855                 ++peer->rate_tokens;
856 #ifdef CONFIG_IP_ROUTE_VERBOSE
857                 if (log_martians &&
858                     peer->rate_tokens == ip_rt_redirect_number)
859                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
860                                              &ip_hdr(skb)->saddr, inet_iif(skb),
861                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
862 #endif
863         }
864 out_put_peer:
865         inet_putpeer(peer);
866 }
867
868 static int ip_error(struct sk_buff *skb)
869 {
870         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
871         struct rtable *rt = skb_rtable(skb);
872         struct inet_peer *peer;
873         unsigned long now;
874         struct net *net;
875         bool send;
876         int code;
877
878         net = dev_net(rt->dst.dev);
879         if (!IN_DEV_FORWARD(in_dev)) {
880                 switch (rt->dst.error) {
881                 case EHOSTUNREACH:
882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
883                         break;
884
885                 case ENETUNREACH:
886                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
887                         break;
888                 }
889                 goto out;
890         }
891
892         switch (rt->dst.error) {
893         case EINVAL:
894         default:
895                 goto out;
896         case EHOSTUNREACH:
897                 code = ICMP_HOST_UNREACH;
898                 break;
899         case ENETUNREACH:
900                 code = ICMP_NET_UNREACH;
901                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
902                 break;
903         case EACCES:
904                 code = ICMP_PKT_FILTERED;
905                 break;
906         }
907
908         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
909
910         send = true;
911         if (peer) {
912                 now = jiffies;
913                 peer->rate_tokens += now - peer->rate_last;
914                 if (peer->rate_tokens > ip_rt_error_burst)
915                         peer->rate_tokens = ip_rt_error_burst;
916                 peer->rate_last = now;
917                 if (peer->rate_tokens >= ip_rt_error_cost)
918                         peer->rate_tokens -= ip_rt_error_cost;
919                 else
920                         send = false;
921                 inet_putpeer(peer);
922         }
923         if (send)
924                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
925
926 out:    kfree_skb(skb);
927         return 0;
928 }
929
930 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
931 {
932         struct fib_result res;
933
934         if (mtu < ip_rt_min_pmtu)
935                 mtu = ip_rt_min_pmtu;
936
937         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
938                 struct fib_nh *nh = &FIB_RES_NH(res);
939
940                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
941                                       jiffies + ip_rt_mtu_expires);
942         }
943         return mtu;
944 }
945
946 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
947                               struct sk_buff *skb, u32 mtu)
948 {
949         struct rtable *rt = (struct rtable *) dst;
950         struct flowi4 fl4;
951
952         ip_rt_build_flow_key(&fl4, sk, skb);
953         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
954
955         if (!rt->rt_pmtu) {
956                 dst->obsolete = DST_OBSOLETE_KILL;
957         } else {
958                 rt->rt_pmtu = mtu;
959                 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
960         }
961 }
962
963 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
964                       int oif, u32 mark, u8 protocol, int flow_flags)
965 {
966         const struct iphdr *iph = (const struct iphdr *) skb->data;
967         struct flowi4 fl4;
968         struct rtable *rt;
969
970         __build_flow_key(&fl4, NULL, iph, oif,
971                          RT_TOS(iph->tos), protocol, mark, flow_flags);
972         rt = __ip_route_output_key(net, &fl4);
973         if (!IS_ERR(rt)) {
974                 __ip_rt_update_pmtu(rt, &fl4, mtu);
975                 ip_rt_put(rt);
976         }
977 }
978 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
979
980 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
981 {
982         const struct iphdr *iph = (const struct iphdr *) skb->data;
983         struct flowi4 fl4;
984         struct rtable *rt;
985
986         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
987         rt = __ip_route_output_key(sock_net(sk), &fl4);
988         if (!IS_ERR(rt)) {
989                 __ip_rt_update_pmtu(rt, &fl4, mtu);
990                 ip_rt_put(rt);
991         }
992 }
993 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
994
995 void ipv4_redirect(struct sk_buff *skb, struct net *net,
996                    int oif, u32 mark, u8 protocol, int flow_flags)
997 {
998         const struct iphdr *iph = (const struct iphdr *) skb->data;
999         struct flowi4 fl4;
1000         struct rtable *rt;
1001
1002         __build_flow_key(&fl4, NULL, iph, oif,
1003                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1004         rt = __ip_route_output_key(net, &fl4);
1005         if (!IS_ERR(rt)) {
1006                 __ip_do_redirect(rt, skb, &fl4, false);
1007                 ip_rt_put(rt);
1008         }
1009 }
1010 EXPORT_SYMBOL_GPL(ipv4_redirect);
1011
1012 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1013 {
1014         const struct iphdr *iph = (const struct iphdr *) skb->data;
1015         struct flowi4 fl4;
1016         struct rtable *rt;
1017
1018         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1019         rt = __ip_route_output_key(sock_net(sk), &fl4);
1020         if (!IS_ERR(rt)) {
1021                 __ip_do_redirect(rt, skb, &fl4, false);
1022                 ip_rt_put(rt);
1023         }
1024 }
1025 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1026
1027 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1028 {
1029         struct rtable *rt = (struct rtable *) dst;
1030
1031         /* All IPV4 dsts are created with ->obsolete set to the value
1032          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1033          * into this function always.
1034          *
1035          * When a PMTU/redirect information update invalidates a
1036          * route, this is indicated by setting obsolete to
1037          * DST_OBSOLETE_KILL.
1038          */
1039         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1040                 return NULL;
1041         return dst;
1042 }
1043
1044 static void ipv4_link_failure(struct sk_buff *skb)
1045 {
1046         struct rtable *rt;
1047
1048         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1049
1050         rt = skb_rtable(skb);
1051         if (rt)
1052                 dst_set_expires(&rt->dst, 0);
1053 }
1054
1055 static int ip_rt_bug(struct sk_buff *skb)
1056 {
1057         pr_debug("%s: %pI4 -> %pI4, %s\n",
1058                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1059                  skb->dev ? skb->dev->name : "?");
1060         kfree_skb(skb);
1061         WARN_ON(1);
1062         return 0;
1063 }
1064
1065 /*
1066    We do not cache source address of outgoing interface,
1067    because it is used only by IP RR, TS and SRR options,
1068    so that it out of fast path.
1069
1070    BTW remember: "addr" is allowed to be not aligned
1071    in IP options!
1072  */
1073
1074 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1075 {
1076         __be32 src;
1077
1078         if (rt_is_output_route(rt))
1079                 src = ip_hdr(skb)->saddr;
1080         else {
1081                 struct fib_result res;
1082                 struct flowi4 fl4;
1083                 struct iphdr *iph;
1084
1085                 iph = ip_hdr(skb);
1086
1087                 memset(&fl4, 0, sizeof(fl4));
1088                 fl4.daddr = iph->daddr;
1089                 fl4.saddr = iph->saddr;
1090                 fl4.flowi4_tos = RT_TOS(iph->tos);
1091                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1092                 fl4.flowi4_iif = skb->dev->ifindex;
1093                 fl4.flowi4_mark = skb->mark;
1094
1095                 rcu_read_lock();
1096                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1097                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1098                 else
1099                         src = inet_select_addr(rt->dst.dev,
1100                                                rt_nexthop(rt, iph->daddr),
1101                                                RT_SCOPE_UNIVERSE);
1102                 rcu_read_unlock();
1103         }
1104         memcpy(addr, &src, 4);
1105 }
1106
1107 #ifdef CONFIG_IP_ROUTE_CLASSID
1108 static void set_class_tag(struct rtable *rt, u32 tag)
1109 {
1110         if (!(rt->dst.tclassid & 0xFFFF))
1111                 rt->dst.tclassid |= tag & 0xFFFF;
1112         if (!(rt->dst.tclassid & 0xFFFF0000))
1113                 rt->dst.tclassid |= tag & 0xFFFF0000;
1114 }
1115 #endif
1116
1117 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1118 {
1119         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1120
1121         if (advmss == 0) {
1122                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1123                                ip_rt_min_advmss);
1124                 if (advmss > 65535 - 40)
1125                         advmss = 65535 - 40;
1126         }
1127         return advmss;
1128 }
1129
1130 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1131 {
1132         const struct rtable *rt = (const struct rtable *) dst;
1133         unsigned int mtu = rt->rt_pmtu;
1134
1135         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1136                 mtu = 0;
1137
1138         if (!mtu)
1139                 mtu = dst_metric_raw(dst, RTAX_MTU);
1140
1141         if (mtu && rt_is_output_route(rt))
1142                 return mtu;
1143
1144         mtu = dst->dev->mtu;
1145
1146         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1147                 if (rt->rt_gateway && mtu > 576)
1148                         mtu = 576;
1149         }
1150
1151         if (mtu > IP_MAX_MTU)
1152                 mtu = IP_MAX_MTU;
1153
1154         return mtu;
1155 }
1156
1157 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1158 {
1159         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1160         struct fib_nh_exception *fnhe;
1161         u32 hval;
1162
1163         if (!hash)
1164                 return NULL;
1165
1166         hval = fnhe_hashfun(daddr);
1167
1168         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1169              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1170                 if (fnhe->fnhe_daddr == daddr)
1171                         return fnhe;
1172         }
1173         return NULL;
1174 }
1175
1176 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1177                               __be32 daddr)
1178 {
1179         bool ret = false;
1180
1181         spin_lock_bh(&fnhe_lock);
1182
1183         if (daddr == fnhe->fnhe_daddr) {
1184                 struct rtable *orig;
1185
1186                 if (fnhe->fnhe_pmtu) {
1187                         unsigned long expires = fnhe->fnhe_expires;
1188                         unsigned long diff = expires - jiffies;
1189
1190                         if (time_before(jiffies, expires)) {
1191                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1192                                 dst_set_expires(&rt->dst, diff);
1193                         }
1194                 }
1195                 if (fnhe->fnhe_gw) {
1196                         rt->rt_flags |= RTCF_REDIRECTED;
1197                         rt->rt_gateway = fnhe->fnhe_gw;
1198                 }
1199
1200                 orig = rcu_dereference(fnhe->fnhe_rth);
1201                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1202                 if (orig)
1203                         rt_free(orig);
1204
1205                 fnhe->fnhe_stamp = jiffies;
1206                 ret = true;
1207         } else {
1208                 /* Routes we intend to cache in nexthop exception have
1209                  * the DST_NOCACHE bit clear.  However, if we are
1210                  * unsuccessful at storing this route into the cache
1211                  * we really need to set it.
1212                  */
1213                 rt->dst.flags |= DST_NOCACHE;
1214         }
1215         spin_unlock_bh(&fnhe_lock);
1216
1217         return ret;
1218 }
1219
1220 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1221 {
1222         struct rtable *orig, *prev, **p;
1223         bool ret = true;
1224
1225         if (rt_is_input_route(rt)) {
1226                 p = (struct rtable **)&nh->nh_rth_input;
1227         } else {
1228                 if (!nh->nh_pcpu_rth_output)
1229                         goto nocache;
1230                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1231         }
1232         orig = *p;
1233
1234         prev = cmpxchg(p, orig, rt);
1235         if (prev == orig) {
1236                 if (orig)
1237                         rt_free(orig);
1238         } else {
1239                 /* Routes we intend to cache in the FIB nexthop have
1240                  * the DST_NOCACHE bit clear.  However, if we are
1241                  * unsuccessful at storing this route into the cache
1242                  * we really need to set it.
1243                  */
1244 nocache:
1245                 rt->dst.flags |= DST_NOCACHE;
1246                 ret = false;
1247         }
1248
1249         return ret;
1250 }
1251
1252 static DEFINE_SPINLOCK(rt_uncached_lock);
1253 static LIST_HEAD(rt_uncached_list);
1254
1255 static void rt_add_uncached_list(struct rtable *rt)
1256 {
1257         spin_lock_bh(&rt_uncached_lock);
1258         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1259         spin_unlock_bh(&rt_uncached_lock);
1260 }
1261
1262 static void ipv4_dst_destroy(struct dst_entry *dst)
1263 {
1264         struct rtable *rt = (struct rtable *) dst;
1265
1266         if (!list_empty(&rt->rt_uncached)) {
1267                 spin_lock_bh(&rt_uncached_lock);
1268                 list_del(&rt->rt_uncached);
1269                 spin_unlock_bh(&rt_uncached_lock);
1270         }
1271 }
1272
1273 void rt_flush_dev(struct net_device *dev)
1274 {
1275         if (!list_empty(&rt_uncached_list)) {
1276                 struct net *net = dev_net(dev);
1277                 struct rtable *rt;
1278
1279                 spin_lock_bh(&rt_uncached_lock);
1280                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1281                         if (rt->dst.dev != dev)
1282                                 continue;
1283                         rt->dst.dev = net->loopback_dev;
1284                         dev_hold(rt->dst.dev);
1285                         dev_put(dev);
1286                 }
1287                 spin_unlock_bh(&rt_uncached_lock);
1288         }
1289 }
1290
1291 static bool rt_cache_valid(const struct rtable *rt)
1292 {
1293         return  rt &&
1294                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1295                 !rt_is_expired(rt);
1296 }
1297
1298 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1299                            const struct fib_result *res,
1300                            struct fib_nh_exception *fnhe,
1301                            struct fib_info *fi, u16 type, u32 itag)
1302 {
1303         bool cached = false;
1304
1305         if (fi) {
1306                 struct fib_nh *nh = &FIB_RES_NH(*res);
1307
1308                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1309                         rt->rt_gateway = nh->nh_gw;
1310                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1311 #ifdef CONFIG_IP_ROUTE_CLASSID
1312                 rt->dst.tclassid = nh->nh_tclassid;
1313 #endif
1314                 if (unlikely(fnhe))
1315                         cached = rt_bind_exception(rt, fnhe, daddr);
1316                 else if (!(rt->dst.flags & DST_NOCACHE))
1317                         cached = rt_cache_route(nh, rt);
1318         }
1319         if (unlikely(!cached))
1320                 rt_add_uncached_list(rt);
1321
1322 #ifdef CONFIG_IP_ROUTE_CLASSID
1323 #ifdef CONFIG_IP_MULTIPLE_TABLES
1324         set_class_tag(rt, res->tclassid);
1325 #endif
1326         set_class_tag(rt, itag);
1327 #endif
1328 }
1329
1330 static struct rtable *rt_dst_alloc(struct net_device *dev,
1331                                    bool nopolicy, bool noxfrm, bool will_cache)
1332 {
1333         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1334                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1335                          (nopolicy ? DST_NOPOLICY : 0) |
1336                          (noxfrm ? DST_NOXFRM : 0));
1337 }
1338
1339 /* called in rcu_read_lock() section */
1340 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1341                                 u8 tos, struct net_device *dev, int our)
1342 {
1343         struct rtable *rth;
1344         struct in_device *in_dev = __in_dev_get_rcu(dev);
1345         u32 itag = 0;
1346         int err;
1347
1348         /* Primary sanity checks. */
1349
1350         if (in_dev == NULL)
1351                 return -EINVAL;
1352
1353         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1354             skb->protocol != htons(ETH_P_IP))
1355                 goto e_inval;
1356
1357         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1358                 if (ipv4_is_loopback(saddr))
1359                         goto e_inval;
1360
1361         if (ipv4_is_zeronet(saddr)) {
1362                 if (!ipv4_is_local_multicast(daddr))
1363                         goto e_inval;
1364         } else {
1365                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1366                                           in_dev, &itag);
1367                 if (err < 0)
1368                         goto e_err;
1369         }
1370         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1371                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1372         if (!rth)
1373                 goto e_nobufs;
1374
1375 #ifdef CONFIG_IP_ROUTE_CLASSID
1376         rth->dst.tclassid = itag;
1377 #endif
1378         rth->dst.output = ip_rt_bug;
1379
1380         rth->rt_genid   = rt_genid(dev_net(dev));
1381         rth->rt_flags   = RTCF_MULTICAST;
1382         rth->rt_type    = RTN_MULTICAST;
1383         rth->rt_is_input= 1;
1384         rth->rt_iif     = 0;
1385         rth->rt_pmtu    = 0;
1386         rth->rt_gateway = 0;
1387         INIT_LIST_HEAD(&rth->rt_uncached);
1388         if (our) {
1389                 rth->dst.input= ip_local_deliver;
1390                 rth->rt_flags |= RTCF_LOCAL;
1391         }
1392
1393 #ifdef CONFIG_IP_MROUTE
1394         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1395                 rth->dst.input = ip_mr_input;
1396 #endif
1397         RT_CACHE_STAT_INC(in_slow_mc);
1398
1399         skb_dst_set(skb, &rth->dst);
1400         return 0;
1401
1402 e_nobufs:
1403         return -ENOBUFS;
1404 e_inval:
1405         return -EINVAL;
1406 e_err:
1407         return err;
1408 }
1409
1410
1411 static void ip_handle_martian_source(struct net_device *dev,
1412                                      struct in_device *in_dev,
1413                                      struct sk_buff *skb,
1414                                      __be32 daddr,
1415                                      __be32 saddr)
1416 {
1417         RT_CACHE_STAT_INC(in_martian_src);
1418 #ifdef CONFIG_IP_ROUTE_VERBOSE
1419         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1420                 /*
1421                  *      RFC1812 recommendation, if source is martian,
1422                  *      the only hint is MAC header.
1423                  */
1424                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1425                         &daddr, &saddr, dev->name);
1426                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1427                         print_hex_dump(KERN_WARNING, "ll header: ",
1428                                        DUMP_PREFIX_OFFSET, 16, 1,
1429                                        skb_mac_header(skb),
1430                                        dev->hard_header_len, true);
1431                 }
1432         }
1433 #endif
1434 }
1435
1436 /* called in rcu_read_lock() section */
1437 static int __mkroute_input(struct sk_buff *skb,
1438                            const struct fib_result *res,
1439                            struct in_device *in_dev,
1440                            __be32 daddr, __be32 saddr, u32 tos)
1441 {
1442         struct rtable *rth;
1443         int err;
1444         struct in_device *out_dev;
1445         unsigned int flags = 0;
1446         bool do_cache;
1447         u32 itag;
1448
1449         /* get a working reference to the output device */
1450         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1451         if (out_dev == NULL) {
1452                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1453                 return -EINVAL;
1454         }
1455
1456
1457         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1458                                   in_dev->dev, in_dev, &itag);
1459         if (err < 0) {
1460                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1461                                          saddr);
1462
1463                 goto cleanup;
1464         }
1465
1466         if (out_dev == in_dev && err &&
1467             (IN_DEV_SHARED_MEDIA(out_dev) ||
1468              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1469                 flags |= RTCF_DOREDIRECT;
1470
1471         if (skb->protocol != htons(ETH_P_IP)) {
1472                 /* Not IP (i.e. ARP). Do not create route, if it is
1473                  * invalid for proxy arp. DNAT routes are always valid.
1474                  *
1475                  * Proxy arp feature have been extended to allow, ARP
1476                  * replies back to the same interface, to support
1477                  * Private VLAN switch technologies. See arp.c.
1478                  */
1479                 if (out_dev == in_dev &&
1480                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1481                         err = -EINVAL;
1482                         goto cleanup;
1483                 }
1484         }
1485
1486         do_cache = false;
1487         if (res->fi) {
1488                 if (!itag) {
1489                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1490                         if (rt_cache_valid(rth)) {
1491                                 skb_dst_set_noref(skb, &rth->dst);
1492                                 goto out;
1493                         }
1494                         do_cache = true;
1495                 }
1496         }
1497
1498         rth = rt_dst_alloc(out_dev->dev,
1499                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1500                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1501         if (!rth) {
1502                 err = -ENOBUFS;
1503                 goto cleanup;
1504         }
1505
1506         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1507         rth->rt_flags = flags;
1508         rth->rt_type = res->type;
1509         rth->rt_is_input = 1;
1510         rth->rt_iif     = 0;
1511         rth->rt_pmtu    = 0;
1512         rth->rt_gateway = 0;
1513         INIT_LIST_HEAD(&rth->rt_uncached);
1514
1515         rth->dst.input = ip_forward;
1516         rth->dst.output = ip_output;
1517
1518         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1519         skb_dst_set(skb, &rth->dst);
1520 out:
1521         err = 0;
1522  cleanup:
1523         return err;
1524 }
1525
1526 static int ip_mkroute_input(struct sk_buff *skb,
1527                             struct fib_result *res,
1528                             const struct flowi4 *fl4,
1529                             struct in_device *in_dev,
1530                             __be32 daddr, __be32 saddr, u32 tos)
1531 {
1532 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1533         if (res->fi && res->fi->fib_nhs > 1)
1534                 fib_select_multipath(res);
1535 #endif
1536
1537         /* create a routing cache entry */
1538         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1539 }
1540
1541 /*
1542  *      NOTE. We drop all the packets that has local source
1543  *      addresses, because every properly looped back packet
1544  *      must have correct destination already attached by output routine.
1545  *
1546  *      Such approach solves two big problems:
1547  *      1. Not simplex devices are handled properly.
1548  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1549  *      called with rcu_read_lock()
1550  */
1551
1552 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1553                                u8 tos, struct net_device *dev)
1554 {
1555         struct fib_result res;
1556         struct in_device *in_dev = __in_dev_get_rcu(dev);
1557         struct flowi4   fl4;
1558         unsigned int    flags = 0;
1559         u32             itag = 0;
1560         struct rtable   *rth;
1561         int             err = -EINVAL;
1562         struct net    *net = dev_net(dev);
1563         bool do_cache;
1564
1565         /* IP on this device is disabled. */
1566
1567         if (!in_dev)
1568                 goto out;
1569
1570         /* Check for the most weird martians, which can be not detected
1571            by fib_lookup.
1572          */
1573
1574         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1575                 goto martian_source;
1576
1577         res.fi = NULL;
1578         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1579                 goto brd_input;
1580
1581         /* Accept zero addresses only to limited broadcast;
1582          * I even do not know to fix it or not. Waiting for complains :-)
1583          */
1584         if (ipv4_is_zeronet(saddr))
1585                 goto martian_source;
1586
1587         if (ipv4_is_zeronet(daddr))
1588                 goto martian_destination;
1589
1590         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1591                 if (ipv4_is_loopback(daddr))
1592                         goto martian_destination;
1593
1594                 if (ipv4_is_loopback(saddr))
1595                         goto martian_source;
1596         }
1597
1598         /*
1599          *      Now we are ready to route packet.
1600          */
1601         fl4.flowi4_oif = 0;
1602         fl4.flowi4_iif = dev->ifindex;
1603         fl4.flowi4_mark = skb->mark;
1604         fl4.flowi4_tos = tos;
1605         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1606         fl4.daddr = daddr;
1607         fl4.saddr = saddr;
1608         err = fib_lookup(net, &fl4, &res);
1609         if (err != 0)
1610                 goto no_route;
1611
1612         RT_CACHE_STAT_INC(in_slow_tot);
1613
1614         if (res.type == RTN_BROADCAST)
1615                 goto brd_input;
1616
1617         if (res.type == RTN_LOCAL) {
1618                 err = fib_validate_source(skb, saddr, daddr, tos,
1619                                           net->loopback_dev->ifindex,
1620                                           dev, in_dev, &itag);
1621                 if (err < 0)
1622                         goto martian_source_keep_err;
1623                 goto local_input;
1624         }
1625
1626         if (!IN_DEV_FORWARD(in_dev))
1627                 goto no_route;
1628         if (res.type != RTN_UNICAST)
1629                 goto martian_destination;
1630
1631         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1632 out:    return err;
1633
1634 brd_input:
1635         if (skb->protocol != htons(ETH_P_IP))
1636                 goto e_inval;
1637
1638         if (!ipv4_is_zeronet(saddr)) {
1639                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1640                                           in_dev, &itag);
1641                 if (err < 0)
1642                         goto martian_source_keep_err;
1643         }
1644         flags |= RTCF_BROADCAST;
1645         res.type = RTN_BROADCAST;
1646         RT_CACHE_STAT_INC(in_brd);
1647
1648 local_input:
1649         do_cache = false;
1650         if (res.fi) {
1651                 if (!itag) {
1652                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1653                         if (rt_cache_valid(rth)) {
1654                                 skb_dst_set_noref(skb, &rth->dst);
1655                                 err = 0;
1656                                 goto out;
1657                         }
1658                         do_cache = true;
1659                 }
1660         }
1661
1662         rth = rt_dst_alloc(net->loopback_dev,
1663                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1664         if (!rth)
1665                 goto e_nobufs;
1666
1667         rth->dst.input= ip_local_deliver;
1668         rth->dst.output= ip_rt_bug;
1669 #ifdef CONFIG_IP_ROUTE_CLASSID
1670         rth->dst.tclassid = itag;
1671 #endif
1672
1673         rth->rt_genid = rt_genid(net);
1674         rth->rt_flags   = flags|RTCF_LOCAL;
1675         rth->rt_type    = res.type;
1676         rth->rt_is_input = 1;
1677         rth->rt_iif     = 0;
1678         rth->rt_pmtu    = 0;
1679         rth->rt_gateway = 0;
1680         INIT_LIST_HEAD(&rth->rt_uncached);
1681         if (res.type == RTN_UNREACHABLE) {
1682                 rth->dst.input= ip_error;
1683                 rth->dst.error= -err;
1684                 rth->rt_flags   &= ~RTCF_LOCAL;
1685         }
1686         if (do_cache)
1687                 rt_cache_route(&FIB_RES_NH(res), rth);
1688         skb_dst_set(skb, &rth->dst);
1689         err = 0;
1690         goto out;
1691
1692 no_route:
1693         RT_CACHE_STAT_INC(in_no_route);
1694         res.type = RTN_UNREACHABLE;
1695         if (err == -ESRCH)
1696                 err = -ENETUNREACH;
1697         goto local_input;
1698
1699         /*
1700          *      Do not cache martian addresses: they should be logged (RFC1812)
1701          */
1702 martian_destination:
1703         RT_CACHE_STAT_INC(in_martian_dst);
1704 #ifdef CONFIG_IP_ROUTE_VERBOSE
1705         if (IN_DEV_LOG_MARTIANS(in_dev))
1706                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1707                                      &daddr, &saddr, dev->name);
1708 #endif
1709
1710 e_inval:
1711         err = -EINVAL;
1712         goto out;
1713
1714 e_nobufs:
1715         err = -ENOBUFS;
1716         goto out;
1717
1718 martian_source:
1719         err = -EINVAL;
1720 martian_source_keep_err:
1721         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1722         goto out;
1723 }
1724
1725 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1726                          u8 tos, struct net_device *dev)
1727 {
1728         int res;
1729
1730         rcu_read_lock();
1731
1732         /* Multicast recognition logic is moved from route cache to here.
1733            The problem was that too many Ethernet cards have broken/missing
1734            hardware multicast filters :-( As result the host on multicasting
1735            network acquires a lot of useless route cache entries, sort of
1736            SDR messages from all the world. Now we try to get rid of them.
1737            Really, provided software IP multicast filter is organized
1738            reasonably (at least, hashed), it does not result in a slowdown
1739            comparing with route cache reject entries.
1740            Note, that multicast routers are not affected, because
1741            route cache entry is created eventually.
1742          */
1743         if (ipv4_is_multicast(daddr)) {
1744                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1745
1746                 if (in_dev) {
1747                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1748                                                   ip_hdr(skb)->protocol);
1749                         if (our
1750 #ifdef CONFIG_IP_MROUTE
1751                                 ||
1752                             (!ipv4_is_local_multicast(daddr) &&
1753                              IN_DEV_MFORWARD(in_dev))
1754 #endif
1755                            ) {
1756                                 int res = ip_route_input_mc(skb, daddr, saddr,
1757                                                             tos, dev, our);
1758                                 rcu_read_unlock();
1759                                 return res;
1760                         }
1761                 }
1762                 rcu_read_unlock();
1763                 return -EINVAL;
1764         }
1765         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1766         rcu_read_unlock();
1767         return res;
1768 }
1769 EXPORT_SYMBOL(ip_route_input_noref);
1770
1771 /* called with rcu_read_lock() */
1772 static struct rtable *__mkroute_output(const struct fib_result *res,
1773                                        const struct flowi4 *fl4, int orig_oif,
1774                                        struct net_device *dev_out,
1775                                        unsigned int flags)
1776 {
1777         struct fib_info *fi = res->fi;
1778         struct fib_nh_exception *fnhe;
1779         struct in_device *in_dev;
1780         u16 type = res->type;
1781         struct rtable *rth;
1782
1783         in_dev = __in_dev_get_rcu(dev_out);
1784         if (!in_dev)
1785                 return ERR_PTR(-EINVAL);
1786
1787         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1788                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1789                         return ERR_PTR(-EINVAL);
1790
1791         if (ipv4_is_lbcast(fl4->daddr))
1792                 type = RTN_BROADCAST;
1793         else if (ipv4_is_multicast(fl4->daddr))
1794                 type = RTN_MULTICAST;
1795         else if (ipv4_is_zeronet(fl4->daddr))
1796                 return ERR_PTR(-EINVAL);
1797
1798         if (dev_out->flags & IFF_LOOPBACK)
1799                 flags |= RTCF_LOCAL;
1800
1801         if (type == RTN_BROADCAST) {
1802                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1803                 fi = NULL;
1804         } else if (type == RTN_MULTICAST) {
1805                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1806                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1807                                      fl4->flowi4_proto))
1808                         flags &= ~RTCF_LOCAL;
1809                 /* If multicast route do not exist use
1810                  * default one, but do not gateway in this case.
1811                  * Yes, it is hack.
1812                  */
1813                 if (fi && res->prefixlen < 4)
1814                         fi = NULL;
1815         }
1816
1817         fnhe = NULL;
1818         if (fi) {
1819                 struct rtable __rcu **prth;
1820
1821                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1822                 if (fnhe)
1823                         prth = &fnhe->fnhe_rth;
1824                 else
1825                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1826                 rth = rcu_dereference(*prth);
1827                 if (rt_cache_valid(rth)) {
1828                         dst_hold(&rth->dst);
1829                         return rth;
1830                 }
1831         }
1832         rth = rt_dst_alloc(dev_out,
1833                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1834                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1835                            fi);
1836         if (!rth)
1837                 return ERR_PTR(-ENOBUFS);
1838
1839         rth->dst.output = ip_output;
1840
1841         rth->rt_genid = rt_genid(dev_net(dev_out));
1842         rth->rt_flags   = flags;
1843         rth->rt_type    = type;
1844         rth->rt_is_input = 0;
1845         rth->rt_iif     = orig_oif ? : 0;
1846         rth->rt_pmtu    = 0;
1847         rth->rt_gateway = 0;
1848         INIT_LIST_HEAD(&rth->rt_uncached);
1849
1850         RT_CACHE_STAT_INC(out_slow_tot);
1851
1852         if (flags & RTCF_LOCAL)
1853                 rth->dst.input = ip_local_deliver;
1854         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1855                 if (flags & RTCF_LOCAL &&
1856                     !(dev_out->flags & IFF_LOOPBACK)) {
1857                         rth->dst.output = ip_mc_output;
1858                         RT_CACHE_STAT_INC(out_slow_mc);
1859                 }
1860 #ifdef CONFIG_IP_MROUTE
1861                 if (type == RTN_MULTICAST) {
1862                         if (IN_DEV_MFORWARD(in_dev) &&
1863                             !ipv4_is_local_multicast(fl4->daddr)) {
1864                                 rth->dst.input = ip_mr_input;
1865                                 rth->dst.output = ip_mc_output;
1866                         }
1867                 }
1868 #endif
1869         }
1870
1871         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1872
1873         return rth;
1874 }
1875
1876 /*
1877  * Major route resolver routine.
1878  */
1879
1880 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1881 {
1882         struct net_device *dev_out = NULL;
1883         __u8 tos = RT_FL_TOS(fl4);
1884         unsigned int flags = 0;
1885         struct fib_result res;
1886         struct rtable *rth;
1887         int orig_oif;
1888
1889         res.tclassid    = 0;
1890         res.fi          = NULL;
1891         res.table       = NULL;
1892
1893         orig_oif = fl4->flowi4_oif;
1894
1895         fl4->flowi4_iif = net->loopback_dev->ifindex;
1896         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1897         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1898                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1899
1900         rcu_read_lock();
1901         if (fl4->saddr) {
1902                 rth = ERR_PTR(-EINVAL);
1903                 if (ipv4_is_multicast(fl4->saddr) ||
1904                     ipv4_is_lbcast(fl4->saddr) ||
1905                     ipv4_is_zeronet(fl4->saddr))
1906                         goto out;
1907
1908                 /* I removed check for oif == dev_out->oif here.
1909                    It was wrong for two reasons:
1910                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1911                       is assigned to multiple interfaces.
1912                    2. Moreover, we are allowed to send packets with saddr
1913                       of another iface. --ANK
1914                  */
1915
1916                 if (fl4->flowi4_oif == 0 &&
1917                     (ipv4_is_multicast(fl4->daddr) ||
1918                      ipv4_is_lbcast(fl4->daddr))) {
1919                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1920                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1921                         if (dev_out == NULL)
1922                                 goto out;
1923
1924                         /* Special hack: user can direct multicasts
1925                            and limited broadcast via necessary interface
1926                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1927                            This hack is not just for fun, it allows
1928                            vic,vat and friends to work.
1929                            They bind socket to loopback, set ttl to zero
1930                            and expect that it will work.
1931                            From the viewpoint of routing cache they are broken,
1932                            because we are not allowed to build multicast path
1933                            with loopback source addr (look, routing cache
1934                            cannot know, that ttl is zero, so that packet
1935                            will not leave this host and route is valid).
1936                            Luckily, this hack is good workaround.
1937                          */
1938
1939                         fl4->flowi4_oif = dev_out->ifindex;
1940                         goto make_route;
1941                 }
1942
1943                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1944                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1945                         if (!__ip_dev_find(net, fl4->saddr, false))
1946                                 goto out;
1947                 }
1948         }
1949
1950
1951         if (fl4->flowi4_oif) {
1952                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1953                 rth = ERR_PTR(-ENODEV);
1954                 if (dev_out == NULL)
1955                         goto out;
1956
1957                 /* RACE: Check return value of inet_select_addr instead. */
1958                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1959                         rth = ERR_PTR(-ENETUNREACH);
1960                         goto out;
1961                 }
1962                 if (ipv4_is_local_multicast(fl4->daddr) ||
1963                     ipv4_is_lbcast(fl4->daddr)) {
1964                         if (!fl4->saddr)
1965                                 fl4->saddr = inet_select_addr(dev_out, 0,
1966                                                               RT_SCOPE_LINK);
1967                         goto make_route;
1968                 }
1969                 if (fl4->saddr) {
1970                         if (ipv4_is_multicast(fl4->daddr))
1971                                 fl4->saddr = inet_select_addr(dev_out, 0,
1972                                                               fl4->flowi4_scope);
1973                         else if (!fl4->daddr)
1974                                 fl4->saddr = inet_select_addr(dev_out, 0,
1975                                                               RT_SCOPE_HOST);
1976                 }
1977         }
1978
1979         if (!fl4->daddr) {
1980                 fl4->daddr = fl4->saddr;
1981                 if (!fl4->daddr)
1982                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1983                 dev_out = net->loopback_dev;
1984                 fl4->flowi4_oif = net->loopback_dev->ifindex;
1985                 res.type = RTN_LOCAL;
1986                 flags |= RTCF_LOCAL;
1987                 goto make_route;
1988         }
1989
1990         if (fib_lookup(net, fl4, &res)) {
1991                 res.fi = NULL;
1992                 res.table = NULL;
1993                 if (fl4->flowi4_oif) {
1994                         /* Apparently, routing tables are wrong. Assume,
1995                            that the destination is on link.
1996
1997                            WHY? DW.
1998                            Because we are allowed to send to iface
1999                            even if it has NO routes and NO assigned
2000                            addresses. When oif is specified, routing
2001                            tables are looked up with only one purpose:
2002                            to catch if destination is gatewayed, rather than
2003                            direct. Moreover, if MSG_DONTROUTE is set,
2004                            we send packet, ignoring both routing tables
2005                            and ifaddr state. --ANK
2006
2007
2008                            We could make it even if oif is unknown,
2009                            likely IPv6, but we do not.
2010                          */
2011
2012                         if (fl4->saddr == 0)
2013                                 fl4->saddr = inet_select_addr(dev_out, 0,
2014                                                               RT_SCOPE_LINK);
2015                         res.type = RTN_UNICAST;
2016                         goto make_route;
2017                 }
2018                 rth = ERR_PTR(-ENETUNREACH);
2019                 goto out;
2020         }
2021
2022         if (res.type == RTN_LOCAL) {
2023                 if (!fl4->saddr) {
2024                         if (res.fi->fib_prefsrc)
2025                                 fl4->saddr = res.fi->fib_prefsrc;
2026                         else
2027                                 fl4->saddr = fl4->daddr;
2028                 }
2029                 dev_out = net->loopback_dev;
2030                 fl4->flowi4_oif = dev_out->ifindex;
2031                 flags |= RTCF_LOCAL;
2032                 goto make_route;
2033         }
2034
2035 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2036         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2037                 fib_select_multipath(&res);
2038         else
2039 #endif
2040         if (!res.prefixlen &&
2041             res.table->tb_num_default > 1 &&
2042             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2043                 fib_select_default(&res);
2044
2045         if (!fl4->saddr)
2046                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2047
2048         dev_out = FIB_RES_DEV(res);
2049         fl4->flowi4_oif = dev_out->ifindex;
2050
2051
2052 make_route:
2053         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2054
2055 out:
2056         rcu_read_unlock();
2057         return rth;
2058 }
2059 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2060
2061 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2062 {
2063         return NULL;
2064 }
2065
2066 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2067 {
2068         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2069
2070         return mtu ? : dst->dev->mtu;
2071 }
2072
2073 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2074                                           struct sk_buff *skb, u32 mtu)
2075 {
2076 }
2077
2078 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2079                                        struct sk_buff *skb)
2080 {
2081 }
2082
2083 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2084                                           unsigned long old)
2085 {
2086         return NULL;
2087 }
2088
2089 static struct dst_ops ipv4_dst_blackhole_ops = {
2090         .family                 =       AF_INET,
2091         .protocol               =       cpu_to_be16(ETH_P_IP),
2092         .check                  =       ipv4_blackhole_dst_check,
2093         .mtu                    =       ipv4_blackhole_mtu,
2094         .default_advmss         =       ipv4_default_advmss,
2095         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2096         .redirect               =       ipv4_rt_blackhole_redirect,
2097         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2098         .neigh_lookup           =       ipv4_neigh_lookup,
2099 };
2100
2101 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2102 {
2103         struct rtable *ort = (struct rtable *) dst_orig;
2104         struct rtable *rt;
2105
2106         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2107         if (rt) {
2108                 struct dst_entry *new = &rt->dst;
2109
2110                 new->__use = 1;
2111                 new->input = dst_discard;
2112                 new->output = dst_discard;
2113
2114                 new->dev = ort->dst.dev;
2115                 if (new->dev)
2116                         dev_hold(new->dev);
2117
2118                 rt->rt_is_input = ort->rt_is_input;
2119                 rt->rt_iif = ort->rt_iif;
2120                 rt->rt_pmtu = ort->rt_pmtu;
2121
2122                 rt->rt_genid = rt_genid(net);
2123                 rt->rt_flags = ort->rt_flags;
2124                 rt->rt_type = ort->rt_type;
2125                 rt->rt_gateway = ort->rt_gateway;
2126
2127                 INIT_LIST_HEAD(&rt->rt_uncached);
2128
2129                 dst_free(new);
2130         }
2131
2132         dst_release(dst_orig);
2133
2134         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2135 }
2136
2137 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2138                                     struct sock *sk)
2139 {
2140         struct rtable *rt = __ip_route_output_key(net, flp4);
2141
2142         if (IS_ERR(rt))
2143                 return rt;
2144
2145         if (flp4->flowi4_proto)
2146                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2147                                                    flowi4_to_flowi(flp4),
2148                                                    sk, 0);
2149
2150         return rt;
2151 }
2152 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2153
2154 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2155                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2156                         u32 seq, int event, int nowait, unsigned int flags)
2157 {
2158         struct rtable *rt = skb_rtable(skb);
2159         struct rtmsg *r;
2160         struct nlmsghdr *nlh;
2161         unsigned long expires = 0;
2162         u32 error;
2163         u32 metrics[RTAX_MAX];
2164
2165         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2166         if (nlh == NULL)
2167                 return -EMSGSIZE;
2168
2169         r = nlmsg_data(nlh);
2170         r->rtm_family    = AF_INET;
2171         r->rtm_dst_len  = 32;
2172         r->rtm_src_len  = 0;
2173         r->rtm_tos      = fl4->flowi4_tos;
2174         r->rtm_table    = RT_TABLE_MAIN;
2175         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2176                 goto nla_put_failure;
2177         r->rtm_type     = rt->rt_type;
2178         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2179         r->rtm_protocol = RTPROT_UNSPEC;
2180         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2181         if (rt->rt_flags & RTCF_NOTIFY)
2182                 r->rtm_flags |= RTM_F_NOTIFY;
2183
2184         if (nla_put_be32(skb, RTA_DST, dst))
2185                 goto nla_put_failure;
2186         if (src) {
2187                 r->rtm_src_len = 32;
2188                 if (nla_put_be32(skb, RTA_SRC, src))
2189                         goto nla_put_failure;
2190         }
2191         if (rt->dst.dev &&
2192             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2193                 goto nla_put_failure;
2194 #ifdef CONFIG_IP_ROUTE_CLASSID
2195         if (rt->dst.tclassid &&
2196             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2197                 goto nla_put_failure;
2198 #endif
2199         if (!rt_is_input_route(rt) &&
2200             fl4->saddr != src) {
2201                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2202                         goto nla_put_failure;
2203         }
2204         if (rt->rt_gateway &&
2205             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2206                 goto nla_put_failure;
2207
2208         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2209         if (rt->rt_pmtu)
2210                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2211         if (rtnetlink_put_metrics(skb, metrics) < 0)
2212                 goto nla_put_failure;
2213
2214         if (fl4->flowi4_mark &&
2215             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2216                 goto nla_put_failure;
2217
2218         error = rt->dst.error;
2219         expires = rt->dst.expires;
2220         if (expires) {
2221                 if (time_before(jiffies, expires))
2222                         expires -= jiffies;
2223                 else
2224                         expires = 0;
2225         }
2226
2227         if (rt_is_input_route(rt)) {
2228                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2229                         goto nla_put_failure;
2230         }
2231
2232         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2233                 goto nla_put_failure;
2234
2235         return nlmsg_end(skb, nlh);
2236
2237 nla_put_failure:
2238         nlmsg_cancel(skb, nlh);
2239         return -EMSGSIZE;
2240 }
2241
2242 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2243 {
2244         struct net *net = sock_net(in_skb->sk);
2245         struct rtmsg *rtm;
2246         struct nlattr *tb[RTA_MAX+1];
2247         struct rtable *rt = NULL;
2248         struct flowi4 fl4;
2249         __be32 dst = 0;
2250         __be32 src = 0;
2251         u32 iif;
2252         int err;
2253         int mark;
2254         struct sk_buff *skb;
2255
2256         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2257         if (err < 0)
2258                 goto errout;
2259
2260         rtm = nlmsg_data(nlh);
2261
2262         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2263         if (skb == NULL) {
2264                 err = -ENOBUFS;
2265                 goto errout;
2266         }
2267
2268         /* Reserve room for dummy headers, this skb can pass
2269            through good chunk of routing engine.
2270          */
2271         skb_reset_mac_header(skb);
2272         skb_reset_network_header(skb);
2273
2274         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2275         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2276         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2277
2278         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2279         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2280         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2281         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2282
2283         memset(&fl4, 0, sizeof(fl4));
2284         fl4.daddr = dst;
2285         fl4.saddr = src;
2286         fl4.flowi4_tos = rtm->rtm_tos;
2287         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2288         fl4.flowi4_mark = mark;
2289
2290         if (iif) {
2291                 struct net_device *dev;
2292
2293                 dev = __dev_get_by_index(net, iif);
2294                 if (dev == NULL) {
2295                         err = -ENODEV;
2296                         goto errout_free;
2297                 }
2298
2299                 skb->protocol   = htons(ETH_P_IP);
2300                 skb->dev        = dev;
2301                 skb->mark       = mark;
2302                 local_bh_disable();
2303                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2304                 local_bh_enable();
2305
2306                 rt = skb_rtable(skb);
2307                 if (err == 0 && rt->dst.error)
2308                         err = -rt->dst.error;
2309         } else {
2310                 rt = ip_route_output_key(net, &fl4);
2311
2312                 err = 0;
2313                 if (IS_ERR(rt))
2314                         err = PTR_ERR(rt);
2315         }
2316
2317         if (err)
2318                 goto errout_free;
2319
2320         skb_dst_set(skb, &rt->dst);
2321         if (rtm->rtm_flags & RTM_F_NOTIFY)
2322                 rt->rt_flags |= RTCF_NOTIFY;
2323
2324         err = rt_fill_info(net, dst, src, &fl4, skb,
2325                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2326                            RTM_NEWROUTE, 0, 0);
2327         if (err <= 0)
2328                 goto errout_free;
2329
2330         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2331 errout:
2332         return err;
2333
2334 errout_free:
2335         kfree_skb(skb);
2336         goto errout;
2337 }
2338
2339 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2340 {
2341         return skb->len;
2342 }
2343
2344 void ip_rt_multicast_event(struct in_device *in_dev)
2345 {
2346         rt_cache_flush(dev_net(in_dev->dev), 0);
2347 }
2348
2349 #ifdef CONFIG_SYSCTL
2350 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2351                                         void __user *buffer,
2352                                         size_t *lenp, loff_t *ppos)
2353 {
2354         if (write) {
2355                 int flush_delay;
2356                 ctl_table ctl;
2357                 struct net *net;
2358
2359                 memcpy(&ctl, __ctl, sizeof(ctl));
2360                 ctl.data = &flush_delay;
2361                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2362
2363                 net = (struct net *)__ctl->extra1;
2364                 rt_cache_flush(net, flush_delay);
2365                 return 0;
2366         }
2367
2368         return -EINVAL;
2369 }
2370
2371 static ctl_table ipv4_route_table[] = {
2372         {
2373                 .procname       = "gc_thresh",
2374                 .data           = &ipv4_dst_ops.gc_thresh,
2375                 .maxlen         = sizeof(int),
2376                 .mode           = 0644,
2377                 .proc_handler   = proc_dointvec,
2378         },
2379         {
2380                 .procname       = "max_size",
2381                 .data           = &ip_rt_max_size,
2382                 .maxlen         = sizeof(int),
2383                 .mode           = 0644,
2384                 .proc_handler   = proc_dointvec,
2385         },
2386         {
2387                 /*  Deprecated. Use gc_min_interval_ms */
2388
2389                 .procname       = "gc_min_interval",
2390                 .data           = &ip_rt_gc_min_interval,
2391                 .maxlen         = sizeof(int),
2392                 .mode           = 0644,
2393                 .proc_handler   = proc_dointvec_jiffies,
2394         },
2395         {
2396                 .procname       = "gc_min_interval_ms",
2397                 .data           = &ip_rt_gc_min_interval,
2398                 .maxlen         = sizeof(int),
2399                 .mode           = 0644,
2400                 .proc_handler   = proc_dointvec_ms_jiffies,
2401         },
2402         {
2403                 .procname       = "gc_timeout",
2404                 .data           = &ip_rt_gc_timeout,
2405                 .maxlen         = sizeof(int),
2406                 .mode           = 0644,
2407                 .proc_handler   = proc_dointvec_jiffies,
2408         },
2409         {
2410                 .procname       = "gc_interval",
2411                 .data           = &ip_rt_gc_interval,
2412                 .maxlen         = sizeof(int),
2413                 .mode           = 0644,
2414                 .proc_handler   = proc_dointvec_jiffies,
2415         },
2416         {
2417                 .procname       = "redirect_load",
2418                 .data           = &ip_rt_redirect_load,
2419                 .maxlen         = sizeof(int),
2420                 .mode           = 0644,
2421                 .proc_handler   = proc_dointvec,
2422         },
2423         {
2424                 .procname       = "redirect_number",
2425                 .data           = &ip_rt_redirect_number,
2426                 .maxlen         = sizeof(int),
2427                 .mode           = 0644,
2428                 .proc_handler   = proc_dointvec,
2429         },
2430         {
2431                 .procname       = "redirect_silence",
2432                 .data           = &ip_rt_redirect_silence,
2433                 .maxlen         = sizeof(int),
2434                 .mode           = 0644,
2435                 .proc_handler   = proc_dointvec,
2436         },
2437         {
2438                 .procname       = "error_cost",
2439                 .data           = &ip_rt_error_cost,
2440                 .maxlen         = sizeof(int),
2441                 .mode           = 0644,
2442                 .proc_handler   = proc_dointvec,
2443         },
2444         {
2445                 .procname       = "error_burst",
2446                 .data           = &ip_rt_error_burst,
2447                 .maxlen         = sizeof(int),
2448                 .mode           = 0644,
2449                 .proc_handler   = proc_dointvec,
2450         },
2451         {
2452                 .procname       = "gc_elasticity",
2453                 .data           = &ip_rt_gc_elasticity,
2454                 .maxlen         = sizeof(int),
2455                 .mode           = 0644,
2456                 .proc_handler   = proc_dointvec,
2457         },
2458         {
2459                 .procname       = "mtu_expires",
2460                 .data           = &ip_rt_mtu_expires,
2461                 .maxlen         = sizeof(int),
2462                 .mode           = 0644,
2463                 .proc_handler   = proc_dointvec_jiffies,
2464         },
2465         {
2466                 .procname       = "min_pmtu",
2467                 .data           = &ip_rt_min_pmtu,
2468                 .maxlen         = sizeof(int),
2469                 .mode           = 0644,
2470                 .proc_handler   = proc_dointvec,
2471         },
2472         {
2473                 .procname       = "min_adv_mss",
2474                 .data           = &ip_rt_min_advmss,
2475                 .maxlen         = sizeof(int),
2476                 .mode           = 0644,
2477                 .proc_handler   = proc_dointvec,
2478         },
2479         { }
2480 };
2481
2482 static struct ctl_table ipv4_route_flush_table[] = {
2483         {
2484                 .procname       = "flush",
2485                 .maxlen         = sizeof(int),
2486                 .mode           = 0200,
2487                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2488         },
2489         { },
2490 };
2491
2492 static __net_init int sysctl_route_net_init(struct net *net)
2493 {
2494         struct ctl_table *tbl;
2495
2496         tbl = ipv4_route_flush_table;
2497         if (!net_eq(net, &init_net)) {
2498                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2499                 if (tbl == NULL)
2500                         goto err_dup;
2501         }
2502         tbl[0].extra1 = net;
2503
2504         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2505         if (net->ipv4.route_hdr == NULL)
2506                 goto err_reg;
2507         return 0;
2508
2509 err_reg:
2510         if (tbl != ipv4_route_flush_table)
2511                 kfree(tbl);
2512 err_dup:
2513         return -ENOMEM;
2514 }
2515
2516 static __net_exit void sysctl_route_net_exit(struct net *net)
2517 {
2518         struct ctl_table *tbl;
2519
2520         tbl = net->ipv4.route_hdr->ctl_table_arg;
2521         unregister_net_sysctl_table(net->ipv4.route_hdr);
2522         BUG_ON(tbl == ipv4_route_flush_table);
2523         kfree(tbl);
2524 }
2525
2526 static __net_initdata struct pernet_operations sysctl_route_ops = {
2527         .init = sysctl_route_net_init,
2528         .exit = sysctl_route_net_exit,
2529 };
2530 #endif
2531
2532 static __net_init int rt_genid_init(struct net *net)
2533 {
2534         get_random_bytes(&net->ipv4.rt_genid,
2535                          sizeof(net->ipv4.rt_genid));
2536         get_random_bytes(&net->ipv4.dev_addr_genid,
2537                          sizeof(net->ipv4.dev_addr_genid));
2538         return 0;
2539 }
2540
2541 static __net_initdata struct pernet_operations rt_genid_ops = {
2542         .init = rt_genid_init,
2543 };
2544
2545 static int __net_init ipv4_inetpeer_init(struct net *net)
2546 {
2547         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2548
2549         if (!bp)
2550                 return -ENOMEM;
2551         inet_peer_base_init(bp);
2552         net->ipv4.peers = bp;
2553         return 0;
2554 }
2555
2556 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2557 {
2558         struct inet_peer_base *bp = net->ipv4.peers;
2559
2560         net->ipv4.peers = NULL;
2561         inetpeer_invalidate_tree(bp);
2562         kfree(bp);
2563 }
2564
2565 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2566         .init   =       ipv4_inetpeer_init,
2567         .exit   =       ipv4_inetpeer_exit,
2568 };
2569
2570 #ifdef CONFIG_IP_ROUTE_CLASSID
2571 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2572 #endif /* CONFIG_IP_ROUTE_CLASSID */
2573
2574 int __init ip_rt_init(void)
2575 {
2576         int rc = 0;
2577
2578 #ifdef CONFIG_IP_ROUTE_CLASSID
2579         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2580         if (!ip_rt_acct)
2581                 panic("IP: failed to allocate ip_rt_acct\n");
2582 #endif
2583
2584         ipv4_dst_ops.kmem_cachep =
2585                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2586                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2587
2588         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2589
2590         if (dst_entries_init(&ipv4_dst_ops) < 0)
2591                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2592
2593         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2594                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2595
2596         ipv4_dst_ops.gc_thresh = ~0;
2597         ip_rt_max_size = INT_MAX;
2598
2599         devinet_init();
2600         ip_fib_init();
2601
2602         if (ip_rt_proc_init())
2603                 pr_err("Unable to create route proc files\n");
2604 #ifdef CONFIG_XFRM
2605         xfrm_init();
2606         xfrm4_init(ip_rt_max_size);
2607 #endif
2608         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2609
2610 #ifdef CONFIG_SYSCTL
2611         register_pernet_subsys(&sysctl_route_ops);
2612 #endif
2613         register_pernet_subsys(&rt_genid_ops);
2614         register_pernet_subsys(&ipv4_inetpeer_ops);
2615         return rc;
2616 }
2617
2618 #ifdef CONFIG_SYSCTL
2619 /*
2620  * We really need to sanitize the damn ipv4 init order, then all
2621  * this nonsense will go away.
2622  */
2623 void __init ip_static_sysctl_init(void)
2624 {
2625         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2626 }
2627 #endif