ipv4: Introduce IN_DEV_NET_ROUTE_LOCALNET
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 static inline int rt_genid(struct net *net)
206 {
207         return atomic_read(&net->ipv4.rt_genid);
208 }
209
210 #ifdef CONFIG_PROC_FS
211 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
212 {
213         if (*pos)
214                 return NULL;
215         return SEQ_START_TOKEN;
216 }
217
218 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
219 {
220         ++*pos;
221         return NULL;
222 }
223
224 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
225 {
226 }
227
228 static int rt_cache_seq_show(struct seq_file *seq, void *v)
229 {
230         if (v == SEQ_START_TOKEN)
231                 seq_printf(seq, "%-127s\n",
232                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
233                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
234                            "HHUptod\tSpecDst");
235         return 0;
236 }
237
238 static const struct seq_operations rt_cache_seq_ops = {
239         .start  = rt_cache_seq_start,
240         .next   = rt_cache_seq_next,
241         .stop   = rt_cache_seq_stop,
242         .show   = rt_cache_seq_show,
243 };
244
245 static int rt_cache_seq_open(struct inode *inode, struct file *file)
246 {
247         return seq_open(file, &rt_cache_seq_ops);
248 }
249
250 static const struct file_operations rt_cache_seq_fops = {
251         .owner   = THIS_MODULE,
252         .open    = rt_cache_seq_open,
253         .read    = seq_read,
254         .llseek  = seq_lseek,
255         .release = seq_release,
256 };
257
258
259 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
260 {
261         int cpu;
262
263         if (*pos == 0)
264                 return SEQ_START_TOKEN;
265
266         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
267                 if (!cpu_possible(cpu))
268                         continue;
269                 *pos = cpu+1;
270                 return &per_cpu(rt_cache_stat, cpu);
271         }
272         return NULL;
273 }
274
275 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 {
277         int cpu;
278
279         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
280                 if (!cpu_possible(cpu))
281                         continue;
282                 *pos = cpu+1;
283                 return &per_cpu(rt_cache_stat, cpu);
284         }
285         return NULL;
286
287 }
288
289 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
290 {
291
292 }
293
294 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
295 {
296         struct rt_cache_stat *st = v;
297
298         if (v == SEQ_START_TOKEN) {
299                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
300                 return 0;
301         }
302
303         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
304                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
305                    dst_entries_get_slow(&ipv4_dst_ops),
306                    st->in_hit,
307                    st->in_slow_tot,
308                    st->in_slow_mc,
309                    st->in_no_route,
310                    st->in_brd,
311                    st->in_martian_dst,
312                    st->in_martian_src,
313
314                    st->out_hit,
315                    st->out_slow_tot,
316                    st->out_slow_mc,
317
318                    st->gc_total,
319                    st->gc_ignored,
320                    st->gc_goal_miss,
321                    st->gc_dst_overflow,
322                    st->in_hlist_search,
323                    st->out_hlist_search
324                 );
325         return 0;
326 }
327
328 static const struct seq_operations rt_cpu_seq_ops = {
329         .start  = rt_cpu_seq_start,
330         .next   = rt_cpu_seq_next,
331         .stop   = rt_cpu_seq_stop,
332         .show   = rt_cpu_seq_show,
333 };
334
335
336 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
337 {
338         return seq_open(file, &rt_cpu_seq_ops);
339 }
340
341 static const struct file_operations rt_cpu_seq_fops = {
342         .owner   = THIS_MODULE,
343         .open    = rt_cpu_seq_open,
344         .read    = seq_read,
345         .llseek  = seq_lseek,
346         .release = seq_release,
347 };
348
349 #ifdef CONFIG_IP_ROUTE_CLASSID
350 static int rt_acct_proc_show(struct seq_file *m, void *v)
351 {
352         struct ip_rt_acct *dst, *src;
353         unsigned int i, j;
354
355         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
356         if (!dst)
357                 return -ENOMEM;
358
359         for_each_possible_cpu(i) {
360                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
361                 for (j = 0; j < 256; j++) {
362                         dst[j].o_bytes   += src[j].o_bytes;
363                         dst[j].o_packets += src[j].o_packets;
364                         dst[j].i_bytes   += src[j].i_bytes;
365                         dst[j].i_packets += src[j].i_packets;
366                 }
367         }
368
369         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
370         kfree(dst);
371         return 0;
372 }
373
374 static int rt_acct_proc_open(struct inode *inode, struct file *file)
375 {
376         return single_open(file, rt_acct_proc_show, NULL);
377 }
378
379 static const struct file_operations rt_acct_proc_fops = {
380         .owner          = THIS_MODULE,
381         .open           = rt_acct_proc_open,
382         .read           = seq_read,
383         .llseek         = seq_lseek,
384         .release        = single_release,
385 };
386 #endif
387
388 static int __net_init ip_rt_do_proc_init(struct net *net)
389 {
390         struct proc_dir_entry *pde;
391
392         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
393                         &rt_cache_seq_fops);
394         if (!pde)
395                 goto err1;
396
397         pde = proc_create("rt_cache", S_IRUGO,
398                           net->proc_net_stat, &rt_cpu_seq_fops);
399         if (!pde)
400                 goto err2;
401
402 #ifdef CONFIG_IP_ROUTE_CLASSID
403         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
404         if (!pde)
405                 goto err3;
406 #endif
407         return 0;
408
409 #ifdef CONFIG_IP_ROUTE_CLASSID
410 err3:
411         remove_proc_entry("rt_cache", net->proc_net_stat);
412 #endif
413 err2:
414         remove_proc_entry("rt_cache", net->proc_net);
415 err1:
416         return -ENOMEM;
417 }
418
419 static void __net_exit ip_rt_do_proc_exit(struct net *net)
420 {
421         remove_proc_entry("rt_cache", net->proc_net_stat);
422         remove_proc_entry("rt_cache", net->proc_net);
423 #ifdef CONFIG_IP_ROUTE_CLASSID
424         remove_proc_entry("rt_acct", net->proc_net);
425 #endif
426 }
427
428 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
429         .init = ip_rt_do_proc_init,
430         .exit = ip_rt_do_proc_exit,
431 };
432
433 static int __init ip_rt_proc_init(void)
434 {
435         return register_pernet_subsys(&ip_rt_proc_ops);
436 }
437
438 #else
439 static inline int ip_rt_proc_init(void)
440 {
441         return 0;
442 }
443 #endif /* CONFIG_PROC_FS */
444
445 static inline bool rt_is_expired(const struct rtable *rth)
446 {
447         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
448 }
449
450 /*
451  * Perturbation of rt_genid by a small quantity [1..256]
452  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
453  * many times (2^24) without giving recent rt_genid.
454  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
455  */
456 static void rt_cache_invalidate(struct net *net)
457 {
458         unsigned char shuffle;
459
460         get_random_bytes(&shuffle, sizeof(shuffle));
461         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
462 }
463
464 /*
465  * delay < 0  : invalidate cache (fast : entries will be deleted later)
466  * delay >= 0 : invalidate & flush cache (can be long)
467  */
468 void rt_cache_flush(struct net *net, int delay)
469 {
470         rt_cache_invalidate(net);
471 }
472
473 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
474                                            struct sk_buff *skb,
475                                            const void *daddr)
476 {
477         struct net_device *dev = dst->dev;
478         const __be32 *pkey = daddr;
479         const struct rtable *rt;
480         struct neighbour *n;
481
482         rt = (const struct rtable *) dst;
483         if (rt->rt_gateway)
484                 pkey = (const __be32 *) &rt->rt_gateway;
485         else if (skb)
486                 pkey = &ip_hdr(skb)->daddr;
487
488         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
489         if (n)
490                 return n;
491         return neigh_create(&arp_tbl, pkey, dev);
492 }
493
494 /*
495  * Peer allocation may fail only in serious out-of-memory conditions.  However
496  * we still can generate some output.
497  * Random ID selection looks a bit dangerous because we have no chances to
498  * select ID being unique in a reasonable period of time.
499  * But broken packet identifier may be better than no packet at all.
500  */
501 static void ip_select_fb_ident(struct iphdr *iph)
502 {
503         static DEFINE_SPINLOCK(ip_fb_id_lock);
504         static u32 ip_fallback_id;
505         u32 salt;
506
507         spin_lock_bh(&ip_fb_id_lock);
508         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
509         iph->id = htons(salt & 0xFFFF);
510         ip_fallback_id = salt;
511         spin_unlock_bh(&ip_fb_id_lock);
512 }
513
514 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
515 {
516         struct net *net = dev_net(dst->dev);
517         struct inet_peer *peer;
518
519         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
520         if (peer) {
521                 iph->id = htons(inet_getid(peer, more));
522                 inet_putpeer(peer);
523                 return;
524         }
525
526         ip_select_fb_ident(iph);
527 }
528 EXPORT_SYMBOL(__ip_select_ident);
529
530 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
531                              const struct iphdr *iph,
532                              int oif, u8 tos,
533                              u8 prot, u32 mark, int flow_flags)
534 {
535         if (sk) {
536                 const struct inet_sock *inet = inet_sk(sk);
537
538                 oif = sk->sk_bound_dev_if;
539                 mark = sk->sk_mark;
540                 tos = RT_CONN_FLAGS(sk);
541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542         }
543         flowi4_init_output(fl4, oif, mark, tos,
544                            RT_SCOPE_UNIVERSE, prot,
545                            flow_flags,
546                            iph->daddr, iph->saddr, 0, 0);
547 }
548
549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
550                                const struct sock *sk)
551 {
552         const struct iphdr *iph = ip_hdr(skb);
553         int oif = skb->dev->ifindex;
554         u8 tos = RT_TOS(iph->tos);
555         u8 prot = iph->protocol;
556         u32 mark = skb->mark;
557
558         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
559 }
560
561 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564         const struct ip_options_rcu *inet_opt;
565         __be32 daddr = inet->inet_daddr;
566
567         rcu_read_lock();
568         inet_opt = rcu_dereference(inet->inet_opt);
569         if (inet_opt && inet_opt->opt.srr)
570                 daddr = inet_opt->opt.faddr;
571         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574                            inet_sk_flowi_flags(sk),
575                            daddr, inet->inet_saddr, 0, 0);
576         rcu_read_unlock();
577 }
578
579 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580                                  const struct sk_buff *skb)
581 {
582         if (skb)
583                 build_skb_flow_key(fl4, skb, sk);
584         else
585                 build_sk_flow_key(fl4, sk);
586 }
587
588 static inline void rt_free(struct rtable *rt)
589 {
590         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
591 }
592
593 static DEFINE_SPINLOCK(fnhe_lock);
594
595 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
596 {
597         struct fib_nh_exception *fnhe, *oldest;
598         struct rtable *orig;
599
600         oldest = rcu_dereference(hash->chain);
601         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
602              fnhe = rcu_dereference(fnhe->fnhe_next)) {
603                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
604                         oldest = fnhe;
605         }
606         orig = rcu_dereference(oldest->fnhe_rth);
607         if (orig) {
608                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
609                 rt_free(orig);
610         }
611         return oldest;
612 }
613
614 static inline u32 fnhe_hashfun(__be32 daddr)
615 {
616         u32 hval;
617
618         hval = (__force u32) daddr;
619         hval ^= (hval >> 11) ^ (hval >> 22);
620
621         return hval & (FNHE_HASH_SIZE - 1);
622 }
623
624 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
625                                   u32 pmtu, unsigned long expires)
626 {
627         struct fnhe_hash_bucket *hash;
628         struct fib_nh_exception *fnhe;
629         int depth;
630         u32 hval = fnhe_hashfun(daddr);
631
632         spin_lock_bh(&fnhe_lock);
633
634         hash = nh->nh_exceptions;
635         if (!hash) {
636                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
637                 if (!hash)
638                         goto out_unlock;
639                 nh->nh_exceptions = hash;
640         }
641
642         hash += hval;
643
644         depth = 0;
645         for (fnhe = rcu_dereference(hash->chain); fnhe;
646              fnhe = rcu_dereference(fnhe->fnhe_next)) {
647                 if (fnhe->fnhe_daddr == daddr)
648                         break;
649                 depth++;
650         }
651
652         if (fnhe) {
653                 if (gw)
654                         fnhe->fnhe_gw = gw;
655                 if (pmtu) {
656                         fnhe->fnhe_pmtu = pmtu;
657                         fnhe->fnhe_expires = expires;
658                 }
659         } else {
660                 if (depth > FNHE_RECLAIM_DEPTH)
661                         fnhe = fnhe_oldest(hash);
662                 else {
663                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
664                         if (!fnhe)
665                                 goto out_unlock;
666
667                         fnhe->fnhe_next = hash->chain;
668                         rcu_assign_pointer(hash->chain, fnhe);
669                 }
670                 fnhe->fnhe_daddr = daddr;
671                 fnhe->fnhe_gw = gw;
672                 fnhe->fnhe_pmtu = pmtu;
673                 fnhe->fnhe_expires = expires;
674         }
675
676         fnhe->fnhe_stamp = jiffies;
677
678 out_unlock:
679         spin_unlock_bh(&fnhe_lock);
680         return;
681 }
682
683 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
684                              bool kill_route)
685 {
686         __be32 new_gw = icmp_hdr(skb)->un.gateway;
687         __be32 old_gw = ip_hdr(skb)->saddr;
688         struct net_device *dev = skb->dev;
689         struct in_device *in_dev;
690         struct fib_result res;
691         struct neighbour *n;
692         struct net *net;
693
694         switch (icmp_hdr(skb)->code & 7) {
695         case ICMP_REDIR_NET:
696         case ICMP_REDIR_NETTOS:
697         case ICMP_REDIR_HOST:
698         case ICMP_REDIR_HOSTTOS:
699                 break;
700
701         default:
702                 return;
703         }
704
705         if (rt->rt_gateway != old_gw)
706                 return;
707
708         in_dev = __in_dev_get_rcu(dev);
709         if (!in_dev)
710                 return;
711
712         net = dev_net(dev);
713         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
714             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
715             ipv4_is_zeronet(new_gw))
716                 goto reject_redirect;
717
718         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
719                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
720                         goto reject_redirect;
721                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
722                         goto reject_redirect;
723         } else {
724                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
725                         goto reject_redirect;
726         }
727
728         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
729         if (n) {
730                 if (!(n->nud_state & NUD_VALID)) {
731                         neigh_event_send(n, NULL);
732                 } else {
733                         if (fib_lookup(net, fl4, &res) == 0) {
734                                 struct fib_nh *nh = &FIB_RES_NH(res);
735
736                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
737                                                       0, 0);
738                         }
739                         if (kill_route)
740                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
741                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
742                 }
743                 neigh_release(n);
744         }
745         return;
746
747 reject_redirect:
748 #ifdef CONFIG_IP_ROUTE_VERBOSE
749         if (IN_DEV_LOG_MARTIANS(in_dev)) {
750                 const struct iphdr *iph = (const struct iphdr *) skb->data;
751                 __be32 daddr = iph->daddr;
752                 __be32 saddr = iph->saddr;
753
754                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
755                                      "  Advised path = %pI4 -> %pI4\n",
756                                      &old_gw, dev->name, &new_gw,
757                                      &saddr, &daddr);
758         }
759 #endif
760         ;
761 }
762
763 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
764 {
765         struct rtable *rt;
766         struct flowi4 fl4;
767
768         rt = (struct rtable *) dst;
769
770         ip_rt_build_flow_key(&fl4, sk, skb);
771         __ip_do_redirect(rt, skb, &fl4, true);
772 }
773
774 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
775 {
776         struct rtable *rt = (struct rtable *)dst;
777         struct dst_entry *ret = dst;
778
779         if (rt) {
780                 if (dst->obsolete > 0) {
781                         ip_rt_put(rt);
782                         ret = NULL;
783                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
784                            rt->dst.expires) {
785                         ip_rt_put(rt);
786                         ret = NULL;
787                 }
788         }
789         return ret;
790 }
791
792 /*
793  * Algorithm:
794  *      1. The first ip_rt_redirect_number redirects are sent
795  *         with exponential backoff, then we stop sending them at all,
796  *         assuming that the host ignores our redirects.
797  *      2. If we did not see packets requiring redirects
798  *         during ip_rt_redirect_silence, we assume that the host
799  *         forgot redirected route and start to send redirects again.
800  *
801  * This algorithm is much cheaper and more intelligent than dumb load limiting
802  * in icmp.c.
803  *
804  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
805  * and "frag. need" (breaks PMTU discovery) in icmp.c.
806  */
807
808 void ip_rt_send_redirect(struct sk_buff *skb)
809 {
810         struct rtable *rt = skb_rtable(skb);
811         struct in_device *in_dev;
812         struct inet_peer *peer;
813         struct net *net;
814         int log_martians;
815
816         rcu_read_lock();
817         in_dev = __in_dev_get_rcu(rt->dst.dev);
818         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
819                 rcu_read_unlock();
820                 return;
821         }
822         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
823         rcu_read_unlock();
824
825         net = dev_net(rt->dst.dev);
826         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
827         if (!peer) {
828                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
829                 return;
830         }
831
832         /* No redirected packets during ip_rt_redirect_silence;
833          * reset the algorithm.
834          */
835         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
836                 peer->rate_tokens = 0;
837
838         /* Too many ignored redirects; do not send anything
839          * set dst.rate_last to the last seen redirected packet.
840          */
841         if (peer->rate_tokens >= ip_rt_redirect_number) {
842                 peer->rate_last = jiffies;
843                 goto out_put_peer;
844         }
845
846         /* Check for load limit; set rate_last to the latest sent
847          * redirect.
848          */
849         if (peer->rate_tokens == 0 ||
850             time_after(jiffies,
851                        (peer->rate_last +
852                         (ip_rt_redirect_load << peer->rate_tokens)))) {
853                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
854                 peer->rate_last = jiffies;
855                 ++peer->rate_tokens;
856 #ifdef CONFIG_IP_ROUTE_VERBOSE
857                 if (log_martians &&
858                     peer->rate_tokens == ip_rt_redirect_number)
859                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
860                                              &ip_hdr(skb)->saddr, inet_iif(skb),
861                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
862 #endif
863         }
864 out_put_peer:
865         inet_putpeer(peer);
866 }
867
868 static int ip_error(struct sk_buff *skb)
869 {
870         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
871         struct rtable *rt = skb_rtable(skb);
872         struct inet_peer *peer;
873         unsigned long now;
874         struct net *net;
875         bool send;
876         int code;
877
878         net = dev_net(rt->dst.dev);
879         if (!IN_DEV_FORWARD(in_dev)) {
880                 switch (rt->dst.error) {
881                 case EHOSTUNREACH:
882                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
883                         break;
884
885                 case ENETUNREACH:
886                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
887                         break;
888                 }
889                 goto out;
890         }
891
892         switch (rt->dst.error) {
893         case EINVAL:
894         default:
895                 goto out;
896         case EHOSTUNREACH:
897                 code = ICMP_HOST_UNREACH;
898                 break;
899         case ENETUNREACH:
900                 code = ICMP_NET_UNREACH;
901                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
902                 break;
903         case EACCES:
904                 code = ICMP_PKT_FILTERED;
905                 break;
906         }
907
908         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
909
910         send = true;
911         if (peer) {
912                 now = jiffies;
913                 peer->rate_tokens += now - peer->rate_last;
914                 if (peer->rate_tokens > ip_rt_error_burst)
915                         peer->rate_tokens = ip_rt_error_burst;
916                 peer->rate_last = now;
917                 if (peer->rate_tokens >= ip_rt_error_cost)
918                         peer->rate_tokens -= ip_rt_error_cost;
919                 else
920                         send = false;
921                 inet_putpeer(peer);
922         }
923         if (send)
924                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
925
926 out:    kfree_skb(skb);
927         return 0;
928 }
929
930 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
931 {
932         struct fib_result res;
933
934         if (mtu < ip_rt_min_pmtu)
935                 mtu = ip_rt_min_pmtu;
936
937         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
938                 struct fib_nh *nh = &FIB_RES_NH(res);
939
940                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
941                                       jiffies + ip_rt_mtu_expires);
942         }
943         return mtu;
944 }
945
946 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
947                               struct sk_buff *skb, u32 mtu)
948 {
949         struct rtable *rt = (struct rtable *) dst;
950         struct flowi4 fl4;
951
952         ip_rt_build_flow_key(&fl4, sk, skb);
953         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
954
955         if (!rt->rt_pmtu) {
956                 dst->obsolete = DST_OBSOLETE_KILL;
957         } else {
958                 rt->rt_pmtu = mtu;
959                 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
960         }
961 }
962
963 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
964                       int oif, u32 mark, u8 protocol, int flow_flags)
965 {
966         const struct iphdr *iph = (const struct iphdr *) skb->data;
967         struct flowi4 fl4;
968         struct rtable *rt;
969
970         __build_flow_key(&fl4, NULL, iph, oif,
971                          RT_TOS(iph->tos), protocol, mark, flow_flags);
972         rt = __ip_route_output_key(net, &fl4);
973         if (!IS_ERR(rt)) {
974                 __ip_rt_update_pmtu(rt, &fl4, mtu);
975                 ip_rt_put(rt);
976         }
977 }
978 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
979
980 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
981 {
982         const struct iphdr *iph = (const struct iphdr *) skb->data;
983         struct flowi4 fl4;
984         struct rtable *rt;
985
986         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
987         rt = __ip_route_output_key(sock_net(sk), &fl4);
988         if (!IS_ERR(rt)) {
989                 __ip_rt_update_pmtu(rt, &fl4, mtu);
990                 ip_rt_put(rt);
991         }
992 }
993 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
994
995 void ipv4_redirect(struct sk_buff *skb, struct net *net,
996                    int oif, u32 mark, u8 protocol, int flow_flags)
997 {
998         const struct iphdr *iph = (const struct iphdr *) skb->data;
999         struct flowi4 fl4;
1000         struct rtable *rt;
1001
1002         __build_flow_key(&fl4, NULL, iph, oif,
1003                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1004         rt = __ip_route_output_key(net, &fl4);
1005         if (!IS_ERR(rt)) {
1006                 __ip_do_redirect(rt, skb, &fl4, false);
1007                 ip_rt_put(rt);
1008         }
1009 }
1010 EXPORT_SYMBOL_GPL(ipv4_redirect);
1011
1012 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1013 {
1014         const struct iphdr *iph = (const struct iphdr *) skb->data;
1015         struct flowi4 fl4;
1016         struct rtable *rt;
1017
1018         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1019         rt = __ip_route_output_key(sock_net(sk), &fl4);
1020         if (!IS_ERR(rt)) {
1021                 __ip_do_redirect(rt, skb, &fl4, false);
1022                 ip_rt_put(rt);
1023         }
1024 }
1025 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1026
1027 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1028 {
1029         struct rtable *rt = (struct rtable *) dst;
1030
1031         /* All IPV4 dsts are created with ->obsolete set to the value
1032          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1033          * into this function always.
1034          *
1035          * When a PMTU/redirect information update invalidates a
1036          * route, this is indicated by setting obsolete to
1037          * DST_OBSOLETE_KILL.
1038          */
1039         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1040                 return NULL;
1041         return dst;
1042 }
1043
1044 static void ipv4_link_failure(struct sk_buff *skb)
1045 {
1046         struct rtable *rt;
1047
1048         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1049
1050         rt = skb_rtable(skb);
1051         if (rt)
1052                 dst_set_expires(&rt->dst, 0);
1053 }
1054
1055 static int ip_rt_bug(struct sk_buff *skb)
1056 {
1057         pr_debug("%s: %pI4 -> %pI4, %s\n",
1058                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1059                  skb->dev ? skb->dev->name : "?");
1060         kfree_skb(skb);
1061         WARN_ON(1);
1062         return 0;
1063 }
1064
1065 /*
1066    We do not cache source address of outgoing interface,
1067    because it is used only by IP RR, TS and SRR options,
1068    so that it out of fast path.
1069
1070    BTW remember: "addr" is allowed to be not aligned
1071    in IP options!
1072  */
1073
1074 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1075 {
1076         __be32 src;
1077
1078         if (rt_is_output_route(rt))
1079                 src = ip_hdr(skb)->saddr;
1080         else {
1081                 struct fib_result res;
1082                 struct flowi4 fl4;
1083                 struct iphdr *iph;
1084
1085                 iph = ip_hdr(skb);
1086
1087                 memset(&fl4, 0, sizeof(fl4));
1088                 fl4.daddr = iph->daddr;
1089                 fl4.saddr = iph->saddr;
1090                 fl4.flowi4_tos = RT_TOS(iph->tos);
1091                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1092                 fl4.flowi4_iif = skb->dev->ifindex;
1093                 fl4.flowi4_mark = skb->mark;
1094
1095                 rcu_read_lock();
1096                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1097                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1098                 else
1099                         src = inet_select_addr(rt->dst.dev,
1100                                                rt_nexthop(rt, iph->daddr),
1101                                                RT_SCOPE_UNIVERSE);
1102                 rcu_read_unlock();
1103         }
1104         memcpy(addr, &src, 4);
1105 }
1106
1107 #ifdef CONFIG_IP_ROUTE_CLASSID
1108 static void set_class_tag(struct rtable *rt, u32 tag)
1109 {
1110         if (!(rt->dst.tclassid & 0xFFFF))
1111                 rt->dst.tclassid |= tag & 0xFFFF;
1112         if (!(rt->dst.tclassid & 0xFFFF0000))
1113                 rt->dst.tclassid |= tag & 0xFFFF0000;
1114 }
1115 #endif
1116
1117 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1118 {
1119         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1120
1121         if (advmss == 0) {
1122                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1123                                ip_rt_min_advmss);
1124                 if (advmss > 65535 - 40)
1125                         advmss = 65535 - 40;
1126         }
1127         return advmss;
1128 }
1129
1130 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1131 {
1132         const struct rtable *rt = (const struct rtable *) dst;
1133         unsigned int mtu = rt->rt_pmtu;
1134
1135         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1136                 mtu = 0;
1137
1138         if (!mtu)
1139                 mtu = dst_metric_raw(dst, RTAX_MTU);
1140
1141         if (mtu && rt_is_output_route(rt))
1142                 return mtu;
1143
1144         mtu = dst->dev->mtu;
1145
1146         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1147                 if (rt->rt_gateway && mtu > 576)
1148                         mtu = 576;
1149         }
1150
1151         if (mtu > IP_MAX_MTU)
1152                 mtu = IP_MAX_MTU;
1153
1154         return mtu;
1155 }
1156
1157 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1158 {
1159         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1160         struct fib_nh_exception *fnhe;
1161         u32 hval;
1162
1163         if (!hash)
1164                 return NULL;
1165
1166         hval = fnhe_hashfun(daddr);
1167
1168         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1169              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1170                 if (fnhe->fnhe_daddr == daddr)
1171                         return fnhe;
1172         }
1173         return NULL;
1174 }
1175
1176 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1177                               __be32 daddr)
1178 {
1179         bool ret = false;
1180
1181         spin_lock_bh(&fnhe_lock);
1182
1183         if (daddr == fnhe->fnhe_daddr) {
1184                 struct rtable *orig;
1185
1186                 if (fnhe->fnhe_pmtu) {
1187                         unsigned long expires = fnhe->fnhe_expires;
1188                         unsigned long diff = expires - jiffies;
1189
1190                         if (time_before(jiffies, expires)) {
1191                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1192                                 dst_set_expires(&rt->dst, diff);
1193                         }
1194                 }
1195                 if (fnhe->fnhe_gw) {
1196                         rt->rt_flags |= RTCF_REDIRECTED;
1197                         rt->rt_gateway = fnhe->fnhe_gw;
1198                 }
1199
1200                 orig = rcu_dereference(fnhe->fnhe_rth);
1201                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1202                 if (orig)
1203                         rt_free(orig);
1204
1205                 fnhe->fnhe_stamp = jiffies;
1206                 ret = true;
1207         } else {
1208                 /* Routes we intend to cache in nexthop exception have
1209                  * the DST_NOCACHE bit clear.  However, if we are
1210                  * unsuccessful at storing this route into the cache
1211                  * we really need to set it.
1212                  */
1213                 rt->dst.flags |= DST_NOCACHE;
1214         }
1215         spin_unlock_bh(&fnhe_lock);
1216
1217         return ret;
1218 }
1219
1220 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1221 {
1222         struct rtable *orig, *prev, **p;
1223         bool ret = true;
1224
1225         if (rt_is_input_route(rt)) {
1226                 p = (struct rtable **)&nh->nh_rth_input;
1227         } else {
1228                 if (!nh->nh_pcpu_rth_output)
1229                         goto nocache;
1230                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1231         }
1232         orig = *p;
1233
1234         prev = cmpxchg(p, orig, rt);
1235         if (prev == orig) {
1236                 if (orig)
1237                         rt_free(orig);
1238         } else {
1239                 /* Routes we intend to cache in the FIB nexthop have
1240                  * the DST_NOCACHE bit clear.  However, if we are
1241                  * unsuccessful at storing this route into the cache
1242                  * we really need to set it.
1243                  */
1244 nocache:
1245                 rt->dst.flags |= DST_NOCACHE;
1246                 ret = false;
1247         }
1248
1249         return ret;
1250 }
1251
1252 static DEFINE_SPINLOCK(rt_uncached_lock);
1253 static LIST_HEAD(rt_uncached_list);
1254
1255 static void rt_add_uncached_list(struct rtable *rt)
1256 {
1257         spin_lock_bh(&rt_uncached_lock);
1258         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1259         spin_unlock_bh(&rt_uncached_lock);
1260 }
1261
1262 static void ipv4_dst_destroy(struct dst_entry *dst)
1263 {
1264         struct rtable *rt = (struct rtable *) dst;
1265
1266         if (dst->flags & DST_NOCACHE) {
1267                 spin_lock_bh(&rt_uncached_lock);
1268                 list_del(&rt->rt_uncached);
1269                 spin_unlock_bh(&rt_uncached_lock);
1270         }
1271 }
1272
1273 void rt_flush_dev(struct net_device *dev)
1274 {
1275         if (!list_empty(&rt_uncached_list)) {
1276                 struct net *net = dev_net(dev);
1277                 struct rtable *rt;
1278
1279                 spin_lock_bh(&rt_uncached_lock);
1280                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1281                         if (rt->dst.dev != dev)
1282                                 continue;
1283                         rt->dst.dev = net->loopback_dev;
1284                         dev_hold(rt->dst.dev);
1285                         dev_put(dev);
1286                 }
1287                 spin_unlock_bh(&rt_uncached_lock);
1288         }
1289 }
1290
1291 static bool rt_cache_valid(const struct rtable *rt)
1292 {
1293         return  rt &&
1294                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1295                 !rt_is_expired(rt);
1296 }
1297
1298 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1299                            const struct fib_result *res,
1300                            struct fib_nh_exception *fnhe,
1301                            struct fib_info *fi, u16 type, u32 itag)
1302 {
1303         bool cached = false;
1304
1305         if (fi) {
1306                 struct fib_nh *nh = &FIB_RES_NH(*res);
1307
1308                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1309                         rt->rt_gateway = nh->nh_gw;
1310                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1311 #ifdef CONFIG_IP_ROUTE_CLASSID
1312                 rt->dst.tclassid = nh->nh_tclassid;
1313 #endif
1314                 if (unlikely(fnhe))
1315                         cached = rt_bind_exception(rt, fnhe, daddr);
1316                 else if (!(rt->dst.flags & DST_NOCACHE))
1317                         cached = rt_cache_route(nh, rt);
1318         }
1319         if (unlikely(!cached))
1320                 rt_add_uncached_list(rt);
1321
1322 #ifdef CONFIG_IP_ROUTE_CLASSID
1323 #ifdef CONFIG_IP_MULTIPLE_TABLES
1324         set_class_tag(rt, res->tclassid);
1325 #endif
1326         set_class_tag(rt, itag);
1327 #endif
1328 }
1329
1330 static struct rtable *rt_dst_alloc(struct net_device *dev,
1331                                    bool nopolicy, bool noxfrm, bool will_cache)
1332 {
1333         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1334                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1335                          (nopolicy ? DST_NOPOLICY : 0) |
1336                          (noxfrm ? DST_NOXFRM : 0));
1337 }
1338
1339 /* called in rcu_read_lock() section */
1340 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1341                                 u8 tos, struct net_device *dev, int our)
1342 {
1343         struct rtable *rth;
1344         struct in_device *in_dev = __in_dev_get_rcu(dev);
1345         u32 itag = 0;
1346         int err;
1347
1348         /* Primary sanity checks. */
1349
1350         if (in_dev == NULL)
1351                 return -EINVAL;
1352
1353         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1354             skb->protocol != htons(ETH_P_IP))
1355                 goto e_inval;
1356
1357         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1358                 if (ipv4_is_loopback(saddr))
1359                         goto e_inval;
1360
1361         if (ipv4_is_zeronet(saddr)) {
1362                 if (!ipv4_is_local_multicast(daddr))
1363                         goto e_inval;
1364         } else {
1365                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1366                                           in_dev, &itag);
1367                 if (err < 0)
1368                         goto e_err;
1369         }
1370         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1371                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1372         if (!rth)
1373                 goto e_nobufs;
1374
1375 #ifdef CONFIG_IP_ROUTE_CLASSID
1376         rth->dst.tclassid = itag;
1377 #endif
1378         rth->dst.output = ip_rt_bug;
1379
1380         rth->rt_genid   = rt_genid(dev_net(dev));
1381         rth->rt_flags   = RTCF_MULTICAST;
1382         rth->rt_type    = RTN_MULTICAST;
1383         rth->rt_is_input= 1;
1384         rth->rt_iif     = 0;
1385         rth->rt_pmtu    = 0;
1386         rth->rt_gateway = 0;
1387         INIT_LIST_HEAD(&rth->rt_uncached);
1388         if (our) {
1389                 rth->dst.input= ip_local_deliver;
1390                 rth->rt_flags |= RTCF_LOCAL;
1391         }
1392
1393 #ifdef CONFIG_IP_MROUTE
1394         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1395                 rth->dst.input = ip_mr_input;
1396 #endif
1397         RT_CACHE_STAT_INC(in_slow_mc);
1398
1399         skb_dst_set(skb, &rth->dst);
1400         return 0;
1401
1402 e_nobufs:
1403         return -ENOBUFS;
1404 e_inval:
1405         return -EINVAL;
1406 e_err:
1407         return err;
1408 }
1409
1410
1411 static void ip_handle_martian_source(struct net_device *dev,
1412                                      struct in_device *in_dev,
1413                                      struct sk_buff *skb,
1414                                      __be32 daddr,
1415                                      __be32 saddr)
1416 {
1417         RT_CACHE_STAT_INC(in_martian_src);
1418 #ifdef CONFIG_IP_ROUTE_VERBOSE
1419         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1420                 /*
1421                  *      RFC1812 recommendation, if source is martian,
1422                  *      the only hint is MAC header.
1423                  */
1424                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1425                         &daddr, &saddr, dev->name);
1426                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1427                         print_hex_dump(KERN_WARNING, "ll header: ",
1428                                        DUMP_PREFIX_OFFSET, 16, 1,
1429                                        skb_mac_header(skb),
1430                                        dev->hard_header_len, true);
1431                 }
1432         }
1433 #endif
1434 }
1435
1436 /* called in rcu_read_lock() section */
1437 static int __mkroute_input(struct sk_buff *skb,
1438                            const struct fib_result *res,
1439                            struct in_device *in_dev,
1440                            __be32 daddr, __be32 saddr, u32 tos)
1441 {
1442         struct rtable *rth;
1443         int err;
1444         struct in_device *out_dev;
1445         unsigned int flags = 0;
1446         bool do_cache;
1447         u32 itag;
1448
1449         /* get a working reference to the output device */
1450         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1451         if (out_dev == NULL) {
1452                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1453                 return -EINVAL;
1454         }
1455
1456
1457         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1458                                   in_dev->dev, in_dev, &itag);
1459         if (err < 0) {
1460                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1461                                          saddr);
1462
1463                 goto cleanup;
1464         }
1465
1466         if (out_dev == in_dev && err &&
1467             (IN_DEV_SHARED_MEDIA(out_dev) ||
1468              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1469                 flags |= RTCF_DOREDIRECT;
1470
1471         if (skb->protocol != htons(ETH_P_IP)) {
1472                 /* Not IP (i.e. ARP). Do not create route, if it is
1473                  * invalid for proxy arp. DNAT routes are always valid.
1474                  *
1475                  * Proxy arp feature have been extended to allow, ARP
1476                  * replies back to the same interface, to support
1477                  * Private VLAN switch technologies. See arp.c.
1478                  */
1479                 if (out_dev == in_dev &&
1480                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1481                         err = -EINVAL;
1482                         goto cleanup;
1483                 }
1484         }
1485
1486         do_cache = false;
1487         if (res->fi) {
1488                 if (!itag) {
1489                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1490                         if (rt_cache_valid(rth)) {
1491                                 skb_dst_set_noref(skb, &rth->dst);
1492                                 goto out;
1493                         }
1494                         do_cache = true;
1495                 }
1496         }
1497
1498         rth = rt_dst_alloc(out_dev->dev,
1499                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1500                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1501         if (!rth) {
1502                 err = -ENOBUFS;
1503                 goto cleanup;
1504         }
1505
1506         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1507         rth->rt_flags = flags;
1508         rth->rt_type = res->type;
1509         rth->rt_is_input = 1;
1510         rth->rt_iif     = 0;
1511         rth->rt_pmtu    = 0;
1512         rth->rt_gateway = 0;
1513         INIT_LIST_HEAD(&rth->rt_uncached);
1514
1515         rth->dst.input = ip_forward;
1516         rth->dst.output = ip_output;
1517
1518         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1519         skb_dst_set(skb, &rth->dst);
1520 out:
1521         err = 0;
1522  cleanup:
1523         return err;
1524 }
1525
1526 static int ip_mkroute_input(struct sk_buff *skb,
1527                             struct fib_result *res,
1528                             const struct flowi4 *fl4,
1529                             struct in_device *in_dev,
1530                             __be32 daddr, __be32 saddr, u32 tos)
1531 {
1532 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1533         if (res->fi && res->fi->fib_nhs > 1)
1534                 fib_select_multipath(res);
1535 #endif
1536
1537         /* create a routing cache entry */
1538         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1539 }
1540
1541 /*
1542  *      NOTE. We drop all the packets that has local source
1543  *      addresses, because every properly looped back packet
1544  *      must have correct destination already attached by output routine.
1545  *
1546  *      Such approach solves two big problems:
1547  *      1. Not simplex devices are handled properly.
1548  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1549  *      called with rcu_read_lock()
1550  */
1551
1552 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1553                                u8 tos, struct net_device *dev)
1554 {
1555         struct fib_result res;
1556         struct in_device *in_dev = __in_dev_get_rcu(dev);
1557         struct flowi4   fl4;
1558         unsigned int    flags = 0;
1559         u32             itag = 0;
1560         struct rtable   *rth;
1561         int             err = -EINVAL;
1562         struct net    *net = dev_net(dev);
1563         bool do_cache;
1564
1565         /* IP on this device is disabled. */
1566
1567         if (!in_dev)
1568                 goto out;
1569
1570         /* Check for the most weird martians, which can be not detected
1571            by fib_lookup.
1572          */
1573
1574         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1575                 goto martian_source;
1576
1577         res.fi = NULL;
1578         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1579                 goto brd_input;
1580
1581         /* Accept zero addresses only to limited broadcast;
1582          * I even do not know to fix it or not. Waiting for complains :-)
1583          */
1584         if (ipv4_is_zeronet(saddr))
1585                 goto martian_source;
1586
1587         if (ipv4_is_zeronet(daddr))
1588                 goto martian_destination;
1589
1590         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1591          * and call it once if daddr or/and saddr are loopback addresses
1592          */
1593         if (ipv4_is_loopback(daddr)) {
1594                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1595                         goto martian_destination;
1596         } else if (ipv4_is_loopback(saddr)) {
1597                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1598                         goto martian_source;
1599         }
1600
1601         /*
1602          *      Now we are ready to route packet.
1603          */
1604         fl4.flowi4_oif = 0;
1605         fl4.flowi4_iif = dev->ifindex;
1606         fl4.flowi4_mark = skb->mark;
1607         fl4.flowi4_tos = tos;
1608         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1609         fl4.daddr = daddr;
1610         fl4.saddr = saddr;
1611         err = fib_lookup(net, &fl4, &res);
1612         if (err != 0)
1613                 goto no_route;
1614
1615         RT_CACHE_STAT_INC(in_slow_tot);
1616
1617         if (res.type == RTN_BROADCAST)
1618                 goto brd_input;
1619
1620         if (res.type == RTN_LOCAL) {
1621                 err = fib_validate_source(skb, saddr, daddr, tos,
1622                                           net->loopback_dev->ifindex,
1623                                           dev, in_dev, &itag);
1624                 if (err < 0)
1625                         goto martian_source_keep_err;
1626                 goto local_input;
1627         }
1628
1629         if (!IN_DEV_FORWARD(in_dev))
1630                 goto no_route;
1631         if (res.type != RTN_UNICAST)
1632                 goto martian_destination;
1633
1634         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1635 out:    return err;
1636
1637 brd_input:
1638         if (skb->protocol != htons(ETH_P_IP))
1639                 goto e_inval;
1640
1641         if (!ipv4_is_zeronet(saddr)) {
1642                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1643                                           in_dev, &itag);
1644                 if (err < 0)
1645                         goto martian_source_keep_err;
1646         }
1647         flags |= RTCF_BROADCAST;
1648         res.type = RTN_BROADCAST;
1649         RT_CACHE_STAT_INC(in_brd);
1650
1651 local_input:
1652         do_cache = false;
1653         if (res.fi) {
1654                 if (!itag) {
1655                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1656                         if (rt_cache_valid(rth)) {
1657                                 skb_dst_set_noref(skb, &rth->dst);
1658                                 err = 0;
1659                                 goto out;
1660                         }
1661                         do_cache = true;
1662                 }
1663         }
1664
1665         rth = rt_dst_alloc(net->loopback_dev,
1666                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1667         if (!rth)
1668                 goto e_nobufs;
1669
1670         rth->dst.input= ip_local_deliver;
1671         rth->dst.output= ip_rt_bug;
1672 #ifdef CONFIG_IP_ROUTE_CLASSID
1673         rth->dst.tclassid = itag;
1674 #endif
1675
1676         rth->rt_genid = rt_genid(net);
1677         rth->rt_flags   = flags|RTCF_LOCAL;
1678         rth->rt_type    = res.type;
1679         rth->rt_is_input = 1;
1680         rth->rt_iif     = 0;
1681         rth->rt_pmtu    = 0;
1682         rth->rt_gateway = 0;
1683         INIT_LIST_HEAD(&rth->rt_uncached);
1684         if (res.type == RTN_UNREACHABLE) {
1685                 rth->dst.input= ip_error;
1686                 rth->dst.error= -err;
1687                 rth->rt_flags   &= ~RTCF_LOCAL;
1688         }
1689         if (do_cache)
1690                 rt_cache_route(&FIB_RES_NH(res), rth);
1691         skb_dst_set(skb, &rth->dst);
1692         err = 0;
1693         goto out;
1694
1695 no_route:
1696         RT_CACHE_STAT_INC(in_no_route);
1697         res.type = RTN_UNREACHABLE;
1698         if (err == -ESRCH)
1699                 err = -ENETUNREACH;
1700         goto local_input;
1701
1702         /*
1703          *      Do not cache martian addresses: they should be logged (RFC1812)
1704          */
1705 martian_destination:
1706         RT_CACHE_STAT_INC(in_martian_dst);
1707 #ifdef CONFIG_IP_ROUTE_VERBOSE
1708         if (IN_DEV_LOG_MARTIANS(in_dev))
1709                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1710                                      &daddr, &saddr, dev->name);
1711 #endif
1712
1713 e_inval:
1714         err = -EINVAL;
1715         goto out;
1716
1717 e_nobufs:
1718         err = -ENOBUFS;
1719         goto out;
1720
1721 martian_source:
1722         err = -EINVAL;
1723 martian_source_keep_err:
1724         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1725         goto out;
1726 }
1727
1728 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1729                          u8 tos, struct net_device *dev)
1730 {
1731         int res;
1732
1733         rcu_read_lock();
1734
1735         /* Multicast recognition logic is moved from route cache to here.
1736            The problem was that too many Ethernet cards have broken/missing
1737            hardware multicast filters :-( As result the host on multicasting
1738            network acquires a lot of useless route cache entries, sort of
1739            SDR messages from all the world. Now we try to get rid of them.
1740            Really, provided software IP multicast filter is organized
1741            reasonably (at least, hashed), it does not result in a slowdown
1742            comparing with route cache reject entries.
1743            Note, that multicast routers are not affected, because
1744            route cache entry is created eventually.
1745          */
1746         if (ipv4_is_multicast(daddr)) {
1747                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1748
1749                 if (in_dev) {
1750                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1751                                                   ip_hdr(skb)->protocol);
1752                         if (our
1753 #ifdef CONFIG_IP_MROUTE
1754                                 ||
1755                             (!ipv4_is_local_multicast(daddr) &&
1756                              IN_DEV_MFORWARD(in_dev))
1757 #endif
1758                            ) {
1759                                 int res = ip_route_input_mc(skb, daddr, saddr,
1760                                                             tos, dev, our);
1761                                 rcu_read_unlock();
1762                                 return res;
1763                         }
1764                 }
1765                 rcu_read_unlock();
1766                 return -EINVAL;
1767         }
1768         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1769         rcu_read_unlock();
1770         return res;
1771 }
1772 EXPORT_SYMBOL(ip_route_input_noref);
1773
1774 /* called with rcu_read_lock() */
1775 static struct rtable *__mkroute_output(const struct fib_result *res,
1776                                        const struct flowi4 *fl4, int orig_oif,
1777                                        struct net_device *dev_out,
1778                                        unsigned int flags)
1779 {
1780         struct fib_info *fi = res->fi;
1781         struct fib_nh_exception *fnhe;
1782         struct in_device *in_dev;
1783         u16 type = res->type;
1784         struct rtable *rth;
1785
1786         in_dev = __in_dev_get_rcu(dev_out);
1787         if (!in_dev)
1788                 return ERR_PTR(-EINVAL);
1789
1790         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1791                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1792                         return ERR_PTR(-EINVAL);
1793
1794         if (ipv4_is_lbcast(fl4->daddr))
1795                 type = RTN_BROADCAST;
1796         else if (ipv4_is_multicast(fl4->daddr))
1797                 type = RTN_MULTICAST;
1798         else if (ipv4_is_zeronet(fl4->daddr))
1799                 return ERR_PTR(-EINVAL);
1800
1801         if (dev_out->flags & IFF_LOOPBACK)
1802                 flags |= RTCF_LOCAL;
1803
1804         if (type == RTN_BROADCAST) {
1805                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1806                 fi = NULL;
1807         } else if (type == RTN_MULTICAST) {
1808                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1809                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1810                                      fl4->flowi4_proto))
1811                         flags &= ~RTCF_LOCAL;
1812                 /* If multicast route do not exist use
1813                  * default one, but do not gateway in this case.
1814                  * Yes, it is hack.
1815                  */
1816                 if (fi && res->prefixlen < 4)
1817                         fi = NULL;
1818         }
1819
1820         fnhe = NULL;
1821         if (fi) {
1822                 struct rtable __rcu **prth;
1823
1824                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1825                 if (fnhe)
1826                         prth = &fnhe->fnhe_rth;
1827                 else
1828                         prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1829                 rth = rcu_dereference(*prth);
1830                 if (rt_cache_valid(rth)) {
1831                         dst_hold(&rth->dst);
1832                         return rth;
1833                 }
1834         }
1835         rth = rt_dst_alloc(dev_out,
1836                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1837                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1838                            fi);
1839         if (!rth)
1840                 return ERR_PTR(-ENOBUFS);
1841
1842         rth->dst.output = ip_output;
1843
1844         rth->rt_genid = rt_genid(dev_net(dev_out));
1845         rth->rt_flags   = flags;
1846         rth->rt_type    = type;
1847         rth->rt_is_input = 0;
1848         rth->rt_iif     = orig_oif ? : 0;
1849         rth->rt_pmtu    = 0;
1850         rth->rt_gateway = 0;
1851         INIT_LIST_HEAD(&rth->rt_uncached);
1852
1853         RT_CACHE_STAT_INC(out_slow_tot);
1854
1855         if (flags & RTCF_LOCAL)
1856                 rth->dst.input = ip_local_deliver;
1857         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1858                 if (flags & RTCF_LOCAL &&
1859                     !(dev_out->flags & IFF_LOOPBACK)) {
1860                         rth->dst.output = ip_mc_output;
1861                         RT_CACHE_STAT_INC(out_slow_mc);
1862                 }
1863 #ifdef CONFIG_IP_MROUTE
1864                 if (type == RTN_MULTICAST) {
1865                         if (IN_DEV_MFORWARD(in_dev) &&
1866                             !ipv4_is_local_multicast(fl4->daddr)) {
1867                                 rth->dst.input = ip_mr_input;
1868                                 rth->dst.output = ip_mc_output;
1869                         }
1870                 }
1871 #endif
1872         }
1873
1874         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1875
1876         return rth;
1877 }
1878
1879 /*
1880  * Major route resolver routine.
1881  */
1882
1883 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1884 {
1885         struct net_device *dev_out = NULL;
1886         __u8 tos = RT_FL_TOS(fl4);
1887         unsigned int flags = 0;
1888         struct fib_result res;
1889         struct rtable *rth;
1890         int orig_oif;
1891
1892         res.tclassid    = 0;
1893         res.fi          = NULL;
1894         res.table       = NULL;
1895
1896         orig_oif = fl4->flowi4_oif;
1897
1898         fl4->flowi4_iif = net->loopback_dev->ifindex;
1899         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1900         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1901                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1902
1903         rcu_read_lock();
1904         if (fl4->saddr) {
1905                 rth = ERR_PTR(-EINVAL);
1906                 if (ipv4_is_multicast(fl4->saddr) ||
1907                     ipv4_is_lbcast(fl4->saddr) ||
1908                     ipv4_is_zeronet(fl4->saddr))
1909                         goto out;
1910
1911                 /* I removed check for oif == dev_out->oif here.
1912                    It was wrong for two reasons:
1913                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1914                       is assigned to multiple interfaces.
1915                    2. Moreover, we are allowed to send packets with saddr
1916                       of another iface. --ANK
1917                  */
1918
1919                 if (fl4->flowi4_oif == 0 &&
1920                     (ipv4_is_multicast(fl4->daddr) ||
1921                      ipv4_is_lbcast(fl4->daddr))) {
1922                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1923                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1924                         if (dev_out == NULL)
1925                                 goto out;
1926
1927                         /* Special hack: user can direct multicasts
1928                            and limited broadcast via necessary interface
1929                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1930                            This hack is not just for fun, it allows
1931                            vic,vat and friends to work.
1932                            They bind socket to loopback, set ttl to zero
1933                            and expect that it will work.
1934                            From the viewpoint of routing cache they are broken,
1935                            because we are not allowed to build multicast path
1936                            with loopback source addr (look, routing cache
1937                            cannot know, that ttl is zero, so that packet
1938                            will not leave this host and route is valid).
1939                            Luckily, this hack is good workaround.
1940                          */
1941
1942                         fl4->flowi4_oif = dev_out->ifindex;
1943                         goto make_route;
1944                 }
1945
1946                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1947                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1948                         if (!__ip_dev_find(net, fl4->saddr, false))
1949                                 goto out;
1950                 }
1951         }
1952
1953
1954         if (fl4->flowi4_oif) {
1955                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1956                 rth = ERR_PTR(-ENODEV);
1957                 if (dev_out == NULL)
1958                         goto out;
1959
1960                 /* RACE: Check return value of inet_select_addr instead. */
1961                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1962                         rth = ERR_PTR(-ENETUNREACH);
1963                         goto out;
1964                 }
1965                 if (ipv4_is_local_multicast(fl4->daddr) ||
1966                     ipv4_is_lbcast(fl4->daddr)) {
1967                         if (!fl4->saddr)
1968                                 fl4->saddr = inet_select_addr(dev_out, 0,
1969                                                               RT_SCOPE_LINK);
1970                         goto make_route;
1971                 }
1972                 if (fl4->saddr) {
1973                         if (ipv4_is_multicast(fl4->daddr))
1974                                 fl4->saddr = inet_select_addr(dev_out, 0,
1975                                                               fl4->flowi4_scope);
1976                         else if (!fl4->daddr)
1977                                 fl4->saddr = inet_select_addr(dev_out, 0,
1978                                                               RT_SCOPE_HOST);
1979                 }
1980         }
1981
1982         if (!fl4->daddr) {
1983                 fl4->daddr = fl4->saddr;
1984                 if (!fl4->daddr)
1985                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1986                 dev_out = net->loopback_dev;
1987                 fl4->flowi4_oif = net->loopback_dev->ifindex;
1988                 res.type = RTN_LOCAL;
1989                 flags |= RTCF_LOCAL;
1990                 goto make_route;
1991         }
1992
1993         if (fib_lookup(net, fl4, &res)) {
1994                 res.fi = NULL;
1995                 res.table = NULL;
1996                 if (fl4->flowi4_oif) {
1997                         /* Apparently, routing tables are wrong. Assume,
1998                            that the destination is on link.
1999
2000                            WHY? DW.
2001                            Because we are allowed to send to iface
2002                            even if it has NO routes and NO assigned
2003                            addresses. When oif is specified, routing
2004                            tables are looked up with only one purpose:
2005                            to catch if destination is gatewayed, rather than
2006                            direct. Moreover, if MSG_DONTROUTE is set,
2007                            we send packet, ignoring both routing tables
2008                            and ifaddr state. --ANK
2009
2010
2011                            We could make it even if oif is unknown,
2012                            likely IPv6, but we do not.
2013                          */
2014
2015                         if (fl4->saddr == 0)
2016                                 fl4->saddr = inet_select_addr(dev_out, 0,
2017                                                               RT_SCOPE_LINK);
2018                         res.type = RTN_UNICAST;
2019                         goto make_route;
2020                 }
2021                 rth = ERR_PTR(-ENETUNREACH);
2022                 goto out;
2023         }
2024
2025         if (res.type == RTN_LOCAL) {
2026                 if (!fl4->saddr) {
2027                         if (res.fi->fib_prefsrc)
2028                                 fl4->saddr = res.fi->fib_prefsrc;
2029                         else
2030                                 fl4->saddr = fl4->daddr;
2031                 }
2032                 dev_out = net->loopback_dev;
2033                 fl4->flowi4_oif = dev_out->ifindex;
2034                 res.fi = NULL;
2035                 flags |= RTCF_LOCAL;
2036                 goto make_route;
2037         }
2038
2039 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2040         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2041                 fib_select_multipath(&res);
2042         else
2043 #endif
2044         if (!res.prefixlen &&
2045             res.table->tb_num_default > 1 &&
2046             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2047                 fib_select_default(&res);
2048
2049         if (!fl4->saddr)
2050                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2051
2052         dev_out = FIB_RES_DEV(res);
2053         fl4->flowi4_oif = dev_out->ifindex;
2054
2055
2056 make_route:
2057         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2058
2059 out:
2060         rcu_read_unlock();
2061         return rth;
2062 }
2063 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2064
2065 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2066 {
2067         return NULL;
2068 }
2069
2070 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2071 {
2072         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2073
2074         return mtu ? : dst->dev->mtu;
2075 }
2076
2077 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2078                                           struct sk_buff *skb, u32 mtu)
2079 {
2080 }
2081
2082 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2083                                        struct sk_buff *skb)
2084 {
2085 }
2086
2087 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2088                                           unsigned long old)
2089 {
2090         return NULL;
2091 }
2092
2093 static struct dst_ops ipv4_dst_blackhole_ops = {
2094         .family                 =       AF_INET,
2095         .protocol               =       cpu_to_be16(ETH_P_IP),
2096         .check                  =       ipv4_blackhole_dst_check,
2097         .mtu                    =       ipv4_blackhole_mtu,
2098         .default_advmss         =       ipv4_default_advmss,
2099         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2100         .redirect               =       ipv4_rt_blackhole_redirect,
2101         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2102         .neigh_lookup           =       ipv4_neigh_lookup,
2103 };
2104
2105 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2106 {
2107         struct rtable *ort = (struct rtable *) dst_orig;
2108         struct rtable *rt;
2109
2110         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2111         if (rt) {
2112                 struct dst_entry *new = &rt->dst;
2113
2114                 new->__use = 1;
2115                 new->input = dst_discard;
2116                 new->output = dst_discard;
2117
2118                 new->dev = ort->dst.dev;
2119                 if (new->dev)
2120                         dev_hold(new->dev);
2121
2122                 rt->rt_is_input = ort->rt_is_input;
2123                 rt->rt_iif = ort->rt_iif;
2124                 rt->rt_pmtu = ort->rt_pmtu;
2125
2126                 rt->rt_genid = rt_genid(net);
2127                 rt->rt_flags = ort->rt_flags;
2128                 rt->rt_type = ort->rt_type;
2129                 rt->rt_gateway = ort->rt_gateway;
2130
2131                 INIT_LIST_HEAD(&rt->rt_uncached);
2132
2133                 dst_free(new);
2134         }
2135
2136         dst_release(dst_orig);
2137
2138         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2139 }
2140
2141 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2142                                     struct sock *sk)
2143 {
2144         struct rtable *rt = __ip_route_output_key(net, flp4);
2145
2146         if (IS_ERR(rt))
2147                 return rt;
2148
2149         if (flp4->flowi4_proto)
2150                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2151                                                    flowi4_to_flowi(flp4),
2152                                                    sk, 0);
2153
2154         return rt;
2155 }
2156 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2157
2158 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2159                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2160                         u32 seq, int event, int nowait, unsigned int flags)
2161 {
2162         struct rtable *rt = skb_rtable(skb);
2163         struct rtmsg *r;
2164         struct nlmsghdr *nlh;
2165         unsigned long expires = 0;
2166         u32 error;
2167         u32 metrics[RTAX_MAX];
2168
2169         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2170         if (nlh == NULL)
2171                 return -EMSGSIZE;
2172
2173         r = nlmsg_data(nlh);
2174         r->rtm_family    = AF_INET;
2175         r->rtm_dst_len  = 32;
2176         r->rtm_src_len  = 0;
2177         r->rtm_tos      = fl4->flowi4_tos;
2178         r->rtm_table    = RT_TABLE_MAIN;
2179         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2180                 goto nla_put_failure;
2181         r->rtm_type     = rt->rt_type;
2182         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2183         r->rtm_protocol = RTPROT_UNSPEC;
2184         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2185         if (rt->rt_flags & RTCF_NOTIFY)
2186                 r->rtm_flags |= RTM_F_NOTIFY;
2187
2188         if (nla_put_be32(skb, RTA_DST, dst))
2189                 goto nla_put_failure;
2190         if (src) {
2191                 r->rtm_src_len = 32;
2192                 if (nla_put_be32(skb, RTA_SRC, src))
2193                         goto nla_put_failure;
2194         }
2195         if (rt->dst.dev &&
2196             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2197                 goto nla_put_failure;
2198 #ifdef CONFIG_IP_ROUTE_CLASSID
2199         if (rt->dst.tclassid &&
2200             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2201                 goto nla_put_failure;
2202 #endif
2203         if (!rt_is_input_route(rt) &&
2204             fl4->saddr != src) {
2205                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2206                         goto nla_put_failure;
2207         }
2208         if (rt->rt_gateway &&
2209             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2210                 goto nla_put_failure;
2211
2212         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2213         if (rt->rt_pmtu)
2214                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2215         if (rtnetlink_put_metrics(skb, metrics) < 0)
2216                 goto nla_put_failure;
2217
2218         if (fl4->flowi4_mark &&
2219             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2220                 goto nla_put_failure;
2221
2222         error = rt->dst.error;
2223         expires = rt->dst.expires;
2224         if (expires) {
2225                 if (time_before(jiffies, expires))
2226                         expires -= jiffies;
2227                 else
2228                         expires = 0;
2229         }
2230
2231         if (rt_is_input_route(rt)) {
2232                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2233                         goto nla_put_failure;
2234         }
2235
2236         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2237                 goto nla_put_failure;
2238
2239         return nlmsg_end(skb, nlh);
2240
2241 nla_put_failure:
2242         nlmsg_cancel(skb, nlh);
2243         return -EMSGSIZE;
2244 }
2245
2246 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2247 {
2248         struct net *net = sock_net(in_skb->sk);
2249         struct rtmsg *rtm;
2250         struct nlattr *tb[RTA_MAX+1];
2251         struct rtable *rt = NULL;
2252         struct flowi4 fl4;
2253         __be32 dst = 0;
2254         __be32 src = 0;
2255         u32 iif;
2256         int err;
2257         int mark;
2258         struct sk_buff *skb;
2259
2260         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2261         if (err < 0)
2262                 goto errout;
2263
2264         rtm = nlmsg_data(nlh);
2265
2266         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2267         if (skb == NULL) {
2268                 err = -ENOBUFS;
2269                 goto errout;
2270         }
2271
2272         /* Reserve room for dummy headers, this skb can pass
2273            through good chunk of routing engine.
2274          */
2275         skb_reset_mac_header(skb);
2276         skb_reset_network_header(skb);
2277
2278         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2279         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2280         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2281
2282         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2283         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2284         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2285         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2286
2287         memset(&fl4, 0, sizeof(fl4));
2288         fl4.daddr = dst;
2289         fl4.saddr = src;
2290         fl4.flowi4_tos = rtm->rtm_tos;
2291         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2292         fl4.flowi4_mark = mark;
2293
2294         if (iif) {
2295                 struct net_device *dev;
2296
2297                 dev = __dev_get_by_index(net, iif);
2298                 if (dev == NULL) {
2299                         err = -ENODEV;
2300                         goto errout_free;
2301                 }
2302
2303                 skb->protocol   = htons(ETH_P_IP);
2304                 skb->dev        = dev;
2305                 skb->mark       = mark;
2306                 local_bh_disable();
2307                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2308                 local_bh_enable();
2309
2310                 rt = skb_rtable(skb);
2311                 if (err == 0 && rt->dst.error)
2312                         err = -rt->dst.error;
2313         } else {
2314                 rt = ip_route_output_key(net, &fl4);
2315
2316                 err = 0;
2317                 if (IS_ERR(rt))
2318                         err = PTR_ERR(rt);
2319         }
2320
2321         if (err)
2322                 goto errout_free;
2323
2324         skb_dst_set(skb, &rt->dst);
2325         if (rtm->rtm_flags & RTM_F_NOTIFY)
2326                 rt->rt_flags |= RTCF_NOTIFY;
2327
2328         err = rt_fill_info(net, dst, src, &fl4, skb,
2329                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2330                            RTM_NEWROUTE, 0, 0);
2331         if (err <= 0)
2332                 goto errout_free;
2333
2334         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2335 errout:
2336         return err;
2337
2338 errout_free:
2339         kfree_skb(skb);
2340         goto errout;
2341 }
2342
2343 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2344 {
2345         return skb->len;
2346 }
2347
2348 void ip_rt_multicast_event(struct in_device *in_dev)
2349 {
2350         rt_cache_flush(dev_net(in_dev->dev), 0);
2351 }
2352
2353 #ifdef CONFIG_SYSCTL
2354 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2355                                         void __user *buffer,
2356                                         size_t *lenp, loff_t *ppos)
2357 {
2358         if (write) {
2359                 int flush_delay;
2360                 ctl_table ctl;
2361                 struct net *net;
2362
2363                 memcpy(&ctl, __ctl, sizeof(ctl));
2364                 ctl.data = &flush_delay;
2365                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2366
2367                 net = (struct net *)__ctl->extra1;
2368                 rt_cache_flush(net, flush_delay);
2369                 return 0;
2370         }
2371
2372         return -EINVAL;
2373 }
2374
2375 static ctl_table ipv4_route_table[] = {
2376         {
2377                 .procname       = "gc_thresh",
2378                 .data           = &ipv4_dst_ops.gc_thresh,
2379                 .maxlen         = sizeof(int),
2380                 .mode           = 0644,
2381                 .proc_handler   = proc_dointvec,
2382         },
2383         {
2384                 .procname       = "max_size",
2385                 .data           = &ip_rt_max_size,
2386                 .maxlen         = sizeof(int),
2387                 .mode           = 0644,
2388                 .proc_handler   = proc_dointvec,
2389         },
2390         {
2391                 /*  Deprecated. Use gc_min_interval_ms */
2392
2393                 .procname       = "gc_min_interval",
2394                 .data           = &ip_rt_gc_min_interval,
2395                 .maxlen         = sizeof(int),
2396                 .mode           = 0644,
2397                 .proc_handler   = proc_dointvec_jiffies,
2398         },
2399         {
2400                 .procname       = "gc_min_interval_ms",
2401                 .data           = &ip_rt_gc_min_interval,
2402                 .maxlen         = sizeof(int),
2403                 .mode           = 0644,
2404                 .proc_handler   = proc_dointvec_ms_jiffies,
2405         },
2406         {
2407                 .procname       = "gc_timeout",
2408                 .data           = &ip_rt_gc_timeout,
2409                 .maxlen         = sizeof(int),
2410                 .mode           = 0644,
2411                 .proc_handler   = proc_dointvec_jiffies,
2412         },
2413         {
2414                 .procname       = "gc_interval",
2415                 .data           = &ip_rt_gc_interval,
2416                 .maxlen         = sizeof(int),
2417                 .mode           = 0644,
2418                 .proc_handler   = proc_dointvec_jiffies,
2419         },
2420         {
2421                 .procname       = "redirect_load",
2422                 .data           = &ip_rt_redirect_load,
2423                 .maxlen         = sizeof(int),
2424                 .mode           = 0644,
2425                 .proc_handler   = proc_dointvec,
2426         },
2427         {
2428                 .procname       = "redirect_number",
2429                 .data           = &ip_rt_redirect_number,
2430                 .maxlen         = sizeof(int),
2431                 .mode           = 0644,
2432                 .proc_handler   = proc_dointvec,
2433         },
2434         {
2435                 .procname       = "redirect_silence",
2436                 .data           = &ip_rt_redirect_silence,
2437                 .maxlen         = sizeof(int),
2438                 .mode           = 0644,
2439                 .proc_handler   = proc_dointvec,
2440         },
2441         {
2442                 .procname       = "error_cost",
2443                 .data           = &ip_rt_error_cost,
2444                 .maxlen         = sizeof(int),
2445                 .mode           = 0644,
2446                 .proc_handler   = proc_dointvec,
2447         },
2448         {
2449                 .procname       = "error_burst",
2450                 .data           = &ip_rt_error_burst,
2451                 .maxlen         = sizeof(int),
2452                 .mode           = 0644,
2453                 .proc_handler   = proc_dointvec,
2454         },
2455         {
2456                 .procname       = "gc_elasticity",
2457                 .data           = &ip_rt_gc_elasticity,
2458                 .maxlen         = sizeof(int),
2459                 .mode           = 0644,
2460                 .proc_handler   = proc_dointvec,
2461         },
2462         {
2463                 .procname       = "mtu_expires",
2464                 .data           = &ip_rt_mtu_expires,
2465                 .maxlen         = sizeof(int),
2466                 .mode           = 0644,
2467                 .proc_handler   = proc_dointvec_jiffies,
2468         },
2469         {
2470                 .procname       = "min_pmtu",
2471                 .data           = &ip_rt_min_pmtu,
2472                 .maxlen         = sizeof(int),
2473                 .mode           = 0644,
2474                 .proc_handler   = proc_dointvec,
2475         },
2476         {
2477                 .procname       = "min_adv_mss",
2478                 .data           = &ip_rt_min_advmss,
2479                 .maxlen         = sizeof(int),
2480                 .mode           = 0644,
2481                 .proc_handler   = proc_dointvec,
2482         },
2483         { }
2484 };
2485
2486 static struct ctl_table ipv4_route_flush_table[] = {
2487         {
2488                 .procname       = "flush",
2489                 .maxlen         = sizeof(int),
2490                 .mode           = 0200,
2491                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2492         },
2493         { },
2494 };
2495
2496 static __net_init int sysctl_route_net_init(struct net *net)
2497 {
2498         struct ctl_table *tbl;
2499
2500         tbl = ipv4_route_flush_table;
2501         if (!net_eq(net, &init_net)) {
2502                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2503                 if (tbl == NULL)
2504                         goto err_dup;
2505         }
2506         tbl[0].extra1 = net;
2507
2508         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2509         if (net->ipv4.route_hdr == NULL)
2510                 goto err_reg;
2511         return 0;
2512
2513 err_reg:
2514         if (tbl != ipv4_route_flush_table)
2515                 kfree(tbl);
2516 err_dup:
2517         return -ENOMEM;
2518 }
2519
2520 static __net_exit void sysctl_route_net_exit(struct net *net)
2521 {
2522         struct ctl_table *tbl;
2523
2524         tbl = net->ipv4.route_hdr->ctl_table_arg;
2525         unregister_net_sysctl_table(net->ipv4.route_hdr);
2526         BUG_ON(tbl == ipv4_route_flush_table);
2527         kfree(tbl);
2528 }
2529
2530 static __net_initdata struct pernet_operations sysctl_route_ops = {
2531         .init = sysctl_route_net_init,
2532         .exit = sysctl_route_net_exit,
2533 };
2534 #endif
2535
2536 static __net_init int rt_genid_init(struct net *net)
2537 {
2538         get_random_bytes(&net->ipv4.rt_genid,
2539                          sizeof(net->ipv4.rt_genid));
2540         get_random_bytes(&net->ipv4.dev_addr_genid,
2541                          sizeof(net->ipv4.dev_addr_genid));
2542         return 0;
2543 }
2544
2545 static __net_initdata struct pernet_operations rt_genid_ops = {
2546         .init = rt_genid_init,
2547 };
2548
2549 static int __net_init ipv4_inetpeer_init(struct net *net)
2550 {
2551         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2552
2553         if (!bp)
2554                 return -ENOMEM;
2555         inet_peer_base_init(bp);
2556         net->ipv4.peers = bp;
2557         return 0;
2558 }
2559
2560 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2561 {
2562         struct inet_peer_base *bp = net->ipv4.peers;
2563
2564         net->ipv4.peers = NULL;
2565         inetpeer_invalidate_tree(bp);
2566         kfree(bp);
2567 }
2568
2569 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2570         .init   =       ipv4_inetpeer_init,
2571         .exit   =       ipv4_inetpeer_exit,
2572 };
2573
2574 #ifdef CONFIG_IP_ROUTE_CLASSID
2575 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2576 #endif /* CONFIG_IP_ROUTE_CLASSID */
2577
2578 int __init ip_rt_init(void)
2579 {
2580         int rc = 0;
2581
2582 #ifdef CONFIG_IP_ROUTE_CLASSID
2583         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2584         if (!ip_rt_acct)
2585                 panic("IP: failed to allocate ip_rt_acct\n");
2586 #endif
2587
2588         ipv4_dst_ops.kmem_cachep =
2589                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2590                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2591
2592         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2593
2594         if (dst_entries_init(&ipv4_dst_ops) < 0)
2595                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2596
2597         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2598                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2599
2600         ipv4_dst_ops.gc_thresh = ~0;
2601         ip_rt_max_size = INT_MAX;
2602
2603         devinet_init();
2604         ip_fib_init();
2605
2606         if (ip_rt_proc_init())
2607                 pr_err("Unable to create route proc files\n");
2608 #ifdef CONFIG_XFRM
2609         xfrm_init();
2610         xfrm4_init(ip_rt_max_size);
2611 #endif
2612         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2613
2614 #ifdef CONFIG_SYSCTL
2615         register_pernet_subsys(&sysctl_route_ops);
2616 #endif
2617         register_pernet_subsys(&rt_genid_ops);
2618         register_pernet_subsys(&ipv4_inetpeer_ops);
2619         return rc;
2620 }
2621
2622 #ifdef CONFIG_SYSCTL
2623 /*
2624  * We really need to sanitize the damn ipv4 init order, then all
2625  * this nonsense will go away.
2626  */
2627 void __init ip_static_sysctl_init(void)
2628 {
2629         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2630 }
2631 #endif