ipv4: Kill rt->rt_oif
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148                                            struct sk_buff *skb, u32 mtu);
149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150                                         struct sk_buff *skb);
151
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153                             int how)
154 {
155 }
156
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159         WARN_ON(1);
160         return NULL;
161 }
162
163 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
164                                            struct sk_buff *skb,
165                                            const void *daddr);
166
167 static struct dst_ops ipv4_dst_ops = {
168         .family =               AF_INET,
169         .protocol =             cpu_to_be16(ETH_P_IP),
170         .check =                ipv4_dst_check,
171         .default_advmss =       ipv4_default_advmss,
172         .mtu =                  ipv4_mtu,
173         .cow_metrics =          ipv4_cow_metrics,
174         .destroy =              ipv4_dst_destroy,
175         .ifdown =               ipv4_dst_ifdown,
176         .negative_advice =      ipv4_negative_advice,
177         .link_failure =         ipv4_link_failure,
178         .update_pmtu =          ip_rt_update_pmtu,
179         .redirect =             ip_do_redirect,
180         .local_out =            __ip_local_out,
181         .neigh_lookup =         ipv4_neigh_lookup,
182 };
183
184 #define ECN_OR_COST(class)      TC_PRIO_##class
185
186 const __u8 ip_tos2prio[16] = {
187         TC_PRIO_BESTEFFORT,
188         ECN_OR_COST(BESTEFFORT),
189         TC_PRIO_BESTEFFORT,
190         ECN_OR_COST(BESTEFFORT),
191         TC_PRIO_BULK,
192         ECN_OR_COST(BULK),
193         TC_PRIO_BULK,
194         ECN_OR_COST(BULK),
195         TC_PRIO_INTERACTIVE,
196         ECN_OR_COST(INTERACTIVE),
197         TC_PRIO_INTERACTIVE,
198         ECN_OR_COST(INTERACTIVE),
199         TC_PRIO_INTERACTIVE_BULK,
200         ECN_OR_COST(INTERACTIVE_BULK),
201         TC_PRIO_INTERACTIVE_BULK,
202         ECN_OR_COST(INTERACTIVE_BULK)
203 };
204 EXPORT_SYMBOL(ip_tos2prio);
205
206 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
207 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
208
209 static inline int rt_genid(struct net *net)
210 {
211         return atomic_read(&net->ipv4.rt_genid);
212 }
213
214 #ifdef CONFIG_PROC_FS
215 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
216 {
217         if (*pos)
218                 return NULL;
219         return SEQ_START_TOKEN;
220 }
221
222 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223 {
224         ++*pos;
225         return NULL;
226 }
227
228 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
229 {
230 }
231
232 static int rt_cache_seq_show(struct seq_file *seq, void *v)
233 {
234         if (v == SEQ_START_TOKEN)
235                 seq_printf(seq, "%-127s\n",
236                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
237                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
238                            "HHUptod\tSpecDst");
239         return 0;
240 }
241
242 static const struct seq_operations rt_cache_seq_ops = {
243         .start  = rt_cache_seq_start,
244         .next   = rt_cache_seq_next,
245         .stop   = rt_cache_seq_stop,
246         .show   = rt_cache_seq_show,
247 };
248
249 static int rt_cache_seq_open(struct inode *inode, struct file *file)
250 {
251         return seq_open(file, &rt_cache_seq_ops);
252 }
253
254 static const struct file_operations rt_cache_seq_fops = {
255         .owner   = THIS_MODULE,
256         .open    = rt_cache_seq_open,
257         .read    = seq_read,
258         .llseek  = seq_lseek,
259         .release = seq_release,
260 };
261
262
263 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
264 {
265         int cpu;
266
267         if (*pos == 0)
268                 return SEQ_START_TOKEN;
269
270         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277 }
278
279 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
280 {
281         int cpu;
282
283         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
284                 if (!cpu_possible(cpu))
285                         continue;
286                 *pos = cpu+1;
287                 return &per_cpu(rt_cache_stat, cpu);
288         }
289         return NULL;
290
291 }
292
293 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
294 {
295
296 }
297
298 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
299 {
300         struct rt_cache_stat *st = v;
301
302         if (v == SEQ_START_TOKEN) {
303                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
304                 return 0;
305         }
306
307         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
308                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
309                    dst_entries_get_slow(&ipv4_dst_ops),
310                    st->in_hit,
311                    st->in_slow_tot,
312                    st->in_slow_mc,
313                    st->in_no_route,
314                    st->in_brd,
315                    st->in_martian_dst,
316                    st->in_martian_src,
317
318                    st->out_hit,
319                    st->out_slow_tot,
320                    st->out_slow_mc,
321
322                    st->gc_total,
323                    st->gc_ignored,
324                    st->gc_goal_miss,
325                    st->gc_dst_overflow,
326                    st->in_hlist_search,
327                    st->out_hlist_search
328                 );
329         return 0;
330 }
331
332 static const struct seq_operations rt_cpu_seq_ops = {
333         .start  = rt_cpu_seq_start,
334         .next   = rt_cpu_seq_next,
335         .stop   = rt_cpu_seq_stop,
336         .show   = rt_cpu_seq_show,
337 };
338
339
340 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
341 {
342         return seq_open(file, &rt_cpu_seq_ops);
343 }
344
345 static const struct file_operations rt_cpu_seq_fops = {
346         .owner   = THIS_MODULE,
347         .open    = rt_cpu_seq_open,
348         .read    = seq_read,
349         .llseek  = seq_lseek,
350         .release = seq_release,
351 };
352
353 #ifdef CONFIG_IP_ROUTE_CLASSID
354 static int rt_acct_proc_show(struct seq_file *m, void *v)
355 {
356         struct ip_rt_acct *dst, *src;
357         unsigned int i, j;
358
359         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
360         if (!dst)
361                 return -ENOMEM;
362
363         for_each_possible_cpu(i) {
364                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
365                 for (j = 0; j < 256; j++) {
366                         dst[j].o_bytes   += src[j].o_bytes;
367                         dst[j].o_packets += src[j].o_packets;
368                         dst[j].i_bytes   += src[j].i_bytes;
369                         dst[j].i_packets += src[j].i_packets;
370                 }
371         }
372
373         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
374         kfree(dst);
375         return 0;
376 }
377
378 static int rt_acct_proc_open(struct inode *inode, struct file *file)
379 {
380         return single_open(file, rt_acct_proc_show, NULL);
381 }
382
383 static const struct file_operations rt_acct_proc_fops = {
384         .owner          = THIS_MODULE,
385         .open           = rt_acct_proc_open,
386         .read           = seq_read,
387         .llseek         = seq_lseek,
388         .release        = single_release,
389 };
390 #endif
391
392 static int __net_init ip_rt_do_proc_init(struct net *net)
393 {
394         struct proc_dir_entry *pde;
395
396         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
397                         &rt_cache_seq_fops);
398         if (!pde)
399                 goto err1;
400
401         pde = proc_create("rt_cache", S_IRUGO,
402                           net->proc_net_stat, &rt_cpu_seq_fops);
403         if (!pde)
404                 goto err2;
405
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
408         if (!pde)
409                 goto err3;
410 #endif
411         return 0;
412
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 err3:
415         remove_proc_entry("rt_cache", net->proc_net_stat);
416 #endif
417 err2:
418         remove_proc_entry("rt_cache", net->proc_net);
419 err1:
420         return -ENOMEM;
421 }
422
423 static void __net_exit ip_rt_do_proc_exit(struct net *net)
424 {
425         remove_proc_entry("rt_cache", net->proc_net_stat);
426         remove_proc_entry("rt_cache", net->proc_net);
427 #ifdef CONFIG_IP_ROUTE_CLASSID
428         remove_proc_entry("rt_acct", net->proc_net);
429 #endif
430 }
431
432 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
433         .init = ip_rt_do_proc_init,
434         .exit = ip_rt_do_proc_exit,
435 };
436
437 static int __init ip_rt_proc_init(void)
438 {
439         return register_pernet_subsys(&ip_rt_proc_ops);
440 }
441
442 #else
443 static inline int ip_rt_proc_init(void)
444 {
445         return 0;
446 }
447 #endif /* CONFIG_PROC_FS */
448
449 static inline int rt_is_expired(struct rtable *rth)
450 {
451         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
452 }
453
454 /*
455  * Perturbation of rt_genid by a small quantity [1..256]
456  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
457  * many times (2^24) without giving recent rt_genid.
458  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
459  */
460 static void rt_cache_invalidate(struct net *net)
461 {
462         unsigned char shuffle;
463
464         get_random_bytes(&shuffle, sizeof(shuffle));
465         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
466 }
467
468 /*
469  * delay < 0  : invalidate cache (fast : entries will be deleted later)
470  * delay >= 0 : invalidate & flush cache (can be long)
471  */
472 void rt_cache_flush(struct net *net, int delay)
473 {
474         rt_cache_invalidate(net);
475 }
476
477 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
478                                            struct sk_buff *skb,
479                                            const void *daddr)
480 {
481         struct net_device *dev = dst->dev;
482         const __be32 *pkey = daddr;
483         const struct rtable *rt;
484         struct neighbour *n;
485
486         rt = (const struct rtable *) dst;
487         if (rt->rt_gateway)
488                 pkey = (const __be32 *) &rt->rt_gateway;
489         else if (skb)
490                 pkey = &ip_hdr(skb)->daddr;
491
492         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
493         if (n)
494                 return n;
495         return neigh_create(&arp_tbl, pkey, dev);
496 }
497
498 /*
499  * Peer allocation may fail only in serious out-of-memory conditions.  However
500  * we still can generate some output.
501  * Random ID selection looks a bit dangerous because we have no chances to
502  * select ID being unique in a reasonable period of time.
503  * But broken packet identifier may be better than no packet at all.
504  */
505 static void ip_select_fb_ident(struct iphdr *iph)
506 {
507         static DEFINE_SPINLOCK(ip_fb_id_lock);
508         static u32 ip_fallback_id;
509         u32 salt;
510
511         spin_lock_bh(&ip_fb_id_lock);
512         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
513         iph->id = htons(salt & 0xFFFF);
514         ip_fallback_id = salt;
515         spin_unlock_bh(&ip_fb_id_lock);
516 }
517
518 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
519 {
520         struct net *net = dev_net(dst->dev);
521         struct inet_peer *peer;
522
523         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
524         if (peer) {
525                 iph->id = htons(inet_getid(peer, more));
526                 inet_putpeer(peer);
527                 return;
528         }
529
530         ip_select_fb_ident(iph);
531 }
532 EXPORT_SYMBOL(__ip_select_ident);
533
534 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
535                              const struct iphdr *iph,
536                              int oif, u8 tos,
537                              u8 prot, u32 mark, int flow_flags)
538 {
539         if (sk) {
540                 const struct inet_sock *inet = inet_sk(sk);
541
542                 oif = sk->sk_bound_dev_if;
543                 mark = sk->sk_mark;
544                 tos = RT_CONN_FLAGS(sk);
545                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
546         }
547         flowi4_init_output(fl4, oif, mark, tos,
548                            RT_SCOPE_UNIVERSE, prot,
549                            flow_flags,
550                            iph->daddr, iph->saddr, 0, 0);
551 }
552
553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554                                const struct sock *sk)
555 {
556         const struct iphdr *iph = ip_hdr(skb);
557         int oif = skb->dev->ifindex;
558         u8 tos = RT_TOS(iph->tos);
559         u8 prot = iph->protocol;
560         u32 mark = skb->mark;
561
562         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
563 }
564
565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 {
567         const struct inet_sock *inet = inet_sk(sk);
568         const struct ip_options_rcu *inet_opt;
569         __be32 daddr = inet->inet_daddr;
570
571         rcu_read_lock();
572         inet_opt = rcu_dereference(inet->inet_opt);
573         if (inet_opt && inet_opt->opt.srr)
574                 daddr = inet_opt->opt.faddr;
575         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578                            inet_sk_flowi_flags(sk),
579                            daddr, inet->inet_saddr, 0, 0);
580         rcu_read_unlock();
581 }
582
583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584                                  const struct sk_buff *skb)
585 {
586         if (skb)
587                 build_skb_flow_key(fl4, skb, sk);
588         else
589                 build_sk_flow_key(fl4, sk);
590 }
591
592 static DEFINE_SEQLOCK(fnhe_seqlock);
593
594 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
595 {
596         struct fib_nh_exception *fnhe, *oldest;
597
598         oldest = rcu_dereference(hash->chain);
599         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
600              fnhe = rcu_dereference(fnhe->fnhe_next)) {
601                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
602                         oldest = fnhe;
603         }
604         return oldest;
605 }
606
607 static inline u32 fnhe_hashfun(__be32 daddr)
608 {
609         u32 hval;
610
611         hval = (__force u32) daddr;
612         hval ^= (hval >> 11) ^ (hval >> 22);
613
614         return hval & (FNHE_HASH_SIZE - 1);
615 }
616
617 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
618                                   u32 pmtu, unsigned long expires)
619 {
620         struct fnhe_hash_bucket *hash;
621         struct fib_nh_exception *fnhe;
622         int depth;
623         u32 hval = fnhe_hashfun(daddr);
624
625         write_seqlock_bh(&fnhe_seqlock);
626
627         hash = nh->nh_exceptions;
628         if (!hash) {
629                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
630                 if (!hash)
631                         goto out_unlock;
632                 nh->nh_exceptions = hash;
633         }
634
635         hash += hval;
636
637         depth = 0;
638         for (fnhe = rcu_dereference(hash->chain); fnhe;
639              fnhe = rcu_dereference(fnhe->fnhe_next)) {
640                 if (fnhe->fnhe_daddr == daddr)
641                         break;
642                 depth++;
643         }
644
645         if (fnhe) {
646                 if (gw)
647                         fnhe->fnhe_gw = gw;
648                 if (pmtu) {
649                         fnhe->fnhe_pmtu = pmtu;
650                         fnhe->fnhe_expires = expires;
651                 }
652         } else {
653                 if (depth > FNHE_RECLAIM_DEPTH)
654                         fnhe = fnhe_oldest(hash);
655                 else {
656                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
657                         if (!fnhe)
658                                 goto out_unlock;
659
660                         fnhe->fnhe_next = hash->chain;
661                         rcu_assign_pointer(hash->chain, fnhe);
662                 }
663                 fnhe->fnhe_daddr = daddr;
664                 fnhe->fnhe_gw = gw;
665                 fnhe->fnhe_pmtu = pmtu;
666                 fnhe->fnhe_expires = expires;
667         }
668
669         fnhe->fnhe_stamp = jiffies;
670
671 out_unlock:
672         write_sequnlock_bh(&fnhe_seqlock);
673         return;
674 }
675
676 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
677                              bool kill_route)
678 {
679         __be32 new_gw = icmp_hdr(skb)->un.gateway;
680         __be32 old_gw = ip_hdr(skb)->saddr;
681         struct net_device *dev = skb->dev;
682         struct in_device *in_dev;
683         struct fib_result res;
684         struct neighbour *n;
685         struct net *net;
686
687         switch (icmp_hdr(skb)->code & 7) {
688         case ICMP_REDIR_NET:
689         case ICMP_REDIR_NETTOS:
690         case ICMP_REDIR_HOST:
691         case ICMP_REDIR_HOSTTOS:
692                 break;
693
694         default:
695                 return;
696         }
697
698         if (rt->rt_gateway != old_gw)
699                 return;
700
701         in_dev = __in_dev_get_rcu(dev);
702         if (!in_dev)
703                 return;
704
705         net = dev_net(dev);
706         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
707             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
708             ipv4_is_zeronet(new_gw))
709                 goto reject_redirect;
710
711         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
712                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
713                         goto reject_redirect;
714                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
715                         goto reject_redirect;
716         } else {
717                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
718                         goto reject_redirect;
719         }
720
721         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
722         if (n) {
723                 if (!(n->nud_state & NUD_VALID)) {
724                         neigh_event_send(n, NULL);
725                 } else {
726                         if (fib_lookup(net, fl4, &res) == 0) {
727                                 struct fib_nh *nh = &FIB_RES_NH(res);
728
729                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
730                                                       0, 0);
731                         }
732                         if (kill_route)
733                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
734                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
735                 }
736                 neigh_release(n);
737         }
738         return;
739
740 reject_redirect:
741 #ifdef CONFIG_IP_ROUTE_VERBOSE
742         if (IN_DEV_LOG_MARTIANS(in_dev)) {
743                 const struct iphdr *iph = (const struct iphdr *) skb->data;
744                 __be32 daddr = iph->daddr;
745                 __be32 saddr = iph->saddr;
746
747                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
748                                      "  Advised path = %pI4 -> %pI4\n",
749                                      &old_gw, dev->name, &new_gw,
750                                      &saddr, &daddr);
751         }
752 #endif
753         ;
754 }
755
756 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
757 {
758         struct rtable *rt;
759         struct flowi4 fl4;
760
761         rt = (struct rtable *) dst;
762
763         ip_rt_build_flow_key(&fl4, sk, skb);
764         __ip_do_redirect(rt, skb, &fl4, true);
765 }
766
767 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
768 {
769         struct rtable *rt = (struct rtable *)dst;
770         struct dst_entry *ret = dst;
771
772         if (rt) {
773                 if (dst->obsolete > 0) {
774                         ip_rt_put(rt);
775                         ret = NULL;
776                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
777                            rt->dst.expires) {
778                         ip_rt_put(rt);
779                         ret = NULL;
780                 }
781         }
782         return ret;
783 }
784
785 /*
786  * Algorithm:
787  *      1. The first ip_rt_redirect_number redirects are sent
788  *         with exponential backoff, then we stop sending them at all,
789  *         assuming that the host ignores our redirects.
790  *      2. If we did not see packets requiring redirects
791  *         during ip_rt_redirect_silence, we assume that the host
792  *         forgot redirected route and start to send redirects again.
793  *
794  * This algorithm is much cheaper and more intelligent than dumb load limiting
795  * in icmp.c.
796  *
797  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
798  * and "frag. need" (breaks PMTU discovery) in icmp.c.
799  */
800
801 void ip_rt_send_redirect(struct sk_buff *skb)
802 {
803         struct rtable *rt = skb_rtable(skb);
804         struct in_device *in_dev;
805         struct inet_peer *peer;
806         struct net *net;
807         int log_martians;
808
809         rcu_read_lock();
810         in_dev = __in_dev_get_rcu(rt->dst.dev);
811         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
812                 rcu_read_unlock();
813                 return;
814         }
815         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
816         rcu_read_unlock();
817
818         net = dev_net(rt->dst.dev);
819         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
820         if (!peer) {
821                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
822                 return;
823         }
824
825         /* No redirected packets during ip_rt_redirect_silence;
826          * reset the algorithm.
827          */
828         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
829                 peer->rate_tokens = 0;
830
831         /* Too many ignored redirects; do not send anything
832          * set dst.rate_last to the last seen redirected packet.
833          */
834         if (peer->rate_tokens >= ip_rt_redirect_number) {
835                 peer->rate_last = jiffies;
836                 goto out_put_peer;
837         }
838
839         /* Check for load limit; set rate_last to the latest sent
840          * redirect.
841          */
842         if (peer->rate_tokens == 0 ||
843             time_after(jiffies,
844                        (peer->rate_last +
845                         (ip_rt_redirect_load << peer->rate_tokens)))) {
846                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
847                 peer->rate_last = jiffies;
848                 ++peer->rate_tokens;
849 #ifdef CONFIG_IP_ROUTE_VERBOSE
850                 if (log_martians &&
851                     peer->rate_tokens == ip_rt_redirect_number)
852                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
853                                              &ip_hdr(skb)->saddr, rt->rt_iif,
854                                              &ip_hdr(skb)->daddr, &rt->rt_gateway);
855 #endif
856         }
857 out_put_peer:
858         inet_putpeer(peer);
859 }
860
861 static int ip_error(struct sk_buff *skb)
862 {
863         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
864         struct rtable *rt = skb_rtable(skb);
865         struct inet_peer *peer;
866         unsigned long now;
867         struct net *net;
868         bool send;
869         int code;
870
871         net = dev_net(rt->dst.dev);
872         if (!IN_DEV_FORWARD(in_dev)) {
873                 switch (rt->dst.error) {
874                 case EHOSTUNREACH:
875                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
876                         break;
877
878                 case ENETUNREACH:
879                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
880                         break;
881                 }
882                 goto out;
883         }
884
885         switch (rt->dst.error) {
886         case EINVAL:
887         default:
888                 goto out;
889         case EHOSTUNREACH:
890                 code = ICMP_HOST_UNREACH;
891                 break;
892         case ENETUNREACH:
893                 code = ICMP_NET_UNREACH;
894                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
895                 break;
896         case EACCES:
897                 code = ICMP_PKT_FILTERED;
898                 break;
899         }
900
901         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
902
903         send = true;
904         if (peer) {
905                 now = jiffies;
906                 peer->rate_tokens += now - peer->rate_last;
907                 if (peer->rate_tokens > ip_rt_error_burst)
908                         peer->rate_tokens = ip_rt_error_burst;
909                 peer->rate_last = now;
910                 if (peer->rate_tokens >= ip_rt_error_cost)
911                         peer->rate_tokens -= ip_rt_error_cost;
912                 else
913                         send = false;
914                 inet_putpeer(peer);
915         }
916         if (send)
917                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
918
919 out:    kfree_skb(skb);
920         return 0;
921 }
922
923 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
924 {
925         struct fib_result res;
926
927         if (mtu < ip_rt_min_pmtu)
928                 mtu = ip_rt_min_pmtu;
929
930         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
931                 struct fib_nh *nh = &FIB_RES_NH(res);
932
933                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
934                                       jiffies + ip_rt_mtu_expires);
935         }
936         return mtu;
937 }
938
939 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
940                               struct sk_buff *skb, u32 mtu)
941 {
942         struct rtable *rt = (struct rtable *) dst;
943         struct flowi4 fl4;
944
945         ip_rt_build_flow_key(&fl4, sk, skb);
946         mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
947
948         if (!rt->rt_pmtu) {
949                 dst->obsolete = DST_OBSOLETE_KILL;
950         } else {
951                 rt->rt_pmtu = mtu;
952                 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
953         }
954 }
955
956 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
957                       int oif, u32 mark, u8 protocol, int flow_flags)
958 {
959         const struct iphdr *iph = (const struct iphdr *) skb->data;
960         struct flowi4 fl4;
961         struct rtable *rt;
962
963         __build_flow_key(&fl4, NULL, iph, oif,
964                          RT_TOS(iph->tos), protocol, mark, flow_flags);
965         rt = __ip_route_output_key(net, &fl4);
966         if (!IS_ERR(rt)) {
967                 __ip_rt_update_pmtu(rt, &fl4, mtu);
968                 ip_rt_put(rt);
969         }
970 }
971 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
972
973 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
974 {
975         const struct iphdr *iph = (const struct iphdr *) skb->data;
976         struct flowi4 fl4;
977         struct rtable *rt;
978
979         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
980         rt = __ip_route_output_key(sock_net(sk), &fl4);
981         if (!IS_ERR(rt)) {
982                 __ip_rt_update_pmtu(rt, &fl4, mtu);
983                 ip_rt_put(rt);
984         }
985 }
986 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
987
988 void ipv4_redirect(struct sk_buff *skb, struct net *net,
989                    int oif, u32 mark, u8 protocol, int flow_flags)
990 {
991         const struct iphdr *iph = (const struct iphdr *) skb->data;
992         struct flowi4 fl4;
993         struct rtable *rt;
994
995         __build_flow_key(&fl4, NULL, iph, oif,
996                          RT_TOS(iph->tos), protocol, mark, flow_flags);
997         rt = __ip_route_output_key(net, &fl4);
998         if (!IS_ERR(rt)) {
999                 __ip_do_redirect(rt, skb, &fl4, false);
1000                 ip_rt_put(rt);
1001         }
1002 }
1003 EXPORT_SYMBOL_GPL(ipv4_redirect);
1004
1005 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1006 {
1007         const struct iphdr *iph = (const struct iphdr *) skb->data;
1008         struct flowi4 fl4;
1009         struct rtable *rt;
1010
1011         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1012         rt = __ip_route_output_key(sock_net(sk), &fl4);
1013         if (!IS_ERR(rt)) {
1014                 __ip_do_redirect(rt, skb, &fl4, false);
1015                 ip_rt_put(rt);
1016         }
1017 }
1018 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1019
1020 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1021 {
1022         struct rtable *rt = (struct rtable *) dst;
1023
1024         /* All IPV4 dsts are created with ->obsolete set to the value
1025          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1026          * into this function always.
1027          *
1028          * When a PMTU/redirect information update invalidates a
1029          * route, this is indicated by setting obsolete to
1030          * DST_OBSOLETE_KILL.
1031          */
1032         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1033                 return NULL;
1034         return dst;
1035 }
1036
1037 static void ipv4_dst_destroy(struct dst_entry *dst)
1038 {
1039         struct rtable *rt = (struct rtable *) dst;
1040
1041         if (rt->fi) {
1042                 fib_info_put(rt->fi);
1043                 rt->fi = NULL;
1044         }
1045 }
1046
1047
1048 static void ipv4_link_failure(struct sk_buff *skb)
1049 {
1050         struct rtable *rt;
1051
1052         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1053
1054         rt = skb_rtable(skb);
1055         if (rt)
1056                 dst_set_expires(&rt->dst, 0);
1057 }
1058
1059 static int ip_rt_bug(struct sk_buff *skb)
1060 {
1061         pr_debug("%s: %pI4 -> %pI4, %s\n",
1062                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1063                  skb->dev ? skb->dev->name : "?");
1064         kfree_skb(skb);
1065         WARN_ON(1);
1066         return 0;
1067 }
1068
1069 /*
1070    We do not cache source address of outgoing interface,
1071    because it is used only by IP RR, TS and SRR options,
1072    so that it out of fast path.
1073
1074    BTW remember: "addr" is allowed to be not aligned
1075    in IP options!
1076  */
1077
1078 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1079 {
1080         __be32 src;
1081
1082         if (rt_is_output_route(rt))
1083                 src = ip_hdr(skb)->saddr;
1084         else {
1085                 struct fib_result res;
1086                 struct flowi4 fl4;
1087                 struct iphdr *iph;
1088
1089                 iph = ip_hdr(skb);
1090
1091                 memset(&fl4, 0, sizeof(fl4));
1092                 fl4.daddr = iph->daddr;
1093                 fl4.saddr = iph->saddr;
1094                 fl4.flowi4_tos = RT_TOS(iph->tos);
1095                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1096                 fl4.flowi4_iif = skb->dev->ifindex;
1097                 fl4.flowi4_mark = skb->mark;
1098
1099                 rcu_read_lock();
1100                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1101                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1102                 else
1103                         src = inet_select_addr(rt->dst.dev,
1104                                                rt_nexthop(rt, iph->daddr),
1105                                                RT_SCOPE_UNIVERSE);
1106                 rcu_read_unlock();
1107         }
1108         memcpy(addr, &src, 4);
1109 }
1110
1111 #ifdef CONFIG_IP_ROUTE_CLASSID
1112 static void set_class_tag(struct rtable *rt, u32 tag)
1113 {
1114         if (!(rt->dst.tclassid & 0xFFFF))
1115                 rt->dst.tclassid |= tag & 0xFFFF;
1116         if (!(rt->dst.tclassid & 0xFFFF0000))
1117                 rt->dst.tclassid |= tag & 0xFFFF0000;
1118 }
1119 #endif
1120
1121 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1122 {
1123         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1124
1125         if (advmss == 0) {
1126                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1127                                ip_rt_min_advmss);
1128                 if (advmss > 65535 - 40)
1129                         advmss = 65535 - 40;
1130         }
1131         return advmss;
1132 }
1133
1134 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1135 {
1136         const struct rtable *rt = (const struct rtable *) dst;
1137         unsigned int mtu = rt->rt_pmtu;
1138
1139         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1140                 mtu = 0;
1141
1142         if (!mtu)
1143                 mtu = dst_metric_raw(dst, RTAX_MTU);
1144
1145         if (mtu && rt_is_output_route(rt))
1146                 return mtu;
1147
1148         mtu = dst->dev->mtu;
1149
1150         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1151                 if (rt->rt_gateway && mtu > 576)
1152                         mtu = 576;
1153         }
1154
1155         if (mtu > IP_MAX_MTU)
1156                 mtu = IP_MAX_MTU;
1157
1158         return mtu;
1159 }
1160
1161 static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1162 {
1163         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1164                 rt->fi = fi;
1165                 atomic_inc(&fi->fib_clntref);
1166         }
1167         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1168 }
1169
1170 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1171 {
1172         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1173         struct fib_nh_exception *fnhe;
1174         u32 hval;
1175
1176         if (!hash)
1177                 return NULL;
1178
1179         hval = fnhe_hashfun(daddr);
1180
1181         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1182              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1183                 if (fnhe->fnhe_daddr == daddr)
1184                         return fnhe;
1185         }
1186         return NULL;
1187 }
1188
1189 static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1190                               __be32 daddr)
1191 {
1192         __be32 fnhe_daddr, gw;
1193         unsigned long expires;
1194         unsigned int seq;
1195         u32 pmtu;
1196
1197 restart:
1198         seq = read_seqbegin(&fnhe_seqlock);
1199         fnhe_daddr = fnhe->fnhe_daddr;
1200         gw = fnhe->fnhe_gw;
1201         pmtu = fnhe->fnhe_pmtu;
1202         expires = fnhe->fnhe_expires;
1203         if (read_seqretry(&fnhe_seqlock, seq))
1204                 goto restart;
1205
1206         if (daddr != fnhe_daddr)
1207                 return;
1208
1209         if (pmtu) {
1210                 unsigned long diff = expires - jiffies;
1211
1212                 if (time_before(jiffies, expires)) {
1213                         rt->rt_pmtu = pmtu;
1214                         dst_set_expires(&rt->dst, diff);
1215                 }
1216         }
1217         if (gw) {
1218                 rt->rt_flags |= RTCF_REDIRECTED;
1219                 rt->rt_gateway = gw;
1220         }
1221         fnhe->fnhe_stamp = jiffies;
1222 }
1223
1224 static inline void rt_release_rcu(struct rcu_head *head)
1225 {
1226         struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
1227         dst_release(dst);
1228 }
1229
1230 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1231 {
1232         struct rtable *orig, *prev, **p = &nh->nh_rth_output;
1233
1234         if (rt_is_input_route(rt))
1235                 p = &nh->nh_rth_input;
1236
1237         orig = *p;
1238
1239         prev = cmpxchg(p, orig, rt);
1240         if (prev == orig) {
1241                 dst_clone(&rt->dst);
1242                 if (orig)
1243                         call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
1244         }
1245 }
1246
1247 static bool rt_cache_valid(struct rtable *rt)
1248 {
1249         return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
1250 }
1251
1252 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1253                            const struct fib_result *res,
1254                            struct fib_nh_exception *fnhe,
1255                            struct fib_info *fi, u16 type, u32 itag)
1256 {
1257         if (fi) {
1258                 struct fib_nh *nh = &FIB_RES_NH(*res);
1259
1260                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1261                         rt->rt_gateway = nh->nh_gw;
1262                 if (unlikely(fnhe))
1263                         rt_bind_exception(rt, fnhe, daddr);
1264                 rt_init_metrics(rt, fi);
1265 #ifdef CONFIG_IP_ROUTE_CLASSID
1266                 rt->dst.tclassid = nh->nh_tclassid;
1267 #endif
1268                 if (!(rt->dst.flags & DST_HOST))
1269                         rt_cache_route(nh, rt);
1270         }
1271
1272 #ifdef CONFIG_IP_ROUTE_CLASSID
1273 #ifdef CONFIG_IP_MULTIPLE_TABLES
1274         set_class_tag(rt, res->tclassid);
1275 #endif
1276         set_class_tag(rt, itag);
1277 #endif
1278 }
1279
1280 static struct rtable *rt_dst_alloc(struct net_device *dev,
1281                                    bool nopolicy, bool noxfrm, bool will_cache)
1282 {
1283         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1284                          (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
1285                          (nopolicy ? DST_NOPOLICY : 0) |
1286                          (noxfrm ? DST_NOXFRM : 0));
1287 }
1288
1289 /* called in rcu_read_lock() section */
1290 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1291                                 u8 tos, struct net_device *dev, int our)
1292 {
1293         struct rtable *rth;
1294         struct in_device *in_dev = __in_dev_get_rcu(dev);
1295         u32 itag = 0;
1296         int err;
1297
1298         /* Primary sanity checks. */
1299
1300         if (in_dev == NULL)
1301                 return -EINVAL;
1302
1303         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1304             skb->protocol != htons(ETH_P_IP))
1305                 goto e_inval;
1306
1307         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1308                 if (ipv4_is_loopback(saddr))
1309                         goto e_inval;
1310
1311         if (ipv4_is_zeronet(saddr)) {
1312                 if (!ipv4_is_local_multicast(daddr))
1313                         goto e_inval;
1314         } else {
1315                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1316                                           in_dev, &itag);
1317                 if (err < 0)
1318                         goto e_err;
1319         }
1320         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1321                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1322         if (!rth)
1323                 goto e_nobufs;
1324
1325 #ifdef CONFIG_IP_ROUTE_CLASSID
1326         rth->dst.tclassid = itag;
1327 #endif
1328         rth->dst.output = ip_rt_bug;
1329
1330         rth->rt_genid   = rt_genid(dev_net(dev));
1331         rth->rt_flags   = RTCF_MULTICAST;
1332         rth->rt_type    = RTN_MULTICAST;
1333         rth->rt_route_iif = dev->ifindex;
1334         rth->rt_iif     = dev->ifindex;
1335         rth->rt_pmtu    = 0;
1336         rth->rt_gateway = 0;
1337         rth->fi = NULL;
1338         if (our) {
1339                 rth->dst.input= ip_local_deliver;
1340                 rth->rt_flags |= RTCF_LOCAL;
1341         }
1342
1343 #ifdef CONFIG_IP_MROUTE
1344         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1345                 rth->dst.input = ip_mr_input;
1346 #endif
1347         RT_CACHE_STAT_INC(in_slow_mc);
1348
1349         skb_dst_set(skb, &rth->dst);
1350         return 0;
1351
1352 e_nobufs:
1353         return -ENOBUFS;
1354 e_inval:
1355         return -EINVAL;
1356 e_err:
1357         return err;
1358 }
1359
1360
1361 static void ip_handle_martian_source(struct net_device *dev,
1362                                      struct in_device *in_dev,
1363                                      struct sk_buff *skb,
1364                                      __be32 daddr,
1365                                      __be32 saddr)
1366 {
1367         RT_CACHE_STAT_INC(in_martian_src);
1368 #ifdef CONFIG_IP_ROUTE_VERBOSE
1369         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1370                 /*
1371                  *      RFC1812 recommendation, if source is martian,
1372                  *      the only hint is MAC header.
1373                  */
1374                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1375                         &daddr, &saddr, dev->name);
1376                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1377                         print_hex_dump(KERN_WARNING, "ll header: ",
1378                                        DUMP_PREFIX_OFFSET, 16, 1,
1379                                        skb_mac_header(skb),
1380                                        dev->hard_header_len, true);
1381                 }
1382         }
1383 #endif
1384 }
1385
1386 /* called in rcu_read_lock() section */
1387 static int __mkroute_input(struct sk_buff *skb,
1388                            const struct fib_result *res,
1389                            struct in_device *in_dev,
1390                            __be32 daddr, __be32 saddr, u32 tos,
1391                            struct rtable **result)
1392 {
1393         struct rtable *rth;
1394         int err;
1395         struct in_device *out_dev;
1396         unsigned int flags = 0;
1397         bool do_cache;
1398         u32 itag;
1399
1400         /* get a working reference to the output device */
1401         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1402         if (out_dev == NULL) {
1403                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1404                 return -EINVAL;
1405         }
1406
1407
1408         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1409                                   in_dev->dev, in_dev, &itag);
1410         if (err < 0) {
1411                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1412                                          saddr);
1413
1414                 goto cleanup;
1415         }
1416
1417         if (err)
1418                 flags |= RTCF_DIRECTSRC;
1419
1420         if (out_dev == in_dev && err &&
1421             (IN_DEV_SHARED_MEDIA(out_dev) ||
1422              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1423                 flags |= RTCF_DOREDIRECT;
1424
1425         if (skb->protocol != htons(ETH_P_IP)) {
1426                 /* Not IP (i.e. ARP). Do not create route, if it is
1427                  * invalid for proxy arp. DNAT routes are always valid.
1428                  *
1429                  * Proxy arp feature have been extended to allow, ARP
1430                  * replies back to the same interface, to support
1431                  * Private VLAN switch technologies. See arp.c.
1432                  */
1433                 if (out_dev == in_dev &&
1434                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1435                         err = -EINVAL;
1436                         goto cleanup;
1437                 }
1438         }
1439
1440         do_cache = false;
1441         if (res->fi) {
1442                 if (!(flags & RTCF_DIRECTSRC) && !itag) {
1443                         rth = FIB_RES_NH(*res).nh_rth_input;
1444                         if (rt_cache_valid(rth)) {
1445                                 dst_hold(&rth->dst);
1446                                 goto out;
1447                         }
1448                         do_cache = true;
1449                 }
1450         }
1451
1452         rth = rt_dst_alloc(out_dev->dev,
1453                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1454                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1455         if (!rth) {
1456                 err = -ENOBUFS;
1457                 goto cleanup;
1458         }
1459
1460         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1461         rth->rt_flags = flags;
1462         rth->rt_type = res->type;
1463         rth->rt_route_iif = in_dev->dev->ifindex;
1464         rth->rt_iif     = in_dev->dev->ifindex;
1465         rth->rt_pmtu    = 0;
1466         rth->rt_gateway = 0;
1467         rth->fi = NULL;
1468
1469         rth->dst.input = ip_forward;
1470         rth->dst.output = ip_output;
1471
1472         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1473 out:
1474         *result = rth;
1475         err = 0;
1476  cleanup:
1477         return err;
1478 }
1479
1480 static int ip_mkroute_input(struct sk_buff *skb,
1481                             struct fib_result *res,
1482                             const struct flowi4 *fl4,
1483                             struct in_device *in_dev,
1484                             __be32 daddr, __be32 saddr, u32 tos)
1485 {
1486         struct rtable *rth = NULL;
1487         int err;
1488
1489 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1490         if (res->fi && res->fi->fib_nhs > 1)
1491                 fib_select_multipath(res);
1492 #endif
1493
1494         /* create a routing cache entry */
1495         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1496         if (err)
1497                 return err;
1498
1499         skb_dst_set(skb, &rth->dst);
1500         return 0;
1501 }
1502
1503 /*
1504  *      NOTE. We drop all the packets that has local source
1505  *      addresses, because every properly looped back packet
1506  *      must have correct destination already attached by output routine.
1507  *
1508  *      Such approach solves two big problems:
1509  *      1. Not simplex devices are handled properly.
1510  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1511  *      called with rcu_read_lock()
1512  */
1513
1514 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1515                                u8 tos, struct net_device *dev)
1516 {
1517         struct fib_result res;
1518         struct in_device *in_dev = __in_dev_get_rcu(dev);
1519         struct flowi4   fl4;
1520         unsigned int    flags = 0;
1521         u32             itag = 0;
1522         struct rtable   *rth;
1523         int             err = -EINVAL;
1524         struct net    *net = dev_net(dev);
1525         bool do_cache;
1526
1527         /* IP on this device is disabled. */
1528
1529         if (!in_dev)
1530                 goto out;
1531
1532         /* Check for the most weird martians, which can be not detected
1533            by fib_lookup.
1534          */
1535
1536         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1537                 goto martian_source;
1538
1539         res.fi = NULL;
1540         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1541                 goto brd_input;
1542
1543         /* Accept zero addresses only to limited broadcast;
1544          * I even do not know to fix it or not. Waiting for complains :-)
1545          */
1546         if (ipv4_is_zeronet(saddr))
1547                 goto martian_source;
1548
1549         if (ipv4_is_zeronet(daddr))
1550                 goto martian_destination;
1551
1552         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1553                 if (ipv4_is_loopback(daddr))
1554                         goto martian_destination;
1555
1556                 if (ipv4_is_loopback(saddr))
1557                         goto martian_source;
1558         }
1559
1560         /*
1561          *      Now we are ready to route packet.
1562          */
1563         fl4.flowi4_oif = 0;
1564         fl4.flowi4_iif = dev->ifindex;
1565         fl4.flowi4_mark = skb->mark;
1566         fl4.flowi4_tos = tos;
1567         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1568         fl4.daddr = daddr;
1569         fl4.saddr = saddr;
1570         err = fib_lookup(net, &fl4, &res);
1571         if (err != 0)
1572                 goto no_route;
1573
1574         RT_CACHE_STAT_INC(in_slow_tot);
1575
1576         if (res.type == RTN_BROADCAST)
1577                 goto brd_input;
1578
1579         if (res.type == RTN_LOCAL) {
1580                 err = fib_validate_source(skb, saddr, daddr, tos,
1581                                           net->loopback_dev->ifindex,
1582                                           dev, in_dev, &itag);
1583                 if (err < 0)
1584                         goto martian_source_keep_err;
1585                 if (err)
1586                         flags |= RTCF_DIRECTSRC;
1587                 goto local_input;
1588         }
1589
1590         if (!IN_DEV_FORWARD(in_dev))
1591                 goto no_route;
1592         if (res.type != RTN_UNICAST)
1593                 goto martian_destination;
1594
1595         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1596 out:    return err;
1597
1598 brd_input:
1599         if (skb->protocol != htons(ETH_P_IP))
1600                 goto e_inval;
1601
1602         if (!ipv4_is_zeronet(saddr)) {
1603                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1604                                           in_dev, &itag);
1605                 if (err < 0)
1606                         goto martian_source_keep_err;
1607                 if (err)
1608                         flags |= RTCF_DIRECTSRC;
1609         }
1610         flags |= RTCF_BROADCAST;
1611         res.type = RTN_BROADCAST;
1612         RT_CACHE_STAT_INC(in_brd);
1613
1614 local_input:
1615         do_cache = false;
1616         if (res.fi) {
1617                 if (!(flags & RTCF_DIRECTSRC) && !itag) {
1618                         rth = FIB_RES_NH(res).nh_rth_input;
1619                         if (rt_cache_valid(rth)) {
1620                                 dst_hold(&rth->dst);
1621                                 goto set_and_out;
1622                         }
1623                         do_cache = true;
1624                 }
1625         }
1626
1627         rth = rt_dst_alloc(net->loopback_dev,
1628                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1629         if (!rth)
1630                 goto e_nobufs;
1631
1632         rth->dst.input= ip_local_deliver;
1633         rth->dst.output= ip_rt_bug;
1634 #ifdef CONFIG_IP_ROUTE_CLASSID
1635         rth->dst.tclassid = itag;
1636 #endif
1637
1638         rth->rt_genid = rt_genid(net);
1639         rth->rt_flags   = flags|RTCF_LOCAL;
1640         rth->rt_type    = res.type;
1641         rth->rt_route_iif = dev->ifindex;
1642         rth->rt_iif     = dev->ifindex;
1643         rth->rt_pmtu    = 0;
1644         rth->rt_gateway = 0;
1645         rth->fi = NULL;
1646         if (res.type == RTN_UNREACHABLE) {
1647                 rth->dst.input= ip_error;
1648                 rth->dst.error= -err;
1649                 rth->rt_flags   &= ~RTCF_LOCAL;
1650         }
1651         if (do_cache)
1652                 rt_cache_route(&FIB_RES_NH(res), rth);
1653 set_and_out:
1654         skb_dst_set(skb, &rth->dst);
1655         err = 0;
1656         goto out;
1657
1658 no_route:
1659         RT_CACHE_STAT_INC(in_no_route);
1660         res.type = RTN_UNREACHABLE;
1661         if (err == -ESRCH)
1662                 err = -ENETUNREACH;
1663         goto local_input;
1664
1665         /*
1666          *      Do not cache martian addresses: they should be logged (RFC1812)
1667          */
1668 martian_destination:
1669         RT_CACHE_STAT_INC(in_martian_dst);
1670 #ifdef CONFIG_IP_ROUTE_VERBOSE
1671         if (IN_DEV_LOG_MARTIANS(in_dev))
1672                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1673                                      &daddr, &saddr, dev->name);
1674 #endif
1675
1676 e_inval:
1677         err = -EINVAL;
1678         goto out;
1679
1680 e_nobufs:
1681         err = -ENOBUFS;
1682         goto out;
1683
1684 martian_source:
1685         err = -EINVAL;
1686 martian_source_keep_err:
1687         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1688         goto out;
1689 }
1690
1691 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692                    u8 tos, struct net_device *dev)
1693 {
1694         int res;
1695
1696         rcu_read_lock();
1697
1698         /* Multicast recognition logic is moved from route cache to here.
1699            The problem was that too many Ethernet cards have broken/missing
1700            hardware multicast filters :-( As result the host on multicasting
1701            network acquires a lot of useless route cache entries, sort of
1702            SDR messages from all the world. Now we try to get rid of them.
1703            Really, provided software IP multicast filter is organized
1704            reasonably (at least, hashed), it does not result in a slowdown
1705            comparing with route cache reject entries.
1706            Note, that multicast routers are not affected, because
1707            route cache entry is created eventually.
1708          */
1709         if (ipv4_is_multicast(daddr)) {
1710                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1711
1712                 if (in_dev) {
1713                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1714                                                   ip_hdr(skb)->protocol);
1715                         if (our
1716 #ifdef CONFIG_IP_MROUTE
1717                                 ||
1718                             (!ipv4_is_local_multicast(daddr) &&
1719                              IN_DEV_MFORWARD(in_dev))
1720 #endif
1721                            ) {
1722                                 int res = ip_route_input_mc(skb, daddr, saddr,
1723                                                             tos, dev, our);
1724                                 rcu_read_unlock();
1725                                 return res;
1726                         }
1727                 }
1728                 rcu_read_unlock();
1729                 return -EINVAL;
1730         }
1731         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1732         rcu_read_unlock();
1733         return res;
1734 }
1735 EXPORT_SYMBOL(ip_route_input);
1736
1737 /* called with rcu_read_lock() */
1738 static struct rtable *__mkroute_output(const struct fib_result *res,
1739                                        const struct flowi4 *fl4, int orig_oif,
1740                                        struct net_device *dev_out,
1741                                        unsigned int flags)
1742 {
1743         struct fib_info *fi = res->fi;
1744         struct fib_nh_exception *fnhe;
1745         struct in_device *in_dev;
1746         u16 type = res->type;
1747         struct rtable *rth;
1748
1749         in_dev = __in_dev_get_rcu(dev_out);
1750         if (!in_dev)
1751                 return ERR_PTR(-EINVAL);
1752
1753         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1754                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1755                         return ERR_PTR(-EINVAL);
1756
1757         if (ipv4_is_lbcast(fl4->daddr))
1758                 type = RTN_BROADCAST;
1759         else if (ipv4_is_multicast(fl4->daddr))
1760                 type = RTN_MULTICAST;
1761         else if (ipv4_is_zeronet(fl4->daddr))
1762                 return ERR_PTR(-EINVAL);
1763
1764         if (dev_out->flags & IFF_LOOPBACK)
1765                 flags |= RTCF_LOCAL;
1766
1767         if (type == RTN_BROADCAST) {
1768                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1769                 fi = NULL;
1770         } else if (type == RTN_MULTICAST) {
1771                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1772                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1773                                      fl4->flowi4_proto))
1774                         flags &= ~RTCF_LOCAL;
1775                 /* If multicast route do not exist use
1776                  * default one, but do not gateway in this case.
1777                  * Yes, it is hack.
1778                  */
1779                 if (fi && res->prefixlen < 4)
1780                         fi = NULL;
1781         }
1782
1783         fnhe = NULL;
1784         if (fi) {
1785                 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1786                 if (!fnhe) {
1787                         rth = FIB_RES_NH(*res).nh_rth_output;
1788                         if (rt_cache_valid(rth)) {
1789                                 dst_hold(&rth->dst);
1790                                 return rth;
1791                         }
1792                 }
1793         }
1794         rth = rt_dst_alloc(dev_out,
1795                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1796                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1797                            fi && !fnhe);
1798         if (!rth)
1799                 return ERR_PTR(-ENOBUFS);
1800
1801         rth->dst.output = ip_output;
1802
1803         rth->rt_genid = rt_genid(dev_net(dev_out));
1804         rth->rt_flags   = flags;
1805         rth->rt_type    = type;
1806         rth->rt_route_iif = 0;
1807         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
1808         rth->rt_pmtu    = 0;
1809         rth->rt_gateway = 0;
1810         rth->fi = NULL;
1811
1812         RT_CACHE_STAT_INC(out_slow_tot);
1813
1814         if (flags & RTCF_LOCAL)
1815                 rth->dst.input = ip_local_deliver;
1816         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1817                 if (flags & RTCF_LOCAL &&
1818                     !(dev_out->flags & IFF_LOOPBACK)) {
1819                         rth->dst.output = ip_mc_output;
1820                         RT_CACHE_STAT_INC(out_slow_mc);
1821                 }
1822 #ifdef CONFIG_IP_MROUTE
1823                 if (type == RTN_MULTICAST) {
1824                         if (IN_DEV_MFORWARD(in_dev) &&
1825                             !ipv4_is_local_multicast(fl4->daddr)) {
1826                                 rth->dst.input = ip_mr_input;
1827                                 rth->dst.output = ip_mc_output;
1828                         }
1829                 }
1830 #endif
1831         }
1832
1833         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1834
1835         return rth;
1836 }
1837
1838 /*
1839  * Major route resolver routine.
1840  */
1841
1842 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1843 {
1844         struct net_device *dev_out = NULL;
1845         __u8 tos = RT_FL_TOS(fl4);
1846         unsigned int flags = 0;
1847         struct fib_result res;
1848         struct rtable *rth;
1849         int orig_oif;
1850
1851         res.tclassid    = 0;
1852         res.fi          = NULL;
1853         res.table       = NULL;
1854
1855         orig_oif = fl4->flowi4_oif;
1856
1857         fl4->flowi4_iif = net->loopback_dev->ifindex;
1858         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1859         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1860                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1861
1862         rcu_read_lock();
1863         if (fl4->saddr) {
1864                 rth = ERR_PTR(-EINVAL);
1865                 if (ipv4_is_multicast(fl4->saddr) ||
1866                     ipv4_is_lbcast(fl4->saddr) ||
1867                     ipv4_is_zeronet(fl4->saddr))
1868                         goto out;
1869
1870                 /* I removed check for oif == dev_out->oif here.
1871                    It was wrong for two reasons:
1872                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1873                       is assigned to multiple interfaces.
1874                    2. Moreover, we are allowed to send packets with saddr
1875                       of another iface. --ANK
1876                  */
1877
1878                 if (fl4->flowi4_oif == 0 &&
1879                     (ipv4_is_multicast(fl4->daddr) ||
1880                      ipv4_is_lbcast(fl4->daddr))) {
1881                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1882                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1883                         if (dev_out == NULL)
1884                                 goto out;
1885
1886                         /* Special hack: user can direct multicasts
1887                            and limited broadcast via necessary interface
1888                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1889                            This hack is not just for fun, it allows
1890                            vic,vat and friends to work.
1891                            They bind socket to loopback, set ttl to zero
1892                            and expect that it will work.
1893                            From the viewpoint of routing cache they are broken,
1894                            because we are not allowed to build multicast path
1895                            with loopback source addr (look, routing cache
1896                            cannot know, that ttl is zero, so that packet
1897                            will not leave this host and route is valid).
1898                            Luckily, this hack is good workaround.
1899                          */
1900
1901                         fl4->flowi4_oif = dev_out->ifindex;
1902                         goto make_route;
1903                 }
1904
1905                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1906                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1907                         if (!__ip_dev_find(net, fl4->saddr, false))
1908                                 goto out;
1909                 }
1910         }
1911
1912
1913         if (fl4->flowi4_oif) {
1914                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1915                 rth = ERR_PTR(-ENODEV);
1916                 if (dev_out == NULL)
1917                         goto out;
1918
1919                 /* RACE: Check return value of inet_select_addr instead. */
1920                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1921                         rth = ERR_PTR(-ENETUNREACH);
1922                         goto out;
1923                 }
1924                 if (ipv4_is_local_multicast(fl4->daddr) ||
1925                     ipv4_is_lbcast(fl4->daddr)) {
1926                         if (!fl4->saddr)
1927                                 fl4->saddr = inet_select_addr(dev_out, 0,
1928                                                               RT_SCOPE_LINK);
1929                         goto make_route;
1930                 }
1931                 if (fl4->saddr) {
1932                         if (ipv4_is_multicast(fl4->daddr))
1933                                 fl4->saddr = inet_select_addr(dev_out, 0,
1934                                                               fl4->flowi4_scope);
1935                         else if (!fl4->daddr)
1936                                 fl4->saddr = inet_select_addr(dev_out, 0,
1937                                                               RT_SCOPE_HOST);
1938                 }
1939         }
1940
1941         if (!fl4->daddr) {
1942                 fl4->daddr = fl4->saddr;
1943                 if (!fl4->daddr)
1944                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1945                 dev_out = net->loopback_dev;
1946                 fl4->flowi4_oif = net->loopback_dev->ifindex;
1947                 res.type = RTN_LOCAL;
1948                 flags |= RTCF_LOCAL;
1949                 goto make_route;
1950         }
1951
1952         if (fib_lookup(net, fl4, &res)) {
1953                 res.fi = NULL;
1954                 res.table = NULL;
1955                 if (fl4->flowi4_oif) {
1956                         /* Apparently, routing tables are wrong. Assume,
1957                            that the destination is on link.
1958
1959                            WHY? DW.
1960                            Because we are allowed to send to iface
1961                            even if it has NO routes and NO assigned
1962                            addresses. When oif is specified, routing
1963                            tables are looked up with only one purpose:
1964                            to catch if destination is gatewayed, rather than
1965                            direct. Moreover, if MSG_DONTROUTE is set,
1966                            we send packet, ignoring both routing tables
1967                            and ifaddr state. --ANK
1968
1969
1970                            We could make it even if oif is unknown,
1971                            likely IPv6, but we do not.
1972                          */
1973
1974                         if (fl4->saddr == 0)
1975                                 fl4->saddr = inet_select_addr(dev_out, 0,
1976                                                               RT_SCOPE_LINK);
1977                         res.type = RTN_UNICAST;
1978                         goto make_route;
1979                 }
1980                 rth = ERR_PTR(-ENETUNREACH);
1981                 goto out;
1982         }
1983
1984         if (res.type == RTN_LOCAL) {
1985                 if (!fl4->saddr) {
1986                         if (res.fi->fib_prefsrc)
1987                                 fl4->saddr = res.fi->fib_prefsrc;
1988                         else
1989                                 fl4->saddr = fl4->daddr;
1990                 }
1991                 dev_out = net->loopback_dev;
1992                 fl4->flowi4_oif = dev_out->ifindex;
1993                 res.fi = NULL;
1994                 flags |= RTCF_LOCAL;
1995                 goto make_route;
1996         }
1997
1998 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1999         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2000                 fib_select_multipath(&res);
2001         else
2002 #endif
2003         if (!res.prefixlen &&
2004             res.table->tb_num_default > 1 &&
2005             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2006                 fib_select_default(&res);
2007
2008         if (!fl4->saddr)
2009                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2010
2011         dev_out = FIB_RES_DEV(res);
2012         fl4->flowi4_oif = dev_out->ifindex;
2013
2014
2015 make_route:
2016         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2017
2018 out:
2019         rcu_read_unlock();
2020         return rth;
2021 }
2022 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2023
2024 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2025 {
2026         return NULL;
2027 }
2028
2029 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2030 {
2031         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2032
2033         return mtu ? : dst->dev->mtu;
2034 }
2035
2036 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2037                                           struct sk_buff *skb, u32 mtu)
2038 {
2039 }
2040
2041 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2042                                        struct sk_buff *skb)
2043 {
2044 }
2045
2046 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2047                                           unsigned long old)
2048 {
2049         return NULL;
2050 }
2051
2052 static struct dst_ops ipv4_dst_blackhole_ops = {
2053         .family                 =       AF_INET,
2054         .protocol               =       cpu_to_be16(ETH_P_IP),
2055         .destroy                =       ipv4_dst_destroy,
2056         .check                  =       ipv4_blackhole_dst_check,
2057         .mtu                    =       ipv4_blackhole_mtu,
2058         .default_advmss         =       ipv4_default_advmss,
2059         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2060         .redirect               =       ipv4_rt_blackhole_redirect,
2061         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2062         .neigh_lookup           =       ipv4_neigh_lookup,
2063 };
2064
2065 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2066 {
2067         struct rtable *ort = (struct rtable *) dst_orig;
2068         struct rtable *rt;
2069
2070         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2071         if (rt) {
2072                 struct dst_entry *new = &rt->dst;
2073
2074                 new->__use = 1;
2075                 new->input = dst_discard;
2076                 new->output = dst_discard;
2077
2078                 new->dev = ort->dst.dev;
2079                 if (new->dev)
2080                         dev_hold(new->dev);
2081
2082                 rt->rt_route_iif = ort->rt_route_iif;
2083                 rt->rt_iif = ort->rt_iif;
2084                 rt->rt_pmtu = ort->rt_pmtu;
2085
2086                 rt->rt_genid = rt_genid(net);
2087                 rt->rt_flags = ort->rt_flags;
2088                 rt->rt_type = ort->rt_type;
2089                 rt->rt_gateway = ort->rt_gateway;
2090                 rt->fi = ort->fi;
2091                 if (rt->fi)
2092                         atomic_inc(&rt->fi->fib_clntref);
2093
2094                 dst_free(new);
2095         }
2096
2097         dst_release(dst_orig);
2098
2099         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2100 }
2101
2102 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2103                                     struct sock *sk)
2104 {
2105         struct rtable *rt = __ip_route_output_key(net, flp4);
2106
2107         if (IS_ERR(rt))
2108                 return rt;
2109
2110         if (flp4->flowi4_proto)
2111                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2112                                                    flowi4_to_flowi(flp4),
2113                                                    sk, 0);
2114
2115         return rt;
2116 }
2117 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2118
2119 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2120                         struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2121                         u32 seq, int event, int nowait, unsigned int flags)
2122 {
2123         struct rtable *rt = skb_rtable(skb);
2124         struct rtmsg *r;
2125         struct nlmsghdr *nlh;
2126         unsigned long expires = 0;
2127         u32 error;
2128         u32 metrics[RTAX_MAX];
2129
2130         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2131         if (nlh == NULL)
2132                 return -EMSGSIZE;
2133
2134         r = nlmsg_data(nlh);
2135         r->rtm_family    = AF_INET;
2136         r->rtm_dst_len  = 32;
2137         r->rtm_src_len  = 0;
2138         r->rtm_tos      = fl4->flowi4_tos;
2139         r->rtm_table    = RT_TABLE_MAIN;
2140         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2141                 goto nla_put_failure;
2142         r->rtm_type     = rt->rt_type;
2143         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2144         r->rtm_protocol = RTPROT_UNSPEC;
2145         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2146         if (rt->rt_flags & RTCF_NOTIFY)
2147                 r->rtm_flags |= RTM_F_NOTIFY;
2148
2149         if (nla_put_be32(skb, RTA_DST, dst))
2150                 goto nla_put_failure;
2151         if (src) {
2152                 r->rtm_src_len = 32;
2153                 if (nla_put_be32(skb, RTA_SRC, src))
2154                         goto nla_put_failure;
2155         }
2156         if (rt->dst.dev &&
2157             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2158                 goto nla_put_failure;
2159 #ifdef CONFIG_IP_ROUTE_CLASSID
2160         if (rt->dst.tclassid &&
2161             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2162                 goto nla_put_failure;
2163 #endif
2164         if (!rt_is_input_route(rt) &&
2165             fl4->saddr != src) {
2166                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2167                         goto nla_put_failure;
2168         }
2169         if (rt->rt_gateway &&
2170             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2171                 goto nla_put_failure;
2172
2173         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2174         if (rt->rt_pmtu)
2175                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2176         if (rtnetlink_put_metrics(skb, metrics) < 0)
2177                 goto nla_put_failure;
2178
2179         if (fl4->flowi4_mark &&
2180             nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2181                 goto nla_put_failure;
2182
2183         error = rt->dst.error;
2184         expires = rt->dst.expires;
2185         if (expires) {
2186                 if (time_before(jiffies, expires))
2187                         expires -= jiffies;
2188                 else
2189                         expires = 0;
2190         }
2191
2192         if (rt_is_input_route(rt)) {
2193                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2194                         goto nla_put_failure;
2195         }
2196
2197         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2198                 goto nla_put_failure;
2199
2200         return nlmsg_end(skb, nlh);
2201
2202 nla_put_failure:
2203         nlmsg_cancel(skb, nlh);
2204         return -EMSGSIZE;
2205 }
2206
2207 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2208 {
2209         struct net *net = sock_net(in_skb->sk);
2210         struct rtmsg *rtm;
2211         struct nlattr *tb[RTA_MAX+1];
2212         struct rtable *rt = NULL;
2213         struct flowi4 fl4;
2214         __be32 dst = 0;
2215         __be32 src = 0;
2216         u32 iif;
2217         int err;
2218         int mark;
2219         struct sk_buff *skb;
2220
2221         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2222         if (err < 0)
2223                 goto errout;
2224
2225         rtm = nlmsg_data(nlh);
2226
2227         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2228         if (skb == NULL) {
2229                 err = -ENOBUFS;
2230                 goto errout;
2231         }
2232
2233         /* Reserve room for dummy headers, this skb can pass
2234            through good chunk of routing engine.
2235          */
2236         skb_reset_mac_header(skb);
2237         skb_reset_network_header(skb);
2238
2239         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2240         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2241         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2242
2243         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2244         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2245         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2246         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2247
2248         memset(&fl4, 0, sizeof(fl4));
2249         fl4.daddr = dst;
2250         fl4.saddr = src;
2251         fl4.flowi4_tos = rtm->rtm_tos;
2252         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2253         fl4.flowi4_mark = mark;
2254
2255         if (iif) {
2256                 struct net_device *dev;
2257
2258                 dev = __dev_get_by_index(net, iif);
2259                 if (dev == NULL) {
2260                         err = -ENODEV;
2261                         goto errout_free;
2262                 }
2263
2264                 skb->protocol   = htons(ETH_P_IP);
2265                 skb->dev        = dev;
2266                 skb->mark       = mark;
2267                 local_bh_disable();
2268                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2269                 local_bh_enable();
2270
2271                 rt = skb_rtable(skb);
2272                 if (err == 0 && rt->dst.error)
2273                         err = -rt->dst.error;
2274         } else {
2275                 rt = ip_route_output_key(net, &fl4);
2276
2277                 err = 0;
2278                 if (IS_ERR(rt))
2279                         err = PTR_ERR(rt);
2280         }
2281
2282         if (err)
2283                 goto errout_free;
2284
2285         skb_dst_set(skb, &rt->dst);
2286         if (rtm->rtm_flags & RTM_F_NOTIFY)
2287                 rt->rt_flags |= RTCF_NOTIFY;
2288
2289         err = rt_fill_info(net, dst, src, &fl4, skb,
2290                            NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2291                            RTM_NEWROUTE, 0, 0);
2292         if (err <= 0)
2293                 goto errout_free;
2294
2295         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2296 errout:
2297         return err;
2298
2299 errout_free:
2300         kfree_skb(skb);
2301         goto errout;
2302 }
2303
2304 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2305 {
2306         return skb->len;
2307 }
2308
2309 void ip_rt_multicast_event(struct in_device *in_dev)
2310 {
2311         rt_cache_flush(dev_net(in_dev->dev), 0);
2312 }
2313
2314 #ifdef CONFIG_SYSCTL
2315 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2316                                         void __user *buffer,
2317                                         size_t *lenp, loff_t *ppos)
2318 {
2319         if (write) {
2320                 int flush_delay;
2321                 ctl_table ctl;
2322                 struct net *net;
2323
2324                 memcpy(&ctl, __ctl, sizeof(ctl));
2325                 ctl.data = &flush_delay;
2326                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2327
2328                 net = (struct net *)__ctl->extra1;
2329                 rt_cache_flush(net, flush_delay);
2330                 return 0;
2331         }
2332
2333         return -EINVAL;
2334 }
2335
2336 static ctl_table ipv4_route_table[] = {
2337         {
2338                 .procname       = "gc_thresh",
2339                 .data           = &ipv4_dst_ops.gc_thresh,
2340                 .maxlen         = sizeof(int),
2341                 .mode           = 0644,
2342                 .proc_handler   = proc_dointvec,
2343         },
2344         {
2345                 .procname       = "max_size",
2346                 .data           = &ip_rt_max_size,
2347                 .maxlen         = sizeof(int),
2348                 .mode           = 0644,
2349                 .proc_handler   = proc_dointvec,
2350         },
2351         {
2352                 /*  Deprecated. Use gc_min_interval_ms */
2353
2354                 .procname       = "gc_min_interval",
2355                 .data           = &ip_rt_gc_min_interval,
2356                 .maxlen         = sizeof(int),
2357                 .mode           = 0644,
2358                 .proc_handler   = proc_dointvec_jiffies,
2359         },
2360         {
2361                 .procname       = "gc_min_interval_ms",
2362                 .data           = &ip_rt_gc_min_interval,
2363                 .maxlen         = sizeof(int),
2364                 .mode           = 0644,
2365                 .proc_handler   = proc_dointvec_ms_jiffies,
2366         },
2367         {
2368                 .procname       = "gc_timeout",
2369                 .data           = &ip_rt_gc_timeout,
2370                 .maxlen         = sizeof(int),
2371                 .mode           = 0644,
2372                 .proc_handler   = proc_dointvec_jiffies,
2373         },
2374         {
2375                 .procname       = "gc_interval",
2376                 .data           = &ip_rt_gc_interval,
2377                 .maxlen         = sizeof(int),
2378                 .mode           = 0644,
2379                 .proc_handler   = proc_dointvec_jiffies,
2380         },
2381         {
2382                 .procname       = "redirect_load",
2383                 .data           = &ip_rt_redirect_load,
2384                 .maxlen         = sizeof(int),
2385                 .mode           = 0644,
2386                 .proc_handler   = proc_dointvec,
2387         },
2388         {
2389                 .procname       = "redirect_number",
2390                 .data           = &ip_rt_redirect_number,
2391                 .maxlen         = sizeof(int),
2392                 .mode           = 0644,
2393                 .proc_handler   = proc_dointvec,
2394         },
2395         {
2396                 .procname       = "redirect_silence",
2397                 .data           = &ip_rt_redirect_silence,
2398                 .maxlen         = sizeof(int),
2399                 .mode           = 0644,
2400                 .proc_handler   = proc_dointvec,
2401         },
2402         {
2403                 .procname       = "error_cost",
2404                 .data           = &ip_rt_error_cost,
2405                 .maxlen         = sizeof(int),
2406                 .mode           = 0644,
2407                 .proc_handler   = proc_dointvec,
2408         },
2409         {
2410                 .procname       = "error_burst",
2411                 .data           = &ip_rt_error_burst,
2412                 .maxlen         = sizeof(int),
2413                 .mode           = 0644,
2414                 .proc_handler   = proc_dointvec,
2415         },
2416         {
2417                 .procname       = "gc_elasticity",
2418                 .data           = &ip_rt_gc_elasticity,
2419                 .maxlen         = sizeof(int),
2420                 .mode           = 0644,
2421                 .proc_handler   = proc_dointvec,
2422         },
2423         {
2424                 .procname       = "mtu_expires",
2425                 .data           = &ip_rt_mtu_expires,
2426                 .maxlen         = sizeof(int),
2427                 .mode           = 0644,
2428                 .proc_handler   = proc_dointvec_jiffies,
2429         },
2430         {
2431                 .procname       = "min_pmtu",
2432                 .data           = &ip_rt_min_pmtu,
2433                 .maxlen         = sizeof(int),
2434                 .mode           = 0644,
2435                 .proc_handler   = proc_dointvec,
2436         },
2437         {
2438                 .procname       = "min_adv_mss",
2439                 .data           = &ip_rt_min_advmss,
2440                 .maxlen         = sizeof(int),
2441                 .mode           = 0644,
2442                 .proc_handler   = proc_dointvec,
2443         },
2444         { }
2445 };
2446
2447 static struct ctl_table ipv4_route_flush_table[] = {
2448         {
2449                 .procname       = "flush",
2450                 .maxlen         = sizeof(int),
2451                 .mode           = 0200,
2452                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2453         },
2454         { },
2455 };
2456
2457 static __net_init int sysctl_route_net_init(struct net *net)
2458 {
2459         struct ctl_table *tbl;
2460
2461         tbl = ipv4_route_flush_table;
2462         if (!net_eq(net, &init_net)) {
2463                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2464                 if (tbl == NULL)
2465                         goto err_dup;
2466         }
2467         tbl[0].extra1 = net;
2468
2469         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2470         if (net->ipv4.route_hdr == NULL)
2471                 goto err_reg;
2472         return 0;
2473
2474 err_reg:
2475         if (tbl != ipv4_route_flush_table)
2476                 kfree(tbl);
2477 err_dup:
2478         return -ENOMEM;
2479 }
2480
2481 static __net_exit void sysctl_route_net_exit(struct net *net)
2482 {
2483         struct ctl_table *tbl;
2484
2485         tbl = net->ipv4.route_hdr->ctl_table_arg;
2486         unregister_net_sysctl_table(net->ipv4.route_hdr);
2487         BUG_ON(tbl == ipv4_route_flush_table);
2488         kfree(tbl);
2489 }
2490
2491 static __net_initdata struct pernet_operations sysctl_route_ops = {
2492         .init = sysctl_route_net_init,
2493         .exit = sysctl_route_net_exit,
2494 };
2495 #endif
2496
2497 static __net_init int rt_genid_init(struct net *net)
2498 {
2499         get_random_bytes(&net->ipv4.rt_genid,
2500                          sizeof(net->ipv4.rt_genid));
2501         get_random_bytes(&net->ipv4.dev_addr_genid,
2502                          sizeof(net->ipv4.dev_addr_genid));
2503         return 0;
2504 }
2505
2506 static __net_initdata struct pernet_operations rt_genid_ops = {
2507         .init = rt_genid_init,
2508 };
2509
2510 static int __net_init ipv4_inetpeer_init(struct net *net)
2511 {
2512         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2513
2514         if (!bp)
2515                 return -ENOMEM;
2516         inet_peer_base_init(bp);
2517         net->ipv4.peers = bp;
2518         return 0;
2519 }
2520
2521 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2522 {
2523         struct inet_peer_base *bp = net->ipv4.peers;
2524
2525         net->ipv4.peers = NULL;
2526         inetpeer_invalidate_tree(bp);
2527         kfree(bp);
2528 }
2529
2530 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2531         .init   =       ipv4_inetpeer_init,
2532         .exit   =       ipv4_inetpeer_exit,
2533 };
2534
2535 #ifdef CONFIG_IP_ROUTE_CLASSID
2536 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2537 #endif /* CONFIG_IP_ROUTE_CLASSID */
2538
2539 int __init ip_rt_init(void)
2540 {
2541         int rc = 0;
2542
2543 #ifdef CONFIG_IP_ROUTE_CLASSID
2544         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2545         if (!ip_rt_acct)
2546                 panic("IP: failed to allocate ip_rt_acct\n");
2547 #endif
2548
2549         ipv4_dst_ops.kmem_cachep =
2550                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2551                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2552
2553         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2554
2555         if (dst_entries_init(&ipv4_dst_ops) < 0)
2556                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2557
2558         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2559                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2560
2561         ipv4_dst_ops.gc_thresh = ~0;
2562         ip_rt_max_size = INT_MAX;
2563
2564         devinet_init();
2565         ip_fib_init();
2566
2567         if (ip_rt_proc_init())
2568                 pr_err("Unable to create route proc files\n");
2569 #ifdef CONFIG_XFRM
2570         xfrm_init();
2571         xfrm4_init(ip_rt_max_size);
2572 #endif
2573         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2574
2575 #ifdef CONFIG_SYSCTL
2576         register_pernet_subsys(&sysctl_route_ops);
2577 #endif
2578         register_pernet_subsys(&rt_genid_ops);
2579         register_pernet_subsys(&ipv4_inetpeer_ops);
2580         return rc;
2581 }
2582
2583 #ifdef CONFIG_SYSCTL
2584 /*
2585  * We really need to sanitize the damn ipv4 init order, then all
2586  * this nonsense will go away.
2587  */
2588 void __init ip_static_sysctl_init(void)
2589 {
2590         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2591 }
2592 #endif