net: Pass optional SKB and SK arguments to dst_ops->{update_pmtu,redirect}()
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136 static int rt_chain_length_max __read_mostly    = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152                                            struct sk_buff *skb, u32 mtu);
153 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
154                                         struct sk_buff *skb);
155 static int rt_garbage_collect(struct dst_ops *ops);
156
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158                             int how)
159 {
160 }
161
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
163 {
164         WARN_ON(1);
165         return NULL;
166 }
167
168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
169                                            struct sk_buff *skb,
170                                            const void *daddr);
171
172 static struct dst_ops ipv4_dst_ops = {
173         .family =               AF_INET,
174         .protocol =             cpu_to_be16(ETH_P_IP),
175         .gc =                   rt_garbage_collect,
176         .check =                ipv4_dst_check,
177         .default_advmss =       ipv4_default_advmss,
178         .mtu =                  ipv4_mtu,
179         .cow_metrics =          ipv4_cow_metrics,
180         .destroy =              ipv4_dst_destroy,
181         .ifdown =               ipv4_dst_ifdown,
182         .negative_advice =      ipv4_negative_advice,
183         .link_failure =         ipv4_link_failure,
184         .update_pmtu =          ip_rt_update_pmtu,
185         .redirect =             ip_do_redirect,
186         .local_out =            __ip_local_out,
187         .neigh_lookup =         ipv4_neigh_lookup,
188 };
189
190 #define ECN_OR_COST(class)      TC_PRIO_##class
191
192 const __u8 ip_tos2prio[16] = {
193         TC_PRIO_BESTEFFORT,
194         ECN_OR_COST(BESTEFFORT),
195         TC_PRIO_BESTEFFORT,
196         ECN_OR_COST(BESTEFFORT),
197         TC_PRIO_BULK,
198         ECN_OR_COST(BULK),
199         TC_PRIO_BULK,
200         ECN_OR_COST(BULK),
201         TC_PRIO_INTERACTIVE,
202         ECN_OR_COST(INTERACTIVE),
203         TC_PRIO_INTERACTIVE,
204         ECN_OR_COST(INTERACTIVE),
205         TC_PRIO_INTERACTIVE_BULK,
206         ECN_OR_COST(INTERACTIVE_BULK),
207         TC_PRIO_INTERACTIVE_BULK,
208         ECN_OR_COST(INTERACTIVE_BULK)
209 };
210 EXPORT_SYMBOL(ip_tos2prio);
211
212 /*
213  * Route cache.
214  */
215
216 /* The locking scheme is rather straight forward:
217  *
218  * 1) Read-Copy Update protects the buckets of the central route hash.
219  * 2) Only writers remove entries, and they hold the lock
220  *    as they look at rtable reference counts.
221  * 3) Only readers acquire references to rtable entries,
222  *    they do so with atomic increments and with the
223  *    lock held.
224  */
225
226 struct rt_hash_bucket {
227         struct rtable __rcu     *chain;
228 };
229
230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
231         defined(CONFIG_PROVE_LOCKING)
232 /*
233  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
234  * The size of this table is a power of two and depends on the number of CPUS.
235  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
236  */
237 #ifdef CONFIG_LOCKDEP
238 # define RT_HASH_LOCK_SZ        256
239 #else
240 # if NR_CPUS >= 32
241 #  define RT_HASH_LOCK_SZ       4096
242 # elif NR_CPUS >= 16
243 #  define RT_HASH_LOCK_SZ       2048
244 # elif NR_CPUS >= 8
245 #  define RT_HASH_LOCK_SZ       1024
246 # elif NR_CPUS >= 4
247 #  define RT_HASH_LOCK_SZ       512
248 # else
249 #  define RT_HASH_LOCK_SZ       256
250 # endif
251 #endif
252
253 static spinlock_t       *rt_hash_locks;
254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
255
256 static __init void rt_hash_lock_init(void)
257 {
258         int i;
259
260         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
261                         GFP_KERNEL);
262         if (!rt_hash_locks)
263                 panic("IP: failed to allocate rt_hash_locks\n");
264
265         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
266                 spin_lock_init(&rt_hash_locks[i]);
267 }
268 #else
269 # define rt_hash_lock_addr(slot) NULL
270
271 static inline void rt_hash_lock_init(void)
272 {
273 }
274 #endif
275
276 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
277 static unsigned int             rt_hash_mask __read_mostly;
278 static unsigned int             rt_hash_log  __read_mostly;
279
280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
282
283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
284                                    int genid)
285 {
286         return jhash_3words((__force u32)daddr, (__force u32)saddr,
287                             idx, genid)
288                 & rt_hash_mask;
289 }
290
291 static inline int rt_genid(struct net *net)
292 {
293         return atomic_read(&net->ipv4.rt_genid);
294 }
295
296 #ifdef CONFIG_PROC_FS
297 struct rt_cache_iter_state {
298         struct seq_net_private p;
299         int bucket;
300         int genid;
301 };
302
303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
304 {
305         struct rt_cache_iter_state *st = seq->private;
306         struct rtable *r = NULL;
307
308         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
309                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
310                         continue;
311                 rcu_read_lock_bh();
312                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313                 while (r) {
314                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
315                             r->rt_genid == st->genid)
316                                 return r;
317                         r = rcu_dereference_bh(r->dst.rt_next);
318                 }
319                 rcu_read_unlock_bh();
320         }
321         return r;
322 }
323
324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
325                                           struct rtable *r)
326 {
327         struct rt_cache_iter_state *st = seq->private;
328
329         r = rcu_dereference_bh(r->dst.rt_next);
330         while (!r) {
331                 rcu_read_unlock_bh();
332                 do {
333                         if (--st->bucket < 0)
334                                 return NULL;
335                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
336                 rcu_read_lock_bh();
337                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
338         }
339         return r;
340 }
341
342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
343                                         struct rtable *r)
344 {
345         struct rt_cache_iter_state *st = seq->private;
346         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
347                 if (dev_net(r->dst.dev) != seq_file_net(seq))
348                         continue;
349                 if (r->rt_genid == st->genid)
350                         break;
351         }
352         return r;
353 }
354
355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
356 {
357         struct rtable *r = rt_cache_get_first(seq);
358
359         if (r)
360                 while (pos && (r = rt_cache_get_next(seq, r)))
361                         --pos;
362         return pos ? NULL : r;
363 }
364
365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
366 {
367         struct rt_cache_iter_state *st = seq->private;
368         if (*pos)
369                 return rt_cache_get_idx(seq, *pos - 1);
370         st->genid = rt_genid(seq_file_net(seq));
371         return SEQ_START_TOKEN;
372 }
373
374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
375 {
376         struct rtable *r;
377
378         if (v == SEQ_START_TOKEN)
379                 r = rt_cache_get_first(seq);
380         else
381                 r = rt_cache_get_next(seq, v);
382         ++*pos;
383         return r;
384 }
385
386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
387 {
388         if (v && v != SEQ_START_TOKEN)
389                 rcu_read_unlock_bh();
390 }
391
392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
393 {
394         if (v == SEQ_START_TOKEN)
395                 seq_printf(seq, "%-127s\n",
396                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
397                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
398                            "HHUptod\tSpecDst");
399         else {
400                 struct rtable *r = v;
401                 int len;
402
403                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
404                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
405                            r->dst.dev ? r->dst.dev->name : "*",
406                            (__force u32)r->rt_dst,
407                            (__force u32)r->rt_gateway,
408                            r->rt_flags, atomic_read(&r->dst.__refcnt),
409                            r->dst.__use, 0, (__force u32)r->rt_src,
410                            dst_metric_advmss(&r->dst) + 40,
411                            dst_metric(&r->dst, RTAX_WINDOW), 0,
412                            r->rt_key_tos,
413                            -1, 0, 0, &len);
414
415                 seq_printf(seq, "%*s\n", 127 - len, "");
416         }
417         return 0;
418 }
419
420 static const struct seq_operations rt_cache_seq_ops = {
421         .start  = rt_cache_seq_start,
422         .next   = rt_cache_seq_next,
423         .stop   = rt_cache_seq_stop,
424         .show   = rt_cache_seq_show,
425 };
426
427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
428 {
429         return seq_open_net(inode, file, &rt_cache_seq_ops,
430                         sizeof(struct rt_cache_iter_state));
431 }
432
433 static const struct file_operations rt_cache_seq_fops = {
434         .owner   = THIS_MODULE,
435         .open    = rt_cache_seq_open,
436         .read    = seq_read,
437         .llseek  = seq_lseek,
438         .release = seq_release_net,
439 };
440
441
442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
443 {
444         int cpu;
445
446         if (*pos == 0)
447                 return SEQ_START_TOKEN;
448
449         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
450                 if (!cpu_possible(cpu))
451                         continue;
452                 *pos = cpu+1;
453                 return &per_cpu(rt_cache_stat, cpu);
454         }
455         return NULL;
456 }
457
458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
459 {
460         int cpu;
461
462         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
463                 if (!cpu_possible(cpu))
464                         continue;
465                 *pos = cpu+1;
466                 return &per_cpu(rt_cache_stat, cpu);
467         }
468         return NULL;
469
470 }
471
472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
473 {
474
475 }
476
477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
478 {
479         struct rt_cache_stat *st = v;
480
481         if (v == SEQ_START_TOKEN) {
482                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
483                 return 0;
484         }
485
486         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
487                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
488                    dst_entries_get_slow(&ipv4_dst_ops),
489                    st->in_hit,
490                    st->in_slow_tot,
491                    st->in_slow_mc,
492                    st->in_no_route,
493                    st->in_brd,
494                    st->in_martian_dst,
495                    st->in_martian_src,
496
497                    st->out_hit,
498                    st->out_slow_tot,
499                    st->out_slow_mc,
500
501                    st->gc_total,
502                    st->gc_ignored,
503                    st->gc_goal_miss,
504                    st->gc_dst_overflow,
505                    st->in_hlist_search,
506                    st->out_hlist_search
507                 );
508         return 0;
509 }
510
511 static const struct seq_operations rt_cpu_seq_ops = {
512         .start  = rt_cpu_seq_start,
513         .next   = rt_cpu_seq_next,
514         .stop   = rt_cpu_seq_stop,
515         .show   = rt_cpu_seq_show,
516 };
517
518
519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
520 {
521         return seq_open(file, &rt_cpu_seq_ops);
522 }
523
524 static const struct file_operations rt_cpu_seq_fops = {
525         .owner   = THIS_MODULE,
526         .open    = rt_cpu_seq_open,
527         .read    = seq_read,
528         .llseek  = seq_lseek,
529         .release = seq_release,
530 };
531
532 #ifdef CONFIG_IP_ROUTE_CLASSID
533 static int rt_acct_proc_show(struct seq_file *m, void *v)
534 {
535         struct ip_rt_acct *dst, *src;
536         unsigned int i, j;
537
538         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
539         if (!dst)
540                 return -ENOMEM;
541
542         for_each_possible_cpu(i) {
543                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
544                 for (j = 0; j < 256; j++) {
545                         dst[j].o_bytes   += src[j].o_bytes;
546                         dst[j].o_packets += src[j].o_packets;
547                         dst[j].i_bytes   += src[j].i_bytes;
548                         dst[j].i_packets += src[j].i_packets;
549                 }
550         }
551
552         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
553         kfree(dst);
554         return 0;
555 }
556
557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
558 {
559         return single_open(file, rt_acct_proc_show, NULL);
560 }
561
562 static const struct file_operations rt_acct_proc_fops = {
563         .owner          = THIS_MODULE,
564         .open           = rt_acct_proc_open,
565         .read           = seq_read,
566         .llseek         = seq_lseek,
567         .release        = single_release,
568 };
569 #endif
570
571 static int __net_init ip_rt_do_proc_init(struct net *net)
572 {
573         struct proc_dir_entry *pde;
574
575         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
576                         &rt_cache_seq_fops);
577         if (!pde)
578                 goto err1;
579
580         pde = proc_create("rt_cache", S_IRUGO,
581                           net->proc_net_stat, &rt_cpu_seq_fops);
582         if (!pde)
583                 goto err2;
584
585 #ifdef CONFIG_IP_ROUTE_CLASSID
586         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
587         if (!pde)
588                 goto err3;
589 #endif
590         return 0;
591
592 #ifdef CONFIG_IP_ROUTE_CLASSID
593 err3:
594         remove_proc_entry("rt_cache", net->proc_net_stat);
595 #endif
596 err2:
597         remove_proc_entry("rt_cache", net->proc_net);
598 err1:
599         return -ENOMEM;
600 }
601
602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
603 {
604         remove_proc_entry("rt_cache", net->proc_net_stat);
605         remove_proc_entry("rt_cache", net->proc_net);
606 #ifdef CONFIG_IP_ROUTE_CLASSID
607         remove_proc_entry("rt_acct", net->proc_net);
608 #endif
609 }
610
611 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
612         .init = ip_rt_do_proc_init,
613         .exit = ip_rt_do_proc_exit,
614 };
615
616 static int __init ip_rt_proc_init(void)
617 {
618         return register_pernet_subsys(&ip_rt_proc_ops);
619 }
620
621 #else
622 static inline int ip_rt_proc_init(void)
623 {
624         return 0;
625 }
626 #endif /* CONFIG_PROC_FS */
627
628 static inline void rt_free(struct rtable *rt)
629 {
630         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
631 }
632
633 static inline void rt_drop(struct rtable *rt)
634 {
635         ip_rt_put(rt);
636         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
637 }
638
639 static inline int rt_fast_clean(struct rtable *rth)
640 {
641         /* Kill broadcast/multicast entries very aggresively, if they
642            collide in hash table with more useful entries */
643         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
644                 rt_is_input_route(rth) && rth->dst.rt_next;
645 }
646
647 static inline int rt_valuable(struct rtable *rth)
648 {
649         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
650                 rth->dst.expires;
651 }
652
653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
654 {
655         unsigned long age;
656         int ret = 0;
657
658         if (atomic_read(&rth->dst.__refcnt))
659                 goto out;
660
661         age = jiffies - rth->dst.lastuse;
662         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
663             (age <= tmo2 && rt_valuable(rth)))
664                 goto out;
665         ret = 1;
666 out:    return ret;
667 }
668
669 /* Bits of score are:
670  * 31: very valuable
671  * 30: not quite useless
672  * 29..0: usage counter
673  */
674 static inline u32 rt_score(struct rtable *rt)
675 {
676         u32 score = jiffies - rt->dst.lastuse;
677
678         score = ~score & ~(3<<30);
679
680         if (rt_valuable(rt))
681                 score |= (1<<31);
682
683         if (rt_is_output_route(rt) ||
684             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
685                 score |= (1<<30);
686
687         return score;
688 }
689
690 static inline bool rt_caching(const struct net *net)
691 {
692         return net->ipv4.current_rt_cache_rebuild_count <=
693                 net->ipv4.sysctl_rt_cache_rebuild_count;
694 }
695
696 static inline bool compare_hash_inputs(const struct rtable *rt1,
697                                        const struct rtable *rt2)
698 {
699         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
700                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
701                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
702 }
703
704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
705 {
706         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
707                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
708                 (rt1->rt_mark ^ rt2->rt_mark) |
709                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
710                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
711                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
712 }
713
714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
715 {
716         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
717 }
718
719 static inline int rt_is_expired(struct rtable *rth)
720 {
721         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
722 }
723
724 /*
725  * Perform a full scan of hash table and free all entries.
726  * Can be called by a softirq or a process.
727  * In the later case, we want to be reschedule if necessary
728  */
729 static void rt_do_flush(struct net *net, int process_context)
730 {
731         unsigned int i;
732         struct rtable *rth, *next;
733
734         for (i = 0; i <= rt_hash_mask; i++) {
735                 struct rtable __rcu **pprev;
736                 struct rtable *list;
737
738                 if (process_context && need_resched())
739                         cond_resched();
740                 rth = rcu_access_pointer(rt_hash_table[i].chain);
741                 if (!rth)
742                         continue;
743
744                 spin_lock_bh(rt_hash_lock_addr(i));
745
746                 list = NULL;
747                 pprev = &rt_hash_table[i].chain;
748                 rth = rcu_dereference_protected(*pprev,
749                         lockdep_is_held(rt_hash_lock_addr(i)));
750
751                 while (rth) {
752                         next = rcu_dereference_protected(rth->dst.rt_next,
753                                 lockdep_is_held(rt_hash_lock_addr(i)));
754
755                         if (!net ||
756                             net_eq(dev_net(rth->dst.dev), net)) {
757                                 rcu_assign_pointer(*pprev, next);
758                                 rcu_assign_pointer(rth->dst.rt_next, list);
759                                 list = rth;
760                         } else {
761                                 pprev = &rth->dst.rt_next;
762                         }
763                         rth = next;
764                 }
765
766                 spin_unlock_bh(rt_hash_lock_addr(i));
767
768                 for (; list; list = next) {
769                         next = rcu_dereference_protected(list->dst.rt_next, 1);
770                         rt_free(list);
771                 }
772         }
773 }
774
775 /*
776  * While freeing expired entries, we compute average chain length
777  * and standard deviation, using fixed-point arithmetic.
778  * This to have an estimation of rt_chain_length_max
779  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
780  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781  */
782
783 #define FRACT_BITS 3
784 #define ONE (1UL << FRACT_BITS)
785
786 /*
787  * Given a hash chain and an item in this hash chain,
788  * find if a previous entry has the same hash_inputs
789  * (but differs on tos, mark or oif)
790  * Returns 0 if an alias is found.
791  * Returns ONE if rth has no alias before itself.
792  */
793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
794 {
795         const struct rtable *aux = head;
796
797         while (aux != rth) {
798                 if (compare_hash_inputs(aux, rth))
799                         return 0;
800                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
801         }
802         return ONE;
803 }
804
805 static void rt_check_expire(void)
806 {
807         static unsigned int rover;
808         unsigned int i = rover, goal;
809         struct rtable *rth;
810         struct rtable __rcu **rthp;
811         unsigned long samples = 0;
812         unsigned long sum = 0, sum2 = 0;
813         unsigned long delta;
814         u64 mult;
815
816         delta = jiffies - expires_ljiffies;
817         expires_ljiffies = jiffies;
818         mult = ((u64)delta) << rt_hash_log;
819         if (ip_rt_gc_timeout > 1)
820                 do_div(mult, ip_rt_gc_timeout);
821         goal = (unsigned int)mult;
822         if (goal > rt_hash_mask)
823                 goal = rt_hash_mask + 1;
824         for (; goal > 0; goal--) {
825                 unsigned long tmo = ip_rt_gc_timeout;
826                 unsigned long length;
827
828                 i = (i + 1) & rt_hash_mask;
829                 rthp = &rt_hash_table[i].chain;
830
831                 if (need_resched())
832                         cond_resched();
833
834                 samples++;
835
836                 if (rcu_dereference_raw(*rthp) == NULL)
837                         continue;
838                 length = 0;
839                 spin_lock_bh(rt_hash_lock_addr(i));
840                 while ((rth = rcu_dereference_protected(*rthp,
841                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
842                         prefetch(rth->dst.rt_next);
843                         if (rt_is_expired(rth) ||
844                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
845                                 *rthp = rth->dst.rt_next;
846                                 rt_free(rth);
847                                 continue;
848                         }
849
850                         /* We only count entries on a chain with equal
851                          * hash inputs once so that entries for
852                          * different QOS levels, and other non-hash
853                          * input attributes don't unfairly skew the
854                          * length computation
855                          */
856                         tmo >>= 1;
857                         rthp = &rth->dst.rt_next;
858                         length += has_noalias(rt_hash_table[i].chain, rth);
859                 }
860                 spin_unlock_bh(rt_hash_lock_addr(i));
861                 sum += length;
862                 sum2 += length*length;
863         }
864         if (samples) {
865                 unsigned long avg = sum / samples;
866                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
867                 rt_chain_length_max = max_t(unsigned long,
868                                         ip_rt_gc_elasticity,
869                                         (avg + 4*sd) >> FRACT_BITS);
870         }
871         rover = i;
872 }
873
874 /*
875  * rt_worker_func() is run in process context.
876  * we call rt_check_expire() to scan part of the hash table
877  */
878 static void rt_worker_func(struct work_struct *work)
879 {
880         rt_check_expire();
881         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
882 }
883
884 /*
885  * Perturbation of rt_genid by a small quantity [1..256]
886  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
887  * many times (2^24) without giving recent rt_genid.
888  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
889  */
890 static void rt_cache_invalidate(struct net *net)
891 {
892         unsigned char shuffle;
893
894         get_random_bytes(&shuffle, sizeof(shuffle));
895         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
896 }
897
898 /*
899  * delay < 0  : invalidate cache (fast : entries will be deleted later)
900  * delay >= 0 : invalidate & flush cache (can be long)
901  */
902 void rt_cache_flush(struct net *net, int delay)
903 {
904         rt_cache_invalidate(net);
905         if (delay >= 0)
906                 rt_do_flush(net, !in_softirq());
907 }
908
909 /* Flush previous cache invalidated entries from the cache */
910 void rt_cache_flush_batch(struct net *net)
911 {
912         rt_do_flush(net, !in_softirq());
913 }
914
915 static void rt_emergency_hash_rebuild(struct net *net)
916 {
917         net_warn_ratelimited("Route hash chain too long!\n");
918         rt_cache_invalidate(net);
919 }
920
921 /*
922    Short description of GC goals.
923
924    We want to build algorithm, which will keep routing cache
925    at some equilibrium point, when number of aged off entries
926    is kept approximately equal to newly generated ones.
927
928    Current expiration strength is variable "expire".
929    We try to adjust it dynamically, so that if networking
930    is idle expires is large enough to keep enough of warm entries,
931    and when load increases it reduces to limit cache size.
932  */
933
934 static int rt_garbage_collect(struct dst_ops *ops)
935 {
936         static unsigned long expire = RT_GC_TIMEOUT;
937         static unsigned long last_gc;
938         static int rover;
939         static int equilibrium;
940         struct rtable *rth;
941         struct rtable __rcu **rthp;
942         unsigned long now = jiffies;
943         int goal;
944         int entries = dst_entries_get_fast(&ipv4_dst_ops);
945
946         /*
947          * Garbage collection is pretty expensive,
948          * do not make it too frequently.
949          */
950
951         RT_CACHE_STAT_INC(gc_total);
952
953         if (now - last_gc < ip_rt_gc_min_interval &&
954             entries < ip_rt_max_size) {
955                 RT_CACHE_STAT_INC(gc_ignored);
956                 goto out;
957         }
958
959         entries = dst_entries_get_slow(&ipv4_dst_ops);
960         /* Calculate number of entries, which we want to expire now. */
961         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
962         if (goal <= 0) {
963                 if (equilibrium < ipv4_dst_ops.gc_thresh)
964                         equilibrium = ipv4_dst_ops.gc_thresh;
965                 goal = entries - equilibrium;
966                 if (goal > 0) {
967                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
968                         goal = entries - equilibrium;
969                 }
970         } else {
971                 /* We are in dangerous area. Try to reduce cache really
972                  * aggressively.
973                  */
974                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
975                 equilibrium = entries - goal;
976         }
977
978         if (now - last_gc >= ip_rt_gc_min_interval)
979                 last_gc = now;
980
981         if (goal <= 0) {
982                 equilibrium += goal;
983                 goto work_done;
984         }
985
986         do {
987                 int i, k;
988
989                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
990                         unsigned long tmo = expire;
991
992                         k = (k + 1) & rt_hash_mask;
993                         rthp = &rt_hash_table[k].chain;
994                         spin_lock_bh(rt_hash_lock_addr(k));
995                         while ((rth = rcu_dereference_protected(*rthp,
996                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
997                                 if (!rt_is_expired(rth) &&
998                                         !rt_may_expire(rth, tmo, expire)) {
999                                         tmo >>= 1;
1000                                         rthp = &rth->dst.rt_next;
1001                                         continue;
1002                                 }
1003                                 *rthp = rth->dst.rt_next;
1004                                 rt_free(rth);
1005                                 goal--;
1006                         }
1007                         spin_unlock_bh(rt_hash_lock_addr(k));
1008                         if (goal <= 0)
1009                                 break;
1010                 }
1011                 rover = k;
1012
1013                 if (goal <= 0)
1014                         goto work_done;
1015
1016                 /* Goal is not achieved. We stop process if:
1017
1018                    - if expire reduced to zero. Otherwise, expire is halfed.
1019                    - if table is not full.
1020                    - if we are called from interrupt.
1021                    - jiffies check is just fallback/debug loop breaker.
1022                      We will not spin here for long time in any case.
1023                  */
1024
1025                 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027                 if (expire == 0)
1028                         break;
1029
1030                 expire >>= 1;
1031
1032                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                         goto out;
1034         } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037                 goto out;
1038         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039                 goto out;
1040         net_warn_ratelimited("dst cache overflow\n");
1041         RT_CACHE_STAT_INC(gc_dst_overflow);
1042         return 1;
1043
1044 work_done:
1045         expire += ip_rt_gc_min_interval;
1046         if (expire > ip_rt_gc_timeout ||
1047             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049                 expire = ip_rt_gc_timeout;
1050 out:    return 0;
1051 }
1052
1053 /*
1054  * Returns number of entries in a hash chain that have different hash_inputs
1055  */
1056 static int slow_chain_length(const struct rtable *head)
1057 {
1058         int length = 0;
1059         const struct rtable *rth = head;
1060
1061         while (rth) {
1062                 length += has_noalias(head, rth);
1063                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064         }
1065         return length >> FRACT_BITS;
1066 }
1067
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069                                            struct sk_buff *skb,
1070                                            const void *daddr)
1071 {
1072         struct net_device *dev = dst->dev;
1073         const __be32 *pkey = daddr;
1074         const struct rtable *rt;
1075         struct neighbour *n;
1076
1077         rt = (const struct rtable *) dst;
1078         if (rt->rt_gateway)
1079                 pkey = (const __be32 *) &rt->rt_gateway;
1080         else if (skb)
1081                 pkey = &ip_hdr(skb)->daddr;
1082
1083         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1084         if (n)
1085                 return n;
1086         return neigh_create(&arp_tbl, pkey, dev);
1087 }
1088
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090                                      struct sk_buff *skb, int ifindex)
1091 {
1092         struct rtable   *rth, *cand;
1093         struct rtable __rcu **rthp, **candp;
1094         unsigned long   now;
1095         u32             min_score;
1096         int             chain_length;
1097
1098 restart:
1099         chain_length = 0;
1100         min_score = ~(u32)0;
1101         cand = NULL;
1102         candp = NULL;
1103         now = jiffies;
1104
1105         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106                 /*
1107                  * If we're not caching, just tell the caller we
1108                  * were successful and don't touch the route.  The
1109                  * caller hold the sole reference to the cache entry, and
1110                  * it will be released when the caller is done with it.
1111                  * If we drop it here, the callers have no way to resolve routes
1112                  * when we're not caching.  Instead, just point *rp at rt, so
1113                  * the caller gets a single use out of the route
1114                  * Note that we do rt_free on this new route entry, so that
1115                  * once its refcount hits zero, we are still able to reap it
1116                  * (Thanks Alexey)
1117                  * Note: To avoid expensive rcu stuff for this uncached dst,
1118                  * we set DST_NOCACHE so that dst_release() can free dst without
1119                  * waiting a grace period.
1120                  */
1121
1122                 rt->dst.flags |= DST_NOCACHE;
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = rcu_dereference_protected(*rthp,
1130                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131                 if (rt_is_expired(rth)) {
1132                         *rthp = rth->dst.rt_next;
1133                         rt_free(rth);
1134                         continue;
1135                 }
1136                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137                         /* Put it first */
1138                         *rthp = rth->dst.rt_next;
1139                         /*
1140                          * Since lookup is lockfree, the deletion
1141                          * must be visible to another weakly ordered CPU before
1142                          * the insertion at the start of the hash chain.
1143                          */
1144                         rcu_assign_pointer(rth->dst.rt_next,
1145                                            rt_hash_table[hash].chain);
1146                         /*
1147                          * Since lookup is lockfree, the update writes
1148                          * must be ordered for consistency on SMP.
1149                          */
1150                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152                         dst_use(&rth->dst, now);
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         rt_drop(rt);
1156                         if (skb)
1157                                 skb_dst_set(skb, &rth->dst);
1158                         return rth;
1159                 }
1160
1161                 if (!atomic_read(&rth->dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max &&
1189                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190                         struct net *net = dev_net(rt->dst.dev);
1191                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192                         if (!rt_caching(net)) {
1193                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194                                         rt->dst.dev->name, num);
1195                         }
1196                         rt_emergency_hash_rebuild(net);
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200                                         ifindex, rt_genid(net));
1201                         goto restart;
1202                 }
1203         }
1204
1205         rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207         /*
1208          * Since lookup is lockfree, we must make sure
1209          * previous writes to rt are committed to memory
1210          * before making rt visible to other CPUS.
1211          */
1212         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214         spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216 skip_hashing:
1217         if (skb)
1218                 skb_dst_set(skb, &rt->dst);
1219         return rt;
1220 }
1221
1222 /*
1223  * Peer allocation may fail only in serious out-of-memory conditions.  However
1224  * we still can generate some output.
1225  * Random ID selection looks a bit dangerous because we have no chances to
1226  * select ID being unique in a reasonable period of time.
1227  * But broken packet identifier may be better than no packet at all.
1228  */
1229 static void ip_select_fb_ident(struct iphdr *iph)
1230 {
1231         static DEFINE_SPINLOCK(ip_fb_id_lock);
1232         static u32 ip_fallback_id;
1233         u32 salt;
1234
1235         spin_lock_bh(&ip_fb_id_lock);
1236         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237         iph->id = htons(salt & 0xFFFF);
1238         ip_fallback_id = salt;
1239         spin_unlock_bh(&ip_fb_id_lock);
1240 }
1241
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243 {
1244         struct net *net = dev_net(dst->dev);
1245         struct inet_peer *peer;
1246
1247         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248         if (peer) {
1249                 iph->id = htons(inet_getid(peer, more));
1250                 inet_putpeer(peer);
1251                 return;
1252         }
1253
1254         ip_select_fb_ident(iph);
1255 }
1256 EXPORT_SYMBOL(__ip_select_ident);
1257
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1259 {
1260         struct rtable __rcu **rthp;
1261         struct rtable *aux;
1262
1263         rthp = &rt_hash_table[hash].chain;
1264         spin_lock_bh(rt_hash_lock_addr(hash));
1265         ip_rt_put(rt);
1266         while ((aux = rcu_dereference_protected(*rthp,
1267                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268                 if (aux == rt || rt_is_expired(aux)) {
1269                         *rthp = aux->dst.rt_next;
1270                         rt_free(aux);
1271                         continue;
1272                 }
1273                 rthp = &aux->dst.rt_next;
1274         }
1275         spin_unlock_bh(rt_hash_lock_addr(hash));
1276 }
1277
1278 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1279 {
1280         __be32 new_gw = icmp_hdr(skb)->un.gateway;
1281         __be32 old_gw = ip_hdr(skb)->saddr;
1282         struct net_device *dev = skb->dev;
1283         struct in_device *in_dev;
1284         struct neighbour *n;
1285         struct rtable *rt;
1286         struct net *net;
1287
1288         switch (icmp_hdr(skb)->code & 7) {
1289         case ICMP_REDIR_NET:
1290         case ICMP_REDIR_NETTOS:
1291         case ICMP_REDIR_HOST:
1292         case ICMP_REDIR_HOSTTOS:
1293                 break;
1294
1295         default:
1296                 return;
1297         }
1298
1299         rt = (struct rtable *) dst;
1300         if (rt->rt_gateway != old_gw)
1301                 return;
1302
1303         in_dev = __in_dev_get_rcu(dev);
1304         if (!in_dev)
1305                 return;
1306
1307         net = dev_net(dev);
1308         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1309             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1310             ipv4_is_zeronet(new_gw))
1311                 goto reject_redirect;
1312
1313         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1314                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1315                         goto reject_redirect;
1316                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1317                         goto reject_redirect;
1318         } else {
1319                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1320                         goto reject_redirect;
1321         }
1322
1323         n = ipv4_neigh_lookup(dst, NULL, &new_gw);
1324         if (n) {
1325                 if (!(n->nud_state & NUD_VALID)) {
1326                         neigh_event_send(n, NULL);
1327                 } else {
1328                         rt->rt_gateway = new_gw;
1329                         rt->rt_flags |= RTCF_REDIRECTED;
1330                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1331                 }
1332                 neigh_release(n);
1333         }
1334         return;
1335
1336 reject_redirect:
1337 #ifdef CONFIG_IP_ROUTE_VERBOSE
1338         if (IN_DEV_LOG_MARTIANS(in_dev)) {
1339                 const struct iphdr *iph = (const struct iphdr *) skb->data;
1340                 __be32 daddr = iph->daddr;
1341                 __be32 saddr = iph->saddr;
1342
1343                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1344                                      "  Advised path = %pI4 -> %pI4\n",
1345                                      &old_gw, dev->name, &new_gw,
1346                                      &saddr, &daddr);
1347         }
1348 #endif
1349         ;
1350 }
1351
1352 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1353 {
1354         struct rtable *rt = (struct rtable *)dst;
1355         struct dst_entry *ret = dst;
1356
1357         if (rt) {
1358                 if (dst->obsolete > 0) {
1359                         ip_rt_put(rt);
1360                         ret = NULL;
1361                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1362                            rt->dst.expires) {
1363                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1364                                                 rt->rt_oif,
1365                                                 rt_genid(dev_net(dst->dev)));
1366                         rt_del(hash, rt);
1367                         ret = NULL;
1368                 }
1369         }
1370         return ret;
1371 }
1372
1373 /*
1374  * Algorithm:
1375  *      1. The first ip_rt_redirect_number redirects are sent
1376  *         with exponential backoff, then we stop sending them at all,
1377  *         assuming that the host ignores our redirects.
1378  *      2. If we did not see packets requiring redirects
1379  *         during ip_rt_redirect_silence, we assume that the host
1380  *         forgot redirected route and start to send redirects again.
1381  *
1382  * This algorithm is much cheaper and more intelligent than dumb load limiting
1383  * in icmp.c.
1384  *
1385  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1386  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1387  */
1388
1389 void ip_rt_send_redirect(struct sk_buff *skb)
1390 {
1391         struct rtable *rt = skb_rtable(skb);
1392         struct in_device *in_dev;
1393         struct inet_peer *peer;
1394         struct net *net;
1395         int log_martians;
1396
1397         rcu_read_lock();
1398         in_dev = __in_dev_get_rcu(rt->dst.dev);
1399         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1400                 rcu_read_unlock();
1401                 return;
1402         }
1403         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1404         rcu_read_unlock();
1405
1406         net = dev_net(rt->dst.dev);
1407         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1408         if (!peer) {
1409                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1410                 return;
1411         }
1412
1413         /* No redirected packets during ip_rt_redirect_silence;
1414          * reset the algorithm.
1415          */
1416         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1417                 peer->rate_tokens = 0;
1418
1419         /* Too many ignored redirects; do not send anything
1420          * set dst.rate_last to the last seen redirected packet.
1421          */
1422         if (peer->rate_tokens >= ip_rt_redirect_number) {
1423                 peer->rate_last = jiffies;
1424                 goto out_put_peer;
1425         }
1426
1427         /* Check for load limit; set rate_last to the latest sent
1428          * redirect.
1429          */
1430         if (peer->rate_tokens == 0 ||
1431             time_after(jiffies,
1432                        (peer->rate_last +
1433                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1434                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1435                 peer->rate_last = jiffies;
1436                 ++peer->rate_tokens;
1437 #ifdef CONFIG_IP_ROUTE_VERBOSE
1438                 if (log_martians &&
1439                     peer->rate_tokens == ip_rt_redirect_number)
1440                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1441                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1442                                              &rt->rt_dst, &rt->rt_gateway);
1443 #endif
1444         }
1445 out_put_peer:
1446         inet_putpeer(peer);
1447 }
1448
1449 static int ip_error(struct sk_buff *skb)
1450 {
1451         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1452         struct rtable *rt = skb_rtable(skb);
1453         struct inet_peer *peer;
1454         unsigned long now;
1455         struct net *net;
1456         bool send;
1457         int code;
1458
1459         net = dev_net(rt->dst.dev);
1460         if (!IN_DEV_FORWARD(in_dev)) {
1461                 switch (rt->dst.error) {
1462                 case EHOSTUNREACH:
1463                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1464                         break;
1465
1466                 case ENETUNREACH:
1467                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1468                         break;
1469                 }
1470                 goto out;
1471         }
1472
1473         switch (rt->dst.error) {
1474         case EINVAL:
1475         default:
1476                 goto out;
1477         case EHOSTUNREACH:
1478                 code = ICMP_HOST_UNREACH;
1479                 break;
1480         case ENETUNREACH:
1481                 code = ICMP_NET_UNREACH;
1482                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1483                 break;
1484         case EACCES:
1485                 code = ICMP_PKT_FILTERED;
1486                 break;
1487         }
1488
1489         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1490
1491         send = true;
1492         if (peer) {
1493                 now = jiffies;
1494                 peer->rate_tokens += now - peer->rate_last;
1495                 if (peer->rate_tokens > ip_rt_error_burst)
1496                         peer->rate_tokens = ip_rt_error_burst;
1497                 peer->rate_last = now;
1498                 if (peer->rate_tokens >= ip_rt_error_cost)
1499                         peer->rate_tokens -= ip_rt_error_cost;
1500                 else
1501                         send = false;
1502                 inet_putpeer(peer);
1503         }
1504         if (send)
1505                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1506
1507 out:    kfree_skb(skb);
1508         return 0;
1509 }
1510
1511 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1512                               struct sk_buff *skb, u32 mtu)
1513 {
1514         struct rtable *rt = (struct rtable *) dst;
1515
1516         dst_confirm(dst);
1517
1518         if (mtu < ip_rt_min_pmtu)
1519                 mtu = ip_rt_min_pmtu;
1520
1521         rt->rt_pmtu = mtu;
1522         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1523 }
1524
1525 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1526                       int oif, u32 mark, u8 protocol, int flow_flags)
1527 {
1528         const struct iphdr *iph = (const struct iphdr *)skb->data;
1529         struct flowi4 fl4;
1530         struct rtable *rt;
1531
1532         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1533                            protocol, flow_flags,
1534                            iph->daddr, iph->saddr, 0, 0);
1535         rt = __ip_route_output_key(net, &fl4);
1536         if (!IS_ERR(rt)) {
1537                 ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
1538                 ip_rt_put(rt);
1539         }
1540 }
1541 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1542
1543 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1544 {
1545         const struct inet_sock *inet = inet_sk(sk);
1546
1547         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1548                                 sk->sk_bound_dev_if, sk->sk_mark,
1549                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1550                                 inet_sk_flowi_flags(sk));
1551 }
1552 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1553
1554 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1555                    int oif, u32 mark, u8 protocol, int flow_flags)
1556 {
1557         const struct iphdr *iph = (const struct iphdr *)skb->data;
1558         struct flowi4 fl4;
1559         struct rtable *rt;
1560
1561         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1562                            protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
1563         rt = __ip_route_output_key(net, &fl4);
1564         if (!IS_ERR(rt)) {
1565                 ip_do_redirect(&rt->dst, NULL, skb);
1566                 ip_rt_put(rt);
1567         }
1568 }
1569 EXPORT_SYMBOL_GPL(ipv4_redirect);
1570
1571 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1572 {
1573         const struct inet_sock *inet = inet_sk(sk);
1574
1575         return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
1576                              sk->sk_mark,
1577                              inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1578                              inet_sk_flowi_flags(sk));
1579 }
1580 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1581
1582 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1583 {
1584         struct rtable *rt = (struct rtable *) dst;
1585
1586         if (rt_is_expired(rt))
1587                 return NULL;
1588         return dst;
1589 }
1590
1591 static void ipv4_dst_destroy(struct dst_entry *dst)
1592 {
1593         struct rtable *rt = (struct rtable *) dst;
1594
1595         if (rt->fi) {
1596                 fib_info_put(rt->fi);
1597                 rt->fi = NULL;
1598         }
1599 }
1600
1601
1602 static void ipv4_link_failure(struct sk_buff *skb)
1603 {
1604         struct rtable *rt;
1605
1606         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1607
1608         rt = skb_rtable(skb);
1609         if (rt)
1610                 dst_set_expires(&rt->dst, 0);
1611 }
1612
1613 static int ip_rt_bug(struct sk_buff *skb)
1614 {
1615         pr_debug("%s: %pI4 -> %pI4, %s\n",
1616                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1617                  skb->dev ? skb->dev->name : "?");
1618         kfree_skb(skb);
1619         WARN_ON(1);
1620         return 0;
1621 }
1622
1623 /*
1624    We do not cache source address of outgoing interface,
1625    because it is used only by IP RR, TS and SRR options,
1626    so that it out of fast path.
1627
1628    BTW remember: "addr" is allowed to be not aligned
1629    in IP options!
1630  */
1631
1632 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1633 {
1634         __be32 src;
1635
1636         if (rt_is_output_route(rt))
1637                 src = ip_hdr(skb)->saddr;
1638         else {
1639                 struct fib_result res;
1640                 struct flowi4 fl4;
1641                 struct iphdr *iph;
1642
1643                 iph = ip_hdr(skb);
1644
1645                 memset(&fl4, 0, sizeof(fl4));
1646                 fl4.daddr = iph->daddr;
1647                 fl4.saddr = iph->saddr;
1648                 fl4.flowi4_tos = RT_TOS(iph->tos);
1649                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1650                 fl4.flowi4_iif = skb->dev->ifindex;
1651                 fl4.flowi4_mark = skb->mark;
1652
1653                 rcu_read_lock();
1654                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1655                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1656                 else
1657                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1658                                         RT_SCOPE_UNIVERSE);
1659                 rcu_read_unlock();
1660         }
1661         memcpy(addr, &src, 4);
1662 }
1663
1664 #ifdef CONFIG_IP_ROUTE_CLASSID
1665 static void set_class_tag(struct rtable *rt, u32 tag)
1666 {
1667         if (!(rt->dst.tclassid & 0xFFFF))
1668                 rt->dst.tclassid |= tag & 0xFFFF;
1669         if (!(rt->dst.tclassid & 0xFFFF0000))
1670                 rt->dst.tclassid |= tag & 0xFFFF0000;
1671 }
1672 #endif
1673
1674 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1675 {
1676         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1677
1678         if (advmss == 0) {
1679                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1680                                ip_rt_min_advmss);
1681                 if (advmss > 65535 - 40)
1682                         advmss = 65535 - 40;
1683         }
1684         return advmss;
1685 }
1686
1687 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1688 {
1689         const struct rtable *rt = (const struct rtable *) dst;
1690         unsigned int mtu = rt->rt_pmtu;
1691
1692         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1693                 mtu = 0;
1694
1695         if (!mtu)
1696                 mtu = dst_metric_raw(dst, RTAX_MTU);
1697
1698         if (mtu && rt_is_output_route(rt))
1699                 return mtu;
1700
1701         mtu = dst->dev->mtu;
1702
1703         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1704
1705                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1706                         mtu = 576;
1707         }
1708
1709         if (mtu > IP_MAX_MTU)
1710                 mtu = IP_MAX_MTU;
1711
1712         return mtu;
1713 }
1714
1715 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1716                             struct fib_info *fi)
1717 {
1718         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1719                 rt->fi = fi;
1720                 atomic_inc(&fi->fib_clntref);
1721         }
1722         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1723 }
1724
1725 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1726                            const struct fib_result *res,
1727                            struct fib_info *fi, u16 type, u32 itag)
1728 {
1729         if (fi) {
1730                 if (FIB_RES_GW(*res) &&
1731                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1732                         rt->rt_gateway = FIB_RES_GW(*res);
1733                 rt_init_metrics(rt, fl4, fi);
1734 #ifdef CONFIG_IP_ROUTE_CLASSID
1735                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1736 #endif
1737         }
1738
1739 #ifdef CONFIG_IP_ROUTE_CLASSID
1740 #ifdef CONFIG_IP_MULTIPLE_TABLES
1741         set_class_tag(rt, res->tclassid);
1742 #endif
1743         set_class_tag(rt, itag);
1744 #endif
1745 }
1746
1747 static struct rtable *rt_dst_alloc(struct net_device *dev,
1748                                    bool nopolicy, bool noxfrm)
1749 {
1750         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1751                          DST_HOST |
1752                          (nopolicy ? DST_NOPOLICY : 0) |
1753                          (noxfrm ? DST_NOXFRM : 0));
1754 }
1755
1756 /* called in rcu_read_lock() section */
1757 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1758                                 u8 tos, struct net_device *dev, int our)
1759 {
1760         unsigned int hash;
1761         struct rtable *rth;
1762         struct in_device *in_dev = __in_dev_get_rcu(dev);
1763         u32 itag = 0;
1764         int err;
1765
1766         /* Primary sanity checks. */
1767
1768         if (in_dev == NULL)
1769                 return -EINVAL;
1770
1771         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1772             skb->protocol != htons(ETH_P_IP))
1773                 goto e_inval;
1774
1775         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1776                 if (ipv4_is_loopback(saddr))
1777                         goto e_inval;
1778
1779         if (ipv4_is_zeronet(saddr)) {
1780                 if (!ipv4_is_local_multicast(daddr))
1781                         goto e_inval;
1782         } else {
1783                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1784                                           in_dev, &itag);
1785                 if (err < 0)
1786                         goto e_err;
1787         }
1788         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1789                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1790         if (!rth)
1791                 goto e_nobufs;
1792
1793 #ifdef CONFIG_IP_ROUTE_CLASSID
1794         rth->dst.tclassid = itag;
1795 #endif
1796         rth->dst.output = ip_rt_bug;
1797
1798         rth->rt_key_dst = daddr;
1799         rth->rt_key_src = saddr;
1800         rth->rt_genid   = rt_genid(dev_net(dev));
1801         rth->rt_flags   = RTCF_MULTICAST;
1802         rth->rt_type    = RTN_MULTICAST;
1803         rth->rt_key_tos = tos;
1804         rth->rt_dst     = daddr;
1805         rth->rt_src     = saddr;
1806         rth->rt_route_iif = dev->ifindex;
1807         rth->rt_iif     = dev->ifindex;
1808         rth->rt_oif     = 0;
1809         rth->rt_mark    = skb->mark;
1810         rth->rt_pmtu    = 0;
1811         rth->rt_gateway = daddr;
1812         rth->fi = NULL;
1813         if (our) {
1814                 rth->dst.input= ip_local_deliver;
1815                 rth->rt_flags |= RTCF_LOCAL;
1816         }
1817
1818 #ifdef CONFIG_IP_MROUTE
1819         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1820                 rth->dst.input = ip_mr_input;
1821 #endif
1822         RT_CACHE_STAT_INC(in_slow_mc);
1823
1824         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1825         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1826         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1827
1828 e_nobufs:
1829         return -ENOBUFS;
1830 e_inval:
1831         return -EINVAL;
1832 e_err:
1833         return err;
1834 }
1835
1836
1837 static void ip_handle_martian_source(struct net_device *dev,
1838                                      struct in_device *in_dev,
1839                                      struct sk_buff *skb,
1840                                      __be32 daddr,
1841                                      __be32 saddr)
1842 {
1843         RT_CACHE_STAT_INC(in_martian_src);
1844 #ifdef CONFIG_IP_ROUTE_VERBOSE
1845         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1846                 /*
1847                  *      RFC1812 recommendation, if source is martian,
1848                  *      the only hint is MAC header.
1849                  */
1850                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1851                         &daddr, &saddr, dev->name);
1852                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1853                         print_hex_dump(KERN_WARNING, "ll header: ",
1854                                        DUMP_PREFIX_OFFSET, 16, 1,
1855                                        skb_mac_header(skb),
1856                                        dev->hard_header_len, true);
1857                 }
1858         }
1859 #endif
1860 }
1861
1862 /* called in rcu_read_lock() section */
1863 static int __mkroute_input(struct sk_buff *skb,
1864                            const struct fib_result *res,
1865                            struct in_device *in_dev,
1866                            __be32 daddr, __be32 saddr, u32 tos,
1867                            struct rtable **result)
1868 {
1869         struct rtable *rth;
1870         int err;
1871         struct in_device *out_dev;
1872         unsigned int flags = 0;
1873         u32 itag;
1874
1875         /* get a working reference to the output device */
1876         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1877         if (out_dev == NULL) {
1878                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1879                 return -EINVAL;
1880         }
1881
1882
1883         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1884                                   in_dev->dev, in_dev, &itag);
1885         if (err < 0) {
1886                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1887                                          saddr);
1888
1889                 goto cleanup;
1890         }
1891
1892         if (err)
1893                 flags |= RTCF_DIRECTSRC;
1894
1895         if (out_dev == in_dev && err &&
1896             (IN_DEV_SHARED_MEDIA(out_dev) ||
1897              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1898                 flags |= RTCF_DOREDIRECT;
1899
1900         if (skb->protocol != htons(ETH_P_IP)) {
1901                 /* Not IP (i.e. ARP). Do not create route, if it is
1902                  * invalid for proxy arp. DNAT routes are always valid.
1903                  *
1904                  * Proxy arp feature have been extended to allow, ARP
1905                  * replies back to the same interface, to support
1906                  * Private VLAN switch technologies. See arp.c.
1907                  */
1908                 if (out_dev == in_dev &&
1909                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1910                         err = -EINVAL;
1911                         goto cleanup;
1912                 }
1913         }
1914
1915         rth = rt_dst_alloc(out_dev->dev,
1916                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1917                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1918         if (!rth) {
1919                 err = -ENOBUFS;
1920                 goto cleanup;
1921         }
1922
1923         rth->rt_key_dst = daddr;
1924         rth->rt_key_src = saddr;
1925         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1926         rth->rt_flags = flags;
1927         rth->rt_type = res->type;
1928         rth->rt_key_tos = tos;
1929         rth->rt_dst     = daddr;
1930         rth->rt_src     = saddr;
1931         rth->rt_route_iif = in_dev->dev->ifindex;
1932         rth->rt_iif     = in_dev->dev->ifindex;
1933         rth->rt_oif     = 0;
1934         rth->rt_mark    = skb->mark;
1935         rth->rt_pmtu    = 0;
1936         rth->rt_gateway = daddr;
1937         rth->fi = NULL;
1938
1939         rth->dst.input = ip_forward;
1940         rth->dst.output = ip_output;
1941
1942         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1943
1944         *result = rth;
1945         err = 0;
1946  cleanup:
1947         return err;
1948 }
1949
1950 static int ip_mkroute_input(struct sk_buff *skb,
1951                             struct fib_result *res,
1952                             const struct flowi4 *fl4,
1953                             struct in_device *in_dev,
1954                             __be32 daddr, __be32 saddr, u32 tos)
1955 {
1956         struct rtable *rth = NULL;
1957         int err;
1958         unsigned int hash;
1959
1960 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1961         if (res->fi && res->fi->fib_nhs > 1)
1962                 fib_select_multipath(res);
1963 #endif
1964
1965         /* create a routing cache entry */
1966         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1967         if (err)
1968                 return err;
1969
1970         /* put it into the cache */
1971         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
1972                        rt_genid(dev_net(rth->dst.dev)));
1973         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
1974         if (IS_ERR(rth))
1975                 return PTR_ERR(rth);
1976         return 0;
1977 }
1978
1979 /*
1980  *      NOTE. We drop all the packets that has local source
1981  *      addresses, because every properly looped back packet
1982  *      must have correct destination already attached by output routine.
1983  *
1984  *      Such approach solves two big problems:
1985  *      1. Not simplex devices are handled properly.
1986  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1987  *      called with rcu_read_lock()
1988  */
1989
1990 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1991                                u8 tos, struct net_device *dev)
1992 {
1993         struct fib_result res;
1994         struct in_device *in_dev = __in_dev_get_rcu(dev);
1995         struct flowi4   fl4;
1996         unsigned int    flags = 0;
1997         u32             itag = 0;
1998         struct rtable   *rth;
1999         unsigned int    hash;
2000         int             err = -EINVAL;
2001         struct net    *net = dev_net(dev);
2002
2003         /* IP on this device is disabled. */
2004
2005         if (!in_dev)
2006                 goto out;
2007
2008         /* Check for the most weird martians, which can be not detected
2009            by fib_lookup.
2010          */
2011
2012         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2013                 goto martian_source;
2014
2015         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2016                 goto brd_input;
2017
2018         /* Accept zero addresses only to limited broadcast;
2019          * I even do not know to fix it or not. Waiting for complains :-)
2020          */
2021         if (ipv4_is_zeronet(saddr))
2022                 goto martian_source;
2023
2024         if (ipv4_is_zeronet(daddr))
2025                 goto martian_destination;
2026
2027         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2028                 if (ipv4_is_loopback(daddr))
2029                         goto martian_destination;
2030
2031                 if (ipv4_is_loopback(saddr))
2032                         goto martian_source;
2033         }
2034
2035         /*
2036          *      Now we are ready to route packet.
2037          */
2038         fl4.flowi4_oif = 0;
2039         fl4.flowi4_iif = dev->ifindex;
2040         fl4.flowi4_mark = skb->mark;
2041         fl4.flowi4_tos = tos;
2042         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2043         fl4.daddr = daddr;
2044         fl4.saddr = saddr;
2045         err = fib_lookup(net, &fl4, &res);
2046         if (err != 0)
2047                 goto no_route;
2048
2049         RT_CACHE_STAT_INC(in_slow_tot);
2050
2051         if (res.type == RTN_BROADCAST)
2052                 goto brd_input;
2053
2054         if (res.type == RTN_LOCAL) {
2055                 err = fib_validate_source(skb, saddr, daddr, tos,
2056                                           net->loopback_dev->ifindex,
2057                                           dev, in_dev, &itag);
2058                 if (err < 0)
2059                         goto martian_source_keep_err;
2060                 if (err)
2061                         flags |= RTCF_DIRECTSRC;
2062                 goto local_input;
2063         }
2064
2065         if (!IN_DEV_FORWARD(in_dev))
2066                 goto no_route;
2067         if (res.type != RTN_UNICAST)
2068                 goto martian_destination;
2069
2070         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2071 out:    return err;
2072
2073 brd_input:
2074         if (skb->protocol != htons(ETH_P_IP))
2075                 goto e_inval;
2076
2077         if (!ipv4_is_zeronet(saddr)) {
2078                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2079                                           in_dev, &itag);
2080                 if (err < 0)
2081                         goto martian_source_keep_err;
2082                 if (err)
2083                         flags |= RTCF_DIRECTSRC;
2084         }
2085         flags |= RTCF_BROADCAST;
2086         res.type = RTN_BROADCAST;
2087         RT_CACHE_STAT_INC(in_brd);
2088
2089 local_input:
2090         rth = rt_dst_alloc(net->loopback_dev,
2091                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2092         if (!rth)
2093                 goto e_nobufs;
2094
2095         rth->dst.input= ip_local_deliver;
2096         rth->dst.output= ip_rt_bug;
2097 #ifdef CONFIG_IP_ROUTE_CLASSID
2098         rth->dst.tclassid = itag;
2099 #endif
2100
2101         rth->rt_key_dst = daddr;
2102         rth->rt_key_src = saddr;
2103         rth->rt_genid = rt_genid(net);
2104         rth->rt_flags   = flags|RTCF_LOCAL;
2105         rth->rt_type    = res.type;
2106         rth->rt_key_tos = tos;
2107         rth->rt_dst     = daddr;
2108         rth->rt_src     = saddr;
2109         rth->rt_route_iif = dev->ifindex;
2110         rth->rt_iif     = dev->ifindex;
2111         rth->rt_oif     = 0;
2112         rth->rt_mark    = skb->mark;
2113         rth->rt_pmtu    = 0;
2114         rth->rt_gateway = daddr;
2115         rth->fi = NULL;
2116         if (res.type == RTN_UNREACHABLE) {
2117                 rth->dst.input= ip_error;
2118                 rth->dst.error= -err;
2119                 rth->rt_flags   &= ~RTCF_LOCAL;
2120         }
2121         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2122         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2123         err = 0;
2124         if (IS_ERR(rth))
2125                 err = PTR_ERR(rth);
2126         goto out;
2127
2128 no_route:
2129         RT_CACHE_STAT_INC(in_no_route);
2130         res.type = RTN_UNREACHABLE;
2131         if (err == -ESRCH)
2132                 err = -ENETUNREACH;
2133         goto local_input;
2134
2135         /*
2136          *      Do not cache martian addresses: they should be logged (RFC1812)
2137          */
2138 martian_destination:
2139         RT_CACHE_STAT_INC(in_martian_dst);
2140 #ifdef CONFIG_IP_ROUTE_VERBOSE
2141         if (IN_DEV_LOG_MARTIANS(in_dev))
2142                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2143                                      &daddr, &saddr, dev->name);
2144 #endif
2145
2146 e_inval:
2147         err = -EINVAL;
2148         goto out;
2149
2150 e_nobufs:
2151         err = -ENOBUFS;
2152         goto out;
2153
2154 martian_source:
2155         err = -EINVAL;
2156 martian_source_keep_err:
2157         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2158         goto out;
2159 }
2160
2161 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2162                            u8 tos, struct net_device *dev, bool noref)
2163 {
2164         struct rtable   *rth;
2165         unsigned int    hash;
2166         int iif = dev->ifindex;
2167         struct net *net;
2168         int res;
2169
2170         net = dev_net(dev);
2171
2172         rcu_read_lock();
2173
2174         if (!rt_caching(net))
2175                 goto skip_cache;
2176
2177         tos &= IPTOS_RT_MASK;
2178         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2179
2180         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2181              rth = rcu_dereference(rth->dst.rt_next)) {
2182                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2183                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2184                      (rth->rt_route_iif ^ iif) |
2185                      (rth->rt_key_tos ^ tos)) == 0 &&
2186                     rth->rt_mark == skb->mark &&
2187                     net_eq(dev_net(rth->dst.dev), net) &&
2188                     !rt_is_expired(rth)) {
2189                         if (noref) {
2190                                 dst_use_noref(&rth->dst, jiffies);
2191                                 skb_dst_set_noref(skb, &rth->dst);
2192                         } else {
2193                                 dst_use(&rth->dst, jiffies);
2194                                 skb_dst_set(skb, &rth->dst);
2195                         }
2196                         RT_CACHE_STAT_INC(in_hit);
2197                         rcu_read_unlock();
2198                         return 0;
2199                 }
2200                 RT_CACHE_STAT_INC(in_hlist_search);
2201         }
2202
2203 skip_cache:
2204         /* Multicast recognition logic is moved from route cache to here.
2205            The problem was that too many Ethernet cards have broken/missing
2206            hardware multicast filters :-( As result the host on multicasting
2207            network acquires a lot of useless route cache entries, sort of
2208            SDR messages from all the world. Now we try to get rid of them.
2209            Really, provided software IP multicast filter is organized
2210            reasonably (at least, hashed), it does not result in a slowdown
2211            comparing with route cache reject entries.
2212            Note, that multicast routers are not affected, because
2213            route cache entry is created eventually.
2214          */
2215         if (ipv4_is_multicast(daddr)) {
2216                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2217
2218                 if (in_dev) {
2219                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2220                                                   ip_hdr(skb)->protocol);
2221                         if (our
2222 #ifdef CONFIG_IP_MROUTE
2223                                 ||
2224                             (!ipv4_is_local_multicast(daddr) &&
2225                              IN_DEV_MFORWARD(in_dev))
2226 #endif
2227                            ) {
2228                                 int res = ip_route_input_mc(skb, daddr, saddr,
2229                                                             tos, dev, our);
2230                                 rcu_read_unlock();
2231                                 return res;
2232                         }
2233                 }
2234                 rcu_read_unlock();
2235                 return -EINVAL;
2236         }
2237         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2238         rcu_read_unlock();
2239         return res;
2240 }
2241 EXPORT_SYMBOL(ip_route_input_common);
2242
2243 /* called with rcu_read_lock() */
2244 static struct rtable *__mkroute_output(const struct fib_result *res,
2245                                        const struct flowi4 *fl4,
2246                                        __be32 orig_daddr, __be32 orig_saddr,
2247                                        int orig_oif, __u8 orig_rtos,
2248                                        struct net_device *dev_out,
2249                                        unsigned int flags)
2250 {
2251         struct fib_info *fi = res->fi;
2252         struct in_device *in_dev;
2253         u16 type = res->type;
2254         struct rtable *rth;
2255
2256         in_dev = __in_dev_get_rcu(dev_out);
2257         if (!in_dev)
2258                 return ERR_PTR(-EINVAL);
2259
2260         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2261                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2262                         return ERR_PTR(-EINVAL);
2263
2264         if (ipv4_is_lbcast(fl4->daddr))
2265                 type = RTN_BROADCAST;
2266         else if (ipv4_is_multicast(fl4->daddr))
2267                 type = RTN_MULTICAST;
2268         else if (ipv4_is_zeronet(fl4->daddr))
2269                 return ERR_PTR(-EINVAL);
2270
2271         if (dev_out->flags & IFF_LOOPBACK)
2272                 flags |= RTCF_LOCAL;
2273
2274         if (type == RTN_BROADCAST) {
2275                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2276                 fi = NULL;
2277         } else if (type == RTN_MULTICAST) {
2278                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2279                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2280                                      fl4->flowi4_proto))
2281                         flags &= ~RTCF_LOCAL;
2282                 /* If multicast route do not exist use
2283                  * default one, but do not gateway in this case.
2284                  * Yes, it is hack.
2285                  */
2286                 if (fi && res->prefixlen < 4)
2287                         fi = NULL;
2288         }
2289
2290         rth = rt_dst_alloc(dev_out,
2291                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2292                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2293         if (!rth)
2294                 return ERR_PTR(-ENOBUFS);
2295
2296         rth->dst.output = ip_output;
2297
2298         rth->rt_key_dst = orig_daddr;
2299         rth->rt_key_src = orig_saddr;
2300         rth->rt_genid = rt_genid(dev_net(dev_out));
2301         rth->rt_flags   = flags;
2302         rth->rt_type    = type;
2303         rth->rt_key_tos = orig_rtos;
2304         rth->rt_dst     = fl4->daddr;
2305         rth->rt_src     = fl4->saddr;
2306         rth->rt_route_iif = 0;
2307         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2308         rth->rt_oif     = orig_oif;
2309         rth->rt_mark    = fl4->flowi4_mark;
2310         rth->rt_pmtu    = 0;
2311         rth->rt_gateway = fl4->daddr;
2312         rth->fi = NULL;
2313
2314         RT_CACHE_STAT_INC(out_slow_tot);
2315
2316         if (flags & RTCF_LOCAL)
2317                 rth->dst.input = ip_local_deliver;
2318         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2319                 if (flags & RTCF_LOCAL &&
2320                     !(dev_out->flags & IFF_LOOPBACK)) {
2321                         rth->dst.output = ip_mc_output;
2322                         RT_CACHE_STAT_INC(out_slow_mc);
2323                 }
2324 #ifdef CONFIG_IP_MROUTE
2325                 if (type == RTN_MULTICAST) {
2326                         if (IN_DEV_MFORWARD(in_dev) &&
2327                             !ipv4_is_local_multicast(fl4->daddr)) {
2328                                 rth->dst.input = ip_mr_input;
2329                                 rth->dst.output = ip_mc_output;
2330                         }
2331                 }
2332 #endif
2333         }
2334
2335         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2336
2337         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2338                 rth->dst.flags |= DST_NOCACHE;
2339
2340         return rth;
2341 }
2342
2343 /*
2344  * Major route resolver routine.
2345  * called with rcu_read_lock();
2346  */
2347
2348 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2349 {
2350         struct net_device *dev_out = NULL;
2351         __u8 tos = RT_FL_TOS(fl4);
2352         unsigned int flags = 0;
2353         struct fib_result res;
2354         struct rtable *rth;
2355         __be32 orig_daddr;
2356         __be32 orig_saddr;
2357         int orig_oif;
2358
2359         res.tclassid    = 0;
2360         res.fi          = NULL;
2361         res.table       = NULL;
2362
2363         orig_daddr = fl4->daddr;
2364         orig_saddr = fl4->saddr;
2365         orig_oif = fl4->flowi4_oif;
2366
2367         fl4->flowi4_iif = net->loopback_dev->ifindex;
2368         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2369         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2370                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2371
2372         rcu_read_lock();
2373         if (fl4->saddr) {
2374                 rth = ERR_PTR(-EINVAL);
2375                 if (ipv4_is_multicast(fl4->saddr) ||
2376                     ipv4_is_lbcast(fl4->saddr) ||
2377                     ipv4_is_zeronet(fl4->saddr))
2378                         goto out;
2379
2380                 /* I removed check for oif == dev_out->oif here.
2381                    It was wrong for two reasons:
2382                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2383                       is assigned to multiple interfaces.
2384                    2. Moreover, we are allowed to send packets with saddr
2385                       of another iface. --ANK
2386                  */
2387
2388                 if (fl4->flowi4_oif == 0 &&
2389                     (ipv4_is_multicast(fl4->daddr) ||
2390                      ipv4_is_lbcast(fl4->daddr))) {
2391                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2392                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2393                         if (dev_out == NULL)
2394                                 goto out;
2395
2396                         /* Special hack: user can direct multicasts
2397                            and limited broadcast via necessary interface
2398                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2399                            This hack is not just for fun, it allows
2400                            vic,vat and friends to work.
2401                            They bind socket to loopback, set ttl to zero
2402                            and expect that it will work.
2403                            From the viewpoint of routing cache they are broken,
2404                            because we are not allowed to build multicast path
2405                            with loopback source addr (look, routing cache
2406                            cannot know, that ttl is zero, so that packet
2407                            will not leave this host and route is valid).
2408                            Luckily, this hack is good workaround.
2409                          */
2410
2411                         fl4->flowi4_oif = dev_out->ifindex;
2412                         goto make_route;
2413                 }
2414
2415                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2416                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2417                         if (!__ip_dev_find(net, fl4->saddr, false))
2418                                 goto out;
2419                 }
2420         }
2421
2422
2423         if (fl4->flowi4_oif) {
2424                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2425                 rth = ERR_PTR(-ENODEV);
2426                 if (dev_out == NULL)
2427                         goto out;
2428
2429                 /* RACE: Check return value of inet_select_addr instead. */
2430                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2431                         rth = ERR_PTR(-ENETUNREACH);
2432                         goto out;
2433                 }
2434                 if (ipv4_is_local_multicast(fl4->daddr) ||
2435                     ipv4_is_lbcast(fl4->daddr)) {
2436                         if (!fl4->saddr)
2437                                 fl4->saddr = inet_select_addr(dev_out, 0,
2438                                                               RT_SCOPE_LINK);
2439                         goto make_route;
2440                 }
2441                 if (fl4->saddr) {
2442                         if (ipv4_is_multicast(fl4->daddr))
2443                                 fl4->saddr = inet_select_addr(dev_out, 0,
2444                                                               fl4->flowi4_scope);
2445                         else if (!fl4->daddr)
2446                                 fl4->saddr = inet_select_addr(dev_out, 0,
2447                                                               RT_SCOPE_HOST);
2448                 }
2449         }
2450
2451         if (!fl4->daddr) {
2452                 fl4->daddr = fl4->saddr;
2453                 if (!fl4->daddr)
2454                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2455                 dev_out = net->loopback_dev;
2456                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2457                 res.type = RTN_LOCAL;
2458                 flags |= RTCF_LOCAL;
2459                 goto make_route;
2460         }
2461
2462         if (fib_lookup(net, fl4, &res)) {
2463                 res.fi = NULL;
2464                 res.table = NULL;
2465                 if (fl4->flowi4_oif) {
2466                         /* Apparently, routing tables are wrong. Assume,
2467                            that the destination is on link.
2468
2469                            WHY? DW.
2470                            Because we are allowed to send to iface
2471                            even if it has NO routes and NO assigned
2472                            addresses. When oif is specified, routing
2473                            tables are looked up with only one purpose:
2474                            to catch if destination is gatewayed, rather than
2475                            direct. Moreover, if MSG_DONTROUTE is set,
2476                            we send packet, ignoring both routing tables
2477                            and ifaddr state. --ANK
2478
2479
2480                            We could make it even if oif is unknown,
2481                            likely IPv6, but we do not.
2482                          */
2483
2484                         if (fl4->saddr == 0)
2485                                 fl4->saddr = inet_select_addr(dev_out, 0,
2486                                                               RT_SCOPE_LINK);
2487                         res.type = RTN_UNICAST;
2488                         goto make_route;
2489                 }
2490                 rth = ERR_PTR(-ENETUNREACH);
2491                 goto out;
2492         }
2493
2494         if (res.type == RTN_LOCAL) {
2495                 if (!fl4->saddr) {
2496                         if (res.fi->fib_prefsrc)
2497                                 fl4->saddr = res.fi->fib_prefsrc;
2498                         else
2499                                 fl4->saddr = fl4->daddr;
2500                 }
2501                 dev_out = net->loopback_dev;
2502                 fl4->flowi4_oif = dev_out->ifindex;
2503                 res.fi = NULL;
2504                 flags |= RTCF_LOCAL;
2505                 goto make_route;
2506         }
2507
2508 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2509         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2510                 fib_select_multipath(&res);
2511         else
2512 #endif
2513         if (!res.prefixlen &&
2514             res.table->tb_num_default > 1 &&
2515             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2516                 fib_select_default(&res);
2517
2518         if (!fl4->saddr)
2519                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2520
2521         dev_out = FIB_RES_DEV(res);
2522         fl4->flowi4_oif = dev_out->ifindex;
2523
2524
2525 make_route:
2526         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2527                                tos, dev_out, flags);
2528         if (!IS_ERR(rth)) {
2529                 unsigned int hash;
2530
2531                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2532                                rt_genid(dev_net(dev_out)));
2533                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2534         }
2535
2536 out:
2537         rcu_read_unlock();
2538         return rth;
2539 }
2540
2541 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2542 {
2543         struct rtable *rth;
2544         unsigned int hash;
2545
2546         if (!rt_caching(net))
2547                 goto slow_output;
2548
2549         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2550
2551         rcu_read_lock_bh();
2552         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2553                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2554                 if (rth->rt_key_dst == flp4->daddr &&
2555                     rth->rt_key_src == flp4->saddr &&
2556                     rt_is_output_route(rth) &&
2557                     rth->rt_oif == flp4->flowi4_oif &&
2558                     rth->rt_mark == flp4->flowi4_mark &&
2559                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2560                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2561                     net_eq(dev_net(rth->dst.dev), net) &&
2562                     !rt_is_expired(rth)) {
2563                         dst_use(&rth->dst, jiffies);
2564                         RT_CACHE_STAT_INC(out_hit);
2565                         rcu_read_unlock_bh();
2566                         if (!flp4->saddr)
2567                                 flp4->saddr = rth->rt_src;
2568                         if (!flp4->daddr)
2569                                 flp4->daddr = rth->rt_dst;
2570                         return rth;
2571                 }
2572                 RT_CACHE_STAT_INC(out_hlist_search);
2573         }
2574         rcu_read_unlock_bh();
2575
2576 slow_output:
2577         return ip_route_output_slow(net, flp4);
2578 }
2579 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2580
2581 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2582 {
2583         return NULL;
2584 }
2585
2586 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2587 {
2588         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2589
2590         return mtu ? : dst->dev->mtu;
2591 }
2592
2593 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2594                                           struct sk_buff *skb, u32 mtu)
2595 {
2596 }
2597
2598 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2599                                        struct sk_buff *skb)
2600 {
2601 }
2602
2603 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2604                                           unsigned long old)
2605 {
2606         return NULL;
2607 }
2608
2609 static struct dst_ops ipv4_dst_blackhole_ops = {
2610         .family                 =       AF_INET,
2611         .protocol               =       cpu_to_be16(ETH_P_IP),
2612         .destroy                =       ipv4_dst_destroy,
2613         .check                  =       ipv4_blackhole_dst_check,
2614         .mtu                    =       ipv4_blackhole_mtu,
2615         .default_advmss         =       ipv4_default_advmss,
2616         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2617         .redirect               =       ipv4_rt_blackhole_redirect,
2618         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2619         .neigh_lookup           =       ipv4_neigh_lookup,
2620 };
2621
2622 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2623 {
2624         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2625         struct rtable *ort = (struct rtable *) dst_orig;
2626
2627         if (rt) {
2628                 struct dst_entry *new = &rt->dst;
2629
2630                 new->__use = 1;
2631                 new->input = dst_discard;
2632                 new->output = dst_discard;
2633
2634                 new->dev = ort->dst.dev;
2635                 if (new->dev)
2636                         dev_hold(new->dev);
2637
2638                 rt->rt_key_dst = ort->rt_key_dst;
2639                 rt->rt_key_src = ort->rt_key_src;
2640                 rt->rt_key_tos = ort->rt_key_tos;
2641                 rt->rt_route_iif = ort->rt_route_iif;
2642                 rt->rt_iif = ort->rt_iif;
2643                 rt->rt_oif = ort->rt_oif;
2644                 rt->rt_mark = ort->rt_mark;
2645                 rt->rt_pmtu = ort->rt_pmtu;
2646
2647                 rt->rt_genid = rt_genid(net);
2648                 rt->rt_flags = ort->rt_flags;
2649                 rt->rt_type = ort->rt_type;
2650                 rt->rt_dst = ort->rt_dst;
2651                 rt->rt_src = ort->rt_src;
2652                 rt->rt_gateway = ort->rt_gateway;
2653                 rt->fi = ort->fi;
2654                 if (rt->fi)
2655                         atomic_inc(&rt->fi->fib_clntref);
2656
2657                 dst_free(new);
2658         }
2659
2660         dst_release(dst_orig);
2661
2662         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2663 }
2664
2665 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2666                                     struct sock *sk)
2667 {
2668         struct rtable *rt = __ip_route_output_key(net, flp4);
2669
2670         if (IS_ERR(rt))
2671                 return rt;
2672
2673         if (flp4->flowi4_proto)
2674                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2675                                                    flowi4_to_flowi(flp4),
2676                                                    sk, 0);
2677
2678         return rt;
2679 }
2680 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2681
2682 static int rt_fill_info(struct net *net,
2683                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2684                         int nowait, unsigned int flags)
2685 {
2686         struct rtable *rt = skb_rtable(skb);
2687         struct rtmsg *r;
2688         struct nlmsghdr *nlh;
2689         unsigned long expires = 0;
2690         u32 error;
2691
2692         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2693         if (nlh == NULL)
2694                 return -EMSGSIZE;
2695
2696         r = nlmsg_data(nlh);
2697         r->rtm_family    = AF_INET;
2698         r->rtm_dst_len  = 32;
2699         r->rtm_src_len  = 0;
2700         r->rtm_tos      = rt->rt_key_tos;
2701         r->rtm_table    = RT_TABLE_MAIN;
2702         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2703                 goto nla_put_failure;
2704         r->rtm_type     = rt->rt_type;
2705         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2706         r->rtm_protocol = RTPROT_UNSPEC;
2707         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2708         if (rt->rt_flags & RTCF_NOTIFY)
2709                 r->rtm_flags |= RTM_F_NOTIFY;
2710
2711         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2712                 goto nla_put_failure;
2713         if (rt->rt_key_src) {
2714                 r->rtm_src_len = 32;
2715                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2716                         goto nla_put_failure;
2717         }
2718         if (rt->dst.dev &&
2719             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2720                 goto nla_put_failure;
2721 #ifdef CONFIG_IP_ROUTE_CLASSID
2722         if (rt->dst.tclassid &&
2723             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2724                 goto nla_put_failure;
2725 #endif
2726         if (!rt_is_input_route(rt) &&
2727             rt->rt_src != rt->rt_key_src) {
2728                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2729                         goto nla_put_failure;
2730         }
2731         if (rt->rt_dst != rt->rt_gateway &&
2732             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2733                 goto nla_put_failure;
2734
2735         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2736                 goto nla_put_failure;
2737
2738         if (rt->rt_mark &&
2739             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2740                 goto nla_put_failure;
2741
2742         error = rt->dst.error;
2743         expires = rt->dst.expires;
2744         if (expires) {
2745                 if (time_before(jiffies, expires))
2746                         expires -= jiffies;
2747                 else
2748                         expires = 0;
2749         }
2750
2751         if (rt_is_input_route(rt)) {
2752 #ifdef CONFIG_IP_MROUTE
2753                 __be32 dst = rt->rt_dst;
2754
2755                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2756                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2757                         int err = ipmr_get_route(net, skb,
2758                                                  rt->rt_src, rt->rt_dst,
2759                                                  r, nowait);
2760                         if (err <= 0) {
2761                                 if (!nowait) {
2762                                         if (err == 0)
2763                                                 return 0;
2764                                         goto nla_put_failure;
2765                                 } else {
2766                                         if (err == -EMSGSIZE)
2767                                                 goto nla_put_failure;
2768                                         error = err;
2769                                 }
2770                         }
2771                 } else
2772 #endif
2773                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2774                                 goto nla_put_failure;
2775         }
2776
2777         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2778                 goto nla_put_failure;
2779
2780         return nlmsg_end(skb, nlh);
2781
2782 nla_put_failure:
2783         nlmsg_cancel(skb, nlh);
2784         return -EMSGSIZE;
2785 }
2786
2787 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2788 {
2789         struct net *net = sock_net(in_skb->sk);
2790         struct rtmsg *rtm;
2791         struct nlattr *tb[RTA_MAX+1];
2792         struct rtable *rt = NULL;
2793         __be32 dst = 0;
2794         __be32 src = 0;
2795         u32 iif;
2796         int err;
2797         int mark;
2798         struct sk_buff *skb;
2799
2800         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2801         if (err < 0)
2802                 goto errout;
2803
2804         rtm = nlmsg_data(nlh);
2805
2806         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2807         if (skb == NULL) {
2808                 err = -ENOBUFS;
2809                 goto errout;
2810         }
2811
2812         /* Reserve room for dummy headers, this skb can pass
2813            through good chunk of routing engine.
2814          */
2815         skb_reset_mac_header(skb);
2816         skb_reset_network_header(skb);
2817
2818         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2819         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2820         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2821
2822         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2823         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2824         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2825         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2826
2827         if (iif) {
2828                 struct net_device *dev;
2829
2830                 dev = __dev_get_by_index(net, iif);
2831                 if (dev == NULL) {
2832                         err = -ENODEV;
2833                         goto errout_free;
2834                 }
2835
2836                 skb->protocol   = htons(ETH_P_IP);
2837                 skb->dev        = dev;
2838                 skb->mark       = mark;
2839                 local_bh_disable();
2840                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2841                 local_bh_enable();
2842
2843                 rt = skb_rtable(skb);
2844                 if (err == 0 && rt->dst.error)
2845                         err = -rt->dst.error;
2846         } else {
2847                 struct flowi4 fl4 = {
2848                         .daddr = dst,
2849                         .saddr = src,
2850                         .flowi4_tos = rtm->rtm_tos,
2851                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2852                         .flowi4_mark = mark,
2853                 };
2854                 rt = ip_route_output_key(net, &fl4);
2855
2856                 err = 0;
2857                 if (IS_ERR(rt))
2858                         err = PTR_ERR(rt);
2859         }
2860
2861         if (err)
2862                 goto errout_free;
2863
2864         skb_dst_set(skb, &rt->dst);
2865         if (rtm->rtm_flags & RTM_F_NOTIFY)
2866                 rt->rt_flags |= RTCF_NOTIFY;
2867
2868         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2869                            RTM_NEWROUTE, 0, 0);
2870         if (err <= 0)
2871                 goto errout_free;
2872
2873         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2874 errout:
2875         return err;
2876
2877 errout_free:
2878         kfree_skb(skb);
2879         goto errout;
2880 }
2881
2882 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2883 {
2884         struct rtable *rt;
2885         int h, s_h;
2886         int idx, s_idx;
2887         struct net *net;
2888
2889         net = sock_net(skb->sk);
2890
2891         s_h = cb->args[0];
2892         if (s_h < 0)
2893                 s_h = 0;
2894         s_idx = idx = cb->args[1];
2895         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2896                 if (!rt_hash_table[h].chain)
2897                         continue;
2898                 rcu_read_lock_bh();
2899                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2900                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2901                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2902                                 continue;
2903                         if (rt_is_expired(rt))
2904                                 continue;
2905                         skb_dst_set_noref(skb, &rt->dst);
2906                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2907                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2908                                          1, NLM_F_MULTI) <= 0) {
2909                                 skb_dst_drop(skb);
2910                                 rcu_read_unlock_bh();
2911                                 goto done;
2912                         }
2913                         skb_dst_drop(skb);
2914                 }
2915                 rcu_read_unlock_bh();
2916         }
2917
2918 done:
2919         cb->args[0] = h;
2920         cb->args[1] = idx;
2921         return skb->len;
2922 }
2923
2924 void ip_rt_multicast_event(struct in_device *in_dev)
2925 {
2926         rt_cache_flush(dev_net(in_dev->dev), 0);
2927 }
2928
2929 #ifdef CONFIG_SYSCTL
2930 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2931                                         void __user *buffer,
2932                                         size_t *lenp, loff_t *ppos)
2933 {
2934         if (write) {
2935                 int flush_delay;
2936                 ctl_table ctl;
2937                 struct net *net;
2938
2939                 memcpy(&ctl, __ctl, sizeof(ctl));
2940                 ctl.data = &flush_delay;
2941                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2942
2943                 net = (struct net *)__ctl->extra1;
2944                 rt_cache_flush(net, flush_delay);
2945                 return 0;
2946         }
2947
2948         return -EINVAL;
2949 }
2950
2951 static ctl_table ipv4_route_table[] = {
2952         {
2953                 .procname       = "gc_thresh",
2954                 .data           = &ipv4_dst_ops.gc_thresh,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = proc_dointvec,
2958         },
2959         {
2960                 .procname       = "max_size",
2961                 .data           = &ip_rt_max_size,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = proc_dointvec,
2965         },
2966         {
2967                 /*  Deprecated. Use gc_min_interval_ms */
2968
2969                 .procname       = "gc_min_interval",
2970                 .data           = &ip_rt_gc_min_interval,
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0644,
2973                 .proc_handler   = proc_dointvec_jiffies,
2974         },
2975         {
2976                 .procname       = "gc_min_interval_ms",
2977                 .data           = &ip_rt_gc_min_interval,
2978                 .maxlen         = sizeof(int),
2979                 .mode           = 0644,
2980                 .proc_handler   = proc_dointvec_ms_jiffies,
2981         },
2982         {
2983                 .procname       = "gc_timeout",
2984                 .data           = &ip_rt_gc_timeout,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = proc_dointvec_jiffies,
2988         },
2989         {
2990                 .procname       = "gc_interval",
2991                 .data           = &ip_rt_gc_interval,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = proc_dointvec_jiffies,
2995         },
2996         {
2997                 .procname       = "redirect_load",
2998                 .data           = &ip_rt_redirect_load,
2999                 .maxlen         = sizeof(int),
3000                 .mode           = 0644,
3001                 .proc_handler   = proc_dointvec,
3002         },
3003         {
3004                 .procname       = "redirect_number",
3005                 .data           = &ip_rt_redirect_number,
3006                 .maxlen         = sizeof(int),
3007                 .mode           = 0644,
3008                 .proc_handler   = proc_dointvec,
3009         },
3010         {
3011                 .procname       = "redirect_silence",
3012                 .data           = &ip_rt_redirect_silence,
3013                 .maxlen         = sizeof(int),
3014                 .mode           = 0644,
3015                 .proc_handler   = proc_dointvec,
3016         },
3017         {
3018                 .procname       = "error_cost",
3019                 .data           = &ip_rt_error_cost,
3020                 .maxlen         = sizeof(int),
3021                 .mode           = 0644,
3022                 .proc_handler   = proc_dointvec,
3023         },
3024         {
3025                 .procname       = "error_burst",
3026                 .data           = &ip_rt_error_burst,
3027                 .maxlen         = sizeof(int),
3028                 .mode           = 0644,
3029                 .proc_handler   = proc_dointvec,
3030         },
3031         {
3032                 .procname       = "gc_elasticity",
3033                 .data           = &ip_rt_gc_elasticity,
3034                 .maxlen         = sizeof(int),
3035                 .mode           = 0644,
3036                 .proc_handler   = proc_dointvec,
3037         },
3038         {
3039                 .procname       = "mtu_expires",
3040                 .data           = &ip_rt_mtu_expires,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = proc_dointvec_jiffies,
3044         },
3045         {
3046                 .procname       = "min_pmtu",
3047                 .data           = &ip_rt_min_pmtu,
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0644,
3050                 .proc_handler   = proc_dointvec,
3051         },
3052         {
3053                 .procname       = "min_adv_mss",
3054                 .data           = &ip_rt_min_advmss,
3055                 .maxlen         = sizeof(int),
3056                 .mode           = 0644,
3057                 .proc_handler   = proc_dointvec,
3058         },
3059         { }
3060 };
3061
3062 static struct ctl_table ipv4_route_flush_table[] = {
3063         {
3064                 .procname       = "flush",
3065                 .maxlen         = sizeof(int),
3066                 .mode           = 0200,
3067                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3068         },
3069         { },
3070 };
3071
3072 static __net_init int sysctl_route_net_init(struct net *net)
3073 {
3074         struct ctl_table *tbl;
3075
3076         tbl = ipv4_route_flush_table;
3077         if (!net_eq(net, &init_net)) {
3078                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3079                 if (tbl == NULL)
3080                         goto err_dup;
3081         }
3082         tbl[0].extra1 = net;
3083
3084         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3085         if (net->ipv4.route_hdr == NULL)
3086                 goto err_reg;
3087         return 0;
3088
3089 err_reg:
3090         if (tbl != ipv4_route_flush_table)
3091                 kfree(tbl);
3092 err_dup:
3093         return -ENOMEM;
3094 }
3095
3096 static __net_exit void sysctl_route_net_exit(struct net *net)
3097 {
3098         struct ctl_table *tbl;
3099
3100         tbl = net->ipv4.route_hdr->ctl_table_arg;
3101         unregister_net_sysctl_table(net->ipv4.route_hdr);
3102         BUG_ON(tbl == ipv4_route_flush_table);
3103         kfree(tbl);
3104 }
3105
3106 static __net_initdata struct pernet_operations sysctl_route_ops = {
3107         .init = sysctl_route_net_init,
3108         .exit = sysctl_route_net_exit,
3109 };
3110 #endif
3111
3112 static __net_init int rt_genid_init(struct net *net)
3113 {
3114         get_random_bytes(&net->ipv4.rt_genid,
3115                          sizeof(net->ipv4.rt_genid));
3116         get_random_bytes(&net->ipv4.dev_addr_genid,
3117                          sizeof(net->ipv4.dev_addr_genid));
3118         return 0;
3119 }
3120
3121 static __net_initdata struct pernet_operations rt_genid_ops = {
3122         .init = rt_genid_init,
3123 };
3124
3125 static int __net_init ipv4_inetpeer_init(struct net *net)
3126 {
3127         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3128
3129         if (!bp)
3130                 return -ENOMEM;
3131         inet_peer_base_init(bp);
3132         net->ipv4.peers = bp;
3133         return 0;
3134 }
3135
3136 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3137 {
3138         struct inet_peer_base *bp = net->ipv4.peers;
3139
3140         net->ipv4.peers = NULL;
3141         inetpeer_invalidate_tree(bp);
3142         kfree(bp);
3143 }
3144
3145 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3146         .init   =       ipv4_inetpeer_init,
3147         .exit   =       ipv4_inetpeer_exit,
3148 };
3149
3150 #ifdef CONFIG_IP_ROUTE_CLASSID
3151 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3152 #endif /* CONFIG_IP_ROUTE_CLASSID */
3153
3154 static __initdata unsigned long rhash_entries;
3155 static int __init set_rhash_entries(char *str)
3156 {
3157         ssize_t ret;
3158
3159         if (!str)
3160                 return 0;
3161
3162         ret = kstrtoul(str, 0, &rhash_entries);
3163         if (ret)
3164                 return 0;
3165
3166         return 1;
3167 }
3168 __setup("rhash_entries=", set_rhash_entries);
3169
3170 int __init ip_rt_init(void)
3171 {
3172         int rc = 0;
3173
3174 #ifdef CONFIG_IP_ROUTE_CLASSID
3175         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3176         if (!ip_rt_acct)
3177                 panic("IP: failed to allocate ip_rt_acct\n");
3178 #endif
3179
3180         ipv4_dst_ops.kmem_cachep =
3181                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3182                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3183
3184         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3185
3186         if (dst_entries_init(&ipv4_dst_ops) < 0)
3187                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3188
3189         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3190                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3191
3192         rt_hash_table = (struct rt_hash_bucket *)
3193                 alloc_large_system_hash("IP route cache",
3194                                         sizeof(struct rt_hash_bucket),
3195                                         rhash_entries,
3196                                         (totalram_pages >= 128 * 1024) ?
3197                                         15 : 17,
3198                                         0,
3199                                         &rt_hash_log,
3200                                         &rt_hash_mask,
3201                                         0,
3202                                         rhash_entries ? 0 : 512 * 1024);
3203         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3204         rt_hash_lock_init();
3205
3206         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3207         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3208
3209         devinet_init();
3210         ip_fib_init();
3211
3212         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3213         expires_ljiffies = jiffies;
3214         schedule_delayed_work(&expires_work,
3215                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3216
3217         if (ip_rt_proc_init())
3218                 pr_err("Unable to create route proc files\n");
3219 #ifdef CONFIG_XFRM
3220         xfrm_init();
3221         xfrm4_init(ip_rt_max_size);
3222 #endif
3223         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3224
3225 #ifdef CONFIG_SYSCTL
3226         register_pernet_subsys(&sysctl_route_ops);
3227 #endif
3228         register_pernet_subsys(&rt_genid_ops);
3229         register_pernet_subsys(&ipv4_inetpeer_ops);
3230         return rc;
3231 }
3232
3233 #ifdef CONFIG_SYSCTL
3234 /*
3235  * We really need to sanitize the damn ipv4 init order, then all
3236  * this nonsense will go away.
3237  */
3238 void __init ip_static_sysctl_init(void)
3239 {
3240         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3241 }
3242 #endif