ipv4: Rearrange arguments to ip_rt_redirect()
[linux-3.10.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136 static int rt_chain_length_max __read_mostly    = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155                             int how)
156 {
157 }
158
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161         WARN_ON(1);
162         return NULL;
163 }
164
165 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
166                                            struct sk_buff *skb,
167                                            const void *daddr);
168
169 static struct dst_ops ipv4_dst_ops = {
170         .family =               AF_INET,
171         .protocol =             cpu_to_be16(ETH_P_IP),
172         .gc =                   rt_garbage_collect,
173         .check =                ipv4_dst_check,
174         .default_advmss =       ipv4_default_advmss,
175         .mtu =                  ipv4_mtu,
176         .cow_metrics =          ipv4_cow_metrics,
177         .destroy =              ipv4_dst_destroy,
178         .ifdown =               ipv4_dst_ifdown,
179         .negative_advice =      ipv4_negative_advice,
180         .link_failure =         ipv4_link_failure,
181         .update_pmtu =          ip_rt_update_pmtu,
182         .local_out =            __ip_local_out,
183         .neigh_lookup =         ipv4_neigh_lookup,
184 };
185
186 #define ECN_OR_COST(class)      TC_PRIO_##class
187
188 const __u8 ip_tos2prio[16] = {
189         TC_PRIO_BESTEFFORT,
190         ECN_OR_COST(BESTEFFORT),
191         TC_PRIO_BESTEFFORT,
192         ECN_OR_COST(BESTEFFORT),
193         TC_PRIO_BULK,
194         ECN_OR_COST(BULK),
195         TC_PRIO_BULK,
196         ECN_OR_COST(BULK),
197         TC_PRIO_INTERACTIVE,
198         ECN_OR_COST(INTERACTIVE),
199         TC_PRIO_INTERACTIVE,
200         ECN_OR_COST(INTERACTIVE),
201         TC_PRIO_INTERACTIVE_BULK,
202         ECN_OR_COST(INTERACTIVE_BULK),
203         TC_PRIO_INTERACTIVE_BULK,
204         ECN_OR_COST(INTERACTIVE_BULK)
205 };
206 EXPORT_SYMBOL(ip_tos2prio);
207
208 /*
209  * Route cache.
210  */
211
212 /* The locking scheme is rather straight forward:
213  *
214  * 1) Read-Copy Update protects the buckets of the central route hash.
215  * 2) Only writers remove entries, and they hold the lock
216  *    as they look at rtable reference counts.
217  * 3) Only readers acquire references to rtable entries,
218  *    they do so with atomic increments and with the
219  *    lock held.
220  */
221
222 struct rt_hash_bucket {
223         struct rtable __rcu     *chain;
224 };
225
226 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
227         defined(CONFIG_PROVE_LOCKING)
228 /*
229  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
230  * The size of this table is a power of two and depends on the number of CPUS.
231  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
232  */
233 #ifdef CONFIG_LOCKDEP
234 # define RT_HASH_LOCK_SZ        256
235 #else
236 # if NR_CPUS >= 32
237 #  define RT_HASH_LOCK_SZ       4096
238 # elif NR_CPUS >= 16
239 #  define RT_HASH_LOCK_SZ       2048
240 # elif NR_CPUS >= 8
241 #  define RT_HASH_LOCK_SZ       1024
242 # elif NR_CPUS >= 4
243 #  define RT_HASH_LOCK_SZ       512
244 # else
245 #  define RT_HASH_LOCK_SZ       256
246 # endif
247 #endif
248
249 static spinlock_t       *rt_hash_locks;
250 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
251
252 static __init void rt_hash_lock_init(void)
253 {
254         int i;
255
256         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
257                         GFP_KERNEL);
258         if (!rt_hash_locks)
259                 panic("IP: failed to allocate rt_hash_locks\n");
260
261         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
262                 spin_lock_init(&rt_hash_locks[i]);
263 }
264 #else
265 # define rt_hash_lock_addr(slot) NULL
266
267 static inline void rt_hash_lock_init(void)
268 {
269 }
270 #endif
271
272 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
273 static unsigned int             rt_hash_mask __read_mostly;
274 static unsigned int             rt_hash_log  __read_mostly;
275
276 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
277 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
278
279 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
280                                    int genid)
281 {
282         return jhash_3words((__force u32)daddr, (__force u32)saddr,
283                             idx, genid)
284                 & rt_hash_mask;
285 }
286
287 static inline int rt_genid(struct net *net)
288 {
289         return atomic_read(&net->ipv4.rt_genid);
290 }
291
292 #ifdef CONFIG_PROC_FS
293 struct rt_cache_iter_state {
294         struct seq_net_private p;
295         int bucket;
296         int genid;
297 };
298
299 static struct rtable *rt_cache_get_first(struct seq_file *seq)
300 {
301         struct rt_cache_iter_state *st = seq->private;
302         struct rtable *r = NULL;
303
304         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
305                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
306                         continue;
307                 rcu_read_lock_bh();
308                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
309                 while (r) {
310                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
311                             r->rt_genid == st->genid)
312                                 return r;
313                         r = rcu_dereference_bh(r->dst.rt_next);
314                 }
315                 rcu_read_unlock_bh();
316         }
317         return r;
318 }
319
320 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
321                                           struct rtable *r)
322 {
323         struct rt_cache_iter_state *st = seq->private;
324
325         r = rcu_dereference_bh(r->dst.rt_next);
326         while (!r) {
327                 rcu_read_unlock_bh();
328                 do {
329                         if (--st->bucket < 0)
330                                 return NULL;
331                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
332                 rcu_read_lock_bh();
333                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
334         }
335         return r;
336 }
337
338 static struct rtable *rt_cache_get_next(struct seq_file *seq,
339                                         struct rtable *r)
340 {
341         struct rt_cache_iter_state *st = seq->private;
342         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
343                 if (dev_net(r->dst.dev) != seq_file_net(seq))
344                         continue;
345                 if (r->rt_genid == st->genid)
346                         break;
347         }
348         return r;
349 }
350
351 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
352 {
353         struct rtable *r = rt_cache_get_first(seq);
354
355         if (r)
356                 while (pos && (r = rt_cache_get_next(seq, r)))
357                         --pos;
358         return pos ? NULL : r;
359 }
360
361 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
362 {
363         struct rt_cache_iter_state *st = seq->private;
364         if (*pos)
365                 return rt_cache_get_idx(seq, *pos - 1);
366         st->genid = rt_genid(seq_file_net(seq));
367         return SEQ_START_TOKEN;
368 }
369
370 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
371 {
372         struct rtable *r;
373
374         if (v == SEQ_START_TOKEN)
375                 r = rt_cache_get_first(seq);
376         else
377                 r = rt_cache_get_next(seq, v);
378         ++*pos;
379         return r;
380 }
381
382 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
383 {
384         if (v && v != SEQ_START_TOKEN)
385                 rcu_read_unlock_bh();
386 }
387
388 static int rt_cache_seq_show(struct seq_file *seq, void *v)
389 {
390         if (v == SEQ_START_TOKEN)
391                 seq_printf(seq, "%-127s\n",
392                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
393                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
394                            "HHUptod\tSpecDst");
395         else {
396                 struct rtable *r = v;
397                 int len;
398
399                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
400                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
401                            r->dst.dev ? r->dst.dev->name : "*",
402                            (__force u32)r->rt_dst,
403                            (__force u32)r->rt_gateway,
404                            r->rt_flags, atomic_read(&r->dst.__refcnt),
405                            r->dst.__use, 0, (__force u32)r->rt_src,
406                            dst_metric_advmss(&r->dst) + 40,
407                            dst_metric(&r->dst, RTAX_WINDOW), 0,
408                            r->rt_key_tos,
409                            -1, 0, 0, &len);
410
411                 seq_printf(seq, "%*s\n", 127 - len, "");
412         }
413         return 0;
414 }
415
416 static const struct seq_operations rt_cache_seq_ops = {
417         .start  = rt_cache_seq_start,
418         .next   = rt_cache_seq_next,
419         .stop   = rt_cache_seq_stop,
420         .show   = rt_cache_seq_show,
421 };
422
423 static int rt_cache_seq_open(struct inode *inode, struct file *file)
424 {
425         return seq_open_net(inode, file, &rt_cache_seq_ops,
426                         sizeof(struct rt_cache_iter_state));
427 }
428
429 static const struct file_operations rt_cache_seq_fops = {
430         .owner   = THIS_MODULE,
431         .open    = rt_cache_seq_open,
432         .read    = seq_read,
433         .llseek  = seq_lseek,
434         .release = seq_release_net,
435 };
436
437
438 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
439 {
440         int cpu;
441
442         if (*pos == 0)
443                 return SEQ_START_TOKEN;
444
445         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
446                 if (!cpu_possible(cpu))
447                         continue;
448                 *pos = cpu+1;
449                 return &per_cpu(rt_cache_stat, cpu);
450         }
451         return NULL;
452 }
453
454 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
455 {
456         int cpu;
457
458         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
459                 if (!cpu_possible(cpu))
460                         continue;
461                 *pos = cpu+1;
462                 return &per_cpu(rt_cache_stat, cpu);
463         }
464         return NULL;
465
466 }
467
468 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
469 {
470
471 }
472
473 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
474 {
475         struct rt_cache_stat *st = v;
476
477         if (v == SEQ_START_TOKEN) {
478                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
479                 return 0;
480         }
481
482         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
483                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
484                    dst_entries_get_slow(&ipv4_dst_ops),
485                    st->in_hit,
486                    st->in_slow_tot,
487                    st->in_slow_mc,
488                    st->in_no_route,
489                    st->in_brd,
490                    st->in_martian_dst,
491                    st->in_martian_src,
492
493                    st->out_hit,
494                    st->out_slow_tot,
495                    st->out_slow_mc,
496
497                    st->gc_total,
498                    st->gc_ignored,
499                    st->gc_goal_miss,
500                    st->gc_dst_overflow,
501                    st->in_hlist_search,
502                    st->out_hlist_search
503                 );
504         return 0;
505 }
506
507 static const struct seq_operations rt_cpu_seq_ops = {
508         .start  = rt_cpu_seq_start,
509         .next   = rt_cpu_seq_next,
510         .stop   = rt_cpu_seq_stop,
511         .show   = rt_cpu_seq_show,
512 };
513
514
515 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
516 {
517         return seq_open(file, &rt_cpu_seq_ops);
518 }
519
520 static const struct file_operations rt_cpu_seq_fops = {
521         .owner   = THIS_MODULE,
522         .open    = rt_cpu_seq_open,
523         .read    = seq_read,
524         .llseek  = seq_lseek,
525         .release = seq_release,
526 };
527
528 #ifdef CONFIG_IP_ROUTE_CLASSID
529 static int rt_acct_proc_show(struct seq_file *m, void *v)
530 {
531         struct ip_rt_acct *dst, *src;
532         unsigned int i, j;
533
534         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
535         if (!dst)
536                 return -ENOMEM;
537
538         for_each_possible_cpu(i) {
539                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
540                 for (j = 0; j < 256; j++) {
541                         dst[j].o_bytes   += src[j].o_bytes;
542                         dst[j].o_packets += src[j].o_packets;
543                         dst[j].i_bytes   += src[j].i_bytes;
544                         dst[j].i_packets += src[j].i_packets;
545                 }
546         }
547
548         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
549         kfree(dst);
550         return 0;
551 }
552
553 static int rt_acct_proc_open(struct inode *inode, struct file *file)
554 {
555         return single_open(file, rt_acct_proc_show, NULL);
556 }
557
558 static const struct file_operations rt_acct_proc_fops = {
559         .owner          = THIS_MODULE,
560         .open           = rt_acct_proc_open,
561         .read           = seq_read,
562         .llseek         = seq_lseek,
563         .release        = single_release,
564 };
565 #endif
566
567 static int __net_init ip_rt_do_proc_init(struct net *net)
568 {
569         struct proc_dir_entry *pde;
570
571         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
572                         &rt_cache_seq_fops);
573         if (!pde)
574                 goto err1;
575
576         pde = proc_create("rt_cache", S_IRUGO,
577                           net->proc_net_stat, &rt_cpu_seq_fops);
578         if (!pde)
579                 goto err2;
580
581 #ifdef CONFIG_IP_ROUTE_CLASSID
582         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
583         if (!pde)
584                 goto err3;
585 #endif
586         return 0;
587
588 #ifdef CONFIG_IP_ROUTE_CLASSID
589 err3:
590         remove_proc_entry("rt_cache", net->proc_net_stat);
591 #endif
592 err2:
593         remove_proc_entry("rt_cache", net->proc_net);
594 err1:
595         return -ENOMEM;
596 }
597
598 static void __net_exit ip_rt_do_proc_exit(struct net *net)
599 {
600         remove_proc_entry("rt_cache", net->proc_net_stat);
601         remove_proc_entry("rt_cache", net->proc_net);
602 #ifdef CONFIG_IP_ROUTE_CLASSID
603         remove_proc_entry("rt_acct", net->proc_net);
604 #endif
605 }
606
607 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
608         .init = ip_rt_do_proc_init,
609         .exit = ip_rt_do_proc_exit,
610 };
611
612 static int __init ip_rt_proc_init(void)
613 {
614         return register_pernet_subsys(&ip_rt_proc_ops);
615 }
616
617 #else
618 static inline int ip_rt_proc_init(void)
619 {
620         return 0;
621 }
622 #endif /* CONFIG_PROC_FS */
623
624 static inline void rt_free(struct rtable *rt)
625 {
626         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
627 }
628
629 static inline void rt_drop(struct rtable *rt)
630 {
631         ip_rt_put(rt);
632         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
633 }
634
635 static inline int rt_fast_clean(struct rtable *rth)
636 {
637         /* Kill broadcast/multicast entries very aggresively, if they
638            collide in hash table with more useful entries */
639         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
640                 rt_is_input_route(rth) && rth->dst.rt_next;
641 }
642
643 static inline int rt_valuable(struct rtable *rth)
644 {
645         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
646                 rth->dst.expires;
647 }
648
649 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
650 {
651         unsigned long age;
652         int ret = 0;
653
654         if (atomic_read(&rth->dst.__refcnt))
655                 goto out;
656
657         age = jiffies - rth->dst.lastuse;
658         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
659             (age <= tmo2 && rt_valuable(rth)))
660                 goto out;
661         ret = 1;
662 out:    return ret;
663 }
664
665 /* Bits of score are:
666  * 31: very valuable
667  * 30: not quite useless
668  * 29..0: usage counter
669  */
670 static inline u32 rt_score(struct rtable *rt)
671 {
672         u32 score = jiffies - rt->dst.lastuse;
673
674         score = ~score & ~(3<<30);
675
676         if (rt_valuable(rt))
677                 score |= (1<<31);
678
679         if (rt_is_output_route(rt) ||
680             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
681                 score |= (1<<30);
682
683         return score;
684 }
685
686 static inline bool rt_caching(const struct net *net)
687 {
688         return net->ipv4.current_rt_cache_rebuild_count <=
689                 net->ipv4.sysctl_rt_cache_rebuild_count;
690 }
691
692 static inline bool compare_hash_inputs(const struct rtable *rt1,
693                                        const struct rtable *rt2)
694 {
695         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
696                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
697                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
698 }
699
700 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
701 {
702         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
703                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
704                 (rt1->rt_mark ^ rt2->rt_mark) |
705                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
706                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
707                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
708 }
709
710 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
711 {
712         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
713 }
714
715 static inline int rt_is_expired(struct rtable *rth)
716 {
717         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
718 }
719
720 /*
721  * Perform a full scan of hash table and free all entries.
722  * Can be called by a softirq or a process.
723  * In the later case, we want to be reschedule if necessary
724  */
725 static void rt_do_flush(struct net *net, int process_context)
726 {
727         unsigned int i;
728         struct rtable *rth, *next;
729
730         for (i = 0; i <= rt_hash_mask; i++) {
731                 struct rtable __rcu **pprev;
732                 struct rtable *list;
733
734                 if (process_context && need_resched())
735                         cond_resched();
736                 rth = rcu_access_pointer(rt_hash_table[i].chain);
737                 if (!rth)
738                         continue;
739
740                 spin_lock_bh(rt_hash_lock_addr(i));
741
742                 list = NULL;
743                 pprev = &rt_hash_table[i].chain;
744                 rth = rcu_dereference_protected(*pprev,
745                         lockdep_is_held(rt_hash_lock_addr(i)));
746
747                 while (rth) {
748                         next = rcu_dereference_protected(rth->dst.rt_next,
749                                 lockdep_is_held(rt_hash_lock_addr(i)));
750
751                         if (!net ||
752                             net_eq(dev_net(rth->dst.dev), net)) {
753                                 rcu_assign_pointer(*pprev, next);
754                                 rcu_assign_pointer(rth->dst.rt_next, list);
755                                 list = rth;
756                         } else {
757                                 pprev = &rth->dst.rt_next;
758                         }
759                         rth = next;
760                 }
761
762                 spin_unlock_bh(rt_hash_lock_addr(i));
763
764                 for (; list; list = next) {
765                         next = rcu_dereference_protected(list->dst.rt_next, 1);
766                         rt_free(list);
767                 }
768         }
769 }
770
771 /*
772  * While freeing expired entries, we compute average chain length
773  * and standard deviation, using fixed-point arithmetic.
774  * This to have an estimation of rt_chain_length_max
775  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
776  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
777  */
778
779 #define FRACT_BITS 3
780 #define ONE (1UL << FRACT_BITS)
781
782 /*
783  * Given a hash chain and an item in this hash chain,
784  * find if a previous entry has the same hash_inputs
785  * (but differs on tos, mark or oif)
786  * Returns 0 if an alias is found.
787  * Returns ONE if rth has no alias before itself.
788  */
789 static int has_noalias(const struct rtable *head, const struct rtable *rth)
790 {
791         const struct rtable *aux = head;
792
793         while (aux != rth) {
794                 if (compare_hash_inputs(aux, rth))
795                         return 0;
796                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
797         }
798         return ONE;
799 }
800
801 static void rt_check_expire(void)
802 {
803         static unsigned int rover;
804         unsigned int i = rover, goal;
805         struct rtable *rth;
806         struct rtable __rcu **rthp;
807         unsigned long samples = 0;
808         unsigned long sum = 0, sum2 = 0;
809         unsigned long delta;
810         u64 mult;
811
812         delta = jiffies - expires_ljiffies;
813         expires_ljiffies = jiffies;
814         mult = ((u64)delta) << rt_hash_log;
815         if (ip_rt_gc_timeout > 1)
816                 do_div(mult, ip_rt_gc_timeout);
817         goal = (unsigned int)mult;
818         if (goal > rt_hash_mask)
819                 goal = rt_hash_mask + 1;
820         for (; goal > 0; goal--) {
821                 unsigned long tmo = ip_rt_gc_timeout;
822                 unsigned long length;
823
824                 i = (i + 1) & rt_hash_mask;
825                 rthp = &rt_hash_table[i].chain;
826
827                 if (need_resched())
828                         cond_resched();
829
830                 samples++;
831
832                 if (rcu_dereference_raw(*rthp) == NULL)
833                         continue;
834                 length = 0;
835                 spin_lock_bh(rt_hash_lock_addr(i));
836                 while ((rth = rcu_dereference_protected(*rthp,
837                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
838                         prefetch(rth->dst.rt_next);
839                         if (rt_is_expired(rth) ||
840                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
841                                 *rthp = rth->dst.rt_next;
842                                 rt_free(rth);
843                                 continue;
844                         }
845
846                         /* We only count entries on a chain with equal
847                          * hash inputs once so that entries for
848                          * different QOS levels, and other non-hash
849                          * input attributes don't unfairly skew the
850                          * length computation
851                          */
852                         tmo >>= 1;
853                         rthp = &rth->dst.rt_next;
854                         length += has_noalias(rt_hash_table[i].chain, rth);
855                 }
856                 spin_unlock_bh(rt_hash_lock_addr(i));
857                 sum += length;
858                 sum2 += length*length;
859         }
860         if (samples) {
861                 unsigned long avg = sum / samples;
862                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863                 rt_chain_length_max = max_t(unsigned long,
864                                         ip_rt_gc_elasticity,
865                                         (avg + 4*sd) >> FRACT_BITS);
866         }
867         rover = i;
868 }
869
870 /*
871  * rt_worker_func() is run in process context.
872  * we call rt_check_expire() to scan part of the hash table
873  */
874 static void rt_worker_func(struct work_struct *work)
875 {
876         rt_check_expire();
877         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 }
879
880 /*
881  * Perturbation of rt_genid by a small quantity [1..256]
882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883  * many times (2^24) without giving recent rt_genid.
884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
885  */
886 static void rt_cache_invalidate(struct net *net)
887 {
888         unsigned char shuffle;
889
890         get_random_bytes(&shuffle, sizeof(shuffle));
891         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 }
893
894 /*
895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
896  * delay >= 0 : invalidate & flush cache (can be long)
897  */
898 void rt_cache_flush(struct net *net, int delay)
899 {
900         rt_cache_invalidate(net);
901         if (delay >= 0)
902                 rt_do_flush(net, !in_softirq());
903 }
904
905 /* Flush previous cache invalidated entries from the cache */
906 void rt_cache_flush_batch(struct net *net)
907 {
908         rt_do_flush(net, !in_softirq());
909 }
910
911 static void rt_emergency_hash_rebuild(struct net *net)
912 {
913         net_warn_ratelimited("Route hash chain too long!\n");
914         rt_cache_invalidate(net);
915 }
916
917 /*
918    Short description of GC goals.
919
920    We want to build algorithm, which will keep routing cache
921    at some equilibrium point, when number of aged off entries
922    is kept approximately equal to newly generated ones.
923
924    Current expiration strength is variable "expire".
925    We try to adjust it dynamically, so that if networking
926    is idle expires is large enough to keep enough of warm entries,
927    and when load increases it reduces to limit cache size.
928  */
929
930 static int rt_garbage_collect(struct dst_ops *ops)
931 {
932         static unsigned long expire = RT_GC_TIMEOUT;
933         static unsigned long last_gc;
934         static int rover;
935         static int equilibrium;
936         struct rtable *rth;
937         struct rtable __rcu **rthp;
938         unsigned long now = jiffies;
939         int goal;
940         int entries = dst_entries_get_fast(&ipv4_dst_ops);
941
942         /*
943          * Garbage collection is pretty expensive,
944          * do not make it too frequently.
945          */
946
947         RT_CACHE_STAT_INC(gc_total);
948
949         if (now - last_gc < ip_rt_gc_min_interval &&
950             entries < ip_rt_max_size) {
951                 RT_CACHE_STAT_INC(gc_ignored);
952                 goto out;
953         }
954
955         entries = dst_entries_get_slow(&ipv4_dst_ops);
956         /* Calculate number of entries, which we want to expire now. */
957         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
958         if (goal <= 0) {
959                 if (equilibrium < ipv4_dst_ops.gc_thresh)
960                         equilibrium = ipv4_dst_ops.gc_thresh;
961                 goal = entries - equilibrium;
962                 if (goal > 0) {
963                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
964                         goal = entries - equilibrium;
965                 }
966         } else {
967                 /* We are in dangerous area. Try to reduce cache really
968                  * aggressively.
969                  */
970                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971                 equilibrium = entries - goal;
972         }
973
974         if (now - last_gc >= ip_rt_gc_min_interval)
975                 last_gc = now;
976
977         if (goal <= 0) {
978                 equilibrium += goal;
979                 goto work_done;
980         }
981
982         do {
983                 int i, k;
984
985                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
986                         unsigned long tmo = expire;
987
988                         k = (k + 1) & rt_hash_mask;
989                         rthp = &rt_hash_table[k].chain;
990                         spin_lock_bh(rt_hash_lock_addr(k));
991                         while ((rth = rcu_dereference_protected(*rthp,
992                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
993                                 if (!rt_is_expired(rth) &&
994                                         !rt_may_expire(rth, tmo, expire)) {
995                                         tmo >>= 1;
996                                         rthp = &rth->dst.rt_next;
997                                         continue;
998                                 }
999                                 *rthp = rth->dst.rt_next;
1000                                 rt_free(rth);
1001                                 goal--;
1002                         }
1003                         spin_unlock_bh(rt_hash_lock_addr(k));
1004                         if (goal <= 0)
1005                                 break;
1006                 }
1007                 rover = k;
1008
1009                 if (goal <= 0)
1010                         goto work_done;
1011
1012                 /* Goal is not achieved. We stop process if:
1013
1014                    - if expire reduced to zero. Otherwise, expire is halfed.
1015                    - if table is not full.
1016                    - if we are called from interrupt.
1017                    - jiffies check is just fallback/debug loop breaker.
1018                      We will not spin here for long time in any case.
1019                  */
1020
1021                 RT_CACHE_STAT_INC(gc_goal_miss);
1022
1023                 if (expire == 0)
1024                         break;
1025
1026                 expire >>= 1;
1027
1028                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1029                         goto out;
1030         } while (!in_softirq() && time_before_eq(jiffies, now));
1031
1032         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                 goto out;
1034         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1035                 goto out;
1036         net_warn_ratelimited("dst cache overflow\n");
1037         RT_CACHE_STAT_INC(gc_dst_overflow);
1038         return 1;
1039
1040 work_done:
1041         expire += ip_rt_gc_min_interval;
1042         if (expire > ip_rt_gc_timeout ||
1043             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1044             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1045                 expire = ip_rt_gc_timeout;
1046 out:    return 0;
1047 }
1048
1049 /*
1050  * Returns number of entries in a hash chain that have different hash_inputs
1051  */
1052 static int slow_chain_length(const struct rtable *head)
1053 {
1054         int length = 0;
1055         const struct rtable *rth = head;
1056
1057         while (rth) {
1058                 length += has_noalias(head, rth);
1059                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1060         }
1061         return length >> FRACT_BITS;
1062 }
1063
1064 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1065                                            struct sk_buff *skb,
1066                                            const void *daddr)
1067 {
1068         struct net_device *dev = dst->dev;
1069         const __be32 *pkey = daddr;
1070         const struct rtable *rt;
1071         struct neighbour *n;
1072
1073         rt = (const struct rtable *) dst;
1074         if (rt->rt_gateway)
1075                 pkey = (const __be32 *) &rt->rt_gateway;
1076         else if (skb)
1077                 pkey = &ip_hdr(skb)->daddr;
1078
1079         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1080         if (n)
1081                 return n;
1082         return neigh_create(&arp_tbl, pkey, dev);
1083 }
1084
1085 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1086                                      struct sk_buff *skb, int ifindex)
1087 {
1088         struct rtable   *rth, *cand;
1089         struct rtable __rcu **rthp, **candp;
1090         unsigned long   now;
1091         u32             min_score;
1092         int             chain_length;
1093
1094 restart:
1095         chain_length = 0;
1096         min_score = ~(u32)0;
1097         cand = NULL;
1098         candp = NULL;
1099         now = jiffies;
1100
1101         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1102                 /*
1103                  * If we're not caching, just tell the caller we
1104                  * were successful and don't touch the route.  The
1105                  * caller hold the sole reference to the cache entry, and
1106                  * it will be released when the caller is done with it.
1107                  * If we drop it here, the callers have no way to resolve routes
1108                  * when we're not caching.  Instead, just point *rp at rt, so
1109                  * the caller gets a single use out of the route
1110                  * Note that we do rt_free on this new route entry, so that
1111                  * once its refcount hits zero, we are still able to reap it
1112                  * (Thanks Alexey)
1113                  * Note: To avoid expensive rcu stuff for this uncached dst,
1114                  * we set DST_NOCACHE so that dst_release() can free dst without
1115                  * waiting a grace period.
1116                  */
1117
1118                 rt->dst.flags |= DST_NOCACHE;
1119                 goto skip_hashing;
1120         }
1121
1122         rthp = &rt_hash_table[hash].chain;
1123
1124         spin_lock_bh(rt_hash_lock_addr(hash));
1125         while ((rth = rcu_dereference_protected(*rthp,
1126                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1127                 if (rt_is_expired(rth)) {
1128                         *rthp = rth->dst.rt_next;
1129                         rt_free(rth);
1130                         continue;
1131                 }
1132                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1133                         /* Put it first */
1134                         *rthp = rth->dst.rt_next;
1135                         /*
1136                          * Since lookup is lockfree, the deletion
1137                          * must be visible to another weakly ordered CPU before
1138                          * the insertion at the start of the hash chain.
1139                          */
1140                         rcu_assign_pointer(rth->dst.rt_next,
1141                                            rt_hash_table[hash].chain);
1142                         /*
1143                          * Since lookup is lockfree, the update writes
1144                          * must be ordered for consistency on SMP.
1145                          */
1146                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
1148                         dst_use(&rth->dst, now);
1149                         spin_unlock_bh(rt_hash_lock_addr(hash));
1150
1151                         rt_drop(rt);
1152                         if (skb)
1153                                 skb_dst_set(skb, &rth->dst);
1154                         return rth;
1155                 }
1156
1157                 if (!atomic_read(&rth->dst.__refcnt)) {
1158                         u32 score = rt_score(rth);
1159
1160                         if (score <= min_score) {
1161                                 cand = rth;
1162                                 candp = rthp;
1163                                 min_score = score;
1164                         }
1165                 }
1166
1167                 chain_length++;
1168
1169                 rthp = &rth->dst.rt_next;
1170         }
1171
1172         if (cand) {
1173                 /* ip_rt_gc_elasticity used to be average length of chain
1174                  * length, when exceeded gc becomes really aggressive.
1175                  *
1176                  * The second limit is less certain. At the moment it allows
1177                  * only 2 entries per bucket. We will see.
1178                  */
1179                 if (chain_length > ip_rt_gc_elasticity) {
1180                         *candp = cand->dst.rt_next;
1181                         rt_free(cand);
1182                 }
1183         } else {
1184                 if (chain_length > rt_chain_length_max &&
1185                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1186                         struct net *net = dev_net(rt->dst.dev);
1187                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1188                         if (!rt_caching(net)) {
1189                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1190                                         rt->dst.dev->name, num);
1191                         }
1192                         rt_emergency_hash_rebuild(net);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1196                                         ifindex, rt_genid(net));
1197                         goto restart;
1198                 }
1199         }
1200
1201         rt->dst.rt_next = rt_hash_table[hash].chain;
1202
1203         /*
1204          * Since lookup is lockfree, we must make sure
1205          * previous writes to rt are committed to memory
1206          * before making rt visible to other CPUS.
1207          */
1208         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1209
1210         spin_unlock_bh(rt_hash_lock_addr(hash));
1211
1212 skip_hashing:
1213         if (skb)
1214                 skb_dst_set(skb, &rt->dst);
1215         return rt;
1216 }
1217
1218 /*
1219  * Peer allocation may fail only in serious out-of-memory conditions.  However
1220  * we still can generate some output.
1221  * Random ID selection looks a bit dangerous because we have no chances to
1222  * select ID being unique in a reasonable period of time.
1223  * But broken packet identifier may be better than no packet at all.
1224  */
1225 static void ip_select_fb_ident(struct iphdr *iph)
1226 {
1227         static DEFINE_SPINLOCK(ip_fb_id_lock);
1228         static u32 ip_fallback_id;
1229         u32 salt;
1230
1231         spin_lock_bh(&ip_fb_id_lock);
1232         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1233         iph->id = htons(salt & 0xFFFF);
1234         ip_fallback_id = salt;
1235         spin_unlock_bh(&ip_fb_id_lock);
1236 }
1237
1238 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1239 {
1240         struct net *net = dev_net(dst->dev);
1241         struct inet_peer *peer;
1242
1243         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1244         if (peer) {
1245                 iph->id = htons(inet_getid(peer, more));
1246                 inet_putpeer(peer);
1247                 return;
1248         }
1249
1250         ip_select_fb_ident(iph);
1251 }
1252 EXPORT_SYMBOL(__ip_select_ident);
1253
1254 static void rt_del(unsigned int hash, struct rtable *rt)
1255 {
1256         struct rtable __rcu **rthp;
1257         struct rtable *aux;
1258
1259         rthp = &rt_hash_table[hash].chain;
1260         spin_lock_bh(rt_hash_lock_addr(hash));
1261         ip_rt_put(rt);
1262         while ((aux = rcu_dereference_protected(*rthp,
1263                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1264                 if (aux == rt || rt_is_expired(aux)) {
1265                         *rthp = aux->dst.rt_next;
1266                         rt_free(aux);
1267                         continue;
1268                 }
1269                 rthp = &aux->dst.rt_next;
1270         }
1271         spin_unlock_bh(rt_hash_lock_addr(hash));
1272 }
1273
1274 static void ip_do_redirect(struct rtable *rt, __be32 old_gw, __be32 new_gw)
1275 {
1276         struct neighbour *n;
1277
1278         if (rt->rt_gateway != old_gw)
1279                 return;
1280
1281         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1282         if (n) {
1283                 if (!(n->nud_state & NUD_VALID)) {
1284                         neigh_event_send(n, NULL);
1285                 } else {
1286                         rt->rt_gateway = new_gw;
1287                         rt->rt_flags |= RTCF_REDIRECTED;
1288                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1289                 }
1290                 neigh_release(n);
1291         }
1292 }
1293
1294 /* called in rcu_read_lock() section */
1295 void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
1296 {
1297         const struct iphdr *iph = (const struct iphdr *) skb->data;
1298         __be32 old_gw = ip_hdr(skb)->saddr;
1299         __be32 daddr = iph->daddr;
1300         __be32 saddr = iph->saddr;
1301         struct net_device *dev = skb->dev;
1302         struct in_device *in_dev = __in_dev_get_rcu(dev);
1303         int    ikeys[2] = { dev->ifindex, 0 };
1304         __be32 skeys[2] = { saddr, 0 };
1305         struct net *net;
1306         int s, i;
1307
1308         if (!in_dev)
1309                 return;
1310
1311         switch (icmp_hdr(skb)->code & 7) {
1312         case ICMP_REDIR_NET:
1313         case ICMP_REDIR_NETTOS:
1314         case ICMP_REDIR_HOST:
1315         case ICMP_REDIR_HOSTTOS:
1316                 break;
1317
1318         default:
1319                 return;
1320         }
1321
1322         net = dev_net(dev);
1323         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1324             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1325             ipv4_is_zeronet(new_gw))
1326                 goto reject_redirect;
1327
1328         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1329                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1330                         goto reject_redirect;
1331                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1332                         goto reject_redirect;
1333         } else {
1334                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1335                         goto reject_redirect;
1336         }
1337
1338         for (s = 0; s < 2; s++) {
1339                 for (i = 0; i < 2; i++) {
1340                         unsigned int hash;
1341                         struct rtable __rcu **rthp;
1342                         struct rtable *rt;
1343
1344                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1345
1346                         rthp = &rt_hash_table[hash].chain;
1347
1348                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1349                                 rthp = &rt->dst.rt_next;
1350
1351                                 if (rt->rt_key_dst != daddr ||
1352                                     rt->rt_key_src != skeys[s] ||
1353                                     rt->rt_oif != ikeys[i] ||
1354                                     rt_is_input_route(rt) ||
1355                                     rt_is_expired(rt) ||
1356                                     !net_eq(dev_net(rt->dst.dev), net) ||
1357                                     rt->dst.error ||
1358                                     rt->dst.dev != dev)
1359                                         continue;
1360
1361                                 ip_do_redirect(rt, old_gw, new_gw);
1362                         }
1363                 }
1364         }
1365         return;
1366
1367 reject_redirect:
1368 #ifdef CONFIG_IP_ROUTE_VERBOSE
1369         if (IN_DEV_LOG_MARTIANS(in_dev))
1370                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1371                                      "  Advised path = %pI4 -> %pI4\n",
1372                                      &old_gw, dev->name, &new_gw,
1373                                      &saddr, &daddr);
1374 #endif
1375         ;
1376 }
1377
1378 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1379 {
1380         struct rtable *rt = (struct rtable *)dst;
1381         struct dst_entry *ret = dst;
1382
1383         if (rt) {
1384                 if (dst->obsolete > 0) {
1385                         ip_rt_put(rt);
1386                         ret = NULL;
1387                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1388                            rt->dst.expires) {
1389                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1390                                                 rt->rt_oif,
1391                                                 rt_genid(dev_net(dst->dev)));
1392                         rt_del(hash, rt);
1393                         ret = NULL;
1394                 }
1395         }
1396         return ret;
1397 }
1398
1399 /*
1400  * Algorithm:
1401  *      1. The first ip_rt_redirect_number redirects are sent
1402  *         with exponential backoff, then we stop sending them at all,
1403  *         assuming that the host ignores our redirects.
1404  *      2. If we did not see packets requiring redirects
1405  *         during ip_rt_redirect_silence, we assume that the host
1406  *         forgot redirected route and start to send redirects again.
1407  *
1408  * This algorithm is much cheaper and more intelligent than dumb load limiting
1409  * in icmp.c.
1410  *
1411  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1412  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1413  */
1414
1415 void ip_rt_send_redirect(struct sk_buff *skb)
1416 {
1417         struct rtable *rt = skb_rtable(skb);
1418         struct in_device *in_dev;
1419         struct inet_peer *peer;
1420         struct net *net;
1421         int log_martians;
1422
1423         rcu_read_lock();
1424         in_dev = __in_dev_get_rcu(rt->dst.dev);
1425         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1426                 rcu_read_unlock();
1427                 return;
1428         }
1429         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1430         rcu_read_unlock();
1431
1432         net = dev_net(rt->dst.dev);
1433         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1434         if (!peer) {
1435                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1436                 return;
1437         }
1438
1439         /* No redirected packets during ip_rt_redirect_silence;
1440          * reset the algorithm.
1441          */
1442         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1443                 peer->rate_tokens = 0;
1444
1445         /* Too many ignored redirects; do not send anything
1446          * set dst.rate_last to the last seen redirected packet.
1447          */
1448         if (peer->rate_tokens >= ip_rt_redirect_number) {
1449                 peer->rate_last = jiffies;
1450                 goto out_put_peer;
1451         }
1452
1453         /* Check for load limit; set rate_last to the latest sent
1454          * redirect.
1455          */
1456         if (peer->rate_tokens == 0 ||
1457             time_after(jiffies,
1458                        (peer->rate_last +
1459                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1460                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1461                 peer->rate_last = jiffies;
1462                 ++peer->rate_tokens;
1463 #ifdef CONFIG_IP_ROUTE_VERBOSE
1464                 if (log_martians &&
1465                     peer->rate_tokens == ip_rt_redirect_number)
1466                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1467                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1468                                              &rt->rt_dst, &rt->rt_gateway);
1469 #endif
1470         }
1471 out_put_peer:
1472         inet_putpeer(peer);
1473 }
1474
1475 static int ip_error(struct sk_buff *skb)
1476 {
1477         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1478         struct rtable *rt = skb_rtable(skb);
1479         struct inet_peer *peer;
1480         unsigned long now;
1481         struct net *net;
1482         bool send;
1483         int code;
1484
1485         net = dev_net(rt->dst.dev);
1486         if (!IN_DEV_FORWARD(in_dev)) {
1487                 switch (rt->dst.error) {
1488                 case EHOSTUNREACH:
1489                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1490                         break;
1491
1492                 case ENETUNREACH:
1493                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1494                         break;
1495                 }
1496                 goto out;
1497         }
1498
1499         switch (rt->dst.error) {
1500         case EINVAL:
1501         default:
1502                 goto out;
1503         case EHOSTUNREACH:
1504                 code = ICMP_HOST_UNREACH;
1505                 break;
1506         case ENETUNREACH:
1507                 code = ICMP_NET_UNREACH;
1508                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1509                 break;
1510         case EACCES:
1511                 code = ICMP_PKT_FILTERED;
1512                 break;
1513         }
1514
1515         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1516
1517         send = true;
1518         if (peer) {
1519                 now = jiffies;
1520                 peer->rate_tokens += now - peer->rate_last;
1521                 if (peer->rate_tokens > ip_rt_error_burst)
1522                         peer->rate_tokens = ip_rt_error_burst;
1523                 peer->rate_last = now;
1524                 if (peer->rate_tokens >= ip_rt_error_cost)
1525                         peer->rate_tokens -= ip_rt_error_cost;
1526                 else
1527                         send = false;
1528                 inet_putpeer(peer);
1529         }
1530         if (send)
1531                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1532
1533 out:    kfree_skb(skb);
1534         return 0;
1535 }
1536
1537 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1538 {
1539         struct rtable *rt = (struct rtable *) dst;
1540
1541         dst_confirm(dst);
1542
1543         if (mtu < ip_rt_min_pmtu)
1544                 mtu = ip_rt_min_pmtu;
1545
1546         rt->rt_pmtu = mtu;
1547         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1548 }
1549
1550 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1551                       int oif, u32 mark, u8 protocol, int flow_flags)
1552 {
1553         const struct iphdr *iph = (const struct iphdr *)skb->data;
1554         struct flowi4 fl4;
1555         struct rtable *rt;
1556
1557         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1558                            protocol, flow_flags,
1559                            iph->daddr, iph->saddr, 0, 0);
1560         rt = __ip_route_output_key(net, &fl4);
1561         if (!IS_ERR(rt)) {
1562                 ip_rt_update_pmtu(&rt->dst, mtu);
1563                 ip_rt_put(rt);
1564         }
1565 }
1566 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1567
1568 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1569 {
1570         const struct inet_sock *inet = inet_sk(sk);
1571
1572         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1573                                 sk->sk_bound_dev_if, sk->sk_mark,
1574                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1575                                 inet_sk_flowi_flags(sk));
1576 }
1577 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1578
1579 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1580 {
1581         struct rtable *rt = (struct rtable *) dst;
1582
1583         if (rt_is_expired(rt))
1584                 return NULL;
1585         return dst;
1586 }
1587
1588 static void ipv4_dst_destroy(struct dst_entry *dst)
1589 {
1590         struct rtable *rt = (struct rtable *) dst;
1591
1592         if (rt->fi) {
1593                 fib_info_put(rt->fi);
1594                 rt->fi = NULL;
1595         }
1596 }
1597
1598
1599 static void ipv4_link_failure(struct sk_buff *skb)
1600 {
1601         struct rtable *rt;
1602
1603         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1604
1605         rt = skb_rtable(skb);
1606         if (rt)
1607                 dst_set_expires(&rt->dst, 0);
1608 }
1609
1610 static int ip_rt_bug(struct sk_buff *skb)
1611 {
1612         pr_debug("%s: %pI4 -> %pI4, %s\n",
1613                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1614                  skb->dev ? skb->dev->name : "?");
1615         kfree_skb(skb);
1616         WARN_ON(1);
1617         return 0;
1618 }
1619
1620 /*
1621    We do not cache source address of outgoing interface,
1622    because it is used only by IP RR, TS and SRR options,
1623    so that it out of fast path.
1624
1625    BTW remember: "addr" is allowed to be not aligned
1626    in IP options!
1627  */
1628
1629 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1630 {
1631         __be32 src;
1632
1633         if (rt_is_output_route(rt))
1634                 src = ip_hdr(skb)->saddr;
1635         else {
1636                 struct fib_result res;
1637                 struct flowi4 fl4;
1638                 struct iphdr *iph;
1639
1640                 iph = ip_hdr(skb);
1641
1642                 memset(&fl4, 0, sizeof(fl4));
1643                 fl4.daddr = iph->daddr;
1644                 fl4.saddr = iph->saddr;
1645                 fl4.flowi4_tos = RT_TOS(iph->tos);
1646                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1647                 fl4.flowi4_iif = skb->dev->ifindex;
1648                 fl4.flowi4_mark = skb->mark;
1649
1650                 rcu_read_lock();
1651                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1652                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1653                 else
1654                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1655                                         RT_SCOPE_UNIVERSE);
1656                 rcu_read_unlock();
1657         }
1658         memcpy(addr, &src, 4);
1659 }
1660
1661 #ifdef CONFIG_IP_ROUTE_CLASSID
1662 static void set_class_tag(struct rtable *rt, u32 tag)
1663 {
1664         if (!(rt->dst.tclassid & 0xFFFF))
1665                 rt->dst.tclassid |= tag & 0xFFFF;
1666         if (!(rt->dst.tclassid & 0xFFFF0000))
1667                 rt->dst.tclassid |= tag & 0xFFFF0000;
1668 }
1669 #endif
1670
1671 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1672 {
1673         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1674
1675         if (advmss == 0) {
1676                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1677                                ip_rt_min_advmss);
1678                 if (advmss > 65535 - 40)
1679                         advmss = 65535 - 40;
1680         }
1681         return advmss;
1682 }
1683
1684 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1685 {
1686         const struct rtable *rt = (const struct rtable *) dst;
1687         unsigned int mtu = rt->rt_pmtu;
1688
1689         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1690                 mtu = 0;
1691
1692         if (!mtu)
1693                 mtu = dst_metric_raw(dst, RTAX_MTU);
1694
1695         if (mtu && rt_is_output_route(rt))
1696                 return mtu;
1697
1698         mtu = dst->dev->mtu;
1699
1700         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1701
1702                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1703                         mtu = 576;
1704         }
1705
1706         if (mtu > IP_MAX_MTU)
1707                 mtu = IP_MAX_MTU;
1708
1709         return mtu;
1710 }
1711
1712 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1713                             struct fib_info *fi)
1714 {
1715         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1716                 rt->fi = fi;
1717                 atomic_inc(&fi->fib_clntref);
1718         }
1719         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1720 }
1721
1722 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1723                            const struct fib_result *res,
1724                            struct fib_info *fi, u16 type, u32 itag)
1725 {
1726         if (fi) {
1727                 if (FIB_RES_GW(*res) &&
1728                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1729                         rt->rt_gateway = FIB_RES_GW(*res);
1730                 rt_init_metrics(rt, fl4, fi);
1731 #ifdef CONFIG_IP_ROUTE_CLASSID
1732                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1733 #endif
1734         }
1735
1736 #ifdef CONFIG_IP_ROUTE_CLASSID
1737 #ifdef CONFIG_IP_MULTIPLE_TABLES
1738         set_class_tag(rt, fib_rules_tclass(res));
1739 #endif
1740         set_class_tag(rt, itag);
1741 #endif
1742 }
1743
1744 static struct rtable *rt_dst_alloc(struct net_device *dev,
1745                                    bool nopolicy, bool noxfrm)
1746 {
1747         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1748                          DST_HOST |
1749                          (nopolicy ? DST_NOPOLICY : 0) |
1750                          (noxfrm ? DST_NOXFRM : 0));
1751 }
1752
1753 /* called in rcu_read_lock() section */
1754 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1755                                 u8 tos, struct net_device *dev, int our)
1756 {
1757         unsigned int hash;
1758         struct rtable *rth;
1759         struct in_device *in_dev = __in_dev_get_rcu(dev);
1760         u32 itag = 0;
1761         int err;
1762
1763         /* Primary sanity checks. */
1764
1765         if (in_dev == NULL)
1766                 return -EINVAL;
1767
1768         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1769             skb->protocol != htons(ETH_P_IP))
1770                 goto e_inval;
1771
1772         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1773                 if (ipv4_is_loopback(saddr))
1774                         goto e_inval;
1775
1776         if (ipv4_is_zeronet(saddr)) {
1777                 if (!ipv4_is_local_multicast(daddr))
1778                         goto e_inval;
1779         } else {
1780                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1781                                           in_dev, &itag);
1782                 if (err < 0)
1783                         goto e_err;
1784         }
1785         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1786                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1787         if (!rth)
1788                 goto e_nobufs;
1789
1790 #ifdef CONFIG_IP_ROUTE_CLASSID
1791         rth->dst.tclassid = itag;
1792 #endif
1793         rth->dst.output = ip_rt_bug;
1794
1795         rth->rt_key_dst = daddr;
1796         rth->rt_key_src = saddr;
1797         rth->rt_genid   = rt_genid(dev_net(dev));
1798         rth->rt_flags   = RTCF_MULTICAST;
1799         rth->rt_type    = RTN_MULTICAST;
1800         rth->rt_key_tos = tos;
1801         rth->rt_dst     = daddr;
1802         rth->rt_src     = saddr;
1803         rth->rt_route_iif = dev->ifindex;
1804         rth->rt_iif     = dev->ifindex;
1805         rth->rt_oif     = 0;
1806         rth->rt_mark    = skb->mark;
1807         rth->rt_pmtu    = 0;
1808         rth->rt_gateway = daddr;
1809         rth->fi = NULL;
1810         if (our) {
1811                 rth->dst.input= ip_local_deliver;
1812                 rth->rt_flags |= RTCF_LOCAL;
1813         }
1814
1815 #ifdef CONFIG_IP_MROUTE
1816         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1817                 rth->dst.input = ip_mr_input;
1818 #endif
1819         RT_CACHE_STAT_INC(in_slow_mc);
1820
1821         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1822         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1823         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1824
1825 e_nobufs:
1826         return -ENOBUFS;
1827 e_inval:
1828         return -EINVAL;
1829 e_err:
1830         return err;
1831 }
1832
1833
1834 static void ip_handle_martian_source(struct net_device *dev,
1835                                      struct in_device *in_dev,
1836                                      struct sk_buff *skb,
1837                                      __be32 daddr,
1838                                      __be32 saddr)
1839 {
1840         RT_CACHE_STAT_INC(in_martian_src);
1841 #ifdef CONFIG_IP_ROUTE_VERBOSE
1842         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1843                 /*
1844                  *      RFC1812 recommendation, if source is martian,
1845                  *      the only hint is MAC header.
1846                  */
1847                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1848                         &daddr, &saddr, dev->name);
1849                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1850                         print_hex_dump(KERN_WARNING, "ll header: ",
1851                                        DUMP_PREFIX_OFFSET, 16, 1,
1852                                        skb_mac_header(skb),
1853                                        dev->hard_header_len, true);
1854                 }
1855         }
1856 #endif
1857 }
1858
1859 /* called in rcu_read_lock() section */
1860 static int __mkroute_input(struct sk_buff *skb,
1861                            const struct fib_result *res,
1862                            struct in_device *in_dev,
1863                            __be32 daddr, __be32 saddr, u32 tos,
1864                            struct rtable **result)
1865 {
1866         struct rtable *rth;
1867         int err;
1868         struct in_device *out_dev;
1869         unsigned int flags = 0;
1870         u32 itag;
1871
1872         /* get a working reference to the output device */
1873         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1874         if (out_dev == NULL) {
1875                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1876                 return -EINVAL;
1877         }
1878
1879
1880         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1881                                   in_dev->dev, in_dev, &itag);
1882         if (err < 0) {
1883                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1884                                          saddr);
1885
1886                 goto cleanup;
1887         }
1888
1889         if (err)
1890                 flags |= RTCF_DIRECTSRC;
1891
1892         if (out_dev == in_dev && err &&
1893             (IN_DEV_SHARED_MEDIA(out_dev) ||
1894              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1895                 flags |= RTCF_DOREDIRECT;
1896
1897         if (skb->protocol != htons(ETH_P_IP)) {
1898                 /* Not IP (i.e. ARP). Do not create route, if it is
1899                  * invalid for proxy arp. DNAT routes are always valid.
1900                  *
1901                  * Proxy arp feature have been extended to allow, ARP
1902                  * replies back to the same interface, to support
1903                  * Private VLAN switch technologies. See arp.c.
1904                  */
1905                 if (out_dev == in_dev &&
1906                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1907                         err = -EINVAL;
1908                         goto cleanup;
1909                 }
1910         }
1911
1912         rth = rt_dst_alloc(out_dev->dev,
1913                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1914                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1915         if (!rth) {
1916                 err = -ENOBUFS;
1917                 goto cleanup;
1918         }
1919
1920         rth->rt_key_dst = daddr;
1921         rth->rt_key_src = saddr;
1922         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1923         rth->rt_flags = flags;
1924         rth->rt_type = res->type;
1925         rth->rt_key_tos = tos;
1926         rth->rt_dst     = daddr;
1927         rth->rt_src     = saddr;
1928         rth->rt_route_iif = in_dev->dev->ifindex;
1929         rth->rt_iif     = in_dev->dev->ifindex;
1930         rth->rt_oif     = 0;
1931         rth->rt_mark    = skb->mark;
1932         rth->rt_pmtu    = 0;
1933         rth->rt_gateway = daddr;
1934         rth->fi = NULL;
1935
1936         rth->dst.input = ip_forward;
1937         rth->dst.output = ip_output;
1938
1939         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1940
1941         *result = rth;
1942         err = 0;
1943  cleanup:
1944         return err;
1945 }
1946
1947 static int ip_mkroute_input(struct sk_buff *skb,
1948                             struct fib_result *res,
1949                             const struct flowi4 *fl4,
1950                             struct in_device *in_dev,
1951                             __be32 daddr, __be32 saddr, u32 tos)
1952 {
1953         struct rtable *rth = NULL;
1954         int err;
1955         unsigned int hash;
1956
1957 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1958         if (res->fi && res->fi->fib_nhs > 1)
1959                 fib_select_multipath(res);
1960 #endif
1961
1962         /* create a routing cache entry */
1963         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1964         if (err)
1965                 return err;
1966
1967         /* put it into the cache */
1968         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
1969                        rt_genid(dev_net(rth->dst.dev)));
1970         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
1971         if (IS_ERR(rth))
1972                 return PTR_ERR(rth);
1973         return 0;
1974 }
1975
1976 /*
1977  *      NOTE. We drop all the packets that has local source
1978  *      addresses, because every properly looped back packet
1979  *      must have correct destination already attached by output routine.
1980  *
1981  *      Such approach solves two big problems:
1982  *      1. Not simplex devices are handled properly.
1983  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1984  *      called with rcu_read_lock()
1985  */
1986
1987 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1988                                u8 tos, struct net_device *dev)
1989 {
1990         struct fib_result res;
1991         struct in_device *in_dev = __in_dev_get_rcu(dev);
1992         struct flowi4   fl4;
1993         unsigned int    flags = 0;
1994         u32             itag = 0;
1995         struct rtable   *rth;
1996         unsigned int    hash;
1997         int             err = -EINVAL;
1998         struct net    *net = dev_net(dev);
1999
2000         /* IP on this device is disabled. */
2001
2002         if (!in_dev)
2003                 goto out;
2004
2005         /* Check for the most weird martians, which can be not detected
2006            by fib_lookup.
2007          */
2008
2009         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2010                 goto martian_source;
2011
2012         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2013                 goto brd_input;
2014
2015         /* Accept zero addresses only to limited broadcast;
2016          * I even do not know to fix it or not. Waiting for complains :-)
2017          */
2018         if (ipv4_is_zeronet(saddr))
2019                 goto martian_source;
2020
2021         if (ipv4_is_zeronet(daddr))
2022                 goto martian_destination;
2023
2024         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2025                 if (ipv4_is_loopback(daddr))
2026                         goto martian_destination;
2027
2028                 if (ipv4_is_loopback(saddr))
2029                         goto martian_source;
2030         }
2031
2032         /*
2033          *      Now we are ready to route packet.
2034          */
2035         fl4.flowi4_oif = 0;
2036         fl4.flowi4_iif = dev->ifindex;
2037         fl4.flowi4_mark = skb->mark;
2038         fl4.flowi4_tos = tos;
2039         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2040         fl4.daddr = daddr;
2041         fl4.saddr = saddr;
2042         err = fib_lookup(net, &fl4, &res);
2043         if (err != 0)
2044                 goto no_route;
2045
2046         RT_CACHE_STAT_INC(in_slow_tot);
2047
2048         if (res.type == RTN_BROADCAST)
2049                 goto brd_input;
2050
2051         if (res.type == RTN_LOCAL) {
2052                 err = fib_validate_source(skb, saddr, daddr, tos,
2053                                           net->loopback_dev->ifindex,
2054                                           dev, in_dev, &itag);
2055                 if (err < 0)
2056                         goto martian_source_keep_err;
2057                 if (err)
2058                         flags |= RTCF_DIRECTSRC;
2059                 goto local_input;
2060         }
2061
2062         if (!IN_DEV_FORWARD(in_dev))
2063                 goto no_route;
2064         if (res.type != RTN_UNICAST)
2065                 goto martian_destination;
2066
2067         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2068 out:    return err;
2069
2070 brd_input:
2071         if (skb->protocol != htons(ETH_P_IP))
2072                 goto e_inval;
2073
2074         if (!ipv4_is_zeronet(saddr)) {
2075                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2076                                           in_dev, &itag);
2077                 if (err < 0)
2078                         goto martian_source_keep_err;
2079                 if (err)
2080                         flags |= RTCF_DIRECTSRC;
2081         }
2082         flags |= RTCF_BROADCAST;
2083         res.type = RTN_BROADCAST;
2084         RT_CACHE_STAT_INC(in_brd);
2085
2086 local_input:
2087         rth = rt_dst_alloc(net->loopback_dev,
2088                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2089         if (!rth)
2090                 goto e_nobufs;
2091
2092         rth->dst.input= ip_local_deliver;
2093         rth->dst.output= ip_rt_bug;
2094 #ifdef CONFIG_IP_ROUTE_CLASSID
2095         rth->dst.tclassid = itag;
2096 #endif
2097
2098         rth->rt_key_dst = daddr;
2099         rth->rt_key_src = saddr;
2100         rth->rt_genid = rt_genid(net);
2101         rth->rt_flags   = flags|RTCF_LOCAL;
2102         rth->rt_type    = res.type;
2103         rth->rt_key_tos = tos;
2104         rth->rt_dst     = daddr;
2105         rth->rt_src     = saddr;
2106         rth->rt_route_iif = dev->ifindex;
2107         rth->rt_iif     = dev->ifindex;
2108         rth->rt_oif     = 0;
2109         rth->rt_mark    = skb->mark;
2110         rth->rt_pmtu    = 0;
2111         rth->rt_gateway = daddr;
2112         rth->fi = NULL;
2113         if (res.type == RTN_UNREACHABLE) {
2114                 rth->dst.input= ip_error;
2115                 rth->dst.error= -err;
2116                 rth->rt_flags   &= ~RTCF_LOCAL;
2117         }
2118         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2119         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2120         err = 0;
2121         if (IS_ERR(rth))
2122                 err = PTR_ERR(rth);
2123         goto out;
2124
2125 no_route:
2126         RT_CACHE_STAT_INC(in_no_route);
2127         res.type = RTN_UNREACHABLE;
2128         if (err == -ESRCH)
2129                 err = -ENETUNREACH;
2130         goto local_input;
2131
2132         /*
2133          *      Do not cache martian addresses: they should be logged (RFC1812)
2134          */
2135 martian_destination:
2136         RT_CACHE_STAT_INC(in_martian_dst);
2137 #ifdef CONFIG_IP_ROUTE_VERBOSE
2138         if (IN_DEV_LOG_MARTIANS(in_dev))
2139                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2140                                      &daddr, &saddr, dev->name);
2141 #endif
2142
2143 e_inval:
2144         err = -EINVAL;
2145         goto out;
2146
2147 e_nobufs:
2148         err = -ENOBUFS;
2149         goto out;
2150
2151 martian_source:
2152         err = -EINVAL;
2153 martian_source_keep_err:
2154         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2155         goto out;
2156 }
2157
2158 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2159                            u8 tos, struct net_device *dev, bool noref)
2160 {
2161         struct rtable   *rth;
2162         unsigned int    hash;
2163         int iif = dev->ifindex;
2164         struct net *net;
2165         int res;
2166
2167         net = dev_net(dev);
2168
2169         rcu_read_lock();
2170
2171         if (!rt_caching(net))
2172                 goto skip_cache;
2173
2174         tos &= IPTOS_RT_MASK;
2175         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2176
2177         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2178              rth = rcu_dereference(rth->dst.rt_next)) {
2179                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2180                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2181                      (rth->rt_route_iif ^ iif) |
2182                      (rth->rt_key_tos ^ tos)) == 0 &&
2183                     rth->rt_mark == skb->mark &&
2184                     net_eq(dev_net(rth->dst.dev), net) &&
2185                     !rt_is_expired(rth)) {
2186                         if (noref) {
2187                                 dst_use_noref(&rth->dst, jiffies);
2188                                 skb_dst_set_noref(skb, &rth->dst);
2189                         } else {
2190                                 dst_use(&rth->dst, jiffies);
2191                                 skb_dst_set(skb, &rth->dst);
2192                         }
2193                         RT_CACHE_STAT_INC(in_hit);
2194                         rcu_read_unlock();
2195                         return 0;
2196                 }
2197                 RT_CACHE_STAT_INC(in_hlist_search);
2198         }
2199
2200 skip_cache:
2201         /* Multicast recognition logic is moved from route cache to here.
2202            The problem was that too many Ethernet cards have broken/missing
2203            hardware multicast filters :-( As result the host on multicasting
2204            network acquires a lot of useless route cache entries, sort of
2205            SDR messages from all the world. Now we try to get rid of them.
2206            Really, provided software IP multicast filter is organized
2207            reasonably (at least, hashed), it does not result in a slowdown
2208            comparing with route cache reject entries.
2209            Note, that multicast routers are not affected, because
2210            route cache entry is created eventually.
2211          */
2212         if (ipv4_is_multicast(daddr)) {
2213                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2214
2215                 if (in_dev) {
2216                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2217                                                   ip_hdr(skb)->protocol);
2218                         if (our
2219 #ifdef CONFIG_IP_MROUTE
2220                                 ||
2221                             (!ipv4_is_local_multicast(daddr) &&
2222                              IN_DEV_MFORWARD(in_dev))
2223 #endif
2224                            ) {
2225                                 int res = ip_route_input_mc(skb, daddr, saddr,
2226                                                             tos, dev, our);
2227                                 rcu_read_unlock();
2228                                 return res;
2229                         }
2230                 }
2231                 rcu_read_unlock();
2232                 return -EINVAL;
2233         }
2234         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2235         rcu_read_unlock();
2236         return res;
2237 }
2238 EXPORT_SYMBOL(ip_route_input_common);
2239
2240 /* called with rcu_read_lock() */
2241 static struct rtable *__mkroute_output(const struct fib_result *res,
2242                                        const struct flowi4 *fl4,
2243                                        __be32 orig_daddr, __be32 orig_saddr,
2244                                        int orig_oif, __u8 orig_rtos,
2245                                        struct net_device *dev_out,
2246                                        unsigned int flags)
2247 {
2248         struct fib_info *fi = res->fi;
2249         struct in_device *in_dev;
2250         u16 type = res->type;
2251         struct rtable *rth;
2252
2253         in_dev = __in_dev_get_rcu(dev_out);
2254         if (!in_dev)
2255                 return ERR_PTR(-EINVAL);
2256
2257         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2258                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2259                         return ERR_PTR(-EINVAL);
2260
2261         if (ipv4_is_lbcast(fl4->daddr))
2262                 type = RTN_BROADCAST;
2263         else if (ipv4_is_multicast(fl4->daddr))
2264                 type = RTN_MULTICAST;
2265         else if (ipv4_is_zeronet(fl4->daddr))
2266                 return ERR_PTR(-EINVAL);
2267
2268         if (dev_out->flags & IFF_LOOPBACK)
2269                 flags |= RTCF_LOCAL;
2270
2271         if (type == RTN_BROADCAST) {
2272                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2273                 fi = NULL;
2274         } else if (type == RTN_MULTICAST) {
2275                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2276                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2277                                      fl4->flowi4_proto))
2278                         flags &= ~RTCF_LOCAL;
2279                 /* If multicast route do not exist use
2280                  * default one, but do not gateway in this case.
2281                  * Yes, it is hack.
2282                  */
2283                 if (fi && res->prefixlen < 4)
2284                         fi = NULL;
2285         }
2286
2287         rth = rt_dst_alloc(dev_out,
2288                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2289                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2290         if (!rth)
2291                 return ERR_PTR(-ENOBUFS);
2292
2293         rth->dst.output = ip_output;
2294
2295         rth->rt_key_dst = orig_daddr;
2296         rth->rt_key_src = orig_saddr;
2297         rth->rt_genid = rt_genid(dev_net(dev_out));
2298         rth->rt_flags   = flags;
2299         rth->rt_type    = type;
2300         rth->rt_key_tos = orig_rtos;
2301         rth->rt_dst     = fl4->daddr;
2302         rth->rt_src     = fl4->saddr;
2303         rth->rt_route_iif = 0;
2304         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2305         rth->rt_oif     = orig_oif;
2306         rth->rt_mark    = fl4->flowi4_mark;
2307         rth->rt_pmtu    = 0;
2308         rth->rt_gateway = fl4->daddr;
2309         rth->fi = NULL;
2310
2311         RT_CACHE_STAT_INC(out_slow_tot);
2312
2313         if (flags & RTCF_LOCAL)
2314                 rth->dst.input = ip_local_deliver;
2315         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2316                 if (flags & RTCF_LOCAL &&
2317                     !(dev_out->flags & IFF_LOOPBACK)) {
2318                         rth->dst.output = ip_mc_output;
2319                         RT_CACHE_STAT_INC(out_slow_mc);
2320                 }
2321 #ifdef CONFIG_IP_MROUTE
2322                 if (type == RTN_MULTICAST) {
2323                         if (IN_DEV_MFORWARD(in_dev) &&
2324                             !ipv4_is_local_multicast(fl4->daddr)) {
2325                                 rth->dst.input = ip_mr_input;
2326                                 rth->dst.output = ip_mc_output;
2327                         }
2328                 }
2329 #endif
2330         }
2331
2332         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2333
2334         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2335                 rth->dst.flags |= DST_NOCACHE;
2336
2337         return rth;
2338 }
2339
2340 /*
2341  * Major route resolver routine.
2342  * called with rcu_read_lock();
2343  */
2344
2345 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2346 {
2347         struct net_device *dev_out = NULL;
2348         __u8 tos = RT_FL_TOS(fl4);
2349         unsigned int flags = 0;
2350         struct fib_result res;
2351         struct rtable *rth;
2352         __be32 orig_daddr;
2353         __be32 orig_saddr;
2354         int orig_oif;
2355
2356         res.fi          = NULL;
2357         res.table       = NULL;
2358 #ifdef CONFIG_IP_MULTIPLE_TABLES
2359         res.r           = NULL;
2360 #endif
2361
2362         orig_daddr = fl4->daddr;
2363         orig_saddr = fl4->saddr;
2364         orig_oif = fl4->flowi4_oif;
2365
2366         fl4->flowi4_iif = net->loopback_dev->ifindex;
2367         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2368         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2369                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2370
2371         rcu_read_lock();
2372         if (fl4->saddr) {
2373                 rth = ERR_PTR(-EINVAL);
2374                 if (ipv4_is_multicast(fl4->saddr) ||
2375                     ipv4_is_lbcast(fl4->saddr) ||
2376                     ipv4_is_zeronet(fl4->saddr))
2377                         goto out;
2378
2379                 /* I removed check for oif == dev_out->oif here.
2380                    It was wrong for two reasons:
2381                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2382                       is assigned to multiple interfaces.
2383                    2. Moreover, we are allowed to send packets with saddr
2384                       of another iface. --ANK
2385                  */
2386
2387                 if (fl4->flowi4_oif == 0 &&
2388                     (ipv4_is_multicast(fl4->daddr) ||
2389                      ipv4_is_lbcast(fl4->daddr))) {
2390                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2391                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2392                         if (dev_out == NULL)
2393                                 goto out;
2394
2395                         /* Special hack: user can direct multicasts
2396                            and limited broadcast via necessary interface
2397                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2398                            This hack is not just for fun, it allows
2399                            vic,vat and friends to work.
2400                            They bind socket to loopback, set ttl to zero
2401                            and expect that it will work.
2402                            From the viewpoint of routing cache they are broken,
2403                            because we are not allowed to build multicast path
2404                            with loopback source addr (look, routing cache
2405                            cannot know, that ttl is zero, so that packet
2406                            will not leave this host and route is valid).
2407                            Luckily, this hack is good workaround.
2408                          */
2409
2410                         fl4->flowi4_oif = dev_out->ifindex;
2411                         goto make_route;
2412                 }
2413
2414                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2415                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2416                         if (!__ip_dev_find(net, fl4->saddr, false))
2417                                 goto out;
2418                 }
2419         }
2420
2421
2422         if (fl4->flowi4_oif) {
2423                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2424                 rth = ERR_PTR(-ENODEV);
2425                 if (dev_out == NULL)
2426                         goto out;
2427
2428                 /* RACE: Check return value of inet_select_addr instead. */
2429                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2430                         rth = ERR_PTR(-ENETUNREACH);
2431                         goto out;
2432                 }
2433                 if (ipv4_is_local_multicast(fl4->daddr) ||
2434                     ipv4_is_lbcast(fl4->daddr)) {
2435                         if (!fl4->saddr)
2436                                 fl4->saddr = inet_select_addr(dev_out, 0,
2437                                                               RT_SCOPE_LINK);
2438                         goto make_route;
2439                 }
2440                 if (fl4->saddr) {
2441                         if (ipv4_is_multicast(fl4->daddr))
2442                                 fl4->saddr = inet_select_addr(dev_out, 0,
2443                                                               fl4->flowi4_scope);
2444                         else if (!fl4->daddr)
2445                                 fl4->saddr = inet_select_addr(dev_out, 0,
2446                                                               RT_SCOPE_HOST);
2447                 }
2448         }
2449
2450         if (!fl4->daddr) {
2451                 fl4->daddr = fl4->saddr;
2452                 if (!fl4->daddr)
2453                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2454                 dev_out = net->loopback_dev;
2455                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2456                 res.type = RTN_LOCAL;
2457                 flags |= RTCF_LOCAL;
2458                 goto make_route;
2459         }
2460
2461         if (fib_lookup(net, fl4, &res)) {
2462                 res.fi = NULL;
2463                 res.table = NULL;
2464                 if (fl4->flowi4_oif) {
2465                         /* Apparently, routing tables are wrong. Assume,
2466                            that the destination is on link.
2467
2468                            WHY? DW.
2469                            Because we are allowed to send to iface
2470                            even if it has NO routes and NO assigned
2471                            addresses. When oif is specified, routing
2472                            tables are looked up with only one purpose:
2473                            to catch if destination is gatewayed, rather than
2474                            direct. Moreover, if MSG_DONTROUTE is set,
2475                            we send packet, ignoring both routing tables
2476                            and ifaddr state. --ANK
2477
2478
2479                            We could make it even if oif is unknown,
2480                            likely IPv6, but we do not.
2481                          */
2482
2483                         if (fl4->saddr == 0)
2484                                 fl4->saddr = inet_select_addr(dev_out, 0,
2485                                                               RT_SCOPE_LINK);
2486                         res.type = RTN_UNICAST;
2487                         goto make_route;
2488                 }
2489                 rth = ERR_PTR(-ENETUNREACH);
2490                 goto out;
2491         }
2492
2493         if (res.type == RTN_LOCAL) {
2494                 if (!fl4->saddr) {
2495                         if (res.fi->fib_prefsrc)
2496                                 fl4->saddr = res.fi->fib_prefsrc;
2497                         else
2498                                 fl4->saddr = fl4->daddr;
2499                 }
2500                 dev_out = net->loopback_dev;
2501                 fl4->flowi4_oif = dev_out->ifindex;
2502                 res.fi = NULL;
2503                 flags |= RTCF_LOCAL;
2504                 goto make_route;
2505         }
2506
2507 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2508         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2509                 fib_select_multipath(&res);
2510         else
2511 #endif
2512         if (!res.prefixlen &&
2513             res.table->tb_num_default > 1 &&
2514             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2515                 fib_select_default(&res);
2516
2517         if (!fl4->saddr)
2518                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2519
2520         dev_out = FIB_RES_DEV(res);
2521         fl4->flowi4_oif = dev_out->ifindex;
2522
2523
2524 make_route:
2525         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2526                                tos, dev_out, flags);
2527         if (!IS_ERR(rth)) {
2528                 unsigned int hash;
2529
2530                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2531                                rt_genid(dev_net(dev_out)));
2532                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2533         }
2534
2535 out:
2536         rcu_read_unlock();
2537         return rth;
2538 }
2539
2540 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2541 {
2542         struct rtable *rth;
2543         unsigned int hash;
2544
2545         if (!rt_caching(net))
2546                 goto slow_output;
2547
2548         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2549
2550         rcu_read_lock_bh();
2551         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2552                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2553                 if (rth->rt_key_dst == flp4->daddr &&
2554                     rth->rt_key_src == flp4->saddr &&
2555                     rt_is_output_route(rth) &&
2556                     rth->rt_oif == flp4->flowi4_oif &&
2557                     rth->rt_mark == flp4->flowi4_mark &&
2558                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2559                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2560                     net_eq(dev_net(rth->dst.dev), net) &&
2561                     !rt_is_expired(rth)) {
2562                         dst_use(&rth->dst, jiffies);
2563                         RT_CACHE_STAT_INC(out_hit);
2564                         rcu_read_unlock_bh();
2565                         if (!flp4->saddr)
2566                                 flp4->saddr = rth->rt_src;
2567                         if (!flp4->daddr)
2568                                 flp4->daddr = rth->rt_dst;
2569                         return rth;
2570                 }
2571                 RT_CACHE_STAT_INC(out_hlist_search);
2572         }
2573         rcu_read_unlock_bh();
2574
2575 slow_output:
2576         return ip_route_output_slow(net, flp4);
2577 }
2578 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2579
2580 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2581 {
2582         return NULL;
2583 }
2584
2585 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2586 {
2587         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2588
2589         return mtu ? : dst->dev->mtu;
2590 }
2591
2592 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2593 {
2594 }
2595
2596 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2597                                           unsigned long old)
2598 {
2599         return NULL;
2600 }
2601
2602 static struct dst_ops ipv4_dst_blackhole_ops = {
2603         .family                 =       AF_INET,
2604         .protocol               =       cpu_to_be16(ETH_P_IP),
2605         .destroy                =       ipv4_dst_destroy,
2606         .check                  =       ipv4_blackhole_dst_check,
2607         .mtu                    =       ipv4_blackhole_mtu,
2608         .default_advmss         =       ipv4_default_advmss,
2609         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2610         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2611         .neigh_lookup           =       ipv4_neigh_lookup,
2612 };
2613
2614 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2615 {
2616         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2617         struct rtable *ort = (struct rtable *) dst_orig;
2618
2619         if (rt) {
2620                 struct dst_entry *new = &rt->dst;
2621
2622                 new->__use = 1;
2623                 new->input = dst_discard;
2624                 new->output = dst_discard;
2625
2626                 new->dev = ort->dst.dev;
2627                 if (new->dev)
2628                         dev_hold(new->dev);
2629
2630                 rt->rt_key_dst = ort->rt_key_dst;
2631                 rt->rt_key_src = ort->rt_key_src;
2632                 rt->rt_key_tos = ort->rt_key_tos;
2633                 rt->rt_route_iif = ort->rt_route_iif;
2634                 rt->rt_iif = ort->rt_iif;
2635                 rt->rt_oif = ort->rt_oif;
2636                 rt->rt_mark = ort->rt_mark;
2637                 rt->rt_pmtu = ort->rt_pmtu;
2638
2639                 rt->rt_genid = rt_genid(net);
2640                 rt->rt_flags = ort->rt_flags;
2641                 rt->rt_type = ort->rt_type;
2642                 rt->rt_dst = ort->rt_dst;
2643                 rt->rt_src = ort->rt_src;
2644                 rt->rt_gateway = ort->rt_gateway;
2645                 rt->fi = ort->fi;
2646                 if (rt->fi)
2647                         atomic_inc(&rt->fi->fib_clntref);
2648
2649                 dst_free(new);
2650         }
2651
2652         dst_release(dst_orig);
2653
2654         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2655 }
2656
2657 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2658                                     struct sock *sk)
2659 {
2660         struct rtable *rt = __ip_route_output_key(net, flp4);
2661
2662         if (IS_ERR(rt))
2663                 return rt;
2664
2665         if (flp4->flowi4_proto)
2666                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2667                                                    flowi4_to_flowi(flp4),
2668                                                    sk, 0);
2669
2670         return rt;
2671 }
2672 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2673
2674 static int rt_fill_info(struct net *net,
2675                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2676                         int nowait, unsigned int flags)
2677 {
2678         struct rtable *rt = skb_rtable(skb);
2679         struct rtmsg *r;
2680         struct nlmsghdr *nlh;
2681         unsigned long expires = 0;
2682         u32 error;
2683
2684         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2685         if (nlh == NULL)
2686                 return -EMSGSIZE;
2687
2688         r = nlmsg_data(nlh);
2689         r->rtm_family    = AF_INET;
2690         r->rtm_dst_len  = 32;
2691         r->rtm_src_len  = 0;
2692         r->rtm_tos      = rt->rt_key_tos;
2693         r->rtm_table    = RT_TABLE_MAIN;
2694         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2695                 goto nla_put_failure;
2696         r->rtm_type     = rt->rt_type;
2697         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2698         r->rtm_protocol = RTPROT_UNSPEC;
2699         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2700         if (rt->rt_flags & RTCF_NOTIFY)
2701                 r->rtm_flags |= RTM_F_NOTIFY;
2702
2703         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2704                 goto nla_put_failure;
2705         if (rt->rt_key_src) {
2706                 r->rtm_src_len = 32;
2707                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2708                         goto nla_put_failure;
2709         }
2710         if (rt->dst.dev &&
2711             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2712                 goto nla_put_failure;
2713 #ifdef CONFIG_IP_ROUTE_CLASSID
2714         if (rt->dst.tclassid &&
2715             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2716                 goto nla_put_failure;
2717 #endif
2718         if (!rt_is_input_route(rt) &&
2719             rt->rt_src != rt->rt_key_src) {
2720                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2721                         goto nla_put_failure;
2722         }
2723         if (rt->rt_dst != rt->rt_gateway &&
2724             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2725                 goto nla_put_failure;
2726
2727         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2728                 goto nla_put_failure;
2729
2730         if (rt->rt_mark &&
2731             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2732                 goto nla_put_failure;
2733
2734         error = rt->dst.error;
2735         expires = rt->dst.expires;
2736         if (expires) {
2737                 if (time_before(jiffies, expires))
2738                         expires -= jiffies;
2739                 else
2740                         expires = 0;
2741         }
2742
2743         if (rt_is_input_route(rt)) {
2744 #ifdef CONFIG_IP_MROUTE
2745                 __be32 dst = rt->rt_dst;
2746
2747                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2748                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2749                         int err = ipmr_get_route(net, skb,
2750                                                  rt->rt_src, rt->rt_dst,
2751                                                  r, nowait);
2752                         if (err <= 0) {
2753                                 if (!nowait) {
2754                                         if (err == 0)
2755                                                 return 0;
2756                                         goto nla_put_failure;
2757                                 } else {
2758                                         if (err == -EMSGSIZE)
2759                                                 goto nla_put_failure;
2760                                         error = err;
2761                                 }
2762                         }
2763                 } else
2764 #endif
2765                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2766                                 goto nla_put_failure;
2767         }
2768
2769         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2770                 goto nla_put_failure;
2771
2772         return nlmsg_end(skb, nlh);
2773
2774 nla_put_failure:
2775         nlmsg_cancel(skb, nlh);
2776         return -EMSGSIZE;
2777 }
2778
2779 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2780 {
2781         struct net *net = sock_net(in_skb->sk);
2782         struct rtmsg *rtm;
2783         struct nlattr *tb[RTA_MAX+1];
2784         struct rtable *rt = NULL;
2785         __be32 dst = 0;
2786         __be32 src = 0;
2787         u32 iif;
2788         int err;
2789         int mark;
2790         struct sk_buff *skb;
2791
2792         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2793         if (err < 0)
2794                 goto errout;
2795
2796         rtm = nlmsg_data(nlh);
2797
2798         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2799         if (skb == NULL) {
2800                 err = -ENOBUFS;
2801                 goto errout;
2802         }
2803
2804         /* Reserve room for dummy headers, this skb can pass
2805            through good chunk of routing engine.
2806          */
2807         skb_reset_mac_header(skb);
2808         skb_reset_network_header(skb);
2809
2810         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2811         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2812         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2813
2814         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2815         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2816         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2817         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2818
2819         if (iif) {
2820                 struct net_device *dev;
2821
2822                 dev = __dev_get_by_index(net, iif);
2823                 if (dev == NULL) {
2824                         err = -ENODEV;
2825                         goto errout_free;
2826                 }
2827
2828                 skb->protocol   = htons(ETH_P_IP);
2829                 skb->dev        = dev;
2830                 skb->mark       = mark;
2831                 local_bh_disable();
2832                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2833                 local_bh_enable();
2834
2835                 rt = skb_rtable(skb);
2836                 if (err == 0 && rt->dst.error)
2837                         err = -rt->dst.error;
2838         } else {
2839                 struct flowi4 fl4 = {
2840                         .daddr = dst,
2841                         .saddr = src,
2842                         .flowi4_tos = rtm->rtm_tos,
2843                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2844                         .flowi4_mark = mark,
2845                 };
2846                 rt = ip_route_output_key(net, &fl4);
2847
2848                 err = 0;
2849                 if (IS_ERR(rt))
2850                         err = PTR_ERR(rt);
2851         }
2852
2853         if (err)
2854                 goto errout_free;
2855
2856         skb_dst_set(skb, &rt->dst);
2857         if (rtm->rtm_flags & RTM_F_NOTIFY)
2858                 rt->rt_flags |= RTCF_NOTIFY;
2859
2860         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2861                            RTM_NEWROUTE, 0, 0);
2862         if (err <= 0)
2863                 goto errout_free;
2864
2865         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2866 errout:
2867         return err;
2868
2869 errout_free:
2870         kfree_skb(skb);
2871         goto errout;
2872 }
2873
2874 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2875 {
2876         struct rtable *rt;
2877         int h, s_h;
2878         int idx, s_idx;
2879         struct net *net;
2880
2881         net = sock_net(skb->sk);
2882
2883         s_h = cb->args[0];
2884         if (s_h < 0)
2885                 s_h = 0;
2886         s_idx = idx = cb->args[1];
2887         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2888                 if (!rt_hash_table[h].chain)
2889                         continue;
2890                 rcu_read_lock_bh();
2891                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2892                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2893                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2894                                 continue;
2895                         if (rt_is_expired(rt))
2896                                 continue;
2897                         skb_dst_set_noref(skb, &rt->dst);
2898                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2899                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2900                                          1, NLM_F_MULTI) <= 0) {
2901                                 skb_dst_drop(skb);
2902                                 rcu_read_unlock_bh();
2903                                 goto done;
2904                         }
2905                         skb_dst_drop(skb);
2906                 }
2907                 rcu_read_unlock_bh();
2908         }
2909
2910 done:
2911         cb->args[0] = h;
2912         cb->args[1] = idx;
2913         return skb->len;
2914 }
2915
2916 void ip_rt_multicast_event(struct in_device *in_dev)
2917 {
2918         rt_cache_flush(dev_net(in_dev->dev), 0);
2919 }
2920
2921 #ifdef CONFIG_SYSCTL
2922 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2923                                         void __user *buffer,
2924                                         size_t *lenp, loff_t *ppos)
2925 {
2926         if (write) {
2927                 int flush_delay;
2928                 ctl_table ctl;
2929                 struct net *net;
2930
2931                 memcpy(&ctl, __ctl, sizeof(ctl));
2932                 ctl.data = &flush_delay;
2933                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2934
2935                 net = (struct net *)__ctl->extra1;
2936                 rt_cache_flush(net, flush_delay);
2937                 return 0;
2938         }
2939
2940         return -EINVAL;
2941 }
2942
2943 static ctl_table ipv4_route_table[] = {
2944         {
2945                 .procname       = "gc_thresh",
2946                 .data           = &ipv4_dst_ops.gc_thresh,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = proc_dointvec,
2950         },
2951         {
2952                 .procname       = "max_size",
2953                 .data           = &ip_rt_max_size,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = proc_dointvec,
2957         },
2958         {
2959                 /*  Deprecated. Use gc_min_interval_ms */
2960
2961                 .procname       = "gc_min_interval",
2962                 .data           = &ip_rt_gc_min_interval,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = proc_dointvec_jiffies,
2966         },
2967         {
2968                 .procname       = "gc_min_interval_ms",
2969                 .data           = &ip_rt_gc_min_interval,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = proc_dointvec_ms_jiffies,
2973         },
2974         {
2975                 .procname       = "gc_timeout",
2976                 .data           = &ip_rt_gc_timeout,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = proc_dointvec_jiffies,
2980         },
2981         {
2982                 .procname       = "gc_interval",
2983                 .data           = &ip_rt_gc_interval,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = proc_dointvec_jiffies,
2987         },
2988         {
2989                 .procname       = "redirect_load",
2990                 .data           = &ip_rt_redirect_load,
2991                 .maxlen         = sizeof(int),
2992                 .mode           = 0644,
2993                 .proc_handler   = proc_dointvec,
2994         },
2995         {
2996                 .procname       = "redirect_number",
2997                 .data           = &ip_rt_redirect_number,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = proc_dointvec,
3001         },
3002         {
3003                 .procname       = "redirect_silence",
3004                 .data           = &ip_rt_redirect_silence,
3005                 .maxlen         = sizeof(int),
3006                 .mode           = 0644,
3007                 .proc_handler   = proc_dointvec,
3008         },
3009         {
3010                 .procname       = "error_cost",
3011                 .data           = &ip_rt_error_cost,
3012                 .maxlen         = sizeof(int),
3013                 .mode           = 0644,
3014                 .proc_handler   = proc_dointvec,
3015         },
3016         {
3017                 .procname       = "error_burst",
3018                 .data           = &ip_rt_error_burst,
3019                 .maxlen         = sizeof(int),
3020                 .mode           = 0644,
3021                 .proc_handler   = proc_dointvec,
3022         },
3023         {
3024                 .procname       = "gc_elasticity",
3025                 .data           = &ip_rt_gc_elasticity,
3026                 .maxlen         = sizeof(int),
3027                 .mode           = 0644,
3028                 .proc_handler   = proc_dointvec,
3029         },
3030         {
3031                 .procname       = "mtu_expires",
3032                 .data           = &ip_rt_mtu_expires,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = proc_dointvec_jiffies,
3036         },
3037         {
3038                 .procname       = "min_pmtu",
3039                 .data           = &ip_rt_min_pmtu,
3040                 .maxlen         = sizeof(int),
3041                 .mode           = 0644,
3042                 .proc_handler   = proc_dointvec,
3043         },
3044         {
3045                 .procname       = "min_adv_mss",
3046                 .data           = &ip_rt_min_advmss,
3047                 .maxlen         = sizeof(int),
3048                 .mode           = 0644,
3049                 .proc_handler   = proc_dointvec,
3050         },
3051         { }
3052 };
3053
3054 static struct ctl_table ipv4_route_flush_table[] = {
3055         {
3056                 .procname       = "flush",
3057                 .maxlen         = sizeof(int),
3058                 .mode           = 0200,
3059                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3060         },
3061         { },
3062 };
3063
3064 static __net_init int sysctl_route_net_init(struct net *net)
3065 {
3066         struct ctl_table *tbl;
3067
3068         tbl = ipv4_route_flush_table;
3069         if (!net_eq(net, &init_net)) {
3070                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3071                 if (tbl == NULL)
3072                         goto err_dup;
3073         }
3074         tbl[0].extra1 = net;
3075
3076         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3077         if (net->ipv4.route_hdr == NULL)
3078                 goto err_reg;
3079         return 0;
3080
3081 err_reg:
3082         if (tbl != ipv4_route_flush_table)
3083                 kfree(tbl);
3084 err_dup:
3085         return -ENOMEM;
3086 }
3087
3088 static __net_exit void sysctl_route_net_exit(struct net *net)
3089 {
3090         struct ctl_table *tbl;
3091
3092         tbl = net->ipv4.route_hdr->ctl_table_arg;
3093         unregister_net_sysctl_table(net->ipv4.route_hdr);
3094         BUG_ON(tbl == ipv4_route_flush_table);
3095         kfree(tbl);
3096 }
3097
3098 static __net_initdata struct pernet_operations sysctl_route_ops = {
3099         .init = sysctl_route_net_init,
3100         .exit = sysctl_route_net_exit,
3101 };
3102 #endif
3103
3104 static __net_init int rt_genid_init(struct net *net)
3105 {
3106         get_random_bytes(&net->ipv4.rt_genid,
3107                          sizeof(net->ipv4.rt_genid));
3108         get_random_bytes(&net->ipv4.dev_addr_genid,
3109                          sizeof(net->ipv4.dev_addr_genid));
3110         return 0;
3111 }
3112
3113 static __net_initdata struct pernet_operations rt_genid_ops = {
3114         .init = rt_genid_init,
3115 };
3116
3117 static int __net_init ipv4_inetpeer_init(struct net *net)
3118 {
3119         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3120
3121         if (!bp)
3122                 return -ENOMEM;
3123         inet_peer_base_init(bp);
3124         net->ipv4.peers = bp;
3125         return 0;
3126 }
3127
3128 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3129 {
3130         struct inet_peer_base *bp = net->ipv4.peers;
3131
3132         net->ipv4.peers = NULL;
3133         inetpeer_invalidate_tree(bp);
3134         kfree(bp);
3135 }
3136
3137 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3138         .init   =       ipv4_inetpeer_init,
3139         .exit   =       ipv4_inetpeer_exit,
3140 };
3141
3142 #ifdef CONFIG_IP_ROUTE_CLASSID
3143 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3144 #endif /* CONFIG_IP_ROUTE_CLASSID */
3145
3146 static __initdata unsigned long rhash_entries;
3147 static int __init set_rhash_entries(char *str)
3148 {
3149         ssize_t ret;
3150
3151         if (!str)
3152                 return 0;
3153
3154         ret = kstrtoul(str, 0, &rhash_entries);
3155         if (ret)
3156                 return 0;
3157
3158         return 1;
3159 }
3160 __setup("rhash_entries=", set_rhash_entries);
3161
3162 int __init ip_rt_init(void)
3163 {
3164         int rc = 0;
3165
3166 #ifdef CONFIG_IP_ROUTE_CLASSID
3167         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3168         if (!ip_rt_acct)
3169                 panic("IP: failed to allocate ip_rt_acct\n");
3170 #endif
3171
3172         ipv4_dst_ops.kmem_cachep =
3173                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3174                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3175
3176         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3177
3178         if (dst_entries_init(&ipv4_dst_ops) < 0)
3179                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3180
3181         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3182                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3183
3184         rt_hash_table = (struct rt_hash_bucket *)
3185                 alloc_large_system_hash("IP route cache",
3186                                         sizeof(struct rt_hash_bucket),
3187                                         rhash_entries,
3188                                         (totalram_pages >= 128 * 1024) ?
3189                                         15 : 17,
3190                                         0,
3191                                         &rt_hash_log,
3192                                         &rt_hash_mask,
3193                                         0,
3194                                         rhash_entries ? 0 : 512 * 1024);
3195         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3196         rt_hash_lock_init();
3197
3198         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3199         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3200
3201         devinet_init();
3202         ip_fib_init();
3203
3204         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3205         expires_ljiffies = jiffies;
3206         schedule_delayed_work(&expires_work,
3207                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3208
3209         if (ip_rt_proc_init())
3210                 pr_err("Unable to create route proc files\n");
3211 #ifdef CONFIG_XFRM
3212         xfrm_init();
3213         xfrm4_init(ip_rt_max_size);
3214 #endif
3215         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3216
3217 #ifdef CONFIG_SYSCTL
3218         register_pernet_subsys(&sysctl_route_ops);
3219 #endif
3220         register_pernet_subsys(&rt_genid_ops);
3221         register_pernet_subsys(&ipv4_inetpeer_ops);
3222         return rc;
3223 }
3224
3225 #ifdef CONFIG_SYSCTL
3226 /*
3227  * We really need to sanitize the damn ipv4 init order, then all
3228  * this nonsense will go away.
3229  */
3230 void __init ip_static_sysctl_init(void)
3231 {
3232         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3233 }
3234 #endif