ipv4: Don't miss existing cached metrics in new routes.
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151                             int how)
152 {
153 }
154
155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 {
157         struct rtable *rt = (struct rtable *) dst;
158         struct inet_peer *peer;
159         u32 *p = NULL;
160
161         if (!rt->peer)
162                 rt_bind_peer(rt, 1);
163
164         peer = rt->peer;
165         if (peer) {
166                 u32 *old_p = __DST_METRICS_PTR(old);
167                 unsigned long prev, new;
168
169                 p = peer->metrics;
170                 if (inet_metrics_new(peer))
171                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172
173                 new = (unsigned long) p;
174                 prev = cmpxchg(&dst->_metrics, old, new);
175
176                 if (prev != old) {
177                         p = __DST_METRICS_PTR(prev);
178                         if (prev & DST_METRICS_READ_ONLY)
179                                 p = NULL;
180                 } else {
181                         if (rt->fi) {
182                                 fib_info_put(rt->fi);
183                                 rt->fi = NULL;
184                         }
185                 }
186         }
187         return p;
188 }
189
190 static struct dst_ops ipv4_dst_ops = {
191         .family =               AF_INET,
192         .protocol =             cpu_to_be16(ETH_P_IP),
193         .gc =                   rt_garbage_collect,
194         .check =                ipv4_dst_check,
195         .default_advmss =       ipv4_default_advmss,
196         .default_mtu =          ipv4_default_mtu,
197         .cow_metrics =          ipv4_cow_metrics,
198         .destroy =              ipv4_dst_destroy,
199         .ifdown =               ipv4_dst_ifdown,
200         .negative_advice =      ipv4_negative_advice,
201         .link_failure =         ipv4_link_failure,
202         .update_pmtu =          ip_rt_update_pmtu,
203         .local_out =            __ip_local_out,
204 };
205
206 #define ECN_OR_COST(class)      TC_PRIO_##class
207
208 const __u8 ip_tos2prio[16] = {
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(FILLER),
211         TC_PRIO_BESTEFFORT,
212         ECN_OR_COST(BESTEFFORT),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_BULK,
216         ECN_OR_COST(BULK),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE,
220         ECN_OR_COST(INTERACTIVE),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK),
223         TC_PRIO_INTERACTIVE_BULK,
224         ECN_OR_COST(INTERACTIVE_BULK)
225 };
226
227
228 /*
229  * Route cache.
230  */
231
232 /* The locking scheme is rather straight forward:
233  *
234  * 1) Read-Copy Update protects the buckets of the central route hash.
235  * 2) Only writers remove entries, and they hold the lock
236  *    as they look at rtable reference counts.
237  * 3) Only readers acquire references to rtable entries,
238  *    they do so with atomic increments and with the
239  *    lock held.
240  */
241
242 struct rt_hash_bucket {
243         struct rtable __rcu     *chain;
244 };
245
246 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247         defined(CONFIG_PROVE_LOCKING)
248 /*
249  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250  * The size of this table is a power of two and depends on the number of CPUS.
251  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
252  */
253 #ifdef CONFIG_LOCKDEP
254 # define RT_HASH_LOCK_SZ        256
255 #else
256 # if NR_CPUS >= 32
257 #  define RT_HASH_LOCK_SZ       4096
258 # elif NR_CPUS >= 16
259 #  define RT_HASH_LOCK_SZ       2048
260 # elif NR_CPUS >= 8
261 #  define RT_HASH_LOCK_SZ       1024
262 # elif NR_CPUS >= 4
263 #  define RT_HASH_LOCK_SZ       512
264 # else
265 #  define RT_HASH_LOCK_SZ       256
266 # endif
267 #endif
268
269 static spinlock_t       *rt_hash_locks;
270 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
271
272 static __init void rt_hash_lock_init(void)
273 {
274         int i;
275
276         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277                         GFP_KERNEL);
278         if (!rt_hash_locks)
279                 panic("IP: failed to allocate rt_hash_locks\n");
280
281         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282                 spin_lock_init(&rt_hash_locks[i]);
283 }
284 #else
285 # define rt_hash_lock_addr(slot) NULL
286
287 static inline void rt_hash_lock_init(void)
288 {
289 }
290 #endif
291
292 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
293 static unsigned                 rt_hash_mask __read_mostly;
294 static unsigned int             rt_hash_log  __read_mostly;
295
296 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
297 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
298
299 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
300                                    int genid)
301 {
302         return jhash_3words((__force u32)daddr, (__force u32)saddr,
303                             idx, genid)
304                 & rt_hash_mask;
305 }
306
307 static inline int rt_genid(struct net *net)
308 {
309         return atomic_read(&net->ipv4.rt_genid);
310 }
311
312 #ifdef CONFIG_PROC_FS
313 struct rt_cache_iter_state {
314         struct seq_net_private p;
315         int bucket;
316         int genid;
317 };
318
319 static struct rtable *rt_cache_get_first(struct seq_file *seq)
320 {
321         struct rt_cache_iter_state *st = seq->private;
322         struct rtable *r = NULL;
323
324         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
325                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
326                         continue;
327                 rcu_read_lock_bh();
328                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
329                 while (r) {
330                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
331                             r->rt_genid == st->genid)
332                                 return r;
333                         r = rcu_dereference_bh(r->dst.rt_next);
334                 }
335                 rcu_read_unlock_bh();
336         }
337         return r;
338 }
339
340 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
341                                           struct rtable *r)
342 {
343         struct rt_cache_iter_state *st = seq->private;
344
345         r = rcu_dereference_bh(r->dst.rt_next);
346         while (!r) {
347                 rcu_read_unlock_bh();
348                 do {
349                         if (--st->bucket < 0)
350                                 return NULL;
351                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
352                 rcu_read_lock_bh();
353                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
354         }
355         return r;
356 }
357
358 static struct rtable *rt_cache_get_next(struct seq_file *seq,
359                                         struct rtable *r)
360 {
361         struct rt_cache_iter_state *st = seq->private;
362         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
363                 if (dev_net(r->dst.dev) != seq_file_net(seq))
364                         continue;
365                 if (r->rt_genid == st->genid)
366                         break;
367         }
368         return r;
369 }
370
371 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
372 {
373         struct rtable *r = rt_cache_get_first(seq);
374
375         if (r)
376                 while (pos && (r = rt_cache_get_next(seq, r)))
377                         --pos;
378         return pos ? NULL : r;
379 }
380
381 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382 {
383         struct rt_cache_iter_state *st = seq->private;
384         if (*pos)
385                 return rt_cache_get_idx(seq, *pos - 1);
386         st->genid = rt_genid(seq_file_net(seq));
387         return SEQ_START_TOKEN;
388 }
389
390 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391 {
392         struct rtable *r;
393
394         if (v == SEQ_START_TOKEN)
395                 r = rt_cache_get_first(seq);
396         else
397                 r = rt_cache_get_next(seq, v);
398         ++*pos;
399         return r;
400 }
401
402 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403 {
404         if (v && v != SEQ_START_TOKEN)
405                 rcu_read_unlock_bh();
406 }
407
408 static int rt_cache_seq_show(struct seq_file *seq, void *v)
409 {
410         if (v == SEQ_START_TOKEN)
411                 seq_printf(seq, "%-127s\n",
412                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414                            "HHUptod\tSpecDst");
415         else {
416                 struct rtable *r = v;
417                 int len;
418
419                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
421                         r->dst.dev ? r->dst.dev->name : "*",
422                         (__force u32)r->rt_dst,
423                         (__force u32)r->rt_gateway,
424                         r->rt_flags, atomic_read(&r->dst.__refcnt),
425                         r->dst.__use, 0, (__force u32)r->rt_src,
426                         dst_metric_advmss(&r->dst) + 40,
427                         dst_metric(&r->dst, RTAX_WINDOW),
428                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429                               dst_metric(&r->dst, RTAX_RTTVAR)),
430                         r->fl.fl4_tos,
431                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432                         r->dst.hh ? (r->dst.hh->hh_output ==
433                                        dev_queue_xmit) : 0,
434                         r->rt_spec_dst, &len);
435
436                 seq_printf(seq, "%*s\n", 127 - len, "");
437         }
438         return 0;
439 }
440
441 static const struct seq_operations rt_cache_seq_ops = {
442         .start  = rt_cache_seq_start,
443         .next   = rt_cache_seq_next,
444         .stop   = rt_cache_seq_stop,
445         .show   = rt_cache_seq_show,
446 };
447
448 static int rt_cache_seq_open(struct inode *inode, struct file *file)
449 {
450         return seq_open_net(inode, file, &rt_cache_seq_ops,
451                         sizeof(struct rt_cache_iter_state));
452 }
453
454 static const struct file_operations rt_cache_seq_fops = {
455         .owner   = THIS_MODULE,
456         .open    = rt_cache_seq_open,
457         .read    = seq_read,
458         .llseek  = seq_lseek,
459         .release = seq_release_net,
460 };
461
462
463 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464 {
465         int cpu;
466
467         if (*pos == 0)
468                 return SEQ_START_TOKEN;
469
470         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
471                 if (!cpu_possible(cpu))
472                         continue;
473                 *pos = cpu+1;
474                 return &per_cpu(rt_cache_stat, cpu);
475         }
476         return NULL;
477 }
478
479 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480 {
481         int cpu;
482
483         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
484                 if (!cpu_possible(cpu))
485                         continue;
486                 *pos = cpu+1;
487                 return &per_cpu(rt_cache_stat, cpu);
488         }
489         return NULL;
490
491 }
492
493 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494 {
495
496 }
497
498 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499 {
500         struct rt_cache_stat *st = v;
501
502         if (v == SEQ_START_TOKEN) {
503                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
504                 return 0;
505         }
506
507         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
508                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
509                    dst_entries_get_slow(&ipv4_dst_ops),
510                    st->in_hit,
511                    st->in_slow_tot,
512                    st->in_slow_mc,
513                    st->in_no_route,
514                    st->in_brd,
515                    st->in_martian_dst,
516                    st->in_martian_src,
517
518                    st->out_hit,
519                    st->out_slow_tot,
520                    st->out_slow_mc,
521
522                    st->gc_total,
523                    st->gc_ignored,
524                    st->gc_goal_miss,
525                    st->gc_dst_overflow,
526                    st->in_hlist_search,
527                    st->out_hlist_search
528                 );
529         return 0;
530 }
531
532 static const struct seq_operations rt_cpu_seq_ops = {
533         .start  = rt_cpu_seq_start,
534         .next   = rt_cpu_seq_next,
535         .stop   = rt_cpu_seq_stop,
536         .show   = rt_cpu_seq_show,
537 };
538
539
540 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541 {
542         return seq_open(file, &rt_cpu_seq_ops);
543 }
544
545 static const struct file_operations rt_cpu_seq_fops = {
546         .owner   = THIS_MODULE,
547         .open    = rt_cpu_seq_open,
548         .read    = seq_read,
549         .llseek  = seq_lseek,
550         .release = seq_release,
551 };
552
553 #ifdef CONFIG_IP_ROUTE_CLASSID
554 static int rt_acct_proc_show(struct seq_file *m, void *v)
555 {
556         struct ip_rt_acct *dst, *src;
557         unsigned int i, j;
558
559         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560         if (!dst)
561                 return -ENOMEM;
562
563         for_each_possible_cpu(i) {
564                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565                 for (j = 0; j < 256; j++) {
566                         dst[j].o_bytes   += src[j].o_bytes;
567                         dst[j].o_packets += src[j].o_packets;
568                         dst[j].i_bytes   += src[j].i_bytes;
569                         dst[j].i_packets += src[j].i_packets;
570                 }
571         }
572
573         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574         kfree(dst);
575         return 0;
576 }
577
578 static int rt_acct_proc_open(struct inode *inode, struct file *file)
579 {
580         return single_open(file, rt_acct_proc_show, NULL);
581 }
582
583 static const struct file_operations rt_acct_proc_fops = {
584         .owner          = THIS_MODULE,
585         .open           = rt_acct_proc_open,
586         .read           = seq_read,
587         .llseek         = seq_lseek,
588         .release        = single_release,
589 };
590 #endif
591
592 static int __net_init ip_rt_do_proc_init(struct net *net)
593 {
594         struct proc_dir_entry *pde;
595
596         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597                         &rt_cache_seq_fops);
598         if (!pde)
599                 goto err1;
600
601         pde = proc_create("rt_cache", S_IRUGO,
602                           net->proc_net_stat, &rt_cpu_seq_fops);
603         if (!pde)
604                 goto err2;
605
606 #ifdef CONFIG_IP_ROUTE_CLASSID
607         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
608         if (!pde)
609                 goto err3;
610 #endif
611         return 0;
612
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 err3:
615         remove_proc_entry("rt_cache", net->proc_net_stat);
616 #endif
617 err2:
618         remove_proc_entry("rt_cache", net->proc_net);
619 err1:
620         return -ENOMEM;
621 }
622
623 static void __net_exit ip_rt_do_proc_exit(struct net *net)
624 {
625         remove_proc_entry("rt_cache", net->proc_net_stat);
626         remove_proc_entry("rt_cache", net->proc_net);
627 #ifdef CONFIG_IP_ROUTE_CLASSID
628         remove_proc_entry("rt_acct", net->proc_net);
629 #endif
630 }
631
632 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
633         .init = ip_rt_do_proc_init,
634         .exit = ip_rt_do_proc_exit,
635 };
636
637 static int __init ip_rt_proc_init(void)
638 {
639         return register_pernet_subsys(&ip_rt_proc_ops);
640 }
641
642 #else
643 static inline int ip_rt_proc_init(void)
644 {
645         return 0;
646 }
647 #endif /* CONFIG_PROC_FS */
648
649 static inline void rt_free(struct rtable *rt)
650 {
651         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
652 }
653
654 static inline void rt_drop(struct rtable *rt)
655 {
656         ip_rt_put(rt);
657         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
658 }
659
660 static inline int rt_fast_clean(struct rtable *rth)
661 {
662         /* Kill broadcast/multicast entries very aggresively, if they
663            collide in hash table with more useful entries */
664         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
665                 rt_is_input_route(rth) && rth->dst.rt_next;
666 }
667
668 static inline int rt_valuable(struct rtable *rth)
669 {
670         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
671                 rth->dst.expires;
672 }
673
674 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675 {
676         unsigned long age;
677         int ret = 0;
678
679         if (atomic_read(&rth->dst.__refcnt))
680                 goto out;
681
682         ret = 1;
683         if (rth->dst.expires &&
684             time_after_eq(jiffies, rth->dst.expires))
685                 goto out;
686
687         age = jiffies - rth->dst.lastuse;
688         ret = 0;
689         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690             (age <= tmo2 && rt_valuable(rth)))
691                 goto out;
692         ret = 1;
693 out:    return ret;
694 }
695
696 /* Bits of score are:
697  * 31: very valuable
698  * 30: not quite useless
699  * 29..0: usage counter
700  */
701 static inline u32 rt_score(struct rtable *rt)
702 {
703         u32 score = jiffies - rt->dst.lastuse;
704
705         score = ~score & ~(3<<30);
706
707         if (rt_valuable(rt))
708                 score |= (1<<31);
709
710         if (rt_is_output_route(rt) ||
711             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712                 score |= (1<<30);
713
714         return score;
715 }
716
717 static inline bool rt_caching(const struct net *net)
718 {
719         return net->ipv4.current_rt_cache_rebuild_count <=
720                 net->ipv4.sysctl_rt_cache_rebuild_count;
721 }
722
723 static inline bool compare_hash_inputs(const struct flowi *fl1,
724                                         const struct flowi *fl2)
725 {
726         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
728                 (fl1->iif ^ fl2->iif)) == 0);
729 }
730
731 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732 {
733         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
735                 (fl1->mark ^ fl2->mark) |
736                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
737                 (fl1->oif ^ fl2->oif) |
738                 (fl1->iif ^ fl2->iif)) == 0;
739 }
740
741 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742 {
743         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
744 }
745
746 static inline int rt_is_expired(struct rtable *rth)
747 {
748         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
749 }
750
751 /*
752  * Perform a full scan of hash table and free all entries.
753  * Can be called by a softirq or a process.
754  * In the later case, we want to be reschedule if necessary
755  */
756 static void rt_do_flush(struct net *net, int process_context)
757 {
758         unsigned int i;
759         struct rtable *rth, *next;
760
761         for (i = 0; i <= rt_hash_mask; i++) {
762                 struct rtable __rcu **pprev;
763                 struct rtable *list;
764
765                 if (process_context && need_resched())
766                         cond_resched();
767                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
768                 if (!rth)
769                         continue;
770
771                 spin_lock_bh(rt_hash_lock_addr(i));
772
773                 list = NULL;
774                 pprev = &rt_hash_table[i].chain;
775                 rth = rcu_dereference_protected(*pprev,
776                         lockdep_is_held(rt_hash_lock_addr(i)));
777
778                 while (rth) {
779                         next = rcu_dereference_protected(rth->dst.rt_next,
780                                 lockdep_is_held(rt_hash_lock_addr(i)));
781
782                         if (!net ||
783                             net_eq(dev_net(rth->dst.dev), net)) {
784                                 rcu_assign_pointer(*pprev, next);
785                                 rcu_assign_pointer(rth->dst.rt_next, list);
786                                 list = rth;
787                         } else {
788                                 pprev = &rth->dst.rt_next;
789                         }
790                         rth = next;
791                 }
792
793                 spin_unlock_bh(rt_hash_lock_addr(i));
794
795                 for (; list; list = next) {
796                         next = rcu_dereference_protected(list->dst.rt_next, 1);
797                         rt_free(list);
798                 }
799         }
800 }
801
802 /*
803  * While freeing expired entries, we compute average chain length
804  * and standard deviation, using fixed-point arithmetic.
805  * This to have an estimation of rt_chain_length_max
806  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
807  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808  */
809
810 #define FRACT_BITS 3
811 #define ONE (1UL << FRACT_BITS)
812
813 /*
814  * Given a hash chain and an item in this hash chain,
815  * find if a previous entry has the same hash_inputs
816  * (but differs on tos, mark or oif)
817  * Returns 0 if an alias is found.
818  * Returns ONE if rth has no alias before itself.
819  */
820 static int has_noalias(const struct rtable *head, const struct rtable *rth)
821 {
822         const struct rtable *aux = head;
823
824         while (aux != rth) {
825                 if (compare_hash_inputs(&aux->fl, &rth->fl))
826                         return 0;
827                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
828         }
829         return ONE;
830 }
831
832 static void rt_check_expire(void)
833 {
834         static unsigned int rover;
835         unsigned int i = rover, goal;
836         struct rtable *rth;
837         struct rtable __rcu **rthp;
838         unsigned long samples = 0;
839         unsigned long sum = 0, sum2 = 0;
840         unsigned long delta;
841         u64 mult;
842
843         delta = jiffies - expires_ljiffies;
844         expires_ljiffies = jiffies;
845         mult = ((u64)delta) << rt_hash_log;
846         if (ip_rt_gc_timeout > 1)
847                 do_div(mult, ip_rt_gc_timeout);
848         goal = (unsigned int)mult;
849         if (goal > rt_hash_mask)
850                 goal = rt_hash_mask + 1;
851         for (; goal > 0; goal--) {
852                 unsigned long tmo = ip_rt_gc_timeout;
853                 unsigned long length;
854
855                 i = (i + 1) & rt_hash_mask;
856                 rthp = &rt_hash_table[i].chain;
857
858                 if (need_resched())
859                         cond_resched();
860
861                 samples++;
862
863                 if (rcu_dereference_raw(*rthp) == NULL)
864                         continue;
865                 length = 0;
866                 spin_lock_bh(rt_hash_lock_addr(i));
867                 while ((rth = rcu_dereference_protected(*rthp,
868                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
869                         prefetch(rth->dst.rt_next);
870                         if (rt_is_expired(rth)) {
871                                 *rthp = rth->dst.rt_next;
872                                 rt_free(rth);
873                                 continue;
874                         }
875                         if (rth->dst.expires) {
876                                 /* Entry is expired even if it is in use */
877                                 if (time_before_eq(jiffies, rth->dst.expires)) {
878 nofree:
879                                         tmo >>= 1;
880                                         rthp = &rth->dst.rt_next;
881                                         /*
882                                          * We only count entries on
883                                          * a chain with equal hash inputs once
884                                          * so that entries for different QOS
885                                          * levels, and other non-hash input
886                                          * attributes don't unfairly skew
887                                          * the length computation
888                                          */
889                                         length += has_noalias(rt_hash_table[i].chain, rth);
890                                         continue;
891                                 }
892                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893                                 goto nofree;
894
895                         /* Cleanup aged off entries. */
896                         *rthp = rth->dst.rt_next;
897                         rt_free(rth);
898                 }
899                 spin_unlock_bh(rt_hash_lock_addr(i));
900                 sum += length;
901                 sum2 += length*length;
902         }
903         if (samples) {
904                 unsigned long avg = sum / samples;
905                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906                 rt_chain_length_max = max_t(unsigned long,
907                                         ip_rt_gc_elasticity,
908                                         (avg + 4*sd) >> FRACT_BITS);
909         }
910         rover = i;
911 }
912
913 /*
914  * rt_worker_func() is run in process context.
915  * we call rt_check_expire() to scan part of the hash table
916  */
917 static void rt_worker_func(struct work_struct *work)
918 {
919         rt_check_expire();
920         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
921 }
922
923 /*
924  * Pertubation of rt_genid by a small quantity [1..256]
925  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926  * many times (2^24) without giving recent rt_genid.
927  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
928  */
929 static void rt_cache_invalidate(struct net *net)
930 {
931         unsigned char shuffle;
932
933         get_random_bytes(&shuffle, sizeof(shuffle));
934         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
935 }
936
937 /*
938  * delay < 0  : invalidate cache (fast : entries will be deleted later)
939  * delay >= 0 : invalidate & flush cache (can be long)
940  */
941 void rt_cache_flush(struct net *net, int delay)
942 {
943         rt_cache_invalidate(net);
944         if (delay >= 0)
945                 rt_do_flush(net, !in_softirq());
946 }
947
948 /* Flush previous cache invalidated entries from the cache */
949 void rt_cache_flush_batch(struct net *net)
950 {
951         rt_do_flush(net, !in_softirq());
952 }
953
954 static void rt_emergency_hash_rebuild(struct net *net)
955 {
956         if (net_ratelimit())
957                 printk(KERN_WARNING "Route hash chain too long!\n");
958         rt_cache_invalidate(net);
959 }
960
961 /*
962    Short description of GC goals.
963
964    We want to build algorithm, which will keep routing cache
965    at some equilibrium point, when number of aged off entries
966    is kept approximately equal to newly generated ones.
967
968    Current expiration strength is variable "expire".
969    We try to adjust it dynamically, so that if networking
970    is idle expires is large enough to keep enough of warm entries,
971    and when load increases it reduces to limit cache size.
972  */
973
974 static int rt_garbage_collect(struct dst_ops *ops)
975 {
976         static unsigned long expire = RT_GC_TIMEOUT;
977         static unsigned long last_gc;
978         static int rover;
979         static int equilibrium;
980         struct rtable *rth;
981         struct rtable __rcu **rthp;
982         unsigned long now = jiffies;
983         int goal;
984         int entries = dst_entries_get_fast(&ipv4_dst_ops);
985
986         /*
987          * Garbage collection is pretty expensive,
988          * do not make it too frequently.
989          */
990
991         RT_CACHE_STAT_INC(gc_total);
992
993         if (now - last_gc < ip_rt_gc_min_interval &&
994             entries < ip_rt_max_size) {
995                 RT_CACHE_STAT_INC(gc_ignored);
996                 goto out;
997         }
998
999         entries = dst_entries_get_slow(&ipv4_dst_ops);
1000         /* Calculate number of entries, which we want to expire now. */
1001         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1002         if (goal <= 0) {
1003                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004                         equilibrium = ipv4_dst_ops.gc_thresh;
1005                 goal = entries - equilibrium;
1006                 if (goal > 0) {
1007                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1008                         goal = entries - equilibrium;
1009                 }
1010         } else {
1011                 /* We are in dangerous area. Try to reduce cache really
1012                  * aggressively.
1013                  */
1014                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                 equilibrium = entries - goal;
1016         }
1017
1018         if (now - last_gc >= ip_rt_gc_min_interval)
1019                 last_gc = now;
1020
1021         if (goal <= 0) {
1022                 equilibrium += goal;
1023                 goto work_done;
1024         }
1025
1026         do {
1027                 int i, k;
1028
1029                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030                         unsigned long tmo = expire;
1031
1032                         k = (k + 1) & rt_hash_mask;
1033                         rthp = &rt_hash_table[k].chain;
1034                         spin_lock_bh(rt_hash_lock_addr(k));
1035                         while ((rth = rcu_dereference_protected(*rthp,
1036                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1037                                 if (!rt_is_expired(rth) &&
1038                                         !rt_may_expire(rth, tmo, expire)) {
1039                                         tmo >>= 1;
1040                                         rthp = &rth->dst.rt_next;
1041                                         continue;
1042                                 }
1043                                 *rthp = rth->dst.rt_next;
1044                                 rt_free(rth);
1045                                 goal--;
1046                         }
1047                         spin_unlock_bh(rt_hash_lock_addr(k));
1048                         if (goal <= 0)
1049                                 break;
1050                 }
1051                 rover = k;
1052
1053                 if (goal <= 0)
1054                         goto work_done;
1055
1056                 /* Goal is not achieved. We stop process if:
1057
1058                    - if expire reduced to zero. Otherwise, expire is halfed.
1059                    - if table is not full.
1060                    - if we are called from interrupt.
1061                    - jiffies check is just fallback/debug loop breaker.
1062                      We will not spin here for long time in any case.
1063                  */
1064
1065                 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067                 if (expire == 0)
1068                         break;
1069
1070                 expire >>= 1;
1071 #if RT_CACHE_DEBUG >= 2
1072                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1073                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1074 #endif
1075
1076                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1077                         goto out;
1078         } while (!in_softirq() && time_before_eq(jiffies, now));
1079
1080         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081                 goto out;
1082         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (net_ratelimit())
1085                 printk(KERN_WARNING "dst cache overflow\n");
1086         RT_CACHE_STAT_INC(gc_dst_overflow);
1087         return 1;
1088
1089 work_done:
1090         expire += ip_rt_gc_min_interval;
1091         if (expire > ip_rt_gc_timeout ||
1092             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1094                 expire = ip_rt_gc_timeout;
1095 #if RT_CACHE_DEBUG >= 2
1096         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1097                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1098 #endif
1099 out:    return 0;
1100 }
1101
1102 /*
1103  * Returns number of entries in a hash chain that have different hash_inputs
1104  */
1105 static int slow_chain_length(const struct rtable *head)
1106 {
1107         int length = 0;
1108         const struct rtable *rth = head;
1109
1110         while (rth) {
1111                 length += has_noalias(head, rth);
1112                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113         }
1114         return length >> FRACT_BITS;
1115 }
1116
1117 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1118                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1119 {
1120         struct rtable   *rth, *cand;
1121         struct rtable __rcu **rthp, **candp;
1122         unsigned long   now;
1123         u32             min_score;
1124         int             chain_length;
1125         int attempts = !in_softirq();
1126
1127 restart:
1128         chain_length = 0;
1129         min_score = ~(u32)0;
1130         cand = NULL;
1131         candp = NULL;
1132         now = jiffies;
1133
1134         if (!rt_caching(dev_net(rt->dst.dev))) {
1135                 /*
1136                  * If we're not caching, just tell the caller we
1137                  * were successful and don't touch the route.  The
1138                  * caller hold the sole reference to the cache entry, and
1139                  * it will be released when the caller is done with it.
1140                  * If we drop it here, the callers have no way to resolve routes
1141                  * when we're not caching.  Instead, just point *rp at rt, so
1142                  * the caller gets a single use out of the route
1143                  * Note that we do rt_free on this new route entry, so that
1144                  * once its refcount hits zero, we are still able to reap it
1145                  * (Thanks Alexey)
1146                  * Note: To avoid expensive rcu stuff for this uncached dst,
1147                  * we set DST_NOCACHE so that dst_release() can free dst without
1148                  * waiting a grace period.
1149                  */
1150
1151                 rt->dst.flags |= DST_NOCACHE;
1152                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1153                         int err = arp_bind_neighbour(&rt->dst);
1154                         if (err) {
1155                                 if (net_ratelimit())
1156                                         printk(KERN_WARNING
1157                                             "Neighbour table failure & not caching routes.\n");
1158                                 ip_rt_put(rt);
1159                                 return err;
1160                         }
1161                 }
1162
1163                 goto skip_hashing;
1164         }
1165
1166         rthp = &rt_hash_table[hash].chain;
1167
1168         spin_lock_bh(rt_hash_lock_addr(hash));
1169         while ((rth = rcu_dereference_protected(*rthp,
1170                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1171                 if (rt_is_expired(rth)) {
1172                         *rthp = rth->dst.rt_next;
1173                         rt_free(rth);
1174                         continue;
1175                 }
1176                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1177                         /* Put it first */
1178                         *rthp = rth->dst.rt_next;
1179                         /*
1180                          * Since lookup is lockfree, the deletion
1181                          * must be visible to another weakly ordered CPU before
1182                          * the insertion at the start of the hash chain.
1183                          */
1184                         rcu_assign_pointer(rth->dst.rt_next,
1185                                            rt_hash_table[hash].chain);
1186                         /*
1187                          * Since lookup is lockfree, the update writes
1188                          * must be ordered for consistency on SMP.
1189                          */
1190                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
1192                         dst_use(&rth->dst, now);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         rt_drop(rt);
1196                         if (rp)
1197                                 *rp = rth;
1198                         else
1199                                 skb_dst_set(skb, &rth->dst);
1200                         return 0;
1201                 }
1202
1203                 if (!atomic_read(&rth->dst.__refcnt)) {
1204                         u32 score = rt_score(rth);
1205
1206                         if (score <= min_score) {
1207                                 cand = rth;
1208                                 candp = rthp;
1209                                 min_score = score;
1210                         }
1211                 }
1212
1213                 chain_length++;
1214
1215                 rthp = &rth->dst.rt_next;
1216         }
1217
1218         if (cand) {
1219                 /* ip_rt_gc_elasticity used to be average length of chain
1220                  * length, when exceeded gc becomes really aggressive.
1221                  *
1222                  * The second limit is less certain. At the moment it allows
1223                  * only 2 entries per bucket. We will see.
1224                  */
1225                 if (chain_length > ip_rt_gc_elasticity) {
1226                         *candp = cand->dst.rt_next;
1227                         rt_free(cand);
1228                 }
1229         } else {
1230                 if (chain_length > rt_chain_length_max &&
1231                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1232                         struct net *net = dev_net(rt->dst.dev);
1233                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1234                         if (!rt_caching(net)) {
1235                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1236                                         rt->dst.dev->name, num);
1237                         }
1238                         rt_emergency_hash_rebuild(net);
1239                         spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242                                         ifindex, rt_genid(net));
1243                         goto restart;
1244                 }
1245         }
1246
1247         /* Try to bind route to arp only if it is output
1248            route or unicast forwarding path.
1249          */
1250         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1251                 int err = arp_bind_neighbour(&rt->dst);
1252                 if (err) {
1253                         spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255                         if (err != -ENOBUFS) {
1256                                 rt_drop(rt);
1257                                 return err;
1258                         }
1259
1260                         /* Neighbour tables are full and nothing
1261                            can be released. Try to shrink route cache,
1262                            it is most likely it holds some neighbour records.
1263                          */
1264                         if (attempts-- > 0) {
1265                                 int saved_elasticity = ip_rt_gc_elasticity;
1266                                 int saved_int = ip_rt_gc_min_interval;
1267                                 ip_rt_gc_elasticity     = 1;
1268                                 ip_rt_gc_min_interval   = 0;
1269                                 rt_garbage_collect(&ipv4_dst_ops);
1270                                 ip_rt_gc_min_interval   = saved_int;
1271                                 ip_rt_gc_elasticity     = saved_elasticity;
1272                                 goto restart;
1273                         }
1274
1275                         if (net_ratelimit())
1276                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1277                         rt_drop(rt);
1278                         return -ENOBUFS;
1279                 }
1280         }
1281
1282         rt->dst.rt_next = rt_hash_table[hash].chain;
1283
1284 #if RT_CACHE_DEBUG >= 2
1285         if (rt->dst.rt_next) {
1286                 struct rtable *trt;
1287                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288                        hash, &rt->rt_dst);
1289                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1290                         printk(" . %pI4", &trt->rt_dst);
1291                 printk("\n");
1292         }
1293 #endif
1294         /*
1295          * Since lookup is lockfree, we must make sure
1296          * previous writes to rt are comitted to memory
1297          * before making rt visible to other CPUS.
1298          */
1299         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1300
1301         spin_unlock_bh(rt_hash_lock_addr(hash));
1302
1303 skip_hashing:
1304         if (rp)
1305                 *rp = rt;
1306         else
1307                 skb_dst_set(skb, &rt->dst);
1308         return 0;
1309 }
1310
1311 void rt_bind_peer(struct rtable *rt, int create)
1312 {
1313         struct inet_peer *peer;
1314
1315         peer = inet_getpeer_v4(rt->rt_dst, create);
1316
1317         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1318                 inet_putpeer(peer);
1319 }
1320
1321 /*
1322  * Peer allocation may fail only in serious out-of-memory conditions.  However
1323  * we still can generate some output.
1324  * Random ID selection looks a bit dangerous because we have no chances to
1325  * select ID being unique in a reasonable period of time.
1326  * But broken packet identifier may be better than no packet at all.
1327  */
1328 static void ip_select_fb_ident(struct iphdr *iph)
1329 {
1330         static DEFINE_SPINLOCK(ip_fb_id_lock);
1331         static u32 ip_fallback_id;
1332         u32 salt;
1333
1334         spin_lock_bh(&ip_fb_id_lock);
1335         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1336         iph->id = htons(salt & 0xFFFF);
1337         ip_fallback_id = salt;
1338         spin_unlock_bh(&ip_fb_id_lock);
1339 }
1340
1341 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1342 {
1343         struct rtable *rt = (struct rtable *) dst;
1344
1345         if (rt) {
1346                 if (rt->peer == NULL)
1347                         rt_bind_peer(rt, 1);
1348
1349                 /* If peer is attached to destination, it is never detached,
1350                    so that we need not to grab a lock to dereference it.
1351                  */
1352                 if (rt->peer) {
1353                         iph->id = htons(inet_getid(rt->peer, more));
1354                         return;
1355                 }
1356         } else
1357                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1358                        __builtin_return_address(0));
1359
1360         ip_select_fb_ident(iph);
1361 }
1362 EXPORT_SYMBOL(__ip_select_ident);
1363
1364 static void rt_del(unsigned hash, struct rtable *rt)
1365 {
1366         struct rtable __rcu **rthp;
1367         struct rtable *aux;
1368
1369         rthp = &rt_hash_table[hash].chain;
1370         spin_lock_bh(rt_hash_lock_addr(hash));
1371         ip_rt_put(rt);
1372         while ((aux = rcu_dereference_protected(*rthp,
1373                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1374                 if (aux == rt || rt_is_expired(aux)) {
1375                         *rthp = aux->dst.rt_next;
1376                         rt_free(aux);
1377                         continue;
1378                 }
1379                 rthp = &aux->dst.rt_next;
1380         }
1381         spin_unlock_bh(rt_hash_lock_addr(hash));
1382 }
1383
1384 /* called in rcu_read_lock() section */
1385 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386                     __be32 saddr, struct net_device *dev)
1387 {
1388         int i, k;
1389         struct in_device *in_dev = __in_dev_get_rcu(dev);
1390         struct rtable *rth;
1391         struct rtable __rcu **rthp;
1392         __be32  skeys[2] = { saddr, 0 };
1393         int  ikeys[2] = { dev->ifindex, 0 };
1394         struct netevent_redirect netevent;
1395         struct net *net;
1396
1397         if (!in_dev)
1398                 return;
1399
1400         net = dev_net(dev);
1401         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1402             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1403             ipv4_is_zeronet(new_gw))
1404                 goto reject_redirect;
1405
1406         if (!rt_caching(net))
1407                 goto reject_redirect;
1408
1409         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1410                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1411                         goto reject_redirect;
1412                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1413                         goto reject_redirect;
1414         } else {
1415                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1416                         goto reject_redirect;
1417         }
1418
1419         for (i = 0; i < 2; i++) {
1420                 for (k = 0; k < 2; k++) {
1421                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1422                                                 rt_genid(net));
1423
1424                         rthp = &rt_hash_table[hash].chain;
1425
1426                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1427                                 struct rtable *rt;
1428
1429                                 if (rth->fl.fl4_dst != daddr ||
1430                                     rth->fl.fl4_src != skeys[i] ||
1431                                     rth->fl.oif != ikeys[k] ||
1432                                     rt_is_input_route(rth) ||
1433                                     rt_is_expired(rth) ||
1434                                     !net_eq(dev_net(rth->dst.dev), net)) {
1435                                         rthp = &rth->dst.rt_next;
1436                                         continue;
1437                                 }
1438
1439                                 if (rth->rt_dst != daddr ||
1440                                     rth->rt_src != saddr ||
1441                                     rth->dst.error ||
1442                                     rth->rt_gateway != old_gw ||
1443                                     rth->dst.dev != dev)
1444                                         break;
1445
1446                                 dst_hold(&rth->dst);
1447
1448                                 rt = dst_alloc(&ipv4_dst_ops);
1449                                 if (rt == NULL) {
1450                                         ip_rt_put(rth);
1451                                         return;
1452                                 }
1453
1454                                 /* Copy all the information. */
1455                                 *rt = *rth;
1456                                 rt->dst.__use           = 1;
1457                                 atomic_set(&rt->dst.__refcnt, 1);
1458                                 rt->dst.child           = NULL;
1459                                 if (rt->dst.dev)
1460                                         dev_hold(rt->dst.dev);
1461                                 rt->dst.obsolete        = -1;
1462                                 rt->dst.lastuse = jiffies;
1463                                 rt->dst.path            = &rt->dst;
1464                                 rt->dst.neighbour       = NULL;
1465                                 rt->dst.hh              = NULL;
1466 #ifdef CONFIG_XFRM
1467                                 rt->dst.xfrm            = NULL;
1468 #endif
1469                                 rt->rt_genid            = rt_genid(net);
1470                                 rt->rt_flags            |= RTCF_REDIRECTED;
1471
1472                                 /* Gateway is different ... */
1473                                 rt->rt_gateway          = new_gw;
1474
1475                                 /* Redirect received -> path was valid */
1476                                 dst_confirm(&rth->dst);
1477
1478                                 if (rt->peer)
1479                                         atomic_inc(&rt->peer->refcnt);
1480                                 if (rt->fi)
1481                                         atomic_inc(&rt->fi->fib_clntref);
1482
1483                                 if (arp_bind_neighbour(&rt->dst) ||
1484                                     !(rt->dst.neighbour->nud_state &
1485                                             NUD_VALID)) {
1486                                         if (rt->dst.neighbour)
1487                                                 neigh_event_send(rt->dst.neighbour, NULL);
1488                                         ip_rt_put(rth);
1489                                         rt_drop(rt);
1490                                         goto do_next;
1491                                 }
1492
1493                                 netevent.old = &rth->dst;
1494                                 netevent.new = &rt->dst;
1495                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1496                                                         &netevent);
1497
1498                                 rt_del(hash, rth);
1499                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1500                                         ip_rt_put(rt);
1501                                 goto do_next;
1502                         }
1503                 do_next:
1504                         ;
1505                 }
1506         }
1507         return;
1508
1509 reject_redirect:
1510 #ifdef CONFIG_IP_ROUTE_VERBOSE
1511         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1512                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1513                         "  Advised path = %pI4 -> %pI4\n",
1514                        &old_gw, dev->name, &new_gw,
1515                        &saddr, &daddr);
1516 #endif
1517         ;
1518 }
1519
1520 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1521 {
1522         struct rtable *rt = (struct rtable *)dst;
1523         struct dst_entry *ret = dst;
1524
1525         if (rt) {
1526                 if (dst->obsolete > 0) {
1527                         ip_rt_put(rt);
1528                         ret = NULL;
1529                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1530                            (rt->dst.expires &&
1531                             time_after_eq(jiffies, rt->dst.expires))) {
1532                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1533                                                 rt->fl.oif,
1534                                                 rt_genid(dev_net(dst->dev)));
1535 #if RT_CACHE_DEBUG >= 1
1536                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1537                                 &rt->rt_dst, rt->fl.fl4_tos);
1538 #endif
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 }
1542         }
1543         return ret;
1544 }
1545
1546 /*
1547  * Algorithm:
1548  *      1. The first ip_rt_redirect_number redirects are sent
1549  *         with exponential backoff, then we stop sending them at all,
1550  *         assuming that the host ignores our redirects.
1551  *      2. If we did not see packets requiring redirects
1552  *         during ip_rt_redirect_silence, we assume that the host
1553  *         forgot redirected route and start to send redirects again.
1554  *
1555  * This algorithm is much cheaper and more intelligent than dumb load limiting
1556  * in icmp.c.
1557  *
1558  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1559  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1560  */
1561
1562 void ip_rt_send_redirect(struct sk_buff *skb)
1563 {
1564         struct rtable *rt = skb_rtable(skb);
1565         struct in_device *in_dev;
1566         int log_martians;
1567
1568         rcu_read_lock();
1569         in_dev = __in_dev_get_rcu(rt->dst.dev);
1570         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1571                 rcu_read_unlock();
1572                 return;
1573         }
1574         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1575         rcu_read_unlock();
1576
1577         /* No redirected packets during ip_rt_redirect_silence;
1578          * reset the algorithm.
1579          */
1580         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1581                 rt->dst.rate_tokens = 0;
1582
1583         /* Too many ignored redirects; do not send anything
1584          * set dst.rate_last to the last seen redirected packet.
1585          */
1586         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1587                 rt->dst.rate_last = jiffies;
1588                 return;
1589         }
1590
1591         /* Check for load limit; set rate_last to the latest sent
1592          * redirect.
1593          */
1594         if (rt->dst.rate_tokens == 0 ||
1595             time_after(jiffies,
1596                        (rt->dst.rate_last +
1597                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1598                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1599                 rt->dst.rate_last = jiffies;
1600                 ++rt->dst.rate_tokens;
1601 #ifdef CONFIG_IP_ROUTE_VERBOSE
1602                 if (log_martians &&
1603                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1604                     net_ratelimit())
1605                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1606                                 &rt->rt_src, rt->rt_iif,
1607                                 &rt->rt_dst, &rt->rt_gateway);
1608 #endif
1609         }
1610 }
1611
1612 static int ip_error(struct sk_buff *skb)
1613 {
1614         struct rtable *rt = skb_rtable(skb);
1615         unsigned long now;
1616         int code;
1617
1618         switch (rt->dst.error) {
1619                 case EINVAL:
1620                 default:
1621                         goto out;
1622                 case EHOSTUNREACH:
1623                         code = ICMP_HOST_UNREACH;
1624                         break;
1625                 case ENETUNREACH:
1626                         code = ICMP_NET_UNREACH;
1627                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1628                                         IPSTATS_MIB_INNOROUTES);
1629                         break;
1630                 case EACCES:
1631                         code = ICMP_PKT_FILTERED;
1632                         break;
1633         }
1634
1635         now = jiffies;
1636         rt->dst.rate_tokens += now - rt->dst.rate_last;
1637         if (rt->dst.rate_tokens > ip_rt_error_burst)
1638                 rt->dst.rate_tokens = ip_rt_error_burst;
1639         rt->dst.rate_last = now;
1640         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1641                 rt->dst.rate_tokens -= ip_rt_error_cost;
1642                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1643         }
1644
1645 out:    kfree_skb(skb);
1646         return 0;
1647 }
1648
1649 /*
1650  *      The last two values are not from the RFC but
1651  *      are needed for AMPRnet AX.25 paths.
1652  */
1653
1654 static const unsigned short mtu_plateau[] =
1655 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1656
1657 static inline unsigned short guess_mtu(unsigned short old_mtu)
1658 {
1659         int i;
1660
1661         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1662                 if (old_mtu > mtu_plateau[i])
1663                         return mtu_plateau[i];
1664         return 68;
1665 }
1666
1667 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1668                                  unsigned short new_mtu,
1669                                  struct net_device *dev)
1670 {
1671         int i, k;
1672         unsigned short old_mtu = ntohs(iph->tot_len);
1673         struct rtable *rth;
1674         int  ikeys[2] = { dev->ifindex, 0 };
1675         __be32  skeys[2] = { iph->saddr, 0, };
1676         __be32  daddr = iph->daddr;
1677         unsigned short est_mtu = 0;
1678
1679         for (k = 0; k < 2; k++) {
1680                 for (i = 0; i < 2; i++) {
1681                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1682                                                 rt_genid(net));
1683
1684                         rcu_read_lock();
1685                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1686                              rth = rcu_dereference(rth->dst.rt_next)) {
1687                                 unsigned short mtu = new_mtu;
1688
1689                                 if (rth->fl.fl4_dst != daddr ||
1690                                     rth->fl.fl4_src != skeys[i] ||
1691                                     rth->rt_dst != daddr ||
1692                                     rth->rt_src != iph->saddr ||
1693                                     rth->fl.oif != ikeys[k] ||
1694                                     rt_is_input_route(rth) ||
1695                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1696                                     !net_eq(dev_net(rth->dst.dev), net) ||
1697                                     rt_is_expired(rth))
1698                                         continue;
1699
1700                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1701
1702                                         /* BSD 4.2 compatibility hack :-( */
1703                                         if (mtu == 0 &&
1704                                             old_mtu >= dst_mtu(&rth->dst) &&
1705                                             old_mtu >= 68 + (iph->ihl << 2))
1706                                                 old_mtu -= iph->ihl << 2;
1707
1708                                         mtu = guess_mtu(old_mtu);
1709                                 }
1710                                 if (mtu <= dst_mtu(&rth->dst)) {
1711                                         if (mtu < dst_mtu(&rth->dst)) {
1712                                                 dst_confirm(&rth->dst);
1713                                                 if (mtu < ip_rt_min_pmtu) {
1714                                                         u32 lock = dst_metric(&rth->dst,
1715                                                                               RTAX_LOCK);
1716                                                         mtu = ip_rt_min_pmtu;
1717                                                         lock |= (1 << RTAX_MTU);
1718                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1719                                                                        lock);
1720                                                 }
1721                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1722                                                 dst_set_expires(&rth->dst,
1723                                                         ip_rt_mtu_expires);
1724                                         }
1725                                         est_mtu = mtu;
1726                                 }
1727                         }
1728                         rcu_read_unlock();
1729                 }
1730         }
1731         return est_mtu ? : new_mtu;
1732 }
1733
1734 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1735 {
1736         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1737             !(dst_metric_locked(dst, RTAX_MTU))) {
1738                 if (mtu < ip_rt_min_pmtu) {
1739                         u32 lock = dst_metric(dst, RTAX_LOCK);
1740                         mtu = ip_rt_min_pmtu;
1741                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1742                 }
1743                 dst_metric_set(dst, RTAX_MTU, mtu);
1744                 dst_set_expires(dst, ip_rt_mtu_expires);
1745                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1746         }
1747 }
1748
1749 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1750 {
1751         if (rt_is_expired((struct rtable *)dst))
1752                 return NULL;
1753         return dst;
1754 }
1755
1756 static void ipv4_dst_destroy(struct dst_entry *dst)
1757 {
1758         struct rtable *rt = (struct rtable *) dst;
1759         struct inet_peer *peer = rt->peer;
1760
1761         if (rt->fi) {
1762                 fib_info_put(rt->fi);
1763                 rt->fi = NULL;
1764         }
1765         if (peer) {
1766                 rt->peer = NULL;
1767                 inet_putpeer(peer);
1768         }
1769 }
1770
1771
1772 static void ipv4_link_failure(struct sk_buff *skb)
1773 {
1774         struct rtable *rt;
1775
1776         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
1778         rt = skb_rtable(skb);
1779         if (rt)
1780                 dst_set_expires(&rt->dst, 0);
1781 }
1782
1783 static int ip_rt_bug(struct sk_buff *skb)
1784 {
1785         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1787                 skb->dev ? skb->dev->name : "?");
1788         kfree_skb(skb);
1789         return 0;
1790 }
1791
1792 /*
1793    We do not cache source address of outgoing interface,
1794    because it is used only by IP RR, TS and SRR options,
1795    so that it out of fast path.
1796
1797    BTW remember: "addr" is allowed to be not aligned
1798    in IP options!
1799  */
1800
1801 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802 {
1803         __be32 src;
1804         struct fib_result res;
1805
1806         if (rt_is_output_route(rt))
1807                 src = rt->rt_src;
1808         else {
1809                 rcu_read_lock();
1810                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1811                         src = FIB_RES_PREFSRC(res);
1812                 else
1813                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1814                                         RT_SCOPE_UNIVERSE);
1815                 rcu_read_unlock();
1816         }
1817         memcpy(addr, &src, 4);
1818 }
1819
1820 #ifdef CONFIG_IP_ROUTE_CLASSID
1821 static void set_class_tag(struct rtable *rt, u32 tag)
1822 {
1823         if (!(rt->dst.tclassid & 0xFFFF))
1824                 rt->dst.tclassid |= tag & 0xFFFF;
1825         if (!(rt->dst.tclassid & 0xFFFF0000))
1826                 rt->dst.tclassid |= tag & 0xFFFF0000;
1827 }
1828 #endif
1829
1830 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1831 {
1832         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1833
1834         if (advmss == 0) {
1835                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1836                                ip_rt_min_advmss);
1837                 if (advmss > 65535 - 40)
1838                         advmss = 65535 - 40;
1839         }
1840         return advmss;
1841 }
1842
1843 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1844 {
1845         unsigned int mtu = dst->dev->mtu;
1846
1847         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1848                 const struct rtable *rt = (const struct rtable *) dst;
1849
1850                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1851                         mtu = 576;
1852         }
1853
1854         if (mtu > IP_MAX_MTU)
1855                 mtu = IP_MAX_MTU;
1856
1857         return mtu;
1858 }
1859
1860 static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1861 {
1862         struct inet_peer *peer;
1863         int create = 0;
1864
1865         /* If a peer entry exists for this destination, we must hook
1866          * it up in order to get at cached metrics.
1867          */
1868         if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1869                 create = 1;
1870
1871         rt_bind_peer(rt, create);
1872         peer = rt->peer;
1873         if (peer) {
1874                 if (inet_metrics_new(peer))
1875                         memcpy(peer->metrics, fi->fib_metrics,
1876                                sizeof(u32) * RTAX_MAX);
1877                 dst_init_metrics(&rt->dst, peer->metrics, false);
1878         } else {
1879                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1880                         rt->fi = fi;
1881                         atomic_inc(&fi->fib_clntref);
1882                 }
1883                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1884         }
1885 }
1886
1887 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1888 {
1889         struct dst_entry *dst = &rt->dst;
1890         struct fib_info *fi = res->fi;
1891
1892         if (fi) {
1893                 if (FIB_RES_GW(*res) &&
1894                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1895                         rt->rt_gateway = FIB_RES_GW(*res);
1896                 rt_init_metrics(rt, fi);
1897 #ifdef CONFIG_IP_ROUTE_CLASSID
1898                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1899 #endif
1900         }
1901
1902         if (dst_mtu(dst) > IP_MAX_MTU)
1903                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1904         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1905                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1906
1907 #ifdef CONFIG_IP_ROUTE_CLASSID
1908 #ifdef CONFIG_IP_MULTIPLE_TABLES
1909         set_class_tag(rt, fib_rules_tclass(res));
1910 #endif
1911         set_class_tag(rt, itag);
1912 #endif
1913         rt->rt_type = res->type;
1914 }
1915
1916 /* called in rcu_read_lock() section */
1917 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1918                                 u8 tos, struct net_device *dev, int our)
1919 {
1920         unsigned int hash;
1921         struct rtable *rth;
1922         __be32 spec_dst;
1923         struct in_device *in_dev = __in_dev_get_rcu(dev);
1924         u32 itag = 0;
1925         int err;
1926
1927         /* Primary sanity checks. */
1928
1929         if (in_dev == NULL)
1930                 return -EINVAL;
1931
1932         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1933             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1934                 goto e_inval;
1935
1936         if (ipv4_is_zeronet(saddr)) {
1937                 if (!ipv4_is_local_multicast(daddr))
1938                         goto e_inval;
1939                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1940         } else {
1941                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1942                                           &itag, 0);
1943                 if (err < 0)
1944                         goto e_err;
1945         }
1946         rth = dst_alloc(&ipv4_dst_ops);
1947         if (!rth)
1948                 goto e_nobufs;
1949
1950         rth->dst.output = ip_rt_bug;
1951         rth->dst.obsolete = -1;
1952
1953         atomic_set(&rth->dst.__refcnt, 1);
1954         rth->dst.flags= DST_HOST;
1955         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1956                 rth->dst.flags |= DST_NOPOLICY;
1957         rth->fl.fl4_dst = daddr;
1958         rth->rt_dst     = daddr;
1959         rth->fl.fl4_tos = tos;
1960         rth->fl.mark    = skb->mark;
1961         rth->fl.fl4_src = saddr;
1962         rth->rt_src     = saddr;
1963 #ifdef CONFIG_IP_ROUTE_CLASSID
1964         rth->dst.tclassid = itag;
1965 #endif
1966         rth->rt_iif     =
1967         rth->fl.iif     = dev->ifindex;
1968         rth->dst.dev    = init_net.loopback_dev;
1969         dev_hold(rth->dst.dev);
1970         rth->fl.oif     = 0;
1971         rth->rt_gateway = daddr;
1972         rth->rt_spec_dst= spec_dst;
1973         rth->rt_genid   = rt_genid(dev_net(dev));
1974         rth->rt_flags   = RTCF_MULTICAST;
1975         rth->rt_type    = RTN_MULTICAST;
1976         if (our) {
1977                 rth->dst.input= ip_local_deliver;
1978                 rth->rt_flags |= RTCF_LOCAL;
1979         }
1980
1981 #ifdef CONFIG_IP_MROUTE
1982         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1983                 rth->dst.input = ip_mr_input;
1984 #endif
1985         RT_CACHE_STAT_INC(in_slow_mc);
1986
1987         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1988         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1989
1990 e_nobufs:
1991         return -ENOBUFS;
1992 e_inval:
1993         return -EINVAL;
1994 e_err:
1995         return err;
1996 }
1997
1998
1999 static void ip_handle_martian_source(struct net_device *dev,
2000                                      struct in_device *in_dev,
2001                                      struct sk_buff *skb,
2002                                      __be32 daddr,
2003                                      __be32 saddr)
2004 {
2005         RT_CACHE_STAT_INC(in_martian_src);
2006 #ifdef CONFIG_IP_ROUTE_VERBOSE
2007         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2008                 /*
2009                  *      RFC1812 recommendation, if source is martian,
2010                  *      the only hint is MAC header.
2011                  */
2012                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2013                         &daddr, &saddr, dev->name);
2014                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2015                         int i;
2016                         const unsigned char *p = skb_mac_header(skb);
2017                         printk(KERN_WARNING "ll header: ");
2018                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2019                                 printk("%02x", *p);
2020                                 if (i < (dev->hard_header_len - 1))
2021                                         printk(":");
2022                         }
2023                         printk("\n");
2024                 }
2025         }
2026 #endif
2027 }
2028
2029 /* called in rcu_read_lock() section */
2030 static int __mkroute_input(struct sk_buff *skb,
2031                            struct fib_result *res,
2032                            struct in_device *in_dev,
2033                            __be32 daddr, __be32 saddr, u32 tos,
2034                            struct rtable **result)
2035 {
2036         struct rtable *rth;
2037         int err;
2038         struct in_device *out_dev;
2039         unsigned int flags = 0;
2040         __be32 spec_dst;
2041         u32 itag;
2042
2043         /* get a working reference to the output device */
2044         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2045         if (out_dev == NULL) {
2046                 if (net_ratelimit())
2047                         printk(KERN_CRIT "Bug in ip_route_input" \
2048                                "_slow(). Please, report\n");
2049                 return -EINVAL;
2050         }
2051
2052
2053         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2054                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2055         if (err < 0) {
2056                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2057                                          saddr);
2058
2059                 goto cleanup;
2060         }
2061
2062         if (err)
2063                 flags |= RTCF_DIRECTSRC;
2064
2065         if (out_dev == in_dev && err &&
2066             (IN_DEV_SHARED_MEDIA(out_dev) ||
2067              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2068                 flags |= RTCF_DOREDIRECT;
2069
2070         if (skb->protocol != htons(ETH_P_IP)) {
2071                 /* Not IP (i.e. ARP). Do not create route, if it is
2072                  * invalid for proxy arp. DNAT routes are always valid.
2073                  *
2074                  * Proxy arp feature have been extended to allow, ARP
2075                  * replies back to the same interface, to support
2076                  * Private VLAN switch technologies. See arp.c.
2077                  */
2078                 if (out_dev == in_dev &&
2079                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2080                         err = -EINVAL;
2081                         goto cleanup;
2082                 }
2083         }
2084
2085
2086         rth = dst_alloc(&ipv4_dst_ops);
2087         if (!rth) {
2088                 err = -ENOBUFS;
2089                 goto cleanup;
2090         }
2091
2092         atomic_set(&rth->dst.__refcnt, 1);
2093         rth->dst.flags= DST_HOST;
2094         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2095                 rth->dst.flags |= DST_NOPOLICY;
2096         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2097                 rth->dst.flags |= DST_NOXFRM;
2098         rth->fl.fl4_dst = daddr;
2099         rth->rt_dst     = daddr;
2100         rth->fl.fl4_tos = tos;
2101         rth->fl.mark    = skb->mark;
2102         rth->fl.fl4_src = saddr;
2103         rth->rt_src     = saddr;
2104         rth->rt_gateway = daddr;
2105         rth->rt_iif     =
2106                 rth->fl.iif     = in_dev->dev->ifindex;
2107         rth->dst.dev    = (out_dev)->dev;
2108         dev_hold(rth->dst.dev);
2109         rth->fl.oif     = 0;
2110         rth->rt_spec_dst= spec_dst;
2111
2112         rth->dst.obsolete = -1;
2113         rth->dst.input = ip_forward;
2114         rth->dst.output = ip_output;
2115         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2116
2117         rt_set_nexthop(rth, res, itag);
2118
2119         rth->rt_flags = flags;
2120
2121         *result = rth;
2122         err = 0;
2123  cleanup:
2124         return err;
2125 }
2126
2127 static int ip_mkroute_input(struct sk_buff *skb,
2128                             struct fib_result *res,
2129                             const struct flowi *fl,
2130                             struct in_device *in_dev,
2131                             __be32 daddr, __be32 saddr, u32 tos)
2132 {
2133         struct rtable* rth = NULL;
2134         int err;
2135         unsigned hash;
2136
2137 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2138         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2139                 fib_select_multipath(fl, res);
2140 #endif
2141
2142         /* create a routing cache entry */
2143         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2144         if (err)
2145                 return err;
2146
2147         /* put it into the cache */
2148         hash = rt_hash(daddr, saddr, fl->iif,
2149                        rt_genid(dev_net(rth->dst.dev)));
2150         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2151 }
2152
2153 /*
2154  *      NOTE. We drop all the packets that has local source
2155  *      addresses, because every properly looped back packet
2156  *      must have correct destination already attached by output routine.
2157  *
2158  *      Such approach solves two big problems:
2159  *      1. Not simplex devices are handled properly.
2160  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2161  *      called with rcu_read_lock()
2162  */
2163
2164 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2165                                u8 tos, struct net_device *dev)
2166 {
2167         struct fib_result res;
2168         struct in_device *in_dev = __in_dev_get_rcu(dev);
2169         struct flowi fl = { .fl4_dst    = daddr,
2170                             .fl4_src    = saddr,
2171                             .fl4_tos    = tos,
2172                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2173                             .mark = skb->mark,
2174                             .iif = dev->ifindex };
2175         unsigned        flags = 0;
2176         u32             itag = 0;
2177         struct rtable * rth;
2178         unsigned        hash;
2179         __be32          spec_dst;
2180         int             err = -EINVAL;
2181         struct net    * net = dev_net(dev);
2182
2183         /* IP on this device is disabled. */
2184
2185         if (!in_dev)
2186                 goto out;
2187
2188         /* Check for the most weird martians, which can be not detected
2189            by fib_lookup.
2190          */
2191
2192         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2193             ipv4_is_loopback(saddr))
2194                 goto martian_source;
2195
2196         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2197                 goto brd_input;
2198
2199         /* Accept zero addresses only to limited broadcast;
2200          * I even do not know to fix it or not. Waiting for complains :-)
2201          */
2202         if (ipv4_is_zeronet(saddr))
2203                 goto martian_source;
2204
2205         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2206                 goto martian_destination;
2207
2208         /*
2209          *      Now we are ready to route packet.
2210          */
2211         err = fib_lookup(net, &fl, &res);
2212         if (err != 0) {
2213                 if (!IN_DEV_FORWARD(in_dev))
2214                         goto e_hostunreach;
2215                 goto no_route;
2216         }
2217
2218         RT_CACHE_STAT_INC(in_slow_tot);
2219
2220         if (res.type == RTN_BROADCAST)
2221                 goto brd_input;
2222
2223         if (res.type == RTN_LOCAL) {
2224                 err = fib_validate_source(saddr, daddr, tos,
2225                                           net->loopback_dev->ifindex,
2226                                           dev, &spec_dst, &itag, skb->mark);
2227                 if (err < 0)
2228                         goto martian_source_keep_err;
2229                 if (err)
2230                         flags |= RTCF_DIRECTSRC;
2231                 spec_dst = daddr;
2232                 goto local_input;
2233         }
2234
2235         if (!IN_DEV_FORWARD(in_dev))
2236                 goto e_hostunreach;
2237         if (res.type != RTN_UNICAST)
2238                 goto martian_destination;
2239
2240         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2241 out:    return err;
2242
2243 brd_input:
2244         if (skb->protocol != htons(ETH_P_IP))
2245                 goto e_inval;
2246
2247         if (ipv4_is_zeronet(saddr))
2248                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2249         else {
2250                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2251                                           &itag, skb->mark);
2252                 if (err < 0)
2253                         goto martian_source_keep_err;
2254                 if (err)
2255                         flags |= RTCF_DIRECTSRC;
2256         }
2257         flags |= RTCF_BROADCAST;
2258         res.type = RTN_BROADCAST;
2259         RT_CACHE_STAT_INC(in_brd);
2260
2261 local_input:
2262         rth = dst_alloc(&ipv4_dst_ops);
2263         if (!rth)
2264                 goto e_nobufs;
2265
2266         rth->dst.output= ip_rt_bug;
2267         rth->dst.obsolete = -1;
2268         rth->rt_genid = rt_genid(net);
2269
2270         atomic_set(&rth->dst.__refcnt, 1);
2271         rth->dst.flags= DST_HOST;
2272         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2273                 rth->dst.flags |= DST_NOPOLICY;
2274         rth->fl.fl4_dst = daddr;
2275         rth->rt_dst     = daddr;
2276         rth->fl.fl4_tos = tos;
2277         rth->fl.mark    = skb->mark;
2278         rth->fl.fl4_src = saddr;
2279         rth->rt_src     = saddr;
2280 #ifdef CONFIG_IP_ROUTE_CLASSID
2281         rth->dst.tclassid = itag;
2282 #endif
2283         rth->rt_iif     =
2284         rth->fl.iif     = dev->ifindex;
2285         rth->dst.dev    = net->loopback_dev;
2286         dev_hold(rth->dst.dev);
2287         rth->rt_gateway = daddr;
2288         rth->rt_spec_dst= spec_dst;
2289         rth->dst.input= ip_local_deliver;
2290         rth->rt_flags   = flags|RTCF_LOCAL;
2291         if (res.type == RTN_UNREACHABLE) {
2292                 rth->dst.input= ip_error;
2293                 rth->dst.error= -err;
2294                 rth->rt_flags   &= ~RTCF_LOCAL;
2295         }
2296         rth->rt_type    = res.type;
2297         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2298         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2299         goto out;
2300
2301 no_route:
2302         RT_CACHE_STAT_INC(in_no_route);
2303         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2304         res.type = RTN_UNREACHABLE;
2305         if (err == -ESRCH)
2306                 err = -ENETUNREACH;
2307         goto local_input;
2308
2309         /*
2310          *      Do not cache martian addresses: they should be logged (RFC1812)
2311          */
2312 martian_destination:
2313         RT_CACHE_STAT_INC(in_martian_dst);
2314 #ifdef CONFIG_IP_ROUTE_VERBOSE
2315         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2316                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2317                         &daddr, &saddr, dev->name);
2318 #endif
2319
2320 e_hostunreach:
2321         err = -EHOSTUNREACH;
2322         goto out;
2323
2324 e_inval:
2325         err = -EINVAL;
2326         goto out;
2327
2328 e_nobufs:
2329         err = -ENOBUFS;
2330         goto out;
2331
2332 martian_source:
2333         err = -EINVAL;
2334 martian_source_keep_err:
2335         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2336         goto out;
2337 }
2338
2339 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2340                            u8 tos, struct net_device *dev, bool noref)
2341 {
2342         struct rtable * rth;
2343         unsigned        hash;
2344         int iif = dev->ifindex;
2345         struct net *net;
2346         int res;
2347
2348         net = dev_net(dev);
2349
2350         rcu_read_lock();
2351
2352         if (!rt_caching(net))
2353                 goto skip_cache;
2354
2355         tos &= IPTOS_RT_MASK;
2356         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2357
2358         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2359              rth = rcu_dereference(rth->dst.rt_next)) {
2360                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2361                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2362                      (rth->fl.iif ^ iif) |
2363                      rth->fl.oif |
2364                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2365                     rth->fl.mark == skb->mark &&
2366                     net_eq(dev_net(rth->dst.dev), net) &&
2367                     !rt_is_expired(rth)) {
2368                         if (noref) {
2369                                 dst_use_noref(&rth->dst, jiffies);
2370                                 skb_dst_set_noref(skb, &rth->dst);
2371                         } else {
2372                                 dst_use(&rth->dst, jiffies);
2373                                 skb_dst_set(skb, &rth->dst);
2374                         }
2375                         RT_CACHE_STAT_INC(in_hit);
2376                         rcu_read_unlock();
2377                         return 0;
2378                 }
2379                 RT_CACHE_STAT_INC(in_hlist_search);
2380         }
2381
2382 skip_cache:
2383         /* Multicast recognition logic is moved from route cache to here.
2384            The problem was that too many Ethernet cards have broken/missing
2385            hardware multicast filters :-( As result the host on multicasting
2386            network acquires a lot of useless route cache entries, sort of
2387            SDR messages from all the world. Now we try to get rid of them.
2388            Really, provided software IP multicast filter is organized
2389            reasonably (at least, hashed), it does not result in a slowdown
2390            comparing with route cache reject entries.
2391            Note, that multicast routers are not affected, because
2392            route cache entry is created eventually.
2393          */
2394         if (ipv4_is_multicast(daddr)) {
2395                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2396
2397                 if (in_dev) {
2398                         int our = ip_check_mc(in_dev, daddr, saddr,
2399                                               ip_hdr(skb)->protocol);
2400                         if (our
2401 #ifdef CONFIG_IP_MROUTE
2402                                 ||
2403                             (!ipv4_is_local_multicast(daddr) &&
2404                              IN_DEV_MFORWARD(in_dev))
2405 #endif
2406                            ) {
2407                                 int res = ip_route_input_mc(skb, daddr, saddr,
2408                                                             tos, dev, our);
2409                                 rcu_read_unlock();
2410                                 return res;
2411                         }
2412                 }
2413                 rcu_read_unlock();
2414                 return -EINVAL;
2415         }
2416         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2417         rcu_read_unlock();
2418         return res;
2419 }
2420 EXPORT_SYMBOL(ip_route_input_common);
2421
2422 /* called with rcu_read_lock() */
2423 static int __mkroute_output(struct rtable **result,
2424                             struct fib_result *res,
2425                             const struct flowi *fl,
2426                             const struct flowi *oldflp,
2427                             struct net_device *dev_out,
2428                             unsigned flags)
2429 {
2430         struct rtable *rth;
2431         struct in_device *in_dev;
2432         u32 tos = RT_FL_TOS(oldflp);
2433
2434         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2435                 return -EINVAL;
2436
2437         if (ipv4_is_lbcast(fl->fl4_dst))
2438                 res->type = RTN_BROADCAST;
2439         else if (ipv4_is_multicast(fl->fl4_dst))
2440                 res->type = RTN_MULTICAST;
2441         else if (ipv4_is_zeronet(fl->fl4_dst))
2442                 return -EINVAL;
2443
2444         if (dev_out->flags & IFF_LOOPBACK)
2445                 flags |= RTCF_LOCAL;
2446
2447         in_dev = __in_dev_get_rcu(dev_out);
2448         if (!in_dev)
2449                 return -EINVAL;
2450
2451         if (res->type == RTN_BROADCAST) {
2452                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2453                 res->fi = NULL;
2454         } else if (res->type == RTN_MULTICAST) {
2455                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2456                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2457                                  oldflp->proto))
2458                         flags &= ~RTCF_LOCAL;
2459                 /* If multicast route do not exist use
2460                  * default one, but do not gateway in this case.
2461                  * Yes, it is hack.
2462                  */
2463                 if (res->fi && res->prefixlen < 4)
2464                         res->fi = NULL;
2465         }
2466
2467
2468         rth = dst_alloc(&ipv4_dst_ops);
2469         if (!rth)
2470                 return -ENOBUFS;
2471
2472         atomic_set(&rth->dst.__refcnt, 1);
2473         rth->dst.flags= DST_HOST;
2474         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2475                 rth->dst.flags |= DST_NOXFRM;
2476         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2477                 rth->dst.flags |= DST_NOPOLICY;
2478
2479         rth->fl.fl4_dst = oldflp->fl4_dst;
2480         rth->fl.fl4_tos = tos;
2481         rth->fl.fl4_src = oldflp->fl4_src;
2482         rth->fl.oif     = oldflp->oif;
2483         rth->fl.mark    = oldflp->mark;
2484         rth->rt_dst     = fl->fl4_dst;
2485         rth->rt_src     = fl->fl4_src;
2486         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2487         /* get references to the devices that are to be hold by the routing
2488            cache entry */
2489         rth->dst.dev    = dev_out;
2490         dev_hold(dev_out);
2491         rth->rt_gateway = fl->fl4_dst;
2492         rth->rt_spec_dst= fl->fl4_src;
2493
2494         rth->dst.output=ip_output;
2495         rth->dst.obsolete = -1;
2496         rth->rt_genid = rt_genid(dev_net(dev_out));
2497
2498         RT_CACHE_STAT_INC(out_slow_tot);
2499
2500         if (flags & RTCF_LOCAL) {
2501                 rth->dst.input = ip_local_deliver;
2502                 rth->rt_spec_dst = fl->fl4_dst;
2503         }
2504         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2505                 rth->rt_spec_dst = fl->fl4_src;
2506                 if (flags & RTCF_LOCAL &&
2507                     !(dev_out->flags & IFF_LOOPBACK)) {
2508                         rth->dst.output = ip_mc_output;
2509                         RT_CACHE_STAT_INC(out_slow_mc);
2510                 }
2511 #ifdef CONFIG_IP_MROUTE
2512                 if (res->type == RTN_MULTICAST) {
2513                         if (IN_DEV_MFORWARD(in_dev) &&
2514                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2515                                 rth->dst.input = ip_mr_input;
2516                                 rth->dst.output = ip_mc_output;
2517                         }
2518                 }
2519 #endif
2520         }
2521
2522         rt_set_nexthop(rth, res, 0);
2523
2524         rth->rt_flags = flags;
2525         *result = rth;
2526         return 0;
2527 }
2528
2529 /* called with rcu_read_lock() */
2530 static int ip_mkroute_output(struct rtable **rp,
2531                              struct fib_result *res,
2532                              const struct flowi *fl,
2533                              const struct flowi *oldflp,
2534                              struct net_device *dev_out,
2535                              unsigned flags)
2536 {
2537         struct rtable *rth = NULL;
2538         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2539         unsigned hash;
2540         if (err == 0) {
2541                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2542                                rt_genid(dev_net(dev_out)));
2543                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2544         }
2545
2546         return err;
2547 }
2548
2549 /*
2550  * Major route resolver routine.
2551  * called with rcu_read_lock();
2552  */
2553
2554 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2555                                 const struct flowi *oldflp)
2556 {
2557         u32 tos = RT_FL_TOS(oldflp);
2558         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2559                             .fl4_src = oldflp->fl4_src,
2560                             .fl4_tos = tos & IPTOS_RT_MASK,
2561                             .fl4_scope = ((tos & RTO_ONLINK) ?
2562                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2563                             .mark = oldflp->mark,
2564                             .iif = net->loopback_dev->ifindex,
2565                             .oif = oldflp->oif };
2566         struct fib_result res;
2567         unsigned int flags = 0;
2568         struct net_device *dev_out = NULL;
2569         int err;
2570
2571
2572         res.fi          = NULL;
2573 #ifdef CONFIG_IP_MULTIPLE_TABLES
2574         res.r           = NULL;
2575 #endif
2576
2577         if (oldflp->fl4_src) {
2578                 err = -EINVAL;
2579                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2580                     ipv4_is_lbcast(oldflp->fl4_src) ||
2581                     ipv4_is_zeronet(oldflp->fl4_src))
2582                         goto out;
2583
2584                 /* I removed check for oif == dev_out->oif here.
2585                    It was wrong for two reasons:
2586                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2587                       is assigned to multiple interfaces.
2588                    2. Moreover, we are allowed to send packets with saddr
2589                       of another iface. --ANK
2590                  */
2591
2592                 if (oldflp->oif == 0 &&
2593                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2594                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2595                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2596                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2597                         if (dev_out == NULL)
2598                                 goto out;
2599
2600                         /* Special hack: user can direct multicasts
2601                            and limited broadcast via necessary interface
2602                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2603                            This hack is not just for fun, it allows
2604                            vic,vat and friends to work.
2605                            They bind socket to loopback, set ttl to zero
2606                            and expect that it will work.
2607                            From the viewpoint of routing cache they are broken,
2608                            because we are not allowed to build multicast path
2609                            with loopback source addr (look, routing cache
2610                            cannot know, that ttl is zero, so that packet
2611                            will not leave this host and route is valid).
2612                            Luckily, this hack is good workaround.
2613                          */
2614
2615                         fl.oif = dev_out->ifindex;
2616                         goto make_route;
2617                 }
2618
2619                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2620                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2621                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2622                                 goto out;
2623                 }
2624         }
2625
2626
2627         if (oldflp->oif) {
2628                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2629                 err = -ENODEV;
2630                 if (dev_out == NULL)
2631                         goto out;
2632
2633                 /* RACE: Check return value of inet_select_addr instead. */
2634                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2635                         err = -ENETUNREACH;
2636                         goto out;
2637                 }
2638                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2639                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2640                         if (!fl.fl4_src)
2641                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2642                                                               RT_SCOPE_LINK);
2643                         goto make_route;
2644                 }
2645                 if (!fl.fl4_src) {
2646                         if (ipv4_is_multicast(oldflp->fl4_dst))
2647                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2648                                                               fl.fl4_scope);
2649                         else if (!oldflp->fl4_dst)
2650                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2651                                                               RT_SCOPE_HOST);
2652                 }
2653         }
2654
2655         if (!fl.fl4_dst) {
2656                 fl.fl4_dst = fl.fl4_src;
2657                 if (!fl.fl4_dst)
2658                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2659                 dev_out = net->loopback_dev;
2660                 fl.oif = net->loopback_dev->ifindex;
2661                 res.type = RTN_LOCAL;
2662                 flags |= RTCF_LOCAL;
2663                 goto make_route;
2664         }
2665
2666         if (fib_lookup(net, &fl, &res)) {
2667                 res.fi = NULL;
2668                 if (oldflp->oif) {
2669                         /* Apparently, routing tables are wrong. Assume,
2670                            that the destination is on link.
2671
2672                            WHY? DW.
2673                            Because we are allowed to send to iface
2674                            even if it has NO routes and NO assigned
2675                            addresses. When oif is specified, routing
2676                            tables are looked up with only one purpose:
2677                            to catch if destination is gatewayed, rather than
2678                            direct. Moreover, if MSG_DONTROUTE is set,
2679                            we send packet, ignoring both routing tables
2680                            and ifaddr state. --ANK
2681
2682
2683                            We could make it even if oif is unknown,
2684                            likely IPv6, but we do not.
2685                          */
2686
2687                         if (fl.fl4_src == 0)
2688                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2689                                                               RT_SCOPE_LINK);
2690                         res.type = RTN_UNICAST;
2691                         goto make_route;
2692                 }
2693                 err = -ENETUNREACH;
2694                 goto out;
2695         }
2696
2697         if (res.type == RTN_LOCAL) {
2698                 if (!fl.fl4_src) {
2699                         if (res.fi->fib_prefsrc)
2700                                 fl.fl4_src = res.fi->fib_prefsrc;
2701                         else
2702                                 fl.fl4_src = fl.fl4_dst;
2703                 }
2704                 dev_out = net->loopback_dev;
2705                 fl.oif = dev_out->ifindex;
2706                 res.fi = NULL;
2707                 flags |= RTCF_LOCAL;
2708                 goto make_route;
2709         }
2710
2711 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2712         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2713                 fib_select_multipath(&fl, &res);
2714         else
2715 #endif
2716         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2717                 fib_select_default(&res);
2718
2719         if (!fl.fl4_src)
2720                 fl.fl4_src = FIB_RES_PREFSRC(res);
2721
2722         dev_out = FIB_RES_DEV(res);
2723         fl.oif = dev_out->ifindex;
2724
2725
2726 make_route:
2727         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2728
2729 out:    return err;
2730 }
2731
2732 int __ip_route_output_key(struct net *net, struct rtable **rp,
2733                           const struct flowi *flp)
2734 {
2735         unsigned int hash;
2736         int res;
2737         struct rtable *rth;
2738
2739         if (!rt_caching(net))
2740                 goto slow_output;
2741
2742         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2743
2744         rcu_read_lock_bh();
2745         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2746                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2747                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2748                     rth->fl.fl4_src == flp->fl4_src &&
2749                     rt_is_output_route(rth) &&
2750                     rth->fl.oif == flp->oif &&
2751                     rth->fl.mark == flp->mark &&
2752                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2753                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2754                     net_eq(dev_net(rth->dst.dev), net) &&
2755                     !rt_is_expired(rth)) {
2756                         dst_use(&rth->dst, jiffies);
2757                         RT_CACHE_STAT_INC(out_hit);
2758                         rcu_read_unlock_bh();
2759                         *rp = rth;
2760                         return 0;
2761                 }
2762                 RT_CACHE_STAT_INC(out_hlist_search);
2763         }
2764         rcu_read_unlock_bh();
2765
2766 slow_output:
2767         rcu_read_lock();
2768         res = ip_route_output_slow(net, rp, flp);
2769         rcu_read_unlock();
2770         return res;
2771 }
2772 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2773
2774 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2775 {
2776         return NULL;
2777 }
2778
2779 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2780 {
2781         return 0;
2782 }
2783
2784 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2785 {
2786 }
2787
2788 static struct dst_ops ipv4_dst_blackhole_ops = {
2789         .family                 =       AF_INET,
2790         .protocol               =       cpu_to_be16(ETH_P_IP),
2791         .destroy                =       ipv4_dst_destroy,
2792         .check                  =       ipv4_blackhole_dst_check,
2793         .default_mtu            =       ipv4_blackhole_default_mtu,
2794         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2795 };
2796
2797
2798 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2799 {
2800         struct rtable *ort = *rp;
2801         struct rtable *rt = (struct rtable *)
2802                 dst_alloc(&ipv4_dst_blackhole_ops);
2803
2804         if (rt) {
2805                 struct dst_entry *new = &rt->dst;
2806
2807                 atomic_set(&new->__refcnt, 1);
2808                 new->__use = 1;
2809                 new->input = dst_discard;
2810                 new->output = dst_discard;
2811                 dst_copy_metrics(new, &ort->dst);
2812
2813                 new->dev = ort->dst.dev;
2814                 if (new->dev)
2815                         dev_hold(new->dev);
2816
2817                 rt->fl = ort->fl;
2818
2819                 rt->rt_genid = rt_genid(net);
2820                 rt->rt_flags = ort->rt_flags;
2821                 rt->rt_type = ort->rt_type;
2822                 rt->rt_dst = ort->rt_dst;
2823                 rt->rt_src = ort->rt_src;
2824                 rt->rt_iif = ort->rt_iif;
2825                 rt->rt_gateway = ort->rt_gateway;
2826                 rt->rt_spec_dst = ort->rt_spec_dst;
2827                 rt->peer = ort->peer;
2828                 if (rt->peer)
2829                         atomic_inc(&rt->peer->refcnt);
2830                 rt->fi = ort->fi;
2831                 if (rt->fi)
2832                         atomic_inc(&rt->fi->fib_clntref);
2833
2834                 dst_free(new);
2835         }
2836
2837         dst_release(&(*rp)->dst);
2838         *rp = rt;
2839         return rt ? 0 : -ENOMEM;
2840 }
2841
2842 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2843                          struct sock *sk, int flags)
2844 {
2845         int err;
2846
2847         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2848                 return err;
2849
2850         if (flp->proto) {
2851                 if (!flp->fl4_src)
2852                         flp->fl4_src = (*rp)->rt_src;
2853                 if (!flp->fl4_dst)
2854                         flp->fl4_dst = (*rp)->rt_dst;
2855                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2856                                     flags ? XFRM_LOOKUP_WAIT : 0);
2857                 if (err == -EREMOTE)
2858                         err = ipv4_dst_blackhole(net, rp, flp);
2859
2860                 return err;
2861         }
2862
2863         return 0;
2864 }
2865 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2866
2867 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2868 {
2869         return ip_route_output_flow(net, rp, flp, NULL, 0);
2870 }
2871 EXPORT_SYMBOL(ip_route_output_key);
2872
2873 static int rt_fill_info(struct net *net,
2874                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2875                         int nowait, unsigned int flags)
2876 {
2877         struct rtable *rt = skb_rtable(skb);
2878         struct rtmsg *r;
2879         struct nlmsghdr *nlh;
2880         long expires;
2881         u32 id = 0, ts = 0, tsage = 0, error;
2882
2883         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2884         if (nlh == NULL)
2885                 return -EMSGSIZE;
2886
2887         r = nlmsg_data(nlh);
2888         r->rtm_family    = AF_INET;
2889         r->rtm_dst_len  = 32;
2890         r->rtm_src_len  = 0;
2891         r->rtm_tos      = rt->fl.fl4_tos;
2892         r->rtm_table    = RT_TABLE_MAIN;
2893         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2894         r->rtm_type     = rt->rt_type;
2895         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2896         r->rtm_protocol = RTPROT_UNSPEC;
2897         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2898         if (rt->rt_flags & RTCF_NOTIFY)
2899                 r->rtm_flags |= RTM_F_NOTIFY;
2900
2901         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2902
2903         if (rt->fl.fl4_src) {
2904                 r->rtm_src_len = 32;
2905                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2906         }
2907         if (rt->dst.dev)
2908                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910         if (rt->dst.tclassid)
2911                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2912 #endif
2913         if (rt_is_input_route(rt))
2914                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2915         else if (rt->rt_src != rt->fl.fl4_src)
2916                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2917
2918         if (rt->rt_dst != rt->rt_gateway)
2919                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2920
2921         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2922                 goto nla_put_failure;
2923
2924         if (rt->fl.mark)
2925                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2926
2927         error = rt->dst.error;
2928         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2929         if (rt->peer) {
2930                 inet_peer_refcheck(rt->peer);
2931                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2932                 if (rt->peer->tcp_ts_stamp) {
2933                         ts = rt->peer->tcp_ts;
2934                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2935                 }
2936         }
2937
2938         if (rt_is_input_route(rt)) {
2939 #ifdef CONFIG_IP_MROUTE
2940                 __be32 dst = rt->rt_dst;
2941
2942                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2943                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2944                         int err = ipmr_get_route(net, skb, r, nowait);
2945                         if (err <= 0) {
2946                                 if (!nowait) {
2947                                         if (err == 0)
2948                                                 return 0;
2949                                         goto nla_put_failure;
2950                                 } else {
2951                                         if (err == -EMSGSIZE)
2952                                                 goto nla_put_failure;
2953                                         error = err;
2954                                 }
2955                         }
2956                 } else
2957 #endif
2958                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2959         }
2960
2961         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2962                                expires, error) < 0)
2963                 goto nla_put_failure;
2964
2965         return nlmsg_end(skb, nlh);
2966
2967 nla_put_failure:
2968         nlmsg_cancel(skb, nlh);
2969         return -EMSGSIZE;
2970 }
2971
2972 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2973 {
2974         struct net *net = sock_net(in_skb->sk);
2975         struct rtmsg *rtm;
2976         struct nlattr *tb[RTA_MAX+1];
2977         struct rtable *rt = NULL;
2978         __be32 dst = 0;
2979         __be32 src = 0;
2980         u32 iif;
2981         int err;
2982         int mark;
2983         struct sk_buff *skb;
2984
2985         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2986         if (err < 0)
2987                 goto errout;
2988
2989         rtm = nlmsg_data(nlh);
2990
2991         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2992         if (skb == NULL) {
2993                 err = -ENOBUFS;
2994                 goto errout;
2995         }
2996
2997         /* Reserve room for dummy headers, this skb can pass
2998            through good chunk of routing engine.
2999          */
3000         skb_reset_mac_header(skb);
3001         skb_reset_network_header(skb);
3002
3003         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3004         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3005         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3006
3007         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3008         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3009         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3010         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3011
3012         if (iif) {
3013                 struct net_device *dev;
3014
3015                 dev = __dev_get_by_index(net, iif);
3016                 if (dev == NULL) {
3017                         err = -ENODEV;
3018                         goto errout_free;
3019                 }
3020
3021                 skb->protocol   = htons(ETH_P_IP);
3022                 skb->dev        = dev;
3023                 skb->mark       = mark;
3024                 local_bh_disable();
3025                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3026                 local_bh_enable();
3027
3028                 rt = skb_rtable(skb);
3029                 if (err == 0 && rt->dst.error)
3030                         err = -rt->dst.error;
3031         } else {
3032                 struct flowi fl = {
3033                         .fl4_dst = dst,
3034                         .fl4_src = src,
3035                         .fl4_tos = rtm->rtm_tos,
3036                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3037                         .mark = mark,
3038                 };
3039                 err = ip_route_output_key(net, &rt, &fl);
3040         }
3041
3042         if (err)
3043                 goto errout_free;
3044
3045         skb_dst_set(skb, &rt->dst);
3046         if (rtm->rtm_flags & RTM_F_NOTIFY)
3047                 rt->rt_flags |= RTCF_NOTIFY;
3048
3049         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3050                            RTM_NEWROUTE, 0, 0);
3051         if (err <= 0)
3052                 goto errout_free;
3053
3054         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3055 errout:
3056         return err;
3057
3058 errout_free:
3059         kfree_skb(skb);
3060         goto errout;
3061 }
3062
3063 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3064 {
3065         struct rtable *rt;
3066         int h, s_h;
3067         int idx, s_idx;
3068         struct net *net;
3069
3070         net = sock_net(skb->sk);
3071
3072         s_h = cb->args[0];
3073         if (s_h < 0)
3074                 s_h = 0;
3075         s_idx = idx = cb->args[1];
3076         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3077                 if (!rt_hash_table[h].chain)
3078                         continue;
3079                 rcu_read_lock_bh();
3080                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3081                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3082                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3083                                 continue;
3084                         if (rt_is_expired(rt))
3085                                 continue;
3086                         skb_dst_set_noref(skb, &rt->dst);
3087                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3088                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3089                                          1, NLM_F_MULTI) <= 0) {
3090                                 skb_dst_drop(skb);
3091                                 rcu_read_unlock_bh();
3092                                 goto done;
3093                         }
3094                         skb_dst_drop(skb);
3095                 }
3096                 rcu_read_unlock_bh();
3097         }
3098
3099 done:
3100         cb->args[0] = h;
3101         cb->args[1] = idx;
3102         return skb->len;
3103 }
3104
3105 void ip_rt_multicast_event(struct in_device *in_dev)
3106 {
3107         rt_cache_flush(dev_net(in_dev->dev), 0);
3108 }
3109
3110 #ifdef CONFIG_SYSCTL
3111 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3112                                         void __user *buffer,
3113                                         size_t *lenp, loff_t *ppos)
3114 {
3115         if (write) {
3116                 int flush_delay;
3117                 ctl_table ctl;
3118                 struct net *net;
3119
3120                 memcpy(&ctl, __ctl, sizeof(ctl));
3121                 ctl.data = &flush_delay;
3122                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3123
3124                 net = (struct net *)__ctl->extra1;
3125                 rt_cache_flush(net, flush_delay);
3126                 return 0;
3127         }
3128
3129         return -EINVAL;
3130 }
3131
3132 static ctl_table ipv4_route_table[] = {
3133         {
3134                 .procname       = "gc_thresh",
3135                 .data           = &ipv4_dst_ops.gc_thresh,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec,
3139         },
3140         {
3141                 .procname       = "max_size",
3142                 .data           = &ip_rt_max_size,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec,
3146         },
3147         {
3148                 /*  Deprecated. Use gc_min_interval_ms */
3149
3150                 .procname       = "gc_min_interval",
3151                 .data           = &ip_rt_gc_min_interval,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec_jiffies,
3155         },
3156         {
3157                 .procname       = "gc_min_interval_ms",
3158                 .data           = &ip_rt_gc_min_interval,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec_ms_jiffies,
3162         },
3163         {
3164                 .procname       = "gc_timeout",
3165                 .data           = &ip_rt_gc_timeout,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec_jiffies,
3169         },
3170         {
3171                 .procname       = "gc_interval",
3172                 .data           = &ip_rt_gc_interval,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec_jiffies,
3176         },
3177         {
3178                 .procname       = "redirect_load",
3179                 .data           = &ip_rt_redirect_load,
3180                 .maxlen         = sizeof(int),
3181                 .mode           = 0644,
3182                 .proc_handler   = proc_dointvec,
3183         },
3184         {
3185                 .procname       = "redirect_number",
3186                 .data           = &ip_rt_redirect_number,
3187                 .maxlen         = sizeof(int),
3188                 .mode           = 0644,
3189                 .proc_handler   = proc_dointvec,
3190         },
3191         {
3192                 .procname       = "redirect_silence",
3193                 .data           = &ip_rt_redirect_silence,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec,
3197         },
3198         {
3199                 .procname       = "error_cost",
3200                 .data           = &ip_rt_error_cost,
3201                 .maxlen         = sizeof(int),
3202                 .mode           = 0644,
3203                 .proc_handler   = proc_dointvec,
3204         },
3205         {
3206                 .procname       = "error_burst",
3207                 .data           = &ip_rt_error_burst,
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0644,
3210                 .proc_handler   = proc_dointvec,
3211         },
3212         {
3213                 .procname       = "gc_elasticity",
3214                 .data           = &ip_rt_gc_elasticity,
3215                 .maxlen         = sizeof(int),
3216                 .mode           = 0644,
3217                 .proc_handler   = proc_dointvec,
3218         },
3219         {
3220                 .procname       = "mtu_expires",
3221                 .data           = &ip_rt_mtu_expires,
3222                 .maxlen         = sizeof(int),
3223                 .mode           = 0644,
3224                 .proc_handler   = proc_dointvec_jiffies,
3225         },
3226         {
3227                 .procname       = "min_pmtu",
3228                 .data           = &ip_rt_min_pmtu,
3229                 .maxlen         = sizeof(int),
3230                 .mode           = 0644,
3231                 .proc_handler   = proc_dointvec,
3232         },
3233         {
3234                 .procname       = "min_adv_mss",
3235                 .data           = &ip_rt_min_advmss,
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0644,
3238                 .proc_handler   = proc_dointvec,
3239         },
3240         { }
3241 };
3242
3243 static struct ctl_table empty[1];
3244
3245 static struct ctl_table ipv4_skeleton[] =
3246 {
3247         { .procname = "route", 
3248           .mode = 0555, .child = ipv4_route_table},
3249         { .procname = "neigh", 
3250           .mode = 0555, .child = empty},
3251         { }
3252 };
3253
3254 static __net_initdata struct ctl_path ipv4_path[] = {
3255         { .procname = "net", },
3256         { .procname = "ipv4", },
3257         { },
3258 };
3259
3260 static struct ctl_table ipv4_route_flush_table[] = {
3261         {
3262                 .procname       = "flush",
3263                 .maxlen         = sizeof(int),
3264                 .mode           = 0200,
3265                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3266         },
3267         { },
3268 };
3269
3270 static __net_initdata struct ctl_path ipv4_route_path[] = {
3271         { .procname = "net", },
3272         { .procname = "ipv4", },
3273         { .procname = "route", },
3274         { },
3275 };
3276
3277 static __net_init int sysctl_route_net_init(struct net *net)
3278 {
3279         struct ctl_table *tbl;
3280
3281         tbl = ipv4_route_flush_table;
3282         if (!net_eq(net, &init_net)) {
3283                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3284                 if (tbl == NULL)
3285                         goto err_dup;
3286         }
3287         tbl[0].extra1 = net;
3288
3289         net->ipv4.route_hdr =
3290                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3291         if (net->ipv4.route_hdr == NULL)
3292                 goto err_reg;
3293         return 0;
3294
3295 err_reg:
3296         if (tbl != ipv4_route_flush_table)
3297                 kfree(tbl);
3298 err_dup:
3299         return -ENOMEM;
3300 }
3301
3302 static __net_exit void sysctl_route_net_exit(struct net *net)
3303 {
3304         struct ctl_table *tbl;
3305
3306         tbl = net->ipv4.route_hdr->ctl_table_arg;
3307         unregister_net_sysctl_table(net->ipv4.route_hdr);
3308         BUG_ON(tbl == ipv4_route_flush_table);
3309         kfree(tbl);
3310 }
3311
3312 static __net_initdata struct pernet_operations sysctl_route_ops = {
3313         .init = sysctl_route_net_init,
3314         .exit = sysctl_route_net_exit,
3315 };
3316 #endif
3317
3318 static __net_init int rt_genid_init(struct net *net)
3319 {
3320         get_random_bytes(&net->ipv4.rt_genid,
3321                          sizeof(net->ipv4.rt_genid));
3322         return 0;
3323 }
3324
3325 static __net_initdata struct pernet_operations rt_genid_ops = {
3326         .init = rt_genid_init,
3327 };
3328
3329
3330 #ifdef CONFIG_IP_ROUTE_CLASSID
3331 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3332 #endif /* CONFIG_IP_ROUTE_CLASSID */
3333
3334 static __initdata unsigned long rhash_entries;
3335 static int __init set_rhash_entries(char *str)
3336 {
3337         if (!str)
3338                 return 0;
3339         rhash_entries = simple_strtoul(str, &str, 0);
3340         return 1;
3341 }
3342 __setup("rhash_entries=", set_rhash_entries);
3343
3344 int __init ip_rt_init(void)
3345 {
3346         int rc = 0;
3347
3348 #ifdef CONFIG_IP_ROUTE_CLASSID
3349         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3350         if (!ip_rt_acct)
3351                 panic("IP: failed to allocate ip_rt_acct\n");
3352 #endif
3353
3354         ipv4_dst_ops.kmem_cachep =
3355                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3356                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3357
3358         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3359
3360         if (dst_entries_init(&ipv4_dst_ops) < 0)
3361                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3362
3363         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3364                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3365
3366         rt_hash_table = (struct rt_hash_bucket *)
3367                 alloc_large_system_hash("IP route cache",
3368                                         sizeof(struct rt_hash_bucket),
3369                                         rhash_entries,
3370                                         (totalram_pages >= 128 * 1024) ?
3371                                         15 : 17,
3372                                         0,
3373                                         &rt_hash_log,
3374                                         &rt_hash_mask,
3375                                         rhash_entries ? 0 : 512 * 1024);
3376         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3377         rt_hash_lock_init();
3378
3379         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3380         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3381
3382         devinet_init();
3383         ip_fib_init();
3384
3385         /* All the timers, started at system startup tend
3386            to synchronize. Perturb it a bit.
3387          */
3388         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3389         expires_ljiffies = jiffies;
3390         schedule_delayed_work(&expires_work,
3391                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3392
3393         if (ip_rt_proc_init())
3394                 printk(KERN_ERR "Unable to create route proc files\n");
3395 #ifdef CONFIG_XFRM
3396         xfrm_init();
3397         xfrm4_init(ip_rt_max_size);
3398 #endif
3399         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3400
3401 #ifdef CONFIG_SYSCTL
3402         register_pernet_subsys(&sysctl_route_ops);
3403 #endif
3404         register_pernet_subsys(&rt_genid_ops);
3405         return rc;
3406 }
3407
3408 #ifdef CONFIG_SYSCTL
3409 /*
3410  * We really need to sanitize the damn ipv4 init order, then all
3411  * this nonsense will go away.
3412  */
3413 void __init ip_static_sysctl_init(void)
3414 {
3415         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3416 }
3417 #endif