]> nv-tegra.nvidia Code Review - linux-2.6.git/blob - net/ipv4/route.c
b1e5d3ac34603b7acc5090fcc370aeb2b48b8b37
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151                             int how)
152 {
153 }
154
155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 {
157         struct rtable *rt = (struct rtable *) dst;
158         struct inet_peer *peer;
159         u32 *p = NULL;
160
161         if (!rt->peer)
162                 rt_bind_peer(rt, 1);
163
164         peer = rt->peer;
165         if (peer) {
166                 u32 *old_p = __DST_METRICS_PTR(old);
167                 unsigned long prev, new;
168
169                 p = peer->metrics;
170                 if (inet_metrics_new(peer))
171                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172
173                 new = (unsigned long) p;
174                 prev = cmpxchg(&dst->_metrics, old, new);
175
176                 if (prev != old) {
177                         p = __DST_METRICS_PTR(prev);
178                         if (prev & DST_METRICS_READ_ONLY)
179                                 p = NULL;
180                 } else {
181                         if (rt->fi) {
182                                 fib_info_put(rt->fi);
183                                 rt->fi = NULL;
184                         }
185                 }
186         }
187         return p;
188 }
189
190 static struct dst_ops ipv4_dst_ops = {
191         .family =               AF_INET,
192         .protocol =             cpu_to_be16(ETH_P_IP),
193         .gc =                   rt_garbage_collect,
194         .check =                ipv4_dst_check,
195         .default_advmss =       ipv4_default_advmss,
196         .default_mtu =          ipv4_default_mtu,
197         .cow_metrics =          ipv4_cow_metrics,
198         .destroy =              ipv4_dst_destroy,
199         .ifdown =               ipv4_dst_ifdown,
200         .negative_advice =      ipv4_negative_advice,
201         .link_failure =         ipv4_link_failure,
202         .update_pmtu =          ip_rt_update_pmtu,
203         .local_out =            __ip_local_out,
204 };
205
206 #define ECN_OR_COST(class)      TC_PRIO_##class
207
208 const __u8 ip_tos2prio[16] = {
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(FILLER),
211         TC_PRIO_BESTEFFORT,
212         ECN_OR_COST(BESTEFFORT),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_BULK,
216         ECN_OR_COST(BULK),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE,
220         ECN_OR_COST(INTERACTIVE),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK),
223         TC_PRIO_INTERACTIVE_BULK,
224         ECN_OR_COST(INTERACTIVE_BULK)
225 };
226
227
228 /*
229  * Route cache.
230  */
231
232 /* The locking scheme is rather straight forward:
233  *
234  * 1) Read-Copy Update protects the buckets of the central route hash.
235  * 2) Only writers remove entries, and they hold the lock
236  *    as they look at rtable reference counts.
237  * 3) Only readers acquire references to rtable entries,
238  *    they do so with atomic increments and with the
239  *    lock held.
240  */
241
242 struct rt_hash_bucket {
243         struct rtable __rcu     *chain;
244 };
245
246 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247         defined(CONFIG_PROVE_LOCKING)
248 /*
249  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250  * The size of this table is a power of two and depends on the number of CPUS.
251  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
252  */
253 #ifdef CONFIG_LOCKDEP
254 # define RT_HASH_LOCK_SZ        256
255 #else
256 # if NR_CPUS >= 32
257 #  define RT_HASH_LOCK_SZ       4096
258 # elif NR_CPUS >= 16
259 #  define RT_HASH_LOCK_SZ       2048
260 # elif NR_CPUS >= 8
261 #  define RT_HASH_LOCK_SZ       1024
262 # elif NR_CPUS >= 4
263 #  define RT_HASH_LOCK_SZ       512
264 # else
265 #  define RT_HASH_LOCK_SZ       256
266 # endif
267 #endif
268
269 static spinlock_t       *rt_hash_locks;
270 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
271
272 static __init void rt_hash_lock_init(void)
273 {
274         int i;
275
276         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277                         GFP_KERNEL);
278         if (!rt_hash_locks)
279                 panic("IP: failed to allocate rt_hash_locks\n");
280
281         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282                 spin_lock_init(&rt_hash_locks[i]);
283 }
284 #else
285 # define rt_hash_lock_addr(slot) NULL
286
287 static inline void rt_hash_lock_init(void)
288 {
289 }
290 #endif
291
292 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
293 static unsigned                 rt_hash_mask __read_mostly;
294 static unsigned int             rt_hash_log  __read_mostly;
295
296 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
297 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
298
299 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
300                                    int genid)
301 {
302         return jhash_3words((__force u32)daddr, (__force u32)saddr,
303                             idx, genid)
304                 & rt_hash_mask;
305 }
306
307 static inline int rt_genid(struct net *net)
308 {
309         return atomic_read(&net->ipv4.rt_genid);
310 }
311
312 #ifdef CONFIG_PROC_FS
313 struct rt_cache_iter_state {
314         struct seq_net_private p;
315         int bucket;
316         int genid;
317 };
318
319 static struct rtable *rt_cache_get_first(struct seq_file *seq)
320 {
321         struct rt_cache_iter_state *st = seq->private;
322         struct rtable *r = NULL;
323
324         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
325                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
326                         continue;
327                 rcu_read_lock_bh();
328                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
329                 while (r) {
330                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
331                             r->rt_genid == st->genid)
332                                 return r;
333                         r = rcu_dereference_bh(r->dst.rt_next);
334                 }
335                 rcu_read_unlock_bh();
336         }
337         return r;
338 }
339
340 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
341                                           struct rtable *r)
342 {
343         struct rt_cache_iter_state *st = seq->private;
344
345         r = rcu_dereference_bh(r->dst.rt_next);
346         while (!r) {
347                 rcu_read_unlock_bh();
348                 do {
349                         if (--st->bucket < 0)
350                                 return NULL;
351                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
352                 rcu_read_lock_bh();
353                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
354         }
355         return r;
356 }
357
358 static struct rtable *rt_cache_get_next(struct seq_file *seq,
359                                         struct rtable *r)
360 {
361         struct rt_cache_iter_state *st = seq->private;
362         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
363                 if (dev_net(r->dst.dev) != seq_file_net(seq))
364                         continue;
365                 if (r->rt_genid == st->genid)
366                         break;
367         }
368         return r;
369 }
370
371 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
372 {
373         struct rtable *r = rt_cache_get_first(seq);
374
375         if (r)
376                 while (pos && (r = rt_cache_get_next(seq, r)))
377                         --pos;
378         return pos ? NULL : r;
379 }
380
381 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382 {
383         struct rt_cache_iter_state *st = seq->private;
384         if (*pos)
385                 return rt_cache_get_idx(seq, *pos - 1);
386         st->genid = rt_genid(seq_file_net(seq));
387         return SEQ_START_TOKEN;
388 }
389
390 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391 {
392         struct rtable *r;
393
394         if (v == SEQ_START_TOKEN)
395                 r = rt_cache_get_first(seq);
396         else
397                 r = rt_cache_get_next(seq, v);
398         ++*pos;
399         return r;
400 }
401
402 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403 {
404         if (v && v != SEQ_START_TOKEN)
405                 rcu_read_unlock_bh();
406 }
407
408 static int rt_cache_seq_show(struct seq_file *seq, void *v)
409 {
410         if (v == SEQ_START_TOKEN)
411                 seq_printf(seq, "%-127s\n",
412                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414                            "HHUptod\tSpecDst");
415         else {
416                 struct rtable *r = v;
417                 int len;
418
419                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
421                         r->dst.dev ? r->dst.dev->name : "*",
422                         (__force u32)r->rt_dst,
423                         (__force u32)r->rt_gateway,
424                         r->rt_flags, atomic_read(&r->dst.__refcnt),
425                         r->dst.__use, 0, (__force u32)r->rt_src,
426                         dst_metric_advmss(&r->dst) + 40,
427                         dst_metric(&r->dst, RTAX_WINDOW),
428                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429                               dst_metric(&r->dst, RTAX_RTTVAR)),
430                         r->fl.fl4_tos,
431                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432                         r->dst.hh ? (r->dst.hh->hh_output ==
433                                        dev_queue_xmit) : 0,
434                         r->rt_spec_dst, &len);
435
436                 seq_printf(seq, "%*s\n", 127 - len, "");
437         }
438         return 0;
439 }
440
441 static const struct seq_operations rt_cache_seq_ops = {
442         .start  = rt_cache_seq_start,
443         .next   = rt_cache_seq_next,
444         .stop   = rt_cache_seq_stop,
445         .show   = rt_cache_seq_show,
446 };
447
448 static int rt_cache_seq_open(struct inode *inode, struct file *file)
449 {
450         return seq_open_net(inode, file, &rt_cache_seq_ops,
451                         sizeof(struct rt_cache_iter_state));
452 }
453
454 static const struct file_operations rt_cache_seq_fops = {
455         .owner   = THIS_MODULE,
456         .open    = rt_cache_seq_open,
457         .read    = seq_read,
458         .llseek  = seq_lseek,
459         .release = seq_release_net,
460 };
461
462
463 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464 {
465         int cpu;
466
467         if (*pos == 0)
468                 return SEQ_START_TOKEN;
469
470         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
471                 if (!cpu_possible(cpu))
472                         continue;
473                 *pos = cpu+1;
474                 return &per_cpu(rt_cache_stat, cpu);
475         }
476         return NULL;
477 }
478
479 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480 {
481         int cpu;
482
483         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
484                 if (!cpu_possible(cpu))
485                         continue;
486                 *pos = cpu+1;
487                 return &per_cpu(rt_cache_stat, cpu);
488         }
489         return NULL;
490
491 }
492
493 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494 {
495
496 }
497
498 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499 {
500         struct rt_cache_stat *st = v;
501
502         if (v == SEQ_START_TOKEN) {
503                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
504                 return 0;
505         }
506
507         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
508                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
509                    dst_entries_get_slow(&ipv4_dst_ops),
510                    st->in_hit,
511                    st->in_slow_tot,
512                    st->in_slow_mc,
513                    st->in_no_route,
514                    st->in_brd,
515                    st->in_martian_dst,
516                    st->in_martian_src,
517
518                    st->out_hit,
519                    st->out_slow_tot,
520                    st->out_slow_mc,
521
522                    st->gc_total,
523                    st->gc_ignored,
524                    st->gc_goal_miss,
525                    st->gc_dst_overflow,
526                    st->in_hlist_search,
527                    st->out_hlist_search
528                 );
529         return 0;
530 }
531
532 static const struct seq_operations rt_cpu_seq_ops = {
533         .start  = rt_cpu_seq_start,
534         .next   = rt_cpu_seq_next,
535         .stop   = rt_cpu_seq_stop,
536         .show   = rt_cpu_seq_show,
537 };
538
539
540 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541 {
542         return seq_open(file, &rt_cpu_seq_ops);
543 }
544
545 static const struct file_operations rt_cpu_seq_fops = {
546         .owner   = THIS_MODULE,
547         .open    = rt_cpu_seq_open,
548         .read    = seq_read,
549         .llseek  = seq_lseek,
550         .release = seq_release,
551 };
552
553 #ifdef CONFIG_IP_ROUTE_CLASSID
554 static int rt_acct_proc_show(struct seq_file *m, void *v)
555 {
556         struct ip_rt_acct *dst, *src;
557         unsigned int i, j;
558
559         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560         if (!dst)
561                 return -ENOMEM;
562
563         for_each_possible_cpu(i) {
564                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565                 for (j = 0; j < 256; j++) {
566                         dst[j].o_bytes   += src[j].o_bytes;
567                         dst[j].o_packets += src[j].o_packets;
568                         dst[j].i_bytes   += src[j].i_bytes;
569                         dst[j].i_packets += src[j].i_packets;
570                 }
571         }
572
573         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574         kfree(dst);
575         return 0;
576 }
577
578 static int rt_acct_proc_open(struct inode *inode, struct file *file)
579 {
580         return single_open(file, rt_acct_proc_show, NULL);
581 }
582
583 static const struct file_operations rt_acct_proc_fops = {
584         .owner          = THIS_MODULE,
585         .open           = rt_acct_proc_open,
586         .read           = seq_read,
587         .llseek         = seq_lseek,
588         .release        = single_release,
589 };
590 #endif
591
592 static int __net_init ip_rt_do_proc_init(struct net *net)
593 {
594         struct proc_dir_entry *pde;
595
596         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597                         &rt_cache_seq_fops);
598         if (!pde)
599                 goto err1;
600
601         pde = proc_create("rt_cache", S_IRUGO,
602                           net->proc_net_stat, &rt_cpu_seq_fops);
603         if (!pde)
604                 goto err2;
605
606 #ifdef CONFIG_IP_ROUTE_CLASSID
607         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
608         if (!pde)
609                 goto err3;
610 #endif
611         return 0;
612
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 err3:
615         remove_proc_entry("rt_cache", net->proc_net_stat);
616 #endif
617 err2:
618         remove_proc_entry("rt_cache", net->proc_net);
619 err1:
620         return -ENOMEM;
621 }
622
623 static void __net_exit ip_rt_do_proc_exit(struct net *net)
624 {
625         remove_proc_entry("rt_cache", net->proc_net_stat);
626         remove_proc_entry("rt_cache", net->proc_net);
627 #ifdef CONFIG_IP_ROUTE_CLASSID
628         remove_proc_entry("rt_acct", net->proc_net);
629 #endif
630 }
631
632 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
633         .init = ip_rt_do_proc_init,
634         .exit = ip_rt_do_proc_exit,
635 };
636
637 static int __init ip_rt_proc_init(void)
638 {
639         return register_pernet_subsys(&ip_rt_proc_ops);
640 }
641
642 #else
643 static inline int ip_rt_proc_init(void)
644 {
645         return 0;
646 }
647 #endif /* CONFIG_PROC_FS */
648
649 static inline void rt_free(struct rtable *rt)
650 {
651         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
652 }
653
654 static inline void rt_drop(struct rtable *rt)
655 {
656         ip_rt_put(rt);
657         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
658 }
659
660 static inline int rt_fast_clean(struct rtable *rth)
661 {
662         /* Kill broadcast/multicast entries very aggresively, if they
663            collide in hash table with more useful entries */
664         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
665                 rt_is_input_route(rth) && rth->dst.rt_next;
666 }
667
668 static inline int rt_valuable(struct rtable *rth)
669 {
670         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
671                 rth->dst.expires;
672 }
673
674 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675 {
676         unsigned long age;
677         int ret = 0;
678
679         if (atomic_read(&rth->dst.__refcnt))
680                 goto out;
681
682         ret = 1;
683         if (rth->dst.expires &&
684             time_after_eq(jiffies, rth->dst.expires))
685                 goto out;
686
687         age = jiffies - rth->dst.lastuse;
688         ret = 0;
689         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690             (age <= tmo2 && rt_valuable(rth)))
691                 goto out;
692         ret = 1;
693 out:    return ret;
694 }
695
696 /* Bits of score are:
697  * 31: very valuable
698  * 30: not quite useless
699  * 29..0: usage counter
700  */
701 static inline u32 rt_score(struct rtable *rt)
702 {
703         u32 score = jiffies - rt->dst.lastuse;
704
705         score = ~score & ~(3<<30);
706
707         if (rt_valuable(rt))
708                 score |= (1<<31);
709
710         if (rt_is_output_route(rt) ||
711             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712                 score |= (1<<30);
713
714         return score;
715 }
716
717 static inline bool rt_caching(const struct net *net)
718 {
719         return net->ipv4.current_rt_cache_rebuild_count <=
720                 net->ipv4.sysctl_rt_cache_rebuild_count;
721 }
722
723 static inline bool compare_hash_inputs(const struct flowi *fl1,
724                                         const struct flowi *fl2)
725 {
726         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
728                 (fl1->iif ^ fl2->iif)) == 0);
729 }
730
731 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732 {
733         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
735                 (fl1->mark ^ fl2->mark) |
736                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
737                 (fl1->oif ^ fl2->oif) |
738                 (fl1->iif ^ fl2->iif)) == 0;
739 }
740
741 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742 {
743         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
744 }
745
746 static inline int rt_is_expired(struct rtable *rth)
747 {
748         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
749 }
750
751 /*
752  * Perform a full scan of hash table and free all entries.
753  * Can be called by a softirq or a process.
754  * In the later case, we want to be reschedule if necessary
755  */
756 static void rt_do_flush(struct net *net, int process_context)
757 {
758         unsigned int i;
759         struct rtable *rth, *next;
760
761         for (i = 0; i <= rt_hash_mask; i++) {
762                 struct rtable __rcu **pprev;
763                 struct rtable *list;
764
765                 if (process_context && need_resched())
766                         cond_resched();
767                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
768                 if (!rth)
769                         continue;
770
771                 spin_lock_bh(rt_hash_lock_addr(i));
772
773                 list = NULL;
774                 pprev = &rt_hash_table[i].chain;
775                 rth = rcu_dereference_protected(*pprev,
776                         lockdep_is_held(rt_hash_lock_addr(i)));
777
778                 while (rth) {
779                         next = rcu_dereference_protected(rth->dst.rt_next,
780                                 lockdep_is_held(rt_hash_lock_addr(i)));
781
782                         if (!net ||
783                             net_eq(dev_net(rth->dst.dev), net)) {
784                                 rcu_assign_pointer(*pprev, next);
785                                 rcu_assign_pointer(rth->dst.rt_next, list);
786                                 list = rth;
787                         } else {
788                                 pprev = &rth->dst.rt_next;
789                         }
790                         rth = next;
791                 }
792
793                 spin_unlock_bh(rt_hash_lock_addr(i));
794
795                 for (; list; list = next) {
796                         next = rcu_dereference_protected(list->dst.rt_next, 1);
797                         rt_free(list);
798                 }
799         }
800 }
801
802 /*
803  * While freeing expired entries, we compute average chain length
804  * and standard deviation, using fixed-point arithmetic.
805  * This to have an estimation of rt_chain_length_max
806  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
807  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808  */
809
810 #define FRACT_BITS 3
811 #define ONE (1UL << FRACT_BITS)
812
813 /*
814  * Given a hash chain and an item in this hash chain,
815  * find if a previous entry has the same hash_inputs
816  * (but differs on tos, mark or oif)
817  * Returns 0 if an alias is found.
818  * Returns ONE if rth has no alias before itself.
819  */
820 static int has_noalias(const struct rtable *head, const struct rtable *rth)
821 {
822         const struct rtable *aux = head;
823
824         while (aux != rth) {
825                 if (compare_hash_inputs(&aux->fl, &rth->fl))
826                         return 0;
827                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
828         }
829         return ONE;
830 }
831
832 static void rt_check_expire(void)
833 {
834         static unsigned int rover;
835         unsigned int i = rover, goal;
836         struct rtable *rth;
837         struct rtable __rcu **rthp;
838         unsigned long samples = 0;
839         unsigned long sum = 0, sum2 = 0;
840         unsigned long delta;
841         u64 mult;
842
843         delta = jiffies - expires_ljiffies;
844         expires_ljiffies = jiffies;
845         mult = ((u64)delta) << rt_hash_log;
846         if (ip_rt_gc_timeout > 1)
847                 do_div(mult, ip_rt_gc_timeout);
848         goal = (unsigned int)mult;
849         if (goal > rt_hash_mask)
850                 goal = rt_hash_mask + 1;
851         for (; goal > 0; goal--) {
852                 unsigned long tmo = ip_rt_gc_timeout;
853                 unsigned long length;
854
855                 i = (i + 1) & rt_hash_mask;
856                 rthp = &rt_hash_table[i].chain;
857
858                 if (need_resched())
859                         cond_resched();
860
861                 samples++;
862
863                 if (rcu_dereference_raw(*rthp) == NULL)
864                         continue;
865                 length = 0;
866                 spin_lock_bh(rt_hash_lock_addr(i));
867                 while ((rth = rcu_dereference_protected(*rthp,
868                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
869                         prefetch(rth->dst.rt_next);
870                         if (rt_is_expired(rth)) {
871                                 *rthp = rth->dst.rt_next;
872                                 rt_free(rth);
873                                 continue;
874                         }
875                         if (rth->dst.expires) {
876                                 /* Entry is expired even if it is in use */
877                                 if (time_before_eq(jiffies, rth->dst.expires)) {
878 nofree:
879                                         tmo >>= 1;
880                                         rthp = &rth->dst.rt_next;
881                                         /*
882                                          * We only count entries on
883                                          * a chain with equal hash inputs once
884                                          * so that entries for different QOS
885                                          * levels, and other non-hash input
886                                          * attributes don't unfairly skew
887                                          * the length computation
888                                          */
889                                         length += has_noalias(rt_hash_table[i].chain, rth);
890                                         continue;
891                                 }
892                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893                                 goto nofree;
894
895                         /* Cleanup aged off entries. */
896                         *rthp = rth->dst.rt_next;
897                         rt_free(rth);
898                 }
899                 spin_unlock_bh(rt_hash_lock_addr(i));
900                 sum += length;
901                 sum2 += length*length;
902         }
903         if (samples) {
904                 unsigned long avg = sum / samples;
905                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906                 rt_chain_length_max = max_t(unsigned long,
907                                         ip_rt_gc_elasticity,
908                                         (avg + 4*sd) >> FRACT_BITS);
909         }
910         rover = i;
911 }
912
913 /*
914  * rt_worker_func() is run in process context.
915  * we call rt_check_expire() to scan part of the hash table
916  */
917 static void rt_worker_func(struct work_struct *work)
918 {
919         rt_check_expire();
920         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
921 }
922
923 /*
924  * Pertubation of rt_genid by a small quantity [1..256]
925  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926  * many times (2^24) without giving recent rt_genid.
927  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
928  */
929 static void rt_cache_invalidate(struct net *net)
930 {
931         unsigned char shuffle;
932
933         get_random_bytes(&shuffle, sizeof(shuffle));
934         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
935 }
936
937 /*
938  * delay < 0  : invalidate cache (fast : entries will be deleted later)
939  * delay >= 0 : invalidate & flush cache (can be long)
940  */
941 void rt_cache_flush(struct net *net, int delay)
942 {
943         rt_cache_invalidate(net);
944         if (delay >= 0)
945                 rt_do_flush(net, !in_softirq());
946 }
947
948 /* Flush previous cache invalidated entries from the cache */
949 void rt_cache_flush_batch(struct net *net)
950 {
951         rt_do_flush(net, !in_softirq());
952 }
953
954 static void rt_emergency_hash_rebuild(struct net *net)
955 {
956         if (net_ratelimit())
957                 printk(KERN_WARNING "Route hash chain too long!\n");
958         rt_cache_invalidate(net);
959 }
960
961 /*
962    Short description of GC goals.
963
964    We want to build algorithm, which will keep routing cache
965    at some equilibrium point, when number of aged off entries
966    is kept approximately equal to newly generated ones.
967
968    Current expiration strength is variable "expire".
969    We try to adjust it dynamically, so that if networking
970    is idle expires is large enough to keep enough of warm entries,
971    and when load increases it reduces to limit cache size.
972  */
973
974 static int rt_garbage_collect(struct dst_ops *ops)
975 {
976         static unsigned long expire = RT_GC_TIMEOUT;
977         static unsigned long last_gc;
978         static int rover;
979         static int equilibrium;
980         struct rtable *rth;
981         struct rtable __rcu **rthp;
982         unsigned long now = jiffies;
983         int goal;
984         int entries = dst_entries_get_fast(&ipv4_dst_ops);
985
986         /*
987          * Garbage collection is pretty expensive,
988          * do not make it too frequently.
989          */
990
991         RT_CACHE_STAT_INC(gc_total);
992
993         if (now - last_gc < ip_rt_gc_min_interval &&
994             entries < ip_rt_max_size) {
995                 RT_CACHE_STAT_INC(gc_ignored);
996                 goto out;
997         }
998
999         entries = dst_entries_get_slow(&ipv4_dst_ops);
1000         /* Calculate number of entries, which we want to expire now. */
1001         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1002         if (goal <= 0) {
1003                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004                         equilibrium = ipv4_dst_ops.gc_thresh;
1005                 goal = entries - equilibrium;
1006                 if (goal > 0) {
1007                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1008                         goal = entries - equilibrium;
1009                 }
1010         } else {
1011                 /* We are in dangerous area. Try to reduce cache really
1012                  * aggressively.
1013                  */
1014                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                 equilibrium = entries - goal;
1016         }
1017
1018         if (now - last_gc >= ip_rt_gc_min_interval)
1019                 last_gc = now;
1020
1021         if (goal <= 0) {
1022                 equilibrium += goal;
1023                 goto work_done;
1024         }
1025
1026         do {
1027                 int i, k;
1028
1029                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030                         unsigned long tmo = expire;
1031
1032                         k = (k + 1) & rt_hash_mask;
1033                         rthp = &rt_hash_table[k].chain;
1034                         spin_lock_bh(rt_hash_lock_addr(k));
1035                         while ((rth = rcu_dereference_protected(*rthp,
1036                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1037                                 if (!rt_is_expired(rth) &&
1038                                         !rt_may_expire(rth, tmo, expire)) {
1039                                         tmo >>= 1;
1040                                         rthp = &rth->dst.rt_next;
1041                                         continue;
1042                                 }
1043                                 *rthp = rth->dst.rt_next;
1044                                 rt_free(rth);
1045                                 goal--;
1046                         }
1047                         spin_unlock_bh(rt_hash_lock_addr(k));
1048                         if (goal <= 0)
1049                                 break;
1050                 }
1051                 rover = k;
1052
1053                 if (goal <= 0)
1054                         goto work_done;
1055
1056                 /* Goal is not achieved. We stop process if:
1057
1058                    - if expire reduced to zero. Otherwise, expire is halfed.
1059                    - if table is not full.
1060                    - if we are called from interrupt.
1061                    - jiffies check is just fallback/debug loop breaker.
1062                      We will not spin here for long time in any case.
1063                  */
1064
1065                 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067                 if (expire == 0)
1068                         break;
1069
1070                 expire >>= 1;
1071 #if RT_CACHE_DEBUG >= 2
1072                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1073                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1074 #endif
1075
1076                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1077                         goto out;
1078         } while (!in_softirq() && time_before_eq(jiffies, now));
1079
1080         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081                 goto out;
1082         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (net_ratelimit())
1085                 printk(KERN_WARNING "dst cache overflow\n");
1086         RT_CACHE_STAT_INC(gc_dst_overflow);
1087         return 1;
1088
1089 work_done:
1090         expire += ip_rt_gc_min_interval;
1091         if (expire > ip_rt_gc_timeout ||
1092             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1094                 expire = ip_rt_gc_timeout;
1095 #if RT_CACHE_DEBUG >= 2
1096         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1097                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1098 #endif
1099 out:    return 0;
1100 }
1101
1102 /*
1103  * Returns number of entries in a hash chain that have different hash_inputs
1104  */
1105 static int slow_chain_length(const struct rtable *head)
1106 {
1107         int length = 0;
1108         const struct rtable *rth = head;
1109
1110         while (rth) {
1111                 length += has_noalias(head, rth);
1112                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113         }
1114         return length >> FRACT_BITS;
1115 }
1116
1117 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1118                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1119 {
1120         struct rtable   *rth, *cand;
1121         struct rtable __rcu **rthp, **candp;
1122         unsigned long   now;
1123         u32             min_score;
1124         int             chain_length;
1125         int attempts = !in_softirq();
1126
1127 restart:
1128         chain_length = 0;
1129         min_score = ~(u32)0;
1130         cand = NULL;
1131         candp = NULL;
1132         now = jiffies;
1133
1134         if (!rt_caching(dev_net(rt->dst.dev))) {
1135                 /*
1136                  * If we're not caching, just tell the caller we
1137                  * were successful and don't touch the route.  The
1138                  * caller hold the sole reference to the cache entry, and
1139                  * it will be released when the caller is done with it.
1140                  * If we drop it here, the callers have no way to resolve routes
1141                  * when we're not caching.  Instead, just point *rp at rt, so
1142                  * the caller gets a single use out of the route
1143                  * Note that we do rt_free on this new route entry, so that
1144                  * once its refcount hits zero, we are still able to reap it
1145                  * (Thanks Alexey)
1146                  * Note: To avoid expensive rcu stuff for this uncached dst,
1147                  * we set DST_NOCACHE so that dst_release() can free dst without
1148                  * waiting a grace period.
1149                  */
1150
1151                 rt->dst.flags |= DST_NOCACHE;
1152                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1153                         int err = arp_bind_neighbour(&rt->dst);
1154                         if (err) {
1155                                 if (net_ratelimit())
1156                                         printk(KERN_WARNING
1157                                             "Neighbour table failure & not caching routes.\n");
1158                                 ip_rt_put(rt);
1159                                 return err;
1160                         }
1161                 }
1162
1163                 goto skip_hashing;
1164         }
1165
1166         rthp = &rt_hash_table[hash].chain;
1167
1168         spin_lock_bh(rt_hash_lock_addr(hash));
1169         while ((rth = rcu_dereference_protected(*rthp,
1170                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1171                 if (rt_is_expired(rth)) {
1172                         *rthp = rth->dst.rt_next;
1173                         rt_free(rth);
1174                         continue;
1175                 }
1176                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1177                         /* Put it first */
1178                         *rthp = rth->dst.rt_next;
1179                         /*
1180                          * Since lookup is lockfree, the deletion
1181                          * must be visible to another weakly ordered CPU before
1182                          * the insertion at the start of the hash chain.
1183                          */
1184                         rcu_assign_pointer(rth->dst.rt_next,
1185                                            rt_hash_table[hash].chain);
1186                         /*
1187                          * Since lookup is lockfree, the update writes
1188                          * must be ordered for consistency on SMP.
1189                          */
1190                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
1192                         dst_use(&rth->dst, now);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         rt_drop(rt);
1196                         if (rp)
1197                                 *rp = rth;
1198                         else
1199                                 skb_dst_set(skb, &rth->dst);
1200                         return 0;
1201                 }
1202
1203                 if (!atomic_read(&rth->dst.__refcnt)) {
1204                         u32 score = rt_score(rth);
1205
1206                         if (score <= min_score) {
1207                                 cand = rth;
1208                                 candp = rthp;
1209                                 min_score = score;
1210                         }
1211                 }
1212
1213                 chain_length++;
1214
1215                 rthp = &rth->dst.rt_next;
1216         }
1217
1218         if (cand) {
1219                 /* ip_rt_gc_elasticity used to be average length of chain
1220                  * length, when exceeded gc becomes really aggressive.
1221                  *
1222                  * The second limit is less certain. At the moment it allows
1223                  * only 2 entries per bucket. We will see.
1224                  */
1225                 if (chain_length > ip_rt_gc_elasticity) {
1226                         *candp = cand->dst.rt_next;
1227                         rt_free(cand);
1228                 }
1229         } else {
1230                 if (chain_length > rt_chain_length_max &&
1231                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1232                         struct net *net = dev_net(rt->dst.dev);
1233                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1234                         if (!rt_caching(net)) {
1235                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1236                                         rt->dst.dev->name, num);
1237                         }
1238                         rt_emergency_hash_rebuild(net);
1239                         spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242                                         ifindex, rt_genid(net));
1243                         goto restart;
1244                 }
1245         }
1246
1247         /* Try to bind route to arp only if it is output
1248            route or unicast forwarding path.
1249          */
1250         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1251                 int err = arp_bind_neighbour(&rt->dst);
1252                 if (err) {
1253                         spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255                         if (err != -ENOBUFS) {
1256                                 rt_drop(rt);
1257                                 return err;
1258                         }
1259
1260                         /* Neighbour tables are full and nothing
1261                            can be released. Try to shrink route cache,
1262                            it is most likely it holds some neighbour records.
1263                          */
1264                         if (attempts-- > 0) {
1265                                 int saved_elasticity = ip_rt_gc_elasticity;
1266                                 int saved_int = ip_rt_gc_min_interval;
1267                                 ip_rt_gc_elasticity     = 1;
1268                                 ip_rt_gc_min_interval   = 0;
1269                                 rt_garbage_collect(&ipv4_dst_ops);
1270                                 ip_rt_gc_min_interval   = saved_int;
1271                                 ip_rt_gc_elasticity     = saved_elasticity;
1272                                 goto restart;
1273                         }
1274
1275                         if (net_ratelimit())
1276                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1277                         rt_drop(rt);
1278                         return -ENOBUFS;
1279                 }
1280         }
1281
1282         rt->dst.rt_next = rt_hash_table[hash].chain;
1283
1284 #if RT_CACHE_DEBUG >= 2
1285         if (rt->dst.rt_next) {
1286                 struct rtable *trt;
1287                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288                        hash, &rt->rt_dst);
1289                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1290                         printk(" . %pI4", &trt->rt_dst);
1291                 printk("\n");
1292         }
1293 #endif
1294         /*
1295          * Since lookup is lockfree, we must make sure
1296          * previous writes to rt are comitted to memory
1297          * before making rt visible to other CPUS.
1298          */
1299         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1300
1301         spin_unlock_bh(rt_hash_lock_addr(hash));
1302
1303 skip_hashing:
1304         if (rp)
1305                 *rp = rt;
1306         else
1307                 skb_dst_set(skb, &rt->dst);
1308         return 0;
1309 }
1310
1311 void rt_bind_peer(struct rtable *rt, int create)
1312 {
1313         struct inet_peer *peer;
1314
1315         peer = inet_getpeer_v4(rt->rt_dst, create);
1316
1317         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1318                 inet_putpeer(peer);
1319 }
1320
1321 /*
1322  * Peer allocation may fail only in serious out-of-memory conditions.  However
1323  * we still can generate some output.
1324  * Random ID selection looks a bit dangerous because we have no chances to
1325  * select ID being unique in a reasonable period of time.
1326  * But broken packet identifier may be better than no packet at all.
1327  */
1328 static void ip_select_fb_ident(struct iphdr *iph)
1329 {
1330         static DEFINE_SPINLOCK(ip_fb_id_lock);
1331         static u32 ip_fallback_id;
1332         u32 salt;
1333
1334         spin_lock_bh(&ip_fb_id_lock);
1335         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1336         iph->id = htons(salt & 0xFFFF);
1337         ip_fallback_id = salt;
1338         spin_unlock_bh(&ip_fb_id_lock);
1339 }
1340
1341 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1342 {
1343         struct rtable *rt = (struct rtable *) dst;
1344
1345         if (rt) {
1346                 if (rt->peer == NULL)
1347                         rt_bind_peer(rt, 1);
1348
1349                 /* If peer is attached to destination, it is never detached,
1350                    so that we need not to grab a lock to dereference it.
1351                  */
1352                 if (rt->peer) {
1353                         iph->id = htons(inet_getid(rt->peer, more));
1354                         return;
1355                 }
1356         } else
1357                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1358                        __builtin_return_address(0));
1359
1360         ip_select_fb_ident(iph);
1361 }
1362 EXPORT_SYMBOL(__ip_select_ident);
1363
1364 static void rt_del(unsigned hash, struct rtable *rt)
1365 {
1366         struct rtable __rcu **rthp;
1367         struct rtable *aux;
1368
1369         rthp = &rt_hash_table[hash].chain;
1370         spin_lock_bh(rt_hash_lock_addr(hash));
1371         ip_rt_put(rt);
1372         while ((aux = rcu_dereference_protected(*rthp,
1373                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1374                 if (aux == rt || rt_is_expired(aux)) {
1375                         *rthp = aux->dst.rt_next;
1376                         rt_free(aux);
1377                         continue;
1378                 }
1379                 rthp = &aux->dst.rt_next;
1380         }
1381         spin_unlock_bh(rt_hash_lock_addr(hash));
1382 }
1383
1384 /* called in rcu_read_lock() section */
1385 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386                     __be32 saddr, struct net_device *dev)
1387 {
1388         int i, k;
1389         struct in_device *in_dev = __in_dev_get_rcu(dev);
1390         struct rtable *rth;
1391         struct rtable __rcu **rthp;
1392         __be32  skeys[2] = { saddr, 0 };
1393         int  ikeys[2] = { dev->ifindex, 0 };
1394         struct netevent_redirect netevent;
1395         struct net *net;
1396
1397         if (!in_dev)
1398                 return;
1399
1400         net = dev_net(dev);
1401         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1402             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1403             ipv4_is_zeronet(new_gw))
1404                 goto reject_redirect;
1405
1406         if (!rt_caching(net))
1407                 goto reject_redirect;
1408
1409         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1410                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1411                         goto reject_redirect;
1412                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1413                         goto reject_redirect;
1414         } else {
1415                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1416                         goto reject_redirect;
1417         }
1418
1419         for (i = 0; i < 2; i++) {
1420                 for (k = 0; k < 2; k++) {
1421                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1422                                                 rt_genid(net));
1423
1424                         rthp = &rt_hash_table[hash].chain;
1425
1426                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1427                                 struct rtable *rt;
1428
1429                                 if (rth->fl.fl4_dst != daddr ||
1430                                     rth->fl.fl4_src != skeys[i] ||
1431                                     rth->fl.oif != ikeys[k] ||
1432                                     rt_is_input_route(rth) ||
1433                                     rt_is_expired(rth) ||
1434                                     !net_eq(dev_net(rth->dst.dev), net)) {
1435                                         rthp = &rth->dst.rt_next;
1436                                         continue;
1437                                 }
1438
1439                                 if (rth->rt_dst != daddr ||
1440                                     rth->rt_src != saddr ||
1441                                     rth->dst.error ||
1442                                     rth->rt_gateway != old_gw ||
1443                                     rth->dst.dev != dev)
1444                                         break;
1445
1446                                 dst_hold(&rth->dst);
1447
1448                                 rt = dst_alloc(&ipv4_dst_ops);
1449                                 if (rt == NULL) {
1450                                         ip_rt_put(rth);
1451                                         return;
1452                                 }
1453
1454                                 /* Copy all the information. */
1455                                 *rt = *rth;
1456                                 rt->dst.__use           = 1;
1457                                 atomic_set(&rt->dst.__refcnt, 1);
1458                                 rt->dst.child           = NULL;
1459                                 if (rt->dst.dev)
1460                                         dev_hold(rt->dst.dev);
1461                                 rt->dst.obsolete        = -1;
1462                                 rt->dst.lastuse = jiffies;
1463                                 rt->dst.path            = &rt->dst;
1464                                 rt->dst.neighbour       = NULL;
1465                                 rt->dst.hh              = NULL;
1466 #ifdef CONFIG_XFRM
1467                                 rt->dst.xfrm            = NULL;
1468 #endif
1469                                 rt->rt_genid            = rt_genid(net);
1470                                 rt->rt_flags            |= RTCF_REDIRECTED;
1471
1472                                 /* Gateway is different ... */
1473                                 rt->rt_gateway          = new_gw;
1474
1475                                 /* Redirect received -> path was valid */
1476                                 dst_confirm(&rth->dst);
1477
1478                                 if (rt->peer)
1479                                         atomic_inc(&rt->peer->refcnt);
1480                                 if (rt->fi)
1481                                         atomic_inc(&rt->fi->fib_clntref);
1482
1483                                 if (arp_bind_neighbour(&rt->dst) ||
1484                                     !(rt->dst.neighbour->nud_state &
1485                                             NUD_VALID)) {
1486                                         if (rt->dst.neighbour)
1487                                                 neigh_event_send(rt->dst.neighbour, NULL);
1488                                         ip_rt_put(rth);
1489                                         rt_drop(rt);
1490                                         goto do_next;
1491                                 }
1492
1493                                 netevent.old = &rth->dst;
1494                                 netevent.new = &rt->dst;
1495                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1496                                                         &netevent);
1497
1498                                 rt_del(hash, rth);
1499                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1500                                         ip_rt_put(rt);
1501                                 goto do_next;
1502                         }
1503                 do_next:
1504                         ;
1505                 }
1506         }
1507         return;
1508
1509 reject_redirect:
1510 #ifdef CONFIG_IP_ROUTE_VERBOSE
1511         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1512                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1513                         "  Advised path = %pI4 -> %pI4\n",
1514                        &old_gw, dev->name, &new_gw,
1515                        &saddr, &daddr);
1516 #endif
1517         ;
1518 }
1519
1520 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1521 {
1522         struct rtable *rt = (struct rtable *)dst;
1523         struct dst_entry *ret = dst;
1524
1525         if (rt) {
1526                 if (dst->obsolete > 0) {
1527                         ip_rt_put(rt);
1528                         ret = NULL;
1529                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1530                            (rt->dst.expires &&
1531                             time_after_eq(jiffies, rt->dst.expires))) {
1532                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1533                                                 rt->fl.oif,
1534                                                 rt_genid(dev_net(dst->dev)));
1535 #if RT_CACHE_DEBUG >= 1
1536                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1537                                 &rt->rt_dst, rt->fl.fl4_tos);
1538 #endif
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 }
1542         }
1543         return ret;
1544 }
1545
1546 /*
1547  * Algorithm:
1548  *      1. The first ip_rt_redirect_number redirects are sent
1549  *         with exponential backoff, then we stop sending them at all,
1550  *         assuming that the host ignores our redirects.
1551  *      2. If we did not see packets requiring redirects
1552  *         during ip_rt_redirect_silence, we assume that the host
1553  *         forgot redirected route and start to send redirects again.
1554  *
1555  * This algorithm is much cheaper and more intelligent than dumb load limiting
1556  * in icmp.c.
1557  *
1558  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1559  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1560  */
1561
1562 void ip_rt_send_redirect(struct sk_buff *skb)
1563 {
1564         struct rtable *rt = skb_rtable(skb);
1565         struct in_device *in_dev;
1566         int log_martians;
1567
1568         rcu_read_lock();
1569         in_dev = __in_dev_get_rcu(rt->dst.dev);
1570         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1571                 rcu_read_unlock();
1572                 return;
1573         }
1574         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1575         rcu_read_unlock();
1576
1577         /* No redirected packets during ip_rt_redirect_silence;
1578          * reset the algorithm.
1579          */
1580         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1581                 rt->dst.rate_tokens = 0;
1582
1583         /* Too many ignored redirects; do not send anything
1584          * set dst.rate_last to the last seen redirected packet.
1585          */
1586         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1587                 rt->dst.rate_last = jiffies;
1588                 return;
1589         }
1590
1591         /* Check for load limit; set rate_last to the latest sent
1592          * redirect.
1593          */
1594         if (rt->dst.rate_tokens == 0 ||
1595             time_after(jiffies,
1596                        (rt->dst.rate_last +
1597                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1598                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1599                 rt->dst.rate_last = jiffies;
1600                 ++rt->dst.rate_tokens;
1601 #ifdef CONFIG_IP_ROUTE_VERBOSE
1602                 if (log_martians &&
1603                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1604                     net_ratelimit())
1605                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1606                                 &rt->rt_src, rt->rt_iif,
1607                                 &rt->rt_dst, &rt->rt_gateway);
1608 #endif
1609         }
1610 }
1611
1612 static int ip_error(struct sk_buff *skb)
1613 {
1614         struct rtable *rt = skb_rtable(skb);
1615         unsigned long now;
1616         int code;
1617
1618         switch (rt->dst.error) {
1619                 case EINVAL:
1620                 default:
1621                         goto out;
1622                 case EHOSTUNREACH:
1623                         code = ICMP_HOST_UNREACH;
1624                         break;
1625                 case ENETUNREACH:
1626                         code = ICMP_NET_UNREACH;
1627                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1628                                         IPSTATS_MIB_INNOROUTES);
1629                         break;
1630                 case EACCES:
1631                         code = ICMP_PKT_FILTERED;
1632                         break;
1633         }
1634
1635         now = jiffies;
1636         rt->dst.rate_tokens += now - rt->dst.rate_last;
1637         if (rt->dst.rate_tokens > ip_rt_error_burst)
1638                 rt->dst.rate_tokens = ip_rt_error_burst;
1639         rt->dst.rate_last = now;
1640         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1641                 rt->dst.rate_tokens -= ip_rt_error_cost;
1642                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1643         }
1644
1645 out:    kfree_skb(skb);
1646         return 0;
1647 }
1648
1649 /*
1650  *      The last two values are not from the RFC but
1651  *      are needed for AMPRnet AX.25 paths.
1652  */
1653
1654 static const unsigned short mtu_plateau[] =
1655 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1656
1657 static inline unsigned short guess_mtu(unsigned short old_mtu)
1658 {
1659         int i;
1660
1661         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1662                 if (old_mtu > mtu_plateau[i])
1663                         return mtu_plateau[i];
1664         return 68;
1665 }
1666
1667 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1668                                  unsigned short new_mtu,
1669                                  struct net_device *dev)
1670 {
1671         int i, k;
1672         unsigned short old_mtu = ntohs(iph->tot_len);
1673         struct rtable *rth;
1674         int  ikeys[2] = { dev->ifindex, 0 };
1675         __be32  skeys[2] = { iph->saddr, 0, };
1676         __be32  daddr = iph->daddr;
1677         unsigned short est_mtu = 0;
1678
1679         for (k = 0; k < 2; k++) {
1680                 for (i = 0; i < 2; i++) {
1681                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1682                                                 rt_genid(net));
1683
1684                         rcu_read_lock();
1685                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1686                              rth = rcu_dereference(rth->dst.rt_next)) {
1687                                 unsigned short mtu = new_mtu;
1688
1689                                 if (rth->fl.fl4_dst != daddr ||
1690                                     rth->fl.fl4_src != skeys[i] ||
1691                                     rth->rt_dst != daddr ||
1692                                     rth->rt_src != iph->saddr ||
1693                                     rth->fl.oif != ikeys[k] ||
1694                                     rt_is_input_route(rth) ||
1695                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1696                                     !net_eq(dev_net(rth->dst.dev), net) ||
1697                                     rt_is_expired(rth))
1698                                         continue;
1699
1700                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1701
1702                                         /* BSD 4.2 compatibility hack :-( */
1703                                         if (mtu == 0 &&
1704                                             old_mtu >= dst_mtu(&rth->dst) &&
1705                                             old_mtu >= 68 + (iph->ihl << 2))
1706                                                 old_mtu -= iph->ihl << 2;
1707
1708                                         mtu = guess_mtu(old_mtu);
1709                                 }
1710                                 if (mtu <= dst_mtu(&rth->dst)) {
1711                                         if (mtu < dst_mtu(&rth->dst)) {
1712                                                 dst_confirm(&rth->dst);
1713                                                 if (mtu < ip_rt_min_pmtu) {
1714                                                         u32 lock = dst_metric(&rth->dst,
1715                                                                               RTAX_LOCK);
1716                                                         mtu = ip_rt_min_pmtu;
1717                                                         lock |= (1 << RTAX_MTU);
1718                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1719                                                                        lock);
1720                                                 }
1721                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1722                                                 dst_set_expires(&rth->dst,
1723                                                         ip_rt_mtu_expires);
1724                                         }
1725                                         est_mtu = mtu;
1726                                 }
1727                         }
1728                         rcu_read_unlock();
1729                 }
1730         }
1731         return est_mtu ? : new_mtu;
1732 }
1733
1734 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1735 {
1736         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1737             !(dst_metric_locked(dst, RTAX_MTU))) {
1738                 if (mtu < ip_rt_min_pmtu) {
1739                         u32 lock = dst_metric(dst, RTAX_LOCK);
1740                         mtu = ip_rt_min_pmtu;
1741                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1742                 }
1743                 dst_metric_set(dst, RTAX_MTU, mtu);
1744                 dst_set_expires(dst, ip_rt_mtu_expires);
1745                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1746         }
1747 }
1748
1749 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1750 {
1751         if (rt_is_expired((struct rtable *)dst))
1752                 return NULL;
1753         return dst;
1754 }
1755
1756 static void ipv4_dst_destroy(struct dst_entry *dst)
1757 {
1758         struct rtable *rt = (struct rtable *) dst;
1759         struct inet_peer *peer = rt->peer;
1760
1761         if (rt->fi) {
1762                 fib_info_put(rt->fi);
1763                 rt->fi = NULL;
1764         }
1765         if (peer) {
1766                 rt->peer = NULL;
1767                 inet_putpeer(peer);
1768         }
1769 }
1770
1771
1772 static void ipv4_link_failure(struct sk_buff *skb)
1773 {
1774         struct rtable *rt;
1775
1776         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
1778         rt = skb_rtable(skb);
1779         if (rt)
1780                 dst_set_expires(&rt->dst, 0);
1781 }
1782
1783 static int ip_rt_bug(struct sk_buff *skb)
1784 {
1785         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1787                 skb->dev ? skb->dev->name : "?");
1788         kfree_skb(skb);
1789         return 0;
1790 }
1791
1792 /*
1793    We do not cache source address of outgoing interface,
1794    because it is used only by IP RR, TS and SRR options,
1795    so that it out of fast path.
1796
1797    BTW remember: "addr" is allowed to be not aligned
1798    in IP options!
1799  */
1800
1801 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802 {
1803         __be32 src;
1804         struct fib_result res;
1805
1806         if (rt_is_output_route(rt))
1807                 src = rt->rt_src;
1808         else {
1809                 rcu_read_lock();
1810                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1811                         src = FIB_RES_PREFSRC(res);
1812                 else
1813                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1814                                         RT_SCOPE_UNIVERSE);
1815                 rcu_read_unlock();
1816         }
1817         memcpy(addr, &src, 4);
1818 }
1819
1820 #ifdef CONFIG_IP_ROUTE_CLASSID
1821 static void set_class_tag(struct rtable *rt, u32 tag)
1822 {
1823         if (!(rt->dst.tclassid & 0xFFFF))
1824                 rt->dst.tclassid |= tag & 0xFFFF;
1825         if (!(rt->dst.tclassid & 0xFFFF0000))
1826                 rt->dst.tclassid |= tag & 0xFFFF0000;
1827 }
1828 #endif
1829
1830 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1831 {
1832         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1833
1834         if (advmss == 0) {
1835                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1836                                ip_rt_min_advmss);
1837                 if (advmss > 65535 - 40)
1838                         advmss = 65535 - 40;
1839         }
1840         return advmss;
1841 }
1842
1843 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1844 {
1845         unsigned int mtu = dst->dev->mtu;
1846
1847         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1848                 const struct rtable *rt = (const struct rtable *) dst;
1849
1850                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1851                         mtu = 576;
1852         }
1853
1854         if (mtu > IP_MAX_MTU)
1855                 mtu = IP_MAX_MTU;
1856
1857         return mtu;
1858 }
1859
1860 static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1861 {
1862         if (!(rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)) {
1863         no_cow:
1864                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1865                         rt->fi = fi;
1866                         atomic_inc(&fi->fib_clntref);
1867                 }
1868                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1869         } else {
1870                 struct inet_peer *peer;
1871
1872                 if (!rt->peer)
1873                         rt_bind_peer(rt, 1);
1874                 peer = rt->peer;
1875                 if (!peer)
1876                         goto no_cow;
1877                 if (inet_metrics_new(peer))
1878                         memcpy(peer->metrics, fi->fib_metrics,
1879                                sizeof(u32) * RTAX_MAX);
1880                 dst_init_metrics(&rt->dst, peer->metrics, false);
1881         }
1882 }
1883
1884 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1885 {
1886         struct dst_entry *dst = &rt->dst;
1887         struct fib_info *fi = res->fi;
1888
1889         if (fi) {
1890                 if (FIB_RES_GW(*res) &&
1891                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1892                         rt->rt_gateway = FIB_RES_GW(*res);
1893                 rt_init_metrics(rt, fi);
1894 #ifdef CONFIG_IP_ROUTE_CLASSID
1895                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1896 #endif
1897         }
1898
1899         if (dst_mtu(dst) > IP_MAX_MTU)
1900                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1901         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1902                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1903
1904 #ifdef CONFIG_IP_ROUTE_CLASSID
1905 #ifdef CONFIG_IP_MULTIPLE_TABLES
1906         set_class_tag(rt, fib_rules_tclass(res));
1907 #endif
1908         set_class_tag(rt, itag);
1909 #endif
1910         rt->rt_type = res->type;
1911 }
1912
1913 /* called in rcu_read_lock() section */
1914 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1915                                 u8 tos, struct net_device *dev, int our)
1916 {
1917         unsigned int hash;
1918         struct rtable *rth;
1919         __be32 spec_dst;
1920         struct in_device *in_dev = __in_dev_get_rcu(dev);
1921         u32 itag = 0;
1922         int err;
1923
1924         /* Primary sanity checks. */
1925
1926         if (in_dev == NULL)
1927                 return -EINVAL;
1928
1929         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1930             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1931                 goto e_inval;
1932
1933         if (ipv4_is_zeronet(saddr)) {
1934                 if (!ipv4_is_local_multicast(daddr))
1935                         goto e_inval;
1936                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1937         } else {
1938                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1939                                           &itag, 0);
1940                 if (err < 0)
1941                         goto e_err;
1942         }
1943         rth = dst_alloc(&ipv4_dst_ops);
1944         if (!rth)
1945                 goto e_nobufs;
1946
1947         rth->dst.output = ip_rt_bug;
1948         rth->dst.obsolete = -1;
1949
1950         atomic_set(&rth->dst.__refcnt, 1);
1951         rth->dst.flags= DST_HOST;
1952         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1953                 rth->dst.flags |= DST_NOPOLICY;
1954         rth->fl.fl4_dst = daddr;
1955         rth->rt_dst     = daddr;
1956         rth->fl.fl4_tos = tos;
1957         rth->fl.mark    = skb->mark;
1958         rth->fl.fl4_src = saddr;
1959         rth->rt_src     = saddr;
1960 #ifdef CONFIG_IP_ROUTE_CLASSID
1961         rth->dst.tclassid = itag;
1962 #endif
1963         rth->rt_iif     =
1964         rth->fl.iif     = dev->ifindex;
1965         rth->dst.dev    = init_net.loopback_dev;
1966         dev_hold(rth->dst.dev);
1967         rth->fl.oif     = 0;
1968         rth->rt_gateway = daddr;
1969         rth->rt_spec_dst= spec_dst;
1970         rth->rt_genid   = rt_genid(dev_net(dev));
1971         rth->rt_flags   = RTCF_MULTICAST;
1972         rth->rt_type    = RTN_MULTICAST;
1973         if (our) {
1974                 rth->dst.input= ip_local_deliver;
1975                 rth->rt_flags |= RTCF_LOCAL;
1976         }
1977
1978 #ifdef CONFIG_IP_MROUTE
1979         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1980                 rth->dst.input = ip_mr_input;
1981 #endif
1982         RT_CACHE_STAT_INC(in_slow_mc);
1983
1984         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1985         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1986
1987 e_nobufs:
1988         return -ENOBUFS;
1989 e_inval:
1990         return -EINVAL;
1991 e_err:
1992         return err;
1993 }
1994
1995
1996 static void ip_handle_martian_source(struct net_device *dev,
1997                                      struct in_device *in_dev,
1998                                      struct sk_buff *skb,
1999                                      __be32 daddr,
2000                                      __be32 saddr)
2001 {
2002         RT_CACHE_STAT_INC(in_martian_src);
2003 #ifdef CONFIG_IP_ROUTE_VERBOSE
2004         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2005                 /*
2006                  *      RFC1812 recommendation, if source is martian,
2007                  *      the only hint is MAC header.
2008                  */
2009                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2010                         &daddr, &saddr, dev->name);
2011                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2012                         int i;
2013                         const unsigned char *p = skb_mac_header(skb);
2014                         printk(KERN_WARNING "ll header: ");
2015                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2016                                 printk("%02x", *p);
2017                                 if (i < (dev->hard_header_len - 1))
2018                                         printk(":");
2019                         }
2020                         printk("\n");
2021                 }
2022         }
2023 #endif
2024 }
2025
2026 /* called in rcu_read_lock() section */
2027 static int __mkroute_input(struct sk_buff *skb,
2028                            struct fib_result *res,
2029                            struct in_device *in_dev,
2030                            __be32 daddr, __be32 saddr, u32 tos,
2031                            struct rtable **result)
2032 {
2033         struct rtable *rth;
2034         int err;
2035         struct in_device *out_dev;
2036         unsigned int flags = 0;
2037         __be32 spec_dst;
2038         u32 itag;
2039
2040         /* get a working reference to the output device */
2041         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2042         if (out_dev == NULL) {
2043                 if (net_ratelimit())
2044                         printk(KERN_CRIT "Bug in ip_route_input" \
2045                                "_slow(). Please, report\n");
2046                 return -EINVAL;
2047         }
2048
2049
2050         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2051                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2052         if (err < 0) {
2053                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2054                                          saddr);
2055
2056                 goto cleanup;
2057         }
2058
2059         if (err)
2060                 flags |= RTCF_DIRECTSRC;
2061
2062         if (out_dev == in_dev && err &&
2063             (IN_DEV_SHARED_MEDIA(out_dev) ||
2064              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2065                 flags |= RTCF_DOREDIRECT;
2066
2067         if (skb->protocol != htons(ETH_P_IP)) {
2068                 /* Not IP (i.e. ARP). Do not create route, if it is
2069                  * invalid for proxy arp. DNAT routes are always valid.
2070                  *
2071                  * Proxy arp feature have been extended to allow, ARP
2072                  * replies back to the same interface, to support
2073                  * Private VLAN switch technologies. See arp.c.
2074                  */
2075                 if (out_dev == in_dev &&
2076                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2077                         err = -EINVAL;
2078                         goto cleanup;
2079                 }
2080         }
2081
2082
2083         rth = dst_alloc(&ipv4_dst_ops);
2084         if (!rth) {
2085                 err = -ENOBUFS;
2086                 goto cleanup;
2087         }
2088
2089         atomic_set(&rth->dst.__refcnt, 1);
2090         rth->dst.flags= DST_HOST;
2091         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2092                 rth->dst.flags |= DST_NOPOLICY;
2093         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2094                 rth->dst.flags |= DST_NOXFRM;
2095         rth->fl.fl4_dst = daddr;
2096         rth->rt_dst     = daddr;
2097         rth->fl.fl4_tos = tos;
2098         rth->fl.mark    = skb->mark;
2099         rth->fl.fl4_src = saddr;
2100         rth->rt_src     = saddr;
2101         rth->rt_gateway = daddr;
2102         rth->rt_iif     =
2103                 rth->fl.iif     = in_dev->dev->ifindex;
2104         rth->dst.dev    = (out_dev)->dev;
2105         dev_hold(rth->dst.dev);
2106         rth->fl.oif     = 0;
2107         rth->rt_spec_dst= spec_dst;
2108
2109         rth->dst.obsolete = -1;
2110         rth->dst.input = ip_forward;
2111         rth->dst.output = ip_output;
2112         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2113
2114         rt_set_nexthop(rth, res, itag);
2115
2116         rth->rt_flags = flags;
2117
2118         *result = rth;
2119         err = 0;
2120  cleanup:
2121         return err;
2122 }
2123
2124 static int ip_mkroute_input(struct sk_buff *skb,
2125                             struct fib_result *res,
2126                             const struct flowi *fl,
2127                             struct in_device *in_dev,
2128                             __be32 daddr, __be32 saddr, u32 tos)
2129 {
2130         struct rtable* rth = NULL;
2131         int err;
2132         unsigned hash;
2133
2134 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2135         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2136                 fib_select_multipath(fl, res);
2137 #endif
2138
2139         /* create a routing cache entry */
2140         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2141         if (err)
2142                 return err;
2143
2144         /* put it into the cache */
2145         hash = rt_hash(daddr, saddr, fl->iif,
2146                        rt_genid(dev_net(rth->dst.dev)));
2147         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2148 }
2149
2150 /*
2151  *      NOTE. We drop all the packets that has local source
2152  *      addresses, because every properly looped back packet
2153  *      must have correct destination already attached by output routine.
2154  *
2155  *      Such approach solves two big problems:
2156  *      1. Not simplex devices are handled properly.
2157  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2158  *      called with rcu_read_lock()
2159  */
2160
2161 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2162                                u8 tos, struct net_device *dev)
2163 {
2164         struct fib_result res;
2165         struct in_device *in_dev = __in_dev_get_rcu(dev);
2166         struct flowi fl = { .fl4_dst    = daddr,
2167                             .fl4_src    = saddr,
2168                             .fl4_tos    = tos,
2169                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2170                             .mark = skb->mark,
2171                             .iif = dev->ifindex };
2172         unsigned        flags = 0;
2173         u32             itag = 0;
2174         struct rtable * rth;
2175         unsigned        hash;
2176         __be32          spec_dst;
2177         int             err = -EINVAL;
2178         struct net    * net = dev_net(dev);
2179
2180         /* IP on this device is disabled. */
2181
2182         if (!in_dev)
2183                 goto out;
2184
2185         /* Check for the most weird martians, which can be not detected
2186            by fib_lookup.
2187          */
2188
2189         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2190             ipv4_is_loopback(saddr))
2191                 goto martian_source;
2192
2193         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2194                 goto brd_input;
2195
2196         /* Accept zero addresses only to limited broadcast;
2197          * I even do not know to fix it or not. Waiting for complains :-)
2198          */
2199         if (ipv4_is_zeronet(saddr))
2200                 goto martian_source;
2201
2202         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2203                 goto martian_destination;
2204
2205         /*
2206          *      Now we are ready to route packet.
2207          */
2208         err = fib_lookup(net, &fl, &res);
2209         if (err != 0) {
2210                 if (!IN_DEV_FORWARD(in_dev))
2211                         goto e_hostunreach;
2212                 goto no_route;
2213         }
2214
2215         RT_CACHE_STAT_INC(in_slow_tot);
2216
2217         if (res.type == RTN_BROADCAST)
2218                 goto brd_input;
2219
2220         if (res.type == RTN_LOCAL) {
2221                 err = fib_validate_source(saddr, daddr, tos,
2222                                           net->loopback_dev->ifindex,
2223                                           dev, &spec_dst, &itag, skb->mark);
2224                 if (err < 0)
2225                         goto martian_source_keep_err;
2226                 if (err)
2227                         flags |= RTCF_DIRECTSRC;
2228                 spec_dst = daddr;
2229                 goto local_input;
2230         }
2231
2232         if (!IN_DEV_FORWARD(in_dev))
2233                 goto e_hostunreach;
2234         if (res.type != RTN_UNICAST)
2235                 goto martian_destination;
2236
2237         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2238 out:    return err;
2239
2240 brd_input:
2241         if (skb->protocol != htons(ETH_P_IP))
2242                 goto e_inval;
2243
2244         if (ipv4_is_zeronet(saddr))
2245                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2246         else {
2247                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2248                                           &itag, skb->mark);
2249                 if (err < 0)
2250                         goto martian_source_keep_err;
2251                 if (err)
2252                         flags |= RTCF_DIRECTSRC;
2253         }
2254         flags |= RTCF_BROADCAST;
2255         res.type = RTN_BROADCAST;
2256         RT_CACHE_STAT_INC(in_brd);
2257
2258 local_input:
2259         rth = dst_alloc(&ipv4_dst_ops);
2260         if (!rth)
2261                 goto e_nobufs;
2262
2263         rth->dst.output= ip_rt_bug;
2264         rth->dst.obsolete = -1;
2265         rth->rt_genid = rt_genid(net);
2266
2267         atomic_set(&rth->dst.__refcnt, 1);
2268         rth->dst.flags= DST_HOST;
2269         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2270                 rth->dst.flags |= DST_NOPOLICY;
2271         rth->fl.fl4_dst = daddr;
2272         rth->rt_dst     = daddr;
2273         rth->fl.fl4_tos = tos;
2274         rth->fl.mark    = skb->mark;
2275         rth->fl.fl4_src = saddr;
2276         rth->rt_src     = saddr;
2277 #ifdef CONFIG_IP_ROUTE_CLASSID
2278         rth->dst.tclassid = itag;
2279 #endif
2280         rth->rt_iif     =
2281         rth->fl.iif     = dev->ifindex;
2282         rth->dst.dev    = net->loopback_dev;
2283         dev_hold(rth->dst.dev);
2284         rth->rt_gateway = daddr;
2285         rth->rt_spec_dst= spec_dst;
2286         rth->dst.input= ip_local_deliver;
2287         rth->rt_flags   = flags|RTCF_LOCAL;
2288         if (res.type == RTN_UNREACHABLE) {
2289                 rth->dst.input= ip_error;
2290                 rth->dst.error= -err;
2291                 rth->rt_flags   &= ~RTCF_LOCAL;
2292         }
2293         rth->rt_type    = res.type;
2294         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2295         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2296         goto out;
2297
2298 no_route:
2299         RT_CACHE_STAT_INC(in_no_route);
2300         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2301         res.type = RTN_UNREACHABLE;
2302         if (err == -ESRCH)
2303                 err = -ENETUNREACH;
2304         goto local_input;
2305
2306         /*
2307          *      Do not cache martian addresses: they should be logged (RFC1812)
2308          */
2309 martian_destination:
2310         RT_CACHE_STAT_INC(in_martian_dst);
2311 #ifdef CONFIG_IP_ROUTE_VERBOSE
2312         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2313                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2314                         &daddr, &saddr, dev->name);
2315 #endif
2316
2317 e_hostunreach:
2318         err = -EHOSTUNREACH;
2319         goto out;
2320
2321 e_inval:
2322         err = -EINVAL;
2323         goto out;
2324
2325 e_nobufs:
2326         err = -ENOBUFS;
2327         goto out;
2328
2329 martian_source:
2330         err = -EINVAL;
2331 martian_source_keep_err:
2332         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2333         goto out;
2334 }
2335
2336 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2337                            u8 tos, struct net_device *dev, bool noref)
2338 {
2339         struct rtable * rth;
2340         unsigned        hash;
2341         int iif = dev->ifindex;
2342         struct net *net;
2343         int res;
2344
2345         net = dev_net(dev);
2346
2347         rcu_read_lock();
2348
2349         if (!rt_caching(net))
2350                 goto skip_cache;
2351
2352         tos &= IPTOS_RT_MASK;
2353         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2354
2355         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2356              rth = rcu_dereference(rth->dst.rt_next)) {
2357                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2358                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2359                      (rth->fl.iif ^ iif) |
2360                      rth->fl.oif |
2361                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2362                     rth->fl.mark == skb->mark &&
2363                     net_eq(dev_net(rth->dst.dev), net) &&
2364                     !rt_is_expired(rth)) {
2365                         if (noref) {
2366                                 dst_use_noref(&rth->dst, jiffies);
2367                                 skb_dst_set_noref(skb, &rth->dst);
2368                         } else {
2369                                 dst_use(&rth->dst, jiffies);
2370                                 skb_dst_set(skb, &rth->dst);
2371                         }
2372                         RT_CACHE_STAT_INC(in_hit);
2373                         rcu_read_unlock();
2374                         return 0;
2375                 }
2376                 RT_CACHE_STAT_INC(in_hlist_search);
2377         }
2378
2379 skip_cache:
2380         /* Multicast recognition logic is moved from route cache to here.
2381            The problem was that too many Ethernet cards have broken/missing
2382            hardware multicast filters :-( As result the host on multicasting
2383            network acquires a lot of useless route cache entries, sort of
2384            SDR messages from all the world. Now we try to get rid of them.
2385            Really, provided software IP multicast filter is organized
2386            reasonably (at least, hashed), it does not result in a slowdown
2387            comparing with route cache reject entries.
2388            Note, that multicast routers are not affected, because
2389            route cache entry is created eventually.
2390          */
2391         if (ipv4_is_multicast(daddr)) {
2392                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2393
2394                 if (in_dev) {
2395                         int our = ip_check_mc(in_dev, daddr, saddr,
2396                                               ip_hdr(skb)->protocol);
2397                         if (our
2398 #ifdef CONFIG_IP_MROUTE
2399                                 ||
2400                             (!ipv4_is_local_multicast(daddr) &&
2401                              IN_DEV_MFORWARD(in_dev))
2402 #endif
2403                            ) {
2404                                 int res = ip_route_input_mc(skb, daddr, saddr,
2405                                                             tos, dev, our);
2406                                 rcu_read_unlock();
2407                                 return res;
2408                         }
2409                 }
2410                 rcu_read_unlock();
2411                 return -EINVAL;
2412         }
2413         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2414         rcu_read_unlock();
2415         return res;
2416 }
2417 EXPORT_SYMBOL(ip_route_input_common);
2418
2419 /* called with rcu_read_lock() */
2420 static int __mkroute_output(struct rtable **result,
2421                             struct fib_result *res,
2422                             const struct flowi *fl,
2423                             const struct flowi *oldflp,
2424                             struct net_device *dev_out,
2425                             unsigned flags)
2426 {
2427         struct rtable *rth;
2428         struct in_device *in_dev;
2429         u32 tos = RT_FL_TOS(oldflp);
2430
2431         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2432                 return -EINVAL;
2433
2434         if (ipv4_is_lbcast(fl->fl4_dst))
2435                 res->type = RTN_BROADCAST;
2436         else if (ipv4_is_multicast(fl->fl4_dst))
2437                 res->type = RTN_MULTICAST;
2438         else if (ipv4_is_zeronet(fl->fl4_dst))
2439                 return -EINVAL;
2440
2441         if (dev_out->flags & IFF_LOOPBACK)
2442                 flags |= RTCF_LOCAL;
2443
2444         in_dev = __in_dev_get_rcu(dev_out);
2445         if (!in_dev)
2446                 return -EINVAL;
2447
2448         if (res->type == RTN_BROADCAST) {
2449                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2450                 res->fi = NULL;
2451         } else if (res->type == RTN_MULTICAST) {
2452                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2453                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2454                                  oldflp->proto))
2455                         flags &= ~RTCF_LOCAL;
2456                 /* If multicast route do not exist use
2457                  * default one, but do not gateway in this case.
2458                  * Yes, it is hack.
2459                  */
2460                 if (res->fi && res->prefixlen < 4)
2461                         res->fi = NULL;
2462         }
2463
2464
2465         rth = dst_alloc(&ipv4_dst_ops);
2466         if (!rth)
2467                 return -ENOBUFS;
2468
2469         atomic_set(&rth->dst.__refcnt, 1);
2470         rth->dst.flags= DST_HOST;
2471         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2472                 rth->dst.flags |= DST_NOXFRM;
2473         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2474                 rth->dst.flags |= DST_NOPOLICY;
2475
2476         rth->fl.fl4_dst = oldflp->fl4_dst;
2477         rth->fl.fl4_tos = tos;
2478         rth->fl.fl4_src = oldflp->fl4_src;
2479         rth->fl.oif     = oldflp->oif;
2480         rth->fl.mark    = oldflp->mark;
2481         rth->rt_dst     = fl->fl4_dst;
2482         rth->rt_src     = fl->fl4_src;
2483         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2484         /* get references to the devices that are to be hold by the routing
2485            cache entry */
2486         rth->dst.dev    = dev_out;
2487         dev_hold(dev_out);
2488         rth->rt_gateway = fl->fl4_dst;
2489         rth->rt_spec_dst= fl->fl4_src;
2490
2491         rth->dst.output=ip_output;
2492         rth->dst.obsolete = -1;
2493         rth->rt_genid = rt_genid(dev_net(dev_out));
2494
2495         RT_CACHE_STAT_INC(out_slow_tot);
2496
2497         if (flags & RTCF_LOCAL) {
2498                 rth->dst.input = ip_local_deliver;
2499                 rth->rt_spec_dst = fl->fl4_dst;
2500         }
2501         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2502                 rth->rt_spec_dst = fl->fl4_src;
2503                 if (flags & RTCF_LOCAL &&
2504                     !(dev_out->flags & IFF_LOOPBACK)) {
2505                         rth->dst.output = ip_mc_output;
2506                         RT_CACHE_STAT_INC(out_slow_mc);
2507                 }
2508 #ifdef CONFIG_IP_MROUTE
2509                 if (res->type == RTN_MULTICAST) {
2510                         if (IN_DEV_MFORWARD(in_dev) &&
2511                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2512                                 rth->dst.input = ip_mr_input;
2513                                 rth->dst.output = ip_mc_output;
2514                         }
2515                 }
2516 #endif
2517         }
2518
2519         rt_set_nexthop(rth, res, 0);
2520
2521         rth->rt_flags = flags;
2522         *result = rth;
2523         return 0;
2524 }
2525
2526 /* called with rcu_read_lock() */
2527 static int ip_mkroute_output(struct rtable **rp,
2528                              struct fib_result *res,
2529                              const struct flowi *fl,
2530                              const struct flowi *oldflp,
2531                              struct net_device *dev_out,
2532                              unsigned flags)
2533 {
2534         struct rtable *rth = NULL;
2535         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2536         unsigned hash;
2537         if (err == 0) {
2538                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2539                                rt_genid(dev_net(dev_out)));
2540                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2541         }
2542
2543         return err;
2544 }
2545
2546 /*
2547  * Major route resolver routine.
2548  * called with rcu_read_lock();
2549  */
2550
2551 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2552                                 const struct flowi *oldflp)
2553 {
2554         u32 tos = RT_FL_TOS(oldflp);
2555         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2556                             .fl4_src = oldflp->fl4_src,
2557                             .fl4_tos = tos & IPTOS_RT_MASK,
2558                             .fl4_scope = ((tos & RTO_ONLINK) ?
2559                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2560                             .mark = oldflp->mark,
2561                             .iif = net->loopback_dev->ifindex,
2562                             .oif = oldflp->oif };
2563         struct fib_result res;
2564         unsigned int flags = 0;
2565         struct net_device *dev_out = NULL;
2566         int err;
2567
2568
2569         res.fi          = NULL;
2570 #ifdef CONFIG_IP_MULTIPLE_TABLES
2571         res.r           = NULL;
2572 #endif
2573
2574         if (oldflp->fl4_src) {
2575                 err = -EINVAL;
2576                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2577                     ipv4_is_lbcast(oldflp->fl4_src) ||
2578                     ipv4_is_zeronet(oldflp->fl4_src))
2579                         goto out;
2580
2581                 /* I removed check for oif == dev_out->oif here.
2582                    It was wrong for two reasons:
2583                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2584                       is assigned to multiple interfaces.
2585                    2. Moreover, we are allowed to send packets with saddr
2586                       of another iface. --ANK
2587                  */
2588
2589                 if (oldflp->oif == 0 &&
2590                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2591                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2592                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2593                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2594                         if (dev_out == NULL)
2595                                 goto out;
2596
2597                         /* Special hack: user can direct multicasts
2598                            and limited broadcast via necessary interface
2599                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2600                            This hack is not just for fun, it allows
2601                            vic,vat and friends to work.
2602                            They bind socket to loopback, set ttl to zero
2603                            and expect that it will work.
2604                            From the viewpoint of routing cache they are broken,
2605                            because we are not allowed to build multicast path
2606                            with loopback source addr (look, routing cache
2607                            cannot know, that ttl is zero, so that packet
2608                            will not leave this host and route is valid).
2609                            Luckily, this hack is good workaround.
2610                          */
2611
2612                         fl.oif = dev_out->ifindex;
2613                         goto make_route;
2614                 }
2615
2616                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2617                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2618                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2619                                 goto out;
2620                 }
2621         }
2622
2623
2624         if (oldflp->oif) {
2625                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2626                 err = -ENODEV;
2627                 if (dev_out == NULL)
2628                         goto out;
2629
2630                 /* RACE: Check return value of inet_select_addr instead. */
2631                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2632                         err = -ENETUNREACH;
2633                         goto out;
2634                 }
2635                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2636                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2637                         if (!fl.fl4_src)
2638                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2639                                                               RT_SCOPE_LINK);
2640                         goto make_route;
2641                 }
2642                 if (!fl.fl4_src) {
2643                         if (ipv4_is_multicast(oldflp->fl4_dst))
2644                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2645                                                               fl.fl4_scope);
2646                         else if (!oldflp->fl4_dst)
2647                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2648                                                               RT_SCOPE_HOST);
2649                 }
2650         }
2651
2652         if (!fl.fl4_dst) {
2653                 fl.fl4_dst = fl.fl4_src;
2654                 if (!fl.fl4_dst)
2655                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2656                 dev_out = net->loopback_dev;
2657                 fl.oif = net->loopback_dev->ifindex;
2658                 res.type = RTN_LOCAL;
2659                 flags |= RTCF_LOCAL;
2660                 goto make_route;
2661         }
2662
2663         if (fib_lookup(net, &fl, &res)) {
2664                 res.fi = NULL;
2665                 if (oldflp->oif) {
2666                         /* Apparently, routing tables are wrong. Assume,
2667                            that the destination is on link.
2668
2669                            WHY? DW.
2670                            Because we are allowed to send to iface
2671                            even if it has NO routes and NO assigned
2672                            addresses. When oif is specified, routing
2673                            tables are looked up with only one purpose:
2674                            to catch if destination is gatewayed, rather than
2675                            direct. Moreover, if MSG_DONTROUTE is set,
2676                            we send packet, ignoring both routing tables
2677                            and ifaddr state. --ANK
2678
2679
2680                            We could make it even if oif is unknown,
2681                            likely IPv6, but we do not.
2682                          */
2683
2684                         if (fl.fl4_src == 0)
2685                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2686                                                               RT_SCOPE_LINK);
2687                         res.type = RTN_UNICAST;
2688                         goto make_route;
2689                 }
2690                 err = -ENETUNREACH;
2691                 goto out;
2692         }
2693
2694         if (res.type == RTN_LOCAL) {
2695                 if (!fl.fl4_src) {
2696                         if (res.fi->fib_prefsrc)
2697                                 fl.fl4_src = res.fi->fib_prefsrc;
2698                         else
2699                                 fl.fl4_src = fl.fl4_dst;
2700                 }
2701                 dev_out = net->loopback_dev;
2702                 fl.oif = dev_out->ifindex;
2703                 res.fi = NULL;
2704                 flags |= RTCF_LOCAL;
2705                 goto make_route;
2706         }
2707
2708 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2709         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2710                 fib_select_multipath(&fl, &res);
2711         else
2712 #endif
2713         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2714                 fib_select_default(net, &fl, &res);
2715
2716         if (!fl.fl4_src)
2717                 fl.fl4_src = FIB_RES_PREFSRC(res);
2718
2719         dev_out = FIB_RES_DEV(res);
2720         fl.oif = dev_out->ifindex;
2721
2722
2723 make_route:
2724         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2725
2726 out:    return err;
2727 }
2728
2729 int __ip_route_output_key(struct net *net, struct rtable **rp,
2730                           const struct flowi *flp)
2731 {
2732         unsigned int hash;
2733         int res;
2734         struct rtable *rth;
2735
2736         if (!rt_caching(net))
2737                 goto slow_output;
2738
2739         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2740
2741         rcu_read_lock_bh();
2742         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2743                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2744                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2745                     rth->fl.fl4_src == flp->fl4_src &&
2746                     rt_is_output_route(rth) &&
2747                     rth->fl.oif == flp->oif &&
2748                     rth->fl.mark == flp->mark &&
2749                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2750                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2751                     net_eq(dev_net(rth->dst.dev), net) &&
2752                     !rt_is_expired(rth)) {
2753                         dst_use(&rth->dst, jiffies);
2754                         RT_CACHE_STAT_INC(out_hit);
2755                         rcu_read_unlock_bh();
2756                         *rp = rth;
2757                         return 0;
2758                 }
2759                 RT_CACHE_STAT_INC(out_hlist_search);
2760         }
2761         rcu_read_unlock_bh();
2762
2763 slow_output:
2764         rcu_read_lock();
2765         res = ip_route_output_slow(net, rp, flp);
2766         rcu_read_unlock();
2767         return res;
2768 }
2769 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2770
2771 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2772 {
2773         return NULL;
2774 }
2775
2776 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2777 {
2778 }
2779
2780 static struct dst_ops ipv4_dst_blackhole_ops = {
2781         .family                 =       AF_INET,
2782         .protocol               =       cpu_to_be16(ETH_P_IP),
2783         .destroy                =       ipv4_dst_destroy,
2784         .check                  =       ipv4_blackhole_dst_check,
2785         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2786 };
2787
2788
2789 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2790 {
2791         struct rtable *ort = *rp;
2792         struct rtable *rt = (struct rtable *)
2793                 dst_alloc(&ipv4_dst_blackhole_ops);
2794
2795         if (rt) {
2796                 struct dst_entry *new = &rt->dst;
2797
2798                 atomic_set(&new->__refcnt, 1);
2799                 new->__use = 1;
2800                 new->input = dst_discard;
2801                 new->output = dst_discard;
2802                 dst_copy_metrics(new, &ort->dst);
2803
2804                 new->dev = ort->dst.dev;
2805                 if (new->dev)
2806                         dev_hold(new->dev);
2807
2808                 rt->fl = ort->fl;
2809
2810                 rt->rt_genid = rt_genid(net);
2811                 rt->rt_flags = ort->rt_flags;
2812                 rt->rt_type = ort->rt_type;
2813                 rt->rt_dst = ort->rt_dst;
2814                 rt->rt_src = ort->rt_src;
2815                 rt->rt_iif = ort->rt_iif;
2816                 rt->rt_gateway = ort->rt_gateway;
2817                 rt->rt_spec_dst = ort->rt_spec_dst;
2818                 rt->peer = ort->peer;
2819                 if (rt->peer)
2820                         atomic_inc(&rt->peer->refcnt);
2821                 rt->fi = ort->fi;
2822                 if (rt->fi)
2823                         atomic_inc(&rt->fi->fib_clntref);
2824
2825                 dst_free(new);
2826         }
2827
2828         dst_release(&(*rp)->dst);
2829         *rp = rt;
2830         return rt ? 0 : -ENOMEM;
2831 }
2832
2833 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2834                          struct sock *sk, int flags)
2835 {
2836         int err;
2837
2838         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2839                 return err;
2840
2841         if (flp->proto) {
2842                 if (!flp->fl4_src)
2843                         flp->fl4_src = (*rp)->rt_src;
2844                 if (!flp->fl4_dst)
2845                         flp->fl4_dst = (*rp)->rt_dst;
2846                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2847                                     flags ? XFRM_LOOKUP_WAIT : 0);
2848                 if (err == -EREMOTE)
2849                         err = ipv4_dst_blackhole(net, rp, flp);
2850
2851                 return err;
2852         }
2853
2854         return 0;
2855 }
2856 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2857
2858 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2859 {
2860         return ip_route_output_flow(net, rp, flp, NULL, 0);
2861 }
2862 EXPORT_SYMBOL(ip_route_output_key);
2863
2864 static int rt_fill_info(struct net *net,
2865                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2866                         int nowait, unsigned int flags)
2867 {
2868         struct rtable *rt = skb_rtable(skb);
2869         struct rtmsg *r;
2870         struct nlmsghdr *nlh;
2871         long expires;
2872         u32 id = 0, ts = 0, tsage = 0, error;
2873
2874         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2875         if (nlh == NULL)
2876                 return -EMSGSIZE;
2877
2878         r = nlmsg_data(nlh);
2879         r->rtm_family    = AF_INET;
2880         r->rtm_dst_len  = 32;
2881         r->rtm_src_len  = 0;
2882         r->rtm_tos      = rt->fl.fl4_tos;
2883         r->rtm_table    = RT_TABLE_MAIN;
2884         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2885         r->rtm_type     = rt->rt_type;
2886         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2887         r->rtm_protocol = RTPROT_UNSPEC;
2888         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2889         if (rt->rt_flags & RTCF_NOTIFY)
2890                 r->rtm_flags |= RTM_F_NOTIFY;
2891
2892         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2893
2894         if (rt->fl.fl4_src) {
2895                 r->rtm_src_len = 32;
2896                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2897         }
2898         if (rt->dst.dev)
2899                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2900 #ifdef CONFIG_IP_ROUTE_CLASSID
2901         if (rt->dst.tclassid)
2902                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2903 #endif
2904         if (rt_is_input_route(rt))
2905                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2906         else if (rt->rt_src != rt->fl.fl4_src)
2907                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2908
2909         if (rt->rt_dst != rt->rt_gateway)
2910                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2911
2912         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2913                 goto nla_put_failure;
2914
2915         if (rt->fl.mark)
2916                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2917
2918         error = rt->dst.error;
2919         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2920         if (rt->peer) {
2921                 inet_peer_refcheck(rt->peer);
2922                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2923                 if (rt->peer->tcp_ts_stamp) {
2924                         ts = rt->peer->tcp_ts;
2925                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2926                 }
2927         }
2928
2929         if (rt_is_input_route(rt)) {
2930 #ifdef CONFIG_IP_MROUTE
2931                 __be32 dst = rt->rt_dst;
2932
2933                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2934                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2935                         int err = ipmr_get_route(net, skb, r, nowait);
2936                         if (err <= 0) {
2937                                 if (!nowait) {
2938                                         if (err == 0)
2939                                                 return 0;
2940                                         goto nla_put_failure;
2941                                 } else {
2942                                         if (err == -EMSGSIZE)
2943                                                 goto nla_put_failure;
2944                                         error = err;
2945                                 }
2946                         }
2947                 } else
2948 #endif
2949                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2950         }
2951
2952         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2953                                expires, error) < 0)
2954                 goto nla_put_failure;
2955
2956         return nlmsg_end(skb, nlh);
2957
2958 nla_put_failure:
2959         nlmsg_cancel(skb, nlh);
2960         return -EMSGSIZE;
2961 }
2962
2963 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2964 {
2965         struct net *net = sock_net(in_skb->sk);
2966         struct rtmsg *rtm;
2967         struct nlattr *tb[RTA_MAX+1];
2968         struct rtable *rt = NULL;
2969         __be32 dst = 0;
2970         __be32 src = 0;
2971         u32 iif;
2972         int err;
2973         int mark;
2974         struct sk_buff *skb;
2975
2976         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2977         if (err < 0)
2978                 goto errout;
2979
2980         rtm = nlmsg_data(nlh);
2981
2982         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2983         if (skb == NULL) {
2984                 err = -ENOBUFS;
2985                 goto errout;
2986         }
2987
2988         /* Reserve room for dummy headers, this skb can pass
2989            through good chunk of routing engine.
2990          */
2991         skb_reset_mac_header(skb);
2992         skb_reset_network_header(skb);
2993
2994         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2995         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2996         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2997
2998         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2999         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3000         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3001         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3002
3003         if (iif) {
3004                 struct net_device *dev;
3005
3006                 dev = __dev_get_by_index(net, iif);
3007                 if (dev == NULL) {
3008                         err = -ENODEV;
3009                         goto errout_free;
3010                 }
3011
3012                 skb->protocol   = htons(ETH_P_IP);
3013                 skb->dev        = dev;
3014                 skb->mark       = mark;
3015                 local_bh_disable();
3016                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3017                 local_bh_enable();
3018
3019                 rt = skb_rtable(skb);
3020                 if (err == 0 && rt->dst.error)
3021                         err = -rt->dst.error;
3022         } else {
3023                 struct flowi fl = {
3024                         .fl4_dst = dst,
3025                         .fl4_src = src,
3026                         .fl4_tos = rtm->rtm_tos,
3027                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3028                         .mark = mark,
3029                 };
3030                 err = ip_route_output_key(net, &rt, &fl);
3031         }
3032
3033         if (err)
3034                 goto errout_free;
3035
3036         skb_dst_set(skb, &rt->dst);
3037         if (rtm->rtm_flags & RTM_F_NOTIFY)
3038                 rt->rt_flags |= RTCF_NOTIFY;
3039
3040         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3041                            RTM_NEWROUTE, 0, 0);
3042         if (err <= 0)
3043                 goto errout_free;
3044
3045         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3046 errout:
3047         return err;
3048
3049 errout_free:
3050         kfree_skb(skb);
3051         goto errout;
3052 }
3053
3054 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3055 {
3056         struct rtable *rt;
3057         int h, s_h;
3058         int idx, s_idx;
3059         struct net *net;
3060
3061         net = sock_net(skb->sk);
3062
3063         s_h = cb->args[0];
3064         if (s_h < 0)
3065                 s_h = 0;
3066         s_idx = idx = cb->args[1];
3067         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3068                 if (!rt_hash_table[h].chain)
3069                         continue;
3070                 rcu_read_lock_bh();
3071                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3072                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3073                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3074                                 continue;
3075                         if (rt_is_expired(rt))
3076                                 continue;
3077                         skb_dst_set_noref(skb, &rt->dst);
3078                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3079                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3080                                          1, NLM_F_MULTI) <= 0) {
3081                                 skb_dst_drop(skb);
3082                                 rcu_read_unlock_bh();
3083                                 goto done;
3084                         }
3085                         skb_dst_drop(skb);
3086                 }
3087                 rcu_read_unlock_bh();
3088         }
3089
3090 done:
3091         cb->args[0] = h;
3092         cb->args[1] = idx;
3093         return skb->len;
3094 }
3095
3096 void ip_rt_multicast_event(struct in_device *in_dev)
3097 {
3098         rt_cache_flush(dev_net(in_dev->dev), 0);
3099 }
3100
3101 #ifdef CONFIG_SYSCTL
3102 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3103                                         void __user *buffer,
3104                                         size_t *lenp, loff_t *ppos)
3105 {
3106         if (write) {
3107                 int flush_delay;
3108                 ctl_table ctl;
3109                 struct net *net;
3110
3111                 memcpy(&ctl, __ctl, sizeof(ctl));
3112                 ctl.data = &flush_delay;
3113                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3114
3115                 net = (struct net *)__ctl->extra1;
3116                 rt_cache_flush(net, flush_delay);
3117                 return 0;
3118         }
3119
3120         return -EINVAL;
3121 }
3122
3123 static ctl_table ipv4_route_table[] = {
3124         {
3125                 .procname       = "gc_thresh",
3126                 .data           = &ipv4_dst_ops.gc_thresh,
3127                 .maxlen         = sizeof(int),
3128                 .mode           = 0644,
3129                 .proc_handler   = proc_dointvec,
3130         },
3131         {
3132                 .procname       = "max_size",
3133                 .data           = &ip_rt_max_size,
3134                 .maxlen         = sizeof(int),
3135                 .mode           = 0644,
3136                 .proc_handler   = proc_dointvec,
3137         },
3138         {
3139                 /*  Deprecated. Use gc_min_interval_ms */
3140
3141                 .procname       = "gc_min_interval",
3142                 .data           = &ip_rt_gc_min_interval,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec_jiffies,
3146         },
3147         {
3148                 .procname       = "gc_min_interval_ms",
3149                 .data           = &ip_rt_gc_min_interval,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec_ms_jiffies,
3153         },
3154         {
3155                 .procname       = "gc_timeout",
3156                 .data           = &ip_rt_gc_timeout,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec_jiffies,
3160         },
3161         {
3162                 .procname       = "gc_interval",
3163                 .data           = &ip_rt_gc_interval,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec_jiffies,
3167         },
3168         {
3169                 .procname       = "redirect_load",
3170                 .data           = &ip_rt_redirect_load,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec,
3174         },
3175         {
3176                 .procname       = "redirect_number",
3177                 .data           = &ip_rt_redirect_number,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec,
3181         },
3182         {
3183                 .procname       = "redirect_silence",
3184                 .data           = &ip_rt_redirect_silence,
3185                 .maxlen         = sizeof(int),
3186                 .mode           = 0644,
3187                 .proc_handler   = proc_dointvec,
3188         },
3189         {
3190                 .procname       = "error_cost",
3191                 .data           = &ip_rt_error_cost,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec,
3195         },
3196         {
3197                 .procname       = "error_burst",
3198                 .data           = &ip_rt_error_burst,
3199                 .maxlen         = sizeof(int),
3200                 .mode           = 0644,
3201                 .proc_handler   = proc_dointvec,
3202         },
3203         {
3204                 .procname       = "gc_elasticity",
3205                 .data           = &ip_rt_gc_elasticity,
3206                 .maxlen         = sizeof(int),
3207                 .mode           = 0644,
3208                 .proc_handler   = proc_dointvec,
3209         },
3210         {
3211                 .procname       = "mtu_expires",
3212                 .data           = &ip_rt_mtu_expires,
3213                 .maxlen         = sizeof(int),
3214                 .mode           = 0644,
3215                 .proc_handler   = proc_dointvec_jiffies,
3216         },
3217         {
3218                 .procname       = "min_pmtu",
3219                 .data           = &ip_rt_min_pmtu,
3220                 .maxlen         = sizeof(int),
3221                 .mode           = 0644,
3222                 .proc_handler   = proc_dointvec,
3223         },
3224         {
3225                 .procname       = "min_adv_mss",
3226                 .data           = &ip_rt_min_advmss,
3227                 .maxlen         = sizeof(int),
3228                 .mode           = 0644,
3229                 .proc_handler   = proc_dointvec,
3230         },
3231         { }
3232 };
3233
3234 static struct ctl_table empty[1];
3235
3236 static struct ctl_table ipv4_skeleton[] =
3237 {
3238         { .procname = "route", 
3239           .mode = 0555, .child = ipv4_route_table},
3240         { .procname = "neigh", 
3241           .mode = 0555, .child = empty},
3242         { }
3243 };
3244
3245 static __net_initdata struct ctl_path ipv4_path[] = {
3246         { .procname = "net", },
3247         { .procname = "ipv4", },
3248         { },
3249 };
3250
3251 static struct ctl_table ipv4_route_flush_table[] = {
3252         {
3253                 .procname       = "flush",
3254                 .maxlen         = sizeof(int),
3255                 .mode           = 0200,
3256                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3257         },
3258         { },
3259 };
3260
3261 static __net_initdata struct ctl_path ipv4_route_path[] = {
3262         { .procname = "net", },
3263         { .procname = "ipv4", },
3264         { .procname = "route", },
3265         { },
3266 };
3267
3268 static __net_init int sysctl_route_net_init(struct net *net)
3269 {
3270         struct ctl_table *tbl;
3271
3272         tbl = ipv4_route_flush_table;
3273         if (!net_eq(net, &init_net)) {
3274                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3275                 if (tbl == NULL)
3276                         goto err_dup;
3277         }
3278         tbl[0].extra1 = net;
3279
3280         net->ipv4.route_hdr =
3281                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3282         if (net->ipv4.route_hdr == NULL)
3283                 goto err_reg;
3284         return 0;
3285
3286 err_reg:
3287         if (tbl != ipv4_route_flush_table)
3288                 kfree(tbl);
3289 err_dup:
3290         return -ENOMEM;
3291 }
3292
3293 static __net_exit void sysctl_route_net_exit(struct net *net)
3294 {
3295         struct ctl_table *tbl;
3296
3297         tbl = net->ipv4.route_hdr->ctl_table_arg;
3298         unregister_net_sysctl_table(net->ipv4.route_hdr);
3299         BUG_ON(tbl == ipv4_route_flush_table);
3300         kfree(tbl);
3301 }
3302
3303 static __net_initdata struct pernet_operations sysctl_route_ops = {
3304         .init = sysctl_route_net_init,
3305         .exit = sysctl_route_net_exit,
3306 };
3307 #endif
3308
3309 static __net_init int rt_genid_init(struct net *net)
3310 {
3311         get_random_bytes(&net->ipv4.rt_genid,
3312                          sizeof(net->ipv4.rt_genid));
3313         return 0;
3314 }
3315
3316 static __net_initdata struct pernet_operations rt_genid_ops = {
3317         .init = rt_genid_init,
3318 };
3319
3320
3321 #ifdef CONFIG_IP_ROUTE_CLASSID
3322 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3323 #endif /* CONFIG_IP_ROUTE_CLASSID */
3324
3325 static __initdata unsigned long rhash_entries;
3326 static int __init set_rhash_entries(char *str)
3327 {
3328         if (!str)
3329                 return 0;
3330         rhash_entries = simple_strtoul(str, &str, 0);
3331         return 1;
3332 }
3333 __setup("rhash_entries=", set_rhash_entries);
3334
3335 int __init ip_rt_init(void)
3336 {
3337         int rc = 0;
3338
3339 #ifdef CONFIG_IP_ROUTE_CLASSID
3340         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3341         if (!ip_rt_acct)
3342                 panic("IP: failed to allocate ip_rt_acct\n");
3343 #endif
3344
3345         ipv4_dst_ops.kmem_cachep =
3346                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3347                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3348
3349         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3350
3351         if (dst_entries_init(&ipv4_dst_ops) < 0)
3352                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3353
3354         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3355                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3356
3357         rt_hash_table = (struct rt_hash_bucket *)
3358                 alloc_large_system_hash("IP route cache",
3359                                         sizeof(struct rt_hash_bucket),
3360                                         rhash_entries,
3361                                         (totalram_pages >= 128 * 1024) ?
3362                                         15 : 17,
3363                                         0,
3364                                         &rt_hash_log,
3365                                         &rt_hash_mask,
3366                                         rhash_entries ? 0 : 512 * 1024);
3367         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3368         rt_hash_lock_init();
3369
3370         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3371         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3372
3373         devinet_init();
3374         ip_fib_init();
3375
3376         /* All the timers, started at system startup tend
3377            to synchronize. Perturb it a bit.
3378          */
3379         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3380         expires_ljiffies = jiffies;
3381         schedule_delayed_work(&expires_work,
3382                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3383
3384         if (ip_rt_proc_init())
3385                 printk(KERN_ERR "Unable to create route proc files\n");
3386 #ifdef CONFIG_XFRM
3387         xfrm_init();
3388         xfrm4_init(ip_rt_max_size);
3389 #endif
3390         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3391
3392 #ifdef CONFIG_SYSCTL
3393         register_pernet_subsys(&sysctl_route_ops);
3394 #endif
3395         register_pernet_subsys(&rt_genid_ops);
3396         return rc;
3397 }
3398
3399 #ifdef CONFIG_SYSCTL
3400 /*
3401  * We really need to sanitize the damn ipv4 init order, then all
3402  * this nonsense will go away.
3403  */
3404 void __init ip_static_sysctl_init(void)
3405 {
3406         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3407 }
3408 #endif