ipv4: Inline neigh binding.
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112
113 #define RT_FL_TOS(oldflp4) \
114     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly    = 8;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133 static int rt_chain_length_max __read_mostly    = 20;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(struct dst_ops *ops);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         struct rtable *rt = (struct rtable *) dst;
156         struct inet_peer *peer;
157         u32 *p = NULL;
158
159         if (!rt->peer)
160                 rt_bind_peer(rt, rt->rt_dst, 1);
161
162         peer = rt->peer;
163         if (peer) {
164                 u32 *old_p = __DST_METRICS_PTR(old);
165                 unsigned long prev, new;
166
167                 p = peer->metrics;
168                 if (inet_metrics_new(peer))
169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
170
171                 new = (unsigned long) p;
172                 prev = cmpxchg(&dst->_metrics, old, new);
173
174                 if (prev != old) {
175                         p = __DST_METRICS_PTR(prev);
176                         if (prev & DST_METRICS_READ_ONLY)
177                                 p = NULL;
178                 } else {
179                         if (rt->fi) {
180                                 fib_info_put(rt->fi);
181                                 rt->fi = NULL;
182                         }
183                 }
184         }
185         return p;
186 }
187
188 static struct dst_ops ipv4_dst_ops = {
189         .family =               AF_INET,
190         .protocol =             cpu_to_be16(ETH_P_IP),
191         .gc =                   rt_garbage_collect,
192         .check =                ipv4_dst_check,
193         .default_advmss =       ipv4_default_advmss,
194         .default_mtu =          ipv4_default_mtu,
195         .cow_metrics =          ipv4_cow_metrics,
196         .destroy =              ipv4_dst_destroy,
197         .ifdown =               ipv4_dst_ifdown,
198         .negative_advice =      ipv4_negative_advice,
199         .link_failure =         ipv4_link_failure,
200         .update_pmtu =          ip_rt_update_pmtu,
201         .local_out =            __ip_local_out,
202 };
203
204 #define ECN_OR_COST(class)      TC_PRIO_##class
205
206 const __u8 ip_tos2prio[16] = {
207         TC_PRIO_BESTEFFORT,
208         ECN_OR_COST(BESTEFFORT),
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(BESTEFFORT),
211         TC_PRIO_BULK,
212         ECN_OR_COST(BULK),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_INTERACTIVE,
216         ECN_OR_COST(INTERACTIVE),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE_BULK,
220         ECN_OR_COST(INTERACTIVE_BULK),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK)
223 };
224
225
226 /*
227  * Route cache.
228  */
229
230 /* The locking scheme is rather straight forward:
231  *
232  * 1) Read-Copy Update protects the buckets of the central route hash.
233  * 2) Only writers remove entries, and they hold the lock
234  *    as they look at rtable reference counts.
235  * 3) Only readers acquire references to rtable entries,
236  *    they do so with atomic increments and with the
237  *    lock held.
238  */
239
240 struct rt_hash_bucket {
241         struct rtable __rcu     *chain;
242 };
243
244 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245         defined(CONFIG_PROVE_LOCKING)
246 /*
247  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248  * The size of this table is a power of two and depends on the number of CPUS.
249  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
250  */
251 #ifdef CONFIG_LOCKDEP
252 # define RT_HASH_LOCK_SZ        256
253 #else
254 # if NR_CPUS >= 32
255 #  define RT_HASH_LOCK_SZ       4096
256 # elif NR_CPUS >= 16
257 #  define RT_HASH_LOCK_SZ       2048
258 # elif NR_CPUS >= 8
259 #  define RT_HASH_LOCK_SZ       1024
260 # elif NR_CPUS >= 4
261 #  define RT_HASH_LOCK_SZ       512
262 # else
263 #  define RT_HASH_LOCK_SZ       256
264 # endif
265 #endif
266
267 static spinlock_t       *rt_hash_locks;
268 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
269
270 static __init void rt_hash_lock_init(void)
271 {
272         int i;
273
274         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275                         GFP_KERNEL);
276         if (!rt_hash_locks)
277                 panic("IP: failed to allocate rt_hash_locks\n");
278
279         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280                 spin_lock_init(&rt_hash_locks[i]);
281 }
282 #else
283 # define rt_hash_lock_addr(slot) NULL
284
285 static inline void rt_hash_lock_init(void)
286 {
287 }
288 #endif
289
290 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
291 static unsigned                 rt_hash_mask __read_mostly;
292 static unsigned int             rt_hash_log  __read_mostly;
293
294 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
295 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
296
297 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
298                                    int genid)
299 {
300         return jhash_3words((__force u32)daddr, (__force u32)saddr,
301                             idx, genid)
302                 & rt_hash_mask;
303 }
304
305 static inline int rt_genid(struct net *net)
306 {
307         return atomic_read(&net->ipv4.rt_genid);
308 }
309
310 #ifdef CONFIG_PROC_FS
311 struct rt_cache_iter_state {
312         struct seq_net_private p;
313         int bucket;
314         int genid;
315 };
316
317 static struct rtable *rt_cache_get_first(struct seq_file *seq)
318 {
319         struct rt_cache_iter_state *st = seq->private;
320         struct rtable *r = NULL;
321
322         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
323                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
324                         continue;
325                 rcu_read_lock_bh();
326                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
327                 while (r) {
328                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
329                             r->rt_genid == st->genid)
330                                 return r;
331                         r = rcu_dereference_bh(r->dst.rt_next);
332                 }
333                 rcu_read_unlock_bh();
334         }
335         return r;
336 }
337
338 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
339                                           struct rtable *r)
340 {
341         struct rt_cache_iter_state *st = seq->private;
342
343         r = rcu_dereference_bh(r->dst.rt_next);
344         while (!r) {
345                 rcu_read_unlock_bh();
346                 do {
347                         if (--st->bucket < 0)
348                                 return NULL;
349                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
350                 rcu_read_lock_bh();
351                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
352         }
353         return r;
354 }
355
356 static struct rtable *rt_cache_get_next(struct seq_file *seq,
357                                         struct rtable *r)
358 {
359         struct rt_cache_iter_state *st = seq->private;
360         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
361                 if (dev_net(r->dst.dev) != seq_file_net(seq))
362                         continue;
363                 if (r->rt_genid == st->genid)
364                         break;
365         }
366         return r;
367 }
368
369 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
370 {
371         struct rtable *r = rt_cache_get_first(seq);
372
373         if (r)
374                 while (pos && (r = rt_cache_get_next(seq, r)))
375                         --pos;
376         return pos ? NULL : r;
377 }
378
379 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380 {
381         struct rt_cache_iter_state *st = seq->private;
382         if (*pos)
383                 return rt_cache_get_idx(seq, *pos - 1);
384         st->genid = rt_genid(seq_file_net(seq));
385         return SEQ_START_TOKEN;
386 }
387
388 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389 {
390         struct rtable *r;
391
392         if (v == SEQ_START_TOKEN)
393                 r = rt_cache_get_first(seq);
394         else
395                 r = rt_cache_get_next(seq, v);
396         ++*pos;
397         return r;
398 }
399
400 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401 {
402         if (v && v != SEQ_START_TOKEN)
403                 rcu_read_unlock_bh();
404 }
405
406 static int rt_cache_seq_show(struct seq_file *seq, void *v)
407 {
408         if (v == SEQ_START_TOKEN)
409                 seq_printf(seq, "%-127s\n",
410                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412                            "HHUptod\tSpecDst");
413         else {
414                 struct rtable *r = v;
415                 int len;
416
417                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
419                         r->dst.dev ? r->dst.dev->name : "*",
420                         (__force u32)r->rt_dst,
421                         (__force u32)r->rt_gateway,
422                         r->rt_flags, atomic_read(&r->dst.__refcnt),
423                         r->dst.__use, 0, (__force u32)r->rt_src,
424                         dst_metric_advmss(&r->dst) + 40,
425                         dst_metric(&r->dst, RTAX_WINDOW),
426                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427                               dst_metric(&r->dst, RTAX_RTTVAR)),
428                         r->rt_key_tos,
429                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430                         r->dst.hh ? (r->dst.hh->hh_output ==
431                                        dev_queue_xmit) : 0,
432                         r->rt_spec_dst, &len);
433
434                 seq_printf(seq, "%*s\n", 127 - len, "");
435         }
436         return 0;
437 }
438
439 static const struct seq_operations rt_cache_seq_ops = {
440         .start  = rt_cache_seq_start,
441         .next   = rt_cache_seq_next,
442         .stop   = rt_cache_seq_stop,
443         .show   = rt_cache_seq_show,
444 };
445
446 static int rt_cache_seq_open(struct inode *inode, struct file *file)
447 {
448         return seq_open_net(inode, file, &rt_cache_seq_ops,
449                         sizeof(struct rt_cache_iter_state));
450 }
451
452 static const struct file_operations rt_cache_seq_fops = {
453         .owner   = THIS_MODULE,
454         .open    = rt_cache_seq_open,
455         .read    = seq_read,
456         .llseek  = seq_lseek,
457         .release = seq_release_net,
458 };
459
460
461 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462 {
463         int cpu;
464
465         if (*pos == 0)
466                 return SEQ_START_TOKEN;
467
468         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
469                 if (!cpu_possible(cpu))
470                         continue;
471                 *pos = cpu+1;
472                 return &per_cpu(rt_cache_stat, cpu);
473         }
474         return NULL;
475 }
476
477 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478 {
479         int cpu;
480
481         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
482                 if (!cpu_possible(cpu))
483                         continue;
484                 *pos = cpu+1;
485                 return &per_cpu(rt_cache_stat, cpu);
486         }
487         return NULL;
488
489 }
490
491 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492 {
493
494 }
495
496 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497 {
498         struct rt_cache_stat *st = v;
499
500         if (v == SEQ_START_TOKEN) {
501                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
502                 return 0;
503         }
504
505         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
506                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
507                    dst_entries_get_slow(&ipv4_dst_ops),
508                    st->in_hit,
509                    st->in_slow_tot,
510                    st->in_slow_mc,
511                    st->in_no_route,
512                    st->in_brd,
513                    st->in_martian_dst,
514                    st->in_martian_src,
515
516                    st->out_hit,
517                    st->out_slow_tot,
518                    st->out_slow_mc,
519
520                    st->gc_total,
521                    st->gc_ignored,
522                    st->gc_goal_miss,
523                    st->gc_dst_overflow,
524                    st->in_hlist_search,
525                    st->out_hlist_search
526                 );
527         return 0;
528 }
529
530 static const struct seq_operations rt_cpu_seq_ops = {
531         .start  = rt_cpu_seq_start,
532         .next   = rt_cpu_seq_next,
533         .stop   = rt_cpu_seq_stop,
534         .show   = rt_cpu_seq_show,
535 };
536
537
538 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539 {
540         return seq_open(file, &rt_cpu_seq_ops);
541 }
542
543 static const struct file_operations rt_cpu_seq_fops = {
544         .owner   = THIS_MODULE,
545         .open    = rt_cpu_seq_open,
546         .read    = seq_read,
547         .llseek  = seq_lseek,
548         .release = seq_release,
549 };
550
551 #ifdef CONFIG_IP_ROUTE_CLASSID
552 static int rt_acct_proc_show(struct seq_file *m, void *v)
553 {
554         struct ip_rt_acct *dst, *src;
555         unsigned int i, j;
556
557         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558         if (!dst)
559                 return -ENOMEM;
560
561         for_each_possible_cpu(i) {
562                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563                 for (j = 0; j < 256; j++) {
564                         dst[j].o_bytes   += src[j].o_bytes;
565                         dst[j].o_packets += src[j].o_packets;
566                         dst[j].i_bytes   += src[j].i_bytes;
567                         dst[j].i_packets += src[j].i_packets;
568                 }
569         }
570
571         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572         kfree(dst);
573         return 0;
574 }
575
576 static int rt_acct_proc_open(struct inode *inode, struct file *file)
577 {
578         return single_open(file, rt_acct_proc_show, NULL);
579 }
580
581 static const struct file_operations rt_acct_proc_fops = {
582         .owner          = THIS_MODULE,
583         .open           = rt_acct_proc_open,
584         .read           = seq_read,
585         .llseek         = seq_lseek,
586         .release        = single_release,
587 };
588 #endif
589
590 static int __net_init ip_rt_do_proc_init(struct net *net)
591 {
592         struct proc_dir_entry *pde;
593
594         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595                         &rt_cache_seq_fops);
596         if (!pde)
597                 goto err1;
598
599         pde = proc_create("rt_cache", S_IRUGO,
600                           net->proc_net_stat, &rt_cpu_seq_fops);
601         if (!pde)
602                 goto err2;
603
604 #ifdef CONFIG_IP_ROUTE_CLASSID
605         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
606         if (!pde)
607                 goto err3;
608 #endif
609         return 0;
610
611 #ifdef CONFIG_IP_ROUTE_CLASSID
612 err3:
613         remove_proc_entry("rt_cache", net->proc_net_stat);
614 #endif
615 err2:
616         remove_proc_entry("rt_cache", net->proc_net);
617 err1:
618         return -ENOMEM;
619 }
620
621 static void __net_exit ip_rt_do_proc_exit(struct net *net)
622 {
623         remove_proc_entry("rt_cache", net->proc_net_stat);
624         remove_proc_entry("rt_cache", net->proc_net);
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626         remove_proc_entry("rt_acct", net->proc_net);
627 #endif
628 }
629
630 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
631         .init = ip_rt_do_proc_init,
632         .exit = ip_rt_do_proc_exit,
633 };
634
635 static int __init ip_rt_proc_init(void)
636 {
637         return register_pernet_subsys(&ip_rt_proc_ops);
638 }
639
640 #else
641 static inline int ip_rt_proc_init(void)
642 {
643         return 0;
644 }
645 #endif /* CONFIG_PROC_FS */
646
647 static inline void rt_free(struct rtable *rt)
648 {
649         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
650 }
651
652 static inline void rt_drop(struct rtable *rt)
653 {
654         ip_rt_put(rt);
655         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
656 }
657
658 static inline int rt_fast_clean(struct rtable *rth)
659 {
660         /* Kill broadcast/multicast entries very aggresively, if they
661            collide in hash table with more useful entries */
662         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
663                 rt_is_input_route(rth) && rth->dst.rt_next;
664 }
665
666 static inline int rt_valuable(struct rtable *rth)
667 {
668         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
669                 (rth->peer && rth->peer->pmtu_expires);
670 }
671
672 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673 {
674         unsigned long age;
675         int ret = 0;
676
677         if (atomic_read(&rth->dst.__refcnt))
678                 goto out;
679
680         age = jiffies - rth->dst.lastuse;
681         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682             (age <= tmo2 && rt_valuable(rth)))
683                 goto out;
684         ret = 1;
685 out:    return ret;
686 }
687
688 /* Bits of score are:
689  * 31: very valuable
690  * 30: not quite useless
691  * 29..0: usage counter
692  */
693 static inline u32 rt_score(struct rtable *rt)
694 {
695         u32 score = jiffies - rt->dst.lastuse;
696
697         score = ~score & ~(3<<30);
698
699         if (rt_valuable(rt))
700                 score |= (1<<31);
701
702         if (rt_is_output_route(rt) ||
703             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704                 score |= (1<<30);
705
706         return score;
707 }
708
709 static inline bool rt_caching(const struct net *net)
710 {
711         return net->ipv4.current_rt_cache_rebuild_count <=
712                 net->ipv4.sysctl_rt_cache_rebuild_count;
713 }
714
715 static inline bool compare_hash_inputs(const struct rtable *rt1,
716                                        const struct rtable *rt2)
717 {
718         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
720                 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
721 }
722
723 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
724 {
725         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727                 (rt1->rt_mark ^ rt2->rt_mark) |
728                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
729                 (rt1->rt_oif ^ rt2->rt_oif) |
730                 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
731 }
732
733 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734 {
735         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
736 }
737
738 static inline int rt_is_expired(struct rtable *rth)
739 {
740         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
741 }
742
743 /*
744  * Perform a full scan of hash table and free all entries.
745  * Can be called by a softirq or a process.
746  * In the later case, we want to be reschedule if necessary
747  */
748 static void rt_do_flush(struct net *net, int process_context)
749 {
750         unsigned int i;
751         struct rtable *rth, *next;
752
753         for (i = 0; i <= rt_hash_mask; i++) {
754                 struct rtable __rcu **pprev;
755                 struct rtable *list;
756
757                 if (process_context && need_resched())
758                         cond_resched();
759                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
760                 if (!rth)
761                         continue;
762
763                 spin_lock_bh(rt_hash_lock_addr(i));
764
765                 list = NULL;
766                 pprev = &rt_hash_table[i].chain;
767                 rth = rcu_dereference_protected(*pprev,
768                         lockdep_is_held(rt_hash_lock_addr(i)));
769
770                 while (rth) {
771                         next = rcu_dereference_protected(rth->dst.rt_next,
772                                 lockdep_is_held(rt_hash_lock_addr(i)));
773
774                         if (!net ||
775                             net_eq(dev_net(rth->dst.dev), net)) {
776                                 rcu_assign_pointer(*pprev, next);
777                                 rcu_assign_pointer(rth->dst.rt_next, list);
778                                 list = rth;
779                         } else {
780                                 pprev = &rth->dst.rt_next;
781                         }
782                         rth = next;
783                 }
784
785                 spin_unlock_bh(rt_hash_lock_addr(i));
786
787                 for (; list; list = next) {
788                         next = rcu_dereference_protected(list->dst.rt_next, 1);
789                         rt_free(list);
790                 }
791         }
792 }
793
794 /*
795  * While freeing expired entries, we compute average chain length
796  * and standard deviation, using fixed-point arithmetic.
797  * This to have an estimation of rt_chain_length_max
798  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
799  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800  */
801
802 #define FRACT_BITS 3
803 #define ONE (1UL << FRACT_BITS)
804
805 /*
806  * Given a hash chain and an item in this hash chain,
807  * find if a previous entry has the same hash_inputs
808  * (but differs on tos, mark or oif)
809  * Returns 0 if an alias is found.
810  * Returns ONE if rth has no alias before itself.
811  */
812 static int has_noalias(const struct rtable *head, const struct rtable *rth)
813 {
814         const struct rtable *aux = head;
815
816         while (aux != rth) {
817                 if (compare_hash_inputs(aux, rth))
818                         return 0;
819                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
820         }
821         return ONE;
822 }
823
824 /*
825  * Perturbation of rt_genid by a small quantity [1..256]
826  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827  * many times (2^24) without giving recent rt_genid.
828  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
829  */
830 static void rt_cache_invalidate(struct net *net)
831 {
832         unsigned char shuffle;
833
834         get_random_bytes(&shuffle, sizeof(shuffle));
835         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
836 }
837
838 /*
839  * delay < 0  : invalidate cache (fast : entries will be deleted later)
840  * delay >= 0 : invalidate & flush cache (can be long)
841  */
842 void rt_cache_flush(struct net *net, int delay)
843 {
844         rt_cache_invalidate(net);
845         if (delay >= 0)
846                 rt_do_flush(net, !in_softirq());
847 }
848
849 /* Flush previous cache invalidated entries from the cache */
850 void rt_cache_flush_batch(struct net *net)
851 {
852         rt_do_flush(net, !in_softirq());
853 }
854
855 static void rt_emergency_hash_rebuild(struct net *net)
856 {
857         if (net_ratelimit())
858                 printk(KERN_WARNING "Route hash chain too long!\n");
859         rt_cache_invalidate(net);
860 }
861
862 /*
863    Short description of GC goals.
864
865    We want to build algorithm, which will keep routing cache
866    at some equilibrium point, when number of aged off entries
867    is kept approximately equal to newly generated ones.
868
869    Current expiration strength is variable "expire".
870    We try to adjust it dynamically, so that if networking
871    is idle expires is large enough to keep enough of warm entries,
872    and when load increases it reduces to limit cache size.
873  */
874
875 static int rt_garbage_collect(struct dst_ops *ops)
876 {
877         static unsigned long expire = RT_GC_TIMEOUT;
878         static unsigned long last_gc;
879         static int rover;
880         static int equilibrium;
881         struct rtable *rth;
882         struct rtable __rcu **rthp;
883         unsigned long now = jiffies;
884         int goal;
885         int entries = dst_entries_get_fast(&ipv4_dst_ops);
886
887         /*
888          * Garbage collection is pretty expensive,
889          * do not make it too frequently.
890          */
891
892         RT_CACHE_STAT_INC(gc_total);
893
894         if (now - last_gc < ip_rt_gc_min_interval &&
895             entries < ip_rt_max_size) {
896                 RT_CACHE_STAT_INC(gc_ignored);
897                 goto out;
898         }
899
900         entries = dst_entries_get_slow(&ipv4_dst_ops);
901         /* Calculate number of entries, which we want to expire now. */
902         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
903         if (goal <= 0) {
904                 if (equilibrium < ipv4_dst_ops.gc_thresh)
905                         equilibrium = ipv4_dst_ops.gc_thresh;
906                 goal = entries - equilibrium;
907                 if (goal > 0) {
908                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
909                         goal = entries - equilibrium;
910                 }
911         } else {
912                 /* We are in dangerous area. Try to reduce cache really
913                  * aggressively.
914                  */
915                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
916                 equilibrium = entries - goal;
917         }
918
919         if (now - last_gc >= ip_rt_gc_min_interval)
920                 last_gc = now;
921
922         if (goal <= 0) {
923                 equilibrium += goal;
924                 goto work_done;
925         }
926
927         do {
928                 int i, k;
929
930                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931                         unsigned long tmo = expire;
932
933                         k = (k + 1) & rt_hash_mask;
934                         rthp = &rt_hash_table[k].chain;
935                         spin_lock_bh(rt_hash_lock_addr(k));
936                         while ((rth = rcu_dereference_protected(*rthp,
937                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
938                                 if (!rt_is_expired(rth) &&
939                                         !rt_may_expire(rth, tmo, expire)) {
940                                         tmo >>= 1;
941                                         rthp = &rth->dst.rt_next;
942                                         continue;
943                                 }
944                                 *rthp = rth->dst.rt_next;
945                                 rt_free(rth);
946                                 goal--;
947                         }
948                         spin_unlock_bh(rt_hash_lock_addr(k));
949                         if (goal <= 0)
950                                 break;
951                 }
952                 rover = k;
953
954                 if (goal <= 0)
955                         goto work_done;
956
957                 /* Goal is not achieved. We stop process if:
958
959                    - if expire reduced to zero. Otherwise, expire is halfed.
960                    - if table is not full.
961                    - if we are called from interrupt.
962                    - jiffies check is just fallback/debug loop breaker.
963                      We will not spin here for long time in any case.
964                  */
965
966                 RT_CACHE_STAT_INC(gc_goal_miss);
967
968                 if (expire == 0)
969                         break;
970
971                 expire >>= 1;
972
973                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
974                         goto out;
975         } while (!in_softirq() && time_before_eq(jiffies, now));
976
977         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978                 goto out;
979         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
980                 goto out;
981         if (net_ratelimit())
982                 printk(KERN_WARNING "dst cache overflow\n");
983         RT_CACHE_STAT_INC(gc_dst_overflow);
984         return 1;
985
986 work_done:
987         expire += ip_rt_gc_min_interval;
988         if (expire > ip_rt_gc_timeout ||
989             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
991                 expire = ip_rt_gc_timeout;
992 out:    return 0;
993 }
994
995 /*
996  * Returns number of entries in a hash chain that have different hash_inputs
997  */
998 static int slow_chain_length(const struct rtable *head)
999 {
1000         int length = 0;
1001         const struct rtable *rth = head;
1002
1003         while (rth) {
1004                 length += has_noalias(head, rth);
1005                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1006         }
1007         return length >> FRACT_BITS;
1008 }
1009
1010 static int rt_bind_neighbour(struct rtable *rt)
1011 {
1012         static const __be32 inaddr_any = 0;
1013         struct net_device *dev = rt->dst.dev;
1014         struct neigh_table *tbl = &arp_tbl;
1015         const __be32 *nexthop;
1016         struct neighbour *n;
1017
1018 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1019         if (dev->type == ARPHRD_ATM)
1020                 tbl = clip_tbl_hook;
1021 #endif
1022         nexthop = &rt->rt_gateway;
1023         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1024                 nexthop = &inaddr_any;
1025         n = ipv4_neigh_lookup(tbl, dev, nexthop);
1026         if (IS_ERR(n))
1027                 return PTR_ERR(n);
1028         rt->dst.neighbour = n;
1029
1030         return 0;
1031 }
1032
1033 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1034                                      struct sk_buff *skb, int ifindex)
1035 {
1036         struct rtable   *rth, *cand;
1037         struct rtable __rcu **rthp, **candp;
1038         unsigned long   now;
1039         u32             min_score;
1040         int             chain_length;
1041         int attempts = !in_softirq();
1042
1043 restart:
1044         chain_length = 0;
1045         min_score = ~(u32)0;
1046         cand = NULL;
1047         candp = NULL;
1048         now = jiffies;
1049
1050         if (!rt_caching(dev_net(rt->dst.dev))) {
1051                 /*
1052                  * If we're not caching, just tell the caller we
1053                  * were successful and don't touch the route.  The
1054                  * caller hold the sole reference to the cache entry, and
1055                  * it will be released when the caller is done with it.
1056                  * If we drop it here, the callers have no way to resolve routes
1057                  * when we're not caching.  Instead, just point *rp at rt, so
1058                  * the caller gets a single use out of the route
1059                  * Note that we do rt_free on this new route entry, so that
1060                  * once its refcount hits zero, we are still able to reap it
1061                  * (Thanks Alexey)
1062                  * Note: To avoid expensive rcu stuff for this uncached dst,
1063                  * we set DST_NOCACHE so that dst_release() can free dst without
1064                  * waiting a grace period.
1065                  */
1066
1067                 rt->dst.flags |= DST_NOCACHE;
1068                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1069                         int err = rt_bind_neighbour(rt);
1070                         if (err) {
1071                                 if (net_ratelimit())
1072                                         printk(KERN_WARNING
1073                                             "Neighbour table failure & not caching routes.\n");
1074                                 ip_rt_put(rt);
1075                                 return ERR_PTR(err);
1076                         }
1077                 }
1078
1079                 goto skip_hashing;
1080         }
1081
1082         rthp = &rt_hash_table[hash].chain;
1083
1084         spin_lock_bh(rt_hash_lock_addr(hash));
1085         while ((rth = rcu_dereference_protected(*rthp,
1086                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1087                 if (rt_is_expired(rth)) {
1088                         *rthp = rth->dst.rt_next;
1089                         rt_free(rth);
1090                         continue;
1091                 }
1092                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1093                         /* Put it first */
1094                         *rthp = rth->dst.rt_next;
1095                         /*
1096                          * Since lookup is lockfree, the deletion
1097                          * must be visible to another weakly ordered CPU before
1098                          * the insertion at the start of the hash chain.
1099                          */
1100                         rcu_assign_pointer(rth->dst.rt_next,
1101                                            rt_hash_table[hash].chain);
1102                         /*
1103                          * Since lookup is lockfree, the update writes
1104                          * must be ordered for consistency on SMP.
1105                          */
1106                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1107
1108                         dst_use(&rth->dst, now);
1109                         spin_unlock_bh(rt_hash_lock_addr(hash));
1110
1111                         rt_drop(rt);
1112                         if (skb)
1113                                 skb_dst_set(skb, &rth->dst);
1114                         return rth;
1115                 }
1116
1117                 if (!atomic_read(&rth->dst.__refcnt)) {
1118                         u32 score = rt_score(rth);
1119
1120                         if (score <= min_score) {
1121                                 cand = rth;
1122                                 candp = rthp;
1123                                 min_score = score;
1124                         }
1125                 }
1126
1127                 chain_length++;
1128
1129                 rthp = &rth->dst.rt_next;
1130         }
1131
1132         if (cand) {
1133                 /* ip_rt_gc_elasticity used to be average length of chain
1134                  * length, when exceeded gc becomes really aggressive.
1135                  *
1136                  * The second limit is less certain. At the moment it allows
1137                  * only 2 entries per bucket. We will see.
1138                  */
1139                 if (chain_length > ip_rt_gc_elasticity) {
1140                         *candp = cand->dst.rt_next;
1141                         rt_free(cand);
1142                 }
1143         } else {
1144                 if (chain_length > rt_chain_length_max &&
1145                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1146                         struct net *net = dev_net(rt->dst.dev);
1147                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1148                         if (!rt_caching(net)) {
1149                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1150                                         rt->dst.dev->name, num);
1151                         }
1152                         rt_emergency_hash_rebuild(net);
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1156                                         ifindex, rt_genid(net));
1157                         goto restart;
1158                 }
1159         }
1160
1161         /* Try to bind route to arp only if it is output
1162            route or unicast forwarding path.
1163          */
1164         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1165                 int err = rt_bind_neighbour(rt);
1166                 if (err) {
1167                         spin_unlock_bh(rt_hash_lock_addr(hash));
1168
1169                         if (err != -ENOBUFS) {
1170                                 rt_drop(rt);
1171                                 return ERR_PTR(err);
1172                         }
1173
1174                         /* Neighbour tables are full and nothing
1175                            can be released. Try to shrink route cache,
1176                            it is most likely it holds some neighbour records.
1177                          */
1178                         if (attempts-- > 0) {
1179                                 int saved_elasticity = ip_rt_gc_elasticity;
1180                                 int saved_int = ip_rt_gc_min_interval;
1181                                 ip_rt_gc_elasticity     = 1;
1182                                 ip_rt_gc_min_interval   = 0;
1183                                 rt_garbage_collect(&ipv4_dst_ops);
1184                                 ip_rt_gc_min_interval   = saved_int;
1185                                 ip_rt_gc_elasticity     = saved_elasticity;
1186                                 goto restart;
1187                         }
1188
1189                         if (net_ratelimit())
1190                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1191                         rt_drop(rt);
1192                         return ERR_PTR(-ENOBUFS);
1193                 }
1194         }
1195
1196         rt->dst.rt_next = rt_hash_table[hash].chain;
1197
1198         /*
1199          * Since lookup is lockfree, we must make sure
1200          * previous writes to rt are committed to memory
1201          * before making rt visible to other CPUS.
1202          */
1203         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1204
1205         spin_unlock_bh(rt_hash_lock_addr(hash));
1206
1207 skip_hashing:
1208         if (skb)
1209                 skb_dst_set(skb, &rt->dst);
1210         return rt;
1211 }
1212
1213 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1214
1215 static u32 rt_peer_genid(void)
1216 {
1217         return atomic_read(&__rt_peer_genid);
1218 }
1219
1220 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1221 {
1222         struct inet_peer *peer;
1223
1224         peer = inet_getpeer_v4(daddr, create);
1225
1226         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1227                 inet_putpeer(peer);
1228         else
1229                 rt->rt_peer_genid = rt_peer_genid();
1230 }
1231
1232 /*
1233  * Peer allocation may fail only in serious out-of-memory conditions.  However
1234  * we still can generate some output.
1235  * Random ID selection looks a bit dangerous because we have no chances to
1236  * select ID being unique in a reasonable period of time.
1237  * But broken packet identifier may be better than no packet at all.
1238  */
1239 static void ip_select_fb_ident(struct iphdr *iph)
1240 {
1241         static DEFINE_SPINLOCK(ip_fb_id_lock);
1242         static u32 ip_fallback_id;
1243         u32 salt;
1244
1245         spin_lock_bh(&ip_fb_id_lock);
1246         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1247         iph->id = htons(salt & 0xFFFF);
1248         ip_fallback_id = salt;
1249         spin_unlock_bh(&ip_fb_id_lock);
1250 }
1251
1252 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1253 {
1254         struct rtable *rt = (struct rtable *) dst;
1255
1256         if (rt) {
1257                 if (rt->peer == NULL)
1258                         rt_bind_peer(rt, rt->rt_dst, 1);
1259
1260                 /* If peer is attached to destination, it is never detached,
1261                    so that we need not to grab a lock to dereference it.
1262                  */
1263                 if (rt->peer) {
1264                         iph->id = htons(inet_getid(rt->peer, more));
1265                         return;
1266                 }
1267         } else
1268                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1269                        __builtin_return_address(0));
1270
1271         ip_select_fb_ident(iph);
1272 }
1273 EXPORT_SYMBOL(__ip_select_ident);
1274
1275 static void rt_del(unsigned hash, struct rtable *rt)
1276 {
1277         struct rtable __rcu **rthp;
1278         struct rtable *aux;
1279
1280         rthp = &rt_hash_table[hash].chain;
1281         spin_lock_bh(rt_hash_lock_addr(hash));
1282         ip_rt_put(rt);
1283         while ((aux = rcu_dereference_protected(*rthp,
1284                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1285                 if (aux == rt || rt_is_expired(aux)) {
1286                         *rthp = aux->dst.rt_next;
1287                         rt_free(aux);
1288                         continue;
1289                 }
1290                 rthp = &aux->dst.rt_next;
1291         }
1292         spin_unlock_bh(rt_hash_lock_addr(hash));
1293 }
1294
1295 /* called in rcu_read_lock() section */
1296 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1297                     __be32 saddr, struct net_device *dev)
1298 {
1299         struct in_device *in_dev = __in_dev_get_rcu(dev);
1300         struct inet_peer *peer;
1301         struct net *net;
1302
1303         if (!in_dev)
1304                 return;
1305
1306         net = dev_net(dev);
1307         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1308             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1309             ipv4_is_zeronet(new_gw))
1310                 goto reject_redirect;
1311
1312         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1313                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1314                         goto reject_redirect;
1315                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1316                         goto reject_redirect;
1317         } else {
1318                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1319                         goto reject_redirect;
1320         }
1321
1322         peer = inet_getpeer_v4(daddr, 1);
1323         if (peer) {
1324                 peer->redirect_learned.a4 = new_gw;
1325
1326                 inet_putpeer(peer);
1327
1328                 atomic_inc(&__rt_peer_genid);
1329         }
1330         return;
1331
1332 reject_redirect:
1333 #ifdef CONFIG_IP_ROUTE_VERBOSE
1334         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1335                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1336                         "  Advised path = %pI4 -> %pI4\n",
1337                        &old_gw, dev->name, &new_gw,
1338                        &saddr, &daddr);
1339 #endif
1340         ;
1341 }
1342
1343 static bool peer_pmtu_expired(struct inet_peer *peer)
1344 {
1345         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1346
1347         return orig &&
1348                time_after_eq(jiffies, orig) &&
1349                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1350 }
1351
1352 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1353 {
1354         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1355
1356         return orig &&
1357                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1358 }
1359
1360 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1361 {
1362         struct rtable *rt = (struct rtable *)dst;
1363         struct dst_entry *ret = dst;
1364
1365         if (rt) {
1366                 if (dst->obsolete > 0) {
1367                         ip_rt_put(rt);
1368                         ret = NULL;
1369                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1370                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1371                                                 rt->rt_oif,
1372                                                 rt_genid(dev_net(dst->dev)));
1373                         rt_del(hash, rt);
1374                         ret = NULL;
1375                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1376                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1377                 }
1378         }
1379         return ret;
1380 }
1381
1382 /*
1383  * Algorithm:
1384  *      1. The first ip_rt_redirect_number redirects are sent
1385  *         with exponential backoff, then we stop sending them at all,
1386  *         assuming that the host ignores our redirects.
1387  *      2. If we did not see packets requiring redirects
1388  *         during ip_rt_redirect_silence, we assume that the host
1389  *         forgot redirected route and start to send redirects again.
1390  *
1391  * This algorithm is much cheaper and more intelligent than dumb load limiting
1392  * in icmp.c.
1393  *
1394  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1395  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1396  */
1397
1398 void ip_rt_send_redirect(struct sk_buff *skb)
1399 {
1400         struct rtable *rt = skb_rtable(skb);
1401         struct in_device *in_dev;
1402         struct inet_peer *peer;
1403         int log_martians;
1404
1405         rcu_read_lock();
1406         in_dev = __in_dev_get_rcu(rt->dst.dev);
1407         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1408                 rcu_read_unlock();
1409                 return;
1410         }
1411         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1412         rcu_read_unlock();
1413
1414         if (!rt->peer)
1415                 rt_bind_peer(rt, rt->rt_dst, 1);
1416         peer = rt->peer;
1417         if (!peer) {
1418                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1419                 return;
1420         }
1421
1422         /* No redirected packets during ip_rt_redirect_silence;
1423          * reset the algorithm.
1424          */
1425         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1426                 peer->rate_tokens = 0;
1427
1428         /* Too many ignored redirects; do not send anything
1429          * set dst.rate_last to the last seen redirected packet.
1430          */
1431         if (peer->rate_tokens >= ip_rt_redirect_number) {
1432                 peer->rate_last = jiffies;
1433                 return;
1434         }
1435
1436         /* Check for load limit; set rate_last to the latest sent
1437          * redirect.
1438          */
1439         if (peer->rate_tokens == 0 ||
1440             time_after(jiffies,
1441                        (peer->rate_last +
1442                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1443                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1444                 peer->rate_last = jiffies;
1445                 ++peer->rate_tokens;
1446 #ifdef CONFIG_IP_ROUTE_VERBOSE
1447                 if (log_martians &&
1448                     peer->rate_tokens == ip_rt_redirect_number &&
1449                     net_ratelimit())
1450                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1451                                &ip_hdr(skb)->saddr, rt->rt_iif,
1452                                 &rt->rt_dst, &rt->rt_gateway);
1453 #endif
1454         }
1455 }
1456
1457 static int ip_error(struct sk_buff *skb)
1458 {
1459         struct rtable *rt = skb_rtable(skb);
1460         struct inet_peer *peer;
1461         unsigned long now;
1462         bool send;
1463         int code;
1464
1465         switch (rt->dst.error) {
1466         case EINVAL:
1467         default:
1468                 goto out;
1469         case EHOSTUNREACH:
1470                 code = ICMP_HOST_UNREACH;
1471                 break;
1472         case ENETUNREACH:
1473                 code = ICMP_NET_UNREACH;
1474                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1475                                 IPSTATS_MIB_INNOROUTES);
1476                 break;
1477         case EACCES:
1478                 code = ICMP_PKT_FILTERED;
1479                 break;
1480         }
1481
1482         if (!rt->peer)
1483                 rt_bind_peer(rt, rt->rt_dst, 1);
1484         peer = rt->peer;
1485
1486         send = true;
1487         if (peer) {
1488                 now = jiffies;
1489                 peer->rate_tokens += now - peer->rate_last;
1490                 if (peer->rate_tokens > ip_rt_error_burst)
1491                         peer->rate_tokens = ip_rt_error_burst;
1492                 peer->rate_last = now;
1493                 if (peer->rate_tokens >= ip_rt_error_cost)
1494                         peer->rate_tokens -= ip_rt_error_cost;
1495                 else
1496                         send = false;
1497         }
1498         if (send)
1499                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1500
1501 out:    kfree_skb(skb);
1502         return 0;
1503 }
1504
1505 /*
1506  *      The last two values are not from the RFC but
1507  *      are needed for AMPRnet AX.25 paths.
1508  */
1509
1510 static const unsigned short mtu_plateau[] =
1511 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1512
1513 static inline unsigned short guess_mtu(unsigned short old_mtu)
1514 {
1515         int i;
1516
1517         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1518                 if (old_mtu > mtu_plateau[i])
1519                         return mtu_plateau[i];
1520         return 68;
1521 }
1522
1523 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1524                                  unsigned short new_mtu,
1525                                  struct net_device *dev)
1526 {
1527         unsigned short old_mtu = ntohs(iph->tot_len);
1528         unsigned short est_mtu = 0;
1529         struct inet_peer *peer;
1530
1531         peer = inet_getpeer_v4(iph->daddr, 1);
1532         if (peer) {
1533                 unsigned short mtu = new_mtu;
1534
1535                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1536                         /* BSD 4.2 derived systems incorrectly adjust
1537                          * tot_len by the IP header length, and report
1538                          * a zero MTU in the ICMP message.
1539                          */
1540                         if (mtu == 0 &&
1541                             old_mtu >= 68 + (iph->ihl << 2))
1542                                 old_mtu -= iph->ihl << 2;
1543                         mtu = guess_mtu(old_mtu);
1544                 }
1545
1546                 if (mtu < ip_rt_min_pmtu)
1547                         mtu = ip_rt_min_pmtu;
1548                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1549                         unsigned long pmtu_expires;
1550
1551                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1552                         if (!pmtu_expires)
1553                                 pmtu_expires = 1UL;
1554
1555                         est_mtu = mtu;
1556                         peer->pmtu_learned = mtu;
1557                         peer->pmtu_expires = pmtu_expires;
1558                 }
1559
1560                 inet_putpeer(peer);
1561
1562                 atomic_inc(&__rt_peer_genid);
1563         }
1564         return est_mtu ? : new_mtu;
1565 }
1566
1567 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1568 {
1569         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1570
1571         if (!expires)
1572                 return;
1573         if (time_before(jiffies, expires)) {
1574                 u32 orig_dst_mtu = dst_mtu(dst);
1575                 if (peer->pmtu_learned < orig_dst_mtu) {
1576                         if (!peer->pmtu_orig)
1577                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1578                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1579                 }
1580         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1581                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1582 }
1583
1584 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1585 {
1586         struct rtable *rt = (struct rtable *) dst;
1587         struct inet_peer *peer;
1588
1589         dst_confirm(dst);
1590
1591         if (!rt->peer)
1592                 rt_bind_peer(rt, rt->rt_dst, 1);
1593         peer = rt->peer;
1594         if (peer) {
1595                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1596
1597                 if (mtu < ip_rt_min_pmtu)
1598                         mtu = ip_rt_min_pmtu;
1599                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1600
1601                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1602                         if (!pmtu_expires)
1603                                 pmtu_expires = 1UL;
1604
1605                         peer->pmtu_learned = mtu;
1606                         peer->pmtu_expires = pmtu_expires;
1607
1608                         atomic_inc(&__rt_peer_genid);
1609                         rt->rt_peer_genid = rt_peer_genid();
1610                 }
1611                 check_peer_pmtu(dst, peer);
1612         }
1613 }
1614
1615 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1616 {
1617         struct rtable *rt = (struct rtable *) dst;
1618         __be32 orig_gw = rt->rt_gateway;
1619
1620         dst_confirm(&rt->dst);
1621
1622         neigh_release(rt->dst.neighbour);
1623         rt->dst.neighbour = NULL;
1624
1625         rt->rt_gateway = peer->redirect_learned.a4;
1626         if (rt_bind_neighbour(rt) ||
1627             !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1628                 if (rt->dst.neighbour)
1629                         neigh_event_send(rt->dst.neighbour, NULL);
1630                 rt->rt_gateway = orig_gw;
1631                 return -EAGAIN;
1632         } else {
1633                 rt->rt_flags |= RTCF_REDIRECTED;
1634                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1635                                         rt->dst.neighbour);
1636         }
1637         return 0;
1638 }
1639
1640 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1641 {
1642         struct rtable *rt = (struct rtable *) dst;
1643
1644         if (rt_is_expired(rt))
1645                 return NULL;
1646         if (rt->rt_peer_genid != rt_peer_genid()) {
1647                 struct inet_peer *peer;
1648
1649                 if (!rt->peer)
1650                         rt_bind_peer(rt, rt->rt_dst, 0);
1651
1652                 peer = rt->peer;
1653                 if (peer) {
1654                         check_peer_pmtu(dst, peer);
1655
1656                         if (peer->redirect_learned.a4 &&
1657                             peer->redirect_learned.a4 != rt->rt_gateway) {
1658                                 if (check_peer_redir(dst, peer))
1659                                         return NULL;
1660                         }
1661                 }
1662
1663                 rt->rt_peer_genid = rt_peer_genid();
1664         }
1665         return dst;
1666 }
1667
1668 static void ipv4_dst_destroy(struct dst_entry *dst)
1669 {
1670         struct rtable *rt = (struct rtable *) dst;
1671         struct inet_peer *peer = rt->peer;
1672
1673         if (rt->fi) {
1674                 fib_info_put(rt->fi);
1675                 rt->fi = NULL;
1676         }
1677         if (peer) {
1678                 rt->peer = NULL;
1679                 inet_putpeer(peer);
1680         }
1681 }
1682
1683
1684 static void ipv4_link_failure(struct sk_buff *skb)
1685 {
1686         struct rtable *rt;
1687
1688         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1689
1690         rt = skb_rtable(skb);
1691         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1692                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1693 }
1694
1695 static int ip_rt_bug(struct sk_buff *skb)
1696 {
1697         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1698                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1699                 skb->dev ? skb->dev->name : "?");
1700         kfree_skb(skb);
1701         WARN_ON(1);
1702         return 0;
1703 }
1704
1705 /*
1706    We do not cache source address of outgoing interface,
1707    because it is used only by IP RR, TS and SRR options,
1708    so that it out of fast path.
1709
1710    BTW remember: "addr" is allowed to be not aligned
1711    in IP options!
1712  */
1713
1714 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1715 {
1716         __be32 src;
1717
1718         if (rt_is_output_route(rt))
1719                 src = ip_hdr(skb)->saddr;
1720         else {
1721                 struct fib_result res;
1722                 struct flowi4 fl4;
1723                 struct iphdr *iph;
1724
1725                 iph = ip_hdr(skb);
1726
1727                 memset(&fl4, 0, sizeof(fl4));
1728                 fl4.daddr = iph->daddr;
1729                 fl4.saddr = iph->saddr;
1730                 fl4.flowi4_tos = iph->tos;
1731                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1732                 fl4.flowi4_iif = skb->dev->ifindex;
1733                 fl4.flowi4_mark = skb->mark;
1734
1735                 rcu_read_lock();
1736                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1737                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1738                 else
1739                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1740                                         RT_SCOPE_UNIVERSE);
1741                 rcu_read_unlock();
1742         }
1743         memcpy(addr, &src, 4);
1744 }
1745
1746 #ifdef CONFIG_IP_ROUTE_CLASSID
1747 static void set_class_tag(struct rtable *rt, u32 tag)
1748 {
1749         if (!(rt->dst.tclassid & 0xFFFF))
1750                 rt->dst.tclassid |= tag & 0xFFFF;
1751         if (!(rt->dst.tclassid & 0xFFFF0000))
1752                 rt->dst.tclassid |= tag & 0xFFFF0000;
1753 }
1754 #endif
1755
1756 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1757 {
1758         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1759
1760         if (advmss == 0) {
1761                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1762                                ip_rt_min_advmss);
1763                 if (advmss > 65535 - 40)
1764                         advmss = 65535 - 40;
1765         }
1766         return advmss;
1767 }
1768
1769 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1770 {
1771         unsigned int mtu = dst->dev->mtu;
1772
1773         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1774                 const struct rtable *rt = (const struct rtable *) dst;
1775
1776                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1777                         mtu = 576;
1778         }
1779
1780         if (mtu > IP_MAX_MTU)
1781                 mtu = IP_MAX_MTU;
1782
1783         return mtu;
1784 }
1785
1786 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1787                             struct fib_info *fi)
1788 {
1789         struct inet_peer *peer;
1790         int create = 0;
1791
1792         /* If a peer entry exists for this destination, we must hook
1793          * it up in order to get at cached metrics.
1794          */
1795         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1796                 create = 1;
1797
1798         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1799         if (peer) {
1800                 rt->rt_peer_genid = rt_peer_genid();
1801                 if (inet_metrics_new(peer))
1802                         memcpy(peer->metrics, fi->fib_metrics,
1803                                sizeof(u32) * RTAX_MAX);
1804                 dst_init_metrics(&rt->dst, peer->metrics, false);
1805
1806                 check_peer_pmtu(&rt->dst, peer);
1807                 if (peer->redirect_learned.a4 &&
1808                     peer->redirect_learned.a4 != rt->rt_gateway) {
1809                         rt->rt_gateway = peer->redirect_learned.a4;
1810                         rt->rt_flags |= RTCF_REDIRECTED;
1811                 }
1812         } else {
1813                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1814                         rt->fi = fi;
1815                         atomic_inc(&fi->fib_clntref);
1816                 }
1817                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1818         }
1819 }
1820
1821 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1822                            const struct fib_result *res,
1823                            struct fib_info *fi, u16 type, u32 itag)
1824 {
1825         struct dst_entry *dst = &rt->dst;
1826
1827         if (fi) {
1828                 if (FIB_RES_GW(*res) &&
1829                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1830                         rt->rt_gateway = FIB_RES_GW(*res);
1831                 rt_init_metrics(rt, fl4, fi);
1832 #ifdef CONFIG_IP_ROUTE_CLASSID
1833                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1834 #endif
1835         }
1836
1837         if (dst_mtu(dst) > IP_MAX_MTU)
1838                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1839         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1840                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1841
1842 #ifdef CONFIG_IP_ROUTE_CLASSID
1843 #ifdef CONFIG_IP_MULTIPLE_TABLES
1844         set_class_tag(rt, fib_rules_tclass(res));
1845 #endif
1846         set_class_tag(rt, itag);
1847 #endif
1848 }
1849
1850 static struct rtable *rt_dst_alloc(struct net_device *dev,
1851                                    bool nopolicy, bool noxfrm)
1852 {
1853         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1854                          DST_HOST |
1855                          (nopolicy ? DST_NOPOLICY : 0) |
1856                          (noxfrm ? DST_NOXFRM : 0));
1857 }
1858
1859 /* called in rcu_read_lock() section */
1860 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1861                                 u8 tos, struct net_device *dev, int our)
1862 {
1863         unsigned int hash;
1864         struct rtable *rth;
1865         __be32 spec_dst;
1866         struct in_device *in_dev = __in_dev_get_rcu(dev);
1867         u32 itag = 0;
1868         int err;
1869
1870         /* Primary sanity checks. */
1871
1872         if (in_dev == NULL)
1873                 return -EINVAL;
1874
1875         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1876             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1877                 goto e_inval;
1878
1879         if (ipv4_is_zeronet(saddr)) {
1880                 if (!ipv4_is_local_multicast(daddr))
1881                         goto e_inval;
1882                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1883         } else {
1884                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1885                                           &itag);
1886                 if (err < 0)
1887                         goto e_err;
1888         }
1889         rth = rt_dst_alloc(init_net.loopback_dev,
1890                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1891         if (!rth)
1892                 goto e_nobufs;
1893
1894 #ifdef CONFIG_IP_ROUTE_CLASSID
1895         rth->dst.tclassid = itag;
1896 #endif
1897         rth->dst.output = ip_rt_bug;
1898
1899         rth->rt_key_dst = daddr;
1900         rth->rt_key_src = saddr;
1901         rth->rt_genid   = rt_genid(dev_net(dev));
1902         rth->rt_flags   = RTCF_MULTICAST;
1903         rth->rt_type    = RTN_MULTICAST;
1904         rth->rt_key_tos = tos;
1905         rth->rt_dst     = daddr;
1906         rth->rt_src     = saddr;
1907         rth->rt_route_iif = dev->ifindex;
1908         rth->rt_iif     = dev->ifindex;
1909         rth->rt_oif     = 0;
1910         rth->rt_mark    = skb->mark;
1911         rth->rt_gateway = daddr;
1912         rth->rt_spec_dst= spec_dst;
1913         rth->rt_peer_genid = 0;
1914         rth->peer = NULL;
1915         rth->fi = NULL;
1916         if (our) {
1917                 rth->dst.input= ip_local_deliver;
1918                 rth->rt_flags |= RTCF_LOCAL;
1919         }
1920
1921 #ifdef CONFIG_IP_MROUTE
1922         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1923                 rth->dst.input = ip_mr_input;
1924 #endif
1925         RT_CACHE_STAT_INC(in_slow_mc);
1926
1927         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1928         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1929         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1930
1931 e_nobufs:
1932         return -ENOBUFS;
1933 e_inval:
1934         return -EINVAL;
1935 e_err:
1936         return err;
1937 }
1938
1939
1940 static void ip_handle_martian_source(struct net_device *dev,
1941                                      struct in_device *in_dev,
1942                                      struct sk_buff *skb,
1943                                      __be32 daddr,
1944                                      __be32 saddr)
1945 {
1946         RT_CACHE_STAT_INC(in_martian_src);
1947 #ifdef CONFIG_IP_ROUTE_VERBOSE
1948         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1949                 /*
1950                  *      RFC1812 recommendation, if source is martian,
1951                  *      the only hint is MAC header.
1952                  */
1953                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1954                         &daddr, &saddr, dev->name);
1955                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1956                         int i;
1957                         const unsigned char *p = skb_mac_header(skb);
1958                         printk(KERN_WARNING "ll header: ");
1959                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1960                                 printk("%02x", *p);
1961                                 if (i < (dev->hard_header_len - 1))
1962                                         printk(":");
1963                         }
1964                         printk("\n");
1965                 }
1966         }
1967 #endif
1968 }
1969
1970 /* called in rcu_read_lock() section */
1971 static int __mkroute_input(struct sk_buff *skb,
1972                            const struct fib_result *res,
1973                            struct in_device *in_dev,
1974                            __be32 daddr, __be32 saddr, u32 tos,
1975                            struct rtable **result)
1976 {
1977         struct rtable *rth;
1978         int err;
1979         struct in_device *out_dev;
1980         unsigned int flags = 0;
1981         __be32 spec_dst;
1982         u32 itag;
1983
1984         /* get a working reference to the output device */
1985         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1986         if (out_dev == NULL) {
1987                 if (net_ratelimit())
1988                         printk(KERN_CRIT "Bug in ip_route_input" \
1989                                "_slow(). Please, report\n");
1990                 return -EINVAL;
1991         }
1992
1993
1994         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1995                                   in_dev->dev, &spec_dst, &itag);
1996         if (err < 0) {
1997                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1998                                          saddr);
1999
2000                 goto cleanup;
2001         }
2002
2003         if (err)
2004                 flags |= RTCF_DIRECTSRC;
2005
2006         if (out_dev == in_dev && err &&
2007             (IN_DEV_SHARED_MEDIA(out_dev) ||
2008              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2009                 flags |= RTCF_DOREDIRECT;
2010
2011         if (skb->protocol != htons(ETH_P_IP)) {
2012                 /* Not IP (i.e. ARP). Do not create route, if it is
2013                  * invalid for proxy arp. DNAT routes are always valid.
2014                  *
2015                  * Proxy arp feature have been extended to allow, ARP
2016                  * replies back to the same interface, to support
2017                  * Private VLAN switch technologies. See arp.c.
2018                  */
2019                 if (out_dev == in_dev &&
2020                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2021                         err = -EINVAL;
2022                         goto cleanup;
2023                 }
2024         }
2025
2026         rth = rt_dst_alloc(out_dev->dev,
2027                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2028                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2029         if (!rth) {
2030                 err = -ENOBUFS;
2031                 goto cleanup;
2032         }
2033
2034         rth->rt_key_dst = daddr;
2035         rth->rt_key_src = saddr;
2036         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2037         rth->rt_flags = flags;
2038         rth->rt_type = res->type;
2039         rth->rt_key_tos = tos;
2040         rth->rt_dst     = daddr;
2041         rth->rt_src     = saddr;
2042         rth->rt_route_iif = in_dev->dev->ifindex;
2043         rth->rt_iif     = in_dev->dev->ifindex;
2044         rth->rt_oif     = 0;
2045         rth->rt_mark    = skb->mark;
2046         rth->rt_gateway = daddr;
2047         rth->rt_spec_dst= spec_dst;
2048         rth->rt_peer_genid = 0;
2049         rth->peer = NULL;
2050         rth->fi = NULL;
2051
2052         rth->dst.input = ip_forward;
2053         rth->dst.output = ip_output;
2054
2055         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2056
2057         *result = rth;
2058         err = 0;
2059  cleanup:
2060         return err;
2061 }
2062
2063 static int ip_mkroute_input(struct sk_buff *skb,
2064                             struct fib_result *res,
2065                             const struct flowi4 *fl4,
2066                             struct in_device *in_dev,
2067                             __be32 daddr, __be32 saddr, u32 tos)
2068 {
2069         struct rtable* rth = NULL;
2070         int err;
2071         unsigned hash;
2072
2073 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2074         if (res->fi && res->fi->fib_nhs > 1)
2075                 fib_select_multipath(res);
2076 #endif
2077
2078         /* create a routing cache entry */
2079         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2080         if (err)
2081                 return err;
2082
2083         /* put it into the cache */
2084         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2085                        rt_genid(dev_net(rth->dst.dev)));
2086         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2087         if (IS_ERR(rth))
2088                 return PTR_ERR(rth);
2089         return 0;
2090 }
2091
2092 /*
2093  *      NOTE. We drop all the packets that has local source
2094  *      addresses, because every properly looped back packet
2095  *      must have correct destination already attached by output routine.
2096  *
2097  *      Such approach solves two big problems:
2098  *      1. Not simplex devices are handled properly.
2099  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2100  *      called with rcu_read_lock()
2101  */
2102
2103 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2104                                u8 tos, struct net_device *dev)
2105 {
2106         struct fib_result res;
2107         struct in_device *in_dev = __in_dev_get_rcu(dev);
2108         struct flowi4   fl4;
2109         unsigned        flags = 0;
2110         u32             itag = 0;
2111         struct rtable * rth;
2112         unsigned        hash;
2113         __be32          spec_dst;
2114         int             err = -EINVAL;
2115         struct net    * net = dev_net(dev);
2116
2117         /* IP on this device is disabled. */
2118
2119         if (!in_dev)
2120                 goto out;
2121
2122         /* Check for the most weird martians, which can be not detected
2123            by fib_lookup.
2124          */
2125
2126         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2127             ipv4_is_loopback(saddr))
2128                 goto martian_source;
2129
2130         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2131                 goto brd_input;
2132
2133         /* Accept zero addresses only to limited broadcast;
2134          * I even do not know to fix it or not. Waiting for complains :-)
2135          */
2136         if (ipv4_is_zeronet(saddr))
2137                 goto martian_source;
2138
2139         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2140                 goto martian_destination;
2141
2142         /*
2143          *      Now we are ready to route packet.
2144          */
2145         fl4.flowi4_oif = 0;
2146         fl4.flowi4_iif = dev->ifindex;
2147         fl4.flowi4_mark = skb->mark;
2148         fl4.flowi4_tos = tos;
2149         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2150         fl4.daddr = daddr;
2151         fl4.saddr = saddr;
2152         err = fib_lookup(net, &fl4, &res);
2153         if (err != 0) {
2154                 if (!IN_DEV_FORWARD(in_dev))
2155                         goto e_hostunreach;
2156                 goto no_route;
2157         }
2158
2159         RT_CACHE_STAT_INC(in_slow_tot);
2160
2161         if (res.type == RTN_BROADCAST)
2162                 goto brd_input;
2163
2164         if (res.type == RTN_LOCAL) {
2165                 err = fib_validate_source(skb, saddr, daddr, tos,
2166                                           net->loopback_dev->ifindex,
2167                                           dev, &spec_dst, &itag);
2168                 if (err < 0)
2169                         goto martian_source_keep_err;
2170                 if (err)
2171                         flags |= RTCF_DIRECTSRC;
2172                 spec_dst = daddr;
2173                 goto local_input;
2174         }
2175
2176         if (!IN_DEV_FORWARD(in_dev))
2177                 goto e_hostunreach;
2178         if (res.type != RTN_UNICAST)
2179                 goto martian_destination;
2180
2181         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2182 out:    return err;
2183
2184 brd_input:
2185         if (skb->protocol != htons(ETH_P_IP))
2186                 goto e_inval;
2187
2188         if (ipv4_is_zeronet(saddr))
2189                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2190         else {
2191                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2192                                           &itag);
2193                 if (err < 0)
2194                         goto martian_source_keep_err;
2195                 if (err)
2196                         flags |= RTCF_DIRECTSRC;
2197         }
2198         flags |= RTCF_BROADCAST;
2199         res.type = RTN_BROADCAST;
2200         RT_CACHE_STAT_INC(in_brd);
2201
2202 local_input:
2203         rth = rt_dst_alloc(net->loopback_dev,
2204                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2205         if (!rth)
2206                 goto e_nobufs;
2207
2208         rth->dst.input= ip_local_deliver;
2209         rth->dst.output= ip_rt_bug;
2210 #ifdef CONFIG_IP_ROUTE_CLASSID
2211         rth->dst.tclassid = itag;
2212 #endif
2213
2214         rth->rt_key_dst = daddr;
2215         rth->rt_key_src = saddr;
2216         rth->rt_genid = rt_genid(net);
2217         rth->rt_flags   = flags|RTCF_LOCAL;
2218         rth->rt_type    = res.type;
2219         rth->rt_key_tos = tos;
2220         rth->rt_dst     = daddr;
2221         rth->rt_src     = saddr;
2222 #ifdef CONFIG_IP_ROUTE_CLASSID
2223         rth->dst.tclassid = itag;
2224 #endif
2225         rth->rt_route_iif = dev->ifindex;
2226         rth->rt_iif     = dev->ifindex;
2227         rth->rt_oif     = 0;
2228         rth->rt_mark    = skb->mark;
2229         rth->rt_gateway = daddr;
2230         rth->rt_spec_dst= spec_dst;
2231         rth->rt_peer_genid = 0;
2232         rth->peer = NULL;
2233         rth->fi = NULL;
2234         if (res.type == RTN_UNREACHABLE) {
2235                 rth->dst.input= ip_error;
2236                 rth->dst.error= -err;
2237                 rth->rt_flags   &= ~RTCF_LOCAL;
2238         }
2239         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2240         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2241         err = 0;
2242         if (IS_ERR(rth))
2243                 err = PTR_ERR(rth);
2244         goto out;
2245
2246 no_route:
2247         RT_CACHE_STAT_INC(in_no_route);
2248         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2249         res.type = RTN_UNREACHABLE;
2250         if (err == -ESRCH)
2251                 err = -ENETUNREACH;
2252         goto local_input;
2253
2254         /*
2255          *      Do not cache martian addresses: they should be logged (RFC1812)
2256          */
2257 martian_destination:
2258         RT_CACHE_STAT_INC(in_martian_dst);
2259 #ifdef CONFIG_IP_ROUTE_VERBOSE
2260         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2261                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2262                         &daddr, &saddr, dev->name);
2263 #endif
2264
2265 e_hostunreach:
2266         err = -EHOSTUNREACH;
2267         goto out;
2268
2269 e_inval:
2270         err = -EINVAL;
2271         goto out;
2272
2273 e_nobufs:
2274         err = -ENOBUFS;
2275         goto out;
2276
2277 martian_source:
2278         err = -EINVAL;
2279 martian_source_keep_err:
2280         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2281         goto out;
2282 }
2283
2284 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2285                            u8 tos, struct net_device *dev, bool noref)
2286 {
2287         struct rtable * rth;
2288         unsigned        hash;
2289         int iif = dev->ifindex;
2290         struct net *net;
2291         int res;
2292
2293         net = dev_net(dev);
2294
2295         rcu_read_lock();
2296
2297         if (!rt_caching(net))
2298                 goto skip_cache;
2299
2300         tos &= IPTOS_RT_MASK;
2301         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2302
2303         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2304              rth = rcu_dereference(rth->dst.rt_next)) {
2305                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2306                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2307                      (rth->rt_iif ^ iif) |
2308                      rth->rt_oif |
2309                      (rth->rt_key_tos ^ tos)) == 0 &&
2310                     rth->rt_mark == skb->mark &&
2311                     net_eq(dev_net(rth->dst.dev), net) &&
2312                     !rt_is_expired(rth)) {
2313                         if (noref) {
2314                                 dst_use_noref(&rth->dst, jiffies);
2315                                 skb_dst_set_noref(skb, &rth->dst);
2316                         } else {
2317                                 dst_use(&rth->dst, jiffies);
2318                                 skb_dst_set(skb, &rth->dst);
2319                         }
2320                         RT_CACHE_STAT_INC(in_hit);
2321                         rcu_read_unlock();
2322                         return 0;
2323                 }
2324                 RT_CACHE_STAT_INC(in_hlist_search);
2325         }
2326
2327 skip_cache:
2328         /* Multicast recognition logic is moved from route cache to here.
2329            The problem was that too many Ethernet cards have broken/missing
2330            hardware multicast filters :-( As result the host on multicasting
2331            network acquires a lot of useless route cache entries, sort of
2332            SDR messages from all the world. Now we try to get rid of them.
2333            Really, provided software IP multicast filter is organized
2334            reasonably (at least, hashed), it does not result in a slowdown
2335            comparing with route cache reject entries.
2336            Note, that multicast routers are not affected, because
2337            route cache entry is created eventually.
2338          */
2339         if (ipv4_is_multicast(daddr)) {
2340                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2341
2342                 if (in_dev) {
2343                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2344                                                   ip_hdr(skb)->protocol);
2345                         if (our
2346 #ifdef CONFIG_IP_MROUTE
2347                                 ||
2348                             (!ipv4_is_local_multicast(daddr) &&
2349                              IN_DEV_MFORWARD(in_dev))
2350 #endif
2351                            ) {
2352                                 int res = ip_route_input_mc(skb, daddr, saddr,
2353                                                             tos, dev, our);
2354                                 rcu_read_unlock();
2355                                 return res;
2356                         }
2357                 }
2358                 rcu_read_unlock();
2359                 return -EINVAL;
2360         }
2361         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2362         rcu_read_unlock();
2363         return res;
2364 }
2365 EXPORT_SYMBOL(ip_route_input_common);
2366
2367 /* called with rcu_read_lock() */
2368 static struct rtable *__mkroute_output(const struct fib_result *res,
2369                                        const struct flowi4 *fl4,
2370                                        __be32 orig_daddr, __be32 orig_saddr,
2371                                        int orig_oif, struct net_device *dev_out,
2372                                        unsigned int flags)
2373 {
2374         struct fib_info *fi = res->fi;
2375         u32 tos = RT_FL_TOS(fl4);
2376         struct in_device *in_dev;
2377         u16 type = res->type;
2378         struct rtable *rth;
2379
2380         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2381                 return ERR_PTR(-EINVAL);
2382
2383         if (ipv4_is_lbcast(fl4->daddr))
2384                 type = RTN_BROADCAST;
2385         else if (ipv4_is_multicast(fl4->daddr))
2386                 type = RTN_MULTICAST;
2387         else if (ipv4_is_zeronet(fl4->daddr))
2388                 return ERR_PTR(-EINVAL);
2389
2390         if (dev_out->flags & IFF_LOOPBACK)
2391                 flags |= RTCF_LOCAL;
2392
2393         in_dev = __in_dev_get_rcu(dev_out);
2394         if (!in_dev)
2395                 return ERR_PTR(-EINVAL);
2396
2397         if (type == RTN_BROADCAST) {
2398                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2399                 fi = NULL;
2400         } else if (type == RTN_MULTICAST) {
2401                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2402                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2403                                      fl4->flowi4_proto))
2404                         flags &= ~RTCF_LOCAL;
2405                 /* If multicast route do not exist use
2406                  * default one, but do not gateway in this case.
2407                  * Yes, it is hack.
2408                  */
2409                 if (fi && res->prefixlen < 4)
2410                         fi = NULL;
2411         }
2412
2413         rth = rt_dst_alloc(dev_out,
2414                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2415                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2416         if (!rth)
2417                 return ERR_PTR(-ENOBUFS);
2418
2419         rth->dst.output = ip_output;
2420
2421         rth->rt_key_dst = orig_daddr;
2422         rth->rt_key_src = orig_saddr;
2423         rth->rt_genid = rt_genid(dev_net(dev_out));
2424         rth->rt_flags   = flags;
2425         rth->rt_type    = type;
2426         rth->rt_key_tos = tos;
2427         rth->rt_dst     = fl4->daddr;
2428         rth->rt_src     = fl4->saddr;
2429         rth->rt_route_iif = 0;
2430         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2431         rth->rt_oif     = orig_oif;
2432         rth->rt_mark    = fl4->flowi4_mark;
2433         rth->rt_gateway = fl4->daddr;
2434         rth->rt_spec_dst= fl4->saddr;
2435         rth->rt_peer_genid = 0;
2436         rth->peer = NULL;
2437         rth->fi = NULL;
2438
2439         RT_CACHE_STAT_INC(out_slow_tot);
2440
2441         if (flags & RTCF_LOCAL) {
2442                 rth->dst.input = ip_local_deliver;
2443                 rth->rt_spec_dst = fl4->daddr;
2444         }
2445         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2446                 rth->rt_spec_dst = fl4->saddr;
2447                 if (flags & RTCF_LOCAL &&
2448                     !(dev_out->flags & IFF_LOOPBACK)) {
2449                         rth->dst.output = ip_mc_output;
2450                         RT_CACHE_STAT_INC(out_slow_mc);
2451                 }
2452 #ifdef CONFIG_IP_MROUTE
2453                 if (type == RTN_MULTICAST) {
2454                         if (IN_DEV_MFORWARD(in_dev) &&
2455                             !ipv4_is_local_multicast(fl4->daddr)) {
2456                                 rth->dst.input = ip_mr_input;
2457                                 rth->dst.output = ip_mc_output;
2458                         }
2459                 }
2460 #endif
2461         }
2462
2463         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2464
2465         return rth;
2466 }
2467
2468 /*
2469  * Major route resolver routine.
2470  * called with rcu_read_lock();
2471  */
2472
2473 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2474 {
2475         struct net_device *dev_out = NULL;
2476         u32 tos = RT_FL_TOS(fl4);
2477         unsigned int flags = 0;
2478         struct fib_result res;
2479         struct rtable *rth;
2480         __be32 orig_daddr;
2481         __be32 orig_saddr;
2482         int orig_oif;
2483
2484         res.fi          = NULL;
2485 #ifdef CONFIG_IP_MULTIPLE_TABLES
2486         res.r           = NULL;
2487 #endif
2488
2489         orig_daddr = fl4->daddr;
2490         orig_saddr = fl4->saddr;
2491         orig_oif = fl4->flowi4_oif;
2492
2493         fl4->flowi4_iif = net->loopback_dev->ifindex;
2494         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2495         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2496                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2497
2498         rcu_read_lock();
2499         if (fl4->saddr) {
2500                 rth = ERR_PTR(-EINVAL);
2501                 if (ipv4_is_multicast(fl4->saddr) ||
2502                     ipv4_is_lbcast(fl4->saddr) ||
2503                     ipv4_is_zeronet(fl4->saddr))
2504                         goto out;
2505
2506                 /* I removed check for oif == dev_out->oif here.
2507                    It was wrong for two reasons:
2508                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2509                       is assigned to multiple interfaces.
2510                    2. Moreover, we are allowed to send packets with saddr
2511                       of another iface. --ANK
2512                  */
2513
2514                 if (fl4->flowi4_oif == 0 &&
2515                     (ipv4_is_multicast(fl4->daddr) ||
2516                      ipv4_is_lbcast(fl4->daddr))) {
2517                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2518                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2519                         if (dev_out == NULL)
2520                                 goto out;
2521
2522                         /* Special hack: user can direct multicasts
2523                            and limited broadcast via necessary interface
2524                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2525                            This hack is not just for fun, it allows
2526                            vic,vat and friends to work.
2527                            They bind socket to loopback, set ttl to zero
2528                            and expect that it will work.
2529                            From the viewpoint of routing cache they are broken,
2530                            because we are not allowed to build multicast path
2531                            with loopback source addr (look, routing cache
2532                            cannot know, that ttl is zero, so that packet
2533                            will not leave this host and route is valid).
2534                            Luckily, this hack is good workaround.
2535                          */
2536
2537                         fl4->flowi4_oif = dev_out->ifindex;
2538                         goto make_route;
2539                 }
2540
2541                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2542                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2543                         if (!__ip_dev_find(net, fl4->saddr, false))
2544                                 goto out;
2545                 }
2546         }
2547
2548
2549         if (fl4->flowi4_oif) {
2550                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2551                 rth = ERR_PTR(-ENODEV);
2552                 if (dev_out == NULL)
2553                         goto out;
2554
2555                 /* RACE: Check return value of inet_select_addr instead. */
2556                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2557                         rth = ERR_PTR(-ENETUNREACH);
2558                         goto out;
2559                 }
2560                 if (ipv4_is_local_multicast(fl4->daddr) ||
2561                     ipv4_is_lbcast(fl4->daddr)) {
2562                         if (!fl4->saddr)
2563                                 fl4->saddr = inet_select_addr(dev_out, 0,
2564                                                               RT_SCOPE_LINK);
2565                         goto make_route;
2566                 }
2567                 if (fl4->saddr) {
2568                         if (ipv4_is_multicast(fl4->daddr))
2569                                 fl4->saddr = inet_select_addr(dev_out, 0,
2570                                                               fl4->flowi4_scope);
2571                         else if (!fl4->daddr)
2572                                 fl4->saddr = inet_select_addr(dev_out, 0,
2573                                                               RT_SCOPE_HOST);
2574                 }
2575         }
2576
2577         if (!fl4->daddr) {
2578                 fl4->daddr = fl4->saddr;
2579                 if (!fl4->daddr)
2580                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2581                 dev_out = net->loopback_dev;
2582                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2583                 res.type = RTN_LOCAL;
2584                 flags |= RTCF_LOCAL;
2585                 goto make_route;
2586         }
2587
2588         if (fib_lookup(net, fl4, &res)) {
2589                 res.fi = NULL;
2590                 if (fl4->flowi4_oif) {
2591                         /* Apparently, routing tables are wrong. Assume,
2592                            that the destination is on link.
2593
2594                            WHY? DW.
2595                            Because we are allowed to send to iface
2596                            even if it has NO routes and NO assigned
2597                            addresses. When oif is specified, routing
2598                            tables are looked up with only one purpose:
2599                            to catch if destination is gatewayed, rather than
2600                            direct. Moreover, if MSG_DONTROUTE is set,
2601                            we send packet, ignoring both routing tables
2602                            and ifaddr state. --ANK
2603
2604
2605                            We could make it even if oif is unknown,
2606                            likely IPv6, but we do not.
2607                          */
2608
2609                         if (fl4->saddr == 0)
2610                                 fl4->saddr = inet_select_addr(dev_out, 0,
2611                                                               RT_SCOPE_LINK);
2612                         res.type = RTN_UNICAST;
2613                         goto make_route;
2614                 }
2615                 rth = ERR_PTR(-ENETUNREACH);
2616                 goto out;
2617         }
2618
2619         if (res.type == RTN_LOCAL) {
2620                 if (!fl4->saddr) {
2621                         if (res.fi->fib_prefsrc)
2622                                 fl4->saddr = res.fi->fib_prefsrc;
2623                         else
2624                                 fl4->saddr = fl4->daddr;
2625                 }
2626                 dev_out = net->loopback_dev;
2627                 fl4->flowi4_oif = dev_out->ifindex;
2628                 res.fi = NULL;
2629                 flags |= RTCF_LOCAL;
2630                 goto make_route;
2631         }
2632
2633 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2634         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2635                 fib_select_multipath(&res);
2636         else
2637 #endif
2638         if (!res.prefixlen &&
2639             res.table->tb_num_default > 1 &&
2640             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2641                 fib_select_default(&res);
2642
2643         if (!fl4->saddr)
2644                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2645
2646         dev_out = FIB_RES_DEV(res);
2647         fl4->flowi4_oif = dev_out->ifindex;
2648
2649
2650 make_route:
2651         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2652                                dev_out, flags);
2653         if (!IS_ERR(rth)) {
2654                 unsigned int hash;
2655
2656                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2657                                rt_genid(dev_net(dev_out)));
2658                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2659         }
2660
2661 out:
2662         rcu_read_unlock();
2663         return rth;
2664 }
2665
2666 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2667 {
2668         struct rtable *rth;
2669         unsigned int hash;
2670
2671         if (!rt_caching(net))
2672                 goto slow_output;
2673
2674         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2675
2676         rcu_read_lock_bh();
2677         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2678                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2679                 if (rth->rt_key_dst == flp4->daddr &&
2680                     rth->rt_key_src == flp4->saddr &&
2681                     rt_is_output_route(rth) &&
2682                     rth->rt_oif == flp4->flowi4_oif &&
2683                     rth->rt_mark == flp4->flowi4_mark &&
2684                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2685                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2686                     net_eq(dev_net(rth->dst.dev), net) &&
2687                     !rt_is_expired(rth)) {
2688                         dst_use(&rth->dst, jiffies);
2689                         RT_CACHE_STAT_INC(out_hit);
2690                         rcu_read_unlock_bh();
2691                         if (!flp4->saddr)
2692                                 flp4->saddr = rth->rt_src;
2693                         if (!flp4->daddr)
2694                                 flp4->daddr = rth->rt_dst;
2695                         return rth;
2696                 }
2697                 RT_CACHE_STAT_INC(out_hlist_search);
2698         }
2699         rcu_read_unlock_bh();
2700
2701 slow_output:
2702         return ip_route_output_slow(net, flp4);
2703 }
2704 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2705
2706 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2707 {
2708         return NULL;
2709 }
2710
2711 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2712 {
2713         return 0;
2714 }
2715
2716 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2717 {
2718 }
2719
2720 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2721                                           unsigned long old)
2722 {
2723         return NULL;
2724 }
2725
2726 static struct dst_ops ipv4_dst_blackhole_ops = {
2727         .family                 =       AF_INET,
2728         .protocol               =       cpu_to_be16(ETH_P_IP),
2729         .destroy                =       ipv4_dst_destroy,
2730         .check                  =       ipv4_blackhole_dst_check,
2731         .default_mtu            =       ipv4_blackhole_default_mtu,
2732         .default_advmss         =       ipv4_default_advmss,
2733         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2734         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2735 };
2736
2737 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2738 {
2739         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2740         struct rtable *ort = (struct rtable *) dst_orig;
2741
2742         if (rt) {
2743                 struct dst_entry *new = &rt->dst;
2744
2745                 new->__use = 1;
2746                 new->input = dst_discard;
2747                 new->output = dst_discard;
2748                 dst_copy_metrics(new, &ort->dst);
2749
2750                 new->dev = ort->dst.dev;
2751                 if (new->dev)
2752                         dev_hold(new->dev);
2753
2754                 rt->rt_key_dst = ort->rt_key_dst;
2755                 rt->rt_key_src = ort->rt_key_src;
2756                 rt->rt_key_tos = ort->rt_key_tos;
2757                 rt->rt_route_iif = ort->rt_route_iif;
2758                 rt->rt_iif = ort->rt_iif;
2759                 rt->rt_oif = ort->rt_oif;
2760                 rt->rt_mark = ort->rt_mark;
2761
2762                 rt->rt_genid = rt_genid(net);
2763                 rt->rt_flags = ort->rt_flags;
2764                 rt->rt_type = ort->rt_type;
2765                 rt->rt_dst = ort->rt_dst;
2766                 rt->rt_src = ort->rt_src;
2767                 rt->rt_gateway = ort->rt_gateway;
2768                 rt->rt_spec_dst = ort->rt_spec_dst;
2769                 rt->peer = ort->peer;
2770                 if (rt->peer)
2771                         atomic_inc(&rt->peer->refcnt);
2772                 rt->fi = ort->fi;
2773                 if (rt->fi)
2774                         atomic_inc(&rt->fi->fib_clntref);
2775
2776                 dst_free(new);
2777         }
2778
2779         dst_release(dst_orig);
2780
2781         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2782 }
2783
2784 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2785                                     struct sock *sk)
2786 {
2787         struct rtable *rt = __ip_route_output_key(net, flp4);
2788
2789         if (IS_ERR(rt))
2790                 return rt;
2791
2792         if (flp4->flowi4_proto)
2793                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2794                                                    flowi4_to_flowi(flp4),
2795                                                    sk, 0);
2796
2797         return rt;
2798 }
2799 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800
2801 static int rt_fill_info(struct net *net,
2802                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2803                         int nowait, unsigned int flags)
2804 {
2805         struct rtable *rt = skb_rtable(skb);
2806         struct rtmsg *r;
2807         struct nlmsghdr *nlh;
2808         long expires = 0;
2809         const struct inet_peer *peer = rt->peer;
2810         u32 id = 0, ts = 0, tsage = 0, error;
2811
2812         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813         if (nlh == NULL)
2814                 return -EMSGSIZE;
2815
2816         r = nlmsg_data(nlh);
2817         r->rtm_family    = AF_INET;
2818         r->rtm_dst_len  = 32;
2819         r->rtm_src_len  = 0;
2820         r->rtm_tos      = rt->rt_key_tos;
2821         r->rtm_table    = RT_TABLE_MAIN;
2822         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823         r->rtm_type     = rt->rt_type;
2824         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2825         r->rtm_protocol = RTPROT_UNSPEC;
2826         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827         if (rt->rt_flags & RTCF_NOTIFY)
2828                 r->rtm_flags |= RTM_F_NOTIFY;
2829
2830         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831
2832         if (rt->rt_key_src) {
2833                 r->rtm_src_len = 32;
2834                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2835         }
2836         if (rt->dst.dev)
2837                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2838 #ifdef CONFIG_IP_ROUTE_CLASSID
2839         if (rt->dst.tclassid)
2840                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2841 #endif
2842         if (rt_is_input_route(rt))
2843                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844         else if (rt->rt_src != rt->rt_key_src)
2845                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846
2847         if (rt->rt_dst != rt->rt_gateway)
2848                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849
2850         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2851                 goto nla_put_failure;
2852
2853         if (rt->rt_mark)
2854                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2855
2856         error = rt->dst.error;
2857         if (peer) {
2858                 inet_peer_refcheck(rt->peer);
2859                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2860                 if (peer->tcp_ts_stamp) {
2861                         ts = peer->tcp_ts;
2862                         tsage = get_seconds() - peer->tcp_ts_stamp;
2863                 }
2864                 expires = ACCESS_ONCE(peer->pmtu_expires);
2865                 if (expires)
2866                         expires -= jiffies;
2867         }
2868
2869         if (rt_is_input_route(rt)) {
2870 #ifdef CONFIG_IP_MROUTE
2871                 __be32 dst = rt->rt_dst;
2872
2873                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2874                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2875                         int err = ipmr_get_route(net, skb,
2876                                                  rt->rt_src, rt->rt_dst,
2877                                                  r, nowait);
2878                         if (err <= 0) {
2879                                 if (!nowait) {
2880                                         if (err == 0)
2881                                                 return 0;
2882                                         goto nla_put_failure;
2883                                 } else {
2884                                         if (err == -EMSGSIZE)
2885                                                 goto nla_put_failure;
2886                                         error = err;
2887                                 }
2888                         }
2889                 } else
2890 #endif
2891                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2892         }
2893
2894         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2895                                expires, error) < 0)
2896                 goto nla_put_failure;
2897
2898         return nlmsg_end(skb, nlh);
2899
2900 nla_put_failure:
2901         nlmsg_cancel(skb, nlh);
2902         return -EMSGSIZE;
2903 }
2904
2905 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2906 {
2907         struct net *net = sock_net(in_skb->sk);
2908         struct rtmsg *rtm;
2909         struct nlattr *tb[RTA_MAX+1];
2910         struct rtable *rt = NULL;
2911         __be32 dst = 0;
2912         __be32 src = 0;
2913         u32 iif;
2914         int err;
2915         int mark;
2916         struct sk_buff *skb;
2917
2918         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2919         if (err < 0)
2920                 goto errout;
2921
2922         rtm = nlmsg_data(nlh);
2923
2924         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2925         if (skb == NULL) {
2926                 err = -ENOBUFS;
2927                 goto errout;
2928         }
2929
2930         /* Reserve room for dummy headers, this skb can pass
2931            through good chunk of routing engine.
2932          */
2933         skb_reset_mac_header(skb);
2934         skb_reset_network_header(skb);
2935
2936         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2937         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2938         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2939
2940         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2941         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2942         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2943         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2944
2945         if (iif) {
2946                 struct net_device *dev;
2947
2948                 dev = __dev_get_by_index(net, iif);
2949                 if (dev == NULL) {
2950                         err = -ENODEV;
2951                         goto errout_free;
2952                 }
2953
2954                 skb->protocol   = htons(ETH_P_IP);
2955                 skb->dev        = dev;
2956                 skb->mark       = mark;
2957                 local_bh_disable();
2958                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2959                 local_bh_enable();
2960
2961                 rt = skb_rtable(skb);
2962                 if (err == 0 && rt->dst.error)
2963                         err = -rt->dst.error;
2964         } else {
2965                 struct flowi4 fl4 = {
2966                         .daddr = dst,
2967                         .saddr = src,
2968                         .flowi4_tos = rtm->rtm_tos,
2969                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2970                         .flowi4_mark = mark,
2971                 };
2972                 rt = ip_route_output_key(net, &fl4);
2973
2974                 err = 0;
2975                 if (IS_ERR(rt))
2976                         err = PTR_ERR(rt);
2977         }
2978
2979         if (err)
2980                 goto errout_free;
2981
2982         skb_dst_set(skb, &rt->dst);
2983         if (rtm->rtm_flags & RTM_F_NOTIFY)
2984                 rt->rt_flags |= RTCF_NOTIFY;
2985
2986         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2987                            RTM_NEWROUTE, 0, 0);
2988         if (err <= 0)
2989                 goto errout_free;
2990
2991         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2992 errout:
2993         return err;
2994
2995 errout_free:
2996         kfree_skb(skb);
2997         goto errout;
2998 }
2999
3000 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3001 {
3002         struct rtable *rt;
3003         int h, s_h;
3004         int idx, s_idx;
3005         struct net *net;
3006
3007         net = sock_net(skb->sk);
3008
3009         s_h = cb->args[0];
3010         if (s_h < 0)
3011                 s_h = 0;
3012         s_idx = idx = cb->args[1];
3013         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3014                 if (!rt_hash_table[h].chain)
3015                         continue;
3016                 rcu_read_lock_bh();
3017                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3018                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3019                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3020                                 continue;
3021                         if (rt_is_expired(rt))
3022                                 continue;
3023                         skb_dst_set_noref(skb, &rt->dst);
3024                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3025                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3026                                          1, NLM_F_MULTI) <= 0) {
3027                                 skb_dst_drop(skb);
3028                                 rcu_read_unlock_bh();
3029                                 goto done;
3030                         }
3031                         skb_dst_drop(skb);
3032                 }
3033                 rcu_read_unlock_bh();
3034         }
3035
3036 done:
3037         cb->args[0] = h;
3038         cb->args[1] = idx;
3039         return skb->len;
3040 }
3041
3042 void ip_rt_multicast_event(struct in_device *in_dev)
3043 {
3044         rt_cache_flush(dev_net(in_dev->dev), 0);
3045 }
3046
3047 #ifdef CONFIG_SYSCTL
3048 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3049                                         void __user *buffer,
3050                                         size_t *lenp, loff_t *ppos)
3051 {
3052         if (write) {
3053                 int flush_delay;
3054                 ctl_table ctl;
3055                 struct net *net;
3056
3057                 memcpy(&ctl, __ctl, sizeof(ctl));
3058                 ctl.data = &flush_delay;
3059                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3060
3061                 net = (struct net *)__ctl->extra1;
3062                 rt_cache_flush(net, flush_delay);
3063                 return 0;
3064         }
3065
3066         return -EINVAL;
3067 }
3068
3069 static ctl_table ipv4_route_table[] = {
3070         {
3071                 .procname       = "gc_thresh",
3072                 .data           = &ipv4_dst_ops.gc_thresh,
3073                 .maxlen         = sizeof(int),
3074                 .mode           = 0644,
3075                 .proc_handler   = proc_dointvec,
3076         },
3077         {
3078                 .procname       = "max_size",
3079                 .data           = &ip_rt_max_size,
3080                 .maxlen         = sizeof(int),
3081                 .mode           = 0644,
3082                 .proc_handler   = proc_dointvec,
3083         },
3084         {
3085                 /*  Deprecated. Use gc_min_interval_ms */
3086
3087                 .procname       = "gc_min_interval",
3088                 .data           = &ip_rt_gc_min_interval,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec_jiffies,
3092         },
3093         {
3094                 .procname       = "gc_min_interval_ms",
3095                 .data           = &ip_rt_gc_min_interval,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec_ms_jiffies,
3099         },
3100         {
3101                 .procname       = "gc_timeout",
3102                 .data           = &ip_rt_gc_timeout,
3103                 .maxlen         = sizeof(int),
3104                 .mode           = 0644,
3105                 .proc_handler   = proc_dointvec_jiffies,
3106         },
3107         {
3108                 .procname       = "gc_interval",
3109                 .data           = &ip_rt_gc_interval,
3110                 .maxlen         = sizeof(int),
3111                 .mode           = 0644,
3112                 .proc_handler   = proc_dointvec_jiffies,
3113         },
3114         {
3115                 .procname       = "redirect_load",
3116                 .data           = &ip_rt_redirect_load,
3117                 .maxlen         = sizeof(int),
3118                 .mode           = 0644,
3119                 .proc_handler   = proc_dointvec,
3120         },
3121         {
3122                 .procname       = "redirect_number",
3123                 .data           = &ip_rt_redirect_number,
3124                 .maxlen         = sizeof(int),
3125                 .mode           = 0644,
3126                 .proc_handler   = proc_dointvec,
3127         },
3128         {
3129                 .procname       = "redirect_silence",
3130                 .data           = &ip_rt_redirect_silence,
3131                 .maxlen         = sizeof(int),
3132                 .mode           = 0644,
3133                 .proc_handler   = proc_dointvec,
3134         },
3135         {
3136                 .procname       = "error_cost",
3137                 .data           = &ip_rt_error_cost,
3138                 .maxlen         = sizeof(int),
3139                 .mode           = 0644,
3140                 .proc_handler   = proc_dointvec,
3141         },
3142         {
3143                 .procname       = "error_burst",
3144                 .data           = &ip_rt_error_burst,
3145                 .maxlen         = sizeof(int),
3146                 .mode           = 0644,
3147                 .proc_handler   = proc_dointvec,
3148         },
3149         {
3150                 .procname       = "gc_elasticity",
3151                 .data           = &ip_rt_gc_elasticity,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec,
3155         },
3156         {
3157                 .procname       = "mtu_expires",
3158                 .data           = &ip_rt_mtu_expires,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec_jiffies,
3162         },
3163         {
3164                 .procname       = "min_pmtu",
3165                 .data           = &ip_rt_min_pmtu,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec,
3169         },
3170         {
3171                 .procname       = "min_adv_mss",
3172                 .data           = &ip_rt_min_advmss,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec,
3176         },
3177         { }
3178 };
3179
3180 static struct ctl_table empty[1];
3181
3182 static struct ctl_table ipv4_skeleton[] =
3183 {
3184         { .procname = "route", 
3185           .mode = 0555, .child = ipv4_route_table},
3186         { .procname = "neigh", 
3187           .mode = 0555, .child = empty},
3188         { }
3189 };
3190
3191 static __net_initdata struct ctl_path ipv4_path[] = {
3192         { .procname = "net", },
3193         { .procname = "ipv4", },
3194         { },
3195 };
3196
3197 static struct ctl_table ipv4_route_flush_table[] = {
3198         {
3199                 .procname       = "flush",
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0200,
3202                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3203         },
3204         { },
3205 };
3206
3207 static __net_initdata struct ctl_path ipv4_route_path[] = {
3208         { .procname = "net", },
3209         { .procname = "ipv4", },
3210         { .procname = "route", },
3211         { },
3212 };
3213
3214 static __net_init int sysctl_route_net_init(struct net *net)
3215 {
3216         struct ctl_table *tbl;
3217
3218         tbl = ipv4_route_flush_table;
3219         if (!net_eq(net, &init_net)) {
3220                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3221                 if (tbl == NULL)
3222                         goto err_dup;
3223         }
3224         tbl[0].extra1 = net;
3225
3226         net->ipv4.route_hdr =
3227                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3228         if (net->ipv4.route_hdr == NULL)
3229                 goto err_reg;
3230         return 0;
3231
3232 err_reg:
3233         if (tbl != ipv4_route_flush_table)
3234                 kfree(tbl);
3235 err_dup:
3236         return -ENOMEM;
3237 }
3238
3239 static __net_exit void sysctl_route_net_exit(struct net *net)
3240 {
3241         struct ctl_table *tbl;
3242
3243         tbl = net->ipv4.route_hdr->ctl_table_arg;
3244         unregister_net_sysctl_table(net->ipv4.route_hdr);
3245         BUG_ON(tbl == ipv4_route_flush_table);
3246         kfree(tbl);
3247 }
3248
3249 static __net_initdata struct pernet_operations sysctl_route_ops = {
3250         .init = sysctl_route_net_init,
3251         .exit = sysctl_route_net_exit,
3252 };
3253 #endif
3254
3255 static __net_init int rt_genid_init(struct net *net)
3256 {
3257         get_random_bytes(&net->ipv4.rt_genid,
3258                          sizeof(net->ipv4.rt_genid));
3259         get_random_bytes(&net->ipv4.dev_addr_genid,
3260                          sizeof(net->ipv4.dev_addr_genid));
3261         return 0;
3262 }
3263
3264 static __net_initdata struct pernet_operations rt_genid_ops = {
3265         .init = rt_genid_init,
3266 };
3267
3268
3269 #ifdef CONFIG_IP_ROUTE_CLASSID
3270 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3271 #endif /* CONFIG_IP_ROUTE_CLASSID */
3272
3273 static __initdata unsigned long rhash_entries;
3274 static int __init set_rhash_entries(char *str)
3275 {
3276         if (!str)
3277                 return 0;
3278         rhash_entries = simple_strtoul(str, &str, 0);
3279         return 1;
3280 }
3281 __setup("rhash_entries=", set_rhash_entries);
3282
3283 int __init ip_rt_init(void)
3284 {
3285         int rc = 0;
3286
3287 #ifdef CONFIG_IP_ROUTE_CLASSID
3288         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3289         if (!ip_rt_acct)
3290                 panic("IP: failed to allocate ip_rt_acct\n");
3291 #endif
3292
3293         ipv4_dst_ops.kmem_cachep =
3294                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3295                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3296
3297         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3298
3299         if (dst_entries_init(&ipv4_dst_ops) < 0)
3300                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3301
3302         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3303                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3304
3305         rt_hash_table = (struct rt_hash_bucket *)
3306                 alloc_large_system_hash("IP route cache",
3307                                         sizeof(struct rt_hash_bucket),
3308                                         rhash_entries,
3309                                         (totalram_pages >= 128 * 1024) ?
3310                                         15 : 17,
3311                                         0,
3312                                         &rt_hash_log,
3313                                         &rt_hash_mask,
3314                                         rhash_entries ? 0 : 512 * 1024);
3315         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3316         rt_hash_lock_init();
3317
3318         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3319         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3320
3321         devinet_init();
3322         ip_fib_init();
3323
3324         if (ip_rt_proc_init())
3325                 printk(KERN_ERR "Unable to create route proc files\n");
3326 #ifdef CONFIG_XFRM
3327         xfrm_init();
3328         xfrm4_init(ip_rt_max_size);
3329 #endif
3330         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3331
3332 #ifdef CONFIG_SYSCTL
3333         register_pernet_subsys(&sysctl_route_ops);
3334 #endif
3335         register_pernet_subsys(&rt_genid_ops);
3336         return rc;
3337 }
3338
3339 #ifdef CONFIG_SYSCTL
3340 /*
3341  * We really need to sanitize the damn ipv4 init order, then all
3342  * this nonsense will go away.
3343  */
3344 void __init ip_static_sysctl_init(void)
3345 {
3346         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3347 }
3348 #endif