]> nv-tegra.nvidia Code Review - linux-2.6.git/blob - net/ipv4/route.c
Merge branch 'master' into gfs2
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142
143 /*
144  *      Interface to generic destination cache.
145  */
146
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
150                                          struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void              ipv4_link_failure(struct sk_buff *skb);
153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .protocol =             __constant_htons(ETH_P_IP),
160         .gc =                   rt_garbage_collect,
161         .check =                ipv4_dst_check,
162         .destroy =              ipv4_dst_destroy,
163         .ifdown =               ipv4_dst_ifdown,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .entry_size =           sizeof(struct rtable),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()    { \
235                 int i; \
236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239                         spin_lock_init(&rt_hash_locks[i]); \
240                 }
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245
246 static struct rt_hash_bucket    *rt_hash_table;
247 static unsigned                 rt_hash_mask;
248 static int                      rt_hash_log;
249 static unsigned int             rt_hash_rnd;
250
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253         (__raw_get_cpu_var(rt_cache_stat).field++)
254
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256                                 struct rtable **res);
257
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
261                 & rt_hash_mask);
262 }
263
264 #ifdef CONFIG_PROC_FS
265 struct rt_cache_iter_state {
266         int bucket;
267 };
268
269 static struct rtable *rt_cache_get_first(struct seq_file *seq)
270 {
271         struct rtable *r = NULL;
272         struct rt_cache_iter_state *st = seq->private;
273
274         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
275                 rcu_read_lock_bh();
276                 r = rt_hash_table[st->bucket].chain;
277                 if (r)
278                         break;
279                 rcu_read_unlock_bh();
280         }
281         return r;
282 }
283
284 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
285 {
286         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
287
288         r = r->u.rt_next;
289         while (!r) {
290                 rcu_read_unlock_bh();
291                 if (--st->bucket < 0)
292                         break;
293                 rcu_read_lock_bh();
294                 r = rt_hash_table[st->bucket].chain;
295         }
296         return r;
297 }
298
299 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
300 {
301         struct rtable *r = rt_cache_get_first(seq);
302
303         if (r)
304                 while (pos && (r = rt_cache_get_next(seq, r)))
305                         --pos;
306         return pos ? NULL : r;
307 }
308
309 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
310 {
311         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
312 }
313
314 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
315 {
316         struct rtable *r = NULL;
317
318         if (v == SEQ_START_TOKEN)
319                 r = rt_cache_get_first(seq);
320         else
321                 r = rt_cache_get_next(seq, v);
322         ++*pos;
323         return r;
324 }
325
326 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
327 {
328         if (v && v != SEQ_START_TOKEN)
329                 rcu_read_unlock_bh();
330 }
331
332 static int rt_cache_seq_show(struct seq_file *seq, void *v)
333 {
334         if (v == SEQ_START_TOKEN)
335                 seq_printf(seq, "%-127s\n",
336                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
337                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
338                            "HHUptod\tSpecDst");
339         else {
340                 struct rtable *r = v;
341                 char temp[256];
342
343                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
344                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
345                         r->u.dst.dev ? r->u.dst.dev->name : "*",
346                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
347                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
348                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
349                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
350                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
351                         dst_metric(&r->u.dst, RTAX_WINDOW),
352                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
353                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
354                         r->fl.fl4_tos,
355                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
356                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
357                                        dev_queue_xmit) : 0,
358                         r->rt_spec_dst);
359                 seq_printf(seq, "%-127s\n", temp);
360         }
361         return 0;
362 }
363
364 static struct seq_operations rt_cache_seq_ops = {
365         .start  = rt_cache_seq_start,
366         .next   = rt_cache_seq_next,
367         .stop   = rt_cache_seq_stop,
368         .show   = rt_cache_seq_show,
369 };
370
371 static int rt_cache_seq_open(struct inode *inode, struct file *file)
372 {
373         struct seq_file *seq;
374         int rc = -ENOMEM;
375         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
376
377         if (!s)
378                 goto out;
379         rc = seq_open(file, &rt_cache_seq_ops);
380         if (rc)
381                 goto out_kfree;
382         seq          = file->private_data;
383         seq->private = s;
384         memset(s, 0, sizeof(*s));
385 out:
386         return rc;
387 out_kfree:
388         kfree(s);
389         goto out;
390 }
391
392 static struct file_operations rt_cache_seq_fops = {
393         .owner   = THIS_MODULE,
394         .open    = rt_cache_seq_open,
395         .read    = seq_read,
396         .llseek  = seq_lseek,
397         .release = seq_release_private,
398 };
399
400
401 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
402 {
403         int cpu;
404
405         if (*pos == 0)
406                 return SEQ_START_TOKEN;
407
408         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
409                 if (!cpu_possible(cpu))
410                         continue;
411                 *pos = cpu+1;
412                 return &per_cpu(rt_cache_stat, cpu);
413         }
414         return NULL;
415 }
416
417 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
418 {
419         int cpu;
420
421         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
422                 if (!cpu_possible(cpu))
423                         continue;
424                 *pos = cpu+1;
425                 return &per_cpu(rt_cache_stat, cpu);
426         }
427         return NULL;
428         
429 }
430
431 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
432 {
433
434 }
435
436 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
437 {
438         struct rt_cache_stat *st = v;
439
440         if (v == SEQ_START_TOKEN) {
441                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
442                 return 0;
443         }
444         
445         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
446                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
447                    atomic_read(&ipv4_dst_ops.entries),
448                    st->in_hit,
449                    st->in_slow_tot,
450                    st->in_slow_mc,
451                    st->in_no_route,
452                    st->in_brd,
453                    st->in_martian_dst,
454                    st->in_martian_src,
455
456                    st->out_hit,
457                    st->out_slow_tot,
458                    st->out_slow_mc, 
459
460                    st->gc_total,
461                    st->gc_ignored,
462                    st->gc_goal_miss,
463                    st->gc_dst_overflow,
464                    st->in_hlist_search,
465                    st->out_hlist_search
466                 );
467         return 0;
468 }
469
470 static struct seq_operations rt_cpu_seq_ops = {
471         .start  = rt_cpu_seq_start,
472         .next   = rt_cpu_seq_next,
473         .stop   = rt_cpu_seq_stop,
474         .show   = rt_cpu_seq_show,
475 };
476
477
478 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
479 {
480         return seq_open(file, &rt_cpu_seq_ops);
481 }
482
483 static struct file_operations rt_cpu_seq_fops = {
484         .owner   = THIS_MODULE,
485         .open    = rt_cpu_seq_open,
486         .read    = seq_read,
487         .llseek  = seq_lseek,
488         .release = seq_release,
489 };
490
491 #endif /* CONFIG_PROC_FS */
492   
493 static __inline__ void rt_free(struct rtable *rt)
494 {
495         multipath_remove(rt);
496         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
497 }
498
499 static __inline__ void rt_drop(struct rtable *rt)
500 {
501         multipath_remove(rt);
502         ip_rt_put(rt);
503         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
504 }
505
506 static __inline__ int rt_fast_clean(struct rtable *rth)
507 {
508         /* Kill broadcast/multicast entries very aggresively, if they
509            collide in hash table with more useful entries */
510         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
511                 rth->fl.iif && rth->u.rt_next;
512 }
513
514 static __inline__ int rt_valuable(struct rtable *rth)
515 {
516         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
517                 rth->u.dst.expires;
518 }
519
520 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
521 {
522         unsigned long age;
523         int ret = 0;
524
525         if (atomic_read(&rth->u.dst.__refcnt))
526                 goto out;
527
528         ret = 1;
529         if (rth->u.dst.expires &&
530             time_after_eq(jiffies, rth->u.dst.expires))
531                 goto out;
532
533         age = jiffies - rth->u.dst.lastuse;
534         ret = 0;
535         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
536             (age <= tmo2 && rt_valuable(rth)))
537                 goto out;
538         ret = 1;
539 out:    return ret;
540 }
541
542 /* Bits of score are:
543  * 31: very valuable
544  * 30: not quite useless
545  * 29..0: usage counter
546  */
547 static inline u32 rt_score(struct rtable *rt)
548 {
549         u32 score = jiffies - rt->u.dst.lastuse;
550
551         score = ~score & ~(3<<30);
552
553         if (rt_valuable(rt))
554                 score |= (1<<31);
555
556         if (!rt->fl.iif ||
557             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
558                 score |= (1<<30);
559
560         return score;
561 }
562
563 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
564 {
565         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
566                fl1->oif     == fl2->oif &&
567                fl1->iif     == fl2->iif;
568 }
569
570 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
571 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
572                                                 struct rtable *expentry,
573                                                 int *removed_count)
574 {
575         int passedexpired = 0;
576         struct rtable **nextstep = NULL;
577         struct rtable **rthp = chain_head;
578         struct rtable *rth;
579
580         if (removed_count)
581                 *removed_count = 0;
582
583         while ((rth = *rthp) != NULL) {
584                 if (rth == expentry)
585                         passedexpired = 1;
586
587                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
588                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
589                         if (*rthp == expentry) {
590                                 *rthp = rth->u.rt_next;
591                                 continue;
592                         } else {
593                                 *rthp = rth->u.rt_next;
594                                 rt_free(rth);
595                                 if (removed_count)
596                                         ++(*removed_count);
597                         }
598                 } else {
599                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
600                             passedexpired && !nextstep)
601                                 nextstep = &rth->u.rt_next;
602
603                         rthp = &rth->u.rt_next;
604                 }
605         }
606
607         rt_free(expentry);
608         if (removed_count)
609                 ++(*removed_count);
610
611         return nextstep;
612 }
613 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
614
615
616 /* This runs via a timer and thus is always in BH context. */
617 static void rt_check_expire(unsigned long dummy)
618 {
619         static unsigned int rover;
620         unsigned int i = rover, goal;
621         struct rtable *rth, **rthp;
622         unsigned long now = jiffies;
623         u64 mult;
624
625         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
626         if (ip_rt_gc_timeout > 1)
627                 do_div(mult, ip_rt_gc_timeout);
628         goal = (unsigned int)mult;
629         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
630         for (; goal > 0; goal--) {
631                 unsigned long tmo = ip_rt_gc_timeout;
632
633                 i = (i + 1) & rt_hash_mask;
634                 rthp = &rt_hash_table[i].chain;
635
636                 if (*rthp == 0)
637                         continue;
638                 spin_lock(rt_hash_lock_addr(i));
639                 while ((rth = *rthp) != NULL) {
640                         if (rth->u.dst.expires) {
641                                 /* Entry is expired even if it is in use */
642                                 if (time_before_eq(now, rth->u.dst.expires)) {
643                                         tmo >>= 1;
644                                         rthp = &rth->u.rt_next;
645                                         continue;
646                                 }
647                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
648                                 tmo >>= 1;
649                                 rthp = &rth->u.rt_next;
650                                 continue;
651                         }
652
653                         /* Cleanup aged off entries. */
654 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
655                         /* remove all related balanced entries if necessary */
656                         if (rth->u.dst.flags & DST_BALANCED) {
657                                 rthp = rt_remove_balanced_route(
658                                         &rt_hash_table[i].chain,
659                                         rth, NULL);
660                                 if (!rthp)
661                                         break;
662                         } else {
663                                 *rthp = rth->u.rt_next;
664                                 rt_free(rth);
665                         }
666 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
667                         *rthp = rth->u.rt_next;
668                         rt_free(rth);
669 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
670                 }
671                 spin_unlock(rt_hash_lock_addr(i));
672
673                 /* Fallback loop breaker. */
674                 if (time_after(jiffies, now))
675                         break;
676         }
677         rover = i;
678         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
679 }
680
681 /* This can run from both BH and non-BH contexts, the latter
682  * in the case of a forced flush event.
683  */
684 static void rt_run_flush(unsigned long dummy)
685 {
686         int i;
687         struct rtable *rth, *next;
688
689         rt_deadline = 0;
690
691         get_random_bytes(&rt_hash_rnd, 4);
692
693         for (i = rt_hash_mask; i >= 0; i--) {
694                 spin_lock_bh(rt_hash_lock_addr(i));
695                 rth = rt_hash_table[i].chain;
696                 if (rth)
697                         rt_hash_table[i].chain = NULL;
698                 spin_unlock_bh(rt_hash_lock_addr(i));
699
700                 for (; rth; rth = next) {
701                         next = rth->u.rt_next;
702                         rt_free(rth);
703                 }
704         }
705 }
706
707 static DEFINE_SPINLOCK(rt_flush_lock);
708
709 void rt_cache_flush(int delay)
710 {
711         unsigned long now = jiffies;
712         int user_mode = !in_softirq();
713
714         if (delay < 0)
715                 delay = ip_rt_min_delay;
716
717         /* flush existing multipath state*/
718         multipath_flush();
719
720         spin_lock_bh(&rt_flush_lock);
721
722         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
723                 long tmo = (long)(rt_deadline - now);
724
725                 /* If flush timer is already running
726                    and flush request is not immediate (delay > 0):
727
728                    if deadline is not achieved, prolongate timer to "delay",
729                    otherwise fire it at deadline time.
730                  */
731
732                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
733                         tmo = 0;
734                 
735                 if (delay > tmo)
736                         delay = tmo;
737         }
738
739         if (delay <= 0) {
740                 spin_unlock_bh(&rt_flush_lock);
741                 rt_run_flush(0);
742                 return;
743         }
744
745         if (rt_deadline == 0)
746                 rt_deadline = now + ip_rt_max_delay;
747
748         mod_timer(&rt_flush_timer, now+delay);
749         spin_unlock_bh(&rt_flush_lock);
750 }
751
752 static void rt_secret_rebuild(unsigned long dummy)
753 {
754         unsigned long now = jiffies;
755
756         rt_cache_flush(0);
757         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
758 }
759
760 /*
761    Short description of GC goals.
762
763    We want to build algorithm, which will keep routing cache
764    at some equilibrium point, when number of aged off entries
765    is kept approximately equal to newly generated ones.
766
767    Current expiration strength is variable "expire".
768    We try to adjust it dynamically, so that if networking
769    is idle expires is large enough to keep enough of warm entries,
770    and when load increases it reduces to limit cache size.
771  */
772
773 static int rt_garbage_collect(void)
774 {
775         static unsigned long expire = RT_GC_TIMEOUT;
776         static unsigned long last_gc;
777         static int rover;
778         static int equilibrium;
779         struct rtable *rth, **rthp;
780         unsigned long now = jiffies;
781         int goal;
782
783         /*
784          * Garbage collection is pretty expensive,
785          * do not make it too frequently.
786          */
787
788         RT_CACHE_STAT_INC(gc_total);
789
790         if (now - last_gc < ip_rt_gc_min_interval &&
791             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
792                 RT_CACHE_STAT_INC(gc_ignored);
793                 goto out;
794         }
795
796         /* Calculate number of entries, which we want to expire now. */
797         goal = atomic_read(&ipv4_dst_ops.entries) -
798                 (ip_rt_gc_elasticity << rt_hash_log);
799         if (goal <= 0) {
800                 if (equilibrium < ipv4_dst_ops.gc_thresh)
801                         equilibrium = ipv4_dst_ops.gc_thresh;
802                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
803                 if (goal > 0) {
804                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
805                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
806                 }
807         } else {
808                 /* We are in dangerous area. Try to reduce cache really
809                  * aggressively.
810                  */
811                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
812                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
813         }
814
815         if (now - last_gc >= ip_rt_gc_min_interval)
816                 last_gc = now;
817
818         if (goal <= 0) {
819                 equilibrium += goal;
820                 goto work_done;
821         }
822
823         do {
824                 int i, k;
825
826                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
827                         unsigned long tmo = expire;
828
829                         k = (k + 1) & rt_hash_mask;
830                         rthp = &rt_hash_table[k].chain;
831                         spin_lock_bh(rt_hash_lock_addr(k));
832                         while ((rth = *rthp) != NULL) {
833                                 if (!rt_may_expire(rth, tmo, expire)) {
834                                         tmo >>= 1;
835                                         rthp = &rth->u.rt_next;
836                                         continue;
837                                 }
838 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
839                                 /* remove all related balanced entries
840                                  * if necessary
841                                  */
842                                 if (rth->u.dst.flags & DST_BALANCED) {
843                                         int r;
844
845                                         rthp = rt_remove_balanced_route(
846                                                 &rt_hash_table[k].chain,
847                                                 rth,
848                                                 &r);
849                                         goal -= r;
850                                         if (!rthp)
851                                                 break;
852                                 } else {
853                                         *rthp = rth->u.rt_next;
854                                         rt_free(rth);
855                                         goal--;
856                                 }
857 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
858                                 *rthp = rth->u.rt_next;
859                                 rt_free(rth);
860                                 goal--;
861 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
862                         }
863                         spin_unlock_bh(rt_hash_lock_addr(k));
864                         if (goal <= 0)
865                                 break;
866                 }
867                 rover = k;
868
869                 if (goal <= 0)
870                         goto work_done;
871
872                 /* Goal is not achieved. We stop process if:
873
874                    - if expire reduced to zero. Otherwise, expire is halfed.
875                    - if table is not full.
876                    - if we are called from interrupt.
877                    - jiffies check is just fallback/debug loop breaker.
878                      We will not spin here for long time in any case.
879                  */
880
881                 RT_CACHE_STAT_INC(gc_goal_miss);
882
883                 if (expire == 0)
884                         break;
885
886                 expire >>= 1;
887 #if RT_CACHE_DEBUG >= 2
888                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
889                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
890 #endif
891
892                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
893                         goto out;
894         } while (!in_softirq() && time_before_eq(jiffies, now));
895
896         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
897                 goto out;
898         if (net_ratelimit())
899                 printk(KERN_WARNING "dst cache overflow\n");
900         RT_CACHE_STAT_INC(gc_dst_overflow);
901         return 1;
902
903 work_done:
904         expire += ip_rt_gc_min_interval;
905         if (expire > ip_rt_gc_timeout ||
906             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
907                 expire = ip_rt_gc_timeout;
908 #if RT_CACHE_DEBUG >= 2
909         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
910                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
911 #endif
912 out:    return 0;
913 }
914
915 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
916 {
917         struct rtable   *rth, **rthp;
918         unsigned long   now;
919         struct rtable *cand, **candp;
920         u32             min_score;
921         int             chain_length;
922         int attempts = !in_softirq();
923
924 restart:
925         chain_length = 0;
926         min_score = ~(u32)0;
927         cand = NULL;
928         candp = NULL;
929         now = jiffies;
930
931         rthp = &rt_hash_table[hash].chain;
932
933         spin_lock_bh(rt_hash_lock_addr(hash));
934         while ((rth = *rthp) != NULL) {
935 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
936                 if (!(rth->u.dst.flags & DST_BALANCED) &&
937                     compare_keys(&rth->fl, &rt->fl)) {
938 #else
939                 if (compare_keys(&rth->fl, &rt->fl)) {
940 #endif
941                         /* Put it first */
942                         *rthp = rth->u.rt_next;
943                         /*
944                          * Since lookup is lockfree, the deletion
945                          * must be visible to another weakly ordered CPU before
946                          * the insertion at the start of the hash chain.
947                          */
948                         rcu_assign_pointer(rth->u.rt_next,
949                                            rt_hash_table[hash].chain);
950                         /*
951                          * Since lookup is lockfree, the update writes
952                          * must be ordered for consistency on SMP.
953                          */
954                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
955
956                         rth->u.dst.__use++;
957                         dst_hold(&rth->u.dst);
958                         rth->u.dst.lastuse = now;
959                         spin_unlock_bh(rt_hash_lock_addr(hash));
960
961                         rt_drop(rt);
962                         *rp = rth;
963                         return 0;
964                 }
965
966                 if (!atomic_read(&rth->u.dst.__refcnt)) {
967                         u32 score = rt_score(rth);
968
969                         if (score <= min_score) {
970                                 cand = rth;
971                                 candp = rthp;
972                                 min_score = score;
973                         }
974                 }
975
976                 chain_length++;
977
978                 rthp = &rth->u.rt_next;
979         }
980
981         if (cand) {
982                 /* ip_rt_gc_elasticity used to be average length of chain
983                  * length, when exceeded gc becomes really aggressive.
984                  *
985                  * The second limit is less certain. At the moment it allows
986                  * only 2 entries per bucket. We will see.
987                  */
988                 if (chain_length > ip_rt_gc_elasticity) {
989                         *candp = cand->u.rt_next;
990                         rt_free(cand);
991                 }
992         }
993
994         /* Try to bind route to arp only if it is output
995            route or unicast forwarding path.
996          */
997         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
998                 int err = arp_bind_neighbour(&rt->u.dst);
999                 if (err) {
1000                         spin_unlock_bh(rt_hash_lock_addr(hash));
1001
1002                         if (err != -ENOBUFS) {
1003                                 rt_drop(rt);
1004                                 return err;
1005                         }
1006
1007                         /* Neighbour tables are full and nothing
1008                            can be released. Try to shrink route cache,
1009                            it is most likely it holds some neighbour records.
1010                          */
1011                         if (attempts-- > 0) {
1012                                 int saved_elasticity = ip_rt_gc_elasticity;
1013                                 int saved_int = ip_rt_gc_min_interval;
1014                                 ip_rt_gc_elasticity     = 1;
1015                                 ip_rt_gc_min_interval   = 0;
1016                                 rt_garbage_collect();
1017                                 ip_rt_gc_min_interval   = saved_int;
1018                                 ip_rt_gc_elasticity     = saved_elasticity;
1019                                 goto restart;
1020                         }
1021
1022                         if (net_ratelimit())
1023                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1024                         rt_drop(rt);
1025                         return -ENOBUFS;
1026                 }
1027         }
1028
1029         rt->u.rt_next = rt_hash_table[hash].chain;
1030 #if RT_CACHE_DEBUG >= 2
1031         if (rt->u.rt_next) {
1032                 struct rtable *trt;
1033                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1034                        NIPQUAD(rt->rt_dst));
1035                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1036                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1037                 printk("\n");
1038         }
1039 #endif
1040         rt_hash_table[hash].chain = rt;
1041         spin_unlock_bh(rt_hash_lock_addr(hash));
1042         *rp = rt;
1043         return 0;
1044 }
1045
1046 void rt_bind_peer(struct rtable *rt, int create)
1047 {
1048         static DEFINE_SPINLOCK(rt_peer_lock);
1049         struct inet_peer *peer;
1050
1051         peer = inet_getpeer(rt->rt_dst, create);
1052
1053         spin_lock_bh(&rt_peer_lock);
1054         if (rt->peer == NULL) {
1055                 rt->peer = peer;
1056                 peer = NULL;
1057         }
1058         spin_unlock_bh(&rt_peer_lock);
1059         if (peer)
1060                 inet_putpeer(peer);
1061 }
1062
1063 /*
1064  * Peer allocation may fail only in serious out-of-memory conditions.  However
1065  * we still can generate some output.
1066  * Random ID selection looks a bit dangerous because we have no chances to
1067  * select ID being unique in a reasonable period of time.
1068  * But broken packet identifier may be better than no packet at all.
1069  */
1070 static void ip_select_fb_ident(struct iphdr *iph)
1071 {
1072         static DEFINE_SPINLOCK(ip_fb_id_lock);
1073         static u32 ip_fallback_id;
1074         u32 salt;
1075
1076         spin_lock_bh(&ip_fb_id_lock);
1077         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1078         iph->id = htons(salt & 0xFFFF);
1079         ip_fallback_id = salt;
1080         spin_unlock_bh(&ip_fb_id_lock);
1081 }
1082
1083 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1084 {
1085         struct rtable *rt = (struct rtable *) dst;
1086
1087         if (rt) {
1088                 if (rt->peer == NULL)
1089                         rt_bind_peer(rt, 1);
1090
1091                 /* If peer is attached to destination, it is never detached,
1092                    so that we need not to grab a lock to dereference it.
1093                  */
1094                 if (rt->peer) {
1095                         iph->id = htons(inet_getid(rt->peer, more));
1096                         return;
1097                 }
1098         } else
1099                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1100                        __builtin_return_address(0));
1101
1102         ip_select_fb_ident(iph);
1103 }
1104
1105 static void rt_del(unsigned hash, struct rtable *rt)
1106 {
1107         struct rtable **rthp;
1108
1109         spin_lock_bh(rt_hash_lock_addr(hash));
1110         ip_rt_put(rt);
1111         for (rthp = &rt_hash_table[hash].chain; *rthp;
1112              rthp = &(*rthp)->u.rt_next)
1113                 if (*rthp == rt) {
1114                         *rthp = rt->u.rt_next;
1115                         rt_free(rt);
1116                         break;
1117                 }
1118         spin_unlock_bh(rt_hash_lock_addr(hash));
1119 }
1120
1121 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1122                     u32 saddr, struct net_device *dev)
1123 {
1124         int i, k;
1125         struct in_device *in_dev = in_dev_get(dev);
1126         struct rtable *rth, **rthp;
1127         u32  skeys[2] = { saddr, 0 };
1128         int  ikeys[2] = { dev->ifindex, 0 };
1129         struct netevent_redirect netevent;
1130
1131         if (!in_dev)
1132                 return;
1133
1134         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1135             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1136                 goto reject_redirect;
1137
1138         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1139                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1140                         goto reject_redirect;
1141                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1142                         goto reject_redirect;
1143         } else {
1144                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1145                         goto reject_redirect;
1146         }
1147
1148         for (i = 0; i < 2; i++) {
1149                 for (k = 0; k < 2; k++) {
1150                         unsigned hash = rt_hash_code(daddr,
1151                                                      skeys[i] ^ (ikeys[k] << 5));
1152
1153                         rthp=&rt_hash_table[hash].chain;
1154
1155                         rcu_read_lock();
1156                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1157                                 struct rtable *rt;
1158
1159                                 if (rth->fl.fl4_dst != daddr ||
1160                                     rth->fl.fl4_src != skeys[i] ||
1161                                     rth->fl.oif != ikeys[k] ||
1162                                     rth->fl.iif != 0) {
1163                                         rthp = &rth->u.rt_next;
1164                                         continue;
1165                                 }
1166
1167                                 if (rth->rt_dst != daddr ||
1168                                     rth->rt_src != saddr ||
1169                                     rth->u.dst.error ||
1170                                     rth->rt_gateway != old_gw ||
1171                                     rth->u.dst.dev != dev)
1172                                         break;
1173
1174                                 dst_hold(&rth->u.dst);
1175                                 rcu_read_unlock();
1176
1177                                 rt = dst_alloc(&ipv4_dst_ops);
1178                                 if (rt == NULL) {
1179                                         ip_rt_put(rth);
1180                                         in_dev_put(in_dev);
1181                                         return;
1182                                 }
1183
1184                                 /* Copy all the information. */
1185                                 *rt = *rth;
1186                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1187                                 rt->u.dst.__use         = 1;
1188                                 atomic_set(&rt->u.dst.__refcnt, 1);
1189                                 rt->u.dst.child         = NULL;
1190                                 if (rt->u.dst.dev)
1191                                         dev_hold(rt->u.dst.dev);
1192                                 if (rt->idev)
1193                                         in_dev_hold(rt->idev);
1194                                 rt->u.dst.obsolete      = 0;
1195                                 rt->u.dst.lastuse       = jiffies;
1196                                 rt->u.dst.path          = &rt->u.dst;
1197                                 rt->u.dst.neighbour     = NULL;
1198                                 rt->u.dst.hh            = NULL;
1199                                 rt->u.dst.xfrm          = NULL;
1200
1201                                 rt->rt_flags            |= RTCF_REDIRECTED;
1202
1203                                 /* Gateway is different ... */
1204                                 rt->rt_gateway          = new_gw;
1205
1206                                 /* Redirect received -> path was valid */
1207                                 dst_confirm(&rth->u.dst);
1208
1209                                 if (rt->peer)
1210                                         atomic_inc(&rt->peer->refcnt);
1211
1212                                 if (arp_bind_neighbour(&rt->u.dst) ||
1213                                     !(rt->u.dst.neighbour->nud_state &
1214                                             NUD_VALID)) {
1215                                         if (rt->u.dst.neighbour)
1216                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1217                                         ip_rt_put(rth);
1218                                         rt_drop(rt);
1219                                         goto do_next;
1220                                 }
1221                                 
1222                                 netevent.old = &rth->u.dst;
1223                                 netevent.new = &rt->u.dst;
1224                                 call_netevent_notifiers(NETEVENT_REDIRECT, 
1225                                                         &netevent);
1226
1227                                 rt_del(hash, rth);
1228                                 if (!rt_intern_hash(hash, rt, &rt))
1229                                         ip_rt_put(rt);
1230                                 goto do_next;
1231                         }
1232                         rcu_read_unlock();
1233                 do_next:
1234                         ;
1235                 }
1236         }
1237         in_dev_put(in_dev);
1238         return;
1239
1240 reject_redirect:
1241 #ifdef CONFIG_IP_ROUTE_VERBOSE
1242         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1243                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1244                         "%u.%u.%u.%u ignored.\n"
1245                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1246                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1247                        NIPQUAD(saddr), NIPQUAD(daddr));
1248 #endif
1249         in_dev_put(in_dev);
1250 }
1251
1252 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1253 {
1254         struct rtable *rt = (struct rtable*)dst;
1255         struct dst_entry *ret = dst;
1256
1257         if (rt) {
1258                 if (dst->obsolete) {
1259                         ip_rt_put(rt);
1260                         ret = NULL;
1261                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1262                            rt->u.dst.expires) {
1263                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1264                                                      rt->fl.fl4_src ^
1265                                                         (rt->fl.oif << 5));
1266 #if RT_CACHE_DEBUG >= 1
1267                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1268                                           "%u.%u.%u.%u/%02x dropped\n",
1269                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1270 #endif
1271                         rt_del(hash, rt);
1272                         ret = NULL;
1273                 }
1274         }
1275         return ret;
1276 }
1277
1278 /*
1279  * Algorithm:
1280  *      1. The first ip_rt_redirect_number redirects are sent
1281  *         with exponential backoff, then we stop sending them at all,
1282  *         assuming that the host ignores our redirects.
1283  *      2. If we did not see packets requiring redirects
1284  *         during ip_rt_redirect_silence, we assume that the host
1285  *         forgot redirected route and start to send redirects again.
1286  *
1287  * This algorithm is much cheaper and more intelligent than dumb load limiting
1288  * in icmp.c.
1289  *
1290  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1291  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1292  */
1293
1294 void ip_rt_send_redirect(struct sk_buff *skb)
1295 {
1296         struct rtable *rt = (struct rtable*)skb->dst;
1297         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1298
1299         if (!in_dev)
1300                 return;
1301
1302         if (!IN_DEV_TX_REDIRECTS(in_dev))
1303                 goto out;
1304
1305         /* No redirected packets during ip_rt_redirect_silence;
1306          * reset the algorithm.
1307          */
1308         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1309                 rt->u.dst.rate_tokens = 0;
1310
1311         /* Too many ignored redirects; do not send anything
1312          * set u.dst.rate_last to the last seen redirected packet.
1313          */
1314         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1315                 rt->u.dst.rate_last = jiffies;
1316                 goto out;
1317         }
1318
1319         /* Check for load limit; set rate_last to the latest sent
1320          * redirect.
1321          */
1322         if (time_after(jiffies,
1323                        (rt->u.dst.rate_last +
1324                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1325                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1326                 rt->u.dst.rate_last = jiffies;
1327                 ++rt->u.dst.rate_tokens;
1328 #ifdef CONFIG_IP_ROUTE_VERBOSE
1329                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1330                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1331                     net_ratelimit())
1332                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1333                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1334                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1335                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1336 #endif
1337         }
1338 out:
1339         in_dev_put(in_dev);
1340 }
1341
1342 static int ip_error(struct sk_buff *skb)
1343 {
1344         struct rtable *rt = (struct rtable*)skb->dst;
1345         unsigned long now;
1346         int code;
1347
1348         switch (rt->u.dst.error) {
1349                 case EINVAL:
1350                 default:
1351                         goto out;
1352                 case EHOSTUNREACH:
1353                         code = ICMP_HOST_UNREACH;
1354                         break;
1355                 case ENETUNREACH:
1356                         code = ICMP_NET_UNREACH;
1357                         break;
1358                 case EACCES:
1359                         code = ICMP_PKT_FILTERED;
1360                         break;
1361         }
1362
1363         now = jiffies;
1364         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1365         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1366                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1367         rt->u.dst.rate_last = now;
1368         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1369                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1370                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1371         }
1372
1373 out:    kfree_skb(skb);
1374         return 0;
1375
1376
1377 /*
1378  *      The last two values are not from the RFC but
1379  *      are needed for AMPRnet AX.25 paths.
1380  */
1381
1382 static const unsigned short mtu_plateau[] =
1383 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1384
1385 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1386 {
1387         int i;
1388         
1389         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1390                 if (old_mtu > mtu_plateau[i])
1391                         return mtu_plateau[i];
1392         return 68;
1393 }
1394
1395 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1396 {
1397         int i;
1398         unsigned short old_mtu = ntohs(iph->tot_len);
1399         struct rtable *rth;
1400         u32  skeys[2] = { iph->saddr, 0, };
1401         u32  daddr = iph->daddr;
1402         unsigned short est_mtu = 0;
1403
1404         if (ipv4_config.no_pmtu_disc)
1405                 return 0;
1406
1407         for (i = 0; i < 2; i++) {
1408                 unsigned hash = rt_hash_code(daddr, skeys[i]);
1409
1410                 rcu_read_lock();
1411                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1412                      rth = rcu_dereference(rth->u.rt_next)) {
1413                         if (rth->fl.fl4_dst == daddr &&
1414                             rth->fl.fl4_src == skeys[i] &&
1415                             rth->rt_dst  == daddr &&
1416                             rth->rt_src  == iph->saddr &&
1417                             rth->fl.iif == 0 &&
1418                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1419                                 unsigned short mtu = new_mtu;
1420
1421                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1422
1423                                         /* BSD 4.2 compatibility hack :-( */
1424                                         if (mtu == 0 &&
1425                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1426                                             old_mtu >= 68 + (iph->ihl << 2))
1427                                                 old_mtu -= iph->ihl << 2;
1428
1429                                         mtu = guess_mtu(old_mtu);
1430                                 }
1431                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1432                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1433                                                 dst_confirm(&rth->u.dst);
1434                                                 if (mtu < ip_rt_min_pmtu) {
1435                                                         mtu = ip_rt_min_pmtu;
1436                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1437                                                                 (1 << RTAX_MTU);
1438                                                 }
1439                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1440                                                 dst_set_expires(&rth->u.dst,
1441                                                         ip_rt_mtu_expires);
1442                                         }
1443                                         est_mtu = mtu;
1444                                 }
1445                         }
1446                 }
1447                 rcu_read_unlock();
1448         }
1449         return est_mtu ? : new_mtu;
1450 }
1451
1452 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1453 {
1454         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1455             !(dst_metric_locked(dst, RTAX_MTU))) {
1456                 if (mtu < ip_rt_min_pmtu) {
1457                         mtu = ip_rt_min_pmtu;
1458                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1459                 }
1460                 dst->metrics[RTAX_MTU-1] = mtu;
1461                 dst_set_expires(dst, ip_rt_mtu_expires);
1462                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1463         }
1464 }
1465
1466 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1467 {
1468         return NULL;
1469 }
1470
1471 static void ipv4_dst_destroy(struct dst_entry *dst)
1472 {
1473         struct rtable *rt = (struct rtable *) dst;
1474         struct inet_peer *peer = rt->peer;
1475         struct in_device *idev = rt->idev;
1476
1477         if (peer) {
1478                 rt->peer = NULL;
1479                 inet_putpeer(peer);
1480         }
1481
1482         if (idev) {
1483                 rt->idev = NULL;
1484                 in_dev_put(idev);
1485         }
1486 }
1487
1488 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1489                             int how)
1490 {
1491         struct rtable *rt = (struct rtable *) dst;
1492         struct in_device *idev = rt->idev;
1493         if (dev != &loopback_dev && idev && idev->dev == dev) {
1494                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1495                 if (loopback_idev) {
1496                         rt->idev = loopback_idev;
1497                         in_dev_put(idev);
1498                 }
1499         }
1500 }
1501
1502 static void ipv4_link_failure(struct sk_buff *skb)
1503 {
1504         struct rtable *rt;
1505
1506         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1507
1508         rt = (struct rtable *) skb->dst;
1509         if (rt)
1510                 dst_set_expires(&rt->u.dst, 0);
1511 }
1512
1513 static int ip_rt_bug(struct sk_buff *skb)
1514 {
1515         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1516                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1517                 skb->dev ? skb->dev->name : "?");
1518         kfree_skb(skb);
1519         return 0;
1520 }
1521
1522 /*
1523    We do not cache source address of outgoing interface,
1524    because it is used only by IP RR, TS and SRR options,
1525    so that it out of fast path.
1526
1527    BTW remember: "addr" is allowed to be not aligned
1528    in IP options!
1529  */
1530
1531 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1532 {
1533         u32 src;
1534         struct fib_result res;
1535
1536         if (rt->fl.iif == 0)
1537                 src = rt->rt_src;
1538         else if (fib_lookup(&rt->fl, &res) == 0) {
1539                 src = FIB_RES_PREFSRC(res);
1540                 fib_res_put(&res);
1541         } else
1542                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1543                                         RT_SCOPE_UNIVERSE);
1544         memcpy(addr, &src, 4);
1545 }
1546
1547 #ifdef CONFIG_NET_CLS_ROUTE
1548 static void set_class_tag(struct rtable *rt, u32 tag)
1549 {
1550         if (!(rt->u.dst.tclassid & 0xFFFF))
1551                 rt->u.dst.tclassid |= tag & 0xFFFF;
1552         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1553                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1554 }
1555 #endif
1556
1557 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1558 {
1559         struct fib_info *fi = res->fi;
1560
1561         if (fi) {
1562                 if (FIB_RES_GW(*res) &&
1563                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1564                         rt->rt_gateway = FIB_RES_GW(*res);
1565                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1566                        sizeof(rt->u.dst.metrics));
1567                 if (fi->fib_mtu == 0) {
1568                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1569                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1570                             rt->rt_gateway != rt->rt_dst &&
1571                             rt->u.dst.dev->mtu > 576)
1572                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1573                 }
1574 #ifdef CONFIG_NET_CLS_ROUTE
1575                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1576 #endif
1577         } else
1578                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1579
1580         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1581                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1582         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1583                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1584         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1585                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1586                                        ip_rt_min_advmss);
1587         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1588                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1589
1590 #ifdef CONFIG_NET_CLS_ROUTE
1591 #ifdef CONFIG_IP_MULTIPLE_TABLES
1592         set_class_tag(rt, fib_rules_tclass(res));
1593 #endif
1594         set_class_tag(rt, itag);
1595 #endif
1596         rt->rt_type = res->type;
1597 }
1598
1599 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1600                                 u8 tos, struct net_device *dev, int our)
1601 {
1602         unsigned hash;
1603         struct rtable *rth;
1604         u32 spec_dst;
1605         struct in_device *in_dev = in_dev_get(dev);
1606         u32 itag = 0;
1607
1608         /* Primary sanity checks. */
1609
1610         if (in_dev == NULL)
1611                 return -EINVAL;
1612
1613         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1614             skb->protocol != htons(ETH_P_IP))
1615                 goto e_inval;
1616
1617         if (ZERONET(saddr)) {
1618                 if (!LOCAL_MCAST(daddr))
1619                         goto e_inval;
1620                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1621         } else if (fib_validate_source(saddr, 0, tos, 0,
1622                                         dev, &spec_dst, &itag) < 0)
1623                 goto e_inval;
1624
1625         rth = dst_alloc(&ipv4_dst_ops);
1626         if (!rth)
1627                 goto e_nobufs;
1628
1629         rth->u.dst.output= ip_rt_bug;
1630
1631         atomic_set(&rth->u.dst.__refcnt, 1);
1632         rth->u.dst.flags= DST_HOST;
1633         if (in_dev->cnf.no_policy)
1634                 rth->u.dst.flags |= DST_NOPOLICY;
1635         rth->fl.fl4_dst = daddr;
1636         rth->rt_dst     = daddr;
1637         rth->fl.fl4_tos = tos;
1638 #ifdef CONFIG_IP_ROUTE_FWMARK
1639         rth->fl.fl4_fwmark= skb->nfmark;
1640 #endif
1641         rth->fl.fl4_src = saddr;
1642         rth->rt_src     = saddr;
1643 #ifdef CONFIG_NET_CLS_ROUTE
1644         rth->u.dst.tclassid = itag;
1645 #endif
1646         rth->rt_iif     =
1647         rth->fl.iif     = dev->ifindex;
1648         rth->u.dst.dev  = &loopback_dev;
1649         dev_hold(rth->u.dst.dev);
1650         rth->idev       = in_dev_get(rth->u.dst.dev);
1651         rth->fl.oif     = 0;
1652         rth->rt_gateway = daddr;
1653         rth->rt_spec_dst= spec_dst;
1654         rth->rt_type    = RTN_MULTICAST;
1655         rth->rt_flags   = RTCF_MULTICAST;
1656         if (our) {
1657                 rth->u.dst.input= ip_local_deliver;
1658                 rth->rt_flags |= RTCF_LOCAL;
1659         }
1660
1661 #ifdef CONFIG_IP_MROUTE
1662         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1663                 rth->u.dst.input = ip_mr_input;
1664 #endif
1665         RT_CACHE_STAT_INC(in_slow_mc);
1666
1667         in_dev_put(in_dev);
1668         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1669         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1670
1671 e_nobufs:
1672         in_dev_put(in_dev);
1673         return -ENOBUFS;
1674
1675 e_inval:
1676         in_dev_put(in_dev);
1677         return -EINVAL;
1678 }
1679
1680
1681 static void ip_handle_martian_source(struct net_device *dev,
1682                                      struct in_device *in_dev,
1683                                      struct sk_buff *skb,
1684                                      u32 daddr,
1685                                      u32 saddr) 
1686 {
1687         RT_CACHE_STAT_INC(in_martian_src);
1688 #ifdef CONFIG_IP_ROUTE_VERBOSE
1689         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1690                 /*
1691                  *      RFC1812 recommendation, if source is martian,
1692                  *      the only hint is MAC header.
1693                  */
1694                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1695                         "%u.%u.%u.%u, on dev %s\n",
1696                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1697                 if (dev->hard_header_len && skb->mac.raw) {
1698                         int i;
1699                         unsigned char *p = skb->mac.raw;
1700                         printk(KERN_WARNING "ll header: ");
1701                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1702                                 printk("%02x", *p);
1703                                 if (i < (dev->hard_header_len - 1))
1704                                         printk(":");
1705                         }
1706                         printk("\n");
1707                 }
1708         }
1709 #endif
1710 }
1711
1712 static inline int __mkroute_input(struct sk_buff *skb, 
1713                                   struct fib_result* res, 
1714                                   struct in_device *in_dev, 
1715                                   u32 daddr, u32 saddr, u32 tos, 
1716                                   struct rtable **result) 
1717 {
1718
1719         struct rtable *rth;
1720         int err;
1721         struct in_device *out_dev;
1722         unsigned flags = 0;
1723         u32 spec_dst, itag;
1724
1725         /* get a working reference to the output device */
1726         out_dev = in_dev_get(FIB_RES_DEV(*res));
1727         if (out_dev == NULL) {
1728                 if (net_ratelimit())
1729                         printk(KERN_CRIT "Bug in ip_route_input" \
1730                                "_slow(). Please, report\n");
1731                 return -EINVAL;
1732         }
1733
1734
1735         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1736                                   in_dev->dev, &spec_dst, &itag);
1737         if (err < 0) {
1738                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1739                                          saddr);
1740                 
1741                 err = -EINVAL;
1742                 goto cleanup;
1743         }
1744
1745         if (err)
1746                 flags |= RTCF_DIRECTSRC;
1747
1748         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1749             (IN_DEV_SHARED_MEDIA(out_dev) ||
1750              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1751                 flags |= RTCF_DOREDIRECT;
1752
1753         if (skb->protocol != htons(ETH_P_IP)) {
1754                 /* Not IP (i.e. ARP). Do not create route, if it is
1755                  * invalid for proxy arp. DNAT routes are always valid.
1756                  */
1757                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1758                         err = -EINVAL;
1759                         goto cleanup;
1760                 }
1761         }
1762
1763
1764         rth = dst_alloc(&ipv4_dst_ops);
1765         if (!rth) {
1766                 err = -ENOBUFS;
1767                 goto cleanup;
1768         }
1769
1770         atomic_set(&rth->u.dst.__refcnt, 1);
1771         rth->u.dst.flags= DST_HOST;
1772 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1773         if (res->fi->fib_nhs > 1)
1774                 rth->u.dst.flags |= DST_BALANCED;
1775 #endif
1776         if (in_dev->cnf.no_policy)
1777                 rth->u.dst.flags |= DST_NOPOLICY;
1778         if (in_dev->cnf.no_xfrm)
1779                 rth->u.dst.flags |= DST_NOXFRM;
1780         rth->fl.fl4_dst = daddr;
1781         rth->rt_dst     = daddr;
1782         rth->fl.fl4_tos = tos;
1783 #ifdef CONFIG_IP_ROUTE_FWMARK
1784         rth->fl.fl4_fwmark= skb->nfmark;
1785 #endif
1786         rth->fl.fl4_src = saddr;
1787         rth->rt_src     = saddr;
1788         rth->rt_gateway = daddr;
1789         rth->rt_iif     =
1790                 rth->fl.iif     = in_dev->dev->ifindex;
1791         rth->u.dst.dev  = (out_dev)->dev;
1792         dev_hold(rth->u.dst.dev);
1793         rth->idev       = in_dev_get(rth->u.dst.dev);
1794         rth->fl.oif     = 0;
1795         rth->rt_spec_dst= spec_dst;
1796
1797         rth->u.dst.input = ip_forward;
1798         rth->u.dst.output = ip_output;
1799
1800         rt_set_nexthop(rth, res, itag);
1801
1802         rth->rt_flags = flags;
1803
1804         *result = rth;
1805         err = 0;
1806  cleanup:
1807         /* release the working reference to the output device */
1808         in_dev_put(out_dev);
1809         return err;
1810 }                                               
1811
1812 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1813                                        struct fib_result* res, 
1814                                        const struct flowi *fl,
1815                                        struct in_device *in_dev,
1816                                        u32 daddr, u32 saddr, u32 tos)
1817 {
1818         struct rtable* rth = NULL;
1819         int err;
1820         unsigned hash;
1821
1822 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1823         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1824                 fib_select_multipath(fl, res);
1825 #endif
1826
1827         /* create a routing cache entry */
1828         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1829         if (err)
1830                 return err;
1831
1832         /* put it into the cache */
1833         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1834         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1835 }
1836
1837 static inline int ip_mkroute_input(struct sk_buff *skb, 
1838                                    struct fib_result* res, 
1839                                    const struct flowi *fl,
1840                                    struct in_device *in_dev,
1841                                    u32 daddr, u32 saddr, u32 tos)
1842 {
1843 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1844         struct rtable* rth = NULL, *rtres;
1845         unsigned char hop, hopcount;
1846         int err = -EINVAL;
1847         unsigned int hash;
1848
1849         if (res->fi)
1850                 hopcount = res->fi->fib_nhs;
1851         else
1852                 hopcount = 1;
1853
1854         /* distinguish between multipath and singlepath */
1855         if (hopcount < 2)
1856                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1857                                             saddr, tos);
1858         
1859         /* add all alternatives to the routing cache */
1860         for (hop = 0; hop < hopcount; hop++) {
1861                 res->nh_sel = hop;
1862
1863                 /* put reference to previous result */
1864                 if (hop)
1865                         ip_rt_put(rtres);
1866
1867                 /* create a routing cache entry */
1868                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1869                                       &rth);
1870                 if (err)
1871                         return err;
1872
1873                 /* put it into the cache */
1874                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1875                 err = rt_intern_hash(hash, rth, &rtres);
1876                 if (err)
1877                         return err;
1878
1879                 /* forward hop information to multipath impl. */
1880                 multipath_set_nhinfo(rth,
1881                                      FIB_RES_NETWORK(*res),
1882                                      FIB_RES_NETMASK(*res),
1883                                      res->prefixlen,
1884                                      &FIB_RES_NH(*res));
1885         }
1886         skb->dst = &rtres->u.dst;
1887         return err;
1888 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1889         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1890 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1891 }
1892
1893
1894 /*
1895  *      NOTE. We drop all the packets that has local source
1896  *      addresses, because every properly looped back packet
1897  *      must have correct destination already attached by output routine.
1898  *
1899  *      Such approach solves two big problems:
1900  *      1. Not simplex devices are handled properly.
1901  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1902  */
1903
1904 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1905                                u8 tos, struct net_device *dev)
1906 {
1907         struct fib_result res;
1908         struct in_device *in_dev = in_dev_get(dev);
1909         struct flowi fl = { .nl_u = { .ip4_u =
1910                                       { .daddr = daddr,
1911                                         .saddr = saddr,
1912                                         .tos = tos,
1913                                         .scope = RT_SCOPE_UNIVERSE,
1914 #ifdef CONFIG_IP_ROUTE_FWMARK
1915                                         .fwmark = skb->nfmark
1916 #endif
1917                                       } },
1918                             .iif = dev->ifindex };
1919         unsigned        flags = 0;
1920         u32             itag = 0;
1921         struct rtable * rth;
1922         unsigned        hash;
1923         u32             spec_dst;
1924         int             err = -EINVAL;
1925         int             free_res = 0;
1926
1927         /* IP on this device is disabled. */
1928
1929         if (!in_dev)
1930                 goto out;
1931
1932         /* Check for the most weird martians, which can be not detected
1933            by fib_lookup.
1934          */
1935
1936         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1937                 goto martian_source;
1938
1939         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1940                 goto brd_input;
1941
1942         /* Accept zero addresses only to limited broadcast;
1943          * I even do not know to fix it or not. Waiting for complains :-)
1944          */
1945         if (ZERONET(saddr))
1946                 goto martian_source;
1947
1948         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1949                 goto martian_destination;
1950
1951         /*
1952          *      Now we are ready to route packet.
1953          */
1954         if ((err = fib_lookup(&fl, &res)) != 0) {
1955                 if (!IN_DEV_FORWARD(in_dev))
1956                         goto e_hostunreach;
1957                 goto no_route;
1958         }
1959         free_res = 1;
1960
1961         RT_CACHE_STAT_INC(in_slow_tot);
1962
1963         if (res.type == RTN_BROADCAST)
1964                 goto brd_input;
1965
1966         if (res.type == RTN_LOCAL) {
1967                 int result;
1968                 result = fib_validate_source(saddr, daddr, tos,
1969                                              loopback_dev.ifindex,
1970                                              dev, &spec_dst, &itag);
1971                 if (result < 0)
1972                         goto martian_source;
1973                 if (result)
1974                         flags |= RTCF_DIRECTSRC;
1975                 spec_dst = daddr;
1976                 goto local_input;
1977         }
1978
1979         if (!IN_DEV_FORWARD(in_dev))
1980                 goto e_hostunreach;
1981         if (res.type != RTN_UNICAST)
1982                 goto martian_destination;
1983
1984         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1985         if (err == -ENOBUFS)
1986                 goto e_nobufs;
1987         if (err == -EINVAL)
1988                 goto e_inval;
1989         
1990 done:
1991         in_dev_put(in_dev);
1992         if (free_res)
1993                 fib_res_put(&res);
1994 out:    return err;
1995
1996 brd_input:
1997         if (skb->protocol != htons(ETH_P_IP))
1998                 goto e_inval;
1999
2000         if (ZERONET(saddr))
2001                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2002         else {
2003                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2004                                           &itag);
2005                 if (err < 0)
2006                         goto martian_source;
2007                 if (err)
2008                         flags |= RTCF_DIRECTSRC;
2009         }
2010         flags |= RTCF_BROADCAST;
2011         res.type = RTN_BROADCAST;
2012         RT_CACHE_STAT_INC(in_brd);
2013
2014 local_input:
2015         rth = dst_alloc(&ipv4_dst_ops);
2016         if (!rth)
2017                 goto e_nobufs;
2018
2019         rth->u.dst.output= ip_rt_bug;
2020
2021         atomic_set(&rth->u.dst.__refcnt, 1);
2022         rth->u.dst.flags= DST_HOST;
2023         if (in_dev->cnf.no_policy)
2024                 rth->u.dst.flags |= DST_NOPOLICY;
2025         rth->fl.fl4_dst = daddr;
2026         rth->rt_dst     = daddr;
2027         rth->fl.fl4_tos = tos;
2028 #ifdef CONFIG_IP_ROUTE_FWMARK
2029         rth->fl.fl4_fwmark= skb->nfmark;
2030 #endif
2031         rth->fl.fl4_src = saddr;
2032         rth->rt_src     = saddr;
2033 #ifdef CONFIG_NET_CLS_ROUTE
2034         rth->u.dst.tclassid = itag;
2035 #endif
2036         rth->rt_iif     =
2037         rth->fl.iif     = dev->ifindex;
2038         rth->u.dst.dev  = &loopback_dev;
2039         dev_hold(rth->u.dst.dev);
2040         rth->idev       = in_dev_get(rth->u.dst.dev);
2041         rth->rt_gateway = daddr;
2042         rth->rt_spec_dst= spec_dst;
2043         rth->u.dst.input= ip_local_deliver;
2044         rth->rt_flags   = flags|RTCF_LOCAL;
2045         if (res.type == RTN_UNREACHABLE) {
2046                 rth->u.dst.input= ip_error;
2047                 rth->u.dst.error= -err;
2048                 rth->rt_flags   &= ~RTCF_LOCAL;
2049         }
2050         rth->rt_type    = res.type;
2051         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
2052         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053         goto done;
2054
2055 no_route:
2056         RT_CACHE_STAT_INC(in_no_route);
2057         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058         res.type = RTN_UNREACHABLE;
2059         goto local_input;
2060
2061         /*
2062          *      Do not cache martian addresses: they should be logged (RFC1812)
2063          */
2064 martian_destination:
2065         RT_CACHE_STAT_INC(in_martian_dst);
2066 #ifdef CONFIG_IP_ROUTE_VERBOSE
2067         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069                         "%u.%u.%u.%u, dev %s\n",
2070                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071 #endif
2072
2073 e_hostunreach:
2074         err = -EHOSTUNREACH;
2075         goto done;
2076
2077 e_inval:
2078         err = -EINVAL;
2079         goto done;
2080
2081 e_nobufs:
2082         err = -ENOBUFS;
2083         goto done;
2084
2085 martian_source:
2086         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087         goto e_inval;
2088 }
2089
2090 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2091                    u8 tos, struct net_device *dev)
2092 {
2093         struct rtable * rth;
2094         unsigned        hash;
2095         int iif = dev->ifindex;
2096
2097         tos &= IPTOS_RT_MASK;
2098         hash = rt_hash_code(daddr, saddr ^ (iif << 5));
2099
2100         rcu_read_lock();
2101         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102              rth = rcu_dereference(rth->u.rt_next)) {
2103                 if (rth->fl.fl4_dst == daddr &&
2104                     rth->fl.fl4_src == saddr &&
2105                     rth->fl.iif == iif &&
2106                     rth->fl.oif == 0 &&
2107 #ifdef CONFIG_IP_ROUTE_FWMARK
2108                     rth->fl.fl4_fwmark == skb->nfmark &&
2109 #endif
2110                     rth->fl.fl4_tos == tos) {
2111                         rth->u.dst.lastuse = jiffies;
2112                         dst_hold(&rth->u.dst);
2113                         rth->u.dst.__use++;
2114                         RT_CACHE_STAT_INC(in_hit);
2115                         rcu_read_unlock();
2116                         skb->dst = (struct dst_entry*)rth;
2117                         return 0;
2118                 }
2119                 RT_CACHE_STAT_INC(in_hlist_search);
2120         }
2121         rcu_read_unlock();
2122
2123         /* Multicast recognition logic is moved from route cache to here.
2124            The problem was that too many Ethernet cards have broken/missing
2125            hardware multicast filters :-( As result the host on multicasting
2126            network acquires a lot of useless route cache entries, sort of
2127            SDR messages from all the world. Now we try to get rid of them.
2128            Really, provided software IP multicast filter is organized
2129            reasonably (at least, hashed), it does not result in a slowdown
2130            comparing with route cache reject entries.
2131            Note, that multicast routers are not affected, because
2132            route cache entry is created eventually.
2133          */
2134         if (MULTICAST(daddr)) {
2135                 struct in_device *in_dev;
2136
2137                 rcu_read_lock();
2138                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2139                         int our = ip_check_mc(in_dev, daddr, saddr,
2140                                 skb->nh.iph->protocol);
2141                         if (our
2142 #ifdef CONFIG_IP_MROUTE
2143                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2144 #endif
2145                             ) {
2146                                 rcu_read_unlock();
2147                                 return ip_route_input_mc(skb, daddr, saddr,
2148                                                          tos, dev, our);
2149                         }
2150                 }
2151                 rcu_read_unlock();
2152                 return -EINVAL;
2153         }
2154         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2155 }
2156
2157 static inline int __mkroute_output(struct rtable **result,
2158                                    struct fib_result* res, 
2159                                    const struct flowi *fl,
2160                                    const struct flowi *oldflp, 
2161                                    struct net_device *dev_out, 
2162                                    unsigned flags) 
2163 {
2164         struct rtable *rth;
2165         struct in_device *in_dev;
2166         u32 tos = RT_FL_TOS(oldflp);
2167         int err = 0;
2168
2169         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2170                 return -EINVAL;
2171
2172         if (fl->fl4_dst == 0xFFFFFFFF)
2173                 res->type = RTN_BROADCAST;
2174         else if (MULTICAST(fl->fl4_dst))
2175                 res->type = RTN_MULTICAST;
2176         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2177                 return -EINVAL;
2178
2179         if (dev_out->flags & IFF_LOOPBACK)
2180                 flags |= RTCF_LOCAL;
2181
2182         /* get work reference to inet device */
2183         in_dev = in_dev_get(dev_out);
2184         if (!in_dev)
2185                 return -EINVAL;
2186
2187         if (res->type == RTN_BROADCAST) {
2188                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2189                 if (res->fi) {
2190                         fib_info_put(res->fi);
2191                         res->fi = NULL;
2192                 }
2193         } else if (res->type == RTN_MULTICAST) {
2194                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2195                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2196                                  oldflp->proto))
2197                         flags &= ~RTCF_LOCAL;
2198                 /* If multicast route do not exist use
2199                    default one, but do not gateway in this case.
2200                    Yes, it is hack.
2201                  */
2202                 if (res->fi && res->prefixlen < 4) {
2203                         fib_info_put(res->fi);
2204                         res->fi = NULL;
2205                 }
2206         }
2207
2208
2209         rth = dst_alloc(&ipv4_dst_ops);
2210         if (!rth) {
2211                 err = -ENOBUFS;
2212                 goto cleanup;
2213         }               
2214
2215         atomic_set(&rth->u.dst.__refcnt, 1);
2216         rth->u.dst.flags= DST_HOST;
2217 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2218         if (res->fi) {
2219                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2220                 if (res->fi->fib_nhs > 1)
2221                         rth->u.dst.flags |= DST_BALANCED;
2222         }
2223 #endif
2224         if (in_dev->cnf.no_xfrm)
2225                 rth->u.dst.flags |= DST_NOXFRM;
2226         if (in_dev->cnf.no_policy)
2227                 rth->u.dst.flags |= DST_NOPOLICY;
2228
2229         rth->fl.fl4_dst = oldflp->fl4_dst;
2230         rth->fl.fl4_tos = tos;
2231         rth->fl.fl4_src = oldflp->fl4_src;
2232         rth->fl.oif     = oldflp->oif;
2233 #ifdef CONFIG_IP_ROUTE_FWMARK
2234         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2235 #endif
2236         rth->rt_dst     = fl->fl4_dst;
2237         rth->rt_src     = fl->fl4_src;
2238         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2239         /* get references to the devices that are to be hold by the routing 
2240            cache entry */
2241         rth->u.dst.dev  = dev_out;
2242         dev_hold(dev_out);
2243         rth->idev       = in_dev_get(dev_out);
2244         rth->rt_gateway = fl->fl4_dst;
2245         rth->rt_spec_dst= fl->fl4_src;
2246
2247         rth->u.dst.output=ip_output;
2248
2249         RT_CACHE_STAT_INC(out_slow_tot);
2250
2251         if (flags & RTCF_LOCAL) {
2252                 rth->u.dst.input = ip_local_deliver;
2253                 rth->rt_spec_dst = fl->fl4_dst;
2254         }
2255         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2256                 rth->rt_spec_dst = fl->fl4_src;
2257                 if (flags & RTCF_LOCAL && 
2258                     !(dev_out->flags & IFF_LOOPBACK)) {
2259                         rth->u.dst.output = ip_mc_output;
2260                         RT_CACHE_STAT_INC(out_slow_mc);
2261                 }
2262 #ifdef CONFIG_IP_MROUTE
2263                 if (res->type == RTN_MULTICAST) {
2264                         if (IN_DEV_MFORWARD(in_dev) &&
2265                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2266                                 rth->u.dst.input = ip_mr_input;
2267                                 rth->u.dst.output = ip_mc_output;
2268                         }
2269                 }
2270 #endif
2271         }
2272
2273         rt_set_nexthop(rth, res, 0);
2274
2275         rth->rt_flags = flags;
2276
2277         *result = rth;
2278  cleanup:
2279         /* release work reference to inet device */
2280         in_dev_put(in_dev);
2281
2282         return err;
2283 }
2284
2285 static inline int ip_mkroute_output_def(struct rtable **rp,
2286                                         struct fib_result* res,
2287                                         const struct flowi *fl,
2288                                         const struct flowi *oldflp,
2289                                         struct net_device *dev_out,
2290                                         unsigned flags)
2291 {
2292         struct rtable *rth = NULL;
2293         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2294         unsigned hash;
2295         if (err == 0) {
2296                 hash = rt_hash_code(oldflp->fl4_dst, 
2297                                     oldflp->fl4_src ^ (oldflp->oif << 5));
2298                 err = rt_intern_hash(hash, rth, rp);
2299         }
2300         
2301         return err;
2302 }
2303
2304 static inline int ip_mkroute_output(struct rtable** rp,
2305                                     struct fib_result* res,
2306                                     const struct flowi *fl,
2307                                     const struct flowi *oldflp,
2308                                     struct net_device *dev_out,
2309                                     unsigned flags)
2310 {
2311 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2312         unsigned char hop;
2313         unsigned hash;
2314         int err = -EINVAL;
2315         struct rtable *rth = NULL;
2316
2317         if (res->fi && res->fi->fib_nhs > 1) {
2318                 unsigned char hopcount = res->fi->fib_nhs;
2319
2320                 for (hop = 0; hop < hopcount; hop++) {
2321                         struct net_device *dev2nexthop;
2322
2323                         res->nh_sel = hop;
2324
2325                         /* hold a work reference to the output device */
2326                         dev2nexthop = FIB_RES_DEV(*res);
2327                         dev_hold(dev2nexthop);
2328
2329                         /* put reference to previous result */
2330                         if (hop)
2331                                 ip_rt_put(*rp);
2332
2333                         err = __mkroute_output(&rth, res, fl, oldflp,
2334                                                dev2nexthop, flags);
2335
2336                         if (err != 0)
2337                                 goto cleanup;
2338
2339                         hash = rt_hash_code(oldflp->fl4_dst, 
2340                                             oldflp->fl4_src ^
2341                                             (oldflp->oif << 5));
2342                         err = rt_intern_hash(hash, rth, rp);
2343
2344                         /* forward hop information to multipath impl. */
2345                         multipath_set_nhinfo(rth,
2346                                              FIB_RES_NETWORK(*res),
2347                                              FIB_RES_NETMASK(*res),
2348                                              res->prefixlen,
2349                                              &FIB_RES_NH(*res));
2350                 cleanup:
2351                         /* release work reference to output device */
2352                         dev_put(dev2nexthop);
2353
2354                         if (err != 0)
2355                                 return err;
2356                 }
2357                 return err;
2358         } else {
2359                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2360                                              flags);
2361         }
2362 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2363         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2364 #endif
2365 }
2366
2367 /*
2368  * Major route resolver routine.
2369  */
2370
2371 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2372 {
2373         u32 tos = RT_FL_TOS(oldflp);
2374         struct flowi fl = { .nl_u = { .ip4_u =
2375                                       { .daddr = oldflp->fl4_dst,
2376                                         .saddr = oldflp->fl4_src,
2377                                         .tos = tos & IPTOS_RT_MASK,
2378                                         .scope = ((tos & RTO_ONLINK) ?
2379                                                   RT_SCOPE_LINK :
2380                                                   RT_SCOPE_UNIVERSE),
2381 #ifdef CONFIG_IP_ROUTE_FWMARK
2382                                         .fwmark = oldflp->fl4_fwmark
2383 #endif
2384                                       } },
2385                             .iif = loopback_dev.ifindex,
2386                             .oif = oldflp->oif };
2387         struct fib_result res;
2388         unsigned flags = 0;
2389         struct net_device *dev_out = NULL;
2390         int free_res = 0;
2391         int err;
2392
2393
2394         res.fi          = NULL;
2395 #ifdef CONFIG_IP_MULTIPLE_TABLES
2396         res.r           = NULL;
2397 #endif
2398
2399         if (oldflp->fl4_src) {
2400                 err = -EINVAL;
2401                 if (MULTICAST(oldflp->fl4_src) ||
2402                     BADCLASS(oldflp->fl4_src) ||
2403                     ZERONET(oldflp->fl4_src))
2404                         goto out;
2405
2406                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2407                 dev_out = ip_dev_find(oldflp->fl4_src);
2408                 if (dev_out == NULL)
2409                         goto out;
2410
2411                 /* I removed check for oif == dev_out->oif here.
2412                    It was wrong for two reasons:
2413                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2414                       assigned to multiple interfaces.
2415                    2. Moreover, we are allowed to send packets with saddr
2416                       of another iface. --ANK
2417                  */
2418
2419                 if (oldflp->oif == 0
2420                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2421                         /* Special hack: user can direct multicasts
2422                            and limited broadcast via necessary interface
2423                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2424                            This hack is not just for fun, it allows
2425                            vic,vat and friends to work.
2426                            They bind socket to loopback, set ttl to zero
2427                            and expect that it will work.
2428                            From the viewpoint of routing cache they are broken,
2429                            because we are not allowed to build multicast path
2430                            with loopback source addr (look, routing cache
2431                            cannot know, that ttl is zero, so that packet
2432                            will not leave this host and route is valid).
2433                            Luckily, this hack is good workaround.
2434                          */
2435
2436                         fl.oif = dev_out->ifindex;
2437                         goto make_route;
2438                 }
2439                 if (dev_out)
2440                         dev_put(dev_out);
2441                 dev_out = NULL;
2442         }
2443
2444
2445         if (oldflp->oif) {
2446                 dev_out = dev_get_by_index(oldflp->oif);
2447                 err = -ENODEV;
2448                 if (dev_out == NULL)
2449                         goto out;
2450
2451                 /* RACE: Check return value of inet_select_addr instead. */
2452                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2453                         dev_put(dev_out);
2454                         goto out;       /* Wrong error code */
2455                 }
2456
2457                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2458                         if (!fl.fl4_src)
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_LINK);
2461                         goto make_route;
2462                 }
2463                 if (!fl.fl4_src) {
2464                         if (MULTICAST(oldflp->fl4_dst))
2465                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2466                                                               fl.fl4_scope);
2467                         else if (!oldflp->fl4_dst)
2468                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2469                                                               RT_SCOPE_HOST);
2470                 }
2471         }
2472
2473         if (!fl.fl4_dst) {
2474                 fl.fl4_dst = fl.fl4_src;
2475                 if (!fl.fl4_dst)
2476                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2477                 if (dev_out)
2478                         dev_put(dev_out);
2479                 dev_out = &loopback_dev;
2480                 dev_hold(dev_out);
2481                 fl.oif = loopback_dev.ifindex;
2482                 res.type = RTN_LOCAL;
2483                 flags |= RTCF_LOCAL;
2484                 goto make_route;
2485         }
2486
2487         if (fib_lookup(&fl, &res)) {
2488                 res.fi = NULL;
2489                 if (oldflp->oif) {
2490                         /* Apparently, routing tables are wrong. Assume,
2491                            that the destination is on link.
2492
2493                            WHY? DW.
2494                            Because we are allowed to send to iface
2495                            even if it has NO routes and NO assigned
2496                            addresses. When oif is specified, routing
2497                            tables are looked up with only one purpose:
2498                            to catch if destination is gatewayed, rather than
2499                            direct. Moreover, if MSG_DONTROUTE is set,
2500                            we send packet, ignoring both routing tables
2501                            and ifaddr state. --ANK
2502
2503
2504                            We could make it even if oif is unknown,
2505                            likely IPv6, but we do not.
2506                          */
2507
2508                         if (fl.fl4_src == 0)
2509                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2510                                                               RT_SCOPE_LINK);
2511                         res.type = RTN_UNICAST;
2512                         goto make_route;
2513                 }
2514                 if (dev_out)
2515                         dev_put(dev_out);
2516                 err = -ENETUNREACH;
2517                 goto out;
2518         }
2519         free_res = 1;
2520
2521         if (res.type == RTN_LOCAL) {
2522                 if (!fl.fl4_src)
2523                         fl.fl4_src = fl.fl4_dst;
2524                 if (dev_out)
2525                         dev_put(dev_out);
2526                 dev_out = &loopback_dev;
2527                 dev_hold(dev_out);
2528                 fl.oif = dev_out->ifindex;
2529                 if (res.fi)
2530                         fib_info_put(res.fi);
2531                 res.fi = NULL;
2532                 flags |= RTCF_LOCAL;
2533                 goto make_route;
2534         }
2535
2536 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2537         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2538                 fib_select_multipath(&fl, &res);
2539         else
2540 #endif
2541         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2542                 fib_select_default(&fl, &res);
2543
2544         if (!fl.fl4_src)
2545                 fl.fl4_src = FIB_RES_PREFSRC(res);
2546
2547         if (dev_out)
2548                 dev_put(dev_out);
2549         dev_out = FIB_RES_DEV(res);
2550         dev_hold(dev_out);
2551         fl.oif = dev_out->ifindex;
2552
2553
2554 make_route:
2555         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2556
2557
2558         if (free_res)
2559                 fib_res_put(&res);
2560         if (dev_out)
2561                 dev_put(dev_out);
2562 out:    return err;
2563 }
2564
2565 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2566 {
2567         unsigned hash;
2568         struct rtable *rth;
2569
2570         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
2571
2572         rcu_read_lock_bh();
2573         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2574                 rth = rcu_dereference(rth->u.rt_next)) {
2575                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2576                     rth->fl.fl4_src == flp->fl4_src &&
2577                     rth->fl.iif == 0 &&
2578                     rth->fl.oif == flp->oif &&
2579 #ifdef CONFIG_IP_ROUTE_FWMARK
2580                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2581 #endif
2582                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2583                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2584
2585                         /* check for multipath routes and choose one if
2586                          * necessary
2587                          */
2588                         if (multipath_select_route(flp, rth, rp)) {
2589                                 dst_hold(&(*rp)->u.dst);
2590                                 RT_CACHE_STAT_INC(out_hit);
2591                                 rcu_read_unlock_bh();
2592                                 return 0;
2593                         }
2594
2595                         rth->u.dst.lastuse = jiffies;
2596                         dst_hold(&rth->u.dst);
2597                         rth->u.dst.__use++;
2598                         RT_CACHE_STAT_INC(out_hit);
2599                         rcu_read_unlock_bh();
2600                         *rp = rth;
2601                         return 0;
2602                 }
2603                 RT_CACHE_STAT_INC(out_hlist_search);
2604         }
2605         rcu_read_unlock_bh();
2606
2607         return ip_route_output_slow(rp, flp);
2608 }
2609
2610 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2611
2612 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2613 {
2614         int err;
2615
2616         if ((err = __ip_route_output_key(rp, flp)) != 0)
2617                 return err;
2618
2619         if (flp->proto) {
2620                 if (!flp->fl4_src)
2621                         flp->fl4_src = (*rp)->rt_src;
2622                 if (!flp->fl4_dst)
2623                         flp->fl4_dst = (*rp)->rt_dst;
2624                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2625         }
2626
2627         return 0;
2628 }
2629
2630 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2631
2632 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2633 {
2634         return ip_route_output_flow(rp, flp, NULL, 0);
2635 }
2636
2637 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2638                         int nowait, unsigned int flags)
2639 {
2640         struct rtable *rt = (struct rtable*)skb->dst;
2641         struct rtmsg *r;
2642         struct nlmsghdr  *nlh;
2643         unsigned char    *b = skb->tail;
2644         struct rta_cacheinfo ci;
2645 #ifdef CONFIG_IP_MROUTE
2646         struct rtattr *eptr;
2647 #endif
2648         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2649         r = NLMSG_DATA(nlh);
2650         r->rtm_family    = AF_INET;
2651         r->rtm_dst_len  = 32;
2652         r->rtm_src_len  = 0;
2653         r->rtm_tos      = rt->fl.fl4_tos;
2654         r->rtm_table    = RT_TABLE_MAIN;
2655         r->rtm_type     = rt->rt_type;
2656         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2657         r->rtm_protocol = RTPROT_UNSPEC;
2658         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2659         if (rt->rt_flags & RTCF_NOTIFY)
2660                 r->rtm_flags |= RTM_F_NOTIFY;
2661         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2662         if (rt->fl.fl4_src) {
2663                 r->rtm_src_len = 32;
2664                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2665         }
2666         if (rt->u.dst.dev)
2667                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2668 #ifdef CONFIG_NET_CLS_ROUTE
2669         if (rt->u.dst.tclassid)
2670                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2671 #endif
2672 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2673         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2674                 __u32 alg = rt->rt_multipath_alg;
2675
2676                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2677         }
2678 #endif
2679         if (rt->fl.iif)
2680                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2681         else if (rt->rt_src != rt->fl.fl4_src)
2682                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2683         if (rt->rt_dst != rt->rt_gateway)
2684                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2685         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2686                 goto rtattr_failure;
2687         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2688         ci.rta_used     = rt->u.dst.__use;
2689         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2690         if (rt->u.dst.expires)
2691                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2692         else
2693                 ci.rta_expires = 0;
2694         ci.rta_error    = rt->u.dst.error;
2695         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2696         if (rt->peer) {
2697                 ci.rta_id = rt->peer->ip_id_count;
2698                 if (rt->peer->tcp_ts_stamp) {
2699                         ci.rta_ts = rt->peer->tcp_ts;
2700                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2701                 }
2702         }
2703 #ifdef CONFIG_IP_MROUTE
2704         eptr = (struct rtattr*)skb->tail;
2705 #endif
2706         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2707         if (rt->fl.iif) {
2708 #ifdef CONFIG_IP_MROUTE
2709                 u32 dst = rt->rt_dst;
2710
2711                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2712                     ipv4_devconf.mc_forwarding) {
2713                         int err = ipmr_get_route(skb, r, nowait);
2714                         if (err <= 0) {
2715                                 if (!nowait) {
2716                                         if (err == 0)
2717                                                 return 0;
2718                                         goto nlmsg_failure;
2719                                 } else {
2720                                         if (err == -EMSGSIZE)
2721                                                 goto nlmsg_failure;
2722                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2723                                 }
2724                         }
2725                 } else
2726 #endif
2727                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2728         }
2729
2730         nlh->nlmsg_len = skb->tail - b;
2731         return skb->len;
2732
2733 nlmsg_failure:
2734 rtattr_failure:
2735         skb_trim(skb, b - skb->data);
2736         return -1;
2737 }
2738
2739 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2740 {
2741         struct rtattr **rta = arg;
2742         struct rtmsg *rtm = NLMSG_DATA(nlh);
2743         struct rtable *rt = NULL;
2744         u32 dst = 0;
2745         u32 src = 0;
2746         int iif = 0;
2747         int err = -ENOBUFS;
2748         struct sk_buff *skb;
2749
2750         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2751         if (!skb)
2752                 goto out;
2753
2754         /* Reserve room for dummy headers, this skb can pass
2755            through good chunk of routing engine.
2756          */
2757         skb->mac.raw = skb->nh.raw = skb->data;
2758
2759         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2760         skb->nh.iph->protocol = IPPROTO_ICMP;
2761         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2762
2763         if (rta[RTA_SRC - 1])
2764                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2765         if (rta[RTA_DST - 1])
2766                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2767         if (rta[RTA_IIF - 1])
2768                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2769
2770         if (iif) {
2771                 struct net_device *dev = __dev_get_by_index(iif);
2772                 err = -ENODEV;
2773                 if (!dev)
2774                         goto out_free;
2775                 skb->protocol   = htons(ETH_P_IP);
2776                 skb->dev        = dev;
2777                 local_bh_disable();
2778                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2779                 local_bh_enable();
2780                 rt = (struct rtable*)skb->dst;
2781                 if (!err && rt->u.dst.error)
2782                         err = -rt->u.dst.error;
2783         } else {
2784                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2785                                                          .saddr = src,
2786                                                          .tos = rtm->rtm_tos } } };
2787                 int oif = 0;
2788                 if (rta[RTA_OIF - 1])
2789                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2790                 fl.oif = oif;
2791                 err = ip_route_output_key(&rt, &fl);
2792         }
2793         if (err)
2794                 goto out_free;
2795
2796         skb->dst = &rt->u.dst;
2797         if (rtm->rtm_flags & RTM_F_NOTIFY)
2798                 rt->rt_flags |= RTCF_NOTIFY;
2799
2800         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2801
2802         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2803                                 RTM_NEWROUTE, 0, 0);
2804         if (!err)
2805                 goto out_free;
2806         if (err < 0) {
2807                 err = -EMSGSIZE;
2808                 goto out_free;
2809         }
2810
2811         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2812         if (err > 0)
2813                 err = 0;
2814 out:    return err;
2815
2816 out_free:
2817         kfree_skb(skb);
2818         goto out;
2819 }
2820
2821 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2822 {
2823         struct rtable *rt;
2824         int h, s_h;
2825         int idx, s_idx;
2826
2827         s_h = cb->args[0];
2828         s_idx = idx = cb->args[1];
2829         for (h = 0; h <= rt_hash_mask; h++) {
2830                 if (h < s_h) continue;
2831                 if (h > s_h)
2832                         s_idx = 0;
2833                 rcu_read_lock_bh();
2834                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2835                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2836                         if (idx < s_idx)
2837                                 continue;
2838                         skb->dst = dst_clone(&rt->u.dst);
2839                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2840                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2841                                          1, NLM_F_MULTI) <= 0) {
2842                                 dst_release(xchg(&skb->dst, NULL));
2843                                 rcu_read_unlock_bh();
2844                                 goto done;
2845                         }
2846                         dst_release(xchg(&skb->dst, NULL));
2847                 }
2848                 rcu_read_unlock_bh();
2849         }
2850
2851 done:
2852         cb->args[0] = h;
2853         cb->args[1] = idx;
2854         return skb->len;
2855 }
2856
2857 void ip_rt_multicast_event(struct in_device *in_dev)
2858 {
2859         rt_cache_flush(0);
2860 }
2861
2862 #ifdef CONFIG_SYSCTL
2863 static int flush_delay;
2864
2865 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2866                                         struct file *filp, void __user *buffer,
2867                                         size_t *lenp, loff_t *ppos)
2868 {
2869         if (write) {
2870                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2871                 rt_cache_flush(flush_delay);
2872                 return 0;
2873         } 
2874
2875         return -EINVAL;
2876 }
2877
2878 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2879                                                 int __user *name,
2880                                                 int nlen,
2881                                                 void __user *oldval,
2882                                                 size_t __user *oldlenp,
2883                                                 void __user *newval,
2884                                                 size_t newlen,
2885                                                 void **context)
2886 {
2887         int delay;
2888         if (newlen != sizeof(int))
2889                 return -EINVAL;
2890         if (get_user(delay, (int __user *)newval))
2891                 return -EFAULT; 
2892         rt_cache_flush(delay); 
2893         return 0;
2894 }
2895
2896 ctl_table ipv4_route_table[] = {
2897         {
2898                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2899                 .procname       = "flush",
2900                 .data           = &flush_delay,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0200,
2903                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2904                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2905         },
2906         {
2907                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2908                 .procname       = "min_delay",
2909                 .data           = &ip_rt_min_delay,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = &proc_dointvec_jiffies,
2913                 .strategy       = &sysctl_jiffies,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2917                 .procname       = "max_delay",
2918                 .data           = &ip_rt_max_delay,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec_jiffies,
2922                 .strategy       = &sysctl_jiffies,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2926                 .procname       = "gc_thresh",
2927                 .data           = &ipv4_dst_ops.gc_thresh,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2934                 .procname       = "max_size",
2935                 .data           = &ip_rt_max_size,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = &proc_dointvec,
2939         },
2940         {
2941                 /*  Deprecated. Use gc_min_interval_ms */
2942  
2943                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2944                 .procname       = "gc_min_interval",
2945                 .data           = &ip_rt_gc_min_interval,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec_jiffies,
2949                 .strategy       = &sysctl_jiffies,
2950         },
2951         {
2952                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2953                 .procname       = "gc_min_interval_ms",
2954                 .data           = &ip_rt_gc_min_interval,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = &proc_dointvec_ms_jiffies,
2958                 .strategy       = &sysctl_ms_jiffies,
2959         },
2960         {
2961                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2962                 .procname       = "gc_timeout",
2963                 .data           = &ip_rt_gc_timeout,
2964                 .maxlen         = sizeof(int),
2965                 .mode           = 0644,
2966                 .proc_handler   = &proc_dointvec_jiffies,
2967                 .strategy       = &sysctl_jiffies,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2971                 .procname       = "gc_interval",
2972                 .data           = &ip_rt_gc_interval,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec_jiffies,
2976                 .strategy       = &sysctl_jiffies,
2977         },
2978         {
2979                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2980                 .procname       = "redirect_load",
2981                 .data           = &ip_rt_redirect_load,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = &proc_dointvec,
2985         },
2986         {
2987                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2988                 .procname       = "redirect_number",
2989                 .data           = &ip_rt_redirect_number,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = &proc_dointvec,
2993         },
2994         {
2995                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2996                 .procname       = "redirect_silence",
2997                 .data           = &ip_rt_redirect_silence,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = &proc_dointvec,
3001         },
3002         {
3003                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3004                 .procname       = "error_cost",
3005                 .data           = &ip_rt_error_cost,
3006                 .maxlen         = sizeof(int),
3007                 .mode           = 0644,
3008                 .proc_handler   = &proc_dointvec,
3009         },
3010         {
3011                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3012                 .procname       = "error_burst",
3013                 .data           = &ip_rt_error_burst,
3014                 .maxlen         = sizeof(int),
3015                 .mode           = 0644,
3016                 .proc_handler   = &proc_dointvec,
3017         },
3018         {
3019                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3020                 .procname       = "gc_elasticity",
3021                 .data           = &ip_rt_gc_elasticity,
3022                 .maxlen         = sizeof(int),
3023                 .mode           = 0644,
3024                 .proc_handler   = &proc_dointvec,
3025         },
3026         {
3027                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3028                 .procname       = "mtu_expires",
3029                 .data           = &ip_rt_mtu_expires,
3030                 .maxlen         = sizeof(int),
3031                 .mode           = 0644,
3032                 .proc_handler   = &proc_dointvec_jiffies,
3033                 .strategy       = &sysctl_jiffies,
3034         },
3035         {
3036                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3037                 .procname       = "min_pmtu",
3038                 .data           = &ip_rt_min_pmtu,
3039                 .maxlen         = sizeof(int),
3040                 .mode           = 0644,
3041                 .proc_handler   = &proc_dointvec,
3042         },
3043         {
3044                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3045                 .procname       = "min_adv_mss",
3046                 .data           = &ip_rt_min_advmss,
3047                 .maxlen         = sizeof(int),
3048                 .mode           = 0644,
3049                 .proc_handler   = &proc_dointvec,
3050         },
3051         {
3052                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3053                 .procname       = "secret_interval",
3054                 .data           = &ip_rt_secret_interval,
3055                 .maxlen         = sizeof(int),
3056                 .mode           = 0644,
3057                 .proc_handler   = &proc_dointvec_jiffies,
3058                 .strategy       = &sysctl_jiffies,
3059         },
3060         { .ctl_name = 0 }
3061 };
3062 #endif
3063
3064 #ifdef CONFIG_NET_CLS_ROUTE
3065 struct ip_rt_acct *ip_rt_acct;
3066
3067 /* This code sucks.  But you should have seen it before! --RR */
3068
3069 /* IP route accounting ptr for this logical cpu number. */
3070 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3071
3072 #ifdef CONFIG_PROC_FS
3073 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3074                            int length, int *eof, void *data)
3075 {
3076         unsigned int i;
3077
3078         if ((offset & 3) || (length & 3))
3079                 return -EIO;
3080
3081         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3082                 *eof = 1;
3083                 return 0;
3084         }
3085
3086         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3087                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3088                 *eof = 1;
3089         }
3090
3091         offset /= sizeof(u32);
3092
3093         if (length > 0) {
3094                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3095                 u32 *dst = (u32 *) buffer;
3096
3097                 /* Copy first cpu. */
3098                 *start = buffer;
3099                 memcpy(dst, src, length);
3100
3101                 /* Add the other cpus in, one int at a time */
3102                 for_each_possible_cpu(i) {
3103                         unsigned int j;
3104
3105                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3106
3107                         for (j = 0; j < length/4; j++)
3108                                 dst[j] += src[j];
3109                 }
3110         }
3111         return length;
3112 }
3113 #endif /* CONFIG_PROC_FS */
3114 #endif /* CONFIG_NET_CLS_ROUTE */
3115
3116 static __initdata unsigned long rhash_entries;
3117 static int __init set_rhash_entries(char *str)
3118 {
3119         if (!str)
3120                 return 0;
3121         rhash_entries = simple_strtoul(str, &str, 0);
3122         return 1;
3123 }
3124 __setup("rhash_entries=", set_rhash_entries);
3125
3126 int __init ip_rt_init(void)
3127 {
3128         int rc = 0;
3129
3130         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3131                              (jiffies ^ (jiffies >> 7)));
3132
3133 #ifdef CONFIG_NET_CLS_ROUTE
3134         {
3135         int order;
3136         for (order = 0;
3137              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3138                 /* NOTHING */;
3139         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3140         if (!ip_rt_acct)
3141                 panic("IP: failed to allocate ip_rt_acct\n");
3142         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3143         }
3144 #endif
3145
3146         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3147                                                      sizeof(struct rtable),
3148                                                      0, SLAB_HWCACHE_ALIGN,
3149                                                      NULL, NULL);
3150
3151         if (!ipv4_dst_ops.kmem_cachep)
3152                 panic("IP: failed to allocate ip_dst_cache\n");
3153
3154         rt_hash_table = (struct rt_hash_bucket *)
3155                 alloc_large_system_hash("IP route cache",
3156                                         sizeof(struct rt_hash_bucket),
3157                                         rhash_entries,
3158                                         (num_physpages >= 128 * 1024) ?
3159                                         15 : 17,
3160                                         0,
3161                                         &rt_hash_log,
3162                                         &rt_hash_mask,
3163                                         0);
3164         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3165         rt_hash_lock_init();
3166
3167         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3168         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3169
3170         devinet_init();
3171         ip_fib_init();
3172
3173         init_timer(&rt_flush_timer);
3174         rt_flush_timer.function = rt_run_flush;
3175         init_timer(&rt_periodic_timer);
3176         rt_periodic_timer.function = rt_check_expire;
3177         init_timer(&rt_secret_timer);
3178         rt_secret_timer.function = rt_secret_rebuild;
3179
3180         /* All the timers, started at system startup tend
3181            to synchronize. Perturb it a bit.
3182          */
3183         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3184                                         ip_rt_gc_interval;
3185         add_timer(&rt_periodic_timer);
3186
3187         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3188                 ip_rt_secret_interval;
3189         add_timer(&rt_secret_timer);
3190
3191 #ifdef CONFIG_PROC_FS
3192         {
3193         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3194         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3195             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3196                                              proc_net_stat))) {
3197                 return -ENOMEM;
3198         }
3199         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3200         }
3201 #ifdef CONFIG_NET_CLS_ROUTE
3202         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3203 #endif
3204 #endif
3205 #ifdef CONFIG_XFRM
3206         xfrm_init();
3207         xfrm4_init();
3208 #endif
3209         return rc;
3210 }
3211
3212 EXPORT_SYMBOL(__ip_select_ident);
3213 EXPORT_SYMBOL(ip_route_input);
3214 EXPORT_SYMBOL(ip_route_output_key);