[IPV4]: ip_build_and_send_pkt() annotations
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142
143 /*
144  *      Interface to generic destination cache.
145  */
146
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
150                                          struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void              ipv4_link_failure(struct sk_buff *skb);
153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .protocol =             __constant_htons(ETH_P_IP),
160         .gc =                   rt_garbage_collect,
161         .check =                ipv4_dst_check,
162         .destroy =              ipv4_dst_destroy,
163         .ifdown =               ipv4_dst_ifdown,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .entry_size =           sizeof(struct rtable),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()    { \
235                 int i; \
236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239                         spin_lock_init(&rt_hash_locks[i]); \
240                 }
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245
246 static struct rt_hash_bucket    *rt_hash_table;
247 static unsigned                 rt_hash_mask;
248 static int                      rt_hash_log;
249 static unsigned int             rt_hash_rnd;
250
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253         (__raw_get_cpu_var(rt_cache_stat).field++)
254
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256                                 struct rtable **res);
257
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
261                 & rt_hash_mask);
262 }
263
264 #define rt_hash(daddr, saddr, idx) \
265         rt_hash_code((__force u32)(__be32)(daddr),\
266                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270         int bucket;
271 };
272
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 {
275         struct rtable *r = NULL;
276         struct rt_cache_iter_state *st = seq->private;
277
278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279                 rcu_read_lock_bh();
280                 r = rt_hash_table[st->bucket].chain;
281                 if (r)
282                         break;
283                 rcu_read_unlock_bh();
284         }
285         return r;
286 }
287
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 {
290         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291
292         r = r->u.rt_next;
293         while (!r) {
294                 rcu_read_unlock_bh();
295                 if (--st->bucket < 0)
296                         break;
297                 rcu_read_lock_bh();
298                 r = rt_hash_table[st->bucket].chain;
299         }
300         return r;
301 }
302
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 {
305         struct rtable *r = rt_cache_get_first(seq);
306
307         if (r)
308                 while (pos && (r = rt_cache_get_next(seq, r)))
309                         --pos;
310         return pos ? NULL : r;
311 }
312
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 {
315         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316 }
317
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 {
320         struct rtable *r = NULL;
321
322         if (v == SEQ_START_TOKEN)
323                 r = rt_cache_get_first(seq);
324         else
325                 r = rt_cache_get_next(seq, v);
326         ++*pos;
327         return r;
328 }
329
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 {
332         if (v && v != SEQ_START_TOKEN)
333                 rcu_read_unlock_bh();
334 }
335
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 {
338         if (v == SEQ_START_TOKEN)
339                 seq_printf(seq, "%-127s\n",
340                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342                            "HHUptod\tSpecDst");
343         else {
344                 struct rtable *r = v;
345                 char temp[256];
346
347                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349                         r->u.dst.dev ? r->u.dst.dev->name : "*",
350                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
353                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355                         dst_metric(&r->u.dst, RTAX_WINDOW),
356                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
358                         r->fl.fl4_tos,
359                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361                                        dev_queue_xmit) : 0,
362                         r->rt_spec_dst);
363                 seq_printf(seq, "%-127s\n", temp);
364         }
365         return 0;
366 }
367
368 static struct seq_operations rt_cache_seq_ops = {
369         .start  = rt_cache_seq_start,
370         .next   = rt_cache_seq_next,
371         .stop   = rt_cache_seq_stop,
372         .show   = rt_cache_seq_show,
373 };
374
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 {
377         struct seq_file *seq;
378         int rc = -ENOMEM;
379         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380
381         if (!s)
382                 goto out;
383         rc = seq_open(file, &rt_cache_seq_ops);
384         if (rc)
385                 goto out_kfree;
386         seq          = file->private_data;
387         seq->private = s;
388         memset(s, 0, sizeof(*s));
389 out:
390         return rc;
391 out_kfree:
392         kfree(s);
393         goto out;
394 }
395
396 static struct file_operations rt_cache_seq_fops = {
397         .owner   = THIS_MODULE,
398         .open    = rt_cache_seq_open,
399         .read    = seq_read,
400         .llseek  = seq_lseek,
401         .release = seq_release_private,
402 };
403
404
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406 {
407         int cpu;
408
409         if (*pos == 0)
410                 return SEQ_START_TOKEN;
411
412         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413                 if (!cpu_possible(cpu))
414                         continue;
415                 *pos = cpu+1;
416                 return &per_cpu(rt_cache_stat, cpu);
417         }
418         return NULL;
419 }
420
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422 {
423         int cpu;
424
425         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426                 if (!cpu_possible(cpu))
427                         continue;
428                 *pos = cpu+1;
429                 return &per_cpu(rt_cache_stat, cpu);
430         }
431         return NULL;
432         
433 }
434
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436 {
437
438 }
439
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 {
442         struct rt_cache_stat *st = v;
443
444         if (v == SEQ_START_TOKEN) {
445                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
446                 return 0;
447         }
448         
449         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
450                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451                    atomic_read(&ipv4_dst_ops.entries),
452                    st->in_hit,
453                    st->in_slow_tot,
454                    st->in_slow_mc,
455                    st->in_no_route,
456                    st->in_brd,
457                    st->in_martian_dst,
458                    st->in_martian_src,
459
460                    st->out_hit,
461                    st->out_slow_tot,
462                    st->out_slow_mc, 
463
464                    st->gc_total,
465                    st->gc_ignored,
466                    st->gc_goal_miss,
467                    st->gc_dst_overflow,
468                    st->in_hlist_search,
469                    st->out_hlist_search
470                 );
471         return 0;
472 }
473
474 static struct seq_operations rt_cpu_seq_ops = {
475         .start  = rt_cpu_seq_start,
476         .next   = rt_cpu_seq_next,
477         .stop   = rt_cpu_seq_stop,
478         .show   = rt_cpu_seq_show,
479 };
480
481
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 {
484         return seq_open(file, &rt_cpu_seq_ops);
485 }
486
487 static struct file_operations rt_cpu_seq_fops = {
488         .owner   = THIS_MODULE,
489         .open    = rt_cpu_seq_open,
490         .read    = seq_read,
491         .llseek  = seq_lseek,
492         .release = seq_release,
493 };
494
495 #endif /* CONFIG_PROC_FS */
496   
497 static __inline__ void rt_free(struct rtable *rt)
498 {
499         multipath_remove(rt);
500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501 }
502
503 static __inline__ void rt_drop(struct rtable *rt)
504 {
505         multipath_remove(rt);
506         ip_rt_put(rt);
507         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508 }
509
510 static __inline__ int rt_fast_clean(struct rtable *rth)
511 {
512         /* Kill broadcast/multicast entries very aggresively, if they
513            collide in hash table with more useful entries */
514         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515                 rth->fl.iif && rth->u.rt_next;
516 }
517
518 static __inline__ int rt_valuable(struct rtable *rth)
519 {
520         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521                 rth->u.dst.expires;
522 }
523
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525 {
526         unsigned long age;
527         int ret = 0;
528
529         if (atomic_read(&rth->u.dst.__refcnt))
530                 goto out;
531
532         ret = 1;
533         if (rth->u.dst.expires &&
534             time_after_eq(jiffies, rth->u.dst.expires))
535                 goto out;
536
537         age = jiffies - rth->u.dst.lastuse;
538         ret = 0;
539         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540             (age <= tmo2 && rt_valuable(rth)))
541                 goto out;
542         ret = 1;
543 out:    return ret;
544 }
545
546 /* Bits of score are:
547  * 31: very valuable
548  * 30: not quite useless
549  * 29..0: usage counter
550  */
551 static inline u32 rt_score(struct rtable *rt)
552 {
553         u32 score = jiffies - rt->u.dst.lastuse;
554
555         score = ~score & ~(3<<30);
556
557         if (rt_valuable(rt))
558                 score |= (1<<31);
559
560         if (!rt->fl.iif ||
561             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562                 score |= (1<<30);
563
564         return score;
565 }
566
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568 {
569         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
570                fl1->oif     == fl2->oif &&
571                fl1->iif     == fl2->iif;
572 }
573
574 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
575 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
576                                                 struct rtable *expentry,
577                                                 int *removed_count)
578 {
579         int passedexpired = 0;
580         struct rtable **nextstep = NULL;
581         struct rtable **rthp = chain_head;
582         struct rtable *rth;
583
584         if (removed_count)
585                 *removed_count = 0;
586
587         while ((rth = *rthp) != NULL) {
588                 if (rth == expentry)
589                         passedexpired = 1;
590
591                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
592                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
593                         if (*rthp == expentry) {
594                                 *rthp = rth->u.rt_next;
595                                 continue;
596                         } else {
597                                 *rthp = rth->u.rt_next;
598                                 rt_free(rth);
599                                 if (removed_count)
600                                         ++(*removed_count);
601                         }
602                 } else {
603                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
604                             passedexpired && !nextstep)
605                                 nextstep = &rth->u.rt_next;
606
607                         rthp = &rth->u.rt_next;
608                 }
609         }
610
611         rt_free(expentry);
612         if (removed_count)
613                 ++(*removed_count);
614
615         return nextstep;
616 }
617 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
618
619
620 /* This runs via a timer and thus is always in BH context. */
621 static void rt_check_expire(unsigned long dummy)
622 {
623         static unsigned int rover;
624         unsigned int i = rover, goal;
625         struct rtable *rth, **rthp;
626         unsigned long now = jiffies;
627         u64 mult;
628
629         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
630         if (ip_rt_gc_timeout > 1)
631                 do_div(mult, ip_rt_gc_timeout);
632         goal = (unsigned int)mult;
633         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
634         for (; goal > 0; goal--) {
635                 unsigned long tmo = ip_rt_gc_timeout;
636
637                 i = (i + 1) & rt_hash_mask;
638                 rthp = &rt_hash_table[i].chain;
639
640                 if (*rthp == 0)
641                         continue;
642                 spin_lock(rt_hash_lock_addr(i));
643                 while ((rth = *rthp) != NULL) {
644                         if (rth->u.dst.expires) {
645                                 /* Entry is expired even if it is in use */
646                                 if (time_before_eq(now, rth->u.dst.expires)) {
647                                         tmo >>= 1;
648                                         rthp = &rth->u.rt_next;
649                                         continue;
650                                 }
651                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
652                                 tmo >>= 1;
653                                 rthp = &rth->u.rt_next;
654                                 continue;
655                         }
656
657                         /* Cleanup aged off entries. */
658 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
659                         /* remove all related balanced entries if necessary */
660                         if (rth->u.dst.flags & DST_BALANCED) {
661                                 rthp = rt_remove_balanced_route(
662                                         &rt_hash_table[i].chain,
663                                         rth, NULL);
664                                 if (!rthp)
665                                         break;
666                         } else {
667                                 *rthp = rth->u.rt_next;
668                                 rt_free(rth);
669                         }
670 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
671                         *rthp = rth->u.rt_next;
672                         rt_free(rth);
673 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
674                 }
675                 spin_unlock(rt_hash_lock_addr(i));
676
677                 /* Fallback loop breaker. */
678                 if (time_after(jiffies, now))
679                         break;
680         }
681         rover = i;
682         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
683 }
684
685 /* This can run from both BH and non-BH contexts, the latter
686  * in the case of a forced flush event.
687  */
688 static void rt_run_flush(unsigned long dummy)
689 {
690         int i;
691         struct rtable *rth, *next;
692
693         rt_deadline = 0;
694
695         get_random_bytes(&rt_hash_rnd, 4);
696
697         for (i = rt_hash_mask; i >= 0; i--) {
698                 spin_lock_bh(rt_hash_lock_addr(i));
699                 rth = rt_hash_table[i].chain;
700                 if (rth)
701                         rt_hash_table[i].chain = NULL;
702                 spin_unlock_bh(rt_hash_lock_addr(i));
703
704                 for (; rth; rth = next) {
705                         next = rth->u.rt_next;
706                         rt_free(rth);
707                 }
708         }
709 }
710
711 static DEFINE_SPINLOCK(rt_flush_lock);
712
713 void rt_cache_flush(int delay)
714 {
715         unsigned long now = jiffies;
716         int user_mode = !in_softirq();
717
718         if (delay < 0)
719                 delay = ip_rt_min_delay;
720
721         /* flush existing multipath state*/
722         multipath_flush();
723
724         spin_lock_bh(&rt_flush_lock);
725
726         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
727                 long tmo = (long)(rt_deadline - now);
728
729                 /* If flush timer is already running
730                    and flush request is not immediate (delay > 0):
731
732                    if deadline is not achieved, prolongate timer to "delay",
733                    otherwise fire it at deadline time.
734                  */
735
736                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
737                         tmo = 0;
738                 
739                 if (delay > tmo)
740                         delay = tmo;
741         }
742
743         if (delay <= 0) {
744                 spin_unlock_bh(&rt_flush_lock);
745                 rt_run_flush(0);
746                 return;
747         }
748
749         if (rt_deadline == 0)
750                 rt_deadline = now + ip_rt_max_delay;
751
752         mod_timer(&rt_flush_timer, now+delay);
753         spin_unlock_bh(&rt_flush_lock);
754 }
755
756 static void rt_secret_rebuild(unsigned long dummy)
757 {
758         unsigned long now = jiffies;
759
760         rt_cache_flush(0);
761         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
762 }
763
764 /*
765    Short description of GC goals.
766
767    We want to build algorithm, which will keep routing cache
768    at some equilibrium point, when number of aged off entries
769    is kept approximately equal to newly generated ones.
770
771    Current expiration strength is variable "expire".
772    We try to adjust it dynamically, so that if networking
773    is idle expires is large enough to keep enough of warm entries,
774    and when load increases it reduces to limit cache size.
775  */
776
777 static int rt_garbage_collect(void)
778 {
779         static unsigned long expire = RT_GC_TIMEOUT;
780         static unsigned long last_gc;
781         static int rover;
782         static int equilibrium;
783         struct rtable *rth, **rthp;
784         unsigned long now = jiffies;
785         int goal;
786
787         /*
788          * Garbage collection is pretty expensive,
789          * do not make it too frequently.
790          */
791
792         RT_CACHE_STAT_INC(gc_total);
793
794         if (now - last_gc < ip_rt_gc_min_interval &&
795             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
796                 RT_CACHE_STAT_INC(gc_ignored);
797                 goto out;
798         }
799
800         /* Calculate number of entries, which we want to expire now. */
801         goal = atomic_read(&ipv4_dst_ops.entries) -
802                 (ip_rt_gc_elasticity << rt_hash_log);
803         if (goal <= 0) {
804                 if (equilibrium < ipv4_dst_ops.gc_thresh)
805                         equilibrium = ipv4_dst_ops.gc_thresh;
806                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
807                 if (goal > 0) {
808                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
809                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
810                 }
811         } else {
812                 /* We are in dangerous area. Try to reduce cache really
813                  * aggressively.
814                  */
815                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
816                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
817         }
818
819         if (now - last_gc >= ip_rt_gc_min_interval)
820                 last_gc = now;
821
822         if (goal <= 0) {
823                 equilibrium += goal;
824                 goto work_done;
825         }
826
827         do {
828                 int i, k;
829
830                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
831                         unsigned long tmo = expire;
832
833                         k = (k + 1) & rt_hash_mask;
834                         rthp = &rt_hash_table[k].chain;
835                         spin_lock_bh(rt_hash_lock_addr(k));
836                         while ((rth = *rthp) != NULL) {
837                                 if (!rt_may_expire(rth, tmo, expire)) {
838                                         tmo >>= 1;
839                                         rthp = &rth->u.rt_next;
840                                         continue;
841                                 }
842 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
843                                 /* remove all related balanced entries
844                                  * if necessary
845                                  */
846                                 if (rth->u.dst.flags & DST_BALANCED) {
847                                         int r;
848
849                                         rthp = rt_remove_balanced_route(
850                                                 &rt_hash_table[k].chain,
851                                                 rth,
852                                                 &r);
853                                         goal -= r;
854                                         if (!rthp)
855                                                 break;
856                                 } else {
857                                         *rthp = rth->u.rt_next;
858                                         rt_free(rth);
859                                         goal--;
860                                 }
861 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
862                                 *rthp = rth->u.rt_next;
863                                 rt_free(rth);
864                                 goal--;
865 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866                         }
867                         spin_unlock_bh(rt_hash_lock_addr(k));
868                         if (goal <= 0)
869                                 break;
870                 }
871                 rover = k;
872
873                 if (goal <= 0)
874                         goto work_done;
875
876                 /* Goal is not achieved. We stop process if:
877
878                    - if expire reduced to zero. Otherwise, expire is halfed.
879                    - if table is not full.
880                    - if we are called from interrupt.
881                    - jiffies check is just fallback/debug loop breaker.
882                      We will not spin here for long time in any case.
883                  */
884
885                 RT_CACHE_STAT_INC(gc_goal_miss);
886
887                 if (expire == 0)
888                         break;
889
890                 expire >>= 1;
891 #if RT_CACHE_DEBUG >= 2
892                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
893                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
894 #endif
895
896                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
897                         goto out;
898         } while (!in_softirq() && time_before_eq(jiffies, now));
899
900         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901                 goto out;
902         if (net_ratelimit())
903                 printk(KERN_WARNING "dst cache overflow\n");
904         RT_CACHE_STAT_INC(gc_dst_overflow);
905         return 1;
906
907 work_done:
908         expire += ip_rt_gc_min_interval;
909         if (expire > ip_rt_gc_timeout ||
910             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
911                 expire = ip_rt_gc_timeout;
912 #if RT_CACHE_DEBUG >= 2
913         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
914                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
915 #endif
916 out:    return 0;
917 }
918
919 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
920 {
921         struct rtable   *rth, **rthp;
922         unsigned long   now;
923         struct rtable *cand, **candp;
924         u32             min_score;
925         int             chain_length;
926         int attempts = !in_softirq();
927
928 restart:
929         chain_length = 0;
930         min_score = ~(u32)0;
931         cand = NULL;
932         candp = NULL;
933         now = jiffies;
934
935         rthp = &rt_hash_table[hash].chain;
936
937         spin_lock_bh(rt_hash_lock_addr(hash));
938         while ((rth = *rthp) != NULL) {
939 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
940                 if (!(rth->u.dst.flags & DST_BALANCED) &&
941                     compare_keys(&rth->fl, &rt->fl)) {
942 #else
943                 if (compare_keys(&rth->fl, &rt->fl)) {
944 #endif
945                         /* Put it first */
946                         *rthp = rth->u.rt_next;
947                         /*
948                          * Since lookup is lockfree, the deletion
949                          * must be visible to another weakly ordered CPU before
950                          * the insertion at the start of the hash chain.
951                          */
952                         rcu_assign_pointer(rth->u.rt_next,
953                                            rt_hash_table[hash].chain);
954                         /*
955                          * Since lookup is lockfree, the update writes
956                          * must be ordered for consistency on SMP.
957                          */
958                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
959
960                         rth->u.dst.__use++;
961                         dst_hold(&rth->u.dst);
962                         rth->u.dst.lastuse = now;
963                         spin_unlock_bh(rt_hash_lock_addr(hash));
964
965                         rt_drop(rt);
966                         *rp = rth;
967                         return 0;
968                 }
969
970                 if (!atomic_read(&rth->u.dst.__refcnt)) {
971                         u32 score = rt_score(rth);
972
973                         if (score <= min_score) {
974                                 cand = rth;
975                                 candp = rthp;
976                                 min_score = score;
977                         }
978                 }
979
980                 chain_length++;
981
982                 rthp = &rth->u.rt_next;
983         }
984
985         if (cand) {
986                 /* ip_rt_gc_elasticity used to be average length of chain
987                  * length, when exceeded gc becomes really aggressive.
988                  *
989                  * The second limit is less certain. At the moment it allows
990                  * only 2 entries per bucket. We will see.
991                  */
992                 if (chain_length > ip_rt_gc_elasticity) {
993                         *candp = cand->u.rt_next;
994                         rt_free(cand);
995                 }
996         }
997
998         /* Try to bind route to arp only if it is output
999            route or unicast forwarding path.
1000          */
1001         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1002                 int err = arp_bind_neighbour(&rt->u.dst);
1003                 if (err) {
1004                         spin_unlock_bh(rt_hash_lock_addr(hash));
1005
1006                         if (err != -ENOBUFS) {
1007                                 rt_drop(rt);
1008                                 return err;
1009                         }
1010
1011                         /* Neighbour tables are full and nothing
1012                            can be released. Try to shrink route cache,
1013                            it is most likely it holds some neighbour records.
1014                          */
1015                         if (attempts-- > 0) {
1016                                 int saved_elasticity = ip_rt_gc_elasticity;
1017                                 int saved_int = ip_rt_gc_min_interval;
1018                                 ip_rt_gc_elasticity     = 1;
1019                                 ip_rt_gc_min_interval   = 0;
1020                                 rt_garbage_collect();
1021                                 ip_rt_gc_min_interval   = saved_int;
1022                                 ip_rt_gc_elasticity     = saved_elasticity;
1023                                 goto restart;
1024                         }
1025
1026                         if (net_ratelimit())
1027                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1028                         rt_drop(rt);
1029                         return -ENOBUFS;
1030                 }
1031         }
1032
1033         rt->u.rt_next = rt_hash_table[hash].chain;
1034 #if RT_CACHE_DEBUG >= 2
1035         if (rt->u.rt_next) {
1036                 struct rtable *trt;
1037                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1038                        NIPQUAD(rt->rt_dst));
1039                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1040                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1041                 printk("\n");
1042         }
1043 #endif
1044         rt_hash_table[hash].chain = rt;
1045         spin_unlock_bh(rt_hash_lock_addr(hash));
1046         *rp = rt;
1047         return 0;
1048 }
1049
1050 void rt_bind_peer(struct rtable *rt, int create)
1051 {
1052         static DEFINE_SPINLOCK(rt_peer_lock);
1053         struct inet_peer *peer;
1054
1055         peer = inet_getpeer(rt->rt_dst, create);
1056
1057         spin_lock_bh(&rt_peer_lock);
1058         if (rt->peer == NULL) {
1059                 rt->peer = peer;
1060                 peer = NULL;
1061         }
1062         spin_unlock_bh(&rt_peer_lock);
1063         if (peer)
1064                 inet_putpeer(peer);
1065 }
1066
1067 /*
1068  * Peer allocation may fail only in serious out-of-memory conditions.  However
1069  * we still can generate some output.
1070  * Random ID selection looks a bit dangerous because we have no chances to
1071  * select ID being unique in a reasonable period of time.
1072  * But broken packet identifier may be better than no packet at all.
1073  */
1074 static void ip_select_fb_ident(struct iphdr *iph)
1075 {
1076         static DEFINE_SPINLOCK(ip_fb_id_lock);
1077         static u32 ip_fallback_id;
1078         u32 salt;
1079
1080         spin_lock_bh(&ip_fb_id_lock);
1081         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1082         iph->id = htons(salt & 0xFFFF);
1083         ip_fallback_id = salt;
1084         spin_unlock_bh(&ip_fb_id_lock);
1085 }
1086
1087 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1088 {
1089         struct rtable *rt = (struct rtable *) dst;
1090
1091         if (rt) {
1092                 if (rt->peer == NULL)
1093                         rt_bind_peer(rt, 1);
1094
1095                 /* If peer is attached to destination, it is never detached,
1096                    so that we need not to grab a lock to dereference it.
1097                  */
1098                 if (rt->peer) {
1099                         iph->id = htons(inet_getid(rt->peer, more));
1100                         return;
1101                 }
1102         } else
1103                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1104                        __builtin_return_address(0));
1105
1106         ip_select_fb_ident(iph);
1107 }
1108
1109 static void rt_del(unsigned hash, struct rtable *rt)
1110 {
1111         struct rtable **rthp;
1112
1113         spin_lock_bh(rt_hash_lock_addr(hash));
1114         ip_rt_put(rt);
1115         for (rthp = &rt_hash_table[hash].chain; *rthp;
1116              rthp = &(*rthp)->u.rt_next)
1117                 if (*rthp == rt) {
1118                         *rthp = rt->u.rt_next;
1119                         rt_free(rt);
1120                         break;
1121                 }
1122         spin_unlock_bh(rt_hash_lock_addr(hash));
1123 }
1124
1125 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1126                     __be32 saddr, struct net_device *dev)
1127 {
1128         int i, k;
1129         struct in_device *in_dev = in_dev_get(dev);
1130         struct rtable *rth, **rthp;
1131         __be32  skeys[2] = { saddr, 0 };
1132         int  ikeys[2] = { dev->ifindex, 0 };
1133         struct netevent_redirect netevent;
1134
1135         if (!in_dev)
1136                 return;
1137
1138         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1139             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1140                 goto reject_redirect;
1141
1142         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1143                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1144                         goto reject_redirect;
1145                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1146                         goto reject_redirect;
1147         } else {
1148                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1149                         goto reject_redirect;
1150         }
1151
1152         for (i = 0; i < 2; i++) {
1153                 for (k = 0; k < 2; k++) {
1154                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1155
1156                         rthp=&rt_hash_table[hash].chain;
1157
1158                         rcu_read_lock();
1159                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1160                                 struct rtable *rt;
1161
1162                                 if (rth->fl.fl4_dst != daddr ||
1163                                     rth->fl.fl4_src != skeys[i] ||
1164                                     rth->fl.oif != ikeys[k] ||
1165                                     rth->fl.iif != 0) {
1166                                         rthp = &rth->u.rt_next;
1167                                         continue;
1168                                 }
1169
1170                                 if (rth->rt_dst != daddr ||
1171                                     rth->rt_src != saddr ||
1172                                     rth->u.dst.error ||
1173                                     rth->rt_gateway != old_gw ||
1174                                     rth->u.dst.dev != dev)
1175                                         break;
1176
1177                                 dst_hold(&rth->u.dst);
1178                                 rcu_read_unlock();
1179
1180                                 rt = dst_alloc(&ipv4_dst_ops);
1181                                 if (rt == NULL) {
1182                                         ip_rt_put(rth);
1183                                         in_dev_put(in_dev);
1184                                         return;
1185                                 }
1186
1187                                 /* Copy all the information. */
1188                                 *rt = *rth;
1189                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1190                                 rt->u.dst.__use         = 1;
1191                                 atomic_set(&rt->u.dst.__refcnt, 1);
1192                                 rt->u.dst.child         = NULL;
1193                                 if (rt->u.dst.dev)
1194                                         dev_hold(rt->u.dst.dev);
1195                                 if (rt->idev)
1196                                         in_dev_hold(rt->idev);
1197                                 rt->u.dst.obsolete      = 0;
1198                                 rt->u.dst.lastuse       = jiffies;
1199                                 rt->u.dst.path          = &rt->u.dst;
1200                                 rt->u.dst.neighbour     = NULL;
1201                                 rt->u.dst.hh            = NULL;
1202                                 rt->u.dst.xfrm          = NULL;
1203
1204                                 rt->rt_flags            |= RTCF_REDIRECTED;
1205
1206                                 /* Gateway is different ... */
1207                                 rt->rt_gateway          = new_gw;
1208
1209                                 /* Redirect received -> path was valid */
1210                                 dst_confirm(&rth->u.dst);
1211
1212                                 if (rt->peer)
1213                                         atomic_inc(&rt->peer->refcnt);
1214
1215                                 if (arp_bind_neighbour(&rt->u.dst) ||
1216                                     !(rt->u.dst.neighbour->nud_state &
1217                                             NUD_VALID)) {
1218                                         if (rt->u.dst.neighbour)
1219                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1220                                         ip_rt_put(rth);
1221                                         rt_drop(rt);
1222                                         goto do_next;
1223                                 }
1224                                 
1225                                 netevent.old = &rth->u.dst;
1226                                 netevent.new = &rt->u.dst;
1227                                 call_netevent_notifiers(NETEVENT_REDIRECT, 
1228                                                         &netevent);
1229
1230                                 rt_del(hash, rth);
1231                                 if (!rt_intern_hash(hash, rt, &rt))
1232                                         ip_rt_put(rt);
1233                                 goto do_next;
1234                         }
1235                         rcu_read_unlock();
1236                 do_next:
1237                         ;
1238                 }
1239         }
1240         in_dev_put(in_dev);
1241         return;
1242
1243 reject_redirect:
1244 #ifdef CONFIG_IP_ROUTE_VERBOSE
1245         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1246                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1247                         "%u.%u.%u.%u ignored.\n"
1248                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1249                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1250                        NIPQUAD(saddr), NIPQUAD(daddr));
1251 #endif
1252         in_dev_put(in_dev);
1253 }
1254
1255 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1256 {
1257         struct rtable *rt = (struct rtable*)dst;
1258         struct dst_entry *ret = dst;
1259
1260         if (rt) {
1261                 if (dst->obsolete) {
1262                         ip_rt_put(rt);
1263                         ret = NULL;
1264                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1265                            rt->u.dst.expires) {
1266                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1267                                                 rt->fl.oif);
1268 #if RT_CACHE_DEBUG >= 1
1269                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1270                                           "%u.%u.%u.%u/%02x dropped\n",
1271                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1272 #endif
1273                         rt_del(hash, rt);
1274                         ret = NULL;
1275                 }
1276         }
1277         return ret;
1278 }
1279
1280 /*
1281  * Algorithm:
1282  *      1. The first ip_rt_redirect_number redirects are sent
1283  *         with exponential backoff, then we stop sending them at all,
1284  *         assuming that the host ignores our redirects.
1285  *      2. If we did not see packets requiring redirects
1286  *         during ip_rt_redirect_silence, we assume that the host
1287  *         forgot redirected route and start to send redirects again.
1288  *
1289  * This algorithm is much cheaper and more intelligent than dumb load limiting
1290  * in icmp.c.
1291  *
1292  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1293  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1294  */
1295
1296 void ip_rt_send_redirect(struct sk_buff *skb)
1297 {
1298         struct rtable *rt = (struct rtable*)skb->dst;
1299         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1300
1301         if (!in_dev)
1302                 return;
1303
1304         if (!IN_DEV_TX_REDIRECTS(in_dev))
1305                 goto out;
1306
1307         /* No redirected packets during ip_rt_redirect_silence;
1308          * reset the algorithm.
1309          */
1310         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1311                 rt->u.dst.rate_tokens = 0;
1312
1313         /* Too many ignored redirects; do not send anything
1314          * set u.dst.rate_last to the last seen redirected packet.
1315          */
1316         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1317                 rt->u.dst.rate_last = jiffies;
1318                 goto out;
1319         }
1320
1321         /* Check for load limit; set rate_last to the latest sent
1322          * redirect.
1323          */
1324         if (time_after(jiffies,
1325                        (rt->u.dst.rate_last +
1326                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1327                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1328                 rt->u.dst.rate_last = jiffies;
1329                 ++rt->u.dst.rate_tokens;
1330 #ifdef CONFIG_IP_ROUTE_VERBOSE
1331                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1332                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1333                     net_ratelimit())
1334                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1335                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1336                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1337                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1338 #endif
1339         }
1340 out:
1341         in_dev_put(in_dev);
1342 }
1343
1344 static int ip_error(struct sk_buff *skb)
1345 {
1346         struct rtable *rt = (struct rtable*)skb->dst;
1347         unsigned long now;
1348         int code;
1349
1350         switch (rt->u.dst.error) {
1351                 case EINVAL:
1352                 default:
1353                         goto out;
1354                 case EHOSTUNREACH:
1355                         code = ICMP_HOST_UNREACH;
1356                         break;
1357                 case ENETUNREACH:
1358                         code = ICMP_NET_UNREACH;
1359                         break;
1360                 case EACCES:
1361                         code = ICMP_PKT_FILTERED;
1362                         break;
1363         }
1364
1365         now = jiffies;
1366         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1367         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1368                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1369         rt->u.dst.rate_last = now;
1370         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1371                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1372                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1373         }
1374
1375 out:    kfree_skb(skb);
1376         return 0;
1377
1378
1379 /*
1380  *      The last two values are not from the RFC but
1381  *      are needed for AMPRnet AX.25 paths.
1382  */
1383
1384 static const unsigned short mtu_plateau[] =
1385 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1386
1387 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1388 {
1389         int i;
1390         
1391         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1392                 if (old_mtu > mtu_plateau[i])
1393                         return mtu_plateau[i];
1394         return 68;
1395 }
1396
1397 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1398 {
1399         int i;
1400         unsigned short old_mtu = ntohs(iph->tot_len);
1401         struct rtable *rth;
1402         __be32  skeys[2] = { iph->saddr, 0, };
1403         __be32  daddr = iph->daddr;
1404         unsigned short est_mtu = 0;
1405
1406         if (ipv4_config.no_pmtu_disc)
1407                 return 0;
1408
1409         for (i = 0; i < 2; i++) {
1410                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1411
1412                 rcu_read_lock();
1413                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1414                      rth = rcu_dereference(rth->u.rt_next)) {
1415                         if (rth->fl.fl4_dst == daddr &&
1416                             rth->fl.fl4_src == skeys[i] &&
1417                             rth->rt_dst  == daddr &&
1418                             rth->rt_src  == iph->saddr &&
1419                             rth->fl.iif == 0 &&
1420                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1421                                 unsigned short mtu = new_mtu;
1422
1423                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1424
1425                                         /* BSD 4.2 compatibility hack :-( */
1426                                         if (mtu == 0 &&
1427                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1428                                             old_mtu >= 68 + (iph->ihl << 2))
1429                                                 old_mtu -= iph->ihl << 2;
1430
1431                                         mtu = guess_mtu(old_mtu);
1432                                 }
1433                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1434                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1435                                                 dst_confirm(&rth->u.dst);
1436                                                 if (mtu < ip_rt_min_pmtu) {
1437                                                         mtu = ip_rt_min_pmtu;
1438                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1439                                                                 (1 << RTAX_MTU);
1440                                                 }
1441                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1442                                                 dst_set_expires(&rth->u.dst,
1443                                                         ip_rt_mtu_expires);
1444                                         }
1445                                         est_mtu = mtu;
1446                                 }
1447                         }
1448                 }
1449                 rcu_read_unlock();
1450         }
1451         return est_mtu ? : new_mtu;
1452 }
1453
1454 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1455 {
1456         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1457             !(dst_metric_locked(dst, RTAX_MTU))) {
1458                 if (mtu < ip_rt_min_pmtu) {
1459                         mtu = ip_rt_min_pmtu;
1460                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1461                 }
1462                 dst->metrics[RTAX_MTU-1] = mtu;
1463                 dst_set_expires(dst, ip_rt_mtu_expires);
1464                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1465         }
1466 }
1467
1468 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1469 {
1470         return NULL;
1471 }
1472
1473 static void ipv4_dst_destroy(struct dst_entry *dst)
1474 {
1475         struct rtable *rt = (struct rtable *) dst;
1476         struct inet_peer *peer = rt->peer;
1477         struct in_device *idev = rt->idev;
1478
1479         if (peer) {
1480                 rt->peer = NULL;
1481                 inet_putpeer(peer);
1482         }
1483
1484         if (idev) {
1485                 rt->idev = NULL;
1486                 in_dev_put(idev);
1487         }
1488 }
1489
1490 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1491                             int how)
1492 {
1493         struct rtable *rt = (struct rtable *) dst;
1494         struct in_device *idev = rt->idev;
1495         if (dev != &loopback_dev && idev && idev->dev == dev) {
1496                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1497                 if (loopback_idev) {
1498                         rt->idev = loopback_idev;
1499                         in_dev_put(idev);
1500                 }
1501         }
1502 }
1503
1504 static void ipv4_link_failure(struct sk_buff *skb)
1505 {
1506         struct rtable *rt;
1507
1508         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1509
1510         rt = (struct rtable *) skb->dst;
1511         if (rt)
1512                 dst_set_expires(&rt->u.dst, 0);
1513 }
1514
1515 static int ip_rt_bug(struct sk_buff *skb)
1516 {
1517         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1518                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1519                 skb->dev ? skb->dev->name : "?");
1520         kfree_skb(skb);
1521         return 0;
1522 }
1523
1524 /*
1525    We do not cache source address of outgoing interface,
1526    because it is used only by IP RR, TS and SRR options,
1527    so that it out of fast path.
1528
1529    BTW remember: "addr" is allowed to be not aligned
1530    in IP options!
1531  */
1532
1533 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1534 {
1535         __be32 src;
1536         struct fib_result res;
1537
1538         if (rt->fl.iif == 0)
1539                 src = rt->rt_src;
1540         else if (fib_lookup(&rt->fl, &res) == 0) {
1541                 src = FIB_RES_PREFSRC(res);
1542                 fib_res_put(&res);
1543         } else
1544                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1545                                         RT_SCOPE_UNIVERSE);
1546         memcpy(addr, &src, 4);
1547 }
1548
1549 #ifdef CONFIG_NET_CLS_ROUTE
1550 static void set_class_tag(struct rtable *rt, u32 tag)
1551 {
1552         if (!(rt->u.dst.tclassid & 0xFFFF))
1553                 rt->u.dst.tclassid |= tag & 0xFFFF;
1554         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1555                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1556 }
1557 #endif
1558
1559 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1560 {
1561         struct fib_info *fi = res->fi;
1562
1563         if (fi) {
1564                 if (FIB_RES_GW(*res) &&
1565                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1566                         rt->rt_gateway = FIB_RES_GW(*res);
1567                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1568                        sizeof(rt->u.dst.metrics));
1569                 if (fi->fib_mtu == 0) {
1570                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1571                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1572                             rt->rt_gateway != rt->rt_dst &&
1573                             rt->u.dst.dev->mtu > 576)
1574                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1575                 }
1576 #ifdef CONFIG_NET_CLS_ROUTE
1577                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1578 #endif
1579         } else
1580                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1581
1582         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1583                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1584         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1585                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1586         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1587                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1588                                        ip_rt_min_advmss);
1589         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1590                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1591
1592 #ifdef CONFIG_NET_CLS_ROUTE
1593 #ifdef CONFIG_IP_MULTIPLE_TABLES
1594         set_class_tag(rt, fib_rules_tclass(res));
1595 #endif
1596         set_class_tag(rt, itag);
1597 #endif
1598         rt->rt_type = res->type;
1599 }
1600
1601 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1602                                 u8 tos, struct net_device *dev, int our)
1603 {
1604         unsigned hash;
1605         struct rtable *rth;
1606         __be32 spec_dst;
1607         struct in_device *in_dev = in_dev_get(dev);
1608         u32 itag = 0;
1609
1610         /* Primary sanity checks. */
1611
1612         if (in_dev == NULL)
1613                 return -EINVAL;
1614
1615         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1616             skb->protocol != htons(ETH_P_IP))
1617                 goto e_inval;
1618
1619         if (ZERONET(saddr)) {
1620                 if (!LOCAL_MCAST(daddr))
1621                         goto e_inval;
1622                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1623         } else if (fib_validate_source(saddr, 0, tos, 0,
1624                                         dev, &spec_dst, &itag) < 0)
1625                 goto e_inval;
1626
1627         rth = dst_alloc(&ipv4_dst_ops);
1628         if (!rth)
1629                 goto e_nobufs;
1630
1631         rth->u.dst.output= ip_rt_bug;
1632
1633         atomic_set(&rth->u.dst.__refcnt, 1);
1634         rth->u.dst.flags= DST_HOST;
1635         if (in_dev->cnf.no_policy)
1636                 rth->u.dst.flags |= DST_NOPOLICY;
1637         rth->fl.fl4_dst = daddr;
1638         rth->rt_dst     = daddr;
1639         rth->fl.fl4_tos = tos;
1640 #ifdef CONFIG_IP_ROUTE_FWMARK
1641         rth->fl.fl4_fwmark= skb->nfmark;
1642 #endif
1643         rth->fl.fl4_src = saddr;
1644         rth->rt_src     = saddr;
1645 #ifdef CONFIG_NET_CLS_ROUTE
1646         rth->u.dst.tclassid = itag;
1647 #endif
1648         rth->rt_iif     =
1649         rth->fl.iif     = dev->ifindex;
1650         rth->u.dst.dev  = &loopback_dev;
1651         dev_hold(rth->u.dst.dev);
1652         rth->idev       = in_dev_get(rth->u.dst.dev);
1653         rth->fl.oif     = 0;
1654         rth->rt_gateway = daddr;
1655         rth->rt_spec_dst= spec_dst;
1656         rth->rt_type    = RTN_MULTICAST;
1657         rth->rt_flags   = RTCF_MULTICAST;
1658         if (our) {
1659                 rth->u.dst.input= ip_local_deliver;
1660                 rth->rt_flags |= RTCF_LOCAL;
1661         }
1662
1663 #ifdef CONFIG_IP_MROUTE
1664         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1665                 rth->u.dst.input = ip_mr_input;
1666 #endif
1667         RT_CACHE_STAT_INC(in_slow_mc);
1668
1669         in_dev_put(in_dev);
1670         hash = rt_hash(daddr, saddr, dev->ifindex);
1671         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1672
1673 e_nobufs:
1674         in_dev_put(in_dev);
1675         return -ENOBUFS;
1676
1677 e_inval:
1678         in_dev_put(in_dev);
1679         return -EINVAL;
1680 }
1681
1682
1683 static void ip_handle_martian_source(struct net_device *dev,
1684                                      struct in_device *in_dev,
1685                                      struct sk_buff *skb,
1686                                      __be32 daddr,
1687                                      __be32 saddr)
1688 {
1689         RT_CACHE_STAT_INC(in_martian_src);
1690 #ifdef CONFIG_IP_ROUTE_VERBOSE
1691         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1692                 /*
1693                  *      RFC1812 recommendation, if source is martian,
1694                  *      the only hint is MAC header.
1695                  */
1696                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1697                         "%u.%u.%u.%u, on dev %s\n",
1698                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1699                 if (dev->hard_header_len && skb->mac.raw) {
1700                         int i;
1701                         unsigned char *p = skb->mac.raw;
1702                         printk(KERN_WARNING "ll header: ");
1703                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1704                                 printk("%02x", *p);
1705                                 if (i < (dev->hard_header_len - 1))
1706                                         printk(":");
1707                         }
1708                         printk("\n");
1709                 }
1710         }
1711 #endif
1712 }
1713
1714 static inline int __mkroute_input(struct sk_buff *skb, 
1715                                   struct fib_result* res, 
1716                                   struct in_device *in_dev, 
1717                                   __be32 daddr, __be32 saddr, u32 tos,
1718                                   struct rtable **result) 
1719 {
1720
1721         struct rtable *rth;
1722         int err;
1723         struct in_device *out_dev;
1724         unsigned flags = 0;
1725         __be32 spec_dst;
1726         u32 itag;
1727
1728         /* get a working reference to the output device */
1729         out_dev = in_dev_get(FIB_RES_DEV(*res));
1730         if (out_dev == NULL) {
1731                 if (net_ratelimit())
1732                         printk(KERN_CRIT "Bug in ip_route_input" \
1733                                "_slow(). Please, report\n");
1734                 return -EINVAL;
1735         }
1736
1737
1738         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1739                                   in_dev->dev, &spec_dst, &itag);
1740         if (err < 0) {
1741                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1742                                          saddr);
1743                 
1744                 err = -EINVAL;
1745                 goto cleanup;
1746         }
1747
1748         if (err)
1749                 flags |= RTCF_DIRECTSRC;
1750
1751         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1752             (IN_DEV_SHARED_MEDIA(out_dev) ||
1753              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1754                 flags |= RTCF_DOREDIRECT;
1755
1756         if (skb->protocol != htons(ETH_P_IP)) {
1757                 /* Not IP (i.e. ARP). Do not create route, if it is
1758                  * invalid for proxy arp. DNAT routes are always valid.
1759                  */
1760                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1761                         err = -EINVAL;
1762                         goto cleanup;
1763                 }
1764         }
1765
1766
1767         rth = dst_alloc(&ipv4_dst_ops);
1768         if (!rth) {
1769                 err = -ENOBUFS;
1770                 goto cleanup;
1771         }
1772
1773         atomic_set(&rth->u.dst.__refcnt, 1);
1774         rth->u.dst.flags= DST_HOST;
1775 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1776         if (res->fi->fib_nhs > 1)
1777                 rth->u.dst.flags |= DST_BALANCED;
1778 #endif
1779         if (in_dev->cnf.no_policy)
1780                 rth->u.dst.flags |= DST_NOPOLICY;
1781         if (in_dev->cnf.no_xfrm)
1782                 rth->u.dst.flags |= DST_NOXFRM;
1783         rth->fl.fl4_dst = daddr;
1784         rth->rt_dst     = daddr;
1785         rth->fl.fl4_tos = tos;
1786 #ifdef CONFIG_IP_ROUTE_FWMARK
1787         rth->fl.fl4_fwmark= skb->nfmark;
1788 #endif
1789         rth->fl.fl4_src = saddr;
1790         rth->rt_src     = saddr;
1791         rth->rt_gateway = daddr;
1792         rth->rt_iif     =
1793                 rth->fl.iif     = in_dev->dev->ifindex;
1794         rth->u.dst.dev  = (out_dev)->dev;
1795         dev_hold(rth->u.dst.dev);
1796         rth->idev       = in_dev_get(rth->u.dst.dev);
1797         rth->fl.oif     = 0;
1798         rth->rt_spec_dst= spec_dst;
1799
1800         rth->u.dst.input = ip_forward;
1801         rth->u.dst.output = ip_output;
1802
1803         rt_set_nexthop(rth, res, itag);
1804
1805         rth->rt_flags = flags;
1806
1807         *result = rth;
1808         err = 0;
1809  cleanup:
1810         /* release the working reference to the output device */
1811         in_dev_put(out_dev);
1812         return err;
1813 }                                               
1814
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1816                                        struct fib_result* res, 
1817                                        const struct flowi *fl,
1818                                        struct in_device *in_dev,
1819                                        __be32 daddr, __be32 saddr, u32 tos)
1820 {
1821         struct rtable* rth = NULL;
1822         int err;
1823         unsigned hash;
1824
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827                 fib_select_multipath(fl, res);
1828 #endif
1829
1830         /* create a routing cache entry */
1831         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832         if (err)
1833                 return err;
1834
1835         /* put it into the cache */
1836         hash = rt_hash(daddr, saddr, fl->iif);
1837         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1838 }
1839
1840 static inline int ip_mkroute_input(struct sk_buff *skb, 
1841                                    struct fib_result* res, 
1842                                    const struct flowi *fl,
1843                                    struct in_device *in_dev,
1844                                    __be32 daddr, __be32 saddr, u32 tos)
1845 {
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847         struct rtable* rth = NULL, *rtres;
1848         unsigned char hop, hopcount;
1849         int err = -EINVAL;
1850         unsigned int hash;
1851
1852         if (res->fi)
1853                 hopcount = res->fi->fib_nhs;
1854         else
1855                 hopcount = 1;
1856
1857         /* distinguish between multipath and singlepath */
1858         if (hopcount < 2)
1859                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860                                             saddr, tos);
1861         
1862         /* add all alternatives to the routing cache */
1863         for (hop = 0; hop < hopcount; hop++) {
1864                 res->nh_sel = hop;
1865
1866                 /* put reference to previous result */
1867                 if (hop)
1868                         ip_rt_put(rtres);
1869
1870                 /* create a routing cache entry */
1871                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872                                       &rth);
1873                 if (err)
1874                         return err;
1875
1876                 /* put it into the cache */
1877                 hash = rt_hash(daddr, saddr, fl->iif);
1878                 err = rt_intern_hash(hash, rth, &rtres);
1879                 if (err)
1880                         return err;
1881
1882                 /* forward hop information to multipath impl. */
1883                 multipath_set_nhinfo(rth,
1884                                      FIB_RES_NETWORK(*res),
1885                                      FIB_RES_NETMASK(*res),
1886                                      res->prefixlen,
1887                                      &FIB_RES_NH(*res));
1888         }
1889         skb->dst = &rtres->u.dst;
1890         return err;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894 }
1895
1896
1897 /*
1898  *      NOTE. We drop all the packets that has local source
1899  *      addresses, because every properly looped back packet
1900  *      must have correct destination already attached by output routine.
1901  *
1902  *      Such approach solves two big problems:
1903  *      1. Not simplex devices are handled properly.
1904  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1905  */
1906
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908                                u8 tos, struct net_device *dev)
1909 {
1910         struct fib_result res;
1911         struct in_device *in_dev = in_dev_get(dev);
1912         struct flowi fl = { .nl_u = { .ip4_u =
1913                                       { .daddr = daddr,
1914                                         .saddr = saddr,
1915                                         .tos = tos,
1916                                         .scope = RT_SCOPE_UNIVERSE,
1917 #ifdef CONFIG_IP_ROUTE_FWMARK
1918                                         .fwmark = skb->nfmark
1919 #endif
1920                                       } },
1921                             .iif = dev->ifindex };
1922         unsigned        flags = 0;
1923         u32             itag = 0;
1924         struct rtable * rth;
1925         unsigned        hash;
1926         __be32          spec_dst;
1927         int             err = -EINVAL;
1928         int             free_res = 0;
1929
1930         /* IP on this device is disabled. */
1931
1932         if (!in_dev)
1933                 goto out;
1934
1935         /* Check for the most weird martians, which can be not detected
1936            by fib_lookup.
1937          */
1938
1939         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1940                 goto martian_source;
1941
1942         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1943                 goto brd_input;
1944
1945         /* Accept zero addresses only to limited broadcast;
1946          * I even do not know to fix it or not. Waiting for complains :-)
1947          */
1948         if (ZERONET(saddr))
1949                 goto martian_source;
1950
1951         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1952                 goto martian_destination;
1953
1954         /*
1955          *      Now we are ready to route packet.
1956          */
1957         if ((err = fib_lookup(&fl, &res)) != 0) {
1958                 if (!IN_DEV_FORWARD(in_dev))
1959                         goto e_hostunreach;
1960                 goto no_route;
1961         }
1962         free_res = 1;
1963
1964         RT_CACHE_STAT_INC(in_slow_tot);
1965
1966         if (res.type == RTN_BROADCAST)
1967                 goto brd_input;
1968
1969         if (res.type == RTN_LOCAL) {
1970                 int result;
1971                 result = fib_validate_source(saddr, daddr, tos,
1972                                              loopback_dev.ifindex,
1973                                              dev, &spec_dst, &itag);
1974                 if (result < 0)
1975                         goto martian_source;
1976                 if (result)
1977                         flags |= RTCF_DIRECTSRC;
1978                 spec_dst = daddr;
1979                 goto local_input;
1980         }
1981
1982         if (!IN_DEV_FORWARD(in_dev))
1983                 goto e_hostunreach;
1984         if (res.type != RTN_UNICAST)
1985                 goto martian_destination;
1986
1987         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1988         if (err == -ENOBUFS)
1989                 goto e_nobufs;
1990         if (err == -EINVAL)
1991                 goto e_inval;
1992         
1993 done:
1994         in_dev_put(in_dev);
1995         if (free_res)
1996                 fib_res_put(&res);
1997 out:    return err;
1998
1999 brd_input:
2000         if (skb->protocol != htons(ETH_P_IP))
2001                 goto e_inval;
2002
2003         if (ZERONET(saddr))
2004                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2005         else {
2006                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2007                                           &itag);
2008                 if (err < 0)
2009                         goto martian_source;
2010                 if (err)
2011                         flags |= RTCF_DIRECTSRC;
2012         }
2013         flags |= RTCF_BROADCAST;
2014         res.type = RTN_BROADCAST;
2015         RT_CACHE_STAT_INC(in_brd);
2016
2017 local_input:
2018         rth = dst_alloc(&ipv4_dst_ops);
2019         if (!rth)
2020                 goto e_nobufs;
2021
2022         rth->u.dst.output= ip_rt_bug;
2023
2024         atomic_set(&rth->u.dst.__refcnt, 1);
2025         rth->u.dst.flags= DST_HOST;
2026         if (in_dev->cnf.no_policy)
2027                 rth->u.dst.flags |= DST_NOPOLICY;
2028         rth->fl.fl4_dst = daddr;
2029         rth->rt_dst     = daddr;
2030         rth->fl.fl4_tos = tos;
2031 #ifdef CONFIG_IP_ROUTE_FWMARK
2032         rth->fl.fl4_fwmark= skb->nfmark;
2033 #endif
2034         rth->fl.fl4_src = saddr;
2035         rth->rt_src     = saddr;
2036 #ifdef CONFIG_NET_CLS_ROUTE
2037         rth->u.dst.tclassid = itag;
2038 #endif
2039         rth->rt_iif     =
2040         rth->fl.iif     = dev->ifindex;
2041         rth->u.dst.dev  = &loopback_dev;
2042         dev_hold(rth->u.dst.dev);
2043         rth->idev       = in_dev_get(rth->u.dst.dev);
2044         rth->rt_gateway = daddr;
2045         rth->rt_spec_dst= spec_dst;
2046         rth->u.dst.input= ip_local_deliver;
2047         rth->rt_flags   = flags|RTCF_LOCAL;
2048         if (res.type == RTN_UNREACHABLE) {
2049                 rth->u.dst.input= ip_error;
2050                 rth->u.dst.error= -err;
2051                 rth->rt_flags   &= ~RTCF_LOCAL;
2052         }
2053         rth->rt_type    = res.type;
2054         hash = rt_hash(daddr, saddr, fl.iif);
2055         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2056         goto done;
2057
2058 no_route:
2059         RT_CACHE_STAT_INC(in_no_route);
2060         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2061         res.type = RTN_UNREACHABLE;
2062         goto local_input;
2063
2064         /*
2065          *      Do not cache martian addresses: they should be logged (RFC1812)
2066          */
2067 martian_destination:
2068         RT_CACHE_STAT_INC(in_martian_dst);
2069 #ifdef CONFIG_IP_ROUTE_VERBOSE
2070         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2071                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2072                         "%u.%u.%u.%u, dev %s\n",
2073                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2074 #endif
2075
2076 e_hostunreach:
2077         err = -EHOSTUNREACH;
2078         goto done;
2079
2080 e_inval:
2081         err = -EINVAL;
2082         goto done;
2083
2084 e_nobufs:
2085         err = -ENOBUFS;
2086         goto done;
2087
2088 martian_source:
2089         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2090         goto e_inval;
2091 }
2092
2093 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094                    u8 tos, struct net_device *dev)
2095 {
2096         struct rtable * rth;
2097         unsigned        hash;
2098         int iif = dev->ifindex;
2099
2100         tos &= IPTOS_RT_MASK;
2101         hash = rt_hash(daddr, saddr, iif);
2102
2103         rcu_read_lock();
2104         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2105              rth = rcu_dereference(rth->u.rt_next)) {
2106                 if (rth->fl.fl4_dst == daddr &&
2107                     rth->fl.fl4_src == saddr &&
2108                     rth->fl.iif == iif &&
2109                     rth->fl.oif == 0 &&
2110 #ifdef CONFIG_IP_ROUTE_FWMARK
2111                     rth->fl.fl4_fwmark == skb->nfmark &&
2112 #endif
2113                     rth->fl.fl4_tos == tos) {
2114                         rth->u.dst.lastuse = jiffies;
2115                         dst_hold(&rth->u.dst);
2116                         rth->u.dst.__use++;
2117                         RT_CACHE_STAT_INC(in_hit);
2118                         rcu_read_unlock();
2119                         skb->dst = (struct dst_entry*)rth;
2120                         return 0;
2121                 }
2122                 RT_CACHE_STAT_INC(in_hlist_search);
2123         }
2124         rcu_read_unlock();
2125
2126         /* Multicast recognition logic is moved from route cache to here.
2127            The problem was that too many Ethernet cards have broken/missing
2128            hardware multicast filters :-( As result the host on multicasting
2129            network acquires a lot of useless route cache entries, sort of
2130            SDR messages from all the world. Now we try to get rid of them.
2131            Really, provided software IP multicast filter is organized
2132            reasonably (at least, hashed), it does not result in a slowdown
2133            comparing with route cache reject entries.
2134            Note, that multicast routers are not affected, because
2135            route cache entry is created eventually.
2136          */
2137         if (MULTICAST(daddr)) {
2138                 struct in_device *in_dev;
2139
2140                 rcu_read_lock();
2141                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2142                         int our = ip_check_mc(in_dev, daddr, saddr,
2143                                 skb->nh.iph->protocol);
2144                         if (our
2145 #ifdef CONFIG_IP_MROUTE
2146                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2147 #endif
2148                             ) {
2149                                 rcu_read_unlock();
2150                                 return ip_route_input_mc(skb, daddr, saddr,
2151                                                          tos, dev, our);
2152                         }
2153                 }
2154                 rcu_read_unlock();
2155                 return -EINVAL;
2156         }
2157         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2158 }
2159
2160 static inline int __mkroute_output(struct rtable **result,
2161                                    struct fib_result* res, 
2162                                    const struct flowi *fl,
2163                                    const struct flowi *oldflp, 
2164                                    struct net_device *dev_out, 
2165                                    unsigned flags) 
2166 {
2167         struct rtable *rth;
2168         struct in_device *in_dev;
2169         u32 tos = RT_FL_TOS(oldflp);
2170         int err = 0;
2171
2172         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2173                 return -EINVAL;
2174
2175         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2176                 res->type = RTN_BROADCAST;
2177         else if (MULTICAST(fl->fl4_dst))
2178                 res->type = RTN_MULTICAST;
2179         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2180                 return -EINVAL;
2181
2182         if (dev_out->flags & IFF_LOOPBACK)
2183                 flags |= RTCF_LOCAL;
2184
2185         /* get work reference to inet device */
2186         in_dev = in_dev_get(dev_out);
2187         if (!in_dev)
2188                 return -EINVAL;
2189
2190         if (res->type == RTN_BROADCAST) {
2191                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2192                 if (res->fi) {
2193                         fib_info_put(res->fi);
2194                         res->fi = NULL;
2195                 }
2196         } else if (res->type == RTN_MULTICAST) {
2197                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2198                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2199                                  oldflp->proto))
2200                         flags &= ~RTCF_LOCAL;
2201                 /* If multicast route do not exist use
2202                    default one, but do not gateway in this case.
2203                    Yes, it is hack.
2204                  */
2205                 if (res->fi && res->prefixlen < 4) {
2206                         fib_info_put(res->fi);
2207                         res->fi = NULL;
2208                 }
2209         }
2210
2211
2212         rth = dst_alloc(&ipv4_dst_ops);
2213         if (!rth) {
2214                 err = -ENOBUFS;
2215                 goto cleanup;
2216         }               
2217
2218         atomic_set(&rth->u.dst.__refcnt, 1);
2219         rth->u.dst.flags= DST_HOST;
2220 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2221         if (res->fi) {
2222                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2223                 if (res->fi->fib_nhs > 1)
2224                         rth->u.dst.flags |= DST_BALANCED;
2225         }
2226 #endif
2227         if (in_dev->cnf.no_xfrm)
2228                 rth->u.dst.flags |= DST_NOXFRM;
2229         if (in_dev->cnf.no_policy)
2230                 rth->u.dst.flags |= DST_NOPOLICY;
2231
2232         rth->fl.fl4_dst = oldflp->fl4_dst;
2233         rth->fl.fl4_tos = tos;
2234         rth->fl.fl4_src = oldflp->fl4_src;
2235         rth->fl.oif     = oldflp->oif;
2236 #ifdef CONFIG_IP_ROUTE_FWMARK
2237         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2238 #endif
2239         rth->rt_dst     = fl->fl4_dst;
2240         rth->rt_src     = fl->fl4_src;
2241         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2242         /* get references to the devices that are to be hold by the routing 
2243            cache entry */
2244         rth->u.dst.dev  = dev_out;
2245         dev_hold(dev_out);
2246         rth->idev       = in_dev_get(dev_out);
2247         rth->rt_gateway = fl->fl4_dst;
2248         rth->rt_spec_dst= fl->fl4_src;
2249
2250         rth->u.dst.output=ip_output;
2251
2252         RT_CACHE_STAT_INC(out_slow_tot);
2253
2254         if (flags & RTCF_LOCAL) {
2255                 rth->u.dst.input = ip_local_deliver;
2256                 rth->rt_spec_dst = fl->fl4_dst;
2257         }
2258         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2259                 rth->rt_spec_dst = fl->fl4_src;
2260                 if (flags & RTCF_LOCAL && 
2261                     !(dev_out->flags & IFF_LOOPBACK)) {
2262                         rth->u.dst.output = ip_mc_output;
2263                         RT_CACHE_STAT_INC(out_slow_mc);
2264                 }
2265 #ifdef CONFIG_IP_MROUTE
2266                 if (res->type == RTN_MULTICAST) {
2267                         if (IN_DEV_MFORWARD(in_dev) &&
2268                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2269                                 rth->u.dst.input = ip_mr_input;
2270                                 rth->u.dst.output = ip_mc_output;
2271                         }
2272                 }
2273 #endif
2274         }
2275
2276         rt_set_nexthop(rth, res, 0);
2277
2278         rth->rt_flags = flags;
2279
2280         *result = rth;
2281  cleanup:
2282         /* release work reference to inet device */
2283         in_dev_put(in_dev);
2284
2285         return err;
2286 }
2287
2288 static inline int ip_mkroute_output_def(struct rtable **rp,
2289                                         struct fib_result* res,
2290                                         const struct flowi *fl,
2291                                         const struct flowi *oldflp,
2292                                         struct net_device *dev_out,
2293                                         unsigned flags)
2294 {
2295         struct rtable *rth = NULL;
2296         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2297         unsigned hash;
2298         if (err == 0) {
2299                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2300                 err = rt_intern_hash(hash, rth, rp);
2301         }
2302         
2303         return err;
2304 }
2305
2306 static inline int ip_mkroute_output(struct rtable** rp,
2307                                     struct fib_result* res,
2308                                     const struct flowi *fl,
2309                                     const struct flowi *oldflp,
2310                                     struct net_device *dev_out,
2311                                     unsigned flags)
2312 {
2313 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2314         unsigned char hop;
2315         unsigned hash;
2316         int err = -EINVAL;
2317         struct rtable *rth = NULL;
2318
2319         if (res->fi && res->fi->fib_nhs > 1) {
2320                 unsigned char hopcount = res->fi->fib_nhs;
2321
2322                 for (hop = 0; hop < hopcount; hop++) {
2323                         struct net_device *dev2nexthop;
2324
2325                         res->nh_sel = hop;
2326
2327                         /* hold a work reference to the output device */
2328                         dev2nexthop = FIB_RES_DEV(*res);
2329                         dev_hold(dev2nexthop);
2330
2331                         /* put reference to previous result */
2332                         if (hop)
2333                                 ip_rt_put(*rp);
2334
2335                         err = __mkroute_output(&rth, res, fl, oldflp,
2336                                                dev2nexthop, flags);
2337
2338                         if (err != 0)
2339                                 goto cleanup;
2340
2341                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2342                                         oldflp->oif);
2343                         err = rt_intern_hash(hash, rth, rp);
2344
2345                         /* forward hop information to multipath impl. */
2346                         multipath_set_nhinfo(rth,
2347                                              FIB_RES_NETWORK(*res),
2348                                              FIB_RES_NETMASK(*res),
2349                                              res->prefixlen,
2350                                              &FIB_RES_NH(*res));
2351                 cleanup:
2352                         /* release work reference to output device */
2353                         dev_put(dev2nexthop);
2354
2355                         if (err != 0)
2356                                 return err;
2357                 }
2358                 return err;
2359         } else {
2360                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2361                                              flags);
2362         }
2363 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2364         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2365 #endif
2366 }
2367
2368 /*
2369  * Major route resolver routine.
2370  */
2371
2372 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2373 {
2374         u32 tos = RT_FL_TOS(oldflp);
2375         struct flowi fl = { .nl_u = { .ip4_u =
2376                                       { .daddr = oldflp->fl4_dst,
2377                                         .saddr = oldflp->fl4_src,
2378                                         .tos = tos & IPTOS_RT_MASK,
2379                                         .scope = ((tos & RTO_ONLINK) ?
2380                                                   RT_SCOPE_LINK :
2381                                                   RT_SCOPE_UNIVERSE),
2382 #ifdef CONFIG_IP_ROUTE_FWMARK
2383                                         .fwmark = oldflp->fl4_fwmark
2384 #endif
2385                                       } },
2386                             .iif = loopback_dev.ifindex,
2387                             .oif = oldflp->oif };
2388         struct fib_result res;
2389         unsigned flags = 0;
2390         struct net_device *dev_out = NULL;
2391         int free_res = 0;
2392         int err;
2393
2394
2395         res.fi          = NULL;
2396 #ifdef CONFIG_IP_MULTIPLE_TABLES
2397         res.r           = NULL;
2398 #endif
2399
2400         if (oldflp->fl4_src) {
2401                 err = -EINVAL;
2402                 if (MULTICAST(oldflp->fl4_src) ||
2403                     BADCLASS(oldflp->fl4_src) ||
2404                     ZERONET(oldflp->fl4_src))
2405                         goto out;
2406
2407                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2408                 dev_out = ip_dev_find(oldflp->fl4_src);
2409                 if (dev_out == NULL)
2410                         goto out;
2411
2412                 /* I removed check for oif == dev_out->oif here.
2413                    It was wrong for two reasons:
2414                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2415                       assigned to multiple interfaces.
2416                    2. Moreover, we are allowed to send packets with saddr
2417                       of another iface. --ANK
2418                  */
2419
2420                 if (oldflp->oif == 0
2421                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2422                         /* Special hack: user can direct multicasts
2423                            and limited broadcast via necessary interface
2424                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2425                            This hack is not just for fun, it allows
2426                            vic,vat and friends to work.
2427                            They bind socket to loopback, set ttl to zero
2428                            and expect that it will work.
2429                            From the viewpoint of routing cache they are broken,
2430                            because we are not allowed to build multicast path
2431                            with loopback source addr (look, routing cache
2432                            cannot know, that ttl is zero, so that packet
2433                            will not leave this host and route is valid).
2434                            Luckily, this hack is good workaround.
2435                          */
2436
2437                         fl.oif = dev_out->ifindex;
2438                         goto make_route;
2439                 }
2440                 if (dev_out)
2441                         dev_put(dev_out);
2442                 dev_out = NULL;
2443         }
2444
2445
2446         if (oldflp->oif) {
2447                 dev_out = dev_get_by_index(oldflp->oif);
2448                 err = -ENODEV;
2449                 if (dev_out == NULL)
2450                         goto out;
2451
2452                 /* RACE: Check return value of inet_select_addr instead. */
2453                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2454                         dev_put(dev_out);
2455                         goto out;       /* Wrong error code */
2456                 }
2457
2458                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2459                         if (!fl.fl4_src)
2460                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2461                                                               RT_SCOPE_LINK);
2462                         goto make_route;
2463                 }
2464                 if (!fl.fl4_src) {
2465                         if (MULTICAST(oldflp->fl4_dst))
2466                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2467                                                               fl.fl4_scope);
2468                         else if (!oldflp->fl4_dst)
2469                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2470                                                               RT_SCOPE_HOST);
2471                 }
2472         }
2473
2474         if (!fl.fl4_dst) {
2475                 fl.fl4_dst = fl.fl4_src;
2476                 if (!fl.fl4_dst)
2477                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2478                 if (dev_out)
2479                         dev_put(dev_out);
2480                 dev_out = &loopback_dev;
2481                 dev_hold(dev_out);
2482                 fl.oif = loopback_dev.ifindex;
2483                 res.type = RTN_LOCAL;
2484                 flags |= RTCF_LOCAL;
2485                 goto make_route;
2486         }
2487
2488         if (fib_lookup(&fl, &res)) {
2489                 res.fi = NULL;
2490                 if (oldflp->oif) {
2491                         /* Apparently, routing tables are wrong. Assume,
2492                            that the destination is on link.
2493
2494                            WHY? DW.
2495                            Because we are allowed to send to iface
2496                            even if it has NO routes and NO assigned
2497                            addresses. When oif is specified, routing
2498                            tables are looked up with only one purpose:
2499                            to catch if destination is gatewayed, rather than
2500                            direct. Moreover, if MSG_DONTROUTE is set,
2501                            we send packet, ignoring both routing tables
2502                            and ifaddr state. --ANK
2503
2504
2505                            We could make it even if oif is unknown,
2506                            likely IPv6, but we do not.
2507                          */
2508
2509                         if (fl.fl4_src == 0)
2510                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2511                                                               RT_SCOPE_LINK);
2512                         res.type = RTN_UNICAST;
2513                         goto make_route;
2514                 }
2515                 if (dev_out)
2516                         dev_put(dev_out);
2517                 err = -ENETUNREACH;
2518                 goto out;
2519         }
2520         free_res = 1;
2521
2522         if (res.type == RTN_LOCAL) {
2523                 if (!fl.fl4_src)
2524                         fl.fl4_src = fl.fl4_dst;
2525                 if (dev_out)
2526                         dev_put(dev_out);
2527                 dev_out = &loopback_dev;
2528                 dev_hold(dev_out);
2529                 fl.oif = dev_out->ifindex;
2530                 if (res.fi)
2531                         fib_info_put(res.fi);
2532                 res.fi = NULL;
2533                 flags |= RTCF_LOCAL;
2534                 goto make_route;
2535         }
2536
2537 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2538         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2539                 fib_select_multipath(&fl, &res);
2540         else
2541 #endif
2542         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2543                 fib_select_default(&fl, &res);
2544
2545         if (!fl.fl4_src)
2546                 fl.fl4_src = FIB_RES_PREFSRC(res);
2547
2548         if (dev_out)
2549                 dev_put(dev_out);
2550         dev_out = FIB_RES_DEV(res);
2551         dev_hold(dev_out);
2552         fl.oif = dev_out->ifindex;
2553
2554
2555 make_route:
2556         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2557
2558
2559         if (free_res)
2560                 fib_res_put(&res);
2561         if (dev_out)
2562                 dev_put(dev_out);
2563 out:    return err;
2564 }
2565
2566 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2567 {
2568         unsigned hash;
2569         struct rtable *rth;
2570
2571         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2572
2573         rcu_read_lock_bh();
2574         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2575                 rth = rcu_dereference(rth->u.rt_next)) {
2576                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2577                     rth->fl.fl4_src == flp->fl4_src &&
2578                     rth->fl.iif == 0 &&
2579                     rth->fl.oif == flp->oif &&
2580 #ifdef CONFIG_IP_ROUTE_FWMARK
2581                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2582 #endif
2583                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2584                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2585
2586                         /* check for multipath routes and choose one if
2587                          * necessary
2588                          */
2589                         if (multipath_select_route(flp, rth, rp)) {
2590                                 dst_hold(&(*rp)->u.dst);
2591                                 RT_CACHE_STAT_INC(out_hit);
2592                                 rcu_read_unlock_bh();
2593                                 return 0;
2594                         }
2595
2596                         rth->u.dst.lastuse = jiffies;
2597                         dst_hold(&rth->u.dst);
2598                         rth->u.dst.__use++;
2599                         RT_CACHE_STAT_INC(out_hit);
2600                         rcu_read_unlock_bh();
2601                         *rp = rth;
2602                         return 0;
2603                 }
2604                 RT_CACHE_STAT_INC(out_hlist_search);
2605         }
2606         rcu_read_unlock_bh();
2607
2608         return ip_route_output_slow(rp, flp);
2609 }
2610
2611 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2612
2613 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2614 {
2615         int err;
2616
2617         if ((err = __ip_route_output_key(rp, flp)) != 0)
2618                 return err;
2619
2620         if (flp->proto) {
2621                 if (!flp->fl4_src)
2622                         flp->fl4_src = (*rp)->rt_src;
2623                 if (!flp->fl4_dst)
2624                         flp->fl4_dst = (*rp)->rt_dst;
2625                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2626         }
2627
2628         return 0;
2629 }
2630
2631 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2632
2633 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2634 {
2635         return ip_route_output_flow(rp, flp, NULL, 0);
2636 }
2637
2638 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2639                         int nowait, unsigned int flags)
2640 {
2641         struct rtable *rt = (struct rtable*)skb->dst;
2642         struct rtmsg *r;
2643         struct nlmsghdr *nlh;
2644         struct rta_cacheinfo ci;
2645
2646         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2647         if (nlh == NULL)
2648                 return -ENOBUFS;
2649
2650         r = nlmsg_data(nlh);
2651         r->rtm_family    = AF_INET;
2652         r->rtm_dst_len  = 32;
2653         r->rtm_src_len  = 0;
2654         r->rtm_tos      = rt->fl.fl4_tos;
2655         r->rtm_table    = RT_TABLE_MAIN;
2656         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2657         r->rtm_type     = rt->rt_type;
2658         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2659         r->rtm_protocol = RTPROT_UNSPEC;
2660         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2661         if (rt->rt_flags & RTCF_NOTIFY)
2662                 r->rtm_flags |= RTM_F_NOTIFY;
2663
2664         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2665
2666         if (rt->fl.fl4_src) {
2667                 r->rtm_src_len = 32;
2668                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2669         }
2670         if (rt->u.dst.dev)
2671                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2672 #ifdef CONFIG_NET_CLS_ROUTE
2673         if (rt->u.dst.tclassid)
2674                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2675 #endif
2676 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2677         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2678                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2679 #endif
2680         if (rt->fl.iif)
2681                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2682         else if (rt->rt_src != rt->fl.fl4_src)
2683                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2684
2685         if (rt->rt_dst != rt->rt_gateway)
2686                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2687
2688         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2689                 goto nla_put_failure;
2690
2691         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2692         ci.rta_used     = rt->u.dst.__use;
2693         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2694         if (rt->u.dst.expires)
2695                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2696         else
2697                 ci.rta_expires = 0;
2698         ci.rta_error    = rt->u.dst.error;
2699         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2700         if (rt->peer) {
2701                 ci.rta_id = rt->peer->ip_id_count;
2702                 if (rt->peer->tcp_ts_stamp) {
2703                         ci.rta_ts = rt->peer->tcp_ts;
2704                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2705                 }
2706         }
2707
2708         if (rt->fl.iif) {
2709 #ifdef CONFIG_IP_MROUTE
2710                 __be32 dst = rt->rt_dst;
2711
2712                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2713                     ipv4_devconf.mc_forwarding) {
2714                         int err = ipmr_get_route(skb, r, nowait);
2715                         if (err <= 0) {
2716                                 if (!nowait) {
2717                                         if (err == 0)
2718                                                 return 0;
2719                                         goto nla_put_failure;
2720                                 } else {
2721                                         if (err == -EMSGSIZE)
2722                                                 goto nla_put_failure;
2723                                         ci.rta_error = err;
2724                                 }
2725                         }
2726                 } else
2727 #endif
2728                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2729         }
2730
2731         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2732
2733         return nlmsg_end(skb, nlh);
2734
2735 nla_put_failure:
2736         return nlmsg_cancel(skb, nlh);
2737 }
2738
2739 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2740 {
2741         struct rtmsg *rtm;
2742         struct nlattr *tb[RTA_MAX+1];
2743         struct rtable *rt = NULL;
2744         __be32 dst = 0;
2745         __be32 src = 0;
2746         u32 iif;
2747         int err;
2748         struct sk_buff *skb;
2749
2750         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2751         if (err < 0)
2752                 goto errout;
2753
2754         rtm = nlmsg_data(nlh);
2755
2756         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2757         if (skb == NULL) {
2758                 err = -ENOBUFS;
2759                 goto errout;
2760         }
2761
2762         /* Reserve room for dummy headers, this skb can pass
2763            through good chunk of routing engine.
2764          */
2765         skb->mac.raw = skb->nh.raw = skb->data;
2766
2767         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2768         skb->nh.iph->protocol = IPPROTO_ICMP;
2769         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2770
2771         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2772         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2773         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2774
2775         if (iif) {
2776                 struct net_device *dev;
2777
2778                 dev = __dev_get_by_index(iif);
2779                 if (dev == NULL) {
2780                         err = -ENODEV;
2781                         goto errout_free;
2782                 }
2783
2784                 skb->protocol   = htons(ETH_P_IP);
2785                 skb->dev        = dev;
2786                 local_bh_disable();
2787                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2788                 local_bh_enable();
2789
2790                 rt = (struct rtable*) skb->dst;
2791                 if (err == 0 && rt->u.dst.error)
2792                         err = -rt->u.dst.error;
2793         } else {
2794                 struct flowi fl = {
2795                         .nl_u = {
2796                                 .ip4_u = {
2797                                         .daddr = dst,
2798                                         .saddr = src,
2799                                         .tos = rtm->rtm_tos,
2800                                 },
2801                         },
2802                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2803                 };
2804                 err = ip_route_output_key(&rt, &fl);
2805         }
2806
2807         if (err)
2808                 goto errout_free;
2809
2810         skb->dst = &rt->u.dst;
2811         if (rtm->rtm_flags & RTM_F_NOTIFY)
2812                 rt->rt_flags |= RTCF_NOTIFY;
2813
2814         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2815                                 RTM_NEWROUTE, 0, 0);
2816         if (err <= 0)
2817                 goto errout_free;
2818
2819         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2820 errout:
2821         return err;
2822
2823 errout_free:
2824         kfree_skb(skb);
2825         goto errout;
2826 }
2827
2828 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2829 {
2830         struct rtable *rt;
2831         int h, s_h;
2832         int idx, s_idx;
2833
2834         s_h = cb->args[0];
2835         s_idx = idx = cb->args[1];
2836         for (h = 0; h <= rt_hash_mask; h++) {
2837                 if (h < s_h) continue;
2838                 if (h > s_h)
2839                         s_idx = 0;
2840                 rcu_read_lock_bh();
2841                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2842                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2843                         if (idx < s_idx)
2844                                 continue;
2845                         skb->dst = dst_clone(&rt->u.dst);
2846                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2847                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2848                                          1, NLM_F_MULTI) <= 0) {
2849                                 dst_release(xchg(&skb->dst, NULL));
2850                                 rcu_read_unlock_bh();
2851                                 goto done;
2852                         }
2853                         dst_release(xchg(&skb->dst, NULL));
2854                 }
2855                 rcu_read_unlock_bh();
2856         }
2857
2858 done:
2859         cb->args[0] = h;
2860         cb->args[1] = idx;
2861         return skb->len;
2862 }
2863
2864 void ip_rt_multicast_event(struct in_device *in_dev)
2865 {
2866         rt_cache_flush(0);
2867 }
2868
2869 #ifdef CONFIG_SYSCTL
2870 static int flush_delay;
2871
2872 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2873                                         struct file *filp, void __user *buffer,
2874                                         size_t *lenp, loff_t *ppos)
2875 {
2876         if (write) {
2877                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2878                 rt_cache_flush(flush_delay);
2879                 return 0;
2880         } 
2881
2882         return -EINVAL;
2883 }
2884
2885 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2886                                                 int __user *name,
2887                                                 int nlen,
2888                                                 void __user *oldval,
2889                                                 size_t __user *oldlenp,
2890                                                 void __user *newval,
2891                                                 size_t newlen,
2892                                                 void **context)
2893 {
2894         int delay;
2895         if (newlen != sizeof(int))
2896                 return -EINVAL;
2897         if (get_user(delay, (int __user *)newval))
2898                 return -EFAULT; 
2899         rt_cache_flush(delay); 
2900         return 0;
2901 }
2902
2903 ctl_table ipv4_route_table[] = {
2904         {
2905                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2906                 .procname       = "flush",
2907                 .data           = &flush_delay,
2908                 .maxlen         = sizeof(int),
2909                 .mode           = 0200,
2910                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2911                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2912         },
2913         {
2914                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2915                 .procname       = "min_delay",
2916                 .data           = &ip_rt_min_delay,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = &proc_dointvec_jiffies,
2920                 .strategy       = &sysctl_jiffies,
2921         },
2922         {
2923                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2924                 .procname       = "max_delay",
2925                 .data           = &ip_rt_max_delay,
2926                 .maxlen         = sizeof(int),
2927                 .mode           = 0644,
2928                 .proc_handler   = &proc_dointvec_jiffies,
2929                 .strategy       = &sysctl_jiffies,
2930         },
2931         {
2932                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2933                 .procname       = "gc_thresh",
2934                 .data           = &ipv4_dst_ops.gc_thresh,
2935                 .maxlen         = sizeof(int),
2936                 .mode           = 0644,
2937                 .proc_handler   = &proc_dointvec,
2938         },
2939         {
2940                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2941                 .procname       = "max_size",
2942                 .data           = &ip_rt_max_size,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = &proc_dointvec,
2946         },
2947         {
2948                 /*  Deprecated. Use gc_min_interval_ms */
2949  
2950                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2951                 .procname       = "gc_min_interval",
2952                 .data           = &ip_rt_gc_min_interval,
2953                 .maxlen         = sizeof(int),
2954                 .mode           = 0644,
2955                 .proc_handler   = &proc_dointvec_jiffies,
2956                 .strategy       = &sysctl_jiffies,
2957         },
2958         {
2959                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2960                 .procname       = "gc_min_interval_ms",
2961                 .data           = &ip_rt_gc_min_interval,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = &proc_dointvec_ms_jiffies,
2965                 .strategy       = &sysctl_ms_jiffies,
2966         },
2967         {
2968                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2969                 .procname       = "gc_timeout",
2970                 .data           = &ip_rt_gc_timeout,
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0644,
2973                 .proc_handler   = &proc_dointvec_jiffies,
2974                 .strategy       = &sysctl_jiffies,
2975         },
2976         {
2977                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2978                 .procname       = "gc_interval",
2979                 .data           = &ip_rt_gc_interval,
2980                 .maxlen         = sizeof(int),
2981                 .mode           = 0644,
2982                 .proc_handler   = &proc_dointvec_jiffies,
2983                 .strategy       = &sysctl_jiffies,
2984         },
2985         {
2986                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2987                 .procname       = "redirect_load",
2988                 .data           = &ip_rt_redirect_load,
2989                 .maxlen         = sizeof(int),
2990                 .mode           = 0644,
2991                 .proc_handler   = &proc_dointvec,
2992         },
2993         {
2994                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2995                 .procname       = "redirect_number",
2996                 .data           = &ip_rt_redirect_number,
2997                 .maxlen         = sizeof(int),
2998                 .mode           = 0644,
2999                 .proc_handler   = &proc_dointvec,
3000         },
3001         {
3002                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3003                 .procname       = "redirect_silence",
3004                 .data           = &ip_rt_redirect_silence,
3005                 .maxlen         = sizeof(int),
3006                 .mode           = 0644,
3007                 .proc_handler   = &proc_dointvec,
3008         },
3009         {
3010                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3011                 .procname       = "error_cost",
3012                 .data           = &ip_rt_error_cost,
3013                 .maxlen         = sizeof(int),
3014                 .mode           = 0644,
3015                 .proc_handler   = &proc_dointvec,
3016         },
3017         {
3018                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3019                 .procname       = "error_burst",
3020                 .data           = &ip_rt_error_burst,
3021                 .maxlen         = sizeof(int),
3022                 .mode           = 0644,
3023                 .proc_handler   = &proc_dointvec,
3024         },
3025         {
3026                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3027                 .procname       = "gc_elasticity",
3028                 .data           = &ip_rt_gc_elasticity,
3029                 .maxlen         = sizeof(int),
3030                 .mode           = 0644,
3031                 .proc_handler   = &proc_dointvec,
3032         },
3033         {
3034                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3035                 .procname       = "mtu_expires",
3036                 .data           = &ip_rt_mtu_expires,
3037                 .maxlen         = sizeof(int),
3038                 .mode           = 0644,
3039                 .proc_handler   = &proc_dointvec_jiffies,
3040                 .strategy       = &sysctl_jiffies,
3041         },
3042         {
3043                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3044                 .procname       = "min_pmtu",
3045                 .data           = &ip_rt_min_pmtu,
3046                 .maxlen         = sizeof(int),
3047                 .mode           = 0644,
3048                 .proc_handler   = &proc_dointvec,
3049         },
3050         {
3051                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3052                 .procname       = "min_adv_mss",
3053                 .data           = &ip_rt_min_advmss,
3054                 .maxlen         = sizeof(int),
3055                 .mode           = 0644,
3056                 .proc_handler   = &proc_dointvec,
3057         },
3058         {
3059                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3060                 .procname       = "secret_interval",
3061                 .data           = &ip_rt_secret_interval,
3062                 .maxlen         = sizeof(int),
3063                 .mode           = 0644,
3064                 .proc_handler   = &proc_dointvec_jiffies,
3065                 .strategy       = &sysctl_jiffies,
3066         },
3067         { .ctl_name = 0 }
3068 };
3069 #endif
3070
3071 #ifdef CONFIG_NET_CLS_ROUTE
3072 struct ip_rt_acct *ip_rt_acct;
3073
3074 /* This code sucks.  But you should have seen it before! --RR */
3075
3076 /* IP route accounting ptr for this logical cpu number. */
3077 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3078
3079 #ifdef CONFIG_PROC_FS
3080 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3081                            int length, int *eof, void *data)
3082 {
3083         unsigned int i;
3084
3085         if ((offset & 3) || (length & 3))
3086                 return -EIO;
3087
3088         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3089                 *eof = 1;
3090                 return 0;
3091         }
3092
3093         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3094                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3095                 *eof = 1;
3096         }
3097
3098         offset /= sizeof(u32);
3099
3100         if (length > 0) {
3101                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3102                 u32 *dst = (u32 *) buffer;
3103
3104                 /* Copy first cpu. */
3105                 *start = buffer;
3106                 memcpy(dst, src, length);
3107
3108                 /* Add the other cpus in, one int at a time */
3109                 for_each_possible_cpu(i) {
3110                         unsigned int j;
3111
3112                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3113
3114                         for (j = 0; j < length/4; j++)
3115                                 dst[j] += src[j];
3116                 }
3117         }
3118         return length;
3119 }
3120 #endif /* CONFIG_PROC_FS */
3121 #endif /* CONFIG_NET_CLS_ROUTE */
3122
3123 static __initdata unsigned long rhash_entries;
3124 static int __init set_rhash_entries(char *str)
3125 {
3126         if (!str)
3127                 return 0;
3128         rhash_entries = simple_strtoul(str, &str, 0);
3129         return 1;
3130 }
3131 __setup("rhash_entries=", set_rhash_entries);
3132
3133 int __init ip_rt_init(void)
3134 {
3135         int rc = 0;
3136
3137         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3138                              (jiffies ^ (jiffies >> 7)));
3139
3140 #ifdef CONFIG_NET_CLS_ROUTE
3141         {
3142         int order;
3143         for (order = 0;
3144              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3145                 /* NOTHING */;
3146         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3147         if (!ip_rt_acct)
3148                 panic("IP: failed to allocate ip_rt_acct\n");
3149         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3150         }
3151 #endif
3152
3153         ipv4_dst_ops.kmem_cachep =
3154                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3155                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3156
3157         rt_hash_table = (struct rt_hash_bucket *)
3158                 alloc_large_system_hash("IP route cache",
3159                                         sizeof(struct rt_hash_bucket),
3160                                         rhash_entries,
3161                                         (num_physpages >= 128 * 1024) ?
3162                                         15 : 17,
3163                                         0,
3164                                         &rt_hash_log,
3165                                         &rt_hash_mask,
3166                                         0);
3167         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3168         rt_hash_lock_init();
3169
3170         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3171         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3172
3173         devinet_init();
3174         ip_fib_init();
3175
3176         init_timer(&rt_flush_timer);
3177         rt_flush_timer.function = rt_run_flush;
3178         init_timer(&rt_periodic_timer);
3179         rt_periodic_timer.function = rt_check_expire;
3180         init_timer(&rt_secret_timer);
3181         rt_secret_timer.function = rt_secret_rebuild;
3182
3183         /* All the timers, started at system startup tend
3184            to synchronize. Perturb it a bit.
3185          */
3186         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3187                                         ip_rt_gc_interval;
3188         add_timer(&rt_periodic_timer);
3189
3190         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3191                 ip_rt_secret_interval;
3192         add_timer(&rt_secret_timer);
3193
3194 #ifdef CONFIG_PROC_FS
3195         {
3196         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3197         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3198             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3199                                              proc_net_stat))) {
3200                 return -ENOMEM;
3201         }
3202         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3203         }
3204 #ifdef CONFIG_NET_CLS_ROUTE
3205         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3206 #endif
3207 #endif
3208 #ifdef CONFIG_XFRM
3209         xfrm_init();
3210         xfrm4_init();
3211 #endif
3212         return rc;
3213 }
3214
3215 EXPORT_SYMBOL(__ip_select_ident);
3216 EXPORT_SYMBOL(ip_route_input);
3217 EXPORT_SYMBOL(ip_route_output_key);