[IPV4]: rt_cache_get_next should take rt_genid into account.
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .local_out =            ip_local_out,
166         .entry_size =           sizeof(struct rtable),
167         .entries =              ATOMIC_INIT(0),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 const __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235 static __init void rt_hash_lock_init(void)
236 {
237         int i;
238
239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240                         GFP_KERNEL);
241         if (!rt_hash_locks)
242                 panic("IP: failed to allocate rt_hash_locks\n");
243
244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245                 spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254
255 static struct rt_hash_bucket    *rt_hash_table;
256 static unsigned                 rt_hash_mask;
257 static unsigned int             rt_hash_log;
258 static atomic_t                 rt_genid;
259
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262         (__raw_get_cpu_var(rt_cache_stat).field++)
263
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267                 & rt_hash_mask;
268 }
269
270 #define rt_hash(daddr, saddr, idx) \
271         rt_hash_code((__force u32)(__be32)(daddr),\
272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         int bucket;
277         int genid;
278 };
279
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
281 {
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 rcu_read_lock_bh();
286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (r->rt_genid == st->genid)
289                                 return r;
290                         r = rcu_dereference(r->u.dst.rt_next);
291                 }
292                 rcu_read_unlock_bh();
293         }
294         return r;
295 }
296
297 static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
298                                           struct rtable *r)
299 {
300         r = r->u.dst.rt_next;
301         while (!r) {
302                 rcu_read_unlock_bh();
303                 if (--st->bucket < 0)
304                         break;
305                 rcu_read_lock_bh();
306                 r = rt_hash_table[st->bucket].chain;
307         }
308         return rcu_dereference(r);
309 }
310
311 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
312                                         struct rtable *r)
313 {
314         while ((r = __rt_cache_get_next(st, r)) != NULL) {
315                 if (r->rt_genid == st->genid)
316                         break;
317         }
318         return r;
319 }
320
321 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
322 {
323         struct rtable *r = rt_cache_get_first(st);
324
325         if (r)
326                 while (pos && (r = rt_cache_get_next(st, r)))
327                         --pos;
328         return pos ? NULL : r;
329 }
330
331 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
332 {
333         struct rt_cache_iter_state *st = seq->private;
334
335         if (*pos)
336                 return rt_cache_get_idx(st, *pos - 1);
337         st->genid = atomic_read(&rt_genid);
338         return SEQ_START_TOKEN;
339 }
340
341 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
342 {
343         struct rtable *r;
344         struct rt_cache_iter_state *st = seq->private;
345
346         if (v == SEQ_START_TOKEN)
347                 r = rt_cache_get_first(st);
348         else
349                 r = rt_cache_get_next(st, v);
350         ++*pos;
351         return r;
352 }
353
354 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
355 {
356         if (v && v != SEQ_START_TOKEN)
357                 rcu_read_unlock_bh();
358 }
359
360 static int rt_cache_seq_show(struct seq_file *seq, void *v)
361 {
362         if (v == SEQ_START_TOKEN)
363                 seq_printf(seq, "%-127s\n",
364                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
365                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
366                            "HHUptod\tSpecDst");
367         else {
368                 struct rtable *r = v;
369                 char temp[256];
370
371                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
372                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
373                         r->u.dst.dev ? r->u.dst.dev->name : "*",
374                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
375                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
376                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
377                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
378                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
379                         dst_metric(&r->u.dst, RTAX_WINDOW),
380                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
381                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
382                         r->fl.fl4_tos,
383                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
384                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
385                                        dev_queue_xmit) : 0,
386                         r->rt_spec_dst);
387                 seq_printf(seq, "%-127s\n", temp);
388         }
389         return 0;
390 }
391
392 static const struct seq_operations rt_cache_seq_ops = {
393         .start  = rt_cache_seq_start,
394         .next   = rt_cache_seq_next,
395         .stop   = rt_cache_seq_stop,
396         .show   = rt_cache_seq_show,
397 };
398
399 static int rt_cache_seq_open(struct inode *inode, struct file *file)
400 {
401         return seq_open_private(file, &rt_cache_seq_ops,
402                         sizeof(struct rt_cache_iter_state));
403 }
404
405 static const struct file_operations rt_cache_seq_fops = {
406         .owner   = THIS_MODULE,
407         .open    = rt_cache_seq_open,
408         .read    = seq_read,
409         .llseek  = seq_lseek,
410         .release = seq_release_private,
411 };
412
413
414 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
415 {
416         int cpu;
417
418         if (*pos == 0)
419                 return SEQ_START_TOKEN;
420
421         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422                 if (!cpu_possible(cpu))
423                         continue;
424                 *pos = cpu+1;
425                 return &per_cpu(rt_cache_stat, cpu);
426         }
427         return NULL;
428 }
429
430 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
431 {
432         int cpu;
433
434         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435                 if (!cpu_possible(cpu))
436                         continue;
437                 *pos = cpu+1;
438                 return &per_cpu(rt_cache_stat, cpu);
439         }
440         return NULL;
441
442 }
443
444 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
445 {
446
447 }
448
449 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
450 {
451         struct rt_cache_stat *st = v;
452
453         if (v == SEQ_START_TOKEN) {
454                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
455                 return 0;
456         }
457
458         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
459                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460                    atomic_read(&ipv4_dst_ops.entries),
461                    st->in_hit,
462                    st->in_slow_tot,
463                    st->in_slow_mc,
464                    st->in_no_route,
465                    st->in_brd,
466                    st->in_martian_dst,
467                    st->in_martian_src,
468
469                    st->out_hit,
470                    st->out_slow_tot,
471                    st->out_slow_mc,
472
473                    st->gc_total,
474                    st->gc_ignored,
475                    st->gc_goal_miss,
476                    st->gc_dst_overflow,
477                    st->in_hlist_search,
478                    st->out_hlist_search
479                 );
480         return 0;
481 }
482
483 static const struct seq_operations rt_cpu_seq_ops = {
484         .start  = rt_cpu_seq_start,
485         .next   = rt_cpu_seq_next,
486         .stop   = rt_cpu_seq_stop,
487         .show   = rt_cpu_seq_show,
488 };
489
490
491 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
492 {
493         return seq_open(file, &rt_cpu_seq_ops);
494 }
495
496 static const struct file_operations rt_cpu_seq_fops = {
497         .owner   = THIS_MODULE,
498         .open    = rt_cpu_seq_open,
499         .read    = seq_read,
500         .llseek  = seq_lseek,
501         .release = seq_release,
502 };
503
504 #ifdef CONFIG_NET_CLS_ROUTE
505 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506                            int length, int *eof, void *data)
507 {
508         unsigned int i;
509
510         if ((offset & 3) || (length & 3))
511                 return -EIO;
512
513         if (offset >= sizeof(struct ip_rt_acct) * 256) {
514                 *eof = 1;
515                 return 0;
516         }
517
518         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519                 length = sizeof(struct ip_rt_acct) * 256 - offset;
520                 *eof = 1;
521         }
522
523         offset /= sizeof(u32);
524
525         if (length > 0) {
526                 u32 *dst = (u32 *) buffer;
527
528                 *start = buffer;
529                 memset(dst, 0, length);
530
531                 for_each_possible_cpu(i) {
532                         unsigned int j;
533                         u32 *src;
534
535                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536                         for (j = 0; j < length/4; j++)
537                                 dst[j] += src[j];
538                 }
539         }
540         return length;
541 }
542 #endif
543
544 static __init int ip_rt_proc_init(struct net *net)
545 {
546         struct proc_dir_entry *pde;
547
548         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
549                         &rt_cache_seq_fops);
550         if (!pde)
551                 goto err1;
552
553         pde = proc_create("rt_cache", S_IRUGO,
554                           net->proc_net_stat, &rt_cpu_seq_fops);
555         if (!pde)
556                 goto err2;
557
558 #ifdef CONFIG_NET_CLS_ROUTE
559         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560                         ip_rt_acct_read, NULL);
561         if (!pde)
562                 goto err3;
563 #endif
564         return 0;
565
566 #ifdef CONFIG_NET_CLS_ROUTE
567 err3:
568         remove_proc_entry("rt_cache", net->proc_net_stat);
569 #endif
570 err2:
571         remove_proc_entry("rt_cache", net->proc_net);
572 err1:
573         return -ENOMEM;
574 }
575 #else
576 static inline int ip_rt_proc_init(struct net *net)
577 {
578         return 0;
579 }
580 #endif /* CONFIG_PROC_FS */
581
582 static __inline__ void rt_free(struct rtable *rt)
583 {
584         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
585 }
586
587 static __inline__ void rt_drop(struct rtable *rt)
588 {
589         ip_rt_put(rt);
590         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
591 }
592
593 static __inline__ int rt_fast_clean(struct rtable *rth)
594 {
595         /* Kill broadcast/multicast entries very aggresively, if they
596            collide in hash table with more useful entries */
597         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
598                 rth->fl.iif && rth->u.dst.rt_next;
599 }
600
601 static __inline__ int rt_valuable(struct rtable *rth)
602 {
603         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
604                 rth->u.dst.expires;
605 }
606
607 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
608 {
609         unsigned long age;
610         int ret = 0;
611
612         if (atomic_read(&rth->u.dst.__refcnt))
613                 goto out;
614
615         ret = 1;
616         if (rth->u.dst.expires &&
617             time_after_eq(jiffies, rth->u.dst.expires))
618                 goto out;
619
620         age = jiffies - rth->u.dst.lastuse;
621         ret = 0;
622         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
623             (age <= tmo2 && rt_valuable(rth)))
624                 goto out;
625         ret = 1;
626 out:    return ret;
627 }
628
629 /* Bits of score are:
630  * 31: very valuable
631  * 30: not quite useless
632  * 29..0: usage counter
633  */
634 static inline u32 rt_score(struct rtable *rt)
635 {
636         u32 score = jiffies - rt->u.dst.lastuse;
637
638         score = ~score & ~(3<<30);
639
640         if (rt_valuable(rt))
641                 score |= (1<<31);
642
643         if (!rt->fl.iif ||
644             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
645                 score |= (1<<30);
646
647         return score;
648 }
649
650 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
651 {
652         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
653                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
654                 (fl1->mark ^ fl2->mark) |
655                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
656                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
657                 (fl1->oif ^ fl2->oif) |
658                 (fl1->iif ^ fl2->iif)) == 0;
659 }
660
661 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
662 {
663         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
664 }
665
666 /*
667  * Perform a full scan of hash table and free all entries.
668  * Can be called by a softirq or a process.
669  * In the later case, we want to be reschedule if necessary
670  */
671 static void rt_do_flush(int process_context)
672 {
673         unsigned int i;
674         struct rtable *rth, *next;
675
676         for (i = 0; i <= rt_hash_mask; i++) {
677                 if (process_context && need_resched())
678                         cond_resched();
679                 rth = rt_hash_table[i].chain;
680                 if (!rth)
681                         continue;
682
683                 spin_lock_bh(rt_hash_lock_addr(i));
684                 rth = rt_hash_table[i].chain;
685                 rt_hash_table[i].chain = NULL;
686                 spin_unlock_bh(rt_hash_lock_addr(i));
687
688                 for (; rth; rth = next) {
689                         next = rth->u.dst.rt_next;
690                         rt_free(rth);
691                 }
692         }
693 }
694
695 static void rt_check_expire(void)
696 {
697         static unsigned int rover;
698         unsigned int i = rover, goal;
699         struct rtable *rth, **rthp;
700         u64 mult;
701
702         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
703         if (ip_rt_gc_timeout > 1)
704                 do_div(mult, ip_rt_gc_timeout);
705         goal = (unsigned int)mult;
706         if (goal > rt_hash_mask)
707                 goal = rt_hash_mask + 1;
708         for (; goal > 0; goal--) {
709                 unsigned long tmo = ip_rt_gc_timeout;
710
711                 i = (i + 1) & rt_hash_mask;
712                 rthp = &rt_hash_table[i].chain;
713
714                 if (need_resched())
715                         cond_resched();
716
717                 if (*rthp == NULL)
718                         continue;
719                 spin_lock_bh(rt_hash_lock_addr(i));
720                 while ((rth = *rthp) != NULL) {
721                         if (rth->rt_genid != atomic_read(&rt_genid)) {
722                                 *rthp = rth->u.dst.rt_next;
723                                 rt_free(rth);
724                                 continue;
725                         }
726                         if (rth->u.dst.expires) {
727                                 /* Entry is expired even if it is in use */
728                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
729                                         tmo >>= 1;
730                                         rthp = &rth->u.dst.rt_next;
731                                         continue;
732                                 }
733                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
734                                 tmo >>= 1;
735                                 rthp = &rth->u.dst.rt_next;
736                                 continue;
737                         }
738
739                         /* Cleanup aged off entries. */
740                         *rthp = rth->u.dst.rt_next;
741                         rt_free(rth);
742                 }
743                 spin_unlock_bh(rt_hash_lock_addr(i));
744         }
745         rover = i;
746 }
747
748 /*
749  * rt_worker_func() is run in process context.
750  * we call rt_check_expire() to scan part of the hash table
751  */
752 static void rt_worker_func(struct work_struct *work)
753 {
754         rt_check_expire();
755         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
756 }
757
758 /*
759  * Pertubation of rt_genid by a small quantity [1..256]
760  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
761  * many times (2^24) without giving recent rt_genid.
762  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
763  */
764 static void rt_cache_invalidate(void)
765 {
766         unsigned char shuffle;
767
768         get_random_bytes(&shuffle, sizeof(shuffle));
769         atomic_add(shuffle + 1U, &rt_genid);
770 }
771
772 /*
773  * delay < 0  : invalidate cache (fast : entries will be deleted later)
774  * delay >= 0 : invalidate & flush cache (can be long)
775  */
776 void rt_cache_flush(int delay)
777 {
778         rt_cache_invalidate();
779         if (delay >= 0)
780                 rt_do_flush(!in_softirq());
781 }
782
783 /*
784  * We change rt_genid and let gc do the cleanup
785  */
786 static void rt_secret_rebuild(unsigned long dummy)
787 {
788         rt_cache_invalidate();
789         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
790 }
791
792 /*
793    Short description of GC goals.
794
795    We want to build algorithm, which will keep routing cache
796    at some equilibrium point, when number of aged off entries
797    is kept approximately equal to newly generated ones.
798
799    Current expiration strength is variable "expire".
800    We try to adjust it dynamically, so that if networking
801    is idle expires is large enough to keep enough of warm entries,
802    and when load increases it reduces to limit cache size.
803  */
804
805 static int rt_garbage_collect(struct dst_ops *ops)
806 {
807         static unsigned long expire = RT_GC_TIMEOUT;
808         static unsigned long last_gc;
809         static int rover;
810         static int equilibrium;
811         struct rtable *rth, **rthp;
812         unsigned long now = jiffies;
813         int goal;
814
815         /*
816          * Garbage collection is pretty expensive,
817          * do not make it too frequently.
818          */
819
820         RT_CACHE_STAT_INC(gc_total);
821
822         if (now - last_gc < ip_rt_gc_min_interval &&
823             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
824                 RT_CACHE_STAT_INC(gc_ignored);
825                 goto out;
826         }
827
828         /* Calculate number of entries, which we want to expire now. */
829         goal = atomic_read(&ipv4_dst_ops.entries) -
830                 (ip_rt_gc_elasticity << rt_hash_log);
831         if (goal <= 0) {
832                 if (equilibrium < ipv4_dst_ops.gc_thresh)
833                         equilibrium = ipv4_dst_ops.gc_thresh;
834                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
835                 if (goal > 0) {
836                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
837                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
838                 }
839         } else {
840                 /* We are in dangerous area. Try to reduce cache really
841                  * aggressively.
842                  */
843                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
844                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
845         }
846
847         if (now - last_gc >= ip_rt_gc_min_interval)
848                 last_gc = now;
849
850         if (goal <= 0) {
851                 equilibrium += goal;
852                 goto work_done;
853         }
854
855         do {
856                 int i, k;
857
858                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
859                         unsigned long tmo = expire;
860
861                         k = (k + 1) & rt_hash_mask;
862                         rthp = &rt_hash_table[k].chain;
863                         spin_lock_bh(rt_hash_lock_addr(k));
864                         while ((rth = *rthp) != NULL) {
865                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
866                                         !rt_may_expire(rth, tmo, expire)) {
867                                         tmo >>= 1;
868                                         rthp = &rth->u.dst.rt_next;
869                                         continue;
870                                 }
871                                 *rthp = rth->u.dst.rt_next;
872                                 rt_free(rth);
873                                 goal--;
874                         }
875                         spin_unlock_bh(rt_hash_lock_addr(k));
876                         if (goal <= 0)
877                                 break;
878                 }
879                 rover = k;
880
881                 if (goal <= 0)
882                         goto work_done;
883
884                 /* Goal is not achieved. We stop process if:
885
886                    - if expire reduced to zero. Otherwise, expire is halfed.
887                    - if table is not full.
888                    - if we are called from interrupt.
889                    - jiffies check is just fallback/debug loop breaker.
890                      We will not spin here for long time in any case.
891                  */
892
893                 RT_CACHE_STAT_INC(gc_goal_miss);
894
895                 if (expire == 0)
896                         break;
897
898                 expire >>= 1;
899 #if RT_CACHE_DEBUG >= 2
900                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
901                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
902 #endif
903
904                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905                         goto out;
906         } while (!in_softirq() && time_before_eq(jiffies, now));
907
908         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
909                 goto out;
910         if (net_ratelimit())
911                 printk(KERN_WARNING "dst cache overflow\n");
912         RT_CACHE_STAT_INC(gc_dst_overflow);
913         return 1;
914
915 work_done:
916         expire += ip_rt_gc_min_interval;
917         if (expire > ip_rt_gc_timeout ||
918             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
919                 expire = ip_rt_gc_timeout;
920 #if RT_CACHE_DEBUG >= 2
921         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
922                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
923 #endif
924 out:    return 0;
925 }
926
927 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
928 {
929         struct rtable   *rth, **rthp;
930         unsigned long   now;
931         struct rtable *cand, **candp;
932         u32             min_score;
933         int             chain_length;
934         int attempts = !in_softirq();
935
936 restart:
937         chain_length = 0;
938         min_score = ~(u32)0;
939         cand = NULL;
940         candp = NULL;
941         now = jiffies;
942
943         rthp = &rt_hash_table[hash].chain;
944
945         spin_lock_bh(rt_hash_lock_addr(hash));
946         while ((rth = *rthp) != NULL) {
947                 if (rth->rt_genid != atomic_read(&rt_genid)) {
948                         *rthp = rth->u.dst.rt_next;
949                         rt_free(rth);
950                         continue;
951                 }
952                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
953                         /* Put it first */
954                         *rthp = rth->u.dst.rt_next;
955                         /*
956                          * Since lookup is lockfree, the deletion
957                          * must be visible to another weakly ordered CPU before
958                          * the insertion at the start of the hash chain.
959                          */
960                         rcu_assign_pointer(rth->u.dst.rt_next,
961                                            rt_hash_table[hash].chain);
962                         /*
963                          * Since lookup is lockfree, the update writes
964                          * must be ordered for consistency on SMP.
965                          */
966                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
967
968                         dst_use(&rth->u.dst, now);
969                         spin_unlock_bh(rt_hash_lock_addr(hash));
970
971                         rt_drop(rt);
972                         *rp = rth;
973                         return 0;
974                 }
975
976                 if (!atomic_read(&rth->u.dst.__refcnt)) {
977                         u32 score = rt_score(rth);
978
979                         if (score <= min_score) {
980                                 cand = rth;
981                                 candp = rthp;
982                                 min_score = score;
983                         }
984                 }
985
986                 chain_length++;
987
988                 rthp = &rth->u.dst.rt_next;
989         }
990
991         if (cand) {
992                 /* ip_rt_gc_elasticity used to be average length of chain
993                  * length, when exceeded gc becomes really aggressive.
994                  *
995                  * The second limit is less certain. At the moment it allows
996                  * only 2 entries per bucket. We will see.
997                  */
998                 if (chain_length > ip_rt_gc_elasticity) {
999                         *candp = cand->u.dst.rt_next;
1000                         rt_free(cand);
1001                 }
1002         }
1003
1004         /* Try to bind route to arp only if it is output
1005            route or unicast forwarding path.
1006          */
1007         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1008                 int err = arp_bind_neighbour(&rt->u.dst);
1009                 if (err) {
1010                         spin_unlock_bh(rt_hash_lock_addr(hash));
1011
1012                         if (err != -ENOBUFS) {
1013                                 rt_drop(rt);
1014                                 return err;
1015                         }
1016
1017                         /* Neighbour tables are full and nothing
1018                            can be released. Try to shrink route cache,
1019                            it is most likely it holds some neighbour records.
1020                          */
1021                         if (attempts-- > 0) {
1022                                 int saved_elasticity = ip_rt_gc_elasticity;
1023                                 int saved_int = ip_rt_gc_min_interval;
1024                                 ip_rt_gc_elasticity     = 1;
1025                                 ip_rt_gc_min_interval   = 0;
1026                                 rt_garbage_collect(&ipv4_dst_ops);
1027                                 ip_rt_gc_min_interval   = saved_int;
1028                                 ip_rt_gc_elasticity     = saved_elasticity;
1029                                 goto restart;
1030                         }
1031
1032                         if (net_ratelimit())
1033                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1034                         rt_drop(rt);
1035                         return -ENOBUFS;
1036                 }
1037         }
1038
1039         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1040 #if RT_CACHE_DEBUG >= 2
1041         if (rt->u.dst.rt_next) {
1042                 struct rtable *trt;
1043                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1044                        NIPQUAD(rt->rt_dst));
1045                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1046                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1047                 printk("\n");
1048         }
1049 #endif
1050         rt_hash_table[hash].chain = rt;
1051         spin_unlock_bh(rt_hash_lock_addr(hash));
1052         *rp = rt;
1053         return 0;
1054 }
1055
1056 void rt_bind_peer(struct rtable *rt, int create)
1057 {
1058         static DEFINE_SPINLOCK(rt_peer_lock);
1059         struct inet_peer *peer;
1060
1061         peer = inet_getpeer(rt->rt_dst, create);
1062
1063         spin_lock_bh(&rt_peer_lock);
1064         if (rt->peer == NULL) {
1065                 rt->peer = peer;
1066                 peer = NULL;
1067         }
1068         spin_unlock_bh(&rt_peer_lock);
1069         if (peer)
1070                 inet_putpeer(peer);
1071 }
1072
1073 /*
1074  * Peer allocation may fail only in serious out-of-memory conditions.  However
1075  * we still can generate some output.
1076  * Random ID selection looks a bit dangerous because we have no chances to
1077  * select ID being unique in a reasonable period of time.
1078  * But broken packet identifier may be better than no packet at all.
1079  */
1080 static void ip_select_fb_ident(struct iphdr *iph)
1081 {
1082         static DEFINE_SPINLOCK(ip_fb_id_lock);
1083         static u32 ip_fallback_id;
1084         u32 salt;
1085
1086         spin_lock_bh(&ip_fb_id_lock);
1087         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1088         iph->id = htons(salt & 0xFFFF);
1089         ip_fallback_id = salt;
1090         spin_unlock_bh(&ip_fb_id_lock);
1091 }
1092
1093 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1094 {
1095         struct rtable *rt = (struct rtable *) dst;
1096
1097         if (rt) {
1098                 if (rt->peer == NULL)
1099                         rt_bind_peer(rt, 1);
1100
1101                 /* If peer is attached to destination, it is never detached,
1102                    so that we need not to grab a lock to dereference it.
1103                  */
1104                 if (rt->peer) {
1105                         iph->id = htons(inet_getid(rt->peer, more));
1106                         return;
1107                 }
1108         } else
1109                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1110                        __builtin_return_address(0));
1111
1112         ip_select_fb_ident(iph);
1113 }
1114
1115 static void rt_del(unsigned hash, struct rtable *rt)
1116 {
1117         struct rtable **rthp, *aux;
1118
1119         rthp = &rt_hash_table[hash].chain;
1120         spin_lock_bh(rt_hash_lock_addr(hash));
1121         ip_rt_put(rt);
1122         while ((aux = *rthp) != NULL) {
1123                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1124                         *rthp = aux->u.dst.rt_next;
1125                         rt_free(aux);
1126                         continue;
1127                 }
1128                 rthp = &aux->u.dst.rt_next;
1129         }
1130         spin_unlock_bh(rt_hash_lock_addr(hash));
1131 }
1132
1133 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1134                     __be32 saddr, struct net_device *dev)
1135 {
1136         int i, k;
1137         struct in_device *in_dev = in_dev_get(dev);
1138         struct rtable *rth, **rthp;
1139         __be32  skeys[2] = { saddr, 0 };
1140         int  ikeys[2] = { dev->ifindex, 0 };
1141         struct netevent_redirect netevent;
1142         struct net *net;
1143
1144         if (!in_dev)
1145                 return;
1146
1147         net = dev->nd_net;
1148         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1149             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1150             || ipv4_is_zeronet(new_gw))
1151                 goto reject_redirect;
1152
1153         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1154                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1155                         goto reject_redirect;
1156                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1157                         goto reject_redirect;
1158         } else {
1159                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1160                         goto reject_redirect;
1161         }
1162
1163         for (i = 0; i < 2; i++) {
1164                 for (k = 0; k < 2; k++) {
1165                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1166
1167                         rthp=&rt_hash_table[hash].chain;
1168
1169                         rcu_read_lock();
1170                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1171                                 struct rtable *rt;
1172
1173                                 if (rth->fl.fl4_dst != daddr ||
1174                                     rth->fl.fl4_src != skeys[i] ||
1175                                     rth->fl.oif != ikeys[k] ||
1176                                     rth->fl.iif != 0 ||
1177                                     rth->rt_genid != atomic_read(&rt_genid) ||
1178                                     rth->u.dst.dev->nd_net != net) {
1179                                         rthp = &rth->u.dst.rt_next;
1180                                         continue;
1181                                 }
1182
1183                                 if (rth->rt_dst != daddr ||
1184                                     rth->rt_src != saddr ||
1185                                     rth->u.dst.error ||
1186                                     rth->rt_gateway != old_gw ||
1187                                     rth->u.dst.dev != dev)
1188                                         break;
1189
1190                                 dst_hold(&rth->u.dst);
1191                                 rcu_read_unlock();
1192
1193                                 rt = dst_alloc(&ipv4_dst_ops);
1194                                 if (rt == NULL) {
1195                                         ip_rt_put(rth);
1196                                         in_dev_put(in_dev);
1197                                         return;
1198                                 }
1199
1200                                 /* Copy all the information. */
1201                                 *rt = *rth;
1202                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1203                                 rt->u.dst.__use         = 1;
1204                                 atomic_set(&rt->u.dst.__refcnt, 1);
1205                                 rt->u.dst.child         = NULL;
1206                                 if (rt->u.dst.dev)
1207                                         dev_hold(rt->u.dst.dev);
1208                                 if (rt->idev)
1209                                         in_dev_hold(rt->idev);
1210                                 rt->u.dst.obsolete      = 0;
1211                                 rt->u.dst.lastuse       = jiffies;
1212                                 rt->u.dst.path          = &rt->u.dst;
1213                                 rt->u.dst.neighbour     = NULL;
1214                                 rt->u.dst.hh            = NULL;
1215                                 rt->u.dst.xfrm          = NULL;
1216                                 rt->rt_genid            = atomic_read(&rt_genid);
1217                                 rt->rt_flags            |= RTCF_REDIRECTED;
1218
1219                                 /* Gateway is different ... */
1220                                 rt->rt_gateway          = new_gw;
1221
1222                                 /* Redirect received -> path was valid */
1223                                 dst_confirm(&rth->u.dst);
1224
1225                                 if (rt->peer)
1226                                         atomic_inc(&rt->peer->refcnt);
1227
1228                                 if (arp_bind_neighbour(&rt->u.dst) ||
1229                                     !(rt->u.dst.neighbour->nud_state &
1230                                             NUD_VALID)) {
1231                                         if (rt->u.dst.neighbour)
1232                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1233                                         ip_rt_put(rth);
1234                                         rt_drop(rt);
1235                                         goto do_next;
1236                                 }
1237
1238                                 netevent.old = &rth->u.dst;
1239                                 netevent.new = &rt->u.dst;
1240                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1241                                                         &netevent);
1242
1243                                 rt_del(hash, rth);
1244                                 if (!rt_intern_hash(hash, rt, &rt))
1245                                         ip_rt_put(rt);
1246                                 goto do_next;
1247                         }
1248                         rcu_read_unlock();
1249                 do_next:
1250                         ;
1251                 }
1252         }
1253         in_dev_put(in_dev);
1254         return;
1255
1256 reject_redirect:
1257 #ifdef CONFIG_IP_ROUTE_VERBOSE
1258         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1259                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1260                         "%u.%u.%u.%u ignored.\n"
1261                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1262                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1263                        NIPQUAD(saddr), NIPQUAD(daddr));
1264 #endif
1265         in_dev_put(in_dev);
1266 }
1267
1268 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1269 {
1270         struct rtable *rt = (struct rtable*)dst;
1271         struct dst_entry *ret = dst;
1272
1273         if (rt) {
1274                 if (dst->obsolete) {
1275                         ip_rt_put(rt);
1276                         ret = NULL;
1277                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1278                            rt->u.dst.expires) {
1279                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1280                                                 rt->fl.oif);
1281 #if RT_CACHE_DEBUG >= 1
1282                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1283                                           "%u.%u.%u.%u/%02x dropped\n",
1284                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1285 #endif
1286                         rt_del(hash, rt);
1287                         ret = NULL;
1288                 }
1289         }
1290         return ret;
1291 }
1292
1293 /*
1294  * Algorithm:
1295  *      1. The first ip_rt_redirect_number redirects are sent
1296  *         with exponential backoff, then we stop sending them at all,
1297  *         assuming that the host ignores our redirects.
1298  *      2. If we did not see packets requiring redirects
1299  *         during ip_rt_redirect_silence, we assume that the host
1300  *         forgot redirected route and start to send redirects again.
1301  *
1302  * This algorithm is much cheaper and more intelligent than dumb load limiting
1303  * in icmp.c.
1304  *
1305  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1306  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1307  */
1308
1309 void ip_rt_send_redirect(struct sk_buff *skb)
1310 {
1311         struct rtable *rt = (struct rtable*)skb->dst;
1312         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1313
1314         if (!in_dev)
1315                 return;
1316
1317         if (!IN_DEV_TX_REDIRECTS(in_dev))
1318                 goto out;
1319
1320         /* No redirected packets during ip_rt_redirect_silence;
1321          * reset the algorithm.
1322          */
1323         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1324                 rt->u.dst.rate_tokens = 0;
1325
1326         /* Too many ignored redirects; do not send anything
1327          * set u.dst.rate_last to the last seen redirected packet.
1328          */
1329         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1330                 rt->u.dst.rate_last = jiffies;
1331                 goto out;
1332         }
1333
1334         /* Check for load limit; set rate_last to the latest sent
1335          * redirect.
1336          */
1337         if (rt->u.dst.rate_tokens == 0 ||
1338             time_after(jiffies,
1339                        (rt->u.dst.rate_last +
1340                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1341                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1342                 rt->u.dst.rate_last = jiffies;
1343                 ++rt->u.dst.rate_tokens;
1344 #ifdef CONFIG_IP_ROUTE_VERBOSE
1345                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1346                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1347                     net_ratelimit())
1348                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1349                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1350                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1351                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1352 #endif
1353         }
1354 out:
1355         in_dev_put(in_dev);
1356 }
1357
1358 static int ip_error(struct sk_buff *skb)
1359 {
1360         struct rtable *rt = (struct rtable*)skb->dst;
1361         unsigned long now;
1362         int code;
1363
1364         switch (rt->u.dst.error) {
1365                 case EINVAL:
1366                 default:
1367                         goto out;
1368                 case EHOSTUNREACH:
1369                         code = ICMP_HOST_UNREACH;
1370                         break;
1371                 case ENETUNREACH:
1372                         code = ICMP_NET_UNREACH;
1373                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1374                         break;
1375                 case EACCES:
1376                         code = ICMP_PKT_FILTERED;
1377                         break;
1378         }
1379
1380         now = jiffies;
1381         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1382         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1383                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1384         rt->u.dst.rate_last = now;
1385         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1386                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1387                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1388         }
1389
1390 out:    kfree_skb(skb);
1391         return 0;
1392 }
1393
1394 /*
1395  *      The last two values are not from the RFC but
1396  *      are needed for AMPRnet AX.25 paths.
1397  */
1398
1399 static const unsigned short mtu_plateau[] =
1400 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1401
1402 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1403 {
1404         int i;
1405
1406         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1407                 if (old_mtu > mtu_plateau[i])
1408                         return mtu_plateau[i];
1409         return 68;
1410 }
1411
1412 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1413                                  unsigned short new_mtu)
1414 {
1415         int i;
1416         unsigned short old_mtu = ntohs(iph->tot_len);
1417         struct rtable *rth;
1418         __be32  skeys[2] = { iph->saddr, 0, };
1419         __be32  daddr = iph->daddr;
1420         unsigned short est_mtu = 0;
1421
1422         if (ipv4_config.no_pmtu_disc)
1423                 return 0;
1424
1425         for (i = 0; i < 2; i++) {
1426                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1427
1428                 rcu_read_lock();
1429                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1430                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1431                         if (rth->fl.fl4_dst == daddr &&
1432                             rth->fl.fl4_src == skeys[i] &&
1433                             rth->rt_dst  == daddr &&
1434                             rth->rt_src  == iph->saddr &&
1435                             rth->fl.iif == 0 &&
1436                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1437                             rth->u.dst.dev->nd_net == net &&
1438                             rth->rt_genid == atomic_read(&rt_genid)) {
1439                                 unsigned short mtu = new_mtu;
1440
1441                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1442
1443                                         /* BSD 4.2 compatibility hack :-( */
1444                                         if (mtu == 0 &&
1445                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1446                                             old_mtu >= 68 + (iph->ihl << 2))
1447                                                 old_mtu -= iph->ihl << 2;
1448
1449                                         mtu = guess_mtu(old_mtu);
1450                                 }
1451                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1452                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1453                                                 dst_confirm(&rth->u.dst);
1454                                                 if (mtu < ip_rt_min_pmtu) {
1455                                                         mtu = ip_rt_min_pmtu;
1456                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1457                                                                 (1 << RTAX_MTU);
1458                                                 }
1459                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1460                                                 dst_set_expires(&rth->u.dst,
1461                                                         ip_rt_mtu_expires);
1462                                         }
1463                                         est_mtu = mtu;
1464                                 }
1465                         }
1466                 }
1467                 rcu_read_unlock();
1468         }
1469         return est_mtu ? : new_mtu;
1470 }
1471
1472 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1473 {
1474         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1475             !(dst_metric_locked(dst, RTAX_MTU))) {
1476                 if (mtu < ip_rt_min_pmtu) {
1477                         mtu = ip_rt_min_pmtu;
1478                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1479                 }
1480                 dst->metrics[RTAX_MTU-1] = mtu;
1481                 dst_set_expires(dst, ip_rt_mtu_expires);
1482                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1483         }
1484 }
1485
1486 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1487 {
1488         return NULL;
1489 }
1490
1491 static void ipv4_dst_destroy(struct dst_entry *dst)
1492 {
1493         struct rtable *rt = (struct rtable *) dst;
1494         struct inet_peer *peer = rt->peer;
1495         struct in_device *idev = rt->idev;
1496
1497         if (peer) {
1498                 rt->peer = NULL;
1499                 inet_putpeer(peer);
1500         }
1501
1502         if (idev) {
1503                 rt->idev = NULL;
1504                 in_dev_put(idev);
1505         }
1506 }
1507
1508 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1509                             int how)
1510 {
1511         struct rtable *rt = (struct rtable *) dst;
1512         struct in_device *idev = rt->idev;
1513         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1514                 struct in_device *loopback_idev =
1515                         in_dev_get(dev->nd_net->loopback_dev);
1516                 if (loopback_idev) {
1517                         rt->idev = loopback_idev;
1518                         in_dev_put(idev);
1519                 }
1520         }
1521 }
1522
1523 static void ipv4_link_failure(struct sk_buff *skb)
1524 {
1525         struct rtable *rt;
1526
1527         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1528
1529         rt = (struct rtable *) skb->dst;
1530         if (rt)
1531                 dst_set_expires(&rt->u.dst, 0);
1532 }
1533
1534 static int ip_rt_bug(struct sk_buff *skb)
1535 {
1536         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1537                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1538                 skb->dev ? skb->dev->name : "?");
1539         kfree_skb(skb);
1540         return 0;
1541 }
1542
1543 /*
1544    We do not cache source address of outgoing interface,
1545    because it is used only by IP RR, TS and SRR options,
1546    so that it out of fast path.
1547
1548    BTW remember: "addr" is allowed to be not aligned
1549    in IP options!
1550  */
1551
1552 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1553 {
1554         __be32 src;
1555         struct fib_result res;
1556
1557         if (rt->fl.iif == 0)
1558                 src = rt->rt_src;
1559         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1560                 src = FIB_RES_PREFSRC(res);
1561                 fib_res_put(&res);
1562         } else
1563                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1564                                         RT_SCOPE_UNIVERSE);
1565         memcpy(addr, &src, 4);
1566 }
1567
1568 #ifdef CONFIG_NET_CLS_ROUTE
1569 static void set_class_tag(struct rtable *rt, u32 tag)
1570 {
1571         if (!(rt->u.dst.tclassid & 0xFFFF))
1572                 rt->u.dst.tclassid |= tag & 0xFFFF;
1573         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1574                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1575 }
1576 #endif
1577
1578 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1579 {
1580         struct fib_info *fi = res->fi;
1581
1582         if (fi) {
1583                 if (FIB_RES_GW(*res) &&
1584                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1585                         rt->rt_gateway = FIB_RES_GW(*res);
1586                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1587                        sizeof(rt->u.dst.metrics));
1588                 if (fi->fib_mtu == 0) {
1589                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1590                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1591                             rt->rt_gateway != rt->rt_dst &&
1592                             rt->u.dst.dev->mtu > 576)
1593                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1594                 }
1595 #ifdef CONFIG_NET_CLS_ROUTE
1596                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1597 #endif
1598         } else
1599                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1600
1601         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1602                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1603         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1604                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1605         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1606                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1607                                        ip_rt_min_advmss);
1608         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1609                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1610
1611 #ifdef CONFIG_NET_CLS_ROUTE
1612 #ifdef CONFIG_IP_MULTIPLE_TABLES
1613         set_class_tag(rt, fib_rules_tclass(res));
1614 #endif
1615         set_class_tag(rt, itag);
1616 #endif
1617         rt->rt_type = res->type;
1618 }
1619
1620 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1621                                 u8 tos, struct net_device *dev, int our)
1622 {
1623         unsigned hash;
1624         struct rtable *rth;
1625         __be32 spec_dst;
1626         struct in_device *in_dev = in_dev_get(dev);
1627         u32 itag = 0;
1628
1629         /* Primary sanity checks. */
1630
1631         if (in_dev == NULL)
1632                 return -EINVAL;
1633
1634         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1635             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1636                 goto e_inval;
1637
1638         if (ipv4_is_zeronet(saddr)) {
1639                 if (!ipv4_is_local_multicast(daddr))
1640                         goto e_inval;
1641                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1642         } else if (fib_validate_source(saddr, 0, tos, 0,
1643                                         dev, &spec_dst, &itag) < 0)
1644                 goto e_inval;
1645
1646         rth = dst_alloc(&ipv4_dst_ops);
1647         if (!rth)
1648                 goto e_nobufs;
1649
1650         rth->u.dst.output= ip_rt_bug;
1651
1652         atomic_set(&rth->u.dst.__refcnt, 1);
1653         rth->u.dst.flags= DST_HOST;
1654         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1655                 rth->u.dst.flags |= DST_NOPOLICY;
1656         rth->fl.fl4_dst = daddr;
1657         rth->rt_dst     = daddr;
1658         rth->fl.fl4_tos = tos;
1659         rth->fl.mark    = skb->mark;
1660         rth->fl.fl4_src = saddr;
1661         rth->rt_src     = saddr;
1662 #ifdef CONFIG_NET_CLS_ROUTE
1663         rth->u.dst.tclassid = itag;
1664 #endif
1665         rth->rt_iif     =
1666         rth->fl.iif     = dev->ifindex;
1667         rth->u.dst.dev  = init_net.loopback_dev;
1668         dev_hold(rth->u.dst.dev);
1669         rth->idev       = in_dev_get(rth->u.dst.dev);
1670         rth->fl.oif     = 0;
1671         rth->rt_gateway = daddr;
1672         rth->rt_spec_dst= spec_dst;
1673         rth->rt_genid   = atomic_read(&rt_genid);
1674         rth->rt_flags   = RTCF_MULTICAST;
1675         rth->rt_type    = RTN_MULTICAST;
1676         if (our) {
1677                 rth->u.dst.input= ip_local_deliver;
1678                 rth->rt_flags |= RTCF_LOCAL;
1679         }
1680
1681 #ifdef CONFIG_IP_MROUTE
1682         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1683                 rth->u.dst.input = ip_mr_input;
1684 #endif
1685         RT_CACHE_STAT_INC(in_slow_mc);
1686
1687         in_dev_put(in_dev);
1688         hash = rt_hash(daddr, saddr, dev->ifindex);
1689         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1690
1691 e_nobufs:
1692         in_dev_put(in_dev);
1693         return -ENOBUFS;
1694
1695 e_inval:
1696         in_dev_put(in_dev);
1697         return -EINVAL;
1698 }
1699
1700
1701 static void ip_handle_martian_source(struct net_device *dev,
1702                                      struct in_device *in_dev,
1703                                      struct sk_buff *skb,
1704                                      __be32 daddr,
1705                                      __be32 saddr)
1706 {
1707         RT_CACHE_STAT_INC(in_martian_src);
1708 #ifdef CONFIG_IP_ROUTE_VERBOSE
1709         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1710                 /*
1711                  *      RFC1812 recommendation, if source is martian,
1712                  *      the only hint is MAC header.
1713                  */
1714                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1715                         "%u.%u.%u.%u, on dev %s\n",
1716                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1717                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1718                         int i;
1719                         const unsigned char *p = skb_mac_header(skb);
1720                         printk(KERN_WARNING "ll header: ");
1721                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1722                                 printk("%02x", *p);
1723                                 if (i < (dev->hard_header_len - 1))
1724                                         printk(":");
1725                         }
1726                         printk("\n");
1727                 }
1728         }
1729 #endif
1730 }
1731
1732 static inline int __mkroute_input(struct sk_buff *skb,
1733                                   struct fib_result* res,
1734                                   struct in_device *in_dev,
1735                                   __be32 daddr, __be32 saddr, u32 tos,
1736                                   struct rtable **result)
1737 {
1738
1739         struct rtable *rth;
1740         int err;
1741         struct in_device *out_dev;
1742         unsigned flags = 0;
1743         __be32 spec_dst;
1744         u32 itag;
1745
1746         /* get a working reference to the output device */
1747         out_dev = in_dev_get(FIB_RES_DEV(*res));
1748         if (out_dev == NULL) {
1749                 if (net_ratelimit())
1750                         printk(KERN_CRIT "Bug in ip_route_input" \
1751                                "_slow(). Please, report\n");
1752                 return -EINVAL;
1753         }
1754
1755
1756         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1757                                   in_dev->dev, &spec_dst, &itag);
1758         if (err < 0) {
1759                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1760                                          saddr);
1761
1762                 err = -EINVAL;
1763                 goto cleanup;
1764         }
1765
1766         if (err)
1767                 flags |= RTCF_DIRECTSRC;
1768
1769         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1770             (IN_DEV_SHARED_MEDIA(out_dev) ||
1771              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1772                 flags |= RTCF_DOREDIRECT;
1773
1774         if (skb->protocol != htons(ETH_P_IP)) {
1775                 /* Not IP (i.e. ARP). Do not create route, if it is
1776                  * invalid for proxy arp. DNAT routes are always valid.
1777                  */
1778                 if (out_dev == in_dev) {
1779                         err = -EINVAL;
1780                         goto cleanup;
1781                 }
1782         }
1783
1784
1785         rth = dst_alloc(&ipv4_dst_ops);
1786         if (!rth) {
1787                 err = -ENOBUFS;
1788                 goto cleanup;
1789         }
1790
1791         atomic_set(&rth->u.dst.__refcnt, 1);
1792         rth->u.dst.flags= DST_HOST;
1793         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1794                 rth->u.dst.flags |= DST_NOPOLICY;
1795         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1796                 rth->u.dst.flags |= DST_NOXFRM;
1797         rth->fl.fl4_dst = daddr;
1798         rth->rt_dst     = daddr;
1799         rth->fl.fl4_tos = tos;
1800         rth->fl.mark    = skb->mark;
1801         rth->fl.fl4_src = saddr;
1802         rth->rt_src     = saddr;
1803         rth->rt_gateway = daddr;
1804         rth->rt_iif     =
1805                 rth->fl.iif     = in_dev->dev->ifindex;
1806         rth->u.dst.dev  = (out_dev)->dev;
1807         dev_hold(rth->u.dst.dev);
1808         rth->idev       = in_dev_get(rth->u.dst.dev);
1809         rth->fl.oif     = 0;
1810         rth->rt_spec_dst= spec_dst;
1811
1812         rth->u.dst.input = ip_forward;
1813         rth->u.dst.output = ip_output;
1814         rth->rt_genid = atomic_read(&rt_genid);
1815
1816         rt_set_nexthop(rth, res, itag);
1817
1818         rth->rt_flags = flags;
1819
1820         *result = rth;
1821         err = 0;
1822  cleanup:
1823         /* release the working reference to the output device */
1824         in_dev_put(out_dev);
1825         return err;
1826 }
1827
1828 static inline int ip_mkroute_input(struct sk_buff *skb,
1829                                    struct fib_result* res,
1830                                    const struct flowi *fl,
1831                                    struct in_device *in_dev,
1832                                    __be32 daddr, __be32 saddr, u32 tos)
1833 {
1834         struct rtable* rth = NULL;
1835         int err;
1836         unsigned hash;
1837
1838 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1839         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1840                 fib_select_multipath(fl, res);
1841 #endif
1842
1843         /* create a routing cache entry */
1844         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1845         if (err)
1846                 return err;
1847
1848         /* put it into the cache */
1849         hash = rt_hash(daddr, saddr, fl->iif);
1850         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1851 }
1852
1853 /*
1854  *      NOTE. We drop all the packets that has local source
1855  *      addresses, because every properly looped back packet
1856  *      must have correct destination already attached by output routine.
1857  *
1858  *      Such approach solves two big problems:
1859  *      1. Not simplex devices are handled properly.
1860  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1861  */
1862
1863 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1864                                u8 tos, struct net_device *dev)
1865 {
1866         struct fib_result res;
1867         struct in_device *in_dev = in_dev_get(dev);
1868         struct flowi fl = { .nl_u = { .ip4_u =
1869                                       { .daddr = daddr,
1870                                         .saddr = saddr,
1871                                         .tos = tos,
1872                                         .scope = RT_SCOPE_UNIVERSE,
1873                                       } },
1874                             .mark = skb->mark,
1875                             .iif = dev->ifindex };
1876         unsigned        flags = 0;
1877         u32             itag = 0;
1878         struct rtable * rth;
1879         unsigned        hash;
1880         __be32          spec_dst;
1881         int             err = -EINVAL;
1882         int             free_res = 0;
1883         struct net    * net = dev->nd_net;
1884
1885         /* IP on this device is disabled. */
1886
1887         if (!in_dev)
1888                 goto out;
1889
1890         /* Check for the most weird martians, which can be not detected
1891            by fib_lookup.
1892          */
1893
1894         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1895             ipv4_is_loopback(saddr))
1896                 goto martian_source;
1897
1898         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1899                 goto brd_input;
1900
1901         /* Accept zero addresses only to limited broadcast;
1902          * I even do not know to fix it or not. Waiting for complains :-)
1903          */
1904         if (ipv4_is_zeronet(saddr))
1905                 goto martian_source;
1906
1907         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1908             ipv4_is_loopback(daddr))
1909                 goto martian_destination;
1910
1911         /*
1912          *      Now we are ready to route packet.
1913          */
1914         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1915                 if (!IN_DEV_FORWARD(in_dev))
1916                         goto e_hostunreach;
1917                 goto no_route;
1918         }
1919         free_res = 1;
1920
1921         RT_CACHE_STAT_INC(in_slow_tot);
1922
1923         if (res.type == RTN_BROADCAST)
1924                 goto brd_input;
1925
1926         if (res.type == RTN_LOCAL) {
1927                 int result;
1928                 result = fib_validate_source(saddr, daddr, tos,
1929                                              net->loopback_dev->ifindex,
1930                                              dev, &spec_dst, &itag);
1931                 if (result < 0)
1932                         goto martian_source;
1933                 if (result)
1934                         flags |= RTCF_DIRECTSRC;
1935                 spec_dst = daddr;
1936                 goto local_input;
1937         }
1938
1939         if (!IN_DEV_FORWARD(in_dev))
1940                 goto e_hostunreach;
1941         if (res.type != RTN_UNICAST)
1942                 goto martian_destination;
1943
1944         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1945 done:
1946         in_dev_put(in_dev);
1947         if (free_res)
1948                 fib_res_put(&res);
1949 out:    return err;
1950
1951 brd_input:
1952         if (skb->protocol != htons(ETH_P_IP))
1953                 goto e_inval;
1954
1955         if (ipv4_is_zeronet(saddr))
1956                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1957         else {
1958                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1959                                           &itag);
1960                 if (err < 0)
1961                         goto martian_source;
1962                 if (err)
1963                         flags |= RTCF_DIRECTSRC;
1964         }
1965         flags |= RTCF_BROADCAST;
1966         res.type = RTN_BROADCAST;
1967         RT_CACHE_STAT_INC(in_brd);
1968
1969 local_input:
1970         rth = dst_alloc(&ipv4_dst_ops);
1971         if (!rth)
1972                 goto e_nobufs;
1973
1974         rth->u.dst.output= ip_rt_bug;
1975         rth->rt_genid = atomic_read(&rt_genid);
1976
1977         atomic_set(&rth->u.dst.__refcnt, 1);
1978         rth->u.dst.flags= DST_HOST;
1979         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1980                 rth->u.dst.flags |= DST_NOPOLICY;
1981         rth->fl.fl4_dst = daddr;
1982         rth->rt_dst     = daddr;
1983         rth->fl.fl4_tos = tos;
1984         rth->fl.mark    = skb->mark;
1985         rth->fl.fl4_src = saddr;
1986         rth->rt_src     = saddr;
1987 #ifdef CONFIG_NET_CLS_ROUTE
1988         rth->u.dst.tclassid = itag;
1989 #endif
1990         rth->rt_iif     =
1991         rth->fl.iif     = dev->ifindex;
1992         rth->u.dst.dev  = net->loopback_dev;
1993         dev_hold(rth->u.dst.dev);
1994         rth->idev       = in_dev_get(rth->u.dst.dev);
1995         rth->rt_gateway = daddr;
1996         rth->rt_spec_dst= spec_dst;
1997         rth->u.dst.input= ip_local_deliver;
1998         rth->rt_flags   = flags|RTCF_LOCAL;
1999         if (res.type == RTN_UNREACHABLE) {
2000                 rth->u.dst.input= ip_error;
2001                 rth->u.dst.error= -err;
2002                 rth->rt_flags   &= ~RTCF_LOCAL;
2003         }
2004         rth->rt_type    = res.type;
2005         hash = rt_hash(daddr, saddr, fl.iif);
2006         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2007         goto done;
2008
2009 no_route:
2010         RT_CACHE_STAT_INC(in_no_route);
2011         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2012         res.type = RTN_UNREACHABLE;
2013         if (err == -ESRCH)
2014                 err = -ENETUNREACH;
2015         goto local_input;
2016
2017         /*
2018          *      Do not cache martian addresses: they should be logged (RFC1812)
2019          */
2020 martian_destination:
2021         RT_CACHE_STAT_INC(in_martian_dst);
2022 #ifdef CONFIG_IP_ROUTE_VERBOSE
2023         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025                         "%u.%u.%u.%u, dev %s\n",
2026                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027 #endif
2028
2029 e_hostunreach:
2030         err = -EHOSTUNREACH;
2031         goto done;
2032
2033 e_inval:
2034         err = -EINVAL;
2035         goto done;
2036
2037 e_nobufs:
2038         err = -ENOBUFS;
2039         goto done;
2040
2041 martian_source:
2042         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2043         goto e_inval;
2044 }
2045
2046 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2047                    u8 tos, struct net_device *dev)
2048 {
2049         struct rtable * rth;
2050         unsigned        hash;
2051         int iif = dev->ifindex;
2052         struct net *net;
2053
2054         net = dev->nd_net;
2055         tos &= IPTOS_RT_MASK;
2056         hash = rt_hash(daddr, saddr, iif);
2057
2058         rcu_read_lock();
2059         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2060              rth = rcu_dereference(rth->u.dst.rt_next)) {
2061                 if (rth->fl.fl4_dst == daddr &&
2062                     rth->fl.fl4_src == saddr &&
2063                     rth->fl.iif == iif &&
2064                     rth->fl.oif == 0 &&
2065                     rth->fl.mark == skb->mark &&
2066                     rth->fl.fl4_tos == tos &&
2067                     rth->u.dst.dev->nd_net == net &&
2068                     rth->rt_genid == atomic_read(&rt_genid)) {
2069                         dst_use(&rth->u.dst, jiffies);
2070                         RT_CACHE_STAT_INC(in_hit);
2071                         rcu_read_unlock();
2072                         skb->dst = (struct dst_entry*)rth;
2073                         return 0;
2074                 }
2075                 RT_CACHE_STAT_INC(in_hlist_search);
2076         }
2077         rcu_read_unlock();
2078
2079         /* Multicast recognition logic is moved from route cache to here.
2080            The problem was that too many Ethernet cards have broken/missing
2081            hardware multicast filters :-( As result the host on multicasting
2082            network acquires a lot of useless route cache entries, sort of
2083            SDR messages from all the world. Now we try to get rid of them.
2084            Really, provided software IP multicast filter is organized
2085            reasonably (at least, hashed), it does not result in a slowdown
2086            comparing with route cache reject entries.
2087            Note, that multicast routers are not affected, because
2088            route cache entry is created eventually.
2089          */
2090         if (ipv4_is_multicast(daddr)) {
2091                 struct in_device *in_dev;
2092
2093                 rcu_read_lock();
2094                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2095                         int our = ip_check_mc(in_dev, daddr, saddr,
2096                                 ip_hdr(skb)->protocol);
2097                         if (our
2098 #ifdef CONFIG_IP_MROUTE
2099                             || (!ipv4_is_local_multicast(daddr) &&
2100                                 IN_DEV_MFORWARD(in_dev))
2101 #endif
2102                             ) {
2103                                 rcu_read_unlock();
2104                                 return ip_route_input_mc(skb, daddr, saddr,
2105                                                          tos, dev, our);
2106                         }
2107                 }
2108                 rcu_read_unlock();
2109                 return -EINVAL;
2110         }
2111         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2112 }
2113
2114 static inline int __mkroute_output(struct rtable **result,
2115                                    struct fib_result* res,
2116                                    const struct flowi *fl,
2117                                    const struct flowi *oldflp,
2118                                    struct net_device *dev_out,
2119                                    unsigned flags)
2120 {
2121         struct rtable *rth;
2122         struct in_device *in_dev;
2123         u32 tos = RT_FL_TOS(oldflp);
2124         int err = 0;
2125
2126         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2127                 return -EINVAL;
2128
2129         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2130                 res->type = RTN_BROADCAST;
2131         else if (ipv4_is_multicast(fl->fl4_dst))
2132                 res->type = RTN_MULTICAST;
2133         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2134                 return -EINVAL;
2135
2136         if (dev_out->flags & IFF_LOOPBACK)
2137                 flags |= RTCF_LOCAL;
2138
2139         /* get work reference to inet device */
2140         in_dev = in_dev_get(dev_out);
2141         if (!in_dev)
2142                 return -EINVAL;
2143
2144         if (res->type == RTN_BROADCAST) {
2145                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2146                 if (res->fi) {
2147                         fib_info_put(res->fi);
2148                         res->fi = NULL;
2149                 }
2150         } else if (res->type == RTN_MULTICAST) {
2151                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2152                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2153                                  oldflp->proto))
2154                         flags &= ~RTCF_LOCAL;
2155                 /* If multicast route do not exist use
2156                    default one, but do not gateway in this case.
2157                    Yes, it is hack.
2158                  */
2159                 if (res->fi && res->prefixlen < 4) {
2160                         fib_info_put(res->fi);
2161                         res->fi = NULL;
2162                 }
2163         }
2164
2165
2166         rth = dst_alloc(&ipv4_dst_ops);
2167         if (!rth) {
2168                 err = -ENOBUFS;
2169                 goto cleanup;
2170         }
2171
2172         atomic_set(&rth->u.dst.__refcnt, 1);
2173         rth->u.dst.flags= DST_HOST;
2174         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2175                 rth->u.dst.flags |= DST_NOXFRM;
2176         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2177                 rth->u.dst.flags |= DST_NOPOLICY;
2178
2179         rth->fl.fl4_dst = oldflp->fl4_dst;
2180         rth->fl.fl4_tos = tos;
2181         rth->fl.fl4_src = oldflp->fl4_src;
2182         rth->fl.oif     = oldflp->oif;
2183         rth->fl.mark    = oldflp->mark;
2184         rth->rt_dst     = fl->fl4_dst;
2185         rth->rt_src     = fl->fl4_src;
2186         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2187         /* get references to the devices that are to be hold by the routing
2188            cache entry */
2189         rth->u.dst.dev  = dev_out;
2190         dev_hold(dev_out);
2191         rth->idev       = in_dev_get(dev_out);
2192         rth->rt_gateway = fl->fl4_dst;
2193         rth->rt_spec_dst= fl->fl4_src;
2194
2195         rth->u.dst.output=ip_output;
2196         rth->rt_genid = atomic_read(&rt_genid);
2197
2198         RT_CACHE_STAT_INC(out_slow_tot);
2199
2200         if (flags & RTCF_LOCAL) {
2201                 rth->u.dst.input = ip_local_deliver;
2202                 rth->rt_spec_dst = fl->fl4_dst;
2203         }
2204         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2205                 rth->rt_spec_dst = fl->fl4_src;
2206                 if (flags & RTCF_LOCAL &&
2207                     !(dev_out->flags & IFF_LOOPBACK)) {
2208                         rth->u.dst.output = ip_mc_output;
2209                         RT_CACHE_STAT_INC(out_slow_mc);
2210                 }
2211 #ifdef CONFIG_IP_MROUTE
2212                 if (res->type == RTN_MULTICAST) {
2213                         if (IN_DEV_MFORWARD(in_dev) &&
2214                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2215                                 rth->u.dst.input = ip_mr_input;
2216                                 rth->u.dst.output = ip_mc_output;
2217                         }
2218                 }
2219 #endif
2220         }
2221
2222         rt_set_nexthop(rth, res, 0);
2223
2224         rth->rt_flags = flags;
2225
2226         *result = rth;
2227  cleanup:
2228         /* release work reference to inet device */
2229         in_dev_put(in_dev);
2230
2231         return err;
2232 }
2233
2234 static inline int ip_mkroute_output(struct rtable **rp,
2235                                     struct fib_result* res,
2236                                     const struct flowi *fl,
2237                                     const struct flowi *oldflp,
2238                                     struct net_device *dev_out,
2239                                     unsigned flags)
2240 {
2241         struct rtable *rth = NULL;
2242         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2243         unsigned hash;
2244         if (err == 0) {
2245                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2246                 err = rt_intern_hash(hash, rth, rp);
2247         }
2248
2249         return err;
2250 }
2251
2252 /*
2253  * Major route resolver routine.
2254  */
2255
2256 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2257                                 const struct flowi *oldflp)
2258 {
2259         u32 tos = RT_FL_TOS(oldflp);
2260         struct flowi fl = { .nl_u = { .ip4_u =
2261                                       { .daddr = oldflp->fl4_dst,
2262                                         .saddr = oldflp->fl4_src,
2263                                         .tos = tos & IPTOS_RT_MASK,
2264                                         .scope = ((tos & RTO_ONLINK) ?
2265                                                   RT_SCOPE_LINK :
2266                                                   RT_SCOPE_UNIVERSE),
2267                                       } },
2268                             .mark = oldflp->mark,
2269                             .iif = net->loopback_dev->ifindex,
2270                             .oif = oldflp->oif };
2271         struct fib_result res;
2272         unsigned flags = 0;
2273         struct net_device *dev_out = NULL;
2274         int free_res = 0;
2275         int err;
2276
2277
2278         res.fi          = NULL;
2279 #ifdef CONFIG_IP_MULTIPLE_TABLES
2280         res.r           = NULL;
2281 #endif
2282
2283         if (oldflp->fl4_src) {
2284                 err = -EINVAL;
2285                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2286                     ipv4_is_lbcast(oldflp->fl4_src) ||
2287                     ipv4_is_zeronet(oldflp->fl4_src))
2288                         goto out;
2289
2290                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2291                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2292                 if (dev_out == NULL)
2293                         goto out;
2294
2295                 /* I removed check for oif == dev_out->oif here.
2296                    It was wrong for two reasons:
2297                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2298                       is assigned to multiple interfaces.
2299                    2. Moreover, we are allowed to send packets with saddr
2300                       of another iface. --ANK
2301                  */
2302
2303                 if (oldflp->oif == 0
2304                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2305                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2306                         /* Special hack: user can direct multicasts
2307                            and limited broadcast via necessary interface
2308                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2309                            This hack is not just for fun, it allows
2310                            vic,vat and friends to work.
2311                            They bind socket to loopback, set ttl to zero
2312                            and expect that it will work.
2313                            From the viewpoint of routing cache they are broken,
2314                            because we are not allowed to build multicast path
2315                            with loopback source addr (look, routing cache
2316                            cannot know, that ttl is zero, so that packet
2317                            will not leave this host and route is valid).
2318                            Luckily, this hack is good workaround.
2319                          */
2320
2321                         fl.oif = dev_out->ifindex;
2322                         goto make_route;
2323                 }
2324                 if (dev_out)
2325                         dev_put(dev_out);
2326                 dev_out = NULL;
2327         }
2328
2329
2330         if (oldflp->oif) {
2331                 dev_out = dev_get_by_index(net, oldflp->oif);
2332                 err = -ENODEV;
2333                 if (dev_out == NULL)
2334                         goto out;
2335
2336                 /* RACE: Check return value of inet_select_addr instead. */
2337                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2338                         dev_put(dev_out);
2339                         goto out;       /* Wrong error code */
2340                 }
2341
2342                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2343                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2344                         if (!fl.fl4_src)
2345                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2346                                                               RT_SCOPE_LINK);
2347                         goto make_route;
2348                 }
2349                 if (!fl.fl4_src) {
2350                         if (ipv4_is_multicast(oldflp->fl4_dst))
2351                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2352                                                               fl.fl4_scope);
2353                         else if (!oldflp->fl4_dst)
2354                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2355                                                               RT_SCOPE_HOST);
2356                 }
2357         }
2358
2359         if (!fl.fl4_dst) {
2360                 fl.fl4_dst = fl.fl4_src;
2361                 if (!fl.fl4_dst)
2362                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2363                 if (dev_out)
2364                         dev_put(dev_out);
2365                 dev_out = net->loopback_dev;
2366                 dev_hold(dev_out);
2367                 fl.oif = net->loopback_dev->ifindex;
2368                 res.type = RTN_LOCAL;
2369                 flags |= RTCF_LOCAL;
2370                 goto make_route;
2371         }
2372
2373         if (fib_lookup(net, &fl, &res)) {
2374                 res.fi = NULL;
2375                 if (oldflp->oif) {
2376                         /* Apparently, routing tables are wrong. Assume,
2377                            that the destination is on link.
2378
2379                            WHY? DW.
2380                            Because we are allowed to send to iface
2381                            even if it has NO routes and NO assigned
2382                            addresses. When oif is specified, routing
2383                            tables are looked up with only one purpose:
2384                            to catch if destination is gatewayed, rather than
2385                            direct. Moreover, if MSG_DONTROUTE is set,
2386                            we send packet, ignoring both routing tables
2387                            and ifaddr state. --ANK
2388
2389
2390                            We could make it even if oif is unknown,
2391                            likely IPv6, but we do not.
2392                          */
2393
2394                         if (fl.fl4_src == 0)
2395                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2396                                                               RT_SCOPE_LINK);
2397                         res.type = RTN_UNICAST;
2398                         goto make_route;
2399                 }
2400                 if (dev_out)
2401                         dev_put(dev_out);
2402                 err = -ENETUNREACH;
2403                 goto out;
2404         }
2405         free_res = 1;
2406
2407         if (res.type == RTN_LOCAL) {
2408                 if (!fl.fl4_src)
2409                         fl.fl4_src = fl.fl4_dst;
2410                 if (dev_out)
2411                         dev_put(dev_out);
2412                 dev_out = net->loopback_dev;
2413                 dev_hold(dev_out);
2414                 fl.oif = dev_out->ifindex;
2415                 if (res.fi)
2416                         fib_info_put(res.fi);
2417                 res.fi = NULL;
2418                 flags |= RTCF_LOCAL;
2419                 goto make_route;
2420         }
2421
2422 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2423         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2424                 fib_select_multipath(&fl, &res);
2425         else
2426 #endif
2427         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2428                 fib_select_default(net, &fl, &res);
2429
2430         if (!fl.fl4_src)
2431                 fl.fl4_src = FIB_RES_PREFSRC(res);
2432
2433         if (dev_out)
2434                 dev_put(dev_out);
2435         dev_out = FIB_RES_DEV(res);
2436         dev_hold(dev_out);
2437         fl.oif = dev_out->ifindex;
2438
2439
2440 make_route:
2441         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2442
2443
2444         if (free_res)
2445                 fib_res_put(&res);
2446         if (dev_out)
2447                 dev_put(dev_out);
2448 out:    return err;
2449 }
2450
2451 int __ip_route_output_key(struct net *net, struct rtable **rp,
2452                           const struct flowi *flp)
2453 {
2454         unsigned hash;
2455         struct rtable *rth;
2456
2457         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2458
2459         rcu_read_lock_bh();
2460         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2461                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2462                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2463                     rth->fl.fl4_src == flp->fl4_src &&
2464                     rth->fl.iif == 0 &&
2465                     rth->fl.oif == flp->oif &&
2466                     rth->fl.mark == flp->mark &&
2467                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2468                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2469                     rth->u.dst.dev->nd_net == net &&
2470                     rth->rt_genid == atomic_read(&rt_genid)) {
2471                         dst_use(&rth->u.dst, jiffies);
2472                         RT_CACHE_STAT_INC(out_hit);
2473                         rcu_read_unlock_bh();
2474                         *rp = rth;
2475                         return 0;
2476                 }
2477                 RT_CACHE_STAT_INC(out_hlist_search);
2478         }
2479         rcu_read_unlock_bh();
2480
2481         return ip_route_output_slow(net, rp, flp);
2482 }
2483
2484 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2485
2486 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2487 {
2488 }
2489
2490 static struct dst_ops ipv4_dst_blackhole_ops = {
2491         .family                 =       AF_INET,
2492         .protocol               =       __constant_htons(ETH_P_IP),
2493         .destroy                =       ipv4_dst_destroy,
2494         .check                  =       ipv4_dst_check,
2495         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2496         .entry_size             =       sizeof(struct rtable),
2497         .entries                =       ATOMIC_INIT(0),
2498 };
2499
2500
2501 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2502 {
2503         struct rtable *ort = *rp;
2504         struct rtable *rt = (struct rtable *)
2505                 dst_alloc(&ipv4_dst_blackhole_ops);
2506
2507         if (rt) {
2508                 struct dst_entry *new = &rt->u.dst;
2509
2510                 atomic_set(&new->__refcnt, 1);
2511                 new->__use = 1;
2512                 new->input = dst_discard;
2513                 new->output = dst_discard;
2514                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2515
2516                 new->dev = ort->u.dst.dev;
2517                 if (new->dev)
2518                         dev_hold(new->dev);
2519
2520                 rt->fl = ort->fl;
2521
2522                 rt->idev = ort->idev;
2523                 if (rt->idev)
2524                         in_dev_hold(rt->idev);
2525                 rt->rt_genid = atomic_read(&rt_genid);
2526                 rt->rt_flags = ort->rt_flags;
2527                 rt->rt_type = ort->rt_type;
2528                 rt->rt_dst = ort->rt_dst;
2529                 rt->rt_src = ort->rt_src;
2530                 rt->rt_iif = ort->rt_iif;
2531                 rt->rt_gateway = ort->rt_gateway;
2532                 rt->rt_spec_dst = ort->rt_spec_dst;
2533                 rt->peer = ort->peer;
2534                 if (rt->peer)
2535                         atomic_inc(&rt->peer->refcnt);
2536
2537                 dst_free(new);
2538         }
2539
2540         dst_release(&(*rp)->u.dst);
2541         *rp = rt;
2542         return (rt ? 0 : -ENOMEM);
2543 }
2544
2545 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2546                          struct sock *sk, int flags)
2547 {
2548         int err;
2549
2550         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2551                 return err;
2552
2553         if (flp->proto) {
2554                 if (!flp->fl4_src)
2555                         flp->fl4_src = (*rp)->rt_src;
2556                 if (!flp->fl4_dst)
2557                         flp->fl4_dst = (*rp)->rt_dst;
2558                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2559                                     flags ? XFRM_LOOKUP_WAIT : 0);
2560                 if (err == -EREMOTE)
2561                         err = ipv4_dst_blackhole(rp, flp, sk);
2562
2563                 return err;
2564         }
2565
2566         return 0;
2567 }
2568
2569 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2570
2571 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2572 {
2573         return ip_route_output_flow(net, rp, flp, NULL, 0);
2574 }
2575
2576 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2577                         int nowait, unsigned int flags)
2578 {
2579         struct rtable *rt = (struct rtable*)skb->dst;
2580         struct rtmsg *r;
2581         struct nlmsghdr *nlh;
2582         long expires;
2583         u32 id = 0, ts = 0, tsage = 0, error;
2584
2585         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2586         if (nlh == NULL)
2587                 return -EMSGSIZE;
2588
2589         r = nlmsg_data(nlh);
2590         r->rtm_family    = AF_INET;
2591         r->rtm_dst_len  = 32;
2592         r->rtm_src_len  = 0;
2593         r->rtm_tos      = rt->fl.fl4_tos;
2594         r->rtm_table    = RT_TABLE_MAIN;
2595         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2596         r->rtm_type     = rt->rt_type;
2597         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2598         r->rtm_protocol = RTPROT_UNSPEC;
2599         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2600         if (rt->rt_flags & RTCF_NOTIFY)
2601                 r->rtm_flags |= RTM_F_NOTIFY;
2602
2603         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2604
2605         if (rt->fl.fl4_src) {
2606                 r->rtm_src_len = 32;
2607                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2608         }
2609         if (rt->u.dst.dev)
2610                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2611 #ifdef CONFIG_NET_CLS_ROUTE
2612         if (rt->u.dst.tclassid)
2613                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2614 #endif
2615         if (rt->fl.iif)
2616                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2617         else if (rt->rt_src != rt->fl.fl4_src)
2618                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2619
2620         if (rt->rt_dst != rt->rt_gateway)
2621                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2622
2623         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2624                 goto nla_put_failure;
2625
2626         error = rt->u.dst.error;
2627         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2628         if (rt->peer) {
2629                 id = rt->peer->ip_id_count;
2630                 if (rt->peer->tcp_ts_stamp) {
2631                         ts = rt->peer->tcp_ts;
2632                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2633                 }
2634         }
2635
2636         if (rt->fl.iif) {
2637 #ifdef CONFIG_IP_MROUTE
2638                 __be32 dst = rt->rt_dst;
2639
2640                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2641                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2642                         int err = ipmr_get_route(skb, r, nowait);
2643                         if (err <= 0) {
2644                                 if (!nowait) {
2645                                         if (err == 0)
2646                                                 return 0;
2647                                         goto nla_put_failure;
2648                                 } else {
2649                                         if (err == -EMSGSIZE)
2650                                                 goto nla_put_failure;
2651                                         error = err;
2652                                 }
2653                         }
2654                 } else
2655 #endif
2656                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2657         }
2658
2659         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2660                                expires, error) < 0)
2661                 goto nla_put_failure;
2662
2663         return nlmsg_end(skb, nlh);
2664
2665 nla_put_failure:
2666         nlmsg_cancel(skb, nlh);
2667         return -EMSGSIZE;
2668 }
2669
2670 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2671 {
2672         struct net *net = in_skb->sk->sk_net;
2673         struct rtmsg *rtm;
2674         struct nlattr *tb[RTA_MAX+1];
2675         struct rtable *rt = NULL;
2676         __be32 dst = 0;
2677         __be32 src = 0;
2678         u32 iif;
2679         int err;
2680         struct sk_buff *skb;
2681
2682         if (net != &init_net)
2683                 return -EINVAL;
2684
2685         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2686         if (err < 0)
2687                 goto errout;
2688
2689         rtm = nlmsg_data(nlh);
2690
2691         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2692         if (skb == NULL) {
2693                 err = -ENOBUFS;
2694                 goto errout;
2695         }
2696
2697         /* Reserve room for dummy headers, this skb can pass
2698            through good chunk of routing engine.
2699          */
2700         skb_reset_mac_header(skb);
2701         skb_reset_network_header(skb);
2702
2703         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2704         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2705         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2706
2707         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2708         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2709         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2710
2711         if (iif) {
2712                 struct net_device *dev;
2713
2714                 dev = __dev_get_by_index(&init_net, iif);
2715                 if (dev == NULL) {
2716                         err = -ENODEV;
2717                         goto errout_free;
2718                 }
2719
2720                 skb->protocol   = htons(ETH_P_IP);
2721                 skb->dev        = dev;
2722                 local_bh_disable();
2723                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2724                 local_bh_enable();
2725
2726                 rt = (struct rtable*) skb->dst;
2727                 if (err == 0 && rt->u.dst.error)
2728                         err = -rt->u.dst.error;
2729         } else {
2730                 struct flowi fl = {
2731                         .nl_u = {
2732                                 .ip4_u = {
2733                                         .daddr = dst,
2734                                         .saddr = src,
2735                                         .tos = rtm->rtm_tos,
2736                                 },
2737                         },
2738                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2739                 };
2740                 err = ip_route_output_key(&init_net, &rt, &fl);
2741         }
2742
2743         if (err)
2744                 goto errout_free;
2745
2746         skb->dst = &rt->u.dst;
2747         if (rtm->rtm_flags & RTM_F_NOTIFY)
2748                 rt->rt_flags |= RTCF_NOTIFY;
2749
2750         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2751                                 RTM_NEWROUTE, 0, 0);
2752         if (err <= 0)
2753                 goto errout_free;
2754
2755         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2756 errout:
2757         return err;
2758
2759 errout_free:
2760         kfree_skb(skb);
2761         goto errout;
2762 }
2763
2764 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2765 {
2766         struct rtable *rt;
2767         int h, s_h;
2768         int idx, s_idx;
2769
2770         s_h = cb->args[0];
2771         if (s_h < 0)
2772                 s_h = 0;
2773         s_idx = idx = cb->args[1];
2774         for (h = s_h; h <= rt_hash_mask; h++) {
2775                 rcu_read_lock_bh();
2776                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2777                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2778                         if (idx < s_idx)
2779                                 continue;
2780                         if (rt->rt_genid != atomic_read(&rt_genid))
2781                                 continue;
2782                         skb->dst = dst_clone(&rt->u.dst);
2783                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2784                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2785                                          1, NLM_F_MULTI) <= 0) {
2786                                 dst_release(xchg(&skb->dst, NULL));
2787                                 rcu_read_unlock_bh();
2788                                 goto done;
2789                         }
2790                         dst_release(xchg(&skb->dst, NULL));
2791                 }
2792                 rcu_read_unlock_bh();
2793                 s_idx = 0;
2794         }
2795
2796 done:
2797         cb->args[0] = h;
2798         cb->args[1] = idx;
2799         return skb->len;
2800 }
2801
2802 void ip_rt_multicast_event(struct in_device *in_dev)
2803 {
2804         rt_cache_flush(0);
2805 }
2806
2807 #ifdef CONFIG_SYSCTL
2808 static int flush_delay;
2809
2810 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2811                                         struct file *filp, void __user *buffer,
2812                                         size_t *lenp, loff_t *ppos)
2813 {
2814         if (write) {
2815                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2816                 rt_cache_flush(flush_delay);
2817                 return 0;
2818         }
2819
2820         return -EINVAL;
2821 }
2822
2823 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2824                                                 int __user *name,
2825                                                 int nlen,
2826                                                 void __user *oldval,
2827                                                 size_t __user *oldlenp,
2828                                                 void __user *newval,
2829                                                 size_t newlen)
2830 {
2831         int delay;
2832         if (newlen != sizeof(int))
2833                 return -EINVAL;
2834         if (get_user(delay, (int __user *)newval))
2835                 return -EFAULT;
2836         rt_cache_flush(delay);
2837         return 0;
2838 }
2839
2840 ctl_table ipv4_route_table[] = {
2841         {
2842                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2843                 .procname       = "flush",
2844                 .data           = &flush_delay,
2845                 .maxlen         = sizeof(int),
2846                 .mode           = 0200,
2847                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2848                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2849         },
2850         {
2851                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2852                 .procname       = "gc_thresh",
2853                 .data           = &ipv4_dst_ops.gc_thresh,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = &proc_dointvec,
2857         },
2858         {
2859                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2860                 .procname       = "max_size",
2861                 .data           = &ip_rt_max_size,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = &proc_dointvec,
2865         },
2866         {
2867                 /*  Deprecated. Use gc_min_interval_ms */
2868
2869                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2870                 .procname       = "gc_min_interval",
2871                 .data           = &ip_rt_gc_min_interval,
2872                 .maxlen         = sizeof(int),
2873                 .mode           = 0644,
2874                 .proc_handler   = &proc_dointvec_jiffies,
2875                 .strategy       = &sysctl_jiffies,
2876         },
2877         {
2878                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2879                 .procname       = "gc_min_interval_ms",
2880                 .data           = &ip_rt_gc_min_interval,
2881                 .maxlen         = sizeof(int),
2882                 .mode           = 0644,
2883                 .proc_handler   = &proc_dointvec_ms_jiffies,
2884                 .strategy       = &sysctl_ms_jiffies,
2885         },
2886         {
2887                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2888                 .procname       = "gc_timeout",
2889                 .data           = &ip_rt_gc_timeout,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = &proc_dointvec_jiffies,
2893                 .strategy       = &sysctl_jiffies,
2894         },
2895         {
2896                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2897                 .procname       = "gc_interval",
2898                 .data           = &ip_rt_gc_interval,
2899                 .maxlen         = sizeof(int),
2900                 .mode           = 0644,
2901                 .proc_handler   = &proc_dointvec_jiffies,
2902                 .strategy       = &sysctl_jiffies,
2903         },
2904         {
2905                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2906                 .procname       = "redirect_load",
2907                 .data           = &ip_rt_redirect_load,
2908                 .maxlen         = sizeof(int),
2909                 .mode           = 0644,
2910                 .proc_handler   = &proc_dointvec,
2911         },
2912         {
2913                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2914                 .procname       = "redirect_number",
2915                 .data           = &ip_rt_redirect_number,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = &proc_dointvec,
2919         },
2920         {
2921                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2922                 .procname       = "redirect_silence",
2923                 .data           = &ip_rt_redirect_silence,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = &proc_dointvec,
2927         },
2928         {
2929                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2930                 .procname       = "error_cost",
2931                 .data           = &ip_rt_error_cost,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = &proc_dointvec,
2935         },
2936         {
2937                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2938                 .procname       = "error_burst",
2939                 .data           = &ip_rt_error_burst,
2940                 .maxlen         = sizeof(int),
2941                 .mode           = 0644,
2942                 .proc_handler   = &proc_dointvec,
2943         },
2944         {
2945                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2946                 .procname       = "gc_elasticity",
2947                 .data           = &ip_rt_gc_elasticity,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = &proc_dointvec,
2951         },
2952         {
2953                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2954                 .procname       = "mtu_expires",
2955                 .data           = &ip_rt_mtu_expires,
2956                 .maxlen         = sizeof(int),
2957                 .mode           = 0644,
2958                 .proc_handler   = &proc_dointvec_jiffies,
2959                 .strategy       = &sysctl_jiffies,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2963                 .procname       = "min_pmtu",
2964                 .data           = &ip_rt_min_pmtu,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2971                 .procname       = "min_adv_mss",
2972                 .data           = &ip_rt_min_advmss,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec,
2976         },
2977         {
2978                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2979                 .procname       = "secret_interval",
2980                 .data           = &ip_rt_secret_interval,
2981                 .maxlen         = sizeof(int),
2982                 .mode           = 0644,
2983                 .proc_handler   = &proc_dointvec_jiffies,
2984                 .strategy       = &sysctl_jiffies,
2985         },
2986         { .ctl_name = 0 }
2987 };
2988 #endif
2989
2990 #ifdef CONFIG_NET_CLS_ROUTE
2991 struct ip_rt_acct *ip_rt_acct __read_mostly;
2992 #endif /* CONFIG_NET_CLS_ROUTE */
2993
2994 static __initdata unsigned long rhash_entries;
2995 static int __init set_rhash_entries(char *str)
2996 {
2997         if (!str)
2998                 return 0;
2999         rhash_entries = simple_strtoul(str, &str, 0);
3000         return 1;
3001 }
3002 __setup("rhash_entries=", set_rhash_entries);
3003
3004 int __init ip_rt_init(void)
3005 {
3006         int rc = 0;
3007
3008         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3009                              (jiffies ^ (jiffies >> 7))));
3010
3011 #ifdef CONFIG_NET_CLS_ROUTE
3012         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3013         if (!ip_rt_acct)
3014                 panic("IP: failed to allocate ip_rt_acct\n");
3015 #endif
3016
3017         ipv4_dst_ops.kmem_cachep =
3018                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3019                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3020
3021         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3022
3023         rt_hash_table = (struct rt_hash_bucket *)
3024                 alloc_large_system_hash("IP route cache",
3025                                         sizeof(struct rt_hash_bucket),
3026                                         rhash_entries,
3027                                         (num_physpages >= 128 * 1024) ?
3028                                         15 : 17,
3029                                         0,
3030                                         &rt_hash_log,
3031                                         &rt_hash_mask,
3032                                         0);
3033         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3034         rt_hash_lock_init();
3035
3036         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3037         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3038
3039         devinet_init();
3040         ip_fib_init();
3041
3042         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3043
3044         /* All the timers, started at system startup tend
3045            to synchronize. Perturb it a bit.
3046          */
3047         schedule_delayed_work(&expires_work,
3048                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3049
3050         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3051                 ip_rt_secret_interval;
3052         add_timer(&rt_secret_timer);
3053
3054         if (ip_rt_proc_init(&init_net))
3055                 printk(KERN_ERR "Unable to create route proc files\n");
3056 #ifdef CONFIG_XFRM
3057         xfrm_init();
3058         xfrm4_init();
3059 #endif
3060         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3061
3062         return rc;
3063 }
3064
3065 EXPORT_SYMBOL(__ip_select_ident);
3066 EXPORT_SYMBOL(ip_route_input);
3067 EXPORT_SYMBOL(ip_route_output_key);