85cc053d9d6ec19238ef44861db1ac71a4a4a4b1
[linux-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/atmclip.h>
113 #include <net/secure_seq.h>
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define IP_MAX_MTU      0xFFF0
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
126 static int ip_rt_redirect_number __read_mostly  = 9;
127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly       = HZ;
130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly    = 8;
132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly       = 256;
135 static int rt_chain_length_max __read_mostly    = 20;
136 static int redirect_genid;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155                             int how)
156 {
157 }
158
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161         struct rtable *rt = (struct rtable *) dst;
162         struct inet_peer *peer;
163         u32 *p = NULL;
164
165         if (!rt->peer)
166                 rt_bind_peer(rt, rt->rt_dst, 1);
167
168         peer = rt->peer;
169         if (peer) {
170                 u32 *old_p = __DST_METRICS_PTR(old);
171                 unsigned long prev, new;
172
173                 p = peer->metrics;
174                 if (inet_metrics_new(peer))
175                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176
177                 new = (unsigned long) p;
178                 prev = cmpxchg(&dst->_metrics, old, new);
179
180                 if (prev != old) {
181                         p = __DST_METRICS_PTR(prev);
182                         if (prev & DST_METRICS_READ_ONLY)
183                                 p = NULL;
184                 } else {
185                         if (rt->fi) {
186                                 fib_info_put(rt->fi);
187                                 rt->fi = NULL;
188                         }
189                 }
190         }
191         return p;
192 }
193
194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
195
196 static struct dst_ops ipv4_dst_ops = {
197         .family =               AF_INET,
198         .protocol =             cpu_to_be16(ETH_P_IP),
199         .gc =                   rt_garbage_collect,
200         .check =                ipv4_dst_check,
201         .default_advmss =       ipv4_default_advmss,
202         .mtu =                  ipv4_mtu,
203         .cow_metrics =          ipv4_cow_metrics,
204         .destroy =              ipv4_dst_destroy,
205         .ifdown =               ipv4_dst_ifdown,
206         .negative_advice =      ipv4_negative_advice,
207         .link_failure =         ipv4_link_failure,
208         .update_pmtu =          ip_rt_update_pmtu,
209         .local_out =            __ip_local_out,
210         .neigh_lookup =         ipv4_neigh_lookup,
211 };
212
213 #define ECN_OR_COST(class)      TC_PRIO_##class
214
215 const __u8 ip_tos2prio[16] = {
216         TC_PRIO_BESTEFFORT,
217         ECN_OR_COST(BESTEFFORT),
218         TC_PRIO_BESTEFFORT,
219         ECN_OR_COST(BESTEFFORT),
220         TC_PRIO_BULK,
221         ECN_OR_COST(BULK),
222         TC_PRIO_BULK,
223         ECN_OR_COST(BULK),
224         TC_PRIO_INTERACTIVE,
225         ECN_OR_COST(INTERACTIVE),
226         TC_PRIO_INTERACTIVE,
227         ECN_OR_COST(INTERACTIVE),
228         TC_PRIO_INTERACTIVE_BULK,
229         ECN_OR_COST(INTERACTIVE_BULK),
230         TC_PRIO_INTERACTIVE_BULK,
231         ECN_OR_COST(INTERACTIVE_BULK)
232 };
233
234
235 /*
236  * Route cache.
237  */
238
239 /* The locking scheme is rather straight forward:
240  *
241  * 1) Read-Copy Update protects the buckets of the central route hash.
242  * 2) Only writers remove entries, and they hold the lock
243  *    as they look at rtable reference counts.
244  * 3) Only readers acquire references to rtable entries,
245  *    they do so with atomic increments and with the
246  *    lock held.
247  */
248
249 struct rt_hash_bucket {
250         struct rtable __rcu     *chain;
251 };
252
253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254         defined(CONFIG_PROVE_LOCKING)
255 /*
256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257  * The size of this table is a power of two and depends on the number of CPUS.
258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259  */
260 #ifdef CONFIG_LOCKDEP
261 # define RT_HASH_LOCK_SZ        256
262 #else
263 # if NR_CPUS >= 32
264 #  define RT_HASH_LOCK_SZ       4096
265 # elif NR_CPUS >= 16
266 #  define RT_HASH_LOCK_SZ       2048
267 # elif NR_CPUS >= 8
268 #  define RT_HASH_LOCK_SZ       1024
269 # elif NR_CPUS >= 4
270 #  define RT_HASH_LOCK_SZ       512
271 # else
272 #  define RT_HASH_LOCK_SZ       256
273 # endif
274 #endif
275
276 static spinlock_t       *rt_hash_locks;
277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278
279 static __init void rt_hash_lock_init(void)
280 {
281         int i;
282
283         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284                         GFP_KERNEL);
285         if (!rt_hash_locks)
286                 panic("IP: failed to allocate rt_hash_locks\n");
287
288         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289                 spin_lock_init(&rt_hash_locks[i]);
290 }
291 #else
292 # define rt_hash_lock_addr(slot) NULL
293
294 static inline void rt_hash_lock_init(void)
295 {
296 }
297 #endif
298
299 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
300 static unsigned                 rt_hash_mask __read_mostly;
301 static unsigned int             rt_hash_log  __read_mostly;
302
303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
305
306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
307                                    int genid)
308 {
309         return jhash_3words((__force u32)daddr, (__force u32)saddr,
310                             idx, genid)
311                 & rt_hash_mask;
312 }
313
314 static inline int rt_genid(struct net *net)
315 {
316         return atomic_read(&net->ipv4.rt_genid);
317 }
318
319 #ifdef CONFIG_PROC_FS
320 struct rt_cache_iter_state {
321         struct seq_net_private p;
322         int bucket;
323         int genid;
324 };
325
326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
327 {
328         struct rt_cache_iter_state *st = seq->private;
329         struct rtable *r = NULL;
330
331         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
332                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
333                         continue;
334                 rcu_read_lock_bh();
335                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336                 while (r) {
337                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
338                             r->rt_genid == st->genid)
339                                 return r;
340                         r = rcu_dereference_bh(r->dst.rt_next);
341                 }
342                 rcu_read_unlock_bh();
343         }
344         return r;
345 }
346
347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
348                                           struct rtable *r)
349 {
350         struct rt_cache_iter_state *st = seq->private;
351
352         r = rcu_dereference_bh(r->dst.rt_next);
353         while (!r) {
354                 rcu_read_unlock_bh();
355                 do {
356                         if (--st->bucket < 0)
357                                 return NULL;
358                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
359                 rcu_read_lock_bh();
360                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
361         }
362         return r;
363 }
364
365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
366                                         struct rtable *r)
367 {
368         struct rt_cache_iter_state *st = seq->private;
369         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
370                 if (dev_net(r->dst.dev) != seq_file_net(seq))
371                         continue;
372                 if (r->rt_genid == st->genid)
373                         break;
374         }
375         return r;
376 }
377
378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379 {
380         struct rtable *r = rt_cache_get_first(seq);
381
382         if (r)
383                 while (pos && (r = rt_cache_get_next(seq, r)))
384                         --pos;
385         return pos ? NULL : r;
386 }
387
388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389 {
390         struct rt_cache_iter_state *st = seq->private;
391         if (*pos)
392                 return rt_cache_get_idx(seq, *pos - 1);
393         st->genid = rt_genid(seq_file_net(seq));
394         return SEQ_START_TOKEN;
395 }
396
397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
398 {
399         struct rtable *r;
400
401         if (v == SEQ_START_TOKEN)
402                 r = rt_cache_get_first(seq);
403         else
404                 r = rt_cache_get_next(seq, v);
405         ++*pos;
406         return r;
407 }
408
409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410 {
411         if (v && v != SEQ_START_TOKEN)
412                 rcu_read_unlock_bh();
413 }
414
415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
416 {
417         if (v == SEQ_START_TOKEN)
418                 seq_printf(seq, "%-127s\n",
419                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
420                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
421                            "HHUptod\tSpecDst");
422         else {
423                 struct rtable *r = v;
424                 struct neighbour *n;
425                 int len, HHUptod;
426
427                 rcu_read_lock();
428                 n = dst_get_neighbour(&r->dst);
429                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430                 rcu_read_unlock();
431
432                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
434                         r->dst.dev ? r->dst.dev->name : "*",
435                         (__force u32)r->rt_dst,
436                         (__force u32)r->rt_gateway,
437                         r->rt_flags, atomic_read(&r->dst.__refcnt),
438                         r->dst.__use, 0, (__force u32)r->rt_src,
439                         dst_metric_advmss(&r->dst) + 40,
440                         dst_metric(&r->dst, RTAX_WINDOW),
441                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442                               dst_metric(&r->dst, RTAX_RTTVAR)),
443                         r->rt_key_tos,
444                         -1,
445                         HHUptod,
446                         r->rt_spec_dst, &len);
447
448                 seq_printf(seq, "%*s\n", 127 - len, "");
449         }
450         return 0;
451 }
452
453 static const struct seq_operations rt_cache_seq_ops = {
454         .start  = rt_cache_seq_start,
455         .next   = rt_cache_seq_next,
456         .stop   = rt_cache_seq_stop,
457         .show   = rt_cache_seq_show,
458 };
459
460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
461 {
462         return seq_open_net(inode, file, &rt_cache_seq_ops,
463                         sizeof(struct rt_cache_iter_state));
464 }
465
466 static const struct file_operations rt_cache_seq_fops = {
467         .owner   = THIS_MODULE,
468         .open    = rt_cache_seq_open,
469         .read    = seq_read,
470         .llseek  = seq_lseek,
471         .release = seq_release_net,
472 };
473
474
475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
476 {
477         int cpu;
478
479         if (*pos == 0)
480                 return SEQ_START_TOKEN;
481
482         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
483                 if (!cpu_possible(cpu))
484                         continue;
485                 *pos = cpu+1;
486                 return &per_cpu(rt_cache_stat, cpu);
487         }
488         return NULL;
489 }
490
491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
492 {
493         int cpu;
494
495         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
496                 if (!cpu_possible(cpu))
497                         continue;
498                 *pos = cpu+1;
499                 return &per_cpu(rt_cache_stat, cpu);
500         }
501         return NULL;
502
503 }
504
505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
506 {
507
508 }
509
510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
511 {
512         struct rt_cache_stat *st = v;
513
514         if (v == SEQ_START_TOKEN) {
515                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
516                 return 0;
517         }
518
519         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
520                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
521                    dst_entries_get_slow(&ipv4_dst_ops),
522                    st->in_hit,
523                    st->in_slow_tot,
524                    st->in_slow_mc,
525                    st->in_no_route,
526                    st->in_brd,
527                    st->in_martian_dst,
528                    st->in_martian_src,
529
530                    st->out_hit,
531                    st->out_slow_tot,
532                    st->out_slow_mc,
533
534                    st->gc_total,
535                    st->gc_ignored,
536                    st->gc_goal_miss,
537                    st->gc_dst_overflow,
538                    st->in_hlist_search,
539                    st->out_hlist_search
540                 );
541         return 0;
542 }
543
544 static const struct seq_operations rt_cpu_seq_ops = {
545         .start  = rt_cpu_seq_start,
546         .next   = rt_cpu_seq_next,
547         .stop   = rt_cpu_seq_stop,
548         .show   = rt_cpu_seq_show,
549 };
550
551
552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
553 {
554         return seq_open(file, &rt_cpu_seq_ops);
555 }
556
557 static const struct file_operations rt_cpu_seq_fops = {
558         .owner   = THIS_MODULE,
559         .open    = rt_cpu_seq_open,
560         .read    = seq_read,
561         .llseek  = seq_lseek,
562         .release = seq_release,
563 };
564
565 #ifdef CONFIG_IP_ROUTE_CLASSID
566 static int rt_acct_proc_show(struct seq_file *m, void *v)
567 {
568         struct ip_rt_acct *dst, *src;
569         unsigned int i, j;
570
571         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
572         if (!dst)
573                 return -ENOMEM;
574
575         for_each_possible_cpu(i) {
576                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
577                 for (j = 0; j < 256; j++) {
578                         dst[j].o_bytes   += src[j].o_bytes;
579                         dst[j].o_packets += src[j].o_packets;
580                         dst[j].i_bytes   += src[j].i_bytes;
581                         dst[j].i_packets += src[j].i_packets;
582                 }
583         }
584
585         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
586         kfree(dst);
587         return 0;
588 }
589
590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
591 {
592         return single_open(file, rt_acct_proc_show, NULL);
593 }
594
595 static const struct file_operations rt_acct_proc_fops = {
596         .owner          = THIS_MODULE,
597         .open           = rt_acct_proc_open,
598         .read           = seq_read,
599         .llseek         = seq_lseek,
600         .release        = single_release,
601 };
602 #endif
603
604 static int __net_init ip_rt_do_proc_init(struct net *net)
605 {
606         struct proc_dir_entry *pde;
607
608         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
609                         &rt_cache_seq_fops);
610         if (!pde)
611                 goto err1;
612
613         pde = proc_create("rt_cache", S_IRUGO,
614                           net->proc_net_stat, &rt_cpu_seq_fops);
615         if (!pde)
616                 goto err2;
617
618 #ifdef CONFIG_IP_ROUTE_CLASSID
619         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
620         if (!pde)
621                 goto err3;
622 #endif
623         return 0;
624
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626 err3:
627         remove_proc_entry("rt_cache", net->proc_net_stat);
628 #endif
629 err2:
630         remove_proc_entry("rt_cache", net->proc_net);
631 err1:
632         return -ENOMEM;
633 }
634
635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
636 {
637         remove_proc_entry("rt_cache", net->proc_net_stat);
638         remove_proc_entry("rt_cache", net->proc_net);
639 #ifdef CONFIG_IP_ROUTE_CLASSID
640         remove_proc_entry("rt_acct", net->proc_net);
641 #endif
642 }
643
644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
645         .init = ip_rt_do_proc_init,
646         .exit = ip_rt_do_proc_exit,
647 };
648
649 static int __init ip_rt_proc_init(void)
650 {
651         return register_pernet_subsys(&ip_rt_proc_ops);
652 }
653
654 #else
655 static inline int ip_rt_proc_init(void)
656 {
657         return 0;
658 }
659 #endif /* CONFIG_PROC_FS */
660
661 static inline void rt_free(struct rtable *rt)
662 {
663         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
664 }
665
666 static inline void rt_drop(struct rtable *rt)
667 {
668         ip_rt_put(rt);
669         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
670 }
671
672 static inline int rt_fast_clean(struct rtable *rth)
673 {
674         /* Kill broadcast/multicast entries very aggresively, if they
675            collide in hash table with more useful entries */
676         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
677                 rt_is_input_route(rth) && rth->dst.rt_next;
678 }
679
680 static inline int rt_valuable(struct rtable *rth)
681 {
682         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
683                 (rth->peer && rth->peer->pmtu_expires);
684 }
685
686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
687 {
688         unsigned long age;
689         int ret = 0;
690
691         if (atomic_read(&rth->dst.__refcnt))
692                 goto out;
693
694         age = jiffies - rth->dst.lastuse;
695         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696             (age <= tmo2 && rt_valuable(rth)))
697                 goto out;
698         ret = 1;
699 out:    return ret;
700 }
701
702 /* Bits of score are:
703  * 31: very valuable
704  * 30: not quite useless
705  * 29..0: usage counter
706  */
707 static inline u32 rt_score(struct rtable *rt)
708 {
709         u32 score = jiffies - rt->dst.lastuse;
710
711         score = ~score & ~(3<<30);
712
713         if (rt_valuable(rt))
714                 score |= (1<<31);
715
716         if (rt_is_output_route(rt) ||
717             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718                 score |= (1<<30);
719
720         return score;
721 }
722
723 static inline bool rt_caching(const struct net *net)
724 {
725         return net->ipv4.current_rt_cache_rebuild_count <=
726                 net->ipv4.sysctl_rt_cache_rebuild_count;
727 }
728
729 static inline bool compare_hash_inputs(const struct rtable *rt1,
730                                        const struct rtable *rt2)
731 {
732         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
733                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
734                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
735 }
736
737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
738 {
739         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741                 (rt1->rt_mark ^ rt2->rt_mark) |
742                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
743                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
744                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
745 }
746
747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748 {
749         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
750 }
751
752 static inline int rt_is_expired(struct rtable *rth)
753 {
754         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
755 }
756
757 /*
758  * Perform a full scan of hash table and free all entries.
759  * Can be called by a softirq or a process.
760  * In the later case, we want to be reschedule if necessary
761  */
762 static void rt_do_flush(struct net *net, int process_context)
763 {
764         unsigned int i;
765         struct rtable *rth, *next;
766
767         for (i = 0; i <= rt_hash_mask; i++) {
768                 struct rtable __rcu **pprev;
769                 struct rtable *list;
770
771                 if (process_context && need_resched())
772                         cond_resched();
773                 rth = rcu_access_pointer(rt_hash_table[i].chain);
774                 if (!rth)
775                         continue;
776
777                 spin_lock_bh(rt_hash_lock_addr(i));
778
779                 list = NULL;
780                 pprev = &rt_hash_table[i].chain;
781                 rth = rcu_dereference_protected(*pprev,
782                         lockdep_is_held(rt_hash_lock_addr(i)));
783
784                 while (rth) {
785                         next = rcu_dereference_protected(rth->dst.rt_next,
786                                 lockdep_is_held(rt_hash_lock_addr(i)));
787
788                         if (!net ||
789                             net_eq(dev_net(rth->dst.dev), net)) {
790                                 rcu_assign_pointer(*pprev, next);
791                                 rcu_assign_pointer(rth->dst.rt_next, list);
792                                 list = rth;
793                         } else {
794                                 pprev = &rth->dst.rt_next;
795                         }
796                         rth = next;
797                 }
798
799                 spin_unlock_bh(rt_hash_lock_addr(i));
800
801                 for (; list; list = next) {
802                         next = rcu_dereference_protected(list->dst.rt_next, 1);
803                         rt_free(list);
804                 }
805         }
806 }
807
808 /*
809  * While freeing expired entries, we compute average chain length
810  * and standard deviation, using fixed-point arithmetic.
811  * This to have an estimation of rt_chain_length_max
812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814  */
815
816 #define FRACT_BITS 3
817 #define ONE (1UL << FRACT_BITS)
818
819 /*
820  * Given a hash chain and an item in this hash chain,
821  * find if a previous entry has the same hash_inputs
822  * (but differs on tos, mark or oif)
823  * Returns 0 if an alias is found.
824  * Returns ONE if rth has no alias before itself.
825  */
826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
827 {
828         const struct rtable *aux = head;
829
830         while (aux != rth) {
831                 if (compare_hash_inputs(aux, rth))
832                         return 0;
833                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
834         }
835         return ONE;
836 }
837
838 static void rt_check_expire(void)
839 {
840         static unsigned int rover;
841         unsigned int i = rover, goal;
842         struct rtable *rth;
843         struct rtable __rcu **rthp;
844         unsigned long samples = 0;
845         unsigned long sum = 0, sum2 = 0;
846         unsigned long delta;
847         u64 mult;
848
849         delta = jiffies - expires_ljiffies;
850         expires_ljiffies = jiffies;
851         mult = ((u64)delta) << rt_hash_log;
852         if (ip_rt_gc_timeout > 1)
853                 do_div(mult, ip_rt_gc_timeout);
854         goal = (unsigned int)mult;
855         if (goal > rt_hash_mask)
856                 goal = rt_hash_mask + 1;
857         for (; goal > 0; goal--) {
858                 unsigned long tmo = ip_rt_gc_timeout;
859                 unsigned long length;
860
861                 i = (i + 1) & rt_hash_mask;
862                 rthp = &rt_hash_table[i].chain;
863
864                 if (need_resched())
865                         cond_resched();
866
867                 samples++;
868
869                 if (rcu_dereference_raw(*rthp) == NULL)
870                         continue;
871                 length = 0;
872                 spin_lock_bh(rt_hash_lock_addr(i));
873                 while ((rth = rcu_dereference_protected(*rthp,
874                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875                         prefetch(rth->dst.rt_next);
876                         if (rt_is_expired(rth)) {
877                                 *rthp = rth->dst.rt_next;
878                                 rt_free(rth);
879                                 continue;
880                         }
881                         if (rth->dst.expires) {
882                                 /* Entry is expired even if it is in use */
883                                 if (time_before_eq(jiffies, rth->dst.expires)) {
884 nofree:
885                                         tmo >>= 1;
886                                         rthp = &rth->dst.rt_next;
887                                         /*
888                                          * We only count entries on
889                                          * a chain with equal hash inputs once
890                                          * so that entries for different QOS
891                                          * levels, and other non-hash input
892                                          * attributes don't unfairly skew
893                                          * the length computation
894                                          */
895                                         length += has_noalias(rt_hash_table[i].chain, rth);
896                                         continue;
897                                 }
898                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899                                 goto nofree;
900
901                         /* Cleanup aged off entries. */
902                         *rthp = rth->dst.rt_next;
903                         rt_free(rth);
904                 }
905                 spin_unlock_bh(rt_hash_lock_addr(i));
906                 sum += length;
907                 sum2 += length*length;
908         }
909         if (samples) {
910                 unsigned long avg = sum / samples;
911                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912                 rt_chain_length_max = max_t(unsigned long,
913                                         ip_rt_gc_elasticity,
914                                         (avg + 4*sd) >> FRACT_BITS);
915         }
916         rover = i;
917 }
918
919 /*
920  * rt_worker_func() is run in process context.
921  * we call rt_check_expire() to scan part of the hash table
922  */
923 static void rt_worker_func(struct work_struct *work)
924 {
925         rt_check_expire();
926         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927 }
928
929 /*
930  * Perturbation of rt_genid by a small quantity [1..256]
931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932  * many times (2^24) without giving recent rt_genid.
933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
934  */
935 static void rt_cache_invalidate(struct net *net)
936 {
937         unsigned char shuffle;
938
939         get_random_bytes(&shuffle, sizeof(shuffle));
940         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
941         redirect_genid++;
942 }
943
944 /*
945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
946  * delay >= 0 : invalidate & flush cache (can be long)
947  */
948 void rt_cache_flush(struct net *net, int delay)
949 {
950         rt_cache_invalidate(net);
951         if (delay >= 0)
952                 rt_do_flush(net, !in_softirq());
953 }
954
955 /* Flush previous cache invalidated entries from the cache */
956 void rt_cache_flush_batch(struct net *net)
957 {
958         rt_do_flush(net, !in_softirq());
959 }
960
961 static void rt_emergency_hash_rebuild(struct net *net)
962 {
963         if (net_ratelimit())
964                 printk(KERN_WARNING "Route hash chain too long!\n");
965         rt_cache_invalidate(net);
966 }
967
968 /*
969    Short description of GC goals.
970
971    We want to build algorithm, which will keep routing cache
972    at some equilibrium point, when number of aged off entries
973    is kept approximately equal to newly generated ones.
974
975    Current expiration strength is variable "expire".
976    We try to adjust it dynamically, so that if networking
977    is idle expires is large enough to keep enough of warm entries,
978    and when load increases it reduces to limit cache size.
979  */
980
981 static int rt_garbage_collect(struct dst_ops *ops)
982 {
983         static unsigned long expire = RT_GC_TIMEOUT;
984         static unsigned long last_gc;
985         static int rover;
986         static int equilibrium;
987         struct rtable *rth;
988         struct rtable __rcu **rthp;
989         unsigned long now = jiffies;
990         int goal;
991         int entries = dst_entries_get_fast(&ipv4_dst_ops);
992
993         /*
994          * Garbage collection is pretty expensive,
995          * do not make it too frequently.
996          */
997
998         RT_CACHE_STAT_INC(gc_total);
999
1000         if (now - last_gc < ip_rt_gc_min_interval &&
1001             entries < ip_rt_max_size) {
1002                 RT_CACHE_STAT_INC(gc_ignored);
1003                 goto out;
1004         }
1005
1006         entries = dst_entries_get_slow(&ipv4_dst_ops);
1007         /* Calculate number of entries, which we want to expire now. */
1008         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1009         if (goal <= 0) {
1010                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011                         equilibrium = ipv4_dst_ops.gc_thresh;
1012                 goal = entries - equilibrium;
1013                 if (goal > 0) {
1014                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                         goal = entries - equilibrium;
1016                 }
1017         } else {
1018                 /* We are in dangerous area. Try to reduce cache really
1019                  * aggressively.
1020                  */
1021                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022                 equilibrium = entries - goal;
1023         }
1024
1025         if (now - last_gc >= ip_rt_gc_min_interval)
1026                 last_gc = now;
1027
1028         if (goal <= 0) {
1029                 equilibrium += goal;
1030                 goto work_done;
1031         }
1032
1033         do {
1034                 int i, k;
1035
1036                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037                         unsigned long tmo = expire;
1038
1039                         k = (k + 1) & rt_hash_mask;
1040                         rthp = &rt_hash_table[k].chain;
1041                         spin_lock_bh(rt_hash_lock_addr(k));
1042                         while ((rth = rcu_dereference_protected(*rthp,
1043                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1044                                 if (!rt_is_expired(rth) &&
1045                                         !rt_may_expire(rth, tmo, expire)) {
1046                                         tmo >>= 1;
1047                                         rthp = &rth->dst.rt_next;
1048                                         continue;
1049                                 }
1050                                 *rthp = rth->dst.rt_next;
1051                                 rt_free(rth);
1052                                 goal--;
1053                         }
1054                         spin_unlock_bh(rt_hash_lock_addr(k));
1055                         if (goal <= 0)
1056                                 break;
1057                 }
1058                 rover = k;
1059
1060                 if (goal <= 0)
1061                         goto work_done;
1062
1063                 /* Goal is not achieved. We stop process if:
1064
1065                    - if expire reduced to zero. Otherwise, expire is halfed.
1066                    - if table is not full.
1067                    - if we are called from interrupt.
1068                    - jiffies check is just fallback/debug loop breaker.
1069                      We will not spin here for long time in any case.
1070                  */
1071
1072                 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074                 if (expire == 0)
1075                         break;
1076
1077                 expire >>= 1;
1078
1079                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                         goto out;
1081         } while (!in_softirq() && time_before_eq(jiffies, now));
1082
1083         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084                 goto out;
1085         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1086                 goto out;
1087         if (net_ratelimit())
1088                 printk(KERN_WARNING "dst cache overflow\n");
1089         RT_CACHE_STAT_INC(gc_dst_overflow);
1090         return 1;
1091
1092 work_done:
1093         expire += ip_rt_gc_min_interval;
1094         if (expire > ip_rt_gc_timeout ||
1095             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1097                 expire = ip_rt_gc_timeout;
1098 out:    return 0;
1099 }
1100
1101 /*
1102  * Returns number of entries in a hash chain that have different hash_inputs
1103  */
1104 static int slow_chain_length(const struct rtable *head)
1105 {
1106         int length = 0;
1107         const struct rtable *rth = head;
1108
1109         while (rth) {
1110                 length += has_noalias(head, rth);
1111                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1112         }
1113         return length >> FRACT_BITS;
1114 }
1115
1116 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1117 {
1118         struct neigh_table *tbl = &arp_tbl;
1119         static const __be32 inaddr_any = 0;
1120         struct net_device *dev = dst->dev;
1121         const __be32 *pkey = daddr;
1122         struct neighbour *n;
1123
1124 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1125         if (dev->type == ARPHRD_ATM)
1126                 tbl = clip_tbl_hook;
1127 #endif
1128         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1129                 pkey = &inaddr_any;
1130
1131         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1132         if (n)
1133                 return n;
1134         return neigh_create(tbl, pkey, dev);
1135 }
1136
1137 static int rt_bind_neighbour(struct rtable *rt)
1138 {
1139         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1140         if (IS_ERR(n))
1141                 return PTR_ERR(n);
1142         dst_set_neighbour(&rt->dst, n);
1143
1144         return 0;
1145 }
1146
1147 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1148                                      struct sk_buff *skb, int ifindex)
1149 {
1150         struct rtable   *rth, *cand;
1151         struct rtable __rcu **rthp, **candp;
1152         unsigned long   now;
1153         u32             min_score;
1154         int             chain_length;
1155         int attempts = !in_softirq();
1156
1157 restart:
1158         chain_length = 0;
1159         min_score = ~(u32)0;
1160         cand = NULL;
1161         candp = NULL;
1162         now = jiffies;
1163
1164         if (!rt_caching(dev_net(rt->dst.dev))) {
1165                 /*
1166                  * If we're not caching, just tell the caller we
1167                  * were successful and don't touch the route.  The
1168                  * caller hold the sole reference to the cache entry, and
1169                  * it will be released when the caller is done with it.
1170                  * If we drop it here, the callers have no way to resolve routes
1171                  * when we're not caching.  Instead, just point *rp at rt, so
1172                  * the caller gets a single use out of the route
1173                  * Note that we do rt_free on this new route entry, so that
1174                  * once its refcount hits zero, we are still able to reap it
1175                  * (Thanks Alexey)
1176                  * Note: To avoid expensive rcu stuff for this uncached dst,
1177                  * we set DST_NOCACHE so that dst_release() can free dst without
1178                  * waiting a grace period.
1179                  */
1180
1181                 rt->dst.flags |= DST_NOCACHE;
1182                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183                         int err = rt_bind_neighbour(rt);
1184                         if (err) {
1185                                 if (net_ratelimit())
1186                                         printk(KERN_WARNING
1187                                             "Neighbour table failure & not caching routes.\n");
1188                                 ip_rt_put(rt);
1189                                 return ERR_PTR(err);
1190                         }
1191                 }
1192
1193                 goto skip_hashing;
1194         }
1195
1196         rthp = &rt_hash_table[hash].chain;
1197
1198         spin_lock_bh(rt_hash_lock_addr(hash));
1199         while ((rth = rcu_dereference_protected(*rthp,
1200                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1201                 if (rt_is_expired(rth)) {
1202                         *rthp = rth->dst.rt_next;
1203                         rt_free(rth);
1204                         continue;
1205                 }
1206                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1207                         /* Put it first */
1208                         *rthp = rth->dst.rt_next;
1209                         /*
1210                          * Since lookup is lockfree, the deletion
1211                          * must be visible to another weakly ordered CPU before
1212                          * the insertion at the start of the hash chain.
1213                          */
1214                         rcu_assign_pointer(rth->dst.rt_next,
1215                                            rt_hash_table[hash].chain);
1216                         /*
1217                          * Since lookup is lockfree, the update writes
1218                          * must be ordered for consistency on SMP.
1219                          */
1220                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1221
1222                         dst_use(&rth->dst, now);
1223                         spin_unlock_bh(rt_hash_lock_addr(hash));
1224
1225                         rt_drop(rt);
1226                         if (skb)
1227                                 skb_dst_set(skb, &rth->dst);
1228                         return rth;
1229                 }
1230
1231                 if (!atomic_read(&rth->dst.__refcnt)) {
1232                         u32 score = rt_score(rth);
1233
1234                         if (score <= min_score) {
1235                                 cand = rth;
1236                                 candp = rthp;
1237                                 min_score = score;
1238                         }
1239                 }
1240
1241                 chain_length++;
1242
1243                 rthp = &rth->dst.rt_next;
1244         }
1245
1246         if (cand) {
1247                 /* ip_rt_gc_elasticity used to be average length of chain
1248                  * length, when exceeded gc becomes really aggressive.
1249                  *
1250                  * The second limit is less certain. At the moment it allows
1251                  * only 2 entries per bucket. We will see.
1252                  */
1253                 if (chain_length > ip_rt_gc_elasticity) {
1254                         *candp = cand->dst.rt_next;
1255                         rt_free(cand);
1256                 }
1257         } else {
1258                 if (chain_length > rt_chain_length_max &&
1259                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1260                         struct net *net = dev_net(rt->dst.dev);
1261                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1262                         if (!rt_caching(net)) {
1263                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1264                                         rt->dst.dev->name, num);
1265                         }
1266                         rt_emergency_hash_rebuild(net);
1267                         spin_unlock_bh(rt_hash_lock_addr(hash));
1268
1269                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1270                                         ifindex, rt_genid(net));
1271                         goto restart;
1272                 }
1273         }
1274
1275         /* Try to bind route to arp only if it is output
1276            route or unicast forwarding path.
1277          */
1278         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1279                 int err = rt_bind_neighbour(rt);
1280                 if (err) {
1281                         spin_unlock_bh(rt_hash_lock_addr(hash));
1282
1283                         if (err != -ENOBUFS) {
1284                                 rt_drop(rt);
1285                                 return ERR_PTR(err);
1286                         }
1287
1288                         /* Neighbour tables are full and nothing
1289                            can be released. Try to shrink route cache,
1290                            it is most likely it holds some neighbour records.
1291                          */
1292                         if (attempts-- > 0) {
1293                                 int saved_elasticity = ip_rt_gc_elasticity;
1294                                 int saved_int = ip_rt_gc_min_interval;
1295                                 ip_rt_gc_elasticity     = 1;
1296                                 ip_rt_gc_min_interval   = 0;
1297                                 rt_garbage_collect(&ipv4_dst_ops);
1298                                 ip_rt_gc_min_interval   = saved_int;
1299                                 ip_rt_gc_elasticity     = saved_elasticity;
1300                                 goto restart;
1301                         }
1302
1303                         if (net_ratelimit())
1304                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1305                         rt_drop(rt);
1306                         return ERR_PTR(-ENOBUFS);
1307                 }
1308         }
1309
1310         rt->dst.rt_next = rt_hash_table[hash].chain;
1311
1312         /*
1313          * Since lookup is lockfree, we must make sure
1314          * previous writes to rt are committed to memory
1315          * before making rt visible to other CPUS.
1316          */
1317         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1318
1319         spin_unlock_bh(rt_hash_lock_addr(hash));
1320
1321 skip_hashing:
1322         if (skb)
1323                 skb_dst_set(skb, &rt->dst);
1324         return rt;
1325 }
1326
1327 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1328
1329 static u32 rt_peer_genid(void)
1330 {
1331         return atomic_read(&__rt_peer_genid);
1332 }
1333
1334 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1335 {
1336         struct inet_peer *peer;
1337
1338         peer = inet_getpeer_v4(daddr, create);
1339
1340         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1341                 inet_putpeer(peer);
1342         else
1343                 rt->rt_peer_genid = rt_peer_genid();
1344 }
1345
1346 /*
1347  * Peer allocation may fail only in serious out-of-memory conditions.  However
1348  * we still can generate some output.
1349  * Random ID selection looks a bit dangerous because we have no chances to
1350  * select ID being unique in a reasonable period of time.
1351  * But broken packet identifier may be better than no packet at all.
1352  */
1353 static void ip_select_fb_ident(struct iphdr *iph)
1354 {
1355         static DEFINE_SPINLOCK(ip_fb_id_lock);
1356         static u32 ip_fallback_id;
1357         u32 salt;
1358
1359         spin_lock_bh(&ip_fb_id_lock);
1360         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1361         iph->id = htons(salt & 0xFFFF);
1362         ip_fallback_id = salt;
1363         spin_unlock_bh(&ip_fb_id_lock);
1364 }
1365
1366 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1367 {
1368         struct rtable *rt = (struct rtable *) dst;
1369
1370         if (rt) {
1371                 if (rt->peer == NULL)
1372                         rt_bind_peer(rt, rt->rt_dst, 1);
1373
1374                 /* If peer is attached to destination, it is never detached,
1375                    so that we need not to grab a lock to dereference it.
1376                  */
1377                 if (rt->peer) {
1378                         iph->id = htons(inet_getid(rt->peer, more));
1379                         return;
1380                 }
1381         } else
1382                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1383                        __builtin_return_address(0));
1384
1385         ip_select_fb_ident(iph);
1386 }
1387 EXPORT_SYMBOL(__ip_select_ident);
1388
1389 static void rt_del(unsigned hash, struct rtable *rt)
1390 {
1391         struct rtable __rcu **rthp;
1392         struct rtable *aux;
1393
1394         rthp = &rt_hash_table[hash].chain;
1395         spin_lock_bh(rt_hash_lock_addr(hash));
1396         ip_rt_put(rt);
1397         while ((aux = rcu_dereference_protected(*rthp,
1398                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1399                 if (aux == rt || rt_is_expired(aux)) {
1400                         *rthp = aux->dst.rt_next;
1401                         rt_free(aux);
1402                         continue;
1403                 }
1404                 rthp = &aux->dst.rt_next;
1405         }
1406         spin_unlock_bh(rt_hash_lock_addr(hash));
1407 }
1408
1409 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1410 {
1411         struct rtable *rt = (struct rtable *) dst;
1412         __be32 orig_gw = rt->rt_gateway;
1413         struct neighbour *n, *old_n;
1414
1415         dst_confirm(&rt->dst);
1416
1417         rt->rt_gateway = peer->redirect_learned.a4;
1418
1419         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1420         if (IS_ERR(n)) {
1421                 rt->rt_gateway = orig_gw;
1422                 return;
1423         }
1424         old_n = xchg(&rt->dst._neighbour, n);
1425         if (old_n)
1426                 neigh_release(old_n);
1427         if (!(n->nud_state & NUD_VALID)) {
1428                 neigh_event_send(n, NULL);
1429         } else {
1430                 rt->rt_flags |= RTCF_REDIRECTED;
1431                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1432         }
1433 }
1434
1435 /* called in rcu_read_lock() section */
1436 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1437                     __be32 saddr, struct net_device *dev)
1438 {
1439         int s, i;
1440         struct in_device *in_dev = __in_dev_get_rcu(dev);
1441         __be32 skeys[2] = { saddr, 0 };
1442         int    ikeys[2] = { dev->ifindex, 0 };
1443         struct inet_peer *peer;
1444         struct net *net;
1445
1446         if (!in_dev)
1447                 return;
1448
1449         net = dev_net(dev);
1450         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1451             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1452             ipv4_is_zeronet(new_gw))
1453                 goto reject_redirect;
1454
1455         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1456                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1457                         goto reject_redirect;
1458                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1459                         goto reject_redirect;
1460         } else {
1461                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1462                         goto reject_redirect;
1463         }
1464
1465         for (s = 0; s < 2; s++) {
1466                 for (i = 0; i < 2; i++) {
1467                         unsigned int hash;
1468                         struct rtable __rcu **rthp;
1469                         struct rtable *rt;
1470
1471                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1472
1473                         rthp = &rt_hash_table[hash].chain;
1474
1475                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1476                                 rthp = &rt->dst.rt_next;
1477
1478                                 if (rt->rt_key_dst != daddr ||
1479                                     rt->rt_key_src != skeys[s] ||
1480                                     rt->rt_oif != ikeys[i] ||
1481                                     rt_is_input_route(rt) ||
1482                                     rt_is_expired(rt) ||
1483                                     !net_eq(dev_net(rt->dst.dev), net) ||
1484                                     rt->dst.error ||
1485                                     rt->dst.dev != dev ||
1486                                     rt->rt_gateway != old_gw)
1487                                         continue;
1488
1489                                 if (!rt->peer)
1490                                         rt_bind_peer(rt, rt->rt_dst, 1);
1491
1492                                 peer = rt->peer;
1493                                 if (peer) {
1494                                         if (peer->redirect_learned.a4 != new_gw ||
1495                                             peer->redirect_genid != redirect_genid) {
1496                                                 peer->redirect_learned.a4 = new_gw;
1497                                                 peer->redirect_genid = redirect_genid;
1498                                                 atomic_inc(&__rt_peer_genid);
1499                                         }
1500                                         check_peer_redir(&rt->dst, peer);
1501                                 }
1502                         }
1503                 }
1504         }
1505         return;
1506
1507 reject_redirect:
1508 #ifdef CONFIG_IP_ROUTE_VERBOSE
1509         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1510                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1511                         "  Advised path = %pI4 -> %pI4\n",
1512                        &old_gw, dev->name, &new_gw,
1513                        &saddr, &daddr);
1514 #endif
1515         ;
1516 }
1517
1518 static bool peer_pmtu_expired(struct inet_peer *peer)
1519 {
1520         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522         return orig &&
1523                time_after_eq(jiffies, orig) &&
1524                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525 }
1526
1527 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1528 {
1529         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1530
1531         return orig &&
1532                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1533 }
1534
1535 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1536 {
1537         struct rtable *rt = (struct rtable *)dst;
1538         struct dst_entry *ret = dst;
1539
1540         if (rt) {
1541                 if (dst->obsolete > 0) {
1542                         ip_rt_put(rt);
1543                         ret = NULL;
1544                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1545                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1546                                                 rt->rt_oif,
1547                                                 rt_genid(dev_net(dst->dev)));
1548                         rt_del(hash, rt);
1549                         ret = NULL;
1550                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1551                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1552                 }
1553         }
1554         return ret;
1555 }
1556
1557 /*
1558  * Algorithm:
1559  *      1. The first ip_rt_redirect_number redirects are sent
1560  *         with exponential backoff, then we stop sending them at all,
1561  *         assuming that the host ignores our redirects.
1562  *      2. If we did not see packets requiring redirects
1563  *         during ip_rt_redirect_silence, we assume that the host
1564  *         forgot redirected route and start to send redirects again.
1565  *
1566  * This algorithm is much cheaper and more intelligent than dumb load limiting
1567  * in icmp.c.
1568  *
1569  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1570  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1571  */
1572
1573 void ip_rt_send_redirect(struct sk_buff *skb)
1574 {
1575         struct rtable *rt = skb_rtable(skb);
1576         struct in_device *in_dev;
1577         struct inet_peer *peer;
1578         int log_martians;
1579
1580         rcu_read_lock();
1581         in_dev = __in_dev_get_rcu(rt->dst.dev);
1582         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1583                 rcu_read_unlock();
1584                 return;
1585         }
1586         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1587         rcu_read_unlock();
1588
1589         if (!rt->peer)
1590                 rt_bind_peer(rt, rt->rt_dst, 1);
1591         peer = rt->peer;
1592         if (!peer) {
1593                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1594                 return;
1595         }
1596
1597         /* No redirected packets during ip_rt_redirect_silence;
1598          * reset the algorithm.
1599          */
1600         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1601                 peer->rate_tokens = 0;
1602
1603         /* Too many ignored redirects; do not send anything
1604          * set dst.rate_last to the last seen redirected packet.
1605          */
1606         if (peer->rate_tokens >= ip_rt_redirect_number) {
1607                 peer->rate_last = jiffies;
1608                 return;
1609         }
1610
1611         /* Check for load limit; set rate_last to the latest sent
1612          * redirect.
1613          */
1614         if (peer->rate_tokens == 0 ||
1615             time_after(jiffies,
1616                        (peer->rate_last +
1617                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1618                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1619                 peer->rate_last = jiffies;
1620                 ++peer->rate_tokens;
1621 #ifdef CONFIG_IP_ROUTE_VERBOSE
1622                 if (log_martians &&
1623                     peer->rate_tokens == ip_rt_redirect_number &&
1624                     net_ratelimit())
1625                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1626                                &ip_hdr(skb)->saddr, rt->rt_iif,
1627                                 &rt->rt_dst, &rt->rt_gateway);
1628 #endif
1629         }
1630 }
1631
1632 static int ip_error(struct sk_buff *skb)
1633 {
1634         struct rtable *rt = skb_rtable(skb);
1635         struct inet_peer *peer;
1636         unsigned long now;
1637         bool send;
1638         int code;
1639
1640         switch (rt->dst.error) {
1641         case EINVAL:
1642         default:
1643                 goto out;
1644         case EHOSTUNREACH:
1645                 code = ICMP_HOST_UNREACH;
1646                 break;
1647         case ENETUNREACH:
1648                 code = ICMP_NET_UNREACH;
1649                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1650                                 IPSTATS_MIB_INNOROUTES);
1651                 break;
1652         case EACCES:
1653                 code = ICMP_PKT_FILTERED;
1654                 break;
1655         }
1656
1657         if (!rt->peer)
1658                 rt_bind_peer(rt, rt->rt_dst, 1);
1659         peer = rt->peer;
1660
1661         send = true;
1662         if (peer) {
1663                 now = jiffies;
1664                 peer->rate_tokens += now - peer->rate_last;
1665                 if (peer->rate_tokens > ip_rt_error_burst)
1666                         peer->rate_tokens = ip_rt_error_burst;
1667                 peer->rate_last = now;
1668                 if (peer->rate_tokens >= ip_rt_error_cost)
1669                         peer->rate_tokens -= ip_rt_error_cost;
1670                 else
1671                         send = false;
1672         }
1673         if (send)
1674                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1675
1676 out:    kfree_skb(skb);
1677         return 0;
1678 }
1679
1680 /*
1681  *      The last two values are not from the RFC but
1682  *      are needed for AMPRnet AX.25 paths.
1683  */
1684
1685 static const unsigned short mtu_plateau[] =
1686 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1687
1688 static inline unsigned short guess_mtu(unsigned short old_mtu)
1689 {
1690         int i;
1691
1692         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1693                 if (old_mtu > mtu_plateau[i])
1694                         return mtu_plateau[i];
1695         return 68;
1696 }
1697
1698 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1699                                  unsigned short new_mtu,
1700                                  struct net_device *dev)
1701 {
1702         unsigned short old_mtu = ntohs(iph->tot_len);
1703         unsigned short est_mtu = 0;
1704         struct inet_peer *peer;
1705
1706         peer = inet_getpeer_v4(iph->daddr, 1);
1707         if (peer) {
1708                 unsigned short mtu = new_mtu;
1709
1710                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1711                         /* BSD 4.2 derived systems incorrectly adjust
1712                          * tot_len by the IP header length, and report
1713                          * a zero MTU in the ICMP message.
1714                          */
1715                         if (mtu == 0 &&
1716                             old_mtu >= 68 + (iph->ihl << 2))
1717                                 old_mtu -= iph->ihl << 2;
1718                         mtu = guess_mtu(old_mtu);
1719                 }
1720
1721                 if (mtu < ip_rt_min_pmtu)
1722                         mtu = ip_rt_min_pmtu;
1723                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1724                         unsigned long pmtu_expires;
1725
1726                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1727                         if (!pmtu_expires)
1728                                 pmtu_expires = 1UL;
1729
1730                         est_mtu = mtu;
1731                         peer->pmtu_learned = mtu;
1732                         peer->pmtu_expires = pmtu_expires;
1733                         atomic_inc(&__rt_peer_genid);
1734                 }
1735
1736                 inet_putpeer(peer);
1737         }
1738         return est_mtu ? : new_mtu;
1739 }
1740
1741 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1742 {
1743         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1744
1745         if (!expires)
1746                 return;
1747         if (time_before(jiffies, expires)) {
1748                 u32 orig_dst_mtu = dst_mtu(dst);
1749                 if (peer->pmtu_learned < orig_dst_mtu) {
1750                         if (!peer->pmtu_orig)
1751                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1752                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1753                 }
1754         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1755                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1756 }
1757
1758 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1759 {
1760         struct rtable *rt = (struct rtable *) dst;
1761         struct inet_peer *peer;
1762
1763         dst_confirm(dst);
1764
1765         if (!rt->peer)
1766                 rt_bind_peer(rt, rt->rt_dst, 1);
1767         peer = rt->peer;
1768         if (peer) {
1769                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1770
1771                 if (mtu < ip_rt_min_pmtu)
1772                         mtu = ip_rt_min_pmtu;
1773                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1774
1775                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1776                         if (!pmtu_expires)
1777                                 pmtu_expires = 1UL;
1778
1779                         peer->pmtu_learned = mtu;
1780                         peer->pmtu_expires = pmtu_expires;
1781
1782                         atomic_inc(&__rt_peer_genid);
1783                         rt->rt_peer_genid = rt_peer_genid();
1784                 }
1785                 check_peer_pmtu(dst, peer);
1786         }
1787 }
1788
1789
1790 static void ipv4_validate_peer(struct rtable *rt)
1791 {
1792         if (rt->rt_peer_genid != rt_peer_genid()) {
1793                 struct inet_peer *peer;
1794
1795                 if (!rt->peer)
1796                         rt_bind_peer(rt, rt->rt_dst, 0);
1797
1798                 peer = rt->peer;
1799                 if (peer) {
1800                         check_peer_pmtu(&rt->dst, peer);
1801
1802                         if (peer->redirect_genid != redirect_genid)
1803                                 peer->redirect_learned.a4 = 0;
1804                         if (peer->redirect_learned.a4 &&
1805                             peer->redirect_learned.a4 != rt->rt_gateway)
1806                                 check_peer_redir(&rt->dst, peer);
1807                 }
1808
1809                 rt->rt_peer_genid = rt_peer_genid();
1810         }
1811 }
1812
1813 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1814 {
1815         struct rtable *rt = (struct rtable *) dst;
1816
1817         if (rt_is_expired(rt))
1818                 return NULL;
1819         ipv4_validate_peer(rt);
1820         return dst;
1821 }
1822
1823 static void ipv4_dst_destroy(struct dst_entry *dst)
1824 {
1825         struct rtable *rt = (struct rtable *) dst;
1826         struct inet_peer *peer = rt->peer;
1827
1828         if (rt->fi) {
1829                 fib_info_put(rt->fi);
1830                 rt->fi = NULL;
1831         }
1832         if (peer) {
1833                 rt->peer = NULL;
1834                 inet_putpeer(peer);
1835         }
1836 }
1837
1838
1839 static void ipv4_link_failure(struct sk_buff *skb)
1840 {
1841         struct rtable *rt;
1842
1843         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1844
1845         rt = skb_rtable(skb);
1846         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1847                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1848 }
1849
1850 static int ip_rt_bug(struct sk_buff *skb)
1851 {
1852         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1853                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1854                 skb->dev ? skb->dev->name : "?");
1855         kfree_skb(skb);
1856         WARN_ON(1);
1857         return 0;
1858 }
1859
1860 /*
1861    We do not cache source address of outgoing interface,
1862    because it is used only by IP RR, TS and SRR options,
1863    so that it out of fast path.
1864
1865    BTW remember: "addr" is allowed to be not aligned
1866    in IP options!
1867  */
1868
1869 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1870 {
1871         __be32 src;
1872
1873         if (rt_is_output_route(rt))
1874                 src = ip_hdr(skb)->saddr;
1875         else {
1876                 struct fib_result res;
1877                 struct flowi4 fl4;
1878                 struct iphdr *iph;
1879
1880                 iph = ip_hdr(skb);
1881
1882                 memset(&fl4, 0, sizeof(fl4));
1883                 fl4.daddr = iph->daddr;
1884                 fl4.saddr = iph->saddr;
1885                 fl4.flowi4_tos = RT_TOS(iph->tos);
1886                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1887                 fl4.flowi4_iif = skb->dev->ifindex;
1888                 fl4.flowi4_mark = skb->mark;
1889
1890                 rcu_read_lock();
1891                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1892                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1893                 else
1894                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1895                                         RT_SCOPE_UNIVERSE);
1896                 rcu_read_unlock();
1897         }
1898         memcpy(addr, &src, 4);
1899 }
1900
1901 #ifdef CONFIG_IP_ROUTE_CLASSID
1902 static void set_class_tag(struct rtable *rt, u32 tag)
1903 {
1904         if (!(rt->dst.tclassid & 0xFFFF))
1905                 rt->dst.tclassid |= tag & 0xFFFF;
1906         if (!(rt->dst.tclassid & 0xFFFF0000))
1907                 rt->dst.tclassid |= tag & 0xFFFF0000;
1908 }
1909 #endif
1910
1911 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1912 {
1913         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1914
1915         if (advmss == 0) {
1916                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1917                                ip_rt_min_advmss);
1918                 if (advmss > 65535 - 40)
1919                         advmss = 65535 - 40;
1920         }
1921         return advmss;
1922 }
1923
1924 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1925 {
1926         const struct rtable *rt = (const struct rtable *) dst;
1927         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1928
1929         if (mtu && rt_is_output_route(rt))
1930                 return mtu;
1931
1932         mtu = dst->dev->mtu;
1933
1934         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1935
1936                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1937                         mtu = 576;
1938         }
1939
1940         if (mtu > IP_MAX_MTU)
1941                 mtu = IP_MAX_MTU;
1942
1943         return mtu;
1944 }
1945
1946 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1947                             struct fib_info *fi)
1948 {
1949         struct inet_peer *peer;
1950         int create = 0;
1951
1952         /* If a peer entry exists for this destination, we must hook
1953          * it up in order to get at cached metrics.
1954          */
1955         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1956                 create = 1;
1957
1958         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1959         if (peer) {
1960                 rt->rt_peer_genid = rt_peer_genid();
1961                 if (inet_metrics_new(peer))
1962                         memcpy(peer->metrics, fi->fib_metrics,
1963                                sizeof(u32) * RTAX_MAX);
1964                 dst_init_metrics(&rt->dst, peer->metrics, false);
1965
1966                 check_peer_pmtu(&rt->dst, peer);
1967                 if (peer->redirect_genid != redirect_genid)
1968                         peer->redirect_learned.a4 = 0;
1969                 if (peer->redirect_learned.a4 &&
1970                     peer->redirect_learned.a4 != rt->rt_gateway) {
1971                         rt->rt_gateway = peer->redirect_learned.a4;
1972                         rt->rt_flags |= RTCF_REDIRECTED;
1973                 }
1974         } else {
1975                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1976                         rt->fi = fi;
1977                         atomic_inc(&fi->fib_clntref);
1978                 }
1979                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1980         }
1981 }
1982
1983 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1984                            const struct fib_result *res,
1985                            struct fib_info *fi, u16 type, u32 itag)
1986 {
1987         struct dst_entry *dst = &rt->dst;
1988
1989         if (fi) {
1990                 if (FIB_RES_GW(*res) &&
1991                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1992                         rt->rt_gateway = FIB_RES_GW(*res);
1993                 rt_init_metrics(rt, fl4, fi);
1994 #ifdef CONFIG_IP_ROUTE_CLASSID
1995                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1996 #endif
1997         }
1998
1999         if (dst_mtu(dst) > IP_MAX_MTU)
2000                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2001         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2002                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2003
2004 #ifdef CONFIG_IP_ROUTE_CLASSID
2005 #ifdef CONFIG_IP_MULTIPLE_TABLES
2006         set_class_tag(rt, fib_rules_tclass(res));
2007 #endif
2008         set_class_tag(rt, itag);
2009 #endif
2010 }
2011
2012 static struct rtable *rt_dst_alloc(struct net_device *dev,
2013                                    bool nopolicy, bool noxfrm)
2014 {
2015         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2016                          DST_HOST |
2017                          (nopolicy ? DST_NOPOLICY : 0) |
2018                          (noxfrm ? DST_NOXFRM : 0));
2019 }
2020
2021 /* called in rcu_read_lock() section */
2022 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2023                                 u8 tos, struct net_device *dev, int our)
2024 {
2025         unsigned int hash;
2026         struct rtable *rth;
2027         __be32 spec_dst;
2028         struct in_device *in_dev = __in_dev_get_rcu(dev);
2029         u32 itag = 0;
2030         int err;
2031
2032         /* Primary sanity checks. */
2033
2034         if (in_dev == NULL)
2035                 return -EINVAL;
2036
2037         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2038             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2039                 goto e_inval;
2040
2041         if (ipv4_is_zeronet(saddr)) {
2042                 if (!ipv4_is_local_multicast(daddr))
2043                         goto e_inval;
2044                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2045         } else {
2046                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2047                                           &itag);
2048                 if (err < 0)
2049                         goto e_err;
2050         }
2051         rth = rt_dst_alloc(init_net.loopback_dev,
2052                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2053         if (!rth)
2054                 goto e_nobufs;
2055
2056 #ifdef CONFIG_IP_ROUTE_CLASSID
2057         rth->dst.tclassid = itag;
2058 #endif
2059         rth->dst.output = ip_rt_bug;
2060
2061         rth->rt_key_dst = daddr;
2062         rth->rt_key_src = saddr;
2063         rth->rt_genid   = rt_genid(dev_net(dev));
2064         rth->rt_flags   = RTCF_MULTICAST;
2065         rth->rt_type    = RTN_MULTICAST;
2066         rth->rt_key_tos = tos;
2067         rth->rt_dst     = daddr;
2068         rth->rt_src     = saddr;
2069         rth->rt_route_iif = dev->ifindex;
2070         rth->rt_iif     = dev->ifindex;
2071         rth->rt_oif     = 0;
2072         rth->rt_mark    = skb->mark;
2073         rth->rt_gateway = daddr;
2074         rth->rt_spec_dst= spec_dst;
2075         rth->rt_peer_genid = 0;
2076         rth->peer = NULL;
2077         rth->fi = NULL;
2078         if (our) {
2079                 rth->dst.input= ip_local_deliver;
2080                 rth->rt_flags |= RTCF_LOCAL;
2081         }
2082
2083 #ifdef CONFIG_IP_MROUTE
2084         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2085                 rth->dst.input = ip_mr_input;
2086 #endif
2087         RT_CACHE_STAT_INC(in_slow_mc);
2088
2089         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2090         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2091         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2092
2093 e_nobufs:
2094         return -ENOBUFS;
2095 e_inval:
2096         return -EINVAL;
2097 e_err:
2098         return err;
2099 }
2100
2101
2102 static void ip_handle_martian_source(struct net_device *dev,
2103                                      struct in_device *in_dev,
2104                                      struct sk_buff *skb,
2105                                      __be32 daddr,
2106                                      __be32 saddr)
2107 {
2108         RT_CACHE_STAT_INC(in_martian_src);
2109 #ifdef CONFIG_IP_ROUTE_VERBOSE
2110         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2111                 /*
2112                  *      RFC1812 recommendation, if source is martian,
2113                  *      the only hint is MAC header.
2114                  */
2115                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2116                         &daddr, &saddr, dev->name);
2117                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2118                         int i;
2119                         const unsigned char *p = skb_mac_header(skb);
2120                         printk(KERN_WARNING "ll header: ");
2121                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2122                                 printk("%02x", *p);
2123                                 if (i < (dev->hard_header_len - 1))
2124                                         printk(":");
2125                         }
2126                         printk("\n");
2127                 }
2128         }
2129 #endif
2130 }
2131
2132 /* called in rcu_read_lock() section */
2133 static int __mkroute_input(struct sk_buff *skb,
2134                            const struct fib_result *res,
2135                            struct in_device *in_dev,
2136                            __be32 daddr, __be32 saddr, u32 tos,
2137                            struct rtable **result)
2138 {
2139         struct rtable *rth;
2140         int err;
2141         struct in_device *out_dev;
2142         unsigned int flags = 0;
2143         __be32 spec_dst;
2144         u32 itag;
2145
2146         /* get a working reference to the output device */
2147         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2148         if (out_dev == NULL) {
2149                 if (net_ratelimit())
2150                         printk(KERN_CRIT "Bug in ip_route_input" \
2151                                "_slow(). Please, report\n");
2152                 return -EINVAL;
2153         }
2154
2155
2156         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2157                                   in_dev->dev, &spec_dst, &itag);
2158         if (err < 0) {
2159                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2160                                          saddr);
2161
2162                 goto cleanup;
2163         }
2164
2165         if (err)
2166                 flags |= RTCF_DIRECTSRC;
2167
2168         if (out_dev == in_dev && err &&
2169             (IN_DEV_SHARED_MEDIA(out_dev) ||
2170              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2171                 flags |= RTCF_DOREDIRECT;
2172
2173         if (skb->protocol != htons(ETH_P_IP)) {
2174                 /* Not IP (i.e. ARP). Do not create route, if it is
2175                  * invalid for proxy arp. DNAT routes are always valid.
2176                  *
2177                  * Proxy arp feature have been extended to allow, ARP
2178                  * replies back to the same interface, to support
2179                  * Private VLAN switch technologies. See arp.c.
2180                  */
2181                 if (out_dev == in_dev &&
2182                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2183                         err = -EINVAL;
2184                         goto cleanup;
2185                 }
2186         }
2187
2188         rth = rt_dst_alloc(out_dev->dev,
2189                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2190                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2191         if (!rth) {
2192                 err = -ENOBUFS;
2193                 goto cleanup;
2194         }
2195
2196         rth->rt_key_dst = daddr;
2197         rth->rt_key_src = saddr;
2198         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2199         rth->rt_flags = flags;
2200         rth->rt_type = res->type;
2201         rth->rt_key_tos = tos;
2202         rth->rt_dst     = daddr;
2203         rth->rt_src     = saddr;
2204         rth->rt_route_iif = in_dev->dev->ifindex;
2205         rth->rt_iif     = in_dev->dev->ifindex;
2206         rth->rt_oif     = 0;
2207         rth->rt_mark    = skb->mark;
2208         rth->rt_gateway = daddr;
2209         rth->rt_spec_dst= spec_dst;
2210         rth->rt_peer_genid = 0;
2211         rth->peer = NULL;
2212         rth->fi = NULL;
2213
2214         rth->dst.input = ip_forward;
2215         rth->dst.output = ip_output;
2216
2217         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2218
2219         *result = rth;
2220         err = 0;
2221  cleanup:
2222         return err;
2223 }
2224
2225 static int ip_mkroute_input(struct sk_buff *skb,
2226                             struct fib_result *res,
2227                             const struct flowi4 *fl4,
2228                             struct in_device *in_dev,
2229                             __be32 daddr, __be32 saddr, u32 tos)
2230 {
2231         struct rtable* rth = NULL;
2232         int err;
2233         unsigned hash;
2234
2235 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2236         if (res->fi && res->fi->fib_nhs > 1)
2237                 fib_select_multipath(res);
2238 #endif
2239
2240         /* create a routing cache entry */
2241         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2242         if (err)
2243                 return err;
2244
2245         /* put it into the cache */
2246         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2247                        rt_genid(dev_net(rth->dst.dev)));
2248         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2249         if (IS_ERR(rth))
2250                 return PTR_ERR(rth);
2251         return 0;
2252 }
2253
2254 /*
2255  *      NOTE. We drop all the packets that has local source
2256  *      addresses, because every properly looped back packet
2257  *      must have correct destination already attached by output routine.
2258  *
2259  *      Such approach solves two big problems:
2260  *      1. Not simplex devices are handled properly.
2261  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2262  *      called with rcu_read_lock()
2263  */
2264
2265 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2266                                u8 tos, struct net_device *dev)
2267 {
2268         struct fib_result res;
2269         struct in_device *in_dev = __in_dev_get_rcu(dev);
2270         struct flowi4   fl4;
2271         unsigned        flags = 0;
2272         u32             itag = 0;
2273         struct rtable * rth;
2274         unsigned        hash;
2275         __be32          spec_dst;
2276         int             err = -EINVAL;
2277         struct net    * net = dev_net(dev);
2278
2279         /* IP on this device is disabled. */
2280
2281         if (!in_dev)
2282                 goto out;
2283
2284         /* Check for the most weird martians, which can be not detected
2285            by fib_lookup.
2286          */
2287
2288         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2289             ipv4_is_loopback(saddr))
2290                 goto martian_source;
2291
2292         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2293                 goto brd_input;
2294
2295         /* Accept zero addresses only to limited broadcast;
2296          * I even do not know to fix it or not. Waiting for complains :-)
2297          */
2298         if (ipv4_is_zeronet(saddr))
2299                 goto martian_source;
2300
2301         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2302                 goto martian_destination;
2303
2304         /*
2305          *      Now we are ready to route packet.
2306          */
2307         fl4.flowi4_oif = 0;
2308         fl4.flowi4_iif = dev->ifindex;
2309         fl4.flowi4_mark = skb->mark;
2310         fl4.flowi4_tos = tos;
2311         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2312         fl4.daddr = daddr;
2313         fl4.saddr = saddr;
2314         err = fib_lookup(net, &fl4, &res);
2315         if (err != 0) {
2316                 if (!IN_DEV_FORWARD(in_dev))
2317                         goto e_hostunreach;
2318                 goto no_route;
2319         }
2320
2321         RT_CACHE_STAT_INC(in_slow_tot);
2322
2323         if (res.type == RTN_BROADCAST)
2324                 goto brd_input;
2325
2326         if (res.type == RTN_LOCAL) {
2327                 err = fib_validate_source(skb, saddr, daddr, tos,
2328                                           net->loopback_dev->ifindex,
2329                                           dev, &spec_dst, &itag);
2330                 if (err < 0)
2331                         goto martian_source_keep_err;
2332                 if (err)
2333                         flags |= RTCF_DIRECTSRC;
2334                 spec_dst = daddr;
2335                 goto local_input;
2336         }
2337
2338         if (!IN_DEV_FORWARD(in_dev))
2339                 goto e_hostunreach;
2340         if (res.type != RTN_UNICAST)
2341                 goto martian_destination;
2342
2343         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2344 out:    return err;
2345
2346 brd_input:
2347         if (skb->protocol != htons(ETH_P_IP))
2348                 goto e_inval;
2349
2350         if (ipv4_is_zeronet(saddr))
2351                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2352         else {
2353                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2354                                           &itag);
2355                 if (err < 0)
2356                         goto martian_source_keep_err;
2357                 if (err)
2358                         flags |= RTCF_DIRECTSRC;
2359         }
2360         flags |= RTCF_BROADCAST;
2361         res.type = RTN_BROADCAST;
2362         RT_CACHE_STAT_INC(in_brd);
2363
2364 local_input:
2365         rth = rt_dst_alloc(net->loopback_dev,
2366                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2367         if (!rth)
2368                 goto e_nobufs;
2369
2370         rth->dst.input= ip_local_deliver;
2371         rth->dst.output= ip_rt_bug;
2372 #ifdef CONFIG_IP_ROUTE_CLASSID
2373         rth->dst.tclassid = itag;
2374 #endif
2375
2376         rth->rt_key_dst = daddr;
2377         rth->rt_key_src = saddr;
2378         rth->rt_genid = rt_genid(net);
2379         rth->rt_flags   = flags|RTCF_LOCAL;
2380         rth->rt_type    = res.type;
2381         rth->rt_key_tos = tos;
2382         rth->rt_dst     = daddr;
2383         rth->rt_src     = saddr;
2384 #ifdef CONFIG_IP_ROUTE_CLASSID
2385         rth->dst.tclassid = itag;
2386 #endif
2387         rth->rt_route_iif = dev->ifindex;
2388         rth->rt_iif     = dev->ifindex;
2389         rth->rt_oif     = 0;
2390         rth->rt_mark    = skb->mark;
2391         rth->rt_gateway = daddr;
2392         rth->rt_spec_dst= spec_dst;
2393         rth->rt_peer_genid = 0;
2394         rth->peer = NULL;
2395         rth->fi = NULL;
2396         if (res.type == RTN_UNREACHABLE) {
2397                 rth->dst.input= ip_error;
2398                 rth->dst.error= -err;
2399                 rth->rt_flags   &= ~RTCF_LOCAL;
2400         }
2401         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2402         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2403         err = 0;
2404         if (IS_ERR(rth))
2405                 err = PTR_ERR(rth);
2406         goto out;
2407
2408 no_route:
2409         RT_CACHE_STAT_INC(in_no_route);
2410         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2411         res.type = RTN_UNREACHABLE;
2412         if (err == -ESRCH)
2413                 err = -ENETUNREACH;
2414         goto local_input;
2415
2416         /*
2417          *      Do not cache martian addresses: they should be logged (RFC1812)
2418          */
2419 martian_destination:
2420         RT_CACHE_STAT_INC(in_martian_dst);
2421 #ifdef CONFIG_IP_ROUTE_VERBOSE
2422         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2423                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2424                         &daddr, &saddr, dev->name);
2425 #endif
2426
2427 e_hostunreach:
2428         err = -EHOSTUNREACH;
2429         goto out;
2430
2431 e_inval:
2432         err = -EINVAL;
2433         goto out;
2434
2435 e_nobufs:
2436         err = -ENOBUFS;
2437         goto out;
2438
2439 martian_source:
2440         err = -EINVAL;
2441 martian_source_keep_err:
2442         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2443         goto out;
2444 }
2445
2446 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2447                            u8 tos, struct net_device *dev, bool noref)
2448 {
2449         struct rtable * rth;
2450         unsigned        hash;
2451         int iif = dev->ifindex;
2452         struct net *net;
2453         int res;
2454
2455         net = dev_net(dev);
2456
2457         rcu_read_lock();
2458
2459         if (!rt_caching(net))
2460                 goto skip_cache;
2461
2462         tos &= IPTOS_RT_MASK;
2463         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2464
2465         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2466              rth = rcu_dereference(rth->dst.rt_next)) {
2467                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2468                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2469                      (rth->rt_route_iif ^ iif) |
2470                      (rth->rt_key_tos ^ tos)) == 0 &&
2471                     rth->rt_mark == skb->mark &&
2472                     net_eq(dev_net(rth->dst.dev), net) &&
2473                     !rt_is_expired(rth)) {
2474                         ipv4_validate_peer(rth);
2475                         if (noref) {
2476                                 dst_use_noref(&rth->dst, jiffies);
2477                                 skb_dst_set_noref(skb, &rth->dst);
2478                         } else {
2479                                 dst_use(&rth->dst, jiffies);
2480                                 skb_dst_set(skb, &rth->dst);
2481                         }
2482                         RT_CACHE_STAT_INC(in_hit);
2483                         rcu_read_unlock();
2484                         return 0;
2485                 }
2486                 RT_CACHE_STAT_INC(in_hlist_search);
2487         }
2488
2489 skip_cache:
2490         /* Multicast recognition logic is moved from route cache to here.
2491            The problem was that too many Ethernet cards have broken/missing
2492            hardware multicast filters :-( As result the host on multicasting
2493            network acquires a lot of useless route cache entries, sort of
2494            SDR messages from all the world. Now we try to get rid of them.
2495            Really, provided software IP multicast filter is organized
2496            reasonably (at least, hashed), it does not result in a slowdown
2497            comparing with route cache reject entries.
2498            Note, that multicast routers are not affected, because
2499            route cache entry is created eventually.
2500          */
2501         if (ipv4_is_multicast(daddr)) {
2502                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2503
2504                 if (in_dev) {
2505                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2506                                                   ip_hdr(skb)->protocol);
2507                         if (our
2508 #ifdef CONFIG_IP_MROUTE
2509                                 ||
2510                             (!ipv4_is_local_multicast(daddr) &&
2511                              IN_DEV_MFORWARD(in_dev))
2512 #endif
2513                            ) {
2514                                 int res = ip_route_input_mc(skb, daddr, saddr,
2515                                                             tos, dev, our);
2516                                 rcu_read_unlock();
2517                                 return res;
2518                         }
2519                 }
2520                 rcu_read_unlock();
2521                 return -EINVAL;
2522         }
2523         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2524         rcu_read_unlock();
2525         return res;
2526 }
2527 EXPORT_SYMBOL(ip_route_input_common);
2528
2529 /* called with rcu_read_lock() */
2530 static struct rtable *__mkroute_output(const struct fib_result *res,
2531                                        const struct flowi4 *fl4,
2532                                        __be32 orig_daddr, __be32 orig_saddr,
2533                                        int orig_oif, __u8 orig_rtos,
2534                                        struct net_device *dev_out,
2535                                        unsigned int flags)
2536 {
2537         struct fib_info *fi = res->fi;
2538         struct in_device *in_dev;
2539         u16 type = res->type;
2540         struct rtable *rth;
2541
2542         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2543                 return ERR_PTR(-EINVAL);
2544
2545         if (ipv4_is_lbcast(fl4->daddr))
2546                 type = RTN_BROADCAST;
2547         else if (ipv4_is_multicast(fl4->daddr))
2548                 type = RTN_MULTICAST;
2549         else if (ipv4_is_zeronet(fl4->daddr))
2550                 return ERR_PTR(-EINVAL);
2551
2552         if (dev_out->flags & IFF_LOOPBACK)
2553                 flags |= RTCF_LOCAL;
2554
2555         in_dev = __in_dev_get_rcu(dev_out);
2556         if (!in_dev)
2557                 return ERR_PTR(-EINVAL);
2558
2559         if (type == RTN_BROADCAST) {
2560                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2561                 fi = NULL;
2562         } else if (type == RTN_MULTICAST) {
2563                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2564                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2565                                      fl4->flowi4_proto))
2566                         flags &= ~RTCF_LOCAL;
2567                 /* If multicast route do not exist use
2568                  * default one, but do not gateway in this case.
2569                  * Yes, it is hack.
2570                  */
2571                 if (fi && res->prefixlen < 4)
2572                         fi = NULL;
2573         }
2574
2575         rth = rt_dst_alloc(dev_out,
2576                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2577                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2578         if (!rth)
2579                 return ERR_PTR(-ENOBUFS);
2580
2581         rth->dst.output = ip_output;
2582
2583         rth->rt_key_dst = orig_daddr;
2584         rth->rt_key_src = orig_saddr;
2585         rth->rt_genid = rt_genid(dev_net(dev_out));
2586         rth->rt_flags   = flags;
2587         rth->rt_type    = type;
2588         rth->rt_key_tos = orig_rtos;
2589         rth->rt_dst     = fl4->daddr;
2590         rth->rt_src     = fl4->saddr;
2591         rth->rt_route_iif = 0;
2592         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2593         rth->rt_oif     = orig_oif;
2594         rth->rt_mark    = fl4->flowi4_mark;
2595         rth->rt_gateway = fl4->daddr;
2596         rth->rt_spec_dst= fl4->saddr;
2597         rth->rt_peer_genid = 0;
2598         rth->peer = NULL;
2599         rth->fi = NULL;
2600
2601         RT_CACHE_STAT_INC(out_slow_tot);
2602
2603         if (flags & RTCF_LOCAL) {
2604                 rth->dst.input = ip_local_deliver;
2605                 rth->rt_spec_dst = fl4->daddr;
2606         }
2607         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2608                 rth->rt_spec_dst = fl4->saddr;
2609                 if (flags & RTCF_LOCAL &&
2610                     !(dev_out->flags & IFF_LOOPBACK)) {
2611                         rth->dst.output = ip_mc_output;
2612                         RT_CACHE_STAT_INC(out_slow_mc);
2613                 }
2614 #ifdef CONFIG_IP_MROUTE
2615                 if (type == RTN_MULTICAST) {
2616                         if (IN_DEV_MFORWARD(in_dev) &&
2617                             !ipv4_is_local_multicast(fl4->daddr)) {
2618                                 rth->dst.input = ip_mr_input;
2619                                 rth->dst.output = ip_mc_output;
2620                         }
2621                 }
2622 #endif
2623         }
2624
2625         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2626
2627         return rth;
2628 }
2629
2630 /*
2631  * Major route resolver routine.
2632  * called with rcu_read_lock();
2633  */
2634
2635 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2636 {
2637         struct net_device *dev_out = NULL;
2638         __u8 tos = RT_FL_TOS(fl4);
2639         unsigned int flags = 0;
2640         struct fib_result res;
2641         struct rtable *rth;
2642         __be32 orig_daddr;
2643         __be32 orig_saddr;
2644         int orig_oif;
2645
2646         res.fi          = NULL;
2647 #ifdef CONFIG_IP_MULTIPLE_TABLES
2648         res.r           = NULL;
2649 #endif
2650
2651         orig_daddr = fl4->daddr;
2652         orig_saddr = fl4->saddr;
2653         orig_oif = fl4->flowi4_oif;
2654
2655         fl4->flowi4_iif = net->loopback_dev->ifindex;
2656         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2657         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2658                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2659
2660         rcu_read_lock();
2661         if (fl4->saddr) {
2662                 rth = ERR_PTR(-EINVAL);
2663                 if (ipv4_is_multicast(fl4->saddr) ||
2664                     ipv4_is_lbcast(fl4->saddr) ||
2665                     ipv4_is_zeronet(fl4->saddr))
2666                         goto out;
2667
2668                 /* I removed check for oif == dev_out->oif here.
2669                    It was wrong for two reasons:
2670                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2671                       is assigned to multiple interfaces.
2672                    2. Moreover, we are allowed to send packets with saddr
2673                       of another iface. --ANK
2674                  */
2675
2676                 if (fl4->flowi4_oif == 0 &&
2677                     (ipv4_is_multicast(fl4->daddr) ||
2678                      ipv4_is_lbcast(fl4->daddr))) {
2679                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2680                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2681                         if (dev_out == NULL)
2682                                 goto out;
2683
2684                         /* Special hack: user can direct multicasts
2685                            and limited broadcast via necessary interface
2686                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2687                            This hack is not just for fun, it allows
2688                            vic,vat and friends to work.
2689                            They bind socket to loopback, set ttl to zero
2690                            and expect that it will work.
2691                            From the viewpoint of routing cache they are broken,
2692                            because we are not allowed to build multicast path
2693                            with loopback source addr (look, routing cache
2694                            cannot know, that ttl is zero, so that packet
2695                            will not leave this host and route is valid).
2696                            Luckily, this hack is good workaround.
2697                          */
2698
2699                         fl4->flowi4_oif = dev_out->ifindex;
2700                         goto make_route;
2701                 }
2702
2703                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2704                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2705                         if (!__ip_dev_find(net, fl4->saddr, false))
2706                                 goto out;
2707                 }
2708         }
2709
2710
2711         if (fl4->flowi4_oif) {
2712                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2713                 rth = ERR_PTR(-ENODEV);
2714                 if (dev_out == NULL)
2715                         goto out;
2716
2717                 /* RACE: Check return value of inet_select_addr instead. */
2718                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2719                         rth = ERR_PTR(-ENETUNREACH);
2720                         goto out;
2721                 }
2722                 if (ipv4_is_local_multicast(fl4->daddr) ||
2723                     ipv4_is_lbcast(fl4->daddr)) {
2724                         if (!fl4->saddr)
2725                                 fl4->saddr = inet_select_addr(dev_out, 0,
2726                                                               RT_SCOPE_LINK);
2727                         goto make_route;
2728                 }
2729                 if (fl4->saddr) {
2730                         if (ipv4_is_multicast(fl4->daddr))
2731                                 fl4->saddr = inet_select_addr(dev_out, 0,
2732                                                               fl4->flowi4_scope);
2733                         else if (!fl4->daddr)
2734                                 fl4->saddr = inet_select_addr(dev_out, 0,
2735                                                               RT_SCOPE_HOST);
2736                 }
2737         }
2738
2739         if (!fl4->daddr) {
2740                 fl4->daddr = fl4->saddr;
2741                 if (!fl4->daddr)
2742                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2743                 dev_out = net->loopback_dev;
2744                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2745                 res.type = RTN_LOCAL;
2746                 flags |= RTCF_LOCAL;
2747                 goto make_route;
2748         }
2749
2750         if (fib_lookup(net, fl4, &res)) {
2751                 res.fi = NULL;
2752                 if (fl4->flowi4_oif) {
2753                         /* Apparently, routing tables are wrong. Assume,
2754                            that the destination is on link.
2755
2756                            WHY? DW.
2757                            Because we are allowed to send to iface
2758                            even if it has NO routes and NO assigned
2759                            addresses. When oif is specified, routing
2760                            tables are looked up with only one purpose:
2761                            to catch if destination is gatewayed, rather than
2762                            direct. Moreover, if MSG_DONTROUTE is set,
2763                            we send packet, ignoring both routing tables
2764                            and ifaddr state. --ANK
2765
2766
2767                            We could make it even if oif is unknown,
2768                            likely IPv6, but we do not.
2769                          */
2770
2771                         if (fl4->saddr == 0)
2772                                 fl4->saddr = inet_select_addr(dev_out, 0,
2773                                                               RT_SCOPE_LINK);
2774                         res.type = RTN_UNICAST;
2775                         goto make_route;
2776                 }
2777                 rth = ERR_PTR(-ENETUNREACH);
2778                 goto out;
2779         }
2780
2781         if (res.type == RTN_LOCAL) {
2782                 if (!fl4->saddr) {
2783                         if (res.fi->fib_prefsrc)
2784                                 fl4->saddr = res.fi->fib_prefsrc;
2785                         else
2786                                 fl4->saddr = fl4->daddr;
2787                 }
2788                 dev_out = net->loopback_dev;
2789                 fl4->flowi4_oif = dev_out->ifindex;
2790                 res.fi = NULL;
2791                 flags |= RTCF_LOCAL;
2792                 goto make_route;
2793         }
2794
2795 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2796         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2797                 fib_select_multipath(&res);
2798         else
2799 #endif
2800         if (!res.prefixlen &&
2801             res.table->tb_num_default > 1 &&
2802             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2803                 fib_select_default(&res);
2804
2805         if (!fl4->saddr)
2806                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2807
2808         dev_out = FIB_RES_DEV(res);
2809         fl4->flowi4_oif = dev_out->ifindex;
2810
2811
2812 make_route:
2813         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2814                                tos, dev_out, flags);
2815         if (!IS_ERR(rth)) {
2816                 unsigned int hash;
2817
2818                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2819                                rt_genid(dev_net(dev_out)));
2820                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2821         }
2822
2823 out:
2824         rcu_read_unlock();
2825         return rth;
2826 }
2827
2828 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2829 {
2830         struct rtable *rth;
2831         unsigned int hash;
2832
2833         if (!rt_caching(net))
2834                 goto slow_output;
2835
2836         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2837
2838         rcu_read_lock_bh();
2839         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2840                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2841                 if (rth->rt_key_dst == flp4->daddr &&
2842                     rth->rt_key_src == flp4->saddr &&
2843                     rt_is_output_route(rth) &&
2844                     rth->rt_oif == flp4->flowi4_oif &&
2845                     rth->rt_mark == flp4->flowi4_mark &&
2846                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2847                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2848                     net_eq(dev_net(rth->dst.dev), net) &&
2849                     !rt_is_expired(rth)) {
2850                         ipv4_validate_peer(rth);
2851                         dst_use(&rth->dst, jiffies);
2852                         RT_CACHE_STAT_INC(out_hit);
2853                         rcu_read_unlock_bh();
2854                         if (!flp4->saddr)
2855                                 flp4->saddr = rth->rt_src;
2856                         if (!flp4->daddr)
2857                                 flp4->daddr = rth->rt_dst;
2858                         return rth;
2859                 }
2860                 RT_CACHE_STAT_INC(out_hlist_search);
2861         }
2862         rcu_read_unlock_bh();
2863
2864 slow_output:
2865         return ip_route_output_slow(net, flp4);
2866 }
2867 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2868
2869 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2870 {
2871         return NULL;
2872 }
2873
2874 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2875 {
2876         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2877
2878         return mtu ? : dst->dev->mtu;
2879 }
2880
2881 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2882 {
2883 }
2884
2885 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2886                                           unsigned long old)
2887 {
2888         return NULL;
2889 }
2890
2891 static struct dst_ops ipv4_dst_blackhole_ops = {
2892         .family                 =       AF_INET,
2893         .protocol               =       cpu_to_be16(ETH_P_IP),
2894         .destroy                =       ipv4_dst_destroy,
2895         .check                  =       ipv4_blackhole_dst_check,
2896         .mtu                    =       ipv4_blackhole_mtu,
2897         .default_advmss         =       ipv4_default_advmss,
2898         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2899         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2900         .neigh_lookup           =       ipv4_neigh_lookup,
2901 };
2902
2903 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2904 {
2905         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2906         struct rtable *ort = (struct rtable *) dst_orig;
2907
2908         if (rt) {
2909                 struct dst_entry *new = &rt->dst;
2910
2911                 new->__use = 1;
2912                 new->input = dst_discard;
2913                 new->output = dst_discard;
2914                 dst_copy_metrics(new, &ort->dst);
2915
2916                 new->dev = ort->dst.dev;
2917                 if (new->dev)
2918                         dev_hold(new->dev);
2919
2920                 rt->rt_key_dst = ort->rt_key_dst;
2921                 rt->rt_key_src = ort->rt_key_src;
2922                 rt->rt_key_tos = ort->rt_key_tos;
2923                 rt->rt_route_iif = ort->rt_route_iif;
2924                 rt->rt_iif = ort->rt_iif;
2925                 rt->rt_oif = ort->rt_oif;
2926                 rt->rt_mark = ort->rt_mark;
2927
2928                 rt->rt_genid = rt_genid(net);
2929                 rt->rt_flags = ort->rt_flags;
2930                 rt->rt_type = ort->rt_type;
2931                 rt->rt_dst = ort->rt_dst;
2932                 rt->rt_src = ort->rt_src;
2933                 rt->rt_gateway = ort->rt_gateway;
2934                 rt->rt_spec_dst = ort->rt_spec_dst;
2935                 rt->peer = ort->peer;
2936                 if (rt->peer)
2937                         atomic_inc(&rt->peer->refcnt);
2938                 rt->fi = ort->fi;
2939                 if (rt->fi)
2940                         atomic_inc(&rt->fi->fib_clntref);
2941
2942                 dst_free(new);
2943         }
2944
2945         dst_release(dst_orig);
2946
2947         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2948 }
2949
2950 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2951                                     struct sock *sk)
2952 {
2953         struct rtable *rt = __ip_route_output_key(net, flp4);
2954
2955         if (IS_ERR(rt))
2956                 return rt;
2957
2958         if (flp4->flowi4_proto)
2959                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2960                                                    flowi4_to_flowi(flp4),
2961                                                    sk, 0);
2962
2963         return rt;
2964 }
2965 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2966
2967 static int rt_fill_info(struct net *net,
2968                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2969                         int nowait, unsigned int flags)
2970 {
2971         struct rtable *rt = skb_rtable(skb);
2972         struct rtmsg *r;
2973         struct nlmsghdr *nlh;
2974         unsigned long expires = 0;
2975         const struct inet_peer *peer = rt->peer;
2976         u32 id = 0, ts = 0, tsage = 0, error;
2977
2978         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2979         if (nlh == NULL)
2980                 return -EMSGSIZE;
2981
2982         r = nlmsg_data(nlh);
2983         r->rtm_family    = AF_INET;
2984         r->rtm_dst_len  = 32;
2985         r->rtm_src_len  = 0;
2986         r->rtm_tos      = rt->rt_key_tos;
2987         r->rtm_table    = RT_TABLE_MAIN;
2988         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2989         r->rtm_type     = rt->rt_type;
2990         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2991         r->rtm_protocol = RTPROT_UNSPEC;
2992         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2993         if (rt->rt_flags & RTCF_NOTIFY)
2994                 r->rtm_flags |= RTM_F_NOTIFY;
2995
2996         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2997
2998         if (rt->rt_key_src) {
2999                 r->rtm_src_len = 32;
3000                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3001         }
3002         if (rt->dst.dev)
3003                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3004 #ifdef CONFIG_IP_ROUTE_CLASSID
3005         if (rt->dst.tclassid)
3006                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3007 #endif
3008         if (rt_is_input_route(rt))
3009                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3010         else if (rt->rt_src != rt->rt_key_src)
3011                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3012
3013         if (rt->rt_dst != rt->rt_gateway)
3014                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3015
3016         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3017                 goto nla_put_failure;
3018
3019         if (rt->rt_mark)
3020                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3021
3022         error = rt->dst.error;
3023         if (peer) {
3024                 inet_peer_refcheck(rt->peer);
3025                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3026                 if (peer->tcp_ts_stamp) {
3027                         ts = peer->tcp_ts;
3028                         tsage = get_seconds() - peer->tcp_ts_stamp;
3029                 }
3030                 expires = ACCESS_ONCE(peer->pmtu_expires);
3031                 if (expires) {
3032                         if (time_before(jiffies, expires))
3033                                 expires -= jiffies;
3034                         else
3035                                 expires = 0;
3036                 }
3037         }
3038
3039         if (rt_is_input_route(rt)) {
3040 #ifdef CONFIG_IP_MROUTE
3041                 __be32 dst = rt->rt_dst;
3042
3043                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3044                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3045                         int err = ipmr_get_route(net, skb,
3046                                                  rt->rt_src, rt->rt_dst,
3047                                                  r, nowait);
3048                         if (err <= 0) {
3049                                 if (!nowait) {
3050                                         if (err == 0)
3051                                                 return 0;
3052                                         goto nla_put_failure;
3053                                 } else {
3054                                         if (err == -EMSGSIZE)
3055                                                 goto nla_put_failure;
3056                                         error = err;
3057                                 }
3058                         }
3059                 } else
3060 #endif
3061                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3062         }
3063
3064         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3065                                expires, error) < 0)
3066                 goto nla_put_failure;
3067
3068         return nlmsg_end(skb, nlh);
3069
3070 nla_put_failure:
3071         nlmsg_cancel(skb, nlh);
3072         return -EMSGSIZE;
3073 }
3074
3075 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3076 {
3077         struct net *net = sock_net(in_skb->sk);
3078         struct rtmsg *rtm;
3079         struct nlattr *tb[RTA_MAX+1];
3080         struct rtable *rt = NULL;
3081         __be32 dst = 0;
3082         __be32 src = 0;
3083         u32 iif;
3084         int err;
3085         int mark;
3086         struct sk_buff *skb;
3087
3088         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3089         if (err < 0)
3090                 goto errout;
3091
3092         rtm = nlmsg_data(nlh);
3093
3094         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3095         if (skb == NULL) {
3096                 err = -ENOBUFS;
3097                 goto errout;
3098         }
3099
3100         /* Reserve room for dummy headers, this skb can pass
3101            through good chunk of routing engine.
3102          */
3103         skb_reset_mac_header(skb);
3104         skb_reset_network_header(skb);
3105
3106         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3107         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3108         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3109
3110         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3111         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3112         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3113         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3114
3115         if (iif) {
3116                 struct net_device *dev;
3117
3118                 dev = __dev_get_by_index(net, iif);
3119                 if (dev == NULL) {
3120                         err = -ENODEV;
3121                         goto errout_free;
3122                 }
3123
3124                 skb->protocol   = htons(ETH_P_IP);
3125                 skb->dev        = dev;
3126                 skb->mark       = mark;
3127                 local_bh_disable();
3128                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3129                 local_bh_enable();
3130
3131                 rt = skb_rtable(skb);
3132                 if (err == 0 && rt->dst.error)
3133                         err = -rt->dst.error;
3134         } else {
3135                 struct flowi4 fl4 = {
3136                         .daddr = dst,
3137                         .saddr = src,
3138                         .flowi4_tos = rtm->rtm_tos,
3139                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3140                         .flowi4_mark = mark,
3141                 };
3142                 rt = ip_route_output_key(net, &fl4);
3143
3144                 err = 0;
3145                 if (IS_ERR(rt))
3146                         err = PTR_ERR(rt);
3147         }
3148
3149         if (err)
3150                 goto errout_free;
3151
3152         skb_dst_set(skb, &rt->dst);
3153         if (rtm->rtm_flags & RTM_F_NOTIFY)
3154                 rt->rt_flags |= RTCF_NOTIFY;
3155
3156         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3157                            RTM_NEWROUTE, 0, 0);
3158         if (err <= 0)
3159                 goto errout_free;
3160
3161         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3162 errout:
3163         return err;
3164
3165 errout_free:
3166         kfree_skb(skb);
3167         goto errout;
3168 }
3169
3170 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3171 {
3172         struct rtable *rt;
3173         int h, s_h;
3174         int idx, s_idx;
3175         struct net *net;
3176
3177         net = sock_net(skb->sk);
3178
3179         s_h = cb->args[0];
3180         if (s_h < 0)
3181                 s_h = 0;
3182         s_idx = idx = cb->args[1];
3183         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3184                 if (!rt_hash_table[h].chain)
3185                         continue;
3186                 rcu_read_lock_bh();
3187                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3188                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3189                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3190                                 continue;
3191                         if (rt_is_expired(rt))
3192                                 continue;
3193                         skb_dst_set_noref(skb, &rt->dst);
3194                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3195                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3196                                          1, NLM_F_MULTI) <= 0) {
3197                                 skb_dst_drop(skb);
3198                                 rcu_read_unlock_bh();
3199                                 goto done;
3200                         }
3201                         skb_dst_drop(skb);
3202                 }
3203                 rcu_read_unlock_bh();
3204         }
3205
3206 done:
3207         cb->args[0] = h;
3208         cb->args[1] = idx;
3209         return skb->len;
3210 }
3211
3212 void ip_rt_multicast_event(struct in_device *in_dev)
3213 {
3214         rt_cache_flush(dev_net(in_dev->dev), 0);
3215 }
3216
3217 #ifdef CONFIG_SYSCTL
3218 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3219                                         void __user *buffer,
3220                                         size_t *lenp, loff_t *ppos)
3221 {
3222         if (write) {
3223                 int flush_delay;
3224                 ctl_table ctl;
3225                 struct net *net;
3226
3227                 memcpy(&ctl, __ctl, sizeof(ctl));
3228                 ctl.data = &flush_delay;
3229                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3230
3231                 net = (struct net *)__ctl->extra1;
3232                 rt_cache_flush(net, flush_delay);
3233                 return 0;
3234         }
3235
3236         return -EINVAL;
3237 }
3238
3239 static ctl_table ipv4_route_table[] = {
3240         {
3241                 .procname       = "gc_thresh",
3242                 .data           = &ipv4_dst_ops.gc_thresh,
3243                 .maxlen         = sizeof(int),
3244                 .mode           = 0644,
3245                 .proc_handler   = proc_dointvec,
3246         },
3247         {
3248                 .procname       = "max_size",
3249                 .data           = &ip_rt_max_size,
3250                 .maxlen         = sizeof(int),
3251                 .mode           = 0644,
3252                 .proc_handler   = proc_dointvec,
3253         },
3254         {
3255                 /*  Deprecated. Use gc_min_interval_ms */
3256
3257                 .procname       = "gc_min_interval",
3258                 .data           = &ip_rt_gc_min_interval,
3259                 .maxlen         = sizeof(int),
3260                 .mode           = 0644,
3261                 .proc_handler   = proc_dointvec_jiffies,
3262         },
3263         {
3264                 .procname       = "gc_min_interval_ms",
3265                 .data           = &ip_rt_gc_min_interval,
3266                 .maxlen         = sizeof(int),
3267                 .mode           = 0644,
3268                 .proc_handler   = proc_dointvec_ms_jiffies,
3269         },
3270         {
3271                 .procname       = "gc_timeout",
3272                 .data           = &ip_rt_gc_timeout,
3273                 .maxlen         = sizeof(int),
3274                 .mode           = 0644,
3275                 .proc_handler   = proc_dointvec_jiffies,
3276         },
3277         {
3278                 .procname       = "gc_interval",
3279                 .data           = &ip_rt_gc_interval,
3280                 .maxlen         = sizeof(int),
3281                 .mode           = 0644,
3282                 .proc_handler   = proc_dointvec_jiffies,
3283         },
3284         {
3285                 .procname       = "redirect_load",
3286                 .data           = &ip_rt_redirect_load,
3287                 .maxlen         = sizeof(int),
3288                 .mode           = 0644,
3289                 .proc_handler   = proc_dointvec,
3290         },
3291         {
3292                 .procname       = "redirect_number",
3293                 .data           = &ip_rt_redirect_number,
3294                 .maxlen         = sizeof(int),
3295                 .mode           = 0644,
3296                 .proc_handler   = proc_dointvec,
3297         },
3298         {
3299                 .procname       = "redirect_silence",
3300                 .data           = &ip_rt_redirect_silence,
3301                 .maxlen         = sizeof(int),
3302                 .mode           = 0644,
3303                 .proc_handler   = proc_dointvec,
3304         },
3305         {
3306                 .procname       = "error_cost",
3307                 .data           = &ip_rt_error_cost,
3308                 .maxlen         = sizeof(int),
3309                 .mode           = 0644,
3310                 .proc_handler   = proc_dointvec,
3311         },
3312         {
3313                 .procname       = "error_burst",
3314                 .data           = &ip_rt_error_burst,
3315                 .maxlen         = sizeof(int),
3316                 .mode           = 0644,
3317                 .proc_handler   = proc_dointvec,
3318         },
3319         {
3320                 .procname       = "gc_elasticity",
3321                 .data           = &ip_rt_gc_elasticity,
3322                 .maxlen         = sizeof(int),
3323                 .mode           = 0644,
3324                 .proc_handler   = proc_dointvec,
3325         },
3326         {
3327                 .procname       = "mtu_expires",
3328                 .data           = &ip_rt_mtu_expires,
3329                 .maxlen         = sizeof(int),
3330                 .mode           = 0644,
3331                 .proc_handler   = proc_dointvec_jiffies,
3332         },
3333         {
3334                 .procname       = "min_pmtu",
3335                 .data           = &ip_rt_min_pmtu,
3336                 .maxlen         = sizeof(int),
3337                 .mode           = 0644,
3338                 .proc_handler   = proc_dointvec,
3339         },
3340         {
3341                 .procname       = "min_adv_mss",
3342                 .data           = &ip_rt_min_advmss,
3343                 .maxlen         = sizeof(int),
3344                 .mode           = 0644,
3345                 .proc_handler   = proc_dointvec,
3346         },
3347         { }
3348 };
3349
3350 static struct ctl_table empty[1];
3351
3352 static struct ctl_table ipv4_skeleton[] =
3353 {
3354         { .procname = "route", 
3355           .mode = 0555, .child = ipv4_route_table},
3356         { .procname = "neigh", 
3357           .mode = 0555, .child = empty},
3358         { }
3359 };
3360
3361 static __net_initdata struct ctl_path ipv4_path[] = {
3362         { .procname = "net", },
3363         { .procname = "ipv4", },
3364         { },
3365 };
3366
3367 static struct ctl_table ipv4_route_flush_table[] = {
3368         {
3369                 .procname       = "flush",
3370                 .maxlen         = sizeof(int),
3371                 .mode           = 0200,
3372                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3373         },
3374         { },
3375 };
3376
3377 static __net_initdata struct ctl_path ipv4_route_path[] = {
3378         { .procname = "net", },
3379         { .procname = "ipv4", },
3380         { .procname = "route", },
3381         { },
3382 };
3383
3384 static __net_init int sysctl_route_net_init(struct net *net)
3385 {
3386         struct ctl_table *tbl;
3387
3388         tbl = ipv4_route_flush_table;
3389         if (!net_eq(net, &init_net)) {
3390                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3391                 if (tbl == NULL)
3392                         goto err_dup;
3393         }
3394         tbl[0].extra1 = net;
3395
3396         net->ipv4.route_hdr =
3397                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3398         if (net->ipv4.route_hdr == NULL)
3399                 goto err_reg;
3400         return 0;
3401
3402 err_reg:
3403         if (tbl != ipv4_route_flush_table)
3404                 kfree(tbl);
3405 err_dup:
3406         return -ENOMEM;
3407 }
3408
3409 static __net_exit void sysctl_route_net_exit(struct net *net)
3410 {
3411         struct ctl_table *tbl;
3412
3413         tbl = net->ipv4.route_hdr->ctl_table_arg;
3414         unregister_net_sysctl_table(net->ipv4.route_hdr);
3415         BUG_ON(tbl == ipv4_route_flush_table);
3416         kfree(tbl);
3417 }
3418
3419 static __net_initdata struct pernet_operations sysctl_route_ops = {
3420         .init = sysctl_route_net_init,
3421         .exit = sysctl_route_net_exit,
3422 };
3423 #endif
3424
3425 static __net_init int rt_genid_init(struct net *net)
3426 {
3427         get_random_bytes(&net->ipv4.rt_genid,
3428                          sizeof(net->ipv4.rt_genid));
3429         get_random_bytes(&net->ipv4.dev_addr_genid,
3430                          sizeof(net->ipv4.dev_addr_genid));
3431         return 0;
3432 }
3433
3434 static __net_initdata struct pernet_operations rt_genid_ops = {
3435         .init = rt_genid_init,
3436 };
3437
3438
3439 #ifdef CONFIG_IP_ROUTE_CLASSID
3440 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3441 #endif /* CONFIG_IP_ROUTE_CLASSID */
3442
3443 static __initdata unsigned long rhash_entries;
3444 static int __init set_rhash_entries(char *str)
3445 {
3446         if (!str)
3447                 return 0;
3448         rhash_entries = simple_strtoul(str, &str, 0);
3449         return 1;
3450 }
3451 __setup("rhash_entries=", set_rhash_entries);
3452
3453 int __init ip_rt_init(void)
3454 {
3455         int rc = 0;
3456
3457 #ifdef CONFIG_IP_ROUTE_CLASSID
3458         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3459         if (!ip_rt_acct)
3460                 panic("IP: failed to allocate ip_rt_acct\n");
3461 #endif
3462
3463         ipv4_dst_ops.kmem_cachep =
3464                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3465                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3466
3467         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3468
3469         if (dst_entries_init(&ipv4_dst_ops) < 0)
3470                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3471
3472         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3473                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3474
3475         rt_hash_table = (struct rt_hash_bucket *)
3476                 alloc_large_system_hash("IP route cache",
3477                                         sizeof(struct rt_hash_bucket),
3478                                         rhash_entries,
3479                                         (totalram_pages >= 128 * 1024) ?
3480                                         15 : 17,
3481                                         0,
3482                                         &rt_hash_log,
3483                                         &rt_hash_mask,
3484                                         rhash_entries ? 0 : 512 * 1024);
3485         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3486         rt_hash_lock_init();
3487
3488         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3489         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3490
3491         devinet_init();
3492         ip_fib_init();
3493
3494         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3495         expires_ljiffies = jiffies;
3496         schedule_delayed_work(&expires_work,
3497                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3498
3499         if (ip_rt_proc_init())
3500                 printk(KERN_ERR "Unable to create route proc files\n");
3501 #ifdef CONFIG_XFRM
3502         xfrm_init();
3503         xfrm4_init(ip_rt_max_size);
3504 #endif
3505         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3506
3507 #ifdef CONFIG_SYSCTL
3508         register_pernet_subsys(&sysctl_route_ops);
3509 #endif
3510         register_pernet_subsys(&rt_genid_ops);
3511         return rc;
3512 }
3513
3514 #ifdef CONFIG_SYSCTL
3515 /*
3516  * We really need to sanitize the damn ipv4 init order, then all
3517  * this nonsense will go away.
3518  */
3519 void __init ip_static_sysctl_init(void)
3520 {
3521         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3522 }
3523 #endif