IPv6: fix race between cleanup and add/delete address
[linux-2.6.git] / net / netfilter / nf_conntrack_expect.c
1 /* Expectation handling for nf_conntrack. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11
12 #include <linux/types.h>
13 #include <linux/netfilter.h>
14 #include <linux/skbuff.h>
15 #include <linux/proc_fs.h>
16 #include <linux/seq_file.h>
17 #include <linux/stddef.h>
18 #include <linux/slab.h>
19 #include <linux/err.h>
20 #include <linux/percpu.h>
21 #include <linux/kernel.h>
22 #include <linux/jhash.h>
23 #include <net/net_namespace.h>
24
25 #include <net/netfilter/nf_conntrack.h>
26 #include <net/netfilter/nf_conntrack_core.h>
27 #include <net/netfilter/nf_conntrack_expect.h>
28 #include <net/netfilter/nf_conntrack_helper.h>
29 #include <net/netfilter/nf_conntrack_tuple.h>
30 #include <net/netfilter/nf_conntrack_zones.h>
31
32 unsigned int nf_ct_expect_hsize __read_mostly;
33 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
34
35 static unsigned int nf_ct_expect_hash_rnd __read_mostly;
36 unsigned int nf_ct_expect_max __read_mostly;
37 static int nf_ct_expect_hash_rnd_initted __read_mostly;
38
39 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
40
41 /* nf_conntrack_expect helper functions */
42 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
43 {
44         struct nf_conn_help *master_help = nfct_help(exp->master);
45         struct net *net = nf_ct_exp_net(exp);
46
47         NF_CT_ASSERT(master_help);
48         NF_CT_ASSERT(!timer_pending(&exp->timeout));
49
50         hlist_del_rcu(&exp->hnode);
51         net->ct.expect_count--;
52
53         hlist_del(&exp->lnode);
54         master_help->expecting[exp->class]--;
55         nf_ct_expect_put(exp);
56
57         NF_CT_STAT_INC(net, expect_delete);
58 }
59 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
60
61 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
62 {
63         struct nf_conntrack_expect *exp = (void *)ul_expect;
64
65         spin_lock_bh(&nf_conntrack_lock);
66         nf_ct_unlink_expect(exp);
67         spin_unlock_bh(&nf_conntrack_lock);
68         nf_ct_expect_put(exp);
69 }
70
71 static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
72 {
73         unsigned int hash;
74
75         if (unlikely(!nf_ct_expect_hash_rnd_initted)) {
76                 get_random_bytes(&nf_ct_expect_hash_rnd,
77                                  sizeof(nf_ct_expect_hash_rnd));
78                 nf_ct_expect_hash_rnd_initted = 1;
79         }
80
81         hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
82                       (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
83                        (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd);
84         return ((u64)hash * nf_ct_expect_hsize) >> 32;
85 }
86
87 struct nf_conntrack_expect *
88 __nf_ct_expect_find(struct net *net, u16 zone,
89                     const struct nf_conntrack_tuple *tuple)
90 {
91         struct nf_conntrack_expect *i;
92         struct hlist_node *n;
93         unsigned int h;
94
95         if (!net->ct.expect_count)
96                 return NULL;
97
98         h = nf_ct_expect_dst_hash(tuple);
99         hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) {
100                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
101                     nf_ct_zone(i->master) == zone)
102                         return i;
103         }
104         return NULL;
105 }
106 EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
107
108 /* Just find a expectation corresponding to a tuple. */
109 struct nf_conntrack_expect *
110 nf_ct_expect_find_get(struct net *net, u16 zone,
111                       const struct nf_conntrack_tuple *tuple)
112 {
113         struct nf_conntrack_expect *i;
114
115         rcu_read_lock();
116         i = __nf_ct_expect_find(net, zone, tuple);
117         if (i && !atomic_inc_not_zero(&i->use))
118                 i = NULL;
119         rcu_read_unlock();
120
121         return i;
122 }
123 EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
124
125 /* If an expectation for this connection is found, it gets delete from
126  * global list then returned. */
127 struct nf_conntrack_expect *
128 nf_ct_find_expectation(struct net *net, u16 zone,
129                        const struct nf_conntrack_tuple *tuple)
130 {
131         struct nf_conntrack_expect *i, *exp = NULL;
132         struct hlist_node *n;
133         unsigned int h;
134
135         if (!net->ct.expect_count)
136                 return NULL;
137
138         h = nf_ct_expect_dst_hash(tuple);
139         hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
140                 if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
141                     nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
142                     nf_ct_zone(i->master) == zone) {
143                         exp = i;
144                         break;
145                 }
146         }
147         if (!exp)
148                 return NULL;
149
150         /* If master is not in hash table yet (ie. packet hasn't left
151            this machine yet), how can other end know about expected?
152            Hence these are not the droids you are looking for (if
153            master ct never got confirmed, we'd hold a reference to it
154            and weird things would happen to future packets). */
155         if (!nf_ct_is_confirmed(exp->master))
156                 return NULL;
157
158         if (exp->flags & NF_CT_EXPECT_PERMANENT) {
159                 atomic_inc(&exp->use);
160                 return exp;
161         } else if (del_timer(&exp->timeout)) {
162                 nf_ct_unlink_expect(exp);
163                 return exp;
164         }
165
166         return NULL;
167 }
168
169 /* delete all expectations for this conntrack */
170 void nf_ct_remove_expectations(struct nf_conn *ct)
171 {
172         struct nf_conn_help *help = nfct_help(ct);
173         struct nf_conntrack_expect *exp;
174         struct hlist_node *n, *next;
175
176         /* Optimization: most connection never expect any others. */
177         if (!help)
178                 return;
179
180         hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
181                 if (del_timer(&exp->timeout)) {
182                         nf_ct_unlink_expect(exp);
183                         nf_ct_expect_put(exp);
184                 }
185         }
186 }
187 EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
188
189 /* Would two expected things clash? */
190 static inline int expect_clash(const struct nf_conntrack_expect *a,
191                                const struct nf_conntrack_expect *b)
192 {
193         /* Part covered by intersection of masks must be unequal,
194            otherwise they clash */
195         struct nf_conntrack_tuple_mask intersect_mask;
196         int count;
197
198         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
199
200         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
201                 intersect_mask.src.u3.all[count] =
202                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
203         }
204
205         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
206 }
207
208 static inline int expect_matches(const struct nf_conntrack_expect *a,
209                                  const struct nf_conntrack_expect *b)
210 {
211         return a->master == b->master && a->class == b->class &&
212                 nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
213                 nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
214                 nf_ct_zone(a->master) == nf_ct_zone(b->master);
215 }
216
217 /* Generally a bad idea to call this: could have matched already. */
218 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
219 {
220         spin_lock_bh(&nf_conntrack_lock);
221         if (del_timer(&exp->timeout)) {
222                 nf_ct_unlink_expect(exp);
223                 nf_ct_expect_put(exp);
224         }
225         spin_unlock_bh(&nf_conntrack_lock);
226 }
227 EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
228
229 /* We don't increase the master conntrack refcount for non-fulfilled
230  * conntracks. During the conntrack destruction, the expectations are
231  * always killed before the conntrack itself */
232 struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
233 {
234         struct nf_conntrack_expect *new;
235
236         new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
237         if (!new)
238                 return NULL;
239
240         new->master = me;
241         atomic_set(&new->use, 1);
242         return new;
243 }
244 EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
245
246 void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
247                        u_int8_t family,
248                        const union nf_inet_addr *saddr,
249                        const union nf_inet_addr *daddr,
250                        u_int8_t proto, const __be16 *src, const __be16 *dst)
251 {
252         int len;
253
254         if (family == AF_INET)
255                 len = 4;
256         else
257                 len = 16;
258
259         exp->flags = 0;
260         exp->class = class;
261         exp->expectfn = NULL;
262         exp->helper = NULL;
263         exp->tuple.src.l3num = family;
264         exp->tuple.dst.protonum = proto;
265
266         if (saddr) {
267                 memcpy(&exp->tuple.src.u3, saddr, len);
268                 if (sizeof(exp->tuple.src.u3) > len)
269                         /* address needs to be cleared for nf_ct_tuple_equal */
270                         memset((void *)&exp->tuple.src.u3 + len, 0x00,
271                                sizeof(exp->tuple.src.u3) - len);
272                 memset(&exp->mask.src.u3, 0xFF, len);
273                 if (sizeof(exp->mask.src.u3) > len)
274                         memset((void *)&exp->mask.src.u3 + len, 0x00,
275                                sizeof(exp->mask.src.u3) - len);
276         } else {
277                 memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
278                 memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
279         }
280
281         if (src) {
282                 exp->tuple.src.u.all = *src;
283                 exp->mask.src.u.all = htons(0xFFFF);
284         } else {
285                 exp->tuple.src.u.all = 0;
286                 exp->mask.src.u.all = 0;
287         }
288
289         memcpy(&exp->tuple.dst.u3, daddr, len);
290         if (sizeof(exp->tuple.dst.u3) > len)
291                 /* address needs to be cleared for nf_ct_tuple_equal */
292                 memset((void *)&exp->tuple.dst.u3 + len, 0x00,
293                        sizeof(exp->tuple.dst.u3) - len);
294
295         exp->tuple.dst.u.all = *dst;
296 }
297 EXPORT_SYMBOL_GPL(nf_ct_expect_init);
298
299 static void nf_ct_expect_free_rcu(struct rcu_head *head)
300 {
301         struct nf_conntrack_expect *exp;
302
303         exp = container_of(head, struct nf_conntrack_expect, rcu);
304         kmem_cache_free(nf_ct_expect_cachep, exp);
305 }
306
307 void nf_ct_expect_put(struct nf_conntrack_expect *exp)
308 {
309         if (atomic_dec_and_test(&exp->use))
310                 call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
311 }
312 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
313
314 static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
315 {
316         struct nf_conn_help *master_help = nfct_help(exp->master);
317         struct net *net = nf_ct_exp_net(exp);
318         const struct nf_conntrack_expect_policy *p;
319         unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
320
321         atomic_inc(&exp->use);
322
323         hlist_add_head(&exp->lnode, &master_help->expectations);
324         master_help->expecting[exp->class]++;
325
326         hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
327         net->ct.expect_count++;
328
329         setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
330                     (unsigned long)exp);
331         p = &master_help->helper->expect_policy[exp->class];
332         exp->timeout.expires = jiffies + p->timeout * HZ;
333         add_timer(&exp->timeout);
334
335         atomic_inc(&exp->use);
336         NF_CT_STAT_INC(net, expect_create);
337 }
338
339 /* Race with expectations being used means we could have none to find; OK. */
340 static void evict_oldest_expect(struct nf_conn *master,
341                                 struct nf_conntrack_expect *new)
342 {
343         struct nf_conn_help *master_help = nfct_help(master);
344         struct nf_conntrack_expect *exp, *last = NULL;
345         struct hlist_node *n;
346
347         hlist_for_each_entry(exp, n, &master_help->expectations, lnode) {
348                 if (exp->class == new->class)
349                         last = exp;
350         }
351
352         if (last && del_timer(&last->timeout)) {
353                 nf_ct_unlink_expect(last);
354                 nf_ct_expect_put(last);
355         }
356 }
357
358 static inline int refresh_timer(struct nf_conntrack_expect *i)
359 {
360         struct nf_conn_help *master_help = nfct_help(i->master);
361         const struct nf_conntrack_expect_policy *p;
362
363         if (!del_timer(&i->timeout))
364                 return 0;
365
366         p = &master_help->helper->expect_policy[i->class];
367         i->timeout.expires = jiffies + p->timeout * HZ;
368         add_timer(&i->timeout);
369         return 1;
370 }
371
372 static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
373 {
374         const struct nf_conntrack_expect_policy *p;
375         struct nf_conntrack_expect *i;
376         struct nf_conn *master = expect->master;
377         struct nf_conn_help *master_help = nfct_help(master);
378         struct net *net = nf_ct_exp_net(expect);
379         struct hlist_node *n;
380         unsigned int h;
381         int ret = 1;
382
383         if (!master_help->helper) {
384                 ret = -ESHUTDOWN;
385                 goto out;
386         }
387         h = nf_ct_expect_dst_hash(&expect->tuple);
388         hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
389                 if (expect_matches(i, expect)) {
390                         /* Refresh timer: if it's dying, ignore.. */
391                         if (refresh_timer(i)) {
392                                 ret = 0;
393                                 goto out;
394                         }
395                 } else if (expect_clash(i, expect)) {
396                         ret = -EBUSY;
397                         goto out;
398                 }
399         }
400         /* Will be over limit? */
401         p = &master_help->helper->expect_policy[expect->class];
402         if (p->max_expected &&
403             master_help->expecting[expect->class] >= p->max_expected) {
404                 evict_oldest_expect(master, expect);
405                 if (master_help->expecting[expect->class] >= p->max_expected) {
406                         ret = -EMFILE;
407                         goto out;
408                 }
409         }
410
411         if (net->ct.expect_count >= nf_ct_expect_max) {
412                 if (net_ratelimit())
413                         printk(KERN_WARNING
414                                "nf_conntrack: expectation table full\n");
415                 ret = -EMFILE;
416         }
417 out:
418         return ret;
419 }
420
421 int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
422                                 u32 pid, int report)
423 {
424         int ret;
425
426         spin_lock_bh(&nf_conntrack_lock);
427         ret = __nf_ct_expect_check(expect);
428         if (ret <= 0)
429                 goto out;
430
431         ret = 0;
432         nf_ct_expect_insert(expect);
433         spin_unlock_bh(&nf_conntrack_lock);
434         nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
435         return ret;
436 out:
437         spin_unlock_bh(&nf_conntrack_lock);
438         return ret;
439 }
440 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
441
442 #ifdef CONFIG_PROC_FS
443 struct ct_expect_iter_state {
444         struct seq_net_private p;
445         unsigned int bucket;
446 };
447
448 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
449 {
450         struct net *net = seq_file_net(seq);
451         struct ct_expect_iter_state *st = seq->private;
452         struct hlist_node *n;
453
454         for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
455                 n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
456                 if (n)
457                         return n;
458         }
459         return NULL;
460 }
461
462 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
463                                              struct hlist_node *head)
464 {
465         struct net *net = seq_file_net(seq);
466         struct ct_expect_iter_state *st = seq->private;
467
468         head = rcu_dereference(head->next);
469         while (head == NULL) {
470                 if (++st->bucket >= nf_ct_expect_hsize)
471                         return NULL;
472                 head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
473         }
474         return head;
475 }
476
477 static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
478 {
479         struct hlist_node *head = ct_expect_get_first(seq);
480
481         if (head)
482                 while (pos && (head = ct_expect_get_next(seq, head)))
483                         pos--;
484         return pos ? NULL : head;
485 }
486
487 static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
488         __acquires(RCU)
489 {
490         rcu_read_lock();
491         return ct_expect_get_idx(seq, *pos);
492 }
493
494 static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
495 {
496         (*pos)++;
497         return ct_expect_get_next(seq, v);
498 }
499
500 static void exp_seq_stop(struct seq_file *seq, void *v)
501         __releases(RCU)
502 {
503         rcu_read_unlock();
504 }
505
506 static int exp_seq_show(struct seq_file *s, void *v)
507 {
508         struct nf_conntrack_expect *expect;
509         struct nf_conntrack_helper *helper;
510         struct hlist_node *n = v;
511         char *delim = "";
512
513         expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
514
515         if (expect->timeout.function)
516                 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
517                            ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
518         else
519                 seq_printf(s, "- ");
520         seq_printf(s, "l3proto = %u proto=%u ",
521                    expect->tuple.src.l3num,
522                    expect->tuple.dst.protonum);
523         print_tuple(s, &expect->tuple,
524                     __nf_ct_l3proto_find(expect->tuple.src.l3num),
525                     __nf_ct_l4proto_find(expect->tuple.src.l3num,
526                                        expect->tuple.dst.protonum));
527
528         if (expect->flags & NF_CT_EXPECT_PERMANENT) {
529                 seq_printf(s, "PERMANENT");
530                 delim = ",";
531         }
532         if (expect->flags & NF_CT_EXPECT_INACTIVE)
533                 seq_printf(s, "%sINACTIVE", delim);
534
535         helper = rcu_dereference(nfct_help(expect->master)->helper);
536         if (helper) {
537                 seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
538                 if (helper->expect_policy[expect->class].name)
539                         seq_printf(s, "/%s",
540                                    helper->expect_policy[expect->class].name);
541         }
542
543         return seq_putc(s, '\n');
544 }
545
546 static const struct seq_operations exp_seq_ops = {
547         .start = exp_seq_start,
548         .next = exp_seq_next,
549         .stop = exp_seq_stop,
550         .show = exp_seq_show
551 };
552
553 static int exp_open(struct inode *inode, struct file *file)
554 {
555         return seq_open_net(inode, file, &exp_seq_ops,
556                         sizeof(struct ct_expect_iter_state));
557 }
558
559 static const struct file_operations exp_file_ops = {
560         .owner   = THIS_MODULE,
561         .open    = exp_open,
562         .read    = seq_read,
563         .llseek  = seq_lseek,
564         .release = seq_release_net,
565 };
566 #endif /* CONFIG_PROC_FS */
567
568 static int exp_proc_init(struct net *net)
569 {
570 #ifdef CONFIG_PROC_FS
571         struct proc_dir_entry *proc;
572
573         proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops);
574         if (!proc)
575                 return -ENOMEM;
576 #endif /* CONFIG_PROC_FS */
577         return 0;
578 }
579
580 static void exp_proc_remove(struct net *net)
581 {
582 #ifdef CONFIG_PROC_FS
583         proc_net_remove(net, "nf_conntrack_expect");
584 #endif /* CONFIG_PROC_FS */
585 }
586
587 module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
588
589 int nf_conntrack_expect_init(struct net *net)
590 {
591         int err = -ENOMEM;
592
593         if (net_eq(net, &init_net)) {
594                 if (!nf_ct_expect_hsize) {
595                         nf_ct_expect_hsize = net->ct.htable_size / 256;
596                         if (!nf_ct_expect_hsize)
597                                 nf_ct_expect_hsize = 1;
598                 }
599                 nf_ct_expect_max = nf_ct_expect_hsize * 4;
600         }
601
602         net->ct.expect_count = 0;
603         net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
604                                                   &net->ct.expect_vmalloc, 0);
605         if (net->ct.expect_hash == NULL)
606                 goto err1;
607
608         if (net_eq(net, &init_net)) {
609                 nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
610                                         sizeof(struct nf_conntrack_expect),
611                                         0, 0, NULL);
612                 if (!nf_ct_expect_cachep)
613                         goto err2;
614         }
615
616         err = exp_proc_init(net);
617         if (err < 0)
618                 goto err3;
619
620         return 0;
621
622 err3:
623         if (net_eq(net, &init_net))
624                 kmem_cache_destroy(nf_ct_expect_cachep);
625 err2:
626         nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
627                              nf_ct_expect_hsize);
628 err1:
629         return err;
630 }
631
632 void nf_conntrack_expect_fini(struct net *net)
633 {
634         exp_proc_remove(net);
635         if (net_eq(net, &init_net)) {
636                 rcu_barrier(); /* Wait for call_rcu() before destroy */
637                 kmem_cache_destroy(nf_ct_expect_cachep);
638         }
639         nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
640                              nf_ct_expect_hsize);
641 }