[NETFILTER]: nf_conntrack_expect: use RCU for expectation hash
[linux-3.10.git] / net / netfilter / nf_conntrack_expect.c
1 /* Expectation handling for nf_conntrack. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11
12 #include <linux/types.h>
13 #include <linux/netfilter.h>
14 #include <linux/skbuff.h>
15 #include <linux/proc_fs.h>
16 #include <linux/seq_file.h>
17 #include <linux/stddef.h>
18 #include <linux/slab.h>
19 #include <linux/err.h>
20 #include <linux/percpu.h>
21 #include <linux/kernel.h>
22 #include <linux/jhash.h>
23 #include <net/net_namespace.h>
24
25 #include <net/netfilter/nf_conntrack.h>
26 #include <net/netfilter/nf_conntrack_core.h>
27 #include <net/netfilter/nf_conntrack_expect.h>
28 #include <net/netfilter/nf_conntrack_helper.h>
29 #include <net/netfilter/nf_conntrack_tuple.h>
30
31 struct hlist_head *nf_ct_expect_hash __read_mostly;
32 EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
33
34 unsigned int nf_ct_expect_hsize __read_mostly;
35 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
36
37 static unsigned int nf_ct_expect_hash_rnd __read_mostly;
38 static unsigned int nf_ct_expect_count;
39 unsigned int nf_ct_expect_max __read_mostly;
40 static int nf_ct_expect_hash_rnd_initted __read_mostly;
41 static int nf_ct_expect_vmalloc;
42
43 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
44
45 /* nf_conntrack_expect helper functions */
46 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
47 {
48         struct nf_conn_help *master_help = nfct_help(exp->master);
49
50         NF_CT_ASSERT(master_help);
51         NF_CT_ASSERT(!timer_pending(&exp->timeout));
52
53         hlist_del_rcu(&exp->hnode);
54         nf_ct_expect_count--;
55
56         hlist_del(&exp->lnode);
57         master_help->expecting--;
58         nf_ct_expect_put(exp);
59
60         NF_CT_STAT_INC(expect_delete);
61 }
62 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
63
64 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
65 {
66         struct nf_conntrack_expect *exp = (void *)ul_expect;
67
68         write_lock_bh(&nf_conntrack_lock);
69         nf_ct_unlink_expect(exp);
70         write_unlock_bh(&nf_conntrack_lock);
71         nf_ct_expect_put(exp);
72 }
73
74 static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
75 {
76         unsigned int hash;
77
78         if (unlikely(!nf_ct_expect_hash_rnd_initted)) {
79                 get_random_bytes(&nf_ct_expect_hash_rnd, 4);
80                 nf_ct_expect_hash_rnd_initted = 1;
81         }
82
83         hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
84                       (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
85                        (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd);
86         return ((u64)hash * nf_ct_expect_hsize) >> 32;
87 }
88
89 struct nf_conntrack_expect *
90 __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple)
91 {
92         struct nf_conntrack_expect *i;
93         struct hlist_node *n;
94         unsigned int h;
95
96         if (!nf_ct_expect_count)
97                 return NULL;
98
99         h = nf_ct_expect_dst_hash(tuple);
100         hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) {
101                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
102                         return i;
103         }
104         return NULL;
105 }
106 EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
107
108 /* Just find a expectation corresponding to a tuple. */
109 struct nf_conntrack_expect *
110 nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple)
111 {
112         struct nf_conntrack_expect *i;
113
114         rcu_read_lock();
115         i = __nf_ct_expect_find(tuple);
116         if (i && !atomic_inc_not_zero(&i->use))
117                 i = NULL;
118         rcu_read_unlock();
119
120         return i;
121 }
122 EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
123
124 /* If an expectation for this connection is found, it gets delete from
125  * global list then returned. */
126 struct nf_conntrack_expect *
127 nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple)
128 {
129         struct nf_conntrack_expect *exp;
130
131         exp = __nf_ct_expect_find(tuple);
132         if (!exp)
133                 return NULL;
134
135         /* If master is not in hash table yet (ie. packet hasn't left
136            this machine yet), how can other end know about expected?
137            Hence these are not the droids you are looking for (if
138            master ct never got confirmed, we'd hold a reference to it
139            and weird things would happen to future packets). */
140         if (!nf_ct_is_confirmed(exp->master))
141                 return NULL;
142
143         if (exp->flags & NF_CT_EXPECT_PERMANENT) {
144                 atomic_inc(&exp->use);
145                 return exp;
146         } else if (del_timer(&exp->timeout)) {
147                 nf_ct_unlink_expect(exp);
148                 return exp;
149         }
150
151         return NULL;
152 }
153
154 /* delete all expectations for this conntrack */
155 void nf_ct_remove_expectations(struct nf_conn *ct)
156 {
157         struct nf_conn_help *help = nfct_help(ct);
158         struct nf_conntrack_expect *exp;
159         struct hlist_node *n, *next;
160
161         /* Optimization: most connection never expect any others. */
162         if (!help || help->expecting == 0)
163                 return;
164
165         hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
166                 if (del_timer(&exp->timeout)) {
167                         nf_ct_unlink_expect(exp);
168                         nf_ct_expect_put(exp);
169                 }
170         }
171 }
172 EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
173
174 /* Would two expected things clash? */
175 static inline int expect_clash(const struct nf_conntrack_expect *a,
176                                const struct nf_conntrack_expect *b)
177 {
178         /* Part covered by intersection of masks must be unequal,
179            otherwise they clash */
180         struct nf_conntrack_tuple_mask intersect_mask;
181         int count;
182
183         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
184
185         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
186                 intersect_mask.src.u3.all[count] =
187                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
188         }
189
190         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
191 }
192
193 static inline int expect_matches(const struct nf_conntrack_expect *a,
194                                  const struct nf_conntrack_expect *b)
195 {
196         return a->master == b->master
197                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
198                 && nf_ct_tuple_mask_equal(&a->mask, &b->mask);
199 }
200
201 /* Generally a bad idea to call this: could have matched already. */
202 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
203 {
204         write_lock_bh(&nf_conntrack_lock);
205         if (del_timer(&exp->timeout)) {
206                 nf_ct_unlink_expect(exp);
207                 nf_ct_expect_put(exp);
208         }
209         write_unlock_bh(&nf_conntrack_lock);
210 }
211 EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
212
213 /* We don't increase the master conntrack refcount for non-fulfilled
214  * conntracks. During the conntrack destruction, the expectations are
215  * always killed before the conntrack itself */
216 struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
217 {
218         struct nf_conntrack_expect *new;
219
220         new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
221         if (!new)
222                 return NULL;
223
224         new->master = me;
225         atomic_set(&new->use, 1);
226         INIT_RCU_HEAD(&new->rcu);
227         return new;
228 }
229 EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
230
231 void nf_ct_expect_init(struct nf_conntrack_expect *exp, int family,
232                        union nf_inet_addr *saddr,
233                        union nf_inet_addr *daddr,
234                        u_int8_t proto, __be16 *src, __be16 *dst)
235 {
236         int len;
237
238         if (family == AF_INET)
239                 len = 4;
240         else
241                 len = 16;
242
243         exp->flags = 0;
244         exp->expectfn = NULL;
245         exp->helper = NULL;
246         exp->tuple.src.l3num = family;
247         exp->tuple.dst.protonum = proto;
248
249         if (saddr) {
250                 memcpy(&exp->tuple.src.u3, saddr, len);
251                 if (sizeof(exp->tuple.src.u3) > len)
252                         /* address needs to be cleared for nf_ct_tuple_equal */
253                         memset((void *)&exp->tuple.src.u3 + len, 0x00,
254                                sizeof(exp->tuple.src.u3) - len);
255                 memset(&exp->mask.src.u3, 0xFF, len);
256                 if (sizeof(exp->mask.src.u3) > len)
257                         memset((void *)&exp->mask.src.u3 + len, 0x00,
258                                sizeof(exp->mask.src.u3) - len);
259         } else {
260                 memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
261                 memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
262         }
263
264         if (src) {
265                 exp->tuple.src.u.all = *src;
266                 exp->mask.src.u.all = htons(0xFFFF);
267         } else {
268                 exp->tuple.src.u.all = 0;
269                 exp->mask.src.u.all = 0;
270         }
271
272         memcpy(&exp->tuple.dst.u3, daddr, len);
273         if (sizeof(exp->tuple.dst.u3) > len)
274                 /* address needs to be cleared for nf_ct_tuple_equal */
275                 memset((void *)&exp->tuple.dst.u3 + len, 0x00,
276                        sizeof(exp->tuple.dst.u3) - len);
277
278         exp->tuple.dst.u.all = *dst;
279 }
280 EXPORT_SYMBOL_GPL(nf_ct_expect_init);
281
282 static void nf_ct_expect_free_rcu(struct rcu_head *head)
283 {
284         struct nf_conntrack_expect *exp;
285
286         exp = container_of(head, struct nf_conntrack_expect, rcu);
287         kmem_cache_free(nf_ct_expect_cachep, exp);
288 }
289
290 void nf_ct_expect_put(struct nf_conntrack_expect *exp)
291 {
292         if (atomic_dec_and_test(&exp->use))
293                 call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
294 }
295 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
296
297 static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
298 {
299         struct nf_conn_help *master_help = nfct_help(exp->master);
300         unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
301
302         atomic_inc(&exp->use);
303
304         hlist_add_head(&exp->lnode, &master_help->expectations);
305         master_help->expecting++;
306
307         hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
308         nf_ct_expect_count++;
309
310         setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
311                     (unsigned long)exp);
312         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
313         add_timer(&exp->timeout);
314
315         atomic_inc(&exp->use);
316         NF_CT_STAT_INC(expect_create);
317 }
318
319 /* Race with expectations being used means we could have none to find; OK. */
320 static void evict_oldest_expect(struct nf_conn *master)
321 {
322         struct nf_conn_help *master_help = nfct_help(master);
323         struct nf_conntrack_expect *exp = NULL;
324         struct hlist_node *n;
325
326         hlist_for_each_entry(exp, n, &master_help->expectations, lnode)
327                 ; /* nothing */
328
329         if (exp && del_timer(&exp->timeout)) {
330                 nf_ct_unlink_expect(exp);
331                 nf_ct_expect_put(exp);
332         }
333 }
334
335 static inline int refresh_timer(struct nf_conntrack_expect *i)
336 {
337         struct nf_conn_help *master_help = nfct_help(i->master);
338
339         if (!del_timer(&i->timeout))
340                 return 0;
341
342         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
343         add_timer(&i->timeout);
344         return 1;
345 }
346
347 int nf_ct_expect_related(struct nf_conntrack_expect *expect)
348 {
349         struct nf_conntrack_expect *i;
350         struct nf_conn *master = expect->master;
351         struct nf_conn_help *master_help = nfct_help(master);
352         struct hlist_node *n;
353         unsigned int h;
354         int ret;
355
356         NF_CT_ASSERT(master_help);
357
358         write_lock_bh(&nf_conntrack_lock);
359         if (!master_help->helper) {
360                 ret = -ESHUTDOWN;
361                 goto out;
362         }
363         h = nf_ct_expect_dst_hash(&expect->tuple);
364         hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) {
365                 if (expect_matches(i, expect)) {
366                         /* Refresh timer: if it's dying, ignore.. */
367                         if (refresh_timer(i)) {
368                                 ret = 0;
369                                 goto out;
370                         }
371                 } else if (expect_clash(i, expect)) {
372                         ret = -EBUSY;
373                         goto out;
374                 }
375         }
376         /* Will be over limit? */
377         if (master_help->helper->max_expected &&
378             master_help->expecting >= master_help->helper->max_expected)
379                 evict_oldest_expect(master);
380
381         if (nf_ct_expect_count >= nf_ct_expect_max) {
382                 if (net_ratelimit())
383                         printk(KERN_WARNING
384                                "nf_conntrack: expectation table full");
385                 ret = -EMFILE;
386                 goto out;
387         }
388
389         nf_ct_expect_insert(expect);
390         nf_ct_expect_event(IPEXP_NEW, expect);
391         ret = 0;
392 out:
393         write_unlock_bh(&nf_conntrack_lock);
394         return ret;
395 }
396 EXPORT_SYMBOL_GPL(nf_ct_expect_related);
397
398 #ifdef CONFIG_PROC_FS
399 struct ct_expect_iter_state {
400         unsigned int bucket;
401 };
402
403 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
404 {
405         struct ct_expect_iter_state *st = seq->private;
406         struct hlist_node *n;
407
408         for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
409                 n = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
410                 if (n)
411                         return n;
412         }
413         return NULL;
414 }
415
416 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
417                                              struct hlist_node *head)
418 {
419         struct ct_expect_iter_state *st = seq->private;
420
421         head = rcu_dereference(head->next);
422         while (head == NULL) {
423                 if (++st->bucket >= nf_ct_expect_hsize)
424                         return NULL;
425                 head = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
426         }
427         return head;
428 }
429
430 static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
431 {
432         struct hlist_node *head = ct_expect_get_first(seq);
433
434         if (head)
435                 while (pos && (head = ct_expect_get_next(seq, head)))
436                         pos--;
437         return pos ? NULL : head;
438 }
439
440 static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
441         __acquires(RCU)
442 {
443         rcu_read_lock();
444         return ct_expect_get_idx(seq, *pos);
445 }
446
447 static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
448 {
449         (*pos)++;
450         return ct_expect_get_next(seq, v);
451 }
452
453 static void exp_seq_stop(struct seq_file *seq, void *v)
454         __releases(RCU)
455 {
456         rcu_read_unlock();
457 }
458
459 static int exp_seq_show(struct seq_file *s, void *v)
460 {
461         struct nf_conntrack_expect *expect;
462         struct hlist_node *n = v;
463
464         expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
465
466         if (expect->timeout.function)
467                 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
468                            ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
469         else
470                 seq_printf(s, "- ");
471         seq_printf(s, "l3proto = %u proto=%u ",
472                    expect->tuple.src.l3num,
473                    expect->tuple.dst.protonum);
474         print_tuple(s, &expect->tuple,
475                     __nf_ct_l3proto_find(expect->tuple.src.l3num),
476                     __nf_ct_l4proto_find(expect->tuple.src.l3num,
477                                        expect->tuple.dst.protonum));
478         return seq_putc(s, '\n');
479 }
480
481 static const struct seq_operations exp_seq_ops = {
482         .start = exp_seq_start,
483         .next = exp_seq_next,
484         .stop = exp_seq_stop,
485         .show = exp_seq_show
486 };
487
488 static int exp_open(struct inode *inode, struct file *file)
489 {
490         return seq_open_private(file, &exp_seq_ops,
491                         sizeof(struct ct_expect_iter_state));
492 }
493
494 static const struct file_operations exp_file_ops = {
495         .owner   = THIS_MODULE,
496         .open    = exp_open,
497         .read    = seq_read,
498         .llseek  = seq_lseek,
499         .release = seq_release_private,
500 };
501 #endif /* CONFIG_PROC_FS */
502
503 static int __init exp_proc_init(void)
504 {
505 #ifdef CONFIG_PROC_FS
506         struct proc_dir_entry *proc;
507
508         proc = proc_net_fops_create(&init_net, "nf_conntrack_expect", 0440, &exp_file_ops);
509         if (!proc)
510                 return -ENOMEM;
511 #endif /* CONFIG_PROC_FS */
512         return 0;
513 }
514
515 static void exp_proc_remove(void)
516 {
517 #ifdef CONFIG_PROC_FS
518         proc_net_remove(&init_net, "nf_conntrack_expect");
519 #endif /* CONFIG_PROC_FS */
520 }
521
522 module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600);
523
524 int __init nf_conntrack_expect_init(void)
525 {
526         int err = -ENOMEM;
527
528         if (!nf_ct_expect_hsize) {
529                 nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
530                 if (!nf_ct_expect_hsize)
531                         nf_ct_expect_hsize = 1;
532         }
533         nf_ct_expect_max = nf_ct_expect_hsize * 4;
534
535         nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
536                                                   &nf_ct_expect_vmalloc);
537         if (nf_ct_expect_hash == NULL)
538                 goto err1;
539
540         nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
541                                         sizeof(struct nf_conntrack_expect),
542                                         0, 0, NULL);
543         if (!nf_ct_expect_cachep)
544                 goto err2;
545
546         err = exp_proc_init();
547         if (err < 0)
548                 goto err3;
549
550         return 0;
551
552 err3:
553         nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
554                              nf_ct_expect_hsize);
555 err2:
556         kmem_cache_destroy(nf_ct_expect_cachep);
557 err1:
558         return err;
559 }
560
561 void nf_conntrack_expect_fini(void)
562 {
563         exp_proc_remove();
564         kmem_cache_destroy(nf_ct_expect_cachep);
565         nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
566                              nf_ct_expect_hsize);
567 }