519d3f00ef9ee4fde8c616125c55e17f79b38651
[linux-2.6.git] / lib / radix-tree.c
1 /*
2  * Copyright (C) 2001 Momchil Velikov
3  * Portions Copyright (C) 2001 Christoph Hellwig
4  * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5  * Copyright (C) 2006 Nick Piggin
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2, or (at
10  * your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #include <linux/errno.h>
23 #include <linux/init.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/radix-tree.h>
27 #include <linux/percpu.h>
28 #include <linux/slab.h>
29 #include <linux/notifier.h>
30 #include <linux/cpu.h>
31 #include <linux/gfp.h>
32 #include <linux/string.h>
33 #include <linux/bitops.h>
34 #include <linux/rcupdate.h>
35
36
37 #ifdef __KERNEL__
38 #define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
39 #else
40 #define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
41 #endif
42
43 #define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
44 #define RADIX_TREE_MAP_MASK     (RADIX_TREE_MAP_SIZE-1)
45
46 #define RADIX_TREE_TAG_LONGS    \
47         ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
48
49 struct radix_tree_node {
50         unsigned int    height;         /* Height from the bottom */
51         unsigned int    count;
52         struct rcu_head rcu_head;
53         void            *slots[RADIX_TREE_MAP_SIZE];
54         unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
55 };
56
57 struct radix_tree_path {
58         struct radix_tree_node *node;
59         int offset;
60 };
61
62 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
63 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
64                                           RADIX_TREE_MAP_SHIFT))
65
66 /*
67  * The height_to_maxindex array needs to be one deeper than the maximum
68  * path as height 0 holds only 1 entry.
69  */
70 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
71
72 /*
73  * Radix tree node cache.
74  */
75 static struct kmem_cache *radix_tree_node_cachep;
76
77 /*
78  * Per-cpu pool of preloaded nodes
79  */
80 struct radix_tree_preload {
81         int nr;
82         struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
83 };
84 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
85
86 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
87 {
88         return root->gfp_mask & __GFP_BITS_MASK;
89 }
90
91 /*
92  * This assumes that the caller has performed appropriate preallocation, and
93  * that the caller has pinned this thread of control to the current CPU.
94  */
95 static struct radix_tree_node *
96 radix_tree_node_alloc(struct radix_tree_root *root)
97 {
98         struct radix_tree_node *ret;
99         gfp_t gfp_mask = root_gfp_mask(root);
100
101         ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
102         if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
103                 struct radix_tree_preload *rtp;
104
105                 rtp = &__get_cpu_var(radix_tree_preloads);
106                 if (rtp->nr) {
107                         ret = rtp->nodes[rtp->nr - 1];
108                         rtp->nodes[rtp->nr - 1] = NULL;
109                         rtp->nr--;
110                 }
111         }
112         BUG_ON(radix_tree_is_indirect_ptr(ret));
113         return ret;
114 }
115
116 static void radix_tree_node_rcu_free(struct rcu_head *head)
117 {
118         struct radix_tree_node *node =
119                         container_of(head, struct radix_tree_node, rcu_head);
120         kmem_cache_free(radix_tree_node_cachep, node);
121 }
122
123 static inline void
124 radix_tree_node_free(struct radix_tree_node *node)
125 {
126         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
127 }
128
129 /*
130  * Load up this CPU's radix_tree_node buffer with sufficient objects to
131  * ensure that the addition of a single element in the tree cannot fail.  On
132  * success, return zero, with preemption disabled.  On error, return -ENOMEM
133  * with preemption not disabled.
134  */
135 int radix_tree_preload(gfp_t gfp_mask)
136 {
137         struct radix_tree_preload *rtp;
138         struct radix_tree_node *node;
139         int ret = -ENOMEM;
140
141         preempt_disable();
142         rtp = &__get_cpu_var(radix_tree_preloads);
143         while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
144                 preempt_enable();
145                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
146                 if (node == NULL)
147                         goto out;
148                 preempt_disable();
149                 rtp = &__get_cpu_var(radix_tree_preloads);
150                 if (rtp->nr < ARRAY_SIZE(rtp->nodes))
151                         rtp->nodes[rtp->nr++] = node;
152                 else
153                         kmem_cache_free(radix_tree_node_cachep, node);
154         }
155         ret = 0;
156 out:
157         return ret;
158 }
159 EXPORT_SYMBOL(radix_tree_preload);
160
161 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
162                 int offset)
163 {
164         __set_bit(offset, node->tags[tag]);
165 }
166
167 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
168                 int offset)
169 {
170         __clear_bit(offset, node->tags[tag]);
171 }
172
173 static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
174                 int offset)
175 {
176         return test_bit(offset, node->tags[tag]);
177 }
178
179 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
180 {
181         root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
182 }
183
184
185 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
186 {
187         root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
188 }
189
190 static inline void root_tag_clear_all(struct radix_tree_root *root)
191 {
192         root->gfp_mask &= __GFP_BITS_MASK;
193 }
194
195 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
196 {
197         return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
198 }
199
200 /*
201  * Returns 1 if any slot in the node has this tag set.
202  * Otherwise returns 0.
203  */
204 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
205 {
206         int idx;
207         for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
208                 if (node->tags[tag][idx])
209                         return 1;
210         }
211         return 0;
212 }
213
214 /*
215  *      Return the maximum key which can be store into a
216  *      radix tree with height HEIGHT.
217  */
218 static inline unsigned long radix_tree_maxindex(unsigned int height)
219 {
220         return height_to_maxindex[height];
221 }
222
223 /*
224  *      Extend a radix tree so it can store key @index.
225  */
226 static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
227 {
228         struct radix_tree_node *node;
229         unsigned int height;
230         int tag;
231
232         /* Figure out what the height should be.  */
233         height = root->height + 1;
234         while (index > radix_tree_maxindex(height))
235                 height++;
236
237         if (root->rnode == NULL) {
238                 root->height = height;
239                 goto out;
240         }
241
242         do {
243                 unsigned int newheight;
244                 if (!(node = radix_tree_node_alloc(root)))
245                         return -ENOMEM;
246
247                 /* Increase the height.  */
248                 node->slots[0] = radix_tree_indirect_to_ptr(root->rnode);
249
250                 /* Propagate the aggregated tag info into the new root */
251                 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
252                         if (root_tag_get(root, tag))
253                                 tag_set(node, tag, 0);
254                 }
255
256                 newheight = root->height+1;
257                 node->height = newheight;
258                 node->count = 1;
259                 node = radix_tree_ptr_to_indirect(node);
260                 rcu_assign_pointer(root->rnode, node);
261                 root->height = newheight;
262         } while (height > root->height);
263 out:
264         return 0;
265 }
266
267 /**
268  *      radix_tree_insert    -    insert into a radix tree
269  *      @root:          radix tree root
270  *      @index:         index key
271  *      @item:          item to insert
272  *
273  *      Insert an item into the radix tree at position @index.
274  */
275 int radix_tree_insert(struct radix_tree_root *root,
276                         unsigned long index, void *item)
277 {
278         struct radix_tree_node *node = NULL, *slot;
279         unsigned int height, shift;
280         int offset;
281         int error;
282
283         BUG_ON(radix_tree_is_indirect_ptr(item));
284
285         /* Make sure the tree is high enough.  */
286         if (index > radix_tree_maxindex(root->height)) {
287                 error = radix_tree_extend(root, index);
288                 if (error)
289                         return error;
290         }
291
292         slot = radix_tree_indirect_to_ptr(root->rnode);
293
294         height = root->height;
295         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
296
297         offset = 0;                     /* uninitialised var warning */
298         while (height > 0) {
299                 if (slot == NULL) {
300                         /* Have to add a child node.  */
301                         if (!(slot = radix_tree_node_alloc(root)))
302                                 return -ENOMEM;
303                         slot->height = height;
304                         if (node) {
305                                 rcu_assign_pointer(node->slots[offset], slot);
306                                 node->count++;
307                         } else
308                                 rcu_assign_pointer(root->rnode,
309                                         radix_tree_ptr_to_indirect(slot));
310                 }
311
312                 /* Go a level down */
313                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
314                 node = slot;
315                 slot = node->slots[offset];
316                 shift -= RADIX_TREE_MAP_SHIFT;
317                 height--;
318         }
319
320         if (slot != NULL)
321                 return -EEXIST;
322
323         if (node) {
324                 node->count++;
325                 rcu_assign_pointer(node->slots[offset], item);
326                 BUG_ON(tag_get(node, 0, offset));
327                 BUG_ON(tag_get(node, 1, offset));
328         } else {
329                 rcu_assign_pointer(root->rnode, item);
330                 BUG_ON(root_tag_get(root, 0));
331                 BUG_ON(root_tag_get(root, 1));
332         }
333
334         return 0;
335 }
336 EXPORT_SYMBOL(radix_tree_insert);
337
338 /**
339  *      radix_tree_lookup_slot    -    lookup a slot in a radix tree
340  *      @root:          radix tree root
341  *      @index:         index key
342  *
343  *      Returns:  the slot corresponding to the position @index in the
344  *      radix tree @root. This is useful for update-if-exists operations.
345  *
346  *      This function cannot be called under rcu_read_lock, it must be
347  *      excluded from writers, as must the returned slot for subsequent
348  *      use by radix_tree_deref_slot() and radix_tree_replace slot.
349  *      Caller must hold tree write locked across slot lookup and
350  *      replace.
351  */
352 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
353 {
354         unsigned int height, shift;
355         struct radix_tree_node *node, **slot;
356
357         node = root->rnode;
358         if (node == NULL)
359                 return NULL;
360
361         if (!radix_tree_is_indirect_ptr(node)) {
362                 if (index > 0)
363                         return NULL;
364                 return (void **)&root->rnode;
365         }
366         node = radix_tree_indirect_to_ptr(node);
367
368         height = node->height;
369         if (index > radix_tree_maxindex(height))
370                 return NULL;
371
372         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
373
374         do {
375                 slot = (struct radix_tree_node **)
376                         (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
377                 node = *slot;
378                 if (node == NULL)
379                         return NULL;
380
381                 shift -= RADIX_TREE_MAP_SHIFT;
382                 height--;
383         } while (height > 0);
384
385         return (void **)slot;
386 }
387 EXPORT_SYMBOL(radix_tree_lookup_slot);
388
389 /**
390  *      radix_tree_lookup    -    perform lookup operation on a radix tree
391  *      @root:          radix tree root
392  *      @index:         index key
393  *
394  *      Lookup the item at the position @index in the radix tree @root.
395  *
396  *      This function can be called under rcu_read_lock, however the caller
397  *      must manage lifetimes of leaf nodes (eg. RCU may also be used to free
398  *      them safely). No RCU barriers are required to access or modify the
399  *      returned item, however.
400  */
401 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
402 {
403         unsigned int height, shift;
404         struct radix_tree_node *node, **slot;
405
406         node = rcu_dereference(root->rnode);
407         if (node == NULL)
408                 return NULL;
409
410         if (!radix_tree_is_indirect_ptr(node)) {
411                 if (index > 0)
412                         return NULL;
413                 return node;
414         }
415         node = radix_tree_indirect_to_ptr(node);
416
417         height = node->height;
418         if (index > radix_tree_maxindex(height))
419                 return NULL;
420
421         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
422
423         do {
424                 slot = (struct radix_tree_node **)
425                         (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
426                 node = rcu_dereference(*slot);
427                 if (node == NULL)
428                         return NULL;
429
430                 shift -= RADIX_TREE_MAP_SHIFT;
431                 height--;
432         } while (height > 0);
433
434         return node;
435 }
436 EXPORT_SYMBOL(radix_tree_lookup);
437
438 /**
439  *      radix_tree_tag_set - set a tag on a radix tree node
440  *      @root:          radix tree root
441  *      @index:         index key
442  *      @tag:           tag index
443  *
444  *      Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
445  *      corresponding to @index in the radix tree.  From
446  *      the root all the way down to the leaf node.
447  *
448  *      Returns the address of the tagged item.   Setting a tag on a not-present
449  *      item is a bug.
450  */
451 void *radix_tree_tag_set(struct radix_tree_root *root,
452                         unsigned long index, unsigned int tag)
453 {
454         unsigned int height, shift;
455         struct radix_tree_node *slot;
456
457         height = root->height;
458         BUG_ON(index > radix_tree_maxindex(height));
459
460         slot = radix_tree_indirect_to_ptr(root->rnode);
461         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
462
463         while (height > 0) {
464                 int offset;
465
466                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
467                 if (!tag_get(slot, tag, offset))
468                         tag_set(slot, tag, offset);
469                 slot = slot->slots[offset];
470                 BUG_ON(slot == NULL);
471                 shift -= RADIX_TREE_MAP_SHIFT;
472                 height--;
473         }
474
475         /* set the root's tag bit */
476         if (slot && !root_tag_get(root, tag))
477                 root_tag_set(root, tag);
478
479         return slot;
480 }
481 EXPORT_SYMBOL(radix_tree_tag_set);
482
483 /**
484  *      radix_tree_tag_clear - clear a tag on a radix tree node
485  *      @root:          radix tree root
486  *      @index:         index key
487  *      @tag:           tag index
488  *
489  *      Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
490  *      corresponding to @index in the radix tree.  If
491  *      this causes the leaf node to have no tags set then clear the tag in the
492  *      next-to-leaf node, etc.
493  *
494  *      Returns the address of the tagged item on success, else NULL.  ie:
495  *      has the same return value and semantics as radix_tree_lookup().
496  */
497 void *radix_tree_tag_clear(struct radix_tree_root *root,
498                         unsigned long index, unsigned int tag)
499 {
500         /*
501          * The radix tree path needs to be one longer than the maximum path
502          * since the "list" is null terminated.
503          */
504         struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
505         struct radix_tree_node *slot = NULL;
506         unsigned int height, shift;
507
508         height = root->height;
509         if (index > radix_tree_maxindex(height))
510                 goto out;
511
512         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
513         pathp->node = NULL;
514         slot = radix_tree_indirect_to_ptr(root->rnode);
515
516         while (height > 0) {
517                 int offset;
518
519                 if (slot == NULL)
520                         goto out;
521
522                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
523                 pathp[1].offset = offset;
524                 pathp[1].node = slot;
525                 slot = slot->slots[offset];
526                 pathp++;
527                 shift -= RADIX_TREE_MAP_SHIFT;
528                 height--;
529         }
530
531         if (slot == NULL)
532                 goto out;
533
534         while (pathp->node) {
535                 if (!tag_get(pathp->node, tag, pathp->offset))
536                         goto out;
537                 tag_clear(pathp->node, tag, pathp->offset);
538                 if (any_tag_set(pathp->node, tag))
539                         goto out;
540                 pathp--;
541         }
542
543         /* clear the root's tag bit */
544         if (root_tag_get(root, tag))
545                 root_tag_clear(root, tag);
546
547 out:
548         return slot;
549 }
550 EXPORT_SYMBOL(radix_tree_tag_clear);
551
552 #ifndef __KERNEL__      /* Only the test harness uses this at present */
553 /**
554  * radix_tree_tag_get - get a tag on a radix tree node
555  * @root:               radix tree root
556  * @index:              index key
557  * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
558  *
559  * Return values:
560  *
561  *  0: tag not present or not set
562  *  1: tag set
563  */
564 int radix_tree_tag_get(struct radix_tree_root *root,
565                         unsigned long index, unsigned int tag)
566 {
567         unsigned int height, shift;
568         struct radix_tree_node *node;
569         int saw_unset_tag = 0;
570
571         /* check the root's tag bit */
572         if (!root_tag_get(root, tag))
573                 return 0;
574
575         node = rcu_dereference(root->rnode);
576         if (node == NULL)
577                 return 0;
578
579         if (!radix_tree_is_indirect_ptr(node))
580                 return (index == 0);
581         node = radix_tree_indirect_to_ptr(node);
582
583         height = node->height;
584         if (index > radix_tree_maxindex(height))
585                 return 0;
586
587         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
588
589         for ( ; ; ) {
590                 int offset;
591
592                 if (node == NULL)
593                         return 0;
594
595                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
596
597                 /*
598                  * This is just a debug check.  Later, we can bale as soon as
599                  * we see an unset tag.
600                  */
601                 if (!tag_get(node, tag, offset))
602                         saw_unset_tag = 1;
603                 if (height == 1) {
604                         int ret = tag_get(node, tag, offset);
605
606                         BUG_ON(ret && saw_unset_tag);
607                         return !!ret;
608                 }
609                 node = rcu_dereference(node->slots[offset]);
610                 shift -= RADIX_TREE_MAP_SHIFT;
611                 height--;
612         }
613 }
614 EXPORT_SYMBOL(radix_tree_tag_get);
615 #endif
616
617 /**
618  *      radix_tree_next_hole    -    find the next hole (not-present entry)
619  *      @root:          tree root
620  *      @index:         index key
621  *      @max_scan:      maximum range to search
622  *
623  *      Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
624  *      indexed hole.
625  *
626  *      Returns: the index of the hole if found, otherwise returns an index
627  *      outside of the set specified (in which case 'return - index >= max_scan'
628  *      will be true).
629  *
630  *      radix_tree_next_hole may be called under rcu_read_lock. However, like
631  *      radix_tree_gang_lookup, this will not atomically search a snapshot of the
632  *      tree at a single point in time. For example, if a hole is created at index
633  *      5, then subsequently a hole is created at index 10, radix_tree_next_hole
634  *      covering both indexes may return 10 if called under rcu_read_lock.
635  */
636 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
637                                 unsigned long index, unsigned long max_scan)
638 {
639         unsigned long i;
640
641         for (i = 0; i < max_scan; i++) {
642                 if (!radix_tree_lookup(root, index))
643                         break;
644                 index++;
645                 if (index == 0)
646                         break;
647         }
648
649         return index;
650 }
651 EXPORT_SYMBOL(radix_tree_next_hole);
652
653 static unsigned int
654 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
655         unsigned int max_items, unsigned long *next_index)
656 {
657         unsigned int nr_found = 0;
658         unsigned int shift, height;
659         unsigned long i;
660
661         height = slot->height;
662         if (height == 0)
663                 goto out;
664         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
665
666         for ( ; height > 1; height--) {
667                 i = (index >> shift) & RADIX_TREE_MAP_MASK;
668                 for (;;) {
669                         if (slot->slots[i] != NULL)
670                                 break;
671                         index &= ~((1UL << shift) - 1);
672                         index += 1UL << shift;
673                         if (index == 0)
674                                 goto out;       /* 32-bit wraparound */
675                         i++;
676                         if (i == RADIX_TREE_MAP_SIZE)
677                                 goto out;
678                 }
679
680                 shift -= RADIX_TREE_MAP_SHIFT;
681                 slot = rcu_dereference(slot->slots[i]);
682                 if (slot == NULL)
683                         goto out;
684         }
685
686         /* Bottom level: grab some items */
687         for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
688                 struct radix_tree_node *node;
689                 index++;
690                 node = slot->slots[i];
691                 if (node) {
692                         results[nr_found++] = rcu_dereference(node);
693                         if (nr_found == max_items)
694                                 goto out;
695                 }
696         }
697 out:
698         *next_index = index;
699         return nr_found;
700 }
701
702 /**
703  *      radix_tree_gang_lookup - perform multiple lookup on a radix tree
704  *      @root:          radix tree root
705  *      @results:       where the results of the lookup are placed
706  *      @first_index:   start the lookup from this key
707  *      @max_items:     place up to this many items at *results
708  *
709  *      Performs an index-ascending scan of the tree for present items.  Places
710  *      them at *@results and returns the number of items which were placed at
711  *      *@results.
712  *
713  *      The implementation is naive.
714  *
715  *      Like radix_tree_lookup, radix_tree_gang_lookup may be called under
716  *      rcu_read_lock. In this case, rather than the returned results being
717  *      an atomic snapshot of the tree at a single point in time, the semantics
718  *      of an RCU protected gang lookup are as though multiple radix_tree_lookups
719  *      have been issued in individual locks, and results stored in 'results'.
720  */
721 unsigned int
722 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
723                         unsigned long first_index, unsigned int max_items)
724 {
725         unsigned long max_index;
726         struct radix_tree_node *node;
727         unsigned long cur_index = first_index;
728         unsigned int ret;
729
730         node = rcu_dereference(root->rnode);
731         if (!node)
732                 return 0;
733
734         if (!radix_tree_is_indirect_ptr(node)) {
735                 if (first_index > 0)
736                         return 0;
737                 results[0] = node;
738                 return 1;
739         }
740         node = radix_tree_indirect_to_ptr(node);
741
742         max_index = radix_tree_maxindex(node->height);
743
744         ret = 0;
745         while (ret < max_items) {
746                 unsigned int nr_found;
747                 unsigned long next_index;       /* Index of next search */
748
749                 if (cur_index > max_index)
750                         break;
751                 nr_found = __lookup(node, results + ret, cur_index,
752                                         max_items - ret, &next_index);
753                 ret += nr_found;
754                 if (next_index == 0)
755                         break;
756                 cur_index = next_index;
757         }
758
759         return ret;
760 }
761 EXPORT_SYMBOL(radix_tree_gang_lookup);
762
763 /*
764  * FIXME: the two tag_get()s here should use find_next_bit() instead of
765  * open-coding the search.
766  */
767 static unsigned int
768 __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
769         unsigned int max_items, unsigned long *next_index, unsigned int tag)
770 {
771         unsigned int nr_found = 0;
772         unsigned int shift, height;
773
774         height = slot->height;
775         if (height == 0)
776                 goto out;
777         shift = (height-1) * RADIX_TREE_MAP_SHIFT;
778
779         while (height > 0) {
780                 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
781
782                 for (;;) {
783                         if (tag_get(slot, tag, i))
784                                 break;
785                         index &= ~((1UL << shift) - 1);
786                         index += 1UL << shift;
787                         if (index == 0)
788                                 goto out;       /* 32-bit wraparound */
789                         i++;
790                         if (i == RADIX_TREE_MAP_SIZE)
791                                 goto out;
792                 }
793                 height--;
794                 if (height == 0) {      /* Bottom level: grab some items */
795                         unsigned long j = index & RADIX_TREE_MAP_MASK;
796
797                         for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
798                                 struct radix_tree_node *node;
799                                 index++;
800                                 if (!tag_get(slot, tag, j))
801                                         continue;
802                                 node = slot->slots[j];
803                                 /*
804                                  * Even though the tag was found set, we need to
805                                  * recheck that we have a non-NULL node, because
806                                  * if this lookup is lockless, it may have been
807                                  * subsequently deleted.
808                                  *
809                                  * Similar care must be taken in any place that
810                                  * lookup ->slots[x] without a lock (ie. can't
811                                  * rely on its value remaining the same).
812                                  */
813                                 if (node) {
814                                         node = rcu_dereference(node);
815                                         results[nr_found++] = node;
816                                         if (nr_found == max_items)
817                                                 goto out;
818                                 }
819                         }
820                 }
821                 shift -= RADIX_TREE_MAP_SHIFT;
822                 slot = rcu_dereference(slot->slots[i]);
823                 if (slot == NULL)
824                         break;
825         }
826 out:
827         *next_index = index;
828         return nr_found;
829 }
830
831 /**
832  *      radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
833  *                                   based on a tag
834  *      @root:          radix tree root
835  *      @results:       where the results of the lookup are placed
836  *      @first_index:   start the lookup from this key
837  *      @max_items:     place up to this many items at *results
838  *      @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
839  *
840  *      Performs an index-ascending scan of the tree for present items which
841  *      have the tag indexed by @tag set.  Places the items at *@results and
842  *      returns the number of items which were placed at *@results.
843  */
844 unsigned int
845 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
846                 unsigned long first_index, unsigned int max_items,
847                 unsigned int tag)
848 {
849         struct radix_tree_node *node;
850         unsigned long max_index;
851         unsigned long cur_index = first_index;
852         unsigned int ret;
853
854         /* check the root's tag bit */
855         if (!root_tag_get(root, tag))
856                 return 0;
857
858         node = rcu_dereference(root->rnode);
859         if (!node)
860                 return 0;
861
862         if (!radix_tree_is_indirect_ptr(node)) {
863                 if (first_index > 0)
864                         return 0;
865                 results[0] = node;
866                 return 1;
867         }
868         node = radix_tree_indirect_to_ptr(node);
869
870         max_index = radix_tree_maxindex(node->height);
871
872         ret = 0;
873         while (ret < max_items) {
874                 unsigned int nr_found;
875                 unsigned long next_index;       /* Index of next search */
876
877                 if (cur_index > max_index)
878                         break;
879                 nr_found = __lookup_tag(node, results + ret, cur_index,
880                                         max_items - ret, &next_index, tag);
881                 ret += nr_found;
882                 if (next_index == 0)
883                         break;
884                 cur_index = next_index;
885         }
886
887         return ret;
888 }
889 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
890
891 /**
892  *      radix_tree_shrink    -    shrink height of a radix tree to minimal
893  *      @root           radix tree root
894  */
895 static inline void radix_tree_shrink(struct radix_tree_root *root)
896 {
897         /* try to shrink tree height */
898         while (root->height > 0) {
899                 struct radix_tree_node *to_free = root->rnode;
900                 void *newptr;
901
902                 BUG_ON(!radix_tree_is_indirect_ptr(to_free));
903                 to_free = radix_tree_indirect_to_ptr(to_free);
904
905                 /*
906                  * The candidate node has more than one child, or its child
907                  * is not at the leftmost slot, we cannot shrink.
908                  */
909                 if (to_free->count != 1)
910                         break;
911                 if (!to_free->slots[0])
912                         break;
913
914                 /*
915                  * We don't need rcu_assign_pointer(), since we are simply
916                  * moving the node from one part of the tree to another. If
917                  * it was safe to dereference the old pointer to it
918                  * (to_free->slots[0]), it will be safe to dereference the new
919                  * one (root->rnode).
920                  */
921                 newptr = to_free->slots[0];
922                 if (root->height > 1)
923                         newptr = radix_tree_ptr_to_indirect(newptr);
924                 root->rnode = newptr;
925                 root->height--;
926                 /* must only free zeroed nodes into the slab */
927                 tag_clear(to_free, 0, 0);
928                 tag_clear(to_free, 1, 0);
929                 to_free->slots[0] = NULL;
930                 to_free->count = 0;
931                 radix_tree_node_free(to_free);
932         }
933 }
934
935 /**
936  *      radix_tree_delete    -    delete an item from a radix tree
937  *      @root:          radix tree root
938  *      @index:         index key
939  *
940  *      Remove the item at @index from the radix tree rooted at @root.
941  *
942  *      Returns the address of the deleted item, or NULL if it was not present.
943  */
944 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
945 {
946         /*
947          * The radix tree path needs to be one longer than the maximum path
948          * since the "list" is null terminated.
949          */
950         struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
951         struct radix_tree_node *slot = NULL;
952         struct radix_tree_node *to_free;
953         unsigned int height, shift;
954         int tag;
955         int offset;
956
957         height = root->height;
958         if (index > radix_tree_maxindex(height))
959                 goto out;
960
961         slot = root->rnode;
962         if (height == 0) {
963                 root_tag_clear_all(root);
964                 root->rnode = NULL;
965                 goto out;
966         }
967         slot = radix_tree_indirect_to_ptr(slot);
968
969         shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
970         pathp->node = NULL;
971
972         do {
973                 if (slot == NULL)
974                         goto out;
975
976                 pathp++;
977                 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
978                 pathp->offset = offset;
979                 pathp->node = slot;
980                 slot = slot->slots[offset];
981                 shift -= RADIX_TREE_MAP_SHIFT;
982                 height--;
983         } while (height > 0);
984
985         if (slot == NULL)
986                 goto out;
987
988         /*
989          * Clear all tags associated with the just-deleted item
990          */
991         for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
992                 if (tag_get(pathp->node, tag, pathp->offset))
993                         radix_tree_tag_clear(root, index, tag);
994         }
995
996         to_free = NULL;
997         /* Now free the nodes we do not need anymore */
998         while (pathp->node) {
999                 pathp->node->slots[pathp->offset] = NULL;
1000                 pathp->node->count--;
1001                 /*
1002                  * Queue the node for deferred freeing after the
1003                  * last reference to it disappears (set NULL, above).
1004                  */
1005                 if (to_free)
1006                         radix_tree_node_free(to_free);
1007
1008                 if (pathp->node->count) {
1009                         if (pathp->node ==
1010                                         radix_tree_indirect_to_ptr(root->rnode))
1011                                 radix_tree_shrink(root);
1012                         goto out;
1013                 }
1014
1015                 /* Node with zero slots in use so free it */
1016                 to_free = pathp->node;
1017                 pathp--;
1018
1019         }
1020         root_tag_clear_all(root);
1021         root->height = 0;
1022         root->rnode = NULL;
1023         if (to_free)
1024                 radix_tree_node_free(to_free);
1025
1026 out:
1027         return slot;
1028 }
1029 EXPORT_SYMBOL(radix_tree_delete);
1030
1031 /**
1032  *      radix_tree_tagged - test whether any items in the tree are tagged
1033  *      @root:          radix tree root
1034  *      @tag:           tag to test
1035  */
1036 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1037 {
1038         return root_tag_get(root, tag);
1039 }
1040 EXPORT_SYMBOL(radix_tree_tagged);
1041
1042 static void
1043 radix_tree_node_ctor(void *node, struct kmem_cache *cachep, unsigned long flags)
1044 {
1045         memset(node, 0, sizeof(struct radix_tree_node));
1046 }
1047
1048 static __init unsigned long __maxindex(unsigned int height)
1049 {
1050         unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
1051         unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
1052
1053         if (tmp >= RADIX_TREE_INDEX_BITS)
1054                 index = ~0UL;
1055         return index;
1056 }
1057
1058 static __init void radix_tree_init_maxindex(void)
1059 {
1060         unsigned int i;
1061
1062         for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
1063                 height_to_maxindex[i] = __maxindex(i);
1064 }
1065
1066 static int radix_tree_callback(struct notifier_block *nfb,
1067                             unsigned long action,
1068                             void *hcpu)
1069 {
1070        int cpu = (long)hcpu;
1071        struct radix_tree_preload *rtp;
1072
1073        /* Free per-cpu pool of perloaded nodes */
1074        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1075                rtp = &per_cpu(radix_tree_preloads, cpu);
1076                while (rtp->nr) {
1077                        kmem_cache_free(radix_tree_node_cachep,
1078                                        rtp->nodes[rtp->nr-1]);
1079                        rtp->nodes[rtp->nr-1] = NULL;
1080                        rtp->nr--;
1081                }
1082        }
1083        return NOTIFY_OK;
1084 }
1085
1086 void __init radix_tree_init(void)
1087 {
1088         radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1089                         sizeof(struct radix_tree_node), 0,
1090                         SLAB_PANIC, radix_tree_node_ctor);
1091         radix_tree_init_maxindex();
1092         hotcpu_notifier(radix_tree_callback, 0);
1093 }