kill-the-bkl/reiserfs: unlock only when needed in search_by_key
[linux-2.6.git] / fs / reiserfs / stree.c
1 /*
2  *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /*
6  *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
7  *  Programm System Institute
8  *  Pereslavl-Zalessky Russia
9  */
10
11 /*
12  *  This file contains functions dealing with S+tree
13  *
14  * B_IS_IN_TREE
15  * copy_item_head
16  * comp_short_keys
17  * comp_keys
18  * comp_short_le_keys
19  * le_key2cpu_key
20  * comp_le_keys
21  * bin_search
22  * get_lkey
23  * get_rkey
24  * key_in_buffer
25  * decrement_bcount
26  * reiserfs_check_path
27  * pathrelse_and_restore
28  * pathrelse
29  * search_by_key_reada
30  * search_by_key
31  * search_for_position_by_key
32  * comp_items
33  * prepare_for_direct_item
34  * prepare_for_direntry_item
35  * prepare_for_delete_or_cut
36  * calc_deleted_bytes_number
37  * init_tb_struct
38  * padd_item
39  * reiserfs_delete_item
40  * reiserfs_delete_solid_item
41  * reiserfs_delete_object
42  * maybe_indirect_to_direct
43  * indirect_to_direct_roll_back
44  * reiserfs_cut_from_item
45  * truncate_directory
46  * reiserfs_do_truncate
47  * reiserfs_paste_into_item
48  * reiserfs_insert_item
49  */
50
51 #include <linux/time.h>
52 #include <linux/string.h>
53 #include <linux/pagemap.h>
54 #include <linux/reiserfs_fs.h>
55 #include <linux/buffer_head.h>
56 #include <linux/quotaops.h>
57
58 /* Does the buffer contain a disk block which is in the tree. */
59 inline int B_IS_IN_TREE(const struct buffer_head *bh)
60 {
61
62         RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
63                "PAP-1010: block (%b) has too big level (%z)", bh, bh);
64
65         return (B_LEVEL(bh) != FREE_LEVEL);
66 }
67
68 //
69 // to gets item head in le form
70 //
71 inline void copy_item_head(struct item_head *to,
72                            const struct item_head *from)
73 {
74         memcpy(to, from, IH_SIZE);
75 }
76
77 /* k1 is pointer to on-disk structure which is stored in little-endian
78    form. k2 is pointer to cpu variable. For key of items of the same
79    object this returns 0.
80    Returns: -1 if key1 < key2
81    0 if key1 == key2
82    1 if key1 > key2 */
83 inline int comp_short_keys(const struct reiserfs_key *le_key,
84                            const struct cpu_key *cpu_key)
85 {
86         __u32 n;
87         n = le32_to_cpu(le_key->k_dir_id);
88         if (n < cpu_key->on_disk_key.k_dir_id)
89                 return -1;
90         if (n > cpu_key->on_disk_key.k_dir_id)
91                 return 1;
92         n = le32_to_cpu(le_key->k_objectid);
93         if (n < cpu_key->on_disk_key.k_objectid)
94                 return -1;
95         if (n > cpu_key->on_disk_key.k_objectid)
96                 return 1;
97         return 0;
98 }
99
100 /* k1 is pointer to on-disk structure which is stored in little-endian
101    form. k2 is pointer to cpu variable.
102    Compare keys using all 4 key fields.
103    Returns: -1 if key1 < key2 0
104    if key1 = key2 1 if key1 > key2 */
105 static inline int comp_keys(const struct reiserfs_key *le_key,
106                             const struct cpu_key *cpu_key)
107 {
108         int retval;
109
110         retval = comp_short_keys(le_key, cpu_key);
111         if (retval)
112                 return retval;
113         if (le_key_k_offset(le_key_version(le_key), le_key) <
114             cpu_key_k_offset(cpu_key))
115                 return -1;
116         if (le_key_k_offset(le_key_version(le_key), le_key) >
117             cpu_key_k_offset(cpu_key))
118                 return 1;
119
120         if (cpu_key->key_length == 3)
121                 return 0;
122
123         /* this part is needed only when tail conversion is in progress */
124         if (le_key_k_type(le_key_version(le_key), le_key) <
125             cpu_key_k_type(cpu_key))
126                 return -1;
127
128         if (le_key_k_type(le_key_version(le_key), le_key) >
129             cpu_key_k_type(cpu_key))
130                 return 1;
131
132         return 0;
133 }
134
135 inline int comp_short_le_keys(const struct reiserfs_key *key1,
136                               const struct reiserfs_key *key2)
137 {
138         __u32 *k1_u32, *k2_u32;
139         int key_length = REISERFS_SHORT_KEY_LEN;
140
141         k1_u32 = (__u32 *) key1;
142         k2_u32 = (__u32 *) key2;
143         for (; key_length--; ++k1_u32, ++k2_u32) {
144                 if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
145                         return -1;
146                 if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
147                         return 1;
148         }
149         return 0;
150 }
151
152 inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
153 {
154         int version;
155         to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
156         to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
157
158         // find out version of the key
159         version = le_key_version(from);
160         to->version = version;
161         to->on_disk_key.k_offset = le_key_k_offset(version, from);
162         to->on_disk_key.k_type = le_key_k_type(version, from);
163 }
164
165 // this does not say which one is bigger, it only returns 1 if keys
166 // are not equal, 0 otherwise
167 inline int comp_le_keys(const struct reiserfs_key *k1,
168                         const struct reiserfs_key *k2)
169 {
170         return memcmp(k1, k2, sizeof(struct reiserfs_key));
171 }
172
173 /**************************************************************************
174  *  Binary search toolkit function                                        *
175  *  Search for an item in the array by the item key                       *
176  *  Returns:    1 if found,  0 if not found;                              *
177  *        *pos = number of the searched element if found, else the        *
178  *        number of the first element that is larger than key.            *
179  **************************************************************************/
180 /* For those not familiar with binary search: lbound is the leftmost item that it
181  could be, rbound the rightmost item that it could be.  We examine the item
182  halfway between lbound and rbound, and that tells us either that we can increase
183  lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that
184  there are no possible items, and we have not found it. With each examination we
185  cut the number of possible items it could be by one more than half rounded down,
186  or we find it. */
187 static inline int bin_search(const void *key,   /* Key to search for. */
188                              const void *base,  /* First item in the array. */
189                              int num,   /* Number of items in the array. */
190                              int width, /* Item size in the array.
191                                            searched. Lest the reader be
192                                            confused, note that this is crafted
193                                            as a general function, and when it
194                                            is applied specifically to the array
195                                            of item headers in a node, width
196                                            is actually the item header size not
197                                            the item size. */
198                              int *pos /* Number of the searched for element. */
199     )
200 {
201         int rbound, lbound, j;
202
203         for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
204              lbound <= rbound; j = (rbound + lbound) / 2)
205                 switch (comp_keys
206                         ((struct reiserfs_key *)((char *)base + j * width),
207                          (struct cpu_key *)key)) {
208                 case -1:
209                         lbound = j + 1;
210                         continue;
211                 case 1:
212                         rbound = j - 1;
213                         continue;
214                 case 0:
215                         *pos = j;
216                         return ITEM_FOUND;      /* Key found in the array.  */
217                 }
218
219         /* bin_search did not find given key, it returns position of key,
220            that is minimal and greater than the given one. */
221         *pos = lbound;
222         return ITEM_NOT_FOUND;
223 }
224
225 #ifdef CONFIG_REISERFS_CHECK
226 extern struct tree_balance *cur_tb;
227 #endif
228
229 /* Minimal possible key. It is never in the tree. */
230 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
231
232 /* Maximal possible key. It is never in the tree. */
233 static const struct reiserfs_key MAX_KEY = {
234         __constant_cpu_to_le32(0xffffffff),
235         __constant_cpu_to_le32(0xffffffff),
236         {{__constant_cpu_to_le32(0xffffffff),
237           __constant_cpu_to_le32(0xffffffff)},}
238 };
239
240 /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
241    of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
242    the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
243    case we return a special key, either MIN_KEY or MAX_KEY. */
244 static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
245                                                   const struct super_block *sb)
246 {
247         int position, path_offset = chk_path->path_length;
248         struct buffer_head *parent;
249
250         RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
251                "PAP-5010: invalid offset in the path");
252
253         /* While not higher in path than first element. */
254         while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
255
256                 RFALSE(!buffer_uptodate
257                        (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
258                        "PAP-5020: parent is not uptodate");
259
260                 /* Parent at the path is not in the tree now. */
261                 if (!B_IS_IN_TREE
262                     (parent =
263                      PATH_OFFSET_PBUFFER(chk_path, path_offset)))
264                         return &MAX_KEY;
265                 /* Check whether position in the parent is correct. */
266                 if ((position =
267                      PATH_OFFSET_POSITION(chk_path,
268                                           path_offset)) >
269                     B_NR_ITEMS(parent))
270                         return &MAX_KEY;
271                 /* Check whether parent at the path really points to the child. */
272                 if (B_N_CHILD_NUM(parent, position) !=
273                     PATH_OFFSET_PBUFFER(chk_path,
274                                         path_offset + 1)->b_blocknr)
275                         return &MAX_KEY;
276                 /* Return delimiting key if position in the parent is not equal to zero. */
277                 if (position)
278                         return B_N_PDELIM_KEY(parent, position - 1);
279         }
280         /* Return MIN_KEY if we are in the root of the buffer tree. */
281         if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
282             b_blocknr == SB_ROOT_BLOCK(sb))
283                 return &MIN_KEY;
284         return &MAX_KEY;
285 }
286
287 /* Get delimiting key of the buffer at the path and its right neighbor. */
288 inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
289                                            const struct super_block *sb)
290 {
291         int position, path_offset = chk_path->path_length;
292         struct buffer_head *parent;
293
294         RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
295                "PAP-5030: invalid offset in the path");
296
297         while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
298
299                 RFALSE(!buffer_uptodate
300                        (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
301                        "PAP-5040: parent is not uptodate");
302
303                 /* Parent at the path is not in the tree now. */
304                 if (!B_IS_IN_TREE
305                     (parent =
306                      PATH_OFFSET_PBUFFER(chk_path, path_offset)))
307                         return &MIN_KEY;
308                 /* Check whether position in the parent is correct. */
309                 if ((position =
310                      PATH_OFFSET_POSITION(chk_path,
311                                           path_offset)) >
312                     B_NR_ITEMS(parent))
313                         return &MIN_KEY;
314                 /* Check whether parent at the path really points to the child. */
315                 if (B_N_CHILD_NUM(parent, position) !=
316                     PATH_OFFSET_PBUFFER(chk_path,
317                                         path_offset + 1)->b_blocknr)
318                         return &MIN_KEY;
319                 /* Return delimiting key if position in the parent is not the last one. */
320                 if (position != B_NR_ITEMS(parent))
321                         return B_N_PDELIM_KEY(parent, position);
322         }
323         /* Return MAX_KEY if we are in the root of the buffer tree. */
324         if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
325             b_blocknr == SB_ROOT_BLOCK(sb))
326                 return &MAX_KEY;
327         return &MIN_KEY;
328 }
329
330 /* Check whether a key is contained in the tree rooted from a buffer at a path. */
331 /* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
332    the path.  These delimiting keys are stored at least one level above that buffer in the tree. If the
333    buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
334    this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
335 static inline int key_in_buffer(struct treepath *chk_path,      /* Path which should be checked.  */
336                                 const struct cpu_key *key,      /* Key which should be checked.   */
337                                 struct super_block *sb
338     )
339 {
340
341         RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
342                || chk_path->path_length > MAX_HEIGHT,
343                "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
344                key, chk_path->path_length);
345         RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
346                "PAP-5060: device must not be NODEV");
347
348         if (comp_keys(get_lkey(chk_path, sb), key) == 1)
349                 /* left delimiting key is bigger, that the key we look for */
350                 return 0;
351         /*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
352         if (comp_keys(get_rkey(chk_path, sb), key) != 1)
353                 /* key must be less than right delimitiing key */
354                 return 0;
355         return 1;
356 }
357
358 int reiserfs_check_path(struct treepath *p)
359 {
360         RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
361                "path not properly relsed");
362         return 0;
363 }
364
365 /* Drop the reference to each buffer in a path and restore
366  * dirty bits clean when preparing the buffer for the log.
367  * This version should only be called from fix_nodes() */
368 void pathrelse_and_restore(struct super_block *sb,
369                            struct treepath *search_path)
370 {
371         int path_offset = search_path->path_length;
372
373         RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
374                "clm-4000: invalid path offset");
375
376         while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
377                 struct buffer_head *bh;
378                 bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
379                 reiserfs_restore_prepared_buffer(sb, bh);
380                 brelse(bh);
381         }
382         search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
383 }
384
385 /* Drop the reference to each buffer in a path */
386 void pathrelse(struct treepath *search_path)
387 {
388         int path_offset = search_path->path_length;
389
390         RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
391                "PAP-5090: invalid path offset");
392
393         while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
394                 brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
395
396         search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
397 }
398
399 static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
400 {
401         struct block_head *blkh;
402         struct item_head *ih;
403         int used_space;
404         int prev_location;
405         int i;
406         int nr;
407
408         blkh = (struct block_head *)buf;
409         if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
410                 reiserfs_warning(NULL, "reiserfs-5080",
411                                  "this should be caught earlier");
412                 return 0;
413         }
414
415         nr = blkh_nr_item(blkh);
416         if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
417                 /* item number is too big or too small */
418                 reiserfs_warning(NULL, "reiserfs-5081",
419                                  "nr_item seems wrong: %z", bh);
420                 return 0;
421         }
422         ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
423         used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
424         if (used_space != blocksize - blkh_free_space(blkh)) {
425                 /* free space does not match to calculated amount of use space */
426                 reiserfs_warning(NULL, "reiserfs-5082",
427                                  "free space seems wrong: %z", bh);
428                 return 0;
429         }
430         // FIXME: it is_leaf will hit performance too much - we may have
431         // return 1 here
432
433         /* check tables of item heads */
434         ih = (struct item_head *)(buf + BLKH_SIZE);
435         prev_location = blocksize;
436         for (i = 0; i < nr; i++, ih++) {
437                 if (le_ih_k_type(ih) == TYPE_ANY) {
438                         reiserfs_warning(NULL, "reiserfs-5083",
439                                          "wrong item type for item %h",
440                                          ih);
441                         return 0;
442                 }
443                 if (ih_location(ih) >= blocksize
444                     || ih_location(ih) < IH_SIZE * nr) {
445                         reiserfs_warning(NULL, "reiserfs-5084",
446                                          "item location seems wrong: %h",
447                                          ih);
448                         return 0;
449                 }
450                 if (ih_item_len(ih) < 1
451                     || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
452                         reiserfs_warning(NULL, "reiserfs-5085",
453                                          "item length seems wrong: %h",
454                                          ih);
455                         return 0;
456                 }
457                 if (prev_location - ih_location(ih) != ih_item_len(ih)) {
458                         reiserfs_warning(NULL, "reiserfs-5086",
459                                          "item location seems wrong "
460                                          "(second one): %h", ih);
461                         return 0;
462                 }
463                 prev_location = ih_location(ih);
464         }
465
466         // one may imagine much more checks
467         return 1;
468 }
469
470 /* returns 1 if buf looks like an internal node, 0 otherwise */
471 static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
472 {
473         struct block_head *blkh;
474         int nr;
475         int used_space;
476
477         blkh = (struct block_head *)buf;
478         nr = blkh_level(blkh);
479         if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
480                 /* this level is not possible for internal nodes */
481                 reiserfs_warning(NULL, "reiserfs-5087",
482                                  "this should be caught earlier");
483                 return 0;
484         }
485
486         nr = blkh_nr_item(blkh);
487         if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
488                 /* for internal which is not root we might check min number of keys */
489                 reiserfs_warning(NULL, "reiserfs-5088",
490                                  "number of key seems wrong: %z", bh);
491                 return 0;
492         }
493
494         used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
495         if (used_space != blocksize - blkh_free_space(blkh)) {
496                 reiserfs_warning(NULL, "reiserfs-5089",
497                                  "free space seems wrong: %z", bh);
498                 return 0;
499         }
500         // one may imagine much more checks
501         return 1;
502 }
503
504 // make sure that bh contains formatted node of reiserfs tree of
505 // 'level'-th level
506 static int is_tree_node(struct buffer_head *bh, int level)
507 {
508         if (B_LEVEL(bh) != level) {
509                 reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
510                                  "not match to the expected one %d",
511                                  B_LEVEL(bh), level);
512                 return 0;
513         }
514         if (level == DISK_LEAF_NODE_LEVEL)
515                 return is_leaf(bh->b_data, bh->b_size, bh);
516
517         return is_internal(bh->b_data, bh->b_size, bh);
518 }
519
520 #define SEARCH_BY_KEY_READA 16
521
522 /*
523  * The function is NOT SCHEDULE-SAFE!
524  * It might unlock the write lock if we needed to wait for a block
525  * to be read. Note that in this case it won't recover the lock to avoid
526  * high contention resulting from too much lock requests, especially
527  * the caller (search_by_key) will perform other schedule-unsafe
528  * operations just after calling this function.
529  *
530  * @return true if we have unlocked
531  */
532 static bool search_by_key_reada(struct super_block *s,
533                                 struct buffer_head **bh,
534                                 b_blocknr_t *b, int num)
535 {
536         int i, j;
537         bool unlocked = false;
538
539         for (i = 0; i < num; i++) {
540                 bh[i] = sb_getblk(s, b[i]);
541         }
542         /*
543          * We are going to read some blocks on which we
544          * have a reference. It's safe, though we might be
545          * reading blocks concurrently changed if we release
546          * the lock. But it's still fine because we check later
547          * if the tree changed
548          */
549         for (j = 0; j < i; j++) {
550                 /*
551                  * note, this needs attention if we are getting rid of the BKL
552                  * you have to make sure the prepared bit isn't set on this buffer
553                  */
554                 if (!buffer_uptodate(bh[j])) {
555                         if (!unlocked) {
556                                 reiserfs_write_unlock(s);
557                                 unlocked = true;
558                         }
559                         ll_rw_block(READA, 1, bh + j);
560                 }
561                 brelse(bh[j]);
562         }
563         return unlocked;
564 }
565
566 /**************************************************************************
567  * Algorithm   SearchByKey                                                *
568  *             look for item in the Disk S+Tree by its key                *
569  * Input:  sb   -  super block                                            *
570  *         key  - pointer to the key to search                            *
571  * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR                         *
572  *         search_path - path from the root to the needed leaf            *
573  **************************************************************************/
574
575 /* This function fills up the path from the root to the leaf as it
576    descends the tree looking for the key.  It uses reiserfs_bread to
577    try to find buffers in the cache given their block number.  If it
578    does not find them in the cache it reads them from disk.  For each
579    node search_by_key finds using reiserfs_bread it then uses
580    bin_search to look through that node.  bin_search will find the
581    position of the block_number of the next node if it is looking
582    through an internal node.  If it is looking through a leaf node
583    bin_search will find the position of the item which has key either
584    equal to given key, or which is the maximal key less than the given
585    key.  search_by_key returns a path that must be checked for the
586    correctness of the top of the path but need not be checked for the
587    correctness of the bottom of the path */
588 /* The function is NOT SCHEDULE-SAFE! */
589 int search_by_key(struct super_block *sb, const struct cpu_key *key,    /* Key to search. */
590                   struct treepath *search_path,/* This structure was
591                                                    allocated and initialized
592                                                    by the calling
593                                                    function. It is filled up
594                                                    by this function.  */
595                   int stop_level        /* How far down the tree to search. To
596                                            stop at leaf level - set to
597                                            DISK_LEAF_NODE_LEVEL */
598     )
599 {
600         b_blocknr_t block_number;
601         int expected_level;
602         struct buffer_head *bh;
603         struct path_element *last_element;
604         int node_level, retval;
605         int right_neighbor_of_leaf_node;
606         int fs_gen;
607         struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
608         b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
609         int reada_count = 0;
610
611 #ifdef CONFIG_REISERFS_CHECK
612         int repeat_counter = 0;
613 #endif
614
615         PROC_INFO_INC(sb, search_by_key);
616
617         /* As we add each node to a path we increase its count.  This means that
618            we must be careful to release all nodes in a path before we either
619            discard the path struct or re-use the path struct, as we do here. */
620
621         pathrelse(search_path);
622
623         right_neighbor_of_leaf_node = 0;
624
625         /* With each iteration of this loop we search through the items in the
626            current node, and calculate the next current node(next path element)
627            for the next iteration of this loop.. */
628         block_number = SB_ROOT_BLOCK(sb);
629         expected_level = -1;
630         while (1) {
631
632 #ifdef CONFIG_REISERFS_CHECK
633                 if (!(++repeat_counter % 50000))
634                         reiserfs_warning(sb, "PAP-5100",
635                                          "%s: there were %d iterations of "
636                                          "while loop looking for key %K",
637                                          current->comm, repeat_counter,
638                                          key);
639 #endif
640
641                 /* prep path to have another element added to it. */
642                 last_element =
643                     PATH_OFFSET_PELEMENT(search_path,
644                                          ++search_path->path_length);
645                 fs_gen = get_generation(sb);
646
647                 /* Read the next tree node, and set the last element in the path to
648                    have a pointer to it. */
649                 if ((bh = last_element->pe_buffer =
650                      sb_getblk(sb, block_number))) {
651                         bool unlocked = false;
652
653                         if (!buffer_uptodate(bh) && reada_count > 1)
654                                 /* may unlock the write lock */
655                                 unlocked = search_by_key_reada(sb, reada_bh,
656                                                     reada_blocks, reada_count);
657                         /*
658                          * If we haven't already unlocked the write lock,
659                          * then we need to do that here before reading
660                          * the current block
661                          */
662                         if (!buffer_uptodate(bh) && !unlocked) {
663                                 reiserfs_write_unlock(sb);
664                                 unlocked = true;
665                         }
666                         ll_rw_block(READ, 1, &bh);
667                         wait_on_buffer(bh);
668
669                         if (unlocked)
670                                 reiserfs_write_lock(sb);
671                         if (!buffer_uptodate(bh))
672                                 goto io_error;
673                 } else {
674                       io_error:
675                         search_path->path_length--;
676                         pathrelse(search_path);
677                         return IO_ERROR;
678                 }
679                 reada_count = 0;
680                 if (expected_level == -1)
681                         expected_level = SB_TREE_HEIGHT(sb);
682                 expected_level--;
683
684                 /* It is possible that schedule occurred. We must check whether the key
685                    to search is still in the tree rooted from the current buffer. If
686                    not then repeat search from the root. */
687                 if (fs_changed(fs_gen, sb) &&
688                     (!B_IS_IN_TREE(bh) ||
689                      B_LEVEL(bh) != expected_level ||
690                      !key_in_buffer(search_path, key, sb))) {
691                         PROC_INFO_INC(sb, search_by_key_fs_changed);
692                         PROC_INFO_INC(sb, search_by_key_restarted);
693                         PROC_INFO_INC(sb,
694                                       sbk_restarted[expected_level - 1]);
695                         pathrelse(search_path);
696
697                         /* Get the root block number so that we can repeat the search
698                            starting from the root. */
699                         block_number = SB_ROOT_BLOCK(sb);
700                         expected_level = -1;
701                         right_neighbor_of_leaf_node = 0;
702
703                         /* repeat search from the root */
704                         continue;
705                 }
706
707                 /* only check that the key is in the buffer if key is not
708                    equal to the MAX_KEY. Latter case is only possible in
709                    "finish_unfinished()" processing during mount. */
710                 RFALSE(comp_keys(&MAX_KEY, key) &&
711                        !key_in_buffer(search_path, key, sb),
712                        "PAP-5130: key is not in the buffer");
713 #ifdef CONFIG_REISERFS_CHECK
714                 if (cur_tb) {
715                         print_cur_tb("5140");
716                         reiserfs_panic(sb, "PAP-5140",
717                                        "schedule occurred in do_balance!");
718                 }
719 #endif
720
721                 // make sure, that the node contents look like a node of
722                 // certain level
723                 if (!is_tree_node(bh, expected_level)) {
724                         reiserfs_error(sb, "vs-5150",
725                                        "invalid format found in block %ld. "
726                                        "Fsck?", bh->b_blocknr);
727                         pathrelse(search_path);
728                         return IO_ERROR;
729                 }
730
731                 /* ok, we have acquired next formatted node in the tree */
732                 node_level = B_LEVEL(bh);
733
734                 PROC_INFO_BH_STAT(sb, bh, node_level - 1);
735
736                 RFALSE(node_level < stop_level,
737                        "vs-5152: tree level (%d) is less than stop level (%d)",
738                        node_level, stop_level);
739
740                 retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
741                                       B_NR_ITEMS(bh),
742                                       (node_level ==
743                                        DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
744                                       KEY_SIZE,
745                                       &(last_element->pe_position));
746                 if (node_level == stop_level) {
747                         return retval;
748                 }
749
750                 /* we are not in the stop level */
751                 if (retval == ITEM_FOUND)
752                         /* item has been found, so we choose the pointer which is to the right of the found one */
753                         last_element->pe_position++;
754
755                 /* if item was not found we choose the position which is to
756                    the left of the found item. This requires no code,
757                    bin_search did it already. */
758
759                 /* So we have chosen a position in the current node which is
760                    an internal node.  Now we calculate child block number by
761                    position in the node. */
762                 block_number =
763                     B_N_CHILD_NUM(bh, last_element->pe_position);
764
765                 /* if we are going to read leaf nodes, try for read ahead as well */
766                 if ((search_path->reada & PATH_READA) &&
767                     node_level == DISK_LEAF_NODE_LEVEL + 1) {
768                         int pos = last_element->pe_position;
769                         int limit = B_NR_ITEMS(bh);
770                         struct reiserfs_key *le_key;
771
772                         if (search_path->reada & PATH_READA_BACK)
773                                 limit = 0;
774                         while (reada_count < SEARCH_BY_KEY_READA) {
775                                 if (pos == limit)
776                                         break;
777                                 reada_blocks[reada_count++] =
778                                     B_N_CHILD_NUM(bh, pos);
779                                 if (search_path->reada & PATH_READA_BACK)
780                                         pos--;
781                                 else
782                                         pos++;
783
784                                 /*
785                                  * check to make sure we're in the same object
786                                  */
787                                 le_key = B_N_PDELIM_KEY(bh, pos);
788                                 if (le32_to_cpu(le_key->k_objectid) !=
789                                     key->on_disk_key.k_objectid) {
790                                         break;
791                                 }
792                         }
793                 }
794         }
795 }
796
797 /* Form the path to an item and position in this item which contains
798    file byte defined by key. If there is no such item
799    corresponding to the key, we point the path to the item with
800    maximal key less than key, and *pos_in_item is set to one
801    past the last entry/byte in the item.  If searching for entry in a
802    directory item, and it is not found, *pos_in_item is set to one
803    entry more than the entry with maximal key which is less than the
804    sought key.
805
806    Note that if there is no entry in this same node which is one more,
807    then we point to an imaginary entry.  for direct items, the
808    position is in units of bytes, for indirect items the position is
809    in units of blocknr entries, for directory items the position is in
810    units of directory entries.  */
811
812 /* The function is NOT SCHEDULE-SAFE! */
813 int search_for_position_by_key(struct super_block *sb,  /* Pointer to the super block.          */
814                                const struct cpu_key *p_cpu_key, /* Key to search (cpu variable)         */
815                                struct treepath *search_path     /* Filled up by this function.          */
816     )
817 {
818         struct item_head *p_le_ih;      /* pointer to on-disk structure */
819         int blk_size;
820         loff_t item_offset, offset;
821         struct reiserfs_dir_entry de;
822         int retval;
823
824         /* If searching for directory entry. */
825         if (is_direntry_cpu_key(p_cpu_key))
826                 return search_by_entry_key(sb, p_cpu_key, search_path,
827                                            &de);
828
829         /* If not searching for directory entry. */
830
831         /* If item is found. */
832         retval = search_item(sb, p_cpu_key, search_path);
833         if (retval == IO_ERROR)
834                 return retval;
835         if (retval == ITEM_FOUND) {
836
837                 RFALSE(!ih_item_len
838                        (B_N_PITEM_HEAD
839                         (PATH_PLAST_BUFFER(search_path),
840                          PATH_LAST_POSITION(search_path))),
841                        "PAP-5165: item length equals zero");
842
843                 pos_in_item(search_path) = 0;
844                 return POSITION_FOUND;
845         }
846
847         RFALSE(!PATH_LAST_POSITION(search_path),
848                "PAP-5170: position equals zero");
849
850         /* Item is not found. Set path to the previous item. */
851         p_le_ih =
852             B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
853                            --PATH_LAST_POSITION(search_path));
854         blk_size = sb->s_blocksize;
855
856         if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
857                 return FILE_NOT_FOUND;
858         }
859         // FIXME: quite ugly this far
860
861         item_offset = le_ih_k_offset(p_le_ih);
862         offset = cpu_key_k_offset(p_cpu_key);
863
864         /* Needed byte is contained in the item pointed to by the path. */
865         if (item_offset <= offset &&
866             item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
867                 pos_in_item(search_path) = offset - item_offset;
868                 if (is_indirect_le_ih(p_le_ih)) {
869                         pos_in_item(search_path) /= blk_size;
870                 }
871                 return POSITION_FOUND;
872         }
873
874         /* Needed byte is not contained in the item pointed to by the
875            path. Set pos_in_item out of the item. */
876         if (is_indirect_le_ih(p_le_ih))
877                 pos_in_item(search_path) =
878                     ih_item_len(p_le_ih) / UNFM_P_SIZE;
879         else
880                 pos_in_item(search_path) = ih_item_len(p_le_ih);
881
882         return POSITION_NOT_FOUND;
883 }
884
885 /* Compare given item and item pointed to by the path. */
886 int comp_items(const struct item_head *stored_ih, const struct treepath *path)
887 {
888         struct buffer_head *bh = PATH_PLAST_BUFFER(path);
889         struct item_head *ih;
890
891         /* Last buffer at the path is not in the tree. */
892         if (!B_IS_IN_TREE(bh))
893                 return 1;
894
895         /* Last path position is invalid. */
896         if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
897                 return 1;
898
899         /* we need only to know, whether it is the same item */
900         ih = get_ih(path);
901         return memcmp(stored_ih, ih, IH_SIZE);
902 }
903
904 /* unformatted nodes are not logged anymore, ever.  This is safe
905 ** now
906 */
907 #define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
908
909 // block can not be forgotten as it is in I/O or held by someone
910 #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
911
912 // prepare for delete or cut of direct item
913 static inline int prepare_for_direct_item(struct treepath *path,
914                                           struct item_head *le_ih,
915                                           struct inode *inode,
916                                           loff_t new_file_length, int *cut_size)
917 {
918         loff_t round_len;
919
920         if (new_file_length == max_reiserfs_offset(inode)) {
921                 /* item has to be deleted */
922                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
923                 return M_DELETE;
924         }
925         // new file gets truncated
926         if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
927                 //
928                 round_len = ROUND_UP(new_file_length);
929                 /* this was new_file_length < le_ih ... */
930                 if (round_len < le_ih_k_offset(le_ih)) {
931                         *cut_size = -(IH_SIZE + ih_item_len(le_ih));
932                         return M_DELETE;        /* Delete this item. */
933                 }
934                 /* Calculate first position and size for cutting from item. */
935                 pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
936                 *cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
937
938                 return M_CUT;   /* Cut from this item. */
939         }
940
941         // old file: items may have any length
942
943         if (new_file_length < le_ih_k_offset(le_ih)) {
944                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
945                 return M_DELETE;        /* Delete this item. */
946         }
947         /* Calculate first position and size for cutting from item. */
948         *cut_size = -(ih_item_len(le_ih) -
949                       (pos_in_item(path) =
950                        new_file_length + 1 - le_ih_k_offset(le_ih)));
951         return M_CUT;           /* Cut from this item. */
952 }
953
954 static inline int prepare_for_direntry_item(struct treepath *path,
955                                             struct item_head *le_ih,
956                                             struct inode *inode,
957                                             loff_t new_file_length,
958                                             int *cut_size)
959 {
960         if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
961             new_file_length == max_reiserfs_offset(inode)) {
962                 RFALSE(ih_entry_count(le_ih) != 2,
963                        "PAP-5220: incorrect empty directory item (%h)", le_ih);
964                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
965                 return M_DELETE;        /* Delete the directory item containing "." and ".." entry. */
966         }
967
968         if (ih_entry_count(le_ih) == 1) {
969                 /* Delete the directory item such as there is one record only
970                    in this item */
971                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
972                 return M_DELETE;
973         }
974
975         /* Cut one record from the directory item. */
976         *cut_size =
977             -(DEH_SIZE +
978               entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
979         return M_CUT;
980 }
981
982 #define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
983
984 /*  If the path points to a directory or direct item, calculate mode and the size cut, for balance.
985     If the path points to an indirect item, remove some number of its unformatted nodes.
986     In case of file truncate calculate whether this item must be deleted/truncated or last
987     unformatted node of this item will be converted to a direct item.
988     This function returns a determination of what balance mode the calling function should employ. */
989 static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed
990                                                                                                                                                                                    from end of the file. */
991                                       int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */
992     )
993 {
994         struct super_block *sb = inode->i_sb;
995         struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
996         struct buffer_head *bh = PATH_PLAST_BUFFER(path);
997
998         BUG_ON(!th->t_trans_id);
999
1000         /* Stat_data item. */
1001         if (is_statdata_le_ih(p_le_ih)) {
1002
1003                 RFALSE(new_file_length != max_reiserfs_offset(inode),
1004                        "PAP-5210: mode must be M_DELETE");
1005
1006                 *cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
1007                 return M_DELETE;
1008         }
1009
1010         /* Directory item. */
1011         if (is_direntry_le_ih(p_le_ih))
1012                 return prepare_for_direntry_item(path, p_le_ih, inode,
1013                                                  new_file_length,
1014                                                  cut_size);
1015
1016         /* Direct item. */
1017         if (is_direct_le_ih(p_le_ih))
1018                 return prepare_for_direct_item(path, p_le_ih, inode,
1019                                                new_file_length, cut_size);
1020
1021         /* Case of an indirect item. */
1022         {
1023             int blk_size = sb->s_blocksize;
1024             struct item_head s_ih;
1025             int need_re_search;
1026             int delete = 0;
1027             int result = M_CUT;
1028             int pos = 0;
1029
1030             if ( new_file_length == max_reiserfs_offset (inode) ) {
1031                 /* prepare_for_delete_or_cut() is called by
1032                  * reiserfs_delete_item() */
1033                 new_file_length = 0;
1034                 delete = 1;
1035             }
1036
1037             do {
1038                 need_re_search = 0;
1039                 *cut_size = 0;
1040                 bh = PATH_PLAST_BUFFER(path);
1041                 copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
1042                 pos = I_UNFM_NUM(&s_ih);
1043
1044                 while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
1045                     __le32 *unfm;
1046                     __u32 block;
1047
1048                     /* Each unformatted block deletion may involve one additional
1049                      * bitmap block into the transaction, thereby the initial
1050                      * journal space reservation might not be enough. */
1051                     if (!delete && (*cut_size) != 0 &&
1052                         reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
1053                         break;
1054
1055                     unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
1056                     block = get_block_num(unfm, 0);
1057
1058                     if (block != 0) {
1059                         reiserfs_prepare_for_journal(sb, bh, 1);
1060                         put_block_num(unfm, 0, 0);
1061                         journal_mark_dirty(th, sb, bh);
1062                         reiserfs_free_block(th, inode, block, 1);
1063                     }
1064
1065                     reiserfs_write_unlock(sb);
1066                     cond_resched();
1067                     reiserfs_write_lock(sb);
1068
1069                     if (item_moved (&s_ih, path))  {
1070                         need_re_search = 1;
1071                         break;
1072                     }
1073
1074                     pos --;
1075                     (*removed)++;
1076                     (*cut_size) -= UNFM_P_SIZE;
1077
1078                     if (pos == 0) {
1079                         (*cut_size) -= IH_SIZE;
1080                         result = M_DELETE;
1081                         break;
1082                     }
1083                 }
1084                 /* a trick.  If the buffer has been logged, this will do nothing.  If
1085                 ** we've broken the loop without logging it, it will restore the
1086                 ** buffer */
1087                 reiserfs_restore_prepared_buffer(sb, bh);
1088             } while (need_re_search &&
1089                      search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
1090             pos_in_item(path) = pos * UNFM_P_SIZE;
1091
1092             if (*cut_size == 0) {
1093                 /* Nothing were cut. maybe convert last unformatted node to the
1094                  * direct item? */
1095                 result = M_CONVERT;
1096             }
1097             return result;
1098         }
1099 }
1100
1101 /* Calculate number of bytes which will be deleted or cut during balance */
1102 static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
1103 {
1104         int del_size;
1105         struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
1106
1107         if (is_statdata_le_ih(p_le_ih))
1108                 return 0;
1109
1110         del_size =
1111             (mode ==
1112              M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
1113         if (is_direntry_le_ih(p_le_ih)) {
1114                 /* return EMPTY_DIR_SIZE; We delete emty directoris only.
1115                  * we can't use EMPTY_DIR_SIZE, as old format dirs have a different
1116                  * empty size.  ick. FIXME, is this right? */
1117                 return del_size;
1118         }
1119
1120         if (is_indirect_le_ih(p_le_ih))
1121                 del_size = (del_size / UNFM_P_SIZE) *
1122                                 (PATH_PLAST_BUFFER(tb->tb_path)->b_size);
1123         return del_size;
1124 }
1125
1126 static void init_tb_struct(struct reiserfs_transaction_handle *th,
1127                            struct tree_balance *tb,
1128                            struct super_block *sb,
1129                            struct treepath *path, int size)
1130 {
1131
1132         BUG_ON(!th->t_trans_id);
1133
1134         memset(tb, '\0', sizeof(struct tree_balance));
1135         tb->transaction_handle = th;
1136         tb->tb_sb = sb;
1137         tb->tb_path = path;
1138         PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
1139         PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
1140         tb->insert_size[0] = size;
1141 }
1142
1143 void padd_item(char *item, int total_length, int length)
1144 {
1145         int i;
1146
1147         for (i = total_length; i > length;)
1148                 item[--i] = 0;
1149 }
1150
1151 #ifdef REISERQUOTA_DEBUG
1152 char key2type(struct reiserfs_key *ih)
1153 {
1154         if (is_direntry_le_key(2, ih))
1155                 return 'd';
1156         if (is_direct_le_key(2, ih))
1157                 return 'D';
1158         if (is_indirect_le_key(2, ih))
1159                 return 'i';
1160         if (is_statdata_le_key(2, ih))
1161                 return 's';
1162         return 'u';
1163 }
1164
1165 char head2type(struct item_head *ih)
1166 {
1167         if (is_direntry_le_ih(ih))
1168                 return 'd';
1169         if (is_direct_le_ih(ih))
1170                 return 'D';
1171         if (is_indirect_le_ih(ih))
1172                 return 'i';
1173         if (is_statdata_le_ih(ih))
1174                 return 's';
1175         return 'u';
1176 }
1177 #endif
1178
1179 /* Delete object item.
1180  * th       - active transaction handle
1181  * path     - path to the deleted item
1182  * item_key - key to search for the deleted item
1183  * indode   - used for updating i_blocks and quotas
1184  * un_bh    - NULL or unformatted node pointer
1185  */
1186 int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1187                          struct treepath *path, const struct cpu_key *item_key,
1188                          struct inode *inode, struct buffer_head *un_bh)
1189 {
1190         struct super_block *sb = inode->i_sb;
1191         struct tree_balance s_del_balance;
1192         struct item_head s_ih;
1193         struct item_head *q_ih;
1194         int quota_cut_bytes;
1195         int ret_value, del_size, removed;
1196
1197 #ifdef CONFIG_REISERFS_CHECK
1198         char mode;
1199         int iter = 0;
1200 #endif
1201
1202         BUG_ON(!th->t_trans_id);
1203
1204         init_tb_struct(th, &s_del_balance, sb, path,
1205                        0 /*size is unknown */ );
1206
1207         while (1) {
1208                 removed = 0;
1209
1210 #ifdef CONFIG_REISERFS_CHECK
1211                 iter++;
1212                 mode =
1213 #endif
1214                     prepare_for_delete_or_cut(th, inode, path,
1215                                               item_key, &removed,
1216                                               &del_size,
1217                                               max_reiserfs_offset(inode));
1218
1219                 RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
1220
1221                 copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
1222                 s_del_balance.insert_size[0] = del_size;
1223
1224                 ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
1225                 if (ret_value != REPEAT_SEARCH)
1226                         break;
1227
1228                 PROC_INFO_INC(sb, delete_item_restarted);
1229
1230                 // file system changed, repeat search
1231                 ret_value =
1232                     search_for_position_by_key(sb, item_key, path);
1233                 if (ret_value == IO_ERROR)
1234                         break;
1235                 if (ret_value == FILE_NOT_FOUND) {
1236                         reiserfs_warning(sb, "vs-5340",
1237                                          "no items of the file %K found",
1238                                          item_key);
1239                         break;
1240                 }
1241         }                       /* while (1) */
1242
1243         if (ret_value != CARRY_ON) {
1244                 unfix_nodes(&s_del_balance);
1245                 return 0;
1246         }
1247         // reiserfs_delete_item returns item length when success
1248         ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
1249         q_ih = get_ih(path);
1250         quota_cut_bytes = ih_item_len(q_ih);
1251
1252         /* hack so the quota code doesn't have to guess if the file
1253          ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
1254          ** We test the offset because the tail might have been
1255          ** split into multiple items, and we only want to decrement for
1256          ** the unfm node once
1257          */
1258         if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
1259                 if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
1260                         quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
1261                 } else {
1262                         quota_cut_bytes = 0;
1263                 }
1264         }
1265
1266         if (un_bh) {
1267                 int off;
1268                 char *data;
1269
1270                 /* We are in direct2indirect conversion, so move tail contents
1271                    to the unformatted node */
1272                 /* note, we do the copy before preparing the buffer because we
1273                  ** don't care about the contents of the unformatted node yet.
1274                  ** the only thing we really care about is the direct item's data
1275                  ** is in the unformatted node.
1276                  **
1277                  ** Otherwise, we would have to call reiserfs_prepare_for_journal on
1278                  ** the unformatted node, which might schedule, meaning we'd have to
1279                  ** loop all the way back up to the start of the while loop.
1280                  **
1281                  ** The unformatted node must be dirtied later on.  We can't be
1282                  ** sure here if the entire tail has been deleted yet.
1283                  **
1284                  ** un_bh is from the page cache (all unformatted nodes are
1285                  ** from the page cache) and might be a highmem page.  So, we
1286                  ** can't use un_bh->b_data.
1287                  ** -clm
1288                  */
1289
1290                 data = kmap_atomic(un_bh->b_page, KM_USER0);
1291                 off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
1292                 memcpy(data + off,
1293                        B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
1294                        ret_value);
1295                 kunmap_atomic(data, KM_USER0);
1296         }
1297         /* Perform balancing after all resources have been collected at once. */
1298         do_balance(&s_del_balance, NULL, NULL, M_DELETE);
1299
1300 #ifdef REISERQUOTA_DEBUG
1301         reiserfs_debug(sb, REISERFS_DEBUG_CODE,
1302                        "reiserquota delete_item(): freeing %u, id=%u type=%c",
1303                        quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1304 #endif
1305         vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
1306
1307         /* Return deleted body length */
1308         return ret_value;
1309 }
1310
1311 /* Summary Of Mechanisms For Handling Collisions Between Processes:
1312
1313  deletion of the body of the object is performed by iput(), with the
1314  result that if multiple processes are operating on a file, the
1315  deletion of the body of the file is deferred until the last process
1316  that has an open inode performs its iput().
1317
1318  writes and truncates are protected from collisions by use of
1319  semaphores.
1320
1321  creates, linking, and mknod are protected from collisions with other
1322  processes by making the reiserfs_add_entry() the last step in the
1323  creation, and then rolling back all changes if there was a collision.
1324  - Hans
1325 */
1326
1327 /* this deletes item which never gets split */
1328 void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1329                                 struct inode *inode, struct reiserfs_key *key)
1330 {
1331         struct tree_balance tb;
1332         INITIALIZE_PATH(path);
1333         int item_len = 0;
1334         int tb_init = 0;
1335         struct cpu_key cpu_key;
1336         int retval;
1337         int quota_cut_bytes = 0;
1338
1339         BUG_ON(!th->t_trans_id);
1340
1341         le_key2cpu_key(&cpu_key, key);
1342
1343         while (1) {
1344                 retval = search_item(th->t_super, &cpu_key, &path);
1345                 if (retval == IO_ERROR) {
1346                         reiserfs_error(th->t_super, "vs-5350",
1347                                        "i/o failure occurred trying "
1348                                        "to delete %K", &cpu_key);
1349                         break;
1350                 }
1351                 if (retval != ITEM_FOUND) {
1352                         pathrelse(&path);
1353                         // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
1354                         if (!
1355                             ((unsigned long long)
1356                              GET_HASH_VALUE(le_key_k_offset
1357                                             (le_key_version(key), key)) == 0
1358                              && (unsigned long long)
1359                              GET_GENERATION_NUMBER(le_key_k_offset
1360                                                    (le_key_version(key),
1361                                                     key)) == 1))
1362                                 reiserfs_warning(th->t_super, "vs-5355",
1363                                                  "%k not found", key);
1364                         break;
1365                 }
1366                 if (!tb_init) {
1367                         tb_init = 1;
1368                         item_len = ih_item_len(PATH_PITEM_HEAD(&path));
1369                         init_tb_struct(th, &tb, th->t_super, &path,
1370                                        -(IH_SIZE + item_len));
1371                 }
1372                 quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path));
1373
1374                 retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
1375                 if (retval == REPEAT_SEARCH) {
1376                         PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
1377                         continue;
1378                 }
1379
1380                 if (retval == CARRY_ON) {
1381                         do_balance(&tb, NULL, NULL, M_DELETE);
1382                         if (inode) {    /* Should we count quota for item? (we don't count quotas for save-links) */
1383 #ifdef REISERQUOTA_DEBUG
1384                                 reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
1385                                                "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
1386                                                quota_cut_bytes, inode->i_uid,
1387                                                key2type(key));
1388 #endif
1389                                 vfs_dq_free_space_nodirty(inode,
1390                                                          quota_cut_bytes);
1391                         }
1392                         break;
1393                 }
1394                 // IO_ERROR, NO_DISK_SPACE, etc
1395                 reiserfs_warning(th->t_super, "vs-5360",
1396                                  "could not delete %K due to fix_nodes failure",
1397                                  &cpu_key);
1398                 unfix_nodes(&tb);
1399                 break;
1400         }
1401
1402         reiserfs_check_path(&path);
1403 }
1404
1405 int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
1406                            struct inode *inode)
1407 {
1408         int err;
1409         inode->i_size = 0;
1410         BUG_ON(!th->t_trans_id);
1411
1412         /* for directory this deletes item containing "." and ".." */
1413         err =
1414             reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
1415         if (err)
1416                 return err;
1417
1418 #if defined( USE_INODE_GENERATION_COUNTER )
1419         if (!old_format_only(th->t_super)) {
1420                 __le32 *inode_generation;
1421
1422                 inode_generation =
1423                     &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
1424                 le32_add_cpu(inode_generation, 1);
1425         }
1426 /* USE_INODE_GENERATION_COUNTER */
1427 #endif
1428         reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
1429
1430         return err;
1431 }
1432
1433 static void unmap_buffers(struct page *page, loff_t pos)
1434 {
1435         struct buffer_head *bh;
1436         struct buffer_head *head;
1437         struct buffer_head *next;
1438         unsigned long tail_index;
1439         unsigned long cur_index;
1440
1441         if (page) {
1442                 if (page_has_buffers(page)) {
1443                         tail_index = pos & (PAGE_CACHE_SIZE - 1);
1444                         cur_index = 0;
1445                         head = page_buffers(page);
1446                         bh = head;
1447                         do {
1448                                 next = bh->b_this_page;
1449
1450                                 /* we want to unmap the buffers that contain the tail, and
1451                                  ** all the buffers after it (since the tail must be at the
1452                                  ** end of the file).  We don't want to unmap file data
1453                                  ** before the tail, since it might be dirty and waiting to
1454                                  ** reach disk
1455                                  */
1456                                 cur_index += bh->b_size;
1457                                 if (cur_index > tail_index) {
1458                                         reiserfs_unmap_buffer(bh);
1459                                 }
1460                                 bh = next;
1461                         } while (bh != head);
1462                 }
1463         }
1464 }
1465
1466 static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
1467                                     struct inode *inode,
1468                                     struct page *page,
1469                                     struct treepath *path,
1470                                     const struct cpu_key *item_key,
1471                                     loff_t new_file_size, char *mode)
1472 {
1473         struct super_block *sb = inode->i_sb;
1474         int block_size = sb->s_blocksize;
1475         int cut_bytes;
1476         BUG_ON(!th->t_trans_id);
1477         BUG_ON(new_file_size != inode->i_size);
1478
1479         /* the page being sent in could be NULL if there was an i/o error
1480          ** reading in the last block.  The user will hit problems trying to
1481          ** read the file, but for now we just skip the indirect2direct
1482          */
1483         if (atomic_read(&inode->i_count) > 1 ||
1484             !tail_has_to_be_packed(inode) ||
1485             !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
1486                 /* leave tail in an unformatted node */
1487                 *mode = M_SKIP_BALANCING;
1488                 cut_bytes =
1489                     block_size - (new_file_size & (block_size - 1));
1490                 pathrelse(path);
1491                 return cut_bytes;
1492         }
1493         /* Perform the conversion to a direct_item. */
1494         /* return indirect_to_direct(inode, path, item_key,
1495                                   new_file_size, mode); */
1496         return indirect2direct(th, inode, page, path, item_key,
1497                                new_file_size, mode);
1498 }
1499
1500 /* we did indirect_to_direct conversion. And we have inserted direct
1501    item successesfully, but there were no disk space to cut unfm
1502    pointer being converted. Therefore we have to delete inserted
1503    direct item(s) */
1504 static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
1505                                          struct inode *inode, struct treepath *path)
1506 {
1507         struct cpu_key tail_key;
1508         int tail_len;
1509         int removed;
1510         BUG_ON(!th->t_trans_id);
1511
1512         make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);      // !!!!
1513         tail_key.key_length = 4;
1514
1515         tail_len =
1516             (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
1517         while (tail_len) {
1518                 /* look for the last byte of the tail */
1519                 if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
1520                     POSITION_NOT_FOUND)
1521                         reiserfs_panic(inode->i_sb, "vs-5615",
1522                                        "found invalid item");
1523                 RFALSE(path->pos_in_item !=
1524                        ih_item_len(PATH_PITEM_HEAD(path)) - 1,
1525                        "vs-5616: appended bytes found");
1526                 PATH_LAST_POSITION(path)--;
1527
1528                 removed =
1529                     reiserfs_delete_item(th, path, &tail_key, inode,
1530                                          NULL /*unbh not needed */ );
1531                 RFALSE(removed <= 0
1532                        || removed > tail_len,
1533                        "vs-5617: there was tail %d bytes, removed item length %d bytes",
1534                        tail_len, removed);
1535                 tail_len -= removed;
1536                 set_cpu_key_k_offset(&tail_key,
1537                                      cpu_key_k_offset(&tail_key) - removed);
1538         }
1539         reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
1540                          "conversion has been rolled back due to "
1541                          "lack of disk space");
1542         //mark_file_without_tail (inode);
1543         mark_inode_dirty(inode);
1544 }
1545
1546 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
1547 int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1548                            struct treepath *path,
1549                            struct cpu_key *item_key,
1550                            struct inode *inode,
1551                            struct page *page, loff_t new_file_size)
1552 {
1553         struct super_block *sb = inode->i_sb;
1554         /* Every function which is going to call do_balance must first
1555            create a tree_balance structure.  Then it must fill up this
1556            structure by using the init_tb_struct and fix_nodes functions.
1557            After that we can make tree balancing. */
1558         struct tree_balance s_cut_balance;
1559         struct item_head *p_le_ih;
1560         int cut_size = 0,       /* Amount to be cut. */
1561             ret_value = CARRY_ON, removed = 0,  /* Number of the removed unformatted nodes. */
1562             is_inode_locked = 0;
1563         char mode;              /* Mode of the balance. */
1564         int retval2 = -1;
1565         int quota_cut_bytes;
1566         loff_t tail_pos = 0;
1567
1568         BUG_ON(!th->t_trans_id);
1569
1570         init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
1571                        cut_size);
1572
1573         /* Repeat this loop until we either cut the item without needing
1574            to balance, or we fix_nodes without schedule occurring */
1575         while (1) {
1576                 /* Determine the balance mode, position of the first byte to
1577                    be cut, and size to be cut.  In case of the indirect item
1578                    free unformatted nodes which are pointed to by the cut
1579                    pointers. */
1580
1581                 mode =
1582                     prepare_for_delete_or_cut(th, inode, path,
1583                                               item_key, &removed,
1584                                               &cut_size, new_file_size);
1585                 if (mode == M_CONVERT) {
1586                         /* convert last unformatted node to direct item or leave
1587                            tail in the unformatted node */
1588                         RFALSE(ret_value != CARRY_ON,
1589                                "PAP-5570: can not convert twice");
1590
1591                         ret_value =
1592                             maybe_indirect_to_direct(th, inode, page,
1593                                                      path, item_key,
1594                                                      new_file_size, &mode);
1595                         if (mode == M_SKIP_BALANCING)
1596                                 /* tail has been left in the unformatted node */
1597                                 return ret_value;
1598
1599                         is_inode_locked = 1;
1600
1601                         /* removing of last unformatted node will change value we
1602                            have to return to truncate. Save it */
1603                         retval2 = ret_value;
1604                         /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */
1605
1606                         /* So, we have performed the first part of the conversion:
1607                            inserting the new direct item.  Now we are removing the
1608                            last unformatted node pointer. Set key to search for
1609                            it. */
1610                         set_cpu_key_k_type(item_key, TYPE_INDIRECT);
1611                         item_key->key_length = 4;
1612                         new_file_size -=
1613                             (new_file_size & (sb->s_blocksize - 1));
1614                         tail_pos = new_file_size;
1615                         set_cpu_key_k_offset(item_key, new_file_size + 1);
1616                         if (search_for_position_by_key
1617                             (sb, item_key,
1618                              path) == POSITION_NOT_FOUND) {
1619                                 print_block(PATH_PLAST_BUFFER(path), 3,
1620                                             PATH_LAST_POSITION(path) - 1,
1621                                             PATH_LAST_POSITION(path) + 1);
1622                                 reiserfs_panic(sb, "PAP-5580", "item to "
1623                                                "convert does not exist (%K)",
1624                                                item_key);
1625                         }
1626                         continue;
1627                 }
1628                 if (cut_size == 0) {
1629                         pathrelse(path);
1630                         return 0;
1631                 }
1632
1633                 s_cut_balance.insert_size[0] = cut_size;
1634
1635                 ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
1636                 if (ret_value != REPEAT_SEARCH)
1637                         break;
1638
1639                 PROC_INFO_INC(sb, cut_from_item_restarted);
1640
1641                 ret_value =
1642                     search_for_position_by_key(sb, item_key, path);
1643                 if (ret_value == POSITION_FOUND)
1644                         continue;
1645
1646                 reiserfs_warning(sb, "PAP-5610", "item %K not found",
1647                                  item_key);
1648                 unfix_nodes(&s_cut_balance);
1649                 return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
1650         }                       /* while */
1651
1652         // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
1653         if (ret_value != CARRY_ON) {
1654                 if (is_inode_locked) {
1655                         // FIXME: this seems to be not needed: we are always able
1656                         // to cut item
1657                         indirect_to_direct_roll_back(th, inode, path);
1658                 }
1659                 if (ret_value == NO_DISK_SPACE)
1660                         reiserfs_warning(sb, "reiserfs-5092",
1661                                          "NO_DISK_SPACE");
1662                 unfix_nodes(&s_cut_balance);
1663                 return -EIO;
1664         }
1665
1666         /* go ahead and perform balancing */
1667
1668         RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
1669
1670         /* Calculate number of bytes that need to be cut from the item. */
1671         quota_cut_bytes =
1672             (mode ==
1673              M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
1674             insert_size[0];
1675         if (retval2 == -1)
1676                 ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
1677         else
1678                 ret_value = retval2;
1679
1680         /* For direct items, we only change the quota when deleting the last
1681          ** item.
1682          */
1683         p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
1684         if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
1685                 if (mode == M_DELETE &&
1686                     (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
1687                     1) {
1688                         // FIXME: this is to keep 3.5 happy
1689                         REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
1690                         quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
1691                 } else {
1692                         quota_cut_bytes = 0;
1693                 }
1694         }
1695 #ifdef CONFIG_REISERFS_CHECK
1696         if (is_inode_locked) {
1697                 struct item_head *le_ih =
1698                     PATH_PITEM_HEAD(s_cut_balance.tb_path);
1699                 /* we are going to complete indirect2direct conversion. Make
1700                    sure, that we exactly remove last unformatted node pointer
1701                    of the item */
1702                 if (!is_indirect_le_ih(le_ih))
1703                         reiserfs_panic(sb, "vs-5652",
1704                                        "item must be indirect %h", le_ih);
1705
1706                 if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
1707                         reiserfs_panic(sb, "vs-5653", "completing "
1708                                        "indirect2direct conversion indirect "
1709                                        "item %h being deleted must be of "
1710                                        "4 byte long", le_ih);
1711
1712                 if (mode == M_CUT
1713                     && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
1714                         reiserfs_panic(sb, "vs-5654", "can not complete "
1715                                        "indirect2direct conversion of %h "
1716                                        "(CUT, insert_size==%d)",
1717                                        le_ih, s_cut_balance.insert_size[0]);
1718                 }
1719                 /* it would be useful to make sure, that right neighboring
1720                    item is direct item of this file */
1721         }
1722 #endif
1723
1724         do_balance(&s_cut_balance, NULL, NULL, mode);
1725         if (is_inode_locked) {
1726                 /* we've done an indirect->direct conversion.  when the data block
1727                  ** was freed, it was removed from the list of blocks that must
1728                  ** be flushed before the transaction commits, make sure to
1729                  ** unmap and invalidate it
1730                  */
1731                 unmap_buffers(page, tail_pos);
1732                 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
1733         }
1734 #ifdef REISERQUOTA_DEBUG
1735         reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
1736                        "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1737                        quota_cut_bytes, inode->i_uid, '?');
1738 #endif
1739         vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
1740         return ret_value;
1741 }
1742
1743 static void truncate_directory(struct reiserfs_transaction_handle *th,
1744                                struct inode *inode)
1745 {
1746         BUG_ON(!th->t_trans_id);
1747         if (inode->i_nlink)
1748                 reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
1749
1750         set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
1751         set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
1752         reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
1753         reiserfs_update_sd(th, inode);
1754         set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
1755         set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
1756 }
1757
1758 /* Truncate file to the new size. Note, this must be called with a transaction
1759    already started */
1760 int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
1761                           struct inode *inode,  /* ->i_size contains new size */
1762                          struct page *page,     /* up to date for last block */
1763                          int update_timestamps  /* when it is called by
1764                                                    file_release to convert
1765                                                    the tail - no timestamps
1766                                                    should be updated */
1767     )
1768 {
1769         INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
1770         struct item_head *p_le_ih;      /* Pointer to an item header. */
1771         struct cpu_key s_item_key;      /* Key to search for a previous file item. */
1772         loff_t file_size,       /* Old file size. */
1773          new_file_size; /* New file size. */
1774         int deleted;            /* Number of deleted or truncated bytes. */
1775         int retval;
1776         int err = 0;
1777
1778         BUG_ON(!th->t_trans_id);
1779         if (!
1780             (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1781              || S_ISLNK(inode->i_mode)))
1782                 return 0;
1783
1784         if (S_ISDIR(inode->i_mode)) {
1785                 // deletion of directory - no need to update timestamps
1786                 truncate_directory(th, inode);
1787                 return 0;
1788         }
1789
1790         /* Get new file size. */
1791         new_file_size = inode->i_size;
1792
1793         // FIXME: note, that key type is unimportant here
1794         make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
1795                      TYPE_DIRECT, 3);
1796
1797         retval =
1798             search_for_position_by_key(inode->i_sb, &s_item_key,
1799                                        &s_search_path);
1800         if (retval == IO_ERROR) {
1801                 reiserfs_error(inode->i_sb, "vs-5657",
1802                                "i/o failure occurred trying to truncate %K",
1803                                &s_item_key);
1804                 err = -EIO;
1805                 goto out;
1806         }
1807         if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
1808                 reiserfs_error(inode->i_sb, "PAP-5660",
1809                                "wrong result %d of search for %K", retval,
1810                                &s_item_key);
1811
1812                 err = -EIO;
1813                 goto out;
1814         }
1815
1816         s_search_path.pos_in_item--;
1817
1818         /* Get real file size (total length of all file items) */
1819         p_le_ih = PATH_PITEM_HEAD(&s_search_path);
1820         if (is_statdata_le_ih(p_le_ih))
1821                 file_size = 0;
1822         else {
1823                 loff_t offset = le_ih_k_offset(p_le_ih);
1824                 int bytes =
1825                     op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
1826
1827                 /* this may mismatch with real file size: if last direct item
1828                    had no padding zeros and last unformatted node had no free
1829                    space, this file would have this file size */
1830                 file_size = offset + bytes - 1;
1831         }
1832         /*
1833          * are we doing a full truncate or delete, if so
1834          * kick in the reada code
1835          */
1836         if (new_file_size == 0)
1837                 s_search_path.reada = PATH_READA | PATH_READA_BACK;
1838
1839         if (file_size == 0 || file_size < new_file_size) {
1840                 goto update_and_out;
1841         }
1842
1843         /* Update key to search for the last file item. */
1844         set_cpu_key_k_offset(&s_item_key, file_size);
1845
1846         do {
1847                 /* Cut or delete file item. */
1848                 deleted =
1849                     reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
1850                                            inode, page, new_file_size);
1851                 if (deleted < 0) {
1852                         reiserfs_warning(inode->i_sb, "vs-5665",
1853                                          "reiserfs_cut_from_item failed");
1854                         reiserfs_check_path(&s_search_path);
1855                         return 0;
1856                 }
1857
1858                 RFALSE(deleted > file_size,
1859                        "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
1860                        deleted, file_size, &s_item_key);
1861
1862                 /* Change key to search the last file item. */
1863                 file_size -= deleted;
1864
1865                 set_cpu_key_k_offset(&s_item_key, file_size);
1866
1867                 /* While there are bytes to truncate and previous file item is presented in the tree. */
1868
1869                 /*
1870                  ** This loop could take a really long time, and could log
1871                  ** many more blocks than a transaction can hold.  So, we do a polite
1872                  ** journal end here, and if the transaction needs ending, we make
1873                  ** sure the file is consistent before ending the current trans
1874                  ** and starting a new one
1875                  */
1876                 if (journal_transaction_should_end(th, 0) ||
1877                     reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
1878                         int orig_len_alloc = th->t_blocks_allocated;
1879                         pathrelse(&s_search_path);
1880
1881                         if (update_timestamps) {
1882                                 inode->i_mtime = CURRENT_TIME_SEC;
1883                                 inode->i_ctime = CURRENT_TIME_SEC;
1884                         }
1885                         reiserfs_update_sd(th, inode);
1886
1887                         err = journal_end(th, inode->i_sb, orig_len_alloc);
1888                         if (err)
1889                                 goto out;
1890                         err = journal_begin(th, inode->i_sb,
1891                                             JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
1892                         if (err)
1893                                 goto out;
1894                         reiserfs_update_inode_transaction(inode);
1895                 }
1896         } while (file_size > ROUND_UP(new_file_size) &&
1897                  search_for_position_by_key(inode->i_sb, &s_item_key,
1898                                             &s_search_path) == POSITION_FOUND);
1899
1900         RFALSE(file_size > ROUND_UP(new_file_size),
1901                "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
1902                new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
1903
1904       update_and_out:
1905         if (update_timestamps) {
1906                 // this is truncate, not file closing
1907                 inode->i_mtime = CURRENT_TIME_SEC;
1908                 inode->i_ctime = CURRENT_TIME_SEC;
1909         }
1910         reiserfs_update_sd(th, inode);
1911
1912       out:
1913         pathrelse(&s_search_path);
1914         return err;
1915 }
1916
1917 #ifdef CONFIG_REISERFS_CHECK
1918 // this makes sure, that we __append__, not overwrite or add holes
1919 static void check_research_for_paste(struct treepath *path,
1920                                      const struct cpu_key *key)
1921 {
1922         struct item_head *found_ih = get_ih(path);
1923
1924         if (is_direct_le_ih(found_ih)) {
1925                 if (le_ih_k_offset(found_ih) +
1926                     op_bytes_number(found_ih,
1927                                     get_last_bh(path)->b_size) !=
1928                     cpu_key_k_offset(key)
1929                     || op_bytes_number(found_ih,
1930                                        get_last_bh(path)->b_size) !=
1931                     pos_in_item(path))
1932                         reiserfs_panic(NULL, "PAP-5720", "found direct item "
1933                                        "%h or position (%d) does not match "
1934                                        "to key %K", found_ih,
1935                                        pos_in_item(path), key);
1936         }
1937         if (is_indirect_le_ih(found_ih)) {
1938                 if (le_ih_k_offset(found_ih) +
1939                     op_bytes_number(found_ih,
1940                                     get_last_bh(path)->b_size) !=
1941                     cpu_key_k_offset(key)
1942                     || I_UNFM_NUM(found_ih) != pos_in_item(path)
1943                     || get_ih_free_space(found_ih) != 0)
1944                         reiserfs_panic(NULL, "PAP-5730", "found indirect "
1945                                        "item (%h) or position (%d) does not "
1946                                        "match to key (%K)",
1947                                        found_ih, pos_in_item(path), key);
1948         }
1949 }
1950 #endif                          /* config reiserfs check */
1951
1952 /* Paste bytes to the existing item. Returns bytes number pasted into the item. */
1953 int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path,      /* Path to the pasted item.       */
1954                              const struct cpu_key *key, /* Key to search for the needed item. */
1955                              struct inode *inode,       /* Inode item belongs to */
1956                              const char *body,  /* Pointer to the bytes to paste.    */
1957                              int pasted_size)
1958 {                               /* Size of pasted bytes.             */
1959         struct tree_balance s_paste_balance;
1960         int retval;
1961         int fs_gen;
1962
1963         BUG_ON(!th->t_trans_id);
1964
1965         fs_gen = get_generation(inode->i_sb);
1966
1967 #ifdef REISERQUOTA_DEBUG
1968         reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
1969                        "reiserquota paste_into_item(): allocating %u id=%u type=%c",
1970                        pasted_size, inode->i_uid,
1971                        key2type(&(key->on_disk_key)));
1972 #endif
1973
1974         if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
1975                 pathrelse(search_path);
1976                 return -EDQUOT;
1977         }
1978         init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
1979                        pasted_size);
1980 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1981         s_paste_balance.key = key->on_disk_key;
1982 #endif
1983
1984         /* DQUOT_* can schedule, must check before the fix_nodes */
1985         if (fs_changed(fs_gen, inode->i_sb)) {
1986                 goto search_again;
1987         }
1988
1989         while ((retval =
1990                 fix_nodes(M_PASTE, &s_paste_balance, NULL,
1991                           body)) == REPEAT_SEARCH) {
1992               search_again:
1993                 /* file system changed while we were in the fix_nodes */
1994                 PROC_INFO_INC(th->t_super, paste_into_item_restarted);
1995                 retval =
1996                     search_for_position_by_key(th->t_super, key,
1997                                                search_path);
1998                 if (retval == IO_ERROR) {
1999                         retval = -EIO;
2000                         goto error_out;
2001                 }
2002                 if (retval == POSITION_FOUND) {
2003                         reiserfs_warning(inode->i_sb, "PAP-5710",
2004                                          "entry or pasted byte (%K) exists",
2005                                          key);
2006                         retval = -EEXIST;
2007                         goto error_out;
2008                 }
2009 #ifdef CONFIG_REISERFS_CHECK
2010                 check_research_for_paste(search_path, key);
2011 #endif
2012         }
2013
2014         /* Perform balancing after all resources are collected by fix_nodes, and
2015            accessing them will not risk triggering schedule. */
2016         if (retval == CARRY_ON) {
2017                 do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
2018                 return 0;
2019         }
2020         retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
2021       error_out:
2022         /* this also releases the path */
2023         unfix_nodes(&s_paste_balance);
2024 #ifdef REISERQUOTA_DEBUG
2025         reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2026                        "reiserquota paste_into_item(): freeing %u id=%u type=%c",
2027                        pasted_size, inode->i_uid,
2028                        key2type(&(key->on_disk_key)));
2029 #endif
2030         vfs_dq_free_space_nodirty(inode, pasted_size);
2031         return retval;
2032 }
2033
2034 /* Insert new item into the buffer at the path.
2035  * th   - active transaction handle
2036  * path - path to the inserted item
2037  * ih   - pointer to the item header to insert
2038  * body - pointer to the bytes to insert
2039  */
2040 int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2041                          struct treepath *path, const struct cpu_key *key,
2042                          struct item_head *ih, struct inode *inode,
2043                          const char *body)
2044 {
2045         struct tree_balance s_ins_balance;
2046         int retval;
2047         int fs_gen = 0;
2048         int quota_bytes = 0;
2049
2050         BUG_ON(!th->t_trans_id);
2051
2052         if (inode) {            /* Do we count quotas for item? */
2053                 fs_gen = get_generation(inode->i_sb);
2054                 quota_bytes = ih_item_len(ih);
2055
2056                 /* hack so the quota code doesn't have to guess if the file has
2057                  ** a tail, links are always tails, so there's no guessing needed
2058                  */
2059                 if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
2060                         quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
2061 #ifdef REISERQUOTA_DEBUG
2062                 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2063                                "reiserquota insert_item(): allocating %u id=%u type=%c",
2064                                quota_bytes, inode->i_uid, head2type(ih));
2065 #endif
2066                 /* We can't dirty inode here. It would be immediately written but
2067                  * appropriate stat item isn't inserted yet... */
2068                 if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
2069                         pathrelse(path);
2070                         return -EDQUOT;
2071                 }
2072         }
2073         init_tb_struct(th, &s_ins_balance, th->t_super, path,
2074                        IH_SIZE + ih_item_len(ih));
2075 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
2076         s_ins_balance.key = key->on_disk_key;
2077 #endif
2078         /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
2079         if (inode && fs_changed(fs_gen, inode->i_sb)) {
2080                 goto search_again;
2081         }
2082
2083         while ((retval =
2084                 fix_nodes(M_INSERT, &s_ins_balance, ih,
2085                           body)) == REPEAT_SEARCH) {
2086               search_again:
2087                 /* file system changed while we were in the fix_nodes */
2088                 PROC_INFO_INC(th->t_super, insert_item_restarted);
2089                 retval = search_item(th->t_super, key, path);
2090                 if (retval == IO_ERROR) {
2091                         retval = -EIO;
2092                         goto error_out;
2093                 }
2094                 if (retval == ITEM_FOUND) {
2095                         reiserfs_warning(th->t_super, "PAP-5760",
2096                                          "key %K already exists in the tree",
2097                                          key);
2098                         retval = -EEXIST;
2099                         goto error_out;
2100                 }
2101         }
2102
2103         /* make balancing after all resources will be collected at a time */
2104         if (retval == CARRY_ON) {
2105                 do_balance(&s_ins_balance, ih, body, M_INSERT);
2106                 return 0;
2107         }
2108
2109         retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
2110       error_out:
2111         /* also releases the path */
2112         unfix_nodes(&s_ins_balance);
2113 #ifdef REISERQUOTA_DEBUG
2114         reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
2115                        "reiserquota insert_item(): freeing %u id=%u type=%c",
2116                        quota_bytes, inode->i_uid, head2type(ih));
2117 #endif
2118         if (inode)
2119                 vfs_dq_free_space_nodirty(inode, quota_bytes);
2120         return retval;
2121 }