[PATCH] mark struct inode_operations const 3
[linux-2.6.git] / fs / reiserfs / file.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 #include <linux/time.h>
6 #include <linux/reiserfs_fs.h>
7 #include <linux/reiserfs_acl.h>
8 #include <linux/reiserfs_xattr.h>
9 #include <linux/smp_lock.h>
10 #include <asm/uaccess.h>
11 #include <linux/pagemap.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/blkdev.h>
15 #include <linux/buffer_head.h>
16 #include <linux/quotaops.h>
17
18 /*
19 ** We pack the tails of files on file close, not at the time they are written.
20 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
21 ** insertion/balancing, for files that are written in one write.
22 ** It avoids unnecessary tail packings (balances) for files that are written in
23 ** multiple writes and are small enough to have tails.
24 ** 
25 ** file_release is called by the VFS layer when the file is closed.  If
26 ** this is the last open file descriptor, and the file
27 ** small enough to have a tail, and the tail is currently in an
28 ** unformatted node, the tail is converted back into a direct item.
29 ** 
30 ** We use reiserfs_truncate_file to pack the tail, since it already has
31 ** all the conditions coded.  
32 */
33 static int reiserfs_file_release(struct inode *inode, struct file *filp)
34 {
35
36         struct reiserfs_transaction_handle th;
37         int err;
38         int jbegin_failure = 0;
39
40         BUG_ON(!S_ISREG(inode->i_mode));
41
42         /* fast out for when nothing needs to be done */
43         if ((atomic_read(&inode->i_count) > 1 ||
44              !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
45              !tail_has_to_be_packed(inode)) &&
46             REISERFS_I(inode)->i_prealloc_count <= 0) {
47                 return 0;
48         }
49
50         mutex_lock(&inode->i_mutex);
51
52         mutex_lock(&(REISERFS_I(inode)->i_mmap));
53         if (REISERFS_I(inode)->i_flags & i_ever_mapped)
54                 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
55
56         reiserfs_write_lock(inode->i_sb);
57         /* freeing preallocation only involves relogging blocks that
58          * are already in the current transaction.  preallocation gets
59          * freed at the end of each transaction, so it is impossible for
60          * us to log any additional blocks (including quota blocks)
61          */
62         err = journal_begin(&th, inode->i_sb, 1);
63         if (err) {
64                 /* uh oh, we can't allow the inode to go away while there
65                  * is still preallocation blocks pending.  Try to join the
66                  * aborted transaction
67                  */
68                 jbegin_failure = err;
69                 err = journal_join_abort(&th, inode->i_sb, 1);
70
71                 if (err) {
72                         /* hmpf, our choices here aren't good.  We can pin the inode
73                          * which will disallow unmount from every happening, we can
74                          * do nothing, which will corrupt random memory on unmount,
75                          * or we can forcibly remove the file from the preallocation
76                          * list, which will leak blocks on disk.  Lets pin the inode
77                          * and let the admin know what is going on.
78                          */
79                         igrab(inode);
80                         reiserfs_warning(inode->i_sb,
81                                          "pinning inode %lu because the "
82                                          "preallocation can't be freed",
83                                          inode->i_ino);
84                         goto out;
85                 }
86         }
87         reiserfs_update_inode_transaction(inode);
88
89 #ifdef REISERFS_PREALLOCATE
90         reiserfs_discard_prealloc(&th, inode);
91 #endif
92         err = journal_end(&th, inode->i_sb, 1);
93
94         /* copy back the error code from journal_begin */
95         if (!err)
96                 err = jbegin_failure;
97
98         if (!err && atomic_read(&inode->i_count) <= 1 &&
99             (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
100             tail_has_to_be_packed(inode)) {
101                 /* if regular file is released by last holder and it has been
102                    appended (we append by unformatted node only) or its direct
103                    item(s) had to be converted, then it may have to be
104                    indirect2direct converted */
105                 err = reiserfs_truncate_file(inode, 0);
106         }
107       out:
108         mutex_unlock(&(REISERFS_I(inode)->i_mmap));
109         mutex_unlock(&inode->i_mutex);
110         reiserfs_write_unlock(inode->i_sb);
111         return err;
112 }
113
114 static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
115 {
116         struct inode *inode;
117
118         inode = file->f_path.dentry->d_inode;
119         mutex_lock(&(REISERFS_I(inode)->i_mmap));
120         REISERFS_I(inode)->i_flags |= i_ever_mapped;
121         mutex_unlock(&(REISERFS_I(inode)->i_mmap));
122
123         return generic_file_mmap(file, vma);
124 }
125
126 static void reiserfs_vfs_truncate_file(struct inode *inode)
127 {
128         reiserfs_truncate_file(inode, 1);
129 }
130
131 /* Sync a reiserfs file. */
132
133 /*
134  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
135  * be removed...
136  */
137
138 static int reiserfs_sync_file(struct file *p_s_filp,
139                               struct dentry *p_s_dentry, int datasync)
140 {
141         struct inode *p_s_inode = p_s_dentry->d_inode;
142         int n_err;
143         int barrier_done;
144
145         BUG_ON(!S_ISREG(p_s_inode->i_mode));
146         n_err = sync_mapping_buffers(p_s_inode->i_mapping);
147         reiserfs_write_lock(p_s_inode->i_sb);
148         barrier_done = reiserfs_commit_for_inode(p_s_inode);
149         reiserfs_write_unlock(p_s_inode->i_sb);
150         if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
151                 blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
152         if (barrier_done < 0)
153                 return barrier_done;
154         return (n_err < 0) ? -EIO : 0;
155 }
156
157 /* I really do not want to play with memory shortage right now, so
158    to simplify the code, we are not going to write more than this much pages at
159    a time. This still should considerably improve performance compared to 4k
160    at a time case. This is 32 pages of 4k size. */
161 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
162
163 /* Allocates blocks for a file to fulfil write request.
164    Maps all unmapped but prepared pages from the list.
165    Updates metadata with newly allocated blocknumbers as needed */
166 static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,     /* Inode we work with */
167                                                loff_t pos,      /* Writing position */
168                                                int num_pages,   /* number of pages write going
169                                                                    to touch */
170                                                int write_bytes, /* amount of bytes to write */
171                                                struct page **prepared_pages,    /* array of
172                                                                                    prepared pages
173                                                                                  */
174                                                int blocks_to_allocate   /* Amount of blocks we
175                                                                            need to allocate to
176                                                                            fit the data into file
177                                                                          */
178     )
179 {
180         struct cpu_key key;     // cpu key of item that we are going to deal with
181         struct item_head *ih;   // pointer to item head that we are going to deal with
182         struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
183         __le32 *item;           // pointer to item we are going to deal with
184         INITIALIZE_PATH(path);  // path to item, that we are going to deal with.
185         b_blocknr_t *allocated_blocks;  // Pointer to a place where allocated blocknumbers would be stored.
186         reiserfs_blocknr_hint_t hint;   // hint structure for block allocator.
187         size_t res;             // return value of various functions that we call.
188         int curr_block;         // current block used to keep track of unmapped blocks.
189         int i;                  // loop counter
190         int itempos;            // position in item
191         unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));      // writing position in
192         // first page
193         unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;        /* last modified byte offset in last page */
194         __u64 hole_size;        // amount of blocks for a file hole, if it needed to be created.
195         int modifying_this_item = 0;    // Flag for items traversal code to keep track
196         // of the fact that we already prepared
197         // current block for journal
198         int will_prealloc = 0;
199         RFALSE(!blocks_to_allocate,
200                "green-9004: tried to allocate zero blocks?");
201
202         /* only preallocate if this is a small write */
203         if (REISERFS_I(inode)->i_prealloc_count ||
204             (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
205              blocks_to_allocate <
206              REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
207                 will_prealloc =
208                     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
209
210         allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
211                                    sizeof(b_blocknr_t), GFP_NOFS);
212         if (!allocated_blocks)
213                 return -ENOMEM;
214
215         /* First we compose a key to point at the writing position, we want to do
216            that outside of any locking region. */
217         make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
218
219         /* If we came here, it means we absolutely need to open a transaction,
220            since we need to allocate some blocks */
221         reiserfs_write_lock(inode->i_sb);       // Journaling stuff and we need that.
222         res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));   // Wish I know if this number enough
223         if (res)
224                 goto error_exit;
225         reiserfs_update_inode_transaction(inode);
226
227         /* Look for the in-tree position of our write, need path for block allocator */
228         res = search_for_position_by_key(inode->i_sb, &key, &path);
229         if (res == IO_ERROR) {
230                 res = -EIO;
231                 goto error_exit;
232         }
233
234         /* Allocate blocks */
235         /* First fill in "hint" structure for block allocator */
236         hint.th = th;           // transaction handle.
237         hint.path = &path;      // Path, so that block allocator can determine packing locality or whatever it needs to determine.
238         hint.inode = inode;     // Inode is needed by block allocator too.
239         hint.search_start = 0;  // We have no hint on where to search free blocks for block allocator.
240         hint.key = key.on_disk_key;     // on disk key of file.
241         hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);    // Number of disk blocks this file occupies already.
242         hint.formatted_node = 0;        // We are allocating blocks for unformatted node.
243         hint.preallocate = will_prealloc;
244
245         /* Call block allocator to allocate blocks */
246         res =
247             reiserfs_allocate_blocknrs(&hint, allocated_blocks,
248                                        blocks_to_allocate, blocks_to_allocate);
249         if (res != CARRY_ON) {
250                 if (res == NO_DISK_SPACE) {
251                         /* We flush the transaction in case of no space. This way some
252                            blocks might become free */
253                         SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
254                         res = restart_transaction(th, inode, &path);
255                         if (res)
256                                 goto error_exit;
257
258                         /* We might have scheduled, so search again */
259                         res =
260                             search_for_position_by_key(inode->i_sb, &key,
261                                                        &path);
262                         if (res == IO_ERROR) {
263                                 res = -EIO;
264                                 goto error_exit;
265                         }
266
267                         /* update changed info for hint structure. */
268                         res =
269                             reiserfs_allocate_blocknrs(&hint, allocated_blocks,
270                                                        blocks_to_allocate,
271                                                        blocks_to_allocate);
272                         if (res != CARRY_ON) {
273                                 res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
274                                 pathrelse(&path);
275                                 goto error_exit;
276                         }
277                 } else {
278                         res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
279                         pathrelse(&path);
280                         goto error_exit;
281                 }
282         }
283 #ifdef __BIG_ENDIAN
284         // Too bad, I have not found any way to convert a given region from
285         // cpu format to little endian format
286         {
287                 int i;
288                 for (i = 0; i < blocks_to_allocate; i++)
289                         allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
290         }
291 #endif
292
293         /* Blocks allocating well might have scheduled and tree might have changed,
294            let's search the tree again */
295         /* find where in the tree our write should go */
296         res = search_for_position_by_key(inode->i_sb, &key, &path);
297         if (res == IO_ERROR) {
298                 res = -EIO;
299                 goto error_exit_free_blocks;
300         }
301
302         bh = get_last_bh(&path);        // Get a bufferhead for last element in path.
303         ih = get_ih(&path);     // Get a pointer to last item head in path.
304         item = get_item(&path); // Get a pointer to last item in path
305
306         /* Let's see what we have found */
307         if (res != POSITION_FOUND) {    /* position not found, this means that we
308                                            might need to append file with holes
309                                            first */
310                 // Since we are writing past the file's end, we need to find out if
311                 // there is a hole that needs to be inserted before our writing
312                 // position, and how many blocks it is going to cover (we need to
313                 //  populate pointers to file blocks representing the hole with zeros)
314
315                 {
316                         int item_offset = 1;
317                         /*
318                          * if ih is stat data, its offset is 0 and we don't want to
319                          * add 1 to pos in the hole_size calculation
320                          */
321                         if (is_statdata_le_ih(ih))
322                                 item_offset = 0;
323                         hole_size = (pos + item_offset -
324                                      (le_key_k_offset
325                                       (get_inode_item_key_version(inode),
326                                        &(ih->ih_key)) + op_bytes_number(ih,
327                                                                         inode->
328                                                                         i_sb->
329                                                                         s_blocksize)))
330                             >> inode->i_sb->s_blocksize_bits;
331                 }
332
333                 if (hole_size > 0) {
334                         int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);   // How much data to insert first time.
335                         /* area filled with zeroes, to supply as list of zero blocknumbers
336                            We allocate it outside of loop just in case loop would spin for
337                            several iterations. */
338                         char *zeros = kzalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);      // We cannot insert more than MAX_ITEM_LEN bytes anyway.
339                         if (!zeros) {
340                                 res = -ENOMEM;
341                                 goto error_exit_free_blocks;
342                         }
343                         do {
344                                 to_paste =
345                                     min_t(__u64, hole_size,
346                                           MAX_ITEM_LEN(inode->i_sb->
347                                                        s_blocksize) /
348                                           UNFM_P_SIZE);
349                                 if (is_indirect_le_ih(ih)) {
350                                         /* Ok, there is existing indirect item already. Need to append it */
351                                         /* Calculate position past inserted item */
352                                         make_cpu_key(&key, inode,
353                                                      le_key_k_offset
354                                                      (get_inode_item_key_version
355                                                       (inode),
356                                                       &(ih->ih_key)) +
357                                                      op_bytes_number(ih,
358                                                                      inode->
359                                                                      i_sb->
360                                                                      s_blocksize),
361                                                      TYPE_INDIRECT, 3);
362                                         res =
363                                             reiserfs_paste_into_item(th, &path,
364                                                                      &key,
365                                                                      inode,
366                                                                      (char *)
367                                                                      zeros,
368                                                                      UNFM_P_SIZE
369                                                                      *
370                                                                      to_paste);
371                                         if (res) {
372                                                 kfree(zeros);
373                                                 goto error_exit_free_blocks;
374                                         }
375                                 } else if (is_statdata_le_ih(ih)) {
376                                         /* No existing item, create it */
377                                         /* item head for new item */
378                                         struct item_head ins_ih;
379
380                                         /* create a key for our new item */
381                                         make_cpu_key(&key, inode, 1,
382                                                      TYPE_INDIRECT, 3);
383
384                                         /* Create new item head for our new item */
385                                         make_le_item_head(&ins_ih, &key,
386                                                           key.version, 1,
387                                                           TYPE_INDIRECT,
388                                                           to_paste *
389                                                           UNFM_P_SIZE,
390                                                           0 /* free space */ );
391
392                                         /* Find where such item should live in the tree */
393                                         res =
394                                             search_item(inode->i_sb, &key,
395                                                         &path);
396                                         if (res != ITEM_NOT_FOUND) {
397                                                 /* item should not exist, otherwise we have error */
398                                                 if (res != -ENOSPC) {
399                                                         reiserfs_warning(inode->
400                                                                          i_sb,
401                                                                          "green-9008: search_by_key (%K) returned %d",
402                                                                          &key,
403                                                                          res);
404                                                 }
405                                                 res = -EIO;
406                                                 kfree(zeros);
407                                                 goto error_exit_free_blocks;
408                                         }
409                                         res =
410                                             reiserfs_insert_item(th, &path,
411                                                                  &key, &ins_ih,
412                                                                  inode,
413                                                                  (char *)zeros);
414                                 } else {
415                                         reiserfs_panic(inode->i_sb,
416                                                        "green-9011: Unexpected key type %K\n",
417                                                        &key);
418                                 }
419                                 if (res) {
420                                         kfree(zeros);
421                                         goto error_exit_free_blocks;
422                                 }
423                                 /* Now we want to check if transaction is too full, and if it is
424                                    we restart it. This will also free the path. */
425                                 if (journal_transaction_should_end
426                                     (th, th->t_blocks_allocated)) {
427                                         inode->i_size = cpu_key_k_offset(&key) +
428                                                 (to_paste << inode->i_blkbits);
429                                         res =
430                                             restart_transaction(th, inode,
431                                                                 &path);
432                                         if (res) {
433                                                 pathrelse(&path);
434                                                 kfree(zeros);
435                                                 goto error_exit;
436                                         }
437                                 }
438
439                                 /* Well, need to recalculate path and stuff */
440                                 set_cpu_key_k_offset(&key,
441                                                      cpu_key_k_offset(&key) +
442                                                      (to_paste << inode->
443                                                       i_blkbits));
444                                 res =
445                                     search_for_position_by_key(inode->i_sb,
446                                                                &key, &path);
447                                 if (res == IO_ERROR) {
448                                         res = -EIO;
449                                         kfree(zeros);
450                                         goto error_exit_free_blocks;
451                                 }
452                                 bh = get_last_bh(&path);
453                                 ih = get_ih(&path);
454                                 item = get_item(&path);
455                                 hole_size -= to_paste;
456                         } while (hole_size);
457                         kfree(zeros);
458                 }
459         }
460         // Go through existing indirect items first
461         // replace all zeroes with blocknumbers from list
462         // Note that if no corresponding item was found, by previous search,
463         // it means there are no existing in-tree representation for file area
464         // we are going to overwrite, so there is nothing to scan through for holes.
465         for (curr_block = 0, itempos = path.pos_in_item;
466              curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
467               retry:
468
469                 if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
470                         /* We run out of data in this indirect item, let's look for another
471                            one. */
472                         /* First if we are already modifying current item, log it */
473                         if (modifying_this_item) {
474                                 journal_mark_dirty(th, inode->i_sb, bh);
475                                 modifying_this_item = 0;
476                         }
477                         /* Then set the key to look for a new indirect item (offset of old
478                            item is added to old item length */
479                         set_cpu_key_k_offset(&key,
480                                              le_key_k_offset
481                                              (get_inode_item_key_version(inode),
482                                               &(ih->ih_key)) +
483                                              op_bytes_number(ih,
484                                                              inode->i_sb->
485                                                              s_blocksize));
486                         /* Search ofor position of new key in the tree. */
487                         res =
488                             search_for_position_by_key(inode->i_sb, &key,
489                                                        &path);
490                         if (res == IO_ERROR) {
491                                 res = -EIO;
492                                 goto error_exit_free_blocks;
493                         }
494                         bh = get_last_bh(&path);
495                         ih = get_ih(&path);
496                         item = get_item(&path);
497                         itempos = path.pos_in_item;
498                         continue;       // loop to check all kinds of conditions and so on.
499                 }
500                 /* Ok, we have correct position in item now, so let's see if it is
501                    representing file hole (blocknumber is zero) and fill it if needed */
502                 if (!item[itempos]) {
503                         /* Ok, a hole. Now we need to check if we already prepared this
504                            block to be journaled */
505                         while (!modifying_this_item) {  // loop until succeed
506                                 /* Well, this item is not journaled yet, so we must prepare
507                                    it for journal first, before we can change it */
508                                 struct item_head tmp_ih;        // We copy item head of found item,
509                                 // here to detect if fs changed under
510                                 // us while we were preparing for
511                                 // journal.
512                                 int fs_gen;     // We store fs generation here to find if someone
513                                 // changes fs under our feet
514
515                                 copy_item_head(&tmp_ih, ih);    // Remember itemhead
516                                 fs_gen = get_generation(inode->i_sb);   // remember fs generation
517                                 reiserfs_prepare_for_journal(inode->i_sb, bh, 1);       // Prepare a buffer within which indirect item is stored for changing.
518                                 if (fs_changed(fs_gen, inode->i_sb)
519                                     && item_moved(&tmp_ih, &path)) {
520                                         // Sigh, fs was changed under us, we need to look for new
521                                         // location of item we are working with
522
523                                         /* unmark prepaerd area as journaled and search for it's
524                                            new position */
525                                         reiserfs_restore_prepared_buffer(inode->
526                                                                          i_sb,
527                                                                          bh);
528                                         res =
529                                             search_for_position_by_key(inode->
530                                                                        i_sb,
531                                                                        &key,
532                                                                        &path);
533                                         if (res == IO_ERROR) {
534                                                 res = -EIO;
535                                                 goto error_exit_free_blocks;
536                                         }
537                                         bh = get_last_bh(&path);
538                                         ih = get_ih(&path);
539                                         item = get_item(&path);
540                                         itempos = path.pos_in_item;
541                                         goto retry;
542                                 }
543                                 modifying_this_item = 1;
544                         }
545                         item[itempos] = allocated_blocks[curr_block];   // Assign new block
546                         curr_block++;
547                 }
548                 itempos++;
549         }
550
551         if (modifying_this_item) {      // We need to log last-accessed block, if it
552                 // was modified, but not logged yet.
553                 journal_mark_dirty(th, inode->i_sb, bh);
554         }
555
556         if (curr_block < blocks_to_allocate) {
557                 // Oh, well need to append to indirect item, or to create indirect item
558                 // if there weren't any
559                 if (is_indirect_le_ih(ih)) {
560                         // Existing indirect item - append. First calculate key for append
561                         // position. We do not need to recalculate path as it should
562                         // already point to correct place.
563                         make_cpu_key(&key, inode,
564                                      le_key_k_offset(get_inode_item_key_version
565                                                      (inode),
566                                                      &(ih->ih_key)) +
567                                      op_bytes_number(ih,
568                                                      inode->i_sb->s_blocksize),
569                                      TYPE_INDIRECT, 3);
570                         res =
571                             reiserfs_paste_into_item(th, &path, &key, inode,
572                                                      (char *)(allocated_blocks +
573                                                               curr_block),
574                                                      UNFM_P_SIZE *
575                                                      (blocks_to_allocate -
576                                                       curr_block));
577                         if (res) {
578                                 goto error_exit_free_blocks;
579                         }
580                 } else if (is_statdata_le_ih(ih)) {
581                         // Last found item was statdata. That means we need to create indirect item.
582                         struct item_head ins_ih;        /* itemhead for new item */
583
584                         /* create a key for our new item */
585                         make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3); // Position one,
586                         // because that's
587                         // where first
588                         // indirect item
589                         // begins
590                         /* Create new item head for our new item */
591                         make_le_item_head(&ins_ih, &key, key.version, 1,
592                                           TYPE_INDIRECT,
593                                           (blocks_to_allocate -
594                                            curr_block) * UNFM_P_SIZE,
595                                           0 /* free space */ );
596                         /* Find where such item should live in the tree */
597                         res = search_item(inode->i_sb, &key, &path);
598                         if (res != ITEM_NOT_FOUND) {
599                                 /* Well, if we have found such item already, or some error
600                                    occured, we need to warn user and return error */
601                                 if (res != -ENOSPC) {
602                                         reiserfs_warning(inode->i_sb,
603                                                          "green-9009: search_by_key (%K) "
604                                                          "returned %d", &key,
605                                                          res);
606                                 }
607                                 res = -EIO;
608                                 goto error_exit_free_blocks;
609                         }
610                         /* Insert item into the tree with the data as its body */
611                         res =
612                             reiserfs_insert_item(th, &path, &key, &ins_ih,
613                                                  inode,
614                                                  (char *)(allocated_blocks +
615                                                           curr_block));
616                 } else {
617                         reiserfs_panic(inode->i_sb,
618                                        "green-9010: unexpected item type for key %K\n",
619                                        &key);
620                 }
621         }
622         // the caller is responsible for closing the transaction
623         // unless we return an error, they are also responsible for logging
624         // the inode.
625         //
626         pathrelse(&path);
627         /*
628          * cleanup prellocation from previous writes
629          * if this is a partial block write
630          */
631         if (write_bytes & (inode->i_sb->s_blocksize - 1))
632                 reiserfs_discard_prealloc(th, inode);
633         reiserfs_write_unlock(inode->i_sb);
634
635         // go through all the pages/buffers and map the buffers to newly allocated
636         // blocks (so that system knows where to write these pages later).
637         curr_block = 0;
638         for (i = 0; i < num_pages; i++) {
639                 struct page *page = prepared_pages[i];  //current page
640                 struct buffer_head *head = page_buffers(page);  // first buffer for a page
641                 int block_start, block_end;     // in-page offsets for buffers.
642
643                 if (!page_buffers(page))
644                         reiserfs_panic(inode->i_sb,
645                                        "green-9005: No buffers for prepared page???");
646
647                 /* For each buffer in page */
648                 for (bh = head, block_start = 0; bh != head || !block_start;
649                      block_start = block_end, bh = bh->b_this_page) {
650                         if (!bh)
651                                 reiserfs_panic(inode->i_sb,
652                                                "green-9006: Allocated but absent buffer for a page?");
653                         block_end = block_start + inode->i_sb->s_blocksize;
654                         if (i == 0 && block_end <= from)
655                                 /* if this buffer is before requested data to map, skip it */
656                                 continue;
657                         if (i == num_pages - 1 && block_start >= to)
658                                 /* If this buffer is after requested data to map, abort
659                                    processing of current page */
660                                 break;
661
662                         if (!buffer_mapped(bh)) {       // Ok, unmapped buffer, need to map it
663                                 map_bh(bh, inode->i_sb,
664                                        le32_to_cpu(allocated_blocks
665                                                    [curr_block]));
666                                 curr_block++;
667                                 set_buffer_new(bh);
668                         }
669                 }
670         }
671
672         RFALSE(curr_block > blocks_to_allocate,
673                "green-9007: Used too many blocks? weird");
674
675         kfree(allocated_blocks);
676         return 0;
677
678 // Need to deal with transaction here.
679       error_exit_free_blocks:
680         pathrelse(&path);
681         // free blocks
682         for (i = 0; i < blocks_to_allocate; i++)
683                 reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
684                                     1);
685
686       error_exit:
687         if (th->t_trans_id) {
688                 int err;
689                 // update any changes we made to blk count
690                 mark_inode_dirty(inode);
691                 err =
692                     journal_end(th, inode->i_sb,
693                                 JOURNAL_PER_BALANCE_CNT * 3 + 1 +
694                                 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
695                 if (err)
696                         res = err;
697         }
698         reiserfs_write_unlock(inode->i_sb);
699         kfree(allocated_blocks);
700
701         return res;
702 }
703
704 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
705 static void reiserfs_unprepare_pages(struct page **prepared_pages,      /* list of locked pages */
706                                      size_t num_pages /* amount of pages */ )
707 {
708         int i;                  // loop counter
709
710         for (i = 0; i < num_pages; i++) {
711                 struct page *page = prepared_pages[i];
712
713                 try_to_free_buffers(page);
714                 unlock_page(page);
715                 page_cache_release(page);
716         }
717 }
718
719 /* This function will copy data from userspace to specified pages within
720    supplied byte range */
721 static int reiserfs_copy_from_user_to_file_region(loff_t pos,   /* In-file position */
722                                                   int num_pages,        /* Number of pages affected */
723                                                   int write_bytes,      /* Amount of bytes to write */
724                                                   struct page **prepared_pages, /* pointer to 
725                                                                                    array to
726                                                                                    prepared pages
727                                                                                  */
728                                                   const char __user * buf       /* Pointer to user-supplied
729                                                                                    data */
730     )
731 {
732         long page_fault = 0;    // status of copy_from_user.
733         int i;                  // loop counter.
734         int offset;             // offset in page
735
736         for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
737              i++, offset = 0) {
738                 size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);    // How much of bytes to write to this page
739                 struct page *page = prepared_pages[i];  // Current page we process.
740
741                 fault_in_pages_readable(buf, count);
742
743                 /* Copy data from userspace to the current page */
744                 kmap(page);
745                 page_fault = __copy_from_user(page_address(page) + offset, buf, count); // Copy the data.
746                 /* Flush processor's dcache for this page */
747                 flush_dcache_page(page);
748                 kunmap(page);
749                 buf += count;
750                 write_bytes -= count;
751
752                 if (page_fault)
753                         break;  // Was there a fault? abort.
754         }
755
756         return page_fault ? -EFAULT : 0;
757 }
758
759 /* taken fs/buffer.c:__block_commit_write */
760 int reiserfs_commit_page(struct inode *inode, struct page *page,
761                          unsigned from, unsigned to)
762 {
763         unsigned block_start, block_end;
764         int partial = 0;
765         unsigned blocksize;
766         struct buffer_head *bh, *head;
767         unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
768         int new;
769         int logit = reiserfs_file_data_log(inode);
770         struct super_block *s = inode->i_sb;
771         int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
772         struct reiserfs_transaction_handle th;
773         int ret = 0;
774
775         th.t_trans_id = 0;
776         blocksize = 1 << inode->i_blkbits;
777
778         if (logit) {
779                 reiserfs_write_lock(s);
780                 ret = journal_begin(&th, s, bh_per_page + 1);
781                 if (ret)
782                         goto drop_write_lock;
783                 reiserfs_update_inode_transaction(inode);
784         }
785         for (bh = head = page_buffers(page), block_start = 0;
786              bh != head || !block_start;
787              block_start = block_end, bh = bh->b_this_page) {
788
789                 new = buffer_new(bh);
790                 clear_buffer_new(bh);
791                 block_end = block_start + blocksize;
792                 if (block_end <= from || block_start >= to) {
793                         if (!buffer_uptodate(bh))
794                                 partial = 1;
795                 } else {
796                         set_buffer_uptodate(bh);
797                         if (logit) {
798                                 reiserfs_prepare_for_journal(s, bh, 1);
799                                 journal_mark_dirty(&th, s, bh);
800                         } else if (!buffer_dirty(bh)) {
801                                 mark_buffer_dirty(bh);
802                                 /* do data=ordered on any page past the end
803                                  * of file and any buffer marked BH_New.
804                                  */
805                                 if (reiserfs_data_ordered(inode->i_sb) &&
806                                     (new || page->index >= i_size_index)) {
807                                         reiserfs_add_ordered_list(inode, bh);
808                                 }
809                         }
810                 }
811         }
812         if (logit) {
813                 ret = journal_end(&th, s, bh_per_page + 1);
814               drop_write_lock:
815                 reiserfs_write_unlock(s);
816         }
817         /*
818          * If this is a partial write which happened to make all buffers
819          * uptodate then we can optimize away a bogus readpage() for
820          * the next read(). Here we 'discover' whether the page went
821          * uptodate as a result of this (potentially partial) write.
822          */
823         if (!partial)
824                 SetPageUptodate(page);
825         return ret;
826 }
827
828 /* Submit pages for write. This was separated from actual file copying
829    because we might want to allocate block numbers in-between.
830    This function assumes that caller will adjust file size to correct value. */
831 static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,       /* Writing position offset */
832                                                  size_t num_pages,      /* Number of pages to write */
833                                                  size_t write_bytes,    /* number of bytes to write */
834                                                  struct page **prepared_pages   /* list of pages */
835     )
836 {
837         int status;             // return status of block_commit_write.
838         int retval = 0;         // Return value we are going to return.
839         int i;                  // loop counter
840         int offset;             // Writing offset in page.
841         int orig_write_bytes = write_bytes;
842         int sd_update = 0;
843
844         for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
845              i++, offset = 0) {
846                 int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);  // How much of bytes to write to this page
847                 struct page *page = prepared_pages[i];  // Current page we process.
848
849                 status =
850                     reiserfs_commit_page(inode, page, offset, offset + count);
851                 if (status)
852                         retval = status;        // To not overcomplicate matters We are going to
853                 // submit all the pages even if there was error.
854                 // we only remember error status to report it on
855                 // exit.
856                 write_bytes -= count;
857         }
858         /* now that we've gotten all the ordered buffers marked dirty,
859          * we can safely update i_size and close any running transaction
860          */
861         if (pos + orig_write_bytes > inode->i_size) {
862                 inode->i_size = pos + orig_write_bytes; // Set new size
863                 /* If the file have grown so much that tail packing is no
864                  * longer possible, reset "need to pack" flag */
865                 if ((have_large_tails(inode->i_sb) &&
866                      inode->i_size > i_block_size(inode) * 4) ||
867                     (have_small_tails(inode->i_sb) &&
868                      inode->i_size > i_block_size(inode)))
869                         REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
870                 else if ((have_large_tails(inode->i_sb) &&
871                           inode->i_size < i_block_size(inode) * 4) ||
872                          (have_small_tails(inode->i_sb) &&
873                           inode->i_size < i_block_size(inode)))
874                         REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
875
876                 if (th->t_trans_id) {
877                         reiserfs_write_lock(inode->i_sb);
878                         // this sets the proper flags for O_SYNC to trigger a commit
879                         mark_inode_dirty(inode);
880                         reiserfs_write_unlock(inode->i_sb);
881                 } else {
882                         reiserfs_write_lock(inode->i_sb);
883                         reiserfs_update_inode_transaction(inode);
884                         mark_inode_dirty(inode);
885                         reiserfs_write_unlock(inode->i_sb);
886                 }
887
888                 sd_update = 1;
889         }
890         if (th->t_trans_id) {
891                 reiserfs_write_lock(inode->i_sb);
892                 if (!sd_update)
893                         mark_inode_dirty(inode);
894                 status = journal_end(th, th->t_super, th->t_blocks_allocated);
895                 if (status)
896                         retval = status;
897                 reiserfs_write_unlock(inode->i_sb);
898         }
899         th->t_trans_id = 0;
900
901         /* 
902          * we have to unlock the pages after updating i_size, otherwise
903          * we race with writepage
904          */
905         for (i = 0; i < num_pages; i++) {
906                 struct page *page = prepared_pages[i];
907                 unlock_page(page);
908                 mark_page_accessed(page);
909                 page_cache_release(page);
910         }
911         return retval;
912 }
913
914 /* Look if passed writing region is going to touch file's tail
915    (if it is present). And if it is, convert the tail to unformatted node */
916 static int reiserfs_check_for_tail_and_convert(struct inode *inode,     /* inode to deal with */
917                                                loff_t pos,      /* Writing position */
918                                                int write_bytes  /* amount of bytes to write */
919     )
920 {
921         INITIALIZE_PATH(path);  // needed for search_for_position
922         struct cpu_key key;     // Key that would represent last touched writing byte.
923         struct item_head *ih;   // item header of found block;
924         int res;                // Return value of various functions we call.
925         int cont_expand_offset; // We will put offset for generic_cont_expand here
926         // This can be int just because tails are created
927         // only for small files.
928
929 /* this embodies a dependency on a particular tail policy */
930         if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
931                 /* such a big files do not have tails, so we won't bother ourselves
932                    to look for tails, simply return */
933                 return 0;
934         }
935
936         reiserfs_write_lock(inode->i_sb);
937         /* find the item containing the last byte to be written, or if
938          * writing past the end of the file then the last item of the
939          * file (and then we check its type). */
940         make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
941                      3 /*key length */ );
942         res = search_for_position_by_key(inode->i_sb, &key, &path);
943         if (res == IO_ERROR) {
944                 reiserfs_write_unlock(inode->i_sb);
945                 return -EIO;
946         }
947         ih = get_ih(&path);
948         res = 0;
949         if (is_direct_le_ih(ih)) {
950                 /* Ok, closest item is file tail (tails are stored in "direct"
951                  * items), so we need to unpack it. */
952                 /* To not overcomplicate matters, we just call generic_cont_expand
953                    which will in turn call other stuff and finally will boil down to
954                    reiserfs_get_block() that would do necessary conversion. */
955                 cont_expand_offset =
956                     le_key_k_offset(get_inode_item_key_version(inode),
957                                     &(ih->ih_key));
958                 pathrelse(&path);
959                 res = generic_cont_expand(inode, cont_expand_offset);
960         } else
961                 pathrelse(&path);
962
963         reiserfs_write_unlock(inode->i_sb);
964         return res;
965 }
966
967 /* This function locks pages starting from @pos for @inode.
968    @num_pages pages are locked and stored in
969    @prepared_pages array. Also buffers are allocated for these pages.
970    First and last page of the region is read if it is overwritten only
971    partially. If last page did not exist before write (file hole or file
972    append), it is zeroed, then. 
973    Returns number of unallocated blocks that should be allocated to cover
974    new file data.*/
975 static int reiserfs_prepare_file_region_for_write(struct inode *inode
976                                                   /* Inode of the file */ ,
977                                                   loff_t pos,   /* position in the file */
978                                                   size_t num_pages,     /* number of pages to
979                                                                            prepare */
980                                                   size_t write_bytes,   /* Amount of bytes to be
981                                                                            overwritten from
982                                                                            @pos */
983                                                   struct page **prepared_pages  /* pointer to array
984                                                                                    where to store
985                                                                                    prepared pages */
986     )
987 {
988         int res = 0;            // Return values of different functions we call.
989         unsigned long index = pos >> PAGE_CACHE_SHIFT;  // Offset in file in pages.
990         int from = (pos & (PAGE_CACHE_SIZE - 1));       // Writing offset in first page
991         int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
992         /* offset of last modified byte in last
993            page */
994         struct address_space *mapping = inode->i_mapping;       // Pages are mapped here.
995         int i;                  // Simple counter
996         int blocks = 0;         /* Return value (blocks that should be allocated) */
997         struct buffer_head *bh, *head;  // Current bufferhead and first bufferhead
998         // of a page.
999         unsigned block_start, block_end;        // Starting and ending offsets of current
1000         // buffer in the page.
1001         struct buffer_head *wait[2], **wait_bh = wait;  // Buffers for page, if
1002         // Page appeared to be not up
1003         // to date. Note how we have
1004         // at most 2 buffers, this is
1005         // because we at most may
1006         // partially overwrite two
1007         // buffers for one page. One at                                                 // the beginning of write area
1008         // and one at the end.
1009         // Everything inthe middle gets                                                 // overwritten totally.
1010
1011         struct cpu_key key;     // cpu key of item that we are going to deal with
1012         struct item_head *ih = NULL;    // pointer to item head that we are going to deal with
1013         struct buffer_head *itembuf = NULL;     // Buffer head that contains items that we are going to deal with
1014         INITIALIZE_PATH(path);  // path to item, that we are going to deal with.
1015         __le32 *item = NULL;    // pointer to item we are going to deal with
1016         int item_pos = -1;      /* Position in indirect item */
1017
1018         if (num_pages < 1) {
1019                 reiserfs_warning(inode->i_sb,
1020                                  "green-9001: reiserfs_prepare_file_region_for_write "
1021                                  "called with zero number of pages to process");
1022                 return -EFAULT;
1023         }
1024
1025         /* We have 2 loops for pages. In first loop we grab and lock the pages, so
1026            that nobody would touch these until we release the pages. Then
1027            we'd start to deal with mapping buffers to blocks. */
1028         for (i = 0; i < num_pages; i++) {
1029                 prepared_pages[i] = grab_cache_page(mapping, index + i);        // locks the page
1030                 if (!prepared_pages[i]) {
1031                         res = -ENOMEM;
1032                         goto failed_page_grabbing;
1033                 }
1034                 if (!page_has_buffers(prepared_pages[i]))
1035                         create_empty_buffers(prepared_pages[i],
1036                                              inode->i_sb->s_blocksize, 0);
1037         }
1038
1039         /* Let's count amount of blocks for a case where all the blocks
1040            overwritten are new (we will substract already allocated blocks later) */
1041         if (num_pages > 2)
1042                 /* These are full-overwritten pages so we count all the blocks in
1043                    these pages are counted as needed to be allocated */
1044                 blocks =
1045                     (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1046
1047         /* count blocks needed for first page (possibly partially written) */
1048         blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));   /* roundup */
1049
1050         /* Now we account for last page. If last page == first page (we
1051            overwrite only one page), we substract all the blocks past the
1052            last writing position in a page out of already calculated number
1053            of blocks */
1054         blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1055             ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1056         /* Note how we do not roundup here since partial blocks still
1057            should be allocated */
1058
1059         /* Now if all the write area lies past the file end, no point in
1060            maping blocks, since there is none, so we just zero out remaining
1061            parts of first and last pages in write area (if needed) */
1062         if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1063                 if (from != 0) {        /* First page needs to be partially zeroed */
1064                         char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1065                         memset(kaddr, 0, from);
1066                         kunmap_atomic(kaddr, KM_USER0);
1067                         flush_dcache_page(prepared_pages[0]);
1068                 }
1069                 if (to != PAGE_CACHE_SIZE) {    /* Last page needs to be partially zeroed */
1070                         char *kaddr =
1071                             kmap_atomic(prepared_pages[num_pages - 1],
1072                                         KM_USER0);
1073                         memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1074                         kunmap_atomic(kaddr, KM_USER0);
1075                         flush_dcache_page(prepared_pages[num_pages - 1]);
1076                 }
1077
1078                 /* Since all blocks are new - use already calculated value */
1079                 return blocks;
1080         }
1081
1082         /* Well, since we write somewhere into the middle of a file, there is
1083            possibility we are writing over some already allocated blocks, so
1084            let's map these blocks and substract number of such blocks out of blocks
1085            we need to allocate (calculated above) */
1086         /* Mask write position to start on blocksize, we do it out of the
1087            loop for performance reasons */
1088         pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1089         /* Set cpu key to the starting position in a file (on left block boundary) */
1090         make_cpu_key(&key, inode,
1091                      1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1092                      TYPE_ANY, 3 /*key length */ );
1093
1094         reiserfs_write_lock(inode->i_sb);       // We need that for at least search_by_key()
1095         for (i = 0; i < num_pages; i++) {
1096
1097                 head = page_buffers(prepared_pages[i]);
1098                 /* For each buffer in the page */
1099                 for (bh = head, block_start = 0; bh != head || !block_start;
1100                      block_start = block_end, bh = bh->b_this_page) {
1101                         if (!bh)
1102                                 reiserfs_panic(inode->i_sb,
1103                                                "green-9002: Allocated but absent buffer for a page?");
1104                         /* Find where this buffer ends */
1105                         block_end = block_start + inode->i_sb->s_blocksize;
1106                         if (i == 0 && block_end <= from)
1107                                 /* if this buffer is before requested data to map, skip it */
1108                                 continue;
1109
1110                         if (i == num_pages - 1 && block_start >= to) {
1111                                 /* If this buffer is after requested data to map, abort
1112                                    processing of current page */
1113                                 break;
1114                         }
1115
1116                         if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1117                                 /* This is optimisation for a case where buffer is mapped
1118                                    and have blocknumber assigned. In case significant amount
1119                                    of such buffers are present, we may avoid some amount
1120                                    of search_by_key calls.
1121                                    Probably it would be possible to move parts of this code
1122                                    out of BKL, but I afraid that would overcomplicate code
1123                                    without any noticeable benefit.
1124                                  */
1125                                 item_pos++;
1126                                 /* Update the key */
1127                                 set_cpu_key_k_offset(&key,
1128                                                      cpu_key_k_offset(&key) +
1129                                                      inode->i_sb->s_blocksize);
1130                                 blocks--;       // Decrease the amount of blocks that need to be
1131                                 // allocated
1132                                 continue;       // Go to the next buffer
1133                         }
1134
1135                         if (!itembuf || /* if first iteration */
1136                             item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {        /* or if we progressed past the
1137                                                                                    current unformatted_item */
1138                                 /* Try to find next item */
1139                                 res =
1140                                     search_for_position_by_key(inode->i_sb,
1141                                                                &key, &path);
1142                                 /* Abort if no more items */
1143                                 if (res != POSITION_FOUND) {
1144                                         /* make sure later loops don't use this item */
1145                                         itembuf = NULL;
1146                                         item = NULL;
1147                                         break;
1148                                 }
1149
1150                                 /* Update information about current indirect item */
1151                                 itembuf = get_last_bh(&path);
1152                                 ih = get_ih(&path);
1153                                 item = get_item(&path);
1154                                 item_pos = path.pos_in_item;
1155
1156                                 RFALSE(!is_indirect_le_ih(ih),
1157                                        "green-9003: indirect item expected");
1158                         }
1159
1160                         /* See if there is some block associated with the file
1161                            at that position, map the buffer to this block */
1162                         if (get_block_num(item, item_pos)) {
1163                                 map_bh(bh, inode->i_sb,
1164                                        get_block_num(item, item_pos));
1165                                 blocks--;       // Decrease the amount of blocks that need to be
1166                                 // allocated
1167                         }
1168                         item_pos++;
1169                         /* Update the key */
1170                         set_cpu_key_k_offset(&key,
1171                                              cpu_key_k_offset(&key) +
1172                                              inode->i_sb->s_blocksize);
1173                 }
1174         }
1175         pathrelse(&path);       // Free the path
1176         reiserfs_write_unlock(inode->i_sb);
1177
1178         /* Now zero out unmappend buffers for the first and last pages of
1179            write area or issue read requests if page is mapped. */
1180         /* First page, see if it is not uptodate */
1181         if (!PageUptodate(prepared_pages[0])) {
1182                 head = page_buffers(prepared_pages[0]);
1183
1184                 /* For each buffer in page */
1185                 for (bh = head, block_start = 0; bh != head || !block_start;
1186                      block_start = block_end, bh = bh->b_this_page) {
1187
1188                         if (!bh)
1189                                 reiserfs_panic(inode->i_sb,
1190                                                "green-9002: Allocated but absent buffer for a page?");
1191                         /* Find where this buffer ends */
1192                         block_end = block_start + inode->i_sb->s_blocksize;
1193                         if (block_end <= from)
1194                                 /* if this buffer is before requested data to map, skip it */
1195                                 continue;
1196                         if (block_start < from) {       /* Aha, our partial buffer */
1197                                 if (buffer_mapped(bh)) {        /* If it is mapped, we need to
1198                                                                    issue READ request for it to
1199                                                                    not loose data */
1200                                         ll_rw_block(READ, 1, &bh);
1201                                         *wait_bh++ = bh;
1202                                 } else {        /* Not mapped, zero it */
1203                                         char *kaddr =
1204                                             kmap_atomic(prepared_pages[0],
1205                                                         KM_USER0);
1206                                         memset(kaddr + block_start, 0,
1207                                                from - block_start);
1208                                         kunmap_atomic(kaddr, KM_USER0);
1209                                         flush_dcache_page(prepared_pages[0]);
1210                                         set_buffer_uptodate(bh);
1211                                 }
1212                         }
1213                 }
1214         }
1215
1216         /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1217         if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1218             ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1219             (inode->i_size >> PAGE_CACHE_SHIFT)) {
1220                 head = page_buffers(prepared_pages[num_pages - 1]);
1221
1222                 /* for each buffer in page */
1223                 for (bh = head, block_start = 0; bh != head || !block_start;
1224                      block_start = block_end, bh = bh->b_this_page) {
1225
1226                         if (!bh)
1227                                 reiserfs_panic(inode->i_sb,
1228                                                "green-9002: Allocated but absent buffer for a page?");
1229                         /* Find where this buffer ends */
1230                         block_end = block_start + inode->i_sb->s_blocksize;
1231                         if (block_start >= to)
1232                                 /* if this buffer is after requested data to map, skip it */
1233                                 break;
1234                         if (block_end > to) {   /* Aha, our partial buffer */
1235                                 if (buffer_mapped(bh)) {        /* If it is mapped, we need to
1236                                                                    issue READ request for it to
1237                                                                    not loose data */
1238                                         ll_rw_block(READ, 1, &bh);
1239                                         *wait_bh++ = bh;
1240                                 } else {        /* Not mapped, zero it */
1241                                         char *kaddr =
1242                                             kmap_atomic(prepared_pages
1243                                                         [num_pages - 1],
1244                                                         KM_USER0);
1245                                         memset(kaddr + to, 0, block_end - to);
1246                                         kunmap_atomic(kaddr, KM_USER0);
1247                                         flush_dcache_page(prepared_pages[num_pages - 1]);
1248                                         set_buffer_uptodate(bh);
1249                                 }
1250                         }
1251                 }
1252         }
1253
1254         /* Wait for read requests we made to happen, if necessary */
1255         while (wait_bh > wait) {
1256                 wait_on_buffer(*--wait_bh);
1257                 if (!buffer_uptodate(*wait_bh)) {
1258                         res = -EIO;
1259                         goto failed_read;
1260                 }
1261         }
1262
1263         return blocks;
1264       failed_page_grabbing:
1265         num_pages = i;
1266       failed_read:
1267         reiserfs_unprepare_pages(prepared_pages, num_pages);
1268         return res;
1269 }
1270
1271 /* Write @count bytes at position @ppos in a file indicated by @file
1272    from the buffer @buf.  
1273
1274    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1275    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1276    written for (ext2/3).  This is for several reasons:
1277
1278    * It has no understanding of any filesystem specific optimizations.
1279
1280    * It enters the filesystem repeatedly for each page that is written.
1281
1282    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1283    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1284    * to reiserfs which allows for fewer tree traversals.
1285
1286    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1287
1288    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1289
1290    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1291    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1292    things right finally.
1293
1294    Future Features: providing search_by_key with hints.
1295
1296 */
1297 static ssize_t reiserfs_file_write(struct file *file,   /* the file we are going to write into */
1298                                    const char __user * buf,     /*  pointer to user supplied data
1299                                                                    (in userspace) */
1300                                    size_t count,        /* amount of bytes to write */
1301                                    loff_t * ppos        /* pointer to position in file that we start writing at. Should be updated to
1302                                                          * new current position before returning. */
1303                                    )
1304 {
1305         size_t already_written = 0;     // Number of bytes already written to the file.
1306         loff_t pos;             // Current position in the file.
1307         ssize_t res;            // return value of various functions that we call.
1308         int err = 0;
1309         struct inode *inode = file->f_path.dentry->d_inode;     // Inode of the file that we are writing to.
1310         /* To simplify coding at this time, we store
1311            locked pages in array for now */
1312         struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1313         struct reiserfs_transaction_handle th;
1314         th.t_trans_id = 0;
1315
1316         /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
1317         * lying around (most of the disk, in fact). Despite the filesystem
1318         * now being a v3.6 format, the old items still can't support large
1319         * file sizes. Catch this case here, as the rest of the VFS layer is
1320         * oblivious to the different limitations between old and new items.
1321         * reiserfs_setattr catches this for truncates. This chunk is lifted
1322         * from generic_write_checks. */
1323         if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1324             *ppos + count > MAX_NON_LFS) {
1325                 if (*ppos >= MAX_NON_LFS) {
1326                         send_sig(SIGXFSZ, current, 0);
1327                         return -EFBIG;
1328                 }
1329                 if (count > MAX_NON_LFS - (unsigned long)*ppos)
1330                         count = MAX_NON_LFS - (unsigned long)*ppos;
1331         }
1332
1333         if (file->f_flags & O_DIRECT)
1334                 return do_sync_write(file, buf, count, ppos);
1335
1336         if (unlikely((ssize_t) count < 0))
1337                 return -EINVAL;
1338
1339         if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1340                 return -EFAULT;
1341
1342         mutex_lock(&inode->i_mutex);    // locks the entire file for just us
1343
1344         pos = *ppos;
1345
1346         /* Check if we can write to specified region of file, file
1347            is not overly big and this kind of stuff. Adjust pos and
1348            count, if needed */
1349         res = generic_write_checks(file, &pos, &count, 0);
1350         if (res)
1351                 goto out;
1352
1353         if (count == 0)
1354                 goto out;
1355
1356         res = remove_suid(file->f_path.dentry);
1357         if (res)
1358                 goto out;
1359
1360         file_update_time(file);
1361
1362         // Ok, we are done with all the checks.
1363
1364         // Now we should start real work
1365
1366         /* If we are going to write past the file's packed tail or if we are going
1367            to overwrite part of the tail, we need that tail to be converted into
1368            unformatted node */
1369         res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1370         if (res)
1371                 goto out;
1372
1373         while (count > 0) {
1374                 /* This is the main loop in which we running until some error occures
1375                    or until we write all of the data. */
1376                 size_t num_pages;       /* amount of pages we are going to write this iteration */
1377                 size_t write_bytes;     /* amount of bytes to write during this iteration */
1378                 size_t blocks_to_allocate;      /* how much blocks we need to allocate for this iteration */
1379
1380                 /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1381                 num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1382                                                                            pages */
1383                     ((count +
1384                       (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1385                 /* convert size to amount of
1386                    pages */
1387                 reiserfs_write_lock(inode->i_sb);
1388                 if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1389                     || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1390                         /* If we were asked to write more data than we want to or if there
1391                            is not that much space, then we shorten amount of data to write
1392                            for this iteration. */
1393                         num_pages =
1394                             min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1395                                   reiserfs_can_fit_pages(inode->i_sb));
1396                         /* Also we should not forget to set size in bytes accordingly */
1397                         write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1398                             (pos & (PAGE_CACHE_SIZE - 1));
1399                         /* If position is not on the
1400                            start of the page, we need
1401                            to substract the offset
1402                            within page */
1403                 } else
1404                         write_bytes = count;
1405
1406                 /* reserve the blocks to be allocated later, so that later on
1407                    we still have the space to write the blocks to */
1408                 reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1409                                                       num_pages <<
1410                                                       (PAGE_CACHE_SHIFT -
1411                                                        inode->i_blkbits));
1412                 reiserfs_write_unlock(inode->i_sb);
1413
1414                 if (!num_pages) {       /* If we do not have enough space even for a single page... */
1415                         if (pos >
1416                             inode->i_size + inode->i_sb->s_blocksize -
1417                             (pos & (inode->i_sb->s_blocksize - 1))) {
1418                                 res = -ENOSPC;
1419                                 break;  // In case we are writing past the end of the last file block, break.
1420                         }
1421                         // Otherwise we are possibly overwriting the file, so
1422                         // let's set write size to be equal or less than blocksize.
1423                         // This way we get it correctly for file holes.
1424                         // But overwriting files on absolutelly full volumes would not
1425                         // be very efficient. Well, people are not supposed to fill
1426                         // 100% of disk space anyway.
1427                         write_bytes =
1428                             min_t(size_t, count,
1429                                   inode->i_sb->s_blocksize -
1430                                   (pos & (inode->i_sb->s_blocksize - 1)));
1431                         num_pages = 1;
1432                         // No blocks were claimed before, so do it now.
1433                         reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1434                                                               1 <<
1435                                                               (PAGE_CACHE_SHIFT
1436                                                                -
1437                                                                inode->
1438                                                                i_blkbits));
1439                 }
1440
1441                 /* Prepare for writing into the region, read in all the
1442                    partially overwritten pages, if needed. And lock the pages,
1443                    so that nobody else can access these until we are done.
1444                    We get number of actual blocks needed as a result. */
1445                 res = reiserfs_prepare_file_region_for_write(inode, pos,
1446                                                              num_pages,
1447                                                              write_bytes,
1448                                                              prepared_pages);
1449                 if (res < 0) {
1450                         reiserfs_release_claimed_blocks(inode->i_sb,
1451                                                         num_pages <<
1452                                                         (PAGE_CACHE_SHIFT -
1453                                                          inode->i_blkbits));
1454                         break;
1455                 }
1456
1457                 blocks_to_allocate = res;
1458
1459                 /* First we correct our estimate of how many blocks we need */
1460                 reiserfs_release_claimed_blocks(inode->i_sb,
1461                                                 (num_pages <<
1462                                                  (PAGE_CACHE_SHIFT -
1463                                                   inode->i_sb->
1464                                                   s_blocksize_bits)) -
1465                                                 blocks_to_allocate);
1466
1467                 if (blocks_to_allocate > 0) {   /*We only allocate blocks if we need to */
1468                         /* Fill in all the possible holes and append the file if needed */
1469                         res =
1470                             reiserfs_allocate_blocks_for_region(&th, inode, pos,
1471                                                                 num_pages,
1472                                                                 write_bytes,
1473                                                                 prepared_pages,
1474                                                                 blocks_to_allocate);
1475                 }
1476
1477                 /* well, we have allocated the blocks, so it is time to free
1478                    the reservation we made earlier. */
1479                 reiserfs_release_claimed_blocks(inode->i_sb,
1480                                                 blocks_to_allocate);
1481                 if (res) {
1482                         reiserfs_unprepare_pages(prepared_pages, num_pages);
1483                         break;
1484                 }
1485
1486 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1487    and probably we would do that just to get rid of garbage in files after a
1488    crash */
1489
1490                 /* Copy data from user-supplied buffer to file's pages */
1491                 res =
1492                     reiserfs_copy_from_user_to_file_region(pos, num_pages,
1493                                                            write_bytes,
1494                                                            prepared_pages, buf);
1495                 if (res) {
1496                         reiserfs_unprepare_pages(prepared_pages, num_pages);
1497                         break;
1498                 }
1499
1500                 /* Send the pages to disk and unlock them. */
1501                 res =
1502                     reiserfs_submit_file_region_for_write(&th, inode, pos,
1503                                                           num_pages,
1504                                                           write_bytes,
1505                                                           prepared_pages);
1506                 if (res)
1507                         break;
1508
1509                 already_written += write_bytes;
1510                 buf += write_bytes;
1511                 *ppos = pos += write_bytes;
1512                 count -= write_bytes;
1513                 balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
1514         }
1515
1516         /* this is only true on error */
1517         if (th.t_trans_id) {
1518                 reiserfs_write_lock(inode->i_sb);
1519                 err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1520                 reiserfs_write_unlock(inode->i_sb);
1521                 if (err) {
1522                         res = err;
1523                         goto out;
1524                 }
1525         }
1526
1527         if (likely(res >= 0) &&
1528             (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))))
1529                 res = generic_osync_inode(inode, file->f_mapping,
1530                                           OSYNC_METADATA | OSYNC_DATA);
1531
1532         mutex_unlock(&inode->i_mutex);
1533         reiserfs_async_progress_wait(inode->i_sb);
1534         return (already_written != 0) ? already_written : res;
1535
1536       out:
1537         mutex_unlock(&inode->i_mutex);  // unlock the file on exit.
1538         return res;
1539 }
1540
1541 const struct file_operations reiserfs_file_operations = {
1542         .read = do_sync_read,
1543         .write = reiserfs_file_write,
1544         .ioctl = reiserfs_ioctl,
1545 #ifdef CONFIG_COMPAT
1546         .compat_ioctl = reiserfs_compat_ioctl,
1547 #endif
1548         .mmap = reiserfs_file_mmap,
1549         .open = generic_file_open,
1550         .release = reiserfs_file_release,
1551         .fsync = reiserfs_sync_file,
1552         .sendfile = generic_file_sendfile,
1553         .aio_read = generic_file_aio_read,
1554         .aio_write = generic_file_aio_write,
1555         .splice_read = generic_file_splice_read,
1556         .splice_write = generic_file_splice_write,
1557 };
1558
1559 const struct inode_operations reiserfs_file_inode_operations = {
1560         .truncate = reiserfs_vfs_truncate_file,
1561         .setattr = reiserfs_setattr,
1562         .setxattr = reiserfs_setxattr,
1563         .getxattr = reiserfs_getxattr,
1564         .listxattr = reiserfs_listxattr,
1565         .removexattr = reiserfs_removexattr,
1566         .permission = reiserfs_permission,
1567 };