Merge commit 'v2.6.32' into reiserfs/kill-bkl
authorFrederic Weisbecker <fweisbec@gmail.com>
Mon, 7 Dec 2009 06:28:35 +0000 (07:28 +0100)
committerFrederic Weisbecker <fweisbec@gmail.com>
Mon, 7 Dec 2009 06:29:22 +0000 (07:29 +0100)
Merge-reason: The tree was based 2.6.31. It's better to be up to date
with 2.6.32. Although no conflicting changes were made in between,
it gives benchmarking results closer to the lastest kernel behaviour.

18 files changed:
fs/reiserfs/Makefile
fs/reiserfs/bitmap.c
fs/reiserfs/dir.c
fs/reiserfs/do_balan.c
fs/reiserfs/file.c
fs/reiserfs/fix_node.c
fs/reiserfs/inode.c
fs/reiserfs/ioctl.c
fs/reiserfs/journal.c
fs/reiserfs/lock.c [new file with mode: 0644]
fs/reiserfs/namei.c
fs/reiserfs/prints.c
fs/reiserfs/resize.c
fs/reiserfs/stree.c
fs/reiserfs/super.c
fs/reiserfs/xattr.c
include/linux/reiserfs_fs.h
include/linux/reiserfs_fs_sb.h

index 7c5ab6330dd6bbe4edbd1f6b62a3a1d5a3f6b645..6a9e30c041dda2a52a44bbe14d9aa664dae14bb7 100644 (file)
@@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
                 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
                 hashes.o tail_conversion.o journal.o resize.o \
-                item_ops.o ioctl.o procfs.o xattr.o
+                item_ops.o ioctl.o procfs.o xattr.o lock.o
 
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
index e716161ab325c8f246b33c11110a60fff351100f..685495707181c7f168ec280e69b72ba3a1bcc230 100644 (file)
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
 
+       reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
+       reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
        else {
                if (buffer_locked(bh)) {
                        PROC_INFO_INC(sb, scan_bitmap.wait);
+                       reiserfs_write_unlock(sb);
                        __wait_on_buffer(bh);
+                       reiserfs_write_lock(sb);
                }
                BUG_ON(!buffer_uptodate(bh));
                BUG_ON(atomic_read(&bh->b_count) == 0);
index 6d2668fdc3848eb5b2be29d027c5634c727148f6..c094f58c7448b06d1da88d87ebbbeb4e27dd7872 100644 (file)
@@ -20,7 +20,7 @@ const struct file_operations reiserfs_dir_operations = {
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
-       .ioctl = reiserfs_ioctl,
+       .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                // user space buffer is swapped out. At that time
                                // entry can move to somewhere else
                                memcpy(local_buf, d_name, d_reclen);
+
+                               /*
+                                * Since filldir might sleep, we can release
+                                * the write lock here for other waiters
+                                */
+                               reiserfs_write_unlock(inode->i_sb);
                                if (filldir
                                    (dirent, local_buf, d_reclen, d_off, d_ino,
                                     DT_UNKNOWN) < 0) {
+                                       reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
                                        }
                                        goto end;
                                }
+                               reiserfs_write_lock(inode->i_sb);
                                if (local_buf != small_buf) {
                                        kfree(local_buf);
                                }
index 128d3f7c8aa5a56a9ca6ad38487034dbdcaf9b92..60c08044066160e7b2289c265ab33d7ee00c3c96 100644 (file)
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 
-#ifdef CONFIG_REISERFS_CHECK
-
-struct tree_balance *cur_tb = NULL;    /* detects whether more than one
-                                          copy of tb exists as a means
-                                          of checking whether schedule
-                                          is interrupting do_balance */
-#endif
-
 static inline void buffer_info_init_left(struct tree_balance *tb,
                                          struct buffer_info *bi)
 {
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
 {
        int retval = 0;
 
-       if (cur_tb) {
+       if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
                               "occurred based on cur_tb not being null at "
                               "this point in code. do_balance cannot properly "
-                              "handle schedule occurring while it runs.");
+                              "handle concurrent tree accesses on a same "
+                              "mount point.");
        }
 
        /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
             "check");*/
        RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-       cur_tb = tb;
+       REISERFS_SB(tb->tb_sb)->cur_tb = tb;
 #endif
 }
 
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
 #ifdef CONFIG_REISERFS_CHECK
        check_leaf_level(tb);
        check_internal_levels(tb);
-       cur_tb = NULL;
+       REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
 #endif
 
        /* reiserfs_free_block is no longer schedule safe.  So, we need to
index 9f436668b7f816e6b40bad3759fb7f32268344c6..da2dba082e2d4e6ba5191bf938d134ca647d3877 100644 (file)
@@ -284,7 +284,7 @@ static ssize_t reiserfs_file_write(struct file *file,       /* the file we are going t
 const struct file_operations reiserfs_file_operations = {
        .read = do_sync_read,
        .write = reiserfs_file_write,
-       .ioctl = reiserfs_ioctl,
+       .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
index 5e5a4e6fbaf8290d2bf91799d7e116390256ccf0..d2f31330dcae7ecd80a67a0eec638c941822bd2c 100644 (file)
@@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
        return needed_nodes;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 
 /* Set parameters for balancing.
  * Performs write of results of analysis of balancing into structure tb,
@@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb,
        /* Check whether the common parent is locked. */
 
        if (buffer_locked(*pcom_father)) {
+
+               /* Release the write lock while the buffer is busy */
+               reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(*pcom_father);
+               reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb)) {
                        brelse(*pcom_father);
                        return REPEAT_SEARCH;
@@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
                return REPEAT_SEARCH;
 
        if (buffer_locked(bh)) {
+               reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(bh);
+               reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
@@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
                                                                       FL[h]);
                son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+               reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+               reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                child_position =
                    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
                son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+               reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+               reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
                                    REPEAT_SEARCH : CARRY_ON;
                        }
 #endif
+                       reiserfs_write_unlock(tb->tb_sb);
                        __wait_on_buffer(locked);
+                       reiserfs_write_lock(tb->tb_sb);
                        if (FILESYSTEM_CHANGED_TB(tb))
                                return REPEAT_SEARCH;
                }
@@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
 
        /* if it possible in indirect_to_direct conversion */
        if (buffer_locked(tbS0)) {
+               reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(tbS0);
+               reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
 #ifdef CONFIG_REISERFS_CHECK
-       if (cur_tb) {
+       if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                print_cur_tb("fix_nodes");
                reiserfs_panic(tb->tb_sb, "PAP-8305",
                               "there is pending do_balance");
index a14d6cd9eeda2670251c0ec3377518912cb465a2..3a28e7751b3c714da6e3b2c0b7fe9ad89e82a87b 100644 (file)
@@ -251,7 +251,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        struct cpu_key key;
        struct buffer_head *bh;
        struct item_head *ih, tmp_ih;
-       int fs_gen;
        b_blocknr_t blocknr;
        char *p = NULL;
        int chars;
@@ -265,7 +264,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
                     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                     3);
 
-      research:
        result = search_for_position_by_key(inode->i_sb, &key, &path);
        if (result != POSITION_FOUND) {
                pathrelse(&path);
@@ -340,7 +338,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        }
        // read file tail into part of page
        offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
-       fs_gen = get_generation(inode->i_sb);
        copy_item_head(&tmp_ih, ih);
 
        /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +345,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
         ** sure we need to.  But, this means the item might move if
         ** kmap schedules
         */
-       if (!p) {
+       if (!p)
                p = (char *)kmap(bh_result->b_page);
-               if (fs_changed(fs_gen, inode->i_sb)
-                   && item_moved(&tmp_ih, &path)) {
-                       goto research;
-               }
-       }
+
        p += offset;
        memset(p, 0, inode->i_sb->s_blocksize);
        do {
@@ -489,10 +482,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
           disappeared */
        if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                int err;
-               lock_kernel();
+
+               reiserfs_write_lock(inode->i_sb);
+
                err = reiserfs_commit_for_inode(inode);
                REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-               unlock_kernel();
+
+               reiserfs_write_unlock(inode->i_sb);
+
                if (err < 0)
                        ret = err;
        }
@@ -601,6 +598,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        __le32 *item;
        int done;
        int fs_gen;
+       int lock_depth;
        struct reiserfs_transaction_handle *th = NULL;
        /* space reserved in transaction batch:
           . 3 balancings in direct->indirect conversion
@@ -616,12 +614,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        loff_t new_offset =
            (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
 
-       /* bad.... */
-       reiserfs_write_lock(inode->i_sb);
+       lock_depth = reiserfs_write_lock_once(inode->i_sb);
        version = get_inode_item_key_version(inode);
 
        if (!file_capable(inode, block)) {
-               reiserfs_write_unlock(inode->i_sb);
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return -EFBIG;
        }
 
@@ -633,7 +630,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                /* find number of block-th logical block of the file */
                ret = _get_block_create_0(inode, block, bh_result,
                                          create | GET_BLOCK_READ_DIRECT);
-               reiserfs_write_unlock(inode->i_sb);
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return ret;
        }
        /*
@@ -751,7 +748,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                if (!dangle && th)
                        retval = reiserfs_end_persistent_transaction(th);
 
-               reiserfs_write_unlock(inode->i_sb);
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 
                /* the item was found, so new blocks were not added to the file
                 ** there is no need to make sure the inode is updated with this
@@ -935,7 +932,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (blocks_needed == 1) {
                                un = &unf_single;
                        } else {
-                               un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);      // We need to avoid scheduling.
+                               un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
                                if (!un) {
                                        un = &unf_single;
                                        blocks_needed = 1;
@@ -997,10 +994,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (retval)
                                goto failure;
                }
-               /* inserting indirect pointers for a hole can take a
-                ** long time.  reschedule if needed
+               /*
+                * inserting indirect pointers for a hole can take a
+                * long time.  reschedule if needed and also release the write
+                * lock for others.
                 */
-               cond_resched();
+               if (need_resched()) {
+                       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+                       schedule();
+                       lock_depth = reiserfs_write_lock_once(inode->i_sb);
+               }
 
                retval = search_for_position_by_key(inode->i_sb, &key, &path);
                if (retval == IO_ERROR) {
@@ -1035,7 +1038,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        retval = err;
        }
 
-       reiserfs_write_unlock(inode->i_sb);
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        reiserfs_check_path(&path);
        return retval;
 }
@@ -2072,8 +2075,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
        int error;
        struct buffer_head *bh = NULL;
        int err2;
+       int lock_depth;
 
-       reiserfs_write_lock(inode->i_sb);
+       lock_depth = reiserfs_write_lock_once(inode->i_sb);
 
        if (inode->i_size > 0) {
                error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2146,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
                page_cache_release(page);
        }
 
-       reiserfs_write_unlock(inode->i_sb);
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
        return 0;
       out:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
        }
-       reiserfs_write_unlock(inode->i_sb);
+
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+
        return error;
 }
 
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
        int ret;
        int old_ref = 0;
 
+       reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+       reiserfs_write_lock(inode->i_sb);
+
        fix_tail_page_for_writing(page);
        if (reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th;
@@ -2664,6 +2674,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th;
        unsigned start;
+       int lock_depth = 0;
+       bool locked = false;
 
        if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
                pos ++;
@@ -2690,9 +2702,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
         ** to do the i_size updates here.
         */
        pos += copied;
+
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-               reiserfs_write_lock(inode->i_sb);
+               lock_depth = reiserfs_write_lock_once(inode->i_sb);
+               locked = true;
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2703,10 +2717,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 
                ret = journal_begin(&myth, inode->i_sb, 1);
-               if (ret) {
-                       reiserfs_write_unlock(inode->i_sb);
+               if (ret)
                        goto journal_error;
-               }
+
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2718,34 +2731,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
+               if (!locked) {
+                       lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                       locked = true;
+               }
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
 
       out:
+       if (locked)
+               reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        unlock_page(page);
        page_cache_release(page);
        return ret == 0 ? copied : ret;
 
       journal_error:
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+       locked = false;
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
        }
-
        goto out;
 }
 
@@ -2758,7 +2773,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th = NULL;
 
+       reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+       reiserfs_write_lock(inode->i_sb);
+
        if (reiserfs_transaction_running(inode->i_sb)) {
                th = current->journal_info;
        }
@@ -2770,7 +2788,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
         */
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-               reiserfs_write_lock(inode->i_sb);
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2781,10 +2798,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 
                ret = journal_begin(&myth, inode->i_sb, 1);
-               if (ret) {
-                       reiserfs_write_unlock(inode->i_sb);
+               if (ret)
                        goto journal_error;
-               }
+
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2796,16 +2812,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
@@ -2815,11 +2828,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 
       journal_error:
        if (th) {
-               reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-               reiserfs_write_unlock(inode->i_sb);
        }
 
        return ret;
index 0ccc3fdda7bfb7d5d00e59e8b26e74a0331e6d3c..ace77451ceb16d3280a94e4bebc1044e0c6920f3 100644 (file)
 #include <linux/compat.h>
 
 /*
-** reiserfs_ioctl - handler for ioctl for inode
-** supported commands:
-**  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
-**                           and prevent packing file (argument arg has to be non-zero)
-**  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
-**  3) That's all for a while ...
-*/
-int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-                  unsigned long arg)
+ * reiserfs_ioctl - handler for ioctl for inode
+ * supported commands:
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *                           and prevent packing file (argument arg has to be non-zero)
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  3) That's all for a while ...
+ */
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+       struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int err = 0;
 
+       reiserfs_write_lock(inode->i_sb);
+
        switch (cmd) {
        case REISERFS_IOC_UNPACK:
                if (S_ISREG(inode->i_mode)) {
                        if (arg)
-                               return reiserfs_unpack(inode, filp);
-                       else
-                               return 0;
+                               err = reiserfs_unpack(inode, filp);
                } else
-                       return -ENOTTY;
-               /* following two cases are taken from fs/ext2/ioctl.c by Remy
-                  Card (card@masi.ibp.fr) */
+                       err = -ENOTTY;
+               break;
+               /*
+                * following two cases are taken from fs/ext2/ioctl.c by Remy
+                * Card (card@masi.ibp.fr)
+                */
        case REISERFS_IOC_GETFLAGS:
-               if (!reiserfs_attrs(inode->i_sb))
-                       return -ENOTTY;
+               if (!reiserfs_attrs(inode->i_sb)) {
+                       err = -ENOTTY;
+                       break;
+               }
 
                flags = REISERFS_I(inode)->i_attrs;
                i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
-               return put_user(flags, (int __user *)arg);
+               err = put_user(flags, (int __user *)arg);
+               break;
        case REISERFS_IOC_SETFLAGS:{
-                       if (!reiserfs_attrs(inode->i_sb))
-                               return -ENOTTY;
+                       if (!reiserfs_attrs(inode->i_sb)) {
+                               err = -ENOTTY;
+                               break;
+                       }
 
                        err = mnt_want_write(filp->f_path.mnt);
                        if (err)
-                               return err;
+                               break;
 
                        if (!is_owner_or_cap(inode)) {
                                err = -EPERM;
@@ -90,16 +98,18 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        mark_inode_dirty(inode);
 setflags_out:
                        mnt_drop_write(filp->f_path.mnt);
-                       return err;
+                       break;
                }
        case REISERFS_IOC_GETVERSION:
-               return put_user(inode->i_generation, (int __user *)arg);
+               err = put_user(inode->i_generation, (int __user *)arg);
+               break;
        case REISERFS_IOC_SETVERSION:
                if (!is_owner_or_cap(inode))
-                       return -EPERM;
+                       err = -EPERM;
+                       break;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
-                       return err;
+                       break;
                if (get_user(inode->i_generation, (int __user *)arg)) {
                        err = -EFAULT;
                        goto setversion_out;
@@ -108,19 +118,20 @@ setflags_out:
                mark_inode_dirty(inode);
 setversion_out:
                mnt_drop_write(filp->f_path.mnt);
-               return err;
+               break;
        default:
-               return -ENOTTY;
+               err = -ENOTTY;
        }
+
+       reiserfs_write_unlock(inode->i_sb);
+
+       return err;
 }
 
 #ifdef CONFIG_COMPAT
 long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
 {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       int ret;
-
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case REISERFS_IOC32_UNPACK:
@@ -141,10 +152,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
        default:
                return -ENOIOCTLCMD;
        }
-       lock_kernel();
-       ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-       unlock_kernel();
-       return ret;
+
+       return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
 
index 90622200b39c0622e0f159d423c929a036d76257..2f8a7e7b8dabf04b8641e7b4426b02b3bba8e570 100644 (file)
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
        clear_buffer_journal_restore_dirty(bh);
 }
 
-/* utility function to force a BUG if it is called without the big
-** kernel lock held.  caller is the string printed just before calling BUG()
-*/
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-#ifdef CONFIG_SMP
-       if (current->lock_depth < 0) {
-               reiserfs_panic(sb, "journal-1", "%s called without kernel "
-                              "lock held", caller);
-       }
-#else
-       ;
-#endif
-}
-
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
                                                                  super_block
@@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *sb)
 {
        PROC_INFO_INC(sb, journal.lock_journal);
-       mutex_lock(&SB_JOURNAL(sb)->j_mutex);
+
+       reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 }
 
 /* unlock the current transaction */
@@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s,
                disable_barrier(s);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
+               reiserfs_write_unlock(s);
                sync_dirty_buffer(bh);
+               reiserfs_write_lock(s);
        }
 }
 
@@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 {
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
-       if (atomic_read(&j->j_async_throttle))
+
+       if (atomic_read(&j->j_async_throttle)) {
+               reiserfs_write_unlock(s);
                congestion_wait(BLK_RW_ASYNC, HZ / 10);
+               reiserfs_write_lock(s);
+       }
+
        return 0;
 }
 
@@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s,
        }
 
        /* make sure nobody is trying to flush this one at the same time */
-       mutex_lock(&jl->j_commit_mutex);
+       reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
+
        if (!journal_list_still_alive(s, trans_id)) {
                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
@@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s,
 
        if (!list_empty(&jl->j_bh_list)) {
                int ret;
-               unlock_kernel();
+
+               /*
+                * We might sleep in numerous places inside
+                * write_ordered_buffers. Relax the write lock.
+                */
+               reiserfs_write_unlock(s);
                ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                            journal, jl, &jl->j_bh_list);
                if (ret < 0 && retval == 0)
                        retval = ret;
-               lock_kernel();
+               reiserfs_write_lock(s);
        }
        BUG_ON(!list_empty(&jl->j_bh_list));
        /*
@@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s,
                    SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
                if (tbh) {
-                       if (buffer_dirty(tbh))
-                           ll_rw_block(WRITE, 1, &tbh) ;
+                       if (buffer_dirty(tbh)) {
+                           reiserfs_write_unlock(s);
+                           ll_rw_block(WRITE, 1, &tbh);
+                           reiserfs_write_lock(s);
+                       }
                        put_bh(tbh) ;
                }
        }
@@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s,
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
+
+               reiserfs_write_unlock(s);
                wait_on_buffer(tbh);
+               reiserfs_write_lock(s);
                // since we're using ll_rw_blk above, it might have skipped over
                // a locked buffer.  Double check here
                //
-               if (buffer_dirty(tbh))  /* redundant, sync_dirty_buffer() checks */
+               /* redundant, sync_dirty_buffer() checks */
+               if (buffer_dirty(tbh)) {
+                       reiserfs_write_unlock(s);
                        sync_dirty_buffer(tbh);
+                       reiserfs_write_lock(s);
+               }
                if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
                        reiserfs_warning(s, "journal-601",
@@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s,
                        if (buffer_dirty(jl->j_commit_bh))
                                BUG();
                        mark_buffer_dirty(jl->j_commit_bh) ;
+                       reiserfs_write_unlock(s);
                        sync_dirty_buffer(jl->j_commit_bh) ;
+                       reiserfs_write_lock(s);
                }
-       } else
+       } else {
+               reiserfs_write_unlock(s);
                wait_on_buffer(jl->j_commit_bh);
+               reiserfs_write_lock(s);
+       }
 
        check_barrier_completion(s, jl->j_commit_bh);
 
@@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb,
 
        if (trans_id >= journal->j_last_flush_trans_id) {
                if (buffer_locked((journal->j_header_bh))) {
+                       reiserfs_write_unlock(sb);
                        wait_on_buffer((journal->j_header_bh));
+                       reiserfs_write_lock(sb);
                        if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
                                reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb,
                                disable_barrier(sb);
                                goto sync;
                        }
+                       reiserfs_write_unlock(sb);
                        wait_on_buffer(journal->j_header_bh);
+                       reiserfs_write_lock(sb);
                        check_barrier_completion(sb, journal->j_header_bh);
                } else {
                      sync:
                        set_buffer_dirty(journal->j_header_bh);
+                       reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
+                       reiserfs_write_lock(sb);
                }
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
 
        /* if flushall == 0, the lock is already held */
        if (flushall) {
-               mutex_lock(&journal->j_flush_mutex);
+               reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        } else if (mutex_trylock(&journal->j_flush_mutex)) {
                BUG();
        }
@@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s,
                                        reiserfs_panic(s, "journal-1011",
                                                       "cn->bh is NULL");
                                }
+
+                               reiserfs_write_unlock(s);
                                wait_on_buffer(cn->bh);
+                               reiserfs_write_lock(s);
+
                                if (!cn->bh) {
                                        reiserfs_panic(s, "journal-1012",
                                                       "cn->bh is NULL");
@@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s,
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        chunk.nr = 0;
 
-       mutex_lock(&journal->j_flush_mutex);
+       reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        if (!journal_list_still_alive(s, orig_trans_id)) {
                goto done;
        }
@@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
        reiserfs_mounted_fs_count--;
        /* wait for all commits to finish */
        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+
+       /*
+        * We must release the write lock here because
+        * the workqueue job (flush_async_commit) needs this lock
+        */
+       reiserfs_write_unlock(sb);
        flush_workqueue(commit_wq);
+
        if (!reiserfs_mounted_fs_count) {
                destroy_workqueue(commit_wq);
                commit_wq = NULL;
        }
+       reiserfs_write_lock(sb);
 
        free_journal_ram(sb);
 
@@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb,
        /* read in the log blocks, memcpy to the corresponding real block */
        ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
        for (i = 0; i < get_desc_trans_len(desc); i++) {
+
+               reiserfs_write_unlock(sb);
                wait_on_buffer(log_blocks[i]);
+               reiserfs_write_lock(sb);
+
                if (!buffer_uptodate(log_blocks[i])) {
                        reiserfs_warning(sb, "journal-1212",
                                         "REPLAY FAILURE fsck required! "
@@ -2765,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
 
+       /*
+        * We need to unlock here to avoid creating the following
+        * dependency:
+        * reiserfs_lock -> sysfs_mutex
+        * Because the reiserfs mmap path creates the following dependency:
+        * mm->mmap -> reiserfs_lock, hence we have
+        * mm->mmap -> reiserfs_lock ->sysfs_mutex
+        * This would ends up in a circular dependency with sysfs readdir path
+        * which does sysfs_mutex -> mm->mmap_sem
+        * This is fine because the reiserfs lock is useless in mount path,
+        * at least until we call journal_begin. We keep it for paranoid
+        * reasons.
+        */
+       reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+               reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
+       reiserfs_write_lock(sb);
 
        rs = SB_DISK_SUPER_BLOCK(sb);
 
@@ -2881,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        }
 
        reiserfs_mounted_fs_count++;
-       if (reiserfs_mounted_fs_count <= 1)
+       if (reiserfs_mounted_fs_count <= 1) {
+               reiserfs_write_unlock(sb);
                commit_wq = create_workqueue("reiserfs");
+               reiserfs_write_lock(sb);
+       }
 
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2964,8 +3019,11 @@ static void queue_log_writer(struct super_block *s)
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&journal->j_join_wait, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
-       if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+       if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+               reiserfs_write_unlock(s);
                schedule();
+               reiserfs_write_lock(s);
+       }
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&journal->j_join_wait, &wait);
 }
@@ -2982,7 +3040,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
        struct reiserfs_journal *journal = SB_JOURNAL(sb);
        unsigned long bcount = journal->j_bcount;
        while (1) {
+               reiserfs_write_unlock(sb);
                schedule_timeout_uninterruptible(1);
+               reiserfs_write_lock(sb);
                journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
                while ((atomic_read(&journal->j_wcount) > 0 ||
                        atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3093,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 
        if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
                unlock_journal(sb);
+               reiserfs_write_unlock(sb);
                reiserfs_wait_on_write_block(sb);
+               reiserfs_write_lock(sb);
                PROC_INFO_INC(sb, journal.journal_relock_writers);
                goto relock;
        }
@@ -3506,14 +3568,14 @@ static void flush_async_commits(struct work_struct *work)
        struct reiserfs_journal_list *jl;
        struct list_head *entry;
 
-       lock_kernel();
+       reiserfs_write_lock(sb);
        if (!list_empty(&journal->j_journal_list)) {
                /* last entry is the youngest, commit it and you get everything */
                entry = journal->j_journal_list.prev;
                jl = JOURNAL_LIST_ENTRY(entry);
                flush_commit_list(sb, jl, 1);
        }
-       unlock_kernel();
+       reiserfs_write_unlock(sb);
 }
 
 /*
@@ -4041,7 +4103,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * the new transaction is fully setup, and we've already flushed the
         * ordered bh list
         */
-       mutex_lock(&jl->j_commit_mutex);
+       reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
 
        /* save the transaction id in case we need to commit it later */
        commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4218,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
                next = cn->next;
                free_cnode(sb, cn);
                cn = next;
+               reiserfs_write_unlock(sb);
                cond_resched();
+               reiserfs_write_lock(sb);
        }
 
        /* we are done  with both the c_bh and d_bh, but
@@ -4203,10 +4267,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * is lost.
         */
        if (!list_empty(&jl->j_tail_bh_list)) {
-               unlock_kernel();
+               reiserfs_write_unlock(sb);
                write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                      journal, jl, &jl->j_tail_bh_list);
-               lock_kernel();
+               reiserfs_write_lock(sb);
        }
        BUG_ON(!list_empty(&jl->j_tail_bh_list));
        mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644 (file)
index 0000000..ee2cfc0
--- /dev/null
@@ -0,0 +1,88 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partialy based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+       if (sb_i->lock_owner != current) {
+               mutex_lock(&sb_i->lock);
+               sb_i->lock_owner = current;
+       }
+
+       /* No need to protect it, only the current task touches it */
+       sb_i->lock_depth++;
+}
+
+void reiserfs_write_unlock(struct super_block *s)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+       /*
+        * Are we unlocking without even holding the lock?
+        * Such a situation must raise a BUG() if we don't want
+        * to corrupt the data.
+        */
+       BUG_ON(sb_i->lock_owner != current);
+
+       if (--sb_i->lock_depth == -1) {
+               sb_i->lock_owner = NULL;
+               mutex_unlock(&sb_i->lock);
+       }
+}
+
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+       if (sb_i->lock_owner != current) {
+               mutex_lock(&sb_i->lock);
+               sb_i->lock_owner = current;
+               return sb_i->lock_depth++;
+       }
+
+       return sb_i->lock_depth;
+}
+
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+       if (lock_depth == -1)
+               reiserfs_write_unlock(s);
+}
+
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+       struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+
+       if (sb_i->lock_depth < 0)
+               reiserfs_panic(sb, "%s called without kernel lock held %d",
+                              caller);
+}
index 271579128634242b64c2b96f97c2f86dc0e3435f..e296ff72a6ccb2bc2e99cfbb97472c7da39f3253 100644 (file)
@@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nd)
 {
        int retval;
+       int lock_depth;
        struct inode *inode = NULL;
        struct reiserfs_dir_entry de;
        INITIALIZE_PATH(path_to_entry);
@@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
        if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
                return ERR_PTR(-ENAMETOOLONG);
 
-       reiserfs_write_lock(dir->i_sb);
+       /*
+        * Might be called with or without the write lock, must be careful
+        * to not recursively hold it in case we want to release the lock
+        * before rescheduling.
+        */
+       lock_depth = reiserfs_write_lock_once(dir->i_sb);
+
        de.de_gen_number_bit_string = NULL;
        retval =
            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                inode = reiserfs_iget(dir->i_sb,
                                      (struct cpu_key *)&(de.de_dir_id));
                if (!inode || IS_ERR(inode)) {
-                       reiserfs_write_unlock(dir->i_sb);
+                       reiserfs_write_unlock_once(dir->i_sb, lock_depth);
                        return ERR_PTR(-EACCES);
                }
 
@@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                if (IS_PRIVATE(dir))
                        inode->i_flags |= S_PRIVATE;
        }
-       reiserfs_write_unlock(dir->i_sb);
+       reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        if (retval == IO_ERROR) {
                return ERR_PTR(-EIO);
        }
@@ -725,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        struct reiserfs_security_handle security;
+       int lock_depth;
        /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 3 +
@@ -748,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return retval;
        }
        jbegin_count += retval;
-       reiserfs_write_lock(dir->i_sb);
+       lock_depth = reiserfs_write_lock_once(dir->i_sb);
 
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval) {
@@ -798,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
-      out_failed:
-       reiserfs_write_unlock(dir->i_sb);
+out_failed:
+       reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        return retval;
 }
 
index 536eacaeb71005a935a95c88dd7afba67535a4ac..adbc6f538515e15f7d7982d97ab1d3fb30728551 100644 (file)
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 
    .  */
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
-
 void __reiserfs_panic(struct super_block *sb, const char *id,
                      const char *function, const char *fmt, ...)
 {
index 18b315d3d104ea0cebfbb5a1991356038dc8c4fd..b3a94d20f0fcd37b044596369687a7d1cebfa2f5 100644 (file)
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
+                       reiserfs_write_unlock(s);
                        sync_dirty_buffer(bh);
+                       reiserfs_write_lock(s);
                        // update bitmap_info stuff
                        bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
                        brelse(bh);
index d036ee5b1c81a8bd43d8f8836fbd78b68c462acb..5fa7118f04e117c079a14b7da3811450f6d72573 100644 (file)
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key,       /* Key to search for. */
        return ITEM_NOT_FOUND;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 
 /* Minimal possible key. It is never in the tree. */
 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
 
 #define SEARCH_BY_KEY_READA 16
 
-/* The function is NOT SCHEDULE-SAFE! */
-static void search_by_key_reada(struct super_block *s,
+/*
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
                                struct buffer_head **bh,
                                b_blocknr_t *b, int num)
 {
        int i, j;
+       bool unlocked = false;
 
        for (i = 0; i < num; i++) {
                bh[i] = sb_getblk(s, b[i]);
        }
+       /*
+        * We are going to read some blocks on which we
+        * have a reference. It's safe, though we might be
+        * reading blocks concurrently changed if we release
+        * the lock. But it's still fine because we check later
+        * if the tree changed
+        */
        for (j = 0; j < i; j++) {
                /*
                 * note, this needs attention if we are getting rid of the BKL
                 * you have to make sure the prepared bit isn't set on this buffer
                 */
-               if (!buffer_uptodate(bh[j]))
+               if (!buffer_uptodate(bh[j])) {
+                       if (!unlocked) {
+                               reiserfs_write_unlock(s);
+                               unlocked = true;
+                       }
                        ll_rw_block(READA, 1, bh + j);
+               }
                brelse(bh[j]);
        }
+       return unlocked;
 }
 
 /**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,      /* Key to s
                   have a pointer to it. */
                if ((bh = last_element->pe_buffer =
                     sb_getblk(sb, block_number))) {
+                       bool unlocked = false;
+
                        if (!buffer_uptodate(bh) && reada_count > 1)
-                               search_by_key_reada(sb, reada_bh,
+                               /* may unlock the write lock */
+                               unlocked = search_by_key_reada(sb, reada_bh,
                                                    reada_blocks, reada_count);
+                       /*
+                        * If we haven't already unlocked the write lock,
+                        * then we need to do that here before reading
+                        * the current block
+                        */
+                       if (!buffer_uptodate(bh) && !unlocked) {
+                               reiserfs_write_unlock(sb);
+                               unlocked = true;
+                       }
                        ll_rw_block(READ, 1, &bh);
                        wait_on_buffer(bh);
+
+                       if (unlocked)
+                               reiserfs_write_lock(sb);
                        if (!buffer_uptodate(bh))
                                goto io_error;
                } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,        /* Key to s
                       !key_in_buffer(search_path, key, sb),
                       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-               if (cur_tb) {
+               if (REISERFS_SB(sb)->cur_tb) {
                        print_cur_tb("5140");
                        reiserfs_panic(sb, "PAP-5140",
                                       "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
                        reiserfs_free_block(th, inode, block, 1);
                    }
 
+                   reiserfs_write_unlock(sb);
                    cond_resched();
+                   reiserfs_write_lock(sb);
 
                    if (item_moved (&s_ih, path))  {
                        need_re_search = 1;
index f0ad05f380223736cd9a8a121c59d6f1c7ecd7e3..339b0baf2af6ed0a721c3aa3b1f4941f633ec59a 100644 (file)
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
 
-       lock_kernel();
+       reiserfs_write_lock(s);
 
        if (s->s_dirt)
                reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
 
        reiserfs_proc_info_done(s);
 
+       reiserfs_write_unlock(s);
+       mutex_destroy(&REISERFS_SB(s)->lock);
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-
-       unlock_kernel();
 }
 
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode)
        struct reiserfs_transaction_handle th;
 
        int err = 0;
+       int lock_depth;
+
        if (inode->i_sb->s_flags & MS_RDONLY) {
                reiserfs_warning(inode->i_sb, "clm-6006",
                                 "writing inode %lu on readonly FS",
                                 inode->i_ino);
                return;
        }
-       reiserfs_write_lock(inode->i_sb);
+       lock_depth = reiserfs_write_lock_once(inode->i_sb);
 
        /* this is really only used for atime updates, so they don't have
         ** to be included in O_SYNC or fsync
         */
        err = journal_begin(&th, inode->i_sb, 1);
-       if (err) {
-               reiserfs_write_unlock(inode->i_sb);
-               return;
-       }
+       if (err)
+               goto out;
+
        reiserfs_update_sd(&th, inode);
        journal_end(&th, inode->i_sb, 1);
-       reiserfs_write_unlock(inode->i_sb);
+
+out:
+       reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 
 #ifdef CONFIG_QUOTA
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
+#endif
+
+       reiserfs_write_lock(s);
 
+#ifdef CONFIG_QUOTA
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
-       lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
 
        if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
        replace_mount_options(s, new_opts);
-       unlock_kernel();
+       reiserfs_write_unlock(s);
        return 0;
 
 out_err:
        kfree(new_opts);
-       unlock_kernel();
+       reiserfs_write_unlock(s);
        return err;
 }
 
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+       reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
+       reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1613,7 +1621,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
        if (!sbi) {
                errval = -ENOMEM;
-               goto error;
+               goto error_alloc;
        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1635,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* setup default block allocator options */
        reiserfs_init_alloc_options(s);
 
+       mutex_init(&REISERFS_SB(s)->lock);
+       REISERFS_SB(s)->lock_depth = -1;
+
+       /*
+        * This function is called with the bkl, which also was the old
+        * locking used here.
+        * do_journal_begin() will soon check if we hold the lock (ie: was the
+        * bkl). This is likely because do_journal_begin() has several another
+        * callers because at this time, it doesn't seem to be necessary to
+        * protect against anything.
+        * Anyway, let's be conservative and lock for now.
+        */
+       reiserfs_write_lock(s);
+
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1874,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        init_waitqueue_head(&(sbi->s_wait));
        spin_lock_init(&sbi->bitmap_lock);
 
+       reiserfs_write_unlock(s);
+
        return (0);
 
 error:
+       reiserfs_write_unlock(s);
+error_alloc:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
index 6925b835a43b6f2f94e8a4217180b2e8d1735ab7..58aa8e75f7f5a8de1608dd7df0d6e4cf707d7a54 100644 (file)
@@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
        int err = 0;
 
        /* If we don't have the privroot located yet - go find it */
-       mutex_lock(&s->s_root->d_inode->i_mutex);
+       reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
@@ -1004,14 +1004,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                goto error;
 
        if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-               mutex_lock(&s->s_root->d_inode->i_mutex);
+               reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
                err = create_privroot(REISERFS_SB(s)->priv_root);
                mutex_unlock(&s->s_root->d_inode->i_mutex);
        }
 
        if (privroot->d_inode) {
                s->s_xattr = reiserfs_xattr_handlers;
-               mutex_lock(&privroot->d_inode->i_mutex);
+               reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(XAROOT_NAME, privroot,
index dd31e7bae35cd606943ca4f52bc585c42a9a65f7..a05b4a20768d9026c939f1244ec2897dbdf4ab74 100644 (file)
 #define REISERFS_IOC32_GETVERSION      FS_IOC32_GETVERSION
 #define REISERFS_IOC32_SETVERSION      FS_IOC32_SETVERSION
 
-/* Locking primitives */
-/* Right now we are still falling back to (un)lock_kernel, but eventually that
-   would evolve into real per-fs locks */
-#define reiserfs_write_lock( sb ) lock_kernel()
-#define reiserfs_write_unlock( sb ) unlock_kernel()
+/*
+ * Locking primitives. The write lock is a per superblock
+ * special mutex that has properties close to the Big Kernel Lock
+ * which was used in the previous locking scheme.
+ */
+void reiserfs_write_lock(struct super_block *s);
+void reiserfs_write_unlock(struct super_block *s);
+int reiserfs_write_lock_once(struct super_block *s);
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth);
+
+/*
+ * Several mutexes depend on the write lock.
+ * However sometimes we want to relax the write lock while we hold
+ * these mutexes, according to the release/reacquire on schedule()
+ * properties of the Bkl that were used.
+ * Reiserfs performances and locking were based on this scheme.
+ * Now that the write lock is a mutex and not the bkl anymore, doing so
+ * may result in a deadlock:
+ *
+ * A acquire write_lock
+ * A acquire j_commit_mutex
+ * A release write_lock and wait for something
+ * B acquire write_lock
+ * B can't acquire j_commit_mutex and sleep
+ * A can't acquire write lock anymore
+ * deadlock
+ *
+ * What we do here is avoiding such deadlock by playing the same game
+ * than the Bkl: if we can't acquire a mutex that depends on the write lock,
+ * we release the write lock, wait a bit and then retry.
+ *
+ * The mutexes concerned by this hack are:
+ * - The commit mutex of a journal list
+ * - The flush mutex
+ * - The journal lock
+ * - The inode mutex
+ */
+static inline void reiserfs_mutex_lock_safe(struct mutex *m,
+                              struct super_block *s)
+{
+       reiserfs_write_unlock(s);
+       mutex_lock(m);
+       reiserfs_write_lock(s);
+}
+
+/*
+ * When we schedule, we usually want to also release the write lock,
+ * according to the previous bkl based locking scheme of reiserfs.
+ */
+static inline void reiserfs_cond_resched(struct super_block *s)
+{
+       if (need_resched()) {
+               reiserfs_write_unlock(s);
+               schedule();
+               reiserfs_write_lock(s);
+       }
+}
 
 struct fid;
 
@@ -1329,7 +1381,11 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
 #define get_generation(s) atomic_read (&fs_generation(s))
 #define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
 #define __fs_changed(gen,s) (gen != get_generation (s))
-#define fs_changed(gen,s) ({cond_resched(); __fs_changed(gen, s);})
+#define fs_changed(gen,s)              \
+({                                     \
+       reiserfs_cond_resched(s);       \
+       __fs_changed(gen, s);           \
+})
 
 /***************************************************************************/
 /*                  FIXATE NODES                                           */
@@ -2258,8 +2314,7 @@ __u32 r5_hash(const signed char *msg, int len);
 #define SPARE_SPACE 500
 
 /* prototypes from ioctl.c */
-int reiserfs_ioctl(struct inode *inode, struct file *filp,
-                  unsigned int cmd, unsigned long arg);
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long reiserfs_compat_ioctl(struct file *filp,
                   unsigned int cmd, unsigned long arg);
 int reiserfs_unpack(struct inode *inode, struct file *filp);
index dab68bbed6757d950afa3987135b5e34cea405ae..52c83b6a758a3133121a9394a955719acee10478 100644 (file)
@@ -7,6 +7,8 @@
 #ifdef __KERNEL__
 #include <linux/workqueue.h>
 #include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
 #endif
 
 typedef enum {
@@ -355,6 +357,13 @@ struct reiserfs_sb_info {
        struct reiserfs_journal *s_journal;     /* pointer to journal information */
        unsigned short s_mount_state;   /* reiserfs state (valid, invalid) */
 
+       /* Serialize writers access, replace the old bkl */
+       struct mutex lock;
+       /* Owner of the lock (can be recursive) */
+       struct task_struct *lock_owner;
+       /* Depth of the lock, start from -1 like the bkl */
+       int lock_depth;
+
        /* Comment? -Hans */
        void (*end_io_handler) (struct buffer_head *, int);
        hashf_t s_hash_function;        /* pointer to function which is used
@@ -408,6 +417,17 @@ struct reiserfs_sb_info {
        char *s_qf_names[MAXQUOTAS];
        int s_jquota_fmt;
 #endif
+#ifdef CONFIG_REISERFS_CHECK
+
+       struct tree_balance *cur_tb;    /*
+                                        * Detects whether more than one
+                                        * copy of tb exists per superblock
+                                        * as a means of checking whether
+                                        * do_balance is executing concurrently
+                                        * against another tree reader/writer
+                                        * on a same mount point.
+                                        */
+#endif
 };
 
 /* Definitions of reiserfs on-disk properties: */