[GFS2] The core of GFS2
David Teigland [Mon, 16 Jan 2006 16:50:04 +0000 (16:50 +0000)]
This patch contains all the core files for GFS2.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

81 files changed:
fs/gfs2/Kconfig [new file with mode: 0644]
fs/gfs2/Makefile [new file with mode: 0644]
fs/gfs2/acl.c [new file with mode: 0644]
fs/gfs2/acl.h [new file with mode: 0644]
fs/gfs2/bits.c [new file with mode: 0644]
fs/gfs2/bits.h [new file with mode: 0644]
fs/gfs2/bmap.c [new file with mode: 0644]
fs/gfs2/bmap.h [new file with mode: 0644]
fs/gfs2/daemon.c [new file with mode: 0644]
fs/gfs2/daemon.h [new file with mode: 0644]
fs/gfs2/dir.c [new file with mode: 0644]
fs/gfs2/dir.h [new file with mode: 0644]
fs/gfs2/eaops.c [new file with mode: 0644]
fs/gfs2/eaops.h [new file with mode: 0644]
fs/gfs2/eattr.c [new file with mode: 0644]
fs/gfs2/eattr.h [new file with mode: 0644]
fs/gfs2/format.h [new file with mode: 0644]
fs/gfs2/gfs2.h [new file with mode: 0644]
fs/gfs2/glock.c [new file with mode: 0644]
fs/gfs2/glock.h [new file with mode: 0644]
fs/gfs2/glops.c [new file with mode: 0644]
fs/gfs2/glops.h [new file with mode: 0644]
fs/gfs2/incore.h [new file with mode: 0644]
fs/gfs2/inode.c [new file with mode: 0644]
fs/gfs2/inode.h [new file with mode: 0644]
fs/gfs2/jdata.c [new file with mode: 0644]
fs/gfs2/jdata.h [new file with mode: 0644]
fs/gfs2/lm.c [new file with mode: 0644]
fs/gfs2/lm.h [new file with mode: 0644]
fs/gfs2/lm_interface.h [new file with mode: 0644]
fs/gfs2/locking.c [new file with mode: 0644]
fs/gfs2/log.c [new file with mode: 0644]
fs/gfs2/log.h [new file with mode: 0644]
fs/gfs2/lops.c [new file with mode: 0644]
fs/gfs2/lops.h [new file with mode: 0644]
fs/gfs2/lvb.c [new file with mode: 0644]
fs/gfs2/lvb.h [new file with mode: 0644]
fs/gfs2/main.c [new file with mode: 0644]
fs/gfs2/meta_io.c [new file with mode: 0644]
fs/gfs2/meta_io.h [new file with mode: 0644]
fs/gfs2/mount.c [new file with mode: 0644]
fs/gfs2/mount.h [new file with mode: 0644]
fs/gfs2/ondisk.c [new file with mode: 0644]
fs/gfs2/ops_address.c [new file with mode: 0644]
fs/gfs2/ops_address.h [new file with mode: 0644]
fs/gfs2/ops_dentry.c [new file with mode: 0644]
fs/gfs2/ops_dentry.h [new file with mode: 0644]
fs/gfs2/ops_export.c [new file with mode: 0644]
fs/gfs2/ops_export.h [new file with mode: 0644]
fs/gfs2/ops_file.c [new file with mode: 0644]
fs/gfs2/ops_file.h [new file with mode: 0644]
fs/gfs2/ops_fstype.c [new file with mode: 0644]
fs/gfs2/ops_fstype.h [new file with mode: 0644]
fs/gfs2/ops_inode.c [new file with mode: 0644]
fs/gfs2/ops_inode.h [new file with mode: 0644]
fs/gfs2/ops_super.c [new file with mode: 0644]
fs/gfs2/ops_super.h [new file with mode: 0644]
fs/gfs2/ops_vm.c [new file with mode: 0644]
fs/gfs2/ops_vm.h [new file with mode: 0644]
fs/gfs2/page.c [new file with mode: 0644]
fs/gfs2/page.h [new file with mode: 0644]
fs/gfs2/quota.c [new file with mode: 0644]
fs/gfs2/quota.h [new file with mode: 0644]
fs/gfs2/recovery.c [new file with mode: 0644]
fs/gfs2/recovery.h [new file with mode: 0644]
fs/gfs2/resize.c [new file with mode: 0644]
fs/gfs2/resize.h [new file with mode: 0644]
fs/gfs2/rgrp.c [new file with mode: 0644]
fs/gfs2/rgrp.h [new file with mode: 0644]
fs/gfs2/super.c [new file with mode: 0644]
fs/gfs2/super.h [new file with mode: 0644]
fs/gfs2/sys.c [new file with mode: 0644]
fs/gfs2/sys.h [new file with mode: 0644]
fs/gfs2/trans.c [new file with mode: 0644]
fs/gfs2/trans.h [new file with mode: 0644]
fs/gfs2/unlinked.c [new file with mode: 0644]
fs/gfs2/unlinked.h [new file with mode: 0644]
fs/gfs2/util.c [new file with mode: 0644]
fs/gfs2/util.h [new file with mode: 0644]
include/linux/gfs2_ioctl.h [new file with mode: 0644]
include/linux/gfs2_ondisk.h [new file with mode: 0644]

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644 (file)
index 0000000..17cb44b
--- /dev/null
@@ -0,0 +1,46 @@
+config GFS2_FS
+        tristate "GFS2 file system support"
+       default m
+       depends on EXPERIMENTAL
+        select FS_POSIX_ACL
+        select SYSFS
+        help
+        A cluster filesystem.
+
+        Allows a cluster of computers to simultaneously use a block device
+        that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+        and writes to the block device like a local filesystem, but also uses
+        a lock module to allow the computers coordinate their I/O so
+        filesystem consistency is maintained.  One of the nifty features of
+        GFS is perfect consistency -- changes made to the filesystem on one
+        machine show up immediately on all other machines in the cluster.
+
+       To use the GFS2 filesystem, you will need to enable one or more of
+       the below locking modules. Documentation and utilities for GFS2 can
+       be found here: http://sources.redhat.com/cluster/gfs/
+
+config GFS2_FS_LOCKING_NOLOCK
+       tristate "GFS2 \"nolock\" locking module"
+       depends on GFS2_FS
+       help
+       Single node locking module for GFS2.
+
+       Use this module if you want to use GFS2 on a single node without
+       its clustering features. You can still take advantage of the
+       large file support, and upgrade to running a full cluster later on
+       if required.
+
+       If you will only be using GFS2 in cluster mode, you do not need this
+       module.
+
+config GFS2_FS_LOCKING_DLM
+       tristate "GFS2 DLM locking module"
+       depends on GFS2_FS
+       select DLM
+       help
+       Multiple node locking module for GFS2
+
+       Most users of GFS2 will require this module. It provides the locking
+       interface between GFS2 and the DLM, which is required to use GFS2
+       in a cluster environment.
+
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644 (file)
index 0000000..b1bac4f
--- /dev/null
@@ -0,0 +1,44 @@
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := \
+       acl.o \
+       bits.o \
+       bmap.o \
+       daemon.o \
+       dir.o \
+       eaops.o \
+       eattr.o \
+       glock.o \
+       glops.o \
+       inode.o \
+       jdata.o \
+       lm.o \
+       log.o \
+       lops.o \
+       locking.o \
+       lvb.o \
+       main.o \
+       meta_io.o \
+       mount.o \
+       ondisk.o \
+       ops_address.o \
+       ops_dentry.o \
+       ops_export.o \
+       ops_file.o \
+       ops_fstype.o \
+       ops_inode.o \
+       ops_super.o \
+       ops_vm.o \
+       page.o \
+       quota.o \
+       resize.o \
+       recovery.o \
+       rgrp.o \
+       super.o \
+       sys.o \
+       trans.o \
+       unlinked.o \
+       util.o
+
+obj-$(CONFIG_GFS2_LOCKING_NOLOCK) += locking/nolock/
+obj-$(CONFIG_GFS2_LOCKING_DLM) += locking/dlm/
+
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644 (file)
index 0000000..33c465a
--- /dev/null
@@ -0,0 +1,312 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <asm/semaphore.h>
+
+#include "gfs2.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+
+#define ACL_ACCESS 1
+#define ACL_DEFAULT 0
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+                     struct gfs2_ea_request *er,
+                     int *remove, mode_t *mode)
+{
+       struct posix_acl *acl;
+       int error;
+
+       error = gfs2_acl_validate_remove(ip, access);
+       if (error)
+               return error;
+
+       if (!er->er_data)
+               return -EINVAL;
+
+       acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
+       if (IS_ERR(acl))
+               return PTR_ERR(acl);
+       if (!acl) {
+               *remove = 1;
+               return 0;
+       }
+
+       error = posix_acl_valid(acl);
+       if (error)
+               goto out;
+
+       if (access) {
+               error = posix_acl_equiv_mode(acl, mode);
+               if (!error)
+                       *remove = 1;
+               else if (error > 0)
+                       error = 0;
+       }
+
+ out:
+       posix_acl_release(acl);
+
+       return error;
+}
+
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+{
+       if (!ip->i_sbd->sd_args.ar_posix_acl)
+               return -EOPNOTSUPP;
+       if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+               return -EPERM;
+       if (S_ISLNK(ip->i_di.di_mode))
+               return -EOPNOTSUPP;
+       if (!access && !S_ISDIR(ip->i_di.di_mode))
+               return -EACCES;
+
+       return 0;
+}
+
+static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+                  struct gfs2_ea_location *el, char **data, unsigned int *len)
+{
+       struct gfs2_ea_request er;
+       struct gfs2_ea_location el_this;
+       int error;
+
+       if (!ip->i_di.di_eattr)
+               return 0;
+
+       memset(&er, 0, sizeof(struct gfs2_ea_request));
+       if (access) {
+               er.er_name = GFS2_POSIX_ACL_ACCESS;
+               er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+       } else {
+               er.er_name = GFS2_POSIX_ACL_DEFAULT;
+               er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+       }
+       er.er_type = GFS2_EATYPE_SYS;
+
+       if (!el)
+               el = &el_this;
+
+       error = gfs2_ea_find(ip, &er, el);
+       if (error)
+               return error;
+       if (!el->el_ea)
+               return 0;
+       if (!GFS2_EA_DATA_LEN(el->el_ea))
+               goto out;
+
+       er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+       er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+       error = -ENOMEM;
+       if (!er.er_data)
+               goto out;
+
+       error = gfs2_ea_get_copy(ip, el, er.er_data);
+       if (error)
+               goto out_kfree;
+
+       if (acl) {
+               *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+               if (IS_ERR(*acl))
+                       error = PTR_ERR(*acl);
+       }
+
+ out_kfree:
+       if (error || !data)
+               kfree(er.er_data);
+       else {
+               *data = er.er_data;
+               *len = er.er_data_len;
+       }
+
+ out:
+       if (error || el == &el_this)
+               brelse(el->el_bh);
+
+       return error;
+}
+
+/**
+ * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+
+int gfs2_check_acl_locked(struct inode *inode, int mask)
+{
+       struct posix_acl *acl = NULL;
+       int error;
+
+       error = acl_get(get_v2ip(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+       if (error)
+               return error;
+
+       if (acl) {
+               error = posix_acl_permission(inode, acl, mask);
+               posix_acl_release(acl);
+               return error;
+       }
+
+       return -EAGAIN;
+}
+
+int gfs2_check_acl(struct inode *inode, int mask)
+{
+       struct gfs2_inode *ip = get_v2ip(inode);
+       struct gfs2_holder i_gh;
+       int error;
+
+       error = gfs2_glock_nq_init(ip->i_gl,
+                                  LM_ST_SHARED, LM_FLAG_ANY,
+                                  &i_gh);
+       if (!error) {
+               error = gfs2_check_acl_locked(inode, mask);
+               gfs2_glock_dq_uninit(&i_gh);
+       }
+       
+       return error;
+}
+
+static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *dibh;
+       int error;
+
+       error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+       if (error)
+               return error;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               gfs2_assert_withdraw(sdp,
+                               (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
+               ip->i_di.di_mode = mode;
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       gfs2_trans_end(sdp);
+
+       return 0;
+}
+
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct posix_acl *acl = NULL, *clone;
+       struct gfs2_ea_request er;
+       mode_t mode = ip->i_di.di_mode;
+       int error;
+
+       if (!sdp->sd_args.ar_posix_acl)
+               return 0;
+       if (S_ISLNK(ip->i_di.di_mode))
+               return 0;
+
+       memset(&er, 0, sizeof(struct gfs2_ea_request));
+       er.er_type = GFS2_EATYPE_SYS;
+
+       error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
+                       &er.er_data, &er.er_data_len);
+       if (error)
+               return error;
+       if (!acl) {
+               mode &= ~current->fs->umask;
+               if (mode != ip->i_di.di_mode)
+                       error = munge_mode(ip, mode);
+               return error;
+       }
+
+       clone = posix_acl_clone(acl, GFP_KERNEL);
+       error = -ENOMEM;
+       if (!clone)
+               goto out;
+       posix_acl_release(acl);
+       acl = clone;
+
+       if (S_ISDIR(ip->i_di.di_mode)) {
+               er.er_name = GFS2_POSIX_ACL_DEFAULT;
+               er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+               error = gfs2_system_eaops.eo_set(ip, &er);
+               if (error)
+                       goto out;
+       }
+
+       error = posix_acl_create_masq(acl, &mode);
+       if (error < 0)
+               goto out;
+       if (error > 0) {
+               er.er_name = GFS2_POSIX_ACL_ACCESS;
+               er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+               posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
+               er.er_mode = mode;
+               er.er_flags = GFS2_ERF_MODE;
+               error = gfs2_system_eaops.eo_set(ip, &er);
+               if (error)
+                       goto out;
+       } else
+               munge_mode(ip, mode);
+
+ out:
+       posix_acl_release(acl);
+       kfree(er.er_data);
+       return error;
+}
+
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+       struct posix_acl *acl = NULL, *clone;
+       struct gfs2_ea_location el;
+       char *data;
+       unsigned int len;
+       int error;
+
+       error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+       if (error)
+               return error;
+       if (!acl)
+               return gfs2_setattr_simple(ip, attr);
+
+       clone = posix_acl_clone(acl, GFP_KERNEL);
+       error = -ENOMEM;
+       if (!clone)
+               goto out;
+       posix_acl_release(acl);
+       acl = clone;
+
+       error = posix_acl_chmod_masq(acl, attr->ia_mode);
+       if (!error) {
+               posix_acl_to_xattr(acl, data, len);
+               error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+       }
+
+ out:
+       posix_acl_release(acl);
+       brelse(el.el_bh);
+       kfree(data);
+
+       return error;
+}
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644 (file)
index 0000000..a174b4f
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+
+#define GFS2_POSIX_ACL_ACCESS          "posix_acl_access"
+#define GFS2_POSIX_ACL_ACCESS_LEN      16
+#define GFS2_POSIX_ACL_DEFAULT         "posix_acl_default"
+#define GFS2_POSIX_ACL_DEFAULT_LEN     17
+
+#define GFS2_ACL_IS_ACCESS(name, len) \
+         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+
+#define GFS2_ACL_IS_DEFAULT(name, len) \
+         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
+         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
+
+struct gfs2_ea_request;
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+                         struct gfs2_ea_request *er,
+                         int *remove, mode_t *mode);
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
+int gfs2_check_acl_locked(struct inode *inode, int mask);
+int gfs2_check_acl(struct inode *inode, int mask);
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bits.c b/fs/gfs2/bits.c
new file mode 100644 (file)
index 0000000..57d420a
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation.  Each block is represented by two
+ * bits.  One bit indicates whether or not the block is used.  (1=used,
+ * 0=free)  The other bit indicates whether or not the block contains a
+ * dinode or not.  (1=dinode, 0=not-dinode) So, each byte represents
+ * GFS2_NBBY (i.e. 4) blocks.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <asm/semaphore.h>
+
+#include "gfs2.h"
+#include "bits.h"
+
+static const char valid_change[16] = {
+               /* current */
+       /* n */ 0, 1, 0, 1,
+       /* e */ 1, 0, 0, 0,
+       /* w */ 0, 0, 0, 0,
+               1, 0, 0, 0
+};
+
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to set
+ * @new_state: the new state of the block
+ *
+ */
+
+void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                unsigned int buflen, uint32_t block, unsigned char new_state)
+{
+       unsigned char *byte, *end, cur_state;
+       unsigned int bit;
+
+       byte = buffer + (block / GFS2_NBBY);
+       bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+       end = buffer + buflen;
+
+       gfs2_assert(rgd->rd_sbd, byte < end);
+
+       cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+       if (valid_change[new_state * 4 + cur_state]) {
+               *byte ^= cur_state << bit;
+               *byte |= new_state << bit;
+       } else
+               gfs2_consist_rgrpd(rgd);
+}
+
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to read
+ *
+ */
+
+unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                          unsigned int buflen, uint32_t block)
+{
+       unsigned char *byte, *end, cur_state;
+       unsigned int bit;
+
+       byte = buffer + (block / GFS2_NBBY);
+       bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+       end = buffer + buflen;
+
+       gfs2_assert(rgd->rd_sbd, byte < end);
+
+       cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+       return cur_state;
+}
+
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ *       a block in a given allocation state.
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
+ *       bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem.  @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+
+uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                    unsigned int buflen, uint32_t goal,
+                    unsigned char old_state)
+{
+       unsigned char *byte, *end, alloc;
+       uint32_t blk = goal;
+       unsigned int bit;
+
+       byte = buffer + (goal / GFS2_NBBY);
+       bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+       end = buffer + buflen;
+       alloc = (old_state & 1) ? 0 : 0x55;
+
+       while (byte < end) {
+               if ((*byte & 0x55) == alloc) {
+                       blk += (8 - bit) >> 1;
+
+                       bit = 0;
+                       byte++;
+
+                       continue;
+               }
+
+               if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
+                       return blk;
+
+               bit += GFS2_BIT_SIZE;
+               if (bit >= 8) {
+                       bit = 0;
+                       byte++;
+               }
+
+               blk++;
+       }
+
+       return BFITNOENT;
+}
+
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+
+uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                      unsigned int buflen, unsigned char state)
+{
+       unsigned char *byte = buffer;
+       unsigned char *end = buffer + buflen;
+       unsigned char state1 = state << 2;
+       unsigned char state2 = state << 4;
+       unsigned char state3 = state << 6;
+       uint32_t count = 0;
+
+       for (; byte < end; byte++) {
+               if (((*byte) & 0x03) == state)
+                       count++;
+               if (((*byte) & 0x0C) == state1)
+                       count++;
+               if (((*byte) & 0x30) == state2)
+                       count++;
+               if (((*byte) & 0xC0) == state3)
+                       count++;
+       }
+
+       return count;
+}
+
diff --git a/fs/gfs2/bits.h b/fs/gfs2/bits.h
new file mode 100644 (file)
index 0000000..36ccbdc
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __BITS_DOT_H__
+#define __BITS_DOT_H__
+
+#define BFITNOENT 0xFFFFFFFF
+
+void gfs2_setbit(struct gfs2_rgrpd *rgd,
+               unsigned char *buffer, unsigned int buflen,
+               uint32_t block, unsigned char new_state);
+unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
+                         unsigned char *buffer, unsigned int buflen,
+                         uint32_t block);
+uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd,
+                   unsigned char *buffer, unsigned int buflen,
+                   uint32_t goal, unsigned char old_state);
+uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd,
+                     unsigned char *buffer, unsigned int buflen,
+                     unsigned char state);
+
+#endif /* __BITS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644 (file)
index 0000000..4b4e295
--- /dev/null
@@ -0,0 +1,1206 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <asm/semaphore.h>
+
+#include "gfs2.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "jdata.h"
+#include "meta_io.h"
+#include "page.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+       __u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+                            struct buffer_head *bh, uint64_t *top,
+                            uint64_t *bottom, unsigned int height,
+                            void *data);
+
+struct strip_mine {
+       int sm_first;
+       unsigned int sm_height;
+};
+
+/**
+ * @gfs2_unstuffer_sync - Synchronously unstuff a dinode
+ * @ip:
+ * @dibh:
+ * @block:
+ * @private:
+ *
+ * Cheat and use a metadata buffer instead of a data page.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuffer_sync(struct gfs2_inode *ip, struct buffer_head *dibh,
+                       uint64_t block, void *private)
+{
+       struct buffer_head *bh;
+       int error;
+
+       bh = gfs2_meta_new(ip->i_gl, block);
+
+       gfs2_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs2_dinode));
+
+       set_buffer_dirty(bh);
+       error = sync_dirty_buffer(bh);
+
+       brelse(bh);
+
+       return error;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @private: private data for the unstuffer
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
+                       void *private)
+{
+       struct buffer_head *bh, *dibh;
+       uint64_t block = 0;
+       int journaled = gfs2_is_jdata(ip);
+       int error;
+
+       down_write(&ip->i_rw_mutex);
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (error)
+               goto out;
+               
+       if (ip->i_di.di_size) {
+               /* Get a free block, fill it with the stuffed data,
+                  and write it out to disk */
+
+               if (journaled) {
+                       block = gfs2_alloc_meta(ip);
+
+                       error = gfs2_jdata_get_buffer(ip, block, 1, &bh);
+                       if (error)
+                               goto out_brelse;
+                       gfs2_buffer_copy_tail(bh,
+                                             sizeof(struct gfs2_meta_header),
+                                             dibh, sizeof(struct gfs2_dinode));
+                       brelse(bh);
+               } else {
+                       block = gfs2_alloc_data(ip);
+
+                       error = unstuffer(ip, dibh, block, private);
+                       if (error)
+                               goto out_brelse;
+               }
+       }
+
+       /*  Set up the pointer to the new block  */
+
+       gfs2_trans_add_bh(ip->i_gl, dibh);
+
+       gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+       if (ip->i_di.di_size) {
+               *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) = cpu_to_be64(block);
+               ip->i_di.di_blocks++;
+       }
+
+       ip->i_di.di_height = 1;
+
+       gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+ out_brelse:
+       brelse(dibh);
+
+ out:
+       up_write(&ip->i_rw_mutex);
+
+       return error;
+}
+
+/**
+ * calc_tree_height - Calculate the height of a metadata tree
+ * @ip: The GFS2 inode
+ * @size: The proposed size of the file
+ *
+ * Work out how tall a metadata tree needs to be in order to accommodate a
+ * file of a particular size. If size is less than the current size of
+ * the inode, then the current size of the inode is used instead of the
+ * supplied one.
+ *
+ * Returns: the height the tree should be
+ */
+
+static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       uint64_t *arr;
+       unsigned int max, height;
+
+       if (ip->i_di.di_size > size)
+               size = ip->i_di.di_size;
+
+       if (gfs2_is_jdata(ip)) {
+               arr = sdp->sd_jheightsize;
+               max = sdp->sd_max_jheight;
+       } else {
+               arr = sdp->sd_heightsize;
+               max = sdp->sd_max_height;
+       }
+
+       for (height = 0; height < max; height++)
+               if (arr[height] >= size)
+                       break;
+
+       return height;
+}
+
+/**
+ * build_height - Build a metadata tree of the requested height
+ * @ip: The GFS2 inode
+ * @height: The height to build to
+ *
+ * This routine makes sure that the metadata tree is tall enough to hold
+ * "size" bytes of data.
+ *
+ * Returns: errno
+ */
+
+static int build_height(struct gfs2_inode *ip, int height)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *bh, *dibh;
+       uint64_t block = 0, *bp;
+       unsigned int x;
+       int new_block;
+       int error;
+
+       while (ip->i_di.di_height < height) {
+               error = gfs2_meta_inode_buffer(ip, &dibh);
+               if (error)
+                       return error;
+
+               new_block = 0;
+               bp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
+               for (x = 0; x < sdp->sd_diptrs; x++, bp++)
+                       if (*bp) {
+                               new_block = 1;
+                               break;
+                       }
+
+               if (new_block) {
+                       /* Get a new block, fill it with the old direct
+                          pointers, and write it out */
+
+                       block = gfs2_alloc_meta(ip);
+
+                       bh = gfs2_meta_new(ip->i_gl, block);
+                       gfs2_trans_add_bh(ip->i_gl, bh);
+                       gfs2_metatype_set(bh,
+                                         GFS2_METATYPE_IN,
+                                         GFS2_FORMAT_IN);
+                       gfs2_buffer_copy_tail(bh,
+                                             sizeof(struct gfs2_meta_header),
+                                             dibh, sizeof(struct gfs2_dinode));
+
+                       brelse(bh);
+               }
+
+               /*  Set up the new direct pointer and write it out to disk  */
+
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+
+               gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+               if (new_block) {
+                       *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) = cpu_to_be64(block);
+                       ip->i_di.di_blocks++;
+               }
+
+               ip->i_di.di_height++;
+
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       return 0;
+}
+
+/**
+ * find_metapath - Find path through the metadata tree
+ * @ip: The inode pointer
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ *
+ *   This routine returns a struct metapath structure that defines a path
+ *   through the metadata of inode "ip" to get to block "block".
+ *
+ *   Example:
+ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
+ *   filesystem with a blocksize of 4096.
+ *
+ *   find_metapath() would return a struct metapath structure set to:
+ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ *   and mp_list[2] = 165.
+ *
+ *   That means that in order to get to the block containing the byte at
+ *   offset 101342453, we would load the indirect block pointed to by pointer
+ *   0 in the dinode.  We would then load the indirect block pointed to by
+ *   pointer 48 in that indirect block.  We would then load the data block
+ *   pointed to by pointer 165 in that indirect block.
+ *
+ *             ----------------------------------------
+ *             | Dinode |                             |
+ *             |        |                            4|
+ *             |        |0 1 2 3 4 5                 9|
+ *             |        |                            6|
+ *             ----------------------------------------
+ *                       |
+ *                       |
+ *                       V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                                     5|
+ *             |            4 4 4 4 4 5 5            1|
+ *             |0           5 6 7 8 9 0 1            2|
+ *             ----------------------------------------
+ *                                |
+ *                                |
+ *                                V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                         1 1 1 1 1   5|
+ *             |                         6 6 6 6 6   1|
+ *             |0                        3 4 5 6 7   2|
+ *             ----------------------------------------
+ *                                           |
+ *                                           |
+ *                                           V
+ *             ----------------------------------------
+ *             | Data block containing offset         |
+ *             |            101342453                 |
+ *             |                                      |
+ *             |                                      |
+ *             ----------------------------------------
+ *
+ */
+
+static void find_metapath(struct gfs2_inode *ip, uint64_t block, struct metapath *mp)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       uint64_t b = block;
+       unsigned int i;
+
+       for (i = ip->i_di.di_height; i--;)
+               mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
+
+}
+
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @bh: The buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+
+static inline uint64_t *metapointer(struct buffer_head *bh,
+                                   unsigned int height, struct metapath *mp)
+{
+       unsigned int head_size = (height > 0) ?
+               sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+
+       return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height];
+}
+
+/**
+ * lookup_block - Get the next metadata block in metadata tree
+ * @ip: The GFS2 inode
+ * @bh: Buffer containing the pointers to metadata blocks
+ * @height: The height of the tree (0 = dinode)
+ * @mp: The metapath
+ * @create: Non-zero if we may create a new meatdata block
+ * @new: Used to indicate if we did create a new metadata block
+ * @block: the returned disk block number
+ *
+ * Given a metatree, complete to a particular height, checks to see if the next
+ * height of the tree exists. If not the next height of the tree is created.
+ * The block number of the next height of the metadata tree is returned.
+ *
+ */
+
+static void lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+                        unsigned int height, struct metapath *mp, int create,
+                        int *new, uint64_t *block)
+{
+       uint64_t *ptr = metapointer(bh, height, mp);
+
+       if (*ptr) {
+               *block = be64_to_cpu(*ptr);
+               return;
+       }
+
+       *block = 0;
+
+       if (!create)
+               return;
+
+       if (height == ip->i_di.di_height - 1 &&
+           !gfs2_is_jdata(ip))
+               *block = gfs2_alloc_data(ip);
+       else
+               *block = gfs2_alloc_meta(ip);
+
+       gfs2_trans_add_bh(ip->i_gl, bh);
+
+       *ptr = cpu_to_be64(*block);
+       ip->i_di.di_blocks++;
+
+       *new = 1;
+}
+
+/**
+ * gfs2_block_map - Map a block from an inode to a disk block
+ * @ip: The GFS2 inode
+ * @lblock: The logical block number
+ * @new: Value/Result argument (1 = may create/did create new blocks)
+ * @dblock: the disk block number of the start of an extent
+ * @extlen: the size of the extent
+ *
+ * Find the block number on the current device which corresponds to an
+ * inode's block. If the block had to be created, "new" will be set.
+ *
+ * Returns: errno
+ */
+
+int gfs2_block_map(struct gfs2_inode *ip, uint64_t lblock, int *new,
+                  uint64_t *dblock, uint32_t *extlen)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *bh;
+       struct metapath mp;
+       int create = *new;
+       unsigned int bsize;
+       unsigned int height;
+       unsigned int end_of_metadata;
+       unsigned int x;
+       int error = 0;
+
+       *new = 0;
+       *dblock = 0;
+       if (extlen)
+               *extlen = 0;
+
+       if (create)
+               down_write(&ip->i_rw_mutex);
+       else
+               down_read(&ip->i_rw_mutex);
+
+       if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+               goto out;
+
+       bsize = (gfs2_is_jdata(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+
+       height = calc_tree_height(ip, (lblock + 1) * bsize);
+       if (ip->i_di.di_height < height) {
+               if (!create)
+                       goto out;
+
+               error = build_height(ip, height);
+               if (error)
+                       goto out;
+       }
+
+       find_metapath(ip, lblock, &mp);
+       end_of_metadata = ip->i_di.di_height - 1;
+
+       error = gfs2_meta_inode_buffer(ip, &bh);
+       if (error)
+               goto out;
+
+       for (x = 0; x < end_of_metadata; x++) {
+               lookup_block(ip, bh, x, &mp, create, new, dblock);
+               brelse(bh);
+               if (!*dblock)
+                       goto out;
+
+               error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
+               if (error)
+                       goto out;
+       }
+
+       lookup_block(ip, bh, end_of_metadata, &mp, create, new, dblock);
+
+       if (extlen && *dblock) {
+               *extlen = 1;
+
+               if (!*new) {
+                       uint64_t tmp_dblock;
+                       int tmp_new;
+                       unsigned int nptrs;
+
+                       nptrs = (end_of_metadata) ? sdp->sd_inptrs :
+                                                   sdp->sd_diptrs;
+
+                       while (++mp.mp_list[end_of_metadata] < nptrs) {
+                               lookup_block(ip, bh, end_of_metadata, &mp,
+                                            0, &tmp_new, &tmp_dblock);
+
+                               if (*dblock + *extlen != tmp_dblock)
+                                       break;
+
+                               (*extlen)++;
+                       }
+               }
+       }
+
+       brelse(bh);
+
+       if (*new) {
+               error = gfs2_meta_inode_buffer(ip, &bh);
+               if (!error) {
+                       gfs2_trans_add_bh(ip->i_gl, bh);
+                       gfs2_dinode_out(&ip->i_di, bh->b_data);
+                       brelse(bh);
+               }
+       }
+
+ out:
+       if (create)
+               up_write(&ip->i_rw_mutex);
+       else
+               up_read(&ip->i_rw_mutex);
+
+       return error;
+}
+
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+                         struct metapath *mp, unsigned int height,
+                         uint64_t block, int first, block_call_t bc,
+                         void *data)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *bh = NULL;
+       uint64_t *top, *bottom;
+       uint64_t bn;
+       int error;
+       int mh_size = sizeof(struct gfs2_meta_header);
+
+       if (!height) {
+               error = gfs2_meta_inode_buffer(ip, &bh);
+               if (error)
+                       return error;
+               dibh = bh;
+
+               top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
+                       mp->mp_list[0];
+               bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
+                       sdp->sd_diptrs;
+       } else {
+               error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+               if (error)
+                       return error;
+
+               top = (uint64_t *)(bh->b_data + mh_size) +
+                                 ((first) ? mp->mp_list[height] : 0);
+
+               bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+       }
+
+       error = bc(ip, dibh, bh, top, bottom, height, data);
+       if (error)
+               goto out;
+
+       if (height < ip->i_di.di_height - 1)
+               for (; top < bottom; top++, first = 0) {
+                       if (!*top)
+                               continue;
+
+                       bn = be64_to_cpu(*top);
+
+                       error = recursive_scan(ip, dibh, mp, height + 1, bn,
+                                              first, bc, data);
+                       if (error)
+                               break;
+               }
+
+ out:
+       brelse(bh);
+
+       return error;
+}
+
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+                   struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
+                   unsigned int height, void *data)
+{
+       struct strip_mine *sm = (struct strip_mine *)data;
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct gfs2_rgrp_list rlist;
+       uint64_t bn, bstart;
+       uint32_t blen;
+       uint64_t *p;
+       unsigned int rg_blocks = 0;
+       int metadata;
+       unsigned int revokes = 0;
+       int x;
+       int error;
+
+       if (!*top)
+               sm->sm_first = 0;
+
+       if (height != sm->sm_height)
+               return 0;
+
+       if (sm->sm_first) {
+               top++;
+               sm->sm_first = 0;
+       }
+
+       metadata = (height != ip->i_di.di_height - 1) || gfs2_is_jdata(ip);
+       if (metadata)
+               revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+
+       error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+       if (error)
+               return error;
+
+       memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+       bstart = 0;
+       blen = 0;
+
+       for (p = top; p < bottom; p++) {
+               if (!*p)
+                       continue;
+
+               bn = be64_to_cpu(*p);
+
+               if (bstart + blen == bn)
+                       blen++;
+               else {
+                       if (bstart)
+                               gfs2_rlist_add(sdp, &rlist, bstart);
+
+                       bstart = bn;
+                       blen = 1;
+               }
+       }
+
+       if (bstart)
+               gfs2_rlist_add(sdp, &rlist, bstart);
+       else
+               goto out; /* Nothing to do */
+
+       gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+       for (x = 0; x < rlist.rl_rgrps; x++) {
+               struct gfs2_rgrpd *rgd;
+               rgd = get_gl2rgd(rlist.rl_ghs[x].gh_gl);
+               rg_blocks += rgd->rd_ri.ri_length;
+       }
+
+       error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+       if (error)
+               goto out_rlist;
+
+       error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+                                RES_INDIRECT + RES_STATFS + RES_QUOTA,
+                                revokes);
+       if (error)
+               goto out_rg_gunlock;
+
+       down_write(&ip->i_rw_mutex);
+
+       gfs2_trans_add_bh(ip->i_gl, dibh);
+       gfs2_trans_add_bh(ip->i_gl, bh);
+
+       bstart = 0;
+       blen = 0;
+
+       for (p = top; p < bottom; p++) {
+               if (!*p)
+                       continue;
+
+               bn = be64_to_cpu(*p);
+
+               if (bstart + blen == bn)
+                       blen++;
+               else {
+                       if (bstart) {
+                               if (metadata)
+                                       gfs2_free_meta(ip, bstart, blen);
+                               else
+                                       gfs2_free_data(ip, bstart, blen);
+                       }
+
+                       bstart = bn;
+                       blen = 1;
+               }
+
+               *p = 0;
+               if (!ip->i_di.di_blocks)
+                       gfs2_consist_inode(ip);
+               ip->i_di.di_blocks--;
+       }
+       if (bstart) {
+               if (metadata)
+                       gfs2_free_meta(ip, bstart, blen);
+               else
+                       gfs2_free_data(ip, bstart, blen);
+       }
+
+       ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+       gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+       up_write(&ip->i_rw_mutex);
+
+       gfs2_trans_end(sdp);
+
+ out_rg_gunlock:
+       gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+
+ out_rlist:
+       gfs2_rlist_free(&rlist);
+
+ out:
+       gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+
+       return error;
+}
+
+/**
+ * do_grow - Make a file look bigger than it is
+ * @ip: the inode
+ * @size: the size to set the file to
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_grow(struct gfs2_inode *ip, uint64_t size)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct gfs2_alloc *al;
+       struct buffer_head *dibh;
+       unsigned int h;
+       int error;
+
+       al = gfs2_alloc_get(ip);
+
+       error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+       if (error)
+               goto out;
+
+       error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+       if (error)
+               goto out_gunlock_q;
+
+       al->al_requested = sdp->sd_max_height + RES_DATA;
+
+       error = gfs2_inplace_reserve(ip);
+       if (error)
+               goto out_gunlock_q;
+
+       error = gfs2_trans_begin(sdp,
+                       sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
+                       RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+       if (error)
+               goto out_ipres;
+
+       if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+               if (gfs2_is_stuffed(ip)) {
+                       error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
+                                                   NULL);
+                       if (error)
+                               goto out_end_trans;
+               }
+
+               h = calc_tree_height(ip, size);
+               if (ip->i_di.di_height < h) {
+                       down_write(&ip->i_rw_mutex);
+                       error = build_height(ip, h);
+                       up_write(&ip->i_rw_mutex);
+                       if (error)
+                               goto out_end_trans;
+               }
+       }
+
+       ip->i_di.di_size = size;
+       ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (error)
+               goto out_end_trans;
+
+       gfs2_trans_add_bh(ip->i_gl, dibh);
+       gfs2_dinode_out(&ip->i_di, dibh->b_data);
+       brelse(dibh);
+
+ out_end_trans:
+       gfs2_trans_end(sdp);
+
+ out_ipres:
+       gfs2_inplace_release(ip);
+
+ out_gunlock_q:
+       gfs2_quota_unlock(ip);
+
+ out:
+       gfs2_alloc_put(ip);
+
+       return error;
+}
+
+static int truncator_journaled(struct gfs2_inode *ip, uint64_t size)
+{
+       uint64_t lbn, dbn;
+       uint32_t off;
+       struct buffer_head *bh;
+       int new = 0;
+       int error;
+
+       lbn = size;
+       off = do_div(lbn, ip->i_sbd->sd_jbsize);
+
+       error = gfs2_block_map(ip, lbn, &new, &dbn, NULL);
+       if (error || !dbn)
+               return error;
+
+       error = gfs2_jdata_get_buffer(ip, dbn, 0, &bh);
+       if (error)
+               return error;
+
+       gfs2_trans_add_bh(ip->i_gl, bh);
+       gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header) + off);
+
+       brelse(bh);
+
+       return 0;
+}
+
+static int trunc_start(struct gfs2_inode *ip, uint64_t size,
+                      gfs2_truncator_t truncator)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *dibh;
+       int journaled = gfs2_is_jdata(ip);
+       int error;
+
+       error = gfs2_trans_begin(sdp,
+                                RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
+       if (error)
+               return error;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (error)
+               goto out;
+
+       if (gfs2_is_stuffed(ip)) {
+               ip->i_di.di_size = size;
+               ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+               error = 1;
+
+       } else {
+               if (journaled) {
+                       uint64_t junk = size;
+                       /* we're just interested in the modulus */
+                       if (do_div(junk, sdp->sd_jbsize))
+                               error = truncator_journaled(ip, size);
+               } else if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
+                       error = truncator(ip, size);
+
+               if (!error) {
+                       ip->i_di.di_size = size;
+                       ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                       ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+                       gfs2_trans_add_bh(ip->i_gl, dibh);
+                       gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               }
+       }
+
+       brelse(dibh);
+
+ out:
+       gfs2_trans_end(sdp);
+
+       return error;
+}
+
+static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
+{
+       unsigned int height = ip->i_di.di_height;
+       uint64_t lblock;
+       struct metapath mp;
+       int error;
+
+       if (!size)
+               lblock = 0;
+       else if (gfs2_is_jdata(ip)) {
+               lblock = size - 1;
+               do_div(lblock, ip->i_sbd->sd_jbsize);
+       } else
+               lblock = (size - 1) >> ip->i_sbd->sd_sb.sb_bsize_shift;
+
+       find_metapath(ip, lblock, &mp);
+       gfs2_alloc_get(ip);
+
+       error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+       if (error)
+               goto out;
+
+       while (height--) {
+               struct strip_mine sm;
+               sm.sm_first = !!size;
+               sm.sm_height = height;
+
+               error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+               if (error)
+                       break;
+       }
+
+       gfs2_quota_unhold(ip);
+
+ out:
+       gfs2_alloc_put(ip);
+       return error;
+}
+
+static int trunc_end(struct gfs2_inode *ip)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *dibh;
+       int error;
+
+       error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+       if (error)
+               return error;
+
+       down_write(&ip->i_rw_mutex);
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (error)
+               goto out;
+
+       if (!ip->i_di.di_size) {
+               ip->i_di.di_height = 0;
+               ip->i_di.di_goal_meta =
+                       ip->i_di.di_goal_data =
+                       ip->i_num.no_addr;
+               gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+       }
+       ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+       ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+
+       gfs2_trans_add_bh(ip->i_gl, dibh);
+       gfs2_dinode_out(&ip->i_di, dibh->b_data);
+       brelse(dibh);
+
+ out:
+       up_write(&ip->i_rw_mutex);
+
+       gfs2_trans_end(sdp);
+
+       return error;
+}
+
+/**
+ * do_shrink - make a file smaller
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_shrink(struct gfs2_inode *ip, uint64_t size,
+                    gfs2_truncator_t truncator)
+{
+       int error;
+
+       error = trunc_start(ip, size, truncator);
+       if (error < 0)
+               return error;
+       if (error > 0)
+               return 0;
+
+       error = trunc_dealloc(ip, size);
+       if (!error)
+               error = trunc_end(ip);
+
+       return error;
+}
+
+/**
+ * gfs2_truncatei - make a file a give size
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * The file size can grow, shrink, or stay the same size.
+ *
+ * Returns: errno
+ */
+
+int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size,
+                  gfs2_truncator_t truncator)
+{
+       int error;
+
+       if (gfs2_assert_warn(ip->i_sbd, S_ISREG(ip->i_di.di_mode)))
+               return -EINVAL;
+
+       if (size > ip->i_di.di_size)
+               error = do_grow(ip, size);
+       else
+               error = do_shrink(ip, size, truncator);
+
+       return error;
+}
+
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+       int error;
+       error = trunc_dealloc(ip, ip->i_di.di_size);
+       if (!error)
+               error = trunc_end(ip);
+       return error;
+}
+
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+       return trunc_dealloc(ip, 0);
+}
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+                           unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       unsigned int tmp;
+
+       if (gfs2_is_jdata(ip)) {
+               *data_blocks = DIV_RU(len, sdp->sd_jbsize) + 2;
+               *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
+       } else {
+               *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+               *ind_blocks = 3 * (sdp->sd_max_height - 1);
+       }
+
+       for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+               tmp = DIV_RU(tmp, sdp->sd_inptrs);
+               *ind_blocks += tmp;
+       }
+}
+
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+
+int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
+                             unsigned int len, int *alloc_required)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       uint64_t lblock, lblock_stop, dblock;
+       uint32_t extlen;
+       int new = 0;
+       int error = 0;
+
+       *alloc_required = 0;
+
+       if (!len)
+               return 0;
+
+       if (gfs2_is_stuffed(ip)) {
+               if (offset + len >
+                   sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+                       *alloc_required = 1;
+               return 0;
+       }
+
+       if (gfs2_is_jdata(ip)) {
+               unsigned int bsize = sdp->sd_jbsize;
+               lblock = offset;
+               do_div(lblock, bsize);
+               lblock_stop = offset + len + bsize - 1;
+               do_div(lblock_stop, bsize);
+       } else {
+               unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+               lblock = offset >> shift;
+               lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+       }
+
+       for (; lblock < lblock_stop; lblock += extlen) {
+               error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
+               if (error)
+                       return error;
+
+               if (!dblock) {
+                       *alloc_required = 1;
+                       return 0;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * do_gfm - Copy out the dinode/indirect blocks of a file
+ * @ip: the file
+ * @dibh: the dinode buffer
+ * @bh: the indirect buffer we're looking at
+ * @top: the first pointer in the block
+ * @bottom: one more than the last pointer in the block
+ * @height: the height the block is at
+ * @data: a pointer to a struct gfs2_user_buffer structure
+ *
+ * If this is a journaled file, copy out the data too.
+ *
+ * Returns: errno
+ */
+
+static int do_gfm(struct gfs2_inode *ip, struct buffer_head *dibh,
+                 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
+                 unsigned int height, void *data)
+{
+       struct gfs2_user_buffer *ub = (struct gfs2_user_buffer *)data;
+       int error;
+
+       error = gfs2_add_bh_to_ub(ub, bh);
+       if (error)
+               return error;
+
+       if (!S_ISDIR(ip->i_di.di_mode) ||
+           height + 1 != ip->i_di.di_height)
+               return 0;
+
+       for (; top < bottom; top++)
+               if (*top) {
+                       struct buffer_head *data_bh;
+
+                       error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*top),
+                                              DIO_START | DIO_WAIT,
+                                              &data_bh);
+                       if (error)
+                               return error;
+
+                       error = gfs2_add_bh_to_ub(ub, data_bh);
+
+                       brelse(data_bh);
+
+                       if (error)
+                               return error;
+               }
+
+       return 0;
+}
+
+/**
+ * gfs2_get_file_meta - return all the metadata for a file
+ * @ip: the file
+ * @ub: the structure representing the meta
+ *
+ * Returns: errno
+ */
+
+int gfs2_get_file_meta(struct gfs2_inode *ip, struct gfs2_user_buffer *ub)
+{
+       int error;
+
+       if (gfs2_is_stuffed(ip)) {
+               struct buffer_head *dibh;
+               error = gfs2_meta_inode_buffer(ip, &dibh);
+               if (!error) {
+                       error = gfs2_add_bh_to_ub(ub, dibh);
+                       brelse(dibh);
+               }
+       } else {
+               struct metapath mp;
+               find_metapath(ip, 0, &mp);
+               error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_gfm, ub);
+       }
+
+       return error;
+}
+
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644 (file)
index 0000000..de16e44
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+
+typedef int (*gfs2_unstuffer_t) (struct gfs2_inode * ip,
+                                struct buffer_head * dibh, uint64_t block,
+                                void *private);
+int gfs2_unstuffer_sync(struct gfs2_inode *ip, struct buffer_head *dibh,
+                       uint64_t block, void *private);
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
+                       void *private);
+
+int gfs2_block_map(struct gfs2_inode *ip,
+                  uint64_t lblock, int *new,
+                  uint64_t *dblock, uint32_t *extlen);
+
+typedef int (*gfs2_truncator_t) (struct gfs2_inode * ip, uint64_t size);
+int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size,
+                  gfs2_truncator_t truncator);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+                           unsigned int *data_blocks,
+                           unsigned int *ind_blocks);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
+                             unsigned int len, int *alloc_required);
+
+int gfs2_get_file_meta(struct gfs2_inode *ip, struct gfs2_user_buffer *ub);
+
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644 (file)
index 0000000..cff8d53
--- /dev/null
@@ -0,0 +1,225 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <asm/semaphore.h>
+
+#include "gfs2.h"
+#include "daemon.h"
+#include "glock.h"
+#include "log.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super.h"
+#include "unlinked.h"
+
+/* This uses schedule_timeout() instead of msleep() because it's good for
+   the daemons to wake up more often than the timeout when unmounting so
+   the user's unmount doesn't sit there forever.
+   
+   The kthread functions used to start these daemons block and flush signals. */
+
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+
+int gfs2_scand(void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
+       unsigned long t;
+
+       while (!kthread_should_stop()) {
+               gfs2_scand_internal(sdp);
+               t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
+               schedule_timeout_interruptible(t);
+       }
+
+       return 0;
+}
+
+/**
+ * gfs2_glockd - Reclaim unused glock structures
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
+ * Number of daemons can be set by user, with num_glockd mount option.
+ */
+
+int gfs2_glockd(void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
+       DECLARE_WAITQUEUE(wait_chan, current);
+
+       while (!kthread_should_stop()) {
+               while (atomic_read(&sdp->sd_reclaim_count))
+                       gfs2_reclaim_glock(sdp);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&sdp->sd_reclaim_wq, &wait_chan);
+               if (!atomic_read(&sdp->sd_reclaim_count) &&
+                   !kthread_should_stop())
+                       schedule();
+               remove_wait_queue(&sdp->sd_reclaim_wq, &wait_chan);
+               set_current_state(TASK_RUNNING);
+       }
+
+       return 0;
+}
+
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
+       unsigned long t;
+
+       while (!kthread_should_stop()) {
+               gfs2_check_journals(sdp);
+               t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+               schedule_timeout_interruptible(t);
+       }
+
+       return 0;
+}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
+       struct gfs2_holder ji_gh;
+       unsigned long t;
+
+       while (!kthread_should_stop()) {
+               /* Advance the log tail */
+
+               t = sdp->sd_log_flush_time +
+                   gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+
+               gfs2_ail1_empty(sdp, DIO_ALL);
+
+               if (time_after_eq(jiffies, t)) {
+                       gfs2_log_flush(sdp);
+                       sdp->sd_log_flush_time = jiffies;
+               }
+
+               /* Check for latest journal index */
+
+               t = sdp->sd_jindex_refresh_time +
+                   gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+
+               if (time_after_eq(jiffies, t)) {
+                       if (!gfs2_jindex_hold(sdp, &ji_gh))
+                               gfs2_glock_dq_uninit(&ji_gh);
+                       sdp->sd_jindex_refresh_time = jiffies;
+               }
+
+               t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+               schedule_timeout_interruptible(t);
+       }
+
+       return 0;
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
+       unsigned long t;
+       int error;
+
+       while (!kthread_should_stop()) {
+               /* Update the master statfs file */
+
+               t = sdp->sd_statfs_sync_time +
+                   gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+
+               if (time_after_eq(jiffies, t)) {
+                       error = gfs2_statfs_sync(sdp);
+                       if (error &&
+                           error != -EROFS &&
+                           !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                               fs_err(sdp, "quotad: (1) error=%d\n", error);
+                       sdp->sd_statfs_sync_time = jiffies;
+               }
+
+               /* Update quota file */
+
+               t = sdp->sd_quota_sync_time +
+                   gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
+
+               if (time_after_eq(jiffies, t)) {
+                       error = gfs2_quota_sync(sdp);
+                       if (error &&
+                           error != -EROFS &&
+                           !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                               fs_err(sdp, "quotad: (2) error=%d\n", error);
+                       sdp->sd_quota_sync_time = jiffies;
+               }
+
+               gfs2_quota_scan(sdp);
+
+               t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
+               schedule_timeout_interruptible(t);
+       }
+
+       return 0;
+}
+
+/**
+ * gfs2_inoded - Deallocate unlinked inodes
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_inoded(void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
+       unsigned long t;
+       int error;
+
+       while (!kthread_should_stop()) {
+               error = gfs2_unlinked_dealloc(sdp);
+               if (error &&
+                   error != -EROFS &&
+                   !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                       fs_err(sdp, "inoded: error = %d\n", error);
+
+               t = gfs2_tune_get(sdp, gt_inoded_secs) * HZ;
+               schedule_timeout_interruptible(t);
+       }
+
+       return 0;
+}
+
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644 (file)
index 0000000..a27fded
--- /dev/null
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __DAEMON_DOT_H__
+#define __DAEMON_DOT_H__
+
+int gfs2_scand(void *data);
+int gfs2_glockd(void *data);
+int gfs2_recoverd(void *data);
+int gfs2_logd(void *data);
+int gfs2_quotad(void *data);
+int gfs2_inoded(void *data);
+
+#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644 (file)
index 0000000..6b1dc3d
--- /dev/null
@@ -0,0 +1,2157 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+/*
+* Implements Extendible Hashing as described in:
+*   "Extendible Hashing" by Fagin, et al in
+*     __ACM Trans. on Database Systems__, Sept 1979.
+*
+*
+* Here's the layout of dirents which is essentially the same as that of ext2
+* within a single block. The field de_name_len is the number of bytes
+* actually required for the name (no null terminator). The field de_rec_len
+* is the number of bytes allocated to the dirent. The offset of the next
+* dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+* deleted, the preceding dirent inherits its allocated space, ie
+* prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+* by adding de_rec_len to the current dirent, this essentially causes the
+* deleted dirent to get jumped over when iterating through all the dirents.
+*
+* When deleting the first dirent in a block, there is no previous dirent so
+* the field de_ino is set to zero to designate it as deleted. When allocating
+* a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+* first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+* dirent is allocated. Otherwise it must go through all the 'used' dirents
+* searching for one in which the amount of total space minus the amount of
+* used space will provide enough space for the new dirent.
+*
+* There are two types of blocks in which dirents reside. In a stuffed dinode,
+* the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+* the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+* beginning of the leaf block. The dirents reside in leaves when
+*
+* dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+*
+* Otherwise, the dirents are "linear", within a single stuffed dinode block.
+*
+* When the dirents are in leaves, the actual contents of the directory file are
+* used as an array of 64-bit block pointers pointing to the leaf blocks. The
+* dirents are NOT in the directory file itself. There can be more than one block
+* pointer in the array that points to the same leaf. In fact, when a directory
+* is first converted from linear to exhash, all of the pointers point to the
+* same leaf.
+*
+* When a leaf is completely full, the size of the hash table can be
+* doubled unless it is already at the maximum size which is hard coded into
+* GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+* but never before the maximum hash table size has been reached.
+*/
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <asm/semaphore.h>
+
+#include "gfs2.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "jdata.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+
+#define IS_LEAF     1 /* Hashed (leaf) directory */
+#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
+
+#if 1
+#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
+#else
+#define gfs2_disk_hash2offset(h) (((uint64_t)(h)))
+#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p))))
+#endif
+
+typedef int (*leaf_call_t) (struct gfs2_inode *dip,
+                           uint32_t index, uint32_t len, uint64_t leaf_no,
+                           void *data);
+
+/**
+ * int gfs2_filecmp - Compare two filenames
+ * @file1: The first filename
+ * @file2: The second filename
+ * @len_of_file2: The length of the second file
+ *
+ * This routine compares two filenames and returns 1 if they are equal.
+ *
+ * Returns: 1 if the files are the same, otherwise 0.
+ */
+
+int gfs2_filecmp(struct qstr *file1, char *file2, int len_of_file2)
+{
+       if (file1->len != len_of_file2)
+               return 0;
+       if (memcmp(file1->name, file2, file1->len))
+               return 0;
+       return 1;
+}
+
+/**
+ * dirent_first - Return the first dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * return first dirent whether bh points to leaf or stuffed dinode
+ *
+ * Returns: IS_LEAF, IS_DINODE, or -errno
+ */
+
+static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
+                       struct gfs2_dirent **dent)
+{
+       struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
+
+       if (be16_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
+               if (gfs2_meta_check(dip->i_sbd, bh))
+                       return -EIO;
+               *dent = (struct gfs2_dirent *)(bh->b_data +
+                                              sizeof(struct gfs2_leaf));
+               return IS_LEAF;
+       } else {
+               if (gfs2_metatype_check(dip->i_sbd, bh, GFS2_METATYPE_DI))
+                       return -EIO;
+               *dent = (struct gfs2_dirent *)(bh->b_data +
+                                              sizeof(struct gfs2_dinode));
+               return IS_DINODE;
+       }
+}
+
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+                      struct gfs2_dirent **dent)
+{
+       struct gfs2_dirent *tmp, *cur;
+       char *bh_end;
+       uint32_t cur_rec_len;
+
+       cur = *dent;
+       bh_end = bh->b_data + bh->b_size;
+       cur_rec_len = be32_to_cpu(cur->de_rec_len);
+
+       if ((char *)cur + cur_rec_len >= bh_end) {
+               if ((char *)cur + cur_rec_len > bh_end) {
+                       gfs2_consist_inode(dip);
+                       return -EIO;
+               }
+               return -ENOENT;
+       }
+
+       tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
+
+       if ((char *)tmp + be32_to_cpu(tmp->de_rec_len) > bh_end) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+        /* Only the first dent could ever have de_inum.no_addr == 0 */
+       if (!tmp->de_inum.no_addr) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       *dent = tmp;
+
+       return 0;
+}
+
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+                      struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+       uint32_t cur_rec_len, prev_rec_len;
+
+       if (!cur->de_inum.no_addr) {
+               gfs2_consist_inode(dip);
+               return;
+       }
+
+       gfs2_trans_add_bh(dip->i_gl, bh);
+
+       /* If there is no prev entry, this is the first entry in the block.
+          The de_rec_len is already as big as it needs to be.  Just zero
+          out the inode number and return.  */
+
+       if (!prev) {
+               cur->de_inum.no_addr = 0;       /* No endianess worries */
+               return;
+       }
+
+       /*  Combine this dentry with the previous one.  */
+
+       prev_rec_len = be32_to_cpu(prev->de_rec_len);
+       cur_rec_len = be32_to_cpu(cur->de_rec_len);
+
+       if ((char *)prev + prev_rec_len != (char *)cur)
+               gfs2_consist_inode(dip);
+       if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+               gfs2_consist_inode(dip);
+
+       prev_rec_len += cur_rec_len;
+       prev->de_rec_len = cpu_to_be32(prev_rec_len);
+}
+
+/**
+ * gfs2_dirent_alloc - Allocate a directory entry
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @name_len: The length of the name
+ * @dent_out: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+int gfs2_dirent_alloc(struct gfs2_inode *dip, struct buffer_head *bh,
+                     int name_len, struct gfs2_dirent **dent_out)
+{
+       struct gfs2_dirent *dent, *new;
+       unsigned int rec_len = GFS2_DIRENT_SIZE(name_len);
+       unsigned int entries = 0, offset = 0;
+       int type;
+
+       type = dirent_first(dip, bh, &dent);
+       if (type < 0)
+               return type;
+
+       if (type == IS_LEAF) {
+               struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+               entries = be16_to_cpu(leaf->lf_entries);
+               offset = sizeof(struct gfs2_leaf);
+       } else {
+               struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data;
+               entries = be32_to_cpu(dinode->di_entries);
+               offset = sizeof(struct gfs2_dinode);
+       }
+
+       if (!entries) {
+               if (dent->de_inum.no_addr) {
+                       gfs2_consist_inode(dip);
+                       return -EIO;
+               }
+
+               gfs2_trans_add_bh(dip->i_gl, bh);
+
+               dent->de_rec_len = bh->b_size - offset;
+               dent->de_rec_len = cpu_to_be32(dent->de_rec_len);
+               dent->de_name_len = name_len;
+
+               *dent_out = dent;
+               return 0;
+       }
+
+       do {
+               uint32_t cur_rec_len, cur_name_len;
+
+               cur_rec_len = be32_to_cpu(dent->de_rec_len);
+               cur_name_len = dent->de_name_len;
+
+               if ((!dent->de_inum.no_addr && cur_rec_len >= rec_len) ||
+                   (cur_rec_len >= GFS2_DIRENT_SIZE(cur_name_len) + rec_len)) {
+                       gfs2_trans_add_bh(dip->i_gl, bh);
+
+                       if (dent->de_inum.no_addr) {
+                               new = (struct gfs2_dirent *)((char *)dent +
+                                                           GFS2_DIRENT_SIZE(cur_name_len));
+                               memset(new, 0, sizeof(struct gfs2_dirent));
+
+                               new->de_rec_len = cur_rec_len - GFS2_DIRENT_SIZE(cur_name_len);
+                               new->de_rec_len = cpu_to_be32(new->de_rec_len);
+                               new->de_name_len = name_len;
+
+                               dent->de_rec_len = cur_rec_len - be32_to_cpu(new->de_rec_len);
+                               dent->de_rec_len = cpu_to_be32(dent->de_rec_len);
+
+                               *dent_out = new;
+                               return 0;
+                       }
+
+                       dent->de_name_len = name_len;
+
+                       *dent_out = dent;
+                       return 0;
+               }
+       } while (dirent_next(dip, bh, &dent) == 0);
+
+       return -ENOSPC;
+}
+
+/**
+ * dirent_fits - See if we can fit a entry in this buffer
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @name_len: The length of the name
+ *
+ * Returns: 1 if it can fit, 0 otherwise
+ */
+
+static int dirent_fits(struct gfs2_inode *dip, struct buffer_head *bh,
+                      int name_len)
+{
+       struct gfs2_dirent *dent;
+       unsigned int rec_len = GFS2_DIRENT_SIZE(name_len);
+       unsigned int entries = 0;
+       int type;
+
+       type = dirent_first(dip, bh, &dent);
+       if (type < 0)
+               return type;
+
+       if (type == IS_LEAF) {
+               struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+               entries = be16_to_cpu(leaf->lf_entries);
+       } else {
+               struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data;
+               entries = be32_to_cpu(dinode->di_entries);
+       }
+
+       if (!entries)
+               return 1;
+
+       do {
+               uint32_t cur_rec_len, cur_name_len;
+
+               cur_rec_len = be32_to_cpu(dent->de_rec_len);
+               cur_name_len = dent->de_name_len;
+
+               if ((!dent->de_inum.no_addr && cur_rec_len >= rec_len) ||
+                   (cur_rec_len >= GFS2_DIRENT_SIZE(cur_name_len) + rec_len))
+                       return 1;
+       } while (dirent_next(dip, bh, &dent) == 0);
+
+       return 0;
+}
+
+static int leaf_search(struct gfs2_inode *dip, struct buffer_head *bh,
+                      struct qstr *filename, struct gfs2_dirent **dent_out,
+                      struct gfs2_dirent **dent_prev)
+{
+       uint32_t hash;
+       struct gfs2_dirent *dent, *prev = NULL;
+       unsigned int entries = 0;
+       int type;
+
+       type = dirent_first(dip, bh, &dent);
+       if (type < 0)
+               return type;
+
+       if (type == IS_LEAF) {
+               struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+               entries = be16_to_cpu(leaf->lf_entries);
+       } else if (type == IS_DINODE) {
+               struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data;
+               entries = be32_to_cpu(dinode->di_entries);
+       }
+
+       hash = gfs2_disk_hash(filename->name, filename->len);
+
+       do {
+               if (!dent->de_inum.no_addr) {
+                       prev = dent;
+                       continue;
+               }
+
+               if (be32_to_cpu(dent->de_hash) == hash &&
+                   gfs2_filecmp(filename, (char *)(dent + 1),
+                                dent->de_name_len)) {
+                       *dent_out = dent;
+                       if (dent_prev)
+                               *dent_prev = prev;
+
+                       return 0;
+               }
+
+               prev = dent;
+       } while (dirent_next(dip, bh, &dent) == 0);
+
+       return -ENOENT;
+}
+
+static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
+                   struct buffer_head **bhp)
+{
+       int error;
+
+       error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
+       if (!error && gfs2_metatype_check(dip->i_sbd, *bhp, GFS2_METATYPE_LF))
+               error = -EIO;
+
+       return error;
+}
+
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
+                      uint64_t *leaf_out)
+{
+       uint64_t leaf_no;
+       int error;
+
+       error = gfs2_jdata_read_mem(dip, (char *)&leaf_no,
+                                   index * sizeof(uint64_t),
+                                   sizeof(uint64_t));
+       if (error != sizeof(uint64_t))
+               return (error < 0) ? error : -EIO;
+
+       *leaf_out = be64_to_cpu(leaf_no);
+
+       return 0;
+}
+
+static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
+                         struct buffer_head **bh_out)
+{
+       uint64_t leaf_no;
+       int error;
+
+       error = get_leaf_nr(dip, index, &leaf_no);
+       if (!error)
+               error = get_leaf(dip, leaf_no, bh_out);
+
+       return error;
+}
+
+static int get_next_leaf(struct gfs2_inode *dip, struct buffer_head *bh_in,
+                        struct buffer_head **bh_out)
+{
+       struct gfs2_leaf *leaf;
+       int error;
+
+       leaf = (struct gfs2_leaf *)bh_in->b_data;
+
+       if (!leaf->lf_next)
+               error = -ENOENT;
+       else
+               error = get_leaf(dip, be64_to_cpu(leaf->lf_next), bh_out);
+
+       return error;
+}
+
+static int linked_leaf_search(struct gfs2_inode *dip, struct qstr *filename,
+                             struct gfs2_dirent **dent_out,
+                             struct gfs2_dirent **dent_prev,
+                             struct buffer_head **bh_out)
+{
+       struct buffer_head *bh = NULL, *bh_next;
+       uint32_t hsize, index;
+       uint32_t hash;
+       int error;
+
+       hsize = 1 << dip->i_di.di_depth;
+       if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       /*  Figure out the address of the leaf node.  */
+
+       hash = gfs2_disk_hash(filename->name, filename->len);
+       index = hash >> (32 - dip->i_di.di_depth);
+
+       error = get_first_leaf(dip, index, &bh_next);
+       if (error)
+               return error;
+
+       /*  Find the entry  */
+
+       do {
+               brelse(bh);
+
+               bh = bh_next;
+
+               error = leaf_search(dip, bh, filename, dent_out, dent_prev);
+               switch (error) {
+               case 0:
+                       *bh_out = bh;
+                       return 0;
+
+               case -ENOENT:
+                       break;
+
+               default:
+                       brelse(bh);
+                       return error;
+               }
+
+               error = get_next_leaf(dip, bh, &bh_next);
+       }
+       while (!error);
+
+       brelse(bh);
+
+       return error;
+}
+
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dir_make_exhash(struct gfs2_inode *dip)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct gfs2_dirent *dent;
+       struct buffer_head *bh, *dibh;
+       struct gfs2_leaf *leaf;
+       int y;
+       uint32_t x;
+       uint64_t *lp, bn;
+       int error;
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       /*  Allocate a new block for the first leaf node  */
+
+       bn = gfs2_alloc_meta(dip);
+
+       /*  Turn over a new leaf  */
+
+       bh = gfs2_meta_new(dip->i_gl, bn);
+       gfs2_trans_add_bh(dip->i_gl, bh);
+       gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+       gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+
+       /*  Fill in the leaf structure  */
+
+       leaf = (struct gfs2_leaf *)bh->b_data;
+
+       gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+
+       leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+       leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+
+       /*  Copy dirents  */
+
+       gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+                            sizeof(struct gfs2_dinode));
+
+       /*  Find last entry  */
+
+       x = 0;
+       dirent_first(dip, bh, &dent);
+
+       do {
+               if (!dent->de_inum.no_addr)
+                       continue;
+               if (++x == dip->i_di.di_entries)
+                       break;
+       }
+       while (dirent_next(dip, bh, &dent) == 0);
+
+       /*  Adjust the last dirent's record length
+          (Remember that dent still points to the last entry.)  */
+
+       dent->de_rec_len = be32_to_cpu(dent->de_rec_len) +
+               sizeof(struct gfs2_dinode) -
+               sizeof(struct gfs2_leaf);
+       dent->de_rec_len = cpu_to_be32(dent->de_rec_len);
+
+       brelse(bh);
+
+       /*  We're done with the new leaf block, now setup the new
+           hash table.  */
+
+       gfs2_trans_add_bh(dip->i_gl, dibh);
+       gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+       lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
+
+       for (x = sdp->sd_hash_ptrs; x--; lp++)
+               *lp = cpu_to_be64(bn);
+
+       dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+       dip->i_di.di_blocks++;
+       dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+       dip->i_di.di_payload_format = 0;
+
+       for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+       dip->i_di.di_depth = y;
+
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+
+       brelse(dibh);
+
+       return 0;
+}
+
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_split_leaf(struct gfs2_inode *dip, uint32_t index,
+                         uint64_t leaf_no)
+{
+       struct buffer_head *nbh, *obh, *dibh;
+       struct gfs2_leaf *nleaf, *oleaf;
+       struct gfs2_dirent *dent, *prev = NULL, *next = NULL, *new;
+       uint32_t start, len, half_len, divider;
+       uint64_t bn, *lp;
+       uint32_t name_len;
+       int x, moved = 0;
+       int error;
+
+       /*  Allocate the new leaf block  */
+
+       bn = gfs2_alloc_meta(dip);
+
+       /*  Get the new leaf block  */
+
+       nbh = gfs2_meta_new(dip->i_gl, bn);
+       gfs2_trans_add_bh(dip->i_gl, nbh);
+       gfs2_metatype_set(nbh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+       gfs2_buffer_clear_tail(nbh, sizeof(struct gfs2_meta_header));
+
+       nleaf = (struct gfs2_leaf *)nbh->b_data;
+
+       nleaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+
+       /*  Get the old leaf block  */
+
+       error = get_leaf(dip, leaf_no, &obh);
+       if (error)
+               goto fail;
+
+       gfs2_trans_add_bh(dip->i_gl, obh);
+
+       oleaf = (struct gfs2_leaf *)obh->b_data;
+
+       /*  Compute the start and len of leaf pointers in the hash table.  */
+
+       len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+       half_len = len >> 1;
+       if (!half_len) {
+               gfs2_consist_inode(dip);
+               error = -EIO;
+               goto fail_brelse;
+       }
+
+       start = (index & ~(len - 1));
+
+       /* Change the pointers.
+          Don't bother distinguishing stuffed from non-stuffed.
+          This code is complicated enough already. */
+
+       lp = kcalloc(half_len, sizeof(uint64_t), GFP_KERNEL | __GFP_NOFAIL);
+
+       error = gfs2_jdata_read_mem(dip, (char *)lp, start * sizeof(uint64_t),
+                                   half_len * sizeof(uint64_t));
+       if (error != half_len * sizeof(uint64_t)) {
+               if (error >= 0)
+                       error = -EIO;
+               goto fail_lpfree;
+       }
+
+       /*  Change the pointers  */
+
+       for (x = 0; x < half_len; x++)
+               lp[x] = cpu_to_be64(bn);
+
+       error = gfs2_jdata_write_mem(dip, (char *)lp, start * sizeof(uint64_t),
+                                    half_len * sizeof(uint64_t));
+       if (error != half_len * sizeof(uint64_t)) {
+               if (error >= 0)
+                       error = -EIO;
+               goto fail_lpfree;
+       }
+
+       kfree(lp);
+
+       /*  Compute the divider  */
+
+       divider = (start + half_len) << (32 - dip->i_di.di_depth);
+
+       /*  Copy the entries  */
+
+       dirent_first(dip, obh, &dent);
+
+       do {
+               next = dent;
+               if (dirent_next(dip, obh, &next))
+                       next = NULL;
+
+               if (dent->de_inum.no_addr &&
+                   be32_to_cpu(dent->de_hash) < divider) {
+                       name_len = dent->de_name_len;
+
+                       gfs2_dirent_alloc(dip, nbh, name_len, &new);
+
+                       new->de_inum = dent->de_inum; /* No endian worries */
+                       new->de_hash = dent->de_hash; /* No endian worries */
+                       new->de_type = dent->de_type; /* No endian worries */
+                       memcpy((char *)(new + 1), (char *)(dent + 1),
+                              name_len);
+
+                       nleaf->lf_entries = be16_to_cpu(nleaf->lf_entries)+1;
+                       nleaf->lf_entries = cpu_to_be16(nleaf->lf_entries);
+
+                       dirent_del(dip, obh, prev, dent);
+
+                       if (!oleaf->lf_entries)
+                               gfs2_consist_inode(dip);
+                       oleaf->lf_entries = be16_to_cpu(oleaf->lf_entries)-1;
+                       oleaf->lf_entries = cpu_to_be16(oleaf->lf_entries);
+
+                       if (!prev)
+                               prev = dent;
+
+                       moved = 1;
+               } else
+                       prev = dent;
+
+               dent = next;
+       }
+       while (dent);
+
+       /* If none of the entries got moved into the new leaf,
+          artificially fill in the first entry. */
+
+       if (!moved) {
+               gfs2_dirent_alloc(dip, nbh, 0, &new);
+               new->de_inum.no_addr = 0;
+       }
+
+       oleaf->lf_depth = be16_to_cpu(oleaf->lf_depth) + 1;
+       oleaf->lf_depth = cpu_to_be16(oleaf->lf_depth);
+       nleaf->lf_depth = oleaf->lf_depth;
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (!gfs2_assert_withdraw(dip->i_sbd, !error)) {
+               dip->i_di.di_blocks++;
+               gfs2_dinode_out(&dip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       brelse(obh);
+       brelse(nbh);
+
+       return error;
+
+ fail_lpfree:
+       kfree(lp);
+
+ fail_brelse:
+       brelse(obh);
+
+ fail:
+       brelse(nbh);
+       return error;
+}
+
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct buffer_head *dibh;
+       uint32_t hsize;
+       uint64_t *buf;
+       uint64_t *from, *to;
+       uint64_t block;
+       int x;
+       int error = 0;
+
+       hsize = 1 << dip->i_di.di_depth;
+       if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       /*  Allocate both the "from" and "to" buffers in one big chunk  */
+
+       buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+
+       for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+               error = gfs2_jdata_read_mem(dip, (char *)buf,
+                                           block * sdp->sd_hash_bsize,
+                                           sdp->sd_hash_bsize);
+               if (error != sdp->sd_hash_bsize) {
+                       if (error >= 0)
+                               error = -EIO;
+                       goto fail;
+               }
+
+               from = buf;
+               to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
+
+               for (x = sdp->sd_hash_ptrs; x--; from++) {
+                       *to++ = *from;  /*  No endianess worries  */
+                       *to++ = *from;
+               }
+
+               error = gfs2_jdata_write_mem(dip,
+                                            (char *)buf + sdp->sd_hash_bsize,
+                                            block * sdp->sd_sb.sb_bsize,
+                                            sdp->sd_sb.sb_bsize);
+               if (error != sdp->sd_sb.sb_bsize) {
+                       if (error >= 0)
+                               error = -EIO;
+                       goto fail;
+               }
+       }
+
+       kfree(buf);
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (!gfs2_assert_withdraw(sdp, !error)) {
+               dip->i_di.di_depth++;
+               gfs2_dinode_out(&dip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       return error;
+
+ fail:
+       kfree(buf);
+
+       return error;
+}
+
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ *   gt: returns 1
+ *   lt: returns -1
+ *   eq: returns 0
+ */
+
+static int compare_dents(const void *a, const void *b)
+{
+       struct gfs2_dirent *dent_a, *dent_b;
+       uint32_t hash_a, hash_b;
+       int ret = 0;
+
+       dent_a = *(struct gfs2_dirent **)a;
+       hash_a = dent_a->de_hash;
+       hash_a = be32_to_cpu(hash_a);
+
+       dent_b = *(struct gfs2_dirent **)b;
+       hash_b = dent_b->de_hash;
+       hash_b = be32_to_cpu(hash_b);
+
+       if (hash_a > hash_b)
+               ret = 1;
+       else if (hash_a < hash_b)
+               ret = -1;
+       else {
+               unsigned int len_a = dent_a->de_name_len;
+               unsigned int len_b = dent_b->de_name_len;
+
+               if (len_a > len_b)
+                       ret = 1;
+               else if (len_a < len_b)
+                       ret = -1;
+               else
+                       ret = memcmp((char *)(dent_a + 1),
+                                    (char *)(dent_b + 1),
+                                    len_a);
+       }
+
+       return ret;
+}
+
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer.  We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
+                          void *opaque, gfs2_filldir_t filldir,
+                          struct gfs2_dirent **darr, uint32_t entries,
+                          int *copied)
+{
+       struct gfs2_dirent *dent, *dent_next;
+       struct gfs2_inum inum;
+       uint64_t off, off_next;
+       unsigned int x, y;
+       int run = 0;
+       int error = 0;
+
+       sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+
+       dent_next = darr[0];
+       off_next = be32_to_cpu(dent_next->de_hash);
+       off_next = gfs2_disk_hash2offset(off_next);
+
+       for (x = 0, y = 1; x < entries; x++, y++) {
+               dent = dent_next;
+               off = off_next;
+
+               if (y < entries) {
+                       dent_next = darr[y];
+                       off_next = be32_to_cpu(dent_next->de_hash);
+                       off_next = gfs2_disk_hash2offset(off_next);
+
+                       if (off < *offset)
+                               continue;
+                       *offset = off;
+
+                       if (off_next == off) {
+                               if (*copied && !run)
+                                       return 1;
+                               run = 1;
+                       } else
+                               run = 0;
+               } else {
+                       if (off < *offset)
+                               continue;
+                       *offset = off;
+               }
+
+               gfs2_inum_in(&inum, (char *)&dent->de_inum);
+
+               error = filldir(opaque, (char *)(dent + 1),
+                               dent->de_name_len,
+                               off, &inum,
+                               dent->de_type);
+               if (error)
+                       return 1;
+
+               *copied = 1;
+       }
+
+       /* Increment the *offset by one, so the next time we come into the
+          do_filldir fxn, we get the next entry instead of the last one in the
+          current leaf */
+
+       (*offset)++;
+
+       return 0;
+}
+
+/**
+ * do_filldir_single - Read directory entries out of a single block
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @bh: the block
+ * @entries: the number of entries in the block
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_single(struct gfs2_inode *dip, uint64_t *offset,
+                            void *opaque, gfs2_filldir_t filldir,
+                            struct buffer_head *bh, uint32_t entries,
+                            int *copied)
+{
+       struct gfs2_dirent **darr;
+       struct gfs2_dirent *de;
+       unsigned int e = 0;
+       int error;
+
+       if (!entries)
+               return 0;
+
+       darr = kcalloc(entries, sizeof(struct gfs2_dirent *), GFP_KERNEL);
+       if (!darr)
+               return -ENOMEM;
+
+       dirent_first(dip, bh, &de);
+       do {
+               if (!de->de_inum.no_addr)
+                       continue;
+               if (e >= entries) {
+                       gfs2_consist_inode(dip);
+                       error = -EIO;
+                       goto out;
+               }
+               darr[e++] = de;
+       }
+       while (dirent_next(dip, bh, &de) == 0);
+
+       if (e != entries) {
+               gfs2_consist_inode(dip);
+               error = -EIO;
+               goto out;
+       }
+
+       error = do_filldir_main(dip, offset, opaque, filldir, darr,
+                               entries, copied);
+
+ out:
+       kfree(darr);
+
+       return error;
+}
+
+/**
+ * do_filldir_multi - Read directory entries out of a linked leaf list
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @bh: the first leaf in the list
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_multi(struct gfs2_inode *dip, uint64_t *offset,
+                           void *opaque, gfs2_filldir_t filldir,
+                           struct buffer_head *bh, int *copied)
+{
+       struct buffer_head **larr = NULL;
+       struct gfs2_dirent **darr;
+       struct gfs2_leaf *leaf;
+       struct buffer_head *tmp_bh;
+       struct gfs2_dirent *de;
+       unsigned int entries, e = 0;
+       unsigned int leaves = 0, l = 0;
+       unsigned int x;
+       uint64_t ln;
+       int error = 0;
+
+       /*  Count leaves and entries  */
+
+       leaf = (struct gfs2_leaf *)bh->b_data;
+       entries = be16_to_cpu(leaf->lf_entries);
+       ln = leaf->lf_next;
+
+       while (ln) {
+               ln = be64_to_cpu(ln);
+
+               error = get_leaf(dip, ln, &tmp_bh);
+               if (error)
+                       return error;
+
+               leaf = (struct gfs2_leaf *)tmp_bh->b_data;
+               if (leaf->lf_entries) {
+                       entries += be16_to_cpu(leaf->lf_entries);
+                       leaves++;
+               }
+               ln = leaf->lf_next;
+
+               brelse(tmp_bh);
+       }
+
+       if (!entries)
+               return 0;
+
+       if (leaves) {
+               larr = kcalloc(leaves, sizeof(struct buffer_head *),GFP_KERNEL);
+               if (!larr)
+                       return -ENOMEM;
+       }
+
+       darr = kcalloc(entries, sizeof(struct gfs2_dirent *), GFP_KERNEL);
+       if (!darr) {
+               kfree(larr);
+               return -ENOMEM;
+       }
+
+       leaf = (struct gfs2_leaf *)bh->b_data;
+       if (leaf->lf_entries) {
+               dirent_first(dip, bh, &de);
+               do {
+                       if (!de->de_inum.no_addr)
+                               continue;
+                       if (e >= entries) {
+                               gfs2_consist_inode(dip);
+                               error = -EIO;
+                               goto out;
+                       }
+                       darr[e++] = de;
+               }
+               while (dirent_next(dip, bh, &de) == 0);
+       }
+       ln = leaf->lf_next;
+
+       while (ln) {
+               ln = be64_to_cpu(ln);
+
+               error = get_leaf(dip, ln, &tmp_bh);
+               if (error)
+                       goto out;
+
+               leaf = (struct gfs2_leaf *)tmp_bh->b_data;
+               if (leaf->lf_entries) {
+                       dirent_first(dip, tmp_bh, &de);
+                       do {
+                               if (!de->de_inum.no_addr)
+                                       continue;
+                               if (e >= entries) {
+                                       gfs2_consist_inode(dip);
+                                       error = -EIO;
+                                       goto out;
+                               }
+                               darr[e++] = de;
+                       }
+                       while (dirent_next(dip, tmp_bh, &de) == 0);
+
+                       larr[l++] = tmp_bh;
+
+                       ln = leaf->lf_next;
+               } else {
+                       ln = leaf->lf_next;
+                       brelse(tmp_bh);
+               }
+       }
+
+       if (gfs2_assert_withdraw(dip->i_sbd, l == leaves)) {
+               error = -EIO;
+               goto out;
+       }
+       if (e != entries) {
+               gfs2_consist_inode(dip);
+               error = -EIO;
+               goto out;
+       }
+
+       error = do_filldir_main(dip, offset, opaque, filldir, darr,
+                               entries, copied);
+
+ out:
+       kfree(darr);
+       for (x = 0; x < l; x++)
+               brelse(larr[x]);
+       kfree(larr);
+
+       return error;
+}
+
+/**
+ * dir_e_search - Search exhash (leaf) dir for inode matching name
+ * @dip: The GFS2 inode
+ * @filename: Filename string
+ * @inode: If non-NULL, function fills with formal inode # and block address
+ * @type: If non-NULL, function fills with DT_... dinode type
+ *
+ * Returns:
+ */
+
+static int dir_e_search(struct gfs2_inode *dip, struct qstr *filename,
+                       struct gfs2_inum *inum, unsigned int *type)
+{
+       struct buffer_head *bh;
+       struct gfs2_dirent *dent;
+       int error;
+
+       error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
+       if (error)
+               return error;
+
+       if (inum)
+               gfs2_inum_in(inum, (char *)&dent->de_inum);
+       if (type)
+               *type = dent->de_type;
+
+       brelse(bh);
+
+       return 0;
+}
+
+static int dir_e_add(struct gfs2_inode *dip, struct qstr *filename,
+                    struct gfs2_inum *inum, unsigned int type)
+{
+       struct buffer_head *bh, *nbh, *dibh;
+       struct gfs2_leaf *leaf, *nleaf;
+       struct gfs2_dirent *dent;
+       uint32_t hsize, index;
+       uint32_t hash;
+       uint64_t leaf_no, bn;
+       int error;
+
+ restart:
+       hsize = 1 << dip->i_di.di_depth;
+       if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       /*  Figure out the address of the leaf node.  */
+
+       hash = gfs2_disk_hash(filename->name, filename->len);
+       index = hash >> (32 - dip->i_di.di_depth);
+
+       error = get_leaf_nr(dip, index, &leaf_no);
+       if (error)
+               return error;
+
+       /*  Add entry to the leaf  */
+
+       for (;;) {
+               error = get_leaf(dip, leaf_no, &bh);
+               if (error)
+                       return error;
+
+               leaf = (struct gfs2_leaf *)bh->b_data;
+
+               if (gfs2_dirent_alloc(dip, bh, filename->len, &dent)) {
+
+                       if (be16_to_cpu(leaf->lf_depth) < dip->i_di.di_depth) {
+                               /* Can we split the leaf? */
+
+                               brelse(bh);
+
+                               error = dir_split_leaf(dip, index, leaf_no);
+                               if (error)
+                                       return error;
+
+                               goto restart;
+
+                       } else if (dip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+                               /* Can we double the hash table? */
+
+                               brelse(bh);
+
+                               error = dir_double_exhash(dip);
+                               if (error)
+                                       return error;
+
+                               goto restart;
+
+                       } else if (leaf->lf_next) {
+                               /* Can we try the next leaf in the list? */
+                               leaf_no = be64_to_cpu(leaf->lf_next);
+                               brelse(bh);
+                               continue;
+
+                       } else {
+                               /* Create a new leaf and add it to the list. */
+
+                               bn = gfs2_alloc_meta(dip);
+
+                               nbh = gfs2_meta_new(dip->i_gl, bn);
+                               gfs2_trans_add_bh(dip->i_gl, nbh);
+                               gfs2_metatype_set(nbh,
+                                                GFS2_METATYPE_LF,
+                                                GFS2_FORMAT_LF);
+                               gfs2_buffer_clear_tail(nbh,
+                                       sizeof(struct gfs2_meta_header));
+
+                               gfs2_trans_add_bh(dip->i_gl, bh);
+                               leaf->lf_next = cpu_to_be64(bn);
+
+                               nleaf = (struct gfs2_leaf *)nbh->b_data;
+                               nleaf->lf_depth = leaf->lf_depth;
+                               nleaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+
+                               gfs2_dirent_alloc(dip, nbh, filename->len,
+                                                 &dent);
+
+                               dip->i_di.di_blocks++;
+
+                               brelse(bh);
+
+                               bh = nbh;
+                               leaf = nleaf;
+                       }
+               }
+
+               /* If the gfs2_dirent_alloc() succeeded, it pinned the "bh" */
+
+               gfs2_inum_out(inum, (char *)&dent->de_inum);
+               dent->de_hash = cpu_to_be32(hash);
+               dent->de_type = type;
+               memcpy((char *)(dent + 1), filename->name, filename->len);
+
+               leaf->lf_entries = be16_to_cpu(leaf->lf_entries) + 1;
+               leaf->lf_entries = cpu_to_be16(leaf->lf_entries);
+
+               brelse(bh);
+
+               error = gfs2_meta_inode_buffer(dip, &dibh);
+               if (error)
+                       return error;
+
+               dip->i_di.di_entries++;
+               dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+
+               gfs2_trans_add_bh(dip->i_gl, dibh);
+               gfs2_dinode_out(&dip->i_di, dibh->b_data);
+               brelse(dibh);
+
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+static int dir_e_del(struct gfs2_inode *dip, struct qstr *filename)
+{
+       struct buffer_head *bh, *dibh;
+       struct gfs2_dirent *dent, *prev;
+       struct gfs2_leaf *leaf;
+       unsigned int entries;
+       int error;
+
+       error = linked_leaf_search(dip, filename, &dent, &prev, &bh);
+       if (error == -ENOENT) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+       if (error)
+               return error;
+
+       dirent_del(dip, bh, prev, dent); /* Pins bh */
+
+       leaf = (struct gfs2_leaf *)bh->b_data;
+       entries = be16_to_cpu(leaf->lf_entries);
+       if (!entries)
+               gfs2_consist_inode(dip);
+       entries--;
+       leaf->lf_entries = cpu_to_be16(entries);
+
+       brelse(bh);
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       if (!dip->i_di.di_entries)
+               gfs2_consist_inode(dip);
+       dip->i_di.di_entries--;
+       dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+
+       gfs2_trans_add_bh(dip->i_gl, dibh);
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+       brelse(dibh);
+
+       return 0;
+}
+
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+
+static int dir_e_read(struct gfs2_inode *dip, uint64_t *offset, void *opaque,
+                     gfs2_filldir_t filldir)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct buffer_head *bh;
+       struct gfs2_leaf leaf;
+       uint32_t hsize, len;
+       uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
+       uint32_t hash, index;
+       uint64_t *lp;
+       int copied = 0;
+       int error = 0;
+
+       hsize = 1 << dip->i_di.di_depth;
+       if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       hash = gfs2_dir_offset2hash(*offset);
+       index = hash >> (32 - dip->i_di.di_depth);
+
+       lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+       if (!lp)
+               return -ENOMEM;
+
+       while (index < hsize) {
+               lp_offset = index & (sdp->sd_hash_ptrs - 1);
+               ht_offset = index - lp_offset;
+
+               if (ht_offset_cur != ht_offset) {
+                       error = gfs2_jdata_read_mem(dip, (char *)lp,
+                                               ht_offset * sizeof(uint64_t),
+                                               sdp->sd_hash_bsize);
+                       if (error != sdp->sd_hash_bsize) {
+                               if (error >= 0)
+                                       error = -EIO;
+                               goto out;
+                       }
+                       ht_offset_cur = ht_offset;
+               }
+
+               error = get_leaf(dip, be64_to_cpu(lp[lp_offset]), &bh);
+               if (error)
+                       goto out;
+
+               gfs2_leaf_in(&leaf, bh->b_data);
+
+               if (leaf.lf_next)
+                       error = do_filldir_multi(dip, offset, opaque, filldir,
+                                                bh, &copied);
+               else
+                       error = do_filldir_single(dip, offset, opaque, filldir,
+                                                 bh, leaf.lf_entries, &copied);
+
+               brelse(bh);
+
+               if (error) {
+                       if (error > 0)
+                               error = 0;
+                       goto out;
+               }
+
+               len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
+               index = (index & ~(len - 1)) + len;
+       }
+
+ out:
+       kfree(lp);
+
+       return error;
+}
+
+static int dir_e_mvino(struct gfs2_inode *dip, struct qstr *filename,
+                      struct gfs2_inum *inum, unsigned int new_type)
+{
+       struct buffer_head *bh, *dibh;
+       struct gfs2_dirent *dent;
+       int error;
+
+       error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
+       if (error == -ENOENT) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+       if (error)
+               return error;
+
+       gfs2_trans_add_bh(dip->i_gl, bh);
+
+       gfs2_inum_out(inum, (char *)&dent->de_inum);
+       dent->de_type = new_type;
+
+       brelse(bh);
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+
+       gfs2_trans_add_bh(dip->i_gl, dibh);
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+       brelse(dibh);
+
+       return 0;
+}
+
+/**
+ * dir_l_search - Search linear (stuffed dinode) dir for inode matching name
+ * @dip: The GFS2 inode
+ * @filename: Filename string
+ * @inode: If non-NULL, function fills with formal inode # and block address
+ * @type: If non-NULL, function fills with DT_... dinode type
+ *
+ * Returns:
+ */
+
+static int dir_l_search(struct gfs2_inode *dip, struct qstr *filename,
+                       struct gfs2_inum *inum, unsigned int *type)
+{
+       struct buffer_head *dibh;
+       struct gfs2_dirent *dent;
+       int error;
+
+       if (!gfs2_is_stuffed(dip)) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       error = leaf_search(dip, dibh, filename, &dent, NULL);
+       if (!error) {
+               if (inum)
+                       gfs2_inum_in(inum, (char *)&dent->de_inum);
+               if (type)
+                       *type = dent->de_type;
+       }
+
+       brelse(dibh);
+
+       return error;
+}
+
+static int dir_l_add(struct gfs2_inode *dip, struct qstr *filename,
+                    struct gfs2_inum *inum, unsigned int type)
+{
+       struct buffer_head *dibh;
+       struct gfs2_dirent *dent;
+       int error;
+
+       if (!gfs2_is_stuffed(dip)) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       if (gfs2_dirent_alloc(dip, dibh, filename->len, &dent)) {
+               brelse(dibh);
+
+               error = dir_make_exhash(dip);
+               if (!error)
+                       error = dir_e_add(dip, filename, inum, type);
+
+               return error;
+       }
+
+       /*  gfs2_dirent_alloc() pins  */
+
+       gfs2_inum_out(inum, (char *)&dent->de_inum);
+       dent->de_hash = gfs2_disk_hash(filename->name, filename->len);
+       dent->de_hash = cpu_to_be32(dent->de_hash);
+       dent->de_type = type;
+       memcpy((char *)(dent + 1), filename->name, filename->len);
+
+       dip->i_di.di_entries++;
+       dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+       brelse(dibh);
+
+       return 0;
+}
+
+static int dir_l_del(struct gfs2_inode *dip, struct qstr *filename)
+{
+       struct buffer_head *dibh;
+       struct gfs2_dirent *dent, *prev;
+       int error;
+
+       if (!gfs2_is_stuffed(dip)) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       error = leaf_search(dip, dibh, filename, &dent, &prev);
+       if (error == -ENOENT) {
+               gfs2_consist_inode(dip);
+               error = -EIO;
+               goto out;
+       }
+       if (error)
+               goto out;
+
+       dirent_del(dip, dibh, prev, dent);
+
+       /*  dirent_del() pins  */
+
+       if (!dip->i_di.di_entries)
+               gfs2_consist_inode(dip);
+       dip->i_di.di_entries--;
+
+       dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+
+ out:
+       brelse(dibh);
+
+       return error;
+}
+
+static int dir_l_read(struct gfs2_inode *dip, uint64_t *offset, void *opaque,
+                     gfs2_filldir_t filldir)
+{
+       struct buffer_head *dibh;
+       int copied = 0;
+       int error;
+
+       if (!gfs2_is_stuffed(dip)) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       if (!dip->i_di.di_entries)
+               return 0;
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       error = do_filldir_single(dip, offset,
+                                 opaque, filldir,
+                                 dibh, dip->i_di.di_entries,
+                                 &copied);
+       if (error > 0)
+               error = 0;
+
+       brelse(dibh);
+
+       return error;
+}
+
+static int dir_l_mvino(struct gfs2_inode *dip, struct qstr *filename,
+                      struct gfs2_inum *inum, unsigned int new_type)
+{
+       struct buffer_head *dibh;
+       struct gfs2_dirent *dent;
+       int error;
+
+       if (!gfs2_is_stuffed(dip)) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               return error;
+
+       error = leaf_search(dip, dibh, filename, &dent, NULL);
+       if (error == -ENOENT) {
+               gfs2_consist_inode(dip);
+               error = -EIO;
+               goto out;
+       }
+       if (error)
+               goto out;
+
+       gfs2_trans_add_bh(dip->i_gl, dibh);
+
+       gfs2_inum_out(inum, (char *)&dent->de_inum);
+       dent->de_type = new_type;
+
+       dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+
+ out:
+       brelse(dibh);
+
+       return error;
+}
+
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_search(struct gfs2_inode *dip, struct qstr *filename,
+                   struct gfs2_inum *inum, unsigned int *type)
+{
+       int error;
+
+       if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+               error = dir_e_search(dip, filename, inum, type);
+       else
+               error = dir_l_search(dip, filename, inum, type);
+
+       return error;
+}
+
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_add(struct gfs2_inode *dip, struct qstr *filename,
+                struct gfs2_inum *inum, unsigned int type)
+{
+       int error;
+
+       if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+               error = dir_e_add(dip, filename, inum, type);
+       else
+               error = dir_l_add(dip, filename, inum, type);
+
+       return error;
+}
+
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_del(struct gfs2_inode *dip, struct qstr *filename)
+{
+       int error;
+
+       if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+               error = dir_e_del(dip, filename);
+       else
+               error = dir_l_del(dip, filename);
+
+       return error;
+}
+
+int gfs2_dir_read(struct gfs2_inode *dip, uint64_t *offset, void *opaque,
+                 gfs2_filldir_t filldir)
+{
+       int error;
+
+       if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+               error = dir_e_read(dip, offset, opaque, filldir);
+       else
+               error = dir_l_read(dip, offset, opaque, filldir);
+
+       return error;
+}
+
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry.  It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_mvino(struct gfs2_inode *dip, struct qstr *filename,
+                  struct gfs2_inum *inum, unsigned int new_type)
+{
+       int error;
+
+       if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+               error = dir_e_mvino(dip, filename, inum, new_type);
+       else
+               error = dir_l_mvino(dip, filename, inum, new_type);
+
+       return error;
+}
+
+/**
+ * foreach_leaf - call a function for each leaf in a directory
+ * @dip: the directory
+ * @lc: the function to call for each each
+ * @data: private data to pass to it
+ *
+ * Returns: errno
+ */
+
+static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct buffer_head *bh;
+       struct gfs2_leaf leaf;
+       uint32_t hsize, len;
+       uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
+       uint32_t index = 0;
+       uint64_t *lp;
+       uint64_t leaf_no;
+       int error = 0;
+
+       hsize = 1 << dip->i_di.di_depth;
+       if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
+               gfs2_consist_inode(dip);
+               return -EIO;
+       }
+
+       lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+       if (!lp)
+               return -ENOMEM;
+
+       while (index < hsize) {
+               lp_offset = index & (sdp->sd_hash_ptrs - 1);
+               ht_offset = index - lp_offset;
+
+               if (ht_offset_cur != ht_offset) {
+                       error = gfs2_jdata_read_mem(dip, (char *)lp,
+                                               ht_offset * sizeof(uint64_t),
+                                               sdp->sd_hash_bsize);
+                       if (error != sdp->sd_hash_bsize) {
+                               if (error >= 0)
+                                       error = -EIO;
+                               goto out;
+                       }
+                       ht_offset_cur = ht_offset;
+               }
+
+               leaf_no = be64_to_cpu(lp[lp_offset]);
+               if (leaf_no) {
+                       error = get_leaf(dip, leaf_no, &bh);
+                       if (error)
+                               goto out;
+                       gfs2_leaf_in(&leaf, bh->b_data);
+                       brelse(bh);
+
+                       len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
+
+                       error = lc(dip, index, len, leaf_no, data);
+                       if (error)
+                               goto out;
+
+                       index = (index & ~(len - 1)) + len;
+               } else
+                       index++;
+       }
+
+       if (index != hsize) {
+               gfs2_consist_inode(dip);
+               error = -EIO;
+       }
+
+ out:
+       kfree(lp);
+
+       return error;
+}
+
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: not used
+ *
+ * Returns: errno
+ */
+
+static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
+                       uint64_t leaf_no, void *data)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct gfs2_leaf tmp_leaf;
+       struct gfs2_rgrp_list rlist;
+       struct buffer_head *bh, *dibh;
+       uint64_t blk;
+       unsigned int rg_blocks = 0, l_blocks = 0;
+       char *ht;
+       unsigned int x, size = len * sizeof(uint64_t);
+       int error;
+
+       memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+       ht = kzalloc(size, GFP_KERNEL);
+       if (!ht)
+               return -ENOMEM;
+
+       gfs2_alloc_get(dip);
+
+       error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+       if (error)
+               goto out;
+
+       error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+       if (error)
+               goto out_qs;
+
+       /*  Count the number of leaves  */
+
+       for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
+               error = get_leaf(dip, blk, &bh);
+               if (error)
+                       goto out_rlist;
+               gfs2_leaf_in(&tmp_leaf, (bh)->b_data);
+               brelse(bh);
+
+               gfs2_rlist_add(sdp, &rlist, blk);
+               l_blocks++;
+       }
+
+       gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+       for (x = 0; x < rlist.rl_rgrps; x++) {
+               struct gfs2_rgrpd *rgd;
+               rgd = get_gl2rgd(rlist.rl_ghs[x].gh_gl);
+               rg_blocks += rgd->rd_ri.ri_length;
+       }
+
+       error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+       if (error)
+               goto out_rlist;
+
+       error = gfs2_trans_begin(sdp,
+                       rg_blocks + (DIV_RU(size, sdp->sd_jbsize) + 1) +
+                       RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+       if (error)
+               goto out_rg_gunlock;
+
+       for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
+               error = get_leaf(dip, blk, &bh);
+               if (error)
+                       goto out_end_trans;
+               gfs2_leaf_in(&tmp_leaf, bh->b_data);
+               brelse(bh);
+
+               gfs2_free_meta(dip, blk, 1);
+
+               if (!dip->i_di.di_blocks)
+                       gfs2_consist_inode(dip);
+               dip->i_di.di_blocks--;
+       }
+
+       error = gfs2_jdata_write_mem(dip, ht, index * sizeof(uint64_t), size);
+       if (error != size) {
+               if (error >= 0)
+                       error = -EIO;
+               goto out_end_trans;
+       }
+
+       error = gfs2_meta_inode_buffer(dip, &dibh);
+       if (error)
+               goto out_end_trans;
+
+       gfs2_trans_add_bh(dip->i_gl, dibh);
+       gfs2_dinode_out(&dip->i_di, dibh->b_data);
+       brelse(dibh);
+
+ out_end_trans:
+       gfs2_trans_end(sdp);
+
+ out_rg_gunlock:
+       gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+
+ out_rlist:
+       gfs2_rlist_free(&rlist);
+       gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+
+ out_qs:
+       gfs2_quota_unhold(dip);
+
+ out:
+       gfs2_alloc_put(dip);
+       kfree(ht);
+
+       return error;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+       struct gfs2_sbd *sdp = dip->i_sbd;
+       struct buffer_head *bh;
+       int error;
+
+       /* Dealloc on-disk leaves to FREEMETA state */
+       error = foreach_leaf(dip, leaf_dealloc, NULL);
+       if (error)
+               return error;
+
+       /* Make this a regular file in case we crash.
+          (We don't want to free these blocks a second time.)  */
+
+       error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+       if (error)
+               return error;
+
+       error = gfs2_meta_inode_buffer(dip, &bh);
+       if (!error) {
+               gfs2_trans_add_bh(dip->i_gl, bh);
+               ((struct gfs2_dinode *)bh->b_data)->di_mode = cpu_to_be32(S_IFREG);
+               brelse(bh);
+       }
+
+       gfs2_trans_end(sdp);
+
+       return error;
+}
+
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+
+int gfs2_diradd_alloc_required(struct gfs2_inode *dip, struct qstr *filename,
+                              int *alloc_required)
+{
+       struct buffer_head *bh = NULL, *bh_next;
+       uint32_t hsize, hash, index;
+       int error = 0;
+
+       *alloc_required = 0;
+
+       if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+               hsize = 1 << dip->i_di.di_depth;
+               if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
+                       gfs2_consist_inode(dip);
+                       return -EIO;
+               }
+
+               hash = gfs2_disk_hash(filename->name, filename->len);
+               index = hash >> (32 - dip->i_di.di_depth);
+
+               error = get_first_leaf(dip, index, &bh_next);
+               if (error)
+                       return error;
+
+               do {
+                       brelse(bh);
+
+                       bh = bh_next;
+
+                       if (dirent_fits(dip, bh, filename->len))
+                               break;
+
+                       error = get_next_leaf(dip, bh, &bh_next);
+                       if (error == -ENOENT) {
+                               *alloc_required = 1;
+                               error = 0;
+                               break;
+                       }
+               }
+               while (!error);
+
+               brelse(bh);
+       } else {
+               error = gfs2_meta_inode_buffer(dip, &bh);
+               if (error)
+                       return error;
+
+               if (!dirent_fits(dip, bh, filename->len))
+                       *alloc_required = 1;
+
+               brelse(bh);
+       }
+
+       return error;
+}
+
+/**
+ * do_gdm - copy out one leaf (or list of leaves)
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: a pointer to a struct gfs2_user_buffer structure
+ *
+ * Returns: errno
+ */
+
+static int do_gdm(struct gfs2_inode *dip, uint32_t index, uint32_t len,
+                 uint64_t leaf_no, void *data)
+{
+       struct gfs2_user_buffer *ub = (struct gfs2_user_buffer *)data;
+       struct gfs2_leaf leaf;
+       struct buffer_head *bh;
+       uint64_t blk;
+       int error = 0;
+
+       for (blk = leaf_no; blk; blk = leaf.lf_next) {
+               error = get_leaf(dip, blk, &bh);
+               if (error)
+                       break;
+
+               gfs2_leaf_in(&leaf, bh->b_data);
+
+               error = gfs2_add_bh_to_ub(ub, bh);
+
+               brelse(bh);
+
+               if (error)
+                       break;
+       }
+
+       return error;
+}
+
+/**
+ * gfs2_get_dir_meta - return all the leaf blocks of a directory
+ * @dip: the directory
+ * @ub: the structure representing the meta
+ *
+ * Returns: errno
+ */
+
+int gfs2_get_dir_meta(struct gfs2_inode *dip, struct gfs2_user_buffer *ub)
+{
+       return foreach_leaf(dip, do_gdm, ub);
+}
+
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644 (file)
index 0000000..79f77aa
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+/**
+ * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+
+typedef int (*gfs2_filldir_t) (void *opaque,
+                             const char *name, unsigned int length,
+                             uint64_t offset,
+                             struct gfs2_inum *inum, unsigned int type);
+
+int gfs2_filecmp(struct qstr *file1, char *file2, int len_of_file2);
+int gfs2_dirent_alloc(struct gfs2_inode *dip, struct buffer_head *bh,
+                    int name_len, struct gfs2_dirent **dent_out);
+
+int gfs2_dir_search(struct gfs2_inode *dip, struct qstr *filename,
+                  struct gfs2_inum *inum, unsigned int *type);
+int gfs2_dir_add(struct gfs2_inode *dip, struct qstr *filename,
+               struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_del(struct gfs2_inode *dip, struct qstr *filename);
+int gfs2_dir_read(struct gfs2_inode *dip, uint64_t * offset, void *opaque,
+                gfs2_filldir_t filldir);
+int gfs2_dir_mvino(struct gfs2_inode *dip, struct qstr *filename,
+                 struct gfs2_inum *new_inum, unsigned int new_type);
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+
+int gfs2_diradd_alloc_required(struct gfs2_inode *dip, struct qstr *filename,
+                             int *alloc_required);
+
+int gfs2_get_dir_meta(struct gfs2_inode *ip, struct gfs2_user_buffer *ub);
+
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644 (file)
index 0000000..2914731
--- /dev/null
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+
+/**
+ * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
+ * @namep: ea name, possibly with type appended
+ *
+ * Returns: GFS2_EATYPE_XXX
+ */
+
+unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
+{
+       unsigned int type;
+
+       if (strncmp(name, "system.", 7) == 0) {
+               type = GFS2_EATYPE_SYS;
+               if (truncated_name)
+                       *truncated_name = strchr(name, '.') + 1;
+       } else if (strncmp(name, "user.", 5) == 0) {
+               type = GFS2_EATYPE_USR;
+               if (truncated_name)
+                       *truncated_name = strchr(name, '.') + 1;
+       } else {
+               type = GFS2_EATYPE_UNUSED;
+               if (truncated_name)
+                       *truncated_name = NULL;
+       }
+
+       return type;
+}
+
+static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct inode *inode = ip->i_vnode;
+       int error = permission(inode, MAY_READ, NULL);
+       if (error)
+               return error;
+
+       return gfs2_ea_get_i(ip, er);
+}
+
+static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct inode *inode = ip->i_vnode;
+
+       if (S_ISREG(inode->i_mode) ||
+           (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+               int error = permission(inode, MAY_WRITE, NULL);
+               if (error)
+                       return error;
+       } else
+               return -EPERM;
+
+       return gfs2_ea_set_i(ip, er);
+}
+
+static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct inode *inode = ip->i_vnode;
+
+       if (S_ISREG(inode->i_mode) ||
+           (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+               int error = permission(inode, MAY_WRITE, NULL);
+               if (error)
+                       return error;
+       } else
+               return -EPERM;
+
+       return gfs2_ea_remove_i(ip, er);
+}
+
+static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
+           !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
+           !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (ip->i_sbd->sd_args.ar_posix_acl == 0 &&
+           (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
+            GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
+               return -EOPNOTSUPP;
+
+
+
+       return gfs2_ea_get_i(ip, er);
+}
+
+static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       int remove = 0;
+       int error;
+
+       if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+               if (!(er->er_flags & GFS2_ERF_MODE)) {
+                       er->er_mode = ip->i_di.di_mode;
+                       er->er_flags |= GFS2_ERF_MODE;
+               }
+               error = gfs2_acl_validate_set(ip, 1, er,
+                                             &remove, &er->er_mode);
+               if (error)
+                       return error;
+               error = gfs2_ea_set_i(ip, er);
+               if (error)
+                       return error;
+               if (remove)
+                       gfs2_ea_remove_i(ip, er);
+               return 0;
+
+       } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+               error = gfs2_acl_validate_set(ip, 0, er,
+                                             &remove, NULL);
+               if (error)
+                       return error;
+               if (!remove)
+                       error = gfs2_ea_set_i(ip, er);
+               else {
+                       error = gfs2_ea_remove_i(ip, er);
+                       if (error == -ENODATA)
+                               error = 0;
+               }
+               return error;   
+       }
+
+       return -EPERM;
+}
+
+static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+               int error = gfs2_acl_validate_remove(ip, 1);
+               if (error)
+                       return error;
+
+       } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+               int error = gfs2_acl_validate_remove(ip, 0);
+               if (error)
+                       return error;
+
+       } else
+               return -EPERM;
+
+       return gfs2_ea_remove_i(ip, er);
+}
+
+struct gfs2_eattr_operations gfs2_user_eaops = {
+       .eo_get = user_eo_get,
+       .eo_set = user_eo_set,
+       .eo_remove = user_eo_remove,
+       .eo_name = "user",
+};
+
+struct gfs2_eattr_operations gfs2_system_eaops = {
+       .eo_get = system_eo_get,
+       .eo_set = system_eo_set,
+       .eo_remove = system_eo_remove,
+       .eo_name = "system",
+};
+
+struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+       NULL,
+       &gfs2_user_eaops,
+       &gfs2_system_eaops,
+};
+
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644 (file)
index 0000000..f83c497
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __EAOPS_DOT_H__
+#define __EAOPS_DOT_H__
+
+struct gfs2_ea_request;
+
+struct gfs2_eattr_operations {
+       int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+       int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+       int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+       char *eo_name;
+};
+
+unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
+
+extern struct gfs2_eattr_operations gfs2_user_eaops;
+extern struct gfs2_eattr_operations gfs2_system_eaops;
+
+extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+
+#endif /* __EAOPS_DOT_H__ */
+
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644 (file)
index 0000000..63a5cf1
--- /dev/null
@@ -0,0 +1,1620 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+                       unsigned int *size)
+{
+       *size = GFS2_EAREQ_SIZE_STUFFED(er);
+       if (*size <= sdp->sd_jbsize)
+               return 1;
+
+       *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+
+       return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+{
+       unsigned int size;
+
+       if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+               return -ERANGE;
+
+       ea_calc_size(sdp, er, &size);
+
+       /* This can only happen with 512 byte blocks */
+       if (size > sdp->sd_jbsize)
+               return -ERANGE;
+
+       return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip,
+                         struct buffer_head *bh,
+                         struct gfs2_ea_header *ea,
+                         struct gfs2_ea_header *prev,
+                         void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                       ea_call_t ea_call, void *data)
+{
+       struct gfs2_ea_header *ea, *prev = NULL;
+       int error = 0;
+
+       if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_EA))
+               return -EIO;
+
+       for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+               if (!GFS2_EA_REC_LEN(ea))
+                       goto fail;
+               if (!(bh->b_data <= (char *)ea &&
+                     (char *)GFS2_EA2NEXT(ea) <=
+                     bh->b_data + bh->b_size))
+                       goto fail;
+               if (!GFS2_EATYPE_VALID(ea->ea_type))
+                       goto fail;
+
+               error = ea_call(ip, bh, ea, prev, data);
+               if (error)
+                       return error;
+
+               if (GFS2_EA_IS_LAST(ea)) {
+                       if ((char *)GFS2_EA2NEXT(ea) !=
+                           bh->b_data + bh->b_size)
+                               goto fail;
+                       break;
+               }
+       }
+
+       return error;
+
+ fail:
+       gfs2_consist_inode(ip);
+       return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+       struct buffer_head *bh, *eabh;
+       uint64_t *eablk, *end;
+       int error;
+
+       error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
+                              DIO_START | DIO_WAIT, &bh);
+       if (error)
+               return error;
+
+       if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+               error = ea_foreach_i(ip, bh, ea_call, data);
+               goto out;
+       }
+
+       if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_IN)) {
+               error = -EIO;
+               goto out;
+       }
+
+       eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
+       end = eablk + ip->i_sbd->sd_inptrs;
+
+       for (; eablk < end; eablk++) {
+               uint64_t bn;
+
+               if (!*eablk)
+                       break;
+               bn = be64_to_cpu(*eablk);
+
+               error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
+                                      &eabh);
+               if (error)
+                       break;
+               error = ea_foreach_i(ip, eabh, ea_call, data);
+               brelse(eabh);
+               if (error)
+                       break;
+       }
+ out:
+       brelse(bh);
+
+       return error;
+}
+
+struct ea_find {
+       struct gfs2_ea_request *ef_er;
+       struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                    struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                    void *private)
+{
+       struct ea_find *ef = private;
+       struct gfs2_ea_request *er = ef->ef_er;
+
+       if (ea->ea_type == GFS2_EATYPE_UNUSED)
+               return 0;
+
+       if (ea->ea_type == er->er_type) {
+               if (ea->ea_name_len == er->er_name_len &&
+                   !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+                       struct gfs2_ea_location *el = ef->ef_el;
+                       get_bh(bh);
+                       el->el_bh = bh;
+                       el->el_ea = ea;
+                       el->el_prev = prev;
+                       return 1;
+               }
+       }
+
+#if 0
+       else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
+                er->er_type == GFS2_EATYPE_SYS)
+               return 1;
+#endif
+
+       return 0;
+}
+
+int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                struct gfs2_ea_location *el)
+{
+       struct ea_find ef;
+       int error;
+
+       ef.ef_er = er;
+       ef.ef_el = el;
+
+       memset(el, 0, sizeof(struct gfs2_ea_location));
+
+       error = ea_foreach(ip, ea_find_i, &ef);
+       if (error > 0)
+               return 0;
+
+       return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+                               struct gfs2_ea_header *ea,
+                               struct gfs2_ea_header *prev, void *private)
+{
+       int *leave = private;
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct gfs2_rgrpd *rgd;
+       struct gfs2_holder rg_gh;
+       struct buffer_head *dibh;
+       uint64_t *dataptrs, bn = 0;
+       uint64_t bstart = 0;
+       unsigned int blen = 0;
+       unsigned int blks = 0;
+       unsigned int x;
+       int error;
+
+       if (GFS2_EA_IS_STUFFED(ea))
+               return 0;
+
+       dataptrs = GFS2_EA2DATAPTRS(ea);
+       for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
+               if (*dataptrs) {
+                       blks++;
+                       bn = be64_to_cpu(*dataptrs);
+               }
+       if (!blks)
+               return 0;
+
+       rgd = gfs2_blk2rgrpd(sdp, bn);
+       if (!rgd) {
+               gfs2_consist_inode(ip);
+               return -EIO;
+       }
+
+       error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+       if (error)
+               return error;
+
+       error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
+                                RES_DINODE + RES_EATTR + RES_STATFS +
+                                RES_QUOTA, blks);
+       if (error)
+               goto out_gunlock;
+
+       gfs2_trans_add_bh(ip->i_gl, bh);
+
+       dataptrs = GFS2_EA2DATAPTRS(ea);
+       for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+               if (!*dataptrs)
+                       break;
+               bn = be64_to_cpu(*dataptrs);
+
+               if (bstart + blen == bn)
+                       blen++;
+               else {
+                       if (bstart)
+                               gfs2_free_meta(ip, bstart, blen);
+                       bstart = bn;
+                       blen = 1;
+               }
+
+               *dataptrs = 0;
+               if (!ip->i_di.di_blocks)
+                       gfs2_consist_inode(ip);
+               ip->i_di.di_blocks--;
+       }
+       if (bstart)
+               gfs2_free_meta(ip, bstart, blen);
+
+       if (prev && !leave) {
+               uint32_t len;
+
+               len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+               prev->ea_rec_len = cpu_to_be32(len);
+
+               if (GFS2_EA_IS_LAST(ea))
+                       prev->ea_flags |= GFS2_EAFLAG_LAST;
+       } else {
+               ea->ea_type = GFS2_EATYPE_UNUSED;
+               ea->ea_num_ptrs = 0;
+       }
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               ip->i_di.di_ctime = get_seconds();
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       gfs2_trans_end(sdp);
+
+ out_gunlock:
+       gfs2_glock_dq_uninit(&rg_gh);
+
+       return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+                              struct gfs2_ea_header *ea,
+                              struct gfs2_ea_header *prev, int leave)
+{
+       struct gfs2_alloc *al;
+       int error;
+
+       al = gfs2_alloc_get(ip);
+
+       error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+       if (error)
+               goto out_alloc;
+
+       error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
+       if (error)
+               goto out_quota;
+
+       error = ea_dealloc_unstuffed(ip,
+                                    bh, ea, prev,
+                                    (leave) ? &error : NULL);
+
+       gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+ out_quota:
+       gfs2_quota_unhold(ip);
+
+ out_alloc:
+       gfs2_alloc_put(ip);
+
+       return error;
+}
+
+/******************************************************************************/
+
+static int gfs2_ea_repack_i(struct gfs2_inode *ip)
+{
+       return -EOPNOTSUPP;
+}
+
+int gfs2_ea_repack(struct gfs2_inode *ip)
+{
+       struct gfs2_holder gh;
+       int error;
+
+       error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+       if (error)
+               return error;
+
+       /* Some sort of permissions checking would be nice */
+
+       error = gfs2_ea_repack_i(ip);
+
+       gfs2_glock_dq_uninit(&gh);
+
+       return error;
+}
+
+struct ea_list {
+       struct gfs2_ea_request *ei_er;
+       unsigned int ei_size;
+};
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                    struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                    void *private)
+{
+       struct ea_list *ei = private;
+       struct gfs2_ea_request *er = ei->ei_er;
+       unsigned int ea_size = GFS2_EA_STRLEN(ea);
+
+       if (ea->ea_type == GFS2_EATYPE_UNUSED)
+               return 0;
+
+       if (er->er_data_len) {
+               char *prefix;
+               unsigned int l;
+               char c = 0;
+
+               if (ei->ei_size + ea_size > er->er_data_len)
+                       return -ERANGE;
+
+               if (ea->ea_type == GFS2_EATYPE_USR) {
+                       prefix = "user.";
+                       l = 5;
+               } else {
+                       prefix = "system.";
+                       l = 7;
+               }
+
+               memcpy(er->er_data + ei->ei_size,
+                      prefix, l);
+               memcpy(er->er_data + ei->ei_size + l,
+                      GFS2_EA2NAME(ea),
+                      ea->ea_name_len);
+               memcpy(er->er_data + ei->ei_size +
+                      ea_size - 1,
+                      &c, 1);
+       }
+
+       ei->ei_size += ea_size;
+
+       return 0;
+}
+
+/**
+ * gfs2_ea_list -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_holder i_gh;
+       int error;
+
+       if (!er->er_data || !er->er_data_len) {
+               er->er_data = NULL;
+               er->er_data_len = 0;
+       }
+
+       error = gfs2_glock_nq_init(ip->i_gl,
+                                 LM_ST_SHARED, LM_FLAG_ANY,
+                                 &i_gh);
+       if (error)
+               return error;
+
+       if (ip->i_di.di_eattr) {
+               struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+
+               error = ea_foreach(ip, ea_list_i, &ei);
+               if (!error)
+                       error = ei.ei_size;
+       }
+
+       gfs2_glock_dq_uninit(&i_gh);
+
+       return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip:
+ * @ea:
+ * @data:
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+                           char *data)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head **bh;
+       unsigned int amount = GFS2_EA_DATA_LEN(ea);
+       unsigned int nptrs = DIV_RU(amount, sdp->sd_jbsize);
+       uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
+       unsigned int x;
+       int error = 0;
+
+       bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+       if (!bh)
+               return -ENOMEM;
+
+       for (x = 0; x < nptrs; x++) {
+               error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
+                                      DIO_START, bh + x);
+               if (error) {
+                       while (x--)
+                               brelse(bh[x]);
+                       goto out;
+               }
+               dataptrs++;
+       }
+
+       for (x = 0; x < nptrs; x++) {
+               error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
+               if (error) {
+                       for (; x < nptrs; x++)
+                               brelse(bh[x]);
+                       goto out;
+               }
+               if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+                       for (; x < nptrs; x++)
+                               brelse(bh[x]);
+                       error = -EIO;
+                       goto out;
+               }
+
+               memcpy(data,
+                      bh[x]->b_data + sizeof(struct gfs2_meta_header),
+                      (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+               amount -= sdp->sd_jbsize;
+               data += sdp->sd_jbsize;
+
+               brelse(bh[x]);
+       }
+
+ out:
+       kfree(bh);
+
+       return error;
+}
+
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                    char *data)
+{
+       if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+               memcpy(data,
+                      GFS2_EA2DATA(el->el_ea),
+                      GFS2_EA_DATA_LEN(el->el_ea));
+               return 0;
+       } else
+               return ea_get_unstuffed(ip, el->el_ea, data);
+}
+
+/**
+ * gfs2_ea_get_i -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_ea_location el;
+       int error;
+
+       if (!ip->i_di.di_eattr)
+               return -ENODATA;
+
+       error = gfs2_ea_find(ip, er, &el);
+       if (error)
+               return error;
+       if (!el.el_ea)
+               return -ENODATA;
+
+       if (er->er_data_len) {
+               if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+                       error =  -ERANGE;
+               else
+                       error = gfs2_ea_get_copy(ip, &el, er->er_data);
+       }
+       if (!error)
+               error = GFS2_EA_DATA_LEN(el.el_ea);
+
+       brelse(el.el_bh);
+
+       return error;
+}
+
+/**
+ * gfs2_ea_get -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_holder i_gh;
+       int error;
+
+       if (!er->er_name_len ||
+           er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+               return -EINVAL;
+       if (!er->er_data || !er->er_data_len) {
+               er->er_data = NULL;
+               er->er_data_len = 0;
+       }
+
+       error = gfs2_glock_nq_init(ip->i_gl,
+                                 LM_ST_SHARED, LM_FLAG_ANY,
+                                 &i_gh);
+       if (error)
+               return error;
+
+       error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
+
+       gfs2_glock_dq_uninit(&i_gh);
+
+       return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp:
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct gfs2_ea_header *ea;
+       uint64_t block;
+
+       block = gfs2_alloc_meta(ip);
+
+       *bhp = gfs2_meta_new(ip->i_gl, block);
+       gfs2_trans_add_bh(ip->i_gl, *bhp);
+       gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+       gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+       ea = GFS2_EA_BH2FIRST(*bhp);
+       ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+       ea->ea_type = GFS2_EATYPE_UNUSED;
+       ea->ea_flags = GFS2_EAFLAG_LAST;
+       ea->ea_num_ptrs = 0;
+
+       ip->i_di.di_blocks++;
+
+       return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip:  inode that is being modified
+ * @ea:  the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+                   struct gfs2_ea_request *er)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+
+       ea->ea_data_len = cpu_to_be32(er->er_data_len);
+       ea->ea_name_len = er->er_name_len;
+       ea->ea_type = er->er_type;
+       ea->__pad = 0;
+
+       memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+       if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+               ea->ea_num_ptrs = 0;
+               memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+       } else {
+               uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
+               const char *data = er->er_data;
+               unsigned int data_len = er->er_data_len;
+               unsigned int copy;
+               unsigned int x;
+
+               ea->ea_num_ptrs = DIV_RU(er->er_data_len, sdp->sd_jbsize);
+               for (x = 0; x < ea->ea_num_ptrs; x++) {
+                       struct buffer_head *bh;
+                       uint64_t block;
+                       int mh_size = sizeof(struct gfs2_meta_header);
+
+                       block = gfs2_alloc_meta(ip);
+
+                       bh = gfs2_meta_new(ip->i_gl, block);
+                       gfs2_trans_add_bh(ip->i_gl, bh);
+                       gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+                       ip->i_di.di_blocks++;
+
+                       copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
+                                                            data_len;
+                       memcpy(bh->b_data + mh_size, data, copy);
+                       if (copy < sdp->sd_jbsize)
+                               memset(bh->b_data + mh_size + copy, 0,
+                                      sdp->sd_jbsize - copy);
+
+                       *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
+                       data += copy;
+                       data_len -= copy;
+
+                       brelse(bh);
+               }
+
+               gfs2_assert_withdraw(sdp, !data_len);
+       }
+
+       return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+                                  struct gfs2_ea_request *er,
+                                  void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                            unsigned int blks,
+                            ea_skeleton_call_t skeleton_call,
+                            void *private)
+{
+       struct gfs2_alloc *al;
+       struct buffer_head *dibh;
+       int error;
+
+       al = gfs2_alloc_get(ip);
+
+       error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+       if (error)
+               goto out;
+
+       error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+       if (error)
+               goto out_gunlock_q;
+
+       al->al_requested = blks;
+
+       error = gfs2_inplace_reserve(ip);
+       if (error)
+               goto out_gunlock_q;
+
+       error = gfs2_trans_begin(ip->i_sbd,
+                                blks + al->al_rgd->rd_ri.ri_length +
+                                RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+       if (error)
+               goto out_ipres;
+
+       error = skeleton_call(ip, er, private);
+       if (error)
+               goto out_end_trans;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               if (er->er_flags & GFS2_ERF_MODE) {
+                       gfs2_assert_withdraw(ip->i_sbd,
+                                           (ip->i_di.di_mode & S_IFMT) ==
+                                           (er->er_mode & S_IFMT));
+                       ip->i_di.di_mode = er->er_mode;
+               }
+               ip->i_di.di_ctime = get_seconds();
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+ out_end_trans:
+       gfs2_trans_end(ip->i_sbd);
+
+ out_ipres:
+       gfs2_inplace_release(ip);
+
+ out_gunlock_q:
+       gfs2_quota_unlock(ip);
+
+ out:
+       gfs2_alloc_put(ip);
+
+       return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                    void *private)
+{
+       struct buffer_head *bh;
+       int error;
+
+       error = ea_alloc_blk(ip, &bh);
+       if (error)
+               return error;
+
+       ip->i_di.di_eattr = bh->b_blocknr;
+       error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+       brelse(bh);
+
+       return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       unsigned int jbsize = ip->i_sbd->sd_jbsize;
+       unsigned int blks = 1;
+
+       if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+               blks += DIV_RU(er->er_data_len, jbsize);
+
+       return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+       uint32_t ea_size = GFS2_EA_SIZE(ea);
+       struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + ea_size);
+       uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+       int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+       ea->ea_rec_len = cpu_to_be32(ea_size);
+       ea->ea_flags ^= last;
+
+       new->ea_rec_len = cpu_to_be32(new_size);
+       new->ea_flags = last;
+
+       return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+                                 struct gfs2_ea_location *el)
+{
+       struct gfs2_ea_header *ea = el->el_ea;
+       struct gfs2_ea_header *prev = el->el_prev;
+       uint32_t len;
+
+       gfs2_trans_add_bh(ip->i_gl, el->el_bh);
+
+       if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+               ea->ea_type = GFS2_EATYPE_UNUSED;
+               return;
+       } else if (GFS2_EA2NEXT(prev) != ea) {
+               prev = GFS2_EA2NEXT(prev);
+               gfs2_assert_withdraw(ip->i_sbd, GFS2_EA2NEXT(prev) == ea);
+       }
+
+       len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+       prev->ea_rec_len = cpu_to_be32(len);
+
+       if (GFS2_EA_IS_LAST(ea))
+               prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+       int ea_split;
+
+       struct gfs2_ea_request *es_er;
+       struct gfs2_ea_location *es_el;
+
+       struct buffer_head *es_bh;
+       struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+                                struct gfs2_ea_header *ea, struct ea_set *es)
+{
+       struct gfs2_ea_request *er = es->es_er;
+       struct buffer_head *dibh;
+       int error;
+
+       error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + 2 * RES_EATTR, 0);
+       if (error)
+               return error;
+
+       gfs2_trans_add_bh(ip->i_gl, bh);
+
+       if (es->ea_split)
+               ea = ea_split_ea(ea);
+
+       ea_write(ip, ea, er);
+
+       if (es->es_el)
+               ea_set_remove_stuffed(ip, es->es_el);
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (error)
+               goto out;
+
+       if (er->er_flags & GFS2_ERF_MODE) {
+               gfs2_assert_withdraw(ip->i_sbd,
+                       (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
+               ip->i_di.di_mode = er->er_mode;
+       }
+       ip->i_di.di_ctime = get_seconds();
+       gfs2_trans_add_bh(ip->i_gl, dibh);
+       gfs2_dinode_out(&ip->i_di, dibh->b_data);
+       brelse(dibh);
+ out:
+       gfs2_trans_end(ip->i_sbd);
+
+       return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+                              struct gfs2_ea_request *er, void *private)
+{
+       struct ea_set *es = private;
+       struct gfs2_ea_header *ea = es->es_ea;
+       int error;
+
+       gfs2_trans_add_bh(ip->i_gl, es->es_bh);
+
+       if (es->ea_split)
+               ea = ea_split_ea(ea);
+
+       error = ea_write(ip, ea, er);
+       if (error)
+               return error;
+
+       if (es->es_el)
+               ea_set_remove_stuffed(ip, es->es_el);
+
+       return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+                        struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                        void *private)
+{
+       struct ea_set *es = private;
+       unsigned int size;
+       int stuffed;
+       int error;
+
+       stuffed = ea_calc_size(ip->i_sbd, es->es_er, &size);
+
+       if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+               if (GFS2_EA_REC_LEN(ea) < size)
+                       return 0;
+               if (!GFS2_EA_IS_STUFFED(ea)) {
+                       error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+                       if (error)
+                               return error;
+               }
+               es->ea_split = 0;
+       } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+               es->ea_split = 1;
+       else
+               return 0;
+
+       if (stuffed) {
+               error = ea_set_simple_noalloc(ip, bh, ea, es);
+               if (error)
+                       return error;
+       } else {
+               unsigned int blks;
+
+               es->es_bh = bh;
+               es->es_ea = ea;
+               blks = 2 + DIV_RU(es->es_er->er_data_len, ip->i_sbd->sd_jbsize);
+
+               error = ea_alloc_skeleton(ip, es->es_er, blks,
+                                         ea_set_simple_alloc, es);
+               if (error)
+                       return error;
+       }
+
+       return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                       void *private)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head *indbh, *newbh;
+       uint64_t *eablk;
+       int error;
+       int mh_size = sizeof(struct gfs2_meta_header);
+
+       if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+               uint64_t *end;
+
+               error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
+                                      DIO_START | DIO_WAIT, &indbh);
+               if (error)
+                       return error;
+
+               if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+                       error = -EIO;
+                       goto out;
+               }
+
+               eablk = (uint64_t *)(indbh->b_data + mh_size);
+               end = eablk + sdp->sd_inptrs;
+
+               for (; eablk < end; eablk++)
+                       if (!*eablk)
+                               break;
+
+               if (eablk == end) {
+                       error = -ENOSPC;
+                       goto out;
+               }
+
+               gfs2_trans_add_bh(ip->i_gl, indbh);
+       } else {
+               uint64_t blk;
+
+               blk = gfs2_alloc_meta(ip);
+
+               indbh = gfs2_meta_new(ip->i_gl, blk);
+               gfs2_trans_add_bh(ip->i_gl, indbh);
+               gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+               gfs2_buffer_clear_tail(indbh, mh_size);
+
+               eablk = (uint64_t *)(indbh->b_data + mh_size);
+               *eablk = cpu_to_be64(ip->i_di.di_eattr);
+               ip->i_di.di_eattr = blk;
+               ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+               ip->i_di.di_blocks++;
+
+               eablk++;
+       }
+
+       error = ea_alloc_blk(ip, &newbh);
+       if (error)
+               goto out;
+
+       *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
+       error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+       brelse(newbh);
+       if (error)
+               goto out;
+
+       if (private)
+               ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
+
+ out:
+       brelse(indbh);
+
+       return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                   struct gfs2_ea_location *el)
+{
+       struct ea_set es;
+       unsigned int blks = 2;
+       int error;
+
+       memset(&es, 0, sizeof(struct ea_set));
+       es.es_er = er;
+       es.es_el = el;
+
+       error = ea_foreach(ip, ea_set_simple, &es);
+       if (error > 0)
+               return 0;
+       if (error)
+               return error;
+
+       if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+               blks++;
+       if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize)
+               blks += DIV_RU(er->er_data_len, ip->i_sbd->sd_jbsize);
+
+       return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+                                  struct gfs2_ea_location *el)
+{
+       if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+               el->el_prev = GFS2_EA2NEXT(el->el_prev);
+               gfs2_assert_withdraw(ip->i_sbd,
+                                    GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+       }
+
+       return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_ea_location el;
+       int error;
+
+       if (!ip->i_di.di_eattr) {
+               if (er->er_flags & XATTR_REPLACE)
+                       return -ENODATA;
+               return ea_init(ip, er);
+       }
+
+       error = gfs2_ea_find(ip, er, &el);
+       if (error)
+               return error;
+
+       if (el.el_ea) {
+               if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+                       brelse(el.el_bh);
+                       return -EPERM;
+               }
+
+               error = -EEXIST;
+               if (!(er->er_flags & XATTR_CREATE)) {
+                       int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+                       error = ea_set_i(ip, er, &el);
+                       if (!error && unstuffed)
+                               ea_set_remove_unstuffed(ip, &el);
+               }
+
+               brelse(el.el_bh);
+       } else {
+               error = -ENODATA;
+               if (!(er->er_flags & XATTR_REPLACE))
+                       error = ea_set_i(ip, er, NULL);
+       }
+
+       return error;
+}
+
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_holder i_gh;
+       int error;
+
+       if (!er->er_name_len ||
+           er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+               return -EINVAL;
+       if (!er->er_data || !er->er_data_len) {
+               er->er_data = NULL;
+               er->er_data_len = 0;
+       }
+       error = ea_check_size(ip->i_sbd, er);
+       if (error)
+               return error;
+
+       error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+       if (error)
+               return error;
+
+       if (IS_IMMUTABLE(ip->i_vnode))
+               error = -EPERM;
+       else
+               error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
+
+       gfs2_glock_dq_uninit(&i_gh);
+
+       return error;
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+       struct gfs2_ea_header *ea = el->el_ea;
+       struct gfs2_ea_header *prev = el->el_prev;
+       struct buffer_head *dibh;
+       int error;
+
+       error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
+       if (error)
+               return error;
+
+       gfs2_trans_add_bh(ip->i_gl, el->el_bh);
+
+       if (prev) {
+               uint32_t len;
+
+               len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+               prev->ea_rec_len = cpu_to_be32(len);
+
+               if (GFS2_EA_IS_LAST(ea))
+                       prev->ea_flags |= GFS2_EAFLAG_LAST;
+       } else
+               ea->ea_type = GFS2_EATYPE_UNUSED;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               ip->i_di.di_ctime = get_seconds();
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }       
+
+       gfs2_trans_end(ip->i_sbd);
+
+       return error;
+}
+
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_ea_location el;
+       int error;
+
+       if (!ip->i_di.di_eattr)
+               return -ENODATA;
+
+       error = gfs2_ea_find(ip, er, &el);
+       if (error)
+               return error;
+       if (!el.el_ea)
+               return -ENODATA;
+
+       if (GFS2_EA_IS_STUFFED(el.el_ea))
+               error = ea_remove_stuffed(ip, &el);
+       else
+               error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+                                           0);
+
+       brelse(el.el_bh);
+
+       return error;
+}
+
+/**
+ * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * @ip: pointer to the inode of the target file
+ * @er: request information
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+       struct gfs2_holder i_gh;
+       int error;
+
+       if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+               return -EINVAL;
+
+       error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+       if (error)
+               return error;
+
+       if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
+               error = -EPERM;
+       else
+               error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+
+       gfs2_glock_dq_uninit(&i_gh);
+
+       return error;
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+                                 struct gfs2_ea_header *ea, char *data)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct buffer_head **bh;
+       unsigned int amount = GFS2_EA_DATA_LEN(ea);
+       unsigned int nptrs = DIV_RU(amount, sdp->sd_jbsize);
+       uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
+       unsigned int x;
+       int error;
+
+       bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+       if (!bh)
+               return -ENOMEM;
+
+       error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+       if (error)
+               goto out;
+
+       for (x = 0; x < nptrs; x++) {
+               error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
+                                      DIO_START, bh + x);
+               if (error) {
+                       while (x--)
+                               brelse(bh[x]);
+                       goto fail;
+               }
+               dataptrs++;
+       }
+
+       for (x = 0; x < nptrs; x++) {
+               error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
+               if (error) {
+                       for (; x < nptrs; x++)
+                               brelse(bh[x]);
+                       goto fail;
+               }
+               if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+                       for (; x < nptrs; x++)
+                               brelse(bh[x]);
+                       error = -EIO;
+                       goto fail;
+               }
+
+               gfs2_trans_add_bh(ip->i_gl, bh[x]);
+
+               memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
+                      data,
+                      (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+               amount -= sdp->sd_jbsize;
+               data += sdp->sd_jbsize;
+
+               brelse(bh[x]);
+       }
+
+ out:
+       kfree(bh);
+
+       return error;
+
+ fail:
+       gfs2_trans_end(sdp);
+       kfree(bh);
+
+       return error;
+}
+
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                     struct iattr *attr, char *data)
+{
+       struct buffer_head *dibh;
+       int error;
+
+       if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+               error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
+               if (error)
+                       return error;
+
+               gfs2_trans_add_bh(ip->i_gl, el->el_bh);
+               memcpy(GFS2_EA2DATA(el->el_ea),
+                      data,
+                      GFS2_EA_DATA_LEN(el->el_ea));
+       } else
+               error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+
+       if (error)
+               return error;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               error = inode_setattr(ip->i_vnode, attr);
+               gfs2_assert_warn(ip->i_sbd, !error);
+               gfs2_inode_attr_out(ip);
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       gfs2_trans_end(ip->i_sbd);
+
+       return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct gfs2_rgrp_list rlist;
+       struct buffer_head *indbh, *dibh;
+       uint64_t *eablk, *end;
+       unsigned int rg_blocks = 0;
+       uint64_t bstart = 0;
+       unsigned int blen = 0;
+       unsigned int blks = 0;
+       unsigned int x;
+       int error;
+
+       memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+       error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
+                              DIO_START | DIO_WAIT, &indbh);
+       if (error)
+               return error;
+
+       if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+               error = -EIO;
+               goto out;
+       }
+
+       eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+       end = eablk + sdp->sd_inptrs;
+
+       for (; eablk < end; eablk++) {
+               uint64_t bn;
+
+               if (!*eablk)
+                       break;
+               bn = be64_to_cpu(*eablk);
+
+               if (bstart + blen == bn)
+                       blen++;
+               else {
+                       if (bstart)
+                               gfs2_rlist_add(sdp, &rlist, bstart);
+                       bstart = bn;
+                       blen = 1;
+               }
+               blks++;
+       }
+       if (bstart)
+               gfs2_rlist_add(sdp, &rlist, bstart);
+       else
+               goto out;
+
+       gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+       for (x = 0; x < rlist.rl_rgrps; x++) {
+               struct gfs2_rgrpd *rgd;
+               rgd = get_gl2rgd(rlist.rl_ghs[x].gh_gl);
+               rg_blocks += rgd->rd_ri.ri_length;
+       }
+
+       error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+       if (error)
+               goto out_rlist_free;
+
+       error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+                                RES_INDIRECT + RES_STATFS +
+                                RES_QUOTA, blks);
+       if (error)
+               goto out_gunlock;
+
+       gfs2_trans_add_bh(ip->i_gl, indbh);
+
+       eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+       bstart = 0;
+       blen = 0;
+
+       for (; eablk < end; eablk++) {
+               uint64_t bn;
+
+               if (!*eablk)
+                       break;
+               bn = be64_to_cpu(*eablk);
+
+               if (bstart + blen == bn)
+                       blen++;
+               else {
+                       if (bstart)
+                               gfs2_free_meta(ip, bstart, blen);
+                       bstart = bn;
+                       blen = 1;
+               }
+
+               *eablk = 0;
+               if (!ip->i_di.di_blocks)
+                       gfs2_consist_inode(ip);
+               ip->i_di.di_blocks--;
+       }
+       if (bstart)
+               gfs2_free_meta(ip, bstart, blen);
+
+       ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       gfs2_trans_end(sdp);
+
+ out_gunlock:
+       gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+
+ out_rlist_free:
+       gfs2_rlist_free(&rlist);
+
+ out:
+       brelse(indbh);
+
+       return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+       struct gfs2_sbd *sdp = ip->i_sbd;
+       struct gfs2_alloc *al = &ip->i_alloc;
+       struct gfs2_rgrpd *rgd;
+       struct buffer_head *dibh;
+       int error;
+
+       rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+       if (!rgd) {
+               gfs2_consist_inode(ip);
+               return -EIO;
+       }
+
+       error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+                                  &al->al_rgd_gh);
+       if (error)
+               return error;
+
+       error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
+                                RES_STATFS + RES_QUOTA, 1);
+       if (error)
+               goto out_gunlock;
+
+       gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+
+       ip->i_di.di_eattr = 0;
+       if (!ip->i_di.di_blocks)
+               gfs2_consist_inode(ip);
+       ip->i_di.di_blocks--;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (!error) {
+               gfs2_trans_add_bh(ip->i_gl, dibh);
+               gfs2_dinode_out(&ip->i_di, dibh->b_data);
+               brelse(dibh);
+       }
+
+       gfs2_trans_end(sdp);
+
+ out_gunlock:
+       gfs2_glock_dq_uninit(&al->al_rgd_gh);
+
+       return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+       struct gfs2_alloc *al;
+       int error;
+
+       al = gfs2_alloc_get(ip);
+
+       error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+       if (error)
+               goto out_alloc;
+
+       error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
+       if (error)
+               goto out_quota;
+
+       error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+       if (error)
+               goto out_rindex;
+
+       if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+               error = ea_dealloc_indirect(ip);
+               if (error)
+                       goto out_rindex;
+       }
+
+       error = ea_dealloc_block(ip);
+
+ out_rindex:
+       gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+ out_quota:
+       gfs2_quota_unhold(ip);
+
+ out_alloc:
+       gfs2_alloc_put(ip);
+
+       return error;
+}
+
+/**
+ * gfs2_get_eattr_meta - return all the eattr blocks of a file
+ * @dip: the directory
+ * @ub: the structure representing the user buffer to copy to
+ *
+ * Returns: errno
+ */
+
+int gfs2_get_eattr_meta(struct gfs2_inode *ip, struct gfs2_user_buffer *ub)
+{
+       struct buffer_head *bh;
+       int error;
+
+       error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
+                              DIO_START | DIO_WAIT, &bh);
+       if (error)
+               return error;
+
+       gfs2_add_bh_to_ub(ub, bh);
+
+       if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+               struct buffer_head *eabh;
+               uint64_t *eablk, *end;
+
+               if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_IN)) {
+                       error = -EIO;
+                       goto out;
+               }
+
+               eablk = (uint64_t *)(bh->b_data +
+                                    sizeof(struct gfs2_meta_header));
+               end = eablk + ip->i_sbd->sd_inptrs;
+
+               for (; eablk < end; eablk++) {
+                       uint64_t bn;
+
+                       if (!*eablk)
+                               break;
+                       bn = be64_to_cpu(*eablk);
+
+                       error = gfs2_meta_read(ip->i_gl, bn,
+                                              DIO_START | DIO_WAIT, &eabh);
+                       if (error)
+                               break;
+                       gfs2_add_bh_to_ub(ub, eabh);
+                       brelse(eabh);
+                       if (error)
+                               break;
+               }
+       }
+
+ out:
+       brelse(bh);
+
+       return error;
+}
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644 (file)
index 0000000..a640392
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+                                  (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_STRLEN(ea) \
+((((ea)->ea_type == GFS2_EATYPE_USR) ? 5 : 7) + (ea)->ea_name_len + 1)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
+      sizeof(uint64_t) * DIV_RU((er)->er_data_len, (sdp)->sd_jbsize), 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+#define GFS2_ERF_MODE 0x80000000
+
+struct gfs2_ea_request {
+       char *er_name;
+       char *er_data;
+       unsigned int er_name_len;
+       unsigned int er_data_len;
+       unsigned int er_type; /* GFS2_EATYPE_... */
+       int er_flags;
+       mode_t er_mode;
+};
+
+struct gfs2_ea_location {
+       struct buffer_head *el_bh;
+       struct gfs2_ea_header *el_ea;
+       struct gfs2_ea_header *el_prev;
+};
+
+int gfs2_ea_repack(struct gfs2_inode *ip);
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+int gfs2_get_eattr_meta(struct gfs2_inode *ip, struct gfs2_user_buffer *ub);
+
+/* Exported to acl.c */
+
+int gfs2_ea_find(struct gfs2_inode *ip,
+                struct gfs2_ea_request *er,
+                struct gfs2_ea_location *el);
+int gfs2_ea_get_copy(struct gfs2_inode *ip,
+                    struct gfs2_ea_location *el,
+                    char *data);
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                     struct iattr *attr, char *data);
+
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644 (file)
index 0000000..c7bf32c
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __FORMAT_DOT_H__
+#define __FORMAT_DOT_H__
+
+static const uint32_t gfs2_old_fs_formats[] = {
+       0
+};
+
+static const uint32_t gfs2_old_multihost_formats[] = {
+       0
+};
+
+#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644 (file)
index 0000000..a5d1182
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+
+#include <linux/gfs2_ondisk.h>
+
+#include "lm_interface.h"
+#include "lvb.h"
+#include "incore.h"
+#include "util.h"
+
+enum {
+       NO_CREATE = 0,
+       CREATE = 1,
+};
+
+enum {
+       NO_WAIT = 0,
+       WAIT = 1,
+};
+
+enum {
+       NO_FORCE = 0,
+       FORCE = 1,
+};
+
+/*  Divide num by den.  Round up if there is a remainder.  */
+#define DIV_RU(num, den) (((num) + (den) - 1) / (den))
+
+#define GFS2_FAST_NAME_SIZE 8
+
+#define get_v2sdp(sb) ((struct gfs2_sbd *)(sb)->s_fs_info)
+#define set_v2sdp(sb, sdp) (sb)->s_fs_info = (sdp)
+#define get_v2ip(inode) ((struct gfs2_inode *)(inode)->u.generic_ip)
+#define set_v2ip(inode, ip) (inode)->u.generic_ip = (ip)
+#define get_v2fp(file) ((struct gfs2_file *)(file)->private_data)
+#define set_v2fp(file, fp) (file)->private_data = (fp)
+#define get_v2bd(bh) ((struct gfs2_bufdata *)(bh)->b_private)
+#define set_v2bd(bh, bd) (bh)->b_private = (bd)
+#define get_v2db(bh) ((struct gfs2_databuf *)(bh)->b_private)
+#define set_v2db(bh, db) (bh)->b_private = (db)
+
+#define get_transaction ((struct gfs2_trans *)(current->journal_info))
+#define set_transaction(tr) (current->journal_info) = (tr)
+
+#define get_gl2ip(gl) ((struct gfs2_inode *)(gl)->gl_object)
+#define set_gl2ip(gl, ip) (gl)->gl_object = (ip)
+#define get_gl2rgd(gl) ((struct gfs2_rgrpd *)(gl)->gl_object)
+#define set_gl2rgd(gl, rgd) (gl)->gl_object = (rgd)
+#define get_gl2gl(gl) ((struct gfs2_glock *)(gl)->gl_object)
+#define set_gl2gl(gl, gl2) (gl)->gl_object = (gl2)
+
+#endif /* __GFS2_DOT_H__ */
+
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644 (file)
index 0000000..321945f
--- /dev/null
@@ -0,0 +1,2513 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+
+/*  Must be kept in sync with the beginning of struct gfs2_glock  */
+struct glock_plug {
+       struct list_head gl_list;
+       unsigned long gl_flags;
+};
+
+struct greedy {
+       struct gfs2_holder gr_gh;
+       struct work_struct gr_work;
+};
+
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+/**
+ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
+ * @actual: the current state of the lock
+ * @requested: the lock state that was requested by the caller
+ * @flags: the modifier flags passed in by the caller
+ *
+ * Returns: 1 if the locks are compatible, 0 otherwise
+ */
+
+static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
+                                  int flags)
+{
+       if (actual == requested)
+               return 1;
+
+       if (flags & GL_EXACT)
+               return 0;
+
+       if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
+               return 1;
+
+       if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
+               return 1;
+
+       return 0;
+}
+
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+
+static unsigned int gl_hash(struct lm_lockname *name)
+{
+       unsigned int h;
+
+       h = jhash(&name->ln_number, sizeof(uint64_t), 0);
+       h = jhash(&name->ln_type, sizeof(unsigned int), h);
+       h &= GFS2_GL_HASH_MASK;
+
+       return h;
+}
+
+/**
+ * glock_free() - Perform a few checks and then release struct gfs2_glock
+ * @gl: The glock to release
+ *
+ * Also calls lock module to release its internal structure for this glock.
+ *
+ */
+
+static void glock_free(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct inode *aspace = gl->gl_aspace;
+
+       gfs2_lm_put_lock(sdp, gl->gl_lock);
+
+       if (aspace)
+               gfs2_aspace_put(aspace);
+
+       kmem_cache_free(gfs2_glock_cachep, gl);
+
+       atomic_dec(&sdp->sd_glock_count);
+}
+
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+       kref_get(&gl->gl_ref);
+}
+
+/* All work is done after the return from kref_put() so we
+   can release the write_lock before the free. */
+
+static void kill_glock(struct kref *kref)
+{
+       struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+
+       gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+       gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+       gfs2_assert(sdp, list_empty(&gl->gl_holders));
+       gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
+       gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
+       gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+}
+
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+int gfs2_glock_put(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
+       int rv = 0;
+
+       down(&sdp->sd_invalidate_inodes_mutex);
+
+       write_lock(&bucket->hb_lock);
+       if (kref_put(&gl->gl_ref, kill_glock)) {
+               list_del_init(&gl->gl_list);
+               write_unlock(&bucket->hb_lock);
+               glock_free(gl);
+               rv = 1;
+               goto out;
+       }
+       write_unlock(&bucket->hb_lock);
+ out:
+       up(&sdp->sd_invalidate_inodes_mutex);
+       return rv;
+}
+
+/**
+ * queue_empty - check to see if a glock's queue is empty
+ * @gl: the glock
+ * @head: the head of the queue to check
+ *
+ * This function protects the list in the event that a process already
+ * has a holder on the list and is adding a second holder for itself.
+ * The glmutex lock is what generally prevents processes from working
+ * on the same glock at once, but the special case of adding a second
+ * holder for yourself ("recursive" locking) doesn't involve locking
+ * glmutex, making the spin lock necessary.
+ *
+ * Returns: 1 if the queue is empty
+ */
+
+static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
+{
+       int empty;
+       spin_lock(&gl->gl_spin);
+       empty = list_empty(head);
+       spin_unlock(&gl->gl_spin);
+       return empty;
+}
+
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
+                                       struct lm_lockname *name)
+{
+       struct gfs2_glock *gl;
+
+       list_for_each_entry(gl, &bucket->hb_list, gl_list) {
+               if (test_bit(GLF_PLUG, &gl->gl_flags))
+                       continue;
+               if (!lm_name_equal(&gl->gl_name, name))
+                       continue;
+
+               kref_get(&gl->gl_ref);
+
+               return gl;
+       }
+
+       return NULL;
+}
+
+/**
+ * gfs2_glock_find() - Find glock by lock number
+ * @sdp: The GFS2 superblock
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
+                                  struct lm_lockname *name)
+{
+       struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
+       struct gfs2_glock *gl;
+
+       read_lock(&bucket->hb_lock);
+       gl = search_bucket(bucket, name);
+       read_unlock(&bucket->hb_lock);
+
+       return gl;
+}
+
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
+                  struct gfs2_glock_operations *glops, int create,
+                  struct gfs2_glock **glp)
+{
+       struct lm_lockname name;
+       struct gfs2_glock *gl, *tmp;
+       struct gfs2_gl_hash_bucket *bucket;
+       int error;
+
+       name.ln_number = number;
+       name.ln_type = glops->go_type;
+       bucket = &sdp->sd_gl_hash[gl_hash(&name)];
+
+       read_lock(&bucket->hb_lock);
+       gl = search_bucket(bucket, &name);
+       read_unlock(&bucket->hb_lock);
+
+       if (gl || !create) {
+               *glp = gl;
+               return 0;
+       }
+
+       gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+       if (!gl)
+               return -ENOMEM;
+
+       memset(gl, 0, sizeof(struct gfs2_glock));
+
+       INIT_LIST_HEAD(&gl->gl_list);
+       gl->gl_name = name;
+       kref_init(&gl->gl_ref);
+
+       spin_lock_init(&gl->gl_spin);
+
+       gl->gl_state = LM_ST_UNLOCKED;
+       INIT_LIST_HEAD(&gl->gl_holders);
+       INIT_LIST_HEAD(&gl->gl_waiters1);
+       INIT_LIST_HEAD(&gl->gl_waiters2);
+       INIT_LIST_HEAD(&gl->gl_waiters3);
+
+       gl->gl_ops = glops;
+
+       gl->gl_bucket = bucket;
+       INIT_LIST_HEAD(&gl->gl_reclaim);
+
+       gl->gl_sbd = sdp;
+
+       lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+       INIT_LIST_HEAD(&gl->gl_ail_list);
+
+       /* If this glock protects actual on-disk data or metadata blocks,
+          create a VFS inode to manage the pages/buffers holding them. */
+       if (glops == &gfs2_inode_glops ||
+           glops == &gfs2_rgrp_glops ||
+           glops == &gfs2_meta_glops) {
+               gl->gl_aspace = gfs2_aspace_get(sdp);
+               if (!gl->gl_aspace) {
+                       error = -ENOMEM;
+                       goto fail;
+               }
+       }
+
+       error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
+       if (error)
+               goto fail_aspace;
+
+       atomic_inc(&sdp->sd_glock_count);
+
+       write_lock(&bucket->hb_lock);
+       tmp = search_bucket(bucket, &name);
+       if (tmp) {
+               write_unlock(&bucket->hb_lock);
+               glock_free(gl);
+               gl = tmp;
+       } else {
+               list_add_tail(&gl->gl_list, &bucket->hb_list);
+               write_unlock(&bucket->hb_lock);
+       }
+
+       *glp = gl;
+
+       return 0;
+
+ fail_aspace:
+       if (gl->gl_aspace)
+               gfs2_aspace_put(gl->gl_aspace);
+
+ fail:
+       kmem_cache_free(gfs2_glock_cachep, gl); 
+
+       return error;
+}
+
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, int flags,
+                     struct gfs2_holder *gh)
+{
+       INIT_LIST_HEAD(&gh->gh_list);
+       gh->gh_gl = gl;
+       gh->gh_owner = (flags & GL_NEVER_RECURSE) ? NULL : current;
+       gh->gh_state = state;
+       gh->gh_flags = flags;
+       gh->gh_error = 0;
+       gh->gh_iflags = 0;
+       init_completion(&gh->gh_wait);
+
+       if (gh->gh_state == LM_ST_EXCLUSIVE)
+               gh->gh_flags |= GL_LOCAL_EXCL;
+
+       gfs2_glock_hold(gl);
+}
+
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+
+void gfs2_holder_reinit(unsigned int state, int flags, struct gfs2_holder *gh)
+{
+       gh->gh_state = state;
+       gh->gh_flags = flags;
+       if (gh->gh_state == LM_ST_EXCLUSIVE)
+               gh->gh_flags |= GL_LOCAL_EXCL;
+
+       gh->gh_iflags &= 1 << HIF_ALLOCED;
+}
+
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+       gfs2_glock_put(gh->gh_gl);
+       gh->gh_gl = NULL;
+}
+
+/**
+ * gfs2_holder_get - get a struct gfs2_holder structure
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gfp_flags: __GFP_NOFAIL
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
+ * 2) Leave it like it is
+ *
+ * Returns: the holder structure, NULL on ENOMEM
+ */
+
+struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, unsigned int state,
+                                   int flags, gfp_t gfp_flags)
+{
+       struct gfs2_holder *gh;
+
+       gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
+       if (!gh)
+               return NULL;
+
+       gfs2_holder_init(gl, state, flags, gh);
+       set_bit(HIF_ALLOCED, &gh->gh_iflags);
+
+       return gh;
+}
+
+/**
+ * gfs2_holder_put - get rid of a struct gfs2_holder structure
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_put(struct gfs2_holder *gh)
+{
+       gfs2_holder_uninit(gh);
+       kfree(gh);
+}
+
+/**
+ * handle_recurse - put other holder structures (marked recursive)
+ *                  into the holders list
+ * @gh: the holder structure
+ *
+ */
+
+static void handle_recurse(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_holder *tmp_gh, *safe;
+       int found = 0;
+
+       if (gfs2_assert_warn(sdp, gh->gh_owner))
+               return;
+
+       list_for_each_entry_safe(tmp_gh, safe, &gl->gl_waiters3, gh_list) {
+               if (tmp_gh->gh_owner != gh->gh_owner)
+                       continue;
+
+               gfs2_assert_warn(sdp,
+                                test_bit(HIF_RECURSE, &tmp_gh->gh_iflags));
+
+               list_move_tail(&tmp_gh->gh_list, &gl->gl_holders);
+               tmp_gh->gh_error = 0;
+               set_bit(HIF_HOLDER, &tmp_gh->gh_iflags);
+
+               complete(&tmp_gh->gh_wait);
+
+               found = 1;
+       }
+
+       gfs2_assert_warn(sdp, found);
+}
+
+/**
+ * do_unrecurse - a recursive holder was just dropped of the waiters3 list
+ * @gh: the holder
+ *
+ * If there is only one other recursive holder, clear its HIF_RECURSE bit.
+ * If there is more than one, leave them alone.
+ *
+ */
+
+static void do_unrecurse(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_holder *tmp_gh, *last_gh = NULL;
+       int found = 0;
+
+       if (gfs2_assert_warn(sdp, gh->gh_owner))
+               return;
+
+       list_for_each_entry(tmp_gh, &gl->gl_waiters3, gh_list) {
+               if (tmp_gh->gh_owner != gh->gh_owner)
+                       continue;
+
+               gfs2_assert_warn(sdp,
+                                test_bit(HIF_RECURSE, &tmp_gh->gh_iflags));
+
+               if (found)
+                       return;
+
+               found = 1;
+               last_gh = tmp_gh;
+       }
+
+       if (!gfs2_assert_warn(sdp, found))
+               clear_bit(HIF_RECURSE, &last_gh->gh_iflags);
+}
+
+/**
+ * rq_mutex - process a mutex request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_mutex(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+
+       list_del_init(&gh->gh_list);
+       /*  gh->gh_error never examined.  */
+       set_bit(GLF_LOCK, &gl->gl_flags);
+       complete(&gh->gh_wait);
+
+       return 1;
+}
+
+/**
+ * rq_promote - process a promote request in the queue
+ * @gh: the glock holder
+ *
+ * Acquire a new inter-node lock, or change a lock state to more restrictive.
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_promote(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+       int recurse;
+
+       if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+               if (list_empty(&gl->gl_holders)) {
+                       gl->gl_req_gh = gh;
+                       set_bit(GLF_LOCK, &gl->gl_flags);
+                       spin_unlock(&gl->gl_spin);
+
+                       if (atomic_read(&sdp->sd_reclaim_count) >
+                           gfs2_tune_get(sdp, gt_reclaim_limit) &&
+                           !(gh->gh_flags & LM_FLAG_PRIORITY)) {
+                               gfs2_reclaim_glock(sdp);
+                               gfs2_reclaim_glock(sdp);
+                       }
+
+                       glops->go_xmote_th(gl, gh->gh_state,
+                                          gh->gh_flags);
+
+                       spin_lock(&gl->gl_spin);
+               }
+               return 1;
+       }
+
+       if (list_empty(&gl->gl_holders)) {
+               set_bit(HIF_FIRST, &gh->gh_iflags);
+               set_bit(GLF_LOCK, &gl->gl_flags);
+               recurse = 0;
+       } else {
+               struct gfs2_holder *next_gh;
+               if (gh->gh_flags & GL_LOCAL_EXCL)
+                       return 1;
+               next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
+                                    gh_list);
+               if (next_gh->gh_flags & GL_LOCAL_EXCL)
+                        return 1;
+               recurse = test_bit(HIF_RECURSE, &gh->gh_iflags);
+       }
+
+       list_move_tail(&gh->gh_list, &gl->gl_holders);
+       gh->gh_error = 0;
+       set_bit(HIF_HOLDER, &gh->gh_iflags);
+
+       if (recurse)
+               handle_recurse(gh);
+
+       complete(&gh->gh_wait);
+
+       return 0;
+}
+
+/**
+ * rq_demote - process a demote request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_demote(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+
+       if (!list_empty(&gl->gl_holders))
+               return 1;
+
+       if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+               list_del_init(&gh->gh_list);
+               gh->gh_error = 0;
+               spin_unlock(&gl->gl_spin);
+               if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                       gfs2_holder_put(gh);
+               else
+                       complete(&gh->gh_wait);
+               spin_lock(&gl->gl_spin);
+       } else {
+               gl->gl_req_gh = gh;
+               set_bit(GLF_LOCK, &gl->gl_flags);
+               spin_unlock(&gl->gl_spin);
+
+               if (gh->gh_state == LM_ST_UNLOCKED ||
+                   gl->gl_state != LM_ST_EXCLUSIVE)
+                       glops->go_drop_th(gl);
+               else
+                       glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+
+               spin_lock(&gl->gl_spin);
+       }
+
+       return 0;
+}
+
+/**
+ * rq_greedy - process a queued request to drop greedy status
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_greedy(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+
+       list_del_init(&gh->gh_list);
+       /*  gh->gh_error never examined.  */
+       clear_bit(GLF_GREEDY, &gl->gl_flags);
+       spin_unlock(&gl->gl_spin);
+
+       gfs2_holder_uninit(gh);
+       kfree(container_of(gh, struct greedy, gr_gh));
+
+       spin_lock(&gl->gl_spin);                
+
+       return 0;
+}
+
+/**
+ * run_queue - process holder structures on a glock
+ * @gl: the glock
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl)
+{
+       struct gfs2_holder *gh;
+       int blocked = 1;
+
+       for (;;) {
+               if (test_bit(GLF_LOCK, &gl->gl_flags))
+                       break;
+
+               if (!list_empty(&gl->gl_waiters1)) {
+                       gh = list_entry(gl->gl_waiters1.next,
+                                       struct gfs2_holder, gh_list);
+
+                       if (test_bit(HIF_MUTEX, &gh->gh_iflags))
+                               blocked = rq_mutex(gh);
+                       else
+                               gfs2_assert_warn(gl->gl_sbd, 0);
+
+               } else if (!list_empty(&gl->gl_waiters2) &&
+                          !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+                       gh = list_entry(gl->gl_waiters2.next,
+                                       struct gfs2_holder, gh_list);
+
+                       if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
+                               blocked = rq_demote(gh);
+                       else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
+                               blocked = rq_greedy(gh);
+                       else
+                               gfs2_assert_warn(gl->gl_sbd, 0);
+
+               } else if (!list_empty(&gl->gl_waiters3)) {
+                       gh = list_entry(gl->gl_waiters3.next,
+                                       struct gfs2_holder, gh_list);
+
+                       if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
+                               blocked = rq_promote(gh);
+                       else
+                               gfs2_assert_warn(gl->gl_sbd, 0);
+
+               } else
+                       break;
+
+               if (blocked)
+                       break;
+       }
+}
+
+/**
+ * gfs2_glmutex_lock - acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Gives caller exclusive access to manipulate a glock structure.
+ */
+
+void gfs2_glmutex_lock(struct gfs2_glock *gl)
+{
+       struct gfs2_holder gh;
+
+       gfs2_holder_init(gl, 0, 0, &gh);
+       set_bit(HIF_MUTEX, &gh.gh_iflags);
+
+       spin_lock(&gl->gl_spin);
+       if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+               list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+       else
+               complete(&gh.gh_wait);
+       spin_unlock(&gl->gl_spin);
+
+       wait_for_completion(&gh.gh_wait);
+       gfs2_holder_uninit(&gh);
+}
+
+/**
+ * gfs2_glmutex_trylock - try to acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if the glock is acquired
+ */
+
+int gfs2_glmutex_trylock(struct gfs2_glock *gl)
+{
+       int acquired = 1;
+
+       spin_lock(&gl->gl_spin);
+       if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+               acquired = 0;
+       spin_unlock(&gl->gl_spin);
+
+       return acquired;
+}
+
+/**
+ * gfs2_glmutex_unlock - release a local lock on a glock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glmutex_unlock(struct gfs2_glock *gl)
+{
+       spin_lock(&gl->gl_spin);
+       clear_bit(GLF_LOCK, &gl->gl_flags);
+       run_queue(gl);
+       spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * handle_callback - add a demote request to a lock's queue
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ */
+
+static void handle_callback(struct gfs2_glock *gl, unsigned int state)
+{
+       struct gfs2_holder *gh, *new_gh = NULL;
+
+ restart:
+       spin_lock(&gl->gl_spin);
+
+       list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+               if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+                   gl->gl_req_gh != gh) {
+                       if (gh->gh_state != state)
+                               gh->gh_state = LM_ST_UNLOCKED;
+                       goto out;
+               }
+       }
+
+       if (new_gh) {
+               list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
+               new_gh = NULL;
+       } else {
+               spin_unlock(&gl->gl_spin);
+
+               new_gh = gfs2_holder_get(gl, state,
+                                        LM_FLAG_TRY | GL_NEVER_RECURSE,
+                                        GFP_KERNEL | __GFP_NOFAIL),
+               set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
+               set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+
+               goto restart;
+       }
+
+ out:
+       spin_unlock(&gl->gl_spin);
+
+       if (new_gh)
+               gfs2_holder_put(new_gh);
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       int held1, held2;
+
+       held1 = (gl->gl_state != LM_ST_UNLOCKED);
+       held2 = (new_state != LM_ST_UNLOCKED);
+
+       if (held1 != held2) {
+               if (held2) {
+                       atomic_inc(&sdp->sd_glock_held_count);
+                       gfs2_glock_hold(gl);
+               } else {
+                       atomic_dec(&sdp->sd_glock_held_count);
+                       gfs2_glock_put(gl);
+               }
+       }
+
+       gl->gl_state = new_state;
+}
+
+/**
+ * xmote_bh - Called after the lock module is done acquiring a lock
+ * @gl: The glock in question
+ * @ret: the int returned from the lock module
+ *
+ */
+
+static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+       struct gfs2_holder *gh = gl->gl_req_gh;
+       int prev_state = gl->gl_state;
+       int op_done = 1;
+
+       gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+       gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+       gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
+
+       state_change(gl, ret & LM_OUT_ST_MASK);
+
+       if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
+               if (glops->go_inval)
+                       glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+       } else if (gl->gl_state == LM_ST_DEFERRED) {
+               /* We might not want to do this here.
+                  Look at moving to the inode glops. */
+               if (glops->go_inval)
+                       glops->go_inval(gl, DIO_DATA);
+       }
+
+       /*  Deal with each possible exit condition  */
+
+       if (!gh)
+               gl->gl_stamp = jiffies;
+
+       else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+               spin_lock(&gl->gl_spin);
+               list_del_init(&gh->gh_list);
+               gh->gh_error = -EIO;
+               if (test_bit(HIF_RECURSE, &gh->gh_iflags))
+                       do_unrecurse(gh);
+               spin_unlock(&gl->gl_spin);
+
+       } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+               spin_lock(&gl->gl_spin);
+               list_del_init(&gh->gh_list);
+               if (gl->gl_state == gh->gh_state ||
+                   gl->gl_state == LM_ST_UNLOCKED)
+                       gh->gh_error = 0;
+               else {
+                       if (gfs2_assert_warn(sdp, gh->gh_flags &
+                                       (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+                               fs_warn(sdp, "ret = 0x%.8X\n", ret);
+                       gh->gh_error = GLR_TRYFAILED;
+               }
+               spin_unlock(&gl->gl_spin);
+
+               if (ret & LM_OUT_CANCELED)
+                       handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
+
+       } else if (ret & LM_OUT_CANCELED) {
+               spin_lock(&gl->gl_spin);
+               list_del_init(&gh->gh_list);
+               gh->gh_error = GLR_CANCELED;
+               if (test_bit(HIF_RECURSE, &gh->gh_iflags))
+                       do_unrecurse(gh);
+               spin_unlock(&gl->gl_spin);
+
+       } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+               spin_lock(&gl->gl_spin);
+               list_move_tail(&gh->gh_list, &gl->gl_holders);
+               gh->gh_error = 0;
+               set_bit(HIF_HOLDER, &gh->gh_iflags);
+               spin_unlock(&gl->gl_spin);
+
+               set_bit(HIF_FIRST, &gh->gh_iflags);
+
+               op_done = 0;
+
+       } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+               spin_lock(&gl->gl_spin);
+               list_del_init(&gh->gh_list);
+               gh->gh_error = GLR_TRYFAILED;
+               if (test_bit(HIF_RECURSE, &gh->gh_iflags))
+                       do_unrecurse(gh);
+               spin_unlock(&gl->gl_spin);
+
+       } else {
+               if (gfs2_assert_withdraw(sdp, 0) == -1)
+                       fs_err(sdp, "ret = 0x%.8X\n", ret);
+       }
+
+       if (glops->go_xmote_bh)
+               glops->go_xmote_bh(gl);
+
+       if (op_done) {
+               spin_lock(&gl->gl_spin);
+               gl->gl_req_gh = NULL;
+               gl->gl_req_bh = NULL;
+               clear_bit(GLF_LOCK, &gl->gl_flags);
+               run_queue(gl);
+               spin_unlock(&gl->gl_spin);
+       }
+
+       gfs2_glock_put(gl);
+
+       if (gh) {
+               if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                       gfs2_holder_put(gh);
+               else
+                       complete(&gh->gh_wait);
+       }
+}
+
+/**
+ * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
+ * @gl: The glock in question
+ * @state: the requested state
+ * @flags: modifier flags to the lock call
+ *
+ */
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+       int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
+                                LM_FLAG_NOEXP | LM_FLAG_ANY |
+                                LM_FLAG_PRIORITY);
+       unsigned int lck_ret;
+
+       gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+       gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+       gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
+       gfs2_assert_warn(sdp, state != gl->gl_state);
+
+       if (gl->gl_state == LM_ST_EXCLUSIVE) {
+               if (glops->go_sync)
+                       glops->go_sync(gl,
+                                      DIO_METADATA | DIO_DATA | DIO_RELEASE);
+       }
+
+       gfs2_glock_hold(gl);
+       gl->gl_req_bh = xmote_bh;
+
+       atomic_inc(&sdp->sd_lm_lock_calls);
+
+       lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
+                              lck_flags);
+
+       if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
+               return;
+
+       if (lck_ret & LM_OUT_ASYNC)
+               gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
+       else
+               xmote_bh(gl, lck_ret);
+}
+
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+       struct gfs2_holder *gh = gl->gl_req_gh;
+
+       clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+       gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+       gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+       gfs2_assert_warn(sdp, !ret);
+
+       state_change(gl, LM_ST_UNLOCKED);
+
+       if (glops->go_inval)
+               glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+
+       if (gh) {
+               spin_lock(&gl->gl_spin);
+               list_del_init(&gh->gh_list);
+               gh->gh_error = 0;
+               spin_unlock(&gl->gl_spin);
+       }
+
+       if (glops->go_drop_bh)
+               glops->go_drop_bh(gl);
+
+       spin_lock(&gl->gl_spin);
+       gl->gl_req_gh = NULL;
+       gl->gl_req_bh = NULL;
+       clear_bit(GLF_LOCK, &gl->gl_flags);
+       run_queue(gl);
+       spin_unlock(&gl->gl_spin);
+
+       gfs2_glock_put(gl);
+
+       if (gh) {
+               if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                       gfs2_holder_put(gh);
+               else
+                       complete(&gh->gh_wait);
+       }
+}
+
+/**
+ * gfs2_glock_drop_th - call into the lock module to unlock a lock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_drop_th(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+       unsigned int ret;
+
+       gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+       gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+       gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
+
+       if (gl->gl_state == LM_ST_EXCLUSIVE) {
+               if (glops->go_sync)
+                       glops->go_sync(gl,
+                                      DIO_METADATA | DIO_DATA | DIO_RELEASE);
+       }
+
+       gfs2_glock_hold(gl);
+       gl->gl_req_bh = drop_bh;
+
+       atomic_inc(&sdp->sd_lm_unlock_calls);
+
+       ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
+
+       if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
+               return;
+
+       if (!ret)
+               drop_bh(gl, ret);
+       else
+               gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+}
+
+/**
+ * do_cancels - cancel requests for locks stuck waiting on an expire flag
+ * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ *
+ * Don't cancel GL_NOCANCEL requests.
+ */
+
+static void do_cancels(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+
+       spin_lock(&gl->gl_spin);
+
+       while (gl->gl_req_gh != gh &&
+              !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+              !list_empty(&gh->gh_list)) {
+               if (gl->gl_req_bh &&
+                   !(gl->gl_req_gh &&
+                     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+                       spin_unlock(&gl->gl_spin);
+                       gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+                       msleep(100);
+                       spin_lock(&gl->gl_spin);
+               } else {
+                       spin_unlock(&gl->gl_spin);
+                       msleep(100);
+                       spin_lock(&gl->gl_spin);
+               }
+       }
+
+       spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * glock_wait_internal - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+static int glock_wait_internal(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+
+       if (test_bit(HIF_ABORTED, &gh->gh_iflags))
+               return -EIO;
+
+       if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+               spin_lock(&gl->gl_spin);
+               if (gl->gl_req_gh != gh &&
+                   !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+                   !list_empty(&gh->gh_list)) {
+                       list_del_init(&gh->gh_list);
+                       gh->gh_error = GLR_TRYFAILED;
+                       if (test_bit(HIF_RECURSE, &gh->gh_iflags))
+                               do_unrecurse(gh);
+                       run_queue(gl);
+                       spin_unlock(&gl->gl_spin);
+                       return gh->gh_error;
+               }
+               spin_unlock(&gl->gl_spin);
+       }
+
+       if (gh->gh_flags & LM_FLAG_PRIORITY)
+               do_cancels(gh);
+
+       wait_for_completion(&gh->gh_wait);
+
+       if (gh->gh_error)
+               return gh->gh_error;
+
+       gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
+       gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
+                                                  gh->gh_state,
+                                                  gh->gh_flags));
+
+       if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
+               gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+
+               if (glops->go_lock) {
+                       gh->gh_error = glops->go_lock(gh);
+                       if (gh->gh_error) {
+                               spin_lock(&gl->gl_spin);
+                               list_del_init(&gh->gh_list);
+                               if (test_and_clear_bit(HIF_RECURSE,
+                                                      &gh->gh_iflags))
+                                       do_unrecurse(gh);
+                               spin_unlock(&gl->gl_spin);
+                       }
+               }
+
+               spin_lock(&gl->gl_spin);
+               gl->gl_req_gh = NULL;
+               gl->gl_req_bh = NULL;
+               clear_bit(GLF_LOCK, &gl->gl_flags);
+               if (test_bit(HIF_RECURSE, &gh->gh_iflags))
+                       handle_recurse(gh);
+               run_queue(gl);
+               spin_unlock(&gl->gl_spin);
+       }
+
+       return gh->gh_error;
+}
+
+static inline struct gfs2_holder *
+find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+{
+       struct gfs2_holder *gh;
+
+       list_for_each_entry(gh, head, gh_list) {
+               if (gh->gh_owner == owner)
+                       return gh;
+       }
+
+       return NULL;
+}
+
+/**
+ * recurse_check -
+ *
+ * Make sure the new holder is compatible with the pre-existing one.
+ *
+ */
+
+static int recurse_check(struct gfs2_holder *existing, struct gfs2_holder *new,
+                        unsigned int state)
+{
+       struct gfs2_sbd *sdp = existing->gh_gl->gl_sbd;
+
+       if (gfs2_assert_warn(sdp, (new->gh_flags & LM_FLAG_ANY) ||
+                                 !(existing->gh_flags & LM_FLAG_ANY)))
+               goto fail;
+
+       if (gfs2_assert_warn(sdp, (existing->gh_flags & GL_LOCAL_EXCL) ||
+                                 !(new->gh_flags & GL_LOCAL_EXCL)))
+               goto fail;
+
+       if (gfs2_assert_warn(sdp, relaxed_state_ok(state, new->gh_state,
+                                                  new->gh_flags)))
+               goto fail;
+
+       return 0;
+
+ fail:
+       set_bit(HIF_ABORTED, &new->gh_iflags);
+       return -EINVAL;
+}
+
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ */
+
+static void add_to_queue(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_holder *existing;
+
+       if (!gh->gh_owner)
+               goto out;
+
+       existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+       if (existing) {
+               if (recurse_check(existing, gh, gl->gl_state))
+                       return;
+
+               list_add_tail(&gh->gh_list, &gl->gl_holders);
+               set_bit(HIF_HOLDER, &gh->gh_iflags);
+
+               gh->gh_error = 0;
+               complete(&gh->gh_wait);
+
+               return;
+       }
+
+       existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+       if (existing) {
+               if (recurse_check(existing, gh, existing->gh_state))
+                       return;
+
+               set_bit(HIF_RECURSE, &gh->gh_iflags);
+               set_bit(HIF_RECURSE, &existing->gh_iflags);
+
+               list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+
+               return;
+       }
+
+ out:
+       if (gh->gh_flags & LM_FLAG_PRIORITY)
+               list_add(&gh->gh_list, &gl->gl_waiters3);
+       else
+               list_add_tail(&gh->gh_list, &gl->gl_waiters3);  
+}
+
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       int error = 0;
+
+       atomic_inc(&sdp->sd_glock_nq_calls);
+
+ restart:
+       if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+               set_bit(HIF_ABORTED, &gh->gh_iflags);
+               return -EIO;
+       }
+
+       set_bit(HIF_PROMOTE, &gh->gh_iflags);
+
+       spin_lock(&gl->gl_spin);
+       add_to_queue(gh);
+       run_queue(gl);
+       spin_unlock(&gl->gl_spin);
+
+       if (!(gh->gh_flags & GL_ASYNC)) {
+               error = glock_wait_internal(gh);
+               if (error == GLR_CANCELED) {
+                       msleep(1000);
+                       goto restart;
+               }
+       }
+
+       clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+       return error;
+}
+
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       int ready = 0;
+
+       spin_lock(&gl->gl_spin);
+
+       if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+               ready = 1;
+       else if (list_empty(&gh->gh_list)) {
+               if (gh->gh_error == GLR_CANCELED) {
+                       spin_unlock(&gl->gl_spin);
+                       msleep(1000);
+                       if (gfs2_glock_nq(gh))
+                               return 1;
+                       return 0;
+               } else
+                       ready = 1;
+       }
+
+       spin_unlock(&gl->gl_spin);
+
+       return ready;
+}
+
+/**
+ * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+       int error;
+
+       error = glock_wait_internal(gh);
+       if (error == GLR_CANCELED) {
+               msleep(1000);
+               gh->gh_flags &= ~GL_ASYNC;
+               error = gfs2_glock_nq(gh);
+       }
+
+       return error;
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+
+       atomic_inc(&sdp->sd_glock_dq_calls);
+
+       if (gh->gh_flags & GL_SYNC)
+               set_bit(GLF_SYNC, &gl->gl_flags);
+
+       if (gh->gh_flags & GL_NOCACHE)
+               handle_callback(gl, LM_ST_UNLOCKED);
+
+       gfs2_glmutex_lock(gl);
+
+       spin_lock(&gl->gl_spin);
+       list_del_init(&gh->gh_list);
+
+       if (list_empty(&gl->gl_holders)) {
+               spin_unlock(&gl->gl_spin);
+
+               if (glops->go_unlock)
+                       glops->go_unlock(gh);
+
+               if (test_bit(GLF_SYNC, &gl->gl_flags)) {
+                       if (glops->go_sync)
+                               glops->go_sync(gl, DIO_METADATA | DIO_DATA);
+               }
+
+               gl->gl_stamp = jiffies;
+
+               spin_lock(&gl->gl_spin);
+       }
+
+       clear_bit(GLF_LOCK, &gl->gl_flags);
+       run_queue(gl);
+       spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * gfs2_glock_prefetch - Try to prefetch a glock
+ * @gl: the glock
+ * @state: the state to prefetch in
+ * @flags: flags passed to go_xmote_th()
+ *
+ */
+
+void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+
+       spin_lock(&gl->gl_spin);
+
+       if (test_bit(GLF_LOCK, &gl->gl_flags) ||
+           !list_empty(&gl->gl_holders) ||
+           !list_empty(&gl->gl_waiters1) ||
+           !list_empty(&gl->gl_waiters2) ||
+           !list_empty(&gl->gl_waiters3) ||
+           relaxed_state_ok(gl->gl_state, state, flags)) {
+               spin_unlock(&gl->gl_spin);
+               return;
+       }
+
+       set_bit(GLF_PREFETCH, &gl->gl_flags);
+       set_bit(GLF_LOCK, &gl->gl_flags);
+       spin_unlock(&gl->gl_spin);
+
+       glops->go_xmote_th(gl, state, flags);
+
+       atomic_inc(&gl->gl_sbd->sd_glock_prefetch_calls);
+}
+
+/**
+ * gfs2_glock_force_drop - Force a glock to be uncached
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_force_drop(struct gfs2_glock *gl)
+{
+       struct gfs2_holder gh;
+
+       gfs2_holder_init(gl, LM_ST_UNLOCKED, GL_NEVER_RECURSE, &gh);
+       set_bit(HIF_DEMOTE, &gh.gh_iflags);
+
+       spin_lock(&gl->gl_spin);
+       list_add_tail(&gh.gh_list, &gl->gl_waiters2);
+       run_queue(gl);
+       spin_unlock(&gl->gl_spin);
+
+       wait_for_completion(&gh.gh_wait);
+       gfs2_holder_uninit(&gh);
+}
+
+static void greedy_work(void *data)
+{
+       struct greedy *gr = (struct greedy *)data;
+       struct gfs2_holder *gh = &gr->gr_gh;
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+
+       clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+
+       if (glops->go_greedy)
+               glops->go_greedy(gl);
+
+       spin_lock(&gl->gl_spin);
+
+       if (list_empty(&gl->gl_waiters2)) {
+               clear_bit(GLF_GREEDY, &gl->gl_flags);
+               spin_unlock(&gl->gl_spin);
+               gfs2_holder_uninit(gh);
+               kfree(gr);
+       } else {
+               gfs2_glock_hold(gl);
+               list_add_tail(&gh->gh_list, &gl->gl_waiters2);
+               run_queue(gl);
+               spin_unlock(&gl->gl_spin);
+               gfs2_glock_put(gl);
+       }
+}
+
+/**
+ * gfs2_glock_be_greedy -
+ * @gl:
+ * @time:
+ *
+ * Returns: 0 if go_greedy will be called, 1 otherwise
+ */
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
+{
+       struct greedy *gr;
+       struct gfs2_holder *gh;
+
+       if (!time ||
+           gl->gl_sbd->sd_args.ar_localcaching ||
+           test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
+               return 1;
+
+       gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
+       if (!gr) {
+               clear_bit(GLF_GREEDY, &gl->gl_flags);
+               return 1;
+       }
+       gh = &gr->gr_gh;
+
+       gfs2_holder_init(gl, 0, GL_NEVER_RECURSE, gh);
+       set_bit(HIF_GREEDY, &gh->gh_iflags);
+       INIT_WORK(&gr->gr_work, greedy_work, gr);
+
+       set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+       schedule_delayed_work(&gr->gr_work, time);
+
+       return 0;
+}
+
+/**
+ * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+
+int gfs2_glock_nq_init(struct gfs2_glock *gl, unsigned int state, int flags,
+                      struct gfs2_holder *gh)
+{
+       int error;
+
+       gfs2_holder_init(gl, state, flags, gh);
+
+       error = gfs2_glock_nq(gh);
+       if (error)
+               gfs2_holder_uninit(gh);
+
+       return error;
+}
+
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+       gfs2_glock_dq(gh);
+       gfs2_holder_uninit(gh);
+}
+
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
+                     struct gfs2_glock_operations *glops, unsigned int state,
+                     int flags, struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl;
+       int error;
+
+       error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+       if (!error) {
+               error = gfs2_glock_nq_init(gl, state, flags, gh);
+               gfs2_glock_put(gl);
+       }
+
+       return error;
+}
+
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+       struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
+       struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
+       struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+       struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+       int ret = 0;
+
+       if (a->ln_number > b->ln_number)
+               ret = 1;
+       else if (a->ln_number < b->ln_number)
+               ret = -1;
+       else {
+               if (gh_a->gh_state == LM_ST_SHARED &&
+                   gh_b->gh_state == LM_ST_EXCLUSIVE)
+                       ret = 1;
+               else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
+                        (gh_b->gh_flags & GL_LOCAL_EXCL))
+                       ret = 1;
+       }
+
+       return ret;
+}
+
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+                    struct gfs2_holder **p)
+{
+       unsigned int x;
+       int error = 0;
+
+       for (x = 0; x < num_gh; x++)
+               p[x] = &ghs[x];
+
+       sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+
+       for (x = 0; x < num_gh; x++) {
+               p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+               error = gfs2_glock_nq(p[x]);
+               if (error) {
+                       while (x--)
+                               gfs2_glock_dq(p[x]);
+                       break;
+               }
+       }
+
+       return error;
+}
+
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace this code with code that calls gfs2_glock_prefetch()
+ * 2) Forget async stuff and just call nq_m_sync()
+ * 3) Leave it like it is
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+       int *e;
+       unsigned int x;
+       int borked = 0, serious = 0;
+       int error = 0;
+
+       if (!num_gh)
+               return 0;
+
+       if (num_gh == 1) {
+               ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+               return gfs2_glock_nq(ghs);
+       }
+
+       e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+       if (!e)
+               return -ENOMEM;
+
+       for (x = 0; x < num_gh; x++) {
+               ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
+               error = gfs2_glock_nq(&ghs[x]);
+               if (error) {
+                       borked = 1;
+                       serious = error;
+                       num_gh = x;
+                       break;
+               }
+       }
+
+       for (x = 0; x < num_gh; x++) {
+               error = e[x] = glock_wait_internal(&ghs[x]);
+               if (error) {
+                       borked = 1;
+                       if (error != GLR_TRYFAILED && error != GLR_CANCELED)
+                               serious = error;
+               }
+       }
+
+       if (!borked) {
+               kfree(e);
+               return 0;
+       }
+
+       for (x = 0; x < num_gh; x++)
+               if (!e[x])
+                       gfs2_glock_dq(&ghs[x]);
+
+       if (serious)
+               error = serious;
+       else {
+               for (x = 0; x < num_gh; x++)
+                       gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
+                                         &ghs[x]);
+               error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
+       }
+
+       kfree(e);
+
+       return error;
+}
+
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+       unsigned int x;
+
+       for (x = 0; x < num_gh; x++)
+               gfs2_glock_dq(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+       unsigned int x;
+
+       for (x = 0; x < num_gh; x++)
+               gfs2_glock_dq_uninit(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_prefetch_num - prefetch a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ *
+ * Returns: errno
+ */
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
+                            struct gfs2_glock_operations *glops,
+                            unsigned int state, int flags)
+{
+       struct gfs2_glock *gl;
+       int error;
+
+       if (atomic_read(&sdp->sd_reclaim_count) <
+           gfs2_tune_get(sdp, gt_reclaim_limit)) {
+               error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+               if (!error) {
+                       gfs2_glock_prefetch(gl, state, flags);
+                       gfs2_glock_put(gl);
+               }
+       }
+}
+
+/**
+ * gfs2_lvb_hold - attach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl)
+{
+       int error;
+
+       gfs2_glmutex_lock(gl);
+
+       if (!atomic_read(&gl->gl_lvb_count)) {
+               error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
+               if (error) {
+                       gfs2_glmutex_unlock(gl);
+                       return error;
+               }
+               gfs2_glock_hold(gl);
+       }
+       atomic_inc(&gl->gl_lvb_count);
+
+       gfs2_glmutex_unlock(gl);
+
+       return 0;
+}
+
+/**
+ * gfs2_lvb_unhold - detach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+void gfs2_lvb_unhold(struct gfs2_glock *gl)
+{
+       gfs2_glock_hold(gl);
+       gfs2_glmutex_lock(gl);
+
+       gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
+       if (atomic_dec_and_test(&gl->gl_lvb_count)) {
+               gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+               gl->gl_lvb = NULL;
+               gfs2_glock_put(gl);
+       }
+
+       gfs2_glmutex_unlock(gl);
+       gfs2_glock_put(gl);
+}
+
+void gfs2_lvb_sync(struct gfs2_glock *gl)
+{
+       gfs2_glmutex_lock(gl);
+
+       gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
+       if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
+               gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+
+       gfs2_glmutex_unlock(gl);
+}
+
+static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                       unsigned int state)
+{
+       struct gfs2_glock *gl;
+
+       gl = gfs2_glock_find(sdp, name);
+       if (!gl)
+               return;
+
+       if (gl->gl_ops->go_callback)
+               gl->gl_ops->go_callback(gl, state);
+       handle_callback(gl, state);
+
+       spin_lock(&gl->gl_spin);
+       run_queue(gl);
+       spin_unlock(&gl->gl_spin);
+
+       gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_cb - Callback used by locking module
+ * @fsdata: Pointer to the superblock
+ * @type: Type of callback
+ * @data: Type dependent data pointer
+ *
+ * Called by the locking module when it wants to tell us something.
+ * Either we need to drop a lock, one of our ASYNC requests completed, or
+ * a journal from another client needs to be recovered.
+ */
+
+void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
+{
+       struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
+
+       atomic_inc(&sdp->sd_lm_callbacks);
+
+       switch (type) {
+       case LM_CB_NEED_E:
+               blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_UNLOCKED);
+               return;
+
+       case LM_CB_NEED_D:
+               blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_DEFERRED);
+               return;
+
+       case LM_CB_NEED_S:
+               blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_SHARED);
+               return;
+
+       case LM_CB_ASYNC: {
+               struct lm_async_cb *async = (struct lm_async_cb *)data;
+               struct gfs2_glock *gl;
+
+               gl = gfs2_glock_find(sdp, &async->lc_name);
+               if (gfs2_assert_warn(sdp, gl))
+                       return;
+               if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+                       gl->gl_req_bh(gl, async->lc_ret);
+               gfs2_glock_put(gl);
+
+               return;
+       }
+
+       case LM_CB_NEED_RECOVERY:
+               gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
+               if (sdp->sd_recoverd_process)
+                       wake_up_process(sdp->sd_recoverd_process);
+               return;
+
+       case LM_CB_DROPLOCKS:
+               gfs2_gl_hash_clear(sdp, NO_WAIT);
+               gfs2_quota_scan(sdp);
+               return;
+
+       default:
+               gfs2_assert_warn(sdp, 0);
+               return;
+       }
+}
+
+/**
+ * gfs2_try_toss_inode - try to remove a particular inode struct from cache
+ * sdp: the filesystem
+ * inum: the inode number
+ *
+ */
+
+void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum)
+{
+       struct gfs2_glock *gl;
+       struct gfs2_inode *ip;
+       int error;
+
+       error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops,
+                              NO_CREATE, &gl);
+       if (error || !gl)
+               return;
+
+       if (!gfs2_glmutex_trylock(gl))
+               goto out;
+
+       ip = get_gl2ip(gl);
+       if (!ip)
+               goto out_unlock;
+
+       if (atomic_read(&ip->i_count))
+               goto out_unlock;
+
+       gfs2_inode_destroy(ip);
+
+ out_unlock:
+       gfs2_glmutex_unlock(gl);
+
+ out:
+       gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
+ *                          iopen glock from memory
+ * @io_gl: the iopen glock
+ * @state: the state into which the glock should be put
+ *
+ */
+
+void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
+{
+       struct gfs2_glock *i_gl;
+
+       if (state != LM_ST_UNLOCKED)
+               return;
+
+       spin_lock(&io_gl->gl_spin);
+       i_gl = get_gl2gl(io_gl);
+       if (i_gl) {
+               gfs2_glock_hold(i_gl);
+               spin_unlock(&io_gl->gl_spin);
+       } else {
+               spin_unlock(&io_gl->gl_spin);
+               return;
+       }
+
+       if (gfs2_glmutex_trylock(i_gl)) {
+               struct gfs2_inode *ip = get_gl2ip(i_gl);
+               if (ip) {
+                       gfs2_try_toss_vnode(ip);
+                       gfs2_glmutex_unlock(i_gl);
+                       gfs2_glock_schedule_for_reclaim(i_gl);
+                       goto out;
+               }
+               gfs2_glmutex_unlock(i_gl);
+       }
+
+ out:
+       gfs2_glock_put(i_gl);
+}
+
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_glock_operations *glops = gl->gl_ops;
+       int demote = 1;
+
+       if (test_bit(GLF_STICKY, &gl->gl_flags))
+               demote = 0;
+       else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
+               demote = time_after_eq(jiffies,
+                                   gl->gl_stamp +
+                                   gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
+       else if (glops->go_demote_ok)
+               demote = glops->go_demote_ok(gl);
+
+       return demote;
+}
+
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+
+       spin_lock(&sdp->sd_reclaim_lock);
+       if (list_empty(&gl->gl_reclaim)) {
+               gfs2_glock_hold(gl);
+               list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+               atomic_inc(&sdp->sd_reclaim_count);
+       }
+       spin_unlock(&sdp->sd_reclaim_lock);
+
+       wake_up(&sdp->sd_reclaim_wq);
+}
+
+/**
+ * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
+ * @sdp: the filesystem
+ *
+ * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
+ * different glock and we notice that there are a lot of glocks in the
+ * reclaim list.
+ *
+ */
+
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+{
+       struct gfs2_glock *gl;
+
+       spin_lock(&sdp->sd_reclaim_lock);
+       if (list_empty(&sdp->sd_reclaim_list)) {
+               spin_unlock(&sdp->sd_reclaim_lock);
+               return;
+       }
+       gl = list_entry(sdp->sd_reclaim_list.next,
+                       struct gfs2_glock, gl_reclaim);
+       list_del_init(&gl->gl_reclaim);
+       spin_unlock(&sdp->sd_reclaim_lock);
+
+       atomic_dec(&sdp->sd_reclaim_count);
+       atomic_inc(&sdp->sd_reclaimed);
+
+       if (gfs2_glmutex_trylock(gl)) {
+               if (gl->gl_ops == &gfs2_inode_glops) {
+                       struct gfs2_inode *ip = get_gl2ip(gl);
+                       if (ip && !atomic_read(&ip->i_count))
+                               gfs2_inode_destroy(ip);
+               }
+               if (queue_empty(gl, &gl->gl_holders) &&
+                   gl->gl_state != LM_ST_UNLOCKED &&
+                   demote_ok(gl))
+                       handle_callback(gl, LM_ST_UNLOCKED);
+               gfs2_glmutex_unlock(gl);
+       }
+
+       gfs2_glock_put(gl);
+}
+
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ * Returns: 1 if the bucket has entries
+ */
+
+static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+                         struct gfs2_gl_hash_bucket *bucket)
+{
+       struct glock_plug plug;
+       struct list_head *tmp;
+       struct gfs2_glock *gl;
+       int entries;
+
+       /* Add "plug" to end of bucket list, work back up list from there */
+       memset(&plug.gl_flags, 0, sizeof(unsigned long));
+       set_bit(GLF_PLUG, &plug.gl_flags);
+
+       write_lock(&bucket->hb_lock);
+       list_add(&plug.gl_list, &bucket->hb_list);
+       write_unlock(&bucket->hb_lock);
+
+       for (;;) {
+               write_lock(&bucket->hb_lock);
+
+               for (;;) {
+                       tmp = plug.gl_list.next;
+
+                       if (tmp == &bucket->hb_list) {
+                               list_del(&plug.gl_list);
+                               entries = !list_empty(&bucket->hb_list);
+                               write_unlock(&bucket->hb_lock);
+                               return entries;
+                       }
+                       gl = list_entry(tmp, struct gfs2_glock, gl_list);
+
+                       /* Move plug up list */
+                       list_move(&plug.gl_list, &gl->gl_list);
+
+                       if (test_bit(GLF_PLUG, &gl->gl_flags))
+                               continue;
+
+                       /* examiner() must glock_put() */
+                       gfs2_glock_hold(gl);
+
+                       break;
+               }
+
+               write_unlock(&bucket->hb_lock);
+
+               examiner(gl);
+       }
+}
+
+/**
+ * scan_glock - look at a glock and see if we can reclaim it
+ * @gl: the glock to look at
+ *
+ */
+
+static void scan_glock(struct gfs2_glock *gl)
+{
+       if (gfs2_glmutex_trylock(gl)) {
+               if (gl->gl_ops == &gfs2_inode_glops) {
+                       struct gfs2_inode *ip = get_gl2ip(gl);
+                       if (ip && !atomic_read(&ip->i_count))
+                               goto out_schedule;
+               }
+               if (queue_empty(gl, &gl->gl_holders) &&
+                   gl->gl_state != LM_ST_UNLOCKED &&
+                   demote_ok(gl))
+                       goto out_schedule;
+
+               gfs2_glmutex_unlock(gl);
+       }
+
+       gfs2_glock_put(gl);
+
+       return;
+
+ out_schedule:
+       gfs2_glmutex_unlock(gl);
+       gfs2_glock_schedule_for_reclaim(gl);
+       gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_scand_internal - Look for glocks and inodes to toss from memory
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp)
+{
+       unsigned int x;
+
+       for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+               examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
+               cond_resched();
+       }
+}
+
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+
+static void clear_glock(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       int released;
+
+       spin_lock(&sdp->sd_reclaim_lock);
+       if (!list_empty(&gl->gl_reclaim)) {
+               list_del_init(&gl->gl_reclaim);
+               atomic_dec(&sdp->sd_reclaim_count);
+               released = gfs2_glock_put(gl);
+               gfs2_assert(sdp, !released);
+       }
+       spin_unlock(&sdp->sd_reclaim_lock);
+
+       if (gfs2_glmutex_trylock(gl)) {
+               if (gl->gl_ops == &gfs2_inode_glops) {
+                       struct gfs2_inode *ip = get_gl2ip(gl);
+                       if (ip && !atomic_read(&ip->i_count))
+                               gfs2_inode_destroy(ip);
+               }
+               if (queue_empty(gl, &gl->gl_holders) &&
+                   gl->gl_state != LM_ST_UNLOCKED)
+                       handle_callback(gl, LM_ST_UNLOCKED);
+
+               gfs2_glmutex_unlock(gl);
+       }
+
+       gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem, or when inter-node lock manager
+ * requests DROPLOCKS because it is running out of capacity.
+ */
+
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+{
+       unsigned long t;
+       unsigned int x;
+       int cont;
+
+       t = jiffies;
+
+       for (;;) {
+               cont = 0;
+
+               for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                       if (examine_bucket(clear_glock, sdp,
+                                          &sdp->sd_gl_hash[x]))
+                               cont = 1;
+
+               if (!wait || !cont)
+                       break;
+
+               if (time_after_eq(jiffies,
+                                 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
+                       fs_warn(sdp, "Unmount seems to be stalled. "
+                                    "Dumping lock state...\n");
+                       gfs2_dump_lockstate(sdp);
+                       t = jiffies;
+               }
+
+               /* invalidate_inodes() requires that the sb inodes list
+                  not change, but an async completion callback for an
+                  unlock can occur which does glock_put() which
+                  can call iput() which will change the sb inodes list.
+                  invalidate_inodes_mutex prevents glock_put()'s during
+                  an invalidate_inodes() */
+
+               down(&sdp->sd_invalidate_inodes_mutex);
+               invalidate_inodes(sdp->sd_vfs);
+               up(&sdp->sd_invalidate_inodes_mutex);
+               yield();
+       }
+}
+
+/*
+ *  Diagnostic routines to help debug distributed deadlock
+ */
+
+/**
+ * dump_holder - print information about a glock holder
+ * @str: a string naming the type of holder
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_holder(char *str, struct gfs2_holder *gh)
+{
+       unsigned int x;
+       int error = -ENOBUFS;
+
+       printk("  %s\n", str);
+       printk("    owner = %ld\n",
+                  (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+       printk("    gh_state = %u\n", gh->gh_state);
+       printk("    gh_flags =");
+       for (x = 0; x < 32; x++)
+               if (gh->gh_flags & (1 << x))
+                       printk(" %u", x);
+       printk(" \n");
+       printk("    error = %d\n", gh->gh_error);
+       printk("    gh_iflags =");
+       for (x = 0; x < 32; x++)
+               if (test_bit(x, &gh->gh_iflags))
+                       printk(" %u", x);
+       printk(" \n");
+
+       error = 0;
+
+       return error;
+}
+
+/**
+ * dump_inode - print information about an inode
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_inode(struct gfs2_inode *ip)
+{
+       unsigned int x;
+       int error = -ENOBUFS;
+
+       printk("  Inode:\n");
+       printk("    num = %llu %llu\n",
+                   ip->i_num.no_formal_ino, ip->i_num.no_addr);
+       printk("    type = %u\n", IF2DT(ip->i_di.di_mode));
+       printk("    i_count = %d\n", atomic_read(&ip->i_count));
+       printk("    i_flags =");
+       for (x = 0; x < 32; x++)
+               if (test_bit(x, &ip->i_flags))
+                       printk(" %u", x);
+       printk(" \n");
+       printk("    vnode = %s\n", (ip->i_vnode) ? "yes" : "no");
+
+       error = 0;
+
+       return error;
+}
+
+/**
+ * dump_glock - print information about a glock
+ * @gl: the glock
+ * @count: where we are in the buffer
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_glock(struct gfs2_glock *gl)
+{
+       struct gfs2_holder *gh;
+       unsigned int x;
+       int error = -ENOBUFS;
+
+       spin_lock(&gl->gl_spin);
+
+       printk("Glock (%u, %llu)\n",
+                   gl->gl_name.ln_type,
+                   gl->gl_name.ln_number);
+       printk("  gl_flags =");
+       for (x = 0; x < 32; x++)
+               if (test_bit(x, &gl->gl_flags))
+                       printk(" %u", x);
+       printk(" \n");
+       printk("  gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
+       printk("  gl_state = %u\n", gl->gl_state);
+       printk("  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+       printk("  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+       printk("  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+       printk("  object = %s\n", (gl->gl_object) ? "yes" : "no");
+       printk("  le = %s\n",
+                  (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
+       printk("  reclaim = %s\n",
+                   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+       if (gl->gl_aspace)
+               printk("  aspace = %lu\n",
+                           gl->gl_aspace->i_mapping->nrpages);
+       else
+               printk("  aspace = no\n");
+       printk("  ail = %d\n", atomic_read(&gl->gl_ail_count));
+       if (gl->gl_req_gh) {
+               error = dump_holder("Request", gl->gl_req_gh);
+               if (error)
+                       goto out;
+       }
+       list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+               error = dump_holder("Holder", gh);
+               if (error)
+                       goto out;
+       }
+       list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+               error = dump_holder("Waiter1", gh);
+               if (error)
+                       goto out;
+       }
+       list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+               error = dump_holder("Waiter2", gh);
+               if (error)
+                       goto out;
+       }
+       list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
+               error = dump_holder("Waiter3", gh);
+               if (error)
+                       goto out;
+       }
+       if (gl->gl_ops == &gfs2_inode_glops && get_gl2ip(gl)) {
+               if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
+                   list_empty(&gl->gl_holders)) {
+                       error = dump_inode(get_gl2ip(gl));
+                       if (error)
+                               goto out;
+               } else {
+                       error = -ENOBUFS;
+                       printk("  Inode: busy\n");
+               }
+       }
+
+       error = 0;
+
+ out:
+       spin_unlock(&gl->gl_spin);
+
+       return error;
+}
+
+/**
+ * gfs2_dump_lockstate - print out the current lockstate
+ * @sdp: the filesystem
+ * @ub: the buffer to copy the information into
+ *
+ * If @ub is NULL, dump the lockstate to the console.
+ *
+ */
+
+int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
+{
+       struct gfs2_gl_hash_bucket *bucket;
+       struct gfs2_glock *gl;
+       unsigned int x;
+       int error = 0;
+
+       for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+               bucket = &sdp->sd_gl_hash[x];
+
+               read_lock(&bucket->hb_lock);
+
+               list_for_each_entry(gl, &bucket->hb_list, gl_list) {
+                       if (test_bit(GLF_PLUG, &gl->gl_flags))
+                               continue;
+
+                       error = dump_glock(gl);
+                       if (error)
+                               break;
+               }
+
+               read_unlock(&bucket->hb_lock);
+
+               if (error)
+                       break;
+       }
+
+
+       return error;
+}
+
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644 (file)
index 0000000..06847eb
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+
+/* Flags for lock requests; used in gfs2_holder gh_flag field.
+   From lm_interface.h:
+#define LM_FLAG_TRY            0x00000001
+#define LM_FLAG_TRY_1CB                0x00000002
+#define LM_FLAG_NOEXP          0x00000004
+#define LM_FLAG_ANY            0x00000008
+#define LM_FLAG_PRIORITY       0x00000010 */
+
+#define GL_LOCAL_EXCL          0x00000020
+#define GL_ASYNC               0x00000040
+#define GL_EXACT               0x00000080
+#define GL_SKIP                        0x00000100
+#define GL_ATIME               0x00000200
+#define GL_NOCACHE             0x00000400
+#define GL_SYNC                        0x00000800
+#define GL_NOCANCEL            0x00001000
+#define GL_NEVER_RECURSE       0x00002000
+
+#define GLR_TRYFAILED          13
+#define GLR_CANCELED           14
+
+static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+       struct gfs2_holder *gh;
+       int locked = 0;
+
+       /* Look in glock's list of holders for one with current task as owner */
+       spin_lock(&gl->gl_spin);
+       list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+               if (gh->gh_owner == current) {
+                       locked = 1;
+                       break;
+               }
+       }
+       spin_unlock(&gl->gl_spin);
+
+       return locked;
+}
+
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+       return (gl->gl_state == LM_ST_EXCLUSIVE);
+}
+
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+       return (gl->gl_state == LM_ST_DEFERRED);
+}
+
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+       return (gl->gl_state == LM_ST_SHARED);
+}
+
+static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
+{
+       int ret;
+       spin_lock(&gl->gl_spin);
+       ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+       spin_unlock(&gl->gl_spin);
+       return ret;
+}
+
+struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
+                                  struct lm_lockname *name);
+int gfs2_glock_get(struct gfs2_sbd *sdp,
+                  uint64_t number, struct gfs2_glock_operations *glops,
+                  int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+int gfs2_glock_put(struct gfs2_glock *gl);
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, int flags,
+                     struct gfs2_holder *gh);
+void gfs2_holder_reinit(unsigned int state, int flags, struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, unsigned int state,
+                                   int flags, gfp_t gfp_flags);
+void gfs2_holder_put(struct gfs2_holder *gh);
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
+void gfs2_glock_drop_th(struct gfs2_glock *gl);
+
+void gfs2_glmutex_lock(struct gfs2_glock *gl);
+int gfs2_glmutex_trylock(struct gfs2_glock *gl);
+void gfs2_glmutex_unlock(struct gfs2_glock *gl);
+
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+
+void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, int flags);
+void gfs2_glock_force_drop(struct gfs2_glock *gl);
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
+
+int gfs2_glock_nq_init(struct gfs2_glock *gl, unsigned int state, int flags,
+                      struct gfs2_holder *gh);
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+                     uint64_t number, struct gfs2_glock_operations *glops,
+                     unsigned int state, int flags, struct gfs2_holder *gh);
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
+                            struct gfs2_glock_operations *glops,
+                            unsigned int state, int flags);
+
+/*  Lock Value Block functions  */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl);
+void gfs2_lvb_unhold(struct gfs2_glock *gl);
+void gfs2_lvb_sync(struct gfs2_glock *gl);
+
+void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
+
+void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum);
+void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+
+int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
+
+#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644 (file)
index 0000000..1270081
--- /dev/null
@@ -0,0 +1,487 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <asm/semaphore.h>
+
+#include "gfs2.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "page.h"
+#include "recovery.h"
+#include "rgrp.h"
+
+/**
+ * meta_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ * @flags: DIO_*
+ *
+ * Called when demoting or unlocking an EX glock.  We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+
+static void meta_go_sync(struct gfs2_glock *gl, int flags)
+{
+       if (!(flags & DIO_METADATA))
+               return;
+
+       if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
+               gfs2_log_flush_glock(gl);
+               gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
+               if (flags & DIO_RELEASE)
+                       gfs2_ail_empty_gl(gl);
+       }
+
+       clear_bit(GLF_SYNC, &gl->gl_flags);
+}
+
+/**
+ * meta_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void meta_go_inval(struct gfs2_glock *gl, int flags)
+{
+       if (!(flags & DIO_METADATA))
+               return;
+
+       gfs2_meta_inval(gl);
+       gl->gl_vn++;
+}
+
+/**
+ * meta_go_demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if we have no cached data; ok to demote meta glock
+ */
+
+static int meta_go_demote_ok(struct gfs2_glock *gl)
+{
+       return !gl->gl_aspace->i_mapping->nrpages;
+}
+
+/**
+ * inode_go_xmote_th - promote/demote a glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+                             int flags)
+{
+       if (gl->gl_state != LM_ST_UNLOCKED)
+               gfs2_pte_inval(gl);
+       gfs2_glock_xmote_th(gl, state, flags);
+}
+
+/**
+ * inode_go_xmote_bh - After promoting/demoting a glock
+ * @gl: the glock
+ *
+ */
+
+static void inode_go_xmote_bh(struct gfs2_glock *gl)
+{
+       struct gfs2_holder *gh = gl->gl_req_gh;
+       struct buffer_head *bh;
+       int error;
+
+       if (gl->gl_state != LM_ST_UNLOCKED &&
+           (!gh || !(gh->gh_flags & GL_SKIP))) {
+               error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
+                                      &bh);
+               if (!error)
+                       brelse(bh);
+       }
+}
+
+/**
+ * inode_go_drop_th - unlock a glock
+ * @gl: the glock
+ *
+ * Invoked from rq_demote().
+ * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
+ * is being purged from our node's glock cache; we're dropping lock.
+ */
+
+static void inode_go_drop_th(struct gfs2_glock *gl)
+{
+       gfs2_pte_inval(gl);
+       gfs2_glock_drop_th(gl);
+}
+
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ * @flags:
+ *
+ */
+
+static void inode_go_sync(struct gfs2_glock *gl, int flags)
+{
+       int meta = (flags & DIO_METADATA);
+       int data = (flags & DIO_DATA);
+
+       if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
+               if (meta && data) {
+                       gfs2_page_sync(gl, flags | DIO_START);
+                       gfs2_log_flush_glock(gl);
+                       gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
+                       gfs2_page_sync(gl, flags | DIO_WAIT);
+                       clear_bit(GLF_DIRTY, &gl->gl_flags);
+               } else if (meta) {
+                       gfs2_log_flush_glock(gl);
+                       gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
+               } else if (data)
+                       gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
+               if (flags & DIO_RELEASE)
+                       gfs2_ail_empty_gl(gl);
+       }
+
+       clear_bit(GLF_SYNC, &gl->gl_flags);
+}
+
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+       int meta = (flags & DIO_METADATA);
+       int data = (flags & DIO_DATA);
+
+       if (meta) {
+               gfs2_meta_inval(gl);
+               gl->gl_vn++;
+       }
+       if (data)
+               gfs2_page_inval(gl);
+}
+
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int inode_go_demote_ok(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       int demote = 0;
+
+       if (!get_gl2ip(gl) && !gl->gl_aspace->i_mapping->nrpages)
+               demote = 1;
+       else if (!sdp->sd_args.ar_localcaching &&
+                time_after_eq(jiffies, gl->gl_stamp +
+                              gfs2_tune_get(sdp, gt_demote_secs) * HZ))
+               demote = 1;
+
+       return demote;
+}
+
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_inode *ip = get_gl2ip(gl);
+       int error = 0;
+
+       if (!ip)
+               return 0;
+
+       if (ip->i_vn != gl->gl_vn) {
+               error = gfs2_inode_refresh(ip);
+               if (error)
+                       return error;
+               gfs2_inode_attr_in(ip);
+       }
+
+       if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+           (gl->gl_state == LM_ST_EXCLUSIVE) &&
+           (gh->gh_flags & GL_LOCAL_EXCL))
+               error = gfs2_truncatei_resume(ip);
+
+       return error;
+}
+
+/**
+ * inode_go_unlock - operation done before an inode lock is unlocked by a
+ *                  process
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void inode_go_unlock(struct gfs2_holder *gh)
+{
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_inode *ip = get_gl2ip(gl);
+
+       if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
+               gfs2_inode_attr_in(ip);
+
+       if (ip)
+               gfs2_meta_cache_flush(ip);
+}
+
+/**
+ * inode_greedy -
+ * @gl: the glock
+ *
+ */
+
+static void inode_greedy(struct gfs2_glock *gl)
+{
+       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_inode *ip = get_gl2ip(gl);
+       unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
+       unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
+       unsigned int new_time;
+
+       spin_lock(&ip->i_spin);
+
+       if (time_after(ip->i_last_pfault + quantum, jiffies)) {
+               new_time = ip->i_greedy + quantum;
+               if (new_time > max)
+                       new_time = max;
+       } else {
+               new_time =&nb