Merge http://oss.oracle.com/git/ocfs2
Linus Torvalds [Fri, 6 Jan 2006 04:43:11 +0000 (20:43 -0800)]
119 files changed:
Documentation/filesystems/00-INDEX
Documentation/filesystems/configfs/configfs.txt [new file with mode: 0644]
Documentation/filesystems/configfs/configfs_example.c [new file with mode: 0644]
Documentation/filesystems/dlmfs.txt [new file with mode: 0644]
Documentation/filesystems/ocfs2.txt [new file with mode: 0644]
MAINTAINERS
drivers/block/loop.c
drivers/block/rd.c
fs/Kconfig
fs/Makefile
fs/configfs/Makefile [new file with mode: 0644]
fs/configfs/configfs_internal.h [new file with mode: 0644]
fs/configfs/dir.c [new file with mode: 0644]
fs/configfs/file.c [new file with mode: 0644]
fs/configfs/inode.c [new file with mode: 0644]
fs/configfs/item.c [new file with mode: 0644]
fs/configfs/mount.c [new file with mode: 0644]
fs/configfs/symlink.c [new file with mode: 0644]
fs/mpage.c
fs/ocfs2/Makefile [new file with mode: 0644]
fs/ocfs2/alloc.c [new file with mode: 0644]
fs/ocfs2/alloc.h [new file with mode: 0644]
fs/ocfs2/aops.c [new file with mode: 0644]
fs/ocfs2/aops.h [new file with mode: 0644]
fs/ocfs2/buffer_head_io.c [new file with mode: 0644]
fs/ocfs2/buffer_head_io.h [new file with mode: 0644]
fs/ocfs2/cluster/Makefile [new file with mode: 0644]
fs/ocfs2/cluster/endian.h [new file with mode: 0644]
fs/ocfs2/cluster/heartbeat.c [new file with mode: 0644]
fs/ocfs2/cluster/heartbeat.h [new file with mode: 0644]
fs/ocfs2/cluster/masklog.c [new file with mode: 0644]
fs/ocfs2/cluster/masklog.h [new file with mode: 0644]
fs/ocfs2/cluster/nodemanager.c [new file with mode: 0644]
fs/ocfs2/cluster/nodemanager.h [new file with mode: 0644]
fs/ocfs2/cluster/ocfs2_heartbeat.h [new file with mode: 0644]
fs/ocfs2/cluster/ocfs2_nodemanager.h [new file with mode: 0644]
fs/ocfs2/cluster/quorum.c [new file with mode: 0644]
fs/ocfs2/cluster/quorum.h [new file with mode: 0644]
fs/ocfs2/cluster/sys.c [new file with mode: 0644]
fs/ocfs2/cluster/sys.h [new file with mode: 0644]
fs/ocfs2/cluster/tcp.c [new file with mode: 0644]
fs/ocfs2/cluster/tcp.h [new file with mode: 0644]
fs/ocfs2/cluster/tcp_internal.h [new file with mode: 0644]
fs/ocfs2/cluster/ver.c [new file with mode: 0644]
fs/ocfs2/cluster/ver.h [new file with mode: 0644]
fs/ocfs2/dcache.c [new file with mode: 0644]
fs/ocfs2/dcache.h [new file with mode: 0644]
fs/ocfs2/dir.c [new file with mode: 0644]
fs/ocfs2/dir.h [new file with mode: 0644]
fs/ocfs2/dlm/Makefile [new file with mode: 0644]
fs/ocfs2/dlm/dlmapi.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmast.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmcommon.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmconvert.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmconvert.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmdebug.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmdebug.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmdomain.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmdomain.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmfs.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmfsver.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmfsver.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmlock.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmmaster.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmrecovery.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmthread.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmunlock.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmver.c [new file with mode: 0644]
fs/ocfs2/dlm/dlmver.h [new file with mode: 0644]
fs/ocfs2/dlm/userdlm.c [new file with mode: 0644]
fs/ocfs2/dlm/userdlm.h [new file with mode: 0644]
fs/ocfs2/dlmglue.c [new file with mode: 0644]
fs/ocfs2/dlmglue.h [new file with mode: 0644]
fs/ocfs2/endian.h [new file with mode: 0644]
fs/ocfs2/export.c [new file with mode: 0644]
fs/ocfs2/export.h [new file with mode: 0644]
fs/ocfs2/extent_map.c [new file with mode: 0644]
fs/ocfs2/extent_map.h [new file with mode: 0644]
fs/ocfs2/file.c [new file with mode: 0644]
fs/ocfs2/file.h [new file with mode: 0644]
fs/ocfs2/heartbeat.c [new file with mode: 0644]
fs/ocfs2/heartbeat.h [new file with mode: 0644]
fs/ocfs2/inode.c [new file with mode: 0644]
fs/ocfs2/inode.h [new file with mode: 0644]
fs/ocfs2/journal.c [new file with mode: 0644]
fs/ocfs2/journal.h [new file with mode: 0644]
fs/ocfs2/localalloc.c [new file with mode: 0644]
fs/ocfs2/localalloc.h [new file with mode: 0644]
fs/ocfs2/mmap.c [new file with mode: 0644]
fs/ocfs2/mmap.h [new file with mode: 0644]
fs/ocfs2/namei.c [new file with mode: 0644]
fs/ocfs2/namei.h [new file with mode: 0644]
fs/ocfs2/ocfs1_fs_compat.h [new file with mode: 0644]
fs/ocfs2/ocfs2.h [new file with mode: 0644]
fs/ocfs2/ocfs2_fs.h [new file with mode: 0644]
fs/ocfs2/ocfs2_lockid.h [new file with mode: 0644]
fs/ocfs2/slot_map.c [new file with mode: 0644]
fs/ocfs2/slot_map.h [new file with mode: 0644]
fs/ocfs2/suballoc.c [new file with mode: 0644]
fs/ocfs2/suballoc.h [new file with mode: 0644]
fs/ocfs2/super.c [new file with mode: 0644]
fs/ocfs2/super.h [new file with mode: 0644]
fs/ocfs2/symlink.c [new file with mode: 0644]
fs/ocfs2/symlink.h [new file with mode: 0644]
fs/ocfs2/sysfile.c [new file with mode: 0644]
fs/ocfs2/sysfile.h [new file with mode: 0644]
fs/ocfs2/uptodate.c [new file with mode: 0644]
fs/ocfs2/uptodate.h [new file with mode: 0644]
fs/ocfs2/ver.c [new file with mode: 0644]
fs/ocfs2/ver.h [new file with mode: 0644]
fs/ocfs2/vote.c [new file with mode: 0644]
fs/ocfs2/vote.h [new file with mode: 0644]
include/linux/configfs.h [new file with mode: 0644]
include/linux/fs.h
include/linux/writeback.h
mm/filemap.c
mm/readahead.c
mm/shmem.c
mm/vmscan.c

index 7e17712..74052d2 100644 (file)
@@ -12,10 +12,14 @@ cifs.txt
        - description of the CIFS filesystem
 coda.txt
        - description of the CODA filesystem.
+configfs/
+       - directory containing configfs documentation and example code.
 cramfs.txt
        - info on the cram filesystem for small storage (ROMs etc)
 devfs/
        - directory containing devfs documentation.
+dlmfs.txt
+       - info on the userspace interface to the OCFS2 DLM.
 ext2.txt
        - info, mount options and specifications for the Ext2 filesystem.
 hpfs.txt
@@ -30,6 +34,8 @@ ntfs.txt
        - info and mount options for the NTFS filesystem (Windows NT).
 proc.txt
        - info on Linux's /proc filesystem.
+ocfs2.txt
+       - info and mount options for the OCFS2 clustered filesystem.
 romfs.txt
        - Description of the ROMFS filesystem.
 smbfs.txt
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644 (file)
index 0000000..c4ff96b
--- /dev/null
@@ -0,0 +1,434 @@
+
+configfs - Userspace-driven kernel object configuation.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+       Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality.  Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs.  Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2).  It may allow some attributes to be modified via
+write(2).  The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together.  Unlike sysfs, the
+lifetime of the representation is completely driven by userspace.  The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system.  One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel.  You can access
+it by doing
+
+       mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems.  Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config.  Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2).  The item's attributes will also
+appear at this time.  readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values.  Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file.  The same efficiency caveats from sysfs
+apply.  Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once.  When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back.  Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2).  An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)).  Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
+for its configuration.  Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it.  Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+       # ls /config
+       fakenbd
+
+A fakenbd connection can be created with mkdir(2).  The name is
+arbitrary, but likely the tool will make some use of the name.  Perhaps
+it is a uuid or a disk name:
+
+       # mkdir /config/fakenbd/disk1
+       # ls /config/fakenbd/disk1
+       target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to.  The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+       # echo 10.0.0.1 > /config/fakenbd/disk1/target
+       # echo /dev/sda1 > /config/fakenbd/disk1/device
+       # echo 1 > /config/fakenbd/disk1/rw
+
+That's it.  That's all there is.  Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item.  A config_item reflects an
+object in the subsystem.  It has attributes that match values on that
+object.  configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group.  A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that.  The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module.  During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem.  A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+       struct config_item {
+               char                    *ci_name;
+               char                    ci_namebuf[UOBJ_NAME_LEN];
+               struct kref             ci_kref;
+               struct list_head        ci_entry;
+               struct config_item      *ci_parent;
+               struct config_group     *ci_group;
+               struct config_item_type *ci_type;
+               struct dentry           *ci_dentry;
+       };
+
+       void config_item_init(struct config_item *);
+       void config_item_init_type_name(struct config_item *,
+                                       const char *name,
+                                       struct config_item_type *type);
+       struct config_item *config_item_get(struct config_item *);
+       void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing.  The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it.  This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things.  For that, it needs a type.
+
+[struct config_item_type]
+
+       struct configfs_item_operations {
+               void (*release)(struct config_item *);
+               ssize_t (*show_attribute)(struct config_item *,
+                                         struct configfs_attribute *,
+                                         char *);
+               ssize_t (*store_attribute)(struct config_item *,
+                                          struct configfs_attribute *,
+                                          const char *, size_t);
+               int (*allow_link)(struct config_item *src,
+                                 struct config_item *target);
+               int (*drop_link)(struct config_item *src,
+                                struct config_item *target);
+       };
+
+       struct config_item_type {
+               struct module                           *ct_owner;
+               struct configfs_item_operations         *ct_item_ops;
+               struct configfs_group_operations        *ct_group_ops;
+               struct configfs_attribute               **ct_attrs;
+       };
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item.  All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method.  This method is called when the config_item's reference count
+reaches zero.  Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method.  Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+       struct configfs_attribute {
+               char                    *ca_name;
+               struct module           *ca_owner;
+               mode_t                  ca_mode;
+       };
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs.  When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename.  configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute.  The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vaccum.  The only way one can be created
+is via mkdir(2) on a config_group.  This will trigger creation of a
+child item.
+
+       struct config_group {
+               struct config_item              cg_item;
+               struct list_head                cg_children;
+               struct configfs_subsystem       *cg_subsys;
+               struct config_group             **default_groups;
+       };
+
+       void config_group_init(struct config_group *group);
+       void config_group_init_type_name(struct config_group *group,
+                                        const char *name,
+                                        struct config_item_type *type);
+
+
+The config_group structure contains a config_item.  Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups.  This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+       struct configfs_group_operations {
+               struct config_item *(*make_item)(struct config_group *group,
+                                                const char *name);
+               struct config_group *(*make_group)(struct config_group *group,
+                                                  const char *name);
+               int (*commit_item)(struct config_item *item);
+               void (*drop_item)(struct config_group *group,
+                                 struct config_item *item);
+       };
+
+A group creates child items by providing the
+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs.  Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group().  Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called.  As a config_group is also a
+config_item, it is not necessary for a seperate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation.  If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy).  The subsystem is
+responsible for responding to this.  If the subsystem has references to
+the item in other threads, the memory is safe.  It may take some time
+for the item to actually disappear from the subsystem's usage.  But it
+is gone from configfs.
+
+A config_group cannot be removed while it still has child items.  This
+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
+called, as the item has not been dropped.  rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, ususally at module_init time.  This
+tells configfs to make the subsystem appear in the file tree.
+
+       struct configfs_subsystem {
+               struct config_group     su_group;
+               struct semaphore        su_sem;
+       };
+
+       int configfs_register_subsystem(struct configfs_subsystem *subsys);
+       void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+       A subsystem consists of a toplevel config_group and a semaphore.
+The group is where child config_items are created.  For a subsystem,
+this group is usually defined statically.  Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the semaphore.
+       When the register call returns, the subsystem is live, and it
+will be visible via configfs.  At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example.c  It
+shows a trivial object displaying and storing an attribute, and a simple
+group creating and destroying these children.
+
+[Hierarchy Navigation and the Subsystem Semaphore]
+
+There is an extra bonus that configfs provides.  The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy.  For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem.  This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem semaphore to
+protect modifications.  Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+semaphore.
+
+A subsystem will be prevented from acquiring the semaphore while a newly
+allocated item has not been linked into this hierarchy.   Similarly, it
+will not be able to acquire the semaphore while a dropping item has not
+yet been unlinked.  This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration.  This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+semaphore.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship.  Often, however, a larger environment requires aggregation
+outside of the parent/child connection.  This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items.  Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item.  If the source item
+allows linking to target item, it returns 0.  A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method.  Like the ->drop_item() method,
+this is a void function and cannot return failure.  The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it.  Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation.  Thus,
+mkdir("parent) results in "parent", "parent/subgroup1", up through
+"parent/subgroupN".  Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group.  If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group.  Similarly, they are removed at the same time
+as the parent.  No extra notification is provided.  When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2).  They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state.  That is, no
+default values can be specified for the item's attributes such that the
+item can do its work.  Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above.  Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects.  This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized.  Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go.  More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attibutes are
+initialized in a way that makes sense.  configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations.  An item is
+committed via rename(2).  The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items.  When this group appears in configfs, mkdir(2) will
+not work directly in the group.  Instead, the group will have two
+subdirectories: "live" and "pending".  The "live" directory does not
+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
+created in the "pending" directory.  Its attributes can be modified at
+will.  Userspace commits the item by renaming it into the "live"
+directory.  At this point, the subsystem recieves the ->commit_item()
+callback.  If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted".  Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one.  The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
new file mode 100644 (file)
index 0000000..f3c6e49
--- /dev/null
@@ -0,0 +1,474 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example.c - This file is a demonstration module containing
+ *      a number of configfs subsystems.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+       struct configfs_subsystem subsys;
+       int showme;
+       int storeme;
+};
+
+struct childless_attribute {
+       struct configfs_attribute attr;
+       ssize_t (*show)(struct childless *, char *);
+       ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+       return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+                                    char *page)
+{
+       ssize_t pos;
+
+       pos = sprintf(page, "%d\n", childless->showme);
+       childless->showme++;
+
+       return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+                                     char *page)
+{
+       return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+                                      const char *page,
+                                      size_t count)
+{
+       unsigned long tmp;
+       char *p = (char *) page;
+
+       tmp = simple_strtoul(p, &p, 10);
+       if (!p || (*p && (*p != '\n')))
+               return -EINVAL;
+
+       if (tmp > INT_MAX)
+               return -ERANGE;
+
+       childless->storeme = tmp;
+
+       return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+                                         char *page)
+{
+       return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+       .attr   = { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+       .show   = childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+       .attr   = { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+       .show   = childless_storeme_read,
+       .store  = childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+       .attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+       .show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+       &childless_attr_showme.attr,
+       &childless_attr_storeme.attr,
+       &childless_attr_description.attr,
+       NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+                                  struct configfs_attribute *attr,
+                                  char *page)
+{
+       struct childless *childless = to_childless(item);
+       struct childless_attribute *childless_attr =
+               container_of(attr, struct childless_attribute, attr);
+       ssize_t ret = 0;
+
+       if (childless_attr->show)
+               ret = childless_attr->show(childless, page);
+       return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+                                   struct configfs_attribute *attr,
+                                   const char *page, size_t count)
+{
+       struct childless *childless = to_childless(item);
+       struct childless_attribute *childless_attr =
+               container_of(attr, struct childless_attribute, attr);
+       ssize_t ret = -EINVAL;
+
+       if (childless_attr->store)
+               ret = childless_attr->store(childless, page, count);
+       return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+       .show_attribute         = childless_attr_show,
+       .store_attribute        = childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+       .ct_item_ops    = &childless_item_ops,
+       .ct_attrs       = childless_attrs,
+       .ct_owner       = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+       .subsys = {
+               .su_group = {
+                       .cg_item = {
+                               .ci_namebuf = "01-childless",
+                               .ci_type = &childless_type,
+                       },
+               },
+       },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+       struct config_item item;
+       int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+       return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+       .ca_owner = THIS_MODULE,
+       .ca_name = "storeme",
+       .ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+       &simple_child_attr_storeme,
+       NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+                                     struct configfs_attribute *attr,
+                                     char *page)
+{
+       ssize_t count;
+       struct simple_child *simple_child = to_simple_child(item);
+
+       count = sprintf(page, "%d\n", simple_child->storeme);
+
+       return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+                                      struct configfs_attribute *attr,
+                                      const char *page, size_t count)
+{
+       struct simple_child *simple_child = to_simple_child(item);
+       unsigned long tmp;
+       char *p = (char *) page;
+
+       tmp = simple_strtoul(p, &p, 10);
+       if (!p || (*p && (*p != '\n')))
+               return -EINVAL;
+
+       if (tmp > INT_MAX)
+               return -ERANGE;
+
+       simple_child->storeme = tmp;
+
+       return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+       kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+       .release                = simple_child_release,
+       .show_attribute         = simple_child_attr_show,
+       .store_attribute        = simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+       .ct_item_ops    = &simple_child_item_ops,
+       .ct_attrs       = simple_child_attrs,
+       .ct_owner       = THIS_MODULE,
+};
+
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+       struct simple_child *simple_child;
+
+       simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
+       if (!simple_child)
+               return NULL;
+
+       memset(simple_child, 0, sizeof(struct simple_child));
+
+       config_item_init_type_name(&simple_child->item, name,
+                                  &simple_child_type);
+
+       simple_child->storeme = 0;
+
+       return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+       .ca_owner = THIS_MODULE,
+       .ca_name = "description",
+       .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+       &simple_children_attr_description,
+       NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+                                        struct configfs_attribute *attr,
+                                        char *page)
+{
+       return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+       .show_attribute = simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+       .make_item      = simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+       .ct_item_ops    = &simple_children_item_ops,
+       .ct_group_ops   = &simple_children_group_ops,
+       .ct_attrs       = simple_children_attrs,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+       .su_group = {
+               .cg_item = {
+                       .ci_namebuf = "02-simple-children",
+                       .ci_type = &simple_children_type,
+               },
+       },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+
+struct simple_children {
+       struct config_group group;
+};
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+       struct simple_children *simple_children;
+
+       simple_children = kmalloc(sizeof(struct simple_children),
+                                 GFP_KERNEL);
+       if (!simple_children)
+               return NULL;
+
+       memset(simple_children, 0, sizeof(struct simple_children));
+
+       config_group_init_type_name(&simple_children->group, name,
+                                   &simple_children_type);
+
+       return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+       .ca_owner = THIS_MODULE,
+       .ca_name = "description",
+       .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+       &group_children_attr_description,
+       NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+                                       struct configfs_attribute *attr,
+                                       char *page)
+{
+       return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+       .show_attribute = group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+       .make_group     = group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+       .ct_item_ops    = &group_children_item_ops,
+       .ct_group_ops   = &group_children_group_ops,
+       .ct_attrs       = group_children_attrs,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+       .su_group = {
+               .cg_item = {
+                       .ci_namebuf = "03-group-children",
+                       .ci_type = &group_children_type,
+               },
+       },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+       &childless_subsys.subsys,
+       &simple_children_subsys,
+       &group_children_subsys,
+       NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+       int ret;
+       int i;
+       struct configfs_subsystem *subsys;
+
+       for (i = 0; example_subsys[i]; i++) {
+               subsys = example_subsys[i];
+
+               config_group_init(&subsys->su_group);
+               init_MUTEX(&subsys->su_sem);
+               ret = configfs_register_subsystem(subsys);
+               if (ret) {
+                       printk(KERN_ERR "Error %d while registering subsystem %s\n",
+                              ret,
+                              subsys->su_group.cg_item.ci_namebuf);
+                       goto out_unregister;
+               }
+       }
+
+       return 0;
+
+out_unregister:
+       for (; i >= 0; i--) {
+               configfs_unregister_subsystem(example_subsys[i]);
+       }
+
+       return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+       int i;
+
+       for (i = 0; example_subsys[i]; i++) {
+               configfs_unregister_subsystem(example_subsys[i]);
+       }
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644 (file)
index 0000000..9afab84
--- /dev/null
@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+  DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find it's heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exlcusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory.  Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag     Lock Request Type
+---------     -----------------
+O_RDONLY      Shared Read
+O_RDWR        Exclusive
+
+Open Flag     Resulting Locking Behavior
+---------     --------------------------
+O_NONBLOCK    Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644 (file)
index 0000000..f2595ca
--- /dev/null
@@ -0,0 +1,55 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker   <joel.becker@oracle.com>
+Zach Brown    <zach.brown@oracle.com>
+Mark Fasheh   <mark.fasheh@oracle.com>
+Kurt Hackel   <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh  <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+       - sparse files
+       - extended attributes
+       - shared writeable mmap
+       - loopback is supported, but data written will not
+         be cluster coherent.
+       - quotas
+       - cluster aware flock
+       - Directory change notification (F_NOTIFY)
+       - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+       - POSIX ACLs
+       - readpages / writepages (not user visible)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1              This enables/disables barriers. barrier=0 disables it,
+                       barrier=1 enables it.
+errors=remount-ro(*)   Remount the filesystem read-only on an error.
+errors=panic           Panic and halt the machine if an error occurs.
+intr           (*)     Allow signals to interrupt cluster operations.
+nointr                 Do not allow signals to interrupt cluster
+                       operations.
index 5daae53..e9db0d6 100644 (file)
@@ -554,6 +554,11 @@ W: http://us1.samba.org/samba/Linux_CIFS_client.html
 T:     git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 S:     Supported       
 
+CONFIGFS
+P:     Joel Becker
+M:     Joel Becker <joel.becker@oracle.com>
+S:     Supported
+
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 P:     Jeff Garzik
 M:     jgarzik@pobox.com
@@ -1898,6 +1903,15 @@ M:       ajoshi@shell.unixbox.com
 L:     linux-nvidia@lists.surfsouth.com
 S:     Maintained
 
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P:     Mark Fasheh
+M:     mark.fasheh@oracle.com
+P:     Kurt Hackel
+M:     kurt.hackel@oracle.com
+L:     ocfs2-devel@oss.oracle.com
+W:     http://oss.oracle.com/projects/ocfs2/
+S:     Supported       
+
 OLYMPIC NETWORK DRIVER
 P:     Peter De Shrijver
 M:     p2@ace.ulyssis.student.kuleuven.ac.be
index 96c664a..a452b13 100644 (file)
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
        struct address_space_operations *aops = mapping->a_ops;
        pgoff_t index;
        unsigned offset, bv_offs;
-       int len, ret = 0;
+       int len, ret;
 
        down(&mapping->host->i_sem);
        index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
                page = grab_cache_page(mapping, index);
                if (unlikely(!page))
                        goto fail;
-               if (unlikely(aops->prepare_write(file, page, offset,
-                               offset + size)))
+               ret = aops->prepare_write(file, page, offset,
+                                         offset + size);
+               if (unlikely(ret)) {
+                       if (ret == AOP_TRUNCATED_PAGE) {
+                               page_cache_release(page);
+                               continue;
+                       }
                        goto unlock;
+               }
                transfer_result = lo_do_transfer(lo, WRITE, page, offset,
                                bvec->bv_page, bv_offs, size, IV);
                if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
                        kunmap_atomic(kaddr, KM_USER0);
                }
                flush_dcache_page(page);
-               if (unlikely(aops->commit_write(file, page, offset,
-                               offset + size)))
+               ret = aops->commit_write(file, page, offset,
+                                        offset + size);
+               if (unlikely(ret)) {
+                       if (ret == AOP_TRUNCATED_PAGE) {
+                               page_cache_release(page);
+                               continue;
+                       }
                        goto unlock;
+               }
                if (unlikely(transfer_result))
                        goto unlock;
                bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
                unlock_page(page);
                page_cache_release(page);
        }
+       ret = 0;
 out:
        up(&mapping->host->i_sem);
        return ret;
index 68c60a5..ffd6abd 100644 (file)
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 
 /*
  * ->writepage to the the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it.  We return WRITEPAGE_ACTIVATE so that the VM
+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
  * won't try to (pointlessly) write the page again for a while.
  *
  * Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
                make_page_uptodate(page);
        SetPageDirty(page);
        if (wbc->for_reclaim)
-               return WRITEPAGE_ACTIVATE;
+               return AOP_WRITEPAGE_ACTIVATE;
        unlock_page(page);
        return 0;
 }
index d5255e6..382e3b2 100644 (file)
@@ -70,6 +70,7 @@ config FS_XIP
 
 config EXT3_FS
        tristate "Ext3 journalling file system support"
+       select JBD
        help
          This is the journaling version of the Second extended file system
          (often called ext3), the de facto standard Linux file system
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
          extended attributes for file security labels, say N.
 
 config JBD
-# CONFIG_JBD could be its own option (even modular), but until there are
-# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
-# dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
        tristate
-       default EXT3_FS
        help
          This is a generic journaling layer for block devices.  It is
-         currently used by the ext3 file system, but it could also be used to
-         add journal support to other file systems or block devices such as
-         RAID or LVM.
+         currently used by the ext3 and OCFS2 file systems, but it could
+         also be used to add journal support to other file systems or block
+         devices such as RAID or LVM.
 
-         If you are using the ext3 file system, you need to say Y here. If
-         you are not using ext3 then you will probably want to say N.
+         If you are using the ext3 or OCFS2 file systems, you need to
+         say Y here. If you are not using ext3 OCFS2 then you will probably
+         want to say N.
 
          To compile this device as a module, choose M here: the module will be
-         called jbd.  If you are compiling ext3 into the kernel, you cannot
-         compile this code as a module.
+         called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
+         you cannot compile this code as a module.
 
 config JBD_DEBUG
        bool "JBD (ext3) debugging support"
@@ -326,6 +324,38 @@ config FS_POSIX_ACL
 
 source "fs/xfs/Kconfig"
 
+config OCFS2_FS
+       tristate "OCFS2 file system support (EXPERIMENTAL)"
+       depends on NET && EXPERIMENTAL
+       select CONFIGFS_FS
+       select JBD
+       select CRC32
+       select INET
+       help
+         OCFS2 is a general purpose extent based shared disk cluster file
+         system with many similarities to ext3. It supports 64 bit inode
+         numbers, and has automatically extending metadata groups which may
+         also make it attractive for non-clustered use.
+
+         You'll want to install the ocfs2-tools package in order to at least
+         get "mount.ocfs2".
+
+         Project web page:    http://oss.oracle.com/projects/ocfs2
+         Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+         OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+         Note: Features which OCFS2 does not support yet:
+                 - extended attributes
+                 - shared writeable mmap
+                 - loopback is supported, but data written will not
+                   be cluster coherent.
+                 - quotas
+                 - cluster aware flock
+                 - Directory change notification (F_NOTIFY)
+                 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+                 - POSIX ACLs
+                 - readpages / writepages (not user visible)
+
 config MINIX_FS
        tristate "Minix fs support"
        help
@@ -841,6 +871,20 @@ config RELAYFS_FS
 
          If unsure, say N.
 
+config CONFIGFS_FS
+       tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       help
+         configfs is a ram-based filesystem that provides the converse
+         of sysfs's functionality. Where sysfs is a filesystem-based
+         view of kernel objects, configfs is a filesystem-based manager
+         of kernel objects, or config_items.
+
+         Both sysfs and configfs can and should exist together on the
+         same system. One is not a replacement for the other.
+
+         If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
index 4c26557..7367611 100644 (file)
@@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS)               += befs/
 obj-$(CONFIG_HOSTFS)           += hostfs/
 obj-$(CONFIG_HPPFS)            += hppfs/
 obj-$(CONFIG_DEBUG_FS)         += debugfs/
+obj-$(CONFIG_CONFIGFS_FS)      += configfs/
+obj-$(CONFIG_OCFS2_FS)         += ocfs2/
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644 (file)
index 0000000..00ffb27
--- /dev/null
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)      += configfs.o
+
+configfs-objs  := inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644 (file)
index 0000000..8899d9c
--- /dev/null
@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+
+struct configfs_dirent {
+       atomic_t                s_count;
+       struct list_head        s_sibling;
+       struct list_head        s_children;
+       struct list_head        s_links;
+       void                    * s_element;
+       int                     s_type;
+       umode_t                 s_mode;
+       struct dentry           * s_dentry;
+};
+
+#define CONFIGFS_ROOT          0x0001
+#define CONFIGFS_DIR           0x0002
+#define CONFIGFS_ITEM_ATTR     0x0004
+#define CONFIGFS_ITEM_LINK     0x0020
+#define CONFIGFS_USET_DIR      0x0040
+#define CONFIGFS_USET_DEFAULT  0x0080
+#define CONFIGFS_USET_DROPPING 0x0100
+#define CONFIGFS_NOT_PINNED    (CONFIGFS_ITEM_ATTR)
+
+extern struct vfsmount * configfs_mount;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+                               struct dentry *, void *, umode_t, int);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern struct file_operations configfs_dir_operations;
+extern struct file_operations configfs_file_operations;
+extern struct file_operations bin_fops;
+extern struct inode_operations configfs_dir_inode_operations;
+extern struct inode_operations configfs_symlink_inode_operations;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+                           const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+       struct list_head sl_list;
+       struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+                               struct dentry *parent,
+                               struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+       struct configfs_dirent * sd = dentry->d_fsdata;
+       return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+       struct configfs_dirent * sd = dentry->d_fsdata;
+       return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+       struct config_item * item = NULL;
+
+       spin_lock(&dcache_lock);
+       if (!d_unhashed(dentry)) {
+               struct configfs_dirent * sd = dentry->d_fsdata;
+               if (sd->s_type & CONFIGFS_ITEM_LINK) {
+                       struct configfs_symlink * sl = sd->s_element;
+                       item = config_item_get(sl->sl_target);
+               } else
+                       item = config_item_get(sd->s_element);
+       }
+       spin_unlock(&dcache_lock);
+
+       return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+       if (!(sd->s_type & CONFIGFS_ROOT))
+               kfree(sd);
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+       if (sd) {
+               WARN_ON(!atomic_read(&sd->s_count));
+               atomic_inc(&sd->s_count);
+       }
+       return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+       WARN_ON(!atomic_read(&sd->s_count));
+       if (atomic_dec_and_test(&sd->s_count))
+               release_configfs_dirent(sd);
+}
+
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644 (file)
index 0000000..e48b539
--- /dev/null
@@ -0,0 +1,1102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+
+static void configfs_d_iput(struct dentry * dentry,
+                           struct inode * inode)
+{
+       struct configfs_dirent * sd = dentry->d_fsdata;
+
+       if (sd) {
+               BUG_ON(sd->s_dentry != dentry);
+               sd->s_dentry = NULL;
+               configfs_put(sd);
+       }
+       iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(struct dentry *dentry)
+{
+       return 1;
+}
+
+static struct dentry_operations configfs_dentry_ops = {
+       .d_iput         = configfs_d_iput,
+       /* simple_delete_dentry() isn't exported */
+       .d_delete       = configfs_d_delete,
+};
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+                                               void * element)
+{
+       struct configfs_dirent * sd;
+
+       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+       if (!sd)
+               return NULL;
+
+       memset(sd, 0, sizeof(*sd));
+       atomic_set(&sd->s_count, 1);
+       INIT_LIST_HEAD(&sd->s_links);
+       INIT_LIST_HEAD(&sd->s_children);
+       list_add(&sd->s_sibling, &parent_sd->s_children);
+       sd->s_element = element;
+
+       return sd;
+}
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+                        struct dentry * dentry, void * element,
+                        umode_t mode, int type)
+{
+       struct configfs_dirent * sd;
+
+       sd = configfs_new_dirent(parent_sd, element);
+       if (!sd)
+               return -ENOMEM;
+
+       sd->s_mode = mode;
+       sd->s_type = type;
+       sd->s_dentry = dentry;
+       if (dentry) {
+               dentry->d_fsdata = configfs_get(sd);
+               dentry->d_op = &configfs_dentry_ops;
+       }
+
+       return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+       inode->i_op = &configfs_dir_inode_operations;
+       inode->i_fop = &configfs_dir_operations;
+
+       /* directory inodes start off with i_nlink == 2 (for "." entry) */
+       inode->i_nlink++;
+       return 0;
+}
+
+static int init_file(struct inode * inode)
+{
+       inode->i_size = PAGE_SIZE;
+       inode->i_fop = &configfs_file_operations;
+       return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+       inode->i_op = &configfs_symlink_inode_operations;
+       return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+                     struct dentry * d)
+{
+       int error;
+       umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+       error = configfs_create(d, mode, init_dir);
+       if (!error) {
+               error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+                                          CONFIGFS_DIR);
+               if (!error) {
+                       p->d_inode->i_nlink++;
+                       (d)->d_op = &configfs_dentry_ops;
+               }
+       }
+       return error;
+}
+
+
+/**
+ *     configfs_create_dir - create a directory for an config_item.
+ *     @item:          config_itemwe're creating directory for.
+ *     @dentry:        config_item's dentry.
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+       struct dentry * parent;
+       int error = 0;
+
+       BUG_ON(!item);
+
+       if (item->ci_parent)
+               parent = item->ci_parent->ci_dentry;
+       else if (configfs_mount && configfs_mount->mnt_sb)
+               parent = configfs_mount->mnt_sb->s_root;
+       else
+               return -EFAULT;
+
+       error = create_dir(item,parent,dentry);
+       if (!error)
+               item->ci_dentry = dentry;
+       return error;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+                        struct dentry *parent,
+                        struct dentry *dentry)
+{
+       int err = 0;
+       umode_t mode = S_IFLNK | S_IRWXUGO;
+
+       err = configfs_create(dentry, mode, init_symlink);
+       if (!err) {
+               err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
+                                        mode, CONFIGFS_ITEM_LINK);
+               if (!err)
+                       dentry->d_op = &configfs_dentry_ops;
+       }
+       return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+       struct dentry * parent = dget(d->d_parent);
+       struct configfs_dirent * sd;
+
+       sd = d->d_fsdata;
+       list_del_init(&sd->s_sibling);
+       configfs_put(sd);
+       if (d->d_inode)
+               simple_rmdir(parent->d_inode,d);
+
+       pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+                atomic_read(&d->d_count));
+
+       dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:      config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+       struct dentry * dentry = dget(item->ci_dentry);
+
+       if (!dentry)
+               return;
+
+       remove_dir(dentry);
+       /**
+        * Drop reference from dget() on entrance.
+        */
+       dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+       struct configfs_attribute * attr = sd->s_element;
+       int error;
+
+       error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+       if (error)
+               return error;
+
+       dentry->d_op = &configfs_dentry_ops;
+       dentry->d_fsdata = configfs_get(sd);
+       sd->s_dentry = dentry;
+       d_rehash(dentry);
+
+       return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+                                      struct dentry *dentry,
+                                      struct nameidata *nd)
+{
+       struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+       struct configfs_dirent * sd;
+       int found = 0;
+       int err = 0;
+
+       list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+               if (sd->s_type & CONFIGFS_NOT_PINNED) {
+                       const unsigned char * name = configfs_get_name(sd);
+
+                       if (strcmp(name, dentry->d_name.name))
+                               continue;
+
+                       found = 1;
+                       err = configfs_attach_attr(sd, dentry);
+                       break;
+               }
+       }
+
+       if (!found) {
+               /*
+                * If it doesn't exist and it isn't a NOT_PINNED item,
+                * it must be negative.
+                */
+               return simple_lookup(dir, dentry, nd);
+       }
+
+       return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * on all children that are candidates for default detach.  If the
+ * result is clean, then configfs_detach_group() will handle dropping
+ * i_sem.  If there is an error, the caller will clean up the i_sem
+ * holders via configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry)
+{
+       struct configfs_dirent *parent_sd = dentry->d_fsdata;
+       struct configfs_dirent *sd;
+       int ret;
+
+       ret = -EBUSY;
+       if (!list_empty(&parent_sd->s_links))
+               goto out;
+
+       ret = 0;
+       list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+               if (sd->s_type & CONFIGFS_NOT_PINNED)
+                       continue;
+               if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+                       down(&sd->s_dentry->d_inode->i_sem);
+                       /* Mark that we've taken i_sem */
+                       sd->s_type |= CONFIGFS_USET_DROPPING;
+
+                       ret = configfs_detach_prep(sd->s_dentry);
+                       if (!ret)
+                               continue;
+               } else
+                       ret = -ENOTEMPTY;
+
+               break;
+       }
+
+out:
+       return ret;
+}
+
+/*
+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+       struct configfs_dirent *parent_sd = dentry->d_fsdata;
+       struct configfs_dirent *sd;
+
+       list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+               if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+                       configfs_detach_rollback(sd->s_dentry);
+
+                       if (sd->s_type & CONFIGFS_USET_DROPPING) {
+                               sd->s_type &= ~CONFIGFS_USET_DROPPING;
+                               up(&sd->s_dentry->d_inode->i_sem);
+                       }
+               }
+       }
+}
+
+static void detach_attrs(struct config_item * item)
+{
+       struct dentry * dentry = dget(item->ci_dentry);
+       struct configfs_dirent * parent_sd;
+       struct configfs_dirent * sd, * tmp;
+
+       if (!dentry)
+               return;
+
+       pr_debug("configfs %s: dropping attrs for  dir\n",
+                dentry->d_name.name);
+
+       parent_sd = dentry->d_fsdata;
+       list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+               if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+                       continue;
+               list_del_init(&sd->s_sibling);
+               configfs_drop_dentry(sd, dentry);
+               configfs_put(sd);
+       }
+
+       /**
+        * Drop reference from dget() on entrance.
+        */
+       dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+       struct config_item_type *t = item->ci_type;
+       struct configfs_attribute *attr;
+       int error = 0;
+       int i;
+
+       if (!t)
+               return -EINVAL;
+       if (t->ct_attrs) {
+               for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+                       if ((error = configfs_create_file(item, attr)))
+                               break;
+               }
+       }
+
+       if (error)
+               detach_attrs(item);
+
+       return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+                                struct config_item *item,
+                                struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+       struct dentry * dentry = dget(group->cg_item.ci_dentry);
+       struct dentry *child;
+       struct configfs_dirent *parent_sd;
+       struct configfs_dirent *sd, *tmp;
+
+       if (!dentry)
+               return;
+
+       parent_sd = dentry->d_fsdata;
+       list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+               if (!sd->s_element ||
+                   !(sd->s_type & CONFIGFS_USET_DEFAULT))
+                       continue;
+
+               child = sd->s_dentry;
+
+               configfs_detach_group(sd->s_element);
+               child->d_inode->i_flags |= S_DEAD;
+
+               /*
+                * From rmdir/unregister, a configfs_detach_prep() pass
+                * has taken our i_sem for us.  Drop it.
+                * From mkdir/register cleanup, there is no sem held.
+                */
+               if (sd->s_type & CONFIGFS_USET_DROPPING)
+                       up(&child->d_inode->i_sem);
+
+               d_delete(child);
+               dput(child);
+       }
+
+       /**
+        * Drop reference from dget() on entrance.
+        */
+       dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+                               struct config_group *group)
+{
+       int ret;
+       struct qstr name;
+       struct configfs_dirent *sd;
+       /* We trust the caller holds a reference to parent */
+       struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+       if (!group->cg_item.ci_name)
+               group->cg_item.ci_name = group->cg_item.ci_namebuf;
+       name.name = group->cg_item.ci_name;
+       name.len = strlen(name.name);
+       name.hash = full_name_hash(name.name, name.len);
+
+       ret = -ENOMEM;
+       child = d_alloc(parent, &name);
+       if (child) {
+               d_add(child, NULL);
+
+               ret = configfs_attach_group(&parent_group->cg_item,
+                                           &group->cg_item, child);
+               if (!ret) {
+                       sd = child->d_fsdata;
+                       sd->s_type |= CONFIGFS_USET_DEFAULT;
+               } else {
+                       d_delete(child);
+                       dput(child);
+               }
+       }
+
+       return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+       struct config_group *new_group;
+       struct dentry *dentry = group->cg_item.ci_dentry;
+       int ret = 0;
+       int i;
+
+       if (group && group->default_groups) {
+               /* FYI, we're faking mkdir here
+                * I'm not sure we need this semaphore, as we're called
+                * from our parent's mkdir.  That holds our parent's
+                * i_sem, so afaik lookup cannot continue through our
+                * parent to find us, let alone mess with our tree.
+                * That said, taking our i_sem is closer to mkdir
+                * emulation, and shouldn't hurt. */
+               down(&dentry->d_inode->i_sem);
+
+               for (i = 0; group->default_groups[i]; i++) {
+                       new_group = group->default_groups[i];
+
+                       ret = create_default_group(group, new_group);
+                       if (ret)
+                               break;
+               }
+
+               up(&dentry->d_inode->i_sem);
+       }
+
+       if (ret)
+               detach_groups(group);
+
+       return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_sem is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+       struct config_group *group;
+
+       group = item->ci_group;
+       if (group) {
+               list_del_init(&item->ci_entry);
+
+               item->ci_group = NULL;
+               item->ci_parent = NULL;
+               config_item_put(item);
+
+               config_group_put(group);
+       }
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+       /* Parent seems redundant with group, but it makes certain
+        * traversals much nicer. */
+       item->ci_parent = parent_item;
+       item->ci_group = config_group_get(to_config_group(parent_item));
+       list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+       config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+       int i;
+       struct config_group *new_group;
+
+       if (group->default_groups) {
+               for (i = 0; group->default_groups[i]; i++) {
+                       new_group = group->default_groups[i];
+                       unlink_group(new_group);
+               }
+       }
+
+       group->cg_subsys = NULL;
+       unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+       int i;
+       struct config_group *new_group;
+       struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+       link_obj(&parent_group->cg_item, &group->cg_item);
+
+       if (parent_group->cg_subsys)
+               subsys = parent_group->cg_subsys;
+       else if (configfs_is_root(&parent_group->cg_item))
+               subsys = to_configfs_subsystem(group);
+       else
+               BUG();
+       group->cg_subsys = subsys;
+
+       if (group->default_groups) {
+               for (i = 0; group->default_groups[i]; i++) {
+                       new_group = group->default_groups[i];
+                       link_group(group, new_group);
+               }
+       }
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+                               struct config_item *item,
+                               struct dentry *dentry)
+{
+       int ret;
+
+       ret = configfs_create_dir(item, dentry);
+       if (!ret) {
+               ret = populate_attrs(item);
+               if (ret) {
+                       configfs_remove_dir(item);
+                       d_delete(dentry);
+               }
+       }
+
+       return ret;
+}
+
+static void configfs_detach_item(struct config_item *item)
+{
+       detach_attrs(item);
+       configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+                                struct config_item *item,
+                                struct dentry *dentry)
+{
+       int ret;
+       struct configfs_dirent *sd;
+
+       ret = configfs_attach_item(parent_item, item, dentry);
+       if (!ret) {
+               sd = dentry->d_fsdata;
+               sd->s_type |= CONFIGFS_USET_DIR;
+
+               ret = populate_groups(to_config_group(item));
+               if (ret) {
+                       configfs_detach_item(item);
+                       d_delete(dentry);
+               }
+       }
+
+       return ret;
+}
+
+static void configfs_detach_group(struct config_item *item)
+{
+       detach_groups(to_config_group(item));
+       configfs_detach_item(item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+                            struct config_item *item)
+{
+       struct config_item_type *type;
+
+       type = parent_item->ci_type;
+       BUG_ON(!type);
+
+       if (type->ct_group_ops && type->ct_group_ops->drop_item)
+               type->ct_group_ops->drop_item(to_config_group(parent_item),
+                                               item);
+       else
+               config_item_put(item);
+}
+
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       int ret;
+       struct config_group *group;
+       struct config_item *item;
+       struct config_item *parent_item;
+       struct configfs_subsystem *subsys;
+       struct configfs_dirent *sd;
+       struct config_item_type *type;
+       struct module *owner;
+       char *name;
+
+       if (dentry->d_parent == configfs_sb->s_root)
+               return -EPERM;
+
+       sd = dentry->d_parent->d_fsdata;
+       if (!(sd->s_type & CONFIGFS_USET_DIR))
+               return -EPERM;
+
+       parent_item = configfs_get_config_item(dentry->d_parent);
+       type = parent_item->ci_type;
+       subsys = to_config_group(parent_item)->cg_subsys;
+       BUG_ON(!subsys);
+
+       if (!type || !type->ct_group_ops ||
+           (!type->ct_group_ops->make_group &&
+            !type->ct_group_ops->make_item)) {
+               config_item_put(parent_item);
+               return -EPERM;  /* What lack-of-mkdir returns */
+       }
+
+       name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+       if (!name) {
+               config_item_put(parent_item);
+               return -ENOMEM;
+       }
+       snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+       down(&subsys->su_sem);
+       group = NULL;
+       item = NULL;
+       if (type->ct_group_ops->make_group) {
+               group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+               if (group) {
+                       link_group(to_config_group(parent_item), group);
+                       item = &group->cg_item;
+               }
+       } else {
+               item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+               if (item)
+                       link_obj(parent_item, item);
+       }
+       up(&subsys->su_sem);
+
+       kfree(name);
+       if (!item) {
+               config_item_put(parent_item);
+               return -ENOMEM;
+       }
+
+       ret = -EINVAL;
+       type = item->ci_type;
+       if (type) {
+               owner = type->ct_owner;
+               if (try_module_get(owner)) {
+                       if (group) {
+                               ret = configfs_attach_group(parent_item,
+                                                           item,
+                                                           dentry);
+                       } else {
+                               ret = configfs_attach_item(parent_item,
+                                                          item,
+                                                          dentry);
+                       }
+
+                       if (ret) {
+                               down(&subsys->su_sem);
+                               if (group)
+                                       unlink_group(group);
+                               else
+                                       unlink_obj(item);
+                               client_drop_item(parent_item, item);
+                               up(&subsys->su_sem);
+
+                               config_item_put(parent_item);
+                               module_put(owner);
+                       }
+               }
+       }
+
+       return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       struct config_item *parent_item;
+       struct config_item *item;
+       struct configfs_subsystem *subsys;
+       struct configfs_dirent *sd;
+       struct module *owner = NULL;
+       int ret;
+
+       if (dentry->d_parent == configfs_sb->s_root)
+               return -EPERM;
+
+       sd = dentry->d_fsdata;
+       if (sd->s_type & CONFIGFS_USET_DEFAULT)
+               return -EPERM;
+
+       parent_item = configfs_get_config_item(dentry->d_parent);
+       subsys = to_config_group(parent_item)->cg_subsys;
+       BUG_ON(!subsys);
+
+       if (!parent_item->ci_type) {
+               config_item_put(parent_item);
+               return -EINVAL;
+       }
+
+       ret = configfs_detach_prep(dentry);
+       if (ret) {
+               configfs_detach_rollback(dentry);
+               config_item_put(parent_item);
+               return ret;
+       }
+
+       item = configfs_get_config_item(dentry);
+
+       /* Drop reference from above, item already holds one. */
+       config_item_put(parent_item);
+
+       if (item->ci_type)
+               owner = item->ci_type->ct_owner;
+
+       if (sd->s_type & CONFIGFS_USET_DIR) {
+               configfs_detach_group(item);
+
+               down(&subsys->su_sem);
+               unlink_group(to_config_group(item));
+       } else {
+               configfs_detach_item(item);
+
+               down(&subsys->su_sem);
+               unlink_obj(item);
+       }
+
+       client_drop_item(parent_item, item);
+       up(&subsys->su_sem);
+
+       /* Drop our reference from above */
+       config_item_put(item);
+
+       module_put(owner);
+
+       return 0;
+}
+
+struct inode_operations configfs_dir_inode_operations = {
+       .mkdir          = configfs_mkdir,
+       .rmdir          = configfs_rmdir,
+       .symlink        = configfs_symlink,
+       .unlink         = configfs_unlink,
+       .lookup         = configfs_lookup,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+       int error = 0;
+       struct dentry * new_dentry, * parent;
+
+       if (!strcmp(config_item_name(item), new_name))
+               return -EINVAL;
+
+       if (!item->parent)
+               return -EINVAL;
+
+       down_write(&configfs_rename_sem);
+       parent = item->parent->dentry;
+
+       down(&parent->d_inode->i_sem);
+
+       new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+       if (!IS_ERR(new_dentry)) {
+               if (!new_dentry->d_inode) {
+                       error = config_item_set_name(item, "%s", new_name);
+                       if (!error) {
+                               d_add(new_dentry, NULL);
+                               d_move(item->dentry, new_dentry);
+                       }
+                       else
+                               d_delete(new_dentry);
+               } else
+                       error = -EEXIST;
+               dput(new_dentry);
+       }
+       up(&parent->d_inode->i_sem);
+       up_write(&configfs_rename_sem);
+
+       return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+       struct dentry * dentry = file->f_dentry;
+       struct configfs_dirent * parent_sd = dentry->d_fsdata;
+
+       down(&dentry->d_inode->i_sem);
+       file->private_data = configfs_new_dirent(parent_sd, NULL);
+       up(&dentry->d_inode->i_sem);
+
+       return file->private_data ? 0 : -ENOMEM;
+
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+       struct dentry * dentry = file->f_dentry;
+       struct configfs_dirent * cursor = file->private_data;
+
+       down(&dentry->d_inode->i_sem);
+       list_del_init(&cursor->s_sibling);
+       up(&dentry->d_inode->i_sem);
+
+       release_configfs_dirent(cursor);
+
+       return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+       return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+       struct dentry *dentry = filp->f_dentry;
+       struct configfs_dirent * parent_sd = dentry->d_fsdata;
+       struct configfs_dirent *cursor = filp->private_data;
+       struct list_head *p, *q = &cursor->s_sibling;
+       ino_t ino;
+       int i = filp->f_pos;
+
+       switch (i) {
+               case 0:
+                       ino = dentry->d_inode->i_ino;
+                       if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                               break;
+                       filp->f_pos++;
+                       i++;
+                       /* fallthrough */
+               case 1:
+                       ino = parent_ino(dentry);
+                       if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                               break;
+                       filp->f_pos++;
+                       i++;
+                       /* fallthrough */
+               default:
+                       if (filp->f_pos == 2) {
+                               list_del(q);
+                               list_add(q, &parent_sd->s_children);
+                       }
+                       for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+                               struct configfs_dirent *next;
+                               const char * name;
+                               int len;
+
+                               next = list_entry(p, struct configfs_dirent,
+                                                  s_sibling);
+                               if (!next->s_element)
+                                       continue;
+
+                               name = configfs_get_name(next);
+                               len = strlen(name);
+                               if (next->s_dentry)
+                                       ino = next->s_dentry->d_inode->i_ino;
+                               else
+                                       ino = iunique(configfs_sb, 2);
+
+                               if (filldir(dirent, name, len, filp->f_pos, ino,
+                                                dt_type(next)) < 0)
+                                       return 0;
+
+                               list_del(q);
+                               list_add(q, p);
+                               p = q;
+                               filp->f_pos++;
+                       }
+       }
+       return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+       struct dentry * dentry = file->f_dentry;
+
+       down(&dentry->d_inode->i_sem);
+       switch (origin) {
+               case 1:
+                       offset += file->f_pos;
+               case 0:
+                       if (offset >= 0)
+                               break;
+               default:
+                       up(&file->f_dentry->d_inode->i_sem);
+                       return -EINVAL;
+       }
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               if (file->f_pos >= 2) {
+                       struct configfs_dirent *sd = dentry->d_fsdata;
+                       struct configfs_dirent *cursor = file->private_data;
+                       struct list_head *p;
+                       loff_t n = file->f_pos - 2;
+
+                       list_del(&cursor->s_sibling);
+                       p = sd->s_children.next;
+                       while (n && p != &sd->s_children) {
+                               struct configfs_dirent *next;
+                               next = list_entry(p, struct configfs_dirent,
+                                                  s_sibling);
+                               if (next->s_element)
+                                       n--;
+                               p = p->next;
+                       }
+                       list_add_tail(&cursor->s_sibling, p);
+               }
+       }
+       up(&dentry->d_inode->i_sem);
+       return offset;
+}
+
+struct file_operations configfs_dir_operations = {
+       .open           = configfs_dir_open,
+       .release        = configfs_dir_close,
+       .llseek         = configfs_dir_lseek,
+       .read           = generic_read_dir,
+       .readdir        = configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+       int err;
+       struct config_group *group = &subsys->su_group;
+       struct qstr name;
+       struct dentry *dentry;
+       struct configfs_dirent *sd;
+
+       err = configfs_pin_fs();
+       if (err)
+               return err;
+
+       if (!group->cg_item.ci_name)
+               group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+       sd = configfs_sb->s_root->d_fsdata;
+       link_group(to_config_group(sd->s_element), group);
+
+       down(&configfs_sb->s_root->d_inode->i_sem);
+
+       name.name = group->cg_item.ci_name;
+       name.len = strlen(name.name);
+       name.hash = full_name_hash(name.name, name.len);
+
+       err = -ENOMEM;
+       dentry = d_alloc(configfs_sb->s_root, &name);
+       if (!dentry)
+               goto out_release;
+
+       d_add(dentry, NULL);
+
+       err = configfs_attach_group(sd->s_element, &group->cg_item,
+                                   dentry);
+       if (!err)
+               dentry = NULL;
+       else
+               d_delete(dentry);
+
+       up(&configfs_sb->s_root->d_inode->i_sem);
+
+       if (dentry) {
+           dput(dentry);
+out_release:
+           unlink_group(group);
+           configfs_release_fs();
+       }
+
+       return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+       struct config_group *group = &subsys->su_group;
+       struct dentry *dentry = group->cg_item.ci_dentry;
+
+       if (dentry->d_parent != configfs_sb->s_root) {
+               printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+               return;
+       }
+
+       down(&configfs_sb->s_root->d_inode->i_sem);
+       down(&dentry->d_inode->i_sem);
+       if (configfs_detach_prep(dentry)) {
+               printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+       }
+       configfs_detach_group(&group->cg_item);
+       dentry->d_inode->i_flags |= S_DEAD;
+       up(&dentry->d_inode->i_sem);
+
+       d_delete(dentry);
+
+       up(&configfs_sb->s_root->d_inode->i_sem);
+
+       dput(dentry);
+
+       unlink_group(group);
+       configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644 (file)
index 0000000..af1ffc9
--- /dev/null
@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+
+struct configfs_buffer {
+       size_t                  count;
+       loff_t                  pos;
+       char                    * page;
+       struct configfs_item_operations * ops;
+       struct semaphore        sem;
+       int                     needs_read_fill;
+};
+
+
+/**
+ *     fill_read_buffer - allocate and fill buffer from item.
+ *     @dentry:        dentry pointer.
+ *     @buffer:        data buffer for file.
+ *
+ *     Allocate @buffer->page, if it hasn't been already, then call the
+ *     config_item's show() method to fill the buffer with this attribute's
+ *     data.
+ *     This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+       struct configfs_attribute * attr = to_attr(dentry);
+       struct config_item * item = to_item(dentry->d_parent);
+       struct configfs_item_operations * ops = buffer->ops;
+       int ret = 0;
+       ssize_t count;
+
+       if (!buffer->page)
+               buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+       if (!buffer->page)
+               return -ENOMEM;
+
+       count = ops->show_attribute(item,attr,buffer->page);
+       buffer->needs_read_fill = 0;
+       BUG_ON(count > (ssize_t)PAGE_SIZE);
+       if (count >= 0)
+               buffer->count = count;
+       else
+               ret = count;
+       return ret;
+}
+
+
+/**
+ *     flush_read_buffer - push buffer to userspace.
+ *     @buffer:        data buffer for file.
+ *     @userbuf:       user-passed buffer.
+ *     @count:         number of bytes requested.
+ *     @ppos:          file position.
+ *
+ *     Copy the buffer we filled in fill_read_buffer() to userspace.
+ *     This is done at the reader's leisure, copying and advancing
+ *     the amount they specify each time.
+ *     This may be called continuously until the buffer is empty.
+ */
+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
+                            size_t count, loff_t * ppos)
+{
+       int error;
+
+       if (*ppos > buffer->count)
+               return 0;
+
+       if (count > (buffer->count - *ppos))
+               count = buffer->count - *ppos;
+
+       error = copy_to_user(buf,buffer->page + *ppos,count);
+       if (!error)
+               *ppos += count;
+       return error ? -EFAULT : count;
+}
+
+/**
+ *     configfs_read_file - read an attribute.
+ *     @file:  file pointer.
+ *     @buf:   buffer to fill.
+ *     @count: number of bytes to read.
+ *     @ppos:  starting offset in file.
+ *
+ *     Userspace wants to read an attribute file. The attribute descriptor
+ *     is in the file's ->d_fsdata. The target item is in the directory's
+ *     ->d_fsdata.
+ *
+ *     We call fill_read_buffer() to allocate and fill the buffer from the
+ *     item's show() method exactly once (if the read is happening from
+ *     the beginning of the file). That should fill the entire buffer with
+ *     all the data the item has to offer for that attribute.
+ *     We then call flush_read_buffer() to copy the buffer to userspace
+ *     in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+       struct configfs_buffer * buffer = file->private_data;
+       ssize_t retval = 0;
+
+       down(&buffer->sem);
+       if (buffer->needs_read_fill) {
+               if ((retval = fill_read_buffer(file->f_dentry,buffer)))
+                       goto out;
+       }
+       pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
+                __FUNCTION__,count,*ppos,buffer->page);
+       retval = flush_read_buffer(buffer,buf,count,ppos);
+out:
+       up(&buffer->sem);
+       return retval;
+}
+
+
+/**
+ *     fill_write_buffer - copy buffer from userspace.
+ *     @buffer:        data buffer for file.
+ *     @userbuf:       data from user.
+ *     @count:         number of bytes in @userbuf.
+ *
+ *     Allocate @buffer->page if it hasn't been already, then
+ *     copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+       int error;
+
+       if (!buffer->page)
+               buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+       if (!buffer->page)
+               return -ENOMEM;
+
+       if (count > PAGE_SIZE)
+               count = PAGE_SIZE;
+       error = copy_from_user(buffer->page,buf,count);
+       buffer->needs_read_fill = 1;
+       return error ? -EFAULT : count;
+}
+
+
+/**
+ *     flush_write_buffer - push buffer to config_item.
+ *     @file:          file pointer.
+ *     @buffer:        data buffer for file.
+ *
+ *     Get the correct pointers for the config_item and the attribute we're
+ *     dealing with, then call the store() method for the attribute,
+ *     passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+       struct configfs_attribute * attr = to_attr(dentry);
+       struct config_item * item = to_item(dentry->d_parent);
+       struct configfs_item_operations * ops = buffer->ops;
+
+       return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *     configfs_write_file - write an attribute.
+ *     @file:  file pointer
+ *     @buf:   data to write
+ *     @count: number of bytes
+ *     @ppos:  starting offset
+ *
+ *     Similar to configfs_read_file(), though working in the opposite direction.
+ *     We allocate and fill the data from the user in fill_write_buffer(),
+ *     then push it to the config_item in flush_write_buffer().
+ *     There is no easy way for us to know if userspace is only doing a partial
+ *     write, so we don't support them. We expect the entire buffer to come
+ *     on the first write.
+ *     Hint: if you're writing a value, first read the file, modify only the
+ *     the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+       struct configfs_buffer * buffer = file->private_data;
+
+       down(&buffer->sem);
+       count = fill_write_buffer(buffer,buf,count);
+       if (count > 0)
+               count = flush_write_buffer(file->f_dentry,buffer,count);
+       if (count > 0)
+               *ppos += count;
+       up(&buffer->sem);
+       return count;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+       struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
+       struct configfs_attribute * attr = to_attr(file->f_dentry);
+       struct configfs_buffer * buffer;
+       struct configfs_item_operations * ops = NULL;
+       int error = 0;
+
+       if (!item || !attr)
+               goto Einval;
+
+       /* Grab the module reference for this attribute if we have one */
+       if (!try_module_get(attr->ca_owner)) {
+               error = -ENODEV;
+               goto Done;
+       }
+
+       if (item->ci_type)
+               ops = item->ci_type->ct_item_ops;
+       else
+               goto Eaccess;
+
+       /* File needs write support.
+        * The inode's perms must say it's ok,
+        * and we must have a store method.
+        */
+       if (file->f_mode & FMODE_WRITE) {
+
+               if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+                       goto Eaccess;
+
+       }
+
+       /* File needs read support.
+        * The inode's perms must say it's ok, and we there
+        * must be a show method for it.
+        */
+       if (file->f_mode & FMODE_READ) {
+               if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+                       goto Eaccess;
+       }
+
+       /* No error? Great, allocate a buffer for the file, and store it
+        * it in file->private_data for easy access.
+        */
+       buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+       if (buffer) {
+               memset(buffer,0,sizeof(struct configfs_buffer));
+               init_MUTEX(&buffer->sem);
+               buffer->needs_read_fill = 1;
+               buffer->ops = ops;
+               file->private_data = buffer;
+       } else
+               error = -ENOMEM;
+       goto Done;
+
+ Einval:
+       error = -EINVAL;
+       goto Done;
+ Eaccess:
+       error = -EACCES;
+       module_put(attr->ca_owner);
+ Done:
+       if (error && item)
+               config_item_put(item);
+       return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+       return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+       struct config_item * item = to_item(filp->f_dentry->d_parent);
+       struct configfs_attribute * attr = to_attr(filp->f_dentry);
+       struct module * owner = attr->ca_owner;
+       struct configfs_buffer * buffer = filp->private_data;
+
+       if (item)
+               config_item_put(item);
+       /* After this point, attr should not be accessed. */
+       module_put(owner);
+
+       if (buffer) {
+               if (buffer->page)
+                       free_page((unsigned long)buffer->page);
+               kfree(buffer);
+       }
+       return 0;
+}
+
+struct file_operations configfs_file_operations = {
+       .read           = configfs_read_file,
+       .write          = configfs_write_file,
+       .llseek         = generic_file_llseek,
+       .open           = configfs_open_file,
+       .release        = configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+       struct configfs_dirent * parent_sd = dir->d_fsdata;
+       umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+       int error = 0;
+
+       down(&dir->d_inode->i_sem);
+       error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+       up(&dir->d_inode->i_sem);
+
+       return error;
+}
+
+
+/**
+ *     configfs_create_file - create an attribute file for an item.
+ *     @item:  item we're creating for.
+ *     @attr:  atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+       BUG_ON(!item || !item->ci_dentry || !attr);
+
+       return configfs_add_file(item->ci_dentry, attr,
+                                CONFIGFS_ITEM_ATTR);
+}
+
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644 (file)
index 0000000..6b274c6
--- /dev/null
@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+extern struct super_block * configfs_sb;
+
+static struct address_space_operations configfs_aops = {
+       .readpage       = simple_readpage,
+       .prepare_write  = simple_prepare_write,
+       .commit_write   = simple_commit_write
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+       .ra_pages       = 0,    /* No readahead */
+       .capabilities   = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+struct inode * configfs_new_inode(mode_t mode)
+{
+       struct inode * inode = new_inode(configfs_sb);
+       if (inode) {
+               inode->i_mode = mode;
+               inode->i_uid = 0;
+               inode->i_gid = 0;
+               inode->i_blksize = PAGE_CACHE_SIZE;
+               inode->i_blocks = 0;
+               inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               inode->i_mapping->a_ops = &configfs_aops;
+               inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+       }
+       return inode;
+}
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+       int error = 0;
+       struct inode * inode = NULL;
+       if (dentry) {
+               if (!dentry->d_inode) {
+                       if ((inode = configfs_new_inode(mode))) {
+                               if (dentry->d_parent && dentry->d_parent->d_inode) {
+                                       struct inode *p_inode = dentry->d_parent->d_inode;
+                                       p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+                               }
+                               goto Proceed;
+                       }
+                       else
+                               error = -ENOMEM;
+               } else
+                       error = -EEXIST;
+       } else
+               error = -ENOENT;
+       goto Done;
+
+ Proceed:
+       if (init)
+               error = init(inode);
+       if (!error) {
+               d_instantiate(dentry, inode);
+               if (S_ISDIR(mode) || S_ISLNK(mode))
+                       dget(dentry);  /* pin link and directory dentries in core */
+       } else
+               iput(inode);
+ Done:
+       return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+       struct attribute * attr;
+
+       if (!sd || !sd->s_element)
+               BUG();
+
+       /* These always have a dentry, so use that */
+       if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+               return sd->s_dentry->d_name.name;
+
+       if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+               attr = sd->s_element;
+               return attr->name;
+       }
+       return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_sem held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+       struct dentry * dentry = sd->s_dentry;
+
+       if (dentry) {
+               spin_lock(&dcache_lock);
+               if (!(d_unhashed(dentry) && dentry->d_inode)) {
+                       dget_locked(dentry);
+                       __d_drop(dentry);
+                       spin_unlock(&dcache_lock);
+                       simple_unlink(parent->d_inode, dentry);
+               } else
+                       spin_unlock(&dcache_lock);
+       }
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+       struct configfs_dirent * sd;
+       struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+       down(&dir->d_inode->i_sem);
+       list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+               if (!sd->s_element)
+                       continue;
+               if (!strcmp(configfs_get_name(sd), name)) {
+                       list_del_init(&sd->s_sibling);
+                       configfs_drop_dentry(sd, dir);
+                       configfs_put(sd);
+                       break;
+               }
+       }
+       up(&dir->d_inode->i_sem);
+}
+
+
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644 (file)
index 0000000..e07485a
--- /dev/null
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ *     kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+       return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *     config_item_init - initialize item.
+ *     @item:  item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+       kref_init(&item->ci_kref);
+       INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *     config_item_set_name - Set the name of an item
+ *     @item:  item.
+ *     @name:  name.
+ *
+ *     If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *     dynamically allocated string that @item->ci_name points to.
+ *     Otherwise, use the static @item->ci_namebuf array.
+ */
+
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+       int error = 0;
+       int limit = CONFIGFS_ITEM_NAME_LEN;
+       int need;
+       va_list args;
+       char * name;
+
+       /*
+        * First, try the static array
+        */
+       va_start(args,fmt);
+       need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+       va_end(args);
+       if (need < limit)
+               name = item->ci_namebuf;
+       else {
+               /*
+                * Need more space? Allocate it and try again
+                */
+               limit = need + 1;
+               name = kmalloc(limit,GFP_KERNEL);
+               if (!name) {
+                       error = -ENOMEM;
+                       goto Done;
+               }
+               va_start(args,fmt);
+               need = vsnprintf(name,limit,fmt,args);
+               va_end(args);
+
+               /* Still? Give up. */
+               if (need >= limit) {
+                       kfree(name);
+                       error = -EFAULT;
+                       goto Done;
+               }
+       }
+
+       /* Free the old name, if necessary. */
+       if (item->ci_name && item->ci_name != item->ci_namebuf)
+               kfree(item->ci_name);
+
+       /* Now, set the new name */
+       item->ci_name = name;
+ Done:
+       return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+                               const char *name,
+                               struct config_item_type *type)
+{
+       config_item_set_name(item, name);
+       item->ci_type = type;
+       config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+                        struct config_item_type *type)
+{
+       config_item_set_name(&group->cg_item, name);
+       group->cg_item.ci_type = type;
+       config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+       if (item)
+               kref_get(&item->ci_kref);
+       return item;
+}
+
+/**
+ *     config_item_cleanup - free config_item resources.
+ *     @item:  item.
+ */
+
+void config_item_cleanup(struct config_item * item)
+{
+       struct config_item_type * t = item->ci_type;
+       struct config_group * s = item->ci_group;
+       struct config_item * parent = item->ci_parent;
+
+       pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+       if (item->ci_name != item->ci_namebuf)
+               kfree(item->ci_name);
+       item->ci_name = NULL;
+       if (t && t->ct_item_ops && t->ct_item_ops->release)
+               t->ct_item_ops->release(item);
+       if (s)
+               config_group_put(s);
+       if (parent)
+               config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+       config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *     config_item_put - decrement refcount for item.
+ *     @item:  item.
+ *
+ *     Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+       if (item)
+               kref_put(&item->ci_kref, config_item_release);
+}
+
+
+/**
+ *     config_group_init - initialize a group for use
+ *     @k:     group
+ */
+
+void config_group_init(struct config_group *group)
+{
+       config_item_init(&group->cg_item);
+       INIT_LIST_HEAD(&group->cg_children);
+}
+
+
+/**
+ *     config_group_find_obj - search for item in group.
+ *     @group: group we're looking in.
+ *     @name:  item's name.
+ *
+ *     Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *     looking for a matching config_item. If matching item is found
+ *     take a reference and return the item.
+ */
+
+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+{
+       struct list_head * entry;
+       struct config_item * ret = NULL;
+
+        /* XXX LOCKING! */
+       list_for_each(entry,&group->cg_children) {
+               struct config_item * item = to_item(entry);
+               if (config_item_name(item) &&
+                    !strcmp(config_item_name(item), name)) {
+                       ret = config_item_get(item);
+                       break;
+               }
+       }
+       return ret;
+}
+
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644 (file)
index 0000000..1a2f6f6
--- /dev/null
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+static int configfs_mnt_count = 0;
+
+static struct super_operations configfs_ops = {
+       .statfs         = simple_statfs,
+       .drop_inode     = generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+       .cg_item = {
+               .ci_namebuf     = "root",
+               .ci_name        = configfs_root_group.cg_item.ci_namebuf,
+       },
+};
+
+int configfs_is_root(struct config_item *item)
+{
+       return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+       .s_sibling      = LIST_HEAD_INIT(configfs_root.s_sibling),
+       .s_children     = LIST_HEAD_INIT(configfs_root.s_children),
+       .s_element      = &configfs_root_group.cg_item,
+       .s_type         = CONFIGFS_ROOT,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+       struct inode *inode;
+       struct dentry *root;
+
+       sb->s_blocksize = PAGE_CACHE_SIZE;
+       sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+       sb->s_magic = CONFIGFS_MAGIC;
+       sb->s_op = &configfs_ops;
+       configfs_sb = sb;
+
+       inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+       if (inode) {
+               inode->i_op = &configfs_dir_inode_operations;
+               inode->i_fop = &configfs_dir_operations;
+               /* directory inodes start off with i_nlink == 2 (for "." entry) */
+               inode->i_nlink++;
+       } else {
+               pr_debug("configfs: could not get root inode\n");
+               return -ENOMEM;
+       }
+
+       root = d_alloc_root(inode);
+       if (!root) {
+               pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+               iput(inode);
+               return -ENOMEM;
+       }
+       config_group_init(&configfs_root_group);
+       configfs_root_group.cg_item.ci_dentry = root;
+       root->d_fsdata = &configfs_root;
+       sb->s_root = root;
+       return 0;
+}
+
+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+       int flags, const char *dev_name, void *data)
+{
+       return get_sb_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "configfs",
+       .get_sb         = configfs_get_sb,
+       .kill_sb        = kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+       return simple_pin_fs("configfs", &configfs_mount,
+                            &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+       simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static decl_subsys(config, NULL, NULL);
+
+static int __init configfs_init(void)
+{
+       int err;
+
+       kset_set_kset_s(&config_subsys, kernel_subsys);
+       err = subsystem_register(&config_subsys);
+       if (err)
+               return err;
+
+       err = register_filesystem(&configfs_fs_type);
+       if (err) {
+               printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+               subsystem_unregister(&config_subsys);
+       }
+
+       return err;
+}
+
+static void __exit configfs_exit(void)
+{
+       unregister_filesystem(&configfs_fs_type);
+       subsystem_unregister(&config_subsys);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.1");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644 (file)
index 0000000..50f5840
--- /dev/null
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *     sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+static int item_depth(struct config_item * item)
+{
+       struct config_item * p = item;
+       int depth = 0;
+       do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+       return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+       struct config_item * p = item;
+       int length = 1;
+       do {
+               length += strlen(config_item_name(p)) + 1;
+               p = p->ci_parent;
+       } while (p && !configfs_is_root(p));
+       return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+       struct config_item * p;
+
+       --length;
+       for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+               int cur = strlen(config_item_name(p));
+
+               /* back up enough to print this bus id with '/' */
+               length -= cur;
+               strncpy(buffer + length,config_item_name(p),cur);
+               *(buffer + --length) = '/';
+       }
+}
+
+static int create_link(struct config_item *parent_item,
+                      struct config_item *item,
+                      struct dentry *dentry)
+{
+       struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+       struct configfs_symlink *sl;
+       int ret;
+
+       ret = -ENOMEM;
+       sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+       if (sl) {
+               sl->sl_target = config_item_get(item);
+               /* FIXME: needs a lock, I'd bet */
+               list_add(&sl->sl_list, &target_sd->s_links);
+               ret = configfs_create_link(sl, parent_item->ci_dentry,
+                                          dentry);
+               if (ret) {
+                       list_del_init(&sl->sl_list);
+                       config_item_put(item);
+                       kfree(sl);
+               }
+       }
+
+       return ret;
+}
+
+
+static int get_target(const char *symname, struct nameidata *nd,
+                     struct config_item **target)
+{
+       int ret;
+
+       ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+       if (!ret) {
+               if (nd->dentry->d_sb == configfs_sb) {
+                       *target = configfs_get_config_item(nd->dentry);
+                       if (!*target) {
+                               ret = -ENOENT;
+                               path_release(nd);
+                       }
+               } else
+                       ret = -EPERM;
+       }
+
+       return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+       int ret;
+       struct nameidata nd;
+       struct config_item *parent_item;
+       struct config_item *target_item;
+       struct config_item_type *type;
+
+       ret = -EPERM;  /* What lack-of-symlink returns */
+       if (dentry->d_parent == configfs_sb->s_root)
+               goto out;
+
+       parent_item = configfs_get_config_item(dentry->d_parent);
+       type = parent_item->ci_type;
+
+       if (!type || !type->ct_item_ops ||
+           !type->ct_item_ops->allow_link)
+               goto out_put;
+
+       ret = get_target(symname, &nd, &target_item);
+       if (ret)
+               goto out_put;
+
+       ret = type->ct_item_ops->allow_link(parent_item, target_item);
+       if (!ret)
+               ret = create_link(parent_item, target_item, dentry);
+
+       config_item_put(target_item);
+       path_release(&nd);
+
+out_put:
+       config_item_put(parent_item);
+
+out:
+       return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+       struct configfs_dirent *sd = dentry->d_fsdata;
+       struct configfs_symlink *sl;
+       struct config_item *parent_item;
+       struct config_item_type *type;
+       int ret;
+
+       ret = -EPERM;  /* What lack-of-symlink returns */
+       if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+               goto out;
+
+       if (dentry->d_parent == configfs_sb->s_root)
+               BUG();
+
+       sl = sd->s_element;
+
+       parent_item = configfs_get_config_item(dentry->d_parent);
+       type = parent_item->ci_type;
+
+       list_del_init(&sd->s_sibling);
+       configfs_drop_dentry(sd, dentry->d_parent);
+       dput(dentry);
+       configfs_put(sd);
+
+       /*
+        * drop_link() must be called before
+        * list_del_init(&sl->sl_list), so that the order of
+        * drop_link(this, target) and drop_item(target) is preserved.
+        */
+       if (type && type->ct_item_ops &&
+           type->ct_item_ops->drop_link)
+               type->ct_item_ops->drop_link(parent_item,
+                                              sl->sl_target);
+
+       /* FIXME: Needs lock */
+       list_del_init(&sl->sl_list);
+
+       /* Put reference from create_link() */
+       config_item_put(sl->sl_target);
+       kfree(sl);
+
+       config_item_put(parent_item);
+
+       ret = 0;
+
+out:
+       return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+                                  char *path)
+{
+       char * s;
+       int depth, size;
+
+       depth = item_depth(item);
+       size = item_path_length(target) + depth * 3 - 1;
+       if (size > PATH_MAX)
+               return -ENAMETOOLONG;
+
+       pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+
+       for (s = path; depth--; s += 3)
+               strcpy(s,"../");
+
+       fill_item_path(target, path, size);
+       pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+
+       return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+       struct config_item *item, *target_item;
+       int error = 0;
+
+       item = configfs_get_config_item(dentry->d_parent);
+       if (!item)
+               return -EINVAL;
+
+       target_item = configfs_get_config_item(dentry);
+       if (!target_item) {
+               config_item_put(item);
+               return -EINVAL;
+       }
+
+       down_read(&configfs_rename_sem);
+       error = configfs_get_target_path(item, target_item, path);
+       up_read(&configfs_rename_sem);
+
+       config_item_put(item);
+       config_item_put(target_item);
+       return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+       int error = -ENOMEM;
+       unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+       if (page) {
+               error = configfs_getlink(dentry, (char *)page);
+               if (!error) {
+                       nd_set_link(nd, (char *)page);
+                       return (void *)page;
+               }
+       }
+
+       nd_set_link(nd, ERR_PTR(error));
+       return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                             void *cookie)
+{
+       if (cookie) {
+               unsigned long page = (unsigned long)cookie;
+               free_page(page);
+       }
+}
+
+struct inode_operations configfs_symlink_inode_operations = {
+       .follow_link = configfs_follow_link,
+       .readlink = generic_readlink,
+       .put_link = configfs_put_link,
+};
+
index c5adcdd..f1d2d02 100644 (file)
@@ -721,7 +721,7 @@ retry:
                                                &last_block_in_bio, &ret, wbc,
                                                page->mapping->a_ops->writepage);
                        }
-                       if (unlikely(ret == WRITEPAGE_ACTIVATE))
+                       if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
                                unlock_page(page);
                        if (ret || (--(wbc->nr_to_write) <= 0))
                                done = 1;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644 (file)
index 0000000..7d3be84
--- /dev/null
@@ -0,0 +1,33 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+
+ocfs2-objs := \
+       alloc.o                 \
+       aops.o                  \
+       buffer_head_io.o        \
+       dcache.o                \
+       dir.o                   \
+       dlmglue.o               \
+       export.o                \
+       extent_map.o            \
+       file.o                  \
+       heartbeat.o             \
+       inode.o                 \
+       journal.o               \
+       localalloc.o            \
+       mmap.o                  \
+       namei.o                 \
+       slot_map.o              \
+       suballoc.o              \
+       super.o                 \
+       symlink.o               \
+       sysfile.o               \
+       uptodate.o              \
+       ver.o                   \
+       vote.o
+
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644 (file)
index 0000000..465f797
--- /dev/null
@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+                              struct ocfs2_extent_rec *ext,
+                              u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+                                    struct ocfs2_journal_handle *handle,
+                                    struct inode *inode,
+                                    int wanted,
+                                    struct ocfs2_alloc_context *meta_ac,
+                                    struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+                           struct ocfs2_journal_handle *handle,
+                           struct inode *inode,
+                           struct buffer_head *fe_bh,
+                           struct buffer_head *eb_bh,
+                           struct buffer_head *last_eb_bh,
+                           struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+                                 struct ocfs2_journal_handle *handle,
+                                 struct inode *inode,
+                                 struct buffer_head *fe_bh,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+                                 struct ocfs2_journal_handle *handle,
+                                 struct inode *inode,
+                                 struct buffer_head *fe_bh,
+                                 u64 blkno,
+                                 u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+                                   struct inode *inode,
+                                   struct buffer_head *fe_bh,
+                                   struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+                                      struct inode *inode,
+                                      struct ocfs2_dinode *fe,
+                                      unsigned int new_i_clusters,
+                                      struct buffer_head *old_last_eb,
+                                      struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+                              struct ocfs2_extent_rec *ext,
+                              u64 blkno)
+{
+       return blkno == (le64_to_cpu(ext->e_blkno) +
+                        ocfs2_clusters_to_blocks(inode->i_sb,
+                                                 le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+                          struct inode *inode,
+                          struct ocfs2_dinode *fe)
+{
+       int retval;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_extent_block *eb;
+       struct buffer_head *eb_bh = NULL;
+
+       mlog_entry_void();
+
+       if (!OCFS2_IS_VALID_DINODE(fe)) {
+               OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+               retval = -EIO;
+               goto bail;
+       }
+
+       if (fe->i_last_eb_blk) {
+               retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                                         &eb_bh, OCFS2_BH_CACHED, inode);
+               if (retval < 0) {
+                       mlog_errno(retval);
+                       goto bail;
+               }
+               eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+               el = &eb->h_list;
+       } else
+               el = &fe->id2.i_list;
+
+       BUG_ON(el->l_tree_depth != 0);
+
+       retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+       if (eb_bh)
+               brelse(eb_bh);
+
+       mlog_exit(retval);
+       return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+                                    struct ocfs2_journal_handle *handle,
+                                    struct inode *inode,
+                                    int wanted,
+                                    struct ocfs2_alloc_context *meta_ac,
+                                    struct buffer_head *bhs[])
+{
+       int count, status, i;
+       u16 suballoc_bit_start;
+       u32 num_got;
+       u64 first_blkno;
+       struct ocfs2_extent_block *eb;
+
+       mlog_entry_void();
+
+       count = 0;
+       while (count < wanted) {
+               status = ocfs2_claim_metadata(osb,
+                                             handle,
+                                             meta_ac,
+                                             wanted - count,
+                                             &suballoc_bit_start,
+                                             &num_got,
+                                             &first_blkno);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               for(i = count;  i < (num_got + count); i++) {
+                       bhs[i] = sb_getblk(osb->sb, first_blkno);
+                       if (bhs[i] == NULL) {
+                               status = -EIO;
+                               mlog_errno(status);
+                               goto bail;
+                       }
+                       ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+                       status = ocfs2_journal_access(handle, inode, bhs[i],
+                                                     OCFS2_JOURNAL_ACCESS_CREATE);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+
+                       memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+                       eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+                       /* Ok, setup the minimal stuff here. */
+                       strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+                       eb->h_blkno = cpu_to_le64(first_blkno);
+                       eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+                       /* we always use slot zero's suballocator */
+                       eb->h_suballoc_slot = 0;
+#else
+                       eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+                       eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+                       eb->h_list.l_count =
+                               cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+                       suballoc_bit_start++;
+                       first_blkno++;
+
+                       /* We'll also be dirtied by the caller, so
+                        * this isn't absolutely necessary. */
+                       status = ocfs2_journal_dirty(handle, bhs[i]);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+               }
+
+               count += num_got;
+       }
+
+       status = 0;
+bail:
+       if (status < 0) {
+               for(i = 0; i < wanted; i++) {
+                       if (bhs[i])
+                               brelse(bhs[i]);
+                       bhs[i] = NULL;
+               }
+       }
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+                           struct ocfs2_journal_handle *handle,
+                           struct inode *inode,
+                           struct buffer_head *fe_bh,
+                           struct buffer_head *eb_bh,
+                           struct buffer_head *last_eb_bh,
+                           struct ocfs2_alloc_context *meta_ac)
+{
+       int status, new_blocks, i;
+       u64 next_blkno, new_last_eb_blk;
+       struct buffer_head *bh;
+       struct buffer_head **new_eb_bhs = NULL;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list  *eb_el;
+       struct ocfs2_extent_list  *el;
+
+       mlog_entry_void();
+
+       BUG_ON(!last_eb_bh);
+
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+       if (eb_bh) {
+               eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+               el = &eb->h_list;
+       } else
+               el = &fe->id2.i_list;
+
+       /* we never add a branch to a leaf. */
+       BUG_ON(!el->l_tree_depth);
+
+       new_blocks = le16_to_cpu(el->l_tree_depth);
+
+       /* allocate the number of new eb blocks we need */
+       new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+                            GFP_KERNEL);
+       if (!new_eb_bhs) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+                                          meta_ac, new_eb_bhs);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+        * linked with the rest of the tree.
+        * conversly, new_eb_bhs[0] is the new bottommost leaf.
+        *
+        * when we leave the loop, new_last_eb_blk will point to the
+        * newest leaf, and next_blkno will point to the topmost extent
+        * block. */
+       next_blkno = new_last_eb_blk = 0;
+       for(i = 0; i < new_blocks; i++) {
+               bh = new_eb_bhs[i];
+               eb = (struct ocfs2_extent_block *) bh->b_data;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                       status = -EIO;
+                       goto bail;
+               }
+               eb_el = &eb->h_list;
+
+               status = ocfs2_journal_access(handle, inode, bh,
+                                             OCFS2_JOURNAL_ACCESS_CREATE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               eb->h_next_leaf_blk = 0;
+               eb_el->l_tree_depth = cpu_to_le16(i);
+               eb_el->l_next_free_rec = cpu_to_le16(1);
+               eb_el->l_recs[0].e_cpos = fe->i_clusters;
+               eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+               eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+               if (!eb_el->l_tree_depth)
+                       new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+               status = ocfs2_journal_dirty(handle, bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               next_blkno = le64_to_cpu(eb->h_blkno);
+       }
+
+       /* This is a bit hairy. We want to update up to three blocks
+        * here without leaving any of them in an inconsistent state
+        * in case of error. We don't have to worry about
+        * journal_dirty erroring as it won't unless we've aborted the
+        * handle (in which case we would never be here) so reserving
+        * the write with journal_access is all we need to do. */
+       status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       status = ocfs2_journal_access(handle, inode, fe_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       if (eb_bh) {
+               status = ocfs2_journal_access(handle, inode, eb_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       /* Link the new branch into the rest of the tree (el will
+        * either be on the fe, or the extent block passed in. */
+       i = le16_to_cpu(el->l_next_free_rec);
+       el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+       el->l_recs[i].e_cpos = fe->i_clusters;
+       el->l_recs[i].e_clusters = 0;
+       le16_add_cpu(&el->l_next_free_rec, 1);
+
+       /* fe needs a new last extent block pointer, as does the
+        * next_leaf on the previously last-extent-block. */
+       fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+       eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+       eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+       status = ocfs2_journal_dirty(handle, last_eb_bh);
+       if (status < 0)
+               mlog_errno(status);
+       status = ocfs2_journal_dirty(handle, fe_bh);
+       if (status < 0)
+               mlog_errno(status);
+       if (eb_bh) {
+               status = ocfs2_journal_dirty(handle, eb_bh);
+               if (status < 0)
+                       mlog_errno(status);
+       }
+
+       status = 0;
+bail:
+       if (new_eb_bhs) {
+               for (i = 0; i < new_blocks; i++)
+                       if (new_eb_bhs[i])
+                               brelse(new_eb_bhs[i]);
+               kfree(new_eb_bhs);
+       }
+
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+                                 struct ocfs2_journal_handle *handle,
+                                 struct inode *inode,
+                                 struct buffer_head *fe_bh,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct buffer_head **ret_new_eb_bh)
+{
+       int status, i;
+       struct buffer_head *new_eb_bh = NULL;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list  *fe_el;
+       struct ocfs2_extent_list  *eb_el;
+
+       mlog_entry_void();
+
+       status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+                                          &new_eb_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+       if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+               status = -EIO;
+               goto bail;
+       }
+
+       eb_el = &eb->h_list;
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+       fe_el = &fe->id2.i_list;
+
+       status = ocfs2_journal_access(handle, inode, new_eb_bh,
+                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       /* copy the fe data into the new extent block */
+       eb_el->l_tree_depth = fe_el->l_tree_depth;
+       eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+       for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+               eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+               eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+               eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+       }
+
+       status = ocfs2_journal_dirty(handle, new_eb_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       status = ocfs2_journal_access(handle, inode, fe_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       /* update fe now */
+       le16_add_cpu(&fe_el->l_tree_depth, 1);
+       fe_el->l_recs[0].e_cpos = 0;
+       fe_el->l_recs[0].e_blkno = eb->h_blkno;
+       fe_el->l_recs[0].e_clusters = fe->i_clusters;
+       for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+               fe_el->l_recs[i].e_cpos = 0;
+               fe_el->l_recs[i].e_clusters = 0;
+               fe_el->l_recs[i].e_blkno = 0;
+       }
+       fe_el->l_next_free_rec = cpu_to_le16(1);
+
+       /* If this is our 1st tree depth shift, then last_eb_blk
+        * becomes the allocated extent block */
+       if (fe_el->l_tree_depth == cpu_to_le16(1))
+               fe->i_last_eb_blk = eb->h_blkno;
+
+       status = ocfs2_journal_dirty(handle, fe_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       *ret_new_eb_bh = new_eb_bh;
+       new_eb_bh = NULL;
+       status = 0;
+bail:
+       if (new_eb_bh)
+               brelse(new_eb_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent.  Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+                                 struct ocfs2_journal_handle *handle,
+                                 struct inode *inode,
+                                 struct buffer_head *fe_bh,
+                                 u64 start_blk,
+                                 u32 new_clusters)
+{
+       int status, i, num_bhs = 0;
+       u64 next_blkno;
+       u16 next_free;
+       struct buffer_head **eb_bhs = NULL;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list  *el;
+
+       mlog_entry_void();
+
+       status = ocfs2_journal_access(handle, inode, fe_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+       el = &fe->id2.i_list;
+       if (el->l_tree_depth) {
+               /* This is another operation where we want to be
+                * careful about our tree updates. An error here means
+                * none of the previous changes we made should roll
+                * forward. As a result, we have to record the buffers
+                * for this part of the tree in an array and reserve a
+                * journal write to them before making any changes. */
+               num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+               eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+                                GFP_KERNEL);
+               if (!eb_bhs) {
+                       status = -ENOMEM;
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               i = 0;
+               while(el->l_tree_depth) {
+                       next_free = le16_to_cpu(el->l_next_free_rec);
+                       if (next_free == 0) {
+                               ocfs2_error(inode->i_sb,
+                                           "Dinode %"MLFu64" has a bad "
+                                           "extent list",
+                                           OCFS2_I(inode)->ip_blkno);
+                               status = -EIO;
+                               goto bail;
+                       }
+                       next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+                       BUG_ON(i >= num_bhs);
+                       status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+                                                 OCFS2_BH_CACHED, inode);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+                       eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+                       if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                eb);
+                               status = -EIO;
+                               goto bail;
+                       }
+
+                       status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+                                                     OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+
+                       el = &eb->h_list;
+                       i++;
+                       /* When we leave this loop, eb_bhs[num_bhs - 1] will
+                        * hold the bottom-most leaf extent block. */
+               }
+               BUG_ON(el->l_tree_depth);
+
+               el = &fe->id2.i_list;
+               /* If we have tree depth, then the fe update is
+                * trivial, and we want to switch el out for the
+                * bottom-most leaf in order to update it with the
+                * actual extent data below. */
+               next_free = le16_to_cpu(el->l_next_free_rec);
+               if (next_free == 0) {
+                       ocfs2_error(inode->i_sb,
+                                   "Dinode %"MLFu64" has a bad "
+                                   "extent list",
+                                   OCFS2_I(inode)->ip_blkno);
+                       status = -EIO;
+                       goto bail;
+               }
+               le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+                            new_clusters);
+               /* (num_bhs - 1) to avoid the leaf */
+               for(i = 0; i < (num_bhs - 1); i++) {
+                       eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+                       el = &eb->h_list;
+
+                       /* finally, make our actual change to the
+                        * intermediate extent blocks. */
+                       next_free = le16_to_cpu(el->l_next_free_rec);
+                       le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+                                    new_clusters);
+
+                       status = ocfs2_journal_dirty(handle, eb_bhs[i]);
+                       if (status < 0)
+                               mlog_errno(status);
+               }
+               BUG_ON(i != (num_bhs - 1));
+               /* note that the leaf block wasn't touched in
+                * the loop above */
+               eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+               el = &eb->h_list;
+               BUG_ON(el->l_tree_depth);
+       }
+
+       /* yay, we can finally add the actual extent now! */
+       i = le16_to_cpu(el->l_next_free_rec) - 1;
+       if (le16_to_cpu(el->l_next_free_rec) &&
+           ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
+               le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
+       } else if (le16_to_cpu(el->l_next_free_rec) &&
+                  (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
+               /* having an empty extent at eof is legal. */
+               if (el->l_recs[i].e_cpos != fe->i_clusters) {
+                       ocfs2_error(inode->i_sb,
+                                   "Dinode %"MLFu64" trailing extent is bad: "
+                                   "cpos (%u) != number of clusters (%u)",
+                                   le32_to_cpu(el->l_recs[i].e_cpos),
+                                   le32_to_cpu(fe->i_clusters));
+                       status = -EIO;
+                       goto bail;
+               }
+               el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+               el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+       } else {
+               /* No contiguous record, or no empty record at eof, so
+                * we add a new one. */
+
+               BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
+                      le16_to_cpu(el->l_count));
+               i = le16_to_cpu(el->l_next_free_rec);
+
+               el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+               el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+               el->l_recs[i].e_cpos = fe->i_clusters;
+               le16_add_cpu(&el->l_next_free_rec, 1);
+       }
+
+       /*
+        * extent_map errors are not fatal, so they are ignored outside
+        * of flushing the thing.
+        */
+       status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+                                        new_clusters);
+       if (status) {
+               mlog_errno(status);
+               ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
+       }
+
+       status = ocfs2_journal_dirty(handle, fe_bh);
+       if (status < 0)
+               mlog_errno(status);
+       if (fe->id2.i_list.l_tree_depth) {
+               status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+               if (status < 0)
+                       mlog_errno(status);
+       }
+
+       status = 0;
+bail:
+       if (eb_bhs) {
+               for (i = 0; i < num_bhs; i++)
+                       if (eb_bhs[i])
+                               brelse(eb_bhs[i]);
+               kfree(eb_bhs);
+       }
+
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+                                   struct inode *inode,
+                                   struct buffer_head *fe_bh,
+                                   struct buffer_head **target_bh)
+{
+       int status = 0, i;
+       u64 blkno;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list  *el;
+       struct buffer_head *bh = NULL;
+       struct buffer_head *lowest_bh = NULL;
+
+       mlog_entry_void();
+
+       *target_bh = NULL;
+
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+       el = &fe->id2.i_list;
+
+       while(le16_to_cpu(el->l_tree_depth) > 1) {
+               if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                       ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
+                                   "extent list (next_free_rec == 0)",
+                                   OCFS2_I(inode)->ip_blkno);
+                       status = -EIO;
+                       goto bail;
+               }
+               i = le16_to_cpu(el->l_next_free_rec) - 1;
+               blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+               if (!blkno) {
+                       ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
+                                   "list where extent # %d has no physical "
+                                   "block start",
+                                   OCFS2_I(inode)->ip_blkno, i);
+                       status = -EIO;
+                       goto bail;
+               }
+
+               if (bh) {
+                       brelse(bh);
+                       bh = NULL;
+               }
+
+               status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+                                         inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               eb = (struct ocfs2_extent_block *) bh->b_data;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                       status = -EIO;
+                       goto bail;
+               }
+               el = &eb->h_list;
+
+               if (le16_to_cpu(el->l_next_free_rec) <
+                   le16_to_cpu(el->l_count)) {
+                       if (lowest_bh)
+                               brelse(lowest_bh);
+                       lowest_bh = bh;
+                       get_bh(lowest_bh);
+               }
+       }
+
+       /* If we didn't find one and the fe doesn't have any room,
+        * then return '1' */
+       if (!lowest_bh
+           && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+               status = 1;
+
+       *target_bh = lowest_bh;
+bail:
+       if (bh)
+               brelse(bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+                       struct ocfs2_journal_handle *handle,
+                       struct inode *inode,
+                       struct buffer_head *fe_bh,
+                       u64 start_blk,
+                       u32 new_clusters,
+                       struct ocfs2_alloc_context *meta_ac)
+{
+       int status, i, shift;
+       struct buffer_head *last_eb_bh = NULL;
+       struct buffer_head *bh = NULL;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list  *el;
+
+       mlog_entry_void();
+
+       mlog(0, "add %u clusters starting at block %"MLFu64" to "
+               "inode %"MLFu64"\n",
+            new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
+
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+       el = &fe->id2.i_list;
+
+       if (el->l_tree_depth) {
+               /* jump to end of tree */
+               status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                                         &last_eb_bh, OCFS2_BH_CACHED, inode);
+               if (status < 0) {
+                       mlog_exit(status);
+                       goto bail;
+               }
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               el = &eb->h_list;
+       }
+
+       /* Can we allocate without adding/shifting tree bits? */
+       i = le16_to_cpu(el->l_next_free_rec) - 1;
+       if (le16_to_cpu(el->l_next_free_rec) == 0
+           || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
+           || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+           || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+               goto out_add;
+
+       mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+            "tree now.\n");
+
+       shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+       if (shift < 0) {
+               status = shift;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       /* We traveled all the way to the bottom of the allocation tree
+        * and didn't find room for any more extents - we need to add
+        * another tree level */
+       if (shift) {
+               /* if we hit a leaf, we'd better be empty :) */
+               BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
+                      le16_to_cpu(el->l_count));
+               BUG_ON(bh);
+               mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+                    "(current = %u)\n",
+                    le16_to_cpu(fe->id2.i_list.l_tree_depth));
+
+               /* ocfs2_shift_tree_depth will return us a buffer with
+                * the new extent block (so we can pass that to
+                * ocfs2_add_branch). */
+               status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
+                                               meta_ac, &bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               /* Special case: we have room now if we shifted from
+                * tree_depth 0 */
+               if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+                       goto out_add;
+       }
+
+       /* call ocfs2_add_branch to add the final part of the tree with
+        * the new data. */
+       mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+       status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
+                                 meta_ac);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+out_add:
+       /* Finally, we can add clusters. */
+       status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+                                       start_blk, new_clusters);
+       if (status < 0)
+               mlog_errno(status);
+
+bail:
+       if (bh)
+               brelse(bh);
+
+       if (last_eb_bh)
+               brelse(last_eb_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+       struct buffer_head *tl_bh = osb->osb_tl_bh;
+       struct ocfs2_dinode *di;
+       struct ocfs2_truncate_log *tl;
+
+       di = (struct ocfs2_dinode *) tl_bh->b_data;
+       tl = &di->id2.i_dealloc;
+
+       mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+                       "slot %d, invalid truncate log parameters: used = "
+                       "%u, count = %u\n", osb->slot_num,
+                       le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+       return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+                                          unsigned int new_start)
+{
+       unsigned int tail_index;
+       unsigned int current_tail;
+
+       /* No records, nothing to coalesce */
+       if (!le16_to_cpu(tl->tl_used))
+               return 0;
+
+       tail_index = le16_to_cpu(tl->tl_used) - 1;
+       current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+       current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+       return current_tail == new_start;
+}
+
+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+                                    struct ocfs2_journal_handle *handle,
+                                    u64 start_blk,
+                                    unsigned int num_clusters)
+{
+       int status, index;
+       unsigned int start_cluster, tl_count;
+       struct inode *tl_inode = osb->osb_tl_inode;
+       struct buffer_head *tl_bh = osb->osb_tl_bh;
+       struct ocfs2_dinode *di;
+       struct ocfs2_truncate_log *tl;
+
+       mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
+                  num_clusters);
+
+       BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+       start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+       di = (struct ocfs2_dinode *) tl_bh->b_data;
+       tl = &di->id2.i_dealloc;
+       if (!OCFS2_IS_VALID_DINODE(di)) {
+               OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+               status = -EIO;
+               goto bail;
+       }
+
+       tl_count = le16_to_cpu(tl->tl_count);
+       mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+                       tl_count == 0,
+                       "Truncate record count on #%"MLFu64" invalid ("
+                       "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
+                       ocfs2_truncate_recs_per_inode(osb->sb),
+                       le16_to_cpu(tl->tl_count));
+
+       /* Caller should have known to flush before calling us. */
+       index = le16_to_cpu(tl->tl_used);
+       if (index >= tl_count) {
+               status = -ENOSPC;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+            "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
+            OCFS2_I(tl_inode)->ip_blkno, index);
+
+       if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+               /*
+                * Move index back to the record we are coalescing with.
+                * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+                */
+               index--;
+
+               num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+               mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+                    index, le32_to_cpu(tl->tl_recs[index].t_start),
+                    num_clusters);
+       } else {
+               tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+               tl->tl_used = cpu_to_le16(index + 1);
+       }
+       tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+       status = ocfs2_journal_dirty(handle, tl_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+bail:
+       mlog_exit(status);
+       return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+                                        struct ocfs2_journal_handle *handle,
+                                        struct inode *data_alloc_inode,
+                                        struct buffer_head *data_alloc_bh)
+{
+       int status = 0;
+       int i;
+       unsigned int num_clusters;
+       u64 start_blk;
+       struct ocfs2_truncate_rec rec;
+       struct ocfs2_dinode *di;
+       struct ocfs2_truncate_log *tl;
+       struct inode *tl_inode = osb->osb_tl_inode;
+       struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+       mlog_entry_void();
+
+       di = (struct ocfs2_dinode *) tl_bh->b_data;
+       tl = &di->id2.i_dealloc;
+       i = le16_to_cpu(tl->tl_used) - 1;
+       while (i >= 0) {
+               /* Caller has given us at least enough credits to
+                * update the truncate log dinode */
+               status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               tl->tl_used = cpu_to_le16(i);
+
+               status = ocfs2_journal_dirty(handle, tl_bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               /* TODO: Perhaps we can calculate the bulk of the
+                * credits up front rather than extending like
+                * this. */
+               status = ocfs2_extend_trans(handle,
+                                           OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               rec = tl->tl_recs[i];
+               start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+                                                   le32_to_cpu(rec.t_start));
+               num_clusters = le32_to_cpu(rec.t_clusters);
+
+               /* if start_blk is not set, we ignore the record as
+                * invalid. */
+               if (start_blk) {
+                       mlog(0, "free record %d, start = %u, clusters = %u\n",
+                            i, le32_to_cpu(rec.t_start), num_clusters);
+
+                       status = ocfs2_free_clusters(handle, data_alloc_inode,
+                                                    data_alloc_bh, start_blk,
+                                                    num_clusters);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+               }
+               i--;
+       }
+
+bail:
+       mlog_exit(status);
+       return status;
+}
+
+/* Expects you to already be holding tl_inode->i_sem */
+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+       int status;
+       unsigned int num_to_flush;
+       struct ocfs2_journal_handle *handle = NULL;
+       struct inode *tl_inode = osb->osb_tl_inode;
+       struct inode *data_alloc_inode = NULL;
+       struct buffer_head *tl_bh = osb->osb_tl_bh;
+       struct buffer_head *data_alloc_bh = NULL;
+       struct ocfs2_dinode *di;
+       struct ocfs2_truncate_log *tl;
+
+       mlog_entry_void();
+
+       BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+       di = (struct ocfs2_dinode *) tl_bh->b_data;
+       tl = &di->id2.i_dealloc;
+       if (!OCFS2_IS_VALID_DINODE(di)) {
+               OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+               status = -EIO;
+               goto bail;
+       }
+
+       num_to_flush = le16_to_cpu(tl->tl_used);
+       mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
+            num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
+       if (!num_to_flush) {
+               status = 0;
+               goto bail;
+       }
+
+       handle = ocfs2_alloc_handle(osb);
+       if (!handle) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       data_alloc_inode = ocfs2_get_system_file_inode(osb,
+                                                      GLOBAL_BITMAP_SYSTEM_INODE,
+                                                      OCFS2_INVALID_SLOT);
+       if (!data_alloc_inode) {
+               status = -EINVAL;
+               mlog(ML_ERROR, "Could not get bitmap inode!\n");
+               goto bail;
+       }
+
+       ocfs2_handle_add_inode(handle, data_alloc_inode);
+       status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+       if (IS_ERR(handle)) {
+               status = PTR_ERR(handle);
+               handle = NULL;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+                                              data_alloc_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+bail:
+       if (handle)
+               ocfs2_commit_trans(handle);
+
+       if (data_alloc_inode)
+               iput(data_alloc_inode);
+
+       if (data_alloc_bh)
+               brelse(data_alloc_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+       int status;
+       struct inode *tl_inode = osb->osb_tl_inode;
+
+       down(&tl_inode->i_sem);
+       status = __ocfs2_flush_truncate_log(osb);
+       up(&tl_inode->i_sem);
+
+       return status;
+}
+
+static void ocfs2_truncate_log_worker(void *data)
+{
+       int status;
+       struct ocfs2_super *osb = data;
+
+       mlog_entry_void();
+
+       status = ocfs2_flush_truncate_log(osb);
+       if (status < 0)
+               mlog_errno(status);
+
+       mlog_exit(status);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+                                      int cancel)
+{
+       if (osb->osb_tl_inode) {
+               /* We want to push off log flushes while truncates are
+                * still running. */
+               if (cancel)
+                       cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+               queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+                                  OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+       }
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+                                      int slot_num,
+                                      struct inode **tl_inode,
+                                      struct buffer_head **tl_bh)
+{
+       int status;
+       struct inode *inode = NULL;
+       struct buffer_head *bh = NULL;
+
+       inode = ocfs2_get_system_file_inode(osb,
+                                          TRUNCATE_LOG_SYSTEM_INODE,
+                                          slot_num);
+       if (!inode) {
+               status = -EINVAL;
+               mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+               goto bail;
+       }
+
+       status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+                                 OCFS2_BH_CACHED, inode);
+       if (status < 0) {
+               iput(inode);
+               mlog_errno(status);
+               goto bail;
+       }
+
+       *tl_inode = inode;
+       *tl_bh    = bh;
+bail:
+       mlog_exit(status);
+       return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+                                     int slot_num,
+                                     struct ocfs2_dinode **tl_copy)
+{
+       int status;
+       struct inode *tl_inode = NULL;
+       struct buffer_head *tl_bh = NULL;
+       struct ocfs2_dinode *di;
+       struct ocfs2_truncate_log *tl;
+
+       *tl_copy = NULL;
+
+       mlog(0, "recover truncate log from slot %d\n", slot_num);
+
+       status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       di = (struct ocfs2_dinode *) tl_bh->b_data;
+       tl = &di->id2.i_dealloc;
+       if (!OCFS2_IS_VALID_DINODE(di)) {
+               OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
+               status = -EIO;
+               goto bail;
+       }
+
+       if (le16_to_cpu(tl->tl_used)) {
+               mlog(0, "We'll have %u logs to recover\n",
+                    le16_to_cpu(tl->tl_used));
+
+               *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+               if (!(*tl_copy)) {
+                       status = -ENOMEM;
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               /* Assuming the write-out below goes well, this copy
+                * will be passed back to recovery for processing. */
+               memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+               /* All we need to do to clear the truncate log is set
+                * tl_used. */
+               tl->tl_used = 0;
+
+               status = ocfs2_write_block(osb, tl_bh, tl_inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+bail:
+       if (tl_inode)
+               iput(tl_inode);
+       if (tl_bh)
+               brelse(tl_bh);
+
+       if (status < 0 && (*tl_copy)) {
+               kfree(*tl_copy);
+               *tl_copy = NULL;
+       }
+
+       mlog_exit(status);
+       return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+                                        struct ocfs2_dinode *tl_copy)
+{
+       int status = 0;
+       int i;
+       unsigned int clusters, num_recs, start_cluster;
+       u64 start_blk;
+       struct ocfs2_journal_handle *handle;
+       struct inode *tl_inode = osb->osb_tl_inode;
+       struct ocfs2_truncate_log *tl;
+
+       mlog_entry_void();
+
+       if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+               mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+               return -EINVAL;
+       }
+
+       tl = &tl_copy->id2.i_dealloc;
+       num_recs = le16_to_cpu(tl->tl_used);
+       mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
+            tl_copy->i_blkno);
+
+       down(&tl_inode->i_sem);
+       for(i = 0; i < num_recs; i++) {
+               if (ocfs2_truncate_log_needs_flush(osb)) {
+                       status = __ocfs2_flush_truncate_log(osb);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail_up;
+                       }
+               }
+
+               handle = ocfs2_start_trans(osb, NULL,
+                                          OCFS2_TRUNCATE_LOG_UPDATE);
+               if (IS_ERR(handle)) {
+                       status = PTR_ERR(handle);
+                       mlog_errno(status);
+                       goto bail_up;
+               }
+
+               clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+               start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+               start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+               status = ocfs2_truncate_log_append(osb, handle,
+                                                  start_blk, clusters);
+               ocfs2_commit_trans(handle);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail_up;
+               }
+       }
+
+bail_up:
+       up(&tl_inode->i_sem);
+
+       mlog_exit(status);
+       return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+       int status;
+       struct inode *tl_inode = osb->osb_tl_inode;
+
+       mlog_entry_void();
+
+       if (tl_inode) {
+               cancel_delayed_work(&osb->osb_truncate_log_wq);
+               flush_workqueue(ocfs2_wq);
+
+               status = ocfs2_flush_truncate_log(osb);
+               if (status < 0)
+                       mlog_errno(status);
+
+               brelse(osb->osb_tl_bh);
+               iput(osb->osb_tl_inode);
+       }
+
+       mlog_exit_void();
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+       int status;
+       struct inode *tl_inode = NULL;
+       struct buffer_head *tl_bh = NULL;
+
+       mlog_entry_void();
+
+       status = ocfs2_get_truncate_log_info(osb,
+                                            osb->slot_num,
+                                            &tl_inode,
+                                            &tl_bh);
+       if (status < 0)
+               mlog_errno(status);
+
+       /* ocfs2_truncate_log_shutdown keys on the existence of
+        * osb->osb_tl_inode so we don't set any of the osb variables
+        * until we're sure all is well. */
+       INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
+       osb->osb_tl_bh    = tl_bh;
+       osb->osb_tl_inode = tl_inode;
+
+       mlog_exit(status);
+       return status;
+}
+
+/* This function will figure out whether the currently last extent
+ * block will be deleted, and if it will, what the new last extent
+ * block will be so we can update his h_next_leaf_blk field, as well
+ * as the dinodes i_last_eb_blk */
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+                                      struct inode *inode,
+                                      struct ocfs2_dinode *fe,
+                                      u32 new_i_clusters,
+                                      struct buffer_head *old_last_eb,
+                                      struct buffer_head **new_last_eb)
+{
+       int i, status = 0;
+       u64 block = 0;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *el;
+       struct buffer_head *bh = NULL;
+
+       *new_last_eb = NULL;
+
+       if (!OCFS2_IS_VALID_DINODE(fe)) {
+               OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+               status = -EIO;
+               goto bail;
+       }
+
+       /* we have no tree, so of course, no last_eb. */
+       if (!fe->id2.i_list.l_tree_depth)
+               goto bail;
+
+       /* trunc to zero special case - this makes tree_depth = 0
+        * regardless of what it is.  */
+       if (!new_i_clusters)
+               goto bail;
+
+       eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+       el = &(eb->h_list);
+       BUG_ON(!el->l_next_free_rec);
+
+       /* Make sure that this guy will actually be empty after we
+        * clear away the data. */
+       if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+               goto bail;
+
+       /* Ok, at this point, we know that last_eb will definitely
+        * change, so lets traverse the tree and find the second to
+        * last extent block. */
+       el = &(fe->id2.i_list);
+       /* go down the tree, */
+       do {
+               for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+                       if (le32_to_cpu(el->l_recs[i].e_cpos) <
+                           new_i_clusters) {
+                               block = le64_to_cpu(el->l_recs[i].e_blkno);
+                               break;
+                       }
+               }
+               BUG_ON(i < 0);
+
+               if (bh) {
+                       brelse(bh);
+                       bh = NULL;
+               }
+
+               status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+                                        inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               eb = (struct ocfs2_extent_block *) bh->b_data;
+               el = &eb->h_list;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                       status = -EIO;
+                       goto bail;
+               }
+       } while (el->l_tree_depth);
+
+       *new_last_eb = bh;
+       get_bh(*new_last_eb);
+       mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
+bail:
+       if (bh)
+               brelse(bh);
+
+       return status;
+}
+
+static int ocfs2_do_truncate(struct ocfs2_super *osb,
+                            unsigned int clusters_to_del,
+                            struct inode *inode,
+                            struct buffer_head *fe_bh,
+                            struct buffer_head *old_last_eb_bh,
+                            struct ocfs2_journal_handle *handle,
+                            struct ocfs2_truncate_context *tc)
+{
+       int status, i, depth;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_block *last_eb = NULL;
+       struct ocfs2_extent_list *el;
+       struct buffer_head *eb_bh = NULL;
+       struct buffer_head *last_eb_bh = NULL;
+       u64 next_eb = 0;
+       u64 delete_blk = 0;
+
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+       status = ocfs2_find_new_last_ext_blk(osb,
+                                            inode,
+                                            fe,
+                                            le32_to_cpu(fe->i_clusters) -
+                                                       clusters_to_del,
+                                            old_last_eb_bh,
+                                            &last_eb_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       if (last_eb_bh)
+               last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+       status = ocfs2_journal_access(handle, inode, fe_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       el = &(fe->id2.i_list);
+
+       spin_lock(&OCFS2_I(inode)->ip_lock);
+       OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
+                                     clusters_to_del;
+       spin_unlock(&OCFS2_I(inode)->ip_lock);
+       le32_add_cpu(&fe->i_clusters, -clusters_to_del);
+       fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+       fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
+
+       i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+       BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+       le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+       /* tree depth zero, we can just delete the clusters, otherwise
+        * we need to record the offset of the next level extent block
+        * as we may overwrite it. */
+       if (!el->l_tree_depth)
+               delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+                       + ocfs2_clusters_to_blocks(osb->sb,
+                                       le32_to_cpu(el->l_recs[i].e_clusters));
+       else
+               next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+
+       if (!el->l_recs[i].e_clusters) {
+               /* if we deleted the whole extent record, then clear
+                * out the other fields and update the extent
+                * list. For depth > 0 trees, we've already recorded
+                * the extent block in 'next_eb' */
+               el->l_recs[i].e_cpos = 0;
+               el->l_recs[i].e_blkno = 0;
+               BUG_ON(!el->l_next_free_rec);
+               le16_add_cpu(&el->l_next_free_rec, -1);
+       }
+
+       depth = le16_to_cpu(el->l_tree_depth);
+       if (!fe->i_clusters) {
+               /* trunc to zero is a special case. */
+               el->l_tree_depth = 0;
+               fe->i_last_eb_blk = 0;
+       } else if (last_eb)
+               fe->i_last_eb_blk = last_eb->h_blkno;
+
+       status = ocfs2_journal_dirty(handle, fe_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       if (last_eb) {
+               /* If there will be a new last extent block, then by
+                * definition, there cannot be any leaves to the right of
+                * him. */
+               status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               last_eb->h_next_leaf_blk = 0;
+               status = ocfs2_journal_dirty(handle, last_eb_bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       /* if our tree depth > 0, update all the tree blocks below us. */
+       while (depth) {
+               mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
+                    depth,  next_eb);
+               status = ocfs2_read_block(osb, next_eb, &eb_bh,
+                                         OCFS2_BH_CACHED, inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                       status = -EIO;
+                       goto bail;
+               }
+               el = &(eb->h_list);
+
+               status = ocfs2_journal_access(handle, inode, eb_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+               BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+
+               i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+               mlog(0, "extent block %"MLFu64", before: record %d: "
+                    "(%u, %u, %"MLFu64"), next = %u\n",
+                    le64_to_cpu(eb->h_blkno), i,
+                    le32_to_cpu(el->l_recs[i].e_cpos),
+                    le32_to_cpu(el->l_recs[i].e_clusters),
+                    le64_to_cpu(el->l_recs[i].e_blkno),
+                    le16_to_cpu(el->l_next_free_rec));
+
+               BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+               le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+
+               next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+               /* bottom-most block requires us to delete data.*/
+               if (!el->l_tree_depth)
+                       delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+                               + ocfs2_clusters_to_blocks(osb->sb,
+                                       le32_to_cpu(el->l_recs[i].e_clusters));
+               if (!el->l_recs[i].e_clusters) {
+                       el->l_recs[i].e_cpos = 0;
+                       el->l_recs[i].e_blkno = 0;
+                       BUG_ON(!el->l_next_free_rec);
+                       le16_add_cpu(&el->l_next_free_rec, -1);
+               }
+               mlog(0, "extent block %"MLFu64", after: record %d: "
+                    "(%u, %u, %"MLFu64"), next = %u\n",
+                    le64_to_cpu(eb->h_blkno), i,
+                    le32_to_cpu(el->l_recs[i].e_cpos),
+                    le32_to_cpu(el->l_recs[i].e_clusters),
+                    le64_to_cpu(el->l_recs[i].e_blkno),
+                    le16_to_cpu(el->l_next_free_rec));
+
+               status = ocfs2_journal_dirty(handle, eb_bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               if (!el->l_next_free_rec) {
+                       mlog(0, "deleting this extent block.\n");
+
+                       ocfs2_remove_from_cache(inode, eb_bh);
+
+                       BUG_ON(eb->h_suballoc_slot);
+                       BUG_ON(el->l_recs[0].e_clusters);
+                       BUG_ON(el->l_recs[0].e_cpos);
+                       BUG_ON(el->l_recs[0].e_blkno);
+                       status = ocfs2_free_extent_block(handle,
+                                                        tc->tc_ext_alloc_inode,
+                                                        tc->tc_ext_alloc_bh,
+                                                        eb);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+               }
+               brelse(eb_bh);
+               eb_bh = NULL;
+               depth--;
+       }
+
+       BUG_ON(!delete_blk);
+       status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+                                          clusters_to_del);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       status = 0;
+bail:
+       if (!status)
+               ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+       else
+               ocfs2_extent_map_drop(inode, 0);
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+                         struct inode *inode,
+                         struct buffer_head *fe_bh,
+                         struct ocfs2_truncate_context *tc)
+{
+       int status, i, credits, tl_sem = 0;
+       u32 clusters_to_del, target_i_clusters;
+       u64 last_eb = 0;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *el;
+       struct buffer_head *last_eb_bh;
+       struct ocfs2_journal_handle *handle = NULL;
+       struct inode *tl_inode = osb->osb_tl_inode;
+
+       mlog_entry_void();
+
+       down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+                                                    i_size_read(inode));
+
+       last_eb_bh = tc->tc_last_eb_bh;
+       tc->tc_last_eb_bh = NULL;
+
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+       if (fe->id2.i_list.l_tree_depth) {
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               el = &eb->h_list;
+       } else
+               el = &fe->id2.i_list;
+       last_eb = le64_to_cpu(fe->i_last_eb_blk);
+start:
+       mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+            "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
+            "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+            le32_to_cpu(fe->i_clusters), last_eb,
+            le64_to_cpu(fe->i_last_eb_blk),
+            le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+
+       if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
+               mlog(0, "last_eb changed!\n");
+               BUG_ON(!fe->id2.i_list.l_tree_depth);
+               last_eb = le64_to_cpu(fe->i_last_eb_blk);
+               /* i_last_eb_blk may have changed, read it if
+                * necessary. We don't have to worry about the
+                * truncate to zero case here (where there becomes no
+                * last_eb) because we never loop back after our work
+                * is done. */
+               if (last_eb_bh) {
+                       brelse(last_eb_bh);
+                       last_eb_bh = NULL;
+               }
+
+               status = ocfs2_read_block(osb, last_eb,
+                                         &last_eb_bh, OCFS2_BH_CACHED,
+                                         inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                       status = -EIO;
+                       goto bail;
+               }
+               el = &(eb->h_list);
+       }
+
+       /* by now, el will point to the extent list on the bottom most
+        * portion of this tree. */
+       i = le16_to_cpu(el->l_next_free_rec) - 1;
+       if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+               clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+       else
+               clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+                                  le32_to_cpu(el->l_recs[i].e_cpos)) -
+                                 target_i_clusters;
+
+       mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+
+       down(&tl_inode->i_sem);
+       tl_sem = 1;
+       /* ocfs2_truncate_log_needs_flush guarantees us at least one
+        * record is free for use. If there isn't any, we flush to get
+        * an empty truncate log.  */
+       if (ocfs2_truncate_log_needs_flush(osb)) {
+               status = __ocfs2_flush_truncate_log(osb);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+                                               fe, el);
+       handle = ocfs2_start_trans(osb, NULL, credits);
+       if (IS_ERR(handle)) {
+               status = PTR_ERR(handle);
+               handle = NULL;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+       status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+       if (status < 0)
+               mlog_errno(status);
+
+       status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
+                                  last_eb_bh, handle, tc);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       up(&tl_inode->i_sem);
+       tl_sem = 0;
+
+       ocfs2_commit_trans(handle);
+       handle = NULL;
+
+       BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+       if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
+               goto start;
+bail:
+       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ocfs2_schedule_truncate_log_flush(osb, 1);
+
+       if (tl_sem)
+               up(&tl_inode->i_sem);
+
+       if (handle)
+               ocfs2_commit_trans(handle);
+
+       if (last_eb_bh)
+               brelse(last_eb_bh);
+
+       /* This will drop the ext_alloc cluster lock for us */
+       ocfs2_free_truncate_context(tc);
+
+       mlog_exit(status);
+       return status;
+}
+
+
+/*
+ * Expects the inode to already be locked. This will figure out which
+ * inodes need to be locked and will put them on the returned truncate
+ * context.
+ */
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+                          struct inode *inode,
+                          struct buffer_head *fe_bh,
+                          struct ocfs2_truncate_context **tc)
+{
+       int status, metadata_delete;
+       unsigned int new_i_clusters;
+       struct ocfs2_dinode *fe;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list *el;
+       struct buffer_head *last_eb_bh = NULL;
+       struct inode *ext_alloc_inode = NULL;
+       struct buffer_head *ext_alloc_bh = NULL;
+
+       mlog_entry_void();
+
+       *tc = NULL;
+
+       new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+                                                 i_size_read(inode));
+       fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+       mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+            "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
+
+       if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
+               ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
+                           "%u and size %"MLFu64" whereas struct inode has "
+                           "cluster count %u and size %llu which caused an "
+                           "invalid truncate to %u clusters.",
+                           le64_to_cpu(fe->i_blkno),
+                           le32_to_cpu(fe->i_clusters),
+                           le64_to_cpu(fe->i_size),
+                           OCFS2_I(inode)->ip_clusters, i_size_read(inode),
+                           new_i_clusters);
+               mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
+               status = -EIO;
+               goto bail;
+       }
+
+       *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
+       if (!(*tc)) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       metadata_delete = 0;
+       if (fe->id2.i_list.l_tree_depth) {
+               /* If we have a tree, then the truncate may result in
+                * metadata deletes. Figure this out from the
+                * rightmost leaf block.*/
+               status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                                         &last_eb_bh, OCFS2_BH_CACHED, inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+               if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                       OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+
+                       brelse(last_eb_bh);
+                       status = -EIO;
+                       goto bail;
+               }
+               el = &(eb->h_list);
+               if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+                       metadata_delete = 1;
+       }
+
+       (*tc)->tc_last_eb_bh = last_eb_bh;
+
+       if (metadata_delete) {
+               mlog(0, "Will have to delete metadata for this trunc. "
+                    "locking allocator.\n");
+               ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+               if (!ext_alloc_inode) {
+                       status = -ENOMEM;
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               down(&ext_alloc_inode->i_sem);
+               (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+
+               status = ocfs2_meta_lock(ext_alloc_inode,
+                                        NULL,
+                                        &ext_alloc_bh,
+                                        1);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
+               (*tc)->tc_ext_alloc_locked = 1;
+       }
+
+       status = 0;
+bail:
+       if (status < 0) {
+               if (*tc)
+                       ocfs2_free_truncate_context(*tc);
+               *tc = NULL;
+       }
+       mlog_exit_void();
+       return status;
+}
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+{
+       if (tc->tc_ext_alloc_inode) {
+               if (tc->tc_ext_alloc_locked)
+                       ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+
+               up(&tc->tc_ext_alloc_inode->i_sem);
+               iput(tc->tc_ext_alloc_inode);
+       }
+
+       if (tc->tc_ext_alloc_bh)
+               brelse(tc->tc_ext_alloc_bh);
+
+       if (tc->tc_last_eb_bh)
+               brelse(tc->tc_last_eb_bh);
+
+       kfree(tc);
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644 (file)
index 0000000..12ba897
--- /dev/null
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+                       struct ocfs2_journal_handle *handle,
+                       struct inode *inode,
+                       struct buffer_head *fe_bh,
+                       u64 blkno,
+                       u32 new_clusters,
+                       struct ocfs2_alloc_context *meta_ac);
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+                          struct inode *inode,
+                          struct ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+{
+       /*
+        * Rather than do all the work of determining how much we need
+        * (involves a ton of reads and locks), just ask for the
+        * maximal limit.  That's a tree depth shift.  So, one block for
+        * level of the tree (current l_tree_depth), one block for the
+        * new tree_depth==0 extent_block, and one block at the new
+        * top-of-the tree.
+        */
+       return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+                                      int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+                                     int slot_num,
+                                     struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+                                        struct ocfs2_dinode *tl_copy);
+
+struct ocfs2_truncate_context {
+       struct inode *tc_ext_alloc_inode;
+       struct buffer_head *tc_ext_alloc_bh;
+       int tc_ext_alloc_locked; /* is it cluster locked? */
+       /* these get destroyed once it's passed to ocfs2_commit_truncate. */
+       struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+                          struct inode *inode,
+                          struct buffer_head *fe_bh,
+                          struct ocfs2_truncate_context **tc);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+                         struct inode *inode,
+                         struct buffer_head *fe_bh,
+                         struct ocfs2_truncate_context *tc);
+
+#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644 (file)
index 0000000..8f4467a
--- /dev/null
@@ -0,0 +1,643 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+       int err = -EIO;
+       int status;
+       struct ocfs2_dinode *fe = NULL;
+       struct buffer_head *bh = NULL;
+       struct buffer_head *buffer_cache_bh = NULL;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       void *kaddr;
+
+       mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+                  (unsigned long long)iblock, bh_result, create);
+
+       BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+       if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+               mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+                    (unsigned long long)iblock);
+               goto bail;
+       }
+
+       status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                 OCFS2_I(inode)->ip_blkno,
+                                 &bh, OCFS2_BH_CACHED, inode);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       fe = (struct ocfs2_dinode *) bh->b_data;
+
+       if (!OCFS2_IS_VALID_DINODE(fe)) {
+               mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+                    fe->i_blkno, 7, fe->i_signature);
+               goto bail;
+       }
+
+       if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+                                                   le32_to_cpu(fe->i_clusters))) {
+               mlog(ML_ERROR, "block offset is outside the allocated size: "
+                    "%llu\n", (unsigned long long)iblock);
+               goto bail;
+       }
+
+       /* We don't use the page cache to create symlink data, so if
+        * need be, copy it over from the buffer cache. */
+       if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+               u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+                           iblock;
+               buffer_cache_bh = sb_getblk(osb->sb, blkno);
+               if (!buffer_cache_bh) {
+                       mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+                       goto bail;
+               }
+
+               /* we haven't locked out transactions, so a commit
+                * could've happened. Since we've got a reference on
+                * the bh, even if it commits while we're doing the
+                * copy, the data is still good. */
+               if (buffer_jbd(buffer_cache_bh)
+                   && ocfs2_inode_is_new(inode)) {
+                       kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+                       if (!kaddr) {
+                               mlog(ML_ERROR, "couldn't kmap!\n");
+                               goto bail;
+                       }
+                       memcpy(kaddr + (bh_result->b_size * iblock),
+                              buffer_cache_bh->b_data,
+                              bh_result->b_size);
+                       kunmap_atomic(kaddr, KM_USER0);
+                       set_buffer_uptodate(bh_result);
+               }
+               brelse(buffer_cache_bh);
+       }
+
+       map_bh(bh_result, inode->i_sb,
+              le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+       err = 0;
+
+bail:
+       if (bh)
+               brelse(bh);
+
+       mlog_exit(err);
+       return err;
+}
+
+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
+                          struct buffer_head *bh_result, int create)
+{
+       int err = 0;
+       u64 p_blkno, past_eof;
+
+       mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+                  (unsigned long long)iblock, bh_result, create);
+
+       if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+               mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+                    inode, inode->i_ino);
+
+       if (S_ISLNK(inode->i_mode)) {
+               /* this always does I/O for some reason. */
+               err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+               goto bail;
+       }
+
+       /* this can happen if another node truncs after our extend! */
+       spin_lock(&OCFS2_I(inode)->ip_lock);
+       if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+                                              OCFS2_I(inode)->ip_clusters))
+               err = -EIO;
+       spin_unlock(&OCFS2_I(inode)->ip_lock);
+       if (err)
+               goto bail;
+
+       err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+                                         NULL);
+       if (err) {
+               mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+                    "%"MLFu64", NULL)\n", err, inode,
+                    (unsigned long long)iblock, p_blkno);
+               goto bail;
+       }
+
+       map_bh(bh_result, inode->i_sb, p_blkno);
+
+       if (bh_result->b_blocknr == 0) {
+               err = -EIO;
+               mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
+                    "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
+                    p_blkno, OCFS2_I(inode)->ip_blkno);
+       }
+
+       past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+       mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+
+       if (create && (iblock >= past_eof))
+               set_buffer_new(bh_result);
+
+bail:
+       if (err < 0)
+               err = -EIO;
+
+       mlog_exit(err);
+       return err;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+       struct inode *inode = page->mapping->host;
+       loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+       int ret, unlock = 1;
+
+       mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+       ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+       if (ret != 0) {
+               if (ret == AOP_TRUNCATED_PAGE)
+                       unlock = 0;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       /*
+        * i_size might have just been updated as we grabed the meta lock.  We
+        * might now be discovering a truncate that hit on another node.
+        * block_read_full_page->get_block freaks out if it is asked to read
+        * beyond the end of a file, so we check here.  Callers
+        * (generic_file_read, fault->nopage) are clever enough to check i_size
+        * and notice that the page they just read isn't needed.
+        *
+        * XXX sys_readahead() seems to get that wrong?
+        */
+       if (start >= i_size_read(inode)) {
+               char *addr = kmap(page);
+               memset(addr, 0, PAGE_SIZE);
+               flush_dcache_page(page);
+               kunmap(page);
+               SetPageUptodate(page);
+               ret = 0;
+               goto out_alloc;
+       }
+
+       ret = ocfs2_data_lock_with_page(inode, 0, page);
+       if (ret != 0) {
+               if (ret == AOP_TRUNCATED_PAGE)
+                       unlock = 0;
+               mlog_errno(ret);
+               goto out_alloc;
+       }
+
+       ret = block_read_full_page(page, ocfs2_get_block);
+       unlock = 0;
+
+       ocfs2_data_unlock(inode, 0);
+out_alloc:
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+       ocfs2_meta_unlock(inode, 0);
+out:
+       if (unlock)
+               unlock_page(page);
+       mlog_exit(ret);
+       return ret;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+       int ret;
+
+       mlog_entry("(0x%p)\n", page);
+
+       ret = block_write_full_page(page, ocfs2_get_block, wbc);
+
+       mlog_exit(ret);
+
+       return ret;
+}
+
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+                       unsigned from, unsigned to)
+{
+       struct inode *inode = page->mapping->host;
+       int ret;
+
+       mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+       ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+       if (ret != 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ocfs2_meta_unlock(inode, 0);
+out:
+       mlog_exit(ret);
+       return ret;
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(  handle_t *handle,
+                               struct buffer_head *head,
+                               unsigned from,
+                               unsigned to,
+                               int *partial,
+                               int (*fn)(      handle_t *handle,
+                                               struct buffer_head *bh))
+{
+       struct buffer_head *bh;
+       unsigned block_start, block_end;
+       unsigned blocksize = head->b_size;
+       int err, ret = 0;
+       struct buffer_head *next;
+
+       for (   bh = head, block_start = 0;
+               ret == 0 && (bh != head || !block_start);
+               block_start = block_end, bh = next)
+       {
+               next = bh->b_this_page;
+               block_end = block_start + blocksize;
+               if (block_end <= from || block_start >= to) {
+                       if (partial && !buffer_uptodate(bh))
+                               *partial = 1;
+                       continue;
+               }
+               err = (*fn)(handle, bh);
+               if (!ret)
+                       ret = err;
+       }
+       return ret;
+}
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+                                                        struct page *page,
+                                                        unsigned from,
+                                                        unsigned to)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_journal_handle *handle = NULL;
+       int ret = 0;
+
+       handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+       if (!handle) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (ocfs2_should_order_data(inode)) {
+               ret = walk_page_buffers(handle->k_handle,
+                                       page_buffers(page),
+                                       from, to, NULL,
+                                       ocfs2_journal_dirty_data);
+               if (ret < 0) 
+                       mlog_errno(ret);
+       }
+out:
+       if (ret) {
+               if (handle)
+                       ocfs2_commit_trans(handle);
+               handle = ERR_PTR(ret);
+       }
+       return handle;
+}
+
+static int ocfs2_commit_write(struct file *file, struct page *page,
+                             unsigned from, unsigned to)
+{
+       int ret, extending = 0, locklevel = 0;
+       loff_t new_i_size;
+       struct buffer_head *di_bh = NULL;
+       struct inode *inode = page->mapping->host;
+       struct ocfs2_journal_handle *handle = NULL;
+
+       mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+       /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+        * us to sample inode->i_size here without the metadata lock:
+        *
+        * 1) We're currently holding the inode alloc lock, so no
+        *    nodes can change it underneath us.
+        *
+        * 2) We've had to take the metadata lock at least once
+        *    already to check for extending writes, hence insuring
+        *    that our current copy is also up to date.
+        */
+       new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+       if (new_i_size > i_size_read(inode)) {
+               extending = 1;
+               locklevel = 1;
+       }
+
+       ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+       if (ret != 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_data_lock_with_page(inode, 1, page);
+       if (ret != 0) {
+               mlog_errno(ret);
+               goto out_unlock_meta;
+       }
+
+       if (extending) {
+               handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       handle = NULL;
+                       goto out_unlock_data;
+               }
+
+               /* Mark our buffer early. We'd rather catch this error up here
+                * as opposed to after a successful commit_write which would
+                * require us to set back inode->i_size. */
+               ret = ocfs2_journal_access(handle, inode, di_bh,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+       }
+
+       /* might update i_size */
+       ret = generic_commit_write(file, page, from, to);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       if (extending) {
+               loff_t size = (u64) i_size_read(inode);
+               struct ocfs2_dinode *di =
+                       (struct ocfs2_dinode *)di_bh->b_data;
+
+               /* ocfs2_mark_inode_dirty is too heavy to use here. */
+               inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+               di->i_size = cpu_to_le64(size);
+               di->i_ctime = di->i_mtime = 
+                               cpu_to_le64(inode->i_mtime.tv_sec);
+               di->i_ctime_nsec = di->i_mtime_nsec = 
+                               cpu_to_le32(inode->i_mtime.tv_nsec);
+
+               ret = ocfs2_journal_dirty(handle, di_bh);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+       }
+
+       BUG_ON(extending && (i_size_read(inode) != new_i_size));
+
+out_commit:
+       if (handle)
+               ocfs2_commit_trans(handle);
+out_unlock_data:
+       ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+       ocfs2_meta_unlock(inode, locklevel);
+out:
+       if (di_bh)
+               brelse(di_bh);
+
+       mlog_exit(ret);
+       return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+       sector_t status;
+       u64 p_blkno = 0;
+       int err = 0;
+       struct inode *inode = mapping->host;
+
+       mlog_entry("(block = %llu)\n", (unsigned long long)block);
+
+       /* We don't need to lock journal system files, since they aren't
+        * accessed concurrently from multiple nodes.
+        */
+       if (!INODE_JOURNAL(inode)) {
+               err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+               if (err) {
+                       if (err != -ENOENT)
+                               mlog_errno(err);
+                       goto bail;
+               }
+               down_read(&OCFS2_I(inode)->ip_alloc_sem);
+       }
+
+       err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+                                         NULL);
+
+       if (!INODE_JOURNAL(inode)) {
+               up_read(&OCFS2_I(inode)->ip_alloc_sem);
+               ocfs2_meta_unlock(inode, 0);
+       }
+
+       if (err) {
+               mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+                    (unsigned long long)block);
+               mlog_errno(err);
+               goto bail;
+       }
+
+
+bail:
+       status = err ? 0 : p_blkno;
+
+       mlog_exit((int)status);
+
+       return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ *                                     fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+                                    unsigned long max_blocks,
+                                    struct buffer_head *bh_result, int create)
+{
+       int ret;
+       u64 vbo_max; /* file offset, max_blocks from iblock */
+       u64 p_blkno;
+       int contig_blocks;
+       unsigned char blocksize_bits;
+
+       if (!inode || !bh_result) {
+               mlog(ML_ERROR, "inode or bh_result is null\n");
+               return -EIO;
+       }
+
+       blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+       /* This function won't even be called if the request isn't all
+        * nicely aligned and of the right size, so there's no need
+        * for us to check any of that. */
+
+       vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
+
+       spin_lock(&OCFS2_I(inode)->ip_lock);
+       if ((iblock + max_blocks) >
+           ocfs2_clusters_to_blocks(inode->i_sb,
+                                    OCFS2_I(inode)->ip_clusters)) {
+               spin_unlock(&OCFS2_I(inode)->ip_lock);
+               ret = -EIO;
+               goto bail;
+       }
+       spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+       /* This figures out the size of the next contiguous block, and
+        * our logical offset */
+       ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+                                         &contig_blocks);
+       if (ret) {
+               mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+                    (unsigned long long)iblock);
+               ret = -EIO;
+               goto bail;
+       }
+
+       map_bh(bh_result, inode->i_sb, p_blkno);
+
+       /* make sure we don't map more than max_blocks blocks here as
+          that's all the kernel will handle at this point. */
+       if (max_blocks < contig_blocks)
+               contig_blocks = max_blocks;
+       bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+       return ret;
+}
+
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+                            loff_t offset,
+                            ssize_t bytes,
+                            void *private)
+{
+       struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+       /* this io's submitter should not have unlocked this before we could */
+       BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+       ocfs2_iocb_clear_rw_locked(iocb);
+       up_read(&inode->i_alloc_sem);
+       ocfs2_rw_unlock(inode, 0);
+}
+
+static ssize_t ocfs2_direct_IO(int rw,
+                              struct kiocb *iocb,
+                              const struct iovec *iov,
+                              loff_t offset,
+                              unsigned long nr_segs)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+       int ret;
+
+       mlog_entry_void();
+       ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+                                           inode->i_sb->s_bdev, iov, offset,
+                                           nr_segs, 
+                                           ocfs2_direct_IO_get_blocks,
+                                           ocfs2_dio_end_io);
+       mlog_exit(ret);
+       return ret;
+}
+
+struct address_space_operations ocfs2_aops = {
+       .readpage       = ocfs2_readpage,
+       .writepage      = ocfs2_writepage,
+       .prepare_write  = ocfs2_prepare_write,
+       .commit_write   = ocfs2_commit_write,
+       .bmap           = ocfs2_bmap,
+       .sync_page      = block_sync_page,
+       .direct_IO      = ocfs2_direct_IO
+};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644 (file)
index 0000000..d40456d
--- /dev/null
@@ -0,0 +1,41 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+                       unsigned from, unsigned to);
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+                                                        struct page *page,
+                                                        unsigned from,
+                                                        unsigned to);
+
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+       test_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_rw_locked(iocb) \
+       set_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+       clear_bit(0, (unsigned long *)&iocb->private)
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644 (file)
index 0000000..d424041
--- /dev/null
@@ -0,0 +1,232 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+                     struct inode *inode)
+{
+       int ret = 0;
+
+       mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
+                  (unsigned long long)bh->b_blocknr, inode);
+
+       BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+       BUG_ON(buffer_jbd(bh));
+
+       /* No need to check for a soft readonly file system here. non
+        * journalled writes are only ever done on system files which
+        * can get modified during recovery even if read-only. */
+       if (ocfs2_is_hard_readonly(osb)) {
+               ret = -EROFS;
+               goto out;
+       }
+
+       down(&OCFS2_I(inode)->ip_io_sem);
+
+       lock_buffer(bh);
+       set_buffer_uptodate(bh);
+
+       /* remove from dirty list before I/O. */
+       clear_buffer_dirty(bh);
+
+       get_bh(bh); /* for end_buffer_write_sync() */                   
+       bh->b_end_io = end_buffer_write_sync;
+       submit_bh(WRITE, bh);
+
+       wait_on_buffer(bh);
+
+       if (buffer_uptodate(bh)) {
+               ocfs2_set_buffer_uptodate(inode, bh);
+       } else {
+               /* We don't need to remove the clustered uptodate
+                * information for this bh as it's not marked locally
+                * uptodate. */
+               ret = -EIO;
+               brelse(bh);
+       }
+
+       up(&OCFS2_I(inode)->ip_io_sem);
+out:
+       mlog_exit(ret);
+       return ret;
+}
+
+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+                     struct buffer_head *bhs[], int flags,
+                     struct inode *inode)
+{
+       int status = 0;
+       struct super_block *sb;
+       int i, ignore_cache = 0;
+       struct buffer_head *bh;
+
+       mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
+                  block, nr, flags, inode);
+
+       if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+               status = -EINVAL;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       if (nr < 0) {
+               mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+               status = -EINVAL;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       if (nr == 0) {
+               mlog(ML_BH_IO, "No buffers will be read!\n");
+               status = 0;
+               goto bail;
+       }
+
+       sb = osb->sb;
+
+       if (flags & OCFS2_BH_CACHED && !inode)
+               flags &= ~OCFS2_BH_CACHED;
+
+       if (inode)
+               down(&OCFS2_I(inode)->ip_io_sem);
+       for (i = 0 ; i < nr ; i++) {
+               if (bhs[i] == NULL) {
+                       bhs[i] = sb_getblk(sb, block++);
+                       if (bhs[i] == NULL) {
+                               if (inode)
+                                       up(&OCFS2_I(inode)->ip_io_sem);
+                               status = -EIO;
+                               mlog_errno(status);
+                               goto bail;
+                       }
+               }
+               bh = bhs[i];
+               ignore_cache = 0;
+
+               if (flags & OCFS2_BH_CACHED &&
+                   !ocfs2_buffer_uptodate(inode, bh)) {
+                       mlog(ML_UPTODATE,
+                            "bh (%llu), inode %"MLFu64" not uptodate\n",
+                            (unsigned long long)bh->b_blocknr,
+                            OCFS2_I(inode)->ip_blkno);
+                       ignore_cache = 1;
+               }
+
+               /* XXX: Can we ever get this and *not* have the cached
+                * flag set? */
+               if (buffer_jbd(bh)) {
+                       if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+                               mlog(ML_BH_IO, "trying to sync read a jbd "
+                                              "managed bh (blocknr = %llu)\n",
+                                    (unsigned long long)bh->b_blocknr);
+                       continue;
+               }
+
+               if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+                       if (buffer_dirty(bh)) {
+                               /* This should probably be a BUG, or
+                                * at least return an error. */
+                               mlog(ML_BH_IO, "asking me to sync read a dirty "
+                                              "buffer! (blocknr = %llu)\n",
+                                    (unsigned long long)bh->b_blocknr);
+                               continue;
+                       }
+
+                       lock_buffer(bh);
+                       if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+                               mlog(ML_ERROR, "block %llu had the JBD bit set "
+                                              "while I was in lock_buffer!",
+                                    (unsigned long long)bh->b_blocknr);
+                               BUG();
+#else
+                               unlock_buffer(bh);
+                               continue;
+#endif
+                       }
+                       clear_buffer_uptodate(bh);
+                       get_bh(bh); /* for end_buffer_read_sync() */
+                       bh->b_end_io = end_buffer_read_sync;
+                       if (flags & OCFS2_BH_READAHEAD)
+                               submit_bh(READA, bh);
+                       else
+                               submit_bh(READ, bh);
+                       continue;
+               }
+       }
+
+       status = 0;
+
+       for (i = (nr - 1); i >= 0; i--) {
+               bh = bhs[i];
+
+               /* We know this can't have changed as we hold the
+                * inode sem. Avoid doing any work on the bh if the
+                * journal has it. */
+               if (!buffer_jbd(bh))
+                       wait_on_buffer(bh);
+
+               if (!buffer_uptodate(bh)) {
+                       /* Status won't be cleared from here on out,
+                        * so we can safely record this and loop back
+                        * to cleanup the other buffers. Don't need to
+                        * remove the clustered uptodate information
+                        * for this bh as it's not marked locally
+                        * uptodate. */
+                       status = -EIO;
+                       brelse(bh);
+                       bhs[i] = NULL;
+                       continue;
+               }
+
+               if (inode)
+                       ocfs2_set_buffer_uptodate(inode, bh);
+       }
+       if (inode)
+               up(&OCFS2_I(inode)->ip_io_sem);
+
+       mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
+            (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+
+bail:
+
+       mlog_exit(status);
+       return status;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644 (file)
index 0000000..6ecb909
--- /dev/null
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+                            int uptodate);
+
+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+                                  u64                  off,
+                                  struct buffer_head **bh,
+                                  int                  flags,
+                                  struct inode        *inode);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+                     struct buffer_head  *bh,
+                     struct inode        *inode);
+int ocfs2_read_blocks(struct ocfs2_super          *osb,
+                     u64                  block,
+                     int                  nr,
+                     struct buffer_head  *bhs[],
+                     int                  flags,
+                     struct inode        *inode);
+
+
+#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_READAHEAD         8   /* use this to pass READA down to submit_bh */
+
+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+                                  struct buffer_head **bh, int flags,
+                                  struct inode *inode)
+{
+       int status = 0;
+
+       if (bh == NULL) {
+               printk("ocfs2: bh == NULL\n");
+               status = -EINVAL;
+               goto bail;
+       }
+
+       status = ocfs2_read_blocks(osb, off, 1, bh,
+                                  flags, inode);
+
+bail:
+       return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644 (file)
index 0000000..cdd162f
--- /dev/null
@@ -0,0 +1,4 @@
+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
+
+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
+       quorum.o tcp.o ver.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
new file mode 100644 (file)
index 0000000..2df9082
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_CLUSTER_ENDIAN_H
+#define OCFS2_CLUSTER_ENDIAN_H
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+       *var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644 (file)
index 0000000..7307ba5
--- /dev/null
@@ -0,0 +1,1797 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "quorum.h"
+
+#include "masklog.h"
+
+
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls.  This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(o2hb_callback_sem);
+
+/*
+ * multiple hb threads are watching multiple regions.  A node is live
+ * whenever any of the threads sees activity from the node in its region.
+ */
+static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+static LIST_HEAD(o2hb_node_events);
+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+
+static LIST_HEAD(o2hb_all_regions);
+
+static struct o2hb_callback {
+       struct list_head list;
+} o2hb_callbacks[O2HB_NUM_CB];
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
+
+#define O2HB_DEFAULT_BLOCK_BITS       9
+
+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+
+/* Only sets a new threshold if there are no active regions. 
+ *
+ * No locking or otherwise interesting code is required for reading
+ * o2hb_dead_threshold as it can't change once regions are active and
+ * it's not interesting to anyone until then anyway. */
+static void o2hb_dead_threshold_set(unsigned int threshold)
+{
+       if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
+               spin_lock(&o2hb_live_lock);
+               if (list_empty(&o2hb_all_regions))
+                       o2hb_dead_threshold = threshold;
+               spin_unlock(&o2hb_live_lock);
+       }
+}
+
+struct o2hb_node_event {
+       struct list_head        hn_item;
+       enum o2hb_callback_type hn_event_type;
+       struct o2nm_node        *hn_node;
+       int                     hn_node_num;
+};
+
+struct o2hb_disk_slot {
+       struct o2hb_disk_heartbeat_block *ds_raw_block;
+       u8                      ds_node_num;
+       u64                     ds_last_time;
+       u64                     ds_last_generation;
+       u16                     ds_equal_samples;
+       u16                     ds_changed_samples;
+       struct list_head        ds_live_item;
+};
+
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct o2hb_region {
+       struct config_item      hr_item;
+
+       struct list_head        hr_all_item;
+       unsigned                hr_unclean_stop:1;
+
+       /* protected by the hr_callback_sem */
+       struct task_struct      *hr_task;
+
+       unsigned int            hr_blocks;
+       unsigned long long      hr_start_block;
+
+       unsigned int            hr_block_bits;
+       unsigned int            hr_block_bytes;
+
+       unsigned int            hr_slots_per_page;
+       unsigned int            hr_num_pages;
+
+       struct page             **hr_slot_data;
+       struct block_device     *hr_bdev;
+       struct o2hb_disk_slot   *hr_slots;
+
+       /* let the person setting up hb wait for it to return until it
+        * has reached a 'steady' state.  This will be fixed when we have
+        * a more complete api that doesn't lead to this sort of fragility. */
+       atomic_t                hr_steady_iterations;
+
+       char                    hr_dev_name[BDEVNAME_SIZE];
+
+       unsigned int            hr_timeout_ms;
+
+       /* randomized as the region goes up and down so that a node
+        * recognizes a node going up and down in one iteration */
+       u64                     hr_generation;
+
+       struct work_struct      hr_write_timeout_work;
+       unsigned long           hr_last_timeout_start;
+
+       /* Used during o2hb_check_slot to hold a copy of the block
+        * being checked because we temporarily have to zero out the
+        * crc field. */
+       struct o2hb_disk_heartbeat_block *hr_tmp_block;
+};
+
+struct o2hb_bio_wait_ctxt {
+       atomic_t          wc_num_reqs;
+       struct completion wc_io_complete;
+};
+
+static void o2hb_write_timeout(void *arg)
+{
+       struct o2hb_region *reg = arg;
+
+       mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
+            "milliseconds\n", reg->hr_dev_name,
+            jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
+       o2quo_disk_timeout();
+}
+
+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+{
+       mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+
+       cancel_delayed_work(&reg->hr_write_timeout_work);
+       reg->hr_last_timeout_start = jiffies;
+       schedule_delayed_work(&reg->hr_write_timeout_work,
+                             msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+}
+
+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+{
+       cancel_delayed_work(&reg->hr_write_timeout_work);
+       flush_scheduled_work();
+}
+
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
+                                     unsigned int num_ios)
+{
+       atomic_set(&wc->wc_num_reqs, num_ios);
+       init_completion(&wc->wc_io_complete);
+}
+
+/* Used in error paths too */
+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
+                                    unsigned int num)
+{
+       /* sadly atomic_sub_and_test() isn't available on all platforms.  The
+        * good news is that the fast path only completes one at a time */
+       while(num--) {
+               if (atomic_dec_and_test(&wc->wc_num_reqs)) {
+                       BUG_ON(num > 0);
+                       complete(&wc->wc_io_complete);
+               }
+       }
+}
+
+static void o2hb_wait_on_io(struct o2hb_region *reg,
+                           struct o2hb_bio_wait_ctxt *wc)
+{
+       struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
+
+       blk_run_address_space(mapping);
+
+       wait_for_completion(&wc->wc_io_complete);
+}
+
+static int o2hb_bio_end_io(struct bio *bio,
+                          unsigned int bytes_done,
+                          int error)
+{
+       struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+
+       if (error)
+               mlog(ML_ERROR, "IO Error %d\n", error);
+
+       if (bio->bi_size)
+               return 1;
+
+       o2hb_bio_wait_dec(wc, 1);
+       return 0;
+}
+
+/* Setup a Bio to cover I/O against num_slots slots starting at
+ * start_slot. */
+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
+                                     struct o2hb_bio_wait_ctxt *wc,
+                                     unsigned int start_slot,
+                                     unsigned int num_slots)
+{
+       int i, nr_vecs, len, first_page, last_page;
+       unsigned int vec_len, vec_start;
+       unsigned int bits = reg->hr_block_bits;
+       unsigned int spp = reg->hr_slots_per_page;
+       struct bio *bio;
+       struct page *page;
+
+       nr_vecs = (num_slots + spp - 1) / spp;
+
+       /* Testing has shown this allocation to take long enough under
+        * GFP_KERNEL that the local node can get fenced. It would be
+        * nicest if we could pre-allocate these bios and avoid this
+        * all together. */
+       bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+       if (!bio) {
+               mlog(ML_ERROR, "Could not alloc slots BIO!\n");
+               bio = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       /* Must put everything in 512 byte sectors for the bio... */
+       bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+       bio->bi_bdev = reg->hr_bdev;
+       bio->bi_private = wc;
+       bio->bi_end_io = o2hb_bio_end_io;
+
+       first_page = start_slot / spp;
+       last_page = first_page + nr_vecs;
+       vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
+       for(i = first_page; i < last_page; i++) {
+               page = reg->hr_slot_data[i];
+
+               vec_len = PAGE_CACHE_SIZE;
+               /* last page might be short */
+               if (((i + 1) * spp) > (start_slot + num_slots))
+                       vec_len = ((num_slots + start_slot) % spp) << bits;
+               vec_len -=  vec_start;
+
+               mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+                    i, vec_len, vec_start);
+
+               len = bio_add_page(bio, page, vec_len, vec_start);
+               if (len != vec_len) {
+                       bio_put(bio);
+                       bio = ERR_PTR(-EIO);
+
+                       mlog(ML_ERROR, "Error adding page to bio i = %d, "
+                            "vec_len = %u, len = %d\n, start = %u\n",
+                            i, vec_len, len, vec_start);
+                       goto bail;
+               }
+
+               vec_start = 0;
+       }
+
+bail:
+       return bio;
+}
+
+/*
+ * Compute the maximum number of sectors the bdev can handle in one bio,
+ * as a power of two.
+ *
+ * Stolen from oracleasm, thanks Joel!
+ */
+static int compute_max_sectors(struct block_device *bdev)
+{
+       int max_pages, max_sectors, pow_two_sectors;
+
+       struct request_queue *q;
+
+       q = bdev_get_queue(bdev);
+       max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
+       if (max_pages > BIO_MAX_PAGES)
+               max_pages = BIO_MAX_PAGES;
+       if (max_pages > q->max_phys_segments)
+               max_pages = q->max_phys_segments;
+       if (max_pages > q->max_hw_segments)
+               max_pages = q->max_hw_segments;
+       max_pages--; /* Handle I/Os that straddle a page */
+
+       max_sectors = max_pages << (PAGE_SHIFT - 9);
+
+       /* Why is fls() 1-based???? */
+       pow_two_sectors = 1 << (fls(max_sectors) - 1);
+
+       return pow_two_sectors;
+}
+
+static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
+                                              unsigned int num_slots,
+                                              unsigned int *num_bios,
+                                              unsigned int *slots_per_bio)
+{
+       unsigned int max_sectors, io_sectors;
+
+       max_sectors = compute_max_sectors(reg->hr_bdev);
+
+       io_sectors = num_slots << (reg->hr_block_bits - 9);
+
+       *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
+       *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
+
+       mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
+            "device can handle %u sectors of I/O\n", io_sectors, num_slots,
+            max_sectors);
+       mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
+            *num_bios, *slots_per_bio);
+}
+
+static int o2hb_read_slots(struct o2hb_region *reg,
+                          unsigned int max_slots)
+{
+       unsigned int num_bios, slots_per_bio, start_slot, num_slots;
+       int i, status;
+       struct o2hb_bio_wait_ctxt wc;
+       struct bio **bios;
+       struct bio *bio;
+
+       o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
+
+       bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
+       if (!bios) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               return status;
+       }
+
+       o2hb_bio_wait_init(&wc, num_bios);
+
+       num_slots = slots_per_bio;
+       for(i = 0; i < num_bios; i++) {
+               start_slot = i * slots_per_bio;
+
+               /* adjust num_slots at last bio */
+               if (max_slots < (start_slot + num_slots))
+                       num_slots = max_slots - start_slot;
+
+               bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
+               if (IS_ERR(bio)) {
+                       o2hb_bio_wait_dec(&wc, num_bios - i);
+
+                       status = PTR_ERR(bio);
+                       mlog_errno(status);
+                       goto bail_and_wait;
+               }
+               bios[i] = bio;
+
+               submit_bio(READ, bio);
+       }
+
+       status = 0;
+
+bail_and_wait:
+       o2hb_wait_on_io(reg, &wc);
+
+       if (bios) {
+               for(i = 0; i < num_bios; i++)
+                       if (bios[i])
+                               bio_put(bios[i]);
+               kfree(bios);
+       }
+
+       return status;
+}
+
+static int o2hb_issue_node_write(struct o2hb_region *reg,
+                                struct bio **write_bio,
+                                struct o2hb_bio_wait_ctxt *write_wc)
+{
+       int status;
+       unsigned int slot;
+       struct bio *bio;
+
+       o2hb_bio_wait_init(write_wc, 1);
+
+       slot = o2nm_this_node();
+
+       bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
+       if (IS_ERR(bio)) {
+               status = PTR_ERR(bio);
+               mlog_errno(status);
+               goto bail;
+       }
+
+       submit_bio(WRITE, bio);
+
+       *write_bio = bio;
+       status = 0;
+bail:
+       return status;
+}
+
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+                                    struct o2hb_disk_heartbeat_block *hb_block)
+{
+       __le32 old_cksum;
+       u32 ret;
+
+       /* We want to compute the block crc with a 0 value in the
+        * hb_cksum field. Save it off here and replace after the
+        * crc. */
+       old_cksum = hb_block->hb_cksum;
+       hb_block->hb_cksum = 0;
+
+       ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+       hb_block->hb_cksum = old_cksum;
+
+       return ret;
+}
+
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+       mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
+            "cksum = 0x%x, generation 0x%"MLFx64"\n",
+            le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
+            le32_to_cpu(hb_block->hb_cksum),
+            le64_to_cpu(hb_block->hb_generation));
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+                          struct o2hb_disk_heartbeat_block *hb_block)
+{
+       u32 read, computed;
+
+       read = le32_to_cpu(hb_block->hb_cksum);
+       computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+       return read == computed;
+}
+
+/* We want to make sure that nobody is heartbeating on top of us --
+ * this will help detect an invalid configuration. */
+static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+{
+       int node_num, ret;
+       struct o2hb_disk_slot *slot;
+       struct o2hb_disk_heartbeat_block *hb_block;
+
+       node_num = o2nm_this_node();
+
+       ret = 1;
+       slot = &reg->hr_slots[node_num];
+       /* Don't check on our 1st timestamp */
+       if (slot->ds_last_time) {
+               hb_block = slot->ds_raw_block;
+
+               if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
+                       ret = 0;
+       }
+
+       return ret;
+}
+
+static inline void o2hb_prepare_block(struct o2hb_region *reg,
+                                     u64 generation)
+{
+       int node_num;
+       u64 cputime;
+       struct o2hb_disk_slot *slot;
+       struct o2hb_disk_heartbeat_block *hb_block;
+
+       node_num = o2nm_this_node();
+       slot = &reg->hr_slots[node_num];
+
+       hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
+       memset(hb_block, 0, reg->hr_block_bytes);
+       /* TODO: time stuff */
+       cputime = CURRENT_TIME.tv_sec;
+       if (!cputime)
+               cputime = 1;
+
+       hb_block->hb_seq = cpu_to_le64(cputime);
+       hb_block->hb_node = node_num;
+       hb_block->hb_generation = cpu_to_le64(generation);
+
+       /* This step must always happen last! */
+       hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
+                                                                  hb_block));
+
+       mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
+            cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
+}
+
+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
+                               struct o2nm_node *node,
+                               int idx)
+{
+       struct list_head *iter;
+       struct o2hb_callback_func *f;
+
+       list_for_each(iter, &hbcall->list) {
+               f = list_entry(iter, struct o2hb_callback_func, hc_item);
+               mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
+               (f->hc_func)(node, idx, f->hc_data);
+       }
+}
+
+/* Will run the list in order until we process the passed event */
+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
+{
+       int empty;
+       struct o2hb_callback *hbcall;
+       struct o2hb_node_event *event;
+
+       spin_lock(&o2hb_live_lock);
+       empty = list_empty(&queued_event->hn_item);
+       spin_unlock(&o2hb_live_lock);
+       if (empty)
+               return;
+
+       /* Holding callback sem assures we don't alter the callback
+        * lists when doing this, and serializes ourselves with other
+        * processes wanting callbacks. */
+       down_write(&o2hb_callback_sem);
+
+       spin_lock(&o2hb_live_lock);
+       while (!list_empty(&o2hb_node_events)
+              && !list_empty(&queued_event->hn_item)) {
+               event = list_entry(o2hb_node_events.next,
+                                  struct o2hb_node_event,
+                                  hn_item);
+               list_del_init(&event->hn_item);
+               spin_unlock(&o2hb_live_lock);
+
+               mlog(ML_HEARTBEAT, "Node %s event for %d\n",
+                    event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
+                    event->hn_node_num);
+
+               hbcall = hbcall_from_type(event->hn_event_type);
+
+               /* We should *never* have gotten on to the list with a
+                * bad type... This isn't something that we should try
+                * to recover from. */
+               BUG_ON(IS_ERR(hbcall));
+
+               o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
+
+               spin_lock(&o2hb_live_lock);
+       }
+       spin_unlock(&o2hb_live_lock);
+
+       up_write(&o2hb_callback_sem);
+}
+
+static void o2hb_queue_node_event(struct o2hb_node_event *event,
+                                 enum o2hb_callback_type type,
+                                 struct o2nm_node *node,
+                                 int node_num)
+{
+       assert_spin_locked(&o2hb_live_lock);
+
+       event->hn_event_type = type;
+       event->hn_node = node;
+       event->hn_node_num = node_num;
+
+       mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
+            type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
+
+       list_add_tail(&event->hn_item, &o2hb_node_events);
+}
+
+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
+{
+       struct o2hb_node_event event =
+               { .hn_item = LIST_HEAD_INIT(event.hn_item), };
+       struct o2nm_node *node;
+
+       node = o2nm_get_node_by_num(slot->ds_node_num);
+       if (!node)
+               return;
+
+       spin_lock(&o2hb_live_lock);
+       if (!list_empty(&slot->ds_live_item)) {
+               mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
+                    slot->ds_node_num);
+
+               list_del_init(&slot->ds_live_item);
+
+               if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+                       clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+                       o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+                                             slot->ds_node_num);
+               }
+       }
+       spin_unlock(&o2hb_live_lock);
+
+       o2hb_run_event_list(&event);
+
+       o2nm_node_put(node);
+}
+
+static int o2hb_check_slot(struct o2hb_region *reg,
+                          struct o2hb_disk_slot *slot)
+{
+       int changed = 0, gen_changed = 0;
+       struct o2hb_node_event event =
+               { .hn_item = LIST_HEAD_INIT(event.hn_item), };
+       struct o2nm_node *node;
+       struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
+       u64 cputime;
+
+       memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
+       /* Is this correct? Do we assume that the node doesn't exist
+        * if we're not configured for him? */
+       node = o2nm_get_node_by_num(slot->ds_node_num);
+       if (!node)
+               return 0;
+
+       if (!o2hb_verify_crc(reg, hb_block)) {
+               /* all paths from here will drop o2hb_live_lock for
+                * us. */
+               spin_lock(&o2hb_live_lock);
+
+               /* Don't print an error on the console in this case -
+                * a freshly formatted heartbeat area will not have a
+                * crc set on it. */
+               if (list_empty(&slot->ds_live_item))
+                       goto out;
+
+               /* The node is live but pushed out a bad crc. We
+                * consider it a transient miss but don't populate any
+                * other values as they may be junk. */
+               mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+                    slot->ds_node_num, reg->hr_dev_name);
+               o2hb_dump_slot(hb_block);
+
+               slot->ds_equal_samples++;
+               goto fire_callbacks;
+       }
+
+       /* we don't care if these wrap.. the state transitions below
+        * clear at the right places */
+       cputime = le64_to_cpu(hb_block->hb_seq);
+       if (slot->ds_last_time != cputime)
+               slot->ds_changed_samples++;
+       else
+               slot->ds_equal_samples++;
+       slot->ds_last_time = cputime;
+
+       /* The node changed heartbeat generations. We assume this to
+        * mean it dropped off but came back before we timed out. We
+        * want to consider it down for the time being but don't want
+        * to lose any changed_samples state we might build up to
+        * considering it live again. */
+       if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+               gen_changed = 1;
+               slot->ds_equal_samples = 0;
+               mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
+                    "to 0x%"MLFx64")\n", slot->ds_node_num,
+                    slot->ds_last_generation,
+                    le64_to_cpu(hb_block->hb_generation));
+       }
+
+       slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+       mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
+            "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
+            slot->ds_node_num, slot->ds_last_generation,
+            le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), 
+            slot->ds_last_time, slot->ds_changed_samples,
+            slot->ds_equal_samples);
+
+       spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
+       /* dead nodes only come to life after some number of
+        * changes at any time during their dead time */
+       if (list_empty(&slot->ds_live_item) &&
+           slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
+               mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
+                    "region\n", slot->ds_node_num, slot->ds_last_generation);
+
+               /* first on the list generates a callback */
+               if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+                       set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+                       o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
+                                             slot->ds_node_num);
+
+                       changed = 1;
+               }
+
+               list_add_tail(&slot->ds_live_item,
+                             &o2hb_live_slots[slot->ds_node_num]);
+
+               slot->ds_equal_samples = 0;
+               goto out;
+       }
+
+       /* if the list is dead, we're done.. */
+       if (list_empty(&slot->ds_live_item))
+               goto out;
+
+       /* live nodes only go dead after enough consequtive missed
+        * samples..  reset the missed counter whenever we see
+        * activity */
+       if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
+               mlog(ML_HEARTBEAT, "Node %d left my region\n",
+                    slot->ds_node_num);
+
+               /* last off the live_slot generates a callback */
+               list_del_init(&slot->ds_live_item);
+               if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+                       clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+                       o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+                                             slot->ds_node_num);
+
+                       changed = 1;
+               }
+
+               /* We don't clear this because the node is still
+                * actually writing new blocks. */
+               if (!gen_changed)
+                       slot->ds_changed_samples = 0;
+               goto out;
+       }
+       if (slot->ds_changed_samples) {
+               slot->ds_changed_samples = 0;
+               slot->ds_equal_samples = 0;
+       }
+out:
+       spin_unlock(&o2hb_live_lock);
+
+       o2hb_run_event_list(&event);
+
+       o2nm_node_put(node);
+       return changed;
+}
+
+/* This could be faster if we just implmented a find_last_bit, but I
+ * don't think the circumstances warrant it. */
+static int o2hb_highest_node(unsigned long *nodes,
+                            int numbits)
+{
+       int highest, node;
+
+       highest = numbits;
+       node = -1;
+       while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
+               if (node >= numbits)
+                       break;
+
+               highest = node;
+       }
+
+       return highest;
+}
+
+static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+{
+       int i, ret, highest_node, change = 0;
+       unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       struct bio *write_bio;
+       struct o2hb_bio_wait_ctxt write_wc;
+
+       if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
+               return;
+
+       highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+       if (highest_node >= O2NM_MAX_NODES) {
+               mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+               return;
+       }
+
+       /* No sense in reading the slots of nodes that don't exist
+        * yet. Of course, if the node definitions have holes in them
+        * then we're reading an empty slot anyway... Consider this
+        * best-effort. */
+       ret = o2hb_read_slots(reg, highest_node + 1);
+       if (ret < 0) {
+               mlog_errno(ret);
+               return;
+       }
+
+       /* With an up to date view of the slots, we can check that no
+        * other node has been improperly configured to heartbeat in
+        * our slot. */
+       if (!o2hb_check_last_timestamp(reg))
+               mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
+                    "in our slot!\n", reg->hr_dev_name);
+
+       /* fill in the proper info for our next heartbeat */
+       o2hb_prepare_block(reg, reg->hr_generation);
+
+       /* And fire off the write. Note that we don't wait on this I/O
+        * until later. */
+       ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+       if (ret < 0) {
+               mlog_errno(ret);
+               return;
+       }
+
+       i = -1;
+       while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+
+               change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+       }
+
+       /*
+        * We have to be sure we've advertised ourselves on disk
+        * before we can go to steady state.  This ensures that
+        * people we find in our steady state have seen us.
+        */
+       o2hb_wait_on_io(reg, &write_wc);
+       bio_put(write_bio);
+       o2hb_arm_write_timeout(reg);
+
+       /* let the person who launched us know when things are steady */
+       if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
+               if (atomic_dec_and_test(&reg->hr_steady_iterations))
+                       wake_up(&o2hb_steady_queue);
+       }
+}
+
+/* Subtract b from a, storing the result in a. a *must* have a larger
+ * value than b. */
+static void o2hb_tv_subtract(struct timeval *a,
+                            struct timeval *b)
+{
+       /* just return 0 when a is after b */
+       if (a->tv_sec < b->tv_sec ||
+           (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
+               a->tv_sec = 0;
+               a->tv_usec = 0;
+               return;
+       }
+
+       a->tv_sec -= b->tv_sec;
+       a->tv_usec -= b->tv_usec;
+       while ( a->tv_usec < 0 ) {
+               a->tv_sec--;
+               a->tv_usec += 1000000;
+       }
+}
+
+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+                                      struct timeval *end)
+{
+       struct timeval res = *end;
+
+       o2hb_tv_subtract(&res, start);
+
+       return res.tv_sec * 1000 + res.tv_usec / 1000;
+}
+
+/*
+ * we ride the region ref that the region dir holds.  before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
+static int o2hb_thread(void *data)
+{
+       int i, ret;
+       struct o2hb_region *reg = data;
+       struct bio *write_bio;
+       struct o2hb_bio_wait_ctxt write_wc;
+       struct timeval before_hb, after_hb;
+       unsigned int elapsed_msec;
+
+       mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+
+       set_user_nice(current, -20);
+
+       while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+               /* We track the time spent inside
+                * o2hb_do_disk_heartbeat so that we avoid more then
+                * hr_timeout_ms between disk writes. On busy systems
+                * this should result in a heartbeat which is less
+                * likely to time itself out. */
+               do_gettimeofday(&before_hb);
+
+               o2hb_do_disk_heartbeat(reg);
+
+               do_gettimeofday(&after_hb);
+               elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+
+               mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+                    before_hb.tv_sec, before_hb.tv_usec,
+                    after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
+
+               if (elapsed_msec < reg->hr_timeout_ms) {
+                       /* the kthread api has blocked signals for us so no
+                        * need to record the return value. */
+                       msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
+               }
+       }
+
+       o2hb_disarm_write_timeout(reg);
+
+       /* unclean stop is only used in very bad situation */
+       for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
+               o2hb_shutdown_slot(&reg->hr_slots[i]);
+
+       /* Explicit down notification - avoid forcing the other nodes
+        * to timeout on this region when we could just as easily
+        * write a clear generation - thus indicating to them that
+        * this node has left this region.
+        *
+        * XXX: Should we skip this on unclean_stop? */
+       o2hb_prepare_block(reg, 0);
+       ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+       if (ret == 0) {
+               o2hb_wait_on_io(reg, &write_wc);
+               bio_put(write_bio);
+       } else {
+               mlog_errno(ret);
+       }
+
+       mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+
+       return 0;
+}
+
+void o2hb_init(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
+               INIT_LIST_HEAD(&o2hb_callbacks[i].list);
+
+       for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
+               INIT_LIST_HEAD(&o2hb_live_slots[i]);
+
+       INIT_LIST_HEAD(&o2hb_node_events);
+
+       memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+}
+
+/* if we're already in a callback then we're already serialized by the sem */
+static void o2hb_fill_node_map_from_callback(unsigned long *map,
+                                            unsigned bytes)
+{
+       BUG_ON(