fs/btrfs/inode.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/kernel.h>
  20 #include <linux/bio.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/file.h>
  23 #include <linux/fs.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/highmem.h>
  26 #include <linux/time.h>
  27 #include <linux/init.h>
  28 #include <linux/string.h>
  29 #include <linux/backing-dev.h>
  30 #include <linux/mpage.h>
  31 #include <linux/swap.h>
  32 #include <linux/writeback.h>
  33 #include <linux/statfs.h>
  34 #include <linux/compat.h>
  35 #include <linux/bit_spinlock.h>
  36 #include <linux/xattr.h>
  37 #include <linux/posix_acl.h>
  38 #include <linux/falloc.h>
  39 #include "compat.h"
  40 #include "ctree.h"
  41 #include "disk-io.h"
  42 #include "transaction.h"
  43 #include "btrfs_inode.h"
  44 #include "ioctl.h"
  45 #include "print-tree.h"
  46 #include "volumes.h"
  47 #include "ordered-data.h"
  48 #include "xattr.h"
  49 #include "tree-log.h"
  50 #include "compression.h"
  51 #include "locking.h"
  52
  53 struct btrfs_iget_args {
  54         u64 ino;
  55         struct btrfs_root *root;
  56 };
  57
  58 static const struct inode_operations btrfs_dir_inode_operations;
  59 static const struct inode_operations btrfs_symlink_inode_operations;
  60 static const struct inode_operations btrfs_dir_ro_inode_operations;
  61 static const struct inode_operations btrfs_special_inode_operations;
  62 static const struct inode_operations btrfs_file_inode_operations;
  63 static const struct address_space_operations btrfs_aops;
  64 static const struct address_space_operations btrfs_symlink_aops;
  65 static const struct file_operations btrfs_dir_file_operations;
  66 static struct extent_io_ops btrfs_extent_io_ops;
  67
  68 static struct kmem_cache *btrfs_inode_cachep;
  69 struct kmem_cache *btrfs_trans_handle_cachep;
  70 struct kmem_cache *btrfs_transaction_cachep;
  71 struct kmem_cache *btrfs_path_cachep;
  72
  73 #define S_SHIFT 12
  74 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  75         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
  76         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
  77         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
  78         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
  79         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
  80         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
  81         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
  82 };
  83
  84 static void btrfs_truncate(struct inode *inode);
  85 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
  86 static noinline int cow_file_range(struct inode *inode,
  87                                    struct page *locked_page,
  88                                    u64 start, u64 end, int *page_started,
  89                                    unsigned long *nr_written, int unlock);
  90
  91 static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
  92 {
  93         int err;
  94
  95         err = btrfs_init_acl(inode, dir);
  96         if (!err)
  97                 err = btrfs_xattr_security_init(inode, dir);
  98         return err;
  99 }
 100
 101 /*
 102  * this does all the hard work for inserting an inline extent into
 103  * the btree.  The caller should have done a btrfs_drop_extents so that
 104  * no overlapping inline items exist in the btree
 105  */
 106 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 107                                 struct btrfs_root *root, struct inode *inode,
 108                                 u64 start, size_t size, size_t compressed_size,
 109                                 struct page **compressed_pages)
 110 {
 111         struct btrfs_key key;
 112         struct btrfs_path *path;
 113         struct extent_buffer *leaf;
 114         struct page *page = NULL;
 115         char *kaddr;
 116         unsigned long ptr;
 117         struct btrfs_file_extent_item *ei;
 118         int err = 0;
 119         int ret;
 120         size_t cur_size = size;
 121         size_t datasize;
 122         unsigned long offset;
 123         int use_compress = 0;
 124
 125         if (compressed_size && compressed_pages) {
 126                 use_compress = 1;
 127                 cur_size = compressed_size;
 128         }
 129
 130         path = btrfs_alloc_path();
 131         if (!path)
 132                 return -ENOMEM;
 133
 134         path->leave_spinning = 1;
 135         btrfs_set_trans_block_group(trans, inode);
 136
 137         key.objectid = inode->i_ino;
 138         key.offset = start;
 139         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 140         datasize = btrfs_file_extent_calc_inline_size(cur_size);
 141
 142         inode_add_bytes(inode, size);
 143         ret = btrfs_insert_empty_item(trans, root, path, &key,
 144                                       datasize);
 145         BUG_ON(ret);
 146         if (ret) {
 147                 err = ret;
 148                 goto fail;
 149         }
 150         leaf = path->nodes[0];
 151         ei = btrfs_item_ptr(leaf, path->slots[0],
 152                             struct btrfs_file_extent_item);
 153         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 154         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 155         btrfs_set_file_extent_encryption(leaf, ei, 0);
 156         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 157         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 158         ptr = btrfs_file_extent_inline_start(ei);
 159
 160         if (use_compress) {
 161                 struct page *cpage;
 162                 int i = 0;
 163                 while (compressed_size > 0) {
 164                         cpage = compressed_pages[i];
 165                         cur_size = min_t(unsigned long, compressed_size,
 166                                        PAGE_CACHE_SIZE);
 167
 168                         kaddr = kmap_atomic(cpage, KM_USER0);
 169                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
 170                         kunmap_atomic(kaddr, KM_USER0);
 171
 172                         i++;
 173                         ptr += cur_size;
 174                         compressed_size -= cur_size;
 175                 }
 176                 btrfs_set_file_extent_compression(leaf, ei,
 177                                                   BTRFS_COMPRESS_ZLIB);
 178         } else {
 179                 page = find_get_page(inode->i_mapping,
 180                                      start >> PAGE_CACHE_SHIFT);
 181                 btrfs_set_file_extent_compression(leaf, ei, 0);
 182                 kaddr = kmap_atomic(page, KM_USER0);
 183                 offset = start & (PAGE_CACHE_SIZE - 1);
 184                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
 185                 kunmap_atomic(kaddr, KM_USER0);
 186                 page_cache_release(page);
 187         }
 188         btrfs_mark_buffer_dirty(leaf);
 189         btrfs_free_path(path);
 190
 191         /*
 192          * we're an inline extent, so nobody can
 193          * extend the file past i_size without locking
 194          * a page we already have locked.
 195          *
 196          * We must do any isize and inode updates
 197          * before we unlock the pages.  Otherwise we
 198          * could end up racing with unlink.
 199          */
 200         BTRFS_I(inode)->disk_i_size = inode->i_size;
 201         btrfs_update_inode(trans, root, inode);
 202
 203         return 0;
 204 fail:
 205         btrfs_free_path(path);
 206         return err;
 207 }
 208
 209
 210 /*
 211  * conditionally insert an inline extent into the file.  This
 212  * does the checks required to make sure the data is small enough
 213  * to fit as an inline extent.
 214  */
 215 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 216                                  struct btrfs_root *root,
 217                                  struct inode *inode, u64 start, u64 end,
 218                                  size_t compressed_size,
 219                                  struct page **compressed_pages)
 220 {
 221         u64 isize = i_size_read(inode);
 222         u64 actual_end = min(end + 1, isize);
 223         u64 inline_len = actual_end - start;
 224         u64 aligned_end = (end + root->sectorsize - 1) &
 225                         ~((u64)root->sectorsize - 1);
 226         u64 hint_byte;
 227         u64 data_len = inline_len;
 228         int ret;
 229
 230         if (compressed_size)
 231                 data_len = compressed_size;
 232
 233         if (start > 0 ||
 234             actual_end >= PAGE_CACHE_SIZE ||
 235             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 236             (!compressed_size &&
 237             (actual_end & (root->sectorsize - 1)) == 0) ||
 238             end + 1 < isize ||
 239             data_len > root->fs_info->max_inline) {
 240                 return 1;
 241         }
 242
 243         ret = btrfs_drop_extents(trans, inode, start, aligned_end,
 244                                  &hint_byte, 1);
 245         BUG_ON(ret);
 246
 247         if (isize > actual_end)
 248                 inline_len = min_t(u64, isize, actual_end);
 249         ret = insert_inline_extent(trans, root, inode, start,
 250                                    inline_len, compressed_size,
 251                                    compressed_pages);
 252         BUG_ON(ret);
 253         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 254         return 0;
 255 }
 256
 257 struct async_extent {
 258         u64 start;
 259         u64 ram_size;
 260         u64 compressed_size;
 261         struct page **pages;
 262         unsigned long nr_pages;
 263         struct list_head list;
 264 };
 265
 266 struct async_cow {
 267         struct inode *inode;
 268         struct btrfs_root *root;
 269         struct page *locked_page;
 270         u64 start;
 271         u64 end;
 272         struct list_head extents;
 273         struct btrfs_work work;
 274 };
 275
 276 static noinline int add_async_extent(struct async_cow *cow,
 277                                      u64 start, u64 ram_size,
 278                                      u64 compressed_size,
 279                                      struct page **pages,
 280                                      unsigned long nr_pages)
 281 {
 282         struct async_extent *async_extent;
 283
 284         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 285         async_extent->start = start;
 286         async_extent->ram_size = ram_size;
 287         async_extent->compressed_size = compressed_size;
 288         async_extent->pages = pages;
 289         async_extent->nr_pages = nr_pages;
 290         list_add_tail(&async_extent->list, &cow->extents);
 291         return 0;
 292 }
 293
 294 /*
 295  * we create compressed extents in two phases.  The first
 296  * phase compresses a range of pages that have already been
 297  * locked (both pages and state bits are locked).
 298  *
 299  * This is done inside an ordered work queue, and the compression
 300  * is spread across many cpus.  The actual IO submission is step
 301  * two, and the ordered work queue takes care of making sure that
 302  * happens in the same order things were put onto the queue by
 303  * writepages and friends.
 304  *
 305  * If this code finds it can't get good compression, it puts an
 306  * entry onto the work queue to write the uncompressed bytes.  This
 307  * makes sure that both compressed inodes and uncompressed inodes
 308  * are written in the same order that pdflush sent them down.
 309  */
 310 static noinline int compress_file_range(struct inode *inode,
 311                                         struct page *locked_page,
 312                                         u64 start, u64 end,
 313                                         struct async_cow *async_cow,
 314                                         int *num_added)
 315 {
 316         struct btrfs_root *root = BTRFS_I(inode)->root;
 317         struct btrfs_trans_handle *trans;
 318         u64 num_bytes;
 319         u64 orig_start;
 320         u64 disk_num_bytes;
 321         u64 blocksize = root->sectorsize;
 322         u64 actual_end;
 323         u64 isize = i_size_read(inode);
 324         int ret = 0;
 325         struct page **pages = NULL;
 326         unsigned long nr_pages;
 327         unsigned long nr_pages_ret = 0;
 328         unsigned long total_compressed = 0;
 329         unsigned long total_in = 0;
 330         unsigned long max_compressed = 128 * 1024;
 331         unsigned long max_uncompressed = 128 * 1024;
 332         int i;
 333         int will_compress;
 334
 335         orig_start = start;
 336
 337         actual_end = min_t(u64, isize, end + 1);
 338 again:
 339         will_compress = 0;
 340         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 341         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 342
 343         /*
 344          * we don't want to send crud past the end of i_size through
 345          * compression, that's just a waste of CPU time.  So, if the
 346          * end of the file is before the start of our current
 347          * requested range of bytes, we bail out to the uncompressed
 348          * cleanup code that can deal with all of this.
 349          *
 350          * It isn't really the fastest way to fix things, but this is a
 351          * very uncommon corner.
 352          */
 353         if (actual_end <= start)
 354                 goto cleanup_and_bail_uncompressed;
 355
 356         total_compressed = actual_end - start;
 357
 358         /* we want to make sure that amount of ram required to uncompress
 359          * an extent is reasonable, so we limit the total size in ram
 360          * of a compressed extent to 128k.  This is a crucial number
 361          * because it also controls how easily we can spread reads across
 362          * cpus for decompression.
 363          *
 364          * We also want to make sure the amount of IO required to do
 365          * a random read is reasonably small, so we limit the size of
 366          * a compressed extent to 128k.
 367          */
 368         total_compressed = min(total_compressed, max_uncompressed);
 369         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 370         num_bytes = max(blocksize,  num_bytes);
 371         disk_num_bytes = num_bytes;
 372         total_in = 0;
 373         ret = 0;
 374
 375         /*
 376          * we do compression for mount -o compress and when the
 377          * inode has not been flagged as nocompress.  This flag can
 378          * change at any time if we discover bad compression ratios.
 379          */
 380         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
 381             btrfs_test_opt(root, COMPRESS)) {
 382                 WARN_ON(pages);
 383                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 384
 385                 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
 386                                                 total_compressed, pages,
 387                                                 nr_pages, &nr_pages_ret,
 388                                                 &total_in,
 389                                                 &total_compressed,
 390                                                 max_compressed);
 391
 392                 if (!ret) {
 393                         unsigned long offset = total_compressed &
 394                                 (PAGE_CACHE_SIZE - 1);
 395                         struct page *page = pages[nr_pages_ret - 1];
 396                         char *kaddr;
 397
 398                         /* zero the tail end of the last page, we might be
 399                          * sending it down to disk
 400                          */
 401                         if (offset) {
 402                                 kaddr = kmap_atomic(page, KM_USER0);
 403                                 memset(kaddr + offset, 0,
 404                                        PAGE_CACHE_SIZE - offset);
 405                                 kunmap_atomic(kaddr, KM_USER0);
 406                         }
 407                         will_compress = 1;
 408                 }
 409         }
 410         if (start == 0) {
 411                 trans = btrfs_join_transaction(root, 1);
 412                 BUG_ON(!trans);
 413                 btrfs_set_trans_block_group(trans, inode);
 414
 415                 /* lets try to make an inline extent */
 416                 if (ret || total_in < (actual_end - start)) {
 417                         /* we didn't compress the entire range, try
 418                          * to make an uncompressed inline extent.
 419                          */
 420                         ret = cow_file_range_inline(trans, root, inode,
 421                                                     start, end, 0, NULL);
 422                 } else {
 423                         /* try making a compressed inline extent */
 424                         ret = cow_file_range_inline(trans, root, inode,
 425                                                     start, end,
 426                                                     total_compressed, pages);
 427                 }
 428                 if (ret == 0) {
 429                         /*
 430                          * inline extent creation worked, we don't need
 431                          * to create any more async work items.  Unlock
 432                          * and free up our temp pages.
 433                          */
 434                         extent_clear_unlock_delalloc(inode,
 435                              &BTRFS_I(inode)->io_tree,
 436                              start, end, NULL,
 437                              EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
 438                              EXTENT_CLEAR_DELALLOC |
 439                              EXTENT_CLEAR_ACCOUNTING |
 440                              EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
 441
 442                         btrfs_end_transaction(trans, root);
 443                         goto free_pages_out;
 444                 }
 445                 btrfs_end_transaction(trans, root);
 446         }
 447
 448         if (will_compress) {
 449                 /*
 450                  * we aren't doing an inline extent round the compressed size
 451                  * up to a block size boundary so the allocator does sane
 452                  * things
 453                  */
 454                 total_compressed = (total_compressed + blocksize - 1) &
 455                         ~(blocksize - 1);
 456
 457                 /*
 458                  * one last check to make sure the compression is really a
 459                  * win, compare the page count read with the blocks on disk
 460                  */
 461                 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
 462                         ~(PAGE_CACHE_SIZE - 1);
 463                 if (total_compressed >= total_in) {
 464                         will_compress = 0;
 465                 } else {
 466                         disk_num_bytes = total_compressed;
 467                         num_bytes = total_in;
 468                 }
 469         }
 470         if (!will_compress && pages) {
 471                 /*
 472                  * the compression code ran but failed to make things smaller,
 473                  * free any pages it allocated and our page pointer array
 474                  */
 475                 for (i = 0; i < nr_pages_ret; i++) {
 476                         WARN_ON(pages[i]->mapping);
 477                         page_cache_release(pages[i]);
 478                 }
 479                 kfree(pages);
 480                 pages = NULL;
 481                 total_compressed = 0;
 482                 nr_pages_ret = 0;
 483
 484                 /* flag the file so we don't compress in the future */
 485                 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 486         }
 487         if (will_compress) {
 488                 *num_added += 1;
 489
 490                 /* the async work queues will take care of doing actual
 491                  * allocation on disk for these compressed pages,
 492                  * and will submit them to the elevator.
 493                  */
 494                 add_async_extent(async_cow, start, num_bytes,
 495                                  total_compressed, pages, nr_pages_ret);
 496
 497                 if (start + num_bytes < end && start + num_bytes < actual_end) {
 498                         start += num_bytes;
 499                         pages = NULL;
 500                         cond_resched();
 501                         goto again;
 502                 }
 503         } else {
 504 cleanup_and_bail_uncompressed:
 505                 /*
 506                  * No compression, but we still need to write the pages in
 507                  * the file we've been given so far.  redirty the locked
 508                  * page if it corresponds to our extent and set things up
 509                  * for the async work queue to run cow_file_range to do
 510                  * the normal delalloc dance
 511                  */
 512                 if (page_offset(locked_page) >= start &&
 513                     page_offset(locked_page) <= end) {
 514                         __set_page_dirty_nobuffers(locked_page);
 515                         /* unlocked later on in the async handlers */
 516                 }
 517                 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
 518                 *num_added += 1;
 519         }
 520
 521 out:
 522         return 0;
 523
 524 free_pages_out:
 525         for (i = 0; i < nr_pages_ret; i++) {
 526                 WARN_ON(pages[i]->mapping);
 527                 page_cache_release(pages[i]);
 528         }
 529         kfree(pages);
 530
 531         goto out;
 532 }
 533
 534 /*
 535  * phase two of compressed writeback.  This is the ordered portion
 536  * of the code, which only gets called in the order the work was
 537  * queued.  We walk all the async extents created by compress_file_range
 538  * and send them down to the disk.
 539  */
 540 static noinline int submit_compressed_extents(struct inode *inode,
 541                                               struct async_cow *async_cow)
 542 {
 543         struct async_extent *async_extent;
 544         u64 alloc_hint = 0;
 545         struct btrfs_trans_handle *trans;
 546         struct btrfs_key ins;
 547         struct extent_map *em;
 548         struct btrfs_root *root = BTRFS_I(inode)->root;
 549         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 550         struct extent_io_tree *io_tree;
 551         int ret = 0;
 552
 553         if (list_empty(&async_cow->extents))
 554                 return 0;
 555
 556
 557         while (!list_empty(&async_cow->extents)) {
 558                 async_extent = list_entry(async_cow->extents.next,
 559                                           struct async_extent, list);
 560                 list_del(&async_extent->list);
 561
 562                 io_tree = &BTRFS_I(inode)->io_tree;
 563
 564 retry:
 565                 /* did the compression code fall back to uncompressed IO? */
 566                 if (!async_extent->pages) {
 567                         int page_started = 0;
 568                         unsigned long nr_written = 0;
 569
 570                         lock_extent(io_tree, async_extent->start,
 571                                     async_extent->start +
 572                                     async_extent->ram_size - 1, GFP_NOFS);
 573
 574                         /* allocate blocks */
 575                         ret = cow_file_range(inode, async_cow->locked_page,
 576                                              async_extent->start,
 577                                              async_extent->start +
 578                                              async_extent->ram_size - 1,
 579                                              &page_started, &nr_written, 0);
 580
 581                         /*
 582                          * if page_started, cow_file_range inserted an
 583                          * inline extent and took care of all the unlocking
 584                          * and IO for us.  Otherwise, we need to submit
 585                          * all those pages down to the drive.
 586                          */
 587                         if (!page_started && !ret)
 588                                 extent_write_locked_range(io_tree,
 589                                                   inode, async_extent->start,
 590                                                   async_extent->start +
 591                                                   async_extent->ram_size - 1,
 592                                                   btrfs_get_extent,
 593                                                   WB_SYNC_ALL);
 594                         kfree(async_extent);
 595                         cond_resched();
 596                         continue;
 597                 }
 598
 599                 lock_extent(io_tree, async_extent->start,
 600                             async_extent->start + async_extent->ram_size - 1,
 601                             GFP_NOFS);
 602
 603                 trans = btrfs_join_transaction(root, 1);
 604                 ret = btrfs_reserve_extent(trans, root,
 605                                            async_extent->compressed_size,
 606                                            async_extent->compressed_size,
 607                                            0, alloc_hint,
 608                                            (u64)-1, &ins, 1);
 609                 btrfs_end_transaction(trans, root);
 610
 611                 if (ret) {
 612                         int i;
 613                         for (i = 0; i < async_extent->nr_pages; i++) {
 614                                 WARN_ON(async_extent->pages[i]->mapping);
 615                                 page_cache_release(async_extent->pages[i]);
 616                         }
 617                         kfree(async_extent->pages);
 618                         async_extent->nr_pages = 0;
 619                         async_extent->pages = NULL;
 620                         unlock_extent(io_tree, async_extent->start,
 621                                       async_extent->start +
 622                                       async_extent->ram_size - 1, GFP_NOFS);
 623                         goto retry;
 624                 }
 625
 626                 /*
 627                  * here we're doing allocation and writeback of the
 628                  * compressed pages
 629                  */
 630                 btrfs_drop_extent_cache(inode, async_extent->start,
 631                                         async_extent->start +
 632                                         async_extent->ram_size - 1, 0);
 633
 634                 em = alloc_extent_map(GFP_NOFS);
 635                 em->start = async_extent->start;
 636                 em->len = async_extent->ram_size;
 637                 em->orig_start = em->start;
 638
 639                 em->block_start = ins.objectid;
 640                 em->block_len = ins.offset;
 641                 em->bdev = root->fs_info->fs_devices->latest_bdev;
 642                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
 643                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 644
 645                 while (1) {
 646                         write_lock(&em_tree->lock);
 647                         ret = add_extent_mapping(em_tree, em);
 648                         write_unlock(&em_tree->lock);
 649                         if (ret != -EEXIST) {
 650                                 free_extent_map(em);
 651                                 break;
 652                         }
 653                         btrfs_drop_extent_cache(inode, async_extent->start,
 654                                                 async_extent->start +
 655                                                 async_extent->ram_size - 1, 0);
 656                 }
 657
 658                 ret = btrfs_add_ordered_extent(inode, async_extent->start,
 659                                                ins.objectid,
 660                                                async_extent->ram_size,
 661                                                ins.offset,
 662                                                BTRFS_ORDERED_COMPRESSED);
 663                 BUG_ON(ret);
 664
 665                 /*
 666                  * clear dirty, set writeback and unlock the pages.
 667                  */
 668                 extent_clear_unlock_delalloc(inode,
 669                                 &BTRFS_I(inode)->io_tree,
 670                                 async_extent->start,
 671                                 async_extent->start +
 672                                 async_extent->ram_size - 1,
 673                                 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
 674                                 EXTENT_CLEAR_UNLOCK |
 675                                 EXTENT_CLEAR_DELALLOC |
 676                                 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
 677
 678                 ret = btrfs_submit_compressed_write(inode,
 679                                     async_extent->start,
 680                                     async_extent->ram_size,
 681                                     ins.objectid,
 682                                     ins.offset, async_extent->pages,
 683                                     async_extent->nr_pages);
 684
 685                 BUG_ON(ret);
 686                 alloc_hint = ins.objectid + ins.offset;
 687                 kfree(async_extent);
 688                 cond_resched();
 689         }
 690
 691         return 0;
 692 }
 693
 694 /*
 695  * when extent_io.c finds a delayed allocation range in the file,
 696  * the call backs end up in this code.  The basic idea is to
 697  * allocate extents on disk for the range, and create ordered data structs
 698  * in ram to track those extents.
 699  *
 700  * locked_page is the page that writepage had locked already.  We use
 701  * it to make sure we don't do extra locks or unlocks.
 702  *
 703  * *page_started is set to one if we unlock locked_page and do everything
 704  * required to start IO on it.  It may be clean and already done with
 705  * IO when we return.
 706  */
 707 static noinline int cow_file_range(struct inode *inode,
 708                                    struct page *locked_page,
 709                                    u64 start, u64 end, int *page_started,
 710                                    unsigned long *nr_written,
 711                                    int unlock)
 712 {
 713         struct btrfs_root *root = BTRFS_I(inode)->root;
 714         struct btrfs_trans_handle *trans;
 715         u64 alloc_hint = 0;
 716         u64 num_bytes;
 717         unsigned long ram_size;
 718         u64 disk_num_bytes;
 719         u64 cur_alloc_size;
 720         u64 blocksize = root->sectorsize;
 721         u64 actual_end;
 722         u64 isize = i_size_read(inode);
 723         struct btrfs_key ins;
 724         struct extent_map *em;
 725         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 726         int ret = 0;
 727
 728         trans = btrfs_join_transaction(root, 1);
 729         BUG_ON(!trans);
 730         btrfs_set_trans_block_group(trans, inode);
 731
 732         actual_end = min_t(u64, isize, end + 1);
 733
 734         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 735         num_bytes = max(blocksize,  num_bytes);
 736         disk_num_bytes = num_bytes;
 737         ret = 0;
 738
 739         if (start == 0) {
 740                 /* lets try to make an inline extent */
 741                 ret = cow_file_range_inline(trans, root, inode,
 742                                             start, end, 0, NULL);
 743                 if (ret == 0) {
 744                         extent_clear_unlock_delalloc(inode,
 745                                      &BTRFS_I(inode)->io_tree,
 746                                      start, end, NULL,
 747                                      EXTENT_CLEAR_UNLOCK_PAGE |
 748                                      EXTENT_CLEAR_UNLOCK |
 749                                      EXTENT_CLEAR_DELALLOC |
 750                                      EXTENT_CLEAR_ACCOUNTING |
 751                                      EXTENT_CLEAR_DIRTY |
 752                                      EXTENT_SET_WRITEBACK |
 753                                      EXTENT_END_WRITEBACK);
 754
 755                         *nr_written = *nr_written +
 756                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 757                         *page_started = 1;
 758                         ret = 0;
 759                         goto out;
 760                 }
 761         }
 762
 763         BUG_ON(disk_num_bytes >
 764                btrfs_super_total_bytes(&root->fs_info->super_copy));
 765
 766
 767         read_lock(&BTRFS_I(inode)->extent_tree.lock);
 768         em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
 769                                    start, num_bytes);
 770         if (em) {
 771                 /*
 772                  * if block start isn't an actual block number then find the
 773                  * first block in this inode and use that as a hint.  If that
 774                  * block is also bogus then just don't worry about it.
 775                  */
 776                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 777                         free_extent_map(em);
 778                         em = search_extent_mapping(em_tree, 0, 0);
 779                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 780                                 alloc_hint = em->block_start;
 781                         if (em)
 782                                 free_extent_map(em);
 783                 } else {
 784                         alloc_hint = em->block_start;
 785                         free_extent_map(em);
 786                 }
 787         }
 788         read_unlock(&BTRFS_I(inode)->extent_tree.lock);
 789         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 790
 791         while (disk_num_bytes > 0) {
 792                 unsigned long op;
 793
 794                 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 795                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 796                                            root->sectorsize, 0, alloc_hint,
 797                                            (u64)-1, &ins, 1);
 798                 BUG_ON(ret);
 799
 800                 em = alloc_extent_map(GFP_NOFS);
 801                 em->start = start;
 802                 em->orig_start = em->start;
 803                 ram_size = ins.offset;
 804                 em->len = ins.offset;
 805
 806                 em->block_start = ins.objectid;
 807                 em->block_len = ins.offset;
 808                 em->bdev = root->fs_info->fs_devices->latest_bdev;
 809                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
 810
 811                 while (1) {
 812                         write_lock(&em_tree->lock);
 813                         ret = add_extent_mapping(em_tree, em);
 814                         write_unlock(&em_tree->lock);
 815                         if (ret != -EEXIST) {
 816                                 free_extent_map(em);
 817                                 break;
 818                         }
 819                         btrfs_drop_extent_cache(inode, start,
 820                                                 start + ram_size - 1, 0);
 821                 }
 822
 823                 cur_alloc_size = ins.offset;
 824                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 825                                                ram_size, cur_alloc_size, 0);
 826                 BUG_ON(ret);
 827
 828                 if (root->root_key.objectid ==
 829                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
 830                         ret = btrfs_reloc_clone_csums(inode, start,
 831                                                       cur_alloc_size);
 832                         BUG_ON(ret);
 833                 }
 834
 835                 if (disk_num_bytes < cur_alloc_size)
 836                         break;
 837
 838                 /* we're not doing compressed IO, don't unlock the first
 839                  * page (which the caller expects to stay locked), don't
 840                  * clear any dirty bits and don't set any writeback bits
 841                  *
 842                  * Do set the Private2 bit so we know this page was properly
 843                  * setup for writepage
 844                  */
 845                 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
 846                 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
 847                         EXTENT_SET_PRIVATE2;
 848
 849                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 850                                              start, start + ram_size - 1,
 851                                              locked_page, op);
 852                 disk_num_bytes -= cur_alloc_size;
 853                 num_bytes -= cur_alloc_size;
 854                 alloc_hint = ins.objectid + ins.offset;
 855                 start += cur_alloc_size;
 856         }
 857 out:
 858         ret = 0;
 859         btrfs_end_transaction(trans, root);
 860
 861         return ret;
 862 }
 863
 864 /*
 865  * work queue call back to started compression on a file and pages
 866  */
 867 static noinline void async_cow_start(struct btrfs_work *work)
 868 {
 869         struct async_cow *async_cow;
 870         int num_added = 0;
 871         async_cow = container_of(work, struct async_cow, work);
 872
 873         compress_file_range(async_cow->inode, async_cow->locked_page,
 874                             async_cow->start, async_cow->end, async_cow,
 875                             &num_added);
 876         if (num_added == 0)
 877                 async_cow->inode = NULL;
 878 }
 879
 880 /*
 881  * work queue call back to submit previously compressed pages
 882  */
 883 static noinline void async_cow_submit(struct btrfs_work *work)
 884 {
 885         struct async_cow *async_cow;
 886         struct btrfs_root *root;
 887         unsigned long nr_pages;
 888
 889         async_cow = container_of(work, struct async_cow, work);
 890
 891         root = async_cow->root;
 892         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
 893                 PAGE_CACHE_SHIFT;
 894
 895         atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
 896
 897         if (atomic_read(&root->fs_info->async_delalloc_pages) <
 898             5 * 1042 * 1024 &&
 899             waitqueue_active(&root->fs_info->async_submit_wait))
 900                 wake_up(&root->fs_info->async_submit_wait);
 901
 902         if (async_cow->inode)
 903                 submit_compressed_extents(async_cow->inode, async_cow);
 904 }
 905
 906 static noinline void async_cow_free(struct btrfs_work *work)
 907 {
 908         struct async_cow *async_cow;
 909         async_cow = container_of(work, struct async_cow, work);
 910         kfree(async_cow);
 911 }
 912
 913 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 914                                 u64 start, u64 end, int *page_started,
 915                                 unsigned long *nr_written)
 916 {
 917         struct async_cow *async_cow;
 918         struct btrfs_root *root = BTRFS_I(inode)->root;
 919         unsigned long nr_pages;
 920         u64 cur_end;
 921         int limit = 10 * 1024 * 1042;
 922
 923         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
 924                          1, 0, NULL, GFP_NOFS);
 925         while (start < end) {
 926                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 927                 async_cow->inode = inode;
 928                 async_cow->root = root;
 929                 async_cow->locked_page = locked_page;
 930                 async_cow->start = start;
 931
 932                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 933                         cur_end = end;
 934                 else
 935                         cur_end = min(end, start + 512 * 1024 - 1);
 936
 937                 async_cow->end = cur_end;
 938                 INIT_LIST_HEAD(&async_cow->extents);
 939
 940                 async_cow->work.func = async_cow_start;
 941                 async_cow->work.ordered_func = async_cow_submit;
 942                 async_cow->work.ordered_free = async_cow_free;
 943                 async_cow->work.flags = 0;
 944
 945                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
 946                         PAGE_CACHE_SHIFT;
 947                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
 948
 949                 btrfs_queue_worker(&root->fs_info->delalloc_workers,
 950                                    &async_cow->work);
 951
 952                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
 953                         wait_event(root->fs_info->async_submit_wait,
 954                            (atomic_read(&root->fs_info->async_delalloc_pages) <
 955                             limit));
 956                 }
 957
 958                 while (atomic_read(&root->fs_info->async_submit_draining) &&
 959                       atomic_read(&root->fs_info->async_delalloc_pages)) {
 960                         wait_event(root->fs_info->async_submit_wait,
 961                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
 962                            0));
 963                 }
 964
 965                 *nr_written += nr_pages;
 966                 start = cur_end + 1;
 967         }
 968         *page_started = 1;
 969         return 0;
 970 }
 971
 972 static noinline int csum_exist_in_range(struct btrfs_root *root,
 973                                         u64 bytenr, u64 num_bytes)
 974 {
 975         int ret;
 976         struct btrfs_ordered_sum *sums;
 977         LIST_HEAD(list);
 978
 979         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
 980                                        bytenr + num_bytes - 1, &list);
 981         if (ret == 0 && list_empty(&list))
 982                 return 0;
 983
 984         while (!list_empty(&list)) {
 985                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 986                 list_del(&sums->list);
 987                 kfree(sums);
 988         }
 989         return 1;
 990 }
 991
 992 /*
 993  * when nowcow writeback call back.  This checks for snapshots or COW copies
 994  * of the extents that exist in the file, and COWs the file as required.
 995  *
 996  * If no cow copies or snapshots exist, we write directly to the existing
 997  * blocks on disk
 998  */
 999 static noinline int run_delalloc_nocow(struct inode *inode,
1000                                        struct page *locked_page,
1001                               u64 start, u64 end, int *page_started, int force,
1002                               unsigned long *nr_written)
1003 {
1004         struct btrfs_root *root = BTRFS_I(inode)->root;
1005         struct btrfs_trans_handle *trans;
1006         struct extent_buffer *leaf;
1007         struct btrfs_path *path;
1008         struct btrfs_file_extent_item *fi;
1009         struct btrfs_key found_key;
1010         u64 cow_start;
1011         u64 cur_offset;
1012         u64 extent_end;
1013         u64 extent_offset;
1014         u64 disk_bytenr;
1015         u64 num_bytes;
1016         int extent_type;
1017         int ret;
1018         int type;
1019         int nocow;
1020         int check_prev = 1;
1021
1022         path = btrfs_alloc_path();
1023         BUG_ON(!path);
1024         trans = btrfs_join_transaction(root, 1);
1025         BUG_ON(!trans);
1026
1027         cow_start = (u64)-1;
1028         cur_offset = start;
1029         while (1) {
1030                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
1031                                                cur_offset, 0);
1032                 BUG_ON(ret < 0);
1033                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1034                         leaf = path->nodes[0];
1035                         btrfs_item_key_to_cpu(leaf, &found_key,
1036                                               path->slots[0] - 1);
1037                         if (found_key.objectid == inode->i_ino &&
1038                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1039                                 path->slots[0]--;
1040                 }
1041                 check_prev = 0;
1042 next_slot:
1043                 leaf = path->nodes[0];
1044                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1045                         ret = btrfs_next_leaf(root, path);
1046                         if (ret < 0)
1047                                 BUG_ON(1);
1048                         if (ret > 0)
1049                                 break;
1050                         leaf = path->nodes[0];
1051                 }
1052
1053                 nocow = 0;
1054                 disk_bytenr = 0;
1055                 num_bytes = 0;
1056                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1057
1058                 if (found_key.objectid > inode->i_ino ||
1059                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1060                     found_key.offset > end)
1061                         break;
1062
1063                 if (found_key.offset > cur_offset) {
1064                         extent_end = found_key.offset;
1065                         extent_type = 0;
1066                         goto out_check;
1067                 }
1068
1069                 fi = btrfs_item_ptr(leaf, path->slots[0],
1070                                     struct btrfs_file_extent_item);
1071                 extent_type = btrfs_file_extent_type(leaf, fi);
1072
1073                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1074                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1075                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1076                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1077                         extent_end = found_key.offset +
1078                                 btrfs_file_extent_num_bytes(leaf, fi);
1079                         if (extent_end <= start) {
1080                                 path->slots[0]++;
1081                                 goto next_slot;
1082                         }
1083                         if (disk_bytenr == 0)
1084                                 goto out_check;
1085                         if (btrfs_file_extent_compression(leaf, fi) ||
1086                             btrfs_file_extent_encryption(leaf, fi) ||
1087                             btrfs_file_extent_other_encoding(leaf, fi))
1088                                 goto out_check;
1089                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1090                                 goto out_check;
1091                         if (btrfs_extent_readonly(root, disk_bytenr))
1092                                 goto out_check;
1093                         if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1094                                                   found_key.offset -
1095                                                   extent_offset, disk_bytenr))
1096                                 goto out_check;
1097                         disk_bytenr += extent_offset;
1098                         disk_bytenr += cur_offset - found_key.offset;
1099                         num_bytes = min(end + 1, extent_end) - cur_offset;
1100                         /*
1101                          * force cow if csum exists in the range.
1102                          * this ensure that csum for a given extent are
1103                          * either valid or do not exist.
1104                          */
1105                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1106                                 goto out_check;
1107                         nocow = 1;
1108                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1109                         extent_end = found_key.offset +
1110                                 btrfs_file_extent_inline_len(leaf, fi);
1111                         extent_end = ALIGN(extent_end, root->sectorsize);
1112                 } else {
1113                         BUG_ON(1);
1114                 }
1115 out_check:
1116                 if (extent_end <= start) {
1117                         path->slots[0]++;
1118                         goto next_slot;
1119                 }
1120                 if (!nocow) {
1121                         if (cow_start == (u64)-1)
1122                                 cow_start = cur_offset;
1123                         cur_offset = extent_end;
1124                         if (cur_offset > end)
1125                                 break;
1126                         path->slots[0]++;
1127                         goto next_slot;
1128                 }
1129
1130                 btrfs_release_path(root, path);
1131                 if (cow_start != (u64)-1) {
1132                         ret = cow_file_range(inode, locked_page, cow_start,
1133                                         found_key.offset - 1, page_started,
1134                                         nr_written, 1);
1135                         BUG_ON(ret);
1136                         cow_start = (u64)-1;
1137                 }
1138
1139                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1140                         struct extent_map *em;
1141                         struct extent_map_tree *em_tree;
1142                         em_tree = &BTRFS_I(inode)->extent_tree;
1143                         em = alloc_extent_map(GFP_NOFS);
1144                         em->start = cur_offset;
1145                         em->orig_start = em->start;
1146                         em->len = num_bytes;
1147                         em->block_len = num_bytes;
1148                         em->block_start = disk_bytenr;
1149                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1150                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1151                         while (1) {
1152                                 write_lock(&em_tree->lock);
1153                                 ret = add_extent_mapping(em_tree, em);
1154                                 write_unlock(&em_tree->lock);
1155                                 if (ret != -EEXIST) {
1156                                         free_extent_map(em);
1157                                         break;
1158                                 }
1159                                 btrfs_drop_extent_cache(inode, em->start,
1160                                                 em->start + em->len - 1, 0);
1161                         }
1162                         type = BTRFS_ORDERED_PREALLOC;
1163                 } else {
1164                         type = BTRFS_ORDERED_NOCOW;
1165                 }
1166
1167                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1168                                                num_bytes, num_bytes, type);
1169                 BUG_ON(ret);
1170
1171                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1172                                 cur_offset, cur_offset + num_bytes - 1,
1173                                 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1174                                 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1175                                 EXTENT_SET_PRIVATE2);
1176                 cur_offset = extent_end;
1177                 if (cur_offset > end)
1178                         break;
1179         }
1180         btrfs_release_path(root, path);
1181
1182         if (cur_offset <= end && cow_start == (u64)-1)
1183                 cow_start = cur_offset;
1184         if (cow_start != (u64)-1) {
1185                 ret = cow_file_range(inode, locked_page, cow_start, end,
1186                                      page_started, nr_written, 1);
1187                 BUG_ON(ret);
1188         }
1189
1190         ret = btrfs_end_transaction(trans, root);
1191         BUG_ON(ret);
1192         btrfs_free_path(path);
1193         return 0;
1194 }
1195
1196 /*
1197  * extent_io.c call back to do delayed allocation processing
1198  */
1199 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1200                               u64 start, u64 end, int *page_started,
1201                               unsigned long *nr_written)
1202 {
1203         int ret;
1204         struct btrfs_root *root = BTRFS_I(inode)->root;
1205
1206         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
1207                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1208                                          page_started, 1, nr_written);
1209         else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1210                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1211                                          page_started, 0, nr_written);
1212         else if (!btrfs_test_opt(root, COMPRESS))
1213                 ret = cow_file_range(inode, locked_page, start, end,
1214                                       page_started, nr_written, 1);
1215         else
1216                 ret = cow_file_range_async(inode, locked_page, start, end,
1217                                            page_started, nr_written);
1218         return ret;
1219 }
1220
1221 static int btrfs_split_extent_hook(struct inode *inode,
1222                                     struct extent_state *orig, u64 split)
1223 {
1224         struct btrfs_root *root = BTRFS_I(inode)->root;
1225         u64 size;
1226
1227         if (!(orig->state & EXTENT_DELALLOC))
1228                 return 0;
1229
1230         size = orig->end - orig->start + 1;
1231         if (size > root->fs_info->max_extent) {
1232                 u64 num_extents;
1233                 u64 new_size;
1234
1235                 new_size = orig->end - split + 1;
1236                 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1237                                         root->fs_info->max_extent);
1238
1239                 /*
1240                  * if we break a large extent up then leave oustanding_extents
1241                  * be, since we've already accounted for the large extent.
1242                  */
1243                 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1244                               root->fs_info->max_extent) < num_extents)
1245                         return 0;
1246         }
1247
1248         spin_lock(&BTRFS_I(inode)->accounting_lock);
1249         BTRFS_I(inode)->outstanding_extents++;
1250         spin_unlock(&BTRFS_I(inode)->accounting_lock);
1251
1252         return 0;
1253 }
1254
1255 /*
1256  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1257  * extents so we can keep track of new extents that are just merged onto old
1258  * extents, such as when we are doing sequential writes, so we can properly
1259  * account for the metadata space we'll need.
1260  */
1261 static int btrfs_merge_extent_hook(struct inode *inode,
1262                                    struct extent_state *new,
1263                                    struct extent_state *other)
1264 {
1265         struct btrfs_root *root = BTRFS_I(inode)->root;
1266         u64 new_size, old_size;
1267         u64 num_extents;
1268
1269         /* not delalloc, ignore it */
1270         if (!(other->state & EXTENT_DELALLOC))
1271                 return 0;
1272
1273         old_size = other->end - other->start + 1;
1274         if (new->start < other->start)
1275                 new_size = other->end - new->start + 1;
1276         else
1277                 new_size = new->end - other->start + 1;
1278
1279         /* we're not bigger than the max, unreserve the space and go */
1280         if (new_size <= root->fs_info->max_extent) {
1281                 spin_lock(&BTRFS_I(inode)->accounting_lock);
1282                 BTRFS_I(inode)->outstanding_extents--;
1283                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1284                 return 0;
1285         }
1286
1287         /*
1288          * If we grew by another max_extent, just return, we want to keep that
1289          * reserved amount.
1290          */
1291         num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1292                                 root->fs_info->max_extent);
1293         if (div64_u64(new_size + root->fs_info->max_extent - 1,
1294                       root->fs_info->max_extent) > num_extents)
1295                 return 0;
1296
1297         spin_lock(&BTRFS_I(inode)->accounting_lock);
1298         BTRFS_I(inode)->outstanding_extents--;
1299         spin_unlock(&BTRFS_I(inode)->accounting_lock);
1300
1301         return 0;
1302 }
1303
1304 /*
1305  * extent_io.c set_bit_hook, used to track delayed allocation
1306  * bytes in this file, and to maintain the list of inodes that
1307  * have pending delalloc work to be done.
1308  */
1309 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1310                        unsigned long old, unsigned long bits)
1311 {
1312
1313         /*
1314          * set_bit and clear bit hooks normally require _irqsave/restore
1315          * but in this case, we are only testeing for the DELALLOC
1316          * bit, which is only set or cleared with irqs on
1317          */
1318         if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1319                 struct btrfs_root *root = BTRFS_I(inode)->root;
1320
1321                 spin_lock(&BTRFS_I(inode)->accounting_lock);
1322                 BTRFS_I(inode)->outstanding_extents++;
1323                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1324                 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1325                 spin_lock(&root->fs_info->delalloc_lock);
1326                 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1327                 root->fs_info->delalloc_bytes += end - start + 1;
1328                 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1329                         list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1330                                       &root->fs_info->delalloc_inodes);
1331                 }
1332                 spin_unlock(&root->fs_info->delalloc_lock);
1333         }
1334         return 0;
1335 }
1336
1337 /*
1338  * extent_io.c clear_bit_hook, see set_bit_hook for why
1339  */
1340 static int btrfs_clear_bit_hook(struct inode *inode,
1341                                 struct extent_state *state, unsigned long bits)
1342 {
1343         /*
1344          * set_bit and clear bit hooks normally require _irqsave/restore
1345          * but in this case, we are only testeing for the DELALLOC
1346          * bit, which is only set or cleared with irqs on
1347          */
1348         if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1349                 struct btrfs_root *root = BTRFS_I(inode)->root;
1350
1351                 if (bits & EXTENT_DO_ACCOUNTING) {
1352                         spin_lock(&BTRFS_I(inode)->accounting_lock);
1353                         BTRFS_I(inode)->outstanding_extents--;
1354                         spin_unlock(&BTRFS_I(inode)->accounting_lock);
1355                         btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1356                 }
1357
1358                 spin_lock(&root->fs_info->delalloc_lock);
1359                 if (state->end - state->start + 1 >
1360                     root->fs_info->delalloc_bytes) {
1361                         printk(KERN_INFO "btrfs warning: delalloc account "
1362                                "%llu %llu\n",
1363                                (unsigned long long)
1364                                state->end - state->start + 1,
1365                                (unsigned long long)
1366                                root->fs_info->delalloc_bytes);
1367                         btrfs_delalloc_free_space(root, inode, (u64)-1);
1368                         root->fs_info->delalloc_bytes = 0;
1369                         BTRFS_I(inode)->delalloc_bytes = 0;
1370                 } else {
1371                         btrfs_delalloc_free_space(root, inode,
1372                                                   state->end -
1373                                                   state->start + 1);
1374                         root->fs_info->delalloc_bytes -= state->end -
1375                                 state->start + 1;
1376                         BTRFS_I(inode)->delalloc_bytes -= state->end -
1377                                 state->start + 1;
1378                 }
1379                 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1380                     !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1381                         list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1382                 }
1383                 spin_unlock(&root->fs_info->delalloc_lock);
1384         }
1385         return 0;
1386 }
1387
1388 /*
1389  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1390  * we don't create bios that span stripes or chunks
1391  */
1392 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1393                          size_t size, struct bio *bio,
1394                          unsigned long bio_flags)
1395 {
1396         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1397         struct btrfs_mapping_tree *map_tree;
1398         u64 logical = (u64)bio->bi_sector << 9;
1399         u64 length = 0;
1400         u64 map_length;
1401         int ret;
1402
1403         if (bio_flags & EXTENT_BIO_COMPRESSED)
1404                 return 0;
1405
1406         length = bio->bi_size;
1407         map_tree = &root->fs_info->mapping_tree;
1408         map_length = length;
1409         ret = btrfs_map_block(map_tree, READ, logical,
1410                               &map_length, NULL, 0);
1411
1412         if (map_length < length + size)
1413                 return 1;
1414         return 0;
1415 }
1416
1417 /*
1418  * in order to insert checksums into the metadata in large chunks,
1419  * we wait until bio submission time.   All the pages in the bio are
1420  * checksummed and sums are attached onto the ordered extent record.
1421  *
1422  * At IO completion time the cums attached on the ordered extent record
1423  * are inserted into the btree
1424  */
1425 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1426                                     struct bio *bio, int mirror_num,
1427                                     unsigned long bio_flags)
1428 {
1429         struct btrfs_root *root = BTRFS_I(inode)->root;
1430         int ret = 0;
1431
1432         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1433         BUG_ON(ret);
1434         return 0;
1435 }
1436
1437 /*
1438  * in order to insert checksums into the metadata in large chunks,
1439  * we wait until bio submission time.   All the pages in the bio are
1440  * checksummed and sums are attached onto the ordered extent record.
1441  *
1442  * At IO completion time the cums attached on the ordered extent record
1443  * are inserted into the btree
1444  */
1445 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1446                           int mirror_num, unsigned long bio_flags)
1447 {
1448         struct btrfs_root *root = BTRFS_I(inode)->root;
1449         return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1450 }
1451
1452 /*
1453  * extent_io.c submission hook. This does the right thing for csum calculation
1454  * on write, or reading the csums from the tree before a read
1455  */
1456 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1457                           int mirror_num, unsigned long bio_flags)
1458 {
1459         struct btrfs_root *root = BTRFS_I(inode)->root;
1460         int ret = 0;
1461         int skip_sum;
1462
1463         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1464
1465         ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1466         BUG_ON(ret);
1467
1468         if (!(rw & (1 << BIO_RW))) {
1469                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1470                         return btrfs_submit_compressed_read(inode, bio,
1471                                                     mirror_num, bio_flags);
1472                 } else if (!skip_sum)
1473                         btrfs_lookup_bio_sums(root, inode, bio, NULL);
1474                 goto mapit;
1475         } else if (!skip_sum) {
1476                 /* csum items have already been cloned */
1477                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1478                         goto mapit;
1479                 /* we're doing a write, do the async checksumming */
1480                 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1481                                    inode, rw, bio, mirror_num,
1482                                    bio_flags, __btrfs_submit_bio_start,
1483                                    __btrfs_submit_bio_done);
1484         }
1485
1486 mapit:
1487         return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1488 }
1489
1490 /*
1491  * given a list of ordered sums record them in the inode.  This happens
1492  * at IO completion time based on sums calculated at bio submission time.
1493  */
1494 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1495                              struct inode *inode, u64 file_offset,
1496                              struct list_head *list)
1497 {
1498         struct btrfs_ordered_sum *sum;
1499
1500         btrfs_set_trans_block_group(trans, inode);
1501
1502         list_for_each_entry(sum, list, list) {
1503                 btrfs_csum_file_blocks(trans,
1504                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1505         }
1506         return 0;
1507 }
1508
1509 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1510 {
1511         if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1512                 WARN_ON(1);
1513         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1514                                    GFP_NOFS);
1515 }
1516
1517 /* see btrfs_writepage_start_hook for details on why this is required */
1518 struct btrfs_writepage_fixup {
1519         struct page *page;
1520         struct btrfs_work work;
1521 };
1522
1523 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1524 {
1525         struct btrfs_writepage_fixup *fixup;
1526         struct btrfs_ordered_extent *ordered;
1527         struct page *page;
1528         struct inode *inode;
1529         u64 page_start;
1530         u64 page_end;
1531
1532         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1533         page = fixup->page;
1534 again:
1535         lock_page(page);
1536         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1537                 ClearPageChecked(page);
1538                 goto out_page;
1539         }
1540
1541         inode = page->mapping->host;
1542         page_start = page_offset(page);
1543         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1544
1545         lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1546
1547         /* already ordered? We're done */
1548         if (PagePrivate2(page))
1549                 goto out;
1550
1551         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1552         if (ordered) {
1553                 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1554                               page_end, GFP_NOFS);
1555                 unlock_page(page);
1556                 btrfs_start_ordered_extent(inode, ordered, 1);
1557                 goto again;
1558         }
1559
1560         btrfs_set_extent_delalloc(inode, page_start, page_end);
1561         ClearPageChecked(page);
1562 out:
1563         unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1564 out_page:
1565         unlock_page(page);
1566         page_cache_release(page);
1567 }
1568
1569 /*
1570  * There are a few paths in the higher layers of the kernel that directly
1571  * set the page dirty bit without asking the filesystem if it is a
1572  * good idea.  This causes problems because we want to make sure COW
1573  * properly happens and the data=ordered rules are followed.
1574  *
1575  * In our case any range that doesn't have the ORDERED bit set
1576  * hasn't been properly setup for IO.  We kick off an async process
1577  * to fix it up.  The async helper will wait for ordered extents, set
1578  * the delalloc bit and make it safe to write the page.
1579  */
1580 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1581 {
1582         struct inode *inode = page->mapping->host;
1583         struct btrfs_writepage_fixup *fixup;
1584         struct btrfs_root *root = BTRFS_I(inode)->root;
1585
1586         /* this page is properly in the ordered list */
1587         if (TestClearPagePrivate2(page))
1588                 return 0;
1589
1590         if (PageChecked(page))
1591                 return -EAGAIN;
1592
1593         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1594         if (!fixup)
1595                 return -EAGAIN;
1596
1597         SetPageChecked(page);
1598         page_cache_get(page);
1599         fixup->work.func = btrfs_writepage_fixup_worker;
1600         fixup->page = page;
1601         btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1602         return -EAGAIN;
1603 }
1604
1605 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1606                                        struct inode *inode, u64 file_pos,
1607                                        u64 disk_bytenr, u64 disk_num_bytes,
1608                                        u64 num_bytes, u64 ram_bytes,
1609                                        u8 compression, u8 encryption,
1610                                        u16 other_encoding, int extent_type)
1611 {
1612         struct btrfs_root *root = BTRFS_I(inode)->root;
1613         struct btrfs_file_extent_item *fi;
1614         struct btrfs_path *path;
1615         struct extent_buffer *leaf;
1616         struct btrfs_key ins;
1617         u64 hint;
1618         int ret;
1619
1620         path = btrfs_alloc_path();
1621         BUG_ON(!path);
1622
1623         path->leave_spinning = 1;
1624
1625         /*
1626          * we may be replacing one extent in the tree with another.
1627          * The new extent is pinned in the extent map, and we don't want
1628          * to drop it from the cache until it is completely in the btree.
1629          *
1630          * So, tell btrfs_drop_extents to leave this extent in the cache.
1631          * the caller is expected to unpin it and allow it to be merged
1632          * with the others.
1633          */
1634         ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
1635                                  &hint, 0);
1636         BUG_ON(ret);
1637
1638         ins.objectid = inode->i_ino;
1639         ins.offset = file_pos;
1640         ins.type = BTRFS_EXTENT_DATA_KEY;
1641         ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1642         BUG_ON(ret);
1643         leaf = path->nodes[0];
1644         fi = btrfs_item_ptr(leaf, path->slots[0],
1645                             struct btrfs_file_extent_item);
1646         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1647         btrfs_set_file_extent_type(leaf, fi, extent_type);
1648         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1649         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1650         btrfs_set_file_extent_offset(leaf, fi, 0);
1651         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1652         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1653         btrfs_set_file_extent_compression(leaf, fi, compression);
1654         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1655         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1656
1657         btrfs_unlock_up_safe(path, 1);
1658         btrfs_set_lock_blocking(leaf);
1659
1660         btrfs_mark_buffer_dirty(leaf);
1661
1662         inode_add_bytes(inode, num_bytes);
1663
1664         ins.objectid = disk_bytenr;
1665         ins.offset = disk_num_bytes;
1666         ins.type = BTRFS_EXTENT_ITEM_KEY;
1667         ret = btrfs_alloc_reserved_file_extent(trans, root,
1668                                         root->root_key.objectid,
1669                                         inode->i_ino, file_pos, &ins);
1670         BUG_ON(ret);
1671         btrfs_free_path(path);
1672
1673         return 0;
1674 }
1675
1676 /*
1677  * helper function for btrfs_finish_ordered_io, this
1678  * just reads in some of the csum leaves to prime them into ram
1679  * before we start the transaction.  It limits the amount of btree
1680  * reads required while inside the transaction.
1681  */
1682 static noinline void reada_csum(struct btrfs_root *root,
1683                                 struct btrfs_path *path,
1684                                 struct btrfs_ordered_extent *ordered_extent)
1685 {
1686         struct btrfs_ordered_sum *sum;
1687         u64 bytenr;
1688
1689         sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1690                          list);
1691         bytenr = sum->sums[0].bytenr;
1692
1693         /*
1694          * we don't care about the results, the point of this search is
1695          * just to get the btree leaves into ram
1696          */
1697         btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1698 }
1699
1700 /* as ordered data IO finishes, this gets called so we can finish
1701  * an ordered extent if the range of bytes in the file it covers are
1702  * fully written.
1703  */
1704 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1705 {
1706         struct btrfs_root *root = BTRFS_I(inode)->root;
1707         struct btrfs_trans_handle *trans;
1708         struct btrfs_ordered_extent *ordered_extent = NULL;
1709         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1710         struct btrfs_path *path;
1711         int compressed = 0;
1712         int ret;
1713
1714         ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1715         if (!ret)
1716                 return 0;
1717
1718         /*
1719          * before we join the transaction, try to do some of our IO.
1720          * This will limit the amount of IO that we have to do with
1721          * the transaction running.  We're unlikely to need to do any
1722          * IO if the file extents are new, the disk_i_size checks
1723          * covers the most common case.
1724          */
1725         if (start < BTRFS_I(inode)->disk_i_size) {
1726                 path = btrfs_alloc_path();
1727                 if (path) {
1728                         ret = btrfs_lookup_file_extent(NULL, root, path,
1729                                                        inode->i_ino,
1730                                                        start, 0);
1731                         ordered_extent = btrfs_lookup_ordered_extent(inode,
1732                                                                      start);
1733                         if (!list_empty(&ordered_extent->list)) {
1734                                 btrfs_release_path(root, path);
1735                                 reada_csum(root, path, ordered_extent);
1736                         }
1737                         btrfs_free_path(path);
1738                 }
1739         }
1740
1741         if (!ordered_extent)
1742                 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1743         BUG_ON(!ordered_extent);
1744         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1745                 BUG_ON(!list_empty(&ordered_extent->list));
1746                 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1747                 if (!ret) {
1748                         trans = btrfs_join_transaction(root, 1);
1749                         ret = btrfs_update_inode(trans, root, inode);
1750                         BUG_ON(ret);
1751                         btrfs_end_transaction(trans, root);
1752                 }
1753                 goto out;
1754         }
1755
1756         lock_extent(io_tree, ordered_extent->file_offset,
1757                     ordered_extent->file_offset + ordered_extent->len - 1,
1758                     GFP_NOFS);
1759
1760         trans = btrfs_join_transaction(root, 1);
1761
1762         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1763                 compressed = 1;
1764         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1765                 BUG_ON(compressed);
1766                 ret = btrfs_mark_extent_written(trans, inode,
1767                                                 ordered_extent->file_offset,
1768                                                 ordered_extent->file_offset +
1769                                                 ordered_extent->len);
1770                 BUG_ON(ret);
1771         } else {
1772                 ret = insert_reserved_file_extent(trans, inode,
1773                                                 ordered_extent->file_offset,
1774                                                 ordered_extent->start,
1775                                                 ordered_extent->disk_len,
1776                                                 ordered_extent->len,
1777                                                 ordered_extent->len,
1778                                                 compressed, 0, 0,
1779                                                 BTRFS_FILE_EXTENT_REG);
1780                 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1781                                    ordered_extent->file_offset,
1782                                    ordered_extent->len);
1783                 BUG_ON(ret);
1784         }
1785         unlock_extent(io_tree, ordered_extent->file_offset,
1786                     ordered_extent->file_offset + ordered_extent->len - 1,
1787                     GFP_NOFS);
1788         add_pending_csums(trans, inode, ordered_extent->file_offset,
1789                           &ordered_extent->list);
1790
1791         /* this also removes the ordered extent from the tree */
1792         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1793         ret = btrfs_update_inode(trans, root, inode);
1794         BUG_ON(ret);
1795         btrfs_end_transaction(trans, root);
1796 out:
1797         /* once for us */
1798         btrfs_put_ordered_extent(ordered_extent);
1799         /* once for the tree */
1800         btrfs_put_ordered_extent(ordered_extent);
1801
1802         return 0;
1803 }
1804
1805 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1806                                 struct extent_state *state, int uptodate)
1807 {
1808         ClearPagePrivate2(page);
1809         return btrfs_finish_ordered_io(page->mapping->host, start, end);
1810 }
1811
1812 /*
1813  * When IO fails, either with EIO or csum verification fails, we
1814  * try other mirrors that might have a good copy of the data.  This
1815  * io_failure_record is used to record state as we go through all the
1816  * mirrors.  If another mirror has good data, the page is set up to date
1817  * and things continue.  If a good mirror can't be found, the original
1818  * bio end_io callback is called to indicate things have failed.
1819  */
1820 struct io_failure_record {
1821         struct page *page;
1822         u64 start;
1823         u64 len;
1824         u64 logical;
1825         unsigned long bio_flags;
1826         int last_mirror;
1827 };
1828
1829 static int btrfs_io_failed_hook(struct bio *failed_bio,
1830                          struct page *page, u64 start, u64 end,
1831                          struct extent_state *state)
1832 {
1833         struct io_failure_record *failrec = NULL;
1834         u64 private;
1835         struct extent_map *em;
1836         struct inode *inode = page->mapping->host;
1837         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1838         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1839         struct bio *bio;
1840         int num_copies;
1841         int ret;
1842         int rw;
1843         u64 logical;
1844
1845         ret = get_state_private(failure_tree, start, &private);
1846         if (ret) {
1847                 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1848                 if (!failrec)
1849                         return -ENOMEM;
1850                 failrec->start = start;
1851                 failrec->len = end - start + 1;
1852                 failrec->last_mirror = 0;
1853                 failrec->bio_flags = 0;
1854
1855                 read_lock(&em_tree->lock);
1856                 em = lookup_extent_mapping(em_tree, start, failrec->len);
1857                 if (em->start > start || em->start + em->len < start) {
1858                         free_extent_map(em);
1859                         em = NULL;
1860                 }
1861                 read_unlock(&em_tree->lock);
1862
1863                 if (!em || IS_ERR(em)) {
1864                         kfree(failrec);
1865                         return -EIO;
1866                 }
1867                 logical = start - em->start;
1868                 logical = em->block_start + logical;
1869                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1870                         logical = em->block_start;
1871                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1872                 }
1873                 failrec->logical = logical;
1874                 free_extent_map(em);
1875                 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1876                                 EXTENT_DIRTY, GFP_NOFS);
1877                 set_state_private(failure_tree, start,
1878                                  (u64)(unsigned long)failrec);
1879         } else {
1880                 failrec = (struct io_failure_record *)(unsigned long)private;
1881         }
1882         num_copies = btrfs_num_copies(
1883                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
1884                               failrec->logical, failrec->len);
1885         failrec->last_mirror++;
1886         if (!state) {
1887                 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1888                 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1889                                                     failrec->start,
1890                                                     EXTENT_LOCKED);
1891                 if (state && state->start != failrec->start)
1892                         state = NULL;
1893                 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1894         }
1895         if (!state || failrec->last_mirror > num_copies) {
1896                 set_state_private(failure_tree, failrec->start, 0);
1897                 clear_extent_bits(failure_tree, failrec->start,
1898                                   failrec->start + failrec->len - 1,
1899                                   EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1900                 kfree(failrec);
1901                 return -EIO;
1902         }
1903         bio = bio_alloc(GFP_NOFS, 1);
1904         bio->bi_private = state;
1905         bio->bi_end_io = failed_bio->bi_end_io;
1906         bio->bi_sector = failrec->logical >> 9;
1907         bio->bi_bdev = failed_bio->bi_bdev;
1908         bio->bi_size = 0;
1909
1910         bio_add_page(bio, page, failrec->len, start - page_offset(page));
1911         if (failed_bio->bi_rw & (1 << BIO_RW))
1912                 rw = WRITE;
1913         else
1914                 rw = READ;
1915
1916         BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1917                                                       failrec->last_mirror,
1918                                                       failrec->bio_flags);
1919         return 0;
1920 }
1921
1922 /*
1923  * each time an IO finishes, we do a fast check in the IO failure tree
1924  * to see if we need to process or clean up an io_failure_record
1925  */
1926 static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1927 {
1928         u64 private;
1929         u64 private_failure;
1930         struct io_failure_record *failure;
1931         int ret;
1932
1933         private = 0;
1934         if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1935                              (u64)-1, 1, EXTENT_DIRTY)) {
1936                 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1937                                         start, &private_failure);
1938                 if (ret == 0) {
1939                         failure = (struct io_failure_record *)(unsigned long)
1940                                    private_failure;
1941                         set_state_private(&BTRFS_I(inode)->io_failure_tree,
1942                                           failure->start, 0);
1943                         clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1944                                           failure->start,
1945                                           failure->start + failure->len - 1,
1946                                           EXTENT_DIRTY | EXTENT_LOCKED,
1947                                           GFP_NOFS);
1948                         kfree(failure);
1949                 }
1950         }
1951         return 0;
1952 }
1953
1954 /*
1955  * when reads are done, we need to check csums to verify the data is correct
1956  * if there's a match, we allow the bio to finish.  If not, we go through
1957  * the io_failure_record routines to find good copies
1958  */
1959 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1960                                struct extent_state *state)
1961 {
1962         size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1963         struct inode *inode = page->mapping->host;
1964         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1965         char *kaddr;
1966         u64 private = ~(u32)0;
1967         int ret;
1968         struct btrfs_root *root = BTRFS_I(inode)->root;
1969         u32 csum = ~(u32)0;
1970
1971         if (PageChecked(page)) {
1972                 ClearPageChecked(page);
1973                 goto good;
1974         }
1975
1976         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
1977                 return 0;
1978
1979         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1980             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
1981                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1982                                   GFP_NOFS);
1983                 return 0;
1984         }
1985
1986         if (state && state->start == start) {
1987                 private = state->private;
1988                 ret = 0;
1989         } else {
1990                 ret = get_state_private(io_tree, start, &private);
1991         }
1992         kaddr = kmap_atomic(page, KM_USER0);
1993         if (ret)
1994                 goto zeroit;
1995
1996         csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
1997         btrfs_csum_final(csum, (char *)&csum);
1998         if (csum != private)
1999                 goto zeroit;
2000
2001         kunmap_atomic(kaddr, KM_USER0);
2002 good:
2003         /* if the io failure tree for this inode is non-empty,
2004          * check to see if we've recovered from a failed IO
2005          */
2006         btrfs_clean_io_failures(inode, start);
2007         return 0;
2008
2009 zeroit:
2010         if (printk_ratelimit()) {
2011                 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
2012                        "private %llu\n", page->mapping->host->i_ino,
2013                        (unsigned long long)start, csum,
2014                        (unsigned long long)private);
2015         }
2016         memset(kaddr + offset, 1, end - start + 1);
2017         flush_dcache_page(page);
2018         kunmap_atomic(kaddr, KM_USER0);
2019         if (private == 0)
2020                 return 0;
2021         return -EIO;
2022 }
2023
2024 /*
2025  * This creates an orphan entry for the given inode in case something goes
2026  * wrong in the middle of an unlink/truncate.
2027  */
2028 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2029 {
2030         struct btrfs_root *root = BTRFS_I(inode)->root;
2031         int ret = 0;
2032
2033         spin_lock(&root->list_lock);
2034
2035         /* already on the orphan list, we're good */
2036         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2037                 spin_unlock(&root->list_lock);
2038                 return 0;
2039         }
2040
2041         list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2042
2043         spin_unlock(&root->list_lock);
2044
2045         /*
2046          * insert an orphan item to track this unlinked/truncated file
2047          */
2048         ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2049
2050         return ret;
2051 }
2052
2053 /*
2054  * We have done the truncate/delete so we can go ahead and remove the orphan
2055  * item for this particular inode.
2056  */
2057 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2058 {
2059         struct btrfs_root *root = BTRFS_I(inode)->root;
2060         int ret = 0;
2061
2062         spin_lock(&root->list_lock);
2063
2064         if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2065                 spin_unlock(&root->list_lock);
2066                 return 0;
2067         }
2068
2069         list_del_init(&BTRFS_I(inode)->i_orphan);
2070         if (!trans) {
2071                 spin_unlock(&root->list_lock);
2072                 return 0;
2073         }
2074
2075         spin_unlock(&root->list_lock);
2076
2077         ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2078
2079         return ret;
2080 }
2081
2082 /*
2083  * this cleans up any orphans that may be left on the list from the last use
2084  * of this root.
2085  */
2086 void btrfs_orphan_cleanup(struct btrfs_root *root)
2087 {
2088         struct btrfs_path *path;
2089         struct extent_buffer *leaf;
2090         struct btrfs_item *item;
2091         struct btrfs_key key, found_key;
2092         struct btrfs_trans_handle *trans;
2093         struct inode *inode;
2094         int ret = 0, nr_unlink = 0, nr_truncate = 0;
2095
2096         if (!xchg(&root->clean_orphans, 0))
2097                 return;
2098
2099         path = btrfs_alloc_path();
2100         BUG_ON(!path);
2101         path->reada = -1;
2102
2103         key.objectid = BTRFS_ORPHAN_OBJECTID;
2104         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
2105         key.offset = (u64)-1;
2106
2107         while (1) {
2108                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2109                 if (ret < 0) {
2110                         printk(KERN_ERR "Error searching slot for orphan: %d"
2111                                "\n", ret);
2112                         break;
2113                 }
2114
2115                 /*
2116                  * if ret == 0 means we found what we were searching for, which
2117                  * is weird, but possible, so only screw with path if we didnt
2118                  * find the key and see if we have stuff that matches
2119                  */
2120                 if (ret > 0) {
2121                         if (path->slots[0] == 0)
2122                                 break;
2123                         path->slots[0]--;
2124                 }
2125
2126                 /* pull out the item */
2127                 leaf = path->nodes[0];
2128                 item = btrfs_item_nr(leaf, path->slots[0]);
2129                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2130
2131                 /* make sure the item matches what we want */
2132                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
2133                         break;
2134                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
2135                         break;
2136
2137                 /* release the path since we're done with it */
2138                 btrfs_release_path(root, path);
2139
2140                 /*
2141                  * this is where we are basically btrfs_lookup, without the
2142                  * crossing root thing.  we store the inode number in the
2143                  * offset of the orphan item.
2144                  */
2145                 found_key.objectid = found_key.offset;
2146                 found_key.type = BTRFS_INODE_ITEM_KEY;
2147                 found_key.offset = 0;
2148                 inode = btrfs_iget(root->fs_info->sb, &found_key, root);
2149                 if (IS_ERR(inode))
2150                         break;
2151
2152                 /*
2153                  * add this inode to the orphan list so btrfs_orphan_del does
2154                  * the proper thing when we hit it
2155                  */
2156                 spin_lock(&root->list_lock);
2157                 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2158                 spin_unlock(&root->list_lock);
2159
2160                 /*
2161                  * if this is a bad inode, means we actually succeeded in
2162                  * removing the inode, but not the orphan record, which means
2163                  * we need to manually delete the orphan since iput will just
2164                  * do a destroy_inode
2165                  */
2166                 if (is_bad_inode(inode)) {
2167                         trans = btrfs_start_transaction(root, 1);
2168                         btrfs_orphan_del(trans, inode);
2169                         btrfs_end_transaction(trans, root);
2170                         iput(inode);
2171                         continue;
2172                 }
2173
2174                 /* if we have links, this was a truncate, lets do that */
2175                 if (inode->i_nlink) {
2176                         nr_truncate++;
2177                         btrfs_truncate(inode);
2178                 } else {
2179                         nr_unlink++;
2180                 }
2181
2182                 /* this will do delete_inode and everything for us */
2183                 iput(inode);
2184         }
2185
2186         if (nr_unlink)
2187                 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2188         if (nr_truncate)
2189                 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2190
2191         btrfs_free_path(path);
2192 }
2193
2194 /*
2195  * very simple check to peek ahead in the leaf looking for xattrs.  If we
2196  * don't find any xattrs, we know there can't be any acls.
2197  *
2198  * slot is the slot the inode is in, objectid is the objectid of the inode
2199  */
2200 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2201                                           int slot, u64 objectid)
2202 {
2203         u32 nritems = btrfs_header_nritems(leaf);
2204         struct btrfs_key found_key;
2205         int scanned = 0;
2206
2207         slot++;
2208         while (slot < nritems) {
2209                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2210
2211                 /* we found a different objectid, there must not be acls */
2212                 if (found_key.objectid != objectid)
2213                         return 0;
2214
2215                 /* we found an xattr, assume we've got an acl */
2216                 if (found_key.type == BTRFS_XATTR_ITEM_KEY)
2217                         return 1;
2218
2219                 /*
2220                  * we found a key greater than an xattr key, there can't
2221                  * be any acls later on
2222                  */
2223                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
2224                         return 0;
2225
2226                 slot++;
2227                 scanned++;
2228
2229                 /*
2230                  * it goes inode, inode backrefs, xattrs, extents,
2231                  * so if there are a ton of hard links to an inode there can
2232                  * be a lot of backrefs.  Don't waste time searching too hard,
2233                  * this is just an optimization
2234                  */
2235                 if (scanned >= 8)
2236                         break;
2237         }
2238         /* we hit the end of the leaf before we found an xattr or
2239          * something larger than an xattr.  We have to assume the inode
2240          * has acls
2241          */
2242         return 1;
2243 }
2244
2245 /*
2246  * read an inode from the btree into the in-memory inode
2247  */
2248 static void btrfs_read_locked_inode(struct inode *inode)
2249 {
2250         struct btrfs_path *path;
2251         struct extent_buffer *leaf;
2252         struct btrfs_inode_item *inode_item;
2253         struct btrfs_timespec *tspec;
2254         struct btrfs_root *root = BTRFS_I(inode)->root;
2255         struct btrfs_key location;
2256         int maybe_acls;
2257         u64 alloc_group_block;
2258         u32 rdev;
2259         int ret;
2260
2261         path = btrfs_alloc_path();
2262         BUG_ON(!path);
2263         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2264
2265         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
2266         if (ret)
2267                 goto make_bad;
2268
2269         leaf = path->nodes[0];
2270         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2271                                     struct btrfs_inode_item);
2272
2273         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2274         inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
2275         inode->i_uid = btrfs_inode_uid(leaf, inode_item);
2276         inode->i_gid = btrfs_inode_gid(leaf, inode_item);
2277         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
2278
2279         tspec = btrfs_inode_atime(inode_item);
2280         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2281         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2282
2283         tspec = btrfs_inode_mtime(inode_item);
2284         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2285         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2286
2287         tspec = btrfs_inode_ctime(inode_item);
2288         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2289         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2290
2291         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2292         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2293         BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2294         inode->i_generation = BTRFS_I(inode)->generation;
2295         inode->i_rdev = 0;
2296         rdev = btrfs_inode_rdev(leaf, inode_item);
2297
2298         BTRFS_I(inode)->index_cnt = (u64)-1;
2299         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2300
2301         alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2302
2303         /*
2304          * try to precache a NULL acl entry for files that don't have
2305          * any xattrs or acls
2306          */
2307         maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
2308         if (!maybe_acls)
2309                 cache_no_acl(inode);
2310
2311         BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2312                                                 alloc_group_block, 0);
2313         btrfs_free_path(path);
2314         inode_item = NULL;
2315
2316         switch (inode->i_mode & S_IFMT) {
2317         case S_IFREG:
2318                 inode->i_mapping->a_ops = &btrfs_aops;
2319                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2320                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2321                 inode->i_fop = &btrfs_file_operations;
2322                 inode->i_op = &btrfs_file_inode_operations;
2323                 break;
2324         case S_IFDIR:
2325                 inode->i_fop = &btrfs_dir_file_operations;
2326                 if (root == root->fs_info->tree_root)
2327                         inode->i_op = &btrfs_dir_ro_inode_operations;
2328                 else
2329                         inode->i_op = &btrfs_dir_inode_operations;
2330                 break;
2331         case S_IFLNK:
2332                 inode->i_op = &btrfs_symlink_inode_operations;
2333                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2334                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2335                 break;
2336         default:
2337                 inode->i_op = &btrfs_special_inode_operations;
2338                 init_special_inode(inode, inode->i_mode, rdev);
2339                 break;
2340         }
2341
2342         btrfs_update_iflags(inode);
2343         return;
2344
2345 make_bad:
2346         btrfs_free_path(path);
2347         make_bad_inode(inode);
2348 }
2349
2350 /*
2351  * given a leaf and an inode, copy the inode fields into the leaf
2352  */
2353 static void fill_inode_item(struct btrfs_trans_handle *trans,
2354                             struct extent_buffer *leaf,
2355                             struct btrfs_inode_item *item,
2356                             struct inode *inode)
2357 {
2358         btrfs_set_inode_uid(leaf, item, inode->i_uid);
2359         btrfs_set_inode_gid(leaf, item, inode->i_gid);
2360         btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2361         btrfs_set_inode_mode(leaf, item, inode->i_mode);
2362         btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2363
2364         btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2365                                inode->i_atime.tv_sec);
2366         btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2367                                 inode->i_atime.tv_nsec);
2368
2369         btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2370                                inode->i_mtime.tv_sec);
2371         btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2372                                 inode->i_mtime.tv_nsec);
2373
2374         btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2375                                inode->i_ctime.tv_sec);
2376         btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2377                                 inode->i_ctime.tv_nsec);
2378
2379         btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2380         btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2381         btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2382         btrfs_set_inode_transid(leaf, item, trans->transid);
2383         btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2384         btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2385         btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2386 }
2387
2388 /*
2389  * copy everything in the in-memory inode into the btree.
2390  */
2391 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2392                                 struct btrfs_root *root, struct inode *inode)
2393 {
2394         struct btrfs_inode_item *inode_item;
2395         struct btrfs_path *path;
2396         struct extent_buffer *leaf;
2397         int ret;
2398
2399         path = btrfs_alloc_path();
2400         BUG_ON(!path);
2401         path->leave_spinning = 1;
2402         ret = btrfs_lookup_inode(trans, root, path,
2403                                  &BTRFS_I(inode)->location, 1);
2404         if (ret) {
2405                 if (ret > 0)
2406                         ret = -ENOENT;
2407                 goto failed;
2408         }
2409
2410         btrfs_unlock_up_safe(path, 1);
2411         leaf = path->nodes[0];
2412         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2413                                   struct btrfs_inode_item);
2414
2415         fill_inode_item(trans, leaf, inode_item, inode);
2416         btrfs_mark_buffer_dirty(leaf);
2417         btrfs_set_inode_last_trans(trans, inode);
2418         ret = 0;
2419 failed:
2420         btrfs_free_path(path);
2421         return ret;
2422 }
2423
2424
2425 /*
2426  * unlink helper that gets used here in inode.c and in the tree logging
2427  * recovery code.  It remove a link in a directory with a given name, and
2428  * also drops the back refs in the inode to the directory
2429  */
2430 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2431                        struct btrfs_root *root,
2432                        struct inode *dir, struct inode *inode,
2433                        const char *name, int name_len)
2434 {
2435         struct btrfs_path *path;
2436         int ret = 0;
2437         struct extent_buffer *leaf;
2438         struct btrfs_dir_item *di;
2439         struct btrfs_key key;
2440         u64 index;
2441
2442         path = btrfs_alloc_path();
2443         if (!path) {
2444                 ret = -ENOMEM;
2445                 goto err;
2446         }
2447
2448         path->leave_spinning = 1;
2449         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2450                                     name, name_len, -1);
2451         if (IS_ERR(di)) {
2452                 ret = PTR_ERR(di);
2453                 goto err;
2454         }
2455         if (!di) {
2456                 ret = -ENOENT;
2457                 goto err;
2458         }
2459         leaf = path->nodes[0];
2460         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2461         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2462         if (ret)
2463                 goto err;
2464         btrfs_release_path(root, path);
2465
2466         ret = btrfs_del_inode_ref(trans, root, name, name_len,
2467                                   inode->i_ino,
2468                                   dir->i_ino, &index);
2469         if (ret) {
2470                 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2471                        "inode %lu parent %lu\n", name_len, name,
2472                        inode->i_ino, dir->i_ino);
2473                 goto err;
2474         }
2475
2476         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2477                                          index, name, name_len, -1);
2478         if (IS_ERR(di)) {
2479                 ret = PTR_ERR(di);
2480                 goto err;
2481         }
2482         if (!di) {
2483                 ret = -ENOENT;
2484                 goto err;
2485         }
2486         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2487         btrfs_release_path(root, path);
2488
2489         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2490                                          inode, dir->i_ino);
2491         BUG_ON(ret != 0 && ret != -ENOENT);
2492
2493         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2494                                            dir, index);
2495         BUG_ON(ret);
2496 err:
2497         btrfs_free_path(path);
2498         if (ret)
2499                 goto out;
2500
2501         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2502         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2503         btrfs_update_inode(trans, root, dir);
2504         btrfs_drop_nlink(inode);
2505         ret = btrfs_update_inode(trans, root, inode);
2506 out:
2507         return ret;
2508 }
2509
2510 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2511 {
2512         struct btrfs_root *root;
2513         struct btrfs_trans_handle *trans;
2514         struct inode *inode = dentry->d_inode;
2515         int ret;
2516         unsigned long nr = 0;
2517
2518         root = BTRFS_I(dir)->root;
2519
2520         /*
2521          * 5 items for unlink inode
2522          * 1 for orphan
2523          */
2524         ret = btrfs_reserve_metadata_space(root, 6);
2525         if (ret)
2526                 return ret;
2527
2528         trans = btrfs_start_transaction(root, 1);
2529         if (IS_ERR(trans)) {
2530                 btrfs_unreserve_metadata_space(root, 6);
2531                 return PTR_ERR(trans);
2532         }
2533
2534         btrfs_set_trans_block_group(trans, dir);
2535
2536         btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2537
2538         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2539                                  dentry->d_name.name, dentry->d_name.len);
2540
2541         if (inode->i_nlink == 0)
2542                 ret = btrfs_orphan_add(trans, inode);
2543
2544         nr = trans->blocks_used;
2545
2546         btrfs_end_transaction_throttle(trans, root);
2547         btrfs_unreserve_metadata_space(root, 6);
2548         btrfs_btree_balance_dirty(root, nr);
2549         return ret;
2550 }
2551
2552 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2553                         struct btrfs_root *root,
2554                         struct inode *dir, u64 objectid,
2555                         const char *name, int name_len)
2556 {
2557         struct btrfs_path *path;
2558         struct extent_buffer *leaf;
2559         struct btrfs_dir_item *di;
2560         struct btrfs_key key;
2561         u64 index;
2562         int ret;
2563
2564         path = btrfs_alloc_path();
2565         if (!path)
2566                 return -ENOMEM;
2567
2568         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2569                                    name, name_len, -1);
2570         BUG_ON(!di || IS_ERR(di));
2571
2572         leaf = path->nodes[0];
2573         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2574         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2575         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2576         BUG_ON(ret);
2577         btrfs_release_path(root, path);
2578
2579         ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
2580                                  objectid, root->root_key.objectid,
2581                                  dir->i_ino, &index, name, name_len);
2582         if (ret < 0) {
2583                 BUG_ON(ret != -ENOENT);
2584                 di = btrfs_search_dir_index_item(root, path, dir->i_ino,
2585                                                  name, name_len);
2586                 BUG_ON(!di || IS_ERR(di));
2587
2588                 leaf = path->nodes[0];
2589                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2590                 btrfs_release_path(root, path);
2591                 index = key.offset;
2592         }
2593
2594         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2595                                          index, name, name_len, -1);
2596         BUG_ON(!di || IS_ERR(di));
2597
2598         leaf = path->nodes[0];
2599         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2600         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2601         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2602         BUG_ON(ret);
2603         btrfs_release_path(root, path);
2604
2605         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2606         dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2607         ret = btrfs_update_inode(trans, root, dir);
2608         BUG_ON(ret);
2609         dir->i_sb->s_dirt = 1;
2610
2611         btrfs_free_path(path);
2612         return 0;
2613 }
2614
2615 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2616 {
2617         struct inode *inode = dentry->d_inode;
2618         int err = 0;
2619         int ret;
2620         struct btrfs_root *root = BTRFS_I(dir)->root;
2621         struct btrfs_trans_handle *trans;
2622         unsigned long nr = 0;
2623
2624         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2625             inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2626                 return -ENOTEMPTY;
2627
2628         ret = btrfs_reserve_metadata_space(root, 5);
2629         if (ret)
2630                 return ret;
2631
2632         trans = btrfs_start_transaction(root, 1);
2633         if (IS_ERR(trans)) {
2634                 btrfs_unreserve_metadata_space(root, 5);
2635                 return PTR_ERR(trans);
2636         }
2637
2638         btrfs_set_trans_block_group(trans, dir);
2639
2640         if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
2641                 err = btrfs_unlink_subvol(trans, root, dir,
2642                                           BTRFS_I(inode)->location.objectid,
2643                                           dentry->d_name.name,
2644                                           dentry->d_name.len);
2645                 goto out;
2646         }
2647
2648         err = btrfs_orphan_add(trans, inode);
2649         if (err)
2650                 goto out;
2651
2652         /* now the directory is empty */
2653         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2654                                  dentry->d_name.name, dentry->d_name.len);
2655         if (!err)
2656                 btrfs_i_size_write(inode, 0);
2657 out:
2658         nr = trans->blocks_used;
2659         ret = btrfs_end_transaction_throttle(trans, root);
2660         btrfs_unreserve_metadata_space(root, 5);
2661         btrfs_btree_balance_dirty(root, nr);
2662
2663         if (ret && !err)
2664                 err = ret;
2665         return err;
2666 }
2667
2668 #if 0
2669 /*
2670  * when truncating bytes in a file, it is possible to avoid reading
2671  * the leaves that contain only checksum items.  This can be the
2672  * majority of the IO required to delete a large file, but it must
2673  * be done carefully.
2674  *
2675  * The keys in the level just above the leaves are checked to make sure
2676  * the lowest key in a given leaf is a csum key, and starts at an offset
2677  * after the new  size.
2678  *
2679  * Then the key for the next leaf is checked to make sure it also has
2680  * a checksum item for the same file.  If it does, we know our target leaf
2681  * contains only checksum items, and it can be safely freed without reading
2682  * it.
2683  *
2684  * This is just an optimization targeted at large files.  It may do
2685  * nothing.  It will return 0 unless things went badly.
2686  */
2687 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2688                                      struct btrfs_root *root,
2689                                      struct btrfs_path *path,
2690                                      struct inode *inode, u64 new_size)
2691 {
2692         struct btrfs_key key;
2693         int ret;
2694         int nritems;
2695         struct btrfs_key found_key;
2696         struct btrfs_key other_key;
2697         struct btrfs_leaf_ref *ref;
2698         u64 leaf_gen;
2699         u64 leaf_start;
2700
2701         path->lowest_level = 1;
2702         key.objectid = inode->i_ino;
2703         key.type = BTRFS_CSUM_ITEM_KEY;
2704         key.offset = new_size;
2705 again:
2706         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2707         if (ret < 0)
2708                 goto out;
2709
2710         if (path->nodes[1] == NULL) {
2711                 ret = 0;
2712                 goto out;
2713         }
2714         ret = 0;
2715         btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2716         nritems = btrfs_header_nritems(path->nodes[1]);
2717
2718         if (!nritems)
2719                 goto out;
2720
2721         if (path->slots[1] >= nritems)
2722                 goto next_node;
2723
2724         /* did we find a key greater than anything we want to delete? */
2725         if (found_key.objectid > inode->i_ino ||
2726            (found_key.objectid == inode->i_ino && found_key.type > key.type))
2727                 goto out;
2728
2729         /* we check the next key in the node to make sure the leave contains
2730          * only checksum items.  This comparison doesn't work if our
2731          * leaf is the last one in the node
2732          */
2733         if (path->slots[1] + 1 >= nritems) {
2734 next_node:
2735                 /* search forward from the last key in the node, this
2736                  * will bring us into the next node in the tree
2737                  */
2738                 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2739
2740                 /* unlikely, but we inc below, so check to be safe */
2741                 if (found_key.offset == (u64)-1)
2742                         goto out;
2743
2744                 /* search_forward needs a path with locks held, do the
2745                  * search again for the original key.  It is possible
2746                  * this will race with a balance and return a path that
2747                  * we could modify, but this drop is just an optimization
2748                  * and is allowed to miss some leaves.
2749                  */
2750                 btrfs_release_path(root, path);
2751                 found_key.offset++;
2752
2753                 /* setup a max key for search_forward */
2754                 other_key.offset = (u64)-1;
2755                 other_key.type = key.type;
2756                 other_key.objectid = key.objectid;
2757
2758                 path->keep_locks = 1;
2759                 ret = btrfs_search_forward(root, &found_key, &other_key,
2760                                            path, 0, 0);
2761                 path->keep_locks = 0;
2762                 if (ret || found_key.objectid != key.objectid ||
2763                     found_key.type != key.type) {
2764                         ret = 0;
2765                         goto out;
2766                 }
2767
2768                 key.offset = found_key.offset;
2769                 btrfs_release_path(root, path);
2770                 cond_resched();
2771                 goto again;
2772         }
2773
2774         /* we know there's one more slot after us in the tree,
2775          * read that key so we can verify it is also a checksum item
2776          */
2777         btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2778
2779         if (found_key.objectid < inode->i_ino)
2780                 goto next_key;
2781
2782         if (found_key.type != key.type || found_key.offset < new_size)
2783                 goto next_key;
2784
2785         /*
2786          * if the key for the next leaf isn't a csum key from this objectid,
2787          * we can't be sure there aren't good items inside this leaf.
2788          * Bail out
2789          */
2790         if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2791                 goto out;
2792
2793         leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2794         leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2795         /*
2796          * it is safe to delete this leaf, it contains only
2797          * csum items from this inode at an offset >= new_size
2798          */
2799         ret = btrfs_del_leaf(trans, root, path, leaf_start);
2800         BUG_ON(ret);
2801
2802         if (root->ref_cows && leaf_gen < trans->transid) {
2803                 ref = btrfs_alloc_leaf_ref(root, 0);
2804                 if (ref) {
2805                         ref->root_gen = root->root_key.offset;
2806                         ref->bytenr = leaf_start;
2807                         ref->owner = 0;
2808                         ref->generation = leaf_gen;
2809                         ref->nritems = 0;
2810
2811                         btrfs_sort_leaf_ref(ref);
2812
2813                         ret = btrfs_add_leaf_ref(root, ref, 0);
2814                         WARN_ON(ret);
2815                         btrfs_free_leaf_ref(root, ref);
2816                 } else {
2817                         WARN_ON(1);
2818                 }
2819         }
2820 next_key:
2821         btrfs_release_path(root, path);
2822
2823         if (other_key.objectid == inode->i_ino &&
2824             other_key.type == key.type && other_key.offset > key.offset) {
2825                 key.offset = other_key.offset;
2826                 cond_resched();
2827                 goto again;
2828         }
2829         ret = 0;
2830 out:
2831         /* fixup any changes we've made to the path */
2832         path->lowest_level = 0;
2833         path->keep_locks = 0;
2834         btrfs_release_path(root, path);
2835         return ret;
2836 }
2837
2838 #endif
2839
2840 /*
2841  * this can truncate away extent items, csum items and directory items.
2842  * It starts at a high offset and removes keys until it can't find
2843  * any higher than new_size
2844  *
2845  * csum items that cross the new i_size are truncated to the new size
2846  * as well.
2847  *
2848  * min_type is the minimum key type to truncate down to.  If set to 0, this
2849  * will kill all the items on this inode, including the INODE_ITEM_KEY.
2850  */
2851 noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2852                                         struct btrfs_root *root,
2853                                         struct inode *inode,
2854                                         u64 new_size, u32 min_type)
2855 {
2856         int ret;
2857         struct btrfs_path *path;
2858         struct btrfs_key key;
2859         struct btrfs_key found_key;
2860         u32 found_type = (u8)-1;
2861         struct extent_buffer *leaf;
2862         struct btrfs_file_extent_item *fi;
2863         u64 extent_start = 0;
2864         u64 extent_num_bytes = 0;
2865         u64 extent_offset = 0;
2866         u64 item_end = 0;
2867         int found_extent;
2868         int del_item;
2869         int pending_del_nr = 0;
2870         int pending_del_slot = 0;
2871         int extent_type = -1;
2872         int encoding;
2873         u64 mask = root->sectorsize - 1;
2874
2875         if (root->ref_cows)
2876                 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2877         path = btrfs_alloc_path();
2878         BUG_ON(!path);
2879         path->reada = -1;
2880
2881         /* FIXME, add redo link to tree so we don't leak on crash */
2882         key.objectid = inode->i_ino;
2883         key.offset = (u64)-1;
2884         key.type = (u8)-1;
2885
2886 search_again:
2887         path->leave_spinning = 1;
2888         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2889         if (ret < 0)
2890                 goto error;
2891
2892         if (ret > 0) {
2893                 /* there are no items in the tree for us to truncate, we're
2894                  * done
2895                  */
2896                 if (path->slots[0] == 0) {
2897                         ret = 0;
2898                         goto error;
2899                 }
2900                 path->slots[0]--;
2901         }
2902
2903         while (1) {
2904                 fi = NULL;
2905                 leaf = path->nodes[0];
2906                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2907                 found_type = btrfs_key_type(&found_key);
2908                 encoding = 0;
2909
2910                 if (found_key.objectid != inode->i_ino)
2911                         break;
2912
2913                 if (found_type < min_type)
2914                         break;
2915
2916                 item_end = found_key.offset;
2917                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2918                         fi = btrfs_item_ptr(leaf, path->slots[0],
2919                                             struct btrfs_file_extent_item);
2920                         extent_type = btrfs_file_extent_type(leaf, fi);
2921                         encoding = btrfs_file_extent_compression(leaf, fi);
2922                         encoding |= btrfs_file_extent_encryption(leaf, fi);
2923                         encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2924
2925                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2926                                 item_end +=
2927                                     btrfs_file_extent_num_bytes(leaf, fi);
2928                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2929                                 item_end += btrfs_file_extent_inline_len(leaf,
2930                                                                          fi);
2931                         }
2932                         item_end--;
2933                 }
2934                 if (item_end < new_size) {
2935                         if (found_type == BTRFS_DIR_ITEM_KEY)
2936                                 found_type = BTRFS_INODE_ITEM_KEY;
2937                         else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2938                                 found_type = BTRFS_EXTENT_DATA_KEY;
2939                         else if (found_type == BTRFS_EXTENT_DATA_KEY)
2940                                 found_type = BTRFS_XATTR_ITEM_KEY;
2941                         else if (found_type == BTRFS_XATTR_ITEM_KEY)
2942                                 found_type = BTRFS_INODE_REF_KEY;
2943                         else if (found_type)
2944                                 found_type--;
2945                         else
2946                                 break;
2947                         btrfs_set_key_type(&key, found_type);
2948                         goto next;
2949                 }
2950                 if (found_key.offset >= new_size)
2951                         del_item = 1;
2952                 else
2953                         del_item = 0;
2954                 found_extent = 0;
2955
2956                 /* FIXME, shrink the extent if the ref count is only 1 */
2957                 if (found_type != BTRFS_EXTENT_DATA_KEY)
2958                         goto delete;
2959
2960                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2961                         u64 num_dec;
2962                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2963                         if (!del_item && !encoding) {
2964                                 u64 orig_num_bytes =
2965                                         btrfs_file_extent_num_bytes(leaf, fi);
2966                                 extent_num_bytes = new_size -
2967                                         found_key.offset + root->sectorsize - 1;
2968                                 extent_num_bytes = extent_num_bytes &
2969                                         ~((u64)root->sectorsize - 1);
2970                                 btrfs_set_file_extent_num_bytes(leaf, fi,
2971                                                          extent_num_bytes);
2972                                 num_dec = (orig_num_bytes -
2973                                            extent_num_bytes);
2974                                 if (root->ref_cows && extent_start != 0)
2975                                         inode_sub_bytes(inode, num_dec);
2976                                 btrfs_mark_buffer_dirty(leaf);
2977                         } else {
2978                                 extent_num_bytes =
2979                                         btrfs_file_extent_disk_num_bytes(leaf,
2980                                                                          fi);
2981                                 extent_offset = found_key.offset -
2982                                         btrfs_file_extent_offset(leaf, fi);
2983
2984                                 /* FIXME blocksize != 4096 */
2985                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2986                                 if (extent_start != 0) {
2987                                         found_extent = 1;
2988                                         if (root->ref_cows)
2989                                                 inode_sub_bytes(inode, num_dec);
2990                                 }
2991                         }
2992                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2993                         /*
2994                          * we can't truncate inline items that have had
2995                          * special encodings
2996                          */
2997                         if (!del_item &&
2998                             btrfs_file_extent_compression(leaf, fi) == 0 &&
2999                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
3000                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
3001                                 u32 size = new_size - found_key.offset;
3002
3003                                 if (root->ref_cows) {
3004                                         inode_sub_bytes(inode, item_end + 1 -
3005                                                         new_size);
3006                                 }
3007                                 size =
3008                                     btrfs_file_extent_calc_inline_size(size);
3009                                 ret = btrfs_truncate_item(trans, root, path,
3010                                                           size, 1);
3011                                 BUG_ON(ret);
3012                         } else if (root->ref_cows) {
3013                                 inode_sub_bytes(inode, item_end + 1 -
3014                                                 found_key.offset);
3015                         }
3016                 }
3017 delete:
3018                 if (del_item) {
3019                         if (!pending_del_nr) {
3020                                 /* no pending yet, add ourselves */
3021                                 pending_del_slot = path->slots[0];
3022                                 pending_del_nr = 1;
3023                         } else if (pending_del_nr &&
3024                                    path->slots[0] + 1 == pending_del_slot) {
3025                                 /* hop on the pending chunk */
3026                                 pending_del_nr++;
3027                                 pending_del_slot = path->slots[0];
3028                         } else {
3029                                 BUG();
3030                         }
3031                 } else {
3032                         break;
3033                 }
3034                 if (found_extent && root->ref_cows) {
3035                         btrfs_set_path_blocking(path);
3036                         ret = btrfs_free_extent(trans, root, extent_start,
3037                                                 extent_num_bytes, 0,
3038                                                 btrfs_header_owner(leaf),
3039                                                 inode->i_ino, extent_offset);
3040                         BUG_ON(ret);
3041                 }
3042 next:
3043                 if (path->slots[0] == 0) {
3044                         if (pending_del_nr)
3045                                 goto del_pending;
3046                         btrfs_release_path(root, path);
3047                         if (found_type == BTRFS_INODE_ITEM_KEY)
3048                                 break;
3049                         goto search_again;
3050                 }
3051
3052                 path->slots[0]--;
3053                 if (pending_del_nr &&
3054                     path->slots[0] + 1 != pending_del_slot) {
3055                         struct btrfs_key debug;
3056 del_pending:
3057                         btrfs_item_key_to_cpu(path->nodes[0], &debug,
3058                                               pending_del_slot);
3059                         ret = btrfs_del_items(trans, root, path,
3060                                               pending_del_slot,
3061                                               pending_del_nr);
3062                         BUG_ON(ret);
3063                         pending_del_nr = 0;
3064                         btrfs_release_path(root, path);
3065                         if (found_type == BTRFS_INODE_ITEM_KEY)
3066                                 break;
3067                         goto search_again;
3068                 }
3069         }
3070         ret = 0;
3071 error:
3072         if (pending_del_nr) {
3073                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3074                                       pending_del_nr);
3075         }
3076         btrfs_free_path(path);
3077         return ret;
3078 }
3079
3080 /*
3081  * taken from block_truncate_page, but does cow as it zeros out
3082  * any bytes left in the last page in the file.
3083  */
3084 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3085 {
3086         struct inode *inode = mapping->host;
3087         struct btrfs_root *root = BTRFS_I(inode)->root;
3088         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3089         struct btrfs_ordered_extent *ordered;
3090         char *kaddr;
3091         u32 blocksize = root->sectorsize;
3092         pgoff_t index = from >> PAGE_CACHE_SHIFT;
3093         unsigned offset = from & (PAGE_CACHE_SIZE-1);
3094         struct page *page;
3095         int ret = 0;
3096         u64 page_start;
3097         u64 page_end;
3098
3099         if ((offset & (blocksize - 1)) == 0)
3100                 goto out;
3101         ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
3102         if (ret)
3103                 goto out;
3104
3105         ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3106         if (ret)
3107                 goto out;
3108
3109         ret = -ENOMEM;
3110 again:
3111         page = grab_cache_page(mapping, index);
3112         if (!page) {
3113                 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3114                 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3115                 goto out;
3116         }
3117
3118         page_start = page_offset(page);
3119         page_end = page_start + PAGE_CACHE_SIZE - 1;
3120
3121         if (!PageUptodate(page)) {
3122                 ret = btrfs_readpage(NULL, page);
3123                 lock_page(page);
3124                 if (page->mapping != mapping) {
3125                         unlock_page(page);
3126                         page_cache_release(page);
3127                         goto again;
3128                 }
3129                 if (!PageUptodate(page)) {
3130                         ret = -EIO;
3131                         goto out_unlock;
3132                 }
3133         }
3134         wait_on_page_writeback(page);
3135
3136         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3137         set_page_extent_mapped(page);
3138
3139         ordered = btrfs_lookup_ordered_extent(inode, page_start);
3140         if (ordered) {
3141                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3142                 unlock_page(page);
3143                 page_cache_release(page);
3144                 btrfs_start_ordered_extent(inode, ordered, 1);
3145                 btrfs_put_ordered_extent(ordered);
3146                 goto again;
3147         }
3148
3149         clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
3150                           EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3151                           GFP_NOFS);
3152
3153         ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3154         if (ret) {
3155                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3156                 goto out_unlock;
3157         }
3158
3159         ret = 0;
3160         if (offset != PAGE_CACHE_SIZE) {
3161                 kaddr = kmap(page);
3162                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
3163                 flush_dcache_page(page);
3164                 kunmap(page);
3165         }
3166         ClearPageChecked(page);
3167         set_page_dirty(page);
3168         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3169
3170 out_unlock:
3171         if (ret)
3172                 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3173         btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3174         unlock_page(page);
3175         page_cache_release(page);
3176 out:
3177         return ret;
3178 }
3179
3180 int btrfs_cont_expand(struct inode *inode, loff_t size)
3181 {
3182         struct btrfs_trans_handle *trans;
3183         struct btrfs_root *root = BTRFS_I(inode)->root;
3184         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3185         struct extent_map *em;
3186         u64 mask = root->sectorsize - 1;
3187         u64 hole_start = (inode->i_size + mask) & ~mask;
3188         u64 block_end = (size + mask) & ~mask;
3189         u64 last_byte;
3190         u64 cur_offset;
3191         u64 hole_size;
3192         int err = 0;
3193
3194         if (size <= hole_start)
3195                 return 0;
3196
3197         err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
3198         if (err)
3199                 return err;
3200
3201         while (1) {
3202                 struct btrfs_ordered_extent *ordered;
3203                 btrfs_wait_ordered_range(inode, hole_start,
3204                                          block_end - hole_start);
3205                 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
3206                 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3207                 if (!ordered)
3208                         break;
3209                 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
3210                 btrfs_put_ordered_extent(ordered);
3211         }
3212
3213         trans = btrfs_start_transaction(root, 1);
3214         btrfs_set_trans_block_group(trans, inode);
3215
3216         cur_offset = hole_start;
3217         while (1) {
3218                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
3219                                 block_end - cur_offset, 0);
3220                 BUG_ON(IS_ERR(em) || !em);
3221                 last_byte = min(extent_map_end(em), block_end);
3222                 last_byte = (last_byte + mask) & ~mask;
3223                 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
3224                         u64 hint_byte = 0;
3225                         hole_size = last_byte - cur_offset;
3226                         err = btrfs_drop_extents(trans, inode, cur_offset,
3227                                                  cur_offset + hole_size,
3228                                                  &hint_byte, 1);
3229                         if (err)
3230                                 break;
3231
3232                         err = btrfs_reserve_metadata_space(root, 1);
3233                         if (err)
3234                                 break;
3235
3236                         err = btrfs_insert_file_extent(trans, root,
3237                                         inode->i_ino, cur_offset, 0,
3238                                         0, hole_size, 0, hole_size,
3239                                         0, 0, 0);
3240                         btrfs_drop_extent_cache(inode, hole_start,
3241                                         last_byte - 1, 0);
3242                         btrfs_unreserve_metadata_space(root, 1);
3243                 }
3244                 free_extent_map(em);
3245                 cur_offset = last_byte;
3246                 if (err || cur_offset >= block_end)
3247                         break;
3248         }
3249
3250         btrfs_end_transaction(trans, root);
3251         unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
3252         return err;
3253 }
3254
3255 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3256 {
3257         struct inode *inode = dentry->d_inode;
3258         int err;
3259
3260         err = inode_change_ok(inode, attr);
3261         if (err)
3262                 return err;
3263
3264         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3265                 if (attr->ia_size > inode->i_size) {
3266                         err = btrfs_cont_expand(inode, attr->ia_size);
3267                         if (err)
3268                                 return err;
3269                 } else if (inode->i_size > 0 &&
3270                            attr->ia_size == 0) {
3271
3272                         /* we're truncating a file that used to have good
3273                          * data down to zero.  Make sure it gets into
3274                          * the ordered flush list so that any new writes
3275                          * get down to disk quickly.
3276                          */
3277                         BTRFS_I(inode)->ordered_data_close = 1;
3278                 }
3279         }
3280
3281         err = inode_setattr(inode, attr);
3282
3283         if (!err && ((attr->ia_valid & ATTR_MODE)))
3284                 err = btrfs_acl_chmod(inode);
3285         return err;
3286 }
3287
3288 void btrfs_delete_inode(struct inode *inode)
3289 {
3290         struct btrfs_trans_handle *trans;
3291         struct btrfs_root *root = BTRFS_I(inode)->root;
3292         unsigned long nr;
3293         int ret;
3294
3295         truncate_inode_pages(&inode->i_data, 0);
3296         if (is_bad_inode(inode)) {
3297                 btrfs_orphan_del(NULL, inode);
3298                 goto no_delete;
3299         }
3300         btrfs_wait_ordered_range(inode, 0, (u64)-1);
3301
3302         if (root->fs_info->log_root_recovering) {
3303                 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
3304                 goto no_delete;
3305         }
3306
3307         if (inode->i_nlink > 0) {
3308                 BUG_ON(btrfs_root_refs(&root->root_item) != 0);
3309                 goto no_delete;
3310         }
3311
3312         btrfs_i_size_write(inode, 0);
3313         trans = btrfs_join_transaction(root, 1);
3314
3315         btrfs_set_trans_block_group(trans, inode);
3316         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
3317         if (ret) {
3318                 btrfs_orphan_del(NULL, inode);
3319                 goto no_delete_lock;
3320         }
3321
3322         btrfs_orphan_del(trans, inode);
3323
3324         nr = trans->blocks_used;
3325         clear_inode(inode);
3326
3327         btrfs_end_transaction(trans, root);
3328         btrfs_btree_balance_dirty(root, nr);
3329         return;
3330
3331 no_delete_lock:
3332         nr = trans->blocks_used;
3333         btrfs_end_transaction(trans, root);
3334         btrfs_btree_balance_dirty(root, nr);
3335 no_delete:
3336         clear_inode(inode);
3337 }
3338
3339 /*
3340  * this returns the key found in the dir entry in the location pointer.
3341  * If no dir entries were found, location->objectid is 0.
3342  */
3343 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3344                                struct btrfs_key *location)
3345 {
3346         const char *name = dentry->d_name.name;
3347         int namelen = dentry->d_name.len;
3348         struct btrfs_dir_item *di;
3349         struct btrfs_path *path;
3350         struct btrfs_root *root = BTRFS_I(dir)->root;
3351         int ret = 0;
3352
3353         path = btrfs_alloc_path();
3354         BUG_ON(!path);
3355
3356         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
3357                                     namelen, 0);
3358         if (IS_ERR(di))
3359                 ret = PTR_ERR(di);
3360
3361         if (!di || IS_ERR(di))
3362                 goto out_err;
3363
3364         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
3365 out:
3366         btrfs_free_path(path);
3367         return ret;
3368 out_err:
3369         location->objectid = 0;
3370         goto out;
3371 }
3372
3373 /*
3374  * when we hit a tree root in a directory, the btrfs part of the inode
3375  * needs to be changed to reflect the root directory of the tree root.  This
3376  * is kind of like crossing a mount point.
3377  */
3378 static int fixup_tree_root_location(struct btrfs_root *root,
3379                                     struct inode *dir,
3380                                     struct dentry *dentry,
3381                                     struct btrfs_key *location,
3382                                     struct btrfs_root **sub_root)
3383 {
3384         struct btrfs_path *path;
3385         struct btrfs_root *new_root;
3386         struct btrfs_root_ref *ref;
3387         struct extent_buffer *leaf;
3388         int ret;
3389         int err = 0;
3390
3391         path = btrfs_alloc_path();
3392         if (!path) {
3393                 err = -ENOMEM;
3394                 goto out;
3395         }
3396
3397         err = -ENOENT;
3398         ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
3399                                   BTRFS_I(dir)->root->root_key.objectid,
3400                                   location->objectid);
3401         if (ret) {
3402                 if (ret < 0)
3403                         err = ret;
3404                 goto out;
3405         }
3406
3407         leaf = path->nodes[0];
3408         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3409         if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
3410             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
3411                 goto out;
3412
3413         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
3414                                    (unsigned long)(ref + 1),
3415                                    dentry->d_name.len);
3416         if (ret)
3417                 goto out;
3418
3419         btrfs_release_path(root->fs_info->tree_root, path);
3420
3421         new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
3422         if (IS_ERR(new_root)) {
3423                 err = PTR_ERR(new_root);
3424                 goto out;
3425         }
3426
3427         if (btrfs_root_refs(&new_root->root_item) == 0) {
3428                 err = -ENOENT;
3429                 goto out;
3430         }
3431
3432         *sub_root = new_root;
3433         location->objectid = btrfs_root_dirid(&new_root->root_item);
3434         location->type = BTRFS_INODE_ITEM_KEY;
3435         location->offset = 0;
3436         err = 0;
3437 out:
3438         btrfs_free_path(path);
3439         return err;
3440 }
3441
3442 static void inode_tree_add(struct inode *inode)
3443 {
3444         struct btrfs_root *root = BTRFS_I(inode)->root;
3445         struct btrfs_inode *entry;
3446         struct rb_node **p;
3447         struct rb_node *parent;
3448 again:
3449         p = &root->inode_tree.rb_node;
3450         parent = NULL;
3451
3452         if (hlist_unhashed(&inode->i_hash))
3453                 return;
3454
3455         spin_lock(&root->inode_lock);
3456         while (*p) {
3457                 parent = *p;
3458                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3459
3460                 if (inode->i_ino < entry->vfs_inode.i_ino)
3461                         p = &parent->rb_left;
3462                 else if (inode->i_ino > entry->vfs_inode.i_ino)
3463                         p = &parent->rb_right;
3464                 else {
3465                         WARN_ON(!(entry->vfs_inode.i_state &
3466                                   (I_WILL_FREE | I_FREEING | I_CLEAR)));
3467                         rb_erase(parent, &root->inode_tree);
3468                         RB_CLEAR_NODE(parent);
3469                         spin_unlock(&root->inode_lock);
3470                         goto again;
3471                 }
3472         }
3473         rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
3474         rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3475         spin_unlock(&root->inode_lock);
3476 }
3477
3478 static void inode_tree_del(struct inode *inode)
3479 {
3480         struct btrfs_root *root = BTRFS_I(inode)->root;
3481         int empty = 0;
3482
3483         spin_lock(&root->inode_lock);
3484         if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3485                 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3486                 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3487                 empty = RB_EMPTY_ROOT(&root->inode_tree);
3488         }
3489         spin_unlock(&root->inode_lock);
3490
3491         if (empty && btrfs_root_refs(&root->root_item) == 0) {
3492                 synchronize_srcu(&root->fs_info->subvol_srcu);
3493                 spin_lock(&root->inode_lock);
3494                 empty = RB_EMPTY_ROOT(&root->inode_tree);
3495                 spin_unlock(&root->inode_lock);
3496                 if (empty)
3497                         btrfs_add_dead_root(root);
3498         }
3499 }
3500
3501 int btrfs_invalidate_inodes(struct btrfs_root *root)
3502 {
3503         struct rb_node *node;
3504         struct rb_node *prev;
3505         struct btrfs_inode *entry;
3506         struct inode *inode;
3507         u64 objectid = 0;
3508
3509         WARN_ON(btrfs_root_refs(&root->root_item) != 0);
3510
3511         spin_lock(&root->inode_lock);
3512 again:
3513         node = root->inode_tree.rb_node;
3514         prev = NULL;
3515         while (node) {
3516                 prev = node;
3517                 entry = rb_entry(node, struct btrfs_inode, rb_node);
3518
3519                 if (objectid < entry->vfs_inode.i_ino)
3520                         node = node->rb_left;
3521                 else if (objectid > entry->vfs_inode.i_ino)
3522                         node = node->rb_right;
3523                 else
3524                         break;
3525         }
3526         if (!node) {
3527                 while (prev) {
3528                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
3529                         if (objectid <= entry->vfs_inode.i_ino) {
3530                                 node = prev;
3531                                 break;
3532                         }
3533                         prev = rb_next(prev);
3534                 }
3535         }
3536         while (node) {
3537                 entry = rb_entry(node, struct btrfs_inode, rb_node);
3538                 objectid = entry->vfs_inode.i_ino + 1;
3539                 inode = igrab(&entry->vfs_inode);
3540                 if (inode) {
3541                         spin_unlock(&root->inode_lock);
3542                         if (atomic_read(&inode->i_count) > 1)
3543                                 d_prune_aliases(inode);
3544                         /*
3545                          * btrfs_drop_inode will remove it from
3546                          * the inode cache when its usage count
3547                          * hits zero.
3548                          */
3549                         iput(inode);
3550                         cond_resched();
3551                         spin_lock(&root->inode_lock);
3552                         goto again;
3553                 }
3554
3555                 if (cond_resched_lock(&root->inode_lock))
3556                         goto again;
3557
3558                 node = rb_next(node);
3559         }
3560         spin_unlock(&root->inode_lock);
3561         return 0;
3562 }
3563
3564 static noinline void init_btrfs_i(struct inode *inode)
3565 {
3566         struct btrfs_inode *bi = BTRFS_I(inode);
3567
3568         bi->generation = 0;
3569         bi->sequence = 0;
3570         bi->last_trans = 0;
3571         bi->last_sub_trans = 0;
3572         bi->logged_trans = 0;
3573         bi->delalloc_bytes = 0;
3574         bi->reserved_bytes = 0;
3575         bi->disk_i_size = 0;
3576         bi->flags = 0;
3577         bi->index_cnt = (u64)-1;
3578         bi->last_unlink_trans = 0;
3579         bi->ordered_data_close = 0;
3580         extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3581         extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3582                              inode->i_mapping, GFP_NOFS);
3583         extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3584                              inode->i_mapping, GFP_NOFS);
3585         INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3586         INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3587         RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3588         btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3589         mutex_init(&BTRFS_I(inode)->log_mutex);
3590 }
3591
3592 static int btrfs_init_locked_inode(struct inode *inode, void *p)
3593 {
3594         struct btrfs_iget_args *args = p;
3595         inode->i_ino = args->ino;
3596         init_btrfs_i(inode);
3597         BTRFS_I(inode)->root = args->root;
3598         btrfs_set_inode_space_info(args->root, inode);
3599         return 0;
3600 }
3601
3602 static int btrfs_find_actor(struct inode *inode, void *opaque)
3603 {
3604         struct btrfs_iget_args *args = opaque;
3605         return args->ino == inode->i_ino &&
3606                 args->root == BTRFS_I(inode)->root;
3607 }
3608
3609 static struct inode *btrfs_iget_locked(struct super_block *s,
3610                                        u64 objectid,
3611                                        struct btrfs_root *root)
3612 {
3613         struct inode *inode;
3614         struct btrfs_iget_args args;
3615         args.ino = objectid;
3616         args.root = root;
3617
3618         inode = iget5_locked(s, objectid, btrfs_find_actor,
3619                              btrfs_init_locked_inode,
3620                              (void *)&args);
3621         return inode;
3622 }
3623
3624 /* Get an inode object given its location and corresponding root.
3625  * Returns in *is_new if the inode was read from disk
3626  */
3627 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3628                          struct btrfs_root *root)
3629 {
3630         struct inode *inode;
3631
3632         inode = btrfs_iget_locked(s, location->objectid, root);
3633         if (!inode)
3634                 return ERR_PTR(-ENOMEM);
3635
3636         if (inode->i_state & I_NEW) {
3637                 BTRFS_I(inode)->root = root;
3638                 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3639                 btrfs_read_locked_inode(inode);
3640
3641                 inode_tree_add(inode);
3642                 unlock_new_inode(inode);
3643         }
3644
3645         return inode;
3646 }
3647
3648 static struct inode *new_simple_dir(struct super_block *s,
3649                                     struct btrfs_key *key,
3650                                     struct btrfs_root *root)
3651 {
3652         struct inode *inode = new_inode(s);
3653
3654         if (!inode)
3655                 return ERR_PTR(-ENOMEM);
3656
3657         init_btrfs_i(inode);
3658
3659         BTRFS_I(inode)->root = root;
3660         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3661         BTRFS_I(inode)->dummy_inode = 1;
3662
3663         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
3664         inode->i_op = &simple_dir_inode_operations;
3665         inode->i_fop = &simple_dir_operations;
3666         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
3667         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3668
3669         return inode;
3670 }
3671
3672 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3673 {
3674         struct inode *inode;
3675         struct btrfs_root *root = BTRFS_I(dir)->root;
3676         struct btrfs_root *sub_root = root;
3677         struct btrfs_key location;
3678         int index;
3679         int ret;
3680
3681         dentry->d_op = &btrfs_dentry_operations;
3682
3683         if (dentry->d_name.len > BTRFS_NAME_LEN)
3684                 return ERR_PTR(-ENAMETOOLONG);
3685
3686         ret = btrfs_inode_by_name(dir, dentry, &location);
3687
3688         if (ret < 0)
3689                 return ERR_PTR(ret);
3690
3691         if (location.objectid == 0)
3692                 return NULL;
3693
3694         if (location.type == BTRFS_INODE_ITEM_KEY) {
3695                 inode = btrfs_iget(dir->i_sb, &location, root);
3696                 return inode;
3697         }
3698
3699         BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
3700
3701         index = srcu_read_lock(&root->fs_info->subvol_srcu);
3702         ret = fixup_tree_root_location(root, dir, dentry,
3703                                        &location, &sub_root);
3704         if (ret < 0) {
3705                 if (ret != -ENOENT)
3706                         inode = ERR_PTR(ret);
3707                 else
3708                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
3709         } else {
3710                 inode = btrfs_iget(dir->i_sb, &location, sub_root);
3711         }
3712         srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3713
3714         if (root != sub_root) {
3715                 down_read(&root->fs_info->cleanup_work_sem);
3716                 if (!(inode->i_sb->s_flags & MS_RDONLY))
3717                         btrfs_orphan_cleanup(sub_root);
3718                 up_read(&root->fs_info->cleanup_work_sem);
3719         }
3720
3721         return inode;
3722 }
3723
3724 static int btrfs_dentry_delete(struct dentry *dentry)
3725 {
3726         struct btrfs_root *root;
3727
3728         if (!dentry->d_inode && !IS_ROOT(dentry))
3729                 dentry = dentry->d_parent;
3730
3731         if (dentry->d_inode) {
3732                 root = BTRFS_I(dentry->d_inode)->root;
3733                 if (btrfs_root_refs(&root->root_item) == 0)
3734                         return 1;
3735         }
3736         return 0;
3737 }
3738
3739 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3740                                    struct nameidata *nd)
3741 {
3742         struct inode *inode;
3743
3744         inode = btrfs_lookup_dentry(dir, dentry);
3745         if (IS_ERR(inode))
3746                 return ERR_CAST(inode);
3747
3748         return d_splice_alias(inode, dentry);
3749 }
3750
3751 static unsigned char btrfs_filetype_table[] = {
3752         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3753 };
3754
3755 static int btrfs_real_readdir(struct file *filp, void *dirent,
3756                               filldir_t filldir)
3757 {
3758         struct inode *inode = filp->f_dentry->d_inode;
3759         struct btrfs_root *root = BTRFS_I(inode)->root;
3760         struct btrfs_item *item;
3761         struct btrfs_dir_item *di;
3762         struct btrfs_key key;
3763         struct btrfs_key found_key;
3764         struct btrfs_path *path;
3765         int ret;
3766         u32 nritems;
3767         struct extent_buffer *leaf;
3768         int slot;
3769         int advance;
3770         unsigned char d_type;
3771         int over = 0;
3772         u32 di_cur;
3773         u32 di_total;
3774         u32 di_len;
3775         int key_type = BTRFS_DIR_INDEX_KEY;
3776         char tmp_name[32];
3777         char *name_ptr;
3778         int name_len;
3779
3780         /* FIXME, use a real flag for deciding about the key type */
3781         if (root->fs_info->tree_root == root)
3782                 key_type = BTRFS_DIR_ITEM_KEY;
3783
3784         /* special case for "." */
3785         if (filp->f_pos == 0) {
3786                 over = filldir(dirent, ".", 1,
3787                                1, inode->i_ino,
3788                                DT_DIR);
3789                 if (over)
3790                         return 0;
3791                 filp->f_pos = 1;
3792         }
3793         /* special case for .., just use the back ref */
3794         if (filp->f_pos == 1) {
3795                 u64 pino = parent_ino(filp->f_path.dentry);
3796                 over = filldir(dirent, "..", 2,
3797                                2, pino, DT_DIR);
3798                 if (over)
3799                         return 0;
3800                 filp->f_pos = 2;
3801         }
3802         path = btrfs_alloc_path();
3803         path->reada = 2;
3804
3805         btrfs_set_key_type(&key, key_type);
3806         key.offset = filp->f_pos;
3807         key.objectid = inode->i_ino;
3808
3809         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3810         if (ret < 0)
3811                 goto err;
3812         advance = 0;
3813
3814         while (1) {
3815                 leaf = path->nodes[0];
3816                 nritems = btrfs_header_nritems(leaf);
3817                 slot = path->slots[0];
3818                 if (advance || slot >= nritems) {
3819                         if (slot >= nritems - 1) {
3820                                 ret = btrfs_next_leaf(root, path);
3821                                 if (ret)
3822                                         break;
3823                                 leaf = path->nodes[0];
3824                                 nritems = btrfs_header_nritems(leaf);
3825                                 slot = path->slots[0];
3826                         } else {
3827                                 slot++;
3828                                 path->slots[0]++;
3829                         }
3830                 }
3831
3832                 advance = 1;
3833                 item = btrfs_item_nr(leaf, slot);
3834                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3835
3836                 if (found_key.objectid != key.objectid)
3837                         break;
3838                 if (btrfs_key_type(&found_key) != key_type)
3839                         break;
3840                 if (found_key.offset < filp->f_pos)
3841                         continue;
3842
3843                 filp->f_pos = found_key.offset;
3844
3845                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3846                 di_cur = 0;
3847                 di_total = btrfs_item_size(leaf, item);
3848
3849                 while (di_cur < di_total) {
3850                         struct btrfs_key location;
3851
3852                         name_len = btrfs_dir_name_len(leaf, di);
3853                         if (name_len <= sizeof(tmp_name)) {
3854                                 name_ptr = tmp_name;
3855                         } else {
3856                                 name_ptr = kmalloc(name_len, GFP_NOFS);
3857                                 if (!name_ptr) {
3858                                         ret = -ENOMEM;
3859                                         goto err;
3860                                 }
3861                         }
3862                         read_extent_buffer(leaf, name_ptr,
3863                                            (unsigned long)(di + 1), name_len);
3864
3865                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3866                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
3867
3868                         /* is this a reference to our own snapshot? If so
3869                          * skip it
3870                          */
3871                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
3872                             location.objectid == root->root_key.objectid) {
3873                                 over = 0;
3874                                 goto skip;
3875                         }
3876                         over = filldir(dirent, name_ptr, name_len,
3877                                        found_key.offset, location.objectid,
3878                                        d_type);
3879
3880 skip:
3881                         if (name_ptr != tmp_name)
3882                                 kfree(name_ptr);
3883
3884                         if (over)
3885                                 goto nopos;
3886                         di_len = btrfs_dir_name_len(leaf, di) +
3887                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
3888                         di_cur += di_len;
3889                         di = (struct btrfs_dir_item *)((char *)di + di_len);
3890                 }
3891         }
3892
3893         /* Reached end of directory/root. Bump pos past the last item. */
3894         if (key_type == BTRFS_DIR_INDEX_KEY)
3895                 filp->f_pos = INT_LIMIT(off_t);
3896         else
3897                 filp->f_pos++;
3898 nopos:
3899         ret = 0;
3900 err:
3901         btrfs_free_path(path);
3902         return ret;
3903 }
3904
3905 int btrfs_write_inode(struct inode *inode, int wait)
3906 {
3907         struct btrfs_root *root = BTRFS_I(inode)->root;
3908         struct btrfs_trans_handle *trans;
3909         int ret = 0;
3910
3911         if (root->fs_info->btree_inode == inode)
3912                 return 0;
3913
3914         if (wait) {
3915                 trans = btrfs_join_transaction(root, 1);
3916                 btrfs_set_trans_block_group(trans, inode);
3917                 ret = btrfs_commit_transaction(trans, root);
3918         }
3919         return ret;
3920 }
3921
3922 /*
3923  * This is somewhat expensive, updating the tree every time the
3924  * inode changes.  But, it is most likely to find the inode in cache.
3925  * FIXME, needs more benchmarking...there are no reasons other than performance
3926  * to keep or drop this code.
3927  */
3928 void btrfs_dirty_inode(struct inode *inode)
3929 {
3930         struct btrfs_root *root = BTRFS_I(inode)->root;
3931         struct btrfs_trans_handle *trans;
3932
3933         trans = btrfs_join_transaction(root, 1);
3934         btrfs_set_trans_block_group(trans, inode);
3935         btrfs_update_inode(trans, root, inode);
3936         btrfs_end_transaction(trans, root);
3937 }
3938
3939 /*
3940  * find the highest existing sequence number in a directory
3941  * and then set the in-memory index_cnt variable to reflect
3942  * free sequence numbers
3943  */
3944 static int btrfs_set_inode_index_count(struct inode *inode)
3945 {
3946         struct btrfs_root *root = BTRFS_I(inode)->root;
3947         struct btrfs_key key, found_key;
3948         struct btrfs_path *path;
3949         struct extent_buffer *leaf;
3950         int ret;
3951
3952         key.objectid = inode->i_ino;
3953         btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3954         key.offset = (u64)-1;
3955
3956         path = btrfs_alloc_path();
3957         if (!path)
3958                 return -ENOMEM;
3959
3960         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3961         if (ret < 0)
3962                 goto out;
3963         /* FIXME: we should be able to handle this */
3964         if (ret == 0)
3965                 goto out;
3966         ret = 0;
3967
3968         /*
3969          * MAGIC NUMBER EXPLANATION:
3970          * since we search a directory based on f_pos we have to start at 2
3971          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3972          * else has to start at 2
3973          */
3974         if (path->slots[0] == 0) {
3975                 BTRFS_I(inode)->index_cnt = 2;
3976                 goto out;
3977         }
3978
3979         path->slots[0]--;
3980
3981         leaf = path->nodes[0];
3982         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3983
3984         if (found_key.objectid != inode->i_ino ||
3985             btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3986                 BTRFS_I(inode)->index_cnt = 2;
3987                 goto out;
3988         }
3989
3990         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3991 out:
3992         btrfs_free_path(path);
3993         return ret;
3994 }
3995
3996 /*
3997  * helper to find a free sequence number in a given directory.  This current
3998  * code is very simple, later versions will do smarter things in the btree
3999  */
4000 int btrfs_set_inode_index(struct inode *dir, u64 *index)
4001 {
4002         int ret = 0;
4003
4004         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
4005                 ret = btrfs_set_inode_index_count(dir);
4006                 if (ret)
4007                         return ret;
4008         }
4009
4010         *index = BTRFS_I(dir)->index_cnt;
4011         BTRFS_I(dir)->index_cnt++;
4012
4013         return ret;
4014 }
4015
4016 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4017                                      struct btrfs_root *root,
4018                                      struct inode *dir,
4019                                      const char *name, int name_len,
4020                                      u64 ref_objectid, u64 objectid,
4021                                      u64 alloc_hint, int mode, u64 *index)
4022 {
4023         struct inode *inode;
4024         struct btrfs_inode_item *inode_item;
4025         struct btrfs_key *location;
4026         struct btrfs_path *path;
4027         struct btrfs_inode_ref *ref;
4028         struct btrfs_key key[2];
4029         u32 sizes[2];
4030         unsigned long ptr;
4031         int ret;
4032         int owner;
4033
4034         path = btrfs_alloc_path();
4035         BUG_ON(!path);
4036
4037         inode = new_inode(root->fs_info->sb);
4038         if (!inode)
4039                 return ERR_PTR(-ENOMEM);
4040
4041         if (dir) {
4042                 ret = btrfs_set_inode_index(dir, index);
4043                 if (ret) {
4044                         iput(inode);
4045                         return ERR_PTR(ret);
4046                 }
4047         }
4048         /*
4049          * index_cnt is ignored for everything but a dir,
4050          * btrfs_get_inode_index_count has an explanation for the magic
4051          * number
4052          */
4053         init_btrfs_i(inode);
4054         BTRFS_I(inode)->index_cnt = 2;
4055         BTRFS_I(inode)->root = root;
4056         BTRFS_I(inode)->generation = trans->transid;
4057         btrfs_set_inode_space_info(root, inode);
4058
4059         if (mode & S_IFDIR)
4060                 owner = 0;
4061         else
4062                 owner = 1;
4063         BTRFS_I(inode)->block_group =
4064                         btrfs_find_block_group(root, 0, alloc_hint, owner);
4065
4066         key[0].objectid = objectid;
4067         btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4068         key[0].offset = 0;
4069
4070         key[1].objectid = objectid;
4071         btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4072         key[1].offset = ref_objectid;
4073
4074         sizes[0] = sizeof(struct btrfs_inode_item);
4075         sizes[1] = name_len + sizeof(*ref);
4076
4077         path->leave_spinning = 1;
4078         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
4079         if (ret != 0)
4080                 goto fail;
4081
4082         inode->i_uid = current_fsuid();
4083
4084         if (dir && (dir->i_mode & S_ISGID)) {
4085                 inode->i_gid = dir->i_gid;
4086                 if (S_ISDIR(mode))
4087                         mode |= S_ISGID;
4088         } else
4089                 inode->i_gid = current_fsgid();
4090
4091         inode->i_mode = mode;
4092         inode->i_ino = objectid;
4093         inode_set_bytes(inode, 0);
4094         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4095         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4096                                   struct btrfs_inode_item);
4097         fill_inode_item(trans, path->nodes[0], inode_item, inode);
4098
4099         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
4100                              struct btrfs_inode_ref);
4101         btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
4102         btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
4103         ptr = (unsigned long)(ref + 1);
4104         write_extent_buffer(path->nodes[0], name, ptr, name_len);
4105
4106         btrfs_mark_buffer_dirty(path->nodes[0]);
4107         btrfs_free_path(path);
4108
4109         location = &BTRFS_I(inode)->location;
4110         location->objectid = objectid;
4111         location->offset = 0;
4112         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
4113
4114         btrfs_inherit_iflags(inode, dir);
4115
4116         if ((mode & S_IFREG)) {
4117                 if (btrfs_test_opt(root, NODATASUM))
4118                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4119                 if (btrfs_test_opt(root, NODATACOW))
4120                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4121         }
4122
4123         insert_inode_hash(inode);
4124         inode_tree_add(inode);
4125         return inode;
4126 fail:
4127         if (dir)
4128                 BTRFS_I(dir)->index_cnt--;
4129         btrfs_free_path(path);
4130         iput(inode);
4131         return ERR_PTR(ret);
4132 }
4133
4134 static inline u8 btrfs_inode_type(struct inode *inode)
4135 {
4136         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
4137 }
4138
4139 /*
4140  * utility function to add 'inode' into 'parent_inode' with
4141  * a give name and a given sequence number.
4142  * if 'add_backref' is true, also insert a backref from the
4143  * inode to the parent directory.
4144  */
4145 int btrfs_add_link(struct btrfs_trans_handle *trans,
4146                    struct inode *parent_inode, struct inode *inode,
4147                    const char *name, int name_len, int add_backref, u64 index)
4148 {
4149         int ret = 0;
4150         struct btrfs_key key;
4151         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
4152
4153         if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4154                 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
4155         } else {
4156                 key.objectid = inode->i_ino;
4157                 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
4158                 key.offset = 0;
4159         }
4160
4161         if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4162                 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
4163                                          key.objectid, root->root_key.objectid,
4164                                          parent_inode->i_ino,
4165                                          index, name, name_len);
4166         } else if (add_backref) {
4167                 ret = btrfs_insert_inode_ref(trans, root,
4168                                              name, name_len, inode->i_ino,
4169                                              parent_inode->i_ino, index);
4170         }
4171
4172         if (ret == 0) {
4173                 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4174                                             parent_inode->i_ino, &key,
4175                                             btrfs_inode_type(inode), index);
4176                 BUG_ON(ret);
4177
4178                 btrfs_i_size_write(parent_inode, parent_inode->i_size +
4179                                    name_len * 2);
4180                 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
4181                 ret = btrfs_update_inode(trans, root, parent_inode);
4182         }
4183         return ret;
4184 }
4185
4186 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4187                             struct dentry *dentry, struct inode *inode,
4188                             int backref, u64 index)
4189 {
4190         int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
4191                                  inode, dentry->d_name.name,
4192                                  dentry->d_name.len, backref, index);
4193         if (!err) {
4194                 d_instantiate(dentry, inode);
4195                 return 0;
4196         }
4197         if (err > 0)
4198                 err = -EEXIST;
4199         return err;
4200 }
4201
4202 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4203                         int mode, dev_t rdev)
4204 {
4205         struct btrfs_trans_handle *trans;
4206         struct btrfs_root *root = BTRFS_I(dir)->root;
4207         struct inode *inode = NULL;
4208         int err;
4209         int drop_inode = 0;
4210         u64 objectid;
4211         unsigned long nr = 0;
4212         u64 index = 0;
4213
4214         if (!new_valid_dev(rdev))
4215                 return -EINVAL;
4216
4217         /*
4218          * 2 for inode item and ref
4219          * 2 for dir items
4220          * 1 for xattr if selinux is on
4221          */
4222         err = btrfs_reserve_metadata_space(root, 5);
4223         if (err)
4224                 return err;
4225
4226         trans = btrfs_start_transaction(root, 1);
4227         if (!trans)
4228                 goto fail;
4229         btrfs_set_trans_block_group(trans, dir);
4230
4231         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4232         if (err) {
4233                 err = -ENOSPC;
4234                 goto out_unlock;
4235         }
4236
4237         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4238                                 dentry->d_name.len,
4239                                 dentry->d_parent->d_inode->i_ino, objectid,
4240                                 BTRFS_I(dir)->block_group, mode, &index);
4241         err = PTR_ERR(inode);
4242         if (IS_ERR(inode))
4243                 goto out_unlock;
4244
4245         err = btrfs_init_inode_security(inode, dir);
4246         if (err) {
4247                 drop_inode = 1;
4248                 goto out_unlock;
4249         }
4250
4251         btrfs_set_trans_block_group(trans, inode);
4252         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4253         if (err)
4254                 drop_inode = 1;
4255         else {
4256                 inode->i_op = &btrfs_special_inode_operations;
4257                 init_special_inode(inode, inode->i_mode, rdev);
4258                 btrfs_update_inode(trans, root, inode);
4259         }
4260         btrfs_update_inode_block_group(trans, inode);
4261         btrfs_update_inode_block_group(trans, dir);
4262 out_unlock:
4263         nr = trans->blocks_used;
4264         btrfs_end_transaction_throttle(trans, root);
4265 fail:
4266         btrfs_unreserve_metadata_space(root, 5);
4267         if (drop_inode) {
4268                 inode_dec_link_count(inode);
4269                 iput(inode);
4270         }
4271         btrfs_btree_balance_dirty(root, nr);
4272         return err;
4273 }
4274
4275 static int btrfs_create(struct inode *dir, struct dentry *dentry,
4276                         int mode, struct nameidata *nd)
4277 {
4278         struct btrfs_trans_handle *trans;
4279         struct btrfs_root *root = BTRFS_I(dir)->root;
4280         struct inode *inode = NULL;
4281         int err;
4282         int drop_inode = 0;
4283         unsigned long nr = 0;
4284         u64 objectid;
4285         u64 index = 0;
4286
4287         /*
4288          * 2 for inode item and ref
4289          * 2 for dir items
4290          * 1 for xattr if selinux is on
4291          */
4292         err = btrfs_reserve_metadata_space(root, 5);
4293         if (err)
4294                 return err;
4295
4296         trans = btrfs_start_transaction(root, 1);
4297         if (!trans)
4298                 goto fail;
4299         btrfs_set_trans_block_group(trans, dir);
4300
4301         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4302         if (err) {
4303                 err = -ENOSPC;
4304                 goto out_unlock;
4305         }
4306
4307         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4308                                 dentry->d_name.len,
4309                                 dentry->d_parent->d_inode->i_ino,
4310                                 objectid, BTRFS_I(dir)->block_group, mode,
4311                                 &index);
4312         err = PTR_ERR(inode);
4313         if (IS_ERR(inode))
4314                 goto out_unlock;
4315
4316         err = btrfs_init_inode_security(inode, dir);
4317         if (err) {
4318                 drop_inode = 1;
4319                 goto out_unlock;
4320         }
4321
4322         btrfs_set_trans_block_group(trans, inode);
4323         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4324         if (err)
4325                 drop_inode = 1;
4326         else {
4327                 inode->i_mapping->a_ops = &btrfs_aops;
4328                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4329                 inode->i_fop = &btrfs_file_operations;
4330                 inode->i_op = &btrfs_file_inode_operations;
4331                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4332         }
4333         btrfs_update_inode_block_group(trans, inode);
4334         btrfs_update_inode_block_group(trans, dir);
4335 out_unlock:
4336         nr = trans->blocks_used;
4337         btrfs_end_transaction_throttle(trans, root);
4338 fail:
4339         btrfs_unreserve_metadata_space(root, 5);
4340         if (drop_inode) {
4341                 inode_dec_link_count(inode);
4342                 iput(inode);
4343         }
4344         btrfs_btree_balance_dirty(root, nr);
4345         return err;
4346 }
4347
4348 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4349                       struct dentry *dentry)
4350 {
4351         struct btrfs_trans_handle *trans;
4352         struct btrfs_root *root = BTRFS_I(dir)->root;
4353         struct inode *inode = old_dentry->d_inode;
4354         u64 index;
4355         unsigned long nr = 0;
4356         int err;
4357         int drop_inode = 0;
4358
4359         if (inode->i_nlink == 0)
4360                 return -ENOENT;
4361
4362         /*
4363          * 1 item for inode ref
4364          * 2 items for dir items
4365          */
4366         err = btrfs_reserve_metadata_space(root, 3);
4367         if (err)
4368                 return err;
4369
4370         btrfs_inc_nlink(inode);
4371
4372         err = btrfs_set_inode_index(dir, &index);
4373         if (err)
4374                 goto fail;
4375
4376         trans = btrfs_start_transaction(root, 1);
4377
4378         btrfs_set_trans_block_group(trans, dir);
4379         atomic_inc(&inode->i_count);
4380
4381         err = btrfs_add_nondir(trans, dentry, inode, 1, index);
4382
4383         if (err) {
4384                 drop_inode = 1;
4385         } else {
4386                 btrfs_update_inode_block_group(trans, dir);
4387                 err = btrfs_update_inode(trans, root, inode);
4388                 BUG_ON(err);
4389                 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
4390         }
4391
4392         nr = trans->blocks_used;
4393         btrfs_end_transaction_throttle(trans, root);
4394 fail:
4395         btrfs_unreserve_metadata_space(root, 3);
4396         if (drop_inode) {
4397                 inode_dec_link_count(inode);
4398                 iput(inode);
4399         }
4400         btrfs_btree_balance_dirty(root, nr);
4401         return err;
4402 }
4403
4404 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4405 {
4406         struct inode *inode = NULL;
4407         struct btrfs_trans_handle *trans;
4408         struct btrfs_root *root = BTRFS_I(dir)->root;
4409         int err = 0;
4410         int drop_on_err = 0;
4411         u64 objectid = 0;
4412         u64 index = 0;
4413         unsigned long nr = 1;
4414
4415         /*
4416          * 2 items for inode and ref
4417          * 2 items for dir items
4418          * 1 for xattr if selinux is on
4419          */
4420         err = btrfs_reserve_metadata_space(root, 5);
4421         if (err)
4422                 return err;
4423
4424         trans = btrfs_start_transaction(root, 1);
4425         if (!trans) {
4426                 err = -ENOMEM;
4427                 goto out_unlock;
4428         }
4429         btrfs_set_trans_block_group(trans, dir);
4430
4431         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4432         if (err) {
4433                 err = -ENOSPC;
4434                 goto out_unlock;
4435         }
4436
4437         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4438                                 dentry->d_name.len,
4439                                 dentry->d_parent->d_inode->i_ino, objectid,
4440                                 BTRFS_I(dir)->block_group, S_IFDIR | mode,
4441                                 &index);
4442         if (IS_ERR(inode)) {
4443                 err = PTR_ERR(inode);
4444                 goto out_fail;
4445         }
4446
4447         drop_on_err = 1;
4448
4449         err = btrfs_init_inode_security(inode, dir);
4450         if (err)
4451                 goto out_fail;
4452
4453         inode->i_op = &btrfs_dir_inode_operations;
4454         inode->i_fop = &btrfs_dir_file_operations;
4455         btrfs_set_trans_block_group(trans, inode);
4456
4457         btrfs_i_size_write(inode, 0);
4458         err = btrfs_update_inode(trans, root, inode);
4459         if (err)
4460                 goto out_fail;
4461
4462         err = btrfs_add_link(trans, dentry->d_parent->d_inode,
4463                                  inode, dentry->d_name.name,
4464                                  dentry->d_name.len, 0, index);
4465         if (err)
4466                 goto out_fail;
4467
4468         d_instantiate(dentry, inode);
4469         drop_on_err = 0;
4470         btrfs_update_inode_block_group(trans, inode);
4471         btrfs_update_inode_block_group(trans, dir);
4472
4473 out_fail:
4474         nr = trans->blocks_used;
4475         btrfs_end_transaction_throttle(trans, root);
4476
4477 out_unlock:
4478         btrfs_unreserve_metadata_space(root, 5);
4479         if (drop_on_err)
4480                 iput(inode);
4481         btrfs_btree_balance_dirty(root, nr);
4482         return err;
4483 }
4484
4485 /* helper for btfs_get_extent.  Given an existing extent in the tree,
4486  * and an extent that you want to insert, deal with overlap and insert
4487  * the new extent into the tree.
4488  */
4489 static int merge_extent_mapping(struct extent_map_tree *em_tree,
4490                                 struct extent_map *existing,
4491                                 struct extent_map *em,
4492                                 u64 map_start, u64 map_len)
4493 {
4494         u64 start_diff;
4495
4496         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
4497         start_diff = map_start - em->start;
4498         em->start = map_start;
4499         em->len = map_len;
4500         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
4501             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4502                 em->block_start += start_diff;
4503                 em->block_len -= start_diff;
4504         }
4505         return add_extent_mapping(em_tree, em);
4506 }
4507
4508 static noinline int uncompress_inline(struct btrfs_path *path,
4509                                       struct inode *inode, struct page *page,
4510                                       size_t pg_offset, u64 extent_offset,
4511                                       struct btrfs_file_extent_item *item)
4512 {
4513         int ret;
4514         struct extent_buffer *leaf = path->nodes[0];
4515         char *tmp;
4516         size_t max_size;
4517         unsigned long inline_size;
4518         unsigned long ptr;
4519
4520         WARN_ON(pg_offset != 0);
4521         max_size = btrfs_file_extent_ram_bytes(leaf, item);
4522         inline_size = btrfs_file_extent_inline_item_len(leaf,
4523                                         btrfs_item_nr(leaf, path->slots[0]));
4524         tmp = kmalloc(inline_size, GFP_NOFS);
4525         ptr = btrfs_file_extent_inline_start(item);
4526
4527         read_extent_buffer(leaf, tmp, ptr, inline_size);
4528
4529         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4530         ret = btrfs_zlib_decompress(tmp, page, extent_offset,
4531                                     inline_size, max_size);
4532         if (ret) {
4533                 char *kaddr = kmap_atomic(page, KM_USER0);
4534                 unsigned long copy_size = min_t(u64,
4535                                   PAGE_CACHE_SIZE - pg_offset,
4536                                   max_size - extent_offset);
4537                 memset(kaddr + pg_offset, 0, copy_size);
4538                 kunmap_atomic(kaddr, KM_USER0);
4539         }
4540         kfree(tmp);
4541         return 0;
4542 }
4543
4544 /*
4545  * a bit scary, this does extent mapping from logical file offset to the disk.
4546  * the ugly parts come from merging extents from the disk with the in-ram
4547  * representation.  This gets more complex because of the data=ordered code,
4548  * where the in-ram extents might be locked pending data=ordered completion.
4549  *
4550  * This also copies inline extents directly into the page.
4551  */
4552
4553 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4554                                     size_t pg_offset, u64 start, u64 len,
4555                                     int create)
4556 {
4557         int ret;
4558         int err = 0;
4559         u64 bytenr;
4560         u64 extent_start = 0;
4561         u64 extent_end = 0;
4562         u64 objectid = inode->i_ino;
4563         u32 found_type;
4564         struct btrfs_path *path = NULL;
4565         struct btrfs_root *root = BTRFS_I(inode)->root;
4566         struct btrfs_file_extent_item *item;
4567         struct extent_buffer *leaf;
4568         struct btrfs_key found_key;
4569         struct extent_map *em = NULL;
4570         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4571         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4572         struct btrfs_trans_handle *trans = NULL;
4573         int compressed;
4574
4575 again:
4576         read_lock(&em_tree->lock);
4577         em = lookup_extent_mapping(em_tree, start, len);
4578         if (em)
4579                 em->bdev = root->fs_info->fs_devices->latest_bdev;
4580         read_unlock(&em_tree->lock);
4581
4582         if (em) {
4583                 if (em->start > start || em->start + em->len <= start)
4584                         free_extent_map(em);
4585                 else if (em->block_start == EXTENT_MAP_INLINE && page)
4586                         free_extent_map(em);
4587                 else
4588                         goto out;
4589         }
4590         em = alloc_extent_map(GFP_NOFS);
4591         if (!em) {
4592                 err = -ENOMEM;
4593                 goto out;
4594         }
4595         em->bdev = root->fs_info->fs_devices->latest_bdev;
4596         em->start = EXTENT_MAP_HOLE;
4597         em->orig_start = EXTENT_MAP_HOLE;
4598         em->len = (u64)-1;
4599         em->block_len = (u64)-1;
4600
4601         if (!path) {
4602                 path = btrfs_alloc_path();
4603                 BUG_ON(!path);
4604         }
4605
4606         ret = btrfs_lookup_file_extent(trans, root, path,
4607                                        objectid, start, trans != NULL);
4608         if (ret < 0) {
4609                 err = ret;
4610                 goto out;
4611         }
4612
4613         if (ret != 0) {
4614                 if (path->slots[0] == 0)
4615                         goto not_found;
4616                 path->slots[0]--;
4617         }
4618
4619         leaf = path->nodes[0];
4620         item = btrfs_item_ptr(leaf, path->slots[0],
4621                               struct btrfs_file_extent_item);
4622         /* are we inside the extent that was found? */
4623         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4624         found_type = btrfs_key_type(&found_key);
4625         if (found_key.objectid != objectid ||
4626             found_type != BTRFS_EXTENT_DATA_KEY) {
4627                 goto not_found;
4628         }
4629
4630         found_type = btrfs_file_extent_type(leaf, item);
4631         extent_start = found_key.offset;
4632         compressed = btrfs_file_extent_compression(leaf, item);
4633         if (found_type == BTRFS_FILE_EXTENT_REG ||
4634             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
4635                 extent_end = extent_start +
4636                        btrfs_file_extent_num_bytes(leaf, item);
4637         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4638                 size_t size;
4639                 size = btrfs_file_extent_inline_len(leaf, item);
4640                 extent_end = (extent_start + size + root->sectorsize - 1) &
4641                         ~((u64)root->sectorsize - 1);
4642         }
4643
4644         if (start >= extent_end) {
4645                 path->slots[0]++;
4646                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
4647                         ret = btrfs_next_leaf(root, path);
4648                         if (ret < 0) {
4649                                 err = ret;
4650                                 goto out;
4651                         }
4652                         if (ret > 0)
4653                                 goto not_found;
4654                         leaf = path->nodes[0];
4655                 }
4656                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4657                 if (found_key.objectid != objectid ||
4658                     found_key.type != BTRFS_EXTENT_DATA_KEY)
4659                         goto not_found;
4660                 if (start + len <= found_key.offset)
4661                         goto not_found;
4662                 em->start = start;
4663                 em->len = found_key.offset - start;
4664                 goto not_found_em;
4665         }
4666
4667         if (found_type == BTRFS_FILE_EXTENT_REG ||
4668             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
4669                 em->start = extent_start;
4670                 em->len = extent_end - extent_start;
4671                 em->orig_start = extent_start -
4672                                  btrfs_file_extent_offset(leaf, item);
4673                 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4674                 if (bytenr == 0) {
4675                         em->block_start = EXTENT_MAP_HOLE;
4676                         goto insert;
4677                 }
4678                 if (compressed) {
4679                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4680                         em->block_start = bytenr;
4681                         em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4682                                                                          item);
4683                 } else {
4684                         bytenr += btrfs_file_extent_offset(leaf, item);
4685                         em->block_start = bytenr;
4686                         em->block_len = em->len;
4687                         if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4688                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4689                 }
4690                 goto insert;
4691         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4692                 unsigned long ptr;
4693                 char *map;
4694                 size_t size;
4695                 size_t extent_offset;
4696                 size_t copy_size;
4697
4698                 em->block_start = EXTENT_MAP_INLINE;
4699                 if (!page || create) {
4700                         em->start = extent_start;
4701                         em->len = extent_end - extent_start;
4702                         goto out;
4703                 }
4704
4705                 size = btrfs_file_extent_inline_len(leaf, item);
4706                 extent_offset = page_offset(page) + pg_offset - extent_start;
4707                 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4708                                 size - extent_offset);
4709                 em->start = extent_start + extent_offset;
4710                 em->len = (copy_size + root->sectorsize - 1) &
4711                         ~((u64)root->sectorsize - 1);
4712                 em->orig_start = EXTENT_MAP_INLINE;
4713                 if (compressed)
4714                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4715                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4716                 if (create == 0 && !PageUptodate(page)) {
4717                         if (btrfs_file_extent_compression(leaf, item) ==
4718                             BTRFS_COMPRESS_ZLIB) {
4719                                 ret = uncompress_inline(path, inode, page,
4720                                                         pg_offset,
4721                                                         extent_offset, item);
4722                                 BUG_ON(ret);
4723                         } else {
4724                                 map = kmap(page);
4725                                 read_extent_buffer(leaf, map + pg_offset, ptr,
4726                                                    copy_size);
4727                                 if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
4728                                         memset(map + pg_offset + copy_size, 0,
4729                                                PAGE_CACHE_SIZE - pg_offset -
4730                                                copy_size);
4731                                 }
4732                                 kunmap(page);
4733                         }
4734                         flush_dcache_page(page);
4735                 } else if (create && PageUptodate(page)) {
4736                         if (!trans) {
4737                                 kunmap(page);
4738                                 free_extent_map(em);
4739                                 em = NULL;
4740                                 btrfs_release_path(root, path);
4741                                 trans = btrfs_join_transaction(root, 1);
4742                                 goto again;
4743                         }
4744                         map = kmap(page);
4745                         write_extent_buffer(leaf, map + pg_offset, ptr,
4746                                             copy_size);
4747                         kunmap(page);
4748                         btrfs_mark_buffer_dirty(leaf);
4749                 }
4750                 set_extent_uptodate(io_tree, em->start,
4751                                     extent_map_end(em) - 1, GFP_NOFS);
4752                 goto insert;
4753         } else {
4754                 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4755                 WARN_ON(1);
4756         }
4757 not_found:
4758         em->start = start;
4759         em->len = len;
4760 not_found_em:
4761         em->block_start = EXTENT_MAP_HOLE;
4762         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4763 insert:
4764         btrfs_release_path(root, path);
4765         if (em->start > start || extent_map_end(em) <= start) {
4766                 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4767                        "[%llu %llu]\n", (unsigned long long)em->start,
4768                        (unsigned long long)em->len,
4769                        (unsigned long long)start,
4770                        (unsigned long long)len);
4771                 err = -EIO;
4772                 goto out;
4773         }
4774
4775         err = 0;
4776         write_lock(&em_tree->lock);
4777         ret = add_extent_mapping(em_tree, em);
4778         /* it is possible that someone inserted the extent into the tree
4779          * while we had the lock dropped.  It is also possible that
4780          * an overlapping map exists in the tree
4781          */
4782         if (ret == -EEXIST) {
4783                 struct extent_map *existing;
4784
4785                 ret = 0;
4786
4787                 existing = lookup_extent_mapping(em_tree, start, len);
4788                 if (existing && (existing->start > start ||
4789                     existing->start + existing->len <= start)) {
4790                         free_extent_map(existing);
4791                         existing = NULL;
4792                 }
4793                 if (!existing) {
4794                         existing = lookup_extent_mapping(em_tree, em->start,
4795                                                          em->len);
4796                         if (existing) {
4797                                 err = merge_extent_mapping(em_tree, existing,
4798                                                            em, start,
4799                                                            root->sectorsize);
4800                                 free_extent_map(existing);
4801                                 if (err) {
4802                                         free_extent_map(em);
4803                                         em = NULL;
4804                                 }
4805                         } else {
4806                                 err = -EIO;
4807                                 free_extent_map(em);
4808                                 em = NULL;
4809                         }
4810                 } else {
4811                         free_extent_map(em);
4812                         em = existing;
4813                         err = 0;
4814                 }
4815         }
4816         write_unlock(&em_tree->lock);
4817 out:
4818         if (path)
4819                 btrfs_free_path(path);
4820         if (trans) {
4821                 ret = btrfs_end_transaction(trans, root);
4822                 if (!err)
4823                         err = ret;
4824         }
4825         if (err) {
4826                 free_extent_map(em);
4827                 return ERR_PTR(err);
4828         }
4829         return em;
4830 }
4831
4832 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4833                         const struct iovec *iov, loff_t offset,
4834                         unsigned long nr_segs)
4835 {
4836         return -EINVAL;
4837 }
4838
4839 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4840                 __u64 start, __u64 len)
4841 {
4842         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4843 }
4844
4845 int btrfs_readpage(struct file *file, struct page *page)
4846 {
4847         struct extent_io_tree *tree;
4848         tree = &BTRFS_I(page->mapping->host)->io_tree;
4849         return extent_read_full_page(tree, page, btrfs_get_extent);
4850 }
4851
4852 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4853 {
4854         struct extent_io_tree *tree;
4855
4856
4857         if (current->flags & PF_MEMALLOC) {
4858                 redirty_page_for_writepage(wbc, page);
4859                 unlock_page(page);
4860                 return 0;
4861         }
4862         tree = &BTRFS_I(page->mapping->host)->io_tree;
4863         return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4864 }
4865
4866 int btrfs_writepages(struct address_space *mapping,
4867                      struct writeback_control *wbc)
4868 {
4869         struct extent_io_tree *tree;
4870
4871         tree = &BTRFS_I(mapping->host)->io_tree;
4872         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4873 }
4874
4875 static int
4876 btrfs_readpages(struct file *file, struct address_space *mapping,
4877                 struct list_head *pages, unsigned nr_pages)
4878 {
4879         struct extent_io_tree *tree;
4880         tree = &BTRFS_I(mapping->host)->io_tree;
4881         return extent_readpages(tree, mapping, pages, nr_pages,
4882                                 btrfs_get_extent);
4883 }
4884 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4885 {
4886         struct extent_io_tree *tree;
4887         struct extent_map_tree *map;
4888         int ret;
4889
4890         tree = &BTRFS_I(page->mapping->host)->io_tree;
4891         map = &BTRFS_I(page->mapping->host)->extent_tree;
4892         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4893         if (ret == 1) {
4894                 ClearPagePrivate(page);
4895                 set_page_private(page, 0);
4896                 page_cache_release(page);
4897         }
4898         return ret;
4899 }
4900
4901 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4902 {
4903         if (PageWriteback(page) || PageDirty(page))
4904                 return 0;
4905         return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
4906 }
4907
4908 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4909 {
4910         struct extent_io_tree *tree;
4911         struct btrfs_ordered_extent *ordered;
4912         u64 page_start = page_offset(page);
4913         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4914
4915
4916         /*
4917          * we have the page locked, so new writeback can't start,
4918          * and the dirty bit won't be cleared while we are here.
4919          *
4920          * Wait for IO on this page so that we can safely clear
4921          * the PagePrivate2 bit and do ordered accounting
4922          */
4923         wait_on_page_writeback(page);
4924
4925         tree = &BTRFS_I(page->mapping->host)->io_tree;
4926         if (offset) {
4927                 btrfs_releasepage(page, GFP_NOFS);
4928                 return;
4929         }
4930         lock_extent(tree, page_start, page_end, GFP_NOFS);
4931         ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4932                                            page_offset(page));
4933         if (ordered) {
4934                 /*
4935                  * IO on this page will never be started, so we need
4936                  * to account for any ordered extents now
4937                  */
4938                 clear_extent_bit(tree, page_start, page_end,
4939                                  EXTENT_DIRTY | EXTENT_DELALLOC |
4940                                  EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
4941                                  NULL, GFP_NOFS);
4942                 /*
4943                  * whoever cleared the private bit is responsible
4944                  * for the finish_ordered_io
4945                  */
4946                 if (TestClearPagePrivate2(page)) {
4947                         btrfs_finish_ordered_io(page->mapping->host,
4948                                                 page_start, page_end);
4949                 }
4950                 btrfs_put_ordered_extent(ordered);
4951                 lock_extent(tree, page_start, page_end, GFP_NOFS);
4952         }
4953         clear_extent_bit(tree, page_start, page_end,
4954                  EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4955                  EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
4956         __btrfs_releasepage(page, GFP_NOFS);
4957
4958         ClearPageChecked(page);
4959         if (PagePrivate(page)) {
4960                 ClearPagePrivate(page);
4961                 set_page_private(page, 0);
4962                 page_cache_release(page);
4963         }
4964 }
4965
4966 /*
4967  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4968  * called from a page fault handler when a page is first dirtied. Hence we must
4969  * be careful to check for EOF conditions here. We set the page up correctly
4970  * for a written page which means we get ENOSPC checking when writing into
4971  * holes and correct delalloc and unwritten extent mapping on filesystems that
4972  * support these features.
4973  *
4974  * We are not allowed to take the i_mutex here so we have to play games to
4975  * protect against truncate races as the page could now be beyond EOF.  Because
4976  * vmtruncate() writes the inode size before removing pages, once we have the
4977  * page lock we can determine safely if the page is beyond EOF. If it is not
4978  * beyond EOF, then the page is guaranteed safe against truncation until we
4979  * unlock the page.
4980  */
4981 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4982 {
4983         struct page *page = vmf->page;
4984         struct inode *inode = fdentry(vma->vm_file)->d_inode;
4985         struct btrfs_root *root = BTRFS_I(inode)->root;
4986         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4987         struct btrfs_ordered_extent *ordered;
4988         char *kaddr;
4989         unsigned long zero_start;
4990         loff_t size;
4991         int ret;
4992         u64 page_start;
4993         u64 page_end;
4994
4995         ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4996         if (ret) {
4997                 if (ret == -ENOMEM)
4998                         ret = VM_FAULT_OOM;
4999                 else /* -ENOSPC, -EIO, etc */
5000                         ret = VM_FAULT_SIGBUS;
5001                 goto out;
5002         }
5003
5004         ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5005         if (ret) {
5006                 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5007                 ret = VM_FAULT_SIGBUS;
5008                 goto out;
5009         }
5010
5011         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5012 again:
5013         lock_page(page);
5014         size = i_size_read(inode);
5015         page_start = page_offset(page);
5016         page_end = page_start + PAGE_CACHE_SIZE - 1;
5017
5018         if ((page->mapping != inode->i_mapping) ||
5019             (page_start >= size)) {
5020                 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5021                 /* page got truncated out from underneath us */
5022                 goto out_unlock;
5023         }
5024         wait_on_page_writeback(page);
5025
5026         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
5027         set_page_extent_mapped(page);
5028
5029         /*
5030          * we can't set the delalloc bits if there are pending ordered
5031          * extents.  Drop our locks and wait for them to finish
5032          */
5033         ordered = btrfs_lookup_ordered_extent(inode, page_start);
5034         if (ordered) {
5035                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5036                 unlock_page(page);
5037                 btrfs_start_ordered_extent(inode, ordered, 1);
5038                 btrfs_put_ordered_extent(ordered);
5039                 goto again;
5040         }
5041
5042         /*
5043          * XXX - page_mkwrite gets called every time the page is dirtied, even
5044          * if it was already dirty, so for space accounting reasons we need to
5045          * clear any delalloc bits for the range we are fixing to save.  There
5046          * is probably a better way to do this, but for now keep consistent with
5047          * prepare_pages in the normal write path.
5048          */
5049         clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
5050                           EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5051                           GFP_NOFS);
5052
5053         ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
5054         if (ret) {
5055                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5056                 ret = VM_FAULT_SIGBUS;
5057                 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5058                 goto out_unlock;
5059         }
5060         ret = 0;
5061
5062         /* page is wholly or partially inside EOF */
5063         if (page_start + PAGE_CACHE_SIZE > size)
5064                 zero_start = size & ~PAGE_CACHE_MASK;
5065         else
5066                 zero_start = PAGE_CACHE_SIZE;
5067
5068         if (zero_start != PAGE_CACHE_SIZE) {
5069                 kaddr = kmap(page);
5070                 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
5071                 flush_dcache_page(page);
5072                 kunmap(page);
5073         }
5074         ClearPageChecked(page);
5075         set_page_dirty(page);
5076         SetPageUptodate(page);
5077
5078         BTRFS_I(inode)->last_trans = root->fs_info->generation;
5079         BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5080
5081         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5082
5083 out_unlock:
5084         btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5085         if (!ret)
5086                 return VM_FAULT_LOCKED;
5087         unlock_page(page);
5088 out:
5089         return ret;
5090 }
5091
5092 static void btrfs_truncate(struct inode *inode)
5093 {
5094         struct btrfs_root *root = BTRFS_I(inode)->root;
5095         int ret;
5096         struct btrfs_trans_handle *trans;
5097         unsigned long nr;
5098         u64 mask = root->sectorsize - 1;
5099
5100         if (!S_ISREG(inode->i_mode))
5101                 return;
5102         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
5103                 return;
5104
5105         ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
5106         if (ret)
5107                 return;
5108         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5109
5110         trans = btrfs_start_transaction(root, 1);
5111
5112         /*
5113          * setattr is responsible for setting the ordered_data_close flag,
5114          * but that is only tested during the last file release.  That
5115          * could happen well after the next commit, leaving a great big
5116          * window where new writes may get lost if someone chooses to write
5117          * to this file after truncating to zero
5118          *
5119          * The inode doesn't have any dirty data here, and so if we commit
5120          * this is a noop.  If someone immediately starts writing to the inode
5121          * it is very likely we'll catch some of their writes in this
5122          * transaction, and the commit will find this file on the ordered
5123          * data list with good things to send down.
5124          *
5125          * This is a best effort solution, there is still a window where
5126          * using truncate to replace the contents of the file will
5127          * end up with a zero length file after a crash.
5128          */
5129         if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
5130                 btrfs_add_ordered_operation(trans, root, inode);
5131
5132         btrfs_set_trans_block_group(trans, inode);
5133         btrfs_i_size_write(inode, inode->i_size);
5134
5135         ret = btrfs_orphan_add(trans, inode);
5136         if (ret)
5137                 goto out;
5138         /* FIXME, add redo link to tree so we don't leak on crash */
5139         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
5140                                       BTRFS_EXTENT_DATA_KEY);
5141         btrfs_update_inode(trans, root, inode);
5142
5143         ret = btrfs_orphan_del(trans, inode);
5144         BUG_ON(ret);
5145
5146 out:
5147         nr = trans->blocks_used;
5148         ret = btrfs_end_transaction_throttle(trans, root);
5149         BUG_ON(ret);
5150         btrfs_btree_balance_dirty(root, nr);
5151 }
5152
5153 /*
5154  * create a new subvolume directory/inode (helper for the ioctl).
5155  */
5156 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
5157                              struct btrfs_root *new_root,
5158                              u64 new_dirid, u64 alloc_hint)
5159 {
5160         struct inode *inode;
5161         int err;
5162         u64 index = 0;
5163
5164         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
5165                                 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
5166         if (IS_ERR(inode))
5167                 return PTR_ERR(inode);
5168         inode->i_op = &btrfs_dir_inode_operations;
5169         inode->i_fop = &btrfs_dir_file_operations;
5170
5171         inode->i_nlink = 1;
5172         btrfs_i_size_write(inode, 0);
5173
5174         err = btrfs_update_inode(trans, new_root, inode);
5175         BUG_ON(err);
5176
5177         iput(inode);
5178         return 0;
5179 }
5180
5181 /* helper function for file defrag and space balancing.  This
5182  * forces readahead on a given range of bytes in an inode
5183  */
5184 unsigned long btrfs_force_ra(struct address_space *mapping,
5185                               struct file_ra_state *ra, struct file *file,
5186                               pgoff_t offset, pgoff_t last_index)
5187 {
5188         pgoff_t req_size = last_index - offset + 1;
5189
5190         page_cache_sync_readahead(mapping, ra, file, offset, req_size);
5191         return offset + req_size;
5192 }
5193
5194 struct inode *btrfs_alloc_inode(struct super_block *sb)
5195 {
5196         struct btrfs_inode *ei;
5197
5198         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5199         if (!ei)
5200                 return NULL;
5201         ei->last_trans = 0;
5202         ei->last_sub_trans = 0;
5203         ei->logged_trans = 0;
5204         ei->outstanding_extents = 0;
5205         ei->reserved_extents = 0;
5206         ei->root = NULL;
5207         spin_lock_init(&ei->accounting_lock);
5208         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5209         INIT_LIST_HEAD(&ei->i_orphan);
5210         INIT_LIST_HEAD(&ei->ordered_operations);
5211         return &ei->vfs_inode;
5212 }
5213
5214 void btrfs_destroy_inode(struct inode *inode)
5215 {
5216         struct btrfs_ordered_extent *ordered;
5217         struct btrfs_root *root = BTRFS_I(inode)->root;
5218
5219         WARN_ON(!list_empty(&inode->i_dentry));
5220         WARN_ON(inode->i_data.nrpages);
5221
5222         /*
5223          * This can happen where we create an inode, but somebody else also
5224          * created the same inode and we need to destroy the one we already
5225          * created.
5226          */
5227         if (!root)
5228                 goto free;
5229
5230         /*
5231          * Make sure we're properly removed from the ordered operation
5232          * lists.
5233          */
5234         smp_mb();
5235         if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
5236                 spin_lock(&root->fs_info->ordered_extent_lock);
5237                 list_del_init(&BTRFS_I(inode)->ordered_operations);
5238                 spin_unlock(&root->fs_info->ordered_extent_lock);
5239         }
5240
5241         spin_lock(&root->list_lock);
5242         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5243                 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
5244                        " list\n", inode->i_ino);
5245                 dump_stack();
5246         }
5247         spin_unlock(&root->list_lock);
5248
5249         while (1) {
5250                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
5251                 if (!ordered)
5252                         break;
5253                 else {
5254                         printk(KERN_ERR "btrfs found ordered "
5255                                "extent %llu %llu on inode cleanup\n",
5256                                (unsigned long long)ordered->file_offset,
5257                                (unsigned long long)ordered->len);
5258                         btrfs_remove_ordered_extent(inode, ordered);
5259                         btrfs_put_ordered_extent(ordered);
5260                         btrfs_put_ordered_extent(ordered);
5261                 }
5262         }
5263         inode_tree_del(inode);
5264         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
5265 free:
5266         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5267 }
5268
5269 void btrfs_drop_inode(struct inode *inode)
5270 {
5271         struct btrfs_root *root = BTRFS_I(inode)->root;
5272
5273         if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5274                 generic_delete_inode(inode);
5275         else
5276                 generic_drop_inode(inode);
5277 }
5278
5279 static void init_once(void *foo)
5280 {
5281         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
5282
5283         inode_init_once(&ei->vfs_inode);
5284 }
5285
5286 void btrfs_destroy_cachep(void)
5287 {
5288         if (btrfs_inode_cachep)
5289                 kmem_cache_destroy(btrfs_inode_cachep);
5290         if (btrfs_trans_handle_cachep)
5291                 kmem_cache_destroy(btrfs_trans_handle_cachep);
5292         if (btrfs_transaction_cachep)
5293                 kmem_cache_destroy(btrfs_transaction_cachep);
5294         if (btrfs_path_cachep)
5295                 kmem_cache_destroy(btrfs_path_cachep);
5296 }
5297
5298 int btrfs_init_cachep(void)
5299 {
5300         btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
5301                         sizeof(struct btrfs_inode), 0,
5302                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
5303         if (!btrfs_inode_cachep)
5304                 goto fail;
5305
5306         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
5307                         sizeof(struct btrfs_trans_handle), 0,
5308                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
5309         if (!btrfs_trans_handle_cachep)
5310                 goto fail;
5311
5312         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
5313                         sizeof(struct btrfs_transaction), 0,
5314                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
5315         if (!btrfs_transaction_cachep)
5316                 goto fail;
5317
5318         btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
5319                         sizeof(struct btrfs_path), 0,
5320                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
5321         if (!btrfs_path_cachep)
5322                 goto fail;
5323
5324         return 0;
5325 fail:
5326         btrfs_destroy_cachep();
5327         return -ENOMEM;
5328 }
5329
5330 static int btrfs_getattr(struct vfsmount *mnt,
5331                          struct dentry *dentry, struct kstat *stat)
5332 {
5333         struct inode *inode = dentry->d_inode;
5334         generic_fillattr(inode, stat);
5335         stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
5336         stat->blksize = PAGE_CACHE_SIZE;
5337         stat->blocks = (inode_get_bytes(inode) +
5338                         BTRFS_I(inode)->delalloc_bytes) >> 9;
5339         return 0;
5340 }
5341
5342 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5343                            struct inode *new_dir, struct dentry *new_dentry)
5344 {
5345         struct btrfs_trans_handle *trans;
5346         struct btrfs_root *root = BTRFS_I(old_dir)->root;
5347         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
5348         struct inode *new_inode = new_dentry->d_inode;
5349         struct inode *old_inode = old_dentry->d_inode;
5350         struct timespec ctime = CURRENT_TIME;
5351         u64 index = 0;
5352         u64 root_objectid;
5353         int ret;
5354
5355         if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5356                 return -EPERM;
5357
5358         /* we only allow rename subvolume link between subvolumes */
5359         if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
5360                 return -EXDEV;
5361
5362         if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
5363             (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
5364                 return -ENOTEMPTY;
5365
5366         if (S_ISDIR(old_inode->i_mode) && new_inode &&
5367             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5368                 return -ENOTEMPTY;
5369
5370         /*
5371          * We want to reserve the absolute worst case amount of items.  So if
5372          * both inodes are subvols and we need to unlink them then that would
5373          * require 4 item modifications, but if they are both normal inodes it
5374          * would require 5 item modifications, so we'll assume their normal
5375          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5376          * should cover the worst case number of items we'll modify.
5377          */
5378         ret = btrfs_reserve_metadata_space(root, 11);
5379         if (ret)
5380                 return ret;
5381
5382         /*
5383          * we're using rename to replace one file with another.
5384          * and the replacement file is large.  Start IO on it now so
5385          * we don't add too much work to the end of the transaction
5386          */
5387         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
5388             old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
5389                 filemap_flush(old_inode->i_mapping);
5390
5391         /* close the racy window with snapshot create/destroy ioctl */
5392         if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5393                 down_read(&root->fs_info->subvol_sem);
5394
5395         trans = btrfs_start_transaction(root, 1);
5396         btrfs_set_trans_block_group(trans, new_dir);
5397
5398         if (dest != root)
5399                 btrfs_record_root_in_trans(trans, dest);
5400
5401         ret = btrfs_set_inode_index(new_dir, &index);
5402         if (ret)
5403                 goto out_fail;
5404
5405         if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
5406                 /* force full log commit if subvolume involved. */
5407                 root->fs_info->last_trans_log_full_commit = trans->transid;
5408         } else {
5409                 ret = btrfs_insert_inode_ref(trans, dest,
5410                                              new_dentry->d_name.name,
5411                                              new_dentry->d_name.len,
5412                                              old_inode->i_ino,
5413                                              new_dir->i_ino, index);
5414                 if (ret)
5415                         goto out_fail;
5416                 /*
5417                  * this is an ugly little race, but the rename is required
5418                  * to make sure that if we crash, the inode is either at the
5419                  * old name or the new one.  pinning the log transaction lets
5420                  * us make sure we don't allow a log commit to come in after
5421                  * we unlink the name but before we add the new name back in.
5422                  */
5423                 btrfs_pin_log_trans(root);
5424         }
5425         /*
5426          * make sure the inode gets flushed if it is replacing
5427          * something.
5428          */
5429         if (new_inode && new_inode->i_size &&
5430             old_inode && S_ISREG(old_inode->i_mode)) {
5431                 btrfs_add_ordered_operation(trans, root, old_inode);
5432         }
5433
5434         old_dir->i_ctime = old_dir->i_mtime = ctime;
5435         new_dir->i_ctime = new_dir->i_mtime = ctime;
5436         old_inode->i_ctime = ctime;
5437
5438         if (old_dentry->d_parent != new_dentry->d_parent)
5439                 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
5440
5441         if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
5442                 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
5443                 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
5444                                         old_dentry->d_name.name,
5445                                         old_dentry->d_name.len);
5446         } else {
5447                 btrfs_inc_nlink(old_dentry->d_inode);
5448                 ret = btrfs_unlink_inode(trans, root, old_dir,
5449                                          old_dentry->d_inode,
5450                                          old_dentry->d_name.name,
5451                                          old_dentry->d_name.len);
5452         }
5453         BUG_ON(ret);
5454
5455         if (new_inode) {
5456                 new_inode->i_ctime = CURRENT_TIME;
5457                 if (unlikely(new_inode->i_ino ==
5458                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
5459                         root_objectid = BTRFS_I(new_inode)->location.objectid;
5460                         ret = btrfs_unlink_subvol(trans, dest, new_dir,
5461                                                 root_objectid,
5462                                                 new_dentry->d_name.name,
5463                                                 new_dentry->d_name.len);
5464                         BUG_ON(new_inode->i_nlink == 0);
5465                 } else {
5466                         ret = btrfs_unlink_inode(trans, dest, new_dir,
5467                                                  new_dentry->d_inode,
5468                                                  new_dentry->d_name.name,
5469                                                  new_dentry->d_name.len);
5470                 }
5471                 BUG_ON(ret);
5472                 if (new_inode->i_nlink == 0) {
5473                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
5474                         BUG_ON(ret);
5475                 }
5476         }
5477
5478         ret = btrfs_add_link(trans, new_dir, old_inode,
5479                              new_dentry->d_name.name,
5480                              new_dentry->d_name.len, 0, index);
5481         BUG_ON(ret);
5482
5483         if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
5484                 btrfs_log_new_name(trans, old_inode, old_dir,
5485                                    new_dentry->d_parent);
5486                 btrfs_end_log_trans(root);
5487         }
5488 out_fail:
5489         btrfs_end_transaction_throttle(trans, root);
5490
5491         if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5492                 up_read(&root->fs_info->subvol_sem);
5493
5494         btrfs_unreserve_metadata_space(root, 11);
5495         return ret;
5496 }
5497
5498 /*
5499  * some fairly slow code that needs optimization. This walks the list
5500  * of all the inodes with pending delalloc and forces them to disk.
5501  */
5502 int btrfs_start_delalloc_inodes(struct btrfs_root *root)
5503 {
5504         struct list_head *head = &root->fs_info->delalloc_inodes;
5505         struct btrfs_inode *binode;
5506         struct inode *inode;
5507
5508         if (root->fs_info->sb->s_flags & MS_RDONLY)
5509                 return -EROFS;
5510
5511         spin_lock(&root->fs_info->delalloc_lock);
5512         while (!list_empty(head)) {
5513                 binode = list_entry(head->next, struct btrfs_inode,
5514                                     delalloc_inodes);
5515                 inode = igrab(&binode->vfs_inode);
5516                 if (!inode)
5517                         list_del_init(&binode->delalloc_inodes);
5518                 spin_unlock(&root->fs_info->delalloc_lock);
5519                 if (inode) {
5520                         filemap_flush(inode->i_mapping);
5521                         iput(inode);
5522                 }
5523                 cond_resched();
5524                 spin_lock(&root->fs_info->delalloc_lock);
5525         }
5526         spin_unlock(&root->fs_info->delalloc_lock);
5527
5528         /* the filemap_flush will queue IO into the worker threads, but
5529          * we have to make sure the IO is actually started and that
5530          * ordered extents get created before we return
5531          */
5532         atomic_inc(&root->fs_info->async_submit_draining);
5533         while (atomic_read(&root->fs_info->nr_async_submits) ||
5534               atomic_read(&root->fs_info->async_delalloc_pages)) {
5535                 wait_event(root->fs_info->async_submit_wait,
5536                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
5537                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
5538         }
5539         atomic_dec(&root->fs_info->async_submit_draining);
5540         return 0;
5541 }
5542
5543 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5544                          const char *symname)
5545 {
5546         struct btrfs_trans_handle *trans;
5547         struct btrfs_root *root = BTRFS_I(dir)->root;
5548         struct btrfs_path *path;
5549         struct btrfs_key key;
5550         struct inode *inode = NULL;
5551         int err;
5552         int drop_inode = 0;
5553         u64 objectid;
5554         u64 index = 0 ;
5555         int name_len;
5556         int datasize;
5557         unsigned long ptr;
5558         struct btrfs_file_extent_item *ei;
5559         struct extent_buffer *leaf;
5560         unsigned long nr = 0;
5561
5562         name_len = strlen(symname) + 1;
5563         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5564                 return -ENAMETOOLONG;
5565
5566         /*
5567          * 2 items for inode item and ref
5568          * 2 items for dir items
5569          * 1 item for xattr if selinux is on
5570          */
5571         err = btrfs_reserve_metadata_space(root, 5);
5572         if (err)
5573                 return err;
5574
5575         trans = btrfs_start_transaction(root, 1);
5576         if (!trans)
5577                 goto out_fail;
5578         btrfs_set_trans_block_group(trans, dir);
5579
5580         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5581         if (err) {
5582                 err = -ENOSPC;
5583                 goto out_unlock;
5584         }
5585
5586         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5587                                 dentry->d_name.len,
5588                                 dentry->d_parent->d_inode->i_ino, objectid,
5589                                 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
5590                                 &index);
5591         err = PTR_ERR(inode);
5592         if (IS_ERR(inode))
5593                 goto out_unlock;
5594
5595         err = btrfs_init_inode_security(inode, dir);
5596         if (err) {
5597                 drop_inode = 1;
5598                 goto out_unlock;
5599         }
5600
5601         btrfs_set_trans_block_group(trans, inode);
5602         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
5603         if (err)
5604                 drop_inode = 1;
5605         else {
5606                 inode->i_mapping->a_ops = &btrfs_aops;
5607                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5608                 inode->i_fop = &btrfs_file_operations;
5609                 inode->i_op = &btrfs_file_inode_operations;
5610                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5611         }
5612         btrfs_update_inode_block_group(trans, inode);
5613         btrfs_update_inode_block_group(trans, dir);
5614         if (drop_inode)
5615                 goto out_unlock;
5616
5617         path = btrfs_alloc_path();
5618         BUG_ON(!path);
5619         key.objectid = inode->i_ino;
5620         key.offset = 0;
5621         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
5622         datasize = btrfs_file_extent_calc_inline_size(name_len);
5623         err = btrfs_insert_empty_item(trans, root, path, &key,
5624                                       datasize);
5625         if (err) {
5626                 drop_inode = 1;
5627                 goto out_unlock;
5628         }
5629         leaf = path->nodes[0];
5630         ei = btrfs_item_ptr(leaf, path->slots[0],
5631                             struct btrfs_file_extent_item);
5632         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
5633         btrfs_set_file_extent_type(leaf, ei,
5634                                    BTRFS_FILE_EXTENT_INLINE);
5635         btrfs_set_file_extent_encryption(leaf, ei, 0);
5636         btrfs_set_file_extent_compression(leaf, ei, 0);
5637         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
5638         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
5639
5640         ptr = btrfs_file_extent_inline_start(ei);
5641         write_extent_buffer(leaf, symname, ptr, name_len);
5642         btrfs_mark_buffer_dirty(leaf);
5643         btrfs_free_path(path);
5644
5645         inode->i_op = &btrfs_symlink_inode_operations;
5646         inode->i_mapping->a_ops = &btrfs_symlink_aops;
5647         inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5648         inode_set_bytes(inode, name_len);
5649         btrfs_i_size_write(inode, name_len - 1);
5650         err = btrfs_update_inode(trans, root, inode);
5651         if (err)
5652                 drop_inode = 1;
5653
5654 out_unlock:
5655         nr = trans->blocks_used;
5656         btrfs_end_transaction_throttle(trans, root);
5657 out_fail:
5658         btrfs_unreserve_metadata_space(root, 5);
5659         if (drop_inode) {
5660                 inode_dec_link_count(inode);
5661                 iput(inode);
5662         }
5663         btrfs_btree_balance_dirty(root, nr);
5664         return err;
5665 }
5666
5667 static int prealloc_file_range(struct btrfs_trans_handle *trans,
5668                                struct inode *inode, u64 start, u64 end,
5669                                u64 alloc_hint, int mode)
5670 {
5671         struct btrfs_root *root = BTRFS_I(inode)->root;
5672         struct btrfs_key ins;
5673         u64 alloc_size;
5674         u64 cur_offset = start;
5675         u64 num_bytes = end - start;
5676         int ret = 0;
5677
5678         while (num_bytes > 0) {
5679                 alloc_size = min(num_bytes, root->fs_info->max_extent);
5680
5681                 ret = btrfs_reserve_metadata_space(root, 1);
5682                 if (ret)
5683                         goto out;
5684
5685                 ret = btrfs_reserve_extent(trans, root, alloc_size,
5686                                            root->sectorsize, 0, alloc_hint,
5687                                            (u64)-1, &ins, 1);
5688                 if (ret) {
5689                         WARN_ON(1);
5690                         goto out;
5691                 }
5692                 ret = insert_reserved_file_extent(trans, inode,
5693                                                   cur_offset, ins.objectid,
5694                                                   ins.offset, ins.offset,
5695                                                   ins.offset, 0, 0, 0,
5696                                                   BTRFS_FILE_EXTENT_PREALLOC);
5697                 BUG_ON(ret);
5698                 btrfs_drop_extent_cache(inode, cur_offset,
5699                                         cur_offset + ins.offset -1, 0);
5700                 num_bytes -= ins.offset;
5701                 cur_offset += ins.offset;
5702                 alloc_hint = ins.objectid + ins.offset;
5703                 btrfs_unreserve_metadata_space(root, 1);
5704         }
5705 out:
5706         if (cur_offset > start) {
5707                 inode->i_ctime = CURRENT_TIME;
5708                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5709                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5710                     cur_offset > i_size_read(inode))
5711                         btrfs_i_size_write(inode, cur_offset);
5712                 ret = btrfs_update_inode(trans, root, inode);
5713                 BUG_ON(ret);
5714         }
5715
5716         return ret;
5717 }
5718
5719 static long btrfs_fallocate(struct inode *inode, int mode,
5720                             loff_t offset, loff_t len)
5721 {
5722         u64 cur_offset;
5723         u64 last_byte;
5724         u64 alloc_start;
5725         u64 alloc_end;
5726         u64 alloc_hint = 0;
5727         u64 locked_end;
5728         u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
5729         struct extent_map *em;
5730         struct btrfs_trans_handle *trans;
5731         struct btrfs_root *root;
5732         int ret;
5733
5734         alloc_start = offset & ~mask;
5735         alloc_end =  (offset + len + mask) & ~mask;
5736
5737         /*
5738          * wait for ordered IO before we have any locks.  We'll loop again
5739          * below with the locks held.
5740          */
5741         btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
5742
5743         mutex_lock(&inode->i_mutex);
5744         if (alloc_start > inode->i_size) {
5745                 ret = btrfs_cont_expand(inode, alloc_start);
5746                 if (ret)
5747                         goto out;
5748         }
5749
5750         root = BTRFS_I(inode)->root;
5751
5752         ret = btrfs_check_data_free_space(root, inode,
5753                                           alloc_end - alloc_start);
5754         if (ret)
5755                 goto out;
5756
5757         locked_end = alloc_end - 1;
5758         while (1) {
5759                 struct btrfs_ordered_extent *ordered;
5760
5761                 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
5762                 if (!trans) {
5763                         ret = -EIO;
5764                         goto out_free;
5765                 }
5766
5767                 /* the extent lock is ordered inside the running
5768                  * transaction
5769                  */
5770                 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5771                             GFP_NOFS);
5772                 ordered = btrfs_lookup_first_ordered_extent(inode,
5773                                                             alloc_end - 1);
5774                 if (ordered &&
5775                     ordered->file_offset + ordered->len > alloc_start &&
5776                     ordered->file_offset < alloc_end) {
5777                         btrfs_put_ordered_extent(ordered);
5778                         unlock_extent(&BTRFS_I(inode)->io_tree,
5779                                       alloc_start, locked_end, GFP_NOFS);
5780                         btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5781
5782                         /*
5783                          * we can't wait on the range with the transaction
5784                          * running or with the extent lock held
5785                          */
5786                         btrfs_wait_ordered_range(inode, alloc_start,
5787                                                  alloc_end - alloc_start);
5788                 } else {
5789                         if (ordered)
5790                                 btrfs_put_ordered_extent(ordered);
5791                         break;
5792                 }
5793         }
5794
5795         cur_offset = alloc_start;
5796         while (1) {
5797                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
5798                                       alloc_end - cur_offset, 0);
5799                 BUG_ON(IS_ERR(em) || !em);
5800                 last_byte = min(extent_map_end(em), alloc_end);
5801                 last_byte = (last_byte + mask) & ~mask;
5802                 if (em->block_start == EXTENT_MAP_HOLE) {
5803                         ret = prealloc_file_range(trans, inode, cur_offset,
5804                                                 last_byte, alloc_hint, mode);
5805                         if (ret < 0) {
5806                                 free_extent_map(em);
5807                                 break;
5808                         }
5809                 }
5810                 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5811                         alloc_hint = em->block_start;
5812                 free_extent_map(em);
5813
5814                 cur_offset = last_byte;
5815                 if (cur_offset >= alloc_end) {
5816                         ret = 0;
5817                         break;
5818                 }
5819         }
5820         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5821                       GFP_NOFS);
5822
5823         btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5824 out_free:
5825         btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
5826 out:
5827         mutex_unlock(&inode->i_mutex);
5828         return ret;
5829 }
5830
5831 static int btrfs_set_page_dirty(struct page *page)
5832 {
5833         return __set_page_dirty_nobuffers(page);
5834 }
5835
5836 static int btrfs_permission(struct inode *inode, int mask)
5837 {
5838         if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
5839                 return -EACCES;
5840         return generic_permission(inode, mask, btrfs_check_acl);
5841 }
5842
5843 static const struct inode_operations btrfs_dir_inode_operations = {
5844         .getattr        = btrfs_getattr,
5845         .lookup         = btrfs_lookup,
5846         .create         = btrfs_create,
5847         .unlink         = btrfs_unlink,
5848         .link           = btrfs_link,
5849         .mkdir          = btrfs_mkdir,
5850         .rmdir          = btrfs_rmdir,
5851         .rename         = btrfs_rename,
5852         .symlink        = btrfs_symlink,
5853         .setattr        = btrfs_setattr,
5854         .mknod          = btrfs_mknod,
5855         .setxattr       = btrfs_setxattr,
5856         .getxattr       = btrfs_getxattr,
5857         .listxattr      = btrfs_listxattr,
5858         .removexattr    = btrfs_removexattr,
5859         .permission     = btrfs_permission,
5860 };
5861 static const struct inode_operations btrfs_dir_ro_inode_operations = {
5862         .lookup         = btrfs_lookup,
5863         .permission     = btrfs_permission,
5864 };
5865
5866 static const struct file_operations btrfs_dir_file_operations = {
5867         .llseek         = generic_file_llseek,
5868         .read           = generic_read_dir,
5869         .readdir        = btrfs_real_readdir,
5870         .unlocked_ioctl = btrfs_ioctl,
5871 #ifdef CONFIG_COMPAT
5872         .compat_ioctl   = btrfs_ioctl,
5873 #endif
5874         .release        = btrfs_release_file,
5875         .fsync          = btrfs_sync_file,
5876 };
5877
5878 static struct extent_io_ops btrfs_extent_io_ops = {
5879         .fill_delalloc = run_delalloc_range,
5880         .submit_bio_hook = btrfs_submit_bio_hook,
5881         .merge_bio_hook = btrfs_merge_bio_hook,
5882         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
5883         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
5884         .writepage_start_hook = btrfs_writepage_start_hook,
5885         .readpage_io_failed_hook = btrfs_io_failed_hook,
5886         .set_bit_hook = btrfs_set_bit_hook,
5887         .clear_bit_hook = btrfs_clear_bit_hook,
5888         .merge_extent_hook = btrfs_merge_extent_hook,
5889         .split_extent_hook = btrfs_split_extent_hook,
5890 };
5891
5892 /*
5893  * btrfs doesn't support the bmap operation because swapfiles
5894  * use bmap to make a mapping of extents in the file.  They assume
5895  * these extents won't change over the life of the file and they
5896  * use the bmap result to do IO directly to the drive.
5897  *
5898  * the btrfs bmap call would return logical addresses that aren't
5899  * suitable for IO and they also will change frequently as COW
5900  * operations happen.  So, swapfile + btrfs == corruption.
5901  *
5902  * For now we're avoiding this by dropping bmap.
5903  */
5904 static const struct address_space_operations btrfs_aops = {
5905         .readpage       = btrfs_readpage,
5906         .writepage      = btrfs_writepage,
5907         .writepages     = btrfs_writepages,
5908         .readpages      = btrfs_readpages,
5909         .sync_page      = block_sync_page,
5910         .direct_IO      = btrfs_direct_IO,
5911         .invalidatepage = btrfs_invalidatepage,
5912         .releasepage    = btrfs_releasepage,
5913         .set_page_dirty = btrfs_set_page_dirty,
5914         .error_remove_page = generic_error_remove_page,
5915 };
5916
5917 static const struct address_space_operations btrfs_symlink_aops = {
5918         .readpage       = btrfs_readpage,
5919         .writepage      = btrfs_writepage,
5920         .invalidatepage = btrfs_invalidatepage,
5921         .releasepage    = btrfs_releasepage,
5922 };
5923
5924 static const struct inode_operations btrfs_file_inode_operations = {
5925         .truncate       = btrfs_truncate,
5926         .getattr        = btrfs_getattr,
5927         .setattr        = btrfs_setattr,
5928         .setxattr       = btrfs_setxattr,
5929         .getxattr       = btrfs_getxattr,
5930         .listxattr      = btrfs_listxattr,
5931         .removexattr    = btrfs_removexattr,
5932         .permission     = btrfs_permission,
5933         .fallocate      = btrfs_fallocate,
5934         .fiemap         = btrfs_fiemap,
5935 };
5936 static const struct inode_operations btrfs_special_inode_operations = {
5937         .getattr        = btrfs_getattr,
5938         .setattr        = btrfs_setattr,
5939         .permission     = btrfs_permission,
5940         .setxattr       = btrfs_setxattr,
5941         .getxattr       = btrfs_getxattr,
5942         .listxattr      = btrfs_listxattr,
5943         .removexattr    = btrfs_removexattr,
5944 };
5945 static const struct inode_operations btrfs_symlink_inode_operations = {
5946         .readlink       = generic_readlink,
5947         .follow_link    = page_follow_link_light,
5948         .put_link       = page_put_link,
5949         .permission     = btrfs_permission,
5950         .setxattr       = btrfs_setxattr,
5951         .getxattr       = btrfs_getxattr,
5952         .listxattr      = btrfs_listxattr,
5953         .removexattr    = btrfs_removexattr,
5954 };
5955
5956 const struct dentry_operations btrfs_dentry_operations = {
5957         .d_delete       = btrfs_dentry_delete,
5958 };