Btrfs: add new defrag-range ioctl.
[linux-2.6.git] / fs / btrfs / ioctl.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/fsnotify.h>
25 #include <linux/pagemap.h>
26 #include <linux/highmem.h>
27 #include <linux/time.h>
28 #include <linux/init.h>
29 #include <linux/string.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mount.h>
32 #include <linux/mpage.h>
33 #include <linux/namei.h>
34 #include <linux/swap.h>
35 #include <linux/writeback.h>
36 #include <linux/statfs.h>
37 #include <linux/compat.h>
38 #include <linux/bit_spinlock.h>
39 #include <linux/security.h>
40 #include <linux/xattr.h>
41 #include <linux/vmalloc.h>
42 #include "compat.h"
43 #include "ctree.h"
44 #include "disk-io.h"
45 #include "transaction.h"
46 #include "btrfs_inode.h"
47 #include "ioctl.h"
48 #include "print-tree.h"
49 #include "volumes.h"
50 #include "locking.h"
51 #include "ctree.h"
52
53 /* Mask out flags that are inappropriate for the given type of inode. */
54 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
55 {
56         if (S_ISDIR(mode))
57                 return flags;
58         else if (S_ISREG(mode))
59                 return flags & ~FS_DIRSYNC_FL;
60         else
61                 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
62 }
63
64 /*
65  * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
66  */
67 static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
68 {
69         unsigned int iflags = 0;
70
71         if (flags & BTRFS_INODE_SYNC)
72                 iflags |= FS_SYNC_FL;
73         if (flags & BTRFS_INODE_IMMUTABLE)
74                 iflags |= FS_IMMUTABLE_FL;
75         if (flags & BTRFS_INODE_APPEND)
76                 iflags |= FS_APPEND_FL;
77         if (flags & BTRFS_INODE_NODUMP)
78                 iflags |= FS_NODUMP_FL;
79         if (flags & BTRFS_INODE_NOATIME)
80                 iflags |= FS_NOATIME_FL;
81         if (flags & BTRFS_INODE_DIRSYNC)
82                 iflags |= FS_DIRSYNC_FL;
83
84         return iflags;
85 }
86
87 /*
88  * Update inode->i_flags based on the btrfs internal flags.
89  */
90 void btrfs_update_iflags(struct inode *inode)
91 {
92         struct btrfs_inode *ip = BTRFS_I(inode);
93
94         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
95
96         if (ip->flags & BTRFS_INODE_SYNC)
97                 inode->i_flags |= S_SYNC;
98         if (ip->flags & BTRFS_INODE_IMMUTABLE)
99                 inode->i_flags |= S_IMMUTABLE;
100         if (ip->flags & BTRFS_INODE_APPEND)
101                 inode->i_flags |= S_APPEND;
102         if (ip->flags & BTRFS_INODE_NOATIME)
103                 inode->i_flags |= S_NOATIME;
104         if (ip->flags & BTRFS_INODE_DIRSYNC)
105                 inode->i_flags |= S_DIRSYNC;
106 }
107
108 /*
109  * Inherit flags from the parent inode.
110  *
111  * Unlike extN we don't have any flags we don't want to inherit currently.
112  */
113 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
114 {
115         unsigned int flags;
116
117         if (!dir)
118                 return;
119
120         flags = BTRFS_I(dir)->flags;
121
122         if (S_ISREG(inode->i_mode))
123                 flags &= ~BTRFS_INODE_DIRSYNC;
124         else if (!S_ISDIR(inode->i_mode))
125                 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
126
127         BTRFS_I(inode)->flags = flags;
128         btrfs_update_iflags(inode);
129 }
130
131 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
132 {
133         struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
134         unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
135
136         if (copy_to_user(arg, &flags, sizeof(flags)))
137                 return -EFAULT;
138         return 0;
139 }
140
141 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
142 {
143         struct inode *inode = file->f_path.dentry->d_inode;
144         struct btrfs_inode *ip = BTRFS_I(inode);
145         struct btrfs_root *root = ip->root;
146         struct btrfs_trans_handle *trans;
147         unsigned int flags, oldflags;
148         int ret;
149
150         if (copy_from_user(&flags, arg, sizeof(flags)))
151                 return -EFAULT;
152
153         if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
154                       FS_NOATIME_FL | FS_NODUMP_FL | \
155                       FS_SYNC_FL | FS_DIRSYNC_FL))
156                 return -EOPNOTSUPP;
157
158         if (!is_owner_or_cap(inode))
159                 return -EACCES;
160
161         mutex_lock(&inode->i_mutex);
162
163         flags = btrfs_mask_flags(inode->i_mode, flags);
164         oldflags = btrfs_flags_to_ioctl(ip->flags);
165         if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
166                 if (!capable(CAP_LINUX_IMMUTABLE)) {
167                         ret = -EPERM;
168                         goto out_unlock;
169                 }
170         }
171
172         ret = mnt_want_write(file->f_path.mnt);
173         if (ret)
174                 goto out_unlock;
175
176         if (flags & FS_SYNC_FL)
177                 ip->flags |= BTRFS_INODE_SYNC;
178         else
179                 ip->flags &= ~BTRFS_INODE_SYNC;
180         if (flags & FS_IMMUTABLE_FL)
181                 ip->flags |= BTRFS_INODE_IMMUTABLE;
182         else
183                 ip->flags &= ~BTRFS_INODE_IMMUTABLE;
184         if (flags & FS_APPEND_FL)
185                 ip->flags |= BTRFS_INODE_APPEND;
186         else
187                 ip->flags &= ~BTRFS_INODE_APPEND;
188         if (flags & FS_NODUMP_FL)
189                 ip->flags |= BTRFS_INODE_NODUMP;
190         else
191                 ip->flags &= ~BTRFS_INODE_NODUMP;
192         if (flags & FS_NOATIME_FL)
193                 ip->flags |= BTRFS_INODE_NOATIME;
194         else
195                 ip->flags &= ~BTRFS_INODE_NOATIME;
196         if (flags & FS_DIRSYNC_FL)
197                 ip->flags |= BTRFS_INODE_DIRSYNC;
198         else
199                 ip->flags &= ~BTRFS_INODE_DIRSYNC;
200
201
202         trans = btrfs_join_transaction(root, 1);
203         BUG_ON(!trans);
204
205         ret = btrfs_update_inode(trans, root, inode);
206         BUG_ON(ret);
207
208         btrfs_update_iflags(inode);
209         inode->i_ctime = CURRENT_TIME;
210         btrfs_end_transaction(trans, root);
211
212         mnt_drop_write(file->f_path.mnt);
213  out_unlock:
214         mutex_unlock(&inode->i_mutex);
215         return 0;
216 }
217
218 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
219 {
220         struct inode *inode = file->f_path.dentry->d_inode;
221
222         return put_user(inode->i_generation, arg);
223 }
224
225 static noinline int create_subvol(struct btrfs_root *root,
226                                   struct dentry *dentry,
227                                   char *name, int namelen)
228 {
229         struct btrfs_trans_handle *trans;
230         struct btrfs_key key;
231         struct btrfs_root_item root_item;
232         struct btrfs_inode_item *inode_item;
233         struct extent_buffer *leaf;
234         struct btrfs_root *new_root;
235         struct inode *dir = dentry->d_parent->d_inode;
236         int ret;
237         int err;
238         u64 objectid;
239         u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240         u64 index = 0;
241
242         /*
243          * 1 - inode item
244          * 2 - refs
245          * 1 - root item
246          * 2 - dir items
247          */
248         ret = btrfs_reserve_metadata_space(root, 6);
249         if (ret)
250                 return ret;
251
252         trans = btrfs_start_transaction(root, 1);
253         BUG_ON(!trans);
254
255         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256                                        0, &objectid);
257         if (ret)
258                 goto fail;
259
260         leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261                                       0, objectid, NULL, 0, 0, 0);
262         if (IS_ERR(leaf)) {
263                 ret = PTR_ERR(leaf);
264                 goto fail;
265         }
266
267         memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
268         btrfs_set_header_bytenr(leaf, leaf->start);
269         btrfs_set_header_generation(leaf, trans->transid);
270         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
271         btrfs_set_header_owner(leaf, objectid);
272
273         write_extent_buffer(leaf, root->fs_info->fsid,
274                             (unsigned long)btrfs_header_fsid(leaf),
275                             BTRFS_FSID_SIZE);
276         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
277                             (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
278                             BTRFS_UUID_SIZE);
279         btrfs_mark_buffer_dirty(leaf);
280
281         inode_item = &root_item.inode;
282         memset(inode_item, 0, sizeof(*inode_item));
283         inode_item->generation = cpu_to_le64(1);
284         inode_item->size = cpu_to_le64(3);
285         inode_item->nlink = cpu_to_le32(1);
286         inode_item->nbytes = cpu_to_le64(root->leafsize);
287         inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
288
289         btrfs_set_root_bytenr(&root_item, leaf->start);
290         btrfs_set_root_generation(&root_item, trans->transid);
291         btrfs_set_root_level(&root_item, 0);
292         btrfs_set_root_refs(&root_item, 1);
293         btrfs_set_root_used(&root_item, leaf->len);
294         btrfs_set_root_last_snapshot(&root_item, 0);
295
296         memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
297         root_item.drop_level = 0;
298
299         btrfs_tree_unlock(leaf);
300         free_extent_buffer(leaf);
301         leaf = NULL;
302
303         btrfs_set_root_dirid(&root_item, new_dirid);
304
305         key.objectid = objectid;
306         key.offset = 0;
307         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
308         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
309                                 &root_item);
310         if (ret)
311                 goto fail;
312
313         key.offset = (u64)-1;
314         new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
315         BUG_ON(IS_ERR(new_root));
316
317         btrfs_record_root_in_trans(trans, new_root);
318
319         ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
320                                        BTRFS_I(dir)->block_group);
321         /*
322          * insert the directory item
323          */
324         ret = btrfs_set_inode_index(dir, &index);
325         BUG_ON(ret);
326
327         ret = btrfs_insert_dir_item(trans, root,
328                                     name, namelen, dir->i_ino, &key,
329                                     BTRFS_FT_DIR, index);
330         if (ret)
331                 goto fail;
332
333         btrfs_i_size_write(dir, dir->i_size + namelen * 2);
334         ret = btrfs_update_inode(trans, root, dir);
335         BUG_ON(ret);
336
337         ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
338                                  objectid, root->root_key.objectid,
339                                  dir->i_ino, index, name, namelen);
340
341         BUG_ON(ret);
342
343         d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
344 fail:
345         err = btrfs_commit_transaction(trans, root);
346         if (err && !ret)
347                 ret = err;
348
349         btrfs_unreserve_metadata_space(root, 6);
350         return ret;
351 }
352
353 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
354                            char *name, int namelen)
355 {
356         struct inode *inode;
357         struct btrfs_pending_snapshot *pending_snapshot;
358         struct btrfs_trans_handle *trans;
359         int ret;
360
361         if (!root->ref_cows)
362                 return -EINVAL;
363
364         /*
365          * 1 - inode item
366          * 2 - refs
367          * 1 - root item
368          * 2 - dir items
369          */
370         ret = btrfs_reserve_metadata_space(root, 6);
371         if (ret)
372                 goto fail;
373
374         pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375         if (!pending_snapshot) {
376                 ret = -ENOMEM;
377                 btrfs_unreserve_metadata_space(root, 6);
378                 goto fail;
379         }
380         pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381         if (!pending_snapshot->name) {
382                 ret = -ENOMEM;
383                 kfree(pending_snapshot);
384                 btrfs_unreserve_metadata_space(root, 6);
385                 goto fail;
386         }
387         memcpy(pending_snapshot->name, name, namelen);
388         pending_snapshot->name[namelen] = '\0';
389         pending_snapshot->dentry = dentry;
390         trans = btrfs_start_transaction(root, 1);
391         BUG_ON(!trans);
392         pending_snapshot->root = root;
393         list_add(&pending_snapshot->list,
394                  &trans->transaction->pending_snapshots);
395         ret = btrfs_commit_transaction(trans, root);
396         BUG_ON(ret);
397         btrfs_unreserve_metadata_space(root, 6);
398
399         inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400         if (IS_ERR(inode)) {
401                 ret = PTR_ERR(inode);
402                 goto fail;
403         }
404         BUG_ON(!inode);
405         d_instantiate(dentry, inode);
406         ret = 0;
407 fail:
408         return ret;
409 }
410
411 /* copy of may_create in fs/namei.c() */
412 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
413 {
414         if (child->d_inode)
415                 return -EEXIST;
416         if (IS_DEADDIR(dir))
417                 return -ENOENT;
418         return inode_permission(dir, MAY_WRITE | MAY_EXEC);
419 }
420
421 /*
422  * Create a new subvolume below @parent.  This is largely modeled after
423  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
424  * inside this filesystem so it's quite a bit simpler.
425  */
426 static noinline int btrfs_mksubvol(struct path *parent,
427                                    char *name, int namelen,
428                                    struct btrfs_root *snap_src)
429 {
430         struct inode *dir  = parent->dentry->d_inode;
431         struct dentry *dentry;
432         int error;
433
434         mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
435
436         dentry = lookup_one_len(name, parent->dentry, namelen);
437         error = PTR_ERR(dentry);
438         if (IS_ERR(dentry))
439                 goto out_unlock;
440
441         error = -EEXIST;
442         if (dentry->d_inode)
443                 goto out_dput;
444
445         error = mnt_want_write(parent->mnt);
446         if (error)
447                 goto out_dput;
448
449         error = btrfs_may_create(dir, dentry);
450         if (error)
451                 goto out_drop_write;
452
453         down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
454
455         if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
456                 goto out_up_read;
457
458         if (snap_src) {
459                 error = create_snapshot(snap_src, dentry,
460                                         name, namelen);
461         } else {
462                 error = create_subvol(BTRFS_I(dir)->root, dentry,
463                                       name, namelen);
464         }
465         if (!error)
466                 fsnotify_mkdir(dir, dentry);
467 out_up_read:
468         up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
469 out_drop_write:
470         mnt_drop_write(parent->mnt);
471 out_dput:
472         dput(dentry);
473 out_unlock:
474         mutex_unlock(&dir->i_mutex);
475         return error;
476 }
477
478 static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479                                int thresh, u64 *last_len, u64 *skip,
480                                u64 *defrag_end)
481 {
482         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
483         struct extent_map *em = NULL;
484         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
485         int ret = 1;
486
487
488         if (thresh == 0)
489                 thresh = 256 * 1024;
490
491         /*
492          * make sure that once we start defragging and extent, we keep on
493          * defragging it
494          */
495         if (start < *defrag_end)
496                 return 1;
497
498         *skip = 0;
499
500         /*
501          * hopefully we have this extent in the tree already, try without
502          * the full extent lock
503          */
504         read_lock(&em_tree->lock);
505         em = lookup_extent_mapping(em_tree, start, len);
506         read_unlock(&em_tree->lock);
507
508         if (!em) {
509                 /* get the big lock and read metadata off disk */
510                 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
511                 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512                 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513
514                 if (!em)
515                         return 0;
516         }
517
518         /* this will cover holes, and inline extents */
519         if (em->block_start >= EXTENT_MAP_LAST_BYTE)
520                 ret = 0;
521
522         /*
523          * we hit a real extent, if it is big don't bother defragging it again
524          */
525         if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
526                 ret = 0;
527
528         /*
529          * last_len ends up being a counter of how many bytes we've defragged.
530          * every time we choose not to defrag an extent, we reset *last_len
531          * so that the next tiny extent will force a defrag.
532          *
533          * The end result of this is that tiny extents before a single big
534          * extent will force at least part of that big extent to be defragged.
535          */
536         if (ret) {
537                 *last_len += len;
538                 *defrag_end = extent_map_end(em);
539         } else {
540                 *last_len = 0;
541                 *skip = extent_map_end(em);
542                 *defrag_end = 0;
543         }
544
545         free_extent_map(em);
546         return ret;
547 }
548
549 static int btrfs_defrag_file(struct file *file,
550                              struct btrfs_ioctl_defrag_range_args *range)
551 {
552         struct inode *inode = fdentry(file)->d_inode;
553         struct btrfs_root *root = BTRFS_I(inode)->root;
554         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
555         struct btrfs_ordered_extent *ordered;
556         struct page *page;
557         unsigned long last_index;
558         unsigned long ra_pages = root->fs_info->bdi.ra_pages;
559         unsigned long total_read = 0;
560         u64 page_start;
561         u64 page_end;
562         u64 last_len = 0;
563         u64 skip = 0;
564         u64 defrag_end = 0;
565         unsigned long i;
566         int ret;
567
568         if (inode->i_size == 0)
569                 return 0;
570
571         if (range->start + range->len > range->start) {
572                 last_index = min_t(u64, inode->i_size - 1,
573                          range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
574         } else {
575                 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
576         }
577
578         i = range->start >> PAGE_CACHE_SHIFT;
579         while (i <= last_index) {
580                 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
581                                         PAGE_CACHE_SIZE,
582                                         range->extent_thresh,
583                                         &last_len, &skip,
584                                         &defrag_end)) {
585                         unsigned long next;
586                         /*
587                          * the should_defrag function tells us how much to skip
588                          * bump our counter by the suggested amount
589                          */
590                         next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
591                         i = max(i + 1, next);
592                         continue;
593                 }
594
595                 if (total_read % ra_pages == 0) {
596                         btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
597                                        min(last_index, i + ra_pages - 1));
598                 }
599                 total_read++;
600                 mutex_lock(&inode->i_mutex);
601                 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602                         BTRFS_I(inode)->force_compress = 1;
603
604                 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
605                 if (ret) {
606                         ret = -ENOSPC;
607                         break;
608                 }
609
610                 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611                 if (ret) {
612                         btrfs_free_reserved_data_space(root, inode,
613                                                        PAGE_CACHE_SIZE);
614                         ret = -ENOSPC;
615                         break;
616                 }
617 again:
618                 if (inode->i_size == 0 ||
619                     i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
620                         ret = 0;
621                         goto err_reservations;
622                 }
623
624                 page = grab_cache_page(inode->i_mapping, i);
625                 if (!page)
626                         goto err_reservations;
627
628                 if (!PageUptodate(page)) {
629                         btrfs_readpage(NULL, page);
630                         lock_page(page);
631                         if (!PageUptodate(page)) {
632                                 unlock_page(page);
633                                 page_cache_release(page);
634                                 goto err_reservations;
635                         }
636                 }
637
638                 if (page->mapping != inode->i_mapping) {
639                         unlock_page(page);
640                         page_cache_release(page);
641                         goto again;
642                 }
643
644                 wait_on_page_writeback(page);
645
646                 if (PageDirty(page)) {
647                         btrfs_free_reserved_data_space(root, inode,
648                                                        PAGE_CACHE_SIZE);
649                         goto loop_unlock;
650                 }
651
652                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
653                 page_end = page_start + PAGE_CACHE_SIZE - 1;
654                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
655
656                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
657                 if (ordered) {
658                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
659                         unlock_page(page);
660                         page_cache_release(page);
661                         btrfs_start_ordered_extent(inode, ordered, 1);
662                         btrfs_put_ordered_extent(ordered);
663                         goto again;
664                 }
665                 set_page_extent_mapped(page);
666
667                 /*
668                  * this makes sure page_mkwrite is called on the
669                  * page if it is dirtied again later
670                  */
671                 clear_page_dirty_for_io(page);
672                 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
673                                   page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
674                                   EXTENT_DO_ACCOUNTING, GFP_NOFS);
675
676                 btrfs_set_extent_delalloc(inode, page_start, page_end);
677                 ClearPageChecked(page);
678                 set_page_dirty(page);
679                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
680
681 loop_unlock:
682                 unlock_page(page);
683                 page_cache_release(page);
684                 mutex_unlock(&inode->i_mutex);
685
686                 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687                 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688                 i++;
689         }
690
691         if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
692                 filemap_flush(inode->i_mapping);
693
694         if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
695                 /* the filemap_flush will queue IO into the worker threads, but
696                  * we have to make sure the IO is actually started and that
697                  * ordered extents get created before we return
698                  */
699                 atomic_inc(&root->fs_info->async_submit_draining);
700                 while (atomic_read(&root->fs_info->nr_async_submits) ||
701                       atomic_read(&root->fs_info->async_delalloc_pages)) {
702                         wait_event(root->fs_info->async_submit_wait,
703                            (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
704                             atomic_read(&root->fs_info->async_delalloc_pages) == 0));
705                 }
706                 atomic_dec(&root->fs_info->async_submit_draining);
707
708                 mutex_lock(&inode->i_mutex);
709                 BTRFS_I(inode)->force_compress = 0;
710                 mutex_unlock(&inode->i_mutex);
711         }
712
713         return 0;
714
715 err_reservations:
716         mutex_unlock(&inode->i_mutex);
717         btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718         btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719         return ret;
720 }
721
722 static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
723                                         void __user *arg)
724 {
725         u64 new_size;
726         u64 old_size;
727         u64 devid = 1;
728         struct btrfs_ioctl_vol_args *vol_args;
729         struct btrfs_trans_handle *trans;
730         struct btrfs_device *device = NULL;
731         char *sizestr;
732         char *devstr = NULL;
733         int ret = 0;
734         int namelen;
735         int mod = 0;
736
737         if (root->fs_info->sb->s_flags & MS_RDONLY)
738                 return -EROFS;
739
740         if (!capable(CAP_SYS_ADMIN))
741                 return -EPERM;
742
743         vol_args = memdup_user(arg, sizeof(*vol_args));
744         if (IS_ERR(vol_args))
745                 return PTR_ERR(vol_args);
746
747         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
748         namelen = strlen(vol_args->name);
749
750         mutex_lock(&root->fs_info->volume_mutex);
751         sizestr = vol_args->name;
752         devstr = strchr(sizestr, ':');
753         if (devstr) {
754                 char *end;
755                 sizestr = devstr + 1;
756                 *devstr = '\0';
757                 devstr = vol_args->name;
758                 devid = simple_strtoull(devstr, &end, 10);
759                 printk(KERN_INFO "resizing devid %llu\n",
760                        (unsigned long long)devid);
761         }
762         device = btrfs_find_device(root, devid, NULL, NULL);
763         if (!device) {
764                 printk(KERN_INFO "resizer unable to find device %llu\n",
765                        (unsigned long long)devid);
766                 ret = -EINVAL;
767                 goto out_unlock;
768         }
769         if (!strcmp(sizestr, "max"))
770                 new_size = device->bdev->bd_inode->i_size;
771         else {
772                 if (sizestr[0] == '-') {
773                         mod = -1;
774                         sizestr++;
775                 } else if (sizestr[0] == '+') {
776                         mod = 1;
777                         sizestr++;
778                 }
779                 new_size = btrfs_parse_size(sizestr);
780                 if (new_size == 0) {
781                         ret = -EINVAL;
782                         goto out_unlock;
783                 }
784         }
785
786         old_size = device->total_bytes;
787
788         if (mod < 0) {
789                 if (new_size > old_size) {
790                         ret = -EINVAL;
791                         goto out_unlock;
792                 }
793                 new_size = old_size - new_size;
794         } else if (mod > 0) {
795                 new_size = old_size + new_size;
796         }
797
798         if (new_size < 256 * 1024 * 1024) {
799                 ret = -EINVAL;
800                 goto out_unlock;
801         }
802         if (new_size > device->bdev->bd_inode->i_size) {
803                 ret = -EFBIG;
804                 goto out_unlock;
805         }
806
807         do_div(new_size, root->sectorsize);
808         new_size *= root->sectorsize;
809
810         printk(KERN_INFO "new size for %s is %llu\n",
811                 device->name, (unsigned long long)new_size);
812
813         if (new_size > old_size) {
814                 trans = btrfs_start_transaction(root, 1);
815                 ret = btrfs_grow_device(trans, device, new_size);
816                 btrfs_commit_transaction(trans, root);
817         } else {
818                 ret = btrfs_shrink_device(device, new_size);
819         }
820
821 out_unlock:
822         mutex_unlock(&root->fs_info->volume_mutex);
823         kfree(vol_args);
824         return ret;
825 }
826
827 static noinline int btrfs_ioctl_snap_create(struct file *file,
828                                             void __user *arg, int subvol)
829 {
830         struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
831         struct btrfs_ioctl_vol_args *vol_args;
832         struct file *src_file;
833         int namelen;
834         int ret = 0;
835
836         if (root->fs_info->sb->s_flags & MS_RDONLY)
837                 return -EROFS;
838
839         vol_args = memdup_user(arg, sizeof(*vol_args));
840         if (IS_ERR(vol_args))
841                 return PTR_ERR(vol_args);
842
843         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
844         namelen = strlen(vol_args->name);
845         if (strchr(vol_args->name, '/')) {
846                 ret = -EINVAL;
847                 goto out;
848         }
849
850         if (subvol) {
851                 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
852                                      NULL);
853         } else {
854                 struct inode *src_inode;
855                 src_file = fget(vol_args->fd);
856                 if (!src_file) {
857                         ret = -EINVAL;
858                         goto out;
859                 }
860
861                 src_inode = src_file->f_path.dentry->d_inode;
862                 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
863                         printk(KERN_INFO "btrfs: Snapshot src from "
864                                "another FS\n");
865                         ret = -EINVAL;
866                         fput(src_file);
867                         goto out;
868                 }
869                 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
870                                      BTRFS_I(src_inode)->root);
871                 fput(src_file);
872         }
873 out:
874         kfree(vol_args);
875         return ret;
876 }
877
878 /*
879  * helper to check if the subvolume references other subvolumes
880  */
881 static noinline int may_destroy_subvol(struct btrfs_root *root)
882 {
883         struct btrfs_path *path;
884         struct btrfs_key key;
885         int ret;
886
887         path = btrfs_alloc_path();
888         if (!path)
889                 return -ENOMEM;
890
891         key.objectid = root->root_key.objectid;
892         key.type = BTRFS_ROOT_REF_KEY;
893         key.offset = (u64)-1;
894
895         ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
896                                 &key, path, 0, 0);
897         if (ret < 0)
898                 goto out;
899         BUG_ON(ret == 0);
900
901         ret = 0;
902         if (path->slots[0] > 0) {
903                 path->slots[0]--;
904                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
905                 if (key.objectid == root->root_key.objectid &&
906                     key.type == BTRFS_ROOT_REF_KEY)
907                         ret = -ENOTEMPTY;
908         }
909 out:
910         btrfs_free_path(path);
911         return ret;
912 }
913
914 static noinline int key_in_sk(struct btrfs_key *key,
915                               struct btrfs_ioctl_search_key *sk)
916 {
917         if (key->objectid < sk->min_objectid)
918                 return 0;
919         if (key->offset < sk->min_offset)
920                 return 0;
921         if (key->type < sk->min_type)
922                 return 0;
923         if (key->objectid > sk->max_objectid)
924                 return 0;
925         if (key->type > sk->max_type)
926                 return 0;
927         if (key->offset > sk->max_offset)
928                 return 0;
929         return 1;
930 }
931
932 static noinline int copy_to_sk(struct btrfs_root *root,
933                                struct btrfs_path *path,
934                                struct btrfs_key *key,
935                                struct btrfs_ioctl_search_key *sk,
936                                char *buf,
937                                unsigned long *sk_offset,
938                                int *num_found)
939 {
940         u64 found_transid;
941         struct extent_buffer *leaf;
942         struct btrfs_ioctl_search_header sh;
943         unsigned long item_off;
944         unsigned long item_len;
945         int nritems;
946         int i;
947         int slot;
948         int found = 0;
949         int ret = 0;
950
951         leaf = path->nodes[0];
952         slot = path->slots[0];
953         nritems = btrfs_header_nritems(leaf);
954
955         if (btrfs_header_generation(leaf) > sk->max_transid) {
956                 i = nritems;
957                 goto advance_key;
958         }
959         found_transid = btrfs_header_generation(leaf);
960
961         for (i = slot; i < nritems; i++) {
962                 item_off = btrfs_item_ptr_offset(leaf, i);
963                 item_len = btrfs_item_size_nr(leaf, i);
964
965                 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
966                         item_len = 0;
967
968                 if (sizeof(sh) + item_len + *sk_offset >
969                     BTRFS_SEARCH_ARGS_BUFSIZE) {
970                         ret = 1;
971                         goto overflow;
972                 }
973
974                 btrfs_item_key_to_cpu(leaf, key, i);
975                 if (!key_in_sk(key, sk))
976                         continue;
977
978                 sh.objectid = key->objectid;
979                 sh.offset = key->offset;
980                 sh.type = key->type;
981                 sh.len = item_len;
982                 sh.transid = found_transid;
983
984                 /* copy search result header */
985                 memcpy(buf + *sk_offset, &sh, sizeof(sh));
986                 *sk_offset += sizeof(sh);
987
988                 if (item_len) {
989                         char *p = buf + *sk_offset;
990                         /* copy the item */
991                         read_extent_buffer(leaf, p,
992                                            item_off, item_len);
993                         *sk_offset += item_len;
994                         found++;
995                 }
996
997                 if (*num_found >= sk->nr_items)
998                         break;
999         }
1000 advance_key:
1001         if (key->offset < (u64)-1)
1002                 key->offset++;
1003         else if (key->type < (u64)-1)
1004                 key->type++;
1005         else if (key->objectid < (u64)-1)
1006                 key->objectid++;
1007         ret = 0;
1008 overflow:
1009         *num_found += found;
1010         return ret;
1011 }
1012
1013 static noinline int search_ioctl(struct inode *inode,
1014                                  struct btrfs_ioctl_search_args *args)
1015 {
1016         struct btrfs_root *root;
1017         struct btrfs_key key;
1018         struct btrfs_key max_key;
1019         struct btrfs_path *path;
1020         struct btrfs_ioctl_search_key *sk = &args->key;
1021         struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1022         int ret;
1023         int num_found = 0;
1024         unsigned long sk_offset = 0;
1025
1026         path = btrfs_alloc_path();
1027         if (!path)
1028                 return -ENOMEM;
1029
1030         if (sk->tree_id == 0) {
1031                 /* search the root of the inode that was passed */
1032                 root = BTRFS_I(inode)->root;
1033         } else {
1034                 key.objectid = sk->tree_id;
1035                 key.type = BTRFS_ROOT_ITEM_KEY;
1036                 key.offset = (u64)-1;
1037                 root = btrfs_read_fs_root_no_name(info, &key);
1038                 if (IS_ERR(root)) {
1039                         printk(KERN_ERR "could not find root %llu\n",
1040                                sk->tree_id);
1041                         btrfs_free_path(path);
1042                         return -ENOENT;
1043                 }
1044         }
1045
1046         key.objectid = sk->min_objectid;
1047         key.type = sk->min_type;
1048         key.offset = sk->min_offset;
1049
1050         max_key.objectid = sk->max_objectid;
1051         max_key.type = sk->max_type;
1052         max_key.offset = sk->max_offset;
1053
1054         path->keep_locks = 1;
1055
1056         while(1) {
1057                 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1058                                            sk->min_transid);
1059                 if (ret != 0) {
1060                         if (ret > 0)
1061                                 ret = 0;
1062                         goto err;
1063                 }
1064                 ret = copy_to_sk(root, path, &key, sk, args->buf,
1065                                  &sk_offset, &num_found);
1066                 btrfs_release_path(root, path);
1067                 if (ret || num_found >= sk->nr_items)
1068                         break;
1069
1070         }
1071         ret = 0;
1072 err:
1073         sk->nr_items = num_found;
1074         btrfs_free_path(path);
1075         return ret;
1076 }
1077
1078 static noinline int btrfs_ioctl_tree_search(struct file *file,
1079                                            void __user *argp)
1080 {
1081          struct btrfs_ioctl_search_args *args;
1082          struct inode *inode;
1083          int ret;
1084
1085         if (!capable(CAP_SYS_ADMIN))
1086                 return -EPERM;
1087
1088         args = kmalloc(sizeof(*args), GFP_KERNEL);
1089         if (!args)
1090                 return -ENOMEM;
1091
1092         if (copy_from_user(args, argp, sizeof(*args))) {
1093                 kfree(args);
1094                 return -EFAULT;
1095         }
1096         inode = fdentry(file)->d_inode;
1097         ret = search_ioctl(inode, args);
1098         if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1099                 ret = -EFAULT;
1100         kfree(args);
1101         return ret;
1102 }
1103
1104 /*
1105  * Search INODE_REFs to identify path name of 'dirid' directory
1106  * in a 'tree_id' tree. and sets path name to 'name'.
1107  */
1108 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1109                                 u64 tree_id, u64 dirid, char *name)
1110 {
1111         struct btrfs_root *root;
1112         struct btrfs_key key;
1113         char *ptr;
1114         int ret = -1;
1115         int slot;
1116         int len;
1117         int total_len = 0;
1118         struct btrfs_inode_ref *iref;
1119         struct extent_buffer *l;
1120         struct btrfs_path *path;
1121
1122         if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1123                 name[0]='\0';
1124                 return 0;
1125         }
1126
1127         path = btrfs_alloc_path();
1128         if (!path)
1129                 return -ENOMEM;
1130
1131         ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1132
1133         key.objectid = tree_id;
1134         key.type = BTRFS_ROOT_ITEM_KEY;
1135         key.offset = (u64)-1;
1136         root = btrfs_read_fs_root_no_name(info, &key);
1137         if (IS_ERR(root)) {
1138                 printk(KERN_ERR "could not find root %llu\n", tree_id);
1139                 return -ENOENT;
1140         }
1141
1142         key.objectid = dirid;
1143         key.type = BTRFS_INODE_REF_KEY;
1144         key.offset = 0;
1145
1146         while(1) {
1147                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1148                 if (ret < 0)
1149                         goto out;
1150
1151                 l = path->nodes[0];
1152                 slot = path->slots[0];
1153                 btrfs_item_key_to_cpu(l, &key, slot);
1154
1155                 if (ret > 0 && (key.objectid != dirid ||
1156                                 key.type != BTRFS_INODE_REF_KEY)) {
1157                         ret = -ENOENT;
1158                         goto out;
1159                 }
1160
1161                 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1162                 len = btrfs_inode_ref_name_len(l, iref);
1163                 ptr -= len + 1;
1164                 total_len += len + 1;
1165                 if (ptr < name)
1166                         goto out;
1167
1168                 *(ptr + len) = '/';
1169                 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1170
1171                 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1172                         break;
1173
1174                 btrfs_release_path(root, path);
1175                 key.objectid = key.offset;
1176                 key.offset = 0;
1177                 dirid = key.objectid;
1178
1179         }
1180         if (ptr < name)
1181                 goto out;
1182         memcpy(name, ptr, total_len);
1183         name[total_len]='\0';
1184         ret = 0;
1185 out:
1186         btrfs_free_path(path);
1187         return ret;
1188 }
1189
1190 static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1191                                            void __user *argp)
1192 {
1193          struct btrfs_ioctl_ino_lookup_args *args;
1194          struct inode *inode;
1195          int ret;
1196
1197         if (!capable(CAP_SYS_ADMIN))
1198                 return -EPERM;
1199
1200         args = kmalloc(sizeof(*args), GFP_KERNEL);
1201         if (copy_from_user(args, argp, sizeof(*args))) {
1202                 kfree(args);
1203                 return -EFAULT;
1204         }
1205         inode = fdentry(file)->d_inode;
1206
1207         ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1208                                         args->treeid, args->objectid,
1209                                         args->name);
1210
1211         if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1212                 ret = -EFAULT;
1213
1214         kfree(args);
1215         return ret;
1216 }
1217
1218 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1219                                              void __user *arg)
1220 {
1221         struct dentry *parent = fdentry(file);
1222         struct dentry *dentry;
1223         struct inode *dir = parent->d_inode;
1224         struct inode *inode;
1225         struct btrfs_root *root = BTRFS_I(dir)->root;
1226         struct btrfs_root *dest = NULL;
1227         struct btrfs_ioctl_vol_args *vol_args;
1228         struct btrfs_trans_handle *trans;
1229         int namelen;
1230         int ret;
1231         int err = 0;
1232
1233         if (!capable(CAP_SYS_ADMIN))
1234                 return -EPERM;
1235
1236         vol_args = memdup_user(arg, sizeof(*vol_args));
1237         if (IS_ERR(vol_args))
1238                 return PTR_ERR(vol_args);
1239
1240         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1241         namelen = strlen(vol_args->name);
1242         if (strchr(vol_args->name, '/') ||
1243             strncmp(vol_args->name, "..", namelen) == 0) {
1244                 err = -EINVAL;
1245                 goto out;
1246         }
1247
1248         err = mnt_want_write(file->f_path.mnt);
1249         if (err)
1250                 goto out;
1251
1252         mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
1253         dentry = lookup_one_len(vol_args->name, parent, namelen);
1254         if (IS_ERR(dentry)) {
1255                 err = PTR_ERR(dentry);
1256                 goto out_unlock_dir;
1257         }
1258
1259         if (!dentry->d_inode) {
1260                 err = -ENOENT;
1261                 goto out_dput;
1262         }
1263
1264         inode = dentry->d_inode;
1265         if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
1266                 err = -EINVAL;
1267                 goto out_dput;
1268         }
1269
1270         dest = BTRFS_I(inode)->root;
1271
1272         mutex_lock(&inode->i_mutex);
1273         err = d_invalidate(dentry);
1274         if (err)
1275                 goto out_unlock;
1276
1277         down_write(&root->fs_info->subvol_sem);
1278
1279         err = may_destroy_subvol(dest);
1280         if (err)
1281                 goto out_up_write;
1282
1283         trans = btrfs_start_transaction(root, 1);
1284         ret = btrfs_unlink_subvol(trans, root, dir,
1285                                 dest->root_key.objectid,
1286                                 dentry->d_name.name,
1287                                 dentry->d_name.len);
1288         BUG_ON(ret);
1289
1290         btrfs_record_root_in_trans(trans, dest);
1291
1292         memset(&dest->root_item.drop_progress, 0,
1293                 sizeof(dest->root_item.drop_progress));
1294         dest->root_item.drop_level = 0;
1295         btrfs_set_root_refs(&dest->root_item, 0);
1296
1297         ret = btrfs_insert_orphan_item(trans,
1298                                 root->fs_info->tree_root,
1299                                 dest->root_key.objectid);
1300         BUG_ON(ret);
1301
1302         ret = btrfs_commit_transaction(trans, root);
1303         BUG_ON(ret);
1304         inode->i_flags |= S_DEAD;
1305 out_up_write:
1306         up_write(&root->fs_info->subvol_sem);
1307 out_unlock:
1308         mutex_unlock(&inode->i_mutex);
1309         if (!err) {
1310                 shrink_dcache_sb(root->fs_info->sb);
1311                 btrfs_invalidate_inodes(dest);
1312                 d_delete(dentry);
1313         }
1314 out_dput:
1315         dput(dentry);
1316 out_unlock_dir:
1317         mutex_unlock(&dir->i_mutex);
1318         mnt_drop_write(file->f_path.mnt);
1319 out:
1320         kfree(vol_args);
1321         return err;
1322 }
1323
1324 static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1325 {
1326         struct inode *inode = fdentry(file)->d_inode;
1327         struct btrfs_root *root = BTRFS_I(inode)->root;
1328         struct btrfs_ioctl_defrag_range_args *range;
1329         int ret;
1330
1331         ret = mnt_want_write(file->f_path.mnt);
1332         if (ret)
1333                 return ret;
1334
1335         switch (inode->i_mode & S_IFMT) {
1336         case S_IFDIR:
1337                 if (!capable(CAP_SYS_ADMIN)) {
1338                         ret = -EPERM;
1339                         goto out;
1340                 }
1341                 btrfs_defrag_root(root, 0);
1342                 btrfs_defrag_root(root->fs_info->extent_root, 0);
1343                 break;
1344         case S_IFREG:
1345                 if (!(file->f_mode & FMODE_WRITE)) {
1346                         ret = -EINVAL;
1347                         goto out;
1348                 }
1349
1350                 range = kzalloc(sizeof(*range), GFP_KERNEL);
1351                 if (!range) {
1352                         ret = -ENOMEM;
1353                         goto out;
1354                 }
1355
1356                 if (argp) {
1357                         if (copy_from_user(range, argp,
1358                                            sizeof(*range))) {
1359                                 ret = -EFAULT;
1360                                 kfree(range);
1361                         }
1362                         /* compression requires us to start the IO */
1363                         if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1364                                 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1365                                 range->extent_thresh = (u32)-1;
1366                         }
1367                 } else {
1368                         /* the rest are all set to zero by kzalloc */
1369                         range->len = (u64)-1;
1370                 }
1371                 btrfs_defrag_file(file, range);
1372                 kfree(range);
1373                 break;
1374         }
1375 out:
1376         mnt_drop_write(file->f_path.mnt);
1377         return ret;
1378 }
1379
1380 static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
1381 {
1382         struct btrfs_ioctl_vol_args *vol_args;
1383         int ret;
1384
1385         if (!capable(CAP_SYS_ADMIN))
1386                 return -EPERM;
1387
1388         vol_args = memdup_user(arg, sizeof(*vol_args));
1389         if (IS_ERR(vol_args))
1390                 return PTR_ERR(vol_args);
1391
1392         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1393         ret = btrfs_init_new_device(root, vol_args->name);
1394
1395         kfree(vol_args);
1396         return ret;
1397 }
1398
1399 static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
1400 {
1401         struct btrfs_ioctl_vol_args *vol_args;
1402         int ret;
1403
1404         if (!capable(CAP_SYS_ADMIN))
1405                 return -EPERM;
1406
1407         if (root->fs_info->sb->s_flags & MS_RDONLY)
1408                 return -EROFS;
1409
1410         vol_args = memdup_user(arg, sizeof(*vol_args));
1411         if (IS_ERR(vol_args))
1412                 return PTR_ERR(vol_args);
1413
1414         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1415         ret = btrfs_rm_device(root, vol_args->name);
1416
1417         kfree(vol_args);
1418         return ret;
1419 }
1420
1421 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1422                                        u64 off, u64 olen, u64 destoff)
1423 {
1424         struct inode *inode = fdentry(file)->d_inode;
1425         struct btrfs_root *root = BTRFS_I(inode)->root;
1426         struct file *src_file;
1427         struct inode *src;
1428         struct btrfs_trans_handle *trans;
1429         struct btrfs_path *path;
1430         struct extent_buffer *leaf;
1431         char *buf;
1432         struct btrfs_key key;
1433         u32 nritems;
1434         int slot;
1435         int ret;
1436         u64 len = olen;
1437         u64 bs = root->fs_info->sb->s_blocksize;
1438         u64 hint_byte;
1439
1440         /*
1441          * TODO:
1442          * - split compressed inline extents.  annoying: we need to
1443          *   decompress into destination's address_space (the file offset
1444          *   may change, so source mapping won't do), then recompress (or
1445          *   otherwise reinsert) a subrange.
1446          * - allow ranges within the same file to be cloned (provided
1447          *   they don't overlap)?
1448          */
1449
1450         /* the destination must be opened for writing */
1451         if (!(file->f_mode & FMODE_WRITE))
1452                 return -EINVAL;
1453
1454         ret = mnt_want_write(file->f_path.mnt);
1455         if (ret)
1456                 return ret;
1457
1458         src_file = fget(srcfd);
1459         if (!src_file) {
1460                 ret = -EBADF;
1461                 goto out_drop_write;
1462         }
1463         src = src_file->f_dentry->d_inode;
1464
1465         ret = -EINVAL;
1466         if (src == inode)
1467                 goto out_fput;
1468
1469         ret = -EISDIR;
1470         if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
1471                 goto out_fput;
1472
1473         ret = -EXDEV;
1474         if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
1475                 goto out_fput;
1476
1477         ret = -ENOMEM;
1478         buf = vmalloc(btrfs_level_size(root, 0));
1479         if (!buf)
1480                 goto out_fput;
1481
1482         path = btrfs_alloc_path();
1483         if (!path) {
1484                 vfree(buf);
1485                 goto out_fput;
1486         }
1487         path->reada = 2;
1488
1489         if (inode < src) {
1490                 mutex_lock(&inode->i_mutex);
1491                 mutex_lock(&src->i_mutex);
1492         } else {
1493                 mutex_lock(&src->i_mutex);
1494                 mutex_lock(&inode->i_mutex);
1495         }
1496
1497         /* determine range to clone */
1498         ret = -EINVAL;
1499         if (off >= src->i_size || off + len > src->i_size)
1500                 goto out_unlock;
1501         if (len == 0)
1502                 olen = len = src->i_size - off;
1503         /* if we extend to eof, continue to block boundary */
1504         if (off + len == src->i_size)
1505                 len = ((src->i_size + bs-1) & ~(bs-1))
1506                         - off;
1507
1508         /* verify the end result is block aligned */
1509         if ((off & (bs-1)) ||
1510             ((off + len) & (bs-1)))
1511                 goto out_unlock;
1512
1513         /* do any pending delalloc/csum calc on src, one way or
1514            another, and lock file content */
1515         while (1) {
1516                 struct btrfs_ordered_extent *ordered;
1517                 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1518                 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
1519                 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
1520                         break;
1521                 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1522                 if (ordered)
1523                         btrfs_put_ordered_extent(ordered);
1524                 btrfs_wait_ordered_range(src, off, off+len);
1525         }
1526
1527         trans = btrfs_start_transaction(root, 1);
1528         BUG_ON(!trans);
1529
1530         /* punch hole in destination first */
1531         btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1532
1533         /* clone data */
1534         key.objectid = src->i_ino;
1535         key.type = BTRFS_EXTENT_DATA_KEY;
1536         key.offset = 0;
1537
1538         while (1) {
1539                 /*
1540                  * note the key will change type as we walk through the
1541                  * tree.
1542                  */
1543                 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
1544                 if (ret < 0)
1545                         goto out;
1546
1547                 nritems = btrfs_header_nritems(path->nodes[0]);
1548                 if (path->slots[0] >= nritems) {
1549                         ret = btrfs_next_leaf(root, path);
1550                         if (ret < 0)
1551                                 goto out;
1552                         if (ret > 0)
1553                                 break;
1554                         nritems = btrfs_header_nritems(path->nodes[0]);
1555                 }
1556                 leaf = path->nodes[0];
1557                 slot = path->slots[0];
1558
1559                 btrfs_item_key_to_cpu(leaf, &key, slot);
1560                 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
1561                     key.objectid != src->i_ino)
1562                         break;
1563
1564                 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
1565                         struct btrfs_file_extent_item *extent;
1566                         int type;
1567                         u32 size;
1568                         struct btrfs_key new_key;
1569                         u64 disko = 0, diskl = 0;
1570                         u64 datao = 0, datal = 0;
1571                         u8 comp;
1572
1573                         size = btrfs_item_size_nr(leaf, slot);
1574                         read_extent_buffer(leaf, buf,
1575                                            btrfs_item_ptr_offset(leaf, slot),
1576                                            size);
1577
1578                         extent = btrfs_item_ptr(leaf, slot,
1579                                                 struct btrfs_file_extent_item);
1580                         comp = btrfs_file_extent_compression(leaf, extent);
1581                         type = btrfs_file_extent_type(leaf, extent);
1582                         if (type == BTRFS_FILE_EXTENT_REG ||
1583                             type == BTRFS_FILE_EXTENT_PREALLOC) {
1584                                 disko = btrfs_file_extent_disk_bytenr(leaf,
1585                                                                       extent);
1586                                 diskl = btrfs_file_extent_disk_num_bytes(leaf,
1587                                                                  extent);
1588                                 datao = btrfs_file_extent_offset(leaf, extent);
1589                                 datal = btrfs_file_extent_num_bytes(leaf,
1590                                                                     extent);
1591                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
1592                                 /* take upper bound, may be compressed */
1593                                 datal = btrfs_file_extent_ram_bytes(leaf,
1594                                                                     extent);
1595                         }
1596                         btrfs_release_path(root, path);
1597
1598                         if (key.offset + datal < off ||
1599                             key.offset >= off+len)
1600                                 goto next;
1601
1602                         memcpy(&new_key, &key, sizeof(new_key));
1603                         new_key.objectid = inode->i_ino;
1604                         new_key.offset = key.offset + destoff - off;
1605
1606                         if (type == BTRFS_FILE_EXTENT_REG ||
1607                             type == BTRFS_FILE_EXTENT_PREALLOC) {
1608                                 ret = btrfs_insert_empty_item(trans, root, path,
1609                                                               &new_key, size);
1610                                 if (ret)
1611                                         goto out;
1612
1613                                 leaf = path->nodes[0];
1614                                 slot = path->slots[0];
1615                                 write_extent_buffer(leaf, buf,
1616                                             btrfs_item_ptr_offset(leaf, slot),
1617                                             size);
1618
1619                                 extent = btrfs_item_ptr(leaf, slot,
1620                                                 struct btrfs_file_extent_item);
1621
1622                                 if (off > key.offset) {
1623                                         datao += off - key.offset;
1624                                         datal -= off - key.offset;
1625                                 }
1626
1627                                 if (key.offset + datal > off + len)
1628                                         datal = off + len - key.offset;
1629
1630                                 /* disko == 0 means it's a hole */
1631                                 if (!disko)
1632                                         datao = 0;
1633
1634                                 btrfs_set_file_extent_offset(leaf, extent,
1635                                                              datao);
1636                                 btrfs_set_file_extent_num_bytes(leaf, extent,
1637                                                                 datal);
1638                                 if (disko) {
1639                                         inode_add_bytes(inode, datal);
1640                                         ret = btrfs_inc_extent_ref(trans, root,
1641                                                         disko, diskl, 0,
1642                                                         root->root_key.objectid,
1643                                                         inode->i_ino,
1644                                                         new_key.offset - datao);
1645                                         BUG_ON(ret);
1646                                 }
1647                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
1648                                 u64 skip = 0;
1649                                 u64 trim = 0;
1650                                 if (off > key.offset) {
1651                                         skip = off - key.offset;
1652                                         new_key.offset += skip;
1653                                 }
1654
1655                                 if (key.offset + datal > off+len)
1656                                         trim = key.offset + datal - (off+len);
1657
1658                                 if (comp && (skip || trim)) {
1659                                         ret = -EINVAL;
1660                                         goto out;
1661                                 }
1662                                 size -= skip + trim;
1663                                 datal -= skip + trim;
1664                                 ret = btrfs_insert_empty_item(trans, root, path,
1665                                                               &new_key, size);
1666                                 if (ret)
1667                                         goto out;
1668
1669                                 if (skip) {
1670                                         u32 start =
1671                                           btrfs_file_extent_calc_inline_size(0);
1672                                         memmove(buf+start, buf+start+skip,
1673                                                 datal);
1674                                 }
1675
1676                                 leaf = path->nodes[0];
1677                                 slot = path->slots[0];
1678                                 write_extent_buffer(leaf, buf,
1679                                             btrfs_item_ptr_offset(leaf, slot),
1680                                             size);
1681                                 inode_add_bytes(inode, datal);
1682                         }
1683
1684                         btrfs_mark_buffer_dirty(leaf);
1685                 }
1686
1687 next:
1688                 btrfs_release_path(root, path);
1689                 key.offset++;
1690         }
1691         ret = 0;
1692 out:
1693         btrfs_release_path(root, path);
1694         if (ret == 0) {
1695                 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1696                 if (destoff + olen > inode->i_size)
1697                         btrfs_i_size_write(inode, destoff + olen);
1698                 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1699                 ret = btrfs_update_inode(trans, root, inode);
1700         }
1701         btrfs_end_transaction(trans, root);
1702         unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1703         if (ret)
1704                 vmtruncate(inode, 0);
1705 out_unlock:
1706         mutex_unlock(&src->i_mutex);
1707         mutex_unlock(&inode->i_mutex);
1708         vfree(buf);
1709         btrfs_free_path(path);
1710 out_fput:
1711         fput(src_file);
1712 out_drop_write:
1713         mnt_drop_write(file->f_path.mnt);
1714         return ret;
1715 }
1716
1717 static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1718 {
1719         struct btrfs_ioctl_clone_range_args args;
1720
1721         if (copy_from_user(&args, argp, sizeof(args)))
1722                 return -EFAULT;
1723         return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1724                                  args.src_length, args.dest_offset);
1725 }
1726
1727 /*
1728  * there are many ways the trans_start and trans_end ioctls can lead
1729  * to deadlocks.  They should only be used by applications that
1730  * basically own the machine, and have a very in depth understanding
1731  * of all the possible deadlocks and enospc problems.
1732  */
1733 static long btrfs_ioctl_trans_start(struct file *file)
1734 {
1735         struct inode *inode = fdentry(file)->d_inode;
1736         struct btrfs_root *root = BTRFS_I(inode)->root;
1737         struct btrfs_trans_handle *trans;
1738         int ret;
1739
1740         ret = -EPERM;
1741         if (!capable(CAP_SYS_ADMIN))
1742                 goto out;
1743
1744         ret = -EINPROGRESS;
1745         if (file->private_data)
1746                 goto out;
1747
1748         ret = mnt_want_write(file->f_path.mnt);
1749         if (ret)
1750                 goto out;
1751
1752         mutex_lock(&root->fs_info->trans_mutex);
1753         root->fs_info->open_ioctl_trans++;
1754         mutex_unlock(&root->fs_info->trans_mutex);
1755
1756         ret = -ENOMEM;
1757         trans = btrfs_start_ioctl_transaction(root, 0);
1758         if (!trans)
1759                 goto out_drop;
1760
1761         file->private_data = trans;
1762         return 0;
1763
1764 out_drop:
1765         mutex_lock(&root->fs_info->trans_mutex);
1766         root->fs_info->open_ioctl_trans--;
1767         mutex_unlock(&root->fs_info->trans_mutex);
1768         mnt_drop_write(file->f_path.mnt);
1769 out:
1770         return ret;
1771 }
1772
1773 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1774 {
1775         struct inode *inode = fdentry(file)->d_inode;
1776         struct btrfs_root *root = BTRFS_I(inode)->root;
1777         struct btrfs_root *new_root;
1778         struct btrfs_dir_item *di;
1779         struct btrfs_trans_handle *trans;
1780         struct btrfs_path *path;
1781         struct btrfs_key location;
1782         struct btrfs_disk_key disk_key;
1783         struct btrfs_super_block *disk_super;
1784         u64 features;
1785         u64 objectid = 0;
1786         u64 dir_id;
1787
1788         if (!capable(CAP_SYS_ADMIN))
1789                 return -EPERM;
1790
1791         if (copy_from_user(&objectid, argp, sizeof(objectid)))
1792                 return -EFAULT;
1793
1794         if (!objectid)
1795                 objectid = root->root_key.objectid;
1796
1797         location.objectid = objectid;
1798         location.type = BTRFS_ROOT_ITEM_KEY;
1799         location.offset = (u64)-1;
1800
1801         new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1802         if (IS_ERR(new_root))
1803                 return PTR_ERR(new_root);
1804
1805         if (btrfs_root_refs(&new_root->root_item) == 0)
1806                 return -ENOENT;
1807
1808         path = btrfs_alloc_path();
1809         if (!path)
1810                 return -ENOMEM;
1811         path->leave_spinning = 1;
1812
1813         trans = btrfs_start_transaction(root, 1);
1814         if (!trans) {
1815                 btrfs_free_path(path);
1816                 return -ENOMEM;
1817         }
1818
1819         dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1820         di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1821                                    dir_id, "default", 7, 1);
1822         if (!di) {
1823                 btrfs_free_path(path);
1824                 btrfs_end_transaction(trans, root);
1825                 printk(KERN_ERR "Umm, you don't have the default dir item, "
1826                        "this isn't going to work\n");
1827                 return -ENOENT;
1828         }
1829
1830         btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1831         btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1832         btrfs_mark_buffer_dirty(path->nodes[0]);
1833         btrfs_free_path(path);
1834
1835         disk_super = &root->fs_info->super_copy;
1836         features = btrfs_super_incompat_flags(disk_super);
1837         if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1838                 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1839                 btrfs_set_super_incompat_flags(disk_super, features);
1840         }
1841         btrfs_end_transaction(trans, root);
1842
1843         return 0;
1844 }
1845
1846 /*
1847  * there are many ways the trans_start and trans_end ioctls can lead
1848  * to deadlocks.  They should only be used by applications that
1849  * basically own the machine, and have a very in depth understanding
1850  * of all the possible deadlocks and enospc problems.
1851  */
1852 long btrfs_ioctl_trans_end(struct file *file)
1853 {
1854         struct inode *inode = fdentry(file)->d_inode;
1855         struct btrfs_root *root = BTRFS_I(inode)->root;
1856         struct btrfs_trans_handle *trans;
1857
1858         trans = file->private_data;
1859         if (!trans)
1860                 return -EINVAL;
1861         file->private_data = NULL;
1862
1863         btrfs_end_transaction(trans, root);
1864
1865         mutex_lock(&root->fs_info->trans_mutex);
1866         root->fs_info->open_ioctl_trans--;
1867         mutex_unlock(&root->fs_info->trans_mutex);
1868
1869         mnt_drop_write(file->f_path.mnt);
1870         return 0;
1871 }
1872
1873 long btrfs_ioctl(struct file *file, unsigned int
1874                 cmd, unsigned long arg)
1875 {
1876         struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1877         void __user *argp = (void __user *)arg;
1878
1879         switch (cmd) {
1880         case FS_IOC_GETFLAGS:
1881                 return btrfs_ioctl_getflags(file, argp);
1882         case FS_IOC_SETFLAGS:
1883                 return btrfs_ioctl_setflags(file, argp);
1884         case FS_IOC_GETVERSION:
1885                 return btrfs_ioctl_getversion(file, argp);
1886         case BTRFS_IOC_SNAP_CREATE:
1887                 return btrfs_ioctl_snap_create(file, argp, 0);
1888         case BTRFS_IOC_SUBVOL_CREATE:
1889                 return btrfs_ioctl_snap_create(file, argp, 1);
1890         case BTRFS_IOC_SNAP_DESTROY:
1891                 return btrfs_ioctl_snap_destroy(file, argp);
1892         case BTRFS_IOC_DEFAULT_SUBVOL:
1893                 return btrfs_ioctl_default_subvol(file, argp);
1894         case BTRFS_IOC_DEFRAG:
1895                 return btrfs_ioctl_defrag(file, NULL);
1896         case BTRFS_IOC_DEFRAG_RANGE:
1897                 return btrfs_ioctl_defrag(file, argp);
1898         case BTRFS_IOC_RESIZE:
1899                 return btrfs_ioctl_resize(root, argp);
1900         case BTRFS_IOC_ADD_DEV:
1901                 return btrfs_ioctl_add_dev(root, argp);
1902         case BTRFS_IOC_RM_DEV:
1903                 return btrfs_ioctl_rm_dev(root, argp);
1904         case BTRFS_IOC_BALANCE:
1905                 return btrfs_balance(root->fs_info->dev_root);
1906         case BTRFS_IOC_CLONE:
1907                 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1908         case BTRFS_IOC_CLONE_RANGE:
1909                 return btrfs_ioctl_clone_range(file, argp);
1910         case BTRFS_IOC_TRANS_START:
1911                 return btrfs_ioctl_trans_start(file);
1912         case BTRFS_IOC_TRANS_END:
1913                 return btrfs_ioctl_trans_end(file);
1914         case BTRFS_IOC_TREE_SEARCH:
1915                 return btrfs_ioctl_tree_search(file, argp);
1916         case BTRFS_IOC_INO_LOOKUP:
1917                 return btrfs_ioctl_ino_lookup(file, argp);
1918         case BTRFS_IOC_SYNC:
1919                 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1920                 return 0;
1921         }
1922
1923         return -ENOTTY;
1924 }