Commit c5c9cd4d authored by Sage Weil's avatar Sage Weil Committed by Chris Mason

Btrfs: allow clone of an arbitrary file range

This patch adds an additional CLONE_RANGE ioctl to clone an arbitrary 
(block-aligned) file range to another file.  The original CLONE ioctl 
becomes a special case of cloning the entire file range.  The logic is a 
bit more complex now since ranges may be cloned to different offsets, and 
because we may only be cloning the beginning or end of a particular extent 
or checksum item.

An additional sanity check ensures the source and destination files aren't 
the same (which would previously deadlock), although eventually this could 
be extended to allow the duplication of file data at a different offset 
within the same file.

Any extents within the destination range in the target file are dropped.

We currently do not cope with the case where a compressed inline extent 
needs to be split.  This will probably require decompressing the extent 
into a temporary address_space, and inserting just the cloned portion as a 
new compressed inline extent.  For now, just return -EINVAL in this case.  
Note that this never comes up in the more common case of cloning an entire 
file.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 2ed6d664
...@@ -592,7 +592,8 @@ long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) ...@@ -592,7 +592,8 @@ long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
return ret; return ret;
} }
long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
u64 olen, u64 destoff)
{ {
struct inode *inode = fdentry(file)->d_inode; struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *root = BTRFS_I(inode)->root;
...@@ -606,12 +607,29 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ...@@ -606,12 +607,29 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
u32 nritems; u32 nritems;
int slot; int slot;
int ret; int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
u64 hint_byte;
src_file = fget(src_fd); /*
* TODO:
* - split compressed inline extents. annoying: we need to
* decompress into destination's address_space (the file offset
* may change, so source mapping won't do), then recompress (or
* otherwise reinsert) a subrange.
* - allow ranges within the same file to be cloned (provided
* they don't overlap)?
*/
src_file = fget(srcfd);
if (!src_file) if (!src_file)
return -EBADF; return -EBADF;
src = src_file->f_dentry->d_inode; src = src_file->f_dentry->d_inode;
ret = -EINVAL;
if (src == inode)
goto out_fput;
ret = -EISDIR; ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
goto out_fput; goto out_fput;
...@@ -640,27 +658,46 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ...@@ -640,27 +658,46 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
} }
ret = -ENOTEMPTY; /* determine range to clone */
if (inode->i_size) ret = -EINVAL;
if (off >= src->i_size || off + len > src->i_size)
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
/* if we extend to eof, continue to block boundary */
if (off + len == src->i_size)
len = ((src->i_size + bs-1) & ~(bs-1))
- off;
/* verify the end result is block aligned */
if ((off & (bs-1)) ||
((off + len) & (bs-1)))
goto out_unlock; goto out_unlock;
printk("final src extent is %llu~%llu\n", off, len);
printk("final dst extent is %llu~%llu\n", destoff, len);
/* do any pending delalloc/csum calc on src, one way or /* do any pending delalloc/csum calc on src, one way or
another, and lock file content */ another, and lock file content */
while (1) { while (1) {
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
break; break;
unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
if (ordered) if (ordered)
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
btrfs_wait_ordered_range(src, 0, (u64)-1); btrfs_wait_ordered_range(src, off, off+len);
} }
trans = btrfs_start_transaction(root, 1); trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans); BUG_ON(!trans);
/* punch hole in destination first */
btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
/* clone data */
key.objectid = src->i_ino; key.objectid = src->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY; key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0; key.offset = 0;
...@@ -691,19 +728,45 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ...@@ -691,19 +728,45 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
key.objectid != src->i_ino) key.objectid != src->i_ino)
break; break;
if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY || if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { struct btrfs_file_extent_item *extent;
int type;
u32 size; u32 size;
struct btrfs_key new_key; struct btrfs_key new_key;
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
size = btrfs_item_size_nr(leaf, slot); size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf, read_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot), btrfs_item_ptr_offset(leaf, slot),
size); size);
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG) {
disko = btrfs_file_extent_disk_bytenr(leaf, extent);
diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
datao = btrfs_file_extent_offset(leaf, extent);
datal = btrfs_file_extent_num_bytes(leaf, extent);
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
/* take upper bound, may be compressed */
datal = btrfs_file_extent_ram_bytes(leaf,
extent);
}
btrfs_release_path(root, path); btrfs_release_path(root, path);
if (key.offset + datal < off ||
key.offset >= off+len)
goto next;
memcpy(&new_key, &key, sizeof(new_key)); memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = inode->i_ino; new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
if (type == BTRFS_FILE_EXTENT_REG) {
ret = btrfs_insert_empty_item(trans, root, path, ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size); &new_key, size);
if (ret) if (ret)
...@@ -714,33 +777,129 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ...@@ -714,33 +777,129 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
write_extent_buffer(leaf, buf, write_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot), btrfs_item_ptr_offset(leaf, slot),
size); size);
btrfs_mark_buffer_dirty(leaf);
}
if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
struct btrfs_file_extent_item *extent;
int found_type;
extent = btrfs_item_ptr(leaf, slot, extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item); struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, extent); printk(" orig disk %llu~%llu data %llu~%llu\n",
if (found_type == BTRFS_FILE_EXTENT_REG || disko, diskl, datao, datal);
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 ds = btrfs_file_extent_disk_bytenr(leaf, if (off > key.offset) {
extent); datao += off - key.offset;
u64 dl = btrfs_file_extent_disk_num_bytes(leaf, datal -= off - key.offset;
extent); }
/* ds == 0 means there's a hole */ if (key.offset + datao + datal + key.offset >
if (ds != 0) { off + len)
datal = off + len - key.offset - datao;
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
printk(" final disk %llu~%llu data %llu~%llu\n",
disko, diskl, datao, datal);
btrfs_set_file_extent_offset(leaf, extent,
datao);
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root, ret = btrfs_inc_extent_ref(trans, root,
ds, dl, leaf->start, disko, diskl, leaf->start,
root->root_key.objectid, root->root_key.objectid,
trans->transid, trans->transid,
inode->i_ino); inode->i_ino);
BUG_ON(ret); BUG_ON(ret);
} }
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
} }
if (key.offset + datal > off+len)
trim = key.offset + datal - (off+len);
printk("len %lld skip %lld trim %lld\n",
datal, skip, trim);
if (comp && (skip || trim)) {
printk("btrfs clone_range can't split compressed inline extents yet\n");
ret = -EINVAL;
goto out;
} }
size -= skip + trim;
datal -= skip + trim;
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
if (ret)
goto out;
if (skip) {
u32 start = btrfs_file_extent_calc_inline_size(0);
memmove(buf+start, buf+start+skip,
datal);
}
leaf = path->nodes[0];
slot = path->slots[0];
write_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot),
size);
inode_add_bytes(inode, datal);
}
btrfs_mark_buffer_dirty(leaf);
}
if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
u32 size;
struct btrfs_key new_key;
u64 coverslen;
int coff, clen;
size = btrfs_item_size_nr(leaf, slot);
coverslen = (size / BTRFS_CRC32_SIZE) <<
root->fs_info->sb->s_blocksize_bits;
printk("csums for %llu~%llu\n",
key.offset, coverslen);
if (key.offset + coverslen < off ||
key.offset >= off+len)
goto next;
read_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot),
size);
btrfs_release_path(root, path);
coff = 0;
if (off > key.offset)
coff = ((off - key.offset) >>
root->fs_info->sb->s_blocksize_bits) *
BTRFS_CRC32_SIZE;
clen = size - coff;
if (key.offset + coverslen > off+len)
clen -= ((key.offset+coverslen-off-len) >>
root->fs_info->sb->s_blocksize_bits) *
BTRFS_CRC32_SIZE;
printk(" will dup %d~%d of %d\n",
coff, clen, size);
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, clen);
if (ret)
goto out;
leaf = path->nodes[0];
slot = path->slots[0];
write_extent_buffer(leaf, buf + coff,
btrfs_item_ptr_offset(leaf, slot),
clen);
btrfs_mark_buffer_dirty(leaf);
}
next:
btrfs_release_path(root, path); btrfs_release_path(root, path);
key.offset++; key.offset++;
} }
...@@ -749,13 +908,13 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ...@@ -749,13 +908,13 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
btrfs_release_path(root, path); btrfs_release_path(root, path);
if (ret == 0) { if (ret == 0) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode_set_bytes(inode, inode_get_bytes(src)); if (destoff + olen > inode->i_size)
btrfs_i_size_write(inode, src->i_size); btrfs_i_size_write(inode, destoff + olen);
BTRFS_I(inode)->flags = BTRFS_I(src)->flags; BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
ret = btrfs_update_inode(trans, root, inode); ret = btrfs_update_inode(trans, root, inode);
} }
btrfs_end_transaction(trans, root); btrfs_end_transaction(trans, root);
unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
if (ret) if (ret)
vmtruncate(inode, 0); vmtruncate(inode, 0);
out_unlock: out_unlock:
...@@ -768,6 +927,16 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) ...@@ -768,6 +927,16 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
return ret; return ret;
} }
long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
{
struct btrfs_ioctl_clone_range_args args;
if (copy_from_user(&args, (void *)argptr, sizeof(args)))
return -EFAULT;
return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
args.src_length, args.dest_offset);
}
/* /*
* there are many ways the trans_start and trans_end ioctls can lead * there are many ways the trans_start and trans_end ioctls can lead
* to deadlocks. They should only be used by applications that * to deadlocks. They should only be used by applications that
...@@ -851,7 +1020,9 @@ long btrfs_ioctl(struct file *file, unsigned int ...@@ -851,7 +1020,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_BALANCE: case BTRFS_IOC_BALANCE:
return btrfs_balance(root->fs_info->dev_root); return btrfs_balance(root->fs_info->dev_root);
case BTRFS_IOC_CLONE: case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg); return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
return btrfs_ioctl_clone_range(file, arg);
case BTRFS_IOC_TRANS_START: case BTRFS_IOC_TRANS_START:
return btrfs_ioctl_trans_start(file); return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END: case BTRFS_IOC_TRANS_END:
......
...@@ -52,4 +52,13 @@ struct btrfs_ioctl_vol_args { ...@@ -52,4 +52,13 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
struct btrfs_ioctl_vol_args) struct btrfs_ioctl_vol_args)
struct btrfs_ioctl_clone_range_args {
__s64 src_fd;
__u64 src_offset, src_length;
__u64 dest_offset;
};
#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
struct btrfs_ioctl_clone_range_args)
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment