Commit eee2a817 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (25 commits)
  Btrfs: forced readonly mounts on errors
  btrfs: Require CAP_SYS_ADMIN for filesystem rebalance
  Btrfs: don't warn if we get ENOSPC in btrfs_block_rsv_check
  btrfs: Fix memory leak in btrfs_read_fs_root_no_radix()
  btrfs: check NULL or not
  btrfs: Don't pass NULL ptr to func that may deref it.
  btrfs: mount failure return value fix
  btrfs: Mem leak in btrfs_get_acl()
  btrfs: fix wrong free space information of btrfs
  btrfs: make the chunk allocator utilize the devices better
  btrfs: restructure find_free_dev_extent()
  btrfs: fix wrong calculation of stripe size
  btrfs: try to reclaim some space when chunk allocation fails
  btrfs: fix wrong data space statistics
  fs/btrfs: Fix build of ctree
  Btrfs: fix off by one while setting block groups readonly
  Btrfs: Add BTRFS_IOC_SUBVOL_GETFLAGS/SETFLAGS ioctls
  Btrfs: Add readonly snapshots support
  Btrfs: Refactor btrfs_ioctl_snap_create()
  btrfs: Extract duplicate decompress code
  ...
parents 83896fb5 acce952b
......@@ -4,6 +4,8 @@ config BTRFS_FS
select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
select LZO_DECOMPRESS
help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
......
......@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o acl.o free-space-cache.o zlib.o \
export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o
......@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, value, size);
if (size > 0) {
acl = posix_acl_from_xattr(value, size);
if (IS_ERR(acl))
if (IS_ERR(acl)) {
kfree(value);
return acl;
}
set_cached_acl(inode, type, acl);
}
kfree(value);
......
......@@ -157,7 +157,7 @@ struct btrfs_inode {
/*
* always compress this one file
*/
unsigned force_compress:1;
unsigned force_compress:4;
struct inode vfs_inode;
};
......
......@@ -62,6 +62,9 @@ struct compressed_bio {
/* number of bytes on disk */
unsigned long compressed_len;
/* the compression algorithm for this bio */
int compress_type;
/* number of compressed pages in the array */
unsigned long nr_pages;
......@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
cb->start,
cb->orig_bio->bi_io_vec,
cb->orig_bio->bi_vcnt,
cb->compressed_len);
ret = btrfs_decompress_biovec(cb->compress_type,
cb->compressed_pages,
cb->start,
cb->orig_bio->bi_io_vec,
cb->orig_bio->bi_vcnt,
cb->compressed_len);
csum_failed:
if (ret)
cb->errors = 1;
......@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->len = uncompressed_len;
cb->compressed_len = compressed_len;
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
......@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_put(comp_bio);
return 0;
}
static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
struct btrfs_compress_op *btrfs_compress_op[] = {
&btrfs_zlib_compress,
&btrfs_lzo_compress,
};
int __init btrfs_init_compress(void)
{
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
INIT_LIST_HEAD(&comp_idle_workspace[i]);
spin_lock_init(&comp_workspace_lock[i]);
atomic_set(&comp_alloc_workspace[i], 0);
init_waitqueue_head(&comp_workspace_wait[i]);
}
return 0;
}
/*
* this finds an available workspace or allocates a new one
* ERR_PTR is returned if things go bad.
*/
static struct list_head *find_workspace(int type)
{
struct list_head *workspace;
int cpus = num_online_cpus();
int idx = type - 1;
struct list_head *idle_workspace = &comp_idle_workspace[idx];
spinlock_t *workspace_lock = &comp_workspace_lock[idx];
atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
int *num_workspace = &comp_num_workspace[idx];
again:
spin_lock(workspace_lock);
if (!list_empty(idle_workspace)) {
workspace = idle_workspace->next;
list_del(workspace);
(*num_workspace)--;
spin_unlock(workspace_lock);
return workspace;
}
if (atomic_read(alloc_workspace) > cpus) {
DEFINE_WAIT(wait);
spin_unlock(workspace_lock);
prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
schedule();
finish_wait(workspace_wait, &wait);
goto again;
}
atomic_inc(alloc_workspace);
spin_unlock(workspace_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
atomic_dec(alloc_workspace);
wake_up(workspace_wait);
}
return workspace;
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
static void free_workspace(int type, struct list_head *workspace)
{
int idx = type - 1;
struct list_head *idle_workspace = &comp_idle_workspace[idx];
spinlock_t *workspace_lock = &comp_workspace_lock[idx];
atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
int *num_workspace = &comp_num_workspace[idx];
spin_lock(workspace_lock);
if (*num_workspace < num_online_cpus()) {
list_add_tail(workspace, idle_workspace);
(*num_workspace)++;
spin_unlock(workspace_lock);
goto wake;
}
spin_unlock(workspace_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(alloc_workspace);
wake:
if (waitqueue_active(workspace_wait))
wake_up(workspace_wait);
}
/*
* cleanup function for module exit
*/
static void free_workspaces(void)
{
struct list_head *workspace;
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
while (!list_empty(&comp_idle_workspace[i])) {
workspace = comp_idle_workspace[i].next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
atomic_dec(&comp_alloc_workspace[i]);
}
}
}
/*
* given an address space and start/len, compress the bytes.
*
* pages are allocated to hold the compressed result and stored
* in 'pages'
*
* out_pages is used to return the number of pages allocated. There
* may be pages allocated even if we return an error
*
* total_in is used to return the number of bytes actually read. It
* may be smaller then len if we had to exit early because we
* ran out of room in the pages array or because we cross the
* max_out threshold.
*
* total_out is used to return the total number of compressed bytes
*
* max_out tells us the max number of bytes that we're allowed to
* stuff into pages
*/
int btrfs_compress_pages(int type, struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out)
{
struct list_head *workspace;
int ret;
workspace = find_workspace(type);
if (IS_ERR(workspace))
return -1;
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, len, pages,
nr_dest_pages, out_pages,
total_in, total_out,
max_out);
free_workspace(type, workspace);
return ret;
}
/*
* pages_in is an array of pages with compressed data.
*
* disk_start is the starting logical offset of this array in the file
*
* bvec is a bio_vec of pages from the file that we want to decompress into
*
* vcnt is the count of pages in the biovec
*
* srclen is the number of bytes in pages_in
*
* The basic idea is that we have a bio that was created by readpages.
* The pages in the bio are for the uncompressed data, and they may not
* be contiguous. They all correspond to the range of bytes covered by
* the compressed extent.
*/
int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
struct bio_vec *bvec, int vcnt, size_t srclen)
{
struct list_head *workspace;
int ret;
workspace = find_workspace(type);
if (IS_ERR(workspace))
return -ENOMEM;
ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
disk_start,
bvec, vcnt, srclen);
free_workspace(type, workspace);
return ret;
}
/*
* a less complex decompression routine. Our compressed data fits in a
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
struct list_head *workspace;
int ret;
workspace = find_workspace(type);
if (IS_ERR(workspace))
return -ENOMEM;
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
free_workspace(type, workspace);
return ret;
}
void __exit btrfs_exit_compress(void)
{
free_workspaces();
}
/*
* Copy uncompressed data from working buffer to pages.
*
* buf_start is the byte offset we're of the start of our workspace buffer.
*
* total_out is the last byte of the buffer
*/
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio_vec *bvec, int vcnt,
unsigned long *page_index,
unsigned long *pg_offset)
{
unsigned long buf_offset;
unsigned long current_buf_start;
unsigned long start_byte;
unsigned long working_bytes = total_out - buf_start;
unsigned long bytes;
char *kaddr;
struct page *page_out = bvec[*page_index].bv_page;
/*
* start byte is the first byte of the page we're currently
* copying into relative to the start of the compressed data.
*/
start_byte = page_offset(page_out) - disk_start;
/* we haven't yet hit data corresponding to this page */
if (total_out <= start_byte)
return 1;
/*
* the start of the data we care about is offset into
* the middle of our working buffer
*/
if (total_out > start_byte && buf_start < start_byte) {
buf_offset = start_byte - buf_start;
working_bytes -= buf_offset;
} else {
buf_offset = 0;
}
current_buf_start = buf_start;
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
bytes = min(PAGE_CACHE_SIZE - *pg_offset,
PAGE_CACHE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out, KM_USER0);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(page_out);
*pg_offset += bytes;
buf_offset += bytes;
working_bytes -= bytes;
current_buf_start += bytes;
/* check if we need to pick another page */
if (*pg_offset == PAGE_CACHE_SIZE) {
(*page_index)++;
if (*page_index >= vcnt)
return 0;
page_out = bvec[*page_index].bv_page;
*pg_offset = 0;
start_byte = page_offset(page_out) - disk_start;
/*
* make sure our new page is covered by this
* working buffer
*/
if (total_out <= start_byte)
return 1;
/*
* the next page in the biovec might not be adjacent
* to the last page, but it might still be found
* inside this working buffer. bump our offset pointer
*/
if (total_out > start_byte &&
current_buf_start < start_byte) {
buf_offset = start_byte - buf_start;
working_bytes = total_out - start_byte;
current_buf_start = buf_start + buf_offset;
}
}
}
return 1;
}
......@@ -19,24 +19,27 @@
#ifndef __BTRFS_COMPRESSION_
#define __BTRFS_COMPRESSION_
int btrfs_zlib_decompress(unsigned char *data_in,
struct page *dest_page,
unsigned long start_byte,
size_t srclen, size_t destlen);
int btrfs_zlib_compress_pages(struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out);
int btrfs_zlib_decompress_biovec(struct page **pages_in,
u64 disk_start,
struct bio_vec *bvec,
int vcnt,
size_t srclen);
void btrfs_zlib_exit(void);
int btrfs_init_compress(void);
void btrfs_exit_compress(void);
int btrfs_compress_pages(int type, struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out);
int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
struct bio_vec *bvec, int vcnt, size_t srclen);
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio_vec *bvec, int vcnt,
unsigned long *page_index,
unsigned long *pg_offset);
int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
......@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long nr_pages);
int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
struct btrfs_compress_op {
struct list_head *(*alloc_workspace)(void);
void (*free_workspace)(struct list_head *workspace);
int (*compress_pages)(struct list_head *workspace,
struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out);
int (*decompress_biovec)(struct list_head *workspace,
struct page **pages_in,
u64 disk_start,
struct bio_vec *bvec,
int vcnt,
size_t srclen);
int (*decompress)(struct list_head *workspace,
unsigned char *data_in,
struct page *dest_page,
unsigned long start_byte,
size_t srclen, size_t destlen);
};
extern struct btrfs_compress_op btrfs_zlib_compress;
extern struct btrfs_compress_op btrfs_lzo_compress;
#endif
......@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
if (!p)
return;
btrfs_release_path(NULL, p);
kmem_cache_free(btrfs_path_cachep, p);
}
......@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
right = read_node_slot(root, upper, slot + 1);
if (right == NULL)
return 1;
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
......@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
left = read_node_slot(root, path->nodes[1], slot - 1);
if (left == NULL)
return 1;
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
......
......@@ -295,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FSID_SIZE 16
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
/*
* File system states
*/
/* Errors detected */
#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
......@@ -399,13 +407,15 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/*
* A leaf is full of items. offset and size tell us where to find
......@@ -552,9 +562,11 @@ struct btrfs_timespec {
} __attribute__ ((__packed__));
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LAST = 2,
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_TYPES = 2,
BTRFS_COMPRESS_LAST = 3,
};
struct btrfs_inode_item {
......@@ -598,6 +610,8 @@ struct btrfs_dir_item {
u8 type;
} __attribute__ ((__packed__));
#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
......@@ -896,7 +910,8 @@ struct btrfs_fs_info {
*/
u64 last_trans_log_full_commit;
u64 open_ioctl_trans;
unsigned long mount_opt;
unsigned long mount_opt:20;
unsigned long compress_type:4;
u64 max_inline;
u64 alloc_start;
struct btrfs_transaction *running_transaction;
......@@ -1051,6 +1066,9 @@ struct btrfs_fs_info {
unsigned metadata_ratio;
void *bdev_holder;
/* filesystem state */
u64 fs_state;
};
/*
......@@ -1894,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
last_snapshot, 64);
static inline bool btrfs_root_readonly(struct btrfs_root *root)
{
return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
}
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
......@@ -2146,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
......@@ -2189,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
u64 start, u64 end);
int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
u64 num_bytes);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
......@@ -2542,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno);
#define btrfs_std_error(fs_info, errno) \
do { \
if ((errno)) \
__btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
} while (0)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
......
......@@ -44,6 +44,20 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents);
static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
......@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
if (eb == NULL) {
WARN_ON(1);
goto out;
}
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
......@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
if (eb == NULL) {
ret = -EIO;
goto out;
}
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
......@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
}
btrfs_free_path(path);
if (ret) {
kfree(root);
if (ret > 0)
ret = -ENOENT;
return ERR_PTR(ret);
......@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh)
if (!bh) {
err = -EINVAL;
goto fail_iput;
}
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
......@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!btrfs_super_root(disk_super))
goto fail_iput;
/* check FS state, whether FS is broken. */
fs_info->fs_state |= btrfs_super_flags(disk_super);
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
ret = btrfs_parse_options(tree_root, options);
if (ret) {
err = ret;
......@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
features = btrfs_super_incompat_flags(disk_super);
if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
btrfs_set_super_incompat_flags(disk_super, features);
}
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
......@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_set_opt(fs_info->mount_opt, SSD);
}
if (btrfs_super_log_root(disk_super) != 0) {
/* do not make disk changes in broken FS */
if (btrfs_super_log_root(disk_super) != 0 &&
!(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
......@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
smp_mb();
btrfs_put_block_group_cache(fs_info);
/*
* Here come 2 situations when btrfs is broken to flip readonly:
*
* 1. when btrfs flips readonly somewhere else before
* btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
* and btrfs will skip to write sb directly to keep
* ERROR state on disk.
*
* 2. when btrfs flips readonly just in btrfs_commit_super,
* and in such case, btrfs cannnot write sb via btrfs_commit_super,
* and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
* btrfs will cleanup all FS resources first and write sb then.
*/
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
ret = btrfs_error_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
......@@ -2619,6 +2671,352 @@ int btree_lock_page_hook(struct page *page)
return 0;
}
static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
if (read_only)
return;
if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
printk(KERN_WARNING "warning: mount fs with errors, "
"running btrfsck is recommended\n");
}
int btrfs_error_commit_super(struct btrfs_root *root)
{
int ret;
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
down_write(&root->fs_info->cleanup_work_sem);
up_write(&root->fs_info->cleanup_work_sem);
/* cleanup FS via transaction */
btrfs_cleanup_transaction(root);
ret = write_ctree_super(NULL, root, 0);
return ret;
}
static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
INIT_LIST_HEAD(&splice);
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_init(&root->fs_info->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
btrfs_invalidate_inodes(btrfs_inode->root);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
return 0;
}
static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct list_head splice;
struct btrfs_ordered_extent *ordered;
struct inode *inode;
INIT_LIST_HEAD(&splice);
spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_init(&root->fs_info->ordered_extents, &splice);
while (!list_empty(&splice)) {
ordered = list_entry(splice.next, struct btrfs_ordered_extent,
root_extent_list);
list_del_init(&ordered->root_extent_list);
atomic_inc(&ordered->refs);
/* the inode may be getting freed (in sys_unlink path). */
inode = igrab(ordered->inode);
spin_unlock(&root->fs_info->ordered_extent_lock);
if (inode)
iput(inode);
atomic_set(&ordered->refs, 1);
btrfs_put_ordered_extent(ordered);
spin_lock(&root->fs_info->ordered_extent_lock);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
return 0;
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
int ret = 0;
delayed_refs = &trans->delayed_refs;
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
printk(KERN_INFO "delayed_refs has NO entry\n");
return ret;
}
node = rb_first(&delayed_refs->root);
while (node) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
node = rb_next(node);
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
atomic_set(&ref->refs, 1);
if (btrfs_delayed_ref_is_head(ref)) {
struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
mutex_lock(&head->mutex);
kfree(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
cond_resched();
spin_lock(&delayed_refs->lock);
}
spin_unlock(&delayed_refs->lock);
return ret;
}
static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
{
struct btrfs_pending_snapshot *snapshot;
struct list_head splice;
INIT_LIST_HEAD(&splice);
list_splice_init(&t->pending_snapshots, &splice);
while (!list_empty(&splice)) {
snapshot = list_entry(splice.next,
struct btrfs_pending_snapshot,
list);
list_del_init(&snapshot->list);
kfree(snapshot);
}
return 0;
}
static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
INIT_LIST_HEAD(&splice);
list_splice_init(&root->fs_info->delalloc_inodes, &splice);
spin_lock(&root->fs_info->delalloc_lock);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
list_del_init(&btrfs_inode->delalloc_inodes);
btrfs_invalidate_inodes(btrfs_inode->root);
}
spin_unlock(&root->fs_info->delalloc_lock);
return 0;
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
int mark)
{
int ret;
struct page *page;
struct inode *btree_inode = root->fs_info->btree_inode;
struct extent_buffer *eb;
u64 start = 0;
u64 end;
u64 offset;
unsigned long index;
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
mark);
if (ret)
break;
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
page = find_get_page(btree_inode->i_mapping, index);
if (!page)
continue;
offset = page_offset(page);
spin_lock(&dirty_pages->buffer_lock);
eb = radix_tree_lookup(
&(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
offset >> PAGE_CACHE_SHIFT);
spin_unlock(&dirty_pages->buffer_lock);
if (eb) {
ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
&eb->bflags);
atomic_set(&eb->refs, 1);
}
if (PageWriteback(page))
end_page_writeback(page);
lock_page(page);
if (PageDirty(page)) {
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
radix_tree_tag_clear(&page->mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&page->mapping->tree_lock);
}
page->mapping->a_ops->invalidatepage(page, 0);
unlock_page(page);
}
}
return ret;
}
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
struct extent_io_tree *pinned_extents)
{
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
unpin = pinned_extents;
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
if (ret)
break;
/* opt_discard */
ret = btrfs_error_discard_extent(root, start, end + 1 - start);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
cond_resched();
}
return 0;
}
static int btrfs_cleanup_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *t;
LIST_HEAD(list);
WARN_ON(1);
mutex_lock(&root->fs_info->trans_mutex);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
list_splice_init(&root->fs_info->trans_list, &list);
while (!list_empty(&list)) {
t = list_entry(list.next, struct btrfs_transaction, list);
if (!t)
break;
btrfs_destroy_ordered_operations(root);
btrfs_destroy_ordered_extents(root);
btrfs_destroy_delayed_refs(t, root);
btrfs_block_rsv_release(root,
&root->fs_info->trans_block_rsv,
t->dirty_pages.dirty_bytes);
/* FIXME: cleanup wait for commit */
t->in_commit = 1;
t->blocked = 1;
if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
wake_up(&root->fs_info->transaction_blocked_wait);
t->blocked = 0;
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
mutex_unlock(&root->fs_info->trans_mutex);
mutex_lock(&root->fs_info->trans_mutex);
t->commit_done = 1;
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
mutex_unlock(&root->fs_info->trans_mutex);
mutex_lock(&root->fs_info->trans_mutex);
btrfs_destroy_pending_snapshots(t);
btrfs_destroy_delalloc_inodes(root);
spin_lock(&root->fs_info->new_trans_lock);
root->fs_info->running_transaction = NULL;
spin_unlock(&root->fs_info->new_trans_lock);
btrfs_destroy_marked_extents(root, &t->dirty_pages,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
t->use_count = 0;
list_del_init(&t->list);
memset(t, 0, sizeof(*t));
kmem_cache_free(btrfs_transaction_cachep, t);
}
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
mutex_unlock(&root->fs_info->trans_mutex);
return 0;
}
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
......
......@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
int btrfs_error_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
......
......@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
return btrfs_reduce_alloc_profile(root, flags);
}
static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
......@@ -3161,8 +3161,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
bytes + 2 * 1024 * 1024,
alloc_target, 0);
btrfs_end_transaction(trans, root);
if (ret < 0)
return ret;
if (ret < 0) {
if (ret != -ENOSPC)
return ret;
else
goto commit_trans;
}
if (!data_sinfo) {
btrfs_set_inode_space_info(root, inode);
......@@ -3173,6 +3177,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
commit_trans:
if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
......@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
return 0;
}
WARN_ON(1);
printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
block_rsv->size, block_rsv->reserved,
block_rsv->freed[0], block_rsv->freed[1]);
return -ENOSPC;
}
......@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
sinfo->bytes_may_use + sinfo->bytes_readonly +
cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
sinfo->bytes_reserved += cache->reserved_pinned;
cache->reserved_pinned = 0;
cache->ro = 1;
ret = 0;
}
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
return ret;
......@@ -8012,6 +8013,62 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
return ret;
}
/*
* helper to account the unused space of all the readonly block group in the
* list. takes mirrors into account.
*/
static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
{
struct btrfs_block_group_cache *block_group;
u64 free_bytes = 0;
int factor;
list_for_each_entry(block_group, groups_list, list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
spin_unlock(&block_group->lock);
continue;
}
if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP))
factor = 2;
else
factor = 1;
free_bytes += (block_group->key.offset -
btrfs_block_group_used(&block_group->item)) *
factor;
spin_unlock(&block_group->lock);
}
return free_bytes;
}
/*
* helper to account the unused space of all the readonly block group in the
* space_info. takes mirrors into account.
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
int i;
u64 free_bytes = 0;
spin_lock(&sinfo->lock);
for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
if (!list_empty(&sinfo->block_groups[i]))
free_bytes += __btrfs_get_ro_block_group_free_space(
&sinfo->block_groups[i]);
spin_unlock(&sinfo->lock);
return free_bytes;
}
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
......@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 min_free = btrfs_block_group_used(&block_group->item);
u64 dev_offset, max_avail;
u64 dev_offset;
/*
* check to make sure we can actually find a chunk with enough
......@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (device->total_bytes > device->bytes_used + min_free) {
ret = find_free_dev_extent(NULL, device, min_free,
&dev_offset, &max_avail);
&dev_offset, NULL);
if (!ret)
break;
ret = -1;
......@@ -8584,3 +8641,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
return ret;
}
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
return unpin_extent_range(root, start, end);
}
int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
u64 num_bytes)
{
return btrfs_discard_extent(root, bytenr, num_bytes);
}
......@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
this_bio_flag = EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&this_bio_flag,
em->compress_type);
}
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
......@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
#endif
eb = kmem_cache_zalloc(extent_buffer_cache, mask);
if (eb == NULL)
return NULL;
eb->start = start;
eb->len = len;
spin_lock_init(&eb->lock);
......
......@@ -20,8 +20,12 @@
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/* flags for bio submission */
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
......@@ -135,6 +139,17 @@ struct extent_buffer {
wait_queue_head_t lock_wq;
};
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
}
static inline int extent_compress_type(unsigned long bio_flags)
{
return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
}
struct extent_map_tree;
static inline struct extent_state *extent_state_next(struct extent_state *state)
......
......@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include "ctree.h"
#include "extent_map.h"
......@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
return em;
em->in_tree = 0;
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
atomic_set(&em->refs, 1);
return em;
}
......
......@@ -26,7 +26,8 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
int in_tree;
unsigned int in_tree:1;
unsigned int compress_type:4;
};
struct extent_map_tree {
......
......@@ -225,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret);
free_extent_map(split);
......@@ -239,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->len = em->start + em->len - (start + len);
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
if (compressed) {
split->block_len = em->block_len;
......@@ -891,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
if (err)
goto out;
/*
* If BTRFS flips readonly due to some impossible error
* (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
* although we have opened a file as writable, we have
* to stop this write operation to ensure FS consistency.
*/
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
err = -EROFS;
goto out;
}
file_update_time(file);
BTRFS_I(inode)->sequence++;
......
......@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
size_t datasize;
unsigned long offset;
int use_compress = 0;
int compress_type = BTRFS_COMPRESS_NONE;
if (compressed_size && compressed_pages) {
use_compress = 1;
compress_type = root->fs_info->compress_type;
cur_size = compressed_size;
}
......@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
ptr = btrfs_file_extent_inline_start(ei);
if (use_compress) {
if (compress_type != BTRFS_COMPRESS_NONE) {
struct page *cpage;
int i = 0;
while (compressed_size > 0) {
......@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
compressed_size -= cur_size;
}
btrfs_set_file_extent_compression(leaf, ei,
BTRFS_COMPRESS_ZLIB);
compress_type);
} else {
page = find_get_page(inode->i_mapping,
start >> PAGE_CACHE_SHIFT);
......@@ -263,6 +263,7 @@ struct async_extent {
u64 compressed_size;
struct page **pages;
unsigned long nr_pages;
int compress_type;
struct list_head list;
};
......@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
u64 start, u64 ram_size,
u64 compressed_size,
struct page **pages,
unsigned long nr_pages)
unsigned long nr_pages,
int compress_type)
{
struct async_extent *async_extent;
......@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
async_extent->compressed_size = compressed_size;
async_extent->pages = pages;
async_extent->nr_pages = nr_pages;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
}
......@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
unsigned long max_uncompressed = 128 * 1024;
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
actual_end = min_t(u64, isize, end + 1);
again:
......@@ -381,12 +385,16 @@ static noinline int compress_file_range(struct inode *inode,
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
total_compressed, pages,
nr_pages, &nr_pages_ret,
&total_in,
&total_compressed,
max_compressed);
if (BTRFS_I(inode)->force_compress)
compress_type = BTRFS_I(inode)->force_compress;
ret = btrfs_compress_pages(compress_type,
inode->i_mapping, start,
total_compressed, pages,
nr_pages, &nr_pages_ret,
&total_in,
&total_compressed,
max_compressed);
if (!ret) {
unsigned long offset = total_compressed &
......@@ -493,7 +501,8 @@ static noinline int compress_file_range(struct inode *inode,
* and will submit them to the elevator.
*/
add_async_extent(async_cow, start, num_bytes,
total_compressed, pages, nr_pages_ret);
total_compressed, pages, nr_pages_ret,
compress_type);
if (start + num_bytes < end) {
start += num_bytes;
......@@ -515,7 +524,8 @@ static noinline int compress_file_range(struct inode *inode,
__set_page_dirty_nobuffers(locked_page);
/* unlocked later on in the async handlers */
}
add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
add_async_extent(async_cow, start, end - start + 1,
0, NULL, 0, BTRFS_COMPRESS_NONE);
*num_added += 1;
}
......@@ -640,6 +650,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
......@@ -656,11 +667,13 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->ram_size - 1, 0);
}
ret = btrfs_add_ordered_extent(inode, async_extent->start,
ins.objectid,
async_extent->ram_size,
ins.offset,
BTRFS_ORDERED_COMPRESSED);
ret = btrfs_add_ordered_extent_compress(inode,
async_extent->start,
ins.objectid,
async_extent->ram_size,
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
BUG_ON(ret);
/*
......@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
int compressed = 0;
int compress_type = 0;
int ret;
bool nolock = false;
......@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compressed = 1;
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compressed);
BUG_ON(compress_type);
ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
......@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->disk_len,
ordered_extent->len,
ordered_extent->len,
compressed, 0, 0,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset,
......@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&failrec->bio_flags,
em->compress_type);
}
failrec->logical = logical;
free_extent_map(em);
......@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
int err;
if (btrfs_root_readonly(root))
return -EROFS;
err = inode_change_ok(inode, attr);
if (err)
return err;
......@@ -4928,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
size_t max_size;
unsigned long inline_size;
unsigned long ptr;
int compress_type;
WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf,
btrfs_item_nr(leaf, path->slots[0]));
......@@ -4939,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
ret = btrfs_zlib_decompress(tmp, page, extent_offset,
inline_size, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
if (ret) {
char *kaddr = kmap_atomic(page, KM_USER0);
unsigned long copy_size = min_t(u64,
......@@ -4982,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
int compressed;
int compress_type;
again:
read_lock(&em_tree->lock);
......@@ -5041,7 +5062,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
compressed = btrfs_file_extent_compression(leaf, item);
compress_type = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
......@@ -5087,8 +5108,9 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
em->block_start = EXTENT_MAP_HOLE;
goto insert;
}
if (compressed) {
if (compress_type != BTRFS_COMPRESS_NONE) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
em->block_start = bytenr;
em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
item);
......@@ -5122,12 +5144,14 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
em->len = (copy_size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
em->orig_start = EXTENT_MAP_INLINE;
if (compressed)
if (compress_type) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
}
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) ==
BTRFS_COMPRESS_ZLIB) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
......@@ -6477,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->ordered_data_close = 0;
ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0;
ei->force_compress = 0;
ei->force_compress = BTRFS_COMPRESS_NONE;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
......@@ -7105,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
return -EROFS;
if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
return -EACCES;
return generic_permission(inode, mask, flags, btrfs_check_acl);
......
......@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int flags, oldflags;
int ret;
if (btrfs_root_readonly(root))
return -EROFS;
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
......@@ -360,7 +363,8 @@ static noinline int create_subvol(struct btrfs_root *root,
}
static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
char *name, int namelen, u64 *async_transid)
char *name, int namelen, u64 *async_transid,
bool readonly)
{
struct inode *inode;
struct dentry *parent;
......@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
btrfs_init_block_rsv(&pending_snapshot->block_rsv);
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
if (IS_ERR(trans)) {
......@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
u64 *async_transid)
u64 *async_transid, bool readonly)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
......@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (snap_src) {
error = create_snapshot(snap_src, dentry,
name, namelen, async_transid);
name, namelen, async_transid, readonly);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
name, namelen, async_transid);
......@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct page *page;
struct btrfs_super_block *disk_super;
unsigned long last_index;
unsigned long ra_pages = root->fs_info->bdi.ra_pages;
unsigned long total_read = 0;
u64 features;
u64 page_start;
u64 page_end;
u64 last_len = 0;
......@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
u64 defrag_end = 0;
unsigned long i;
int ret;
int compress_type = BTRFS_COMPRESS_ZLIB;
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
if (range->compress_type > BTRFS_COMPRESS_TYPES)
return -EINVAL;
if (range->compress_type)
compress_type = range->compress_type;
}
if (inode->i_size == 0)
return 0;
......@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
total_read++;
mutex_lock(&inode->i_mutex);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = 1;
BTRFS_I(inode)->force_compress = compress_type;
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
......@@ -781,10 +796,17 @@ static int btrfs_defrag_file(struct file *file,
atomic_dec(&root->fs_info->async_submit_draining);
mutex_lock(&inode->i_mutex);
BTRFS_I(inode)->force_compress = 0;
BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
mutex_unlock(&inode->i_mutex);
}
disk_super = &root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (range->compress_type == BTRFS_COMPRESS_LZO) {
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
btrfs_set_super_incompat_flags(disk_super, features);
}
return 0;
err_reservations:
......@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
char *name,
unsigned long fd,
int subvol,
u64 *transid)
u64 *transid,
bool readonly)
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct file *src_file;
......@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
NULL, transid);
NULL, transid, readonly);
} else {
struct inode *src_inode;
src_file = fget(fd);
......@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
}
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
transid);
transid, readonly);
fput(src_file);
}
out:
......@@ -946,58 +969,139 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
}
static noinline int btrfs_ioctl_snap_create(struct file *file,
void __user *arg, int subvol,
int v2)
void __user *arg, int subvol)
{
struct btrfs_ioctl_vol_args *vol_args = NULL;
struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
char *name;
u64 fd;
struct btrfs_ioctl_vol_args *vol_args;
int ret;
if (v2) {
u64 transid = 0;
u64 *ptr = NULL;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
if (IS_ERR(vol_args_v2))
return PTR_ERR(vol_args_v2);
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol,
NULL, false);
if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
ret = -EINVAL;
goto out;
}
name = vol_args_v2->name;
fd = vol_args_v2->fd;
vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
kfree(vol_args);
return ret;
}
if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
ptr = &transid;
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
void __user *arg, int subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
u64 transid = 0;
u64 *ptr = NULL;
bool readonly = false;
ret = btrfs_ioctl_snap_create_transid(file, name, fd,
subvol, ptr);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
if (ret == 0 && ptr &&
copy_to_user(arg +
offsetof(struct btrfs_ioctl_vol_args_v2,
transid), ptr, sizeof(*ptr)))
ret = -EFAULT;
} else {
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
name = vol_args->name;
fd = vol_args->fd;
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_ioctl_snap_create_transid(file, name, fd,
subvol, NULL);
if (vol_args->flags &
~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
ret = -EOPNOTSUPP;
goto out;
}
if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
ptr = &transid;
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol,
ptr, readonly);
if (ret == 0 && ptr &&
copy_to_user(arg +
offsetof(struct btrfs_ioctl_vol_args_v2,
transid), ptr, sizeof(*ptr)))
ret = -EFAULT;
out:
kfree(vol_args);
kfree(vol_args_v2);
return ret;
}
static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
u64 flags = 0;
if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&root->fs_info->subvol_sem);
if (btrfs_root_readonly(root))
flags |= BTRFS_SUBVOL_RDONLY;
up_read(&root->fs_info->subvol_sem);
if (copy_to_user(arg, &flags, sizeof(flags)))
ret = -EFAULT;
return ret;
}
static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
void __user *arg)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
u64 flags;
int ret = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
return -EINVAL;
if (flags & ~BTRFS_SUBVOL_RDONLY)
return -EOPNOTSUPP;
down_write(&root->fs_info->subvol_sem);
/* nothing to do */
if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
goto out;
root_flags = btrfs_root_flags(&root->root_item);
if (flags & BTRFS_SUBVOL_RDONLY)
btrfs_set_root_flags(&root->root_item,
root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
else
btrfs_set_root_flags(&root->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_reset;
}
ret = btrfs_update_root(trans, root,
&root->root_key, &root->root_item);
btrfs_commit_transaction(trans, root);
out_reset:
if (ret)
btrfs_set_root_flags(&root->root_item, root_flags);
out:
up_write(&root->fs_info->subvol_sem);
return ret;
}
......@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
struct btrfs_ioctl_defrag_range_args *range;
int ret;
if (btrfs_root_readonly(root))
return -EROFS;
ret = mnt_want_write(file->f_path.mnt);
if (ret)
return ret;
......@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
return -EINVAL;
if (btrfs_root_readonly(root))
return -EROFS;
ret = mnt_want_write(file->f_path.mnt);
if (ret)
return ret;
......@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
if (file->private_data)
goto out;
ret = -EROFS;
if (btrfs_root_readonly(root))
goto out;
ret = mnt_want_write(file->f_path.mnt);
if (ret)
goto out;
......@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0, 0);
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
return btrfs_ioctl_snap_create(file, argp, 0, 1);
return btrfs_ioctl_snap_create_v2(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1, 0);
return btrfs_ioctl_snap_create(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_SUBVOL_GETFLAGS:
return btrfs_ioctl_subvol_getflags(file, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
return btrfs_ioctl_default_subvol(file, argp);
case BTRFS_IOC_DEFRAG:
......
......@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
};
#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
#define BTRFS_SUBVOL_NAME_MAX 4039
struct btrfs_ioctl_vol_args_v2 {
......@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
*/
__u32 extent_thresh;
/*
* which compression method to use if turning on compression
* for this defrag operation. If unspecified, zlib will
* be used
*/
__u32 compress_type;
/* spare for later */
__u32 unused[5];
__u32 unused[4];
};
struct btrfs_ioctl_space_info {
......@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
struct btrfs_ioctl_vol_args_v2)
#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
#endif
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/lzo.h>
#include "compression.h"
#define LZO_LEN 4
struct workspace {
void *mem;
void *buf; /* where compressed data goes */
void *cbuf; /* where decompressed data goes */
struct list_head list;
};
static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
vfree(workspace->buf);
vfree(workspace->cbuf);
vfree(workspace->mem);
kfree(workspace);
}
static struct list_head *lzo_alloc_workspace(void)
{
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
if (!workspace)
return ERR_PTR(-ENOMEM);
workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
return &workspace->list;
fail:
lzo_free_workspace(&workspace->list);
return ERR_PTR(-ENOMEM);
}
static inline void write_compress_length(char *buf, size_t len)
{
__le32 dlen;
dlen = cpu_to_le32(len);
memcpy(buf, &dlen, LZO_LEN);
}
static inline size_t read_compress_length(char *buf)
{
__le32 dlen;
memcpy(&dlen, buf, LZO_LEN);
return le32_to_cpu(dlen);
}
static int lzo_compress_pages(struct list_head *ws,
struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
char *data_in;
char *cpage_out;
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
unsigned long bytes_left;
size_t in_len;
size_t out_len;
char *buf;
unsigned long tot_in = 0;
unsigned long tot_out = 0;
unsigned long pg_bytes_left;
unsigned long out_offset;
unsigned long bytes;
*out_pages = 0;
*total_out = 0;
*total_in = 0;
in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
/*
* store the size of all chunks of compressed data in
* the first 4 bytes
*/
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
out_offset = LZO_LEN;
tot_out = LZO_LEN;
pages[0] = out_page;
nr_pages = 1;
pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
/* compress at most one page of data each time */
in_len = min(len, PAGE_CACHE_SIZE);
while (tot_in < len) {
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
&out_len, workspace->mem);
if (ret != LZO_E_OK) {
printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
ret);
ret = -1;
goto out;
}
/* store the size of this chunk of compressed data */
write_compress_length(cpage_out + out_offset, out_len);
tot_out += LZO_LEN;
out_offset += LZO_LEN;
pg_bytes_left -= LZO_LEN;
tot_in += in_len;
tot_out += out_len;
/* copy bytes from the working buffer into the pages */
buf = workspace->cbuf;
while (out_len) {
bytes = min_t(unsigned long, pg_bytes_left, out_len);
memcpy(cpage_out + out_offset, buf, bytes);
out_len -= bytes;
pg_bytes_left -= bytes;
buf += bytes;
out_offset += bytes;
/*
* we need another page for writing out.
*
* Note if there's less than 4 bytes left, we just
* skip to a new page.
*/
if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
pg_bytes_left == 0) {
if (pg_bytes_left) {
memset(cpage_out + out_offset, 0,
pg_bytes_left);
tot_out += pg_bytes_left;
}
/* we're done, don't allocate new page */
if (out_len == 0 && tot_in >= len)
break;
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -1;
goto out;
}
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
pages[nr_pages++] = out_page;
pg_bytes_left = PAGE_CACHE_SIZE;
out_offset = 0;
}
}
/* we're making it bigger, give up */
if (tot_in > 8192 && tot_in < tot_out)
goto out;
/* we're all done */
if (tot_in >= len)
break;
if (tot_out > max_out)
break;
bytes_left = len - tot_in;
kunmap(in_page);
page_cache_release(in_page);
start += PAGE_CACHE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
in_len = min(bytes_left, PAGE_CACHE_SIZE);
}
if (tot_out > tot_in)
goto out;
/* store the size of all chunks of compressed data */
cpage_out = kmap(pages[0]);
write_compress_length(cpage_out, tot_out);
kunmap(pages[0]);
ret = 0;
*total_out = tot_out;
*total_in = tot_in;
out:
*out_pages = nr_pages;
if (out_page)
kunmap(out_page);
if (in_page) {
kunmap(in_page);
page_cache_release(in_page);
}
return ret;
}
static int lzo_decompress_biovec(struct list_head *ws,
struct page **pages_in,
u64 disk_start,
struct bio_vec *bvec,
int vcnt,
size_t srclen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
char *data_in;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE;
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
unsigned long working_bytes;
unsigned long pg_offset;
size_t in_len;
size_t out_len;
unsigned long in_offset;
unsigned long in_page_bytes_left;
unsigned long tot_in;
unsigned long tot_out;
unsigned long tot_len;
char *buf;
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
tot_in = LZO_LEN;
in_offset = LZO_LEN;
tot_len = min_t(size_t, srclen, tot_len);
in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
tot_out = 0;
pg_offset = 0;
while (tot_in < tot_len) {
in_len = read_compress_length(data_in + in_offset);
in_page_bytes_left -= LZO_LEN;
in_offset += LZO_LEN;
tot_in += LZO_LEN;
tot_in += in_len;
working_bytes = in_len;
/* fast path: avoid using the working buffer */
if (in_page_bytes_left >= in_len) {
buf = data_in + in_offset;
bytes = in_len;
goto cont;
}
/* copy bytes from the pages into the working buffer */
buf = workspace->cbuf;
buf_offset = 0;
while (working_bytes) {
bytes = min(working_bytes, in_page_bytes_left);
memcpy(buf + buf_offset, data_in + in_offset, bytes);
buf_offset += bytes;
cont:
working_bytes -= bytes;
in_page_bytes_left -= bytes;
in_offset += bytes;
/* check if we need to pick another page */
if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
|| in_page_bytes_left == 0) {
tot_in += in_page_bytes_left;
if (working_bytes == 0 && tot_in >= tot_len)
break;
kunmap(pages_in[page_in_index]);
page_in_index++;
if (page_in_index >= total_pages_in) {
ret = -1;
data_in = NULL;
goto done;
}
data_in = kmap(pages_in[page_in_index]);
in_page_bytes_left = PAGE_CACHE_SIZE;
in_offset = 0;
}
}
out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "btrfs decompress failed\n");
ret = -1;
break;
}
buf_start = tot_out;
tot_out += out_len;
ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
tot_out, disk_start,
bvec, vcnt,
&page_out_index, &pg_offset);
if (ret2 == 0)
break;
}
done:
if (data_in)
kunmap(pages_in[page_in_index]);
return ret;
}
static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct page *dest_page,
unsigned long start_byte,
size_t srclen, size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
size_t tot_len;
int ret = 0;
char *kaddr;
unsigned long bytes;
BUG_ON(srclen < LZO_LEN);
tot_len = read_compress_length(data_in);
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
data_in += LZO_LEN;
out_len = PAGE_CACHE_SIZE;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "btrfs decompress failed!\n");
ret = -1;
goto out;
}
if (out_len < start_byte) {
ret = -1;
goto out;
}
bytes = min_t(unsigned long, destlen, out_len - start_byte);
kaddr = kmap_atomic(dest_page, KM_USER0);
memcpy(kaddr, workspace->buf + start_byte, bytes);
kunmap_atomic(kaddr, KM_USER0);
out:
return ret;
}
struct btrfs_compress_op btrfs_lzo_compress = {
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
.decompress_biovec = lzo_decompress_biovec,
.decompress = lzo_decompress,
};
......@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
*/
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio)
int type, int dio, int compress_type)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
......@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = inode;
entry->compress_type = compress_type;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
......@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 0);
disk_len, type, 0,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 1);
disk_len, type, 1,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 0,
compress_type);
}
/*
......
......@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
......@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
/* flags (described above) */
unsigned long flags;
/* compression algorithm */
int compress_type;
/* reference count */
atomic_t refs;
......@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
......
......@@ -54,6 +54,90 @@
static const struct super_operations btrfs_super_ops;
static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
char nbuf[16])
{
char *errstr = NULL;
switch (errno) {
case -EIO:
errstr = "IO failure";
break;
case -ENOMEM:
errstr = "Out of memory";
break;
case -EROFS:
errstr = "Readonly filesystem";
break;
default:
if (nbuf) {
if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
errstr = nbuf;
}
break;
}
return errstr;
}
static void __save_error_info(struct btrfs_fs_info *fs_info)
{
/*
* today we only save the error info into ram. Long term we'll
* also send it down to the disk
*/
fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
}
/* NOTE:
* We move write_super stuff at umount in order to avoid deadlock
* for umount hold all lock.
*/
static void save_error_info(struct btrfs_fs_info *fs_info)
{
__save_error_info(fs_info);
}
/* btrfs handle error by forcing the filesystem readonly */
static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
{
struct super_block *sb = fs_info->sb;
if (sb->s_flags & MS_RDONLY)
return;
if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
sb->s_flags |= MS_RDONLY;
printk(KERN_INFO "btrfs is forced readonly\n");
}
}
/*
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno)
{
struct super_block *sb = fs_info->sb;
char nbuf[16];
const char *errstr;
/*
* Special case: if the error is EROFS, and we're already
* under MS_RDONLY, then it is safe here.
*/
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
errstr = btrfs_decode_error(fs_info, errno, nbuf);
printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
save_error_info(fs_info);
btrfs_handle_error(fs_info);
}
static void btrfs_put_super(struct super_block *sb)
{
struct btrfs_root *root = btrfs_sb(sb);
......@@ -69,9 +153,9 @@ enum {
Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
Opt_user_subvol_rm_allowed,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
};
static match_table_t tokens = {
......@@ -86,7 +170,9 @@ static match_table_t tokens = {
{Opt_alloc_start, "alloc_start=%s"},
{Opt_thread_pool, "thread_pool=%d"},
{Opt_compress, "compress"},
{Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
{Opt_compress_force_type, "compress-force=%s"},
{Opt_ssd, "ssd"},
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd, "nossd"},
......@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
char *p, *num, *orig;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
if (!options)
return 0;
......@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
case Opt_compress:
printk(KERN_INFO "btrfs: use compression\n");
btrfs_set_opt(info->mount_opt, COMPRESS);
break;
case Opt_compress_force:
printk(KERN_INFO "btrfs: forcing compression\n");
btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
case Opt_compress_force_type:
compress_force = true;
case Opt_compress:
case Opt_compress_type:
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
compress_type = "zlib";
info->compress_type = BTRFS_COMPRESS_ZLIB;
} else if (strcmp(args[0].from, "lzo") == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
} else {
ret = -EINVAL;
goto out;
}
btrfs_set_opt(info->mount_opt, COMPRESS);
if (compress_force) {
btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
pr_info("btrfs: force %s compression\n",
compress_type);
} else
pr_info("btrfs: use %s compression\n",
compress_type);
break;
case Opt_ssd:
printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
......@@ -753,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
/*
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 skip_space;
u64 type;
u64 avail_space;
u64 used_space;
u64 min_stripe_size;
int min_stripes = 1;
int i = 0, nr_devices;
int ret;
nr_devices = fs_info->fs_devices->rw_devices;
BUG_ON(!nr_devices);
devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
/* calc min stripe number for data space alloction */
type = btrfs_get_alloc_profile(root, 1);
if (type & BTRFS_BLOCK_GROUP_RAID0)
min_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
min_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
min_stripes = 4;
if (type & BTRFS_BLOCK_GROUP_DUP)
min_stripe_size = 2 * BTRFS_STRIPE_LEN;
else
min_stripe_size = BTRFS_STRIPE_LEN;
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
if (!device->in_fs_metadata)
continue;
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
do_div(avail_space, BTRFS_STRIPE_LEN);
avail_space *= BTRFS_STRIPE_LEN;
/*
* In order to avoid overwritting the superblock on the drive,
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
skip_space = 1024 * 1024;
/* user can set the offset in fs_info->alloc_start. */
if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
device->total_bytes)
skip_space = max(fs_info->alloc_start, skip_space);
/*
* btrfs can not use the free space in [0, skip_space - 1],
* we must subtract it from the total. In order to implement
* it, we account the used space in this range first.
*/
ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
&used_space);
if (ret) {
kfree(devices_info);
return ret;
}
/* calc the free space in [0, skip_space - 1] */
skip_space -= used_space;
/*
* we can use the free space in [0, skip_space - 1], subtract
* it from the total.
*/
if (avail_space && avail_space >= skip_space)
avail_space -= skip_space;
else
avail_space = 0;
if (avail_space < min_stripe_size)
continue;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
i++;
}
nr_devices = i;
btrfs_descending_sort_devices(devices_info, nr_devices);
i = nr_devices - 1;
avail_space = 0;
while (nr_devices >= min_stripes) {
if (devices_info[i].max_avail >= min_stripe_size) {
int j;
u64 alloc_size;
avail_space += devices_info[i].max_avail * min_stripes;
alloc_size = devices_info[i].max_avail;
for (j = i + 1 - min_stripes; j <= i; j++)
devices_info[j].max_avail -= alloc_size;
}
i--;
nr_devices--;
}
kfree(devices_info);
*free_bytes = avail_space;
return 0;
}
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
......@@ -760,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_used_data = 0;
u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
int ret;
/* holding chunk_muext to avoid allocating new chunks */
mutex_lock(&root->fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_SYSTEM))
total_used_data += found->disk_total;
else
total_used_data += found->disk_used;
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
total_free_data += found->disk_total - found->disk_used;
total_free_data -=
btrfs_account_ro_block_groups_free_space(found);
}
total_used += found->disk_used;
}
rcu_read_unlock();
......@@ -778,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bavail = total_free_data;
ret = btrfs_calc_avail_data_space(root, &total_free_data);
if (ret) {
mutex_unlock(&root->fs_info->chunk_mutex);
return ret;
}
buf->f_bavail += total_free_data;
buf->f_bavail = buf->f_bavail >> bits;
mutex_unlock(&root->fs_info->chunk_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
......@@ -897,10 +1136,14 @@ static int __init init_btrfs_fs(void)
if (err)
return err;
err = btrfs_init_cachep();
err = btrfs_init_compress();
if (err)
goto free_sysfs;
err = btrfs_init_cachep();
if (err)
goto free_compress;
err = extent_io_init();
if (err)
goto free_cachep;
......@@ -928,6 +1171,8 @@ static int __init init_btrfs_fs(void)
extent_io_exit();
free_cachep:
btrfs_destroy_cachep();
free_compress:
btrfs_exit_compress();
free_sysfs:
btrfs_exit_sysfs();
return err;
......@@ -942,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_zlib_exit();
btrfs_exit_compress();
}
module_init(init_btrfs_fs)
......
......@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
int ret;
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
return ERR_PTR(-EROFS);
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h)
......@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 to_reserve = 0;
u64 index = 0;
u64 objectid;
u64 root_flags;
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
......@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
root_flags = btrfs_root_flags(new_root_item);
if (pending->readonly)
root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
else
root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
btrfs_set_root_flags(new_root_item, root_flags);
old = btrfs_lock_root_node(root);
btrfs_cow_block(trans, root, old, NULL, 0, &old);
btrfs_set_lock_blocking(old);
......
......@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv;
/* extra metadata reseration for relocation */
int error;
bool readonly;
struct list_head list;
};
......
......@@ -22,6 +22,7 @@
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
......@@ -600,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
if (!bh)
if (!bh) {
ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
......@@ -703,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error_close;
bh = btrfs_read_dev_super(bdev);
if (!bh) {
ret = -EIO;
ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
......@@ -729,59 +732,167 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
return ret;
}
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 extent_end;
int ret;
int slot;
struct extent_buffer *l;
*length = 0;
if (start >= device->total_bytes)
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = 2;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto out;
}
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
goto next;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (key.offset <= start && extent_end > end) {
*length = end - start + 1;
break;
} else if (key.offset <= start && extent_end > start)
*length += extent_end - start;
else if (key.offset > start && extent_end <= end)
*length += extent_end - key.offset;
else if (key.offset > start && key.offset <= end) {
*length += end - key.offset + 1;
break;
} else if (key.offset > end)
break;
next:
path->slots[0]++;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* find_free_dev_extent - find free space in the specified device
* @trans: transaction handler
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @start: store the start of the free space.
* @len: the size of the free space. that we find, or the size of the max
* free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
* of the max free space.
*
* @len is used to store the size of the free space that we find.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail)
u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 hole_size = 0;
u64 last_byte = 0;
u64 search_start = 0;
u64 hole_size;
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot = 0;
int start_found;
int slot;
struct extent_buffer *l;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = 2;
start_found = 0;
/* FIXME use last free of some kind */
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
search_start = max((u64)1024 * 1024, search_start);
search_start = 1024 * 1024;
if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
if (root->fs_info->alloc_start + num_bytes <= search_end)
search_start = max(root->fs_info->alloc_start, search_start);
max_hole_start = search_start;
max_hole_size = 0;
if (search_start >= search_end) {
ret = -ENOSPC;
goto error;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto error;
}
path->reada = 2;
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
if (ret < 0)
goto error;
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto error;
if (ret > 0)
start_found = 1;
goto out;
}
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
while (1) {
l = path->nodes[0];
slot = path->slots[0];
......@@ -790,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
if (ret == 0)
continue;
if (ret < 0)
goto error;
no_more_items:
if (!start_found) {
if (search_start >= search_end) {
ret = -ENOSPC;
goto error;
}
*start = search_start;
start_found = 1;
goto check_pending;
}
*start = last_byte > search_start ?
last_byte : search_start;
if (search_end <= *start) {
ret = -ENOSPC;
goto error;
}
goto check_pending;
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
......@@ -815,48 +911,62 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
goto next;
if (key.objectid > device->devid)
goto no_more_items;
break;
if (key.offset >= search_start && key.offset > last_byte &&
start_found) {
if (last_byte < search_start)
last_byte = search_start;
hole_size = key.offset - last_byte;
if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
goto next;
if (hole_size > *max_avail)
*max_avail = hole_size;
if (key.offset > search_start) {
hole_size = key.offset - search_start;
if (key.offset > last_byte &&
hole_size >= num_bytes) {
*start = last_byte;
goto check_pending;
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
/*
* If this free space is greater than which we need,
* it must be the max free space that we have found
* until now, so max_hole_start must point to the start
* of this free space and the length of this free space
* is stored in max_hole_size. Thus, we return
* max_hole_start and max_hole_size and go back to the
* caller.
*/
if (hole_size >= num_bytes) {
ret = 0;
goto out;
}
}
if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
goto next;
start_found = 1;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (extent_end > search_start)
search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
check_pending:
/* we have to make sure we didn't find an extent that has already
* been allocated by the map tree or the original allocation
*/
BUG_ON(*start < search_start);
if (*start + num_bytes > search_end) {
ret = -ENOSPC;
goto error;
hole_size = search_end- search_start;
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
/* check for pending inserts here */
ret = 0;
error:
/* See above. */
if (hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
out:
btrfs_free_path(path);
error:
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
......@@ -1196,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
if (!bh) {
ret = -EIO;
ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
......@@ -1916,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
mutex_lock(&dev_root->fs_info->volume_mutex);
dev_root = dev_root->fs_info->dev_root;
......@@ -2154,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
return calc_size * num_stripes;
}
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
struct map_lookup **map_ret,
u64 *num_bytes, u64 *stripe_size,
u64 start, u64 type)
/* Used to sort the devices by max_avail(descending sort) */
int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_device *device = NULL;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct list_head *cur;
struct map_lookup *map = NULL;
struct extent_map_tree *em_tree;
struct extent_map *em;
struct list_head private_devs;
int min_stripe_size = 1 * 1024 * 1024;
u64 calc_size = 1024 * 1024 * 1024;
u64 max_chunk_size = calc_size;
u64 min_free;
u64 avail;
u64 max_avail = 0;
u64 dev_offset;
int num_stripes = 1;
int min_stripes = 1;
int sub_stripes = 0;
int looped = 0;
int ret;
int index;
int stripe_len = 64 * 1024;
if (((struct btrfs_device_info *)dev_info1)->max_avail >
((struct btrfs_device_info *)dev_info2)->max_avail)
return -1;
else if (((struct btrfs_device_info *)dev_info1)->max_avail <
((struct btrfs_device_info *)dev_info2)->max_avail)
return 1;
else
return 0;
}
if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
(type & BTRFS_BLOCK_GROUP_DUP)) {
WARN_ON(1);
type &= ~BTRFS_BLOCK_GROUP_DUP;
}
if (list_empty(&fs_devices->alloc_list))
return -ENOSPC;
static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
int *num_stripes, int *min_stripes,
int *sub_stripes)
{
*num_stripes = 1;
*min_stripes = 1;
*sub_stripes = 0;
if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
num_stripes = fs_devices->rw_devices;
min_stripes = 2;
*num_stripes = fs_devices->rw_devices;
*min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_DUP)) {
num_stripes = 2;
min_stripes = 2;
*num_stripes = 2;
*min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
if (fs_devices->rw_devices < 2)
return -ENOSPC;
num_stripes = 2;
min_stripes = 2;
*num_stripes = 2;
*min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
num_stripes = fs_devices->rw_devices;
if (num_stripes < 4)
*num_stripes = fs_devices->rw_devices;
if (*num_stripes < 4)
return -ENOSPC;
num_stripes &= ~(u32)1;
sub_stripes = 2;
min_stripes = 4;
*num_stripes &= ~(u32)1;
*sub_stripes = 2;
*min_stripes = 4;
}
return 0;
}
static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
u64 proposed_size, u64 type,
int num_stripes, int small_stripe)
{
int min_stripe_size = 1 * 1024 * 1024;
u64 calc_size = proposed_size;
u64 max_chunk_size = calc_size;
int ncopies = 1;
if (type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID10))
ncopies = 2;
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_chunk_size = 10 * calc_size;
min_stripe_size = 64 * 1024 * 1024;
......@@ -2230,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
again:
max_avail = 0;
if (!map || map->num_stripes != num_stripes) {
kfree(map);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map)
return -ENOMEM;
map->num_stripes = num_stripes;
}
if (calc_size * num_stripes > max_chunk_size) {
calc_size = max_chunk_size;
if (calc_size * num_stripes > max_chunk_size * ncopies) {
calc_size = max_chunk_size * ncopies;
do_div(calc_size, num_stripes);
do_div(calc_size, stripe_len);
calc_size *= stripe_len;
do_div(calc_size, BTRFS_STRIPE_LEN);
calc_size *= BTRFS_STRIPE_LEN;
}
/* we don't want tiny stripes */
if (!looped)
if (!small_stripe)
calc_size = max_t(u64, min_stripe_size, calc_size);
/*
* we're about to do_div by the stripe_len so lets make sure
* we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
* we end up with something bigger than a stripe
*/
calc_size = max_t(u64, calc_size, stripe_len * 4);
calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
do_div(calc_size, BTRFS_STRIPE_LEN);
calc_size *= BTRFS_STRIPE_LEN;
return calc_size;
}
static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
int num_stripes)
{
struct map_lookup *new;
size_t len = map_lookup_size(num_stripes);
BUG_ON(map->num_stripes < num_stripes);
if (map->num_stripes == num_stripes)
return map;
new = kmalloc(len, GFP_NOFS);
if (!new) {
/* just change map->num_stripes */
map->num_stripes = num_stripes;
return map;
}
memcpy(new, map, len);
new->num_stripes = num_stripes;
kfree(map);
return new;
}
/*
* helper to allocate device space from btrfs_device_info, in which we stored
* max free space information of every device. It is used when we can not
* allocate chunks by default size.
*
* By this helper, we can allocate a new chunk as larger as possible.
*/
static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
struct btrfs_fs_devices *fs_devices,
struct btrfs_device_info *devices,
int nr_device, u64 type,
struct map_lookup **map_lookup,
int min_stripes, u64 *stripe_size)
{
int i, index, sort_again = 0;
int min_devices = min_stripes;
u64 max_avail, min_free;
struct map_lookup *map = *map_lookup;
int ret;
if (nr_device < min_stripes)
return -ENOSPC;
btrfs_descending_sort_devices(devices, nr_device);
max_avail = devices[0].max_avail;
if (!max_avail)
return -ENOSPC;
for (i = 0; i < nr_device; i++) {
/*
* if dev_offset = 0, it means the free space of this device
* is less than what we need, and we didn't search max avail
* extent on this device, so do it now.
*/
if (!devices[i].dev_offset) {
ret = find_free_dev_extent(trans, devices[i].dev,
max_avail,
&devices[i].dev_offset,
&devices[i].max_avail);
if (ret != 0 && ret != -ENOSPC)
return ret;
sort_again = 1;
}
}
/* we update the max avail free extent of each devices, sort again */
if (sort_again)
btrfs_descending_sort_devices(devices, nr_device);
if (type & BTRFS_BLOCK_GROUP_DUP)
min_devices = 1;
if (!devices[min_devices - 1].max_avail)
return -ENOSPC;
max_avail = devices[min_devices - 1].max_avail;
if (type & BTRFS_BLOCK_GROUP_DUP)
do_div(max_avail, 2);
max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
min_stripes, 1);
if (type & BTRFS_BLOCK_GROUP_DUP)
min_free = max_avail * 2;
else
min_free = max_avail;
if (min_free > devices[min_devices - 1].max_avail)
return -ENOSPC;
map = __shrink_map_lookup_stripes(map, min_stripes);
*stripe_size = max_avail;
index = 0;
for (i = 0; i < min_stripes; i++) {
map->stripes[i].dev = devices[index].dev;
map->stripes[i].physical = devices[index].dev_offset;
if (type & BTRFS_BLOCK_GROUP_DUP) {
i++;
map->stripes[i].dev = devices[index].dev;
map->stripes[i].physical = devices[index].dev_offset +
max_avail;
}
index++;
}
*map_lookup = map;
return 0;
}
do_div(calc_size, stripe_len);
calc_size *= stripe_len;
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
struct map_lookup **map_ret,
u64 *num_bytes, u64 *stripe_size,
u64 start, u64 type)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_device *device = NULL;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct list_head *cur;
struct map_lookup *map;
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_device_info *devices_info;
struct list_head private_devs;
u64 calc_size = 1024 * 1024 * 1024;
u64 min_free;
u64 avail;
u64 dev_offset;
int num_stripes;
int min_stripes;
int sub_stripes;
int min_devices; /* the min number of devices we need */
int i;
int ret;
int index;
if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
(type & BTRFS_BLOCK_GROUP_DUP)) {
WARN_ON(1);
type &= ~BTRFS_BLOCK_GROUP_DUP;
}
if (list_empty(&fs_devices->alloc_list))
return -ENOSPC;
ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
&min_stripes, &sub_stripes);
if (ret)
return ret;
devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
ret = -ENOMEM;
goto error;
}
map->num_stripes = num_stripes;
cur = fs_devices->alloc_list.next;
index = 0;
i = 0;
if (type & BTRFS_BLOCK_GROUP_DUP)
calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
num_stripes, 0);
if (type & BTRFS_BLOCK_GROUP_DUP) {
min_free = calc_size * 2;
else
min_devices = 1;
} else {
min_free = calc_size;
/*
* we add 1MB because we never use the first 1MB of the device, unless
* we've looped, then we are likely allocating the maximum amount of
* space left already
*/
if (!looped)
min_free += 1024 * 1024;
min_devices = min_stripes;
}
INIT_LIST_HEAD(&private_devs);
while (index < num_stripes) {
......@@ -2287,27 +2559,39 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
cur = cur->next;
if (device->in_fs_metadata && avail >= min_free) {
ret = find_free_dev_extent(trans, device,
min_free, &dev_offset,
&max_avail);
ret = find_free_dev_extent(trans, device, min_free,
&devices_info[i].dev_offset,
&devices_info[i].max_avail);
if (ret == 0) {
list_move_tail(&device->dev_alloc_list,
&private_devs);
map->stripes[index].dev = device;
map->stripes[index].physical = dev_offset;
map->stripes[index].physical =
devices_info[i].dev_offset;
index++;
if (type & BTRFS_BLOCK_GROUP_DUP) {
map->stripes[index].dev = device;
map->stripes[index].physical =
dev_offset + calc_size;
devices_info[i].dev_offset +
calc_size;
index++;
}
}
} else if (device->in_fs_metadata && avail > max_avail)
max_avail = avail;
} else if (ret != -ENOSPC)
goto error;
devices_info[i].dev = device;
i++;
} else if (device->in_fs_metadata &&
avail >= BTRFS_STRIPE_LEN) {
devices_info[i].dev = device;
devices_info[i].max_avail = avail;
i++;
}
if (cur == &fs_devices->alloc_list)
break;
}
list_splice(&private_devs, &fs_devices->alloc_list);
if (index < num_stripes) {
if (index >= min_stripes) {
......@@ -2316,34 +2600,36 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
num_stripes /= sub_stripes;
num_stripes *= sub_stripes;
}
looped = 1;
goto again;
}
if (!looped && max_avail > 0) {
looped = 1;
calc_size = max_avail;
goto again;
map = __shrink_map_lookup_stripes(map, num_stripes);
} else if (i >= min_devices) {
ret = __btrfs_alloc_tiny_space(trans, fs_devices,
devices_info, i, type,
&map, min_stripes,
&calc_size);
if (ret)
goto error;
} else {
ret = -ENOSPC;
goto error;
}
kfree(map);
return -ENOSPC;
}
map->sector_size = extent_root->sectorsize;
map->stripe_len = stripe_len;
map->io_align = stripe_len;
map->io_width = stripe_len;
map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
map->num_stripes = num_stripes;
map->sub_stripes = sub_stripes;
*map_ret = map;
*stripe_size = calc_size;
*num_bytes = chunk_bytes_by_type(type, calc_size,
num_stripes, sub_stripes);
map->num_stripes, sub_stripes);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
kfree(map);
return -ENOMEM;
ret = -ENOMEM;
goto error;
}
em->bdev = (struct block_device *)map;
em->start = start;
......@@ -2376,7 +2662,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
index++;
}
kfree(devices_info);
return 0;
error:
kfree(map);
kfree(devices_info);
return ret;
}
static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
......
......@@ -20,8 +20,11 @@
#define __BTRFS_VOLUMES_
#include <linux/bio.h>
#include <linux/sort.h>
#include "async-thread.h"
#define BTRFS_STRIPE_LEN (64 * 1024)
struct buffer_head;
struct btrfs_pending_bios {
struct bio *head;
......@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
struct btrfs_bio_stripe stripes[];
};
struct btrfs_device_info {
struct btrfs_device *dev;
u64 dev_offset;
u64 max_avail;
};
/* Used to sort the devices by max_avail(descending sort) */
int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
/*
* sort the devices by max_avail, in which max free extent size of each device
* is stored.(Descending Sort)
*/
static inline void btrfs_descending_sort_devices(
struct btrfs_device_info *devices,
size_t nr_devices)
{
sort(devices, nr_devices, sizeof(struct btrfs_device_info),
btrfs_cmp_device_free_bytes, NULL);
}
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
......
......@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
/*
* The permission on security.* and system.* is not checked
* in permission().
*/
if (btrfs_root_readonly(root))
return -EROFS;
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
......@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
/*
* The permission on security.* and system.* is not checked
* in permission().
*/
if (btrfs_root_readonly(root))
return -EROFS;
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
......
......@@ -32,15 +32,6 @@
#include <linux/bio.h>
#include "compression.h"
/* Plan: call deflate() with avail_in == *sourcelen,
avail_out = *dstlen - 12 and flush == Z_FINISH.
If it doesn't manage to finish, call it again with
avail_in == 0 and avail_out set to the remaining 12
bytes for it to clean up.
Q: Is 12 bytes sufficient?
*/
#define STREAM_END_SPACE 12
struct workspace {
z_stream inf_strm;
z_stream def_strm;
......@@ -48,152 +39,51 @@ struct workspace {
struct list_head list;
};
static LIST_HEAD(idle_workspace);
static DEFINE_SPINLOCK(workspace_lock);
static unsigned long num_workspace;
static atomic_t alloc_workspace = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
/*
* this finds an available zlib workspace or allocates a new one
* NULL or an ERR_PTR is returned if things go bad.
*/
static struct workspace *find_zlib_workspace(void)
vfree(workspace->def_strm.workspace);
vfree(workspace->inf_strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
static struct list_head *zlib_alloc_workspace(void)
{
struct workspace *workspace;
int ret;
int cpus = num_online_cpus();
again:
spin_lock(&workspace_lock);
if (!list_empty(&idle_workspace)) {
workspace = list_entry(idle_workspace.next, struct workspace,
list);
list_del(&workspace->list);
num_workspace--;
spin_unlock(&workspace_lock);
return workspace;
}
spin_unlock(&workspace_lock);
if (atomic_read(&alloc_workspace) > cpus) {
DEFINE_WAIT(wait);
prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(&alloc_workspace) > cpus)
schedule();
finish_wait(&workspace_wait, &wait);
goto again;
}
atomic_inc(&alloc_workspace);
workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
if (!workspace) {
ret = -ENOMEM;
goto fail;
}
if (!workspace)
return ERR_PTR(-ENOMEM);
workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
if (!workspace->def_strm.workspace) {
ret = -ENOMEM;
goto fail;
}
workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
if (!workspace->inf_strm.workspace) {
ret = -ENOMEM;
goto fail_inflate;
}
workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
if (!workspace->buf) {
ret = -ENOMEM;
goto fail_kmalloc;
}
return workspace;
fail_kmalloc:
vfree(workspace->inf_strm.workspace);
fail_inflate:
vfree(workspace->def_strm.workspace);
fail:
kfree(workspace);
atomic_dec(&alloc_workspace);
wake_up(&workspace_wait);
return ERR_PTR(ret);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
static int free_workspace(struct workspace *workspace)
{
spin_lock(&workspace_lock);
if (num_workspace < num_online_cpus()) {
list_add_tail(&workspace->list, &idle_workspace);
num_workspace++;
spin_unlock(&workspace_lock);
if (waitqueue_active(&workspace_wait))
wake_up(&workspace_wait);
return 0;
}
spin_unlock(&workspace_lock);
vfree(workspace->def_strm.workspace);
vfree(workspace->inf_strm.workspace);
kfree(workspace->buf);
kfree(workspace);
if (!workspace->def_strm.workspace ||
!workspace->inf_strm.workspace || !workspace->buf)
goto fail;
atomic_dec(&alloc_workspace);
if (waitqueue_active(&workspace_wait))
wake_up(&workspace_wait);
return 0;
}
INIT_LIST_HEAD(&workspace->list);
/*
* cleanup function for module exit
*/
static void free_workspaces(void)
{
struct workspace *workspace;
while (!list_empty(&idle_workspace)) {
workspace = list_entry(idle_workspace.next, struct workspace,
list);
list_del(&workspace->list);
vfree(workspace->def_strm.workspace);
vfree(workspace->inf_strm.workspace);
kfree(workspace->buf);
kfree(workspace);
atomic_dec(&alloc_workspace);
}
return &workspace->list;
fail:
zlib_free_workspace(&workspace->list);
return ERR_PTR(-ENOMEM);
}
/*
* given an address space and start/len, compress the bytes.
*
* pages are allocated to hold the compressed result and stored
* in 'pages'
*
* out_pages is used to return the number of pages allocated. There
* may be pages allocated even if we return an error
*
* total_in is used to return the number of bytes actually read. It
* may be smaller then len if we had to exit early because we
* ran out of room in the pages array or because we cross the
* max_out threshold.
*
* total_out is used to return the total number of compressed bytes
*
* max_out tells us the max number of bytes that we're allowed to
* stuff into pages
*/
int btrfs_zlib_compress_pages(struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out)
static int zlib_compress_pages(struct list_head *ws,
struct address_space *mapping,
u64 start, unsigned long len,
struct page **pages,
unsigned long nr_dest_pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out,
unsigned long max_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
struct workspace *workspace;
char *data_in;
char *cpage_out;
int nr_pages = 0;
......@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
*total_out = 0;
*total_in = 0;
workspace = find_zlib_workspace();
if (IS_ERR(workspace))
return -1;
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
printk(KERN_WARNING "deflateInit failed\n");
ret = -1;
......@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
data_in = kmap(in_page);
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
ret = -1;
goto out;
}
cpage_out = kmap(out_page);
pages[0] = out_page;
nr_pages = 1;
......@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
goto out;
}
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
ret = -1;
goto out;
}
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
......@@ -314,55 +208,26 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
kunmap(in_page);
page_cache_release(in_page);
}
free_workspace(workspace);
return ret;
}
/*
* pages_in is an array of pages with compressed data.
*
* disk_start is the starting logical offset of this array in the file
*
* bvec is a bio_vec of pages from the file that we want to decompress into
*
* vcnt is the count of pages in the biovec
*
* srclen is the number of bytes in pages_in
*
* The basic idea is that we have a bio that was created by readpages.
* The pages in the bio are for the uncompressed data, and they may not
* be contiguous. They all correspond to the range of bytes covered by
* the compressed extent.
*/
int btrfs_zlib_decompress_biovec(struct page **pages_in,
u64 disk_start,
struct bio_vec *bvec,
int vcnt,
size_t srclen)
static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
u64 disk_start,
struct bio_vec *bvec,
int vcnt,
size_t srclen)
{
int ret = 0;
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
int wbits = MAX_WBITS;
struct workspace *workspace;
char *data_in;
size_t total_out = 0;
unsigned long page_bytes_left;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
struct page *page_out;
unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE;
unsigned long buf_start;
unsigned long buf_offset;
unsigned long bytes;
unsigned long working_bytes;
unsigned long pg_offset;
unsigned long start_byte;
unsigned long current_buf_start;
char *kaddr;
workspace = find_zlib_workspace();
if (IS_ERR(workspace))
return -ENOMEM;
data_in = kmap(pages_in[page_in_index]);
workspace->inf_strm.next_in = data_in;
......@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
workspace->inf_strm.total_out = 0;
workspace->inf_strm.next_out = workspace->buf;
workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
page_out = bvec[page_out_index].bv_page;
page_bytes_left = PAGE_CACHE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
......@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "inflateInit failed\n");
ret = -1;
goto out;
return -1;
}
while (workspace->inf_strm.total_in < srclen) {
ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
/*
* buf start is the byte offset we're of the start of
* our workspace buffer
*/
buf_start = total_out;
/* total_out is the last byte of the workspace buffer */
buf_start = total_out;
total_out = workspace->inf_strm.total_out;
working_bytes = total_out - buf_start;
/*
* start byte is the first byte of the page we're currently
* copying into relative to the start of the compressed data.
*/
start_byte = page_offset(page_out) - disk_start;
if (working_bytes == 0) {
/* we didn't make progress in this inflate
* call, we're done
*/
if (ret != Z_STREAM_END)
ret = -1;
/* we didn't make progress in this inflate call, we're done */
if (buf_start == total_out)
break;
}
/* we haven't yet hit data corresponding to this page */
if (total_out <= start_byte)
goto next;
/*
* the start of the data we care about is offset into
* the middle of our working buffer
*/
if (total_out > start_byte && buf_start < start_byte) {
buf_offset = start_byte - buf_start;
working_bytes -= buf_offset;
} else {
buf_offset = 0;
}
current_buf_start = buf_start;
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
bytes = min(PAGE_CACHE_SIZE - pg_offset,
PAGE_CACHE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out, KM_USER0);
memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
bytes);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(page_out);
pg_offset += bytes;
page_bytes_left -= bytes;
buf_offset += bytes;
working_bytes -= bytes;
current_buf_start += bytes;
/* check if we need to pick another page */
if (page_bytes_left == 0) {
page_out_index++;
if (page_out_index >= vcnt) {
ret = 0;
goto done;
}
page_out = bvec[page_out_index].bv_page;
pg_offset = 0;
page_bytes_left = PAGE_CACHE_SIZE;
start_byte = page_offset(page_out) - disk_start;
/*
* make sure our new page is covered by this
* working buffer
*/
if (total_out <= start_byte)
goto next;
/* the next page in the biovec might not
* be adjacent to the last page, but it
* might still be found inside this working
* buffer. bump our offset pointer
*/
if (total_out > start_byte &&
current_buf_start < start_byte) {
buf_offset = start_byte - buf_start;
working_bytes = total_out - start_byte;
current_buf_start = buf_start +
buf_offset;
}
}
ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
total_out, disk_start,
bvec, vcnt,
&page_out_index, &pg_offset);
if (ret2 == 0) {
ret = 0;
goto done;
}
next:
workspace->inf_strm.next_out = workspace->buf;
workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
......@@ -516,35 +301,21 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
zlib_inflateEnd(&workspace->inf_strm);
if (data_in)
kunmap(pages_in[page_in_index]);
out:
free_workspace(workspace);
return ret;
}
/*
* a less complex decompression routine. Our compressed data fits in a
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_zlib_decompress(unsigned char *data_in,
struct page *dest_page,
unsigned long start_byte,
size_t srclen, size_t destlen)
static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
struct page *dest_page,
unsigned long start_byte,
size_t srclen, size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
struct workspace *workspace;
unsigned long bytes_left = destlen;
unsigned long total_out = 0;
char *kaddr;
if (destlen > PAGE_CACHE_SIZE)
return -ENOMEM;
workspace = find_zlib_workspace();
if (IS_ERR(workspace))
return -ENOMEM;
workspace->inf_strm.next_in = data_in;
workspace->inf_strm.avail_in = srclen;
workspace->inf_strm.total_in = 0;
......@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "inflateInit failed\n");
ret = -1;
goto out;
return -1;
}
while (bytes_left > 0) {
......@@ -616,12 +386,13 @@ int btrfs_zlib_decompress(unsigned char *data_in,
ret = 0;
zlib_inflateEnd(&workspace->inf_strm);
out:
free_workspace(workspace);
return ret;
}
void btrfs_zlib_exit(void)
{
free_workspaces();
}
struct btrfs_compress_op btrfs_zlib_compress = {
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
.decompress_biovec = zlib_decompress_biovec,
.decompress = zlib_decompress,
};
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment