Commit 4506cfb6 authored by Ryan Ding's avatar Ryan Ding Committed by Linus Torvalds

ocfs2: record UNWRITTEN extents when populate write desc

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

There is still one issue in the direct write procedure.

phase 1: alloc extent with UNWRITTEN flag
phase 2: submit direct data to disk, add zero page to page cache
phase 3: clear UNWRITTEN flag when data has been written to disk

When there are 2 direct write A(0~3KB),B(4~7KB) writing to the same
cluster 0~7KB (cluster size 8KB).  Write request A arrive phase 2 first,
it will zero the region (4~7KB).  Before request A enter to phase 3,
request B arrive phase 2, it will zero region (0~3KB).  This is just like
request B steps request A.

To resolve this issue, we should let request B knows this cluster is already
under zero, to prevent it from steps the previous write request.

This patch will add function ocfs2_unwritten_check() to do this job.  It
will record all clusters that are under direct write(it will be recorded
in the 'ip_unwritten_list' member of inode info), and prevent the later
direct write writing to the same cluster to do the zero work again.
Signed-off-by: default avatarRyan Ding <ryan.ding@oracle.com>
Reviewed-by: default avatarJunxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 2de6a3c7
...@@ -1201,6 +1201,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, ...@@ -1201,6 +1201,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
struct ocfs2_unwritten_extent {
struct list_head ue_node;
struct list_head ue_ip_node;
u32 ue_cpos;
u32 ue_phys;
};
/* /*
* Describe the state of a single cluster to be written to. * Describe the state of a single cluster to be written to.
*/ */
...@@ -1275,6 +1282,8 @@ struct ocfs2_write_ctxt { ...@@ -1275,6 +1282,8 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh; struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc; struct ocfs2_cached_dealloc_ctxt w_dealloc;
struct list_head w_unwritten_list;
}; };
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
...@@ -1313,8 +1322,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) ...@@ -1313,8 +1322,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
} }
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) static void ocfs2_free_unwritten_list(struct inode *inode,
struct list_head *head)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_unwritten_extent *dz = NULL, *tmp = NULL;
list_for_each_entry_safe(dz, tmp, head, ue_node) {
list_del(&dz->ue_node);
spin_lock(&oi->ip_lock);
list_del(&dz->ue_ip_node);
spin_unlock(&oi->ip_lock);
kfree(dz);
}
}
static void ocfs2_free_write_ctxt(struct inode *inode,
struct ocfs2_write_ctxt *wc)
{ {
ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc); ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh); brelse(wc->w_di_bh);
kfree(wc); kfree(wc);
...@@ -1346,6 +1372,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, ...@@ -1346,6 +1372,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_large_pages = 0; wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc); ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc; *wcp = wc;
...@@ -1795,6 +1822,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, ...@@ -1795,6 +1822,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
} }
} }
/*
* Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
* do the zero work. And should not to clear UNWRITTEN since it will be cleared
* by the direct io procedure.
* If this is a new extent that allocated by direct io, we should mark it in
* the ip_unwritten_list.
*/
static int ocfs2_unwritten_check(struct inode *inode,
struct ocfs2_write_ctxt *wc,
struct ocfs2_write_cluster_desc *desc)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_unwritten_extent *dz = NULL, *new = NULL;
int ret = 0;
if (!desc->c_needs_zero)
return 0;
retry:
spin_lock(&oi->ip_lock);
/* Needs not to zero no metter buffer or direct. The one who is zero
* the cluster is doing zero. And he will clear unwritten after all
* cluster io finished. */
list_for_each_entry(dz, &oi->ip_unwritten_list, ue_ip_node) {
if (desc->c_cpos == dz->ue_cpos) {
BUG_ON(desc->c_new);
desc->c_needs_zero = 0;
desc->c_clear_unwritten = 0;
goto unlock;
}
}
if (wc->w_type != OCFS2_WRITE_DIRECT)
goto unlock;
if (new == NULL) {
spin_unlock(&oi->ip_lock);
new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
GFP_NOFS);
if (new == NULL) {
ret = -ENOMEM;
goto out;
}
goto retry;
}
/* This direct write will doing zero. */
new->ue_cpos = desc->c_cpos;
new->ue_phys = desc->c_phys;
desc->c_clear_unwritten = 0;
list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
list_add_tail(&new->ue_node, &wc->w_unwritten_list);
new = NULL;
unlock:
spin_unlock(&oi->ip_lock);
out:
if (new)
kfree(new);
return ret;
}
/* /*
* Populate each single-cluster write descriptor in the write context * Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done. * with information about the i/o to be done.
...@@ -1879,6 +1966,12 @@ static int ocfs2_populate_write_desc(struct inode *inode, ...@@ -1879,6 +1966,12 @@ static int ocfs2_populate_write_desc(struct inode *inode,
desc->c_needs_zero = 1; desc->c_needs_zero = 1;
} }
ret = ocfs2_unwritten_check(inode, wc, desc);
if (ret) {
mlog_errno(ret);
goto out;
}
num_clusters--; num_clusters--;
} }
...@@ -2215,9 +2308,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, ...@@ -2215,9 +2308,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* and non-sparse clusters we just extended. For non-sparse writes, * and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster. * we know zeros will only be needed in the first and/or last cluster.
*/ */
if (clusters_to_alloc || extents_to_split || if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
(wc->w_clen && (wc->w_desc[0].c_needs_zero || wc->w_desc[wc->w_clen - 1].c_needs_zero))
wc->w_desc[wc->w_clen - 1].c_needs_zero)))
cluster_of_pages = 1; cluster_of_pages = 1;
else else
cluster_of_pages = 0; cluster_of_pages = 0;
...@@ -2296,7 +2388,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, ...@@ -2296,7 +2388,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
ocfs2_commit_trans(osb, handle); ocfs2_commit_trans(osb, handle);
out: out:
ocfs2_free_write_ctxt(wc); ocfs2_free_write_ctxt(inode, wc);
if (data_ac) { if (data_ac) {
ocfs2_free_alloc_context(data_ac); ocfs2_free_alloc_context(data_ac);
...@@ -2406,6 +2498,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, ...@@ -2406,6 +2498,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle; handle_t *handle = wc->w_handle;
struct page *tmppage; struct page *tmppage;
BUG_ON(!list_empty(&wc->w_unwritten_list));
if (handle) { if (handle) {
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
......
...@@ -1170,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode) ...@@ -1170,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n", "Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno); (unsigned long long)oi->ip_blkno);
mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
"Clear inode of %llu, inode has unwritten extents\n",
(unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0); ocfs2_extent_map_trunc(inode, 0);
......
...@@ -57,6 +57,9 @@ struct ocfs2_inode_info ...@@ -57,6 +57,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */ u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */ u32 ip_attr; /* inode attributes */
/* Record unwritten extents during direct io. */
struct list_head ip_unwritten_list;
/* protected by recovery_lock. */ /* protected by recovery_lock. */
struct inode *ip_next_orphan; struct inode *ip_next_orphan;
......
...@@ -1745,6 +1745,7 @@ static void ocfs2_inode_init_once(void *data) ...@@ -1745,6 +1745,7 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock); spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode); ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers); INIT_LIST_HEAD(&oi->ip_io_markers);
INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0; oi->ip_dir_start_lookup = 0;
mutex_init(&oi->ip_unaligned_aio); mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_alloc_sem);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment