Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
linux
Commits
542c3118
Commit
542c3118
authored
Apr 16, 2015
by
Dave Chinner
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'xfs-dio-extend-fix' into for-next
Conflicts: fs/xfs/xfs_file.c
parents
6a63ef06
0cefb29e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
239 additions
and
82 deletions
+239
-82
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.c
+192
-78
fs/xfs/xfs_file.c
fs/xfs/xfs_file.c
+42
-4
fs/xfs/xfs_trace.h
fs/xfs/xfs_trace.h
+5
-0
No files found.
fs/xfs/xfs_aops.c
View file @
542c3118
...
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
...
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
return
try_to_free_buffers
(
page
);
return
try_to_free_buffers
(
page
);
}
}
/*
* When we map a DIO buffer, we may need to attach an ioend that describes the
* type of write IO we are doing. This passes to the completion function the
* operations it needs to perform. If the mapping is for an overwrite wholly
* within the EOF then we don't need an ioend and so we don't allocate one.
* This avoids the unnecessary overhead of allocating and freeing ioends for
* workloads that don't require transactions on IO completion.
*
* If we get multiple mappings in a single IO, we might be mapping different
* types. But because the direct IO can only have a single private pointer, we
* need to ensure that:
*
* a) i) the ioend spans the entire region of unwritten mappings; or
* ii) the ioend spans all the mappings that cross or are beyond EOF; and
* b) if it contains unwritten extents, it is *permanently* marked as such
*
* We could do this by chaining ioends like buffered IO does, but we only
* actually get one IO completion callback from the direct IO, and that spans
* the entire IO regardless of how many mappings and IOs are needed to complete
* the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need
* reference counting here.
*/
static
void
xfs_map_direct
(
struct
inode
*
inode
,
struct
buffer_head
*
bh_result
,
struct
xfs_bmbt_irec
*
imap
,
xfs_off_t
offset
)
{
struct
xfs_ioend
*
ioend
;
xfs_off_t
size
=
bh_result
->
b_size
;
int
type
;
if
(
ISUNWRITTEN
(
imap
))
type
=
XFS_IO_UNWRITTEN
;
else
type
=
XFS_IO_OVERWRITE
;
trace_xfs_gbmap_direct
(
XFS_I
(
inode
),
offset
,
size
,
type
,
imap
);
if
(
bh_result
->
b_private
)
{
ioend
=
bh_result
->
b_private
;
ASSERT
(
ioend
->
io_size
>
0
);
ASSERT
(
offset
>=
ioend
->
io_offset
);
if
(
offset
+
size
>
ioend
->
io_offset
+
ioend
->
io_size
)
ioend
->
io_size
=
offset
-
ioend
->
io_offset
+
size
;
if
(
type
==
XFS_IO_UNWRITTEN
&&
type
!=
ioend
->
io_type
)
ioend
->
io_type
=
XFS_IO_UNWRITTEN
;
trace_xfs_gbmap_direct_update
(
XFS_I
(
inode
),
ioend
->
io_offset
,
ioend
->
io_size
,
ioend
->
io_type
,
imap
);
}
else
if
(
type
==
XFS_IO_UNWRITTEN
||
offset
+
size
>
i_size_read
(
inode
))
{
ioend
=
xfs_alloc_ioend
(
inode
,
type
);
ioend
->
io_offset
=
offset
;
ioend
->
io_size
=
size
;
bh_result
->
b_private
=
ioend
;
set_buffer_defer_completion
(
bh_result
);
trace_xfs_gbmap_direct_new
(
XFS_I
(
inode
),
offset
,
size
,
type
,
imap
);
}
else
{
trace_xfs_gbmap_direct_none
(
XFS_I
(
inode
),
offset
,
size
,
type
,
imap
);
}
}
/*
* If this is O_DIRECT or the mpage code calling tell them how large the mapping
* is, so that we can avoid repeated get_blocks calls.
*
* If the mapping spans EOF, then we have to break the mapping up as the mapping
* for blocks beyond EOF must be marked new so that sub block regions can be
* correctly zeroed. We can't do this for mappings within EOF unless the mapping
* was just allocated or is unwritten, otherwise the callers would overwrite
* existing data with zeros. Hence we have to split the mapping into a range up
* to and including EOF, and a second mapping for beyond EOF.
*/
static
void
xfs_map_trim_size
(
struct
inode
*
inode
,
sector_t
iblock
,
struct
buffer_head
*
bh_result
,
struct
xfs_bmbt_irec
*
imap
,
xfs_off_t
offset
,
ssize_t
size
)
{
xfs_off_t
mapping_size
;
mapping_size
=
imap
->
br_startoff
+
imap
->
br_blockcount
-
iblock
;
mapping_size
<<=
inode
->
i_blkbits
;
ASSERT
(
mapping_size
>
0
);
if
(
mapping_size
>
size
)
mapping_size
=
size
;
if
(
offset
<
i_size_read
(
inode
)
&&
offset
+
mapping_size
>=
i_size_read
(
inode
))
{
/* limit mapping to block that spans EOF */
mapping_size
=
roundup_64
(
i_size_read
(
inode
)
-
offset
,
1
<<
inode
->
i_blkbits
);
}
if
(
mapping_size
>
LONG_MAX
)
mapping_size
=
LONG_MAX
;
bh_result
->
b_size
=
mapping_size
;
}
STATIC
int
STATIC
int
__xfs_get_blocks
(
__xfs_get_blocks
(
struct
inode
*
inode
,
struct
inode
*
inode
,
...
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
...
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
xfs_iunlock
(
ip
,
lockmode
);
xfs_iunlock
(
ip
,
lockmode
);
}
}
trace_xfs_get_blocks_alloc
(
ip
,
offset
,
size
,
trace_xfs_get_blocks_alloc
(
ip
,
offset
,
size
,
0
,
&
imap
);
ISUNWRITTEN
(
&
imap
)
?
XFS_IO_UNWRITTEN
:
XFS_IO_DELALLOC
,
&
imap
);
}
else
if
(
nimaps
)
{
}
else
if
(
nimaps
)
{
trace_xfs_get_blocks_found
(
ip
,
offset
,
size
,
0
,
&
imap
);
trace_xfs_get_blocks_found
(
ip
,
offset
,
size
,
ISUNWRITTEN
(
&
imap
)
?
XFS_IO_UNWRITTEN
:
XFS_IO_OVERWRITE
,
&
imap
);
xfs_iunlock
(
ip
,
lockmode
);
xfs_iunlock
(
ip
,
lockmode
);
}
else
{
}
else
{
trace_xfs_get_blocks_notfound
(
ip
,
offset
,
size
);
trace_xfs_get_blocks_notfound
(
ip
,
offset
,
size
);
goto
out_unlock
;
goto
out_unlock
;
}
}
/* trim mapping down to size requested */
if
(
direct
||
size
>
(
1
<<
inode
->
i_blkbits
))
xfs_map_trim_size
(
inode
,
iblock
,
bh_result
,
&
imap
,
offset
,
size
);
/*
* For unwritten extents do not report a disk address in the buffered
* read case (treat as if we're reading into a hole).
*/
if
(
imap
.
br_startblock
!=
HOLESTARTBLOCK
&&
if
(
imap
.
br_startblock
!=
HOLESTARTBLOCK
&&
imap
.
br_startblock
!=
DELAYSTARTBLOCK
)
{
imap
.
br_startblock
!=
DELAYSTARTBLOCK
&&
/*
(
create
||
!
ISUNWRITTEN
(
&
imap
)))
{
* For unwritten extents do not report a disk address on
xfs_map_buffer
(
inode
,
bh_result
,
&
imap
,
offset
);
* the read case (treat as if we're reading into a hole).
if
(
ISUNWRITTEN
(
&
imap
))
*/
if
(
create
||
!
ISUNWRITTEN
(
&
imap
))
xfs_map_buffer
(
inode
,
bh_result
,
&
imap
,
offset
);
if
(
create
&&
ISUNWRITTEN
(
&
imap
))
{
if
(
direct
)
{
bh_result
->
b_private
=
inode
;
set_buffer_defer_completion
(
bh_result
);
}
set_buffer_unwritten
(
bh_result
);
set_buffer_unwritten
(
bh_result
);
}
/* direct IO needs special help */
if
(
create
&&
direct
)
xfs_map_direct
(
inode
,
bh_result
,
&
imap
,
offset
);
}
}
/*
/*
...
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
...
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
}
}
}
}
/*
* If this is O_DIRECT or the mpage code calling tell them how large
* the mapping is, so that we can avoid repeated get_blocks calls.
*
* If the mapping spans EOF, then we have to break the mapping up as the
* mapping for blocks beyond EOF must be marked new so that sub block
* regions can be correctly zeroed. We can't do this for mappings within
* EOF unless the mapping was just allocated or is unwritten, otherwise
* the callers would overwrite existing data with zeros. Hence we have
* to split the mapping into a range up to and including EOF, and a
* second mapping for beyond EOF.
*/
if
(
direct
||
size
>
(
1
<<
inode
->
i_blkbits
))
{
xfs_off_t
mapping_size
;
mapping_size
=
imap
.
br_startoff
+
imap
.
br_blockcount
-
iblock
;
mapping_size
<<=
inode
->
i_blkbits
;
ASSERT
(
mapping_size
>
0
);
if
(
mapping_size
>
size
)
mapping_size
=
size
;
if
(
offset
<
i_size_read
(
inode
)
&&
offset
+
mapping_size
>=
i_size_read
(
inode
))
{
/* limit mapping to block that spans EOF */
mapping_size
=
roundup_64
(
i_size_read
(
inode
)
-
offset
,
1
<<
inode
->
i_blkbits
);
}
if
(
mapping_size
>
LONG_MAX
)
mapping_size
=
LONG_MAX
;
bh_result
->
b_size
=
mapping_size
;
}
return
0
;
return
0
;
out_unlock:
out_unlock:
...
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
...
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
/*
/*
* Complete a direct I/O write request.
* Complete a direct I/O write request.
*
*
* If the private argument is non-NULL __xfs_get_blocks signals us that we
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* need to issue a transaction to convert the range from unwritten to written
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* extents.
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
*/
STATIC
void
STATIC
void
xfs_end_io_direct_write
(
xfs_end_io_direct_write
(
...
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
...
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
struct
inode
*
inode
=
file_inode
(
iocb
->
ki_filp
);
struct
inode
*
inode
=
file_inode
(
iocb
->
ki_filp
);
struct
xfs_inode
*
ip
=
XFS_I
(
inode
);
struct
xfs_inode
*
ip
=
XFS_I
(
inode
);
struct
xfs_mount
*
mp
=
ip
->
i_mount
;
struct
xfs_mount
*
mp
=
ip
->
i_mount
;
struct
xfs_ioend
*
ioend
=
private
;
if
(
XFS_FORCED_SHUTDOWN
(
mp
))
trace_xfs_gbmap_direct_endio
(
ip
,
offset
,
size
,
ioend
?
ioend
->
io_type
:
0
,
NULL
);
if
(
!
ioend
)
{
ASSERT
(
offset
+
size
<=
i_size_read
(
inode
));
return
;
return
;
}
if
(
XFS_FORCED_SHUTDOWN
(
mp
))
goto
out_end_io
;
/*
/*
* While the generic direct I/O code updates the inode size, it does
* dio completion end_io functions are only called on writes if more
* so only after the end_io handler is called, which means our
* than 0 bytes was written.
* end_io handler thinks the on-disk size is outside the in-core
* size. To prevent this just update it a little bit earlier here.
*/
*/
ASSERT
(
size
>
0
);
/*
* The ioend only maps whole blocks, while the IO may be sector aligned.
* Hence the ioend offset/size may not match the IO offset/size exactly.
* Because we don't map overwrites within EOF into the ioend, the offset
* may not match, but only if the endio spans EOF. Either way, write
* the IO sizes into the ioend so that completion processing does the
* right thing.
*/
ASSERT
(
offset
+
size
<=
ioend
->
io_offset
+
ioend
->
io_size
);
ioend
->
io_size
=
size
;
ioend
->
io_offset
=
offset
;
/*
* The ioend tells us whether we are doing unwritten extent conversion
* or an append transaction that updates the on-disk file size. These
* cases are the only cases where we should *potentially* be needing
* to update the VFS inode size.
*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
* if necessary.
*
* We need to lock the test/set EOF update as we can be racing with
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
spin_lock
(
&
ip
->
i_flags_lock
);
if
(
offset
+
size
>
i_size_read
(
inode
))
if
(
offset
+
size
>
i_size_read
(
inode
))
i_size_write
(
inode
,
offset
+
size
);
i_size_write
(
inode
,
offset
+
size
);
spin_unlock
(
&
ip
->
i_flags_lock
);
/*
/*
* For direct I/O we do not know if we need to allocate blocks or not,
* If we are doing an append IO that needs to update the EOF on disk,
* so we can't preallocate an append transaction, as that results in
* do the transaction reserve now so we can use common end io
* nested reservations and log space deadlocks. Hence allocate the
* processing. Stashing the error (if there is one) in the ioend will
* transaction here. While this is sub-optimal and can block IO
* result in the ioend processing passing on the error if it is
* completion for some time, we're stuck with doing it this way until
* possible as we can't return it from here.
* we can pass the ioend to the direct IO allocation callbacks and
* avoid nesting that way.
*/
*/
if
(
private
&&
size
>
0
)
{
if
(
ioend
->
io_type
==
XFS_IO_OVERWRITE
)
xfs_iomap_write_unwritten
(
ip
,
offset
,
size
);
ioend
->
io_error
=
xfs_setfilesize_trans_alloc
(
ioend
);
}
else
if
(
offset
+
size
>
ip
->
i_d
.
di_size
)
{
struct
xfs_trans
*
tp
;
int
error
;
tp
=
xfs_trans_alloc
(
mp
,
XFS_TRANS_FSYNC_TS
);
error
=
xfs_trans_reserve
(
tp
,
&
M_RES
(
mp
)
->
tr_fsyncts
,
0
,
0
);
if
(
error
)
{
xfs_trans_cancel
(
tp
,
0
);
return
;
}
xfs_setfilesize
(
ip
,
tp
,
offset
,
size
);
out_end_io:
}
xfs_end_io
(
&
ioend
->
io_work
);
return
;
}
}
STATIC
ssize_t
STATIC
ssize_t
...
...
fs/xfs/xfs_file.c
View file @
542c3118
...
@@ -569,20 +569,41 @@ xfs_file_aio_write_checks(
...
@@ -569,20 +569,41 @@ xfs_file_aio_write_checks(
* write. If zeroing is needed and we are currently holding the
* write. If zeroing is needed and we are currently holding the
* iolock shared, we need to update it to exclusive which implies
* iolock shared, we need to update it to exclusive which implies
* having to redo all checks before.
* having to redo all checks before.
*
* We need to serialise against EOF updates that occur in IO
* completions here. We want to make sure that nobody is changing the
* size while we do this check until we have placed an IO barrier (i.e.
* hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
* The spinlock effectively forms a memory barrier once we have the
* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
* and hence be able to correctly determine if we need to run zeroing.
*/
*/
spin_lock
(
&
ip
->
i_flags_lock
);
if
(
*
pos
>
i_size_read
(
inode
))
{
if
(
*
pos
>
i_size_read
(
inode
))
{
bool
zero
=
false
;
bool
zero
=
false
;
spin_unlock
(
&
ip
->
i_flags_lock
);
if
(
*
iolock
==
XFS_IOLOCK_SHARED
)
{
if
(
*
iolock
==
XFS_IOLOCK_SHARED
)
{
xfs_rw_iunlock
(
ip
,
*
iolock
);
xfs_rw_iunlock
(
ip
,
*
iolock
);
*
iolock
=
XFS_IOLOCK_EXCL
;
*
iolock
=
XFS_IOLOCK_EXCL
;
xfs_rw_ilock
(
ip
,
*
iolock
);
xfs_rw_ilock
(
ip
,
*
iolock
);
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
* we now need to wait for all of them to drain. Non-AIO
* DIO will have drained before we are given the
* XFS_IOLOCK_EXCL, and so for most cases this wait is a
* no-op.
*/
inode_dio_wait
(
inode
);
goto
restart
;
goto
restart
;
}
}
error
=
xfs_zero_eof
(
ip
,
*
pos
,
i_size_read
(
inode
),
&
zero
);
error
=
xfs_zero_eof
(
ip
,
*
pos
,
i_size_read
(
inode
),
&
zero
);
if
(
error
)
if
(
error
)
return
error
;
return
error
;
}
}
else
spin_unlock
(
&
ip
->
i_flags_lock
);
/*
/*
* Updating the timestamps will grab the ilock again from
* Updating the timestamps will grab the ilock again from
...
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
...
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
int
iolock
;
int
iolock
;
size_t
count
=
iov_iter_count
(
from
);
size_t
count
=
iov_iter_count
(
from
);
loff_t
pos
=
iocb
->
ki_pos
;
loff_t
pos
=
iocb
->
ki_pos
;
loff_t
end
;
struct
iov_iter
data
;
struct
xfs_buftarg
*
target
=
XFS_IS_REALTIME_INODE
(
ip
)
?
struct
xfs_buftarg
*
target
=
XFS_IS_REALTIME_INODE
(
ip
)
?
mp
->
m_rtdev_targp
:
mp
->
m_ddev_targp
;
mp
->
m_rtdev_targp
:
mp
->
m_ddev_targp
;
...
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
...
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
if
(
ret
)
if
(
ret
)
goto
out
;
goto
out
;
iov_iter_truncate
(
from
,
count
);
iov_iter_truncate
(
from
,
count
);
end
=
pos
+
count
-
1
;
if
(
mapping
->
nrpages
)
{
if
(
mapping
->
nrpages
)
{
ret
=
filemap_write_and_wait_range
(
VFS_I
(
ip
)
->
i_mapping
,
ret
=
filemap_write_and_wait_range
(
VFS_I
(
ip
)
->
i_mapping
,
pos
,
pos
+
count
-
1
);
pos
,
end
);
if
(
ret
)
if
(
ret
)
goto
out
;
goto
out
;
/*
/*
...
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
...
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
*/
*/
ret
=
invalidate_inode_pages2_range
(
VFS_I
(
ip
)
->
i_mapping
,
ret
=
invalidate_inode_pages2_range
(
VFS_I
(
ip
)
->
i_mapping
,
pos
>>
PAGE_CACHE_SHIFT
,
pos
>>
PAGE_CACHE_SHIFT
,
(
pos
+
count
-
1
)
>>
PAGE_CACHE_SHIFT
);
end
>>
PAGE_CACHE_SHIFT
);
WARN_ON_ONCE
(
ret
);
WARN_ON_ONCE
(
ret
);
ret
=
0
;
ret
=
0
;
}
}
...
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
...
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
}
}
trace_xfs_file_direct_write
(
ip
,
count
,
iocb
->
ki_pos
,
0
);
trace_xfs_file_direct_write
(
ip
,
count
,
iocb
->
ki_pos
,
0
);
ret
=
generic_file_direct_write
(
iocb
,
from
,
pos
);
data
=
*
from
;
ret
=
mapping
->
a_ops
->
direct_IO
(
WRITE
,
iocb
,
&
data
,
pos
);
/* see generic_file_direct_write() for why this is necessary */
if
(
mapping
->
nrpages
)
{
invalidate_inode_pages2_range
(
mapping
,
pos
>>
PAGE_CACHE_SHIFT
,
end
>>
PAGE_CACHE_SHIFT
);
}
if
(
ret
>
0
)
{
pos
+=
ret
;
iov_iter_advance
(
from
,
ret
);
iocb
->
ki_pos
=
pos
;
}
out:
out:
xfs_rw_iunlock
(
ip
,
iolock
);
xfs_rw_iunlock
(
ip
,
iolock
);
...
...
fs/xfs/xfs_trace.h
View file @
542c3118
...
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
...
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT
(
xfs_map_blocks_alloc
);
DEFINE_IOMAP_EVENT
(
xfs_map_blocks_alloc
);
DEFINE_IOMAP_EVENT
(
xfs_get_blocks_found
);
DEFINE_IOMAP_EVENT
(
xfs_get_blocks_found
);
DEFINE_IOMAP_EVENT
(
xfs_get_blocks_alloc
);
DEFINE_IOMAP_EVENT
(
xfs_get_blocks_alloc
);
DEFINE_IOMAP_EVENT
(
xfs_gbmap_direct
);
DEFINE_IOMAP_EVENT
(
xfs_gbmap_direct_new
);
DEFINE_IOMAP_EVENT
(
xfs_gbmap_direct_update
);
DEFINE_IOMAP_EVENT
(
xfs_gbmap_direct_none
);
DEFINE_IOMAP_EVENT
(
xfs_gbmap_direct_endio
);
DECLARE_EVENT_CLASS
(
xfs_simple_io_class
,
DECLARE_EVENT_CLASS
(
xfs_simple_io_class
,
TP_PROTO
(
struct
xfs_inode
*
ip
,
xfs_off_t
offset
,
ssize_t
count
),
TP_PROTO
(
struct
xfs_inode
*
ip
,
xfs_off_t
offset
,
ssize_t
count
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment