Commit ee13fc67 authored by Darrick J. Wong's avatar Darrick J. Wong Committed by Chandan Babu R

xfs: convert xfarray_pagesort to deal with large folios

Convert xfarray_pagesort to handle large folios by introducing a new
xfile_get_folio routine that can return a folio of arbitrary size, and
using heapsort on the full folio.  This also corrects an off-by-one bug
in the calculation of len in xfarray_pagesort that was papered over by
xfarray_want_pagesort.
Signed-off-by: default avatar"Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>
parent b2fdfe19
...@@ -956,7 +956,7 @@ TRACE_EVENT(xfarray_isort, ...@@ -956,7 +956,7 @@ TRACE_EVENT(xfarray_isort,
__entry->hi - __entry->lo) __entry->hi - __entry->lo)
); );
TRACE_EVENT(xfarray_pagesort, TRACE_EVENT(xfarray_foliosort,
TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
TP_ARGS(si, lo, hi), TP_ARGS(si, lo, hi),
TP_STRUCT__entry( TP_STRUCT__entry(
...@@ -1027,6 +1027,47 @@ TRACE_EVENT(xfarray_sort, ...@@ -1027,6 +1027,47 @@ TRACE_EVENT(xfarray_sort,
__entry->bytes) __entry->bytes)
); );
TRACE_EVENT(xfarray_sort_scan,
TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
TP_ARGS(si, idx),
TP_STRUCT__entry(
__field(unsigned long, ino)
__field(unsigned long long, nr)
__field(size_t, obj_size)
__field(unsigned long long, idx)
__field(unsigned long long, folio_pos)
__field(unsigned long, folio_bytes)
__field(unsigned long long, first_idx)
__field(unsigned long long, last_idx)
),
TP_fast_assign(
__entry->nr = si->array->nr;
__entry->obj_size = si->array->obj_size;
__entry->ino = file_inode(si->array->xfile->file)->i_ino;
__entry->idx = idx;
if (si->folio) {
__entry->folio_pos = folio_pos(si->folio);
__entry->folio_bytes = folio_size(si->folio);
__entry->first_idx = si->first_folio_idx;
__entry->last_idx = si->last_folio_idx;
} else {
__entry->folio_pos = 0;
__entry->folio_bytes = 0;
__entry->first_idx = 0;
__entry->last_idx = 0;
}
),
TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
__entry->ino,
__entry->nr,
__entry->obj_size,
__entry->idx,
__entry->folio_pos,
__entry->folio_bytes,
__entry->first_idx,
__entry->last_idx)
);
TRACE_EVENT(xfarray_sort_stats, TRACE_EVENT(xfarray_sort_stats,
TP_PROTO(struct xfarray_sortinfo *si, int error), TP_PROTO(struct xfarray_sortinfo *si, int error),
TP_ARGS(si, error), TP_ARGS(si, error),
......
...@@ -563,70 +563,42 @@ xfarray_isort( ...@@ -563,70 +563,42 @@ xfarray_isort(
return xfile_store(si->array->xfile, scratch, len, lo_pos); return xfile_store(si->array->xfile, scratch, len, lo_pos);
} }
/* Grab a page for sorting records. */ /*
static inline int * Sort the records from lo to hi (inclusive) if they are all backed by the
xfarray_sort_get_page( * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative
struct xfarray_sortinfo *si, * errno.
loff_t pos, */
uint64_t len)
{
return xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
}
/* Release a page we grabbed for sorting records. */
static inline int
xfarray_sort_put_page(
struct xfarray_sortinfo *si)
{
if (!xfile_page_cached(&si->xfpage))
return 0;
return xfile_put_page(si->array->xfile, &si->xfpage);
}
/* Decide if these records are eligible for in-page sorting. */
static inline bool
xfarray_want_pagesort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
pgoff_t lo_page;
pgoff_t hi_page;
loff_t end_pos;
/* We can only map one page at a time. */
lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
hi_page = end_pos >> PAGE_SHIFT;
return lo_page == hi_page;
}
/* Sort a bunch of records that all live in the same memory page. */
STATIC int STATIC int
xfarray_pagesort( xfarray_foliosort(
struct xfarray_sortinfo *si, struct xfarray_sortinfo *si,
xfarray_idx_t lo, xfarray_idx_t lo,
xfarray_idx_t hi) xfarray_idx_t hi)
{ {
struct folio *folio;
void *startp; void *startp;
loff_t lo_pos = xfarray_pos(si->array, lo); loff_t lo_pos = xfarray_pos(si->array, lo);
uint64_t len = xfarray_pos(si->array, hi - lo); uint64_t len = xfarray_pos(si->array, hi - lo + 1);
int error = 0;
trace_xfarray_pagesort(si, lo, hi); /* No single folio could back this many records. */
if (len > XFILE_MAX_FOLIO_SIZE)
return 0;
xfarray_sort_bump_loads(si); xfarray_sort_bump_loads(si);
error = xfarray_sort_get_page(si, lo_pos, len); folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
if (error) if (IS_ERR(folio))
return error; return PTR_ERR(folio);
if (!folio)
return 0;
trace_xfarray_foliosort(si, lo, hi);
xfarray_sort_bump_heapsorts(si); xfarray_sort_bump_heapsorts(si);
startp = page_address(si->xfpage.page) + offset_in_page(lo_pos); startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si); xfarray_sort_bump_stores(si);
return xfarray_sort_put_page(si); xfile_put_folio(si->array->xfile, folio);
return 1;
} }
/* Return a pointer to the xfarray pivot record within the sortinfo struct. */ /* Return a pointer to the xfarray pivot record within the sortinfo struct. */
...@@ -814,63 +786,78 @@ xfarray_qsort_push( ...@@ -814,63 +786,78 @@ xfarray_qsort_push(
return 0; return 0;
} }
static inline void
xfarray_sort_scan_done(
struct xfarray_sortinfo *si)
{
if (si->folio)
xfile_put_folio(si->array->xfile, si->folio);
si->folio = NULL;
}
/* /*
* Load an element from the array into the first scratchpad and cache the page, * Cache the folio backing the start of the given array element. If the array
* if possible. * element is contained entirely within the folio, return a pointer to the
* cached folio. Otherwise, load the element into the scratchpad and return a
* pointer to the scratchpad.
*/ */
static inline int static inline int
xfarray_sort_load_cached( xfarray_sort_scan(
struct xfarray_sortinfo *si, struct xfarray_sortinfo *si,
xfarray_idx_t idx, xfarray_idx_t idx,
void *ptr) void **ptrp)
{ {
loff_t idx_pos = xfarray_pos(si->array, idx); loff_t idx_pos = xfarray_pos(si->array, idx);
pgoff_t startpage;
pgoff_t endpage;
int error = 0; int error = 0;
/* if (xfarray_sort_terminated(si, &error))
* If this load would split a page, release the cached page, if any, return error;
* and perform a traditional read.
*/
startpage = idx_pos >> PAGE_SHIFT;
endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
if (startpage != endpage) {
error = xfarray_sort_put_page(si);
if (error)
return error;
if (xfarray_sort_terminated(si, &error)) trace_xfarray_sort_scan(si, idx);
return error;
return xfile_load(si->array->xfile, ptr, /* If the cached folio doesn't cover this index, release it. */
si->array->obj_size, idx_pos); if (si->folio &&
} (idx < si->first_folio_idx || idx > si->last_folio_idx))
xfarray_sort_scan_done(si);
/* If the cached page is not the one we want, release it. */ /* Grab the first folio that backs this array element. */
if (xfile_page_cached(&si->xfpage) && if (!si->folio) {
xfile_page_index(&si->xfpage) != startpage) { loff_t next_pos;
error = xfarray_sort_put_page(si);
if (error) si->folio = xfile_get_folio(si->array->xfile, idx_pos,
return error; si->array->obj_size, XFILE_ALLOC);
if (IS_ERR(si->folio))
return PTR_ERR(si->folio);
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
next_pos = folio_pos(si->folio) + folio_size(si->folio);
si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
si->last_folio_idx--;
trace_xfarray_sort_scan(si, idx);
} }
/* /*
* If we don't have a cached page (and we know the load is contained * If this folio still doesn't cover the desired element, it must cross
* in a single page) then grab it. * a folio boundary. Read into the scratchpad and we're done.
*/ */
if (!xfile_page_cached(&si->xfpage)) { if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
if (xfarray_sort_terminated(si, &error)) void *temp = xfarray_scratch(si->array);
return error;
error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, error = xfile_load(si->array->xfile, temp, si->array->obj_size,
PAGE_SIZE); idx_pos);
if (error) if (error)
return error; return error;
*ptrp = temp;
return 0;
} }
memcpy(ptr, page_address(si->xfpage.page) + offset_in_page(idx_pos), /* Otherwise return a pointer to the array element in the folio. */
si->array->obj_size); *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
return 0; return 0;
} }
...@@ -937,6 +924,8 @@ xfarray_sort( ...@@ -937,6 +924,8 @@ xfarray_sort(
pivot = xfarray_sortinfo_pivot(si); pivot = xfarray_sortinfo_pivot(si);
while (si->stack_depth >= 0) { while (si->stack_depth >= 0) {
int ret;
lo = si_lo[si->stack_depth]; lo = si_lo[si->stack_depth];
hi = si_hi[si->stack_depth]; hi = si_hi[si->stack_depth];
...@@ -949,13 +938,13 @@ xfarray_sort( ...@@ -949,13 +938,13 @@ xfarray_sort(
} }
/* /*
* If directly mapping the page and sorting can solve our * If directly mapping the folio and sorting can solve our
* problems, we're done. * problems, we're done.
*/ */
if (xfarray_want_pagesort(si, lo, hi)) { ret = xfarray_foliosort(si, lo, hi);
error = xfarray_pagesort(si, lo, hi); if (ret < 0)
if (error) goto out_free;
goto out_free; if (ret == 1) {
si->stack_depth--; si->stack_depth--;
continue; continue;
} }
...@@ -980,25 +969,24 @@ xfarray_sort( ...@@ -980,25 +969,24 @@ xfarray_sort(
* than the pivot is on the right side of the range. * than the pivot is on the right side of the range.
*/ */
while (lo < hi) { while (lo < hi) {
void *p;
/* /*
* Decrement hi until it finds an a[hi] less than the * Decrement hi until it finds an a[hi] less than the
* pivot value. * pivot value.
*/ */
error = xfarray_sort_load_cached(si, hi, scratch); error = xfarray_sort_scan(si, hi, &p);
if (error) if (error)
goto out_free; goto out_free;
while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
lo < hi) {
hi--; hi--;
error = xfarray_sort_load_cached(si, hi, error = xfarray_sort_scan(si, hi, &p);
scratch);
if (error) if (error)
goto out_free; goto out_free;
} }
error = xfarray_sort_put_page(si); if (p != scratch)
if (error) memcpy(scratch, p, si->array->obj_size);
goto out_free; xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error)) if (xfarray_sort_terminated(si, &error))
goto out_free; goto out_free;
...@@ -1013,21 +1001,18 @@ xfarray_sort( ...@@ -1013,21 +1001,18 @@ xfarray_sort(
* Increment lo until it finds an a[lo] greater than * Increment lo until it finds an a[lo] greater than
* the pivot value. * the pivot value.
*/ */
error = xfarray_sort_load_cached(si, lo, scratch); error = xfarray_sort_scan(si, lo, &p);
if (error) if (error)
goto out_free; goto out_free;
while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
lo < hi) {
lo++; lo++;
error = xfarray_sort_load_cached(si, lo, error = xfarray_sort_scan(si, lo, &p);
scratch);
if (error) if (error)
goto out_free; goto out_free;
} }
error = xfarray_sort_put_page(si); if (p != scratch)
if (error) memcpy(scratch, p, si->array->obj_size);
goto out_free; xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error)) if (xfarray_sort_terminated(si, &error))
goto out_free; goto out_free;
......
...@@ -105,8 +105,14 @@ struct xfarray_sortinfo { ...@@ -105,8 +105,14 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */ /* XFARRAY_SORT_* flags; see below. */
unsigned int flags; unsigned int flags;
/* Cache a page here for faster access. */ /* Cache a folio here for faster scanning for pivots */
struct xfile_page xfpage; struct folio *folio;
/* First array index in folio that is completely readable */
xfarray_idx_t first_folio_idx;
/* Last array index in folio that is completely readable */
xfarray_idx_t last_folio_idx;
#ifdef DEBUG #ifdef DEBUG
/* Performance statistics. */ /* Performance statistics. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment