Commit 9b247179 authored by Darrick J. Wong's avatar Darrick J. Wong

xfs: cache unlinked pointers in an rhashtable

Use a rhashtable to cache the unlinked list incore.  This should speed
up unlinked processing considerably when there are a lot of inodes on
the unlinked list because iunlink_remove no longer has to traverse an
entire bucket list to find which inode points to the one being removed.

The incore list structure records "X.next_unlinked = Y" relations, with
the rhashtable using Y to index the records.  This makes finding the
inode X that points to a inode Y very quick.  If our cache fails to find
anything we can always fall back on the old method.

FWIW this drastically reduces the amount of time it takes to remove
inodes from the unlinked list.  I wrote a program to open a lot of
O_TMPFILE files and then close them in the same order, which takes
a very long time if we have to traverse the unlinked lists.  With the
ptach, I see:

+ /d/t/tmpfile/tmpfile
Opened 193531 files in 6.33s.
Closed 193531 files in 5.86s

real    0m12.192s
user    0m0.064s
sys     0m11.619s
+ cd /
+ umount /mnt

real    0m0.050s
user    0m0.004s
sys     0m0.030s

And without the patch:

+ /d/t/tmpfile/tmpfile
Opened 193588 files in 6.35s.
Closed 193588 files in 751.61s

real    12m38.853s
user    0m0.084s
sys     12m34.470s
+ cd /
+ umount /mnt

real    0m0.086s
user    0m0.000s
sys     0m0.060s
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: default avatarBrian Foster <bfoster@redhat.com>
parent 4664c66c
......@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
#define XFS_ERRTAG_MAX 34
#define XFS_ERRTAG_IUNLINK_FALLBACK 34
#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
......@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
......@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
......@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
......@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
......
......@@ -1906,6 +1906,214 @@ xfs_inactive(
xfs_qm_dqdetach(ip);
}
/*
* In-Core Unlinked List Lookups
* =============================
*
* Every inode is supposed to be reachable from some other piece of metadata
* with the exception of the root directory. Inodes with a connection to a
* file descriptor but not linked from anywhere in the on-disk directory tree
* are collectively known as unlinked inodes, though the filesystem itself
* maintains links to these inodes so that on-disk metadata are consistent.
*
* XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
* header contains a number of buckets that point to an inode, and each inode
* record has a pointer to the next inode in the hash chain. This
* singly-linked list causes scaling problems in the iunlink remove function
* because we must walk that list to find the inode that points to the inode
* being removed from the unlinked hash bucket list.
*
* What if we modelled the unlinked list as a collection of records capturing
* "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
* have a fast way to look up unlinked list predecessors, which avoids the
* slow list walk. That's exactly what we do here (in-core) with a per-AG
* rhashtable.
*
* Because this is a backref cache, we ignore operational failures since the
* iunlink code can fall back to the slow bucket walk. The only errors that
* should bubble out are for obviously incorrect situations.
*
* All users of the backref cache MUST hold the AGI buffer lock to serialize
* access or have otherwise provided for concurrency control.
*/
/* Capture a "X.next_unlinked = Y" relationship. */
struct xfs_iunlink {
struct rhash_head iu_rhash_head;
xfs_agino_t iu_agino; /* X */
xfs_agino_t iu_next_unlinked; /* Y */
};
/* Unlinked list predecessor lookup hashtable construction */
static int
xfs_iunlink_obj_cmpfn(
struct rhashtable_compare_arg *arg,
const void *obj)
{
const xfs_agino_t *key = arg->key;
const struct xfs_iunlink *iu = obj;
if (iu->iu_next_unlinked != *key)
return 1;
return 0;
}
static const struct rhashtable_params xfs_iunlink_hash_params = {
.min_size = XFS_AGI_UNLINKED_BUCKETS,
.key_len = sizeof(xfs_agino_t),
.key_offset = offsetof(struct xfs_iunlink,
iu_next_unlinked),
.head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
.automatic_shrinking = true,
.obj_cmpfn = xfs_iunlink_obj_cmpfn,
};
/*
* Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
* relation is found.
*/
static xfs_agino_t
xfs_iunlink_lookup_backref(
struct xfs_perag *pag,
xfs_agino_t agino)
{
struct xfs_iunlink *iu;
iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
xfs_iunlink_hash_params);
return iu ? iu->iu_agino : NULLAGINO;
}
/*
* Take ownership of an iunlink cache entry and insert it into the hash table.
* If successful, the entry will be owned by the cache; if not, it is freed.
* Either way, the caller does not own @iu after this call.
*/
static int
xfs_iunlink_insert_backref(
struct xfs_perag *pag,
struct xfs_iunlink *iu)
{
int error;
error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
&iu->iu_rhash_head, xfs_iunlink_hash_params);
/*
* Fail loudly if there already was an entry because that's a sign of
* corruption of in-memory data. Also fail loudly if we see an error
* code we didn't anticipate from the rhashtable code. Currently we
* only anticipate ENOMEM.
*/
if (error) {
WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
kmem_free(iu);
}
/*
* Absorb any runtime errors that aren't a result of corruption because
* this is a cache and we can always fall back to bucket list scanning.
*/
if (error != 0 && error != -EEXIST)
error = 0;
return error;
}
/* Remember that @prev_agino.next_unlinked = @this_agino. */
static int
xfs_iunlink_add_backref(
struct xfs_perag *pag,
xfs_agino_t prev_agino,
xfs_agino_t this_agino)
{
struct xfs_iunlink *iu;
if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
return 0;
iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
iu->iu_agino = prev_agino;
iu->iu_next_unlinked = this_agino;
return xfs_iunlink_insert_backref(pag, iu);
}
/*
* Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
* If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
* wasn't any such entry then we don't bother.
*/
static int
xfs_iunlink_change_backref(
struct xfs_perag *pag,
xfs_agino_t agino,
xfs_agino_t next_unlinked)
{
struct xfs_iunlink *iu;
int error;
/* Look up the old entry; if there wasn't one then exit. */
iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
xfs_iunlink_hash_params);
if (!iu)
return 0;
/*
* Remove the entry. This shouldn't ever return an error, but if we
* couldn't remove the old entry we don't want to add it again to the
* hash table, and if the entry disappeared on us then someone's
* violated the locking rules and we need to fail loudly. Either way
* we cannot remove the inode because internal state is or would have
* been corrupt.
*/
error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
&iu->iu_rhash_head, xfs_iunlink_hash_params);
if (error)
return error;
/* If there is no new next entry just free our item and return. */
if (next_unlinked == NULLAGINO) {
kmem_free(iu);
return 0;
}
/* Update the entry and re-add it to the hash table. */
iu->iu_next_unlinked = next_unlinked;
return xfs_iunlink_insert_backref(pag, iu);
}
/* Set up the in-core predecessor structures. */
int
xfs_iunlink_init(
struct xfs_perag *pag)
{
return rhashtable_init(&pag->pagi_unlinked_hash,
&xfs_iunlink_hash_params);
}
/* Free the in-core predecessor structures. */
static void
xfs_iunlink_free_item(
void *ptr,
void *arg)
{
struct xfs_iunlink *iu = ptr;
bool *freed_anything = arg;
*freed_anything = true;
kmem_free(iu);
}
void
xfs_iunlink_destroy(
struct xfs_perag *pag)
{
bool freed_anything = false;
rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
xfs_iunlink_free_item, &freed_anything);
ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
}
/*
* Point the AGI unlinked bucket at an inode and log the results. The caller
* is responsible for validating the old value.
......@@ -2066,7 +2274,8 @@ xfs_iunlink(
return -EFSCORRUPTED;
if (next_agino != NULLAGINO) {
xfs_agino_t old_agino;
struct xfs_perag *pag;
xfs_agino_t old_agino;
/*
* There is already another inode in the bucket, so point this
......@@ -2077,6 +2286,16 @@ xfs_iunlink(
if (error)
return error;
ASSERT(old_agino == NULLAGINO);
/*
* agino has been unlinked, add a backref from the next inode
* back to agino.
*/
pag = xfs_perag_get(mp, agno);
error = xfs_iunlink_add_backref(pag, agino, next_agino);
xfs_perag_put(pag);
if (error)
return error;
}
/* Point the head of the list to point to this inode. */
......@@ -2133,7 +2352,8 @@ xfs_iunlink_map_prev(
xfs_agino_t *agino,
struct xfs_imap *imap,
struct xfs_dinode **dipp,
struct xfs_buf **bpp)
struct xfs_buf **bpp,
struct xfs_perag *pag)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_agino_t next_agino;
......@@ -2142,6 +2362,28 @@ xfs_iunlink_map_prev(
ASSERT(head_agino != target_agino);
*bpp = NULL;
/* See if our backref cache can find it faster. */
*agino = xfs_iunlink_lookup_backref(pag, target_agino);
if (*agino != NULLAGINO) {
error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
if (error)
return error;
if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
return 0;
/*
* If we get here the cache contents were corrupt, so drop the
* buffer and fall back to walking the bucket list.
*/
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
WARN_ON_ONCE(1);
}
trace_xfs_iunlink_map_prev_fallback(mp, agno);
/* Otherwise, walk the entire bucket until we find it. */
next_agino = head_agino;
while (next_agino != target_agino) {
xfs_agino_t unlinked_agino;
......@@ -2187,6 +2429,7 @@ xfs_iunlink_remove(
struct xfs_buf *agibp;
struct xfs_buf *last_ibp;
struct xfs_dinode *last_dip = NULL;
struct xfs_perag *pag = NULL;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t next_agino;
......@@ -2222,27 +2465,62 @@ xfs_iunlink_remove(
if (error)
return error;
/*
* If there was a backref pointing from the next inode back to this
* one, remove it because we've removed this inode from the list.
*
* Later, if this inode was in the middle of the list we'll update
* this inode's backref to point from the next inode.
*/
if (next_agino != NULLAGINO) {
pag = xfs_perag_get(mp, agno);
error = xfs_iunlink_change_backref(pag, next_agino,
NULLAGINO);
if (error)
goto out;
}
if (head_agino == agino) {
/* Point the head of the list to the next unlinked inode. */
error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
next_agino);
if (error)
return error;
goto out;
} else {
struct xfs_imap imap;
xfs_agino_t prev_agino;
if (!pag)
pag = xfs_perag_get(mp, agno);
/* We need to search the list for the inode being freed. */
error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
&prev_agino, &imap, &last_dip, &last_ibp);
&prev_agino, &imap, &last_dip, &last_ibp,
pag);
if (error)
return error;
goto out;
/* Point the previous inode on the list to the next inode. */
xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
last_dip, &imap, next_agino);
/*
* Now we deal with the backref for this inode. If this inode
* pointed at a real inode, change the backref that pointed to
* us to point to our old next. If this inode was the end of
* the list, delete the backref that pointed to us. Note that
* change_backref takes care of deleting the backref if
* next_agino is NULLAGINO.
*/
error = xfs_iunlink_change_backref(pag, agino, next_agino);
if (error)
goto out;
}
return 0;
out:
if (pag)
xfs_perag_put(pag);
return error;
}
/*
......
......@@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone;
bool xfs_inode_verify_forks(struct xfs_inode *ip);
int xfs_iunlink_init(struct xfs_perag *pag);
void xfs_iunlink_destroy(struct xfs_perag *pag);
#endif /* __XFS_INODE_H__ */
......@@ -149,6 +149,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
......@@ -227,6 +228,9 @@ xfs_initialize_perag(
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
error = xfs_iunlink_init(pag);
if (error)
goto out_hash_destroy;
}
index = xfs_set_inode_alloc(mp, agcount);
......@@ -249,6 +253,7 @@ xfs_initialize_perag(
if (!pag)
break;
xfs_buf_hash_destroy(pag);
xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
......
......@@ -396,6 +396,13 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
/*
* Unlinked inode information. This incore information reflects
* data stored in the AGI, so callers must hold the AGI buffer lock
* or have some other means to control concurrency.
*/
struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
......
......@@ -3447,6 +3447,7 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
TP_ARGS(ip))
DEFINE_AGINODE_EVENT(xfs_iunlink);
DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
#endif /* _TRACE_XFS_H */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment