Commit 8691fb83 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] radix-tree tags for selective lookup

Add radix-tree tagging so we can look up dirty or writeback pages in
O(log64(n)) time.

Each radix-tree node gains two bits for each slot: one for page dirtiness and
one for page writebackness.

If a tag bit is set on a leaf node, it indicates that item at the
corresponding slot is tagged (say, a dirty page).

If a tag bit is set in a non-leaf node it indicates that the same tag bit is
set in the subtree which lies under the corresponding slot.  ie: "there is a
dirty page under here somewhere, but you need to search down further to find
it".

A gang lookup function is provided which can walk the radix tree in
logarithmic time looking for items which are tagged, starting from a
specified offset.  We use this for in-order searches for dirty or writeback
pages.

There is a userspace test harness for this code at

http://www.zip.com.au/~akpm/linux/patches/stuff/rtth.tar.gz
parent e279bfef
......@@ -20,8 +20,7 @@
#define _LINUX_RADIX_TREE_H
#include <linux/preempt.h>
struct radix_tree_node;
#include <linux/types.h>
struct radix_tree_root {
unsigned int height;
......@@ -29,25 +28,40 @@ struct radix_tree_root {
struct radix_tree_node *rnode;
};
#define RADIX_TREE_INIT(mask) {0, (mask), NULL}
#define RADIX_TREE_INIT(mask) { \
.height = 0, \
.gfp_mask = (mask), \
.rnode = NULL, \
}
#define RADIX_TREE(name, mask) \
struct radix_tree_root name = RADIX_TREE_INIT(mask)
#define INIT_RADIX_TREE(root, mask) \
do { \
(root)->height = 0; \
(root)->gfp_mask = (mask); \
(root)->rnode = NULL; \
#define INIT_RADIX_TREE(root, mask) \
do { \
(root)->height = 0; \
(root)->gfp_mask = (mask); \
(root)->rnode = NULL; \
} while (0)
extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
extern unsigned int
int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items);
int radix_tree_preload(int gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, int tag);
void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, int tag);
int radix_tree_tag_get(struct radix_tree_root *root,
unsigned long index, int tag);
unsigned int
radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items, int tag);
int radix_tree_tagged(struct radix_tree_root *root, int tag);
static inline void radix_tree_preload_end(void)
{
......
......@@ -6,12 +6,12 @@
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2, or (at
* your option) any later version.
*
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
......@@ -28,21 +28,36 @@
#include <linux/cpu.h>
#include <linux/gfp.h>
#include <linux/string.h>
#include <linux/bitops.h>
/*
* Radix tree node definition.
*
* RADIX_TREE_MAP_SHIFT must be >= log2(BITS_PER_LONG). Otherwise the tags
* array will have zero size and the set_tag() arithmetic will go wrong.
*/
#define RADIX_TREE_MAP_SHIFT 6
#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
#ifdef __KERNEL__
#define RADIX_TREE_MAP_SHIFT 6
#else
#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
#endif
#define RADIX_TREE_TAGS 2
#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
#define RADIX_TREE_TAG_LONGS \
((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
struct radix_tree_node {
unsigned int count;
void *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
};
struct radix_tree_path {
struct radix_tree_node *node, **slot;
int offset;
};
#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
......@@ -124,6 +139,22 @@ int radix_tree_preload(int gfp_mask)
return ret;
}
static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
{
if (!test_bit(offset, &node->tags[tag][0]))
__set_bit(offset, &node->tags[tag][0]);
}
static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
{
__clear_bit(offset, &node->tags[tag][0]);
}
static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
{
return test_bit(offset, &node->tags[tag][0]);
}
/*
* Return the maximum key which can be store into a
* radix tree with height HEIGHT.
......@@ -140,26 +171,53 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
{
struct radix_tree_node *node;
unsigned int height;
char tags[RADIX_TREE_TAGS];
int tag;
/* Figure out what the height should be. */
height = root->height + 1;
while (index > radix_tree_maxindex(height))
height++;
if (root->rnode) {
do {
if (!(node = radix_tree_node_alloc(root)))
return -ENOMEM;
/* Increase the height. */
node->slots[0] = root->rnode;
node->count = 1;
root->rnode = node;
root->height++;
} while (height > root->height);
} else
if (root->rnode == NULL) {
root->height = height;
goto out;
}
/*
* Prepare the tag status of the top-level node for propagation
* into the newly-pushed top-level node(s)
*/
for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
int idx;
tags[tag] = 0;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (root->rnode->tags[tag][idx]) {
tags[tag] = 1;
break;
}
}
}
do {
if (!(node = radix_tree_node_alloc(root)))
return -ENOMEM;
/* Increase the height. */
node->slots[0] = root->rnode;
/* Propagate the aggregated tag info into the new root */
for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
if (tags[tag])
tag_set(node, tag, 0);
}
node->count = 1;
root->rnode = node;
root->height++;
} while (height > root->height);
out:
return 0;
}
......@@ -171,23 +229,27 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
*
* Insert an item into the radix tree at position @index.
*/
int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
int radix_tree_insert(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node = NULL, *tmp, **slot;
unsigned int height, shift;
int offset;
int error;
/* Make sure the tree is high enough. */
if (index > radix_tree_maxindex(root->height)) {
if ((!index && !root->rnode) ||
index > radix_tree_maxindex(root->height)) {
error = radix_tree_extend(root, index);
if (error)
return error;
}
slot = &root->rnode;
height = root->height;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
offset = 0; /* uninitialised var warning */
while (height > 0) {
if (*slot == NULL) {
/* Have to add a child node. */
......@@ -198,18 +260,21 @@ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *i
node->count++;
}
/* Go a level down. */
/* Go a level down */
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
node = *slot;
slot = (struct radix_tree_node **)
(node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
slot = (struct radix_tree_node **)(node->slots + offset);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
if (*slot != NULL)
return -EEXIST;
if (node)
if (node) {
node->count++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
}
*slot = item;
return 0;
......@@ -221,7 +286,7 @@ EXPORT_SYMBOL(radix_tree_insert);
* @root: radix tree root
* @index: index key
*
* Lookup them item at the position @index in the radix tree @root.
* Lookup the item at the position @index in the radix tree @root.
*/
void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
{
......@@ -240,16 +305,174 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
return NULL;
slot = (struct radix_tree_node **)
((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
((*slot)->slots +
((index >> shift) & RADIX_TREE_MAP_MASK));
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
return (void *) *slot;
return *slot;
}
EXPORT_SYMBOL(radix_tree_lookup);
static /* inline */ unsigned int
/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
* @index: index key
* @tag: tag index
*
* Set the search tag corresponging to @index in the radix tree. From
* the root all the way down to the leaf node.
*
* Returns the address of the tagged item. Setting a tag on a not-present
* item is a bug.
*/
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, int tag)
{
unsigned int height, shift;
struct radix_tree_node **slot;
height = root->height;
if (index > radix_tree_maxindex(height))
return NULL;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
slot = &root->rnode;
while (height > 0) {
int offset;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
tag_set(*slot, tag, offset);
slot = (struct radix_tree_node **)((*slot)->slots + offset);
BUG_ON(*slot == NULL);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
return *slot;
}
EXPORT_SYMBOL(radix_tree_tag_set);
/**
* radix_tree_tag_clear - clear a tag on a radix tree node
* @root: radix tree root
* @index: index key
* @tag: tag index
*
* Clear the search tag corresponging to @index in the radix tree. If
* this causes the leaf node to have no tags set then clear the tag in the
* next-to-leaf node, etc.
*
* Returns the address of the tagged item on success, else NULL. ie:
* has the same return value and semantics as radix_tree_lookup().
*/
void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, int tag)
{
struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
unsigned int height, shift;
void *ret = NULL;
height = root->height;
if (index > radix_tree_maxindex(height))
goto out;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL;
pathp->slot = &root->rnode;
while (height > 0) {
int offset;
if (*pathp->slot == NULL)
goto out;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
pathp[1].offset = offset;
pathp[1].node = *pathp[0].slot;
pathp[1].slot = (struct radix_tree_node **)
(pathp[1].node->slots + offset);
pathp++;
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
ret = *pathp[0].slot;
if (ret == NULL)
goto out;
do {
int idx;
tag_clear(pathp[0].node, tag, pathp[0].offset);
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (pathp[0].node->tags[tag][idx])
goto out;
}
pathp--;
} while (pathp[0].node);
out:
return ret;
}
EXPORT_SYMBOL(radix_tree_tag_clear);
#ifndef __KERNEL__ /* Only the test harness uses this at present */
/**
* radix_tree_tag_get - get a tag on a radix tree node
* @root: radix tree root
* @index: index key
* @tag: tag index
*
* Return the search tag corresponging to @index in the radix tree.
*
* Returns zero if the tag is unset, or if there is no corresponding item
* in the tree.
*/
int radix_tree_tag_get(struct radix_tree_root *root,
unsigned long index, int tag)
{
unsigned int height, shift;
struct radix_tree_node **slot;
int saw_unset_tag = 0;
height = root->height;
if (index > radix_tree_maxindex(height))
return 0;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
slot = &root->rnode;
for ( ; ; ) {
int offset;
if (*slot == NULL)
return 0;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
/*
* This is just a debug check. Later, we can bale as soon as
* we see an unset tag.
*/
if (!tag_get(*slot, tag, offset))
saw_unset_tag = 1;
if (height == 1) {
int ret = tag_get(*slot, tag, offset);
BUG_ON(ret && saw_unset_tag);
return ret;
}
slot = (struct radix_tree_node **)((*slot)->slots + offset);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
}
EXPORT_SYMBOL(radix_tree_tag_get);
#endif
static unsigned int
__lookup(struct radix_tree_root *root, void **results, unsigned long index,
unsigned int max_items, unsigned long *next_index)
{
......@@ -316,17 +539,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned long cur_index = first_index;
unsigned int ret = 0;
if (root->rnode == NULL)
goto out;
if (max_index == 0) { /* Bah. Special case */
if (first_index == 0) {
if (max_items > 0) {
*results = root->rnode;
ret = 1;
}
}
goto out;
}
while (ret < max_items) {
unsigned int nr_found;
unsigned long next_index; /* Index of next search */
......@@ -340,11 +552,101 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
break;
cur_index = next_index;
}
out:
return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);
/*
* FIXME: the two tag_get()s here should use find_next_bit() instead of
* open-coding the search.
*/
static unsigned int
__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
unsigned int max_items, unsigned long *next_index, int tag)
{
unsigned int nr_found = 0;
unsigned int shift;
unsigned int height = root->height;
struct radix_tree_node *slot;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
slot = root->rnode;
while (height > 0) {
unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
if (tag_get(slot, tag, i)) {
BUG_ON(slot->slots[i] == NULL);
break;
}
index &= ~((1 << shift) - 1);
index += 1 << shift;
if (index == 0)
goto out; /* 32-bit wraparound */
}
if (i == RADIX_TREE_MAP_SIZE)
goto out;
height--;
if (height == 0) { /* Bottom level: grab some items */
unsigned long j = index & RADIX_TREE_MAP_MASK;
for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
index++;
if (tag_get(slot, tag, j)) {
BUG_ON(slot->slots[j] == NULL);
results[nr_found++] = slot->slots[j];
if (nr_found == max_items)
goto out;
}
}
}
shift -= RADIX_TREE_MAP_SHIFT;
slot = slot->slots[i];
}
out:
*next_index = index;
return nr_found;
}
/**
* radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
* based on a tag
* @root: radix tree root
* @results: where the results of the lookup are placed
* @first_index: start the lookup from this key
* @max_items: place up to this many items at *results
* @tag: the tag index
*
* Performs an index-ascending scan of the tree for present items which
* have the tag indexed by @tag set. Places the items at *@results and
* returns the number of items which were placed at *@results.
*/
unsigned int
radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items, int tag)
{
const unsigned long max_index = radix_tree_maxindex(root->height);
unsigned long cur_index = first_index;
unsigned int ret = 0;
while (ret < max_items) {
unsigned int nr_found;
unsigned long next_index; /* Index of next search */
if (cur_index > max_index)
break;
nr_found = __lookup_tag(root, results + ret, cur_index,
max_items - ret, &next_index, tag);
ret += nr_found;
if (next_index == 0)
break;
cur_index = next_index;
}
return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
/**
* radix_tree_delete - delete an item from a radix tree
* @root: radix tree root
......@@ -357,24 +659,31 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
struct radix_tree_path *orig_pathp;
unsigned int height, shift;
void *ret = NULL;
char tags[RADIX_TREE_TAGS];
int nr_cleared_tags;
height = root->height;
if (index > radix_tree_maxindex(height))
goto out;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL;
pathp->slot = &root->rnode;
while (height > 0) {
int offset;
if (*pathp->slot == NULL)
goto out;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
pathp[1].offset = offset;
pathp[1].node = *pathp[0].slot;
pathp[1].slot = (struct radix_tree_node **)
(pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
(pathp[1].node->slots + offset);
pathp++;
shift -= RADIX_TREE_MAP_SHIFT;
height--;
......@@ -384,20 +693,67 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
if (ret == NULL)
goto out;
orig_pathp = pathp;
/*
* Clear all tags associated with the just-deleted item
*/
memset(tags, 0, sizeof(tags));
do {
int tag;
nr_cleared_tags = RADIX_TREE_TAGS;
for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
int idx;
if (!tags[tag])
tag_clear(pathp[0].node, tag, pathp[0].offset);
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (pathp[0].node->tags[tag][idx]) {
tags[tag] = 1;
nr_cleared_tags--;
break;
}
}
}
pathp--;
} while (pathp[0].node && nr_cleared_tags);
pathp = orig_pathp;
*pathp[0].slot = NULL;
while (pathp[0].node && --pathp[0].node->count == 0) {
pathp--;
BUG_ON(*pathp[0].slot == NULL);
*pathp[0].slot = NULL;
radix_tree_node_free(pathp[1].node);
}
if (root->rnode == NULL)
root->height = 0; /* Empty tree, we can reset the height */
root->height = 0;
out:
return ret;
}
EXPORT_SYMBOL(radix_tree_delete);
/**
* radix_tree_tagged - test whether any items in the tree are tagged
* @root: radix tree root
* @tag: tag to test
*/
int radix_tree_tagged(struct radix_tree_root *root, int tag)
{
int idx;
if (!root->rnode)
return 0;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (root->rnode->tags[tag][idx])
return 1;
}
return 0;
}
EXPORT_SYMBOL(radix_tree_tagged);
static void
radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment