Commit 0fb3860a authored by Dan Magenheimer's avatar Dan Magenheimer Committed by Greg Kroah-Hartman

staging: ramster: ramster-specific changes to zcache/tmem

In tmem.[ch], new "repatriate" (provoke async get) and "localify" (handle
incoming data resulting from an async get) routines combine with a handful
of changes to existing pamops interfaces allow the generic tmem code
to support asynchronous operations.  Also, a new tmem_xhandle struct
groups together key information that must be passed to remote tmem stores.

Zcache-main.c is augmented with a large amount of ramster-specific code
to handle remote operations and "foreign" pages on both ends of the
"remotify" protocol.  New "foreign" pools are auto-created on demand.
A "selfshrinker" thread periodically repatriates remote persistent pages
when local memory conditions allow.  For certain operations, a queue is
necessary to guarantee strict ordering as out-of-order puts/flushes can
cause strange race conditions.  Pampd pointers now either point to local
memory OR describe a remote page; to allow the same 64-bits to describe
either, the LSB is used to differentiate.  Some acrobatics must be performed
to ensure local memory is available to handle a remote persistent get,
or deal with the data directly anyway if the malloc failed.  Lots
of ramster-specific statistics are available via sysfs.

Note: Some debug ifdefs left in for now.
Signed-off-by: default avatarDan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent b083e861
config ZCACHE config RAMSTER
tristate "Dynamic compression of swap pages and clean pagecache pages" tristate "Cross-machine RAM capacity sharing, aka peer-to-peer tmem"
depends on CLEANCACHE || FRONTSWAP depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS && !OCFS2_FS && !ZCACHE && !HIGHMEM
select XVMALLOC select XVMALLOC
select LZO_COMPRESS select LZO_COMPRESS
select LZO_DECOMPRESS select LZO_DECOMPRESS
default n default n
help help
Zcache doubles RAM efficiency while providing a significant RAMster allows RAM on other machines in a cluster to be utilized
performance boosts on many workloads. Zcache uses lzo1x dynamically and symmetrically instead of swapping to a local swap
compression and an in-kernel implementation of transcendent disk, thus improving performance on memory-constrained workloads
memory to store clean page cache pages and swap in RAM, while minimizing total RAM across the cluster. RAMster, like
providing a noticeable reduction in disk I/O. zcache, compresses swap pages into local RAM, but then remotifies
the compressed pages to another node in the RAMster cluster.
zcache-y := zcache-main.o tmem.o obj-$(CONFIG_RAMSTER) += zcache-main.o tmem.o
obj-$(CONFIG_RAMSTER) += ramster_o2net.o cluster/
obj-$(CONFIG_ZCACHE) += zcache.o
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/delay.h>
#include "tmem.h" #include "tmem.h"
...@@ -316,7 +317,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) ...@@ -316,7 +317,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
} }
static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
void *new_pampd) void *new_pampd, bool no_free)
{ {
struct tmem_objnode **slot; struct tmem_objnode **slot;
void *ret = NULL; void *ret = NULL;
...@@ -325,7 +326,9 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, ...@@ -325,7 +326,9 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
if ((slot != NULL) && (*slot != NULL)) { if ((slot != NULL) && (*slot != NULL)) {
void *old_pampd = *(void **)slot; void *old_pampd = *(void **)slot;
*(void **)slot = new_pampd; *(void **)slot = new_pampd;
(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0); if (!no_free)
(*tmem_pamops.free)(old_pampd, obj->pool,
NULL, 0, false);
ret = new_pampd; ret = new_pampd;
} }
return ret; return ret;
...@@ -481,7 +484,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj, ...@@ -481,7 +484,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj,
if (ht == 1) { if (ht == 1) {
obj->pampd_count--; obj->pampd_count--;
(*tmem_pamops.free)(objnode->slots[i], (*tmem_pamops.free)(objnode->slots[i],
obj->pool, NULL, 0); obj->pool, NULL, 0, true);
objnode->slots[i] = NULL; objnode->slots[i] = NULL;
continue; continue;
} }
...@@ -498,7 +501,8 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) ...@@ -498,7 +501,8 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
return; return;
if (obj->objnode_tree_height == 0) { if (obj->objnode_tree_height == 0) {
obj->pampd_count--; obj->pampd_count--;
(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0); (*tmem_pamops.free)(obj->objnode_tree_root,
obj->pool, NULL, 0, true);
} else { } else {
tmem_objnode_node_destroy(obj, obj->objnode_tree_root, tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
obj->objnode_tree_height); obj->objnode_tree_height);
...@@ -529,7 +533,7 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) ...@@ -529,7 +533,7 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
* always flushes for simplicity. * always flushes for simplicity.
*/ */
int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
char *data, size_t size, bool raw, bool ephemeral) char *data, size_t size, bool raw, int ephemeral)
{ {
struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
void *pampd = NULL, *pampd_del = NULL; void *pampd = NULL, *pampd_del = NULL;
...@@ -545,7 +549,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, ...@@ -545,7 +549,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
/* if found, is a dup put, flush the old one */ /* if found, is a dup put, flush the old one */
pampd_del = tmem_pampd_delete_from_obj(obj, index); pampd_del = tmem_pampd_delete_from_obj(obj, index);
BUG_ON(pampd_del != pampd); BUG_ON(pampd_del != pampd);
(*tmem_pamops.free)(pampd, pool, oidp, index); (*tmem_pamops.free)(pampd, pool, oidp, index, true);
if (obj->pampd_count == 0) { if (obj->pampd_count == 0) {
objnew = obj; objnew = obj;
objfound = NULL; objfound = NULL;
...@@ -576,7 +580,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, ...@@ -576,7 +580,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
(void)tmem_pampd_delete_from_obj(obj, index); (void)tmem_pampd_delete_from_obj(obj, index);
free: free:
if (pampd) if (pampd)
(*tmem_pamops.free)(pampd, pool, NULL, 0); (*tmem_pamops.free)(pampd, pool, NULL, 0, true);
if (objnew) { if (objnew) {
tmem_obj_free(objnew, hb); tmem_obj_free(objnew, hb);
(*tmem_hostops.obj_free)(objnew, pool); (*tmem_hostops.obj_free)(objnew, pool);
...@@ -586,6 +590,65 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, ...@@ -586,6 +590,65 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
return ret; return ret;
} }
void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
uint32_t index, struct tmem_obj **ret_obj,
void **saved_hb)
{
struct tmem_hashbucket *hb;
struct tmem_obj *obj = NULL;
void *pampd = NULL;
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock);
obj = tmem_obj_find(hb, oidp);
if (likely(obj != NULL))
pampd = tmem_pampd_lookup_in_obj(obj, index);
*ret_obj = obj;
*saved_hb = (void *)hb;
/* note, hashbucket remains locked */
return pampd;
}
void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
void *pampd, void *saved_hb, bool delete)
{
struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
BUG_ON(!spin_is_locked(&hb->lock));
if (pampd != NULL) {
BUG_ON(obj == NULL);
(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
} else if (delete) {
BUG_ON(obj == NULL);
(void)tmem_pampd_delete_from_obj(obj, index);
}
spin_unlock(&hb->lock);
}
static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
struct tmem_pool *pool, struct tmem_oid *oidp,
uint32_t index, bool free, char *data)
{
void *old_pampd = *ppampd, *new_pampd = NULL;
bool intransit = false;
int ret = 0;
if (!is_ephemeral(pool))
new_pampd = (*tmem_pamops.repatriate_preload)(
old_pampd, pool, oidp, index, &intransit);
if (intransit)
ret = -EAGAIN;
else if (new_pampd != NULL)
*ppampd = new_pampd;
/* must release the hb->lock else repatriate can't sleep */
spin_unlock(&hb->lock);
if (!intransit)
ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
oidp, index, free, data);
return ret;
}
/* /*
* "Get" a page, e.g. if one can be found, copy the tmem page with the * "Get" a page, e.g. if one can be found, copy the tmem page with the
* matching handle from PAM space to the kernel. By tmem definition, * matching handle from PAM space to the kernel. By tmem definition,
...@@ -607,14 +670,36 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, ...@@ -607,14 +670,36 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
int ret = -1; int ret = -1;
struct tmem_hashbucket *hb; struct tmem_hashbucket *hb;
bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
bool lock_held = false; bool lock_held = 0;
void **ppampd;
again:
hb = &pool->hashbucket[tmem_oid_hash(oidp)]; hb = &pool->hashbucket[tmem_oid_hash(oidp)];
spin_lock(&hb->lock); spin_lock(&hb->lock);
lock_held = true; lock_held = 1;
obj = tmem_obj_find(hb, oidp); obj = tmem_obj_find(hb, oidp);
if (obj == NULL) if (obj == NULL)
goto out; goto out;
ppampd = __tmem_pampd_lookup_in_obj(obj, index);
if (ppampd == NULL)
goto out;
if (tmem_pamops.is_remote(*ppampd)) {
ret = tmem_repatriate(ppampd, hb, pool, oidp,
index, free, data);
lock_held = 0; /* note hb->lock has been unlocked */
if (ret == -EAGAIN) {
/* rare I think, but should cond_resched()??? */
usleep_range(10, 1000);
goto again;
} else if (ret != 0) {
if (ret != -ENOENT)
pr_err("UNTESTED case in tmem_get, ret=%d\n",
ret);
ret = -1;
goto out;
}
goto out;
}
if (free) if (free)
pampd = tmem_pampd_delete_from_obj(obj, index); pampd = tmem_pampd_delete_from_obj(obj, index);
else else
...@@ -628,10 +713,6 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, ...@@ -628,10 +713,6 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
obj = NULL; obj = NULL;
} }
} }
if (tmem_pamops.is_remote(pampd)) {
lock_held = false;
spin_unlock(&hb->lock);
}
if (free) if (free)
ret = (*tmem_pamops.get_data_and_free)( ret = (*tmem_pamops.get_data_and_free)(
data, size, raw, pampd, pool, oidp, index); data, size, raw, pampd, pool, oidp, index);
...@@ -668,7 +749,7 @@ int tmem_flush_page(struct tmem_pool *pool, ...@@ -668,7 +749,7 @@ int tmem_flush_page(struct tmem_pool *pool,
pampd = tmem_pampd_delete_from_obj(obj, index); pampd = tmem_pampd_delete_from_obj(obj, index);
if (pampd == NULL) if (pampd == NULL)
goto out; goto out;
(*tmem_pamops.free)(pampd, pool, oidp, index); (*tmem_pamops.free)(pampd, pool, oidp, index, true);
if (obj->pampd_count == 0) { if (obj->pampd_count == 0) {
tmem_obj_free(obj, hb); tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool); (*tmem_hostops.obj_free)(obj, pool);
...@@ -682,8 +763,8 @@ int tmem_flush_page(struct tmem_pool *pool, ...@@ -682,8 +763,8 @@ int tmem_flush_page(struct tmem_pool *pool,
/* /*
* If a page in tmem matches the handle, replace the page so that any * If a page in tmem matches the handle, replace the page so that any
* subsequent "get" gets the new page. Returns 0 if * subsequent "get" gets the new page. Returns the new page if
* there was a page to replace, else returns -1. * there was a page to replace, else returns NULL.
*/ */
int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
uint32_t index, void *new_pampd) uint32_t index, void *new_pampd)
...@@ -697,7 +778,7 @@ int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, ...@@ -697,7 +778,7 @@ int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
obj = tmem_obj_find(hb, oidp); obj = tmem_obj_find(hb, oidp);
if (obj == NULL) if (obj == NULL)
goto out; goto out;
new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd); new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
out: out:
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
#ifndef _TMEM_H_ #ifndef _TMEM_H_
#define _TMEM_H_ #define _TMEM_H_
#include <linux/types.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/hash.h> #include <linux/hash.h>
#include <linux/atomic.h> #include <linux/atomic.h>
...@@ -89,6 +88,31 @@ struct tmem_oid { ...@@ -89,6 +88,31 @@ struct tmem_oid {
uint64_t oid[3]; uint64_t oid[3];
}; };
struct tmem_xhandle {
uint8_t client_id;
uint8_t xh_data_cksum;
uint16_t xh_data_size;
uint16_t pool_id;
struct tmem_oid oid;
uint32_t index;
void *extra;
};
static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id,
struct tmem_pool *pool,
struct tmem_oid *oidp,
uint32_t index)
{
struct tmem_xhandle xh;
xh.client_id = client_id;
xh.xh_data_cksum = (uint8_t)-1;
xh.xh_data_size = (uint16_t)-1;
xh.pool_id = pool->pool_id;
xh.oid = *oidp;
xh.index = index;
return xh;
}
static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
{ {
oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
...@@ -147,7 +171,11 @@ struct tmem_obj { ...@@ -147,7 +171,11 @@ struct tmem_obj {
unsigned int objnode_tree_height; unsigned int objnode_tree_height;
unsigned long objnode_count; unsigned long objnode_count;
long pampd_count; long pampd_count;
void *extra; /* for private use by pampd implementation */ /* for current design of ramster, all pages belonging to
* an object reside on the same remotenode and extra is
* used to record the number of the remotenode so a
* flush-object operation can specify it */
void *extra; /* for use by pampd implementation */
DECL_SENTINEL DECL_SENTINEL
}; };
...@@ -174,9 +202,14 @@ struct tmem_pamops { ...@@ -174,9 +202,14 @@ struct tmem_pamops {
int (*get_data_and_free)(char *, size_t *, bool, void *, int (*get_data_and_free)(char *, size_t *, bool, void *,
struct tmem_pool *, struct tmem_oid *, struct tmem_pool *, struct tmem_oid *,
uint32_t); uint32_t);
void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t); void (*free)(void *, struct tmem_pool *,
struct tmem_oid *, uint32_t, bool);
void (*free_obj)(struct tmem_pool *, struct tmem_obj *); void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
bool (*is_remote)(void *); bool (*is_remote)(void *);
void *(*repatriate_preload)(void *, struct tmem_pool *,
struct tmem_oid *, uint32_t, bool *);
int (*repatriate)(void *, void *, struct tmem_pool *,
struct tmem_oid *, uint32_t, bool, void *);
void (*new_obj)(struct tmem_obj *); void (*new_obj)(struct tmem_obj *);
int (*replace_in_obj)(void *, struct tmem_obj *); int (*replace_in_obj)(void *, struct tmem_obj *);
}; };
...@@ -193,11 +226,16 @@ extern void tmem_register_hostops(struct tmem_hostops *m); ...@@ -193,11 +226,16 @@ extern void tmem_register_hostops(struct tmem_hostops *m);
/* core tmem accessor functions */ /* core tmem accessor functions */
extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
char *, size_t, bool, bool); char *, size_t, bool, int);
extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
char *, size_t *, bool, int); char *, size_t *, bool, int);
extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
void *); void *);
extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *,
uint32_t index, struct tmem_obj **,
void **);
extern void tmem_localify_finish(struct tmem_obj *, uint32_t index,
void *, void *, bool);
extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
uint32_t index); uint32_t index);
extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment