Commit e7fd4179 authored by David Teigland's avatar David Teigland Committed by Steven Whitehouse

[DLM] The core of the DLM for GFS2/CLVM

This is the core of the distributed lock manager which is required
to use GFS2 as a cluster filesystem. It is also used by CLVM and
can be used as a standalone lock manager independantly of either
of these two projects.

It implements VAX-style locking modes.
Signed-off-by: default avatarDavid Teigland <teigland@redhat.com>
Signed-off-by: default avatarSteve Whitehouse <swhiteho@redhat.com>
parent e4731420
......@@ -1831,6 +1831,7 @@ source "fs/partitions/Kconfig"
endmenu
source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
endmenu
......@@ -48,6 +48,7 @@ obj-$(CONFIG_SYSFS) += sysfs/
obj-y += devpts/
obj-$(CONFIG_PROFILING) += dcookies.o
obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
......
menu "Distributed Lock Manager"
depends on INET && EXPERIMENTAL
config DLM
tristate "Distributed Lock Manager (DLM)"
depends on SYSFS
depends on IPV6 || IPV6=n
select IP_SCTP
select CONFIGFS_FS
help
A general purpose distributed lock manager for kernel or userspace
applications.
config DLM_DEVICE
tristate "DLM device for userspace access"
depends on DLM
help
This module creates a misc device through which the dlm lockspace
and locking functions become available to userspace applications
(usually through the libdlm library).
config DLM_DEBUG
bool "DLM debugging"
depends on DLM
help
Under the debugfs mount point, the name of each lockspace will
appear as a file in the "dlm" directory. The output is the
list of resource and locks the local node knows about.
endmenu
obj-$(CONFIG_DLM) += dlm.o
obj-$(CONFIG_DLM_DEVICE) += dlm_device.o
dlm-y := ast.o \
config.o \
dir.o \
lock.o \
lockspace.o \
lowcomms.o \
main.o \
member.o \
memory.o \
midcomms.o \
rcom.o \
recover.o \
recoverd.o \
requestqueue.o \
util.o
dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
dlm_device-y := device.o
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lock.h"
#include "ast.h"
#define WAKE_ASTS 0
static struct list_head ast_queue;
static spinlock_t ast_queue_lock;
static struct task_struct * astd_task;
static unsigned long astd_wakeflags;
static struct semaphore astd_running;
void dlm_del_ast(struct dlm_lkb *lkb)
{
spin_lock(&ast_queue_lock);
if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
list_del(&lkb->lkb_astqueue);
spin_unlock(&ast_queue_lock);
}
void dlm_add_ast(struct dlm_lkb *lkb, int type)
{
spin_lock(&ast_queue_lock);
if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
kref_get(&lkb->lkb_ref);
list_add_tail(&lkb->lkb_astqueue, &ast_queue);
}
lkb->lkb_ast_type |= type;
spin_unlock(&ast_queue_lock);
set_bit(WAKE_ASTS, &astd_wakeflags);
wake_up_process(astd_task);
}
static void process_asts(void)
{
struct dlm_ls *ls = NULL;
struct dlm_rsb *r = NULL;
struct dlm_lkb *lkb;
void (*cast) (long param);
void (*bast) (long param, int mode);
int type = 0, found, bmode;
for (;;) {
found = FALSE;
spin_lock(&ast_queue_lock);
list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
r = lkb->lkb_resource;
ls = r->res_ls;
if (dlm_locking_stopped(ls))
continue;
list_del(&lkb->lkb_astqueue);
type = lkb->lkb_ast_type;
lkb->lkb_ast_type = 0;
found = TRUE;
break;
}
spin_unlock(&ast_queue_lock);
if (!found)
break;
cast = lkb->lkb_astaddr;
bast = lkb->lkb_bastaddr;
bmode = lkb->lkb_bastmode;
if ((type & AST_COMP) && cast)
cast(lkb->lkb_astparam);
/* FIXME: Is it safe to look at lkb_grmode here
without doing a lock_rsb() ?
Look at other checks in v1 to avoid basts. */
if ((type & AST_BAST) && bast)
if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
bast(lkb->lkb_astparam, bmode);
/* this removes the reference added by dlm_add_ast
and may result in the lkb being freed */
dlm_put_lkb(lkb);
schedule();
}
}
static inline int no_asts(void)
{
int ret;
spin_lock(&ast_queue_lock);
ret = list_empty(&ast_queue);
spin_unlock(&ast_queue_lock);
return ret;
}
static int dlm_astd(void *data)
{
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!test_bit(WAKE_ASTS, &astd_wakeflags))
schedule();
set_current_state(TASK_RUNNING);
down(&astd_running);
if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
process_asts();
up(&astd_running);
}
return 0;
}
void dlm_astd_wake(void)
{
if (!no_asts()) {
set_bit(WAKE_ASTS, &astd_wakeflags);
wake_up_process(astd_task);
}
}
int dlm_astd_start(void)
{
struct task_struct *p;
int error = 0;
INIT_LIST_HEAD(&ast_queue);
spin_lock_init(&ast_queue_lock);
init_MUTEX(&astd_running);
p = kthread_run(dlm_astd, NULL, "dlm_astd");
if (IS_ERR(p))
error = PTR_ERR(p);
else
astd_task = p;
return error;
}
void dlm_astd_stop(void)
{
kthread_stop(astd_task);
}
void dlm_astd_suspend(void)
{
down(&astd_running);
}
void dlm_astd_resume(void)
{
up(&astd_running);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
void dlm_add_ast(struct dlm_lkb *lkb, int type);
void dlm_del_ast(struct dlm_lkb *lkb);
void dlm_astd_wake(void);
int dlm_astd_start(void);
void dlm_astd_stop(void);
void dlm_astd_suspend(void);
void dlm_astd_resume(void);
#endif
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/configfs.h>
#include <net/sock.h>
#include "config.h"
/*
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
* /config/dlm/<cluster>/comms/<comm>/nodeid
* /config/dlm/<cluster>/comms/<comm>/local
* /config/dlm/<cluster>/comms/<comm>/addr
* The <cluster> level is useless, but I haven't figured out how to avoid it.
*/
static struct config_group *space_list;
static struct config_group *comm_list;
static struct comm *local_comm;
struct clusters;
struct cluster;
struct spaces;
struct space;
struct comms;
struct comm;
struct nodes;
struct node;
static struct config_group *make_cluster(struct config_group *, const char *);
static void drop_cluster(struct config_group *, struct config_item *);
static void release_cluster(struct config_item *);
static struct config_group *make_space(struct config_group *, const char *);
static void drop_space(struct config_group *, struct config_item *);
static void release_space(struct config_item *);
static struct config_item *make_comm(struct config_group *, const char *);
static void drop_comm(struct config_group *, struct config_item *);
static void release_comm(struct config_item *);
static struct config_item *make_node(struct config_group *, const char *);
static void drop_node(struct config_group *, struct config_item *);
static void release_node(struct config_item *);
static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
char *buf);
static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len);
static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
char *buf);
static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len);
static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
static ssize_t comm_local_read(struct comm *cm, char *buf);
static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
static ssize_t node_nodeid_read(struct node *nd, char *buf);
static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
static ssize_t node_weight_read(struct node *nd, char *buf);
static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
enum {
COMM_ATTR_NODEID = 0,
COMM_ATTR_LOCAL,
COMM_ATTR_ADDR,
};
struct comm_attribute {
struct configfs_attribute attr;
ssize_t (*show)(struct comm *, char *);
ssize_t (*store)(struct comm *, const char *, size_t);
};
static struct comm_attribute comm_attr_nodeid = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "nodeid",
.ca_mode = S_IRUGO | S_IWUSR },
.show = comm_nodeid_read,
.store = comm_nodeid_write,
};
static struct comm_attribute comm_attr_local = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "local",
.ca_mode = S_IRUGO | S_IWUSR },
.show = comm_local_read,
.store = comm_local_write,
};
static struct comm_attribute comm_attr_addr = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "addr",
.ca_mode = S_IRUGO | S_IWUSR },
.store = comm_addr_write,
};
static struct configfs_attribute *comm_attrs[] = {
[COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
[COMM_ATTR_LOCAL] = &comm_attr_local.attr,
[COMM_ATTR_ADDR] = &comm_attr_addr.attr,
NULL,
};
enum {
NODE_ATTR_NODEID = 0,
NODE_ATTR_WEIGHT,
};
struct node_attribute {
struct configfs_attribute attr;
ssize_t (*show)(struct node *, char *);
ssize_t (*store)(struct node *, const char *, size_t);
};
static struct node_attribute node_attr_nodeid = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "nodeid",
.ca_mode = S_IRUGO | S_IWUSR },
.show = node_nodeid_read,
.store = node_nodeid_write,
};
static struct node_attribute node_attr_weight = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "weight",
.ca_mode = S_IRUGO | S_IWUSR },
.show = node_weight_read,
.store = node_weight_write,
};
static struct configfs_attribute *node_attrs[] = {
[NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
[NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
NULL,
};
struct clusters {
struct configfs_subsystem subsys;
};
struct cluster {
struct config_group group;
};
struct spaces {
struct config_group ss_group;
};
struct space {
struct config_group group;
struct list_head members;
struct semaphore members_lock;
int members_count;
};
struct comms {
struct config_group cs_group;
};
struct comm {
struct config_item item;
int nodeid;
int local;
int addr_count;
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
struct nodes {
struct config_group ns_group;
};
struct node {
struct config_item item;
struct list_head list; /* space->members */
int nodeid;
int weight;
};
static struct configfs_group_operations clusters_ops = {
.make_group = make_cluster,
.drop_item = drop_cluster,
};
static struct configfs_item_operations cluster_ops = {
.release = release_cluster,
};
static struct configfs_group_operations spaces_ops = {
.make_group = make_space,
.drop_item = drop_space,
};
static struct configfs_item_operations space_ops = {
.release = release_space,
};
static struct configfs_group_operations comms_ops = {
.make_item = make_comm,
.drop_item = drop_comm,
};
static struct configfs_item_operations comm_ops = {
.release = release_comm,
.show_attribute = show_comm,
.store_attribute = store_comm,
};
static struct configfs_group_operations nodes_ops = {
.make_item = make_node,
.drop_item = drop_node,
};
static struct configfs_item_operations node_ops = {
.release = release_node,
.show_attribute = show_node,
.store_attribute = store_node,
};
static struct config_item_type clusters_type = {
.ct_group_ops = &clusters_ops,
.ct_owner = THIS_MODULE,
};
static struct config_item_type cluster_type = {
.ct_item_ops = &cluster_ops,
.ct_owner = THIS_MODULE,
};
static struct config_item_type spaces_type = {
.ct_group_ops = &spaces_ops,
.ct_owner = THIS_MODULE,
};
static struct config_item_type space_type = {
.ct_item_ops = &space_ops,
.ct_owner = THIS_MODULE,
};
static struct config_item_type comms_type = {
.ct_group_ops = &comms_ops,
.ct_owner = THIS_MODULE,
};
static struct config_item_type comm_type = {
.ct_item_ops = &comm_ops,
.ct_attrs = comm_attrs,
.ct_owner = THIS_MODULE,
};
static struct config_item_type nodes_type = {
.ct_group_ops = &nodes_ops,
.ct_owner = THIS_MODULE,
};
static struct config_item_type node_type = {
.ct_item_ops = &node_ops,
.ct_attrs = node_attrs,
.ct_owner = THIS_MODULE,
};
static struct cluster *to_cluster(struct config_item *i)
{
return i ? container_of(to_config_group(i), struct cluster, group):NULL;
}
static struct space *to_space(struct config_item *i)
{
return i ? container_of(to_config_group(i), struct space, group) : NULL;
}
static struct comm *to_comm(struct config_item *i)
{
return i ? container_of(i, struct comm, item) : NULL;
}
static struct node *to_node(struct config_item *i)
{
return i ? container_of(i, struct node, item) : NULL;
}
static struct config_group *make_cluster(struct config_group *g,
const char *name)
{
struct cluster *cl = NULL;
struct spaces *sps = NULL;
struct comms *cms = NULL;
void *gps = NULL;
cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
if (!cl || !gps || !sps || !cms)
goto fail;
config_group_init_type_name(&cl->group, name, &cluster_type);
config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
cl->group.default_groups = gps;
cl->group.default_groups[0] = &sps->ss_group;
cl->group.default_groups[1] = &cms->cs_group;
cl->group.default_groups[2] = NULL;
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
return &cl->group;
fail:
kfree(cl);
kfree(gps);
kfree(sps);
kfree(cms);
return NULL;
}
static void drop_cluster(struct config_group *g, struct config_item *i)
{
struct cluster *cl = to_cluster(i);
struct config_item *tmp;
int j;
for (j = 0; cl->group.default_groups[j]; j++) {
tmp = &cl->group.default_groups[j]->cg_item;
cl->group.default_groups[j] = NULL;
config_item_put(tmp);
}
space_list = NULL;
comm_list = NULL;
config_item_put(i);
}
static void release_cluster(struct config_item *i)
{
struct cluster *cl = to_cluster(i);
kfree(cl->group.default_groups);
kfree(cl);
}
static struct config_group *make_space(struct config_group *g, const char *name)
{
struct space *sp = NULL;
struct nodes *nds = NULL;
void *gps = NULL;
sp = kzalloc(sizeof(struct space), GFP_KERNEL);
gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
if (!sp || !gps || !nds)
goto fail;
config_group_init_type_name(&sp->group, name, &space_type);
config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
sp->group.default_groups = gps;
sp->group.default_groups[0] = &nds->ns_group;
sp->group.default_groups[1] = NULL;
INIT_LIST_HEAD(&sp->members);
init_MUTEX(&sp->members_lock);
sp->members_count = 0;
return &sp->group;
fail:
kfree(sp);
kfree(gps);
kfree(nds);
return NULL;
}
static void drop_space(struct config_group *g, struct config_item *i)
{
struct space *sp = to_space(i);
struct config_item *tmp;
int j;
/* assert list_empty(&sp->members) */
for (j = 0; sp->group.default_groups[j]; j++) {
tmp = &sp->group.default_groups[j]->cg_item;
sp->group.default_groups[j] = NULL;
config_item_put(tmp);
}
config_item_put(i);
}
static void release_space(struct config_item *i)
{
struct space *sp = to_space(i);
kfree(sp->group.default_groups);
kfree(sp);
}
static struct config_item *make_comm(struct config_group *g, const char *name)
{
struct comm *cm;
cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
if (!cm)
return NULL;
config_item_init_type_name(&cm->item, name, &comm_type);
cm->nodeid = -1;
cm->local = 0;
cm->addr_count = 0;
return &cm->item;
}
static void drop_comm(struct config_group *g, struct config_item *i)
{
struct comm *cm = to_comm(i);
if (local_comm == cm)
local_comm = NULL;
while (cm->addr_count--)
kfree(cm->addr[cm->addr_count]);
config_item_put(i);
}
static void release_comm(struct config_item *i)
{
struct comm *cm = to_comm(i);
kfree(cm);
}
static struct config_item *make_node(struct config_group *g, const char *name)
{
struct space *sp = to_space(g->cg_item.ci_parent);
struct node *nd;
nd = kzalloc(sizeof(struct node), GFP_KERNEL);
if (!nd)
return NULL;
config_item_init_type_name(&nd->item, name, &node_type);
nd->nodeid = -1;
nd->weight = 1; /* default weight of 1 if none is set */
down(&sp->members_lock);
list_add(&nd->list, &sp->members);
sp->members_count++;
up(&sp->members_lock);
return &nd->item;
}
static void drop_node(struct config_group *g, struct config_item *i)
{
struct space *sp = to_space(g->cg_item.ci_parent);
struct node *nd = to_node(i);
down(&sp->members_lock);
list_del(&nd->list);
sp->members_count--;
up(&sp->members_lock);
config_item_put(i);
}
static void release_node(struct config_item *i)
{
struct node *nd = to_node(i);
kfree(nd);
}
static struct clusters clusters_root = {
.subsys = {
.su_group = {
.cg_item = {
.ci_namebuf = "dlm",
.ci_type = &clusters_type,
},
},
},
};
int dlm_config_init(void)
{
config_group_init(&clusters_root.subsys.su_group);
init_MUTEX(&clusters_root.subsys.su_sem);
return configfs_register_subsystem(&clusters_root.subsys);
}
void dlm_config_exit(void)
{
configfs_unregister_subsystem(&clusters_root.subsys);
}
/*
* Functions for user space to read/write attributes
*/
static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
char *buf)
{
struct comm *cm = to_comm(i);
struct comm_attribute *cma =
container_of(a, struct comm_attribute, attr);
return cma->show ? cma->show(cm, buf) : 0;
}
static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len)
{
struct comm *cm = to_comm(i);
struct comm_attribute *cma =
container_of(a, struct comm_attribute, attr);
return cma->store ? cma->store(cm, buf, len) : -EINVAL;
}
static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
{
return sprintf(buf, "%d\n", cm->nodeid);
}
static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
{
cm->nodeid = simple_strtol(buf, NULL, 0);
return len;
}
static ssize_t comm_local_read(struct comm *cm, char *buf)
{
return sprintf(buf, "%d\n", cm->local);
}
static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
{
cm->local= simple_strtol(buf, NULL, 0);
if (cm->local && !local_comm)
local_comm = cm;
return len;
}
static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
{
struct sockaddr_storage *addr;
if (len != sizeof(struct sockaddr_storage))
return -EINVAL;
if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
return -ENOSPC;
addr = kzalloc(sizeof(*addr), GFP_KERNEL);
if (!addr)
return -ENOMEM;
memcpy(addr, buf, len);
cm->addr[cm->addr_count++] = addr;
return len;
}
static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
char *buf)
{
struct node *nd = to_node(i);
struct node_attribute *nda =
container_of(a, struct node_attribute, attr);
return nda->show ? nda->show(nd, buf) : 0;
}
static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len)
{
struct node *nd = to_node(i);
struct node_attribute *nda =
container_of(a, struct node_attribute, attr);
return nda->store ? nda->store(nd, buf, len) : -EINVAL;
}
static ssize_t node_nodeid_read(struct node *nd, char *buf)
{
return sprintf(buf, "%d\n", nd->nodeid);
}
static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
{
nd->nodeid = simple_strtol(buf, NULL, 0);
return len;
}
static ssize_t node_weight_read(struct node *nd, char *buf)
{
return sprintf(buf, "%d\n", nd->weight);
}
static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
{
nd->weight = simple_strtol(buf, NULL, 0);
return len;
}
/*
* Functions for the dlm to get the info that's been configured
*/
static struct space *get_space(char *name)
{
if (!space_list)
return NULL;
return to_space(config_group_find_obj(space_list, name));
}
static void put_space(struct space *sp)
{
config_item_put(&sp->group.cg_item);
}
static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
{
struct config_item *i;
struct comm *cm = NULL;
int found = 0;
if (!comm_list)
return NULL;
down(&clusters_root.subsys.su_sem);
list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
cm = to_comm(i);
if (nodeid) {
if (cm->nodeid != nodeid)
continue;
found = 1;
break;
} else {
if (!cm->addr_count ||
memcmp(cm->addr[0], addr, sizeof(*addr)))
continue;
found = 1;
break;
}
}
up(&clusters_root.subsys.su_sem);
if (found)
config_item_get(i);
else
cm = NULL;
return cm;
}
static void put_comm(struct comm *cm)
{
config_item_put(&cm->item);
}
/* caller must free mem */
int dlm_nodeid_list(char *lsname, int **ids_out)
{
struct space *sp;
struct node *nd;
int i = 0, rv = 0;
int *ids;
sp = get_space(lsname);
if (!sp)
return -EEXIST;
down(&sp->members_lock);
if (!sp->members_count) {
rv = 0;
goto out;
}
ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
if (!ids) {
rv = -ENOMEM;
goto out;
}
rv = sp->members_count;
list_for_each_entry(nd, &sp->members, list)
ids[i++] = nd->nodeid;
if (rv != i)
printk("bad nodeid count %d %d\n", rv, i);
*ids_out = ids;
out:
up(&sp->members_lock);
put_space(sp);
return rv;
}
int dlm_node_weight(char *lsname, int nodeid)
{
struct space *sp;
struct node *nd;
int w = -EEXIST;
sp = get_space(lsname);
if (!sp)
goto out;
down(&sp->members_lock);
list_for_each_entry(nd, &sp->members, list) {
if (nd->nodeid != nodeid)
continue;
w = nd->weight;
break;
}
up(&sp->members_lock);
put_space(sp);
out:
return w;
}
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
{
struct comm *cm = get_comm(nodeid, NULL);
if (!cm)
return -EEXIST;
if (!cm->addr_count)
return -ENOENT;
memcpy(addr, cm->addr[0], sizeof(*addr));
put_comm(cm);
return 0;
}
int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
{
struct comm *cm = get_comm(0, addr);
if (!cm)
return -EEXIST;
*nodeid = cm->nodeid;
put_comm(cm);
return 0;
}
int dlm_our_nodeid(void)
{
return local_comm ? local_comm->nodeid : 0;
}
/* num 0 is first addr, num 1 is second addr */
int dlm_our_addr(struct sockaddr_storage *addr, int num)
{
if (!local_comm)
return -1;
if (num + 1 > local_comm->addr_count)
return -1;
memcpy(addr, local_comm->addr[num], sizeof(*addr));
return 0;
}
/* Config file defaults */
#define DEFAULT_TCP_PORT 21064
#define DEFAULT_BUFFER_SIZE 4096
#define DEFAULT_RSBTBL_SIZE 256
#define DEFAULT_LKBTBL_SIZE 1024
#define DEFAULT_DIRTBL_SIZE 512
#define DEFAULT_RECOVER_TIMER 5
#define DEFAULT_TOSS_SECS 10
#define DEFAULT_SCAN_SECS 5
struct dlm_config_info dlm_config = {
.tcp_port = DEFAULT_TCP_PORT,
.buffer_size = DEFAULT_BUFFER_SIZE,
.rsbtbl_size = DEFAULT_RSBTBL_SIZE,
.lkbtbl_size = DEFAULT_LKBTBL_SIZE,
.dirtbl_size = DEFAULT_DIRTBL_SIZE,
.recover_timer = DEFAULT_RECOVER_TIMER,
.toss_secs = DEFAULT_TOSS_SECS,
.scan_secs = DEFAULT_SCAN_SECS
};
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
#define DLM_MAX_ADDR_COUNT 3
struct dlm_config_info {
int tcp_port;
int buffer_size;
int rsbtbl_size;
int lkbtbl_size;
int dirtbl_size;
int recover_timer;
int toss_secs;
int scan_secs;
};
extern struct dlm_config_info dlm_config;
int dlm_config_init(void);
void dlm_config_exit(void);
int dlm_node_weight(char *lsname, int nodeid);
int dlm_nodeid_list(char *lsname, int **ids_out);
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
#endif /* __CONFIG_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include "dlm_internal.h"
static struct dentry *dlm_root;
struct rsb_iter {
int entry;
struct dlm_ls *ls;
struct list_head *next;
struct dlm_rsb *rsb;
};
static char *print_lockmode(int mode)
{
switch (mode) {
case DLM_LOCK_IV:
return "--";
case DLM_LOCK_NL:
return "NL";
case DLM_LOCK_CR:
return "CR";
case DLM_LOCK_CW:
return "CW";
case DLM_LOCK_PR:
return "PR";
case DLM_LOCK_PW:
return "PW";
case DLM_LOCK_EX:
return "EX";
default:
return "??";
}
}
static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
struct dlm_rsb *res)
{
seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
if (lkb->lkb_status == DLM_LKSTS_CONVERT
|| lkb->lkb_status == DLM_LKSTS_WAITING)
seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
if (lkb->lkb_range) {
/* FIXME: this warns on Alpha */
if (lkb->lkb_status == DLM_LKSTS_CONVERT
|| lkb->lkb_status == DLM_LKSTS_GRANTED)
seq_printf(s, " %" PRIx64 "-%" PRIx64,
lkb->lkb_range[GR_RANGE_START],
lkb->lkb_range[GR_RANGE_END]);
if (lkb->lkb_status == DLM_LKSTS_CONVERT
|| lkb->lkb_status == DLM_LKSTS_WAITING)
seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
lkb->lkb_range[RQ_RANGE_START],
lkb->lkb_range[RQ_RANGE_END]);
}
if (lkb->lkb_nodeid) {
if (lkb->lkb_nodeid != res->res_nodeid)
seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
lkb->lkb_remid);
else
seq_printf(s, " Master: %08x", lkb->lkb_remid);
}
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
seq_printf(s, "\n");
}
static int print_resource(struct dlm_rsb *res, struct seq_file *s)
{
struct dlm_lkb *lkb;
int i, lvblen = res->res_ls->ls_lvblen;
seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
for (i = 0; i < res->res_length; i++) {
if (isprint(res->res_name[i]))
seq_printf(s, "%c", res->res_name[i]);
else
seq_printf(s, "%c", '.');
}
if (res->res_nodeid > 0)
seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
res->res_nodeid);
else if (res->res_nodeid == 0)
seq_printf(s, "\" \nMaster Copy\n");
else if (res->res_nodeid == -1)
seq_printf(s, "\" \nLooking up master (lkid %x)\n",
res->res_first_lkid);
else
seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
/* Print the LVB: */
if (res->res_lvbptr) {
seq_printf(s, "LVB: ");
for (i = 0; i < lvblen; i++) {
if (i == lvblen / 2)
seq_printf(s, "\n ");
seq_printf(s, "%02x ",
(unsigned char) res->res_lvbptr[i]);
}
if (rsb_flag(res, RSB_VALNOTVALID))
seq_printf(s, " (INVALID)");
seq_printf(s, "\n");
}
/* Print the locks attached to this resource */
seq_printf(s, "Granted Queue\n");
list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
print_lock(s, lkb, res);
seq_printf(s, "Conversion Queue\n");
list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
print_lock(s, lkb, res);
seq_printf(s, "Waiting Queue\n");
list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
print_lock(s, lkb, res);
return 0;
}
static int rsb_iter_next(struct rsb_iter *ri)
{
struct dlm_ls *ls = ri->ls;
int i;
if (!ri->next) {
top:
/* Find the next non-empty hash bucket */
for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
read_lock(&ls->ls_rsbtbl[i].lock);
if (!list_empty(&ls->ls_rsbtbl[i].list)) {
ri->next = ls->ls_rsbtbl[i].list.next;
read_unlock(&ls->ls_rsbtbl[i].lock);
break;
}
read_unlock(&ls->ls_rsbtbl[i].lock);
}
ri->entry = i;
if (ri->entry >= ls->ls_rsbtbl_size)
return 1;
} else {
i = ri->entry;
read_lock(&ls->ls_rsbtbl[i].lock);
ri->next = ri->next->next;
if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
/* End of list - move to next bucket */
ri->next = NULL;
ri->entry++;
read_unlock(&ls->ls_rsbtbl[i].lock);
goto top;
}
read_unlock(&ls->ls_rsbtbl[i].lock);
}
ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
return 0;
}
static void rsb_iter_free(struct rsb_iter *ri)
{
kfree(ri);
}
static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
{
struct rsb_iter *ri;
ri = kmalloc(sizeof *ri, GFP_KERNEL);
if (!ri)
return NULL;
ri->ls = ls;
ri->entry = 0;
ri->next = NULL;
if (rsb_iter_next(ri)) {
rsb_iter_free(ri);
return NULL;
}
return ri;
}
static void *seq_start(struct seq_file *file, loff_t *pos)
{
struct rsb_iter *ri;
loff_t n = *pos;
ri = rsb_iter_init(file->private);
if (!ri)
return NULL;
while (n--) {
if (rsb_iter_next(ri)) {
rsb_iter_free(ri);
return NULL;
}
}
return ri;
}
static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
{
struct rsb_iter *ri = iter_ptr;
(*pos)++;
if (rsb_iter_next(ri)) {
rsb_iter_free(ri);
return NULL;
}
return ri;
}
static void seq_stop(struct seq_file *file, void *iter_ptr)
{
/* nothing for now */
}
static int seq_show(struct seq_file *file, void *iter_ptr)
{
struct rsb_iter *ri = iter_ptr;
print_resource(ri->rsb, file);
return 0;
}
static struct seq_operations dlm_seq_ops = {
.start = seq_start,
.next = seq_next,
.stop = seq_stop,
.show = seq_show,
};
static int do_open(struct inode *inode, struct file *file)
{
struct seq_file *seq;
int ret;
ret = seq_open(file, &dlm_seq_ops);
if (ret)
return ret;
seq = file->private_data;
seq->private = inode->u.generic_ip;
return 0;
}
static struct file_operations dlm_fops = {
.owner = THIS_MODULE,
.open = do_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release
};
int dlm_create_debug_file(struct dlm_ls *ls)
{
ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
S_IFREG | S_IRUGO,
dlm_root,
ls,
&dlm_fops);
return ls->ls_debug_dentry ? 0 : -ENOMEM;
}
void dlm_delete_debug_file(struct dlm_ls *ls)
{
if (ls->ls_debug_dentry)
debugfs_remove(ls->ls_debug_dentry);
}
int dlm_register_debugfs(void)
{
dlm_root = debugfs_create_dir("dlm", NULL);
return dlm_root ? 0 : -ENOMEM;
}
void dlm_unregister_debugfs(void)
{
debugfs_remove(dlm_root);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
/*
* device.c
*
* This is the userland interface to the DLM.
*
* The locking is done via a misc char device (find the
* registered minor number in /proc/misc).
*
* User code should not use this interface directly but
* call the library routines in libdlm.a instead.
*
*/
#include <linux/miscdevice.h>
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/signal.h>
#include <linux/spinlock.h>
#include <linux/idr.h>
#include <linux/dlm.h>
#include <linux/dlm_device.h>
#include "lvb_table.h"
static struct file_operations _dlm_fops;
static const char *name_prefix="dlm";
static struct list_head user_ls_list;
static struct semaphore user_ls_lock;
/* Lock infos are stored in here indexed by lock ID */
static DEFINE_IDR(lockinfo_idr);
static rwlock_t lockinfo_lock;
/* Flags in li_flags */
#define LI_FLAG_COMPLETE 1
#define LI_FLAG_FIRSTLOCK 2
#define LI_FLAG_PERSISTENT 3
/* flags in ls_flags*/
#define LS_FLAG_DELETED 1
#define LS_FLAG_AUTOFREE 2
#define LOCKINFO_MAGIC 0x53595324
struct lock_info {
uint32_t li_magic;
uint8_t li_cmd;
int8_t li_grmode;
int8_t li_rqmode;
struct dlm_lksb li_lksb;
wait_queue_head_t li_waitq;
unsigned long li_flags;
void __user *li_castparam;
void __user *li_castaddr;
void __user *li_bastparam;
void __user *li_bastaddr;
void __user *li_pend_bastparam;
void __user *li_pend_bastaddr;
struct list_head li_ownerqueue;
struct file_info *li_file;
struct dlm_lksb __user *li_user_lksb;
struct semaphore li_firstlock;
};
/* A queued AST no less */
struct ast_info {
struct dlm_lock_result result;
struct list_head list;
uint32_t lvb_updated;
uint32_t progress; /* How much has been read */
};
/* One of these per userland lockspace */
struct user_ls {
void *ls_lockspace;
atomic_t ls_refcnt;
long ls_flags;
/* Passed into misc_register() */
struct miscdevice ls_miscinfo;
struct list_head ls_list;
};
/* misc_device info for the control device */
static struct miscdevice ctl_device;
/*
* Stuff we hang off the file struct.
* The first two are to cope with unlocking all the
* locks help by a process when it dies.
*/
struct file_info {
struct list_head fi_li_list; /* List of active lock_infos */
spinlock_t fi_li_lock;
struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
spinlock_t fi_ast_lock;
wait_queue_head_t fi_wait;
struct user_ls *fi_ls;
atomic_t fi_refcnt; /* Number of users */
unsigned long fi_flags; /* Bit 1 means the device is open */
};
/* get and put ops for file_info.
Actually I don't really like "get" and "put", but everyone
else seems to use them and I can't think of anything
nicer at the moment */
static void get_file_info(struct file_info *f)
{
atomic_inc(&f->fi_refcnt);
}
static void put_file_info(struct file_info *f)
{
if (atomic_dec_and_test(&f->fi_refcnt))
kfree(f);
}
static void release_lockinfo(struct lock_info *li)
{
put_file_info(li->li_file);
write_lock(&lockinfo_lock);
idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid);
write_unlock(&lockinfo_lock);
if (li->li_lksb.sb_lvbptr)
kfree(li->li_lksb.sb_lvbptr);
kfree(li);
module_put(THIS_MODULE);
}
static struct lock_info *get_lockinfo(uint32_t lockid)
{
struct lock_info *li;
read_lock(&lockinfo_lock);
li = idr_find(&lockinfo_idr, lockid);
read_unlock(&lockinfo_lock);
return li;
}
static int add_lockinfo(struct lock_info *li)
{
int n;
int r;
int ret = -EINVAL;
write_lock(&lockinfo_lock);
if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid))
goto out_up;
ret = -ENOMEM;
r = idr_pre_get(&lockinfo_idr, GFP_KERNEL);
if (!r)
goto out_up;
r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n);
if (r)
goto out_up;
if (n != li->li_lksb.sb_lkid) {
idr_remove(&lockinfo_idr, n);
goto out_up;
}
ret = 0;
out_up:
write_unlock(&lockinfo_lock);
return ret;
}
static struct user_ls *__find_lockspace(int minor)
{
struct user_ls *lsinfo;
list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
if (lsinfo->ls_miscinfo.minor == minor)
return lsinfo;
}
return NULL;
}
/* Find a lockspace struct given the device minor number */
static struct user_ls *find_lockspace(int minor)
{
struct user_ls *lsinfo;
down(&user_ls_lock);
lsinfo = __find_lockspace(minor);
up(&user_ls_lock);
return lsinfo;
}
static void add_lockspace_to_list(struct user_ls *lsinfo)
{
down(&user_ls_lock);
list_add(&lsinfo->ls_list, &user_ls_list);
up(&user_ls_lock);
}
/* Register a lockspace with the DLM and create a misc
device for userland to access it */
static int register_lockspace(char *name, struct user_ls **ls, int flags)
{
struct user_ls *newls;
int status;
int namelen;
namelen = strlen(name)+strlen(name_prefix)+2;
newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
if (!newls)
return -ENOMEM;
memset(newls, 0, sizeof(struct user_ls));
newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
if (!newls->ls_miscinfo.name) {
kfree(newls);
return -ENOMEM;
}
status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0,
DLM_USER_LVB_LEN);
if (status != 0) {
kfree(newls->ls_miscinfo.name);
kfree(newls);
return status;
}
snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s",
name_prefix, name);
newls->ls_miscinfo.fops = &_dlm_fops;
newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
status = misc_register(&newls->ls_miscinfo);
if (status) {
printk(KERN_ERR "dlm: misc register failed for %s\n", name);
dlm_release_lockspace(newls->ls_lockspace, 0);
kfree(newls->ls_miscinfo.name);
kfree(newls);
return status;
}
if (flags & DLM_USER_LSFLG_AUTOFREE)
set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags);
add_lockspace_to_list(newls);
*ls = newls;
return 0;
}
/* Called with the user_ls_lock semaphore held */
static int unregister_lockspace(struct user_ls *lsinfo, int force)
{
int status;
status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
if (status)
return status;
status = misc_deregister(&lsinfo->ls_miscinfo);
if (status)
return status;
list_del(&lsinfo->ls_list);
set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags);
lsinfo->ls_lockspace = NULL;
if (atomic_read(&lsinfo->ls_refcnt) == 0) {
kfree(lsinfo->ls_miscinfo.name);
kfree(lsinfo);
}
return 0;
}
/* Add it to userland's AST queue */
static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam,
int lvb_updated)
{
struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
if (!ast)
return;
memset(ast, 0, sizeof(*ast));
ast->result.user_astparam = astparam;
ast->result.user_astaddr = astaddr;
ast->result.user_lksb = li->li_user_lksb;
memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
ast->lvb_updated = lvb_updated;
spin_lock(&li->li_file->fi_ast_lock);
list_add_tail(&ast->list, &li->li_file->fi_ast_list);
spin_unlock(&li->li_file->fi_ast_lock);
wake_up_interruptible(&li->li_file->fi_wait);
}
static void bast_routine(void *param, int mode)
{
struct lock_info *li = param;
if (li && li->li_bastaddr)
add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0);
}
/*
* This is the kernel's AST routine.
* All lock, unlock & query operations complete here.
* The only syncronous ops are those done during device close.
*/
static void ast_routine(void *param)
{
struct lock_info *li = param;
/* Param may be NULL if a persistent lock is unlocked by someone else */
if (!li)
return;
/* If this is a succesful conversion then activate the blocking ast
* args from the conversion request */
if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
li->li_lksb.sb_status == 0) {
li->li_bastparam = li->li_pend_bastparam;
li->li_bastaddr = li->li_pend_bastaddr;
li->li_pend_bastaddr = NULL;
}
/* If it's an async request then post data to the user's AST queue. */
if (li->li_castaddr) {
int lvb_updated = 0;
/* See if the lvb has been updated */
if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1)
lvb_updated = 1;
if (li->li_lksb.sb_status == 0)
li->li_grmode = li->li_rqmode;
/* Only queue AST if the device is still open */
if (test_bit(1, &li->li_file->fi_flags))
add_to_astqueue(li, li->li_castaddr, li->li_castparam,
lvb_updated);
/* If it's a new lock operation that failed, then
* remove it from the owner queue and free the
* lock_info.
*/
if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
li->li_lksb.sb_status != 0) {
/* Wait till dlm_lock() has finished */
down(&li->li_firstlock);
up(&li->li_firstlock);
spin_lock(&li->li_file->fi_li_lock);
list_del(&li->li_ownerqueue);
spin_unlock(&li->li_file->fi_li_lock);
release_lockinfo(li);
return;
}
/* Free unlocks & queries */
if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
li->li_cmd == DLM_USER_QUERY) {
release_lockinfo(li);
}
} else {
/* Synchronous request, just wake up the caller */
set_bit(LI_FLAG_COMPLETE, &li->li_flags);
wake_up_interruptible(&li->li_waitq);
}
}
/*
* Wait for the lock op to complete and return the status.
*/
static int wait_for_ast(struct lock_info *li)
{
/* Wait for the AST routine to complete */
set_task_state(current, TASK_INTERRUPTIBLE);
while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
schedule();
set_task_state(current, TASK_RUNNING);
return li->li_lksb.sb_status;
}
/* Open on control device */
static int dlm_ctl_open(struct inode *inode, struct file *file)
{
file->private_data = NULL;
return 0;
}
/* Close on control device */
static int dlm_ctl_close(struct inode *inode, struct file *file)
{
return 0;
}
/* Open on lockspace device */
static int dlm_open(struct inode *inode, struct file *file)
{
struct file_info *f;
struct user_ls *lsinfo;
lsinfo = find_lockspace(iminor(inode));
if (!lsinfo)
return -ENOENT;
f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
if (!f)
return -ENOMEM;
atomic_inc(&lsinfo->ls_refcnt);
INIT_LIST_HEAD(&f->fi_li_list);
INIT_LIST_HEAD(&f->fi_ast_list);
spin_lock_init(&f->fi_li_lock);
spin_lock_init(&f->fi_ast_lock);
init_waitqueue_head(&f->fi_wait);
f->fi_ls = lsinfo;
f->fi_flags = 0;
get_file_info(f);
set_bit(1, &f->fi_flags);
file->private_data = f;
return 0;
}
/* Check the user's version matches ours */
static int check_version(struct dlm_write_request *req)
{
if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
(req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
"user (%d.%d.%d) kernel (%d.%d.%d)\n",
current->comm,
current->pid,
req->version[0],
req->version[1],
req->version[2],
DLM_DEVICE_VERSION_MAJOR,
DLM_DEVICE_VERSION_MINOR,
DLM_DEVICE_VERSION_PATCH);
return -EINVAL;
}
return 0;
}
/* Close on lockspace device */
static int dlm_close(struct inode *inode, struct file *file)
{
struct file_info *f = file->private_data;
struct lock_info li;
struct lock_info *old_li, *safe;
sigset_t tmpsig;
sigset_t allsigs;
struct user_ls *lsinfo;
DECLARE_WAITQUEUE(wq, current);
lsinfo = find_lockspace(iminor(inode));
if (!lsinfo)
return -ENOENT;
/* Mark this closed so that ASTs will not be delivered any more */
clear_bit(1, &f->fi_flags);
/* Block signals while we are doing this */
sigfillset(&allsigs);
sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
/* We use our own lock_info struct here, so that any
* outstanding "real" ASTs will be delivered with the
* corresponding "real" params, thus freeing the lock_info
* that belongs the lock. This catches the corner case where
* a lock is BUSY when we try to unlock it here
*/
memset(&li, 0, sizeof(li));
clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
init_waitqueue_head(&li.li_waitq);
add_wait_queue(&li.li_waitq, &wq);
/*
* Free any outstanding locks, they are on the
* list in LIFO order so there should be no problems
* about unlocking parents before children.
*/
list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) {
int status;
int flags = 0;
/* Don't unlock persistent locks, just mark them orphaned */
if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) {
list_del(&old_li->li_ownerqueue);
/* Update master copy */
/* TODO: Check locking core updates the local and
remote ORPHAN flags */
li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid;
status = dlm_lock(f->fi_ls->ls_lockspace,
old_li->li_grmode, &li.li_lksb,
DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
NULL, 0, 0, ast_routine, NULL,
NULL, NULL);
if (status != 0)
printk("dlm: Error orphaning lock %x: %d\n",
old_li->li_lksb.sb_lkid, status);
/* But tidy our references in it */
release_lockinfo(old_li);
continue;
}
clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
flags = DLM_LKF_FORCEUNLOCK;
if (old_li->li_grmode >= DLM_LOCK_PW)
flags |= DLM_LKF_IVVALBLK;
status = dlm_unlock(f->fi_ls->ls_lockspace,
old_li->li_lksb.sb_lkid, flags,
&li.li_lksb, &li);
/* Must wait for it to complete as the next lock could be its
* parent */
if (status == 0)
wait_for_ast(&li);
/* Unlock suceeded, free the lock_info struct. */
if (status == 0)
release_lockinfo(old_li);
}
remove_wait_queue(&li.li_waitq, &wq);
/*
* If this is the last reference to the lockspace
* then free the struct. If it's an AUTOFREE lockspace
* then free the whole thing.
*/
down(&user_ls_lock);
if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
if (lsinfo->ls_lockspace) {
if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) {
unregister_lockspace(lsinfo, 1);
}
} else {
kfree(lsinfo->ls_miscinfo.name);
kfree(lsinfo);
}
}
up(&user_ls_lock);
put_file_info(f);
/* Restore signals */
sigprocmask(SIG_SETMASK, &tmpsig, NULL);
recalc_sigpending();
return 0;
}
static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd,
struct dlm_lspace_params *kparams)
{
int status;
struct user_ls *lsinfo;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
status = register_lockspace(kparams->name, &lsinfo, kparams->flags);
/* If it succeeded then return the minor number */
if (status == 0)
status = lsinfo->ls_miscinfo.minor;
return status;
}
static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd,
struct dlm_lspace_params *kparams)
{
int status;
int force = 1;
struct user_ls *lsinfo;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
down(&user_ls_lock);
lsinfo = __find_lockspace(kparams->minor);
if (!lsinfo) {
up(&user_ls_lock);
return -EINVAL;
}
if (kparams->flags & DLM_USER_LSFLG_FORCEFREE)
force = 2;
status = unregister_lockspace(lsinfo, force);
up(&user_ls_lock);
return status;
}
/* Read call, might block if no ASTs are waiting.
* It will only ever return one message at a time, regardless
* of how many are pending.
*/
static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count,
loff_t *ppos)
{
struct file_info *fi = file->private_data;
struct ast_info *ast;
int data_size;
int offset;
DECLARE_WAITQUEUE(wait, current);
if (count < sizeof(struct dlm_lock_result))
return -EINVAL;
spin_lock(&fi->fi_ast_lock);
if (list_empty(&fi->fi_ast_list)) {
/* No waiting ASTs.
* Return EOF if the lockspace been deleted.
*/
if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
return 0;
if (file->f_flags & O_NONBLOCK) {
spin_unlock(&fi->fi_ast_lock);
return -EAGAIN;
}
add_wait_queue(&fi->fi_wait, &wait);
repeat:
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&fi->fi_ast_list) &&
!signal_pending(current)) {
spin_unlock(&fi->fi_ast_lock);
schedule();
spin_lock(&fi->fi_ast_lock);
goto repeat;
}
current->state = TASK_RUNNING;
remove_wait_queue(&fi->fi_wait, &wait);
if (signal_pending(current)) {
spin_unlock(&fi->fi_ast_lock);
return -ERESTARTSYS;
}
}
ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
list_del(&ast->list);
spin_unlock(&fi->fi_ast_lock);
/* Work out the size of the returned data */
data_size = sizeof(struct dlm_lock_result);
if (ast->lvb_updated && ast->result.lksb.sb_lvbptr)
data_size += DLM_USER_LVB_LEN;
offset = sizeof(struct dlm_lock_result);
/* Room for the extended data ? */
if (count >= data_size) {
if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) {
if (copy_to_user(buffer+offset,
ast->result.lksb.sb_lvbptr,
DLM_USER_LVB_LEN))
return -EFAULT;
ast->result.lvb_offset = offset;
offset += DLM_USER_LVB_LEN;
}
}
ast->result.length = data_size;
/* Copy the header now it has all the offsets in it */
if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
offset = -EFAULT;
/* If we only returned a header and there's more to come then put it
back on the list */
if (count < data_size) {
spin_lock(&fi->fi_ast_lock);
list_add(&ast->list, &fi->fi_ast_list);
spin_unlock(&fi->fi_ast_lock);
} else
kfree(ast);
return offset;
}
static unsigned int dlm_poll(struct file *file, poll_table *wait)
{
struct file_info *fi = file->private_data;
poll_wait(file, &fi->fi_wait, wait);
spin_lock(&fi->fi_ast_lock);
if (!list_empty(&fi->fi_ast_list)) {
spin_unlock(&fi->fi_ast_lock);
return POLLIN | POLLRDNORM;
}
spin_unlock(&fi->fi_ast_lock);
return 0;
}
static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd,
struct dlm_lock_params *kparams)
{
struct lock_info *li;
if (!try_module_get(THIS_MODULE))
return NULL;
li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
if (li) {
li->li_magic = LOCKINFO_MAGIC;
li->li_file = fi;
li->li_cmd = cmd;
li->li_flags = 0;
li->li_grmode = -1;
li->li_rqmode = -1;
li->li_pend_bastparam = NULL;
li->li_pend_bastaddr = NULL;
li->li_castaddr = NULL;
li->li_castparam = NULL;
li->li_lksb.sb_lvbptr = NULL;
li->li_bastaddr = kparams->bastaddr;
li->li_bastparam = kparams->bastparam;
get_file_info(fi);
}
return li;
}
static int do_user_lock(struct file_info *fi, uint8_t cmd,
struct dlm_lock_params *kparams)
{
struct lock_info *li;
int status;
/*
* Validate things that we need to have correct.
*/
if (!kparams->castaddr)
return -EINVAL;
if (!kparams->lksb)
return -EINVAL;
/* Persistent child locks are not available yet */
if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
return -EINVAL;
/* For conversions, there should already be a lockinfo struct,
unless we are adopting an orphaned persistent lock */
if (kparams->flags & DLM_LKF_CONVERT) {
li = get_lockinfo(kparams->lkid);
/* If this is a persistent lock we will have to create a
lockinfo again */
if (!li && DLM_LKF_PERSISTENT) {
li = allocate_lockinfo(fi, cmd, kparams);
li->li_lksb.sb_lkid = kparams->lkid;
li->li_castaddr = kparams->castaddr;
li->li_castparam = kparams->castparam;
/* OK, this isn;t exactly a FIRSTLOCK but it is the
first time we've used this lockinfo, and if things
fail we want rid of it */
init_MUTEX_LOCKED(&li->li_firstlock);
set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
add_lockinfo(li);
/* TODO: do a query to get the current state ?? */
}
if (!li)
return -EINVAL;
if (li->li_magic != LOCKINFO_MAGIC)
return -EINVAL;
/* For conversions don't overwrite the current blocking AST
info so that:
a) if a blocking AST fires before the conversion is queued
it runs the current handler
b) if the conversion is cancelled, the original blocking AST
declaration is active
The pend_ info is made active when the conversion
completes.
*/
li->li_pend_bastaddr = kparams->bastaddr;
li->li_pend_bastparam = kparams->bastparam;
} else {
li = allocate_lockinfo(fi, cmd, kparams);
if (!li)
return -ENOMEM;
/* semaphore to allow us to complete our work before
the AST routine runs. In fact we only need (and use) this
when the initial lock fails */
init_MUTEX_LOCKED(&li->li_firstlock);
set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
}
li->li_user_lksb = kparams->lksb;
li->li_castaddr = kparams->castaddr;
li->li_castparam = kparams->castparam;
li->li_lksb.sb_lkid = kparams->lkid;
li->li_rqmode = kparams->mode;
if (kparams->flags & DLM_LKF_PERSISTENT)
set_bit(LI_FLAG_PERSISTENT, &li->li_flags);
/* Copy in the value block */
if (kparams->flags & DLM_LKF_VALBLK) {
if (!li->li_lksb.sb_lvbptr) {
li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN,
GFP_KERNEL);
if (!li->li_lksb.sb_lvbptr) {
status = -ENOMEM;
goto out_err;
}
}
memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN);
}
/* Lock it ... */
status = dlm_lock(fi->fi_ls->ls_lockspace,
kparams->mode, &li->li_lksb,
kparams->flags,
kparams->name, kparams->namelen,
kparams->parent,
ast_routine,
li,
(li->li_pend_bastaddr || li->li_bastaddr) ?
bast_routine : NULL,
kparams->range.ra_end ? &kparams->range : NULL);
if (status)
goto out_err;
/* If it succeeded (this far) with a new lock then keep track of
it on the file's lockinfo list */
if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
spin_lock(&fi->fi_li_lock);
list_add(&li->li_ownerqueue, &fi->fi_li_list);
spin_unlock(&fi->fi_li_lock);
if (add_lockinfo(li))
printk(KERN_WARNING "Add lockinfo failed\n");
up(&li->li_firstlock);
}
/* Return the lockid as the user needs it /now/ */
return li->li_lksb.sb_lkid;
out_err:
if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags))
release_lockinfo(li);
return status;
}
static int do_user_unlock(struct file_info *fi, uint8_t cmd,
struct dlm_lock_params *kparams)
{
struct lock_info *li;
int status;
int convert_cancel = 0;
li = get_lockinfo(kparams->lkid);
if (!li) {
li = allocate_lockinfo(fi, cmd, kparams);
spin_lock(&fi->fi_li_lock);
list_add(&li->li_ownerqueue, &fi->fi_li_list);
spin_unlock(&fi->fi_li_lock);
}
if (!li)
return -ENOMEM;
if (li->li_magic != LOCKINFO_MAGIC)
return -EINVAL;
li->li_user_lksb = kparams->lksb;
li->li_castparam = kparams->castparam;
li->li_cmd = cmd;
/* Cancelling a conversion doesn't remove the lock...*/
if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1)
convert_cancel = 1;
/* dlm_unlock() passes a 0 for castaddr which means don't overwrite
the existing li_castaddr as that's the completion routine for
unlocks. dlm_unlock_wait() specifies a new AST routine to be
executed when the unlock completes. */
if (kparams->castaddr)
li->li_castaddr = kparams->castaddr;
/* Use existing lksb & astparams */
status = dlm_unlock(fi->fi_ls->ls_lockspace,
kparams->lkid,
kparams->flags, &li->li_lksb, li);
if (!status && !convert_cancel) {
spin_lock(&fi->fi_li_lock);
list_del(&li->li_ownerqueue);
spin_unlock(&fi->fi_li_lock);
}
return status;
}
/* Write call, submit a locking request */
static ssize_t dlm_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
struct file_info *fi = file->private_data;
struct dlm_write_request *kparams;
sigset_t tmpsig;
sigset_t allsigs;
int status;
/* -1 because lock name is optional */
if (count < sizeof(struct dlm_write_request)-1)
return -EINVAL;
/* Has the lockspace been deleted */
if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
return -ENOENT;
kparams = kmalloc(count, GFP_KERNEL);
if (!kparams)
return -ENOMEM;
status = -EFAULT;
/* Get the command info */
if (copy_from_user(kparams, buffer, count))
goto out_free;
status = -EBADE;
if (check_version(kparams))
goto out_free;
/* Block signals while we are doing this */
sigfillset(&allsigs);
sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
status = -EINVAL;
switch (kparams->cmd)
{
case DLM_USER_LOCK:
if (!fi) goto out_sig;
status = do_user_lock(fi, kparams->cmd, &kparams->i.lock);
break;
case DLM_USER_UNLOCK:
if (!fi) goto out_sig;
status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock);
break;
case DLM_USER_CREATE_LOCKSPACE:
if (fi) goto out_sig;
status = do_user_create_lockspace(fi, kparams->cmd,
&kparams->i.lspace);
break;
case DLM_USER_REMOVE_LOCKSPACE:
if (fi) goto out_sig;
status = do_user_remove_lockspace(fi, kparams->cmd,
&kparams->i.lspace);
break;
default:
printk("Unknown command passed to DLM device : %d\n",
kparams->cmd);
break;
}
out_sig:
/* Restore signals */
sigprocmask(SIG_SETMASK, &tmpsig, NULL);
recalc_sigpending();
out_free:
kfree(kparams);
if (status == 0)
return count;
else
return status;
}
static struct file_operations _dlm_fops = {
.open = dlm_open,
.release = dlm_close,
.read = dlm_read,
.write = dlm_write,
.poll = dlm_poll,
.owner = THIS_MODULE,
};
static struct file_operations _dlm_ctl_fops = {
.open = dlm_ctl_open,
.release = dlm_ctl_close,
.write = dlm_write,
.owner = THIS_MODULE,
};
/*
* Create control device
*/
static int __init dlm_device_init(void)
{
int r;
INIT_LIST_HEAD(&user_ls_list);
init_MUTEX(&user_ls_lock);
rwlock_init(&lockinfo_lock);
ctl_device.name = "dlm-control";
ctl_device.fops = &_dlm_ctl_fops;
ctl_device.minor = MISC_DYNAMIC_MINOR;
r = misc_register(&ctl_device);
if (r) {
printk(KERN_ERR "dlm: misc_register failed for control dev\n");
return r;
}
return 0;
}
static void __exit dlm_device_exit(void)
{
misc_deregister(&ctl_device);
}
MODULE_DESCRIPTION("Distributed Lock Manager device interface");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
module_init(dlm_device_init);
module_exit(dlm_device_exit);
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
#include "lowcomms.h"
#include "rcom.h"
#include "config.h"
#include "memory.h"
#include "recover.h"
#include "util.h"
#include "lock.h"
#include "dir.h"
static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
{
spin_lock(&ls->ls_recover_list_lock);
list_add(&de->list, &ls->ls_recover_list);
spin_unlock(&ls->ls_recover_list_lock);
}
static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
{
int found = FALSE;
struct dlm_direntry *de;
spin_lock(&ls->ls_recover_list_lock);
list_for_each_entry(de, &ls->ls_recover_list, list) {
if (de->length == len) {
list_del(&de->list);
de->master_nodeid = 0;
memset(de->name, 0, len);
found = TRUE;
break;
}
}
spin_unlock(&ls->ls_recover_list_lock);
if (!found)
de = allocate_direntry(ls, len);
return de;
}
void dlm_clear_free_entries(struct dlm_ls *ls)
{
struct dlm_direntry *de;
spin_lock(&ls->ls_recover_list_lock);
while (!list_empty(&ls->ls_recover_list)) {
de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
list);
list_del(&de->list);
free_direntry(de);
}
spin_unlock(&ls->ls_recover_list_lock);
}
/*
* We use the upper 16 bits of the hash value to select the directory node.
* Low bits are used for distribution of rsb's among hash buckets on each node.
*
* To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
* num_nodes to the hash value. This value in the desired range is used as an
* offset into the sorted list of nodeid's to give the particular nodeid.
*/
int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
{
struct list_head *tmp;
struct dlm_member *memb = NULL;
uint32_t node, n = 0;
int nodeid;
if (ls->ls_num_nodes == 1) {
nodeid = dlm_our_nodeid();
goto out;
}
if (ls->ls_node_array) {
node = (hash >> 16) % ls->ls_total_weight;
nodeid = ls->ls_node_array[node];
goto out;
}
/* make_member_array() failed to kmalloc ls_node_array... */
node = (hash >> 16) % ls->ls_num_nodes;
list_for_each(tmp, &ls->ls_nodes) {
if (n++ != node)
continue;
memb = list_entry(tmp, struct dlm_member, list);
break;
}
DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
ls->ls_num_nodes, n, node););
nodeid = memb->nodeid;
out:
return nodeid;
}
int dlm_dir_nodeid(struct dlm_rsb *r)
{
return dlm_hash2nodeid(r->res_ls, r->res_hash);
}
static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
{
uint32_t val;
val = jhash(name, len, 0);
val &= (ls->ls_dirtbl_size - 1);
return val;
}
static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
{
uint32_t bucket;
bucket = dir_hash(ls, de->name, de->length);
list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
}
static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
int namelen, uint32_t bucket)
{
struct dlm_direntry *de;
list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
if (de->length == namelen && !memcmp(name, de->name, namelen))
goto out;
}
de = NULL;
out:
return de;
}
void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
{
struct dlm_direntry *de;
uint32_t bucket;
bucket = dir_hash(ls, name, namelen);
write_lock(&ls->ls_dirtbl[bucket].lock);
de = search_bucket(ls, name, namelen, bucket);
if (!de) {
log_error(ls, "remove fr %u none", nodeid);
goto out;
}
if (de->master_nodeid != nodeid) {
log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
goto out;
}
list_del(&de->list);
free_direntry(de);
out:
write_unlock(&ls->ls_dirtbl[bucket].lock);
}
void dlm_dir_clear(struct dlm_ls *ls)
{
struct list_head *head;
struct dlm_direntry *de;
int i;
DLM_ASSERT(list_empty(&ls->ls_recover_list), );
for (i = 0; i < ls->ls_dirtbl_size; i++) {
write_lock(&ls->ls_dirtbl[i].lock);
head = &ls->ls_dirtbl[i].list;
while (!list_empty(head)) {
de = list_entry(head->next, struct dlm_direntry, list);
list_del(&de->list);
put_free_de(ls, de);
}
write_unlock(&ls->ls_dirtbl[i].lock);
}
}
int dlm_recover_directory(struct dlm_ls *ls)
{
struct dlm_member *memb;
struct dlm_direntry *de;
char *b, *last_name = NULL;
int error = -ENOMEM, last_len, count = 0;
uint16_t namelen;
log_debug(ls, "dlm_recover_directory");
if (dlm_no_directory(ls))
goto out_status;
dlm_dir_clear(ls);
last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
if (!last_name)
goto out;
list_for_each_entry(memb, &ls->ls_nodes, list) {
memset(last_name, 0, DLM_RESNAME_MAXLEN);
last_len = 0;
for (;;) {
error = dlm_recovery_stopped(ls);
if (error)
goto out_free;
error = dlm_rcom_names(ls, memb->nodeid,
last_name, last_len);
if (error)
goto out_free;
schedule();
/*
* pick namelen/name pairs out of received buffer
*/
b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
for (;;) {
memcpy(&namelen, b, sizeof(uint16_t));
namelen = be16_to_cpu(namelen);
b += sizeof(uint16_t);
/* namelen of 0xFFFFF marks end of names for
this node; namelen of 0 marks end of the
buffer */
if (namelen == 0xFFFF)
goto done;
if (!namelen)
break;
error = -ENOMEM;
de = get_free_de(ls, namelen);
if (!de)
goto out_free;
de->master_nodeid = memb->nodeid;
de->length = namelen;
last_len = namelen;
memcpy(de->name, b, namelen);
memcpy(last_name, b, namelen);
b += namelen;
add_entry_to_hash(ls, de);
count++;
}
}
done:
;
}
out_status:
error = 0;
dlm_set_recover_status(ls, DLM_RS_DIR);
log_debug(ls, "dlm_recover_directory %d entries", count);
out_free:
kfree(last_name);
out:
dlm_clear_free_entries(ls);
return error;
}
static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
int namelen, int *r_nodeid)
{
struct dlm_direntry *de, *tmp;
uint32_t bucket;
bucket = dir_hash(ls, name, namelen);
write_lock(&ls->ls_dirtbl[bucket].lock);
de = search_bucket(ls, name, namelen, bucket);
if (de) {
*r_nodeid = de->master_nodeid;
write_unlock(&ls->ls_dirtbl[bucket].lock);
if (*r_nodeid == nodeid)
return -EEXIST;
return 0;
}
write_unlock(&ls->ls_dirtbl[bucket].lock);
de = allocate_direntry(ls, namelen);
if (!de)
return -ENOMEM;
de->master_nodeid = nodeid;
de->length = namelen;
memcpy(de->name, name, namelen);
write_lock(&ls->ls_dirtbl[bucket].lock);
tmp = search_bucket(ls, name, namelen, bucket);
if (tmp) {
free_direntry(de);
de = tmp;
} else {
list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
}
*r_nodeid = de->master_nodeid;
write_unlock(&ls->ls_dirtbl[bucket].lock);
return 0;
}
int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
int *r_nodeid)
{
return get_entry(ls, nodeid, name, namelen, r_nodeid);
}
/* Copy the names of master rsb's into the buffer provided.
Only select names whose dir node is the given nodeid. */
void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
char *outbuf, int outlen, int nodeid)
{
struct list_head *list;
struct dlm_rsb *start_r = NULL, *r = NULL;
int offset = 0, start_namelen, error, dir_nodeid;
char *start_name;
uint16_t be_namelen;
/*
* Find the rsb where we left off (or start again)
*/
start_namelen = inlen;
start_name = inbuf;
if (start_namelen > 1) {
/*
* We could also use a find_rsb_root() function here that
* searched the ls_root_list.
*/
error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
&start_r);
DLM_ASSERT(!error && start_r,
printk("error %d\n", error););
DLM_ASSERT(!list_empty(&start_r->res_root_list),
dlm_print_rsb(start_r););
dlm_put_rsb(start_r);
}
/*
* Send rsb names for rsb's we're master of and whose directory node
* matches the requesting node.
*/
down_read(&ls->ls_root_sem);
if (start_r)
list = start_r->res_root_list.next;
else
list = ls->ls_root_list.next;
for (offset = 0; list != &ls->ls_root_list; list = list->next) {
r = list_entry(list, struct dlm_rsb, res_root_list);
if (r->res_nodeid)
continue;
dir_nodeid = dlm_dir_nodeid(r);
if (dir_nodeid != nodeid)
continue;
/*
* The block ends when we can't fit the following in the
* remaining buffer space:
* namelen (uint16_t) +
* name (r->res_length) +
* end-of-block record 0x0000 (uint16_t)
*/
if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
/* Write end-of-block record */
be_namelen = 0;
memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
offset += sizeof(uint16_t);
goto out;
}
be_namelen = cpu_to_be16(r->res_length);
memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
offset += sizeof(uint16_t);
memcpy(outbuf + offset, r->res_name, r->res_length);
offset += r->res_length;
}
/*
* If we've reached the end of the list (and there's room) write a
* terminating record.
*/
if ((list == &ls->ls_root_list) &&
(offset + sizeof(uint16_t) <= outlen)) {
be_namelen = 0xFFFF;
memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
offset += sizeof(uint16_t);
}
out:
up_read(&ls->ls_root_sem);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __DIR_DOT_H__
#define __DIR_DOT_H__
int dlm_dir_nodeid(struct dlm_rsb *rsb);
int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
void dlm_dir_clear(struct dlm_ls *ls);
void dlm_clear_free_entries(struct dlm_ls *ls);
int dlm_recover_directory(struct dlm_ls *ls);
int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
int *r_nodeid);
void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
char *outbuf, int outlen, int nodeid);
#endif /* __DIR_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __DLM_INTERNAL_DOT_H__
#define __DLM_INTERNAL_DOT_H__
/*
* This is the main header file to be included in each DLM source file.
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/random.h>
#include <linux/delay.h>
#include <linux/socket.h>
#include <linux/kthread.h>
#include <linux/kobject.h>
#include <linux/kref.h>
#include <linux/kernel.h>
#include <linux/jhash.h>
#include <asm/semaphore.h>
#include <asm/uaccess.h>
#include <linux/dlm.h>
#define DLM_LOCKSPACE_LEN 64
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#if (BITS_PER_LONG == 64)
#define PRIx64 "lx"
#else
#define PRIx64 "Lx"
#endif
/* Size of the temp buffer midcomms allocates on the stack.
We try to make this large enough so most messages fit.
FIXME: should sctp make this unnecessary? */
#define DLM_INBUF_LEN 148
struct dlm_ls;
struct dlm_lkb;
struct dlm_rsb;
struct dlm_member;
struct dlm_lkbtable;
struct dlm_rsbtable;
struct dlm_dirtable;
struct dlm_direntry;
struct dlm_recover;
struct dlm_header;
struct dlm_message;
struct dlm_rcom;
struct dlm_mhandle;
#define log_print(fmt, args...) \
printk(KERN_ERR "dlm: "fmt"\n" , ##args)
#define log_error(ls, fmt, args...) \
printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
#ifdef DLM_LOG_DEBUG
#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
#else
#define log_debug(ls, fmt, args...)
#endif
#define DLM_ASSERT(x, do) \
{ \
if (!(x)) \
{ \
printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
"DLM: assertion: \"%s\"\n" \
"DLM: time = %lu\n", \
__LINE__, __FILE__, #x, jiffies); \
{do} \
printk("\n"); \
BUG(); \
panic("DLM: Record message above and reboot.\n"); \
} \
}
struct dlm_direntry {
struct list_head list;
uint32_t master_nodeid;
uint16_t length;
char name[1];
};
struct dlm_dirtable {
struct list_head list;
rwlock_t lock;
};
struct dlm_rsbtable {
struct list_head list;
struct list_head toss;
rwlock_t lock;
};
struct dlm_lkbtable {
struct list_head list;
rwlock_t lock;
uint16_t counter;
};
/*
* Lockspace member (per node in a ls)
*/
struct dlm_member {
struct list_head list;
int nodeid;
int weight;
};
/*
* Save and manage recovery state for a lockspace.
*/
struct dlm_recover {
struct list_head list;
int *nodeids;
int node_count;
uint64_t seq;
};
/*
* Pass input args to second stage locking function.
*/
struct dlm_args {
uint32_t flags;
void *astaddr;
long astparam;
void *bastaddr;
int mode;
struct dlm_lksb *lksb;
struct dlm_range *range;
};
/*
* Lock block
*
* A lock can be one of three types:
*
* local copy lock is mastered locally
* (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
* process copy lock is mastered on a remote node
* (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
* master copy master node's copy of a lock owned by remote node
* (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
*
* lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
* dlm_unlock. The dlm does not modify these or use any private flags in
* this field; it only contains DLM_LKF_ flags from dlm.h. These flags
* are sent as-is to the remote master when the lock is remote.
*
* lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
* Some internal flags are shared between the master and process nodes;
* these shared flags are kept in the lower two bytes. One of these
* flags set on the master copy will be propagated to the process copy
* and v.v. Other internal flags are private to the master or process
* node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
*
* lkb_sbflags: status block flags. These flags are copied directly into
* the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
* ast. All defined in dlm.h with DLM_SBF_ prefix.
*
* lkb_status: the lock status indicates which rsb queue the lock is
* on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
*
* lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
* reply is needed. Only set when the lkb is on the lockspace waiters
* list awaiting a reply from a remote node.
*
* lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
* is a master copy, nodeid specifies the remote lock holder, when the
* lkb is a process copy, the nodeid specifies the lock master.
*/
/* lkb_ast_type */
#define AST_COMP 1
#define AST_BAST 2
/* lkb_range[] */
#define GR_RANGE_START 0
#define GR_RANGE_END 1
#define RQ_RANGE_START 2
#define RQ_RANGE_END 3
/* lkb_status */
#define DLM_LKSTS_WAITING 1
#define DLM_LKSTS_GRANTED 2
#define DLM_LKSTS_CONVERT 3
/* lkb_flags */
#define DLM_IFL_MSTCPY 0x00010000
#define DLM_IFL_RESEND 0x00020000
#define DLM_IFL_RANGE 0x00000001
struct dlm_lkb {
struct dlm_rsb *lkb_resource; /* the rsb */
struct kref lkb_ref;
int lkb_nodeid; /* copied from rsb */
int lkb_ownpid; /* pid of lock owner */
uint32_t lkb_id; /* our lock ID */
uint32_t lkb_remid; /* lock ID on remote partner */
uint32_t lkb_exflags; /* external flags from caller */
uint32_t lkb_sbflags; /* lksb flags */
uint32_t lkb_flags; /* internal flags */
uint32_t lkb_lvbseq; /* lvb sequence number */
int8_t lkb_status; /* granted, waiting, convert */
int8_t lkb_rqmode; /* requested lock mode */
int8_t lkb_grmode; /* granted lock mode */
int8_t lkb_bastmode; /* requested mode */
int8_t lkb_highbast; /* highest mode bast sent for */
int8_t lkb_wait_type; /* type of reply waiting for */
int8_t lkb_ast_type; /* type of ast queued for */
struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
struct list_head lkb_statequeue; /* rsb g/c/w list */
struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
struct list_head lkb_wait_reply; /* waiting for remote reply */
struct list_head lkb_astqueue; /* need ast to be sent */
uint64_t *lkb_range; /* array of gr/rq ranges */
char *lkb_lvbptr;
struct dlm_lksb *lkb_lksb; /* caller's status block */
void *lkb_astaddr; /* caller's ast function */
void *lkb_bastaddr; /* caller's bast function */
long lkb_astparam; /* caller's ast arg */
};
struct dlm_rsb {
struct dlm_ls *res_ls; /* the lockspace */
struct kref res_ref;
struct semaphore res_sem;
unsigned long res_flags;
int res_length; /* length of rsb name */
int res_nodeid;
uint32_t res_lvbseq;
uint32_t res_hash;
uint32_t res_bucket; /* rsbtbl */
unsigned long res_toss_time;
uint32_t res_first_lkid;
struct list_head res_lookup; /* lkbs waiting on first */
struct list_head res_hashchain; /* rsbtbl */
struct list_head res_grantqueue;
struct list_head res_convertqueue;
struct list_head res_waitqueue;
struct list_head res_root_list; /* used for recovery */
struct list_head res_recover_list; /* used for recovery */
int res_recover_locks_count;
char *res_lvbptr;
char res_name[1];
};
/* find_rsb() flags */
#define R_MASTER 1 /* only return rsb if it's a master */
#define R_CREATE 2 /* create/add rsb if not found */
/* rsb_flags */
enum rsb_flags {
RSB_MASTER_UNCERTAIN,
RSB_VALNOTVALID,
RSB_VALNOTVALID_PREV,
RSB_NEW_MASTER,
RSB_NEW_MASTER2,
RSB_RECOVER_CONVERT,
};
static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
{
__set_bit(flag, &r->res_flags);
}
static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
{
__clear_bit(flag, &r->res_flags);
}
static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
{
return test_bit(flag, &r->res_flags);
}
/* dlm_header is first element of all structs sent between nodes */
#define DLM_HEADER_MAJOR 0x00020000
#define DLM_HEADER_MINOR 0x00000001
#define DLM_MSG 1
#define DLM_RCOM 2
struct dlm_header {
uint32_t h_version;
uint32_t h_lockspace;
uint32_t h_nodeid; /* nodeid of sender */
uint16_t h_length;
uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
uint8_t h_pad;
};
#define DLM_MSG_REQUEST 1
#define DLM_MSG_CONVERT 2
#define DLM_MSG_UNLOCK 3
#define DLM_MSG_CANCEL 4
#define DLM_MSG_REQUEST_REPLY 5
#define DLM_MSG_CONVERT_REPLY 6
#define DLM_MSG_UNLOCK_REPLY 7
#define DLM_MSG_CANCEL_REPLY 8
#define DLM_MSG_GRANT 9
#define DLM_MSG_BAST 10
#define DLM_MSG_LOOKUP 11
#define DLM_MSG_REMOVE 12
#define DLM_MSG_LOOKUP_REPLY 13
struct dlm_message {
struct dlm_header m_header;
uint32_t m_type; /* DLM_MSG_ */
uint32_t m_nodeid;
uint32_t m_pid;
uint32_t m_lkid; /* lkid on sender */
uint32_t m_remid; /* lkid on receiver */
uint32_t m_parent_lkid;
uint32_t m_parent_remid;
uint32_t m_exflags;
uint32_t m_sbflags;
uint32_t m_flags;
uint32_t m_lvbseq;
uint32_t m_hash;
int m_status;
int m_grmode;
int m_rqmode;
int m_bastmode;
int m_asts;
int m_result; /* 0 or -EXXX */
uint64_t m_range[2];
char m_extra[0]; /* name or lvb */
};
#define DLM_RS_NODES 0x00000001
#define DLM_RS_NODES_ALL 0x00000002
#define DLM_RS_DIR 0x00000004
#define DLM_RS_DIR_ALL 0x00000008
#define DLM_RS_LOCKS 0x00000010
#define DLM_RS_LOCKS_ALL 0x00000020
#define DLM_RS_DONE 0x00000040
#define DLM_RS_DONE_ALL 0x00000080
#define DLM_RCOM_STATUS 1
#define DLM_RCOM_NAMES 2
#define DLM_RCOM_LOOKUP 3
#define DLM_RCOM_LOCK 4
#define DLM_RCOM_STATUS_REPLY 5
#define DLM_RCOM_NAMES_REPLY 6
#define DLM_RCOM_LOOKUP_REPLY 7
#define DLM_RCOM_LOCK_REPLY 8
struct dlm_rcom {
struct dlm_header rc_header;
uint32_t rc_type; /* DLM_RCOM_ */
int rc_result; /* multi-purpose */
uint64_t rc_id; /* match reply with request */
char rc_buf[0];
};
struct rcom_config {
uint32_t rf_lvblen;
uint32_t rf_lsflags;
uint64_t rf_unused;
};
struct rcom_lock {
uint32_t rl_ownpid;
uint32_t rl_lkid;
uint32_t rl_remid;
uint32_t rl_parent_lkid;
uint32_t rl_parent_remid;
uint32_t rl_exflags;
uint32_t rl_flags;
uint32_t rl_lvbseq;
int rl_result;
int8_t rl_rqmode;
int8_t rl_grmode;
int8_t rl_status;
int8_t rl_asts;
uint16_t rl_wait_type;
uint16_t rl_namelen;
uint64_t rl_range[4];
char rl_name[DLM_RESNAME_MAXLEN];
char rl_lvb[0];
};
struct dlm_ls {
struct list_head ls_list; /* list of lockspaces */
uint32_t ls_global_id; /* global unique lockspace ID */
uint32_t ls_exflags;
int ls_lvblen;
int ls_count; /* reference count */
unsigned long ls_flags; /* LSFL_ */
struct kobject ls_kobj;
struct dlm_rsbtable *ls_rsbtbl;
uint32_t ls_rsbtbl_size;
struct dlm_lkbtable *ls_lkbtbl;
uint32_t ls_lkbtbl_size;
struct dlm_dirtable *ls_dirtbl;
uint32_t ls_dirtbl_size;
struct semaphore ls_waiters_sem;
struct list_head ls_waiters; /* lkbs needing a reply */
struct list_head ls_nodes; /* current nodes in ls */
struct list_head ls_nodes_gone; /* dead node list, recovery */
int ls_num_nodes; /* number of nodes in ls */
int ls_low_nodeid;
int ls_total_weight;
int *ls_node_array;
struct dlm_rsb ls_stub_rsb; /* for returning errors */
struct dlm_lkb ls_stub_lkb; /* for returning errors */
struct dlm_message ls_stub_ms; /* for faking a reply */
struct dentry *ls_debug_dentry; /* debugfs */
wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
/* recovery related */
struct timer_list ls_timer;
struct task_struct *ls_recoverd_task;
struct semaphore ls_recoverd_active;
spinlock_t ls_recover_lock;
uint32_t ls_recover_status; /* DLM_RS_ */
uint64_t ls_recover_seq;
struct dlm_recover *ls_recover_args;
struct rw_semaphore ls_in_recovery; /* block local requests */
struct list_head ls_requestqueue;/* queue remote requests */
struct semaphore ls_requestqueue_lock;
char *ls_recover_buf;
struct list_head ls_recover_list;
spinlock_t ls_recover_list_lock;
int ls_recover_list_count;
wait_queue_head_t ls_wait_general;
struct list_head ls_root_list; /* root resources */
struct rw_semaphore ls_root_sem; /* protect root_list */
int ls_namelen;
char ls_name[1];
};
#define LSFL_WORK 0
#define LSFL_RUNNING 1
#define LSFL_RECOVERY_STOP 2
#define LSFL_RCOM_READY 3
#define LSFL_UEVENT_WAIT 4
static inline int dlm_locking_stopped(struct dlm_ls *ls)
{
return !test_bit(LSFL_RUNNING, &ls->ls_flags);
}
static inline int dlm_recovery_stopped(struct dlm_ls *ls)
{
return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
}
static inline int dlm_no_directory(struct dlm_ls *ls)
{
return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
}
#endif /* __DLM_INTERNAL_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
/* Central locking logic has four stages:
dlm_lock()
dlm_unlock()
request_lock(ls, lkb)
convert_lock(ls, lkb)
unlock_lock(ls, lkb)
cancel_lock(ls, lkb)
_request_lock(r, lkb)
_convert_lock(r, lkb)
_unlock_lock(r, lkb)
_cancel_lock(r, lkb)
do_request(r, lkb)
do_convert(r, lkb)
do_unlock(r, lkb)
do_cancel(r, lkb)
Stage 1 (lock, unlock) is mainly about checking input args and
splitting into one of the four main operations:
dlm_lock = request_lock
dlm_lock+CONVERT = convert_lock
dlm_unlock = unlock_lock
dlm_unlock+CANCEL = cancel_lock
Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
provided to the next stage.
Stage 3, _xxxx_lock(), determines if the operation is local or remote.
When remote, it calls send_xxxx(), when local it calls do_xxxx().
Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
given rsb and lkb and queues callbacks.
For remote operations, send_xxxx() results in the corresponding do_xxxx()
function being executed on the remote node. The connecting send/receive
calls on local (L) and remote (R) nodes:
L: send_xxxx() -> R: receive_xxxx()
R: do_xxxx()
L: receive_xxxx_reply() <- R: send_xxxx_reply()
*/
#include "dlm_internal.h"
#include "memory.h"
#include "lowcomms.h"
#include "requestqueue.h"
#include "util.h"
#include "dir.h"
#include "member.h"
#include "lockspace.h"
#include "ast.h"
#include "lock.h"
#include "rcom.h"
#include "recover.h"
#include "lvb_table.h"
#include "config.h"
static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int send_remove(struct dlm_rsb *r);
static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms);
static int receive_extralen(struct dlm_message *ms);
/*
* Lock compatibilty matrix - thanks Steve
* UN = Unlocked state. Not really a state, used as a flag
* PD = Padding. Used to make the matrix a nice power of two in size
* Other states are the same as the VMS DLM.
* Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
*/
static const int __dlm_compat_matrix[8][8] = {
/* UN NL CR CW PR PW EX PD */
{1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
{1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
{1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
{1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
{1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
{1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
{1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
{0, 0, 0, 0, 0, 0, 0, 0} /* PD */
};
/*
* This defines the direction of transfer of LVB data.
* Granted mode is the row; requested mode is the column.
* Usage: matrix[grmode+1][rqmode+1]
* 1 = LVB is returned to the caller
* 0 = LVB is written to the resource
* -1 = nothing happens to the LVB
*/
const int dlm_lvb_operations[8][8] = {
/* UN NL CR CW PR PW EX PD*/
{ -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
{ -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
{ -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
{ -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
{ -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
{ -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
{ -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
{ -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
};
EXPORT_SYMBOL_GPL(dlm_lvb_operations);
#define modes_compat(gr, rq) \
__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
int dlm_modes_compat(int mode1, int mode2)
{
return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
}
/*
* Compatibility matrix for conversions with QUECVT set.
* Granted mode is the row; requested mode is the column.
* Usage: matrix[grmode+1][rqmode+1]
*/
static const int __quecvt_compat_matrix[8][8] = {
/* UN NL CR CW PR PW EX PD */
{0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
{0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
{0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
{0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
{0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
{0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
{0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
{0, 0, 0, 0, 0, 0, 0, 0} /* PD */
};
static void dlm_print_lkb(struct dlm_lkb *lkb)
{
printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
" status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
}
void dlm_print_rsb(struct dlm_rsb *r)
{
printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
r->res_nodeid, r->res_flags, r->res_first_lkid,
r->res_recover_locks_count, r->res_name);
}
/* Threads cannot use the lockspace while it's being recovered */
static inline void lock_recovery(struct dlm_ls *ls)
{
down_read(&ls->ls_in_recovery);
}
static inline void unlock_recovery(struct dlm_ls *ls)
{
up_read(&ls->ls_in_recovery);
}
static inline int lock_recovery_try(struct dlm_ls *ls)
{
return down_read_trylock(&ls->ls_in_recovery);
}
static inline int can_be_queued(struct dlm_lkb *lkb)
{
return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
}
static inline int force_blocking_asts(struct dlm_lkb *lkb)
{
return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
}
static inline int is_demoted(struct dlm_lkb *lkb)
{
return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
}
static inline int is_remote(struct dlm_rsb *r)
{
DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
return !!r->res_nodeid;
}
static inline int is_process_copy(struct dlm_lkb *lkb)
{
return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
}
static inline int is_master_copy(struct dlm_lkb *lkb)
{
if (lkb->lkb_flags & DLM_IFL_MSTCPY)
DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? TRUE : FALSE;
}
static inline int middle_conversion(struct dlm_lkb *lkb)
{
if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
(lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
return TRUE;
return FALSE;
}
static inline int down_conversion(struct dlm_lkb *lkb)
{
return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
}
static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
{
if (is_master_copy(lkb))
return;
DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
lkb->lkb_lksb->sb_status = rv;
lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
dlm_add_ast(lkb, AST_COMP);
}
static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
{
if (is_master_copy(lkb))
send_bast(r, lkb, rqmode);
else {
lkb->lkb_bastmode = rqmode;
dlm_add_ast(lkb, AST_BAST);
}
}
/*
* Basic operations on rsb's and lkb's
*/
static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
{
struct dlm_rsb *r;
r = allocate_rsb(ls, len);
if (!r)
return NULL;
r->res_ls = ls;
r->res_length = len;
memcpy(r->res_name, name, len);
init_MUTEX(&r->res_sem);
INIT_LIST_HEAD(&r->res_lookup);
INIT_LIST_HEAD(&r->res_grantqueue);
INIT_LIST_HEAD(&r->res_convertqueue);
INIT_LIST_HEAD(&r->res_waitqueue);
INIT_LIST_HEAD(&r->res_root_list);
INIT_LIST_HEAD(&r->res_recover_list);
return r;
}
static int search_rsb_list(struct list_head *head, char *name, int len,
unsigned int flags, struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
int error = 0;
list_for_each_entry(r, head, res_hashchain) {
if (len == r->res_length && !memcmp(name, r->res_name, len))
goto found;
}
return -ENOENT;
found:
if (r->res_nodeid && (flags & R_MASTER))
error = -ENOTBLK;
*r_ret = r;
return error;
}
static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
unsigned int flags, struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
int error;
error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
if (!error) {
kref_get(&r->res_ref);
goto out;
}
error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
if (error)
goto out;
list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
if (dlm_no_directory(ls))
goto out;
if (r->res_nodeid == -1) {
rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
r->res_first_lkid = 0;
} else if (r->res_nodeid > 0) {
rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
r->res_first_lkid = 0;
} else {
DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
}
out:
*r_ret = r;
return error;
}
static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
unsigned int flags, struct dlm_rsb **r_ret)
{
int error;
write_lock(&ls->ls_rsbtbl[b].lock);
error = _search_rsb(ls, name, len, b, flags, r_ret);
write_unlock(&ls->ls_rsbtbl[b].lock);
return error;
}
/*
* Find rsb in rsbtbl and potentially create/add one
*
* Delaying the release of rsb's has a similar benefit to applications keeping
* NL locks on an rsb, but without the guarantee that the cached master value
* will still be valid when the rsb is reused. Apps aren't always smart enough
* to keep NL locks on an rsb that they may lock again shortly; this can lead
* to excessive master lookups and removals if we don't delay the release.
*
* Searching for an rsb means looking through both the normal list and toss
* list. When found on the toss list the rsb is moved to the normal list with
* ref count of 1; when found on normal list the ref count is incremented.
*/
static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
unsigned int flags, struct dlm_rsb **r_ret)
{
struct dlm_rsb *r, *tmp;
uint32_t hash, bucket;
int error = 0;
if (dlm_no_directory(ls))
flags |= R_CREATE;
hash = jhash(name, namelen, 0);
bucket = hash & (ls->ls_rsbtbl_size - 1);
error = search_rsb(ls, name, namelen, bucket, flags, &r);
if (!error)
goto out;
if (error == -ENOENT && !(flags & R_CREATE))
goto out;
/* the rsb was found but wasn't a master copy */
if (error == -ENOTBLK)
goto out;
error = -ENOMEM;
r = create_rsb(ls, name, namelen);
if (!r)
goto out;
r->res_hash = hash;
r->res_bucket = bucket;
r->res_nodeid = -1;
kref_init(&r->res_ref);
/* With no directory, the master can be set immediately */
if (dlm_no_directory(ls)) {
int nodeid = dlm_dir_nodeid(r);
if (nodeid == dlm_our_nodeid())
nodeid = 0;
r->res_nodeid = nodeid;
}
write_lock(&ls->ls_rsbtbl[bucket].lock);
error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
if (!error) {
write_unlock(&ls->ls_rsbtbl[bucket].lock);
free_rsb(r);
r = tmp;
goto out;
}
list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
write_unlock(&ls->ls_rsbtbl[bucket].lock);
error = 0;
out:
*r_ret = r;
return error;
}
int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
unsigned int flags, struct dlm_rsb **r_ret)
{
return find_rsb(ls, name, namelen, flags, r_ret);
}
/* This is only called to add a reference when the code already holds
a valid reference to the rsb, so there's no need for locking. */
static inline void hold_rsb(struct dlm_rsb *r)
{
kref_get(&r->res_ref);
}
void dlm_hold_rsb(struct dlm_rsb *r)
{
hold_rsb(r);
}
static void toss_rsb(struct kref *kref)
{
struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
struct dlm_ls *ls = r->res_ls;
DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
kref_init(&r->res_ref);
list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
r->res_toss_time = jiffies;
if (r->res_lvbptr) {
free_lvb(r->res_lvbptr);
r->res_lvbptr = NULL;
}
}
/* When all references to the rsb are gone it's transfered to
the tossed list for later disposal. */
static void put_rsb(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
uint32_t bucket = r->res_bucket;
write_lock(&ls->ls_rsbtbl[bucket].lock);
kref_put(&r->res_ref, toss_rsb);
write_unlock(&ls->ls_rsbtbl[bucket].lock);
}
void dlm_put_rsb(struct dlm_rsb *r)
{
put_rsb(r);
}
/* See comment for unhold_lkb */
static void unhold_rsb(struct dlm_rsb *r)
{
int rv;
rv = kref_put(&r->res_ref, toss_rsb);
DLM_ASSERT(!rv, dlm_print_rsb(r););
}
static void kill_rsb(struct kref *kref)
{
struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
/* All work is done after the return from kref_put() so we
can release the write_lock before the remove and free. */
DLM_ASSERT(list_empty(&r->res_lookup),);
DLM_ASSERT(list_empty(&r->res_grantqueue),);
DLM_ASSERT(list_empty(&r->res_convertqueue),);
DLM_ASSERT(list_empty(&r->res_waitqueue),);
DLM_ASSERT(list_empty(&r->res_root_list),);
DLM_ASSERT(list_empty(&r->res_recover_list),);
}
/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
The rsb must exist as long as any lkb's for it do. */
static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
hold_rsb(r);
lkb->lkb_resource = r;
}
static void detach_lkb(struct dlm_lkb *lkb)
{
if (lkb->lkb_resource) {
put_rsb(lkb->lkb_resource);
lkb->lkb_resource = NULL;
}
}
static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb, *tmp;
uint32_t lkid = 0;
uint16_t bucket;
lkb = allocate_lkb(ls);
if (!lkb)
return -ENOMEM;
lkb->lkb_nodeid = -1;
lkb->lkb_grmode = DLM_LOCK_IV;
kref_init(&lkb->lkb_ref);
get_random_bytes(&bucket, sizeof(bucket));
bucket &= (ls->ls_lkbtbl_size - 1);
write_lock(&ls->ls_lkbtbl[bucket].lock);
/* counter can roll over so we must verify lkid is not in use */
while (lkid == 0) {
lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
lkb_idtbl_list) {
if (tmp->lkb_id != lkid)
continue;
lkid = 0;
break;
}
}
lkb->lkb_id = lkid;
list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
write_unlock(&ls->ls_lkbtbl[bucket].lock);
*lkb_ret = lkb;
return 0;
}
static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
{
uint16_t bucket = lkid & 0xFFFF;
struct dlm_lkb *lkb;
list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
if (lkb->lkb_id == lkid)
return lkb;
}
return NULL;
}
static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb;
uint16_t bucket = lkid & 0xFFFF;
if (bucket >= ls->ls_lkbtbl_size)
return -EBADSLT;
read_lock(&ls->ls_lkbtbl[bucket].lock);
lkb = __find_lkb(ls, lkid);
if (lkb)
kref_get(&lkb->lkb_ref);
read_unlock(&ls->ls_lkbtbl[bucket].lock);
*lkb_ret = lkb;
return lkb ? 0 : -ENOENT;
}
static void kill_lkb(struct kref *kref)
{
struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
/* All work is done after the return from kref_put() so we
can release the write_lock before the detach_lkb */
DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
}
static int put_lkb(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
uint16_t bucket = lkb->lkb_id & 0xFFFF;
write_lock(&ls->ls_lkbtbl[bucket].lock);
if (kref_put(&lkb->lkb_ref, kill_lkb)) {
list_del(&lkb->lkb_idtbl_list);
write_unlock(&ls->ls_lkbtbl[bucket].lock);
detach_lkb(lkb);
/* for local/process lkbs, lvbptr points to caller's lksb */
if (lkb->lkb_lvbptr && is_master_copy(lkb))
free_lvb(lkb->lkb_lvbptr);
if (lkb->lkb_range)
free_range(lkb->lkb_range);
free_lkb(lkb);
return 1;
} else {
write_unlock(&ls->ls_lkbtbl[bucket].lock);
return 0;
}
}
int dlm_put_lkb(struct dlm_lkb *lkb)
{
return put_lkb(lkb);
}
/* This is only called to add a reference when the code already holds
a valid reference to the lkb, so there's no need for locking. */
static inline void hold_lkb(struct dlm_lkb *lkb)
{
kref_get(&lkb->lkb_ref);
}
/* This is called when we need to remove a reference and are certain
it's not the last ref. e.g. del_lkb is always called between a
find_lkb/put_lkb and is always the inverse of a previous add_lkb.
put_lkb would work fine, but would involve unnecessary locking */
static inline void unhold_lkb(struct dlm_lkb *lkb)
{
int rv;
rv = kref_put(&lkb->lkb_ref, kill_lkb);
DLM_ASSERT(!rv, dlm_print_lkb(lkb););
}
static void lkb_add_ordered(struct list_head *new, struct list_head *head,
int mode)
{
struct dlm_lkb *lkb = NULL;
list_for_each_entry(lkb, head, lkb_statequeue)
if (lkb->lkb_rqmode < mode)
break;
if (!lkb)
list_add_tail(new, head);
else
__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
}
/* add/remove lkb to rsb's grant/convert/wait queue */
static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
{
kref_get(&lkb->lkb_ref);
DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
lkb->lkb_status = status;
switch (status) {
case DLM_LKSTS_WAITING:
if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
else
list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
break;
case DLM_LKSTS_GRANTED:
/* convention says granted locks kept in order of grmode */
lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
lkb->lkb_grmode);
break;
case DLM_LKSTS_CONVERT:
if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
else
list_add_tail(&lkb->lkb_statequeue,
&r->res_convertqueue);
break;
default:
DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
}
}
static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
lkb->lkb_status = 0;
list_del(&lkb->lkb_statequeue);
unhold_lkb(lkb);
}
static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
{
hold_lkb(lkb);
del_lkb(r, lkb);
add_lkb(r, lkb, sts);
unhold_lkb(lkb);
}
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
down(&ls->ls_waiters_sem);
if (lkb->lkb_wait_type) {
log_print("add_to_waiters error %d", lkb->lkb_wait_type);
goto out;
}
lkb->lkb_wait_type = mstype;
kref_get(&lkb->lkb_ref);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
out:
up(&ls->ls_waiters_sem);
}
static int _remove_from_waiters(struct dlm_lkb *lkb)
{
int error = 0;
if (!lkb->lkb_wait_type) {
log_print("remove_from_waiters error");
error = -EINVAL;
goto out;
}
lkb->lkb_wait_type = 0;
list_del(&lkb->lkb_wait_reply);
unhold_lkb(lkb);
out:
return error;
}
static int remove_from_waiters(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error;
down(&ls->ls_waiters_sem);
error = _remove_from_waiters(lkb);
up(&ls->ls_waiters_sem);
return error;
}
static void dir_remove(struct dlm_rsb *r)
{
int to_nodeid;
if (dlm_no_directory(r->res_ls))
return;
to_nodeid = dlm_dir_nodeid(r);
if (to_nodeid != dlm_our_nodeid())
send_remove(r);
else
dlm_dir_remove_entry(r->res_ls, to_nodeid,
r->res_name, r->res_length);
}
/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
found since they are in order of newest to oldest? */
static int shrink_bucket(struct dlm_ls *ls, int b)
{
struct dlm_rsb *r;
int count = 0, found;
for (;;) {
found = FALSE;
write_lock(&ls->ls_rsbtbl[b].lock);
list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
res_hashchain) {
if (!time_after_eq(jiffies, r->res_toss_time +
dlm_config.toss_secs * HZ))
continue;
found = TRUE;
break;
}
if (!found) {
write_unlock(&ls->ls_rsbtbl[b].lock);
break;
}
if (kref_put(&r->res_ref, kill_rsb)) {
list_del(&r->res_hashchain);
write_unlock(&ls->ls_rsbtbl[b].lock);
if (is_master(r))
dir_remove(r);
free_rsb(r);
count++;
} else {
write_unlock(&ls->ls_rsbtbl[b].lock);
log_error(ls, "tossed rsb in use %s", r->res_name);
}
}
return count;
}
void dlm_scan_rsbs(struct dlm_ls *ls)
{
int i;
if (dlm_locking_stopped(ls))
return;
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
shrink_bucket(ls, i);
cond_resched();
}
}
/* lkb is master or local copy */
static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int b, len = r->res_ls->ls_lvblen;
/* b=1 lvb returned to caller
b=0 lvb written to rsb or invalidated
b=-1 do nothing */
b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
if (b == 1) {
if (!lkb->lkb_lvbptr)
return;
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
return;
if (!r->res_lvbptr)
return;
memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
lkb->lkb_lvbseq = r->res_lvbseq;
} else if (b == 0) {
if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
rsb_set_flag(r, RSB_VALNOTVALID);
return;
}
if (!lkb->lkb_lvbptr)
return;
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
return;
if (!r->res_lvbptr)
r->res_lvbptr = allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
return;
memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
r->res_lvbseq++;
lkb->lkb_lvbseq = r->res_lvbseq;
rsb_clear_flag(r, RSB_VALNOTVALID);
}
if (rsb_flag(r, RSB_VALNOTVALID))
lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
}
static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
if (lkb->lkb_grmode < DLM_LOCK_PW)
return;
if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
rsb_set_flag(r, RSB_VALNOTVALID);
return;
}
if (!lkb->lkb_lvbptr)
return;
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
return;
if (!r->res_lvbptr)
r->res_lvbptr = allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
return;
memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
r->res_lvbseq++;
rsb_clear_flag(r, RSB_VALNOTVALID);
}
/* lkb is process copy (pc) */
static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
int b;
if (!lkb->lkb_lvbptr)
return;
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
return;
b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
if (b == 1) {
int len = receive_extralen(ms);
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
lkb->lkb_lvbseq = ms->m_lvbseq;
}
}
/* Manipulate lkb's on rsb's convert/granted/waiting queues
remove_lock -- used for unlock, removes lkb from granted
revert_lock -- used for cancel, moves lkb from convert to granted
grant_lock -- used for request and convert, adds lkb to granted or
moves lkb from convert or waiting to granted
Each of these is used for master or local copy lkb's. There is
also a _pc() variation used to make the corresponding change on
a process copy (pc) lkb. */
static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
del_lkb(r, lkb);
lkb->lkb_grmode = DLM_LOCK_IV;
/* this unhold undoes the original ref from create_lkb()
so this leads to the lkb being freed */
unhold_lkb(lkb);
}
static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
set_lvb_unlock(r, lkb);
_remove_lock(r, lkb);
}
static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
_remove_lock(r, lkb);
}
static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
lkb->lkb_rqmode = DLM_LOCK_IV;
switch (lkb->lkb_status) {
case DLM_LKSTS_CONVERT:
move_lkb(r, lkb, DLM_LKSTS_GRANTED);
break;
case DLM_LKSTS_WAITING:
del_lkb(r, lkb);
lkb->lkb_grmode = DLM_LOCK_IV;
/* this unhold undoes the original ref from create_lkb()
so this leads to the lkb being freed */
unhold_lkb(lkb);
break;
default:
log_print("invalid status for revert %d", lkb->lkb_status);
}
}
static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
revert_lock(r, lkb);
}
static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
if (lkb->lkb_grmode != lkb->lkb_rqmode) {
lkb->lkb_grmode = lkb->lkb_rqmode;
if (lkb->lkb_status)
move_lkb(r, lkb, DLM_LKSTS_GRANTED);
else
add_lkb(r, lkb, DLM_LKSTS_GRANTED);
}
lkb->lkb_rqmode = DLM_LOCK_IV;
if (lkb->lkb_range) {
lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
}
}
static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
set_lvb_lock(r, lkb);
_grant_lock(r, lkb);
lkb->lkb_highbast = 0;
}
static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
set_lvb_lock_pc(r, lkb, ms);
_grant_lock(r, lkb);
}
/* called by grant_pending_locks() which means an async grant message must
be sent to the requesting node in addition to granting the lock if the
lkb belongs to a remote node. */
static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
grant_lock(r, lkb);
if (is_master_copy(lkb))
send_grant(r, lkb);
else
queue_cast(r, lkb, 0);
}
static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
{
struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
lkb_statequeue);
if (lkb->lkb_id == first->lkb_id)
return TRUE;
return FALSE;
}
/* Return 1 if the locks' ranges overlap. If the lkb has no range then it is
assumed to cover 0-ffffffff.ffffffff */
static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
{
if (!lkb1->lkb_range || !lkb2->lkb_range)
return TRUE;
if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
return FALSE;
return TRUE;
}
/* Check if the given lkb conflicts with another lkb on the queue. */
static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
{
struct dlm_lkb *this;
list_for_each_entry(this, head, lkb_statequeue) {
if (this == lkb)
continue;
if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
return TRUE;
}
return FALSE;
}
/*
* "A conversion deadlock arises with a pair of lock requests in the converting
* queue for one resource. The granted mode of each lock blocks the requested
* mode of the other lock."
*
* Part 2: if the granted mode of lkb is preventing the first lkb in the
* convert queue from being granted, then demote lkb (set grmode to NL).
* This second form requires that we check for conv-deadlk even when
* now == 0 in _can_be_granted().
*
* Example:
* Granted Queue: empty
* Convert Queue: NL->EX (first lock)
* PR->EX (second lock)
*
* The first lock can't be granted because of the granted mode of the second
* lock and the second lock can't be granted because it's not first in the
* list. We demote the granted mode of the second lock (the lkb passed to this
* function).
*
* After the resolution, the "grant pending" function needs to go back and try
* to grant locks on the convert queue again since the first lock can now be
* granted.
*/
static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
{
struct dlm_lkb *this, *first = NULL, *self = NULL;
list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
if (!first)
first = this;
if (this == lkb) {
self = lkb;
continue;
}
if (!ranges_overlap(lkb, this))
continue;
if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
return TRUE;
}
/* if lkb is on the convert queue and is preventing the first
from being granted, then there's deadlock and we demote lkb.
multiple converting locks may need to do this before the first
converting lock can be granted. */
if (self && self != first) {
if (!modes_compat(lkb, first) &&
!queue_conflict(&rsb->res_grantqueue, first))
return TRUE;
}
return FALSE;
}
/*
* Return 1 if the lock can be granted, 0 otherwise.
* Also detect and resolve conversion deadlocks.
*
* lkb is the lock to be granted
*
* now is 1 if the function is being called in the context of the
* immediate request, it is 0 if called later, after the lock has been
* queued.
*
* References are from chapter 6 of "VAXcluster Principles" by Roy Davis
*/
static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
{
int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
/*
* 6-10: Version 5.4 introduced an option to address the phenomenon of
* a new request for a NL mode lock being blocked.
*
* 6-11: If the optional EXPEDITE flag is used with the new NL mode
* request, then it would be granted. In essence, the use of this flag
* tells the Lock Manager to expedite theis request by not considering
* what may be in the CONVERTING or WAITING queues... As of this
* writing, the EXPEDITE flag can be used only with new requests for NL
* mode locks. This flag is not valid for conversion requests.
*
* A shortcut. Earlier checks return an error if EXPEDITE is used in a
* conversion or used with a non-NL requested mode. We also know an
* EXPEDITE request is always granted immediately, so now must always
* be 1. The full condition to grant an expedite request: (now &&
* !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
* therefore be shortened to just checking the flag.
*/
if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
return TRUE;
/*
* A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
* added to the remaining conditions.
*/
if (queue_conflict(&r->res_grantqueue, lkb))
goto out;
/*
* 6-3: By default, a conversion request is immediately granted if the
* requested mode is compatible with the modes of all other granted
* locks
*/
if (queue_conflict(&r->res_convertqueue, lkb))
goto out;
/*
* 6-5: But the default algorithm for deciding whether to grant or
* queue conversion requests does not by itself guarantee that such
* requests are serviced on a "first come first serve" basis. This, in
* turn, can lead to a phenomenon known as "indefinate postponement".
*
* 6-7: This issue is dealt with by using the optional QUECVT flag with
* the system service employed to request a lock conversion. This flag
* forces certain conversion requests to be queued, even if they are
* compatible with the granted modes of other locks on the same
* resource. Thus, the use of this flag results in conversion requests
* being ordered on a "first come first servce" basis.
*
* DCT: This condition is all about new conversions being able to occur
* "in place" while the lock remains on the granted queue (assuming
* nothing else conflicts.) IOW if QUECVT isn't set, a conversion
* doesn't _have_ to go onto the convert queue where it's processed in
* order. The "now" variable is necessary to distinguish converts
* being received and processed for the first time now, because once a
* convert is moved to the conversion queue the condition below applies
* requiring fifo granting.
*/
if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
return TRUE;
/*
* When using range locks the NOORDER flag is set to avoid the standard
* vms rules on grant order.
*/
if (lkb->lkb_exflags & DLM_LKF_NOORDER)
return TRUE;
/*
* 6-3: Once in that queue [CONVERTING], a conversion request cannot be
* granted until all other conversion requests ahead of it are granted
* and/or canceled.
*/
if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
return TRUE;
/*
* 6-4: By default, a new request is immediately granted only if all
* three of the following conditions are satisfied when the request is
* issued:
* - The queue of ungranted conversion requests for the resource is
* empty.
* - The queue of ungranted new requests for the resource is empty.
* - The mode of the new request is compatible with the most
* restrictive mode of all granted locks on the resource.
*/
if (now && !conv && list_empty(&r->res_convertqueue) &&
list_empty(&r->res_waitqueue))
return TRUE;
/*
* 6-4: Once a lock request is in the queue of ungranted new requests,
* it cannot be granted until the queue of ungranted conversion
* requests is empty, all ungranted new requests ahead of it are
* granted and/or canceled, and it is compatible with the granted mode
* of the most restrictive lock granted on the resource.
*/
if (!now && !conv && list_empty(&r->res_convertqueue) &&
first_in_list(lkb, &r->res_waitqueue))
return TRUE;
out:
/*
* The following, enabled by CONVDEADLK, departs from VMS.
*/
if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
conversion_deadlock_detect(r, lkb)) {
lkb->lkb_grmode = DLM_LOCK_NL;
lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
}
return FALSE;
}
/*
* The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
* simple way to provide a big optimization to applications that can use them.
*/
static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
{
uint32_t flags = lkb->lkb_exflags;
int rv;
int8_t alt = 0, rqmode = lkb->lkb_rqmode;
rv = _can_be_granted(r, lkb, now);
if (rv)
goto out;
if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
goto out;
if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
alt = DLM_LOCK_PR;
else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
alt = DLM_LOCK_CW;
if (alt) {
lkb->lkb_rqmode = alt;
rv = _can_be_granted(r, lkb, now);
if (rv)
lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
else
lkb->lkb_rqmode = rqmode;
}
out:
return rv;
}
static int grant_pending_convert(struct dlm_rsb *r, int high)
{
struct dlm_lkb *lkb, *s;
int hi, demoted, quit, grant_restart, demote_restart;
quit = 0;
restart:
grant_restart = 0;
demote_restart = 0;
hi = DLM_LOCK_IV;
list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
demoted = is_demoted(lkb);
if (can_be_granted(r, lkb, FALSE)) {
grant_lock_pending(r, lkb);
grant_restart = 1;
} else {
hi = max_t(int, lkb->lkb_rqmode, hi);
if (!demoted && is_demoted(lkb))
demote_restart = 1;
}
}
if (grant_restart)
goto restart;
if (demote_restart && !quit) {
quit = 1;
goto restart;
}
return max_t(int, high, hi);
}
static int grant_pending_wait(struct dlm_rsb *r, int high)
{
struct dlm_lkb *lkb, *s;
list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
if (can_be_granted(r, lkb, FALSE))
grant_lock_pending(r, lkb);
else
high = max_t(int, lkb->lkb_rqmode, high);
}
return high;
}
static void grant_pending_locks(struct dlm_rsb *r)
{
struct dlm_lkb *lkb, *s;
int high = DLM_LOCK_IV;
DLM_ASSERT(is_master(r), dlm_print_rsb(r););
high = grant_pending_convert(r, high);
high = grant_pending_wait(r, high);
if (high == DLM_LOCK_IV)
return;
/*
* If there are locks left on the wait/convert queue then send blocking
* ASTs to granted locks based on the largest requested mode (high)
* found above. This can generate spurious blocking ASTs for range
* locks. FIXME: highbast < high comparison not valid for PR/CW.
*/
list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
!__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
queue_bast(r, lkb, high);
lkb->lkb_highbast = high;
}
}
}
static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
struct dlm_lkb *lkb)
{
struct dlm_lkb *gr;
list_for_each_entry(gr, head, lkb_statequeue) {
if (gr->lkb_bastaddr &&
gr->lkb_highbast < lkb->lkb_rqmode &&
ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
queue_bast(r, gr, lkb->lkb_rqmode);
gr->lkb_highbast = lkb->lkb_rqmode;
}
}
}
static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
send_bast_queue(r, &r->res_grantqueue, lkb);
}
static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
send_bast_queue(r, &r->res_grantqueue, lkb);
send_bast_queue(r, &r->res_convertqueue, lkb);
}
/* set_master(r, lkb) -- set the master nodeid of a resource
The purpose of this function is to set the nodeid field in the given
lkb using the nodeid field in the given rsb. If the rsb's nodeid is
known, it can just be copied to the lkb and the function will return
0. If the rsb's nodeid is _not_ known, it needs to be looked up
before it can be copied to the lkb.
When the rsb nodeid is being looked up remotely, the initial lkb
causing the lookup is kept on the ls_waiters list waiting for the
lookup reply. Other lkb's waiting for the same rsb lookup are kept
on the rsb's res_lookup list until the master is verified.
Return values:
0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1: the rsb master is not available and the lkb has been placed on
a wait queue
*/
static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
struct dlm_ls *ls = r->res_ls;
int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
r->res_first_lkid = lkb->lkb_id;
lkb->lkb_nodeid = r->res_nodeid;
return 0;
}
if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
return 1;
}
if (r->res_nodeid == 0) {
lkb->lkb_nodeid = 0;
return 0;
}
if (r->res_nodeid > 0) {
lkb->lkb_nodeid = r->res_nodeid;
return 0;
}
DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
dir_nodeid = dlm_dir_nodeid(r);
if (dir_nodeid != our_nodeid) {
r->res_first_lkid = lkb->lkb_id;
send_lookup(r, lkb);
return 1;
}
for (;;) {
/* It's possible for dlm_scand to remove an old rsb for
this same resource from the toss list, us to create
a new one, look up the master locally, and find it
already exists just before dlm_scand does the
dir_remove() on the previous rsb. */
error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
r->res_length, &ret_nodeid);
if (!error)
break;
log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
schedule();
}
if (ret_nodeid == our_nodeid) {
r->res_first_lkid = 0;
r->res_nodeid = 0;
lkb->lkb_nodeid = 0;
} else {
r->res_first_lkid = lkb->lkb_id;
r->res_nodeid = ret_nodeid;
lkb->lkb_nodeid = ret_nodeid;
}
return 0;
}
static void process_lookup_list(struct dlm_rsb *r)
{
struct dlm_lkb *lkb, *safe;
list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
list_del(&lkb->lkb_rsb_lookup);
_request_lock(r, lkb);
schedule();
}
}
/* confirm_master -- confirm (or deny) an rsb's master nodeid */
static void confirm_master(struct dlm_rsb *r, int error)
{
struct dlm_lkb *lkb;
if (!r->res_first_lkid)
return;
switch (error) {
case 0:
case -EINPROGRESS:
r->res_first_lkid = 0;
process_lookup_list(r);
break;
case -EAGAIN:
/* the remote master didn't queue our NOQUEUE request;
make a waiting lkb the first_lkid */
r->res_first_lkid = 0;
if (!list_empty(&r->res_lookup)) {
lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
lkb_rsb_lookup);
list_del(&lkb->lkb_rsb_lookup);
r->res_first_lkid = lkb->lkb_id;
_request_lock(r, lkb);
} else
r->res_nodeid = -1;
break;
default:
log_error(r->res_ls, "confirm_master unknown error %d", error);
}
}
static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
int namelen, uint32_t parent_lkid, void *ast,
void *astarg, void *bast, struct dlm_range *range,
struct dlm_args *args)
{
int rv = -EINVAL;
/* check for invalid arg usage */
if (mode < 0 || mode > DLM_LOCK_EX)
goto out;
if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
goto out;
if (flags & DLM_LKF_CANCEL)
goto out;
if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
goto out;
if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
goto out;
if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
goto out;
if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
goto out;
if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
goto out;
if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
goto out;
if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
goto out;
if (!ast || !lksb)
goto out;
if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
goto out;
/* parent/child locks not yet supported */
if (parent_lkid)
goto out;
if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
goto out;
/* these args will be copied to the lkb in validate_lock_args,
it cannot be done now because when converting locks, fields in
an active lkb cannot be modified before locking the rsb */
args->flags = flags;
args->astaddr = ast;
args->astparam = (long) astarg;
args->bastaddr = bast;
args->mode = mode;
args->lksb = lksb;
args->range = range;
rv = 0;
out:
return rv;
}
static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
{
if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
DLM_LKF_FORCEUNLOCK))
return -EINVAL;
args->flags = flags;
args->astparam = (long) astarg;
return 0;
}
static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
int rv = -EINVAL;
if (args->flags & DLM_LKF_CONVERT) {
if (lkb->lkb_flags & DLM_IFL_MSTCPY)
goto out;
if (args->flags & DLM_LKF_QUECVT &&
!__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
goto out;
rv = -EBUSY;
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
if (lkb->lkb_wait_type)
goto out;
}
lkb->lkb_exflags = args->flags;
lkb->lkb_sbflags = 0;
lkb->lkb_astaddr = args->astaddr;
lkb->lkb_astparam = args->astparam;
lkb->lkb_bastaddr = args->bastaddr;
lkb->lkb_rqmode = args->mode;
lkb->lkb_lksb = args->lksb;
lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
lkb->lkb_ownpid = (int) current->pid;
rv = 0;
if (!args->range)
goto out;
if (!lkb->lkb_range) {
rv = -ENOMEM;
lkb->lkb_range = allocate_range(ls);
if (!lkb->lkb_range)
goto out;
/* This is needed for conversions that contain ranges
where the original lock didn't but it's harmless for
new locks too. */
lkb->lkb_range[GR_RANGE_START] = 0LL;
lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
}
lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start;
lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end;
lkb->lkb_flags |= DLM_IFL_RANGE;
rv = 0;
out:
return rv;
}
static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
{
int rv = -EINVAL;
if (lkb->lkb_flags & DLM_IFL_MSTCPY)
goto out;
if (args->flags & DLM_LKF_FORCEUNLOCK)
goto out_ok;
if (args->flags & DLM_LKF_CANCEL &&
lkb->lkb_status == DLM_LKSTS_GRANTED)
goto out;
if (!(args->flags & DLM_LKF_CANCEL) &&
lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
rv = -EBUSY;
if (lkb->lkb_wait_type)
goto out;
out_ok:
lkb->lkb_exflags = args->flags;
lkb->lkb_sbflags = 0;
lkb->lkb_astparam = args->astparam;
rv = 0;
out:
return rv;
}
/*
* Four stage 4 varieties:
* do_request(), do_convert(), do_unlock(), do_cancel()
* These are called on the master node for the given lock and
* from the central locking logic.
*/
static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error = 0;
if (can_be_granted(r, lkb, TRUE)) {
grant_lock(r, lkb);
queue_cast(r, lkb, 0);
goto out;
}
if (can_be_queued(lkb)) {
error = -EINPROGRESS;
add_lkb(r, lkb, DLM_LKSTS_WAITING);
send_blocking_asts(r, lkb);
goto out;
}
error = -EAGAIN;
if (force_blocking_asts(lkb))
send_blocking_asts_all(r, lkb);
queue_cast(r, lkb, -EAGAIN);
out:
return error;
}
static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error = 0;
/* changing an existing lock may allow others to be granted */
if (can_be_granted(r, lkb, TRUE)) {
grant_lock(r, lkb);
queue_cast(r, lkb, 0);
grant_pending_locks(r);
goto out;
}
if (can_be_queued(lkb)) {
if (is_demoted(lkb))
grant_pending_locks(r);
error = -EINPROGRESS;
del_lkb(r, lkb);
add_lkb(r, lkb, DLM_LKSTS_CONVERT);
send_blocking_asts(r, lkb);
goto out;
}
error = -EAGAIN;
if (force_blocking_asts(lkb))
send_blocking_asts_all(r, lkb);
queue_cast(r, lkb, -EAGAIN);
out:
return error;
}
static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
remove_lock(r, lkb);
queue_cast(r, lkb, -DLM_EUNLOCK);
grant_pending_locks(r);
return -DLM_EUNLOCK;
}
static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
revert_lock(r, lkb);
queue_cast(r, lkb, -DLM_ECANCEL);
grant_pending_locks(r);
return -DLM_ECANCEL;
}
/*
* Four stage 3 varieties:
* _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
*/
/* add a new lkb to a possibly new rsb, called by requesting process */
static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
/* set_master: sets lkb nodeid from r */
error = set_master(r, lkb);
if (error < 0)
goto out;
if (error) {
error = 0;
goto out;
}
if (is_remote(r))
/* receive_request() calls do_request() on remote node */
error = send_request(r, lkb);
else
error = do_request(r, lkb);
out:
return error;
}
/* change some property of an existing lkb, e.g. mode, range */
static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
if (is_remote(r))
/* receive_convert() calls do_convert() on remote node */
error = send_convert(r, lkb);
else
error = do_convert(r, lkb);
return error;
}
/* remove an existing lkb from the granted queue */
static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
if (is_remote(r))
/* receive_unlock() calls do_unlock() on remote node */
error = send_unlock(r, lkb);
else
error = do_unlock(r, lkb);
return error;
}
/* remove an existing lkb from the convert or wait queue */
static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
if (is_remote(r))
/* receive_cancel() calls do_cancel() on remote node */
error = send_cancel(r, lkb);
else
error = do_cancel(r, lkb);
return error;
}
/*
* Four stage 2 varieties:
* request_lock(), convert_lock(), unlock_lock(), cancel_lock()
*/
static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
int len, struct dlm_args *args)
{
struct dlm_rsb *r;
int error;
error = validate_lock_args(ls, lkb, args);
if (error)
goto out;
error = find_rsb(ls, name, len, R_CREATE, &r);
if (error)
goto out;
lock_rsb(r);
attach_lkb(r, lkb);
lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
error = _request_lock(r, lkb);
unlock_rsb(r);
put_rsb(r);
out:
return error;
}
static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
struct dlm_rsb *r;
int error;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
error = validate_lock_args(ls, lkb, args);
if (error)
goto out;
error = _convert_lock(r, lkb);
out:
unlock_rsb(r);
put_rsb(r);
return error;
}
static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
struct dlm_rsb *r;
int error;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
error = validate_unlock_args(lkb, args);
if (error)
goto out;
error = _unlock_lock(r, lkb);
out:
unlock_rsb(r);
put_rsb(r);
return error;
}
static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
struct dlm_rsb *r;
int error;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
error = validate_unlock_args(lkb, args);
if (error)
goto out;
error = _cancel_lock(r, lkb);
out:
unlock_rsb(r);
put_rsb(r);
return error;
}
/*
* Two stage 1 varieties: dlm_lock() and dlm_unlock()
*/
int dlm_lock(dlm_lockspace_t *lockspace,
int mode,
struct dlm_lksb *lksb,
uint32_t flags,
void *name,
unsigned int namelen,
uint32_t parent_lkid,
void (*ast) (void *astarg),
void *astarg,
void (*bast) (void *astarg, int mode),
struct dlm_range *range)
{
struct dlm_ls *ls;
struct dlm_lkb *lkb;
struct dlm_args args;
int error, convert = flags & DLM_LKF_CONVERT;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
lock_recovery(ls);
if (convert)
error = find_lkb(ls, lksb->sb_lkid, &lkb);
else
error = create_lkb(ls, &lkb);
if (error)
goto out;
error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
astarg, bast, range, &args);
if (error)
goto out_put;
if (convert)
error = convert_lock(ls, lkb, &args);
else
error = request_lock(ls, lkb, name, namelen, &args);
if (error == -EINPROGRESS)
error = 0;
out_put:
if (convert || error)
put_lkb(lkb);
if (error == -EAGAIN)
error = 0;
out:
unlock_recovery(ls);
dlm_put_lockspace(ls);
return error;
}
int dlm_unlock(dlm_lockspace_t *lockspace,
uint32_t lkid,
uint32_t flags,
struct dlm_lksb *lksb,
void *astarg)
{
struct dlm_ls *ls;
struct dlm_lkb *lkb;
struct dlm_args args;
int error;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
lock_recovery(ls);
error = find_lkb(ls, lkid, &lkb);
if (error)
goto out;
error = set_unlock_args(flags, astarg, &args);
if (error)
goto out_put;
if (flags & DLM_LKF_CANCEL)
error = cancel_lock(ls, lkb, &args);
else
error = unlock_lock(ls, lkb, &args);
if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
error = 0;
out_put:
put_lkb(lkb);
out:
unlock_recovery(ls);
dlm_put_lockspace(ls);
return error;
}
/*
* send/receive routines for remote operations and replies
*
* send_args
* send_common
* send_request receive_request
* send_convert receive_convert
* send_unlock receive_unlock
* send_cancel receive_cancel
* send_grant receive_grant
* send_bast receive_bast
* send_lookup receive_lookup
* send_remove receive_remove
*
* send_common_reply
* receive_request_reply send_request_reply
* receive_convert_reply send_convert_reply
* receive_unlock_reply send_unlock_reply
* receive_cancel_reply send_cancel_reply
* receive_lookup_reply send_lookup_reply
*/
static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
struct dlm_mhandle **mh_ret)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
char *mb;
int mb_len = sizeof(struct dlm_message);
switch (mstype) {
case DLM_MSG_REQUEST:
case DLM_MSG_LOOKUP:
case DLM_MSG_REMOVE:
mb_len += r->res_length;
break;
case DLM_MSG_CONVERT:
case DLM_MSG_UNLOCK:
case DLM_MSG_REQUEST_REPLY:
case DLM_MSG_CONVERT_REPLY:
case DLM_MSG_GRANT:
if (lkb && lkb->lkb_lvbptr)
mb_len += r->res_ls->ls_lvblen;
break;
}
/* get_buffer gives us a message handle (mh) that we need to
pass into lowcomms_commit and a message buffer (mb) that we
write our data into */
mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
if (!mh)
return -ENOBUFS;
memset(mb, 0, mb_len);
ms = (struct dlm_message *) mb;
ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
ms->m_header.h_lockspace = r->res_ls->ls_global_id;
ms->m_header.h_nodeid = dlm_our_nodeid();
ms->m_header.h_length = mb_len;
ms->m_header.h_cmd = DLM_MSG;
ms->m_type = mstype;
*mh_ret = mh;
*ms_ret = ms;
return 0;
}
/* further lowcomms enhancements or alternate implementations may make
the return value from this function useful at some point */
static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
{
dlm_message_out(ms);
dlm_lowcomms_commit_buffer(mh);
return 0;
}
static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
ms->m_nodeid = lkb->lkb_nodeid;
ms->m_pid = lkb->lkb_ownpid;
ms->m_lkid = lkb->lkb_id;
ms->m_remid = lkb->lkb_remid;
ms->m_exflags = lkb->lkb_exflags;
ms->m_sbflags = lkb->lkb_sbflags;
ms->m_flags = lkb->lkb_flags;
ms->m_lvbseq = lkb->lkb_lvbseq;
ms->m_status = lkb->lkb_status;
ms->m_grmode = lkb->lkb_grmode;
ms->m_rqmode = lkb->lkb_rqmode;
ms->m_hash = r->res_hash;
/* m_result and m_bastmode are set from function args,
not from lkb fields */
if (lkb->lkb_bastaddr)
ms->m_asts |= AST_BAST;
if (lkb->lkb_astaddr)
ms->m_asts |= AST_COMP;
if (lkb->lkb_range) {
ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START];
ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END];
}
if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
memcpy(ms->m_extra, r->res_name, r->res_length);
else if (lkb->lkb_lvbptr)
memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
}
static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
int to_nodeid, error;
add_to_waiters(lkb, mstype);
to_nodeid = r->res_nodeid;
error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
if (error)
goto fail;
send_args(r, lkb, ms);
error = send_message(mh, ms);
if (error)
goto fail;
return 0;
fail:
remove_from_waiters(lkb);
return error;
}
static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
return send_common(r, lkb, DLM_MSG_REQUEST);
}
static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
int error;
error = send_common(r, lkb, DLM_MSG_CONVERT);
/* down conversions go without a reply from the master */
if (!error && down_conversion(lkb)) {
remove_from_waiters(lkb);
r->res_ls->ls_stub_ms.m_result = 0;
__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
}
return error;
}
/* FIXME: if this lkb is the only lock we hold on the rsb, then set
MASTER_UNCERTAIN to force the next request on the rsb to confirm
that the master is still correct. */
static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
return send_common(r, lkb, DLM_MSG_UNLOCK);
}
static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
return send_common(r, lkb, DLM_MSG_CANCEL);
}
static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
int to_nodeid, error;
to_nodeid = lkb->lkb_nodeid;
error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
if (error)
goto out;
send_args(r, lkb, ms);
ms->m_result = 0;
error = send_message(mh, ms);
out:
return error;
}
static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
int to_nodeid, error;
to_nodeid = lkb->lkb_nodeid;
error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
if (error)
goto out;
send_args(r, lkb, ms);
ms->m_bastmode = mode;
error = send_message(mh, ms);
out:
return error;
}
static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
int to_nodeid, error;
add_to_waiters(lkb, DLM_MSG_LOOKUP);
to_nodeid = dlm_dir_nodeid(r);
error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
if (error)
goto fail;
send_args(r, lkb, ms);
error = send_message(mh, ms);
if (error)
goto fail;
return 0;
fail:
remove_from_waiters(lkb);
return error;
}
static int send_remove(struct dlm_rsb *r)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
int to_nodeid, error;
to_nodeid = dlm_dir_nodeid(r);
error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
if (error)
goto out;
memcpy(ms->m_extra, r->res_name, r->res_length);
ms->m_hash = r->res_hash;
error = send_message(mh, ms);
out:
return error;
}
static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
int mstype, int rv)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
int to_nodeid, error;
to_nodeid = lkb->lkb_nodeid;
error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
if (error)
goto out;
send_args(r, lkb, ms);
ms->m_result = rv;
error = send_message(mh, ms);
out:
return error;
}
static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
{
return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
}
static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
{
return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
}
static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
{
return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
}
static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
{
return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
}
static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
int ret_nodeid, int rv)
{
struct dlm_rsb *r = &ls->ls_stub_rsb;
struct dlm_message *ms;
struct dlm_mhandle *mh;
int error, nodeid = ms_in->m_header.h_nodeid;
error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
if (error)
goto out;
ms->m_lkid = ms_in->m_lkid;
ms->m_result = rv;
ms->m_nodeid = ret_nodeid;
error = send_message(mh, ms);
out:
return error;
}
/* which args we save from a received message depends heavily on the type
of message, unlike the send side where we can safely send everything about
the lkb for any type of message */
static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
{
lkb->lkb_exflags = ms->m_exflags;
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
(ms->m_flags & 0x0000FFFF);
}
static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
lkb->lkb_sbflags = ms->m_sbflags;
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
(ms->m_flags & 0x0000FFFF);
}
static int receive_extralen(struct dlm_message *ms)
{
return (ms->m_header.h_length - sizeof(struct dlm_message));
}
static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
if (lkb->lkb_flags & DLM_IFL_RANGE) {
if (!lkb->lkb_range)
lkb->lkb_range = allocate_range(ls);
if (!lkb->lkb_range)
return -ENOMEM;
lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0];
lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1];
}
return 0;
}
static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
int len;
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
if (!lkb->lkb_lvbptr)
lkb->lkb_lvbptr = allocate_lvb(ls);
if (!lkb->lkb_lvbptr)
return -ENOMEM;
len = receive_extralen(ms);
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
}
return 0;
}
static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
lkb->lkb_nodeid = ms->m_header.h_nodeid;
lkb->lkb_ownpid = ms->m_pid;
lkb->lkb_remid = ms->m_lkid;
lkb->lkb_grmode = DLM_LOCK_IV;
lkb->lkb_rqmode = ms->m_rqmode;
lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
if (receive_range(ls, lkb, ms))
return -ENOMEM;
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
return 0;
}
static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
log_error(ls, "convert_args nodeid %d %d lkid %x %x",
lkb->lkb_nodeid, ms->m_header.h_nodeid,
lkb->lkb_id, lkb->lkb_remid);
return -EINVAL;
}
if (!is_master_copy(lkb))
return -EINVAL;
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
return -EBUSY;
if (receive_range(ls, lkb, ms))
return -ENOMEM;
if (lkb->lkb_range) {
lkb->lkb_range[GR_RANGE_START] = 0LL;
lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
}
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
lkb->lkb_rqmode = ms->m_rqmode;
lkb->lkb_lvbseq = ms->m_lvbseq;
return 0;
}
static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
if (!is_master_copy(lkb))
return -EINVAL;
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
return 0;
}
/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
uses to send a reply and that the remote end uses to process the reply. */
static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb = &ls->ls_stub_lkb;
lkb->lkb_nodeid = ms->m_header.h_nodeid;
lkb->lkb_remid = ms->m_lkid;
}
static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, namelen;
error = create_lkb(ls, &lkb);
if (error)
goto fail;
receive_flags(lkb, ms);
lkb->lkb_flags |= DLM_IFL_MSTCPY;
error = receive_request_args(ls, lkb, ms);
if (error) {
put_lkb(lkb);
goto fail;
}
namelen = receive_extralen(ms);
error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
if (error) {
put_lkb(lkb);
goto fail;
}
lock_rsb(r);
attach_lkb(r, lkb);
error = do_request(r, lkb);
send_request_reply(r, lkb, error);
unlock_rsb(r);
put_rsb(r);
if (error == -EINPROGRESS)
error = 0;
if (error)
put_lkb(lkb);
return;
fail:
setup_stub_lkb(ls, ms);
send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
}
static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, reply = TRUE;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error)
goto fail;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
receive_flags(lkb, ms);
error = receive_convert_args(ls, lkb, ms);
if (error)
goto out;
reply = !down_conversion(lkb);
error = do_convert(r, lkb);
out:
if (reply)
send_convert_reply(r, lkb, error);
unlock_rsb(r);
put_rsb(r);
put_lkb(lkb);
return;
fail:
setup_stub_lkb(ls, ms);
send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
}
static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error)
goto fail;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
receive_flags(lkb, ms);
error = receive_unlock_args(ls, lkb, ms);
if (error)
goto out;
error = do_unlock(r, lkb);
out:
send_unlock_reply(r, lkb, error);
unlock_rsb(r);
put_rsb(r);
put_lkb(lkb);
return;
fail:
setup_stub_lkb(ls, ms);
send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
}
static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error)
goto fail;
receive_flags(lkb, ms);
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
error = do_cancel(r, lkb);
send_cancel_reply(r, lkb, error);
unlock_rsb(r);
put_rsb(r);
put_lkb(lkb);
return;
fail:
setup_stub_lkb(ls, ms);
send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
}
static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
log_error(ls, "receive_grant no lkb");
return;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
receive_flags_reply(lkb, ms);
grant_lock_pc(r, lkb, ms);
queue_cast(r, lkb, 0);
unlock_rsb(r);
put_rsb(r);
put_lkb(lkb);
}
static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
log_error(ls, "receive_bast no lkb");
return;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
queue_bast(r, lkb, ms->m_bastmode);
unlock_rsb(r);
put_rsb(r);
put_lkb(lkb);
}
static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
{
int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
from_nodeid = ms->m_header.h_nodeid;
our_nodeid = dlm_our_nodeid();
len = receive_extralen(ms);
dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
if (dir_nodeid != our_nodeid) {
log_error(ls, "lookup dir_nodeid %d from %d",
dir_nodeid, from_nodeid);
error = -EINVAL;
ret_nodeid = -1;
goto out;
}
error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
/* Optimization: we're master so treat lookup as a request */
if (!error && ret_nodeid == our_nodeid) {
receive_request(ls, ms);
return;
}
out:
send_lookup_reply(ls, ms, ret_nodeid, error);
}
static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
{
int len, dir_nodeid, from_nodeid;
from_nodeid = ms->m_header.h_nodeid;
len = receive_extralen(ms);
dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
if (dir_nodeid != dlm_our_nodeid()) {
log_error(ls, "remove dir entry dir_nodeid %d from %d",
dir_nodeid, from_nodeid);
return;
}
dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
}
static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, mstype;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
log_error(ls, "receive_request_reply no lkb");
return;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
mstype = lkb->lkb_wait_type;
error = remove_from_waiters(lkb);
if (error) {
log_error(ls, "receive_request_reply not on waiters");
goto out;
}
/* this is the value returned from do_request() on the master */
error = ms->m_result;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
/* Optimization: the dir node was also the master, so it took our
lookup as a request and sent request reply instead of lookup reply */
if (mstype == DLM_MSG_LOOKUP) {
r->res_nodeid = ms->m_header.h_nodeid;
lkb->lkb_nodeid = r->res_nodeid;
}
switch (error) {
case -EAGAIN:
/* request would block (be queued) on remote master;
the unhold undoes the original ref from create_lkb()
so it leads to the lkb being freed */
queue_cast(r, lkb, -EAGAIN);
confirm_master(r, -EAGAIN);
unhold_lkb(lkb);
break;
case -EINPROGRESS:
case 0:
/* request was queued or granted on remote master */
receive_flags_reply(lkb, ms);
lkb->lkb_remid = ms->m_lkid;
if (error)
add_lkb(r, lkb, DLM_LKSTS_WAITING);
else {
grant_lock_pc(r, lkb, ms);
queue_cast(r, lkb, 0);
}
confirm_master(r, error);
break;
case -ENOENT:
case -ENOTBLK:
/* find_rsb failed to find rsb or rsb wasn't master */
r->res_nodeid = -1;
lkb->lkb_nodeid = -1;
_request_lock(r, lkb);
break;
default:
log_error(ls, "receive_request_reply error %d", error);
}
unlock_rsb(r);
put_rsb(r);
out:
put_lkb(lkb);
}
static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
int error = ms->m_result;
/* this is the value returned from do_convert() on the master */
switch (error) {
case -EAGAIN:
/* convert would block (be queued) on remote master */
queue_cast(r, lkb, -EAGAIN);
break;
case -EINPROGRESS:
/* convert was queued on remote master */
del_lkb(r, lkb);
add_lkb(r, lkb, DLM_LKSTS_CONVERT);
break;
case 0:
/* convert was granted on remote master */
receive_flags_reply(lkb, ms);
grant_lock_pc(r, lkb, ms);
queue_cast(r, lkb, 0);
break;
default:
log_error(r->res_ls, "receive_convert_reply error %d", error);
}
}
static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
struct dlm_rsb *r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
__receive_convert_reply(r, lkb, ms);
unlock_rsb(r);
put_rsb(r);
}
static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
log_error(ls, "receive_convert_reply no lkb");
return;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
error = remove_from_waiters(lkb);
if (error) {
log_error(ls, "receive_convert_reply not on waiters");
goto out;
}
_receive_convert_reply(lkb, ms);
out:
put_lkb(lkb);
}
static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
struct dlm_rsb *r = lkb->lkb_resource;
int error = ms->m_result;
hold_rsb(r);
lock_rsb(r);
/* this is the value returned from do_unlock() on the master */
switch (error) {
case -DLM_EUNLOCK:
receive_flags_reply(lkb, ms);
remove_lock_pc(r, lkb);
queue_cast(r, lkb, -DLM_EUNLOCK);
break;
default:
log_error(r->res_ls, "receive_unlock_reply error %d", error);
}
unlock_rsb(r);
put_rsb(r);
}
static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
log_error(ls, "receive_unlock_reply no lkb");
return;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
error = remove_from_waiters(lkb);
if (error) {
log_error(ls, "receive_unlock_reply not on waiters");
goto out;
}
_receive_unlock_reply(lkb, ms);
out:
put_lkb(lkb);
}
static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
struct dlm_rsb *r = lkb->lkb_resource;
int error = ms->m_result;
hold_rsb(r);
lock_rsb(r);
/* this is the value returned from do_cancel() on the master */
switch (error) {
case -DLM_ECANCEL:
receive_flags_reply(lkb, ms);
revert_lock_pc(r, lkb);
queue_cast(r, lkb, -DLM_ECANCEL);
break;
default:
log_error(r->res_ls, "receive_cancel_reply error %d", error);
}
unlock_rsb(r);
put_rsb(r);
}
static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, ms->m_remid, &lkb);
if (error) {
log_error(ls, "receive_cancel_reply no lkb");
return;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
error = remove_from_waiters(lkb);
if (error) {
log_error(ls, "receive_cancel_reply not on waiters");
goto out;
}
_receive_cancel_reply(lkb, ms);
out:
put_lkb(lkb);
}
static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, ret_nodeid;
error = find_lkb(ls, ms->m_lkid, &lkb);
if (error) {
log_error(ls, "receive_lookup_reply no lkb");
return;
}
error = remove_from_waiters(lkb);
if (error) {
log_error(ls, "receive_lookup_reply not on waiters");
goto out;
}
/* this is the value returned by dlm_dir_lookup on dir node
FIXME: will a non-zero error ever be returned? */
error = ms->m_result;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
ret_nodeid = ms->m_nodeid;
if (ret_nodeid == dlm_our_nodeid()) {
r->res_nodeid = 0;
ret_nodeid = 0;
r->res_first_lkid = 0;
} else {
/* set_master() will copy res_nodeid to lkb_nodeid */
r->res_nodeid = ret_nodeid;
}
_request_lock(r, lkb);
if (!ret_nodeid)
process_lookup_list(r);
unlock_rsb(r);
put_rsb(r);
out:
put_lkb(lkb);
}
int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
{
struct dlm_message *ms = (struct dlm_message *) hd;
struct dlm_ls *ls;
int error;
if (!recovery)
dlm_message_in(ms);
ls = dlm_find_lockspace_global(hd->h_lockspace);
if (!ls) {
log_print("drop message %d from %d for unknown lockspace %d",
ms->m_type, nodeid, hd->h_lockspace);
return -EINVAL;
}
/* recovery may have just ended leaving a bunch of backed-up requests
in the requestqueue; wait while dlm_recoverd clears them */
if (!recovery)
dlm_wait_requestqueue(ls);
/* recovery may have just started while there were a bunch of
in-flight requests -- save them in requestqueue to be processed
after recovery. we can't let dlm_recvd block on the recovery
lock. if dlm_recoverd is calling this function to clear the
requestqueue, it needs to be interrupted (-EINTR) if another
recovery operation is starting. */
while (1) {
if (dlm_locking_stopped(ls)) {
if (!recovery)
dlm_add_requestqueue(ls, nodeid, hd);
error = -EINTR;
goto out;
}
if (lock_recovery_try(ls))
break;
schedule();
}
switch (ms->m_type) {
/* messages sent to a master node */
case DLM_MSG_REQUEST:
receive_request(ls, ms);
break;
case DLM_MSG_CONVERT:
receive_convert(ls, ms);
break;
case DLM_MSG_UNLOCK:
receive_unlock(ls, ms);
break;
case DLM_MSG_CANCEL:
receive_cancel(ls, ms);
break;
/* messages sent from a master node (replies to above) */
case DLM_MSG_REQUEST_REPLY:
receive_request_reply(ls, ms);
break;
case DLM_MSG_CONVERT_REPLY:
receive_convert_reply(ls, ms);
break;
case DLM_MSG_UNLOCK_REPLY:
receive_unlock_reply(ls, ms);
break;
case DLM_MSG_CANCEL_REPLY:
receive_cancel_reply(ls, ms);
break;
/* messages sent from a master node (only two types of async msg) */
case DLM_MSG_GRANT:
receive_grant(ls, ms);
break;
case DLM_MSG_BAST:
receive_bast(ls, ms);
break;
/* messages sent to a dir node */
case DLM_MSG_LOOKUP:
receive_lookup(ls, ms);
break;
case DLM_MSG_REMOVE:
receive_remove(ls, ms);
break;
/* messages sent from a dir node (remove has no reply) */
case DLM_MSG_LOOKUP_REPLY:
receive_lookup_reply(ls, ms);
break;
default:
log_error(ls, "unknown message type %d", ms->m_type);
}
unlock_recovery(ls);
out:
dlm_put_lockspace(ls);
dlm_astd_wake();
return 0;
}
/*
* Recovery related
*/
static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
if (middle_conversion(lkb)) {
hold_lkb(lkb);
ls->ls_stub_ms.m_result = -EINPROGRESS;
_remove_from_waiters(lkb);
_receive_convert_reply(lkb, &ls->ls_stub_ms);
/* Same special case as in receive_rcom_lock_args() */
lkb->lkb_grmode = DLM_LOCK_IV;
rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
unhold_lkb(lkb);
} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
lkb->lkb_flags |= DLM_IFL_RESEND;
}
/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
conversions are async; there's no reply from the remote master */
}
/* A waiting lkb needs recovery if the master node has failed, or
the master node is changing (only when no directory is used) */
static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
if (dlm_is_removed(ls, lkb->lkb_nodeid))
return 1;
if (!dlm_no_directory(ls))
return 0;
if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
return 1;
return 0;
}
/* Recovery for locks that are waiting for replies from nodes that are now
gone. We can just complete unlocks and cancels by faking a reply from the
dead node. Requests and up-conversions we flag to be resent after
recovery. Down-conversions can just be completed with a fake reply like
unlocks. Conversions between PR and CW need special attention. */
void dlm_recover_waiters_pre(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
down(&ls->ls_waiters_sem);
list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
/* all outstanding lookups, regardless of destination will be
resent after recovery is done */
if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
lkb->lkb_flags |= DLM_IFL_RESEND;
continue;
}
if (!waiter_needs_recovery(ls, lkb))
continue;
switch (lkb->lkb_wait_type) {
case DLM_MSG_REQUEST:
lkb->lkb_flags |= DLM_IFL_RESEND;
break;
case DLM_MSG_CONVERT:
recover_convert_waiter(ls, lkb);
break;
case DLM_MSG_UNLOCK:
hold_lkb(lkb);
ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
_remove_from_waiters(lkb);
_receive_unlock_reply(lkb, &ls->ls_stub_ms);
put_lkb(lkb);
break;
case DLM_MSG_CANCEL:
hold_lkb(lkb);
ls->ls_stub_ms.m_result = -DLM_ECANCEL;
_remove_from_waiters(lkb);
_receive_cancel_reply(lkb, &ls->ls_stub_ms);
put_lkb(lkb);
break;
default:
log_error(ls, "invalid lkb wait_type %d",
lkb->lkb_wait_type);
}
}
up(&ls->ls_waiters_sem);
}
static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb;
int rv = 0;
down(&ls->ls_waiters_sem);
list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
if (lkb->lkb_flags & DLM_IFL_RESEND) {
rv = lkb->lkb_wait_type;
_remove_from_waiters(lkb);
lkb->lkb_flags &= ~DLM_IFL_RESEND;
break;
}
}
up(&ls->ls_waiters_sem);
if (!rv)
lkb = NULL;
*lkb_ret = lkb;
return rv;
}
/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
master or dir-node for r. Processing the lkb may result in it being placed
back on waiters. */
int dlm_recover_waiters_post(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error = 0, mstype;
while (1) {
if (dlm_locking_stopped(ls)) {
log_debug(ls, "recover_waiters_post aborted");
error = -EINTR;
break;
}
mstype = remove_resend_waiter(ls, &lkb);
if (!mstype)
break;
r = lkb->lkb_resource;
log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
switch (mstype) {
case DLM_MSG_LOOKUP:
hold_rsb(r);
lock_rsb(r);
_request_lock(r, lkb);
if (is_master(r))
confirm_master(r, 0);
unlock_rsb(r);
put_rsb(r);
break;
case DLM_MSG_REQUEST:
hold_rsb(r);
lock_rsb(r);
_request_lock(r, lkb);
unlock_rsb(r);
put_rsb(r);
break;
case DLM_MSG_CONVERT:
hold_rsb(r);
lock_rsb(r);
_convert_lock(r, lkb);
unlock_rsb(r);
put_rsb(r);
break;
default:
log_error(ls, "recover_waiters_post type %d", mstype);
}
}
return error;
}
static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
{
struct dlm_ls *ls = r->res_ls;
struct dlm_lkb *lkb, *safe;
list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
if (test(ls, lkb)) {
del_lkb(r, lkb);
/* this put should free the lkb */
if (!put_lkb(lkb))
log_error(ls, "purged lkb not released");
}
}
}
static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
}
static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
return is_master_copy(lkb);
}
static void purge_dead_locks(struct dlm_rsb *r)
{
purge_queue(r, &r->res_grantqueue, &purge_dead_test);
purge_queue(r, &r->res_convertqueue, &purge_dead_test);
purge_queue(r, &r->res_waitqueue, &purge_dead_test);
}
void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
{
purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
}
/* Get rid of locks held by nodes that are gone. */
int dlm_purge_locks(struct dlm_ls *ls)
{
struct dlm_rsb *r;
log_debug(ls, "dlm_purge_locks");
down_write(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
hold_rsb(r);
lock_rsb(r);
if (is_master(r))
purge_dead_locks(r);
unlock_rsb(r);
unhold_rsb(r);
schedule();
}
up_write(&ls->ls_root_sem);
return 0;
}
int dlm_grant_after_purge(struct dlm_ls *ls)
{
struct dlm_rsb *r;
int i;
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
read_lock(&ls->ls_rsbtbl[i].lock);
list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
hold_rsb(r);
lock_rsb(r);
if (is_master(r)) {
grant_pending_locks(r);
confirm_master(r, 0);
}
unlock_rsb(r);
put_rsb(r);
}
read_unlock(&ls->ls_rsbtbl[i].lock);
}
return 0;
}
static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
uint32_t remid)
{
struct dlm_lkb *lkb;
list_for_each_entry(lkb, head, lkb_statequeue) {
if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
return lkb;
}
return NULL;
}
static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
uint32_t remid)
{
struct dlm_lkb *lkb;
lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
if (lkb)
return lkb;
lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
if (lkb)
return lkb;
lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
if (lkb)
return lkb;
return NULL;
}
static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_rsb *r, struct dlm_rcom *rc)
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
int lvblen;
lkb->lkb_nodeid = rc->rc_header.h_nodeid;
lkb->lkb_ownpid = rl->rl_ownpid;
lkb->lkb_remid = rl->rl_lkid;
lkb->lkb_exflags = rl->rl_exflags;
lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
lkb->lkb_flags |= DLM_IFL_MSTCPY;
lkb->lkb_lvbseq = rl->rl_lvbseq;
lkb->lkb_rqmode = rl->rl_rqmode;
lkb->lkb_grmode = rl->rl_grmode;
/* don't set lkb_status because add_lkb wants to itself */
lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
if (lkb->lkb_flags & DLM_IFL_RANGE) {
lkb->lkb_range = allocate_range(ls);
if (!lkb->lkb_range)
return -ENOMEM;
memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t));
}
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
lkb->lkb_lvbptr = allocate_lvb(ls);
if (!lkb->lkb_lvbptr)
return -ENOMEM;
lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
sizeof(struct rcom_lock);
memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
}
/* Conversions between PR and CW (middle modes) need special handling.
The real granted mode of these converting locks cannot be determined
until all locks have been rebuilt on the rsb (recover_conversion) */
if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
rl->rl_status = DLM_LKSTS_CONVERT;
lkb->lkb_grmode = DLM_LOCK_IV;
rsb_set_flag(r, RSB_RECOVER_CONVERT);
}
return 0;
}
/* This lkb may have been recovered in a previous aborted recovery so we need
to check if the rsb already has an lkb with the given remote nodeid/lkid.
If so we just send back a standard reply. If not, we create a new lkb with
the given values and send back our lkid. We send back our lkid by sending
back the rcom_lock struct we got but with the remid field filled in. */
int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
struct dlm_rsb *r;
struct dlm_lkb *lkb;
int error;
if (rl->rl_parent_lkid) {
error = -EOPNOTSUPP;
goto out;
}
error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
if (error)
goto out;
lock_rsb(r);
lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
if (lkb) {
error = -EEXIST;
goto out_remid;
}
error = create_lkb(ls, &lkb);
if (error)
goto out_unlock;
error = receive_rcom_lock_args(ls, lkb, r, rc);
if (error) {
put_lkb(lkb);
goto out_unlock;
}
attach_lkb(r, lkb);
add_lkb(r, lkb, rl->rl_status);
error = 0;
out_remid:
/* this is the new value returned to the lock holder for
saving in its process-copy lkb */
rl->rl_remid = lkb->lkb_id;
out_unlock:
unlock_rsb(r);
put_rsb(r);
out:
if (error)
log_print("recover_master_copy %d %x", error, rl->rl_lkid);
rl->rl_result = error;
return error;
}
int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
struct dlm_rsb *r;
struct dlm_lkb *lkb;
int error;
error = find_lkb(ls, rl->rl_lkid, &lkb);
if (error) {
log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
return error;
}
DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
error = rl->rl_result;
r = lkb->lkb_resource;
hold_rsb(r);
lock_rsb(r);
switch (error) {
case -EEXIST:
log_debug(ls, "master copy exists %x", lkb->lkb_id);
/* fall through */
case 0:
lkb->lkb_remid = rl->rl_remid;
break;
default:
log_error(ls, "dlm_recover_process_copy unknown error %d %x",
error, lkb->lkb_id);
}
/* an ack for dlm_recover_locks() which waits for replies from
all the locks it sends to new masters */
dlm_recovered_lock(r);
unlock_rsb(r);
put_rsb(r);
put_lkb(lkb);
return 0;
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __LOCK_DOT_H__
#define __LOCK_DOT_H__
void dlm_print_rsb(struct dlm_rsb *r);
int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
int dlm_modes_compat(int mode1, int mode2);
int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
unsigned int flags, struct dlm_rsb **r_ret);
void dlm_put_rsb(struct dlm_rsb *r);
void dlm_hold_rsb(struct dlm_rsb *r);
int dlm_put_lkb(struct dlm_lkb *lkb);
void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_purge_locks(struct dlm_ls *ls);
void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
int dlm_grant_after_purge(struct dlm_ls *ls);
int dlm_recover_waiters_post(struct dlm_ls *ls);
void dlm_recover_waiters_pre(struct dlm_ls *ls);
int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
static inline int is_master(struct dlm_rsb *r)
{
return !r->res_nodeid;
}
static inline void lock_rsb(struct dlm_rsb *r)
{
down(&r->res_sem);
}
static inline void unlock_rsb(struct dlm_rsb *r)
{
up(&r->res_sem);
}
#endif
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
#include "recoverd.h"
#include "ast.h"
#include "dir.h"
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
#include "lock.h"
#ifdef CONFIG_DLM_DEBUG
int dlm_create_debug_file(struct dlm_ls *ls);
void dlm_delete_debug_file(struct dlm_ls *ls);
#else
static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
#endif
static int ls_count;
static struct semaphore ls_lock;
static struct list_head lslist;
static spinlock_t lslist_lock;
static struct task_struct * scand_task;
static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
{
ssize_t ret = len;
int n = simple_strtol(buf, NULL, 0);
switch (n) {
case 0:
dlm_ls_stop(ls);
break;
case 1:
dlm_ls_start(ls);
break;
default:
ret = -EINVAL;
}
return ret;
}
static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
{
ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
wake_up(&ls->ls_uevent_wait);
return len;
}
static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
{
return sprintf(buf, "%u\n", ls->ls_global_id);
}
static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
{
ls->ls_global_id = simple_strtoul(buf, NULL, 0);
return len;
}
struct dlm_attr {
struct attribute attr;
ssize_t (*show)(struct dlm_ls *, char *);
ssize_t (*store)(struct dlm_ls *, const char *, size_t);
};
static struct dlm_attr dlm_attr_control = {
.attr = {.name = "control", .mode = S_IWUSR},
.store = dlm_control_store
};
static struct dlm_attr dlm_attr_event = {
.attr = {.name = "event_done", .mode = S_IWUSR},
.store = dlm_event_store
};
static struct dlm_attr dlm_attr_id = {
.attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
.show = dlm_id_show,
.store = dlm_id_store
};
static struct attribute *dlm_attrs[] = {
&dlm_attr_control.attr,
&dlm_attr_event.attr,
&dlm_attr_id.attr,
NULL,
};
static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
return a->show ? a->show(ls, buf) : 0;
}
static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t len)
{
struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
return a->store ? a->store(ls, buf, len) : len;
}
static struct sysfs_ops dlm_attr_ops = {
.show = dlm_attr_show,
.store = dlm_attr_store,
};
static struct kobj_type dlm_ktype = {
.default_attrs = dlm_attrs,
.sysfs_ops = &dlm_attr_ops,
};
static struct kset dlm_kset = {
.subsys = &kernel_subsys,
.kobj = {.name = "dlm",},
.ktype = &dlm_ktype,
};
static int kobject_setup(struct dlm_ls *ls)
{
char lsname[DLM_LOCKSPACE_LEN];
int error;
memset(lsname, 0, DLM_LOCKSPACE_LEN);
snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
if (error)
return error;
ls->ls_kobj.kset = &dlm_kset;
ls->ls_kobj.ktype = &dlm_ktype;
return 0;
}
static int do_uevent(struct dlm_ls *ls, int in)
{
int error;
if (in)
kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
else
kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
error = wait_event_interruptible(ls->ls_uevent_wait,
test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
if (error)
goto out;
error = ls->ls_uevent_result;
out:
return error;
}
int dlm_lockspace_init(void)
{
int error;
ls_count = 0;
init_MUTEX(&ls_lock);
INIT_LIST_HEAD(&lslist);
spin_lock_init(&lslist_lock);
error = kset_register(&dlm_kset);
if (error)
printk("dlm_lockspace_init: cannot register kset %d\n", error);
return error;
}
void dlm_lockspace_exit(void)
{
kset_unregister(&dlm_kset);
}
static int dlm_scand(void *data)
{
struct dlm_ls *ls;
while (!kthread_should_stop()) {
list_for_each_entry(ls, &lslist, ls_list)
dlm_scan_rsbs(ls);
schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
}
return 0;
}
static int dlm_scand_start(void)
{
struct task_struct *p;
int error = 0;
p = kthread_run(dlm_scand, NULL, "dlm_scand");
if (IS_ERR(p))
error = PTR_ERR(p);
else
scand_task = p;
return error;
}
static void dlm_scand_stop(void)
{
kthread_stop(scand_task);
}
static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
{
struct dlm_ls *ls;
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_namelen == namelen &&
memcmp(ls->ls_name, name, namelen) == 0)
goto out;
}
ls = NULL;
out:
spin_unlock(&lslist_lock);
return ls;
}
struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
{
struct dlm_ls *ls;
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_global_id == id) {
ls->ls_count++;
goto out;
}
}
ls = NULL;
out:
spin_unlock(&lslist_lock);
return ls;
}
struct dlm_ls *dlm_find_lockspace_local(void *id)
{
struct dlm_ls *ls = id;
spin_lock(&lslist_lock);
ls->ls_count++;
spin_unlock(&lslist_lock);
return ls;
}
void dlm_put_lockspace(struct dlm_ls *ls)
{
spin_lock(&lslist_lock);
ls->ls_count--;
spin_unlock(&lslist_lock);
}
static void remove_lockspace(struct dlm_ls *ls)
{
for (;;) {
spin_lock(&lslist_lock);
if (ls->ls_count == 0) {
list_del(&ls->ls_list);
spin_unlock(&lslist_lock);
return;
}
spin_unlock(&lslist_lock);
ssleep(1);
}
}
static int threads_start(void)
{
int error;
/* Thread which process lock requests for all lockspace's */
error = dlm_astd_start();
if (error) {
log_print("cannot start dlm_astd thread %d", error);
goto fail;
}
error = dlm_scand_start();
if (error) {
log_print("cannot start dlm_scand thread %d", error);
goto astd_fail;
}
/* Thread for sending/receiving messages for all lockspace's */
error = dlm_lowcomms_start();
if (error) {
log_print("cannot start dlm lowcomms %d", error);
goto scand_fail;
}
return 0;
scand_fail:
dlm_scand_stop();
astd_fail:
dlm_astd_stop();
fail:
return error;
}
static void threads_stop(void)
{
dlm_scand_stop();
dlm_lowcomms_stop();
dlm_astd_stop();
}
static int new_lockspace(char *name, int namelen, void **lockspace,
uint32_t flags, int lvblen)
{
struct dlm_ls *ls;
int i, size, error = -ENOMEM;
if (namelen > DLM_LOCKSPACE_LEN)
return -EINVAL;
if (!lvblen || (lvblen % 8))
return -EINVAL;
if (!try_module_get(THIS_MODULE))
return -EINVAL;
ls = dlm_find_lockspace_name(name, namelen);
if (ls) {
*lockspace = ls;
module_put(THIS_MODULE);
return -EEXIST;
}
ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
if (!ls)
goto out;
memset(ls, 0, sizeof(struct dlm_ls) + namelen);
memcpy(ls->ls_name, name, namelen);
ls->ls_namelen = namelen;
ls->ls_exflags = flags;
ls->ls_lvblen = lvblen;
ls->ls_count = 0;
ls->ls_flags = 0;
size = dlm_config.rsbtbl_size;
ls->ls_rsbtbl_size = size;
ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
if (!ls->ls_rsbtbl)
goto out_lsfree;
for (i = 0; i < size; i++) {
INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
rwlock_init(&ls->ls_rsbtbl[i].lock);
}
size = dlm_config.lkbtbl_size;
ls->ls_lkbtbl_size = size;
ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
if (!ls->ls_lkbtbl)
goto out_rsbfree;
for (i = 0; i < size; i++) {
INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
rwlock_init(&ls->ls_lkbtbl[i].lock);
ls->ls_lkbtbl[i].counter = 1;
}
size = dlm_config.dirtbl_size;
ls->ls_dirtbl_size = size;
ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
if (!ls->ls_dirtbl)
goto out_lkbfree;
for (i = 0; i < size; i++) {
INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
rwlock_init(&ls->ls_dirtbl[i].lock);
}
INIT_LIST_HEAD(&ls->ls_waiters);
init_MUTEX(&ls->ls_waiters_sem);
INIT_LIST_HEAD(&ls->ls_nodes);
INIT_LIST_HEAD(&ls->ls_nodes_gone);
ls->ls_num_nodes = 0;
ls->ls_low_nodeid = 0;
ls->ls_total_weight = 0;
ls->ls_node_array = NULL;
memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
ls->ls_stub_rsb.res_ls = ls;
ls->ls_debug_dentry = NULL;
init_waitqueue_head(&ls->ls_uevent_wait);
ls->ls_uevent_result = 0;
ls->ls_recoverd_task = NULL;
init_MUTEX(&ls->ls_recoverd_active);
spin_lock_init(&ls->ls_recover_lock);
ls->ls_recover_status = 0;
ls->ls_recover_seq = 0;
ls->ls_recover_args = NULL;
init_rwsem(&ls->ls_in_recovery);
INIT_LIST_HEAD(&ls->ls_requestqueue);
init_MUTEX(&ls->ls_requestqueue_lock);
ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
if (!ls->ls_recover_buf)
goto out_dirfree;
INIT_LIST_HEAD(&ls->ls_recover_list);
spin_lock_init(&ls->ls_recover_list_lock);
ls->ls_recover_list_count = 0;
init_waitqueue_head(&ls->ls_wait_general);
INIT_LIST_HEAD(&ls->ls_root_list);
init_rwsem(&ls->ls_root_sem);
down_write(&ls->ls_in_recovery);
error = dlm_recoverd_start(ls);
if (error) {
log_error(ls, "can't start dlm_recoverd %d", error);
goto out_rcomfree;
}
spin_lock(&lslist_lock);
list_add(&ls->ls_list, &lslist);
spin_unlock(&lslist_lock);
dlm_create_debug_file(ls);
error = kobject_setup(ls);
if (error)
goto out_del;
error = kobject_register(&ls->ls_kobj);
if (error)
goto out_del;
error = do_uevent(ls, 1);
if (error)
goto out_unreg;
*lockspace = ls;
return 0;
out_unreg:
kobject_unregister(&ls->ls_kobj);
out_del:
dlm_delete_debug_file(ls);
spin_lock(&lslist_lock);
list_del(&ls->ls_list);
spin_unlock(&lslist_lock);
dlm_recoverd_stop(ls);
out_rcomfree:
kfree(ls->ls_recover_buf);
out_dirfree:
kfree(ls->ls_dirtbl);
out_lkbfree:
kfree(ls->ls_lkbtbl);
out_rsbfree:
kfree(ls->ls_rsbtbl);
out_lsfree:
kfree(ls);
out:
module_put(THIS_MODULE);
return error;
}
int dlm_new_lockspace(char *name, int namelen, void **lockspace,
uint32_t flags, int lvblen)
{
int error = 0;
down(&ls_lock);
if (!ls_count)
error = threads_start();
if (error)
goto out;
error = new_lockspace(name, namelen, lockspace, flags, lvblen);
if (!error)
ls_count++;
out:
up(&ls_lock);
return error;
}
/* Return 1 if the lockspace still has active remote locks,
* 2 if the lockspace still has active local locks.
*/
static int lockspace_busy(struct dlm_ls *ls)
{
int i, lkb_found = 0;
struct dlm_lkb *lkb;
/* NOTE: We check the lockidtbl here rather than the resource table.
This is because there may be LKBs queued as ASTs that have been
unlinked from their RSBs and are pending deletion once the AST has
been delivered */
for (i = 0; i < ls->ls_lkbtbl_size; i++) {
read_lock(&ls->ls_lkbtbl[i].lock);
if (!list_empty(&ls->ls_lkbtbl[i].list)) {
lkb_found = 1;
list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
lkb_idtbl_list) {
if (!lkb->lkb_nodeid) {
read_unlock(&ls->ls_lkbtbl[i].lock);
return 2;
}
}
}
read_unlock(&ls->ls_lkbtbl[i].lock);
}
return lkb_found;
}
static int release_lockspace(struct dlm_ls *ls, int force)
{
struct dlm_lkb *lkb;
struct dlm_rsb *rsb;
struct list_head *head;
int i;
int busy = lockspace_busy(ls);
if (busy > force)
return -EBUSY;
if (force < 3)
do_uevent(ls, 0);
dlm_recoverd_stop(ls);
remove_lockspace(ls);
dlm_delete_debug_file(ls);
dlm_astd_suspend();
kfree(ls->ls_recover_buf);
/*
* Free direntry structs.
*/
dlm_dir_clear(ls);
kfree(ls->ls_dirtbl);
/*
* Free all lkb's on lkbtbl[] lists.
*/
for (i = 0; i < ls->ls_lkbtbl_size; i++) {
head = &ls->ls_lkbtbl[i].list;
while (!list_empty(head)) {
lkb = list_entry(head->next, struct dlm_lkb,
lkb_idtbl_list);
list_del(&lkb->lkb_idtbl_list);
dlm_del_ast(lkb);
if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
free_lvb(lkb->lkb_lvbptr);
free_lkb(lkb);
}
}
dlm_astd_resume();
kfree(ls->ls_lkbtbl);
/*
* Free all rsb's on rsbtbl[] lists
*/
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
head = &ls->ls_rsbtbl[i].list;
while (!list_empty(head)) {
rsb = list_entry(head->next, struct dlm_rsb,
res_hashchain);
list_del(&rsb->res_hashchain);
free_rsb(rsb);
}
head = &ls->ls_rsbtbl[i].toss;
while (!list_empty(head)) {
rsb = list_entry(head->next, struct dlm_rsb,
res_hashchain);
list_del(&rsb->res_hashchain);
free_rsb(rsb);
}
}
kfree(ls->ls_rsbtbl);
/*
* Free structures on any other lists
*/
kfree(ls->ls_recover_args);
dlm_clear_free_entries(ls);
dlm_clear_members(ls);
dlm_clear_members_gone(ls);
kfree(ls->ls_node_array);
kobject_unregister(&ls->ls_kobj);
kfree(ls);
down(&ls_lock);
ls_count--;
if (!ls_count)
threads_stop();
up(&ls_lock);
module_put(THIS_MODULE);
return 0;
}
/*
* Called when a system has released all its locks and is not going to use the
* lockspace any longer. We free everything we're managing for this lockspace.
* Remaining nodes will go through the recovery process as if we'd died. The
* lockspace must continue to function as usual, participating in recoveries,
* until this returns.
*
* Force has 4 possible values:
* 0 - don't destroy locksapce if it has any LKBs
* 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
* 2 - destroy lockspace regardless of LKBs
* 3 - destroy lockspace as part of a forced shutdown
*/
int dlm_release_lockspace(void *lockspace, int force)
{
struct dlm_ls *ls;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
dlm_put_lockspace(ls);
return release_lockspace(ls, force);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __LOCKSPACE_DOT_H__
#define __LOCKSPACE_DOT_H__
int dlm_lockspace_init(void);
void dlm_lockspace_exit(void);
struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
struct dlm_ls *dlm_find_lockspace_local(void *id);
void dlm_put_lockspace(struct dlm_ls *ls);
#endif /* __LOCKSPACE_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
/*
* lowcomms.c
*
* This is the "low-level" comms layer.
*
* It is responsible for sending/receiving messages
* from other nodes in the cluster.
*
* Cluster nodes are referred to by their nodeids. nodeids are
* simply 32 bit numbers to the locking module - if they need to
* be expanded for the cluster infrastructure then that is it's
* responsibility. It is this layer's
* responsibility to resolve these into IP address or
* whatever it needs for inter-node communication.
*
* The comms level is two kernel threads that deal mainly with
* the receiving of messages from other nodes and passing them
* up to the mid-level comms layer (which understands the
* message format) for execution by the locking core, and
* a send thread which does all the setting up of connections
* to remote nodes and the sending of data. Threads are not allowed
* to send their own data because it may cause them to wait in times
* of high load. Also, this way, the sending thread can collect together
* messages bound for one node and send them in one block.
*
* I don't see any problem with the recv thread executing the locking
* code on behalf of remote processes as the locking code is
* short, efficient and never (well, hardly ever) waits.
*
*/
#include <asm/ioctls.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/sctp/user.h>
#include <linux/pagemap.h>
#include <linux/socket.h>
#include <linux/idr.h>
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "midcomms.h"
static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT];
static int local_count;
static int local_nodeid;
/* One of these per connected node */
#define NI_INIT_PENDING 1
#define NI_WRITE_PENDING 2
struct nodeinfo {
spinlock_t lock;
sctp_assoc_t assoc_id;
unsigned long flags;
struct list_head write_list; /* nodes with pending writes */
struct list_head writequeue; /* outgoing writequeue_entries */
spinlock_t writequeue_lock;
int nodeid;
};
static DEFINE_IDR(nodeinfo_idr);
static struct rw_semaphore nodeinfo_lock;
static int max_nodeid;
struct cbuf {
unsigned base;
unsigned len;
unsigned mask;
};
/* Just the one of these, now. But this struct keeps
the connection-specific variables together */
#define CF_READ_PENDING 1
struct connection {
struct socket *sock;
unsigned long flags;
struct page *rx_page;
atomic_t waiting_requests;
struct cbuf cb;
int eagain_flag;
};
/* An entry waiting to be sent */
struct writequeue_entry {
struct list_head list;
struct page *page;
int offset;
int len;
int end;
int users;
struct nodeinfo *ni;
};
#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
#define CBUF_EMPTY(cb) ((cb)->len == 0)
#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
#define CBUF_INIT(cb, size) \
do { \
(cb)->base = (cb)->len = 0; \
(cb)->mask = ((size)-1); \
} while(0)
#define CBUF_EAT(cb, n) \
do { \
(cb)->len -= (n); \
(cb)->base += (n); \
(cb)->base &= (cb)->mask; \
} while(0)
/* List of nodes which have writes pending */
static struct list_head write_nodes;
static spinlock_t write_nodes_lock;
/* Maximum number of incoming messages to process before
* doing a schedule()
*/
#define MAX_RX_MSG_COUNT 25
/* Manage daemons */
static struct task_struct *recv_task;
static struct task_struct *send_task;
static wait_queue_head_t lowcomms_recv_wait;
static atomic_t accepting;
/* The SCTP connection */
static struct connection sctp_con;
static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
{
struct sockaddr_storage addr;
int error;
if (!local_count)
return -1;
error = dlm_nodeid_to_addr(nodeid, &addr);
if (error)
return error;
if (local_addr[0]->ss_family == AF_INET) {
struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
} else {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
memcpy(&ret6->sin6_addr, &in6->sin6_addr,
sizeof(in6->sin6_addr));
}
return 0;
}
static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
{
struct nodeinfo *ni;
int r;
int n;
down_read(&nodeinfo_lock);
ni = idr_find(&nodeinfo_idr, nodeid);
up_read(&nodeinfo_lock);
if (!ni && alloc) {
down_write(&nodeinfo_lock);
ni = idr_find(&nodeinfo_idr, nodeid);
if (ni)
goto out_up;
r = idr_pre_get(&nodeinfo_idr, alloc);
if (!r)
goto out_up;
ni = kmalloc(sizeof(struct nodeinfo), alloc);
if (!ni)
goto out_up;
r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
if (r) {
kfree(ni);
ni = NULL;
goto out_up;
}
if (n != nodeid) {
idr_remove(&nodeinfo_idr, n);
kfree(ni);
ni = NULL;
goto out_up;
}
memset(ni, 0, sizeof(struct nodeinfo));
spin_lock_init(&ni->lock);
INIT_LIST_HEAD(&ni->writequeue);
spin_lock_init(&ni->writequeue_lock);
ni->nodeid = nodeid;
if (nodeid > max_nodeid)
max_nodeid = nodeid;
out_up:
up_write(&nodeinfo_lock);
}
return ni;
}
/* Don't call this too often... */
static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
{
int i;
struct nodeinfo *ni;
for (i=1; i<=max_nodeid; i++) {
ni = nodeid2nodeinfo(i, 0);
if (ni && ni->assoc_id == assoc)
return ni;
}
return NULL;
}
/* Data or notification available on socket */
static void lowcomms_data_ready(struct sock *sk, int count_unused)
{
atomic_inc(&sctp_con.waiting_requests);
if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
return;
wake_up_interruptible(&lowcomms_recv_wait);
}
/* Add the port number to an IP6 or 4 sockaddr and return the address length.
Also padd out the struct with zeros to make comparisons meaningful */
static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
int *addr_len)
{
struct sockaddr_in *local4_addr;
struct sockaddr_in6 *local6_addr;
if (!local_count)
return;
if (!port) {
if (local_addr[0]->ss_family == AF_INET) {
local4_addr = (struct sockaddr_in *)local_addr[0];
port = be16_to_cpu(local4_addr->sin_port);
} else {
local6_addr = (struct sockaddr_in6 *)local_addr[0];
port = be16_to_cpu(local6_addr->sin6_port);
}
}
saddr->ss_family = local_addr[0]->ss_family;
if (local_addr[0]->ss_family == AF_INET) {
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
in4_addr->sin_port = cpu_to_be16(port);
memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
sizeof(struct sockaddr_in));
*addr_len = sizeof(struct sockaddr_in);
} else {
struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
in6_addr->sin6_port = cpu_to_be16(port);
memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
sizeof(struct sockaddr_in6));
*addr_len = sizeof(struct sockaddr_in6);
}
}
/* Close the connection and tidy up */
static void close_connection(void)
{
if (sctp_con.sock) {
sock_release(sctp_con.sock);
sctp_con.sock = NULL;
}
if (sctp_con.rx_page) {
__free_page(sctp_con.rx_page);
sctp_con.rx_page = NULL;
}
}
/* We only send shutdown messages to nodes that are not part of the cluster */
static void send_shutdown(sctp_assoc_t associd)
{
static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
struct msghdr outmessage;
struct cmsghdr *cmsg;
struct sctp_sndrcvinfo *sinfo;
int ret;
outmessage.msg_name = NULL;
outmessage.msg_namelen = 0;
outmessage.msg_control = outcmsg;
outmessage.msg_controllen = sizeof(outcmsg);
outmessage.msg_flags = MSG_EOR;
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
cmsg->cmsg_type = SCTP_SNDRCV;
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
outmessage.msg_controllen = cmsg->cmsg_len;
sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
sinfo->sinfo_flags |= MSG_EOF;
sinfo->sinfo_assoc_id = associd;
ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
if (ret != 0)
log_print("send EOF to node failed: %d", ret);
}
/* INIT failed but we don't know which node...
restart INIT on all pending nodes */
static void init_failed(void)
{
int i;
struct nodeinfo *ni;
for (i=1; i<=max_nodeid; i++) {
ni = nodeid2nodeinfo(i, 0);
if (!ni)
continue;
if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
ni->assoc_id = 0;
if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
spin_lock_bh(&write_nodes_lock);
list_add_tail(&ni->write_list, &write_nodes);
spin_unlock_bh(&write_nodes_lock);
}
}
}
wake_up_process(send_task);
}
/* Something happened to an association */
static void process_sctp_notification(struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
switch (sn->sn_assoc_change.sac_state) {
case SCTP_COMM_UP:
case SCTP_RESTART:
{
/* Check that the new node is in the lockspace */
struct sctp_prim prim;
mm_segment_t fs;
int nodeid;
int prim_len, ret;
int addr_len;
struct nodeinfo *ni;
/* This seems to happen when we received a connection
* too early... or something... anyway, it happens but
* we always seem to get a real message too, see
* receive_from_sock */
if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
log_print("COMM_UP for invalid assoc ID %d",
(int)sn->sn_assoc_change.sac_assoc_id);
init_failed();
return;
}
memset(&prim, 0, sizeof(struct sctp_prim));
prim_len = sizeof(struct sctp_prim);
prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
fs = get_fs();
set_fs(get_ds());
ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
(char*)&prim, &prim_len);
set_fs(fs);
if (ret < 0) {
struct nodeinfo *ni;
log_print("getsockopt/sctp_primary_addr on "
"new assoc %d failed : %d",
(int)sn->sn_assoc_change.sac_assoc_id, ret);
/* Retry INIT later */
ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
if (ni)
clear_bit(NI_INIT_PENDING, &ni->flags);
return;
}
make_sockaddr(&prim.ssp_addr, 0, &addr_len);
if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
log_print("reject connect from unknown addr");
send_shutdown(prim.ssp_assoc_id);
return;
}
ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
if (!ni)
return;
/* Save the assoc ID */
spin_lock(&ni->lock);
ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
spin_unlock(&ni->lock);
log_print("got new/restarted association %d nodeid %d",
(int)sn->sn_assoc_change.sac_assoc_id, nodeid);
/* Send any pending writes */
clear_bit(NI_INIT_PENDING, &ni->flags);
if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
spin_lock_bh(&write_nodes_lock);
list_add_tail(&ni->write_list, &write_nodes);
spin_unlock_bh(&write_nodes_lock);
}
wake_up_process(send_task);
}
break;
case SCTP_COMM_LOST:
case SCTP_SHUTDOWN_COMP:
{
struct nodeinfo *ni;
ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
if (ni) {
spin_lock(&ni->lock);
ni->assoc_id = 0;
spin_unlock(&ni->lock);
}
}
break;
/* We don't know which INIT failed, so clear the PENDING flags
* on them all. if assoc_id is zero then it will then try
* again */
case SCTP_CANT_STR_ASSOC:
{
log_print("Can't start SCTP association - retrying");
init_failed();
}
break;
default:
log_print("unexpected SCTP assoc change id=%d state=%d",
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
}
}
/* Data received from remote end */
static int receive_from_sock(void)
{
int ret = 0;
struct msghdr msg;
struct kvec iov[2];
unsigned len;
int r;
struct sctp_sndrcvinfo *sinfo;
struct cmsghdr *cmsg;
struct nodeinfo *ni;
/* These two are marginally too big for stack allocation, but this
* function is (currently) only called by dlm_recvd so static should be
* OK.
*/
static struct sockaddr_storage msgname;
static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
if (sctp_con.sock == NULL)
goto out;
if (sctp_con.rx_page == NULL) {
/*
* This doesn't need to be atomic, but I think it should
* improve performance if it is.
*/
sctp_con.rx_page = alloc_page(GFP_ATOMIC);
if (sctp_con.rx_page == NULL)
goto out_resched;
CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
}
memset(&incmsg, 0, sizeof(incmsg));
memset(&msgname, 0, sizeof(msgname));
memset(incmsg, 0, sizeof(incmsg));
msg.msg_name = &msgname;
msg.msg_namelen = sizeof(msgname);
msg.msg_flags = 0;
msg.msg_control = incmsg;
msg.msg_controllen = sizeof(incmsg);
/* I don't see why this circular buffer stuff is necessary for SCTP
* which is a packet-based protocol, but the whole thing breaks under
* load without it! The overhead is minimal (and is in the TCP lowcomms
* anyway, of course) so I'll leave it in until I can figure out what's
* really happening.
*/
/*
* iov[0] is the bit of the circular buffer between the current end
* point (cb.base + cb.len) and the end of the buffer.
*/
iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
iov[0].iov_base = page_address(sctp_con.rx_page) +
CBUF_DATA(&sctp_con.cb);
iov[1].iov_len = 0;
/*
* iov[1] is the bit of the circular buffer between the start of the
* buffer and the start of the currently used section (cb.base)
*/
if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
iov[1].iov_len = sctp_con.cb.base;
iov[1].iov_base = page_address(sctp_con.rx_page);
msg.msg_iovlen = 2;
}
len = iov[0].iov_len + iov[1].iov_len;
r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
MSG_NOSIGNAL | MSG_DONTWAIT);
if (ret <= 0)
goto out_close;
msg.msg_control = incmsg;
msg.msg_controllen = sizeof(incmsg);
cmsg = CMSG_FIRSTHDR(&msg);
sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
if (msg.msg_flags & MSG_NOTIFICATION) {
process_sctp_notification(&msg, page_address(sctp_con.rx_page));
return 0;
}
/* Is this a new association ? */
ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
if (ni) {
ni->assoc_id = sinfo->sinfo_assoc_id;
if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
spin_lock_bh(&write_nodes_lock);
list_add_tail(&ni->write_list, &write_nodes);
spin_unlock_bh(&write_nodes_lock);
}
wake_up_process(send_task);
}
}
/* INIT sends a message with length of 1 - ignore it */
if (r == 1)
return 0;
CBUF_ADD(&sctp_con.cb, ret);
ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
page_address(sctp_con.rx_page),
sctp_con.cb.base, sctp_con.cb.len,
PAGE_CACHE_SIZE);
if (ret < 0)
goto out_close;
CBUF_EAT(&sctp_con.cb, ret);
out:
ret = 0;
goto out_ret;
out_resched:
lowcomms_data_ready(sctp_con.sock->sk, 0);
ret = 0;
schedule();
goto out_ret;
out_close:
if (ret != -EAGAIN)
log_print("error reading from sctp socket: %d", ret);
out_ret:
return ret;
}
/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
{
mm_segment_t fs;
int result = 0;
fs = get_fs();
set_fs(get_ds());
if (num == 1)
result = sctp_con.sock->ops->bind(sctp_con.sock,
(struct sockaddr *) addr, addr_len);
else
result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
set_fs(fs);
if (result < 0)
log_print("Can't bind to port %d addr number %d",
dlm_config.tcp_port, num);
return result;
}
static void init_local(void)
{
struct sockaddr_storage sas, *addr;
int i;
local_nodeid = dlm_our_nodeid();
for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
if (dlm_our_addr(&sas, i))
break;
addr = kmalloc(sizeof(*addr), GFP_KERNEL);
if (!addr)
break;
memcpy(addr, &sas, sizeof(*addr));
local_addr[local_count++] = addr;
}
}
/* Initialise SCTP socket and bind to all interfaces */
static int init_sock(void)
{
mm_segment_t fs;
struct socket *sock = NULL;
struct sockaddr_storage localaddr;
struct sctp_event_subscribe subscribe;
int result = -EINVAL, num = 1, i, addr_len;
if (!local_count) {
init_local();
if (!local_count) {
log_print("no local IP address has been set");
goto out;
}
}
result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET,
IPPROTO_SCTP, &sock);
if (result < 0) {
log_print("Can't create comms socket, check SCTP is loaded");
goto out;
}
/* Listen for events */
memset(&subscribe, 0, sizeof(subscribe));
subscribe.sctp_data_io_event = 1;
subscribe.sctp_association_event = 1;
subscribe.sctp_send_failure_event = 1;
subscribe.sctp_shutdown_event = 1;
subscribe.sctp_partial_delivery_event = 1;
fs = get_fs();
set_fs(get_ds());
result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
(char *)&subscribe, sizeof(subscribe));
set_fs(fs);
if (result < 0) {
log_print("Failed to set SCTP_EVENTS on socket: result=%d",
result);
goto create_delsock;
}
/* Init con struct */
sock->sk->sk_user_data = &sctp_con;
sctp_con.sock = sock;
sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
/* Bind to all interfaces. */
for (i = 0; i < local_count; i++) {
memcpy(&localaddr, local_addr[i], sizeof(localaddr));
make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
result = add_bind_addr(&localaddr, addr_len, num);
if (result)
goto create_delsock;
++num;
}
result = sock->ops->listen(sock, 5);
if (result < 0) {
log_print("Can't set socket listening");
goto create_delsock;
}
return 0;
create_delsock:
sock_release(sock);
sctp_con.sock = NULL;
out:
return result;
}
static struct writequeue_entry *new_writequeue_entry(int allocation)
{
struct writequeue_entry *entry;
entry = kmalloc(sizeof(struct writequeue_entry), allocation);
if (!entry)
return NULL;
entry->page = alloc_page(allocation);
if (!entry->page) {
kfree(entry);
return NULL;
}
entry->offset = 0;
entry->len = 0;
entry->end = 0;
entry->users = 0;
return entry;
}
void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
{
struct writequeue_entry *e;
int offset = 0;
int users = 0;
struct nodeinfo *ni;
if (!atomic_read(&accepting))
return NULL;
ni = nodeid2nodeinfo(nodeid, allocation);
if (!ni)
return NULL;
spin_lock(&ni->writequeue_lock);
e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
if (((struct list_head *) e == &ni->writequeue) ||
(PAGE_CACHE_SIZE - e->end < len)) {
e = NULL;
} else {
offset = e->end;
e->end += len;
users = e->users++;
}
spin_unlock(&ni->writequeue_lock);
if (e) {
got_one:
if (users == 0)
kmap(e->page);
*ppc = page_address(e->page) + offset;
return e;
}
e = new_writequeue_entry(allocation);
if (e) {
spin_lock(&ni->writequeue_lock);
offset = e->end;
e->end += len;
e->ni = ni;
users = e->users++;
list_add_tail(&e->list, &ni->writequeue);
spin_unlock(&ni->writequeue_lock);
goto got_one;
}
return NULL;
}
void dlm_lowcomms_commit_buffer(void *arg)
{
struct writequeue_entry *e = (struct writequeue_entry *) arg;
int users;
struct nodeinfo *ni = e->ni;
if (!atomic_read(&accepting))
return;
spin_lock(&ni->writequeue_lock);
users = --e->users;
if (users)
goto out;
e->len = e->end - e->offset;
kunmap(e->page);
spin_unlock(&ni->writequeue_lock);
if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
spin_lock_bh(&write_nodes_lock);
list_add_tail(&ni->write_list, &write_nodes);
spin_unlock_bh(&write_nodes_lock);
wake_up_process(send_task);
}
return;
out:
spin_unlock(&ni->writequeue_lock);
return;
}
static void free_entry(struct writequeue_entry *e)
{
__free_page(e->page);
kfree(e);
}
/* Initiate an SCTP association. In theory we could just use sendmsg() on
the first IP address and it should work, but this allows us to set up the
association before sending any valuable data that we can't afford to lose.
It also keeps the send path clean as it can now always use the association ID */
static void initiate_association(int nodeid)
{
struct sockaddr_storage rem_addr;
static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
struct msghdr outmessage;
struct cmsghdr *cmsg;
struct sctp_sndrcvinfo *sinfo;
int ret;
int addrlen;
char buf[1];
struct kvec iov[1];
struct nodeinfo *ni;
log_print("Initiating association with node %d", nodeid);
ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
if (!ni)
return;
if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
log_print("no address for nodeid %d", nodeid);
return;
}
make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
outmessage.msg_name = &rem_addr;
outmessage.msg_namelen = addrlen;
outmessage.msg_control = outcmsg;
outmessage.msg_controllen = sizeof(outcmsg);
outmessage.msg_flags = MSG_EOR;
iov[0].iov_base = buf;
iov[0].iov_len = 1;
/* Real INIT messages seem to cause trouble. Just send a 1 byte message
we can afford to lose */
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
cmsg->cmsg_type = SCTP_SNDRCV;
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
outmessage.msg_controllen = cmsg->cmsg_len;
ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
if (ret < 0) {
log_print("send INIT to node failed: %d", ret);
/* Try again later */
clear_bit(NI_INIT_PENDING, &ni->flags);
}
}
/* Send a message */
static int send_to_sock(struct nodeinfo *ni)
{
int ret = 0;
struct writequeue_entry *e;
int len, offset;
struct msghdr outmsg;
static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
struct cmsghdr *cmsg;
struct sctp_sndrcvinfo *sinfo;
struct kvec iov;
/* See if we need to init an association before we start
sending precious messages */
spin_lock(&ni->lock);
if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
spin_unlock(&ni->lock);
initiate_association(ni->nodeid);
return 0;
}
spin_unlock(&ni->lock);
outmsg.msg_name = NULL; /* We use assoc_id */
outmsg.msg_namelen = 0;
outmsg.msg_control = outcmsg;
outmsg.msg_controllen = sizeof(outcmsg);
outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
cmsg = CMSG_FIRSTHDR(&outmsg);
cmsg->cmsg_level = IPPROTO_SCTP;
cmsg->cmsg_type = SCTP_SNDRCV;
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
sinfo->sinfo_assoc_id = ni->assoc_id;
outmsg.msg_controllen = cmsg->cmsg_len;
spin_lock(&ni->writequeue_lock);
for (;;) {
if (list_empty(&ni->writequeue))
break;
e = list_entry(ni->writequeue.next, struct writequeue_entry,
list);
kmap(e->page);
len = e->len;
offset = e->offset;
BUG_ON(len == 0 && e->users == 0);
spin_unlock(&ni->writequeue_lock);
ret = 0;
if (len) {
iov.iov_base = page_address(e->page)+offset;
iov.iov_len = len;
ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
len);
if (ret == -EAGAIN) {
sctp_con.eagain_flag = 1;
goto out;
} else if (ret < 0)
goto send_error;
} else {
/* Don't starve people filling buffers */
schedule();
}
spin_lock(&ni->writequeue_lock);
e->offset += ret;
e->len -= ret;
if (e->len == 0 && e->users == 0) {
list_del(&e->list);
free_entry(e);
continue;
}
}
spin_unlock(&ni->writequeue_lock);
out:
return ret;
send_error:
log_print("Error sending to node %d %d", ni->nodeid, ret);
spin_lock(&ni->lock);
if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
ni->assoc_id = 0;
spin_unlock(&ni->lock);
initiate_association(ni->nodeid);
} else
spin_unlock(&ni->lock);
return ret;
}
/* Try to send any messages that are pending */
static void process_output_queue(void)
{
struct list_head *list;
struct list_head *temp;
spin_lock_bh(&write_nodes_lock);
list_for_each_safe(list, temp, &write_nodes) {
struct nodeinfo *ni =
list_entry(list, struct nodeinfo, write_list);
clear_bit(NI_WRITE_PENDING, &ni->flags);
list_del(&ni->write_list);
spin_unlock_bh(&write_nodes_lock);
send_to_sock(ni);
spin_lock_bh(&write_nodes_lock);
}
spin_unlock_bh(&write_nodes_lock);
}
/* Called after we've had -EAGAIN and been woken up */
static void refill_write_queue(void)
{
int i;
for (i=1; i<=max_nodeid; i++) {
struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
if (ni) {
if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
spin_lock_bh(&write_nodes_lock);
list_add_tail(&ni->write_list, &write_nodes);
spin_unlock_bh(&write_nodes_lock);
}
}
}
}
static void clean_one_writequeue(struct nodeinfo *ni)
{
struct list_head *list;
struct list_head *temp;
spin_lock(&ni->writequeue_lock);
list_for_each_safe(list, temp, &ni->writequeue) {
struct writequeue_entry *e =
list_entry(list, struct writequeue_entry, list);
list_del(&e->list);
free_entry(e);
}
spin_unlock(&ni->writequeue_lock);
}
static void clean_writequeues(void)
{
int i;
for (i=1; i<=max_nodeid; i++) {
struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
if (ni)
clean_one_writequeue(ni);
}
}
static void dealloc_nodeinfo(void)
{
int i;
for (i=1; i<=max_nodeid; i++) {
struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
if (ni) {
idr_remove(&nodeinfo_idr, i);
kfree(ni);
}
}
}
static int write_list_empty(void)
{
int status;
spin_lock_bh(&write_nodes_lock);
status = list_empty(&write_nodes);
spin_unlock_bh(&write_nodes_lock);
return status;
}
static int dlm_recvd(void *data)
{
DECLARE_WAITQUEUE(wait, current);
while (!kthread_should_stop()) {
int count = 0;
set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&lowcomms_recv_wait, &wait);
if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
schedule();
remove_wait_queue(&lowcomms_recv_wait, &wait);
set_current_state(TASK_RUNNING);
if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
int ret;
do {
ret = receive_from_sock();
/* Don't starve out everyone else */
if (++count >= MAX_RX_MSG_COUNT) {
schedule();
count = 0;
}
} while (!kthread_should_stop() && ret >=0);
}
schedule();
}
return 0;
}
static int dlm_sendd(void *data)
{
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
if (write_list_empty())
schedule();
set_current_state(TASK_RUNNING);
if (sctp_con.eagain_flag) {
sctp_con.eagain_flag = 0;
refill_write_queue();
}
process_output_queue();
}
remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
return 0;
}
static void daemons_stop(void)
{
kthread_stop(recv_task);
kthread_stop(send_task);
}
static int daemons_start(void)
{
struct task_struct *p;
int error;
p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
error = IS_ERR(p);
if (error) {
log_print("can't start dlm_recvd %d", error);
return error;
}
recv_task = p;
p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
error = IS_ERR(p);
if (error) {
log_print("can't start dlm_sendd %d", error);
kthread_stop(recv_task);
return error;
}
send_task = p;
return 0;
}
/*
* This is quite likely to sleep...
*/
int dlm_lowcomms_start(void)
{
int error;
spin_lock_init(&write_nodes_lock);
INIT_LIST_HEAD(&write_nodes);
init_rwsem(&nodeinfo_lock);
error = init_sock();
if (error)
goto fail_sock;
error = daemons_start();
if (error)
goto fail_sock;
atomic_set(&accepting, 1);
return 0;
fail_sock:
close_connection();
return error;
}
/* Set all the activity flags to prevent any socket activity. */
void dlm_lowcomms_stop(void)
{
atomic_set(&accepting, 0);
sctp_con.flags = 0x7;
daemons_stop();
clean_writequeues();
close_connection();
dealloc_nodeinfo();
max_nodeid = 0;
}
int dlm_lowcomms_init(void)
{
init_waitqueue_head(&lowcomms_recv_wait);
return 0;
}
void dlm_lowcomms_exit(void)
{
int i;
for (i = 0; i < local_count; i++)
kfree(local_addr[i]);
local_count = 0;
local_nodeid = 0;
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __LOWCOMMS_DOT_H__
#define __LOWCOMMS_DOT_H__
int dlm_lowcomms_init(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_start(void);
void dlm_lowcomms_stop(void);
void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
void dlm_lowcomms_commit_buffer(void *mh);
#endif /* __LOWCOMMS_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __LVB_TABLE_DOT_H__
#define __LVB_TABLE_DOT_H__
extern const int dlm_lvb_operations[8][8];
#endif
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "lock.h"
#include "memory.h"
#include "lowcomms.h"
#include "config.h"
#ifdef CONFIG_DLM_DEBUG
int dlm_register_debugfs(void);
void dlm_unregister_debugfs(void);
#else
static inline int dlm_register_debugfs(void) { return 0; }
static inline void dlm_unregister_debugfs(void) { }
#endif
static int __init init_dlm(void)
{
int error;
error = dlm_memory_init();
if (error)
goto out;
error = dlm_lockspace_init();
if (error)
goto out_mem;
error = dlm_config_init();
if (error)
goto out_lockspace;
error = dlm_register_debugfs();
if (error)
goto out_config;
error = dlm_lowcomms_init();
if (error)
goto out_debug;
printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
return 0;
out_debug:
dlm_unregister_debugfs();
out_config:
dlm_config_exit();
out_lockspace:
dlm_lockspace_exit();
out_mem:
dlm_memory_exit();
out:
return error;
}
static void __exit exit_dlm(void)
{
dlm_lowcomms_exit();
dlm_config_exit();
dlm_memory_exit();
dlm_lockspace_exit();
dlm_unregister_debugfs();
}
module_init(init_dlm);
module_exit(exit_dlm);
MODULE_DESCRIPTION("Distributed Lock Manager");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
EXPORT_SYMBOL_GPL(dlm_new_lockspace);
EXPORT_SYMBOL_GPL(dlm_release_lockspace);
EXPORT_SYMBOL_GPL(dlm_lock);
EXPORT_SYMBOL_GPL(dlm_unlock);
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
#include "recoverd.h"
#include "recover.h"
#include "lowcomms.h"
#include "rcom.h"
#include "config.h"
/*
* Following called by dlm_recoverd thread
*/
static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
{
struct dlm_member *memb = NULL;
struct list_head *tmp;
struct list_head *newlist = &new->list;
struct list_head *head = &ls->ls_nodes;
list_for_each(tmp, head) {
memb = list_entry(tmp, struct dlm_member, list);
if (new->nodeid < memb->nodeid)
break;
}
if (!memb)
list_add_tail(newlist, head);
else {
/* FIXME: can use list macro here */
newlist->prev = tmp->prev;
newlist->next = tmp;
tmp->prev->next = newlist;
tmp->prev = newlist;
}
}
static int dlm_add_member(struct dlm_ls *ls, int nodeid)
{
struct dlm_member *memb;
int w;
memb = kmalloc(sizeof(struct dlm_member), GFP_KERNEL);
if (!memb)
return -ENOMEM;
w = dlm_node_weight(ls->ls_name, nodeid);
if (w < 0)
return w;
memb->nodeid = nodeid;
memb->weight = w;
add_ordered_member(ls, memb);
ls->ls_num_nodes++;
return 0;
}
static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
{
list_move(&memb->list, &ls->ls_nodes_gone);
ls->ls_num_nodes--;
}
static int dlm_is_member(struct dlm_ls *ls, int nodeid)
{
struct dlm_member *memb;
list_for_each_entry(memb, &ls->ls_nodes, list) {
if (memb->nodeid == nodeid)
return TRUE;
}
return FALSE;
}
int dlm_is_removed(struct dlm_ls *ls, int nodeid)
{
struct dlm_member *memb;
list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
if (memb->nodeid == nodeid)
return TRUE;
}
return FALSE;
}
static void clear_memb_list(struct list_head *head)
{
struct dlm_member *memb;
while (!list_empty(head)) {
memb = list_entry(head->next, struct dlm_member, list);
list_del(&memb->list);
kfree(memb);
}
}
void dlm_clear_members(struct dlm_ls *ls)
{
clear_memb_list(&ls->ls_nodes);
ls->ls_num_nodes = 0;
}
void dlm_clear_members_gone(struct dlm_ls *ls)
{
clear_memb_list(&ls->ls_nodes_gone);
}
static void make_member_array(struct dlm_ls *ls)
{
struct dlm_member *memb;
int i, w, x = 0, total = 0, all_zero = 0, *array;
kfree(ls->ls_node_array);
ls->ls_node_array = NULL;
list_for_each_entry(memb, &ls->ls_nodes, list) {
if (memb->weight)
total += memb->weight;
}
/* all nodes revert to weight of 1 if all have weight 0 */
if (!total) {
total = ls->ls_num_nodes;
all_zero = 1;
}
ls->ls_total_weight = total;
array = kmalloc(sizeof(int) * total, GFP_KERNEL);
if (!array)
return;
list_for_each_entry(memb, &ls->ls_nodes, list) {
if (!all_zero && !memb->weight)
continue;
if (all_zero)
w = 1;
else
w = memb->weight;
DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
for (i = 0; i < w; i++)
array[x++] = memb->nodeid;
}
ls->ls_node_array = array;
}
/* send a status request to all members just to establish comms connections */
static void ping_members(struct dlm_ls *ls)
{
struct dlm_member *memb;
list_for_each_entry(memb, &ls->ls_nodes, list)
dlm_rcom_status(ls, memb->nodeid);
}
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
{
struct dlm_member *memb, *safe;
int i, error, found, pos = 0, neg = 0, low = -1;
/* move departed members from ls_nodes to ls_nodes_gone */
list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
found = FALSE;
for (i = 0; i < rv->node_count; i++) {
if (memb->nodeid == rv->nodeids[i]) {
found = TRUE;
break;
}
}
if (!found) {
neg++;
dlm_remove_member(ls, memb);
log_debug(ls, "remove member %d", memb->nodeid);
}
}
/* add new members to ls_nodes */
for (i = 0; i < rv->node_count; i++) {
if (dlm_is_member(ls, rv->nodeids[i]))
continue;
dlm_add_member(ls, rv->nodeids[i]);
pos++;
log_debug(ls, "add member %d", rv->nodeids[i]);
}
list_for_each_entry(memb, &ls->ls_nodes, list) {
if (low == -1 || memb->nodeid < low)
low = memb->nodeid;
}
ls->ls_low_nodeid = low;
make_member_array(ls);
dlm_set_recover_status(ls, DLM_RS_NODES);
*neg_out = neg;
ping_members(ls);
error = dlm_recover_members_wait(ls);
log_debug(ls, "total members %d", ls->ls_num_nodes);
return error;
}
/*
* Following called from lockspace.c
*/
int dlm_ls_stop(struct dlm_ls *ls)
{
int new;
/*
* A stop cancels any recovery that's in progress (see RECOVERY_STOP,
* dlm_recovery_stopped()) and prevents any new locks from being
* processed (see RUNNING, dlm_locking_stopped()).
*/
spin_lock(&ls->ls_recover_lock);
set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
ls->ls_recover_seq++;
spin_unlock(&ls->ls_recover_lock);
/*
* This in_recovery lock does two things:
*
* 1) Keeps this function from returning until all threads are out
* of locking routines and locking is truely stopped.
* 2) Keeps any new requests from being processed until it's unlocked
* when recovery is complete.
*/
if (new)
down_write(&ls->ls_in_recovery);
/*
* The recoverd suspend/resume makes sure that dlm_recoverd (if
* running) has noticed the clearing of RUNNING above and quit
* processing the previous recovery. This will be true for all nodes
* before any nodes start the new recovery.
*/
dlm_recoverd_suspend(ls);
ls->ls_recover_status = 0;
dlm_recoverd_resume(ls);
return 0;
}
int dlm_ls_start(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL, *rv_old;
int *ids = NULL;
int error, count;
rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
if (!rv)
return -ENOMEM;
memset(rv, 0, sizeof(struct dlm_recover));
error = count = dlm_nodeid_list(ls->ls_name, &ids);
if (error <= 0)
goto fail;
spin_lock(&ls->ls_recover_lock);
/* the lockspace needs to be stopped before it can be started */
if (!dlm_locking_stopped(ls)) {
spin_unlock(&ls->ls_recover_lock);
log_error(ls, "start ignored: lockspace running");
error = -EINVAL;
goto fail;
}
rv->nodeids = ids;
rv->node_count = count;
rv->seq = ++ls->ls_recover_seq;
rv_old = ls->ls_recover_args;
ls->ls_recover_args = rv;
spin_unlock(&ls->ls_recover_lock);
if (rv_old) {
kfree(rv_old->nodeids);
kfree(rv_old);
}
dlm_recoverd_kick(ls);
return 0;
fail:
kfree(rv);
kfree(ids);
return error;
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __MEMBER_DOT_H__
#define __MEMBER_DOT_H__
int dlm_ls_stop(struct dlm_ls *ls);
int dlm_ls_start(struct dlm_ls *ls);
void dlm_clear_members(struct dlm_ls *ls);
void dlm_clear_members_gone(struct dlm_ls *ls);
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
int dlm_is_removed(struct dlm_ls *ls, int nodeid);
#endif /* __MEMBER_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "config.h"
#include "memory.h"
static kmem_cache_t *lkb_cache;
int dlm_memory_init(void)
{
int ret = 0;
lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
__alignof__(struct dlm_lkb), 0, NULL, NULL);
if (!lkb_cache)
ret = -ENOMEM;
return ret;
}
void dlm_memory_exit(void)
{
if (lkb_cache)
kmem_cache_destroy(lkb_cache);
}
char *allocate_lvb(struct dlm_ls *ls)
{
char *p;
p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
if (p)
memset(p, 0, ls->ls_lvblen);
return p;
}
void free_lvb(char *p)
{
kfree(p);
}
uint64_t *allocate_range(struct dlm_ls *ls)
{
int ralen = 4*sizeof(uint64_t);
uint64_t *p;
p = kmalloc(ralen, GFP_KERNEL);
if (p)
memset(p, 0, ralen);
return p;
}
void free_range(uint64_t *p)
{
kfree(p);
}
/* FIXME: have some minimal space built-in to rsb for the name and
kmalloc a separate name if needed, like dentries are done */
struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
{
struct dlm_rsb *r;
DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
if (r)
memset(r, 0, sizeof(*r) + namelen);
return r;
}
void free_rsb(struct dlm_rsb *r)
{
if (r->res_lvbptr)
free_lvb(r->res_lvbptr);
kfree(r);
}
struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
if (lkb)
memset(lkb, 0, sizeof(*lkb));
return lkb;
}
void free_lkb(struct dlm_lkb *lkb)
{
kmem_cache_free(lkb_cache, lkb);
}
struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
{
struct dlm_direntry *de;
DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
if (de)
memset(de, 0, sizeof(*de) + namelen);
return de;
}
void free_direntry(struct dlm_direntry *de)
{
kfree(de);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __MEMORY_DOT_H__
#define __MEMORY_DOT_H__
int dlm_memory_init(void);
void dlm_memory_exit(void);
struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
void free_rsb(struct dlm_rsb *r);
struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
void free_lkb(struct dlm_lkb *l);
struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
void free_direntry(struct dlm_direntry *de);
char *allocate_lvb(struct dlm_ls *ls);
void free_lvb(char *l);
uint64_t *allocate_range(struct dlm_ls *ls);
void free_range(uint64_t *l);
#endif /* __MEMORY_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
/*
* midcomms.c
*
* This is the appallingly named "mid-level" comms layer.
*
* Its purpose is to take packets from the "real" comms layer,
* split them up into packets and pass them to the interested
* part of the locking mechanism.
*
* It also takes messages from the locking layer, formats them
* into packets and sends them to the comms layer.
*/
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "rcom.h"
#include "lock.h"
#include "midcomms.h"
static void copy_from_cb(void *dst, const void *base, unsigned offset,
unsigned len, unsigned limit)
{
unsigned copy = len;
if ((copy + offset) > limit)
copy = limit - offset;
memcpy(dst, base + offset, copy);
len -= copy;
if (len)
memcpy(dst + copy, base, len);
}
/*
* Called from the low-level comms layer to process a buffer of
* commands.
*
* Only complete messages are processed here, any "spare" bytes from
* the end of a buffer are saved and tacked onto the front of the next
* message that comes in. I doubt this will happen very often but we
* need to be able to cope with it and I don't want the task to be waiting
* for packets to come in when there is useful work to be done.
*/
int dlm_process_incoming_buffer(int nodeid, const void *base,
unsigned offset, unsigned len, unsigned limit)
{
unsigned char __tmp[DLM_INBUF_LEN];
struct dlm_header *msg = (struct dlm_header *) __tmp;
int ret = 0;
int err = 0;
uint16_t msglen;
uint32_t lockspace;
while (len > sizeof(struct dlm_header)) {
/* Copy just the header to check the total length. The
message may wrap around the end of the buffer back to the
start, so we need to use a temp buffer and copy_from_cb. */
copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
limit);
msglen = le16_to_cpu(msg->h_length);
lockspace = msg->h_lockspace;
err = -EINVAL;
if (msglen < sizeof(struct dlm_header))
break;
err = -E2BIG;
if (msglen > dlm_config.buffer_size) {
log_print("message size %d from %d too big, buf len %d",
msglen, nodeid, len);
break;
}
err = 0;
/* If only part of the full message is contained in this
buffer, then do nothing and wait for lowcomms to call
us again later with more data. We return 0 meaning
we've consumed none of the input buffer. */
if (msglen > len)
break;
/* Allocate a larger temp buffer if the full message won't fit
in the buffer on the stack (which should work for most
ordinary messages). */
if (msglen > sizeof(__tmp) &&
msg == (struct dlm_header *) __tmp) {
msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
if (msg == NULL)
return ret;
}
copy_from_cb(msg, base, offset, msglen, limit);
BUG_ON(lockspace != msg->h_lockspace);
ret += msglen;
offset += msglen;
offset &= (limit - 1);
len -= msglen;
switch (msg->h_cmd) {
case DLM_MSG:
dlm_receive_message(msg, nodeid, FALSE);
break;
case DLM_RCOM:
dlm_receive_rcom(msg, nodeid);
break;
default:
log_print("unknown msg type %x from %u: %u %u %u %u",
msg->h_cmd, nodeid, msglen, len, offset, ret);
}
}
if (msg != (struct dlm_header *) __tmp)
kfree(msg);
return err ? err : ret;
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
unsigned len, unsigned limit);
#endif /* __MIDCOMMS_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
#include "lowcomms.h"
#include "midcomms.h"
#include "rcom.h"
#include "recover.h"
#include "dir.h"
#include "config.h"
#include "memory.h"
#include "lock.h"
#include "util.h"
static int rcom_response(struct dlm_ls *ls)
{
return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
}
static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
char *mb;
int mb_len = sizeof(struct dlm_rcom) + len;
mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
if (!mh) {
log_print("create_rcom to %d type %d len %d ENOBUFS",
to_nodeid, type, len);
return -ENOBUFS;
}
memset(mb, 0, mb_len);
rc = (struct dlm_rcom *) mb;
rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
rc->rc_header.h_lockspace = ls->ls_global_id;
rc->rc_header.h_nodeid = dlm_our_nodeid();
rc->rc_header.h_length = mb_len;
rc->rc_header.h_cmd = DLM_RCOM;
rc->rc_type = type;
*mh_ret = mh;
*rc_ret = rc;
return 0;
}
static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
struct dlm_rcom *rc)
{
dlm_rcom_out(rc);
dlm_lowcomms_commit_buffer(mh);
}
/* When replying to a status request, a node also sends back its
configuration values. The requesting node then checks that the remote
node is configured the same way as itself. */
static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
{
rf->rf_lvblen = ls->ls_lvblen;
rf->rf_lsflags = ls->ls_exflags;
}
static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
{
if (rf->rf_lvblen != ls->ls_lvblen ||
rf->rf_lsflags != ls->ls_exflags) {
log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
ls->ls_lvblen, ls->ls_exflags,
nodeid, rf->rf_lvblen, rf->rf_lsflags);
return -EINVAL;
}
return 0;
}
int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error = 0;
memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
if (nodeid == dlm_our_nodeid()) {
rc = (struct dlm_rcom *) ls->ls_recover_buf;
rc->rc_result = dlm_recover_status(ls);
goto out;
}
error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
if (error)
goto out;
send_rcom(ls, mh, rc);
error = dlm_wait_function(ls, &rcom_response);
clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
if (error)
goto out;
rc = (struct dlm_rcom *) ls->ls_recover_buf;
if (rc->rc_result == -ESRCH) {
/* we pretend the remote lockspace exists with 0 status */
log_debug(ls, "remote node %d not ready", nodeid);
rc->rc_result = 0;
} else
error = check_config(ls, (struct rcom_config *) rc->rc_buf,
nodeid);
/* the caller looks at rc_result for the remote recovery status */
out:
return error;
}
static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error, nodeid = rc_in->rc_header.h_nodeid;
error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
sizeof(struct rcom_config), &rc, &mh);
if (error)
return;
rc->rc_result = dlm_recover_status(ls);
make_config(ls, (struct rcom_config *) rc->rc_buf);
send_rcom(ls, mh, rc);
}
static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
set_bit(LSFL_RCOM_READY, &ls->ls_flags);
wake_up(&ls->ls_wait_general);
}
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error = 0, len = sizeof(struct dlm_rcom);
memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
if (nodeid == dlm_our_nodeid()) {
dlm_copy_master_names(ls, last_name, last_len,
ls->ls_recover_buf + len,
dlm_config.buffer_size - len, nodeid);
goto out;
}
error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
if (error)
goto out;
memcpy(rc->rc_buf, last_name, last_len);
send_rcom(ls, mh, rc);
error = dlm_wait_function(ls, &rcom_response);
clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
out:
return error;
}
static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error, inlen, outlen;
int nodeid = rc_in->rc_header.h_nodeid;
uint32_t status = dlm_recover_status(ls);
/*
* We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
* dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
* It could only happen in rare cases where we get a late NAMES
* message from a previous instance of recovery.
*/
if (!(status & DLM_RS_NODES)) {
log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
return;
}
nodeid = rc_in->rc_header.h_nodeid;
inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
if (error)
return;
dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
nodeid);
send_rcom(ls, mh, rc);
}
static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
set_bit(LSFL_RCOM_READY, &ls->ls_flags);
wake_up(&ls->ls_wait_general);
}
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
struct dlm_ls *ls = r->res_ls;
int error;
error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
&rc, &mh);
if (error)
goto out;
memcpy(rc->rc_buf, r->res_name, r->res_length);
rc->rc_id = (unsigned long) r;
send_rcom(ls, mh, rc);
out:
return error;
}
static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
if (error)
return;
error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
if (error)
ret_nodeid = error;
rc->rc_result = ret_nodeid;
rc->rc_id = rc_in->rc_id;
send_rcom(ls, mh, rc);
}
static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
dlm_recover_master_reply(ls, rc_in);
}
static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct rcom_lock *rl)
{
memset(rl, 0, sizeof(*rl));
rl->rl_ownpid = lkb->lkb_ownpid;
rl->rl_lkid = lkb->lkb_id;
rl->rl_exflags = lkb->lkb_exflags;
rl->rl_flags = lkb->lkb_flags;
rl->rl_lvbseq = lkb->lkb_lvbseq;
rl->rl_rqmode = lkb->lkb_rqmode;
rl->rl_grmode = lkb->lkb_grmode;
rl->rl_status = lkb->lkb_status;
rl->rl_wait_type = lkb->lkb_wait_type;
if (lkb->lkb_bastaddr)
rl->rl_asts |= AST_BAST;
if (lkb->lkb_astaddr)
rl->rl_asts |= AST_COMP;
if (lkb->lkb_range)
memcpy(rl->rl_range, lkb->lkb_range, 4*sizeof(uint64_t));
rl->rl_namelen = r->res_length;
memcpy(rl->rl_name, r->res_name, r->res_length);
/* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
If so, receive_rcom_lock_args() won't take this copy. */
if (lkb->lkb_lvbptr)
memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
}
int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
{
struct dlm_ls *ls = r->res_ls;
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
struct rcom_lock *rl;
int error, len = sizeof(struct rcom_lock);
if (lkb->lkb_lvbptr)
len += ls->ls_lvblen;
error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
if (error)
goto out;
rl = (struct rcom_lock *) rc->rc_buf;
pack_rcom_lock(r, lkb, rl);
rc->rc_id = (unsigned long) r;
send_rcom(ls, mh, rc);
out:
return error;
}
static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error, nodeid = rc_in->rc_header.h_nodeid;
dlm_recover_master_copy(ls, rc_in);
error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
sizeof(struct rcom_lock), &rc, &mh);
if (error)
return;
/* We send back the same rcom_lock struct we received, but
dlm_recover_master_copy() has filled in rl_remid and rl_result */
memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
rc->rc_id = rc_in->rc_id;
send_rcom(ls, mh, rc);
}
static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
uint32_t status = dlm_recover_status(ls);
if (!(status & DLM_RS_DIR)) {
log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
rc_in->rc_header.h_nodeid);
return;
}
dlm_recover_process_copy(ls, rc_in);
}
static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
char *mb;
int mb_len = sizeof(struct dlm_rcom);
mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
if (!mh)
return -ENOBUFS;
memset(mb, 0, mb_len);
rc = (struct dlm_rcom *) mb;
rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
rc->rc_header.h_nodeid = dlm_our_nodeid();
rc->rc_header.h_length = mb_len;
rc->rc_header.h_cmd = DLM_RCOM;
rc->rc_type = DLM_RCOM_STATUS_REPLY;
rc->rc_result = -ESRCH;
dlm_rcom_out(rc);
dlm_lowcomms_commit_buffer(mh);
return 0;
}
/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
recovery-only comms are sent through here. */
void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
{
struct dlm_rcom *rc = (struct dlm_rcom *) hd;
struct dlm_ls *ls;
dlm_rcom_in(rc);
/* If the lockspace doesn't exist then still send a status message
back; it's possible that it just doesn't have its global_id yet. */
ls = dlm_find_lockspace_global(hd->h_lockspace);
if (!ls) {
log_print("lockspace %x from %d not found",
hd->h_lockspace, nodeid);
send_ls_not_ready(nodeid, rc);
return;
}
if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
log_error(ls, "ignoring recovery message %x from %d",
rc->rc_type, nodeid);
goto out;
}
if (nodeid != rc->rc_header.h_nodeid) {
log_error(ls, "bad rcom nodeid %d from %d",
rc->rc_header.h_nodeid, nodeid);
goto out;
}
switch (rc->rc_type) {
case DLM_RCOM_STATUS:
receive_rcom_status(ls, rc);
break;
case DLM_RCOM_NAMES:
receive_rcom_names(ls, rc);
break;
case DLM_RCOM_LOOKUP:
receive_rcom_lookup(ls, rc);
break;
case DLM_RCOM_LOCK:
receive_rcom_lock(ls, rc);
break;
case DLM_RCOM_STATUS_REPLY:
receive_rcom_status_reply(ls, rc);
break;
case DLM_RCOM_NAMES_REPLY:
receive_rcom_names_reply(ls, rc);
break;
case DLM_RCOM_LOOKUP_REPLY:
receive_rcom_lookup_reply(ls, rc);
break;
case DLM_RCOM_LOCK_REPLY:
receive_rcom_lock_reply(ls, rc);
break;
default:
DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
}
out:
dlm_put_lockspace(ls);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __RCOM_DOT_H__
#define __RCOM_DOT_H__
int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
#endif
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "dir.h"
#include "config.h"
#include "ast.h"
#include "memory.h"
#include "rcom.h"
#include "lock.h"
#include "lowcomms.h"
#include "member.h"
#include "recover.h"
/*
* Recovery waiting routines: these functions wait for a particular reply from
* a remote node, or for the remote node to report a certain status. They need
* to abort if the lockspace is stopped indicating a node has failed (perhaps
* the one being waited for).
*/
/*
* Wait until given function returns non-zero or lockspace is stopped
* (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
* function thinks it could have completed the waited-on task, they should wake
* up ls_wait_general to get an immediate response rather than waiting for the
* timer to detect the result. A timer wakes us up periodically while waiting
* to see if we should abort due to a node failure. This should only be called
* by the dlm_recoverd thread.
*/
static void dlm_wait_timer_fn(unsigned long data)
{
struct dlm_ls *ls = (struct dlm_ls *) data;
mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
wake_up(&ls->ls_wait_general);
}
int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
{
int error = 0;
init_timer(&ls->ls_timer);
ls->ls_timer.function = dlm_wait_timer_fn;
ls->ls_timer.data = (long) ls;
ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
add_timer(&ls->ls_timer);
wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
del_timer_sync(&ls->ls_timer);
if (dlm_recovery_stopped(ls)) {
log_debug(ls, "dlm_wait_function aborted");
error = -EINTR;
}
return error;
}
/*
* An efficient way for all nodes to wait for all others to have a certain
* status. The node with the lowest nodeid polls all the others for their
* status (wait_status_all) and all the others poll the node with the low id
* for its accumulated result (wait_status_low). When all nodes have set
* status flag X, then status flag X_ALL will be set on the low nodeid.
*/
uint32_t dlm_recover_status(struct dlm_ls *ls)
{
uint32_t status;
spin_lock(&ls->ls_recover_lock);
status = ls->ls_recover_status;
spin_unlock(&ls->ls_recover_lock);
return status;
}
void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
{
spin_lock(&ls->ls_recover_lock);
ls->ls_recover_status |= status;
spin_unlock(&ls->ls_recover_lock);
}
static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
{
struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
struct dlm_member *memb;
int error = 0, delay;
list_for_each_entry(memb, &ls->ls_nodes, list) {
delay = 0;
for (;;) {
if (dlm_recovery_stopped(ls)) {
error = -EINTR;
goto out;
}
error = dlm_rcom_status(ls, memb->nodeid);
if (error)
goto out;
if (rc->rc_result & wait_status)
break;
if (delay < 1000)
delay += 20;
msleep(delay);
}
}
out:
return error;
}
static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
{
struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
for (;;) {
if (dlm_recovery_stopped(ls)) {
error = -EINTR;
goto out;
}
error = dlm_rcom_status(ls, nodeid);
if (error)
break;
if (rc->rc_result & wait_status)
break;
if (delay < 1000)
delay += 20;
msleep(delay);
}
out:
return error;
}
static int wait_status(struct dlm_ls *ls, uint32_t status)
{
uint32_t status_all = status << 1;
int error;
if (ls->ls_low_nodeid == dlm_our_nodeid()) {
error = wait_status_all(ls, status);
if (!error)
dlm_set_recover_status(ls, status_all);
} else
error = wait_status_low(ls, status_all);
return error;
}
int dlm_recover_members_wait(struct dlm_ls *ls)
{
return wait_status(ls, DLM_RS_NODES);
}
int dlm_recover_directory_wait(struct dlm_ls *ls)
{
return wait_status(ls, DLM_RS_DIR);
}
int dlm_recover_locks_wait(struct dlm_ls *ls)
{
return wait_status(ls, DLM_RS_LOCKS);
}
int dlm_recover_done_wait(struct dlm_ls *ls)
{
return wait_status(ls, DLM_RS_DONE);
}
/*
* The recover_list contains all the rsb's for which we've requested the new
* master nodeid. As replies are returned from the resource directories the
* rsb's are removed from the list. When the list is empty we're done.
*
* The recover_list is later similarly used for all rsb's for which we've sent
* new lkb's and need to receive new corresponding lkid's.
*
* We use the address of the rsb struct as a simple local identifier for the
* rsb so we can match an rcom reply with the rsb it was sent for.
*/
static int recover_list_empty(struct dlm_ls *ls)
{
int empty;
spin_lock(&ls->ls_recover_list_lock);
empty = list_empty(&ls->ls_recover_list);
spin_unlock(&ls->ls_recover_list_lock);
return empty;
}
static void recover_list_add(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
spin_lock(&ls->ls_recover_list_lock);
if (list_empty(&r->res_recover_list)) {
list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
ls->ls_recover_list_count++;
dlm_hold_rsb(r);
}
spin_unlock(&ls->ls_recover_list_lock);
}
static void recover_list_del(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
spin_lock(&ls->ls_recover_list_lock);
list_del_init(&r->res_recover_list);
ls->ls_recover_list_count--;
spin_unlock(&ls->ls_recover_list_lock);
dlm_put_rsb(r);
}
static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
{
struct dlm_rsb *r = NULL;
spin_lock(&ls->ls_recover_list_lock);
list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
if (id == (unsigned long) r)
goto out;
}
r = NULL;
out:
spin_unlock(&ls->ls_recover_list_lock);
return r;
}
static void recover_list_clear(struct dlm_ls *ls)
{
struct dlm_rsb *r, *s;
spin_lock(&ls->ls_recover_list_lock);
list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
list_del_init(&r->res_recover_list);
dlm_put_rsb(r);
ls->ls_recover_list_count--;
}
if (ls->ls_recover_list_count != 0) {
log_error(ls, "warning: recover_list_count %d",
ls->ls_recover_list_count);
ls->ls_recover_list_count = 0;
}
spin_unlock(&ls->ls_recover_list_lock);
}
/* Master recovery: find new master node for rsb's that were
mastered on nodes that have been removed.
dlm_recover_masters
recover_master
dlm_send_rcom_lookup -> receive_rcom_lookup
dlm_dir_lookup
receive_rcom_lookup_reply <-
dlm_recover_master_reply
set_new_master
set_master_lkbs
set_lock_master
*/
/*
* Set the lock master for all LKBs in a lock queue
* If we are the new master of the rsb, we may have received new
* MSTCPY locks from other nodes already which we need to ignore
* when setting the new nodeid.
*/
static void set_lock_master(struct list_head *queue, int nodeid)
{
struct dlm_lkb *lkb;
list_for_each_entry(lkb, queue, lkb_statequeue)
if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
lkb->lkb_nodeid = nodeid;
}
static void set_master_lkbs(struct dlm_rsb *r)
{
set_lock_master(&r->res_grantqueue, r->res_nodeid);
set_lock_master(&r->res_convertqueue, r->res_nodeid);
set_lock_master(&r->res_waitqueue, r->res_nodeid);
}
/*
* Propogate the new master nodeid to locks
* The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
* The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
*/
static void set_new_master(struct dlm_rsb *r, int nodeid)
{
lock_rsb(r);
r->res_nodeid = nodeid;
set_master_lkbs(r);
rsb_set_flag(r, RSB_NEW_MASTER);
rsb_set_flag(r, RSB_NEW_MASTER2);
unlock_rsb(r);
}
/*
* We do async lookups on rsb's that need new masters. The rsb's
* waiting for a lookup reply are kept on the recover_list.
*/
static int recover_master(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
dir_nodeid = dlm_dir_nodeid(r);
if (dir_nodeid == our_nodeid) {
error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
r->res_length, &ret_nodeid);
if (error)
log_error(ls, "recover dir lookup error %d", error);
if (ret_nodeid == our_nodeid)
ret_nodeid = 0;
set_new_master(r, ret_nodeid);
} else {
recover_list_add(r);
error = dlm_send_rcom_lookup(r, dir_nodeid);
}
return error;
}
/*
* When not using a directory, most resource names will hash to a new static
* master nodeid and the resource will need to be remastered.
*/
static int recover_master_static(struct dlm_rsb *r)
{
int master = dlm_dir_nodeid(r);
if (master == dlm_our_nodeid())
master = 0;
if (r->res_nodeid != master) {
if (is_master(r))
dlm_purge_mstcpy_locks(r);
set_new_master(r, master);
return 1;
}
return 0;
}
/*
* Go through local root resources and for each rsb which has a master which
* has departed, get the new master nodeid from the directory. The dir will
* assign mastery to the first node to look up the new master. That means
* we'll discover in this lookup if we're the new master of any rsb's.
*
* We fire off all the dir lookup requests individually and asynchronously to
* the correct dir node.
*/
int dlm_recover_masters(struct dlm_ls *ls)
{
struct dlm_rsb *r;
int error = 0, count = 0;
log_debug(ls, "dlm_recover_masters");
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
if (dlm_recovery_stopped(ls)) {
up_read(&ls->ls_root_sem);
error = -EINTR;
goto out;
}
if (dlm_no_directory(ls))
count += recover_master_static(r);
else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
recover_master(r);
count++;
}
schedule();
}
up_read(&ls->ls_root_sem);
log_debug(ls, "dlm_recover_masters %d resources", count);
error = dlm_wait_function(ls, &recover_list_empty);
out:
if (error)
recover_list_clear(ls);
return error;
}
int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
{
struct dlm_rsb *r;
int nodeid;
r = recover_list_find(ls, rc->rc_id);
if (!r) {
log_error(ls, "dlm_recover_master_reply no id %"PRIx64"",
rc->rc_id);
goto out;
}
nodeid = rc->rc_result;
if (nodeid == dlm_our_nodeid())
nodeid = 0;
set_new_master(r, nodeid);
recover_list_del(r);
if (recover_list_empty(ls))
wake_up(&ls->ls_wait_general);
out:
return 0;
}
/* Lock recovery: rebuild the process-copy locks we hold on a
remastered rsb on the new rsb master.
dlm_recover_locks
recover_locks
recover_locks_queue
dlm_send_rcom_lock -> receive_rcom_lock
dlm_recover_master_copy
receive_rcom_lock_reply <-
dlm_recover_process_copy
*/
/*
* keep a count of the number of lkb's we send to the new master; when we get
* an equal number of replies then recovery for the rsb is done
*/
static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
{
struct dlm_lkb *lkb;
int error = 0;
list_for_each_entry(lkb, head, lkb_statequeue) {
error = dlm_send_rcom_lock(r, lkb);
if (error)
break;
r->res_recover_locks_count++;
}
return error;
}
static int all_queues_empty(struct dlm_rsb *r)
{
if (!list_empty(&r->res_grantqueue) ||
!list_empty(&r->res_convertqueue) ||
!list_empty(&r->res_waitqueue))
return FALSE;
return TRUE;
}
static int recover_locks(struct dlm_rsb *r)
{
int error = 0;
lock_rsb(r);
if (all_queues_empty(r))
goto out;
DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
error = recover_locks_queue(r, &r->res_grantqueue);
if (error)
goto out;
error = recover_locks_queue(r, &r->res_convertqueue);
if (error)
goto out;
error = recover_locks_queue(r, &r->res_waitqueue);
if (error)
goto out;
if (r->res_recover_locks_count)
recover_list_add(r);
else
rsb_clear_flag(r, RSB_NEW_MASTER);
out:
unlock_rsb(r);
return error;
}
int dlm_recover_locks(struct dlm_ls *ls)
{
struct dlm_rsb *r;
int error, count = 0;
log_debug(ls, "dlm_recover_locks");
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
if (is_master(r)) {
rsb_clear_flag(r, RSB_NEW_MASTER);
continue;
}
if (!rsb_flag(r, RSB_NEW_MASTER))
continue;
if (dlm_recovery_stopped(ls)) {
error = -EINTR;
up_read(&ls->ls_root_sem);
goto out;
}
error = recover_locks(r);
if (error) {
up_read(&ls->ls_root_sem);
goto out;
}
count += r->res_recover_locks_count;
}
up_read(&ls->ls_root_sem);
log_debug(ls, "dlm_recover_locks %d locks", count);
error = dlm_wait_function(ls, &recover_list_empty);
out:
if (error)
recover_list_clear(ls);
else
dlm_set_recover_status(ls, DLM_RS_LOCKS);
return error;
}
void dlm_recovered_lock(struct dlm_rsb *r)
{
DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
r->res_recover_locks_count--;
if (!r->res_recover_locks_count) {
rsb_clear_flag(r, RSB_NEW_MASTER);
recover_list_del(r);
}
if (recover_list_empty(r->res_ls))
wake_up(&r->res_ls->ls_wait_general);
}
/*
* The lvb needs to be recovered on all master rsb's. This includes setting
* the VALNOTVALID flag if necessary, and determining the correct lvb contents
* based on the lvb's of the locks held on the rsb.
*
* RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
* was already set prior to recovery, it's not cleared, regardless of locks.
*
* The LVB contents are only considered for changing when this is a new master
* of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
* mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
* from the lkb with the largest lvb sequence number.
*/
static void recover_lvb(struct dlm_rsb *r)
{
struct dlm_lkb *lkb, *high_lkb = NULL;
uint32_t high_seq = 0;
int lock_lvb_exists = FALSE;
int big_lock_exists = FALSE;
int lvblen = r->res_ls->ls_lvblen;
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = TRUE;
if (lkb->lkb_grmode > DLM_LOCK_CR) {
big_lock_exists = TRUE;
goto setflag;
}
if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
high_lkb = lkb;
high_seq = lkb->lkb_lvbseq;
}
}
list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = TRUE;
if (lkb->lkb_grmode > DLM_LOCK_CR) {
big_lock_exists = TRUE;
goto setflag;
}
if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
high_lkb = lkb;
high_seq = lkb->lkb_lvbseq;
}
}
setflag:
if (!lock_lvb_exists)
goto out;
if (!big_lock_exists)
rsb_set_flag(r, RSB_VALNOTVALID);
/* don't mess with the lvb unless we're the new master */
if (!rsb_flag(r, RSB_NEW_MASTER2))
goto out;
if (!r->res_lvbptr) {
r->res_lvbptr = allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
goto out;
}
if (big_lock_exists) {
r->res_lvbseq = lkb->lkb_lvbseq;
memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
} else if (high_lkb) {
r->res_lvbseq = high_lkb->lkb_lvbseq;
memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
} else {
r->res_lvbseq = 0;
memset(r->res_lvbptr, 0, lvblen);
}
out:
return;
}
/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
converting PR->CW or CW->PR need to have their lkb_grmode set. */
static void recover_conversion(struct dlm_rsb *r)
{
struct dlm_lkb *lkb;
int grmode = -1;
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
if (lkb->lkb_grmode == DLM_LOCK_PR ||
lkb->lkb_grmode == DLM_LOCK_CW) {
grmode = lkb->lkb_grmode;
break;
}
}
list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
if (lkb->lkb_grmode != DLM_LOCK_IV)
continue;
if (grmode == -1)
lkb->lkb_grmode = lkb->lkb_rqmode;
else
lkb->lkb_grmode = grmode;
}
}
void dlm_recover_rsbs(struct dlm_ls *ls)
{
struct dlm_rsb *r;
int count = 0;
log_debug(ls, "dlm_recover_rsbs");
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
lock_rsb(r);
if (is_master(r)) {
if (rsb_flag(r, RSB_RECOVER_CONVERT))
recover_conversion(r);
recover_lvb(r);
count++;
}
rsb_clear_flag(r, RSB_RECOVER_CONVERT);
unlock_rsb(r);
}
up_read(&ls->ls_root_sem);
log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
}
/* Create a single list of all root rsb's to be used during recovery */
int dlm_create_root_list(struct dlm_ls *ls)
{
struct dlm_rsb *r;
int i, error = 0;
down_write(&ls->ls_root_sem);
if (!list_empty(&ls->ls_root_list)) {
log_error(ls, "root list not empty");
error = -EINVAL;
goto out;
}
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
read_lock(&ls->ls_rsbtbl[i].lock);
list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
list_add(&r->res_root_list, &ls->ls_root_list);
dlm_hold_rsb(r);
}
read_unlock(&ls->ls_rsbtbl[i].lock);
}
out:
up_write(&ls->ls_root_sem);
return error;
}
void dlm_release_root_list(struct dlm_ls *ls)
{
struct dlm_rsb *r, *safe;
down_write(&ls->ls_root_sem);
list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
list_del_init(&r->res_root_list);
dlm_put_rsb(r);
}
up_write(&ls->ls_root_sem);
}
void dlm_clear_toss_list(struct dlm_ls *ls)
{
struct dlm_rsb *r, *safe;
int i;
for (i = 0; i < ls->ls_rsbtbl_size; i++) {
write_lock(&ls->ls_rsbtbl[i].lock);
list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
res_hashchain) {
list_del(&r->res_hashchain);
free_rsb(r);
}
write_unlock(&ls->ls_rsbtbl[i].lock);
}
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __RECOVER_DOT_H__
#define __RECOVER_DOT_H__
int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
uint32_t dlm_recover_status(struct dlm_ls *ls);
void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
int dlm_recover_members_wait(struct dlm_ls *ls);
int dlm_recover_directory_wait(struct dlm_ls *ls);
int dlm_recover_locks_wait(struct dlm_ls *ls);
int dlm_recover_done_wait(struct dlm_ls *ls);
int dlm_recover_masters(struct dlm_ls *ls);
int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
int dlm_recover_locks(struct dlm_ls *ls);
void dlm_recovered_lock(struct dlm_rsb *r);
int dlm_create_root_list(struct dlm_ls *ls);
void dlm_release_root_list(struct dlm_ls *ls);
void dlm_clear_toss_list(struct dlm_ls *ls);
void dlm_recover_rsbs(struct dlm_ls *ls);
#endif /* __RECOVER_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
#include "dir.h"
#include "ast.h"
#include "recover.h"
#include "lowcomms.h"
#include "lock.h"
#include "requestqueue.h"
#include "recoverd.h"
/* If the start for which we're re-enabling locking (seq) has been superseded
by a newer stop (ls_recover_seq), we need to leave locking disabled. */
static int enable_locking(struct dlm_ls *ls, uint64_t seq)
{
int error = -EINTR;
spin_lock(&ls->ls_recover_lock);
if (ls->ls_recover_seq == seq) {
set_bit(LSFL_RUNNING, &ls->ls_flags);
up_write(&ls->ls_in_recovery);
error = 0;
}
spin_unlock(&ls->ls_recover_lock);
return error;
}
static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
{
unsigned long start;
int error, neg = 0;
log_debug(ls, "recover %"PRIx64"", rv->seq);
down(&ls->ls_recoverd_active);
/*
* Suspending and resuming dlm_astd ensures that no lkb's from this ls
* will be processed by dlm_astd during recovery.
*/
dlm_astd_suspend();
dlm_astd_resume();
/*
* This list of root rsb's will be the basis of most of the recovery
* routines.
*/
dlm_create_root_list(ls);
/*
* Free all the tossed rsb's so we don't have to recover them.
*/
dlm_clear_toss_list(ls);
/*
* Add or remove nodes from the lockspace's ls_nodes list.
* Also waits for all nodes to complete dlm_recover_members.
*/
error = dlm_recover_members(ls, rv, &neg);
if (error) {
log_error(ls, "recover_members failed %d", error);
goto fail;
}
start = jiffies;
/*
* Rebuild our own share of the directory by collecting from all other
* nodes their master rsb names that hash to us.
*/
error = dlm_recover_directory(ls);
if (error) {
log_error(ls, "recover_directory failed %d", error);
goto fail;
}
/*
* Purge directory-related requests that are saved in requestqueue.
* All dir requests from before recovery are invalid now due to the dir
* rebuild and will be resent by the requesting nodes.
*/
dlm_purge_requestqueue(ls);
/*
* Wait for all nodes to complete directory rebuild.
*/
error = dlm_recover_directory_wait(ls);
if (error) {
log_error(ls, "recover_directory_wait failed %d", error);
goto fail;
}
/*
* We may have outstanding operations that are waiting for a reply from
* a failed node. Mark these to be resent after recovery. Unlock and
* cancel ops can just be completed.
*/
dlm_recover_waiters_pre(ls);
error = dlm_recovery_stopped(ls);
if (error)
goto fail;
if (neg || dlm_no_directory(ls)) {
/*
* Clear lkb's for departed nodes.
*/
dlm_purge_locks(ls);
/*
* Get new master nodeid's for rsb's that were mastered on
* departed nodes.
*/
error = dlm_recover_masters(ls);
if (error) {
log_error(ls, "recover_masters failed %d", error);
goto fail;
}
/*
* Send our locks on remastered rsb's to the new masters.
*/
error = dlm_recover_locks(ls);
if (error) {
log_error(ls, "recover_locks failed %d", error);
goto fail;
}
error = dlm_recover_locks_wait(ls);
if (error) {
log_error(ls, "recover_locks_wait failed %d", error);
goto fail;
}
/*
* Finalize state in master rsb's now that all locks can be
* checked. This includes conversion resolution and lvb
* settings.
*/
dlm_recover_rsbs(ls);
}
dlm_release_root_list(ls);
dlm_set_recover_status(ls, DLM_RS_DONE);
error = dlm_recover_done_wait(ls);
if (error) {
log_error(ls, "recover_done_wait failed %d", error);
goto fail;
}
dlm_clear_members_gone(ls);
error = enable_locking(ls, rv->seq);
if (error) {
log_error(ls, "enable_locking failed %d", error);
goto fail;
}
error = dlm_process_requestqueue(ls);
if (error) {
log_error(ls, "process_requestqueue failed %d", error);
goto fail;
}
error = dlm_recover_waiters_post(ls);
if (error) {
log_error(ls, "recover_waiters_post failed %d", error);
goto fail;
}
dlm_grant_after_purge(ls);
dlm_astd_wake();
log_debug(ls, "recover %"PRIx64" done: %u ms", rv->seq,
jiffies_to_msecs(jiffies - start));
up(&ls->ls_recoverd_active);
return 0;
fail:
dlm_release_root_list(ls);
log_debug(ls, "recover %"PRIx64" error %d", rv->seq, error);
up(&ls->ls_recoverd_active);
return error;
}
static void do_ls_recovery(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL;
spin_lock(&ls->ls_recover_lock);
rv = ls->ls_recover_args;
ls->ls_recover_args = NULL;
clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
spin_unlock(&ls->ls_recover_lock);
if (rv) {
ls_recover(ls, rv);
kfree(rv->nodeids);
kfree(rv);
}
}
static int dlm_recoverd(void *arg)
{
struct dlm_ls *ls;
ls = dlm_find_lockspace_local(arg);
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!test_bit(LSFL_WORK, &ls->ls_flags))
schedule();
set_current_state(TASK_RUNNING);
if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
do_ls_recovery(ls);
}
dlm_put_lockspace(ls);
return 0;
}
void dlm_recoverd_kick(struct dlm_ls *ls)
{
set_bit(LSFL_WORK, &ls->ls_flags);
wake_up_process(ls->ls_recoverd_task);
}
int dlm_recoverd_start(struct dlm_ls *ls)
{
struct task_struct *p;
int error = 0;
p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
if (IS_ERR(p))
error = PTR_ERR(p);
else
ls->ls_recoverd_task = p;
return error;
}
void dlm_recoverd_stop(struct dlm_ls *ls)
{
kthread_stop(ls->ls_recoverd_task);
}
void dlm_recoverd_suspend(struct dlm_ls *ls)
{
down(&ls->ls_recoverd_active);
}
void dlm_recoverd_resume(struct dlm_ls *ls)
{
up(&ls->ls_recoverd_active);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __RECOVERD_DOT_H__
#define __RECOVERD_DOT_H__
void dlm_recoverd_kick(struct dlm_ls *ls);
void dlm_recoverd_stop(struct dlm_ls *ls);
int dlm_recoverd_start(struct dlm_ls *ls);
void dlm_recoverd_suspend(struct dlm_ls *ls);
void dlm_recoverd_resume(struct dlm_ls *ls);
#endif /* __RECOVERD_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "member.h"
#include "lock.h"
#include "dir.h"
#include "config.h"
#include "requestqueue.h"
struct rq_entry {
struct list_head list;
int nodeid;
char request[1];
};
/*
* Requests received while the lockspace is in recovery get added to the
* request queue and processed when recovery is complete. This happens when
* the lockspace is suspended on some nodes before it is on others, or the
* lockspace is enabled on some while still suspended on others.
*/
void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
{
struct rq_entry *e;
int length = hd->h_length;
if (dlm_is_removed(ls, nodeid))
return;
e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
if (!e) {
log_print("dlm_add_requestqueue: out of memory\n");
return;
}
e->nodeid = nodeid;
memcpy(e->request, hd, length);
down(&ls->ls_requestqueue_lock);
list_add_tail(&e->list, &ls->ls_requestqueue);
up(&ls->ls_requestqueue_lock);
}
int dlm_process_requestqueue(struct dlm_ls *ls)
{
struct rq_entry *e;
struct dlm_header *hd;
int error = 0;
down(&ls->ls_requestqueue_lock);
for (;;) {
if (list_empty(&ls->ls_requestqueue)) {
up(&ls->ls_requestqueue_lock);
error = 0;
break;
}
e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
up(&ls->ls_requestqueue_lock);
hd = (struct dlm_header *) e->request;
error = dlm_receive_message(hd, e->nodeid, TRUE);
if (error == -EINTR) {
/* entry is left on requestqueue */
log_debug(ls, "process_requestqueue abort eintr");
break;
}
down(&ls->ls_requestqueue_lock);
list_del(&e->list);
kfree(e);
if (dlm_locking_stopped(ls)) {
log_debug(ls, "process_requestqueue abort running");
up(&ls->ls_requestqueue_lock);
error = -EINTR;
break;
}
schedule();
}
return error;
}
/*
* After recovery is done, locking is resumed and dlm_recoverd takes all the
* saved requests and processes them as they would have been by dlm_recvd. At
* the same time, dlm_recvd will start receiving new requests from remote
* nodes. We want to delay dlm_recvd processing new requests until
* dlm_recoverd has finished processing the old saved requests.
*/
void dlm_wait_requestqueue(struct dlm_ls *ls)
{
for (;;) {
down(&ls->ls_requestqueue_lock);
if (list_empty(&ls->ls_requestqueue))
break;
if (dlm_locking_stopped(ls))
break;
up(&ls->ls_requestqueue_lock);
schedule();
}
up(&ls->ls_requestqueue_lock);
}
static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
{
uint32_t type = ms->m_type;
if (dlm_is_removed(ls, nodeid))
return 1;
/* directory operations are always purged because the directory is
always rebuilt during recovery and the lookups resent */
if (type == DLM_MSG_REMOVE ||
type == DLM_MSG_LOOKUP ||
type == DLM_MSG_LOOKUP_REPLY)
return 1;
if (!dlm_no_directory(ls))
return 0;
/* with no directory, the master is likely to change as a part of
recovery; requests to/from the defunct master need to be purged */
switch (type) {
case DLM_MSG_REQUEST:
case DLM_MSG_CONVERT:
case DLM_MSG_UNLOCK:
case DLM_MSG_CANCEL:
/* we're no longer the master of this resource, the sender
will resend to the new master (see waiter_needs_recovery) */
if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
return 1;
break;
case DLM_MSG_REQUEST_REPLY:
case DLM_MSG_CONVERT_REPLY:
case DLM_MSG_UNLOCK_REPLY:
case DLM_MSG_CANCEL_REPLY:
case DLM_MSG_GRANT:
/* this reply is from the former master of the resource,
we'll resend to the new master if needed */
if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
return 1;
break;
}
return 0;
}
void dlm_purge_requestqueue(struct dlm_ls *ls)
{
struct dlm_message *ms;
struct rq_entry *e, *safe;
down(&ls->ls_requestqueue_lock);
list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
ms = (struct dlm_message *) e->request;
if (purge_request(ls, ms, e->nodeid)) {
list_del(&e->list);
kfree(e);
}
}
up(&ls->ls_requestqueue_lock);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __REQUESTQUEUE_DOT_H__
#define __REQUESTQUEUE_DOT_H__
void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
int dlm_process_requestqueue(struct dlm_ls *ls);
void dlm_wait_requestqueue(struct dlm_ls *ls);
void dlm_purge_requestqueue(struct dlm_ls *ls);
#endif
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#include "dlm_internal.h"
#include "rcom.h"
#include "util.h"
static void header_out(struct dlm_header *hd)
{
hd->h_version = cpu_to_le32(hd->h_version);
hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
hd->h_length = cpu_to_le16(hd->h_length);
}
static void header_in(struct dlm_header *hd)
{
hd->h_version = le32_to_cpu(hd->h_version);
hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
hd->h_length = le16_to_cpu(hd->h_length);
}
void dlm_message_out(struct dlm_message *ms)
{
struct dlm_header *hd = (struct dlm_header *) ms;
header_out(hd);
ms->m_type = cpu_to_le32(ms->m_type);
ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
ms->m_pid = cpu_to_le32(ms->m_pid);
ms->m_lkid = cpu_to_le32(ms->m_lkid);
ms->m_remid = cpu_to_le32(ms->m_remid);
ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
ms->m_exflags = cpu_to_le32(ms->m_exflags);
ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
ms->m_flags = cpu_to_le32(ms->m_flags);
ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
ms->m_hash = cpu_to_le32(ms->m_hash);
ms->m_status = cpu_to_le32(ms->m_status);
ms->m_grmode = cpu_to_le32(ms->m_grmode);
ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
ms->m_asts = cpu_to_le32(ms->m_asts);
ms->m_result = cpu_to_le32(ms->m_result);
ms->m_range[0] = cpu_to_le64(ms->m_range[0]);
ms->m_range[1] = cpu_to_le64(ms->m_range[1]);
}
void dlm_message_in(struct dlm_message *ms)
{
struct dlm_header *hd = (struct dlm_header *) ms;
header_in(hd);
ms->m_type = le32_to_cpu(ms->m_type);
ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
ms->m_pid = le32_to_cpu(ms->m_pid);
ms->m_lkid = le32_to_cpu(ms->m_lkid);
ms->m_remid = le32_to_cpu(ms->m_remid);
ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
ms->m_exflags = le32_to_cpu(ms->m_exflags);
ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
ms->m_flags = le32_to_cpu(ms->m_flags);
ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
ms->m_hash = le32_to_cpu(ms->m_hash);
ms->m_status = le32_to_cpu(ms->m_status);
ms->m_grmode = le32_to_cpu(ms->m_grmode);
ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
ms->m_asts = le32_to_cpu(ms->m_asts);
ms->m_result = le32_to_cpu(ms->m_result);
ms->m_range[0] = le64_to_cpu(ms->m_range[0]);
ms->m_range[1] = le64_to_cpu(ms->m_range[1]);
}
static void rcom_lock_out(struct rcom_lock *rl)
{
rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
rl->rl_remid = cpu_to_le32(rl->rl_remid);
rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
rl->rl_flags = cpu_to_le32(rl->rl_flags);
rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
rl->rl_result = cpu_to_le32(rl->rl_result);
rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
rl->rl_range[0] = cpu_to_le64(rl->rl_range[0]);
rl->rl_range[1] = cpu_to_le64(rl->rl_range[1]);
rl->rl_range[2] = cpu_to_le64(rl->rl_range[2]);
rl->rl_range[3] = cpu_to_le64(rl->rl_range[3]);
}
static void rcom_lock_in(struct rcom_lock *rl)
{
rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
rl->rl_remid = le32_to_cpu(rl->rl_remid);
rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
rl->rl_flags = le32_to_cpu(rl->rl_flags);
rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
rl->rl_result = le32_to_cpu(rl->rl_result);
rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
rl->rl_range[0] = le64_to_cpu(rl->rl_range[0]);
rl->rl_range[1] = le64_to_cpu(rl->rl_range[1]);
rl->rl_range[2] = le64_to_cpu(rl->rl_range[2]);
rl->rl_range[3] = le64_to_cpu(rl->rl_range[3]);
}
static void rcom_config_out(struct rcom_config *rf)
{
rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
}
static void rcom_config_in(struct rcom_config *rf)
{
rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
}
void dlm_rcom_out(struct dlm_rcom *rc)
{
struct dlm_header *hd = (struct dlm_header *) rc;
int type = rc->rc_type;
header_out(hd);
rc->rc_type = cpu_to_le32(rc->rc_type);
rc->rc_result = cpu_to_le32(rc->rc_result);
rc->rc_id = cpu_to_le64(rc->rc_id);
if (type == DLM_RCOM_LOCK)
rcom_lock_out((struct rcom_lock *) rc->rc_buf);
else if (type == DLM_RCOM_STATUS_REPLY)
rcom_config_out((struct rcom_config *) rc->rc_buf);
}
void dlm_rcom_in(struct dlm_rcom *rc)
{
struct dlm_header *hd = (struct dlm_header *) rc;
header_in(hd);
rc->rc_type = le32_to_cpu(rc->rc_type);
rc->rc_result = le32_to_cpu(rc->rc_result);
rc->rc_id = le64_to_cpu(rc->rc_id);
if (rc->rc_type == DLM_RCOM_LOCK)
rcom_lock_in((struct rcom_lock *) rc->rc_buf);
else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
rcom_config_in((struct rcom_config *) rc->rc_buf);
}
/******************************************************************************
*******************************************************************************
**
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __UTIL_DOT_H__
#define __UTIL_DOT_H__
void dlm_message_out(struct dlm_message *ms);
void dlm_message_in(struct dlm_message *ms);
void dlm_rcom_out(struct dlm_rcom *rc);
void dlm_rcom_in(struct dlm_rcom *rc);
#endif
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
#ifndef __DLM_DOT_H__
#define __DLM_DOT_H__
/*
* Interface to Distributed Lock Manager (DLM)
* routines and structures to use DLM lockspaces
*/
/*
* Lock Modes
*/
#define DLM_LOCK_IV -1 /* invalid */
#define DLM_LOCK_NL 0 /* null */
#define DLM_LOCK_CR 1 /* concurrent read */
#define DLM_LOCK_CW 2 /* concurrent write */
#define DLM_LOCK_PR 3 /* protected read */
#define DLM_LOCK_PW 4 /* protected write */
#define DLM_LOCK_EX 5 /* exclusive */
/*
* Maximum size in bytes of a dlm_lock name
*/
#define DLM_RESNAME_MAXLEN 64
/*
* Flags to dlm_lock
*
* DLM_LKF_NOQUEUE
*
* Do not queue the lock request on the wait queue if it cannot be granted
* immediately. If the lock cannot be granted because of this flag, DLM will
* either return -EAGAIN from the dlm_lock call or will return 0 from
* dlm_lock and -EAGAIN in the lock status block when the AST is executed.
*
* DLM_LKF_CANCEL
*
* Used to cancel a pending lock request or conversion. A converting lock is
* returned to its previously granted mode.
*
* DLM_LKF_CONVERT
*
* Indicates a lock conversion request. For conversions the name and namelen
* are ignored and the lock ID in the LKSB is used to identify the lock.
*
* DLM_LKF_VALBLK
*
* Requests DLM to return the current contents of the lock value block in the
* lock status block. When this flag is set in a lock conversion from PW or EX
* modes, DLM assigns the value specified in the lock status block to the lock
* value block of the lock resource. The LVB is a DLM_LVB_LEN size array
* containing application-specific information.
*
* DLM_LKF_QUECVT
*
* Force a conversion request to be queued, even if it is compatible with
* the granted modes of other locks on the same resource.
*
* DLM_LKF_IVVALBLK
*
* Invalidate the lock value block.
*
* DLM_LKF_CONVDEADLK
*
* Allows the dlm to resolve conversion deadlocks internally by demoting the
* granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
* returned for a conversion that's been effected by this.
*
* DLM_LKF_PERSISTENT
*
* Only relevant to locks originating in userspace. A persistent lock will not
* be removed if the process holding the lock exits.
*
* DLM_LKF_NODLKWT
* DLM_LKF_NODLCKBLK
*
* net yet implemented
*
* DLM_LKF_EXPEDITE
*
* Used only with new requests for NL mode locks. Tells the lock manager
* to grant the lock, ignoring other locks in convert and wait queues.
*
* DLM_LKF_NOQUEUEBAST
*
* Send blocking AST's before returning -EAGAIN to the caller. It is only
* used along with the NOQUEUE flag. Blocking AST's are not sent for failed
* NOQUEUE requests otherwise.
*
* DLM_LKF_HEADQUE
*
* Add a lock to the head of the convert or wait queue rather than the tail.
*
* DLM_LKF_NOORDER
*
* Disregard the standard grant order rules and grant a lock as soon as it
* is compatible with other granted locks.
*
* DLM_LKF_ORPHAN
*
* not yet implemented
*
* DLM_LKF_ALTPR
*
* If the requested mode cannot be granted immediately, try to grant the lock
* in PR mode instead. If this alternate mode is granted instead of the
* requested mode, DLM_SBF_ALTMODE is returned in the lksb.
*
* DLM_LKF_ALTCW
*
* The same as ALTPR, but the alternate mode is CW.
*
* DLM_LKF_FORCEUNLOCK
*
* Unlock the lock even if it is converting or waiting or has sublocks.
* Only really for use by the userland device.c code.
*
*/
#define DLM_LKF_NOQUEUE 0x00000001
#define DLM_LKF_CANCEL 0x00000002
#define DLM_LKF_CONVERT 0x00000004
#define DLM_LKF_VALBLK 0x00000008
#define DLM_LKF_QUECVT 0x00000010
#define DLM_LKF_IVVALBLK 0x00000020
#define DLM_LKF_CONVDEADLK 0x00000040
#define DLM_LKF_PERSISTENT 0x00000080
#define DLM_LKF_NODLCKWT 0x00000100
#define DLM_LKF_NODLCKBLK 0x00000200
#define DLM_LKF_EXPEDITE 0x00000400
#define DLM_LKF_NOQUEUEBAST 0x00000800
#define DLM_LKF_HEADQUE 0x00001000
#define DLM_LKF_NOORDER 0x00002000
#define DLM_LKF_ORPHAN 0x00004000
#define DLM_LKF_ALTPR 0x00008000
#define DLM_LKF_ALTCW 0x00010000
#define DLM_LKF_FORCEUNLOCK 0x00020000
/*
* Some return codes that are not in errno.h
*/
#define DLM_ECANCEL 0x10001
#define DLM_EUNLOCK 0x10002
typedef void dlm_lockspace_t;
/*
* Lock range structure
*/
struct dlm_range {
uint64_t ra_start;
uint64_t ra_end;
};
/*
* Lock status block
*
* Use this structure to specify the contents of the lock value block. For a
* conversion request, this structure is used to specify the lock ID of the
* lock. DLM writes the status of the lock request and the lock ID assigned
* to the request in the lock status block.
*
* sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
* It is available when dlm_lock returns.
*
* sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
* shown for the DLM_LKF_VALBLK flag.
*
* sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
* it was first demoted to NL to avoid conversion deadlock.
* DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
*
* sb_status: the returned status of the lock request set prior to AST
* execution. Possible return values:
*
* 0 if lock request was successful
* -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
* -ENOMEM if there is no memory to process request
* -EINVAL if there are invalid parameters
* -DLM_EUNLOCK if unlock request was successful
* -DLM_ECANCEL if a cancel completed successfully
*/
#define DLM_SBF_DEMOTED 0x01
#define DLM_SBF_VALNOTVALID 0x02
#define DLM_SBF_ALTMODE 0x04
struct dlm_lksb {
int sb_status;
uint32_t sb_lkid;
char sb_flags;
char * sb_lvbptr;
};
#ifdef __KERNEL__
#define DLM_LSFL_NODIR 0x00000001
/*
* dlm_new_lockspace
*
* Starts a lockspace with the given name. If the named lockspace exists in
* the cluster, the calling node joins it.
*/
int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
uint32_t flags, int lvblen);
/*
* dlm_release_lockspace
*
* Stop a lockspace.
*/
int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
/*
* dlm_lock
*
* Make an asyncronous request to acquire or convert a lock on a named
* resource.
*
* lockspace: context for the request
* mode: the requested mode of the lock (DLM_LOCK_)
* lksb: lock status block for input and async return values
* flags: input flags (DLM_LKF_)
* name: name of the resource to lock, can be binary
* namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
* parent: the lock ID of a parent lock or 0 if none
* lockast: function DLM executes when it completes processing the request
* astarg: argument passed to lockast and bast functions
* bast: function DLM executes when this lock later blocks another request
*
* Returns:
* 0 if request is successfully queued for processing
* -EINVAL if any input parameters are invalid
* -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
* -ENOMEM if there is no memory to process request
* -ENOTCONN if there is a communication error
*
* If the call to dlm_lock returns an error then the operation has failed and
* the AST routine will not be called. If dlm_lock returns 0 it is still
* possible that the lock operation will fail. The AST routine will be called
* when the locking is complete and the status is returned in the lksb.
*
* If the AST routines or parameter are passed to a conversion operation then
* they will overwrite those values that were passed to a previous dlm_lock
* call.
*
* AST routines should not block (at least not for long), but may make
* any locking calls they please.
*/
int dlm_lock(dlm_lockspace_t *lockspace,
int mode,
struct dlm_lksb *lksb,
uint32_t flags,
void *name,
unsigned int namelen,
uint32_t parent_lkid,
void (*lockast) (void *astarg),
void *astarg,
void (*bast) (void *astarg, int mode),
struct dlm_range *range);
/*
* dlm_unlock
*
* Asynchronously release a lock on a resource. The AST routine is called
* when the resource is successfully unlocked.
*
* lockspace: context for the request
* lkid: the lock ID as returned in the lksb
* flags: input flags (DLM_LKF_)
* lksb: if NULL the lksb parameter passed to last lock request is used
* astarg: the arg used with the completion ast for the unlock
*
* Returns:
* 0 if request is successfully queued for processing
* -EINVAL if any input parameters are invalid
* -ENOTEMPTY if the lock still has sublocks
* -EBUSY if the lock is waiting for a remote lock operation
* -ENOTCONN if there is a communication error
*/
int dlm_unlock(dlm_lockspace_t *lockspace,
uint32_t lkid,
uint32_t flags,
struct dlm_lksb *lksb,
void *astarg);
#endif /* __KERNEL__ */
#endif /* __DLM_DOT_H__ */
/******************************************************************************
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
** of the GNU General Public License v.2.
**
*******************************************************************************
******************************************************************************/
/* This is the device interface for dlm, most users will use a library
* interface.
*/
#define DLM_USER_LVB_LEN 32
/* Version of the device interface */
#define DLM_DEVICE_VERSION_MAJOR 3
#define DLM_DEVICE_VERSION_MINOR 0
#define DLM_DEVICE_VERSION_PATCH 0
/* struct passed to the lock write */
struct dlm_lock_params {
__u8 mode;
__u16 flags;
__u32 lkid;
__u32 parent;
struct dlm_range range;
__u8 namelen;
void __user *castparam;
void __user *castaddr;
void __user *bastparam;
void __user *bastaddr;
struct dlm_lksb __user *lksb;
char lvb[DLM_USER_LVB_LEN];
char name[1];
};
struct dlm_lspace_params {
__u32 flags;
__u32 minor;
char name[1];
};
struct dlm_write_request {
__u32 version[3];
__u8 cmd;
union {
struct dlm_lock_params lock;
struct dlm_lspace_params lspace;
} i;
};
/* struct read from the "device" fd,
consists mainly of userspace pointers for the library to use */
struct dlm_lock_result {
__u32 length;
void __user * user_astaddr;
void __user * user_astparam;
struct dlm_lksb __user * user_lksb;
struct dlm_lksb lksb;
__u8 bast_mode;
/* Offsets may be zero if no data is present */
__u32 lvb_offset;
};
/* Commands passed to the device */
#define DLM_USER_LOCK 1
#define DLM_USER_UNLOCK 2
#define DLM_USER_QUERY 3
#define DLM_USER_CREATE_LOCKSPACE 4
#define DLM_USER_REMOVE_LOCKSPACE 5
/* Arbitrary length restriction */
#define MAX_LS_NAME_LEN 64
/* Lockspace flags */
#define DLM_USER_LSFLG_AUTOFREE 1
#define DLM_USER_LSFLG_FORCEFREE 2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment