Commit 42877042 authored by Davide Libenzi's avatar Davide Libenzi Committed by Linus Torvalds

[PATCH] epoll - just when you think it's over ...

This does:

- naming cleanup: ep_* -> eventpoll_* for non-static functions ( 2 )

- No more limit of 2 poll wait queue for each file*

Before epoll used to have, inside its item struct, space for two wait
queues. This was driven by the fact that during a f_op->poll() each file
won't register more than one read and one write wait queue. Now, I'm not
sure if this is 100% true or not, but with the current implementation a
linked list of wait queues is kept to remove each limit.
parent 5f4eb633
/* /*
* drivers/char/eventpoll.c ( Efficent event polling implementation ) * fs/eventpoll.c ( Efficent event polling implementation )
* Copyright (C) 2001,...,2002 Davide Libenzi * Copyright (C) 2001,...,2002 Davide Libenzi
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
...@@ -67,9 +67,6 @@ ...@@ -67,9 +67,6 @@
/* Minimum size of the hash in bits ( 2^N ) */ /* Minimum size of the hash in bits ( 2^N ) */
#define EP_MIN_HASH_BITS 9 #define EP_MIN_HASH_BITS 9
/* Maximum number of wait queue we can attach to */
#define EP_MAX_POLL_QUEUE 2
/* Number of hash entries ( "struct list_head" ) inside a page */ /* Number of hash entries ( "struct list_head" ) inside a page */
#define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head)) #define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head))
...@@ -86,6 +83,12 @@ ...@@ -86,6 +83,12 @@
/* Macro to free a "struct epitem" to the slab cache */ /* Macro to free a "struct epitem" to the slab cache */
#define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p) #define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p)
/* Macro to allocate a "struct eppoll_entry" from the slab cache */
#define PWQ_MEM_ALLOC() (struct eppoll_entry *) kmem_cache_alloc(pwq_cache, SLAB_KERNEL)
/* Macro to free a "struct eppoll_entry" to the slab cache */
#define PWQ_MEM_FREE(p) kmem_cache_free(pwq_cache, p)
/* Fast test to see if the file is an evenpoll file */ /* Fast test to see if the file is an evenpoll file */
#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) #define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)
...@@ -146,6 +149,9 @@ struct eventpoll { ...@@ -146,6 +149,9 @@ struct eventpoll {
/* Wait structure used by the poll hooks */ /* Wait structure used by the poll hooks */
struct eppoll_entry { struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */ /* The "base" pointer is set to the container "struct epitem" */
void *base; void *base;
...@@ -173,8 +179,8 @@ struct epitem { ...@@ -173,8 +179,8 @@ struct epitem {
/* Number of active wait queue attached to poll operations */ /* Number of active wait queue attached to poll operations */
int nwait; int nwait;
/* Wait queue used to attach poll operations */ /* List containing poll wait queues */
struct eppoll_entry wait[EP_MAX_POLL_QUEUE]; struct list_head pwqlist;
/* The "container" of this item */ /* The "container" of this item */
struct eventpoll *ep; struct eventpoll *ep;
...@@ -242,13 +248,16 @@ static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, ...@@ -242,13 +248,16 @@ static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
* it has to be called from outside the lock, must be protected. * it has to be called from outside the lock, must be protected.
* This is read-held during the event transfer loop to userspace * This is read-held during the event transfer loop to userspace
* and it is write-held during the file cleanup path and the epoll * and it is write-held during the file cleanup path and the epoll
* exit code. * file exit code.
*/ */
struct rw_semaphore epsem; struct rw_semaphore epsem;
/* Slab cache used to allocate "struct epitem" */ /* Slab cache used to allocate "struct epitem" */
static kmem_cache_t *dpi_cache; static kmem_cache_t *dpi_cache;
/* Slab cache used to allocate "struct eppoll_entry" */
static kmem_cache_t *pwq_cache;
/* Virtual fs used to allocate inodes for eventpoll files */ /* Virtual fs used to allocate inodes for eventpoll files */
static struct vfsmount *eventpoll_mnt; static struct vfsmount *eventpoll_mnt;
...@@ -289,7 +298,7 @@ static unsigned int ep_get_hash_bits(unsigned int hintsize) ...@@ -289,7 +298,7 @@ static unsigned int ep_get_hash_bits(unsigned int hintsize)
/* Used to initialize the epoll bits inside the "struct file" */ /* Used to initialize the epoll bits inside the "struct file" */
void ep_init_file_struct(struct file *file) void eventpoll_init_file(struct file *file)
{ {
INIT_LIST_HEAD(&file->f_ep_links); INIT_LIST_HEAD(&file->f_ep_links);
...@@ -303,17 +312,34 @@ void ep_init_file_struct(struct file *file) ...@@ -303,17 +312,34 @@ void ep_init_file_struct(struct file *file)
* correctly files that are closed without being removed from the eventpoll * correctly files that are closed without being removed from the eventpoll
* interface. * interface.
*/ */
void ep_notify_file_close(struct file *file) void eventpoll_release(struct file *file)
{ {
struct list_head *lsthead = &file->f_ep_links; struct list_head *lsthead = &file->f_ep_links;
struct epitem *dpi; struct epitem *dpi;
/*
* Fast check to avoid the get/release of the semaphore. Since
* we're doing this outside the semaphore lock, it might return
* false negatives, but we don't care. It'll help in 99.99% of cases
* to avoid the semaphore lock. False positives simply cannot happen
* because the file in on the way to be removed and nobody ( but
* eventpoll ) has still a reference to this file.
*/
if (list_empty(lsthead))
return;
/*
* We don't want to get "file->f_ep_lock" because it is not
* necessary. It is not necessary because we're in the "struct file"
* cleanup path, and this means that noone is using this file anymore.
* The only hit might come from ep_free() but by holding the semaphore
* will correctly serialize the operation.
*/
down_write(&epsem); down_write(&epsem);
while (!list_empty(lsthead)) { while (!list_empty(lsthead)) {
dpi = list_entry(lsthead->next, struct epitem, fllink); dpi = list_entry(lsthead->next, struct epitem, fllink);
EP_LIST_DEL(&dpi->fllink); EP_LIST_DEL(&dpi->fllink);
ep_remove(dpi->ep, dpi); ep_remove(dpi->ep, dpi);
} }
up_write(&epsem); up_write(&epsem);
...@@ -710,8 +736,7 @@ static void ep_free(struct eventpoll *ep) ...@@ -710,8 +736,7 @@ static void ep_free(struct eventpoll *ep)
/* /*
* We need to lock this because we could be hit by * We need to lock this because we could be hit by
* ep_notify_file_close() while we're freeing the * eventpoll_release() while we're freeing the "struct eventpoll".
* "struct eventpoll".
*/ */
down_write(&epsem); down_write(&epsem);
...@@ -815,19 +840,28 @@ static void ep_release_epitem(struct epitem *dpi) ...@@ -815,19 +840,28 @@ static void ep_release_epitem(struct epitem *dpi)
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt)
{ {
struct epitem *dpi = EP_ITEM_FROM_EPQUEUE(pt); struct epitem *dpi = EP_ITEM_FROM_EPQUEUE(pt);
struct eppoll_entry *pwq;
/* No more than EP_MAX_POLL_QUEUE wait queue are supported */
if (dpi->nwait < EP_MAX_POLL_QUEUE) { if (dpi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC()))
add_wait_queue(whead, &dpi->wait[dpi->nwait].wait); {
dpi->wait[dpi->nwait].whead = whead; init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = dpi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &dpi->pwqlist);
dpi->nwait++; dpi->nwait++;
} }
else
{
/* We have to signal that an error occured */
dpi->nwait = -1;
}
} }
static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfile) static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfile)
{ {
int error, i, revents; int error, revents;
unsigned long flags; unsigned long flags;
struct epitem *dpi; struct epitem *dpi;
struct ep_pqueue epq; struct ep_pqueue epq;
...@@ -840,16 +874,12 @@ static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfil ...@@ -840,16 +874,12 @@ static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfil
INIT_LIST_HEAD(&dpi->llink); INIT_LIST_HEAD(&dpi->llink);
INIT_LIST_HEAD(&dpi->rdllink); INIT_LIST_HEAD(&dpi->rdllink);
INIT_LIST_HEAD(&dpi->fllink); INIT_LIST_HEAD(&dpi->fllink);
INIT_LIST_HEAD(&dpi->pwqlist);
dpi->ep = ep; dpi->ep = ep;
dpi->file = tfile; dpi->file = tfile;
dpi->pfd = *pfd; dpi->pfd = *pfd;
atomic_set(&dpi->usecnt, 1); atomic_set(&dpi->usecnt, 1);
dpi->nwait = 0; dpi->nwait = 0;
for (i = 0; i < EP_MAX_POLL_QUEUE; i++) {
init_waitqueue_func_entry(&dpi->wait[i].wait, ep_poll_callback);
dpi->wait[i].whead = NULL;
dpi->wait[i].base = dpi;
}
/* Initialize the poll table using the queue callback */ /* Initialize the poll table using the queue callback */
epq.dpi = dpi; epq.dpi = dpi;
...@@ -864,6 +894,14 @@ static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfil ...@@ -864,6 +894,14 @@ static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfil
poll_freewait(&epq.pt); poll_freewait(&epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (dpi->nwait < 0)
goto eexit_2;
/* We have to drop the new item inside our item list to keep track of it */ /* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags); write_lock_irqsave(&ep->lock, flags);
...@@ -893,6 +931,19 @@ static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfil ...@@ -893,6 +931,19 @@ static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfil
return 0; return 0;
eexit_2:
ep_unregister_pollwait(ep, dpi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/
write_lock_irqsave(&ep->lock, flags);
if (EP_IS_LINKED(&dpi->rdllink))
EP_LIST_DEL(&dpi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);
DPI_MEM_FREE(dpi);
eexit_1: eexit_1:
return error; return error;
} }
...@@ -948,14 +999,23 @@ static int ep_modify(struct eventpoll *ep, struct epitem *dpi, unsigned int even ...@@ -948,14 +999,23 @@ static int ep_modify(struct eventpoll *ep, struct epitem *dpi, unsigned int even
*/ */
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *dpi) static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *dpi)
{ {
int i, nwait; int nwait;
struct list_head *lsthead = &dpi->pwqlist;
struct eppoll_entry *pwq;
/* This is called without locks, so we need the atomic exchange */ /* This is called without locks, so we need the atomic exchange */
nwait = xchg(&dpi->nwait, 0); nwait = xchg(&dpi->nwait, 0);
/* Removes poll wait queue hooks */ if (nwait)
for (i = 0; i < nwait; i++) {
remove_wait_queue(dpi->wait[i].whead, &dpi->wait[i].wait); while (!list_empty(lsthead)) {
pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
EP_LIST_DEL(&pwq->llink);
remove_wait_queue(pwq->whead, &pwq->wait);
PWQ_MEM_FREE(pwq);
}
}
} }
...@@ -1136,14 +1196,6 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **adpi, in ...@@ -1136,14 +1196,6 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **adpi, in
/* Remove the item from the ready list */ /* Remove the item from the ready list */
EP_LIST_DEL(&dpi->rdllink); EP_LIST_DEL(&dpi->rdllink);
/*
* If the item is not linked to the main hash table this means that
* it's on the way to be removed and we don't want to send events
* for such file descriptor.
*/
if (!EP_IS_LINKED(&dpi->llink))
continue;
/* /*
* We need to increase the usage count of the "struct epitem" because * We need to increase the usage count of the "struct epitem" because
* another thread might call EP_CTL_DEL on this target and make the * another thread might call EP_CTL_DEL on this target and make the
...@@ -1218,9 +1270,9 @@ static int ep_events_transfer(struct eventpoll *ep, struct pollfd *events, int m ...@@ -1218,9 +1270,9 @@ static int ep_events_transfer(struct eventpoll *ep, struct pollfd *events, int m
/* /*
* We need to lock this because we could be hit by * We need to lock this because we could be hit by
* ep_notify_file_close() while we're transfering * eventpoll_release() while we're transfering
* events to userspace. Read-holding "epsem" will lock * events to userspace. Read-holding "epsem" will lock
* out ep_notify_file_close() during the whole * out eventpoll_release() during the whole
* transfer loop and this will garantie us that the * transfer loop and this will garantie us that the
* file will not vanish underneath our nose when * file will not vanish underneath our nose when
* we will call f_op->poll() from ep_send_events(). * we will call f_op->poll() from ep_send_events().
...@@ -1370,37 +1422,49 @@ static int __init eventpoll_init(void) ...@@ -1370,37 +1422,49 @@ static int __init eventpoll_init(void)
{ {
int error; int error;
/* Initialize the semaphore used to syncronize the file cleanup code */
init_rwsem(&epsem); init_rwsem(&epsem);
/* Allocates slab cache used to allocate "struct epitem" items */ /* Allocates slab cache used to allocate "struct epitem" items */
error = -ENOMEM; error = -ENOMEM;
dpi_cache = kmem_cache_create("eventpoll", dpi_cache = kmem_cache_create("eventpoll dpi",
sizeof(struct epitem), sizeof(struct epitem),
0, 0,
DPI_SLAB_DEBUG, NULL, NULL); SLAB_HWCACHE_ALIGN | DPI_SLAB_DEBUG, NULL, NULL);
if (!dpi_cache) if (!dpi_cache)
goto eexit_1; goto eexit_1;
/* Allocates slab cache used to allocate "struct eppoll_entry" */
error = -ENOMEM;
pwq_cache = kmem_cache_create("eventpoll pwq",
sizeof(struct eppoll_entry),
0,
DPI_SLAB_DEBUG, NULL, NULL);
if (!pwq_cache)
goto eexit_2;
/* /*
* Register the virtual file system that will be the source of inodes * Register the virtual file system that will be the source of inodes
* for the eventpoll files * for the eventpoll files
*/ */
error = register_filesystem(&eventpoll_fs_type); error = register_filesystem(&eventpoll_fs_type);
if (error) if (error)
goto eexit_2; goto eexit_3;
/* Mount the above commented virtual file system */ /* Mount the above commented virtual file system */
eventpoll_mnt = kern_mount(&eventpoll_fs_type); eventpoll_mnt = kern_mount(&eventpoll_fs_type);
error = PTR_ERR(eventpoll_mnt); error = PTR_ERR(eventpoll_mnt);
if (IS_ERR(eventpoll_mnt)) if (IS_ERR(eventpoll_mnt))
goto eexit_3; goto eexit_4;
printk(KERN_INFO "[%p] eventpoll: driver installed.\n", current); printk(KERN_INFO "[%p] eventpoll: successfully initialized.\n", current);
return 0; return 0;
eexit_3: eexit_4:
unregister_filesystem(&eventpoll_fs_type); unregister_filesystem(&eventpoll_fs_type);
eexit_3:
kmem_cache_destroy(pwq_cache);
eexit_2: eexit_2:
kmem_cache_destroy(dpi_cache); kmem_cache_destroy(dpi_cache);
eexit_1: eexit_1:
...@@ -1414,6 +1478,7 @@ static void __exit eventpoll_exit(void) ...@@ -1414,6 +1478,7 @@ static void __exit eventpoll_exit(void)
/* Undo all operations done inside eventpoll_init() */ /* Undo all operations done inside eventpoll_init() */
unregister_filesystem(&eventpoll_fs_type); unregister_filesystem(&eventpoll_fs_type);
mntput(eventpoll_mnt); mntput(eventpoll_mnt);
kmem_cache_destroy(pwq_cache);
kmem_cache_destroy(dpi_cache); kmem_cache_destroy(dpi_cache);
} }
......
...@@ -52,7 +52,7 @@ struct file * get_empty_filp(void) ...@@ -52,7 +52,7 @@ struct file * get_empty_filp(void)
file_list_unlock(); file_list_unlock();
return NULL; return NULL;
} }
ep_init_file_struct(f); eventpoll_init_file(f);
atomic_set(&f->f_count,1); atomic_set(&f->f_count,1);
f->f_version = 0; f->f_version = 0;
f->f_uid = current->fsuid; f->f_uid = current->fsuid;
...@@ -97,7 +97,7 @@ struct file * get_empty_filp(void) ...@@ -97,7 +97,7 @@ struct file * get_empty_filp(void)
int init_private_file(struct file *filp, struct dentry *dentry, int mode) int init_private_file(struct file *filp, struct dentry *dentry, int mode)
{ {
memset(filp, 0, sizeof(*filp)); memset(filp, 0, sizeof(*filp));
ep_init_file_struct(filp); eventpoll_init_file(filp);
filp->f_mode = mode; filp->f_mode = mode;
atomic_set(&filp->f_count, 1); atomic_set(&filp->f_count, 1);
filp->f_dentry = dentry; filp->f_dentry = dentry;
...@@ -126,10 +126,10 @@ void __fput(struct file * file) ...@@ -126,10 +126,10 @@ void __fput(struct file * file)
struct inode * inode = dentry->d_inode; struct inode * inode = dentry->d_inode;
/* /*
* The function ep_notify_file_close() should be the first called * The function eventpoll_release() should be the first called
* in the file cleanup chain. * in the file cleanup chain.
*/ */
ep_notify_file_close(file); eventpoll_release(file);
locks_remove_flock(file); locks_remove_flock(file);
if (file->f_op && file->f_op->release) if (file->f_op && file->f_op->release)
......
...@@ -35,10 +35,10 @@ asmlinkage int sys_epoll_wait(int epfd, struct pollfd *events, int maxevents, ...@@ -35,10 +35,10 @@ asmlinkage int sys_epoll_wait(int epfd, struct pollfd *events, int maxevents,
int timeout); int timeout);
/* Used to initialize the epoll bits inside the "struct file" */ /* Used to initialize the epoll bits inside the "struct file" */
void ep_init_file_struct(struct file *file); void eventpoll_init_file(struct file *file);
/* Used in fs/file_table.c:__fput() to unlink files from the eventpoll interface */ /* Used in fs/file_table.c:__fput() to unlink files from the eventpoll interface */
void ep_notify_file_close(struct file *file); void eventpoll_release(struct file *file);
#endif /* #ifdef __KERNEL__ */ #endif /* #ifdef __KERNEL__ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment