Commit d0c736e8 authored by Davide Libenzi's avatar Davide Libenzi Committed by Dave Jones

[PATCH] epoll with selectable ET/LT behaviour ...

This patch adds selectable EdgeTriggered/LevelTriggered behaviour to
epoll.

It has been widely discussed on lkml about two weeks ago and everyone
very welcome the change.  It has been even more widely discussed through
private emails with application developers, that do not feel confortable
posting on lkml.  The great value of the patch is that selecting the LT
behaviour, applications using poll/select can be ported very easily to
epoll, making existing apps to benefit from epoll scalability with very
short ETA's.

The API remains the same with the addition of a EPOLLET event flag that
sets the LT/ET behaviour for that fd.
parent da017c5b
/* /*
* fs/eventpoll.c ( Efficent event polling implementation ) * fs/eventpoll.c ( Efficent event polling implementation )
* Copyright (C) 2001,...,2002 Davide Libenzi * Copyright (C) 2001,...,2003 Davide Libenzi
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
...@@ -117,11 +117,6 @@ ...@@ -117,11 +117,6 @@
*/ */
#define EP_MAX_BUF_EVENTS 32 #define EP_MAX_BUF_EVENTS 32
/*
* Used to optimize ready items collection by reducing the irqlock/irqunlock
* switching rate. This is kept in stack too, so do not go wild with this number.
*/
#define EP_MAX_COLLECT_ITEMS 64
/* /*
...@@ -223,6 +218,15 @@ struct epitem { ...@@ -223,6 +218,15 @@ struct epitem {
/* List header used to link this item to the "struct file" items list */ /* List header used to link this item to the "struct file" items list */
struct list_head fllink; struct list_head fllink;
/* List header used to link the item to the transfer list */
struct list_head txlink;
/*
* This is used during the collection/transfer of events to userspace
* to pin items empty events set.
*/
unsigned int revents;
}; };
/* Wrapper struct used by poll queueing */ /* Wrapper struct used by poll queueing */
...@@ -256,9 +260,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi); ...@@ -256,9 +260,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi);
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync); static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync);
static int ep_eventpoll_close(struct inode *inode, struct file *file); static int ep_eventpoll_close(struct inode *inode, struct file *file);
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, int maxepi); static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents);
static int ep_send_events(struct eventpoll *ep, struct epitem **aepi, int nepi, static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
struct epoll_event *events); struct epoll_event *events);
static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist);
static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, int maxevents); static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, int maxevents);
static int ep_poll(struct eventpoll *ep, struct epoll_event *events, int maxevents, static int ep_poll(struct eventpoll *ep, struct epoll_event *events, int maxevents,
long timeout); long timeout);
...@@ -340,13 +345,14 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) ...@@ -340,13 +345,14 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
unsigned long flags; unsigned long flags;
task_t *this_task = current; task_t *this_task = current;
struct list_head *lsthead = &psw->wake_task_list, *lnk; struct list_head *lsthead = &psw->wake_task_list, *lnk;
struct wake_task_node *tncur;
struct wake_task_node tnode; struct wake_task_node tnode;
spin_lock_irqsave(&psw->lock, flags); spin_lock_irqsave(&psw->lock, flags);
/* Try to see if the current task is already inside this wakeup call */ /* Try to see if the current task is already inside this wakeup call */
list_for_each(lnk, lsthead) { list_for_each(lnk, lsthead) {
struct wake_task_node *tncur = list_entry(lnk, struct wake_task_node, llink); tncur = list_entry(lnk, struct wake_task_node, llink);
if (tncur->task == this_task) { if (tncur->task == this_task) {
if (tncur->wq == wq || ++wake_nests > EP_MAX_POLLWAKE_NESTS) { if (tncur->wq == wq || ++wake_nests > EP_MAX_POLLWAKE_NESTS) {
...@@ -830,6 +836,7 @@ static void ep_free(struct eventpoll *ep) ...@@ -830,6 +836,7 @@ static void ep_free(struct eventpoll *ep)
{ {
unsigned int i, hsize; unsigned int i, hsize;
struct list_head *lsthead, *lnk; struct list_head *lsthead, *lnk;
struct epitem *epi;
/* /*
* We need to lock this because we could be hit by * We need to lock this because we could be hit by
...@@ -844,7 +851,7 @@ static void ep_free(struct eventpoll *ep) ...@@ -844,7 +851,7 @@ static void ep_free(struct eventpoll *ep)
lsthead = ep_hash_entry(ep, i); lsthead = ep_hash_entry(ep, i);
list_for_each(lnk, lsthead) { list_for_each(lnk, lsthead) {
struct epitem *epi = list_entry(lnk, struct epitem, llink); epi = list_entry(lnk, struct epitem, llink);
ep_unregister_pollwait(ep, epi); ep_unregister_pollwait(ep, epi);
} }
...@@ -860,7 +867,7 @@ static void ep_free(struct eventpoll *ep) ...@@ -860,7 +867,7 @@ static void ep_free(struct eventpoll *ep)
lsthead = ep_hash_entry(ep, i); lsthead = ep_hash_entry(ep, i);
while (!list_empty(lsthead)) { while (!list_empty(lsthead)) {
struct epitem *epi = list_entry(lsthead->next, struct epitem, llink); epi = list_entry(lsthead->next, struct epitem, llink);
ep_remove(ep, epi); ep_remove(ep, epi);
} }
...@@ -939,17 +946,14 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, po ...@@ -939,17 +946,14 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, po
struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt); struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
struct eppoll_entry *pwq; struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
{
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead; pwq->whead = whead;
pwq->base = epi; pwq->base = epi;
add_wait_queue(whead, &pwq->wait); add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist); list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++; epi->nwait++;
} } else {
else
{
/* We have to signal that an error occurred */ /* We have to signal that an error occurred */
epi->nwait = -1; epi->nwait = -1;
} }
...@@ -971,6 +975,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct fil ...@@ -971,6 +975,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct fil
INIT_LIST_HEAD(&epi->llink); INIT_LIST_HEAD(&epi->llink);
INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist); INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep; epi->ep = ep;
epi->file = tfile; epi->file = tfile;
...@@ -1077,9 +1082,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even ...@@ -1077,9 +1082,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
/* Copy the data member from inside the lock */ /* Copy the data member from inside the lock */
epi->event.data = event->data; epi->event.data = event->data;
/* If the file is already "ready" we drop it inside the ready list */ /*
if ((revents & event->events) && EP_IS_LINKED(&epi->llink) && * If the item is not linked to the hash it means that it's on its
!EP_IS_LINKED(&epi->rdllink)) { * way toward the removal. Do nothing in this case.
*/
if (EP_IS_LINKED(&epi->llink)) {
/*
* If the item is "hot" and it is not registered inside the ready
* list, push it inside. If the item is not "hot" and it is currently
* registered inside the ready list, unlink it.
*/
if (revents & event->events) {
if (!EP_IS_LINKED(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */ /* Notify waiting tasks that events are available */
...@@ -1088,6 +1102,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even ...@@ -1088,6 +1102,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
if (waitqueue_active(&ep->poll_wait)) if (waitqueue_active(&ep->poll_wait))
pwake++; pwake++;
} }
} else if (EP_IS_LINKED(&epi->rdllink))
EP_LIST_DEL(&epi->rdllink);
}
write_unlock_irqrestore(&ep->lock, flags); write_unlock_irqrestore(&ep->lock, flags);
...@@ -1113,8 +1130,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) ...@@ -1113,8 +1130,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
/* This is called without locks, so we need the atomic exchange */ /* This is called without locks, so we need the atomic exchange */
nwait = xchg(&epi->nwait, 0); nwait = xchg(&epi->nwait, 0);
if (nwait) if (nwait) {
{
while (!list_empty(lsthead)) { while (!list_empty(lsthead)) {
pwq = list_entry(lsthead->next, struct eppoll_entry, llink); pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
...@@ -1142,6 +1158,13 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) ...@@ -1142,6 +1158,13 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
if (!EP_IS_LINKED(&epi->llink)) if (!EP_IS_LINKED(&epi->llink))
goto eexit_1; goto eexit_1;
/*
* Clear the event mask for the unlinked item. This will avoid item
* notifications to be sent after the unlink operation from inside
* the kernel->userspace event transfer loop.
*/
epi->event.events = 0;
/* /*
* At this point is safe to do the job, unlink the item from our list. * At this point is safe to do the job, unlink the item from our list.
* This operation togheter with the above check closes the door to * This operation togheter with the above check closes the door to
...@@ -1295,20 +1318,22 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) ...@@ -1295,20 +1318,22 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
* during the f_op->poll() call, we try to collect the maximum number of items * during the f_op->poll() call, we try to collect the maximum number of items
* by reducing the irqlock/irqunlock switching rate. * by reducing the irqlock/irqunlock switching rate.
*/ */
static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, int maxepi) static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
{ {
int nepi; int nepi;
unsigned long flags; unsigned long flags;
struct list_head *lsthead = &ep->rdllist; struct list_head *lsthead = &ep->rdllist, *lnk;
struct epitem *epi;
write_lock_irqsave(&ep->lock, flags); write_lock_irqsave(&ep->lock, flags);
for (nepi = 0; nepi < maxepi && !list_empty(lsthead);) { for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
struct epitem *epi = list_entry(lsthead->next, struct epitem, rdllink); epi = list_entry(lnk, struct epitem, rdllink);
/* Remove the item from the ready list */ lnk = lnk->next;
EP_LIST_DEL(&epi->rdllink);
/* If this file is already in the ready list we exit soon */
if (!EP_IS_LINKED(&epi->txlink)) {
/* /*
* We need to increase the usage count of the "struct epitem" because * We need to increase the usage count of the "struct epitem" because
* another thread might call EPOLL_CTL_DEL on this target and make the * another thread might call EPOLL_CTL_DEL on this target and make the
...@@ -1316,7 +1341,22 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, in ...@@ -1316,7 +1341,22 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, in
*/ */
ep_use_epitem(epi); ep_use_epitem(epi);
aepi[nepi++] = epi; /*
* This is initialized in this way so that the default
* behaviour of the reinjecting code will be to push back
* the item inside the ready list.
*/
epi->revents = epi->event.events;
/* Link the ready item into the transfer list */
list_add(&epi->txlink, txlist);
nepi++;
/*
* Unlink the item from the ready list.
*/
EP_LIST_DEL(&epi->rdllink);
}
} }
write_unlock_irqrestore(&ep->lock, flags); write_unlock_irqrestore(&ep->lock, flags);
...@@ -1330,36 +1370,40 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, in ...@@ -1330,36 +1370,40 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, in
* __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
* because of the way poll() is traditionally implemented in Linux. * because of the way poll() is traditionally implemented in Linux.
*/ */
static int ep_send_events(struct eventpoll *ep, struct epitem **aepi, int nepi, static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
struct epoll_event *events) struct epoll_event *events)
{ {
int i, eventcnt, eventbuf, revents; int eventcnt = 0, eventbuf = 0;
unsigned int revents;
struct list_head *lnk;
struct epitem *epi; struct epitem *epi;
struct epoll_event event[EP_MAX_BUF_EVENTS]; struct epoll_event event[EP_MAX_BUF_EVENTS];
for (i = 0, eventcnt = 0, eventbuf = 0; i < nepi; i++, aepi++) { list_for_each(lnk, txlist) {
epi = *aepi; epi = list_entry(lnk, struct epitem, txlink);
/* Get the ready file event set */ /* Get the ready file event set */
revents = epi->file->f_op->poll(epi->file, NULL); revents = epi->file->f_op->poll(epi->file, NULL);
if (revents & epi->event.events) { /*
* Set the return event set for the current file descriptor.
* Note that only the task task was successfully able to link
* the item to its "txlist" will write this field.
*/
epi->revents = revents & epi->event.events;
if (epi->revents) {
event[eventbuf] = epi->event; event[eventbuf] = epi->event;
event[eventbuf].events &= revents; event[eventbuf].events &= revents;
eventbuf++; eventbuf++;
if (eventbuf == EP_MAX_BUF_EVENTS) { if (eventbuf == EP_MAX_BUF_EVENTS) {
if (__copy_to_user(&events[eventcnt], event, if (__copy_to_user(&events[eventcnt], event,
eventbuf * sizeof(struct epoll_event))) { eventbuf * sizeof(struct epoll_event)))
for (; i < nepi; i++, aepi++)
ep_release_epitem(*aepi);
return -EFAULT; return -EFAULT;
}
eventcnt += eventbuf; eventcnt += eventbuf;
eventbuf = 0; eventbuf = 0;
} }
} }
ep_release_epitem(epi);
} }
if (eventbuf) { if (eventbuf) {
...@@ -1373,13 +1417,69 @@ static int ep_send_events(struct eventpoll *ep, struct epitem **aepi, int nepi, ...@@ -1373,13 +1417,69 @@ static int ep_send_events(struct eventpoll *ep, struct epitem **aepi, int nepi,
} }
/*
* Walk through the transfer list we collected with ep_collect_ready_items()
* and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
* not already linked, links it to the ready list.
*/
static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
{
int ricnt = 0, pwake = 0;
unsigned long flags;
struct epitem *epi;
write_lock_irqsave(&ep->lock, flags);
while (!list_empty(txlist)) {
epi = list_entry(txlist->next, struct epitem, txlink);
/* Unlink the current item from the transfer list */
EP_LIST_DEL(&epi->txlink);
/*
* If the item is no more linked to the interest set, we don't
* have to push it inside the ready list because the following
* ep_release_epitem() is going to drop it. Also, if the current
* item is set to have an Edge Triggered behaviour, we don't have
* to push it back either.
*/
if (EP_IS_LINKED(&epi->llink) && !(epi->event.events & EPOLLET) &&
(epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ricnt++;
}
ep_release_epitem(epi);
}
if (ricnt) {
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
}
/* /*
* Perform the transfer of events to user space. * Perform the transfer of events to user space.
*/ */
static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, int maxevents) static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, int maxevents)
{ {
int eventcnt, nepi, sepi, maxepi; int eventcnt = 0;
struct epitem *aepi[EP_MAX_COLLECT_ITEMS]; struct list_head txlist;
INIT_LIST_HEAD(&txlist);
/* /*
* We need to lock this because we could be hit by * We need to lock this because we could be hit by
...@@ -1392,25 +1492,13 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, ...@@ -1392,25 +1492,13 @@ static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events,
*/ */
down_read(&epsem); down_read(&epsem);
for (eventcnt = 0; eventcnt < maxevents;) {
/* Maximum items we can extract this time */
maxepi = min(EP_MAX_COLLECT_ITEMS, maxevents - eventcnt);
/* Collect/extract ready items */ /* Collect/extract ready items */
nepi = ep_collect_ready_items(ep, aepi, maxepi); if (ep_collect_ready_items(ep, &txlist, maxevents)) {
/* Build result set in userspace */
eventcnt = ep_send_events(ep, &txlist, events);
if (nepi) { /* Reinject ready items into the ready list */
/* Send events to userspace */ ep_reinject_items(ep, &txlist);
sepi = ep_send_events(ep, aepi, nepi, &events[eventcnt]);
if (sepi < 0) {
up_read(&epsem);
return sepi;
}
eventcnt += sepi;
}
if (nepi < maxepi)
break;
} }
up_read(&epsem); up_read(&epsem);
......
/* /*
* include/linux/eventpoll.h ( Efficent event polling implementation ) * include/linux/eventpoll.h ( Efficent event polling implementation )
* Copyright (C) 2001,...,2002 Davide Libenzi * Copyright (C) 2001,...,2003 Davide Libenzi
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
...@@ -20,6 +20,9 @@ ...@@ -20,6 +20,9 @@
#define EPOLL_CTL_DEL 2 #define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3 #define EPOLL_CTL_MOD 3
/* Set the Edge Triggered behaviour for the target file descriptor */
#define EPOLLET (1 << 31)
struct epoll_event { struct epoll_event {
__u32 events; __u32 events;
__u64 data; __u64 data;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment