Commit ff6a1798 authored by Anton Ivanov's avatar Anton Ivanov Committed by Richard Weinberger

Epoll based IRQ controller

1. Removes the need to walk the IRQ/Device list to determine
who triggered the IRQ.
2. Improves scalability (up to several times performance
improvement for cases with 10s of devices).
3. Improves UML baseline IO performance for one disk + one NIC
use case by up to 10%.
4. Introduces write poll triggered IRQs.
5. Prerequisite for introducing high performance mmesg family
of functions in network IO.
6. Fixes RNG shutdown which was leaking a file descriptor
Signed-off-by: default avatarAnton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: default avatarRichard Weinberger <richard@nod.at>
parent 4d1a535b
......@@ -171,56 +171,19 @@ int enable_chan(struct line *line)
return err;
}
/* Items are added in IRQ context, when free_irq can't be called, and
* removed in process context, when it can.
* This handles interrupt sources which disappear, and which need to
* be permanently disabled. This is discovered in IRQ context, but
* the freeing of the IRQ must be done later.
*/
static DEFINE_SPINLOCK(irqs_to_free_lock);
static LIST_HEAD(irqs_to_free);
void free_irqs(void)
{
struct chan *chan;
LIST_HEAD(list);
struct list_head *ele;
unsigned long flags;
spin_lock_irqsave(&irqs_to_free_lock, flags);
list_splice_init(&irqs_to_free, &list);
spin_unlock_irqrestore(&irqs_to_free_lock, flags);
list_for_each(ele, &list) {
chan = list_entry(ele, struct chan, free_list);
if (chan->input && chan->enabled)
um_free_irq(chan->line->driver->read_irq, chan);
if (chan->output && chan->enabled)
um_free_irq(chan->line->driver->write_irq, chan);
chan->enabled = 0;
}
}
static void close_one_chan(struct chan *chan, int delay_free_irq)
{
unsigned long flags;
if (!chan->opened)
return;
if (delay_free_irq) {
spin_lock_irqsave(&irqs_to_free_lock, flags);
list_add(&chan->free_list, &irqs_to_free);
spin_unlock_irqrestore(&irqs_to_free_lock, flags);
}
else {
if (chan->input && chan->enabled)
um_free_irq(chan->line->driver->read_irq, chan);
if (chan->output && chan->enabled)
um_free_irq(chan->line->driver->write_irq, chan);
chan->enabled = 0;
}
/* we can safely call free now - it will be marked
* as free and freed once the IRQ stopped processing
*/
if (chan->input && chan->enabled)
um_free_irq(chan->line->driver->read_irq, chan);
if (chan->output && chan->enabled)
um_free_irq(chan->line->driver->write_irq, chan);
chan->enabled = 0;
if (chan->ops->close != NULL)
(*chan->ops->close)(chan->fd, chan->data);
......
......@@ -284,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
if (err)
return err;
if (output)
err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
line_write_interrupt, IRQF_SHARED,
driver->write_irq_name, data);
return err;
......
......@@ -13,6 +13,7 @@
#include <linux/miscdevice.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <init.h>
#include <irq_kern.h>
#include <os.h>
......@@ -154,7 +155,14 @@ static int __init rng_init (void)
/*
* rng_cleanup - shutdown RNG module
*/
static void __exit rng_cleanup (void)
static void cleanup(void)
{
free_irq_by_fd(random_fd);
os_close_file(random_fd);
}
static void __exit rng_cleanup(void)
{
os_close_file(random_fd);
misc_deregister (&rng_miscdev);
......@@ -162,6 +170,7 @@ static void __exit rng_cleanup (void)
module_init (rng_init);
module_exit (rng_cleanup);
__uml_exitcall(cleanup);
MODULE_DESCRIPTION("UML Host Random Number Generator (RNG) driver");
MODULE_LICENSE("GPL");
......@@ -1587,11 +1587,11 @@ int io_thread(void *arg)
do {
res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n);
if (res > 0) {
if (res >= 0) {
written += res;
} else {
if (res != -EAGAIN) {
printk("io_thread - read failed, fd = %d, "
printk("io_thread - write failed, fd = %d, "
"err = %d\n", kernel_fd, -n);
}
}
......
......@@ -7,6 +7,7 @@
#define __IRQ_USER_H__
#include <sysdep/ptrace.h>
#include <stdbool.h>
struct irq_fd {
struct irq_fd *next;
......@@ -15,10 +16,17 @@ struct irq_fd {
int type;
int irq;
int events;
int current_events;
bool active;
bool pending;
bool purge;
};
enum { IRQ_READ, IRQ_WRITE };
#define IRQ_READ 0
#define IRQ_WRITE 1
#define IRQ_NONE 2
#define MAX_IRQ_TYPE (IRQ_NONE + 1)
struct siginfo;
extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
......
......@@ -290,15 +290,16 @@ extern void halt_skas(void);
extern void reboot_skas(void);
/* irq.c */
extern int os_waiting_for_events(struct irq_fd *active_fds);
extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
extern void os_free_irq_later(struct irq_fd *active_fds,
int irq, void *dev_id);
extern int os_get_pollfd(int i);
extern void os_set_pollfd(int i, int fd);
extern int os_waiting_for_events_epoll(void);
extern void *os_epoll_get_data_pointer(int index);
extern int os_epoll_triggered(int index, int events);
extern int os_event_mask(int irq_type);
extern int os_setup_epoll(void);
extern int os_add_epoll_fd(int events, int fd, void *data);
extern int os_mod_epoll_fd(int events, int fd, void *data);
extern int os_del_epoll_fd(int fd);
extern void os_set_ioignore(void);
extern void os_close_epoll_fd(void);
/* sigio.c */
extern int add_sigio_fd(int fd);
......
This diff is collapsed.
/*
* Copyright (C) 2017 - Cambridge Greys Ltd
* Copyright (C) 2011 - 2014 Cisco Systems Inc
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
#include <stdlib.h>
#include <errno.h>
#include <poll.h>
#include <sys/epoll.h>
#include <signal.h>
#include <string.h>
#include <irq_user.h>
#include <os.h>
#include <um_malloc.h>
/* Epoll support */
static int epollfd = -1;
#define MAX_EPOLL_EVENTS 64
static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
/* Helper to return an Epoll data pointer from an epoll event structure.
* We need to keep this one on the userspace side to keep includes separate
*/
void *os_epoll_get_data_pointer(int index)
{
return epoll_events[index].data.ptr;
}
/* Helper to compare events versus the events in the epoll structure.
* Same as above - needs to be on the userspace side
*/
int os_epoll_triggered(int index, int events)
{
return epoll_events[index].events & events;
}
/* Helper to set the event mask.
* The event mask is opaque to the kernel side, because it does not have
* access to the right includes/defines for EPOLL constants.
*/
int os_event_mask(int irq_type)
{
if (irq_type == IRQ_READ)
return EPOLLIN | EPOLLPRI;
if (irq_type == IRQ_WRITE)
return EPOLLOUT;
return 0;
}
/*
* Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd
* and os_free_irq_by_cb, which are called under irq_lock.
* Initial Epoll Setup
*/
static struct pollfd *pollfds = NULL;
static int pollfds_num = 0;
static int pollfds_size = 0;
int os_setup_epoll(void)
{
epollfd = epoll_create(MAX_EPOLL_EVENTS);
return epollfd;
}
int os_waiting_for_events(struct irq_fd *active_fds)
/*
* Helper to run the actual epoll_wait
*/
int os_waiting_for_events_epoll(void)
{
struct irq_fd *irq_fd;
int i, n, err;
int n, err;
n = poll(pollfds, pollfds_num, 0);
n = epoll_wait(epollfd,
(struct epoll_event *) &epoll_events, MAX_EPOLL_EVENTS, 0);
if (n < 0) {
err = -errno;
if (errno != EINTR)
printk(UM_KERN_ERR "os_waiting_for_events:"
" poll returned %d, errno = %d\n", n, errno);
printk(
UM_KERN_ERR "os_waiting_for_events:"
" epoll returned %d, error = %s\n", n,
strerror(errno)
);
return err;
}
if (n == 0)
return 0;
irq_fd = active_fds;
for (i = 0; i < pollfds_num; i++) {
if (pollfds[i].revents != 0) {
irq_fd->current_events = pollfds[i].revents;
pollfds[i].fd = -1;
}
irq_fd = irq_fd->next;
}
return n;
}
int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
{
if (pollfds_num == pollfds_size) {
if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
/* return min size needed for new pollfds area */
return (pollfds_size + 1) * sizeof(pollfds[0]);
}
if (pollfds != NULL) {
memcpy(tmp_pfd, pollfds,
sizeof(pollfds[0]) * pollfds_size);
/* remove old pollfds */
kfree(pollfds);
}
pollfds = tmp_pfd;
pollfds_size++;
} else
kfree(tmp_pfd); /* remove not used tmp_pfd */
pollfds[pollfds_num] = ((struct pollfd) { .fd = fd,
.events = events,
.revents = 0 });
pollfds_num++;
return 0;
}
void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
/*
* Helper to add a fd to epoll
*/
int os_add_epoll_fd(int events, int fd, void *data)
{
struct irq_fd **prev;
int i = 0;
prev = &active_fds;
while (*prev != NULL) {
if ((*test)(*prev, arg)) {
struct irq_fd *old_fd = *prev;
if ((pollfds[i].fd != -1) &&
(pollfds[i].fd != (*prev)->fd)) {
printk(UM_KERN_ERR "os_free_irq_by_cb - "
"mismatch between active_fds and "
"pollfds, fd %d vs %d\n",
(*prev)->fd, pollfds[i].fd);
goto out;
}
pollfds_num--;
/*
* This moves the *whole* array after pollfds[i]
* (though it doesn't spot as such)!
*/
memmove(&pollfds[i], &pollfds[i + 1],
(pollfds_num - i) * sizeof(pollfds[0]));
if (*last_irq_ptr2 == &old_fd->next)
*last_irq_ptr2 = prev;
*prev = (*prev)->next;
if (old_fd->type == IRQ_WRITE)
ignore_sigio_fd(old_fd->fd);
kfree(old_fd);
continue;
}
prev = &(*prev)->next;
i++;
}
out:
return;
struct epoll_event event;
int result;
event.data.ptr = data;
event.events = events | EPOLLET;
result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
if ((result) && (errno == EEXIST))
result = os_mod_epoll_fd(events, fd, data);
if (result)
printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
return result;
}
int os_get_pollfd(int i)
/*
* Helper to mod the fd event mask and/or data backreference
*/
int os_mod_epoll_fd(int events, int fd, void *data)
{
return pollfds[i].fd;
struct epoll_event event;
int result;
event.data.ptr = data;
event.events = events;
result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
if (result)
printk(UM_KERN_ERR
"epollctl mod err fd %d, %s\n", fd, strerror(errno));
return result;
}
void os_set_pollfd(int i, int fd)
/*
* Helper to delete the epoll fd
*/
int os_del_epoll_fd(int fd)
{
pollfds[i].fd = fd;
struct epoll_event event;
int result;
/* This is quiet as we use this as IO ON/OFF - so it is often
* invoked on a non-existent fd
*/
result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
return result;
}
void os_set_ioignore(void)
{
signal(SIGIO, SIG_IGN);
}
void os_close_epoll_fd(void)
{
/* Needed so we do not leak an fd when rebooting */
os_close_file(epollfd);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment