Commit 49da7e64 authored by Anton Ivanov's avatar Anton Ivanov Committed by Richard Weinberger

High Performance UML Vector Network Driver

1. Provides infrastructure for vector IO using recvmmsg/sendmmsg.
    1.1. Multi-message read.
    1.2. Multi-message write.
    1.3. Optimized queue support for multi-packet enqueue/dequeue.
    1.4. BQL/DQL support.
2. Implements transports for several transports as well support
for direct wiring of PWEs to NIC. Allows direct connection of VMs
to host, other VMs and network devices with no switch in use.
    2.1. Raw socket >4 times higher PPS and 10 times higher tcp RX
    than existing pcap based transport (> 4Gbit)
    2.2. New tap transport using socket RX and tap xmit. Similar
    performance improvements (>4Gbit)
    2.3. GRE transport - direct wiring to GRE PWE
    2.4. L2TPv3 transport - direct wiring to L2TPv3 PWE
3. Tuning, performance and offload related setting support via ethtool.
4. Initial BPF support - used in tap/raw to avoid software looping
5. Scatter Gather support.
6. VNET and checksum offload support for raw socket transport.
7. TSO/GSO support where applicable or available
8. Migrates all error messages to netdevice_*() and rate limits
them where needed.
Signed-off-by: default avatarAnton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: default avatarRichard Weinberger <richard@nod.at>
parent ff6a1798
...@@ -109,6 +109,17 @@ config UML_NET_DAEMON ...@@ -109,6 +109,17 @@ config UML_NET_DAEMON
more than one without conflict. If you don't need UML networking, more than one without conflict. If you don't need UML networking,
say N. say N.
config UML_NET_VECTOR
bool "Vector I/O high performance network devices"
depends on UML_NET
help
This User-Mode Linux network driver uses multi-message send
and receive functions. The host running the UML guest must have
a linux kernel version above 3.0 and a libc version > 2.13.
This driver provides tap, raw, gre and l2tpv3 network transports
with up to 4 times higher network throughput than the UML network
drivers.
config UML_NET_VDE config UML_NET_VDE
bool "VDE transport" bool "VDE transport"
depends on UML_NET depends on UML_NET
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
slip-objs := slip_kern.o slip_user.o slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o daemon-objs := daemon_kern.o daemon_user.o
vector-objs := vector_kern.o vector_user.o vector_transports.o
umcast-objs := umcast_kern.o umcast_user.o umcast-objs := umcast_kern.o umcast_user.o
net-objs := net_kern.o net_user.o net-objs := net_kern.o net_user.o
mconsole-objs := mconsole_kern.o mconsole_user.o mconsole-objs := mconsole_kern.o mconsole_user.o
...@@ -43,6 +44,7 @@ obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o ...@@ -43,6 +44,7 @@ obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o
obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
obj-$(CONFIG_UML_NET_DAEMON) += daemon.o obj-$(CONFIG_UML_NET_DAEMON) += daemon.o
obj-$(CONFIG_UML_NET_VECTOR) += vector.o
obj-$(CONFIG_UML_NET_VDE) += vde.o obj-$(CONFIG_UML_NET_VDE) += vde.o
obj-$(CONFIG_UML_NET_MCAST) += umcast.o obj-$(CONFIG_UML_NET_MCAST) += umcast.o
obj-$(CONFIG_UML_NET_PCAP) += pcap.o obj-$(CONFIG_UML_NET_PCAP) += pcap.o
...@@ -61,7 +63,7 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o ...@@ -61,7 +63,7 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o
obj-$(CONFIG_UML_RANDOM) += random.o obj-$(CONFIG_UML_RANDOM) += random.o
# pcap_user.o must be added explicitly. # pcap_user.o must be added explicitly.
USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o
CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH)
include arch/um/scripts/Makefile.rules include arch/um/scripts/Makefile.rules
...@@ -288,7 +288,7 @@ static void uml_net_user_timer_expire(struct timer_list *t) ...@@ -288,7 +288,7 @@ static void uml_net_user_timer_expire(struct timer_list *t)
#endif #endif
} }
static void setup_etheraddr(struct net_device *dev, char *str) void uml_net_setup_etheraddr(struct net_device *dev, char *str)
{ {
unsigned char *addr = dev->dev_addr; unsigned char *addr = dev->dev_addr;
char *end; char *end;
...@@ -412,7 +412,7 @@ static void eth_configure(int n, void *init, char *mac, ...@@ -412,7 +412,7 @@ static void eth_configure(int n, void *init, char *mac,
*/ */
snprintf(dev->name, sizeof(dev->name), "eth%d", n); snprintf(dev->name, sizeof(dev->name), "eth%d", n);
setup_etheraddr(dev, mac); uml_net_setup_etheraddr(dev, mac);
printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr); printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr);
......
This diff is collapsed.
/*
* Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
#ifndef __UM_VECTOR_KERN_H
#define __UM_VECTOR_KERN_H
#include <linux/netdevice.h>
#include <linux/platform_device.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/list.h>
#include <linux/ctype.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include "vector_user.h"
/* Queue structure specially adapted for multiple enqueue/dequeue
* in a mmsgrecv/mmsgsend context
*/
/* Dequeue method */
#define QUEUE_SENDMSG 0
#define QUEUE_SENDMMSG 1
#define VECTOR_RX 1
#define VECTOR_TX (1 << 1)
#define VECTOR_BPF (1 << 2)
#define ETH_MAX_PACKET 1500
#define ETH_HEADER_OTHER 32 /* just in case someone decides to go mad on QnQ */
struct vector_queue {
struct mmsghdr *mmsg_vector;
void **skbuff_vector;
/* backlink to device which owns us */
struct net_device *dev;
spinlock_t head_lock;
spinlock_t tail_lock;
int queue_depth, head, tail, max_depth, max_iov_frags;
short options;
};
struct vector_estats {
uint64_t rx_queue_max;
uint64_t rx_queue_running_average;
uint64_t tx_queue_max;
uint64_t tx_queue_running_average;
uint64_t rx_encaps_errors;
uint64_t tx_timeout_count;
uint64_t tx_restart_queue;
uint64_t tx_kicks;
uint64_t tx_flow_control_xon;
uint64_t tx_flow_control_xoff;
uint64_t rx_csum_offload_good;
uint64_t rx_csum_offload_errors;
uint64_t sg_ok;
uint64_t sg_linearized;
};
#define VERIFY_HEADER_NOK -1
#define VERIFY_HEADER_OK 0
#define VERIFY_CSUM_OK 1
struct vector_private {
struct list_head list;
spinlock_t lock;
struct net_device *dev;
int unit;
/* Timeout timer in TX */
struct timer_list tl;
/* Scheduled "remove device" work */
struct work_struct reset_tx;
struct vector_fds *fds;
struct vector_queue *rx_queue;
struct vector_queue *tx_queue;
int rx_irq;
int tx_irq;
struct arglist *parsed;
void *transport_data; /* transport specific params if needed */
int max_packet;
int req_size; /* different from max packet - used for TSO */
int headroom;
int options;
/* remote address if any - some transports will leave this as null */
int header_size;
int rx_header_size;
int coalesce;
void *header_rxbuffer;
void *header_txbuffer;
int (*form_header)(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp);
int (*verify_header)(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp);
spinlock_t stats_lock;
struct tasklet_struct tx_poll;
bool rexmit_scheduled;
bool opened;
bool in_write_poll;
/* ethtool stats */
struct vector_estats estats;
void *bpf;
char user[0];
};
extern int build_transport_data(struct vector_private *vp);
#endif
This diff is collapsed.
This diff is collapsed.
/*
* Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
#ifndef __UM_VECTOR_USER_H
#define __UM_VECTOR_USER_H
#define MAXVARGS 20
#define TOKEN_IFNAME "ifname"
#define TRANS_RAW "raw"
#define TRANS_RAW_LEN strlen(TRANS_RAW)
#define TRANS_TAP "tap"
#define TRANS_TAP_LEN strlen(TRANS_TAP)
#define TRANS_GRE "gre"
#define TRANS_GRE_LEN strlen(TRANS_RAW)
#define TRANS_L2TPV3 "l2tpv3"
#define TRANS_L2TPV3_LEN strlen(TRANS_L2TPV3)
#ifndef IPPROTO_GRE
#define IPPROTO_GRE 0x2F
#endif
#define GRE_MODE_CHECKSUM cpu_to_be16(8 << 12) /* checksum */
#define GRE_MODE_RESERVED cpu_to_be16(4 << 12) /* unused */
#define GRE_MODE_KEY cpu_to_be16(2 << 12) /* KEY present */
#define GRE_MODE_SEQUENCE cpu_to_be16(1 << 12) /* sequence */
#define GRE_IRB cpu_to_be16(0x6558)
#define L2TPV3_DATA_PACKET 0x30000
/* IANA-assigned IP protocol ID for L2TPv3 */
#ifndef IPPROTO_L2TP
#define IPPROTO_L2TP 0x73
#endif
struct arglist {
int numargs;
char *tokens[MAXVARGS];
char *values[MAXVARGS];
};
/* Separating read and write FDs allows us to have different
* rx and tx method. Example - read tap via raw socket using
* recvmmsg, write using legacy tap write calls
*/
struct vector_fds {
int rx_fd;
int tx_fd;
void *remote_addr;
int remote_addr_size;
};
#define VECTOR_READ 1
#define VECTOR_WRITE (1 < 1)
#define VECTOR_HEADERS (1 < 2)
extern struct arglist *uml_parse_vector_ifspec(char *arg);
extern struct vector_fds *uml_vector_user_open(
int unit,
struct arglist *parsed
);
extern char *uml_vector_fetch_arg(
struct arglist *ifspec,
char *token
);
extern int uml_vector_recvmsg(int fd, void *hdr, int flags);
extern int uml_vector_sendmsg(int fd, void *hdr, int flags);
extern int uml_vector_writev(int fd, void *hdr, int iovcount);
extern int uml_vector_sendmmsg(
int fd, void *msgvec,
unsigned int vlen,
unsigned int flags
);
extern int uml_vector_recvmmsg(
int fd,
void *msgvec,
unsigned int vlen,
unsigned int flags
);
extern void *uml_vector_default_bpf(int fd, void *mac);
extern int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len);
extern bool uml_raw_enable_vnet_headers(int fd);
extern bool uml_tap_enable_vnet_headers(int fd);
#endif
...@@ -18,7 +18,19 @@ ...@@ -18,7 +18,19 @@
#define XTERM_IRQ 13 #define XTERM_IRQ 13
#define RANDOM_IRQ 14 #define RANDOM_IRQ 14
#ifdef CONFIG_UML_NET_VECTOR
#define VECTOR_BASE_IRQ 15
#define VECTOR_IRQ_SPACE 8
#define LAST_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ)
#else
#define LAST_IRQ RANDOM_IRQ #define LAST_IRQ RANDOM_IRQ
#endif
#define NR_IRQS (LAST_IRQ + 1) #define NR_IRQS (LAST_IRQ + 1)
#endif #endif
...@@ -65,5 +65,7 @@ extern int tap_setup_common(char *str, char *type, char **dev_name, ...@@ -65,5 +65,7 @@ extern int tap_setup_common(char *str, char *type, char **dev_name,
char **mac_out, char **gate_addr); char **mac_out, char **gate_addr);
extern void register_transport(struct transport *new); extern void register_transport(struct transport *new);
extern unsigned short eth_protocol(struct sk_buff *skb); extern unsigned short eth_protocol(struct sk_buff *skb);
extern void uml_net_setup_etheraddr(struct net_device *dev, char *str);
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment