Commit bbd6ef87 authored by Patrick McHardy's avatar Patrick McHardy Committed by David S. Miller

packet: support extensible, 64 bit clean mmaped ring structure

The tpacket_hdr is not 64 bit clean due to use of an unsigned long
and can't be extended because the following struct sockaddr_ll needs
to be at a fixed offset.

Add support for a version 2 tpacket protocol that removes these
limitations.

Userspace can query the header size through a new getsockopt option
and change the protocol version through a setsockopt option. The
changes needed to switch to the new protocol version are:

1. replace struct tpacket_hdr by struct tpacket2_hdr
2. query header len and save
3. set protocol version to 2
 - set up ring as usual
4. for getting the sockaddr_ll, use (void *)hdr + TPACKET_ALIGN(hdrlen)
   instead of (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))

Steps 2 and 4 can be omitted if the struct sockaddr_ll isn't needed.
Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent bc1d0411
...@@ -43,6 +43,8 @@ struct sockaddr_ll ...@@ -43,6 +43,8 @@ struct sockaddr_ll
#define PACKET_COPY_THRESH 7 #define PACKET_COPY_THRESH 7
#define PACKET_AUXDATA 8 #define PACKET_AUXDATA 8
#define PACKET_ORIGDEV 9 #define PACKET_ORIGDEV 9
#define PACKET_VERSION 10
#define PACKET_HDRLEN 11
struct tpacket_stats struct tpacket_stats
{ {
...@@ -79,6 +81,25 @@ struct tpacket_hdr ...@@ -79,6 +81,25 @@ struct tpacket_hdr
#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) #define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
#define TPACKET_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll)) #define TPACKET_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll))
struct tpacket2_hdr
{
__u32 tp_status;
__u32 tp_len;
__u32 tp_snaplen;
__u16 tp_mac;
__u16 tp_net;
__u32 tp_sec;
__u32 tp_nsec;
};
#define TPACKET2_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
enum tpacket_versions
{
TPACKET_V1,
TPACKET_V2,
};
/* /*
Frame structure: Frame structure:
......
...@@ -186,6 +186,8 @@ struct packet_sock { ...@@ -186,6 +186,8 @@ struct packet_sock {
unsigned int pg_vec_order; unsigned int pg_vec_order;
unsigned int pg_vec_pages; unsigned int pg_vec_pages;
unsigned int pg_vec_len; unsigned int pg_vec_len;
enum tpacket_versions tp_version;
unsigned int tp_hdrlen;
#endif #endif
}; };
...@@ -201,14 +203,52 @@ struct packet_skb_cb { ...@@ -201,14 +203,52 @@ struct packet_skb_cb {
#ifdef CONFIG_PACKET_MMAP #ifdef CONFIG_PACKET_MMAP
static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position) static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
int status)
{ {
unsigned int pg_vec_pos, frame_offset; unsigned int pg_vec_pos, frame_offset;
union {
struct tpacket_hdr *h1;
struct tpacket2_hdr *h2;
void *raw;
} h;
pg_vec_pos = position / po->frames_per_block; pg_vec_pos = position / po->frames_per_block;
frame_offset = position % po->frames_per_block; frame_offset = position % po->frames_per_block;
return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size)); h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
switch (po->tp_version) {
case TPACKET_V1:
if (status != h.h1->tp_status ? TP_STATUS_USER :
TP_STATUS_KERNEL)
return NULL;
break;
case TPACKET_V2:
if (status != h.h2->tp_status ? TP_STATUS_USER :
TP_STATUS_KERNEL)
return NULL;
break;
}
return h.raw;
}
static void __packet_set_status(struct packet_sock *po, void *frame, int status)
{
union {
struct tpacket_hdr *h1;
struct tpacket2_hdr *h2;
void *raw;
} h;
h.raw = frame;
switch (po->tp_version) {
case TPACKET_V1:
h.h1->tp_status = status;
break;
case TPACKET_V2:
h.h2->tp_status = status;
break;
}
} }
#endif #endif
...@@ -551,14 +591,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe ...@@ -551,14 +591,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
struct sock *sk; struct sock *sk;
struct packet_sock *po; struct packet_sock *po;
struct sockaddr_ll *sll; struct sockaddr_ll *sll;
struct tpacket_hdr *h; union {
struct tpacket_hdr *h1;
struct tpacket2_hdr *h2;
void *raw;
} h;
u8 * skb_head = skb->data; u8 * skb_head = skb->data;
int skb_len = skb->len; int skb_len = skb->len;
unsigned int snaplen, res; unsigned int snaplen, res;
unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
unsigned short macoff, netoff; unsigned short macoff, netoff, hdrlen;
struct sk_buff *copy_skb = NULL; struct sk_buff *copy_skb = NULL;
struct timeval tv; struct timeval tv;
struct timespec ts;
if (skb->pkt_type == PACKET_LOOPBACK) if (skb->pkt_type == PACKET_LOOPBACK)
goto drop; goto drop;
...@@ -590,10 +635,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe ...@@ -590,10 +635,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
snaplen = res; snaplen = res;
if (sk->sk_type == SOCK_DGRAM) { if (sk->sk_type == SOCK_DGRAM) {
macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16;
} else { } else {
unsigned maclen = skb_network_offset(skb); unsigned maclen = skb_network_offset(skb);
netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); netoff = TPACKET_ALIGN(po->tp_hdrlen +
(maclen < 16 ? 16 : maclen));
macoff = netoff - maclen; macoff = netoff - maclen;
} }
...@@ -616,9 +662,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe ...@@ -616,9 +662,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
} }
spin_lock(&sk->sk_receive_queue.lock); spin_lock(&sk->sk_receive_queue.lock);
h = packet_lookup_frame(po, po->head); h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
if (!h.raw)
if (h->tp_status)
goto ring_is_full; goto ring_is_full;
po->head = po->head != po->frame_max ? po->head+1 : 0; po->head = po->head != po->frame_max ? po->head+1 : 0;
po->stats.tp_packets++; po->stats.tp_packets++;
...@@ -630,20 +675,40 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe ...@@ -630,20 +675,40 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
status &= ~TP_STATUS_LOSING; status &= ~TP_STATUS_LOSING;
spin_unlock(&sk->sk_receive_queue.lock); spin_unlock(&sk->sk_receive_queue.lock);
skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen); skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
h->tp_len = skb->len; switch (po->tp_version) {
h->tp_snaplen = snaplen; case TPACKET_V1:
h->tp_mac = macoff; h.h1->tp_len = skb->len;
h->tp_net = netoff; h.h1->tp_snaplen = snaplen;
if (skb->tstamp.tv64) h.h1->tp_mac = macoff;
tv = ktime_to_timeval(skb->tstamp); h.h1->tp_net = netoff;
else if (skb->tstamp.tv64)
do_gettimeofday(&tv); tv = ktime_to_timeval(skb->tstamp);
h->tp_sec = tv.tv_sec; else
h->tp_usec = tv.tv_usec; do_gettimeofday(&tv);
h.h1->tp_sec = tv.tv_sec;
h.h1->tp_usec = tv.tv_usec;
hdrlen = sizeof(*h.h1);
break;
case TPACKET_V2:
h.h2->tp_len = skb->len;
h.h2->tp_snaplen = snaplen;
h.h2->tp_mac = macoff;
h.h2->tp_net = netoff;
if (skb->tstamp.tv64)
ts = ktime_to_timespec(skb->tstamp);
else
getnstimeofday(&ts);
h.h2->tp_sec = ts.tv_sec;
h.h2->tp_nsec = ts.tv_nsec;
hdrlen = sizeof(*h.h2);
break;
default:
BUG();
}
sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); sll = h.raw + TPACKET_ALIGN(hdrlen);
sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
sll->sll_family = AF_PACKET; sll->sll_family = AF_PACKET;
sll->sll_hatype = dev->type; sll->sll_hatype = dev->type;
...@@ -654,14 +719,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe ...@@ -654,14 +719,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
else else
sll->sll_ifindex = dev->ifindex; sll->sll_ifindex = dev->ifindex;
h->tp_status = status; __packet_set_status(po, h.raw, status);
smp_mb(); smp_mb();
{ {
struct page *p_start, *p_end; struct page *p_start, *p_end;
u8 *h_end = (u8 *)h + macoff + snaplen - 1; u8 *h_end = h.raw + macoff + snaplen - 1;
p_start = virt_to_page(h); p_start = virt_to_page(h.raw);
p_end = virt_to_page(h_end); p_end = virt_to_page(h_end);
while (p_start <= p_end) { while (p_start <= p_end) {
flush_dcache_page(p_start); flush_dcache_page(p_start);
...@@ -1362,6 +1427,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv ...@@ -1362,6 +1427,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
pkt_sk(sk)->copy_thresh = val; pkt_sk(sk)->copy_thresh = val;
return 0; return 0;
} }
case PACKET_VERSION:
{
int val;
if (optlen != sizeof(val))
return -EINVAL;
if (po->pg_vec)
return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
switch (val) {
case TPACKET_V1:
case TPACKET_V2:
po->tp_version = val;
return 0;
default:
return -EINVAL;
}
}
#endif #endif
case PACKET_AUXDATA: case PACKET_AUXDATA:
{ {
...@@ -1437,6 +1521,31 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ...@@ -1437,6 +1521,31 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
data = &val; data = &val;
break; break;
#ifdef CONFIG_PACKET_MMAP
case PACKET_VERSION:
if (len > sizeof(int))
len = sizeof(int);
val = po->tp_version;
data = &val;
break;
case PACKET_HDRLEN:
if (len > sizeof(int))
len = sizeof(int);
if (copy_from_user(&val, optval, len))
return -EFAULT;
switch (val) {
case TPACKET_V1:
val = sizeof(struct tpacket_hdr);
break;
case TPACKET_V2:
val = sizeof(struct tpacket2_hdr);
break;
default:
return -EINVAL;
}
data = &val;
break;
#endif
default: default:
return -ENOPROTOOPT; return -ENOPROTOOPT;
} }
...@@ -1570,11 +1679,8 @@ static unsigned int packet_poll(struct file * file, struct socket *sock, ...@@ -1570,11 +1679,8 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
spin_lock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_receive_queue.lock);
if (po->pg_vec) { if (po->pg_vec) {
unsigned last = po->head ? po->head-1 : po->frame_max; unsigned last = po->head ? po->head-1 : po->frame_max;
struct tpacket_hdr *h;
h = packet_lookup_frame(po, last);
if (h->tp_status) if (packet_lookup_frame(po, last, TP_STATUS_USER))
mask |= POLLIN | POLLRDNORM; mask |= POLLIN | POLLRDNORM;
} }
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
...@@ -1669,11 +1775,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing ...@@ -1669,11 +1775,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
if (unlikely(po->pg_vec)) if (unlikely(po->pg_vec))
return -EBUSY; return -EBUSY;
switch (po->tp_version) {
case TPACKET_V1:
po->tp_hdrlen = TPACKET_HDRLEN;
break;
case TPACKET_V2:
po->tp_hdrlen = TPACKET2_HDRLEN;
break;
}
if (unlikely((int)req->tp_block_size <= 0)) if (unlikely((int)req->tp_block_size <= 0))
return -EINVAL; return -EINVAL;
if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
return -EINVAL; return -EINVAL;
if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) if (unlikely(req->tp_frame_size < po->tp_hdrlen))
return -EINVAL; return -EINVAL;
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
return -EINVAL; return -EINVAL;
...@@ -1692,13 +1807,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing ...@@ -1692,13 +1807,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
goto out; goto out;
for (i = 0; i < req->tp_block_nr; i++) { for (i = 0; i < req->tp_block_nr; i++) {
char *ptr = pg_vec[i]; void *ptr = pg_vec[i];
struct tpacket_hdr *header;
int k; int k;
for (k = 0; k < po->frames_per_block; k++) { for (k = 0; k < po->frames_per_block; k++) {
header = (struct tpacket_hdr *) ptr; __packet_set_status(po, ptr, TP_STATUS_KERNEL);
header->tp_status = TP_STATUS_KERNEL;
ptr += req->tp_frame_size; ptr += req->tp_frame_size;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment