Commit 9b8dd5e5 authored by Bailey Forrest's avatar Bailey Forrest Committed by David S. Miller

gve: DQO: Add RX path

The RX queue has an array of `gve_rx_buf_state_dqo` objects. All
allocated pages have an associated buf_state object. When a buffer is
posted on the RX buffer queue, the buffer ID will be the buf_state's
index into the RX queue's array.

On packet reception, the RX queue will have one descriptor for each
buffer associated with a received packet. Each RX descriptor will have
a buffer_id that was posted on the buffer queue.

Notable mentions:

- We use a default buffer size of 2048 bytes. Based on page size, we
  may post separate sections of a single page as separate buffers.

- The driver holds an extra reference on pages passed up the receive
  path with an skb and keeps these pages on a list. When posting new
  buffers to the NIC, we check if any of these pages has only our
  reference, or another buffer sized segment of the page has no
  references. If so, it is free to reuse. This page recycling approach
  is a common netdev optimization that reduces page alloc/free calls.

- Pages in the free list have a page_count bias in order to avoid an
  atomic increment of pagecount every time we attempt to reuse a page.
  # references = page_count() - bias

- In order to track when a page is safe to reuse, we keep track of the
  last offset which had a single SKB reference. When this occurs, it
  implies that every single other offset is reusable. Otherwise, we
  don't know if offsets can be safely reused.

- We maintain two free lists of pages. List #1 (recycled_buf_states)
  contains pages we know can be reused right away. List #2
  (used_buf_states) contains pages which cannot be used right away. We
  only attempt to get pages from list #2 when list #1 is empty. We only
  attempt to use a small fixed number pages from list #2 before giving
  up and allocating a new page. Both lists are FIFOs in hope that by the
  time we attempt to reuse a page, the references were dropped.
Signed-off-by: default avatarBailey Forrest <bcf@google.com>
Reviewed-by: default avatarWillem de Bruijn <willemb@google.com>
Reviewed-by: default avatarCatherine Sullivan <csully@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent a57e5de4
...@@ -59,6 +59,7 @@ struct gve_rx_slot_page_info { ...@@ -59,6 +59,7 @@ struct gve_rx_slot_page_info {
struct page *page; struct page *page;
void *page_address; void *page_address;
u32 page_offset; /* offset to write to in page */ u32 page_offset; /* offset to write to in page */
int pagecnt_bias; /* expected pagecnt if only the driver has a ref */
u8 can_flip; u8 can_flip;
}; };
......
...@@ -16,9 +16,161 @@ ...@@ -16,9 +16,161 @@
#include <net/ipv6.h> #include <net/ipv6.h>
#include <net/tcp.h> #include <net/tcp.h>
static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs)
{
return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias;
}
static void gve_free_page_dqo(struct gve_priv *priv, static void gve_free_page_dqo(struct gve_priv *priv,
struct gve_rx_buf_state_dqo *bs) struct gve_rx_buf_state_dqo *bs)
{ {
page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1);
gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr,
DMA_FROM_DEVICE);
bs->page_info.page = NULL;
}
static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx)
{
struct gve_rx_buf_state_dqo *buf_state;
s16 buffer_id;
buffer_id = rx->dqo.free_buf_states;
if (unlikely(buffer_id == -1))
return NULL;
buf_state = &rx->dqo.buf_states[buffer_id];
/* Remove buf_state from free list */
rx->dqo.free_buf_states = buf_state->next;
/* Point buf_state to itself to mark it as allocated */
buf_state->next = buffer_id;
return buf_state;
}
static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx,
struct gve_rx_buf_state_dqo *buf_state)
{
s16 buffer_id = buf_state - rx->dqo.buf_states;
return buf_state->next == buffer_id;
}
static void gve_free_buf_state(struct gve_rx_ring *rx,
struct gve_rx_buf_state_dqo *buf_state)
{
s16 buffer_id = buf_state - rx->dqo.buf_states;
buf_state->next = rx->dqo.free_buf_states;
rx->dqo.free_buf_states = buffer_id;
}
static struct gve_rx_buf_state_dqo *
gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list)
{
struct gve_rx_buf_state_dqo *buf_state;
s16 buffer_id;
buffer_id = list->head;
if (unlikely(buffer_id == -1))
return NULL;
buf_state = &rx->dqo.buf_states[buffer_id];
/* Remove buf_state from list */
list->head = buf_state->next;
if (buf_state->next == -1)
list->tail = -1;
/* Point buf_state to itself to mark it as allocated */
buf_state->next = buffer_id;
return buf_state;
}
static void gve_enqueue_buf_state(struct gve_rx_ring *rx,
struct gve_index_list *list,
struct gve_rx_buf_state_dqo *buf_state)
{
s16 buffer_id = buf_state - rx->dqo.buf_states;
buf_state->next = -1;
if (list->head == -1) {
list->head = buffer_id;
list->tail = buffer_id;
} else {
int tail = list->tail;
rx->dqo.buf_states[tail].next = buffer_id;
list->tail = buffer_id;
}
}
static struct gve_rx_buf_state_dqo *
gve_get_recycled_buf_state(struct gve_rx_ring *rx)
{
struct gve_rx_buf_state_dqo *buf_state;
int i;
/* Recycled buf states are immediately usable. */
buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states);
if (likely(buf_state))
return buf_state;
if (unlikely(rx->dqo.used_buf_states.head == -1))
return NULL;
/* Used buf states are only usable when ref count reaches 0, which means
* no SKBs refer to them.
*
* Search a limited number before giving up.
*/
for (i = 0; i < 5; i++) {
buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
if (gve_buf_ref_cnt(buf_state) == 0)
return buf_state;
gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
}
/* If there are no free buf states discard an entry from
* `used_buf_states` so it can be used.
*/
if (unlikely(rx->dqo.free_buf_states == -1)) {
buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
if (gve_buf_ref_cnt(buf_state) == 0)
return buf_state;
gve_free_page_dqo(rx->gve, buf_state);
gve_free_buf_state(rx, buf_state);
}
return NULL;
}
static int gve_alloc_page_dqo(struct gve_priv *priv,
struct gve_rx_buf_state_dqo *buf_state)
{
int err;
err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page,
&buf_state->addr, DMA_FROM_DEVICE);
if (err)
return err;
buf_state->page_info.page_offset = 0;
buf_state->page_info.page_address =
page_address(buf_state->page_info.page);
buf_state->last_single_ref_offset = 0;
/* The page already has 1 ref. */
page_ref_add(buf_state->page_info.page, INT_MAX - 1);
buf_state->page_info.pagecnt_bias = INT_MAX;
return 0;
} }
static void gve_rx_free_ring_dqo(struct gve_priv *priv, int idx) static void gve_rx_free_ring_dqo(struct gve_priv *priv, int idx)
...@@ -137,6 +289,14 @@ static int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) ...@@ -137,6 +289,14 @@ static int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx)
return -ENOMEM; return -ENOMEM;
} }
void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
{
const struct gve_rx_ring *rx = &priv->rx[queue_idx];
u64 index = be32_to_cpu(rx->q_resources->db_index);
iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
}
int gve_rx_alloc_rings_dqo(struct gve_priv *priv) int gve_rx_alloc_rings_dqo(struct gve_priv *priv)
{ {
int err = 0; int err = 0;
...@@ -171,11 +331,433 @@ void gve_rx_free_rings_dqo(struct gve_priv *priv) ...@@ -171,11 +331,433 @@ void gve_rx_free_rings_dqo(struct gve_priv *priv)
void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
{ {
struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
struct gve_priv *priv = rx->gve;
u32 num_avail_slots;
u32 num_full_slots;
u32 num_posted = 0;
num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
num_avail_slots = bufq->mask - num_full_slots;
num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
while (num_posted < num_avail_slots) {
struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
struct gve_rx_buf_state_dqo *buf_state;
buf_state = gve_get_recycled_buf_state(rx);
if (unlikely(!buf_state)) {
buf_state = gve_alloc_buf_state(rx);
if (unlikely(!buf_state))
break;
if (unlikely(gve_alloc_page_dqo(priv, buf_state))) {
u64_stats_update_begin(&rx->statss);
rx->rx_buf_alloc_fail++;
u64_stats_update_end(&rx->statss);
gve_free_buf_state(rx, buf_state);
break;
}
}
desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states);
desc->buf_addr = cpu_to_le64(buf_state->addr +
buf_state->page_info.page_offset);
bufq->tail = (bufq->tail + 1) & bufq->mask;
complq->num_free_slots--;
num_posted++;
if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
gve_rx_write_doorbell_dqo(priv, rx->q_num);
}
rx->fill_cnt += num_posted;
}
static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx,
struct gve_rx_buf_state_dqo *buf_state)
{
const int data_buffer_size = priv->data_buffer_size_dqo;
int pagecount;
/* Can't reuse if we only fit one buffer per page */
if (data_buffer_size * 2 > PAGE_SIZE)
goto mark_used;
pagecount = gve_buf_ref_cnt(buf_state);
/* Record the offset when we have a single remaining reference.
*
* When this happens, we know all of the other offsets of the page are
* usable.
*/
if (pagecount == 1) {
buf_state->last_single_ref_offset =
buf_state->page_info.page_offset;
}
/* Use the next buffer sized chunk in the page. */
buf_state->page_info.page_offset += data_buffer_size;
buf_state->page_info.page_offset &= (PAGE_SIZE - 1);
/* If we wrap around to the same offset without ever dropping to 1
* reference, then we don't know if this offset was ever freed.
*/
if (buf_state->page_info.page_offset ==
buf_state->last_single_ref_offset) {
goto mark_used;
}
gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
return;
mark_used:
gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
}
static void gve_rx_skb_csum(struct sk_buff *skb,
const struct gve_rx_compl_desc_dqo *desc,
struct gve_ptype ptype)
{
skb->ip_summed = CHECKSUM_NONE;
/* HW did not identify and process L3 and L4 headers. */
if (unlikely(!desc->l3_l4_processed))
return;
if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
return;
} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
/* Checksum should be skipped if this flag is set. */
if (unlikely(desc->ipv6_ex_add))
return;
}
if (unlikely(desc->csum_l4_err))
return;
switch (ptype.l4_type) {
case GVE_L4_TYPE_TCP:
case GVE_L4_TYPE_UDP:
case GVE_L4_TYPE_ICMP:
case GVE_L4_TYPE_SCTP:
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
default:
break;
}
}
static void gve_rx_skb_hash(struct sk_buff *skb,
const struct gve_rx_compl_desc_dqo *compl_desc,
struct gve_ptype ptype)
{
enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
hash_type = PKT_HASH_TYPE_L4;
else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
hash_type = PKT_HASH_TYPE_L3;
skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
}
static void gve_rx_free_skb(struct gve_rx_ring *rx)
{
if (!rx->skb_head)
return;
dev_kfree_skb_any(rx->skb_head);
rx->skb_head = NULL;
rx->skb_tail = NULL;
}
/* Chains multi skbs for single rx packet.
* Returns 0 if buffer is appended, -1 otherwise.
*/
static int gve_rx_append_frags(struct napi_struct *napi,
struct gve_rx_buf_state_dqo *buf_state,
u16 buf_len, struct gve_rx_ring *rx,
struct gve_priv *priv)
{
int num_frags = skb_shinfo(rx->skb_tail)->nr_frags;
if (unlikely(num_frags == MAX_SKB_FRAGS)) {
struct sk_buff *skb;
skb = napi_alloc_skb(napi, 0);
if (!skb)
return -1;
skb_shinfo(rx->skb_tail)->frag_list = skb;
rx->skb_tail = skb;
num_frags = 0;
}
if (rx->skb_tail != rx->skb_head) {
rx->skb_head->len += buf_len;
rx->skb_head->data_len += buf_len;
rx->skb_head->truesize += priv->data_buffer_size_dqo;
}
skb_add_rx_frag(rx->skb_tail, num_frags,
buf_state->page_info.page,
buf_state->page_info.page_offset,
buf_len, priv->data_buffer_size_dqo);
gve_dec_pagecnt_bias(&buf_state->page_info);
return 0;
}
/* Returns 0 if descriptor is completed successfully.
* Returns -EINVAL if descriptor is invalid.
* Returns -ENOMEM if data cannot be copied to skb.
*/
static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
const struct gve_rx_compl_desc_dqo *compl_desc,
int queue_idx)
{
const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
const bool eop = compl_desc->end_of_packet != 0;
struct gve_rx_buf_state_dqo *buf_state;
struct gve_priv *priv = rx->gve;
u16 buf_len;
if (unlikely(buffer_id > rx->dqo.num_buf_states)) {
net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
priv->dev->name, buffer_id);
return -EINVAL;
}
buf_state = &rx->dqo.buf_states[buffer_id];
if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
priv->dev->name, buffer_id);
return -EINVAL;
}
if (unlikely(compl_desc->rx_error)) {
gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
buf_state);
return -EINVAL;
}
buf_len = compl_desc->packet_len;
/* Page might have not been used for awhile and was likely last written
* by a different thread.
*/
prefetch(buf_state->page_info.page);
/* Sync the portion of dma buffer for CPU to read. */
dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
buf_state->page_info.page_offset,
buf_len, DMA_FROM_DEVICE);
/* Append to current skb if one exists. */
if (rx->skb_head) {
if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
priv)) != 0) {
goto error;
}
gve_try_recycle_buf(priv, rx, buf_state);
return 0;
}
/* Prefetch the payload header. */
prefetch((char *)buf_state->addr + buf_state->page_info.page_offset);
#if L1_CACHE_BYTES < 128
prefetch((char *)buf_state->addr + buf_state->page_info.page_offset +
L1_CACHE_BYTES);
#endif
if (eop && buf_len <= priv->rx_copybreak) {
rx->skb_head = gve_rx_copy(priv->dev, napi,
&buf_state->page_info, buf_len, 0);
if (unlikely(!rx->skb_head))
goto error;
rx->skb_tail = rx->skb_head;
u64_stats_update_begin(&rx->statss);
rx->rx_copied_pkt++;
rx->rx_copybreak_pkt++;
u64_stats_update_end(&rx->statss);
gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
buf_state);
return 0;
}
rx->skb_head = napi_get_frags(napi);
if (unlikely(!rx->skb_head))
goto error;
rx->skb_tail = rx->skb_head;
skb_add_rx_frag(rx->skb_head, 0, buf_state->page_info.page,
buf_state->page_info.page_offset, buf_len,
priv->data_buffer_size_dqo);
gve_dec_pagecnt_bias(&buf_state->page_info);
gve_try_recycle_buf(priv, rx, buf_state);
return 0;
error:
gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
return -ENOMEM;
}
static int gve_rx_complete_rsc(struct sk_buff *skb,
const struct gve_rx_compl_desc_dqo *desc,
struct gve_ptype ptype)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Only TCP is supported right now. */
if (ptype.l4_type != GVE_L4_TYPE_TCP)
return -EINVAL;
switch (ptype.l3_type) {
case GVE_L3_TYPE_IPV4:
shinfo->gso_type = SKB_GSO_TCPV4;
break;
case GVE_L3_TYPE_IPV6:
shinfo->gso_type = SKB_GSO_TCPV6;
break;
default:
return -EINVAL;
}
shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
return 0;
}
/* Returns 0 if skb is completed successfully, -1 otherwise. */
static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
const struct gve_rx_compl_desc_dqo *desc,
netdev_features_t feat)
{
struct gve_ptype ptype =
rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
int err;
skb_record_rx_queue(rx->skb_head, rx->q_num);
if (feat & NETIF_F_RXHASH)
gve_rx_skb_hash(rx->skb_head, desc, ptype);
if (feat & NETIF_F_RXCSUM)
gve_rx_skb_csum(rx->skb_head, desc, ptype);
/* RSC packets must set gso_size otherwise the TCP stack will complain
* that packets are larger than MTU.
*/
if (desc->rsc) {
err = gve_rx_complete_rsc(rx->skb_head, desc, ptype);
if (err < 0)
return err;
}
if (skb_headlen(rx->skb_head) == 0)
napi_gro_frags(napi);
else
napi_gro_receive(napi, rx->skb_head);
return 0;
} }
int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
{ {
struct napi_struct *napi = &block->napi;
netdev_features_t feat = napi->dev->features;
struct gve_rx_ring *rx = block->rx;
struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
u32 work_done = 0; u32 work_done = 0;
u64 bytes = 0;
int err;
while (work_done < budget) {
struct gve_rx_compl_desc_dqo *compl_desc =
&complq->desc_ring[complq->head];
u32 pkt_bytes;
/* No more new packets */
if (compl_desc->generation == complq->cur_gen_bit)
break;
/* Prefetch the next two descriptors. */
prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
/* Do not read data until we own the descriptor */
dma_rmb();
err = gve_rx_dqo(napi, rx, compl_desc, rx->q_num);
if (err < 0) {
gve_rx_free_skb(rx);
u64_stats_update_begin(&rx->statss);
if (err == -ENOMEM)
rx->rx_skb_alloc_fail++;
else if (err == -EINVAL)
rx->rx_desc_err_dropped_pkt++;
u64_stats_update_end(&rx->statss);
}
complq->head = (complq->head + 1) & complq->mask;
complq->num_free_slots++;
/* When the ring wraps, the generation bit is flipped. */
complq->cur_gen_bit ^= (complq->head == 0);
/* Receiving a completion means we have space to post another
* buffer on the buffer queue.
*/
{
struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
bufq->head = (bufq->head + 1) & bufq->mask;
}
/* Free running counter of completed descriptors */
rx->cnt++;
if (!rx->skb_head)
continue;
if (!compl_desc->end_of_packet)
continue;
work_done++;
pkt_bytes = rx->skb_head->len;
/* The ethernet header (first ETH_HLEN bytes) is snipped off
* by eth_type_trans.
*/
if (skb_headlen(rx->skb_head))
pkt_bytes += ETH_HLEN;
/* gve_rx_complete_skb() will consume skb if successful */
if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
gve_rx_free_skb(rx);
u64_stats_update_begin(&rx->statss);
rx->rx_desc_err_dropped_pkt++;
u64_stats_update_end(&rx->statss);
continue;
}
bytes += pkt_bytes;
rx->skb_head = NULL;
rx->skb_tail = NULL;
}
gve_rx_post_buffers_dqo(rx);
u64_stats_update_begin(&rx->statss);
rx->rpackets += work_done;
rx->rbytes += bytes;
u64_stats_update_end(&rx->statss);
return work_done; return work_done;
} }
...@@ -64,3 +64,18 @@ struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, ...@@ -64,3 +64,18 @@ struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi,
return skb; return skb;
} }
void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info)
{
page_info->pagecnt_bias--;
if (page_info->pagecnt_bias == 0) {
int pagecount = page_count(page_info->page);
/* If we have run out of bias - set it back up to INT_MAX
* minus the existing refs.
*/
page_info->pagecnt_bias = INT_MAX - pagecount;
/* Set pagecount back up to max. */
page_ref_add(page_info->page, INT_MAX - pagecount);
}
}
...@@ -21,5 +21,8 @@ struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, ...@@ -21,5 +21,8 @@ struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi,
struct gve_rx_slot_page_info *page_info, u16 len, struct gve_rx_slot_page_info *page_info, u16 len,
u16 pad); u16 pad);
/* Decrement pagecnt_bias. Set it back to INT_MAX if it reached zero. */
void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info);
#endif /* _GVE_UTILS_H */ #endif /* _GVE_UTILS_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment