Commit 30129cf2 authored by David S. Miller's avatar David S. Miller

Merge branch 'for-davem' of git://git.kernel.org/pub/scm/linux/kernel/git/bwh/sfc-next

Ben Hutchings says:

====================
1. Merge sfc changes (only) accepted for 3.9.

2. PTP improvements from Laurence Evans.

3. Overhaul of RX buffer management:
- Always allocate pages, and enable scattering where possible
- Fit as many buffers as will fit into a page, rather than limiting to 2
- Introduce recycle rings to reduce the need for IOMMU mapping and
  unmapping

4. PCI error recovery (AER and EEH) implementation.

5. Fix a bug in RX filter replacement.

6. Fix configuration with 1 RX queue in the PF and multiple RX queues in
VFs.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e1733de2 1648a23f
...@@ -21,7 +21,9 @@ ...@@ -21,7 +21,9 @@
#include <linux/ethtool.h> #include <linux/ethtool.h>
#include <linux/topology.h> #include <linux/topology.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/cpu_rmap.h> #include <linux/cpu_rmap.h>
#include <linux/aer.h>
#include "net_driver.h" #include "net_driver.h"
#include "efx.h" #include "efx.h"
#include "nic.h" #include "nic.h"
...@@ -71,21 +73,21 @@ const char *const efx_loopback_mode_names[] = { ...@@ -71,21 +73,21 @@ const char *const efx_loopback_mode_names[] = {
const unsigned int efx_reset_type_max = RESET_TYPE_MAX; const unsigned int efx_reset_type_max = RESET_TYPE_MAX;
const char *const efx_reset_type_names[] = { const char *const efx_reset_type_names[] = {
[RESET_TYPE_INVISIBLE] = "INVISIBLE", [RESET_TYPE_INVISIBLE] = "INVISIBLE",
[RESET_TYPE_ALL] = "ALL", [RESET_TYPE_ALL] = "ALL",
[RESET_TYPE_WORLD] = "WORLD", [RESET_TYPE_RECOVER_OR_ALL] = "RECOVER_OR_ALL",
[RESET_TYPE_DISABLE] = "DISABLE", [RESET_TYPE_WORLD] = "WORLD",
[RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG", [RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE",
[RESET_TYPE_INT_ERROR] = "INT_ERROR", [RESET_TYPE_DISABLE] = "DISABLE",
[RESET_TYPE_RX_RECOVERY] = "RX_RECOVERY", [RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG",
[RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH", [RESET_TYPE_INT_ERROR] = "INT_ERROR",
[RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH", [RESET_TYPE_RX_RECOVERY] = "RX_RECOVERY",
[RESET_TYPE_TX_SKIP] = "TX_SKIP", [RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH",
[RESET_TYPE_MC_FAILURE] = "MC_FAILURE", [RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH",
[RESET_TYPE_TX_SKIP] = "TX_SKIP",
[RESET_TYPE_MC_FAILURE] = "MC_FAILURE",
}; };
#define EFX_MAX_MTU (9 * 1024)
/* Reset workqueue. If any NIC has a hardware failure then a reset will be /* Reset workqueue. If any NIC has a hardware failure then a reset will be
* queued onto this work queue. This is not a per-nic work queue, because * queued onto this work queue. This is not a per-nic work queue, because
* efx_reset_work() acquires the rtnl lock, so resets are naturally serialised. * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised.
...@@ -117,9 +119,12 @@ MODULE_PARM_DESC(separate_tx_channels, ...@@ -117,9 +119,12 @@ MODULE_PARM_DESC(separate_tx_channels,
static int napi_weight = 64; static int napi_weight = 64;
/* This is the time (in jiffies) between invocations of the hardware /* This is the time (in jiffies) between invocations of the hardware
* monitor. On Falcon-based NICs, this will: * monitor.
* On Falcon-based NICs, this will:
* - Check the on-board hardware monitor; * - Check the on-board hardware monitor;
* - Poll the link state and reconfigure the hardware as necessary. * - Poll the link state and reconfigure the hardware as necessary.
* On Siena-based NICs for power systems with EEH support, this will give EEH a
* chance to start.
*/ */
static unsigned int efx_monitor_interval = 1 * HZ; static unsigned int efx_monitor_interval = 1 * HZ;
...@@ -203,13 +208,14 @@ static void efx_stop_all(struct efx_nic *efx); ...@@ -203,13 +208,14 @@ static void efx_stop_all(struct efx_nic *efx);
#define EFX_ASSERT_RESET_SERIALISED(efx) \ #define EFX_ASSERT_RESET_SERIALISED(efx) \
do { \ do { \
if ((efx->state == STATE_READY) || \ if ((efx->state == STATE_READY) || \
(efx->state == STATE_RECOVERY) || \
(efx->state == STATE_DISABLED)) \ (efx->state == STATE_DISABLED)) \
ASSERT_RTNL(); \ ASSERT_RTNL(); \
} while (0) } while (0)
static int efx_check_disabled(struct efx_nic *efx) static int efx_check_disabled(struct efx_nic *efx)
{ {
if (efx->state == STATE_DISABLED) { if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) {
netif_err(efx, drv, efx->net_dev, netif_err(efx, drv, efx->net_dev,
"device is disabled due to earlier errors\n"); "device is disabled due to earlier errors\n");
return -EIO; return -EIO;
...@@ -242,15 +248,9 @@ static int efx_process_channel(struct efx_channel *channel, int budget) ...@@ -242,15 +248,9 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
struct efx_rx_queue *rx_queue = struct efx_rx_queue *rx_queue =
efx_channel_get_rx_queue(channel); efx_channel_get_rx_queue(channel);
/* Deliver last RX packet. */ efx_rx_flush_packet(channel);
if (channel->rx_pkt) { if (rx_queue->enabled)
__efx_rx_packet(channel, channel->rx_pkt);
channel->rx_pkt = NULL;
}
if (rx_queue->enabled) {
efx_rx_strategy(channel);
efx_fast_push_rx_descriptors(rx_queue); efx_fast_push_rx_descriptors(rx_queue);
}
} }
return spent; return spent;
...@@ -625,20 +625,51 @@ static int efx_probe_channels(struct efx_nic *efx) ...@@ -625,20 +625,51 @@ static int efx_probe_channels(struct efx_nic *efx)
*/ */
static void efx_start_datapath(struct efx_nic *efx) static void efx_start_datapath(struct efx_nic *efx)
{ {
bool old_rx_scatter = efx->rx_scatter;
struct efx_tx_queue *tx_queue; struct efx_tx_queue *tx_queue;
struct efx_rx_queue *rx_queue; struct efx_rx_queue *rx_queue;
struct efx_channel *channel; struct efx_channel *channel;
size_t rx_buf_len;
/* Calculate the rx buffer allocation parameters required to /* Calculate the rx buffer allocation parameters required to
* support the current MTU, including padding for header * support the current MTU, including padding for header
* alignment and overruns. * alignment and overruns.
*/ */
efx->rx_buffer_len = (max(EFX_PAGE_IP_ALIGN, NET_IP_ALIGN) + efx->rx_dma_len = (efx->type->rx_buffer_hash_size +
EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + EFX_MAX_FRAME_LEN(efx->net_dev->mtu) +
efx->type->rx_buffer_hash_size + efx->type->rx_buffer_padding);
efx->type->rx_buffer_padding); rx_buf_len = (sizeof(struct efx_rx_page_state) +
efx->rx_buffer_order = get_order(efx->rx_buffer_len + EFX_PAGE_IP_ALIGN + efx->rx_dma_len);
sizeof(struct efx_rx_page_state)); if (rx_buf_len <= PAGE_SIZE) {
efx->rx_scatter = false;
efx->rx_buffer_order = 0;
} else if (efx->type->can_rx_scatter) {
BUILD_BUG_ON(sizeof(struct efx_rx_page_state) +
EFX_PAGE_IP_ALIGN + EFX_RX_USR_BUF_SIZE >
PAGE_SIZE / 2);
efx->rx_scatter = true;
efx->rx_dma_len = EFX_RX_USR_BUF_SIZE;
efx->rx_buffer_order = 0;
} else {
efx->rx_scatter = false;
efx->rx_buffer_order = get_order(rx_buf_len);
}
efx_rx_config_page_split(efx);
if (efx->rx_buffer_order)
netif_dbg(efx, drv, efx->net_dev,
"RX buf len=%u; page order=%u batch=%u\n",
efx->rx_dma_len, efx->rx_buffer_order,
efx->rx_pages_per_batch);
else
netif_dbg(efx, drv, efx->net_dev,
"RX buf len=%u step=%u bpp=%u; page batch=%u\n",
efx->rx_dma_len, efx->rx_page_buf_step,
efx->rx_bufs_per_page, efx->rx_pages_per_batch);
/* RX filters also have scatter-enabled flags */
if (efx->rx_scatter != old_rx_scatter)
efx_filter_update_rx_scatter(efx);
/* We must keep at least one descriptor in a TX ring empty. /* We must keep at least one descriptor in a TX ring empty.
* We could avoid this when the queue size does not exactly * We could avoid this when the queue size does not exactly
...@@ -655,16 +686,12 @@ static void efx_start_datapath(struct efx_nic *efx) ...@@ -655,16 +686,12 @@ static void efx_start_datapath(struct efx_nic *efx)
efx_for_each_channel_tx_queue(tx_queue, channel) efx_for_each_channel_tx_queue(tx_queue, channel)
efx_init_tx_queue(tx_queue); efx_init_tx_queue(tx_queue);
/* The rx buffer allocation strategy is MTU dependent */
efx_rx_strategy(channel);
efx_for_each_channel_rx_queue(rx_queue, channel) { efx_for_each_channel_rx_queue(rx_queue, channel) {
efx_init_rx_queue(rx_queue); efx_init_rx_queue(rx_queue);
efx_nic_generate_fill_event(rx_queue); efx_nic_generate_fill_event(rx_queue);
} }
WARN_ON(channel->rx_pkt != NULL); WARN_ON(channel->rx_pkt_n_frags);
efx_rx_strategy(channel);
} }
if (netif_device_present(efx->net_dev)) if (netif_device_present(efx->net_dev))
...@@ -683,7 +710,7 @@ static void efx_stop_datapath(struct efx_nic *efx) ...@@ -683,7 +710,7 @@ static void efx_stop_datapath(struct efx_nic *efx)
BUG_ON(efx->port_enabled); BUG_ON(efx->port_enabled);
/* Only perform flush if dma is enabled */ /* Only perform flush if dma is enabled */
if (dev->is_busmaster) { if (dev->is_busmaster && efx->state != STATE_RECOVERY) {
rc = efx_nic_flush_queues(efx); rc = efx_nic_flush_queues(efx);
if (rc && EFX_WORKAROUND_7803(efx)) { if (rc && EFX_WORKAROUND_7803(efx)) {
...@@ -1596,13 +1623,15 @@ static void efx_start_all(struct efx_nic *efx) ...@@ -1596,13 +1623,15 @@ static void efx_start_all(struct efx_nic *efx)
efx_start_port(efx); efx_start_port(efx);
efx_start_datapath(efx); efx_start_datapath(efx);
/* Start the hardware monitor if there is one. Otherwise (we're link /* Start the hardware monitor if there is one */
* event driven), we have to poll the PHY because after an event queue if (efx->type->monitor != NULL)
* flush, we could have a missed a link state change */
if (efx->type->monitor != NULL) {
queue_delayed_work(efx->workqueue, &efx->monitor_work, queue_delayed_work(efx->workqueue, &efx->monitor_work,
efx_monitor_interval); efx_monitor_interval);
} else {
/* If link state detection is normally event-driven, we have
* to poll now because we could have missed a change
*/
if (efx_nic_rev(efx) >= EFX_REV_SIENA_A0) {
mutex_lock(&efx->mac_lock); mutex_lock(&efx->mac_lock);
if (efx->phy_op->poll(efx)) if (efx->phy_op->poll(efx))
efx_link_status_changed(efx); efx_link_status_changed(efx);
...@@ -2309,7 +2338,9 @@ int efx_reset(struct efx_nic *efx, enum reset_type method) ...@@ -2309,7 +2338,9 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)
out: out:
/* Leave device stopped if necessary */ /* Leave device stopped if necessary */
disabled = rc || method == RESET_TYPE_DISABLE; disabled = rc ||
method == RESET_TYPE_DISABLE ||
method == RESET_TYPE_RECOVER_OR_DISABLE;
rc2 = efx_reset_up(efx, method, !disabled); rc2 = efx_reset_up(efx, method, !disabled);
if (rc2) { if (rc2) {
disabled = true; disabled = true;
...@@ -2328,13 +2359,48 @@ int efx_reset(struct efx_nic *efx, enum reset_type method) ...@@ -2328,13 +2359,48 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)
return rc; return rc;
} }
/* Try recovery mechanisms.
* For now only EEH is supported.
* Returns 0 if the recovery mechanisms are unsuccessful.
* Returns a non-zero value otherwise.
*/
static int efx_try_recovery(struct efx_nic *efx)
{
#ifdef CONFIG_EEH
/* A PCI error can occur and not be seen by EEH because nothing
* happens on the PCI bus. In this case the driver may fail and
* schedule a 'recover or reset', leading to this recovery handler.
* Manually call the eeh failure check function.
*/
struct eeh_dev *eehdev =
of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));
if (eeh_dev_check_failure(eehdev)) {
/* The EEH mechanisms will handle the error and reset the
* device if necessary.
*/
return 1;
}
#endif
return 0;
}
/* The worker thread exists so that code that cannot sleep can /* The worker thread exists so that code that cannot sleep can
* schedule a reset for later. * schedule a reset for later.
*/ */
static void efx_reset_work(struct work_struct *data) static void efx_reset_work(struct work_struct *data)
{ {
struct efx_nic *efx = container_of(data, struct efx_nic, reset_work); struct efx_nic *efx = container_of(data, struct efx_nic, reset_work);
unsigned long pending = ACCESS_ONCE(efx->reset_pending); unsigned long pending;
enum reset_type method;
pending = ACCESS_ONCE(efx->reset_pending);
method = fls(pending) - 1;
if ((method == RESET_TYPE_RECOVER_OR_DISABLE ||
method == RESET_TYPE_RECOVER_OR_ALL) &&
efx_try_recovery(efx))
return;
if (!pending) if (!pending)
return; return;
...@@ -2346,7 +2412,7 @@ static void efx_reset_work(struct work_struct *data) ...@@ -2346,7 +2412,7 @@ static void efx_reset_work(struct work_struct *data)
* it cannot change again. * it cannot change again.
*/ */
if (efx->state == STATE_READY) if (efx->state == STATE_READY)
(void)efx_reset(efx, fls(pending) - 1); (void)efx_reset(efx, method);
rtnl_unlock(); rtnl_unlock();
} }
...@@ -2355,11 +2421,20 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type) ...@@ -2355,11 +2421,20 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
{ {
enum reset_type method; enum reset_type method;
if (efx->state == STATE_RECOVERY) {
netif_dbg(efx, drv, efx->net_dev,
"recovering: skip scheduling %s reset\n",
RESET_TYPE(type));
return;
}
switch (type) { switch (type) {
case RESET_TYPE_INVISIBLE: case RESET_TYPE_INVISIBLE:
case RESET_TYPE_ALL: case RESET_TYPE_ALL:
case RESET_TYPE_RECOVER_OR_ALL:
case RESET_TYPE_WORLD: case RESET_TYPE_WORLD:
case RESET_TYPE_DISABLE: case RESET_TYPE_DISABLE:
case RESET_TYPE_RECOVER_OR_DISABLE:
method = type; method = type;
netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n", netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n",
RESET_TYPE(method)); RESET_TYPE(method));
...@@ -2569,6 +2644,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev) ...@@ -2569,6 +2644,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
efx_fini_struct(efx); efx_fini_struct(efx);
pci_set_drvdata(pci_dev, NULL); pci_set_drvdata(pci_dev, NULL);
free_netdev(efx->net_dev); free_netdev(efx->net_dev);
pci_disable_pcie_error_reporting(pci_dev);
}; };
/* NIC VPD information /* NIC VPD information
...@@ -2741,6 +2818,11 @@ static int efx_pci_probe(struct pci_dev *pci_dev, ...@@ -2741,6 +2818,11 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
netif_warn(efx, probe, efx->net_dev, netif_warn(efx, probe, efx->net_dev,
"failed to create MTDs (%d)\n", rc); "failed to create MTDs (%d)\n", rc);
rc = pci_enable_pcie_error_reporting(pci_dev);
if (rc && rc != -EINVAL)
netif_warn(efx, probe, efx->net_dev,
"pci_enable_pcie_error_reporting failed (%d)\n", rc);
return 0; return 0;
fail4: fail4:
...@@ -2865,12 +2947,112 @@ static const struct dev_pm_ops efx_pm_ops = { ...@@ -2865,12 +2947,112 @@ static const struct dev_pm_ops efx_pm_ops = {
.restore = efx_pm_resume, .restore = efx_pm_resume,
}; };
/* A PCI error affecting this device was detected.
* At this point MMIO and DMA may be disabled.
* Stop the software path and request a slot reset.
*/
pci_ers_result_t efx_io_error_detected(struct pci_dev *pdev,
enum pci_channel_state state)
{
pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
struct efx_nic *efx = pci_get_drvdata(pdev);
if (state == pci_channel_io_perm_failure)
return PCI_ERS_RESULT_DISCONNECT;
rtnl_lock();
if (efx->state != STATE_DISABLED) {
efx->state = STATE_RECOVERY;
efx->reset_pending = 0;
efx_device_detach_sync(efx);
efx_stop_all(efx);
efx_stop_interrupts(efx, false);
status = PCI_ERS_RESULT_NEED_RESET;
} else {
/* If the interface is disabled we don't want to do anything
* with it.
*/
status = PCI_ERS_RESULT_RECOVERED;
}
rtnl_unlock();
pci_disable_device(pdev);
return status;
}
/* Fake a successfull reset, which will be performed later in efx_io_resume. */
pci_ers_result_t efx_io_slot_reset(struct pci_dev *pdev)
{
struct efx_nic *efx = pci_get_drvdata(pdev);
pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
int rc;
if (pci_enable_device(pdev)) {
netif_err(efx, hw, efx->net_dev,
"Cannot re-enable PCI device after reset.\n");
status = PCI_ERS_RESULT_DISCONNECT;
}
rc = pci_cleanup_aer_uncorrect_error_status(pdev);
if (rc) {
netif_err(efx, hw, efx->net_dev,
"pci_cleanup_aer_uncorrect_error_status failed (%d)\n", rc);
/* Non-fatal error. Continue. */
}
return status;
}
/* Perform the actual reset and resume I/O operations. */
static void efx_io_resume(struct pci_dev *pdev)
{
struct efx_nic *efx = pci_get_drvdata(pdev);
int rc;
rtnl_lock();
if (efx->state == STATE_DISABLED)
goto out;
rc = efx_reset(efx, RESET_TYPE_ALL);
if (rc) {
netif_err(efx, hw, efx->net_dev,
"efx_reset failed after PCI error (%d)\n", rc);
} else {
efx->state = STATE_READY;
netif_dbg(efx, hw, efx->net_dev,
"Done resetting and resuming IO after PCI error.\n");
}
out:
rtnl_unlock();
}
/* For simplicity and reliability, we always require a slot reset and try to
* reset the hardware when a pci error affecting the device is detected.
* We leave both the link_reset and mmio_enabled callback unimplemented:
* with our request for slot reset the mmio_enabled callback will never be
* called, and the link_reset callback is not used by AER or EEH mechanisms.
*/
static struct pci_error_handlers efx_err_handlers = {
.error_detected = efx_io_error_detected,
.slot_reset = efx_io_slot_reset,
.resume = efx_io_resume,
};
static struct pci_driver efx_pci_driver = { static struct pci_driver efx_pci_driver = {
.name = KBUILD_MODNAME, .name = KBUILD_MODNAME,
.id_table = efx_pci_table, .id_table = efx_pci_table,
.probe = efx_pci_probe, .probe = efx_pci_probe,
.remove = efx_pci_remove, .remove = efx_pci_remove,
.driver.pm = &efx_pm_ops, .driver.pm = &efx_pm_ops,
.err_handler = &efx_err_handlers,
}; };
/************************************************************************** /**************************************************************************
......
...@@ -33,17 +33,22 @@ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc); ...@@ -33,17 +33,22 @@ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx); extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
/* RX */ /* RX */
extern void efx_rx_config_page_split(struct efx_nic *efx);
extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue); extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue); extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue);
extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue); extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue);
extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue); extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue);
extern void efx_rx_strategy(struct efx_channel *channel);
extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue); extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue);
extern void efx_rx_slow_fill(unsigned long context); extern void efx_rx_slow_fill(unsigned long context);
extern void __efx_rx_packet(struct efx_channel *channel, extern void __efx_rx_packet(struct efx_channel *channel);
struct efx_rx_buffer *rx_buf); extern void efx_rx_packet(struct efx_rx_queue *rx_queue,
extern void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, unsigned int index, unsigned int n_frags,
unsigned int len, u16 flags); unsigned int len, u16 flags);
static inline void efx_rx_flush_packet(struct efx_channel *channel)
{
if (channel->rx_pkt_n_frags)
__efx_rx_packet(channel);
}
extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
#define EFX_MAX_DMAQ_SIZE 4096UL #define EFX_MAX_DMAQ_SIZE 4096UL
...@@ -67,6 +72,7 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); ...@@ -67,6 +72,7 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
extern int efx_probe_filters(struct efx_nic *efx); extern int efx_probe_filters(struct efx_nic *efx);
extern void efx_restore_filters(struct efx_nic *efx); extern void efx_restore_filters(struct efx_nic *efx);
extern void efx_remove_filters(struct efx_nic *efx); extern void efx_remove_filters(struct efx_nic *efx);
extern void efx_filter_update_rx_scatter(struct efx_nic *efx);
extern s32 efx_filter_insert_filter(struct efx_nic *efx, extern s32 efx_filter_insert_filter(struct efx_nic *efx,
struct efx_filter_spec *spec, struct efx_filter_spec *spec,
bool replace); bool replace);
...@@ -171,9 +177,9 @@ static inline void efx_device_detach_sync(struct efx_nic *efx) ...@@ -171,9 +177,9 @@ static inline void efx_device_detach_sync(struct efx_nic *efx)
* TX scheduler is stopped when we're done and before * TX scheduler is stopped when we're done and before
* netif_device_present() becomes false. * netif_device_present() becomes false.
*/ */
netif_tx_lock(dev); netif_tx_lock_bh(dev);
netif_device_detach(dev); netif_device_detach(dev);
netif_tx_unlock(dev); netif_tx_unlock_bh(dev);
} }
#endif /* EFX_EFX_H */ #endif /* EFX_EFX_H */
...@@ -137,8 +137,12 @@ enum efx_loopback_mode { ...@@ -137,8 +137,12 @@ enum efx_loopback_mode {
* Reset methods are numbered in order of increasing scope. * Reset methods are numbered in order of increasing scope.
* *
* @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only) * @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only)
* @RESET_TYPE_RECOVER_OR_ALL: Try to recover. Apply RESET_TYPE_ALL
* if unsuccessful.
* @RESET_TYPE_ALL: Reset datapath, MAC and PHY * @RESET_TYPE_ALL: Reset datapath, MAC and PHY
* @RESET_TYPE_WORLD: Reset as much as possible * @RESET_TYPE_WORLD: Reset as much as possible
* @RESET_TYPE_RECOVER_OR_DISABLE: Try to recover. Apply RESET_TYPE_DISABLE if
* unsuccessful.
* @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled * @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled
* @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog * @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog
* @RESET_TYPE_INT_ERROR: reset due to internal error * @RESET_TYPE_INT_ERROR: reset due to internal error
...@@ -150,9 +154,11 @@ enum efx_loopback_mode { ...@@ -150,9 +154,11 @@ enum efx_loopback_mode {
*/ */
enum reset_type { enum reset_type {
RESET_TYPE_INVISIBLE = 0, RESET_TYPE_INVISIBLE = 0,
RESET_TYPE_ALL = 1, RESET_TYPE_RECOVER_OR_ALL = 1,
RESET_TYPE_WORLD = 2, RESET_TYPE_ALL = 2,
RESET_TYPE_DISABLE = 3, RESET_TYPE_WORLD = 3,
RESET_TYPE_RECOVER_OR_DISABLE = 4,
RESET_TYPE_DISABLE = 5,
RESET_TYPE_MAX_METHOD, RESET_TYPE_MAX_METHOD,
RESET_TYPE_TX_WATCHDOG, RESET_TYPE_TX_WATCHDOG,
RESET_TYPE_INT_ERROR, RESET_TYPE_INT_ERROR,
......
...@@ -154,6 +154,7 @@ static const struct efx_ethtool_stat efx_ethtool_stats[] = { ...@@ -154,6 +154,7 @@ static const struct efx_ethtool_stat efx_ethtool_stats[] = {
EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err), EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err),
EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch), EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch),
EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc), EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc),
EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_nodesc_trunc),
}; };
/* Number of ethtool statistics */ /* Number of ethtool statistics */
...@@ -978,7 +979,8 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx, ...@@ -978,7 +979,8 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx,
rule->m_ext.data[1])) rule->m_ext.data[1]))
return -EINVAL; return -EINVAL;
efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, 0, efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL,
efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
(rule->ring_cookie == RX_CLS_FLOW_DISC) ? (rule->ring_cookie == RX_CLS_FLOW_DISC) ?
0xfff : rule->ring_cookie); 0xfff : rule->ring_cookie);
......
...@@ -1546,10 +1546,6 @@ static int falcon_probe_nic(struct efx_nic *efx) ...@@ -1546,10 +1546,6 @@ static int falcon_probe_nic(struct efx_nic *efx)
static void falcon_init_rx_cfg(struct efx_nic *efx) static void falcon_init_rx_cfg(struct efx_nic *efx)
{ {
/* Prior to Siena the RX DMA engine will split each frame at
* intervals of RX_USR_BUF_SIZE (32-byte units). We set it to
* be so large that that never happens. */
const unsigned huge_buf_size = (3 * 4096) >> 5;
/* RX control FIFO thresholds (32 entries) */ /* RX control FIFO thresholds (32 entries) */
const unsigned ctrl_xon_thr = 20; const unsigned ctrl_xon_thr = 20;
const unsigned ctrl_xoff_thr = 25; const unsigned ctrl_xoff_thr = 25;
...@@ -1557,10 +1553,15 @@ static void falcon_init_rx_cfg(struct efx_nic *efx) ...@@ -1557,10 +1553,15 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)
efx_reado(efx, &reg, FR_AZ_RX_CFG); efx_reado(efx, &reg, FR_AZ_RX_CFG);
if (efx_nic_rev(efx) <= EFX_REV_FALCON_A1) { if (efx_nic_rev(efx) <= EFX_REV_FALCON_A1) {
/* Data FIFO size is 5.5K */ /* Data FIFO size is 5.5K. The RX DMA engine only
* supports scattering for user-mode queues, but will
* split DMA writes at intervals of RX_USR_BUF_SIZE
* (32-byte units) even for kernel-mode queues. We
* set it to be so large that that never happens.
*/
EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_DESC_PUSH_EN, 0); EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_DESC_PUSH_EN, 0);
EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_USR_BUF_SIZE, EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_USR_BUF_SIZE,
huge_buf_size); (3 * 4096) >> 5);
EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_MAC_TH, 512 >> 8); EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_MAC_TH, 512 >> 8);
EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XOFF_MAC_TH, 2048 >> 8); EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XOFF_MAC_TH, 2048 >> 8);
EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_TX_TH, ctrl_xon_thr); EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_TX_TH, ctrl_xon_thr);
...@@ -1569,7 +1570,7 @@ static void falcon_init_rx_cfg(struct efx_nic *efx) ...@@ -1569,7 +1570,7 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)
/* Data FIFO size is 80K; register fields moved */ /* Data FIFO size is 80K; register fields moved */
EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_DESC_PUSH_EN, 0); EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_DESC_PUSH_EN, 0);
EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_USR_BUF_SIZE, EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_USR_BUF_SIZE,
huge_buf_size); EFX_RX_USR_BUF_SIZE >> 5);
/* Send XON and XOFF at ~3 * max MTU away from empty/full */ /* Send XON and XOFF at ~3 * max MTU away from empty/full */
EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XON_MAC_TH, 27648 >> 8); EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XON_MAC_TH, 27648 >> 8);
EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XOFF_MAC_TH, 54272 >> 8); EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XOFF_MAC_TH, 54272 >> 8);
...@@ -1815,6 +1816,7 @@ const struct efx_nic_type falcon_a1_nic_type = { ...@@ -1815,6 +1816,7 @@ const struct efx_nic_type falcon_a1_nic_type = {
.evq_rptr_tbl_base = FR_AA_EVQ_RPTR_KER, .evq_rptr_tbl_base = FR_AA_EVQ_RPTR_KER,
.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH), .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
.rx_buffer_padding = 0x24, .rx_buffer_padding = 0x24,
.can_rx_scatter = false,
.max_interrupt_mode = EFX_INT_MODE_MSI, .max_interrupt_mode = EFX_INT_MODE_MSI,
.phys_addr_channels = 4, .phys_addr_channels = 4,
.timer_period_max = 1 << FRF_AB_TC_TIMER_VAL_WIDTH, .timer_period_max = 1 << FRF_AB_TC_TIMER_VAL_WIDTH,
...@@ -1865,6 +1867,7 @@ const struct efx_nic_type falcon_b0_nic_type = { ...@@ -1865,6 +1867,7 @@ const struct efx_nic_type falcon_b0_nic_type = {
.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH), .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
.rx_buffer_hash_size = 0x10, .rx_buffer_hash_size = 0x10,
.rx_buffer_padding = 0, .rx_buffer_padding = 0,
.can_rx_scatter = true,
.max_interrupt_mode = EFX_INT_MODE_MSIX, .max_interrupt_mode = EFX_INT_MODE_MSIX,
.phys_addr_channels = 32, /* Hardware limit is 64, but the legacy .phys_addr_channels = 32, /* Hardware limit is 64, but the legacy
* interrupt handler only supports 32 * interrupt handler only supports 32
......
...@@ -66,6 +66,10 @@ struct efx_filter_state { ...@@ -66,6 +66,10 @@ struct efx_filter_state {
#endif #endif
}; };
static void efx_filter_table_clear_entry(struct efx_nic *efx,
struct efx_filter_table *table,
unsigned int filter_idx);
/* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit /* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit
* key derived from the n-tuple. The initial LFSR state is 0xffff. */ * key derived from the n-tuple. The initial LFSR state is 0xffff. */
static u16 efx_filter_hash(u32 key) static u16 efx_filter_hash(u32 key)
...@@ -168,6 +172,25 @@ static void efx_filter_push_rx_config(struct efx_nic *efx) ...@@ -168,6 +172,25 @@ static void efx_filter_push_rx_config(struct efx_nic *efx)
filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED, filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED,
!!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags & !!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags &
EFX_FILTER_FLAG_RX_RSS)); EFX_FILTER_FLAG_RX_RSS));
/* There is a single bit to enable RX scatter for all
* unmatched packets. Only set it if scatter is
* enabled in both filter specs.
*/
EFX_SET_OWORD_FIELD(
filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q,
!!(table->spec[EFX_FILTER_INDEX_UC_DEF].flags &
table->spec[EFX_FILTER_INDEX_MC_DEF].flags &
EFX_FILTER_FLAG_RX_SCATTER));
} else if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) {
/* We don't expose 'default' filters because unmatched
* packets always go to the queue number found in the
* RSS table. But we still need to set the RX scatter
* bit here.
*/
EFX_SET_OWORD_FIELD(
filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q,
efx->rx_scatter);
} }
efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL); efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL);
...@@ -409,9 +432,18 @@ static void efx_filter_reset_rx_def(struct efx_nic *efx, unsigned filter_idx) ...@@ -409,9 +432,18 @@ static void efx_filter_reset_rx_def(struct efx_nic *efx, unsigned filter_idx)
struct efx_filter_state *state = efx->filter_state; struct efx_filter_state *state = efx->filter_state;
struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF]; struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF];
struct efx_filter_spec *spec = &table->spec[filter_idx]; struct efx_filter_spec *spec = &table->spec[filter_idx];
enum efx_filter_flags flags = 0;
/* If there's only one channel then disable RSS for non VF
* traffic, thereby allowing VFs to use RSS when the PF can't.
*/
if (efx->n_rx_channels > 1)
flags |= EFX_FILTER_FLAG_RX_RSS;
efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, if (efx->rx_scatter)
EFX_FILTER_FLAG_RX_RSS, 0); flags |= EFX_FILTER_FLAG_RX_SCATTER;
efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, flags, 0);
spec->type = EFX_FILTER_UC_DEF + filter_idx; spec->type = EFX_FILTER_UC_DEF + filter_idx;
table->used_bitmap[0] |= 1 << filter_idx; table->used_bitmap[0] |= 1 << filter_idx;
} }
...@@ -463,13 +495,6 @@ static u32 efx_filter_build(efx_oword_t *filter, struct efx_filter_spec *spec) ...@@ -463,13 +495,6 @@ static u32 efx_filter_build(efx_oword_t *filter, struct efx_filter_spec *spec)
break; break;
} }
case EFX_FILTER_TABLE_RX_DEF:
/* One filter spec per type */
BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
return spec->type - EFX_FILTER_UC_DEF;
case EFX_FILTER_TABLE_RX_MAC: { case EFX_FILTER_TABLE_RX_MAC: {
bool is_wild = spec->type == EFX_FILTER_MAC_WILD; bool is_wild = spec->type == EFX_FILTER_MAC_WILD;
EFX_POPULATE_OWORD_7( EFX_POPULATE_OWORD_7(
...@@ -520,42 +545,6 @@ static bool efx_filter_equal(const struct efx_filter_spec *left, ...@@ -520,42 +545,6 @@ static bool efx_filter_equal(const struct efx_filter_spec *left,
return true; return true;
} }
static int efx_filter_search(struct efx_filter_table *table,
struct efx_filter_spec *spec, u32 key,
bool for_insert, unsigned int *depth_required)
{
unsigned hash, incr, filter_idx, depth, depth_max;
hash = efx_filter_hash(key);
incr = efx_filter_increment(key);
filter_idx = hash & (table->size - 1);
depth = 1;
depth_max = (for_insert ?
(spec->priority <= EFX_FILTER_PRI_HINT ?
FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX) :
table->search_depth[spec->type]);
for (;;) {
/* Return success if entry is used and matches this spec
* or entry is unused and we are trying to insert.
*/
if (test_bit(filter_idx, table->used_bitmap) ?
efx_filter_equal(spec, &table->spec[filter_idx]) :
for_insert) {
*depth_required = depth;
return filter_idx;
}
/* Return failure if we reached the maximum search depth */
if (depth == depth_max)
return for_insert ? -EBUSY : -ENOENT;
filter_idx = (filter_idx + incr) & (table->size - 1);
++depth;
}
}
/* /*
* Construct/deconstruct external filter IDs. At least the RX filter * Construct/deconstruct external filter IDs. At least the RX filter
* IDs must be ordered by matching priority, for RX NFC semantics. * IDs must be ordered by matching priority, for RX NFC semantics.
...@@ -650,44 +639,111 @@ u32 efx_filter_get_rx_id_limit(struct efx_nic *efx) ...@@ -650,44 +639,111 @@ u32 efx_filter_get_rx_id_limit(struct efx_nic *efx)
* efx_filter_insert_filter - add or replace a filter * efx_filter_insert_filter - add or replace a filter
* @efx: NIC in which to insert the filter * @efx: NIC in which to insert the filter
* @spec: Specification for the filter * @spec: Specification for the filter
* @replace: Flag for whether the specified filter may replace a filter * @replace_equal: Flag for whether the specified filter may replace an
* with an identical match expression and equal or lower priority * existing filter with equal priority
* *
* On success, return the filter ID. * On success, return the filter ID.
* On failure, return a negative error code. * On failure, return a negative error code.
*
* If an existing filter has equal match values to the new filter
* spec, then the new filter might replace it, depending on the
* relative priorities. If the existing filter has lower priority, or
* if @replace_equal is set and it has equal priority, then it is
* replaced. Otherwise the function fails, returning -%EPERM if
* the existing filter has higher priority or -%EEXIST if it has
* equal priority.
*/ */
s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec, s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
bool replace) bool replace_equal)
{ {
struct efx_filter_state *state = efx->filter_state; struct efx_filter_state *state = efx->filter_state;
struct efx_filter_table *table = efx_filter_spec_table(state, spec); struct efx_filter_table *table = efx_filter_spec_table(state, spec);
struct efx_filter_spec *saved_spec;
efx_oword_t filter; efx_oword_t filter;
unsigned int filter_idx, depth = 0; int rep_index, ins_index;
u32 key; unsigned int depth = 0;
int rc; int rc;
if (!table || table->size == 0) if (!table || table->size == 0)
return -EINVAL; return -EINVAL;
key = efx_filter_build(&filter, spec);
netif_vdbg(efx, hw, efx->net_dev, netif_vdbg(efx, hw, efx->net_dev,
"%s: type %d search_depth=%d", __func__, spec->type, "%s: type %d search_depth=%d", __func__, spec->type,
table->search_depth[spec->type]); table->search_depth[spec->type]);
spin_lock_bh(&state->lock); if (table->id == EFX_FILTER_TABLE_RX_DEF) {
/* One filter spec per type */
BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
rep_index = spec->type - EFX_FILTER_INDEX_UC_DEF;
ins_index = rep_index;
rc = efx_filter_search(table, spec, key, true, &depth); spin_lock_bh(&state->lock);
if (rc < 0) } else {
goto out; /* Search concurrently for
filter_idx = rc; * (1) a filter to be replaced (rep_index): any filter
BUG_ON(filter_idx >= table->size); * with the same match values, up to the current
saved_spec = &table->spec[filter_idx]; * search depth for this type, and
* (2) the insertion point (ins_index): (1) or any
if (test_bit(filter_idx, table->used_bitmap)) { * free slot before it or up to the maximum search
/* Should we replace the existing filter? */ * depth for this priority
if (!replace) { * We fail if we cannot find (2).
*
* We can stop once either
* (a) we find (1), in which case we have definitely
* found (2) as well; or
* (b) we have searched exhaustively for (1), and have
* either found (2) or searched exhaustively for it
*/
u32 key = efx_filter_build(&filter, spec);
unsigned int hash = efx_filter_hash(key);
unsigned int incr = efx_filter_increment(key);
unsigned int max_rep_depth = table->search_depth[spec->type];
unsigned int max_ins_depth =
spec->priority <= EFX_FILTER_PRI_HINT ?
FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX;
unsigned int i = hash & (table->size - 1);
ins_index = -1;
depth = 1;
spin_lock_bh(&state->lock);
for (;;) {
if (!test_bit(i, table->used_bitmap)) {
if (ins_index < 0)
ins_index = i;
} else if (efx_filter_equal(spec, &table->spec[i])) {
/* Case (a) */
if (ins_index < 0)
ins_index = i;
rep_index = i;
break;
}
if (depth >= max_rep_depth &&
(ins_index >= 0 || depth >= max_ins_depth)) {
/* Case (b) */
if (ins_index < 0) {
rc = -EBUSY;
goto out;
}
rep_index = -1;
break;
}
i = (i + incr) & (table->size - 1);
++depth;
}
}
/* If we found a filter to be replaced, check whether we
* should do so
*/
if (rep_index >= 0) {
struct efx_filter_spec *saved_spec = &table->spec[rep_index];
if (spec->priority == saved_spec->priority && !replace_equal) {
rc = -EEXIST; rc = -EEXIST;
goto out; goto out;
} }
...@@ -695,11 +751,14 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec, ...@@ -695,11 +751,14 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
rc = -EPERM; rc = -EPERM;
goto out; goto out;
} }
} else { }
__set_bit(filter_idx, table->used_bitmap);
/* Insert the filter */
if (ins_index != rep_index) {
__set_bit(ins_index, table->used_bitmap);
++table->used; ++table->used;
} }
*saved_spec = *spec; table->spec[ins_index] = *spec;
if (table->id == EFX_FILTER_TABLE_RX_DEF) { if (table->id == EFX_FILTER_TABLE_RX_DEF) {
efx_filter_push_rx_config(efx); efx_filter_push_rx_config(efx);
...@@ -713,13 +772,19 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec, ...@@ -713,13 +772,19 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
} }
efx_writeo(efx, &filter, efx_writeo(efx, &filter,
table->offset + table->step * filter_idx); table->offset + table->step * ins_index);
/* If we were able to replace a filter by inserting
* at a lower depth, clear the replaced filter
*/
if (ins_index != rep_index && rep_index >= 0)
efx_filter_table_clear_entry(efx, table, rep_index);
} }
netif_vdbg(efx, hw, efx->net_dev, netif_vdbg(efx, hw, efx->net_dev,
"%s: filter type %d index %d rxq %u set", "%s: filter type %d index %d rxq %u set",
__func__, spec->type, filter_idx, spec->dmaq_id); __func__, spec->type, ins_index, spec->dmaq_id);
rc = efx_filter_make_id(spec, filter_idx); rc = efx_filter_make_id(spec, ins_index);
out: out:
spin_unlock_bh(&state->lock); spin_unlock_bh(&state->lock);
...@@ -1060,6 +1125,50 @@ void efx_remove_filters(struct efx_nic *efx) ...@@ -1060,6 +1125,50 @@ void efx_remove_filters(struct efx_nic *efx)
kfree(state); kfree(state);
} }
/* Update scatter enable flags for filters pointing to our own RX queues */
void efx_filter_update_rx_scatter(struct efx_nic *efx)
{
struct efx_filter_state *state = efx->filter_state;
enum efx_filter_table_id table_id;
struct efx_filter_table *table;
efx_oword_t filter;
unsigned int filter_idx;
spin_lock_bh(&state->lock);
for (table_id = EFX_FILTER_TABLE_RX_IP;
table_id <= EFX_FILTER_TABLE_RX_DEF;
table_id++) {
table = &state->table[table_id];
for (filter_idx = 0; filter_idx < table->size; filter_idx++) {
if (!test_bit(filter_idx, table->used_bitmap) ||
table->spec[filter_idx].dmaq_id >=
efx->n_rx_channels)
continue;
if (efx->rx_scatter)
table->spec[filter_idx].flags |=
EFX_FILTER_FLAG_RX_SCATTER;
else
table->spec[filter_idx].flags &=
~EFX_FILTER_FLAG_RX_SCATTER;
if (table_id == EFX_FILTER_TABLE_RX_DEF)
/* Pushed by efx_filter_push_rx_config() */
continue;
efx_filter_build(&filter, &table->spec[filter_idx]);
efx_writeo(efx, &filter,
table->offset + table->step * filter_idx);
}
}
efx_filter_push_rx_config(efx);
spin_unlock_bh(&state->lock);
}
#ifdef CONFIG_RFS_ACCEL #ifdef CONFIG_RFS_ACCEL
int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
......
...@@ -553,6 +553,7 @@ ...@@ -553,6 +553,7 @@
#define MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */ #define MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */
#define MC_CMD_PTP_MODE_V2 0x2 /* enum */ #define MC_CMD_PTP_MODE_V2 0x2 /* enum */
#define MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */ #define MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */
#define MC_CMD_PTP_MODE_V2_ENHANCED 0x4 /* enum */
/* MC_CMD_PTP_IN_DISABLE msgrequest */ /* MC_CMD_PTP_IN_DISABLE msgrequest */
#define MC_CMD_PTP_IN_DISABLE_LEN 8 #define MC_CMD_PTP_IN_DISABLE_LEN 8
......
...@@ -69,6 +69,12 @@ ...@@ -69,6 +69,12 @@
#define EFX_TXQ_TYPES 4 #define EFX_TXQ_TYPES 4
#define EFX_MAX_TX_QUEUES (EFX_TXQ_TYPES * EFX_MAX_CHANNELS) #define EFX_MAX_TX_QUEUES (EFX_TXQ_TYPES * EFX_MAX_CHANNELS)
/* Maximum possible MTU the driver supports */
#define EFX_MAX_MTU (9 * 1024)
/* Size of an RX scatter buffer. Small enough to pack 2 into a 4K page. */
#define EFX_RX_USR_BUF_SIZE 1824
/* Forward declare Precision Time Protocol (PTP) support structure. */ /* Forward declare Precision Time Protocol (PTP) support structure. */
struct efx_ptp_data; struct efx_ptp_data;
...@@ -206,25 +212,23 @@ struct efx_tx_queue { ...@@ -206,25 +212,23 @@ struct efx_tx_queue {
/** /**
* struct efx_rx_buffer - An Efx RX data buffer * struct efx_rx_buffer - An Efx RX data buffer
* @dma_addr: DMA base address of the buffer * @dma_addr: DMA base address of the buffer
* @skb: The associated socket buffer. Valid iff !(@flags & %EFX_RX_BUF_PAGE). * @page: The associated page buffer.
* Will be %NULL if the buffer slot is currently free.
* @page: The associated page buffer. Valif iff @flags & %EFX_RX_BUF_PAGE.
* Will be %NULL if the buffer slot is currently free. * Will be %NULL if the buffer slot is currently free.
* @page_offset: Offset within page. Valid iff @flags & %EFX_RX_BUF_PAGE. * @page_offset: If pending: offset in @page of DMA base address.
* @len: Buffer length, in bytes. * If completed: offset in @page of Ethernet header.
* @flags: Flags for buffer and packet state. * @len: If pending: length for DMA descriptor.
* If completed: received length, excluding hash prefix.
* @flags: Flags for buffer and packet state. These are only set on the
* first buffer of a scattered packet.
*/ */
struct efx_rx_buffer { struct efx_rx_buffer {
dma_addr_t dma_addr; dma_addr_t dma_addr;
union { struct page *page;
struct sk_buff *skb;
struct page *page;
} u;
u16 page_offset; u16 page_offset;
u16 len; u16 len;
u16 flags; u16 flags;
}; };
#define EFX_RX_BUF_PAGE 0x0001 #define EFX_RX_BUF_LAST_IN_PAGE 0x0001
#define EFX_RX_PKT_CSUMMED 0x0002 #define EFX_RX_PKT_CSUMMED 0x0002
#define EFX_RX_PKT_DISCARD 0x0004 #define EFX_RX_PKT_DISCARD 0x0004
...@@ -260,14 +264,23 @@ struct efx_rx_page_state { ...@@ -260,14 +264,23 @@ struct efx_rx_page_state {
* @added_count: Number of buffers added to the receive queue. * @added_count: Number of buffers added to the receive queue.
* @notified_count: Number of buffers given to NIC (<= @added_count). * @notified_count: Number of buffers given to NIC (<= @added_count).
* @removed_count: Number of buffers removed from the receive queue. * @removed_count: Number of buffers removed from the receive queue.
* @scatter_n: Number of buffers used by current packet
* @page_ring: The ring to store DMA mapped pages for reuse.
* @page_add: Counter to calculate the write pointer for the recycle ring.
* @page_remove: Counter to calculate the read pointer for the recycle ring.
* @page_recycle_count: The number of pages that have been recycled.
* @page_recycle_failed: The number of pages that couldn't be recycled because
* the kernel still held a reference to them.
* @page_recycle_full: The number of pages that were released because the
* recycle ring was full.
* @page_ptr_mask: The number of pages in the RX recycle ring minus 1.
* @max_fill: RX descriptor maximum fill level (<= ring size) * @max_fill: RX descriptor maximum fill level (<= ring size)
* @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill
* (<= @max_fill) * (<= @max_fill)
* @min_fill: RX descriptor minimum non-zero fill level. * @min_fill: RX descriptor minimum non-zero fill level.
* This records the minimum fill level observed when a ring * This records the minimum fill level observed when a ring
* refill was triggered. * refill was triggered.
* @alloc_page_count: RX allocation strategy counter. * @recycle_count: RX buffer recycle counter.
* @alloc_skb_count: RX allocation strategy counter.
* @slow_fill: Timer used to defer efx_nic_generate_fill_event(). * @slow_fill: Timer used to defer efx_nic_generate_fill_event().
*/ */
struct efx_rx_queue { struct efx_rx_queue {
...@@ -279,15 +292,22 @@ struct efx_rx_queue { ...@@ -279,15 +292,22 @@ struct efx_rx_queue {
bool enabled; bool enabled;
bool flush_pending; bool flush_pending;
int added_count; unsigned int added_count;
int notified_count; unsigned int notified_count;
int removed_count; unsigned int removed_count;
unsigned int scatter_n;
struct page **page_ring;
unsigned int page_add;
unsigned int page_remove;
unsigned int page_recycle_count;
unsigned int page_recycle_failed;
unsigned int page_recycle_full;
unsigned int page_ptr_mask;
unsigned int max_fill; unsigned int max_fill;
unsigned int fast_fill_trigger; unsigned int fast_fill_trigger;
unsigned int min_fill; unsigned int min_fill;
unsigned int min_overfill; unsigned int min_overfill;
unsigned int alloc_page_count; unsigned int recycle_count;
unsigned int alloc_skb_count;
struct timer_list slow_fill; struct timer_list slow_fill;
unsigned int slow_fill_count; unsigned int slow_fill_count;
}; };
...@@ -336,10 +356,6 @@ enum efx_rx_alloc_method { ...@@ -336,10 +356,6 @@ enum efx_rx_alloc_method {
* @event_test_cpu: Last CPU to handle interrupt or test event for this channel * @event_test_cpu: Last CPU to handle interrupt or test event for this channel
* @irq_count: Number of IRQs since last adaptive moderation decision * @irq_count: Number of IRQs since last adaptive moderation decision
* @irq_mod_score: IRQ moderation score * @irq_mod_score: IRQ moderation score
* @rx_alloc_level: Watermark based heuristic counter for pushing descriptors
* and diagnostic counters
* @rx_alloc_push_pages: RX allocation method currently in use for pushing
* descriptors
* @n_rx_tobe_disc: Count of RX_TOBE_DISC errors * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors
* @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors
* @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors
...@@ -347,6 +363,12 @@ enum efx_rx_alloc_method { ...@@ -347,6 +363,12 @@ enum efx_rx_alloc_method {
* @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors * @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors
* @n_rx_overlength: Count of RX_OVERLENGTH errors * @n_rx_overlength: Count of RX_OVERLENGTH errors
* @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun * @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun
* @n_rx_nodesc_trunc: Number of RX packets truncated and then dropped due to
* lack of descriptors
* @rx_pkt_n_frags: Number of fragments in next packet to be delivered by
* __efx_rx_packet(), or zero if there is none
* @rx_pkt_index: Ring index of first buffer for next packet to be delivered
* by __efx_rx_packet(), if @rx_pkt_n_frags != 0
* @rx_queue: RX queue for this channel * @rx_queue: RX queue for this channel
* @tx_queue: TX queues for this channel * @tx_queue: TX queues for this channel
*/ */
...@@ -371,9 +393,6 @@ struct efx_channel { ...@@ -371,9 +393,6 @@ struct efx_channel {
unsigned int rfs_filters_added; unsigned int rfs_filters_added;
#endif #endif
int rx_alloc_level;
int rx_alloc_push_pages;
unsigned n_rx_tobe_disc; unsigned n_rx_tobe_disc;
unsigned n_rx_ip_hdr_chksum_err; unsigned n_rx_ip_hdr_chksum_err;
unsigned n_rx_tcp_udp_chksum_err; unsigned n_rx_tcp_udp_chksum_err;
...@@ -381,11 +400,10 @@ struct efx_channel { ...@@ -381,11 +400,10 @@ struct efx_channel {
unsigned n_rx_frm_trunc; unsigned n_rx_frm_trunc;
unsigned n_rx_overlength; unsigned n_rx_overlength;
unsigned n_skbuff_leaks; unsigned n_skbuff_leaks;
unsigned int n_rx_nodesc_trunc;
/* Used to pipeline received packets in order to optimise memory unsigned int rx_pkt_n_frags;
* access with prefetches. unsigned int rx_pkt_index;
*/
struct efx_rx_buffer *rx_pkt;
struct efx_rx_queue rx_queue; struct efx_rx_queue rx_queue;
struct efx_tx_queue tx_queue[EFX_TXQ_TYPES]; struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
...@@ -410,7 +428,7 @@ struct efx_channel_type { ...@@ -410,7 +428,7 @@ struct efx_channel_type {
void (*post_remove)(struct efx_channel *); void (*post_remove)(struct efx_channel *);
void (*get_name)(struct efx_channel *, char *buf, size_t len); void (*get_name)(struct efx_channel *, char *buf, size_t len);
struct efx_channel *(*copy)(const struct efx_channel *); struct efx_channel *(*copy)(const struct efx_channel *);
void (*receive_skb)(struct efx_channel *, struct sk_buff *); bool (*receive_skb)(struct efx_channel *, struct sk_buff *);
bool keep_eventq; bool keep_eventq;
}; };
...@@ -446,6 +464,7 @@ enum nic_state { ...@@ -446,6 +464,7 @@ enum nic_state {
STATE_UNINIT = 0, /* device being probed/removed or is frozen */ STATE_UNINIT = 0, /* device being probed/removed or is frozen */
STATE_READY = 1, /* hardware ready and netdev registered */ STATE_READY = 1, /* hardware ready and netdev registered */
STATE_DISABLED = 2, /* device disabled due to hardware errors */ STATE_DISABLED = 2, /* device disabled due to hardware errors */
STATE_RECOVERY = 3, /* device recovering from PCI error */
}; };
/* /*
...@@ -684,10 +703,13 @@ struct vfdi_status; ...@@ -684,10 +703,13 @@ struct vfdi_status;
* @n_channels: Number of channels in use * @n_channels: Number of channels in use
* @n_rx_channels: Number of channels used for RX (= number of RX queues) * @n_rx_channels: Number of channels used for RX (= number of RX queues)
* @n_tx_channels: Number of channels used for TX * @n_tx_channels: Number of channels used for TX
* @rx_buffer_len: RX buffer length * @rx_dma_len: Current maximum RX DMA length
* @rx_buffer_order: Order (log2) of number of pages for each RX buffer * @rx_buffer_order: Order (log2) of number of pages for each RX buffer
* @rx_buffer_truesize: Amortised allocation size of an RX buffer,
* for use in sk_buff::truesize
* @rx_hash_key: Toeplitz hash key for RSS * @rx_hash_key: Toeplitz hash key for RSS
* @rx_indir_table: Indirection table for RSS * @rx_indir_table: Indirection table for RSS
* @rx_scatter: Scatter mode enabled for receives
* @int_error_count: Number of internal errors seen recently * @int_error_count: Number of internal errors seen recently
* @int_error_expire: Time at which error count will be expired * @int_error_expire: Time at which error count will be expired
* @irq_status: Interrupt status buffer * @irq_status: Interrupt status buffer
...@@ -800,10 +822,15 @@ struct efx_nic { ...@@ -800,10 +822,15 @@ struct efx_nic {
unsigned rss_spread; unsigned rss_spread;
unsigned tx_channel_offset; unsigned tx_channel_offset;
unsigned n_tx_channels; unsigned n_tx_channels;
unsigned int rx_buffer_len; unsigned int rx_dma_len;
unsigned int rx_buffer_order; unsigned int rx_buffer_order;
unsigned int rx_buffer_truesize;
unsigned int rx_page_buf_step;
unsigned int rx_bufs_per_page;
unsigned int rx_pages_per_batch;
u8 rx_hash_key[40]; u8 rx_hash_key[40];
u32 rx_indir_table[128]; u32 rx_indir_table[128];
bool rx_scatter;
unsigned int_error_count; unsigned int_error_count;
unsigned long int_error_expire; unsigned long int_error_expire;
...@@ -934,8 +961,9 @@ static inline unsigned int efx_port_num(struct efx_nic *efx) ...@@ -934,8 +961,9 @@ static inline unsigned int efx_port_num(struct efx_nic *efx)
* @evq_ptr_tbl_base: Event queue pointer table base address * @evq_ptr_tbl_base: Event queue pointer table base address
* @evq_rptr_tbl_base: Event queue read-pointer table base address * @evq_rptr_tbl_base: Event queue read-pointer table base address
* @max_dma_mask: Maximum possible DMA mask * @max_dma_mask: Maximum possible DMA mask
* @rx_buffer_hash_size: Size of hash at start of RX buffer * @rx_buffer_hash_size: Size of hash at start of RX packet
* @rx_buffer_padding: Size of padding at end of RX buffer * @rx_buffer_padding: Size of padding at end of RX packet
* @can_rx_scatter: NIC is able to scatter packet to multiple buffers
* @max_interrupt_mode: Highest capability interrupt mode supported * @max_interrupt_mode: Highest capability interrupt mode supported
* from &enum efx_init_mode. * from &enum efx_init_mode.
* @phys_addr_channels: Number of channels with physically addressed * @phys_addr_channels: Number of channels with physically addressed
...@@ -983,6 +1011,7 @@ struct efx_nic_type { ...@@ -983,6 +1011,7 @@ struct efx_nic_type {
u64 max_dma_mask; u64 max_dma_mask;
unsigned int rx_buffer_hash_size; unsigned int rx_buffer_hash_size;
unsigned int rx_buffer_padding; unsigned int rx_buffer_padding;
bool can_rx_scatter;
unsigned int max_interrupt_mode; unsigned int max_interrupt_mode;
unsigned int phys_addr_channels; unsigned int phys_addr_channels;
unsigned int timer_period_max; unsigned int timer_period_max;
......
...@@ -591,12 +591,22 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue) ...@@ -591,12 +591,22 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
bool is_b0 = efx_nic_rev(efx) >= EFX_REV_FALCON_B0; bool is_b0 = efx_nic_rev(efx) >= EFX_REV_FALCON_B0;
bool iscsi_digest_en = is_b0; bool iscsi_digest_en = is_b0;
bool jumbo_en;
/* For kernel-mode queues in Falcon A1, the JUMBO flag enables
* DMA to continue after a PCIe page boundary (and scattering
* is not possible). In Falcon B0 and Siena, it enables
* scatter.
*/
jumbo_en = !is_b0 || efx->rx_scatter;
netif_dbg(efx, hw, efx->net_dev, netif_dbg(efx, hw, efx->net_dev,
"RX queue %d ring in special buffers %d-%d\n", "RX queue %d ring in special buffers %d-%d\n",
efx_rx_queue_index(rx_queue), rx_queue->rxd.index, efx_rx_queue_index(rx_queue), rx_queue->rxd.index,
rx_queue->rxd.index + rx_queue->rxd.entries - 1); rx_queue->rxd.index + rx_queue->rxd.entries - 1);
rx_queue->scatter_n = 0;
/* Pin RX descriptor ring */ /* Pin RX descriptor ring */
efx_init_special_buffer(efx, &rx_queue->rxd); efx_init_special_buffer(efx, &rx_queue->rxd);
...@@ -613,8 +623,7 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue) ...@@ -613,8 +623,7 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
FRF_AZ_RX_DESCQ_SIZE, FRF_AZ_RX_DESCQ_SIZE,
__ffs(rx_queue->rxd.entries), __ffs(rx_queue->rxd.entries),
FRF_AZ_RX_DESCQ_TYPE, 0 /* kernel queue */ , FRF_AZ_RX_DESCQ_TYPE, 0 /* kernel queue */ ,
/* For >=B0 this is scatter so disable */ FRF_AZ_RX_DESCQ_JUMBO, jumbo_en,
FRF_AZ_RX_DESCQ_JUMBO, !is_b0,
FRF_AZ_RX_DESCQ_EN, 1); FRF_AZ_RX_DESCQ_EN, 1);
efx_writeo_table(efx, &rx_desc_ptr, efx->type->rxd_ptr_tbl_base, efx_writeo_table(efx, &rx_desc_ptr, efx->type->rxd_ptr_tbl_base,
efx_rx_queue_index(rx_queue)); efx_rx_queue_index(rx_queue));
...@@ -968,13 +977,24 @@ static u16 efx_handle_rx_not_ok(struct efx_rx_queue *rx_queue, ...@@ -968,13 +977,24 @@ static u16 efx_handle_rx_not_ok(struct efx_rx_queue *rx_queue,
EFX_RX_PKT_DISCARD : 0; EFX_RX_PKT_DISCARD : 0;
} }
/* Handle receive events that are not in-order. */ /* Handle receive events that are not in-order. Return true if this
static void * can be handled as a partial packet discard, false if it's more
* serious.
*/
static bool
efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index) efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)
{ {
struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
unsigned expected, dropped; unsigned expected, dropped;
if (rx_queue->scatter_n &&
index == ((rx_queue->removed_count + rx_queue->scatter_n - 1) &
rx_queue->ptr_mask)) {
++channel->n_rx_nodesc_trunc;
return true;
}
expected = rx_queue->removed_count & rx_queue->ptr_mask; expected = rx_queue->removed_count & rx_queue->ptr_mask;
dropped = (index - expected) & rx_queue->ptr_mask; dropped = (index - expected) & rx_queue->ptr_mask;
netif_info(efx, rx_err, efx->net_dev, netif_info(efx, rx_err, efx->net_dev,
...@@ -983,6 +1003,7 @@ efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index) ...@@ -983,6 +1003,7 @@ efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)
efx_schedule_reset(efx, EFX_WORKAROUND_5676(efx) ? efx_schedule_reset(efx, EFX_WORKAROUND_5676(efx) ?
RESET_TYPE_RX_RECOVERY : RESET_TYPE_DISABLE); RESET_TYPE_RX_RECOVERY : RESET_TYPE_DISABLE);
return false;
} }
/* Handle a packet received event /* Handle a packet received event
...@@ -998,7 +1019,7 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event) ...@@ -998,7 +1019,7 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
unsigned int rx_ev_desc_ptr, rx_ev_byte_cnt; unsigned int rx_ev_desc_ptr, rx_ev_byte_cnt;
unsigned int rx_ev_hdr_type, rx_ev_mcast_pkt; unsigned int rx_ev_hdr_type, rx_ev_mcast_pkt;
unsigned expected_ptr; unsigned expected_ptr;
bool rx_ev_pkt_ok; bool rx_ev_pkt_ok, rx_ev_sop, rx_ev_cont;
u16 flags; u16 flags;
struct efx_rx_queue *rx_queue; struct efx_rx_queue *rx_queue;
struct efx_nic *efx = channel->efx; struct efx_nic *efx = channel->efx;
...@@ -1006,21 +1027,56 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event) ...@@ -1006,21 +1027,56 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
if (unlikely(ACCESS_ONCE(efx->reset_pending))) if (unlikely(ACCESS_ONCE(efx->reset_pending)))
return; return;
/* Basic packet information */ rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT); rx_ev_sop = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP);
rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK);
rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);
WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT));
WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP) != 1);
WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_Q_LABEL) != WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_Q_LABEL) !=
channel->channel); channel->channel);
rx_queue = efx_channel_get_rx_queue(channel); rx_queue = efx_channel_get_rx_queue(channel);
rx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_DESC_PTR); rx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_DESC_PTR);
expected_ptr = rx_queue->removed_count & rx_queue->ptr_mask; expected_ptr = ((rx_queue->removed_count + rx_queue->scatter_n) &
if (unlikely(rx_ev_desc_ptr != expected_ptr)) rx_queue->ptr_mask);
efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr);
/* Check for partial drops and other errors */
if (unlikely(rx_ev_desc_ptr != expected_ptr) ||
unlikely(rx_ev_sop != (rx_queue->scatter_n == 0))) {
if (rx_ev_desc_ptr != expected_ptr &&
!efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr))
return;
/* Discard all pending fragments */
if (rx_queue->scatter_n) {
efx_rx_packet(
rx_queue,
rx_queue->removed_count & rx_queue->ptr_mask,
rx_queue->scatter_n, 0, EFX_RX_PKT_DISCARD);
rx_queue->removed_count += rx_queue->scatter_n;
rx_queue->scatter_n = 0;
}
/* Return if there is no new fragment */
if (rx_ev_desc_ptr != expected_ptr)
return;
/* Discard new fragment if not SOP */
if (!rx_ev_sop) {
efx_rx_packet(
rx_queue,
rx_queue->removed_count & rx_queue->ptr_mask,
1, 0, EFX_RX_PKT_DISCARD);
++rx_queue->removed_count;
return;
}
}
++rx_queue->scatter_n;
if (rx_ev_cont)
return;
rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT);
rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK);
rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);
if (likely(rx_ev_pkt_ok)) { if (likely(rx_ev_pkt_ok)) {
/* If packet is marked as OK and packet type is TCP/IP or /* If packet is marked as OK and packet type is TCP/IP or
...@@ -1048,7 +1104,11 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event) ...@@ -1048,7 +1104,11 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
channel->irq_mod_score += 2; channel->irq_mod_score += 2;
/* Handle received packet */ /* Handle received packet */
efx_rx_packet(rx_queue, rx_ev_desc_ptr, rx_ev_byte_cnt, flags); efx_rx_packet(rx_queue,
rx_queue->removed_count & rx_queue->ptr_mask,
rx_queue->scatter_n, rx_ev_byte_cnt, flags);
rx_queue->removed_count += rx_queue->scatter_n;
rx_queue->scatter_n = 0;
} }
/* If this flush done event corresponds to a &struct efx_tx_queue, then /* If this flush done event corresponds to a &struct efx_tx_queue, then
......
...@@ -99,6 +99,9 @@ ...@@ -99,6 +99,9 @@
#define PTP_V2_VERSION_LENGTH 1 #define PTP_V2_VERSION_LENGTH 1
#define PTP_V2_VERSION_OFFSET 29 #define PTP_V2_VERSION_OFFSET 29
#define PTP_V2_UUID_LENGTH 8
#define PTP_V2_UUID_OFFSET 48
/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2), /* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),
* the MC only captures the last six bytes of the clock identity. These values * the MC only captures the last six bytes of the clock identity. These values
* reflect those, not the ones used in the standard. The standard permits * reflect those, not the ones used in the standard. The standard permits
...@@ -429,13 +432,10 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, ...@@ -429,13 +432,10 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
unsigned number_readings = (response_length / unsigned number_readings = (response_length /
MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN); MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN);
unsigned i; unsigned i;
unsigned min;
unsigned min_set = 0;
unsigned total; unsigned total;
unsigned ngood = 0; unsigned ngood = 0;
unsigned last_good = 0; unsigned last_good = 0;
struct efx_ptp_data *ptp = efx->ptp_data; struct efx_ptp_data *ptp = efx->ptp_data;
bool min_valid = false;
u32 last_sec; u32 last_sec;
u32 start_sec; u32 start_sec;
struct timespec delta; struct timespec delta;
...@@ -443,35 +443,17 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, ...@@ -443,35 +443,17 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
if (number_readings == 0) if (number_readings == 0)
return -EAGAIN; return -EAGAIN;
/* Find minimum value in this set of results, discarding clearly /* Read the set of results and increment stats for any results that
* erroneous results. * appera to be erroneous.
*/ */
for (i = 0; i < number_readings; i++) { for (i = 0; i < number_readings; i++) {
efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]); efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]);
synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN; synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN;
if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) {
if (min_valid) {
if (ptp->timeset[i].window < min_set)
min_set = ptp->timeset[i].window;
} else {
min_valid = true;
min_set = ptp->timeset[i].window;
}
}
}
if (min_valid) {
if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns))
min = ptp->base_sync_ns;
else
min = min_set;
} else {
min = SYNCHRONISATION_GRANULARITY_NS;
} }
/* Discard excessively long synchronise durations. The MC times /* Find the last good host-MC synchronization result. The MC times
* when it finishes reading the host time so the corrected window * when it finishes reading the host time so the corrected window time
* time should be fairly constant for a given platform. * should be fairly constant for a given platform.
*/ */
total = 0; total = 0;
for (i = 0; i < number_readings; i++) for (i = 0; i < number_readings; i++)
...@@ -489,8 +471,8 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, ...@@ -489,8 +471,8 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
if (ngood == 0) { if (ngood == 0) {
netif_warn(efx, drv, efx->net_dev, netif_warn(efx, drv, efx->net_dev,
"PTP no suitable synchronisations %dns %dns\n", "PTP no suitable synchronisations %dns\n",
ptp->base_sync_ns, min_set); ptp->base_sync_ns);
return -EAGAIN; return -EAGAIN;
} }
...@@ -1006,43 +988,53 @@ bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb) ...@@ -1006,43 +988,53 @@ bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
* the receive timestamp from the MC - this will probably occur after the * the receive timestamp from the MC - this will probably occur after the
* packet arrival because of the processing in the MC. * packet arrival because of the processing in the MC.
*/ */
static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
{ {
struct efx_nic *efx = channel->efx; struct efx_nic *efx = channel->efx;
struct efx_ptp_data *ptp = efx->ptp_data; struct efx_ptp_data *ptp = efx->ptp_data;
struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb; struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
u8 *data; u8 *match_data_012, *match_data_345;
unsigned int version; unsigned int version;
match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS); match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
/* Correct version? */ /* Correct version? */
if (ptp->mode == MC_CMD_PTP_MODE_V1) { if (ptp->mode == MC_CMD_PTP_MODE_V1) {
if (skb->len < PTP_V1_MIN_LENGTH) { if (!pskb_may_pull(skb, PTP_V1_MIN_LENGTH)) {
netif_receive_skb(skb); return false;
return;
} }
version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]); version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);
if (version != PTP_VERSION_V1) { if (version != PTP_VERSION_V1) {
netif_receive_skb(skb); return false;
return;
} }
/* PTP V1 uses all six bytes of the UUID to match the packet
* to the timestamp
*/
match_data_012 = skb->data + PTP_V1_UUID_OFFSET;
match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3;
} else { } else {
if (skb->len < PTP_V2_MIN_LENGTH) { if (!pskb_may_pull(skb, PTP_V2_MIN_LENGTH)) {
netif_receive_skb(skb); return false;
return;
} }
version = skb->data[PTP_V2_VERSION_OFFSET]; version = skb->data[PTP_V2_VERSION_OFFSET];
BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2);
BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET);
BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH);
BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) { if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
netif_receive_skb(skb); return false;
return; }
/* The original V2 implementation uses bytes 2-7 of
* the UUID to match the packet to the timestamp. This
* discards two of the bytes of the MAC address used
* to create the UUID (SF bug 33070). The PTP V2
* enhanced mode fixes this issue and uses bytes 0-2
* and byte 5-7 of the UUID.
*/
match_data_345 = skb->data + PTP_V2_UUID_OFFSET + 5;
if (ptp->mode == MC_CMD_PTP_MODE_V2) {
match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 2;
} else {
match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 0;
BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2_ENHANCED);
} }
} }
...@@ -1056,14 +1048,19 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) ...@@ -1056,14 +1048,19 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
timestamps = skb_hwtstamps(skb); timestamps = skb_hwtstamps(skb);
memset(timestamps, 0, sizeof(*timestamps)); memset(timestamps, 0, sizeof(*timestamps));
/* We expect the sequence number to be in the same position in
* the packet for PTP V1 and V2
*/
BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
/* Extract UUID/Sequence information */ /* Extract UUID/Sequence information */
data = skb->data + PTP_V1_UUID_OFFSET; match->words[0] = (match_data_012[0] |
match->words[0] = (data[0] | (match_data_012[1] << 8) |
(data[1] << 8) | (match_data_012[2] << 16) |
(data[2] << 16) | (match_data_345[0] << 24));
(data[3] << 24)); match->words[1] = (match_data_345[1] |
match->words[1] = (data[4] | (match_data_345[2] << 8) |
(data[5] << 8) |
(skb->data[PTP_V1_SEQUENCE_OFFSET + (skb->data[PTP_V1_SEQUENCE_OFFSET +
PTP_V1_SEQUENCE_LENGTH - 1] << PTP_V1_SEQUENCE_LENGTH - 1] <<
16)); 16));
...@@ -1073,6 +1070,8 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) ...@@ -1073,6 +1070,8 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
skb_queue_tail(&ptp->rxq, skb); skb_queue_tail(&ptp->rxq, skb);
queue_work(ptp->workwq, &ptp->work); queue_work(ptp->workwq, &ptp->work);
return true;
} }
/* Transmit a PTP packet. This has to be transmitted by the MC /* Transmit a PTP packet. This has to be transmitted by the MC
...@@ -1167,7 +1166,7 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init) ...@@ -1167,7 +1166,7 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
* timestamped * timestamped
*/ */
init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT; init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
new_mode = MC_CMD_PTP_MODE_V2; new_mode = MC_CMD_PTP_MODE_V2_ENHANCED;
enable_wanted = true; enable_wanted = true;
break; break;
case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_EVENT:
...@@ -1186,7 +1185,14 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init) ...@@ -1186,7 +1185,14 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
if (init->tx_type != HWTSTAMP_TX_OFF) if (init->tx_type != HWTSTAMP_TX_OFF)
enable_wanted = true; enable_wanted = true;
/* Old versions of the firmware do not support the improved
* UUID filtering option (SF bug 33070). If the firmware does
* not accept the enhanced mode, fall back to the standard PTP
* v2 UUID filtering.
*/
rc = efx_ptp_change_mode(efx, enable_wanted, new_mode); rc = efx_ptp_change_mode(efx, enable_wanted, new_mode);
if ((rc != 0) && (new_mode == MC_CMD_PTP_MODE_V2_ENHANCED))
rc = efx_ptp_change_mode(efx, enable_wanted, MC_CMD_PTP_MODE_V2);
if (rc != 0) if (rc != 0)
return rc; return rc;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/udp.h> #include <linux/udp.h>
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
#include <linux/iommu.h>
#include <net/ip.h> #include <net/ip.h>
#include <net/checksum.h> #include <net/checksum.h>
#include "net_driver.h" #include "net_driver.h"
...@@ -24,85 +25,39 @@ ...@@ -24,85 +25,39 @@
#include "selftest.h" #include "selftest.h"
#include "workarounds.h" #include "workarounds.h"
/* Number of RX descriptors pushed at once. */ /* Preferred number of descriptors to fill at once */
#define EFX_RX_BATCH 8 #define EFX_RX_PREFERRED_BATCH 8U
/* Maximum size of a buffer sharing a page */ /* Number of RX buffers to recycle pages for. When creating the RX page recycle
#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state)) * ring, this number is divided by the number of buffers per page to calculate
* the number of pages to store in the RX page recycle ring.
*/
#define EFX_RECYCLE_RING_SIZE_IOMMU 4096
#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)
/* Size of buffer allocated for skb header area. */ /* Size of buffer allocated for skb header area. */
#define EFX_SKB_HEADERS 64u #define EFX_SKB_HEADERS 64u
/*
* rx_alloc_method - RX buffer allocation method
*
* This driver supports two methods for allocating and using RX buffers:
* each RX buffer may be backed by an skb or by an order-n page.
*
* When GRO is in use then the second method has a lower overhead,
* since we don't have to allocate then free skbs on reassembled frames.
*
* Values:
* - RX_ALLOC_METHOD_AUTO = 0
* - RX_ALLOC_METHOD_SKB = 1
* - RX_ALLOC_METHOD_PAGE = 2
*
* The heuristic for %RX_ALLOC_METHOD_AUTO is a simple hysteresis count
* controlled by the parameters below.
*
* - Since pushing and popping descriptors are separated by the rx_queue
* size, so the watermarks should be ~rxd_size.
* - The performance win by using page-based allocation for GRO is less
* than the performance hit of using page-based allocation of non-GRO,
* so the watermarks should reflect this.
*
* Per channel we maintain a single variable, updated by each channel:
*
* rx_alloc_level += (gro_performed ? RX_ALLOC_FACTOR_GRO :
* RX_ALLOC_FACTOR_SKB)
* Per NAPI poll interval, we constrain rx_alloc_level to 0..MAX (which
* limits the hysteresis), and update the allocation strategy:
*
* rx_alloc_method = (rx_alloc_level > RX_ALLOC_LEVEL_GRO ?
* RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB)
*/
static int rx_alloc_method = RX_ALLOC_METHOD_AUTO;
#define RX_ALLOC_LEVEL_GRO 0x2000
#define RX_ALLOC_LEVEL_MAX 0x3000
#define RX_ALLOC_FACTOR_GRO 1
#define RX_ALLOC_FACTOR_SKB (-2)
/* This is the percentage fill level below which new RX descriptors /* This is the percentage fill level below which new RX descriptors
* will be added to the RX descriptor ring. * will be added to the RX descriptor ring.
*/ */
static unsigned int rx_refill_threshold; static unsigned int rx_refill_threshold;
/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */
#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
EFX_RX_USR_BUF_SIZE)
/* /*
* RX maximum head room required. * RX maximum head room required.
* *
* This must be at least 1 to prevent overflow and at least 2 to allow * This must be at least 1 to prevent overflow, plus one packet-worth
* pipelined receives. * to allow pipelined receives.
*/ */
#define EFX_RXD_HEAD_ROOM 2 #define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS)
/* Offset of ethernet header within page */ static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
static inline unsigned int efx_rx_buf_offset(struct efx_nic *efx,
struct efx_rx_buffer *buf)
{ {
return buf->page_offset + efx->type->rx_buffer_hash_size; return page_address(buf->page) + buf->page_offset;
}
static inline unsigned int efx_rx_buf_size(struct efx_nic *efx)
{
return PAGE_SIZE << efx->rx_buffer_order;
}
static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf)
{
if (buf->flags & EFX_RX_BUF_PAGE)
return page_address(buf->u.page) + efx_rx_buf_offset(efx, buf);
else
return (u8 *)buf->u.skb->data + efx->type->rx_buffer_hash_size;
} }
static inline u32 efx_rx_buf_hash(const u8 *eh) static inline u32 efx_rx_buf_hash(const u8 *eh)
...@@ -119,66 +74,81 @@ static inline u32 efx_rx_buf_hash(const u8 *eh) ...@@ -119,66 +74,81 @@ static inline u32 efx_rx_buf_hash(const u8 *eh)
#endif #endif
} }
/** static inline struct efx_rx_buffer *
* efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)
* {
* @rx_queue: Efx RX queue if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask)))
* return efx_rx_buffer(rx_queue, 0);
* This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a else
* struct efx_rx_buffer for each one. Return a negative error code or 0 return rx_buf + 1;
* on success. May fail having only inserted fewer than EFX_RX_BATCH }
* buffers.
*/ static inline void efx_sync_rx_buffer(struct efx_nic *efx,
static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue) struct efx_rx_buffer *rx_buf,
unsigned int len)
{
dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len,
DMA_FROM_DEVICE);
}
void efx_rx_config_page_split(struct efx_nic *efx)
{
efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + EFX_PAGE_IP_ALIGN,
L1_CACHE_BYTES);
efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 :
((PAGE_SIZE - sizeof(struct efx_rx_page_state)) /
efx->rx_page_buf_step);
efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) /
efx->rx_bufs_per_page;
efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH,
efx->rx_bufs_per_page);
}
/* Check the RX page recycle ring for a page that can be reused. */
static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
{ {
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
struct net_device *net_dev = efx->net_dev; struct page *page;
struct efx_rx_buffer *rx_buf; struct efx_rx_page_state *state;
struct sk_buff *skb; unsigned index;
int skb_len = efx->rx_buffer_len;
unsigned index, count;
for (count = 0; count < EFX_RX_BATCH; ++count) { index = rx_queue->page_remove & rx_queue->page_ptr_mask;
index = rx_queue->added_count & rx_queue->ptr_mask; page = rx_queue->page_ring[index];
rx_buf = efx_rx_buffer(rx_queue, index); if (page == NULL)
return NULL;
rx_buf->u.skb = skb = netdev_alloc_skb(net_dev, skb_len);
if (unlikely(!skb)) rx_queue->page_ring[index] = NULL;
return -ENOMEM; /* page_remove cannot exceed page_add. */
if (rx_queue->page_remove != rx_queue->page_add)
/* Adjust the SKB for padding */ ++rx_queue->page_remove;
skb_reserve(skb, NET_IP_ALIGN);
rx_buf->len = skb_len - NET_IP_ALIGN;
rx_buf->flags = 0;
rx_buf->dma_addr = dma_map_single(&efx->pci_dev->dev,
skb->data, rx_buf->len,
DMA_FROM_DEVICE);
if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
rx_buf->dma_addr))) {
dev_kfree_skb_any(skb);
rx_buf->u.skb = NULL;
return -EIO;
}
++rx_queue->added_count; /* If page_count is 1 then we hold the only reference to this page. */
++rx_queue->alloc_skb_count; if (page_count(page) == 1) {
++rx_queue->page_recycle_count;
return page;
} else {
state = page_address(page);
dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
PAGE_SIZE << efx->rx_buffer_order,
DMA_FROM_DEVICE);
put_page(page);
++rx_queue->page_recycle_failed;
} }
return 0; return NULL;
} }
/** /**
* efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
* *
* @rx_queue: Efx RX queue * @rx_queue: Efx RX queue
* *
* This allocates memory for EFX_RX_BATCH receive buffers, maps them for DMA, * This allocates a batch of pages, maps them for DMA, and populates
* and populates struct efx_rx_buffers for each one. Return a negative error * struct efx_rx_buffers for each one. Return a negative error code or
* code or 0 on success. If a single page can be split between two buffers, * 0 on success. If a single page can be used for multiple buffers,
* then the page will either be inserted fully, or not at at all. * then the page will either be inserted fully, or not at all.
*/ */
static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue) static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
{ {
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
struct efx_rx_buffer *rx_buf; struct efx_rx_buffer *rx_buf;
...@@ -188,150 +158,140 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue) ...@@ -188,150 +158,140 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
dma_addr_t dma_addr; dma_addr_t dma_addr;
unsigned index, count; unsigned index, count;
/* We can split a page between two buffers */ count = 0;
BUILD_BUG_ON(EFX_RX_BATCH & 1); do {
page = efx_reuse_page(rx_queue);
for (count = 0; count < EFX_RX_BATCH; ++count) { if (page == NULL) {
page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC, page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
efx->rx_buffer_order); efx->rx_buffer_order);
if (unlikely(page == NULL)) if (unlikely(page == NULL))
return -ENOMEM; return -ENOMEM;
dma_addr = dma_map_page(&efx->pci_dev->dev, page, 0, dma_addr =
efx_rx_buf_size(efx), dma_map_page(&efx->pci_dev->dev, page, 0,
DMA_FROM_DEVICE); PAGE_SIZE << efx->rx_buffer_order,
if (unlikely(dma_mapping_error(&efx->pci_dev->dev, dma_addr))) { DMA_FROM_DEVICE);
__free_pages(page, efx->rx_buffer_order); if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
return -EIO; dma_addr))) {
__free_pages(page, efx->rx_buffer_order);
return -EIO;
}
state = page_address(page);
state->dma_addr = dma_addr;
} else {
state = page_address(page);
dma_addr = state->dma_addr;
} }
state = page_address(page);
state->refcnt = 0;
state->dma_addr = dma_addr;
dma_addr += sizeof(struct efx_rx_page_state); dma_addr += sizeof(struct efx_rx_page_state);
page_offset = sizeof(struct efx_rx_page_state); page_offset = sizeof(struct efx_rx_page_state);
split: do {
index = rx_queue->added_count & rx_queue->ptr_mask; index = rx_queue->added_count & rx_queue->ptr_mask;
rx_buf = efx_rx_buffer(rx_queue, index); rx_buf = efx_rx_buffer(rx_queue, index);
rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN; rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
rx_buf->u.page = page; rx_buf->page = page;
rx_buf->page_offset = page_offset; rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN; rx_buf->len = efx->rx_dma_len;
rx_buf->flags = EFX_RX_BUF_PAGE; rx_buf->flags = 0;
++rx_queue->added_count; ++rx_queue->added_count;
++rx_queue->alloc_page_count;
++state->refcnt;
if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) {
/* Use the second half of the page */
get_page(page); get_page(page);
dma_addr += (PAGE_SIZE >> 1); dma_addr += efx->rx_page_buf_step;
page_offset += (PAGE_SIZE >> 1); page_offset += efx->rx_page_buf_step;
++count; } while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE);
goto split;
} rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
} } while (++count < efx->rx_pages_per_batch);
return 0; return 0;
} }
/* Unmap a DMA-mapped page. This function is only called for the final RX
* buffer in a page.
*/
static void efx_unmap_rx_buffer(struct efx_nic *efx, static void efx_unmap_rx_buffer(struct efx_nic *efx,
struct efx_rx_buffer *rx_buf, struct efx_rx_buffer *rx_buf)
unsigned int used_len)
{ {
if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { struct page *page = rx_buf->page;
struct efx_rx_page_state *state;
if (page) {
state = page_address(rx_buf->u.page); struct efx_rx_page_state *state = page_address(page);
if (--state->refcnt == 0) { dma_unmap_page(&efx->pci_dev->dev,
dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
state->dma_addr, PAGE_SIZE << efx->rx_buffer_order,
efx_rx_buf_size(efx), DMA_FROM_DEVICE);
DMA_FROM_DEVICE);
} else if (used_len) {
dma_sync_single_for_cpu(&efx->pci_dev->dev,
rx_buf->dma_addr, used_len,
DMA_FROM_DEVICE);
}
} else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) {
dma_unmap_single(&efx->pci_dev->dev, rx_buf->dma_addr,
rx_buf->len, DMA_FROM_DEVICE);
} }
} }
static void efx_free_rx_buffer(struct efx_nic *efx, static void efx_free_rx_buffer(struct efx_rx_buffer *rx_buf)
struct efx_rx_buffer *rx_buf)
{ {
if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { if (rx_buf->page) {
__free_pages(rx_buf->u.page, efx->rx_buffer_order); put_page(rx_buf->page);
rx_buf->u.page = NULL; rx_buf->page = NULL;
} else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) {
dev_kfree_skb_any(rx_buf->u.skb);
rx_buf->u.skb = NULL;
} }
} }
static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, /* Attempt to recycle the page if there is an RX recycle ring; the page can
struct efx_rx_buffer *rx_buf) * only be added if this is the final RX buffer, to prevent pages being used in
* the descriptor ring and appearing in the recycle ring simultaneously.
*/
static void efx_recycle_rx_page(struct efx_channel *channel,
struct efx_rx_buffer *rx_buf)
{ {
efx_unmap_rx_buffer(rx_queue->efx, rx_buf, 0); struct page *page = rx_buf->page;
efx_free_rx_buffer(rx_queue->efx, rx_buf); struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
} struct efx_nic *efx = rx_queue->efx;
unsigned index;
/* Attempt to resurrect the other receive buffer that used to share this page, /* Only recycle the page after processing the final buffer. */
* which had previously been passed up to the kernel and freed. */ if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE))
static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
struct efx_rx_buffer *rx_buf)
{
struct efx_rx_page_state *state = page_address(rx_buf->u.page);
struct efx_rx_buffer *new_buf;
unsigned fill_level, index;
/* +1 because efx_rx_packet() incremented removed_count. +1 because
* we'd like to insert an additional descriptor whilst leaving
* EFX_RXD_HEAD_ROOM for the non-recycle path */
fill_level = (rx_queue->added_count - rx_queue->removed_count + 2);
if (unlikely(fill_level > rx_queue->max_fill)) {
/* We could place "state" on a list, and drain the list in
* efx_fast_push_rx_descriptors(). For now, this will do. */
return; return;
}
++state->refcnt; index = rx_queue->page_add & rx_queue->page_ptr_mask;
get_page(rx_buf->u.page); if (rx_queue->page_ring[index] == NULL) {
unsigned read_index = rx_queue->page_remove &
rx_queue->page_ptr_mask;
index = rx_queue->added_count & rx_queue->ptr_mask; /* The next slot in the recycle ring is available, but
new_buf = efx_rx_buffer(rx_queue, index); * increment page_remove if the read pointer currently
new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1); * points here.
new_buf->u.page = rx_buf->u.page; */
new_buf->len = rx_buf->len; if (read_index == index)
new_buf->flags = EFX_RX_BUF_PAGE; ++rx_queue->page_remove;
++rx_queue->added_count; rx_queue->page_ring[index] = page;
++rx_queue->page_add;
return;
}
++rx_queue->page_recycle_full;
efx_unmap_rx_buffer(efx, rx_buf);
put_page(rx_buf->page);
} }
/* Recycle the given rx buffer directly back into the rx_queue. There is static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
* always room to add this buffer, because we've just popped a buffer. */ struct efx_rx_buffer *rx_buf)
static void efx_recycle_rx_buffer(struct efx_channel *channel,
struct efx_rx_buffer *rx_buf)
{ {
struct efx_nic *efx = channel->efx; /* Release the page reference we hold for the buffer. */
struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); if (rx_buf->page)
struct efx_rx_buffer *new_buf; put_page(rx_buf->page);
unsigned index;
/* If this is the last buffer in a page, unmap and free it. */
rx_buf->flags &= EFX_RX_BUF_PAGE; if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) {
efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
if ((rx_buf->flags & EFX_RX_BUF_PAGE) && efx_free_rx_buffer(rx_buf);
efx->rx_buffer_len <= EFX_RX_HALF_PAGE && }
page_count(rx_buf->u.page) == 1) rx_buf->page = NULL;
efx_resurrect_rx_buffer(rx_queue, rx_buf); }
index = rx_queue->added_count & rx_queue->ptr_mask; /* Recycle the pages that are used by buffers that have just been received. */
new_buf = efx_rx_buffer(rx_queue, index); static void efx_recycle_rx_buffers(struct efx_channel *channel,
struct efx_rx_buffer *rx_buf,
unsigned int n_frags)
{
struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
memcpy(new_buf, rx_buf, sizeof(*new_buf)); do {
rx_buf->u.page = NULL; efx_recycle_rx_page(channel, rx_buf);
++rx_queue->added_count; rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
} while (--n_frags);
} }
/** /**
...@@ -348,8 +308,8 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel, ...@@ -348,8 +308,8 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel,
*/ */
void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue) void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
{ {
struct efx_channel *channel = efx_rx_queue_channel(rx_queue); struct efx_nic *efx = rx_queue->efx;
unsigned fill_level; unsigned int fill_level, batch_size;
int space, rc = 0; int space, rc = 0;
/* Calculate current fill level, and exit if we don't need to fill */ /* Calculate current fill level, and exit if we don't need to fill */
...@@ -364,28 +324,26 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue) ...@@ -364,28 +324,26 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
rx_queue->min_fill = fill_level; rx_queue->min_fill = fill_level;
} }
batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;
space = rx_queue->max_fill - fill_level; space = rx_queue->max_fill - fill_level;
EFX_BUG_ON_PARANOID(space < EFX_RX_BATCH); EFX_BUG_ON_PARANOID(space < batch_size);
netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
"RX queue %d fast-filling descriptor ring from" "RX queue %d fast-filling descriptor ring from"
" level %d to level %d using %s allocation\n", " level %d to level %d\n",
efx_rx_queue_index(rx_queue), fill_level, efx_rx_queue_index(rx_queue), fill_level,
rx_queue->max_fill, rx_queue->max_fill);
channel->rx_alloc_push_pages ? "page" : "skb");
do { do {
if (channel->rx_alloc_push_pages) rc = efx_init_rx_buffers(rx_queue);
rc = efx_init_rx_buffers_page(rx_queue);
else
rc = efx_init_rx_buffers_skb(rx_queue);
if (unlikely(rc)) { if (unlikely(rc)) {
/* Ensure that we don't leave the rx queue empty */ /* Ensure that we don't leave the rx queue empty */
if (rx_queue->added_count == rx_queue->removed_count) if (rx_queue->added_count == rx_queue->removed_count)
efx_schedule_slow_fill(rx_queue); efx_schedule_slow_fill(rx_queue);
goto out; goto out;
} }
} while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH); } while ((space -= batch_size) >= batch_size);
netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
"RX queue %d fast-filled descriptor ring " "RX queue %d fast-filled descriptor ring "
...@@ -408,7 +366,7 @@ void efx_rx_slow_fill(unsigned long context) ...@@ -408,7 +366,7 @@ void efx_rx_slow_fill(unsigned long context)
static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
struct efx_rx_buffer *rx_buf, struct efx_rx_buffer *rx_buf,
int len, bool *leak_packet) int len)
{ {
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding; unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding;
...@@ -428,11 +386,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, ...@@ -428,11 +386,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
"RX event (0x%x > 0x%x+0x%x). Leaking\n", "RX event (0x%x > 0x%x+0x%x). Leaking\n",
efx_rx_queue_index(rx_queue), len, max_len, efx_rx_queue_index(rx_queue), len, max_len,
efx->type->rx_buffer_padding); efx->type->rx_buffer_padding);
/* If this buffer was skb-allocated, then the meta
* data at the end of the skb will be trashed. So
* we have no choice but to leak the fragment.
*/
*leak_packet = !(rx_buf->flags & EFX_RX_BUF_PAGE);
efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY); efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY);
} else { } else {
if (net_ratelimit()) if (net_ratelimit())
...@@ -448,212 +401,238 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, ...@@ -448,212 +401,238 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
/* Pass a received packet up through GRO. GRO can handle pages /* Pass a received packet up through GRO. GRO can handle pages
* regardless of checksum state and skbs with a good checksum. * regardless of checksum state and skbs with a good checksum.
*/ */
static void efx_rx_packet_gro(struct efx_channel *channel, static void
struct efx_rx_buffer *rx_buf, efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
const u8 *eh) unsigned int n_frags, u8 *eh)
{ {
struct napi_struct *napi = &channel->napi_str; struct napi_struct *napi = &channel->napi_str;
gro_result_t gro_result; gro_result_t gro_result;
struct efx_nic *efx = channel->efx;
struct sk_buff *skb;
if (rx_buf->flags & EFX_RX_BUF_PAGE) { skb = napi_get_frags(napi);
struct efx_nic *efx = channel->efx; if (unlikely(!skb)) {
struct page *page = rx_buf->u.page; while (n_frags--) {
struct sk_buff *skb; put_page(rx_buf->page);
rx_buf->page = NULL;
rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
}
return;
}
rx_buf->u.page = NULL; if (efx->net_dev->features & NETIF_F_RXHASH)
skb->rxhash = efx_rx_buf_hash(eh);
skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
for (;;) {
skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
rx_buf->page, rx_buf->page_offset,
rx_buf->len);
rx_buf->page = NULL;
skb->len += rx_buf->len;
if (skb_shinfo(skb)->nr_frags == n_frags)
break;
rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
}
skb = napi_get_frags(napi); skb->data_len = skb->len;
if (!skb) { skb->truesize += n_frags * efx->rx_buffer_truesize;
put_page(page);
return; skb_record_rx_queue(skb, channel->rx_queue.core_index);
}
gro_result = napi_gro_frags(napi);
if (gro_result != GRO_DROP)
channel->irq_mod_score += 2;
}
if (efx->net_dev->features & NETIF_F_RXHASH) /* Allocate and construct an SKB around page fragments */
skb->rxhash = efx_rx_buf_hash(eh); static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
struct efx_rx_buffer *rx_buf,
unsigned int n_frags,
u8 *eh, int hdr_len)
{
struct efx_nic *efx = channel->efx;
struct sk_buff *skb;
skb_fill_page_desc(skb, 0, page, /* Allocate an SKB to store the headers */
efx_rx_buf_offset(efx, rx_buf), rx_buf->len); skb = netdev_alloc_skb(efx->net_dev, hdr_len + EFX_PAGE_SKB_ALIGN);
if (unlikely(skb == NULL))
return NULL;
skb->len = rx_buf->len; EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len);
skb->data_len = rx_buf->len;
skb->truesize += rx_buf->len;
skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
skb_record_rx_queue(skb, channel->rx_queue.core_index); skb_reserve(skb, EFX_PAGE_SKB_ALIGN);
memcpy(__skb_put(skb, hdr_len), eh, hdr_len);
gro_result = napi_gro_frags(napi); /* Append the remaining page(s) onto the frag list */
} else { if (rx_buf->len > hdr_len) {
struct sk_buff *skb = rx_buf->u.skb; rx_buf->page_offset += hdr_len;
rx_buf->len -= hdr_len;
EFX_BUG_ON_PARANOID(!(rx_buf->flags & EFX_RX_PKT_CSUMMED)); for (;;) {
rx_buf->u.skb = NULL; skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
skb->ip_summed = CHECKSUM_UNNECESSARY; rx_buf->page, rx_buf->page_offset,
rx_buf->len);
rx_buf->page = NULL;
skb->len += rx_buf->len;
skb->data_len += rx_buf->len;
if (skb_shinfo(skb)->nr_frags == n_frags)
break;
gro_result = napi_gro_receive(napi, skb); rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
}
} else {
__free_pages(rx_buf->page, efx->rx_buffer_order);
rx_buf->page = NULL;
n_frags = 0;
} }
if (gro_result == GRO_NORMAL) { skb->truesize += n_frags * efx->rx_buffer_truesize;
channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
} else if (gro_result != GRO_DROP) { /* Move past the ethernet header */
channel->rx_alloc_level += RX_ALLOC_FACTOR_GRO; skb->protocol = eth_type_trans(skb, efx->net_dev);
channel->irq_mod_score += 2;
} return skb;
} }
void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
unsigned int len, u16 flags) unsigned int n_frags, unsigned int len, u16 flags)
{ {
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
struct efx_channel *channel = efx_rx_queue_channel(rx_queue); struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
struct efx_rx_buffer *rx_buf; struct efx_rx_buffer *rx_buf;
bool leak_packet = false;
rx_buf = efx_rx_buffer(rx_queue, index); rx_buf = efx_rx_buffer(rx_queue, index);
rx_buf->flags |= flags; rx_buf->flags |= flags;
/* This allows the refill path to post another buffer. /* Validate the number of fragments and completed length */
* EFX_RXD_HEAD_ROOM ensures that the slot we are using if (n_frags == 1) {
* isn't overwritten yet. efx_rx_packet__check_len(rx_queue, rx_buf, len);
*/ } else if (unlikely(n_frags > EFX_RX_MAX_FRAGS) ||
rx_queue->removed_count++; unlikely(len <= (n_frags - 1) * EFX_RX_USR_BUF_SIZE) ||
unlikely(len > n_frags * EFX_RX_USR_BUF_SIZE) ||
/* Validate the length encoded in the event vs the descriptor pushed */ unlikely(!efx->rx_scatter)) {
efx_rx_packet__check_len(rx_queue, rx_buf, len, &leak_packet); /* If this isn't an explicit discard request, either
* the hardware or the driver is broken.
*/
WARN_ON(!(len == 0 && rx_buf->flags & EFX_RX_PKT_DISCARD));
rx_buf->flags |= EFX_RX_PKT_DISCARD;
}
netif_vdbg(efx, rx_status, efx->net_dev, netif_vdbg(efx, rx_status, efx->net_dev,
"RX queue %d received id %x at %llx+%x %s%s\n", "RX queue %d received ids %x-%x len %d %s%s\n",
efx_rx_queue_index(rx_queue), index, efx_rx_queue_index(rx_queue), index,
(unsigned long long)rx_buf->dma_addr, len, (index + n_frags - 1) & rx_queue->ptr_mask, len,
(rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "", (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "",
(rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : ""); (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : "");
/* Discard packet, if instructed to do so */ /* Discard packet, if instructed to do so. Process the
* previous receive first.
*/
if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) { if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
if (unlikely(leak_packet)) efx_rx_flush_packet(channel);
channel->n_skbuff_leaks++; put_page(rx_buf->page);
else efx_recycle_rx_buffers(channel, rx_buf, n_frags);
efx_recycle_rx_buffer(channel, rx_buf); return;
/* Don't hold off the previous receive */
rx_buf = NULL;
goto out;
} }
/* Release and/or sync DMA mapping - assumes all RX buffers if (n_frags == 1)
* consumed in-order per RX queue rx_buf->len = len;
/* Release and/or sync the DMA mapping - assumes all RX buffers
* consumed in-order per RX queue.
*/ */
efx_unmap_rx_buffer(efx, rx_buf, len); efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
/* Prefetch nice and early so data will (hopefully) be in cache by /* Prefetch nice and early so data will (hopefully) be in cache by
* the time we look at it. * the time we look at it.
*/ */
prefetch(efx_rx_buf_eh(efx, rx_buf)); prefetch(efx_rx_buf_va(rx_buf));
rx_buf->page_offset += efx->type->rx_buffer_hash_size;
rx_buf->len -= efx->type->rx_buffer_hash_size;
if (n_frags > 1) {
/* Release/sync DMA mapping for additional fragments.
* Fix length for last fragment.
*/
unsigned int tail_frags = n_frags - 1;
for (;;) {
rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
if (--tail_frags == 0)
break;
efx_sync_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE);
}
rx_buf->len = len - (n_frags - 1) * EFX_RX_USR_BUF_SIZE;
efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
}
/* All fragments have been DMA-synced, so recycle buffers and pages. */
rx_buf = efx_rx_buffer(rx_queue, index);
efx_recycle_rx_buffers(channel, rx_buf, n_frags);
/* Pipeline receives so that we give time for packet headers to be /* Pipeline receives so that we give time for packet headers to be
* prefetched into cache. * prefetched into cache.
*/ */
rx_buf->len = len - efx->type->rx_buffer_hash_size; efx_rx_flush_packet(channel);
out: channel->rx_pkt_n_frags = n_frags;
if (channel->rx_pkt) channel->rx_pkt_index = index;
__efx_rx_packet(channel, channel->rx_pkt);
channel->rx_pkt = rx_buf;
} }
static void efx_rx_deliver(struct efx_channel *channel, static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
struct efx_rx_buffer *rx_buf) struct efx_rx_buffer *rx_buf,
unsigned int n_frags)
{ {
struct sk_buff *skb; struct sk_buff *skb;
u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS);
/* We now own the SKB */ skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len);
skb = rx_buf->u.skb; if (unlikely(skb == NULL)) {
rx_buf->u.skb = NULL; efx_free_rx_buffer(rx_buf);
return;
}
skb_record_rx_queue(skb, channel->rx_queue.core_index);
/* Set the SKB flags */ /* Set the SKB flags */
skb_checksum_none_assert(skb); skb_checksum_none_assert(skb);
/* Record the rx_queue */
skb_record_rx_queue(skb, channel->rx_queue.core_index);
/* Pass the packet up */
if (channel->type->receive_skb) if (channel->type->receive_skb)
channel->type->receive_skb(channel, skb); if (channel->type->receive_skb(channel, skb))
else return;
netif_receive_skb(skb);
/* Update allocation strategy method */ /* Pass the packet up */
channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; netif_receive_skb(skb);
} }
/* Handle a received packet. Second half: Touches packet payload. */ /* Handle a received packet. Second half: Touches packet payload. */
void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf) void __efx_rx_packet(struct efx_channel *channel)
{ {
struct efx_nic *efx = channel->efx; struct efx_nic *efx = channel->efx;
u8 *eh = efx_rx_buf_eh(efx, rx_buf); struct efx_rx_buffer *rx_buf =
efx_rx_buffer(&channel->rx_queue, channel->rx_pkt_index);
u8 *eh = efx_rx_buf_va(rx_buf);
/* If we're in loopback test, then pass the packet directly to the /* If we're in loopback test, then pass the packet directly to the
* loopback layer, and free the rx_buf here * loopback layer, and free the rx_buf here
*/ */
if (unlikely(efx->loopback_selftest)) { if (unlikely(efx->loopback_selftest)) {
efx_loopback_rx_packet(efx, eh, rx_buf->len); efx_loopback_rx_packet(efx, eh, rx_buf->len);
efx_free_rx_buffer(efx, rx_buf); efx_free_rx_buffer(rx_buf);
return; goto out;
}
if (!(rx_buf->flags & EFX_RX_BUF_PAGE)) {
struct sk_buff *skb = rx_buf->u.skb;
prefetch(skb_shinfo(skb));
skb_reserve(skb, efx->type->rx_buffer_hash_size);
skb_put(skb, rx_buf->len);
if (efx->net_dev->features & NETIF_F_RXHASH)
skb->rxhash = efx_rx_buf_hash(eh);
/* Move past the ethernet header. rx_buf->data still points
* at the ethernet header */
skb->protocol = eth_type_trans(skb, efx->net_dev);
skb_record_rx_queue(skb, channel->rx_queue.core_index);
} }
if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM))) if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) && if (!channel->type->receive_skb)
!channel->type->receive_skb) efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh);
efx_rx_packet_gro(channel, rx_buf, eh);
else else
efx_rx_deliver(channel, rx_buf); efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags);
} out:
channel->rx_pkt_n_frags = 0;
void efx_rx_strategy(struct efx_channel *channel)
{
enum efx_rx_alloc_method method = rx_alloc_method;
if (channel->type->receive_skb) {
channel->rx_alloc_push_pages = false;
return;
}
/* Only makes sense to use page based allocation if GRO is enabled */
if (!(channel->efx->net_dev->features & NETIF_F_GRO)) {
method = RX_ALLOC_METHOD_SKB;
} else if (method == RX_ALLOC_METHOD_AUTO) {
/* Constrain the rx_alloc_level */
if (channel->rx_alloc_level < 0)
channel->rx_alloc_level = 0;
else if (channel->rx_alloc_level > RX_ALLOC_LEVEL_MAX)
channel->rx_alloc_level = RX_ALLOC_LEVEL_MAX;
/* Decide on the allocation method */
method = ((channel->rx_alloc_level > RX_ALLOC_LEVEL_GRO) ?
RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB);
}
/* Push the option */
channel->rx_alloc_push_pages = (method == RX_ALLOC_METHOD_PAGE);
} }
int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
...@@ -683,9 +662,32 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) ...@@ -683,9 +662,32 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
kfree(rx_queue->buffer); kfree(rx_queue->buffer);
rx_queue->buffer = NULL; rx_queue->buffer = NULL;
} }
return rc; return rc;
} }
void efx_init_rx_recycle_ring(struct efx_nic *efx,
struct efx_rx_queue *rx_queue)
{
unsigned int bufs_in_recycle_ring, page_ring_size;
/* Set the RX recycle ring size */
#ifdef CONFIG_PPC64
bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
#else
if (efx->pci_dev->dev.iommu_group)
bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
else
bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU;
#endif /* CONFIG_PPC64 */
page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
efx->rx_bufs_per_page);
rx_queue->page_ring = kcalloc(page_ring_size,
sizeof(*rx_queue->page_ring), GFP_KERNEL);
rx_queue->page_ptr_mask = page_ring_size - 1;
}
void efx_init_rx_queue(struct efx_rx_queue *rx_queue) void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
{ {
struct efx_nic *efx = rx_queue->efx; struct efx_nic *efx = rx_queue->efx;
...@@ -699,10 +701,18 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) ...@@ -699,10 +701,18 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
rx_queue->notified_count = 0; rx_queue->notified_count = 0;
rx_queue->removed_count = 0; rx_queue->removed_count = 0;
rx_queue->min_fill = -1U; rx_queue->min_fill = -1U;
efx_init_rx_recycle_ring(efx, rx_queue);
rx_queue->page_remove = 0;
rx_queue->page_add = rx_queue->page_ptr_mask + 1;
rx_queue->page_recycle_count = 0;
rx_queue->page_recycle_failed = 0;
rx_queue->page_recycle_full = 0;
/* Initialise limit fields */ /* Initialise limit fields */
max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM; max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
max_trigger = max_fill - EFX_RX_BATCH; max_trigger =
max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;
if (rx_refill_threshold != 0) { if (rx_refill_threshold != 0) {
trigger = max_fill * min(rx_refill_threshold, 100U) / 100U; trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
if (trigger > max_trigger) if (trigger > max_trigger)
...@@ -722,6 +732,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) ...@@ -722,6 +732,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
{ {
int i; int i;
struct efx_nic *efx = rx_queue->efx;
struct efx_rx_buffer *rx_buf; struct efx_rx_buffer *rx_buf;
netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
...@@ -733,13 +744,32 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) ...@@ -733,13 +744,32 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
del_timer_sync(&rx_queue->slow_fill); del_timer_sync(&rx_queue->slow_fill);
efx_nic_fini_rx(rx_queue); efx_nic_fini_rx(rx_queue);
/* Release RX buffers NB start at index 0 not current HW ptr */ /* Release RX buffers from the current read ptr to the write ptr */
if (rx_queue->buffer) { if (rx_queue->buffer) {
for (i = 0; i <= rx_queue->ptr_mask; i++) { for (i = rx_queue->removed_count; i < rx_queue->added_count;
rx_buf = efx_rx_buffer(rx_queue, i); i++) {
unsigned index = i & rx_queue->ptr_mask;
rx_buf = efx_rx_buffer(rx_queue, index);
efx_fini_rx_buffer(rx_queue, rx_buf); efx_fini_rx_buffer(rx_queue, rx_buf);
} }
} }
/* Unmap and release the pages in the recycle ring. Remove the ring. */
for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
struct page *page = rx_queue->page_ring[i];
struct efx_rx_page_state *state;
if (page == NULL)
continue;
state = page_address(page);
dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
PAGE_SIZE << efx->rx_buffer_order,
DMA_FROM_DEVICE);
put_page(page);
}
kfree(rx_queue->page_ring);
rx_queue->page_ring = NULL;
} }
void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
...@@ -754,9 +784,6 @@ void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) ...@@ -754,9 +784,6 @@ void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
} }
module_param(rx_alloc_method, int, 0644);
MODULE_PARM_DESC(rx_alloc_method, "Allocation method used for RX buffers");
module_param(rx_refill_threshold, uint, 0444); module_param(rx_refill_threshold, uint, 0444);
MODULE_PARM_DESC(rx_refill_threshold, MODULE_PARM_DESC(rx_refill_threshold,
"RX descriptor ring refill threshold (%)"); "RX descriptor ring refill threshold (%)");
......
...@@ -202,7 +202,7 @@ static int siena_test_chip(struct efx_nic *efx, struct efx_self_tests *tests) ...@@ -202,7 +202,7 @@ static int siena_test_chip(struct efx_nic *efx, struct efx_self_tests *tests)
static enum reset_type siena_map_reset_reason(enum reset_type reason) static enum reset_type siena_map_reset_reason(enum reset_type reason)
{ {
return RESET_TYPE_ALL; return RESET_TYPE_RECOVER_OR_ALL;
} }
static int siena_map_reset_flags(u32 *flags) static int siena_map_reset_flags(u32 *flags)
...@@ -245,6 +245,22 @@ static int siena_reset_hw(struct efx_nic *efx, enum reset_type method) ...@@ -245,6 +245,22 @@ static int siena_reset_hw(struct efx_nic *efx, enum reset_type method)
return efx_mcdi_reset_port(efx); return efx_mcdi_reset_port(efx);
} }
#ifdef CONFIG_EEH
/* When a PCI device is isolated from the bus, a subsequent MMIO read is
* required for the kernel EEH mechanisms to notice. As the Solarflare driver
* was written to minimise MMIO read (for latency) then a periodic call to check
* the EEH status of the device is required so that device recovery can happen
* in a timely fashion.
*/
static void siena_monitor(struct efx_nic *efx)
{
struct eeh_dev *eehdev =
of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));
eeh_dev_check_failure(eehdev);
}
#endif
static int siena_probe_nvconfig(struct efx_nic *efx) static int siena_probe_nvconfig(struct efx_nic *efx)
{ {
u32 caps = 0; u32 caps = 0;
...@@ -398,6 +414,8 @@ static int siena_init_nic(struct efx_nic *efx) ...@@ -398,6 +414,8 @@ static int siena_init_nic(struct efx_nic *efx)
EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_INSRT_HDR, 1); EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_INSRT_HDR, 1);
EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_ALG, 1); EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_ALG, 1);
EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_IP_HASH, 1); EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_IP_HASH, 1);
EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_USR_BUF_SIZE,
EFX_RX_USR_BUF_SIZE >> 5);
efx_writeo(efx, &temp, FR_AZ_RX_CFG); efx_writeo(efx, &temp, FR_AZ_RX_CFG);
/* Set hash key for IPv4 */ /* Set hash key for IPv4 */
...@@ -665,7 +683,11 @@ const struct efx_nic_type siena_a0_nic_type = { ...@@ -665,7 +683,11 @@ const struct efx_nic_type siena_a0_nic_type = {
.init = siena_init_nic, .init = siena_init_nic,
.dimension_resources = siena_dimension_resources, .dimension_resources = siena_dimension_resources,
.fini = efx_port_dummy_op_void, .fini = efx_port_dummy_op_void,
#ifdef CONFIG_EEH
.monitor = siena_monitor,
#else
.monitor = NULL, .monitor = NULL,
#endif
.map_reset_reason = siena_map_reset_reason, .map_reset_reason = siena_map_reset_reason,
.map_reset_flags = siena_map_reset_flags, .map_reset_flags = siena_map_reset_flags,
.reset = siena_reset_hw, .reset = siena_reset_hw,
...@@ -698,6 +720,7 @@ const struct efx_nic_type siena_a0_nic_type = { ...@@ -698,6 +720,7 @@ const struct efx_nic_type siena_a0_nic_type = {
.max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH), .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
.rx_buffer_hash_size = 0x10, .rx_buffer_hash_size = 0x10,
.rx_buffer_padding = 0, .rx_buffer_padding = 0,
.can_rx_scatter = true,
.max_interrupt_mode = EFX_INT_MODE_MSIX, .max_interrupt_mode = EFX_INT_MODE_MSIX,
.phys_addr_channels = 32, /* Hardware limit is 64, but the legacy .phys_addr_channels = 32, /* Hardware limit is 64, but the legacy
* interrupt handler only supports 32 * interrupt handler only supports 32
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment