Commit ae27e886 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman

Merge tag 'misc-habanalabs-next-2022-11-23' of...

Merge tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next

Oded writes:

This tag contains habanalabs driver changes for v6.2:

- New feature of graceful hard-reset. Instead of immediately killing the
  user-process when a command submission times out, we wait a bit and give
  the user-process notification and let it try to close things gracefully,
  with the ability to retrieve debug information.

- Enhance the EventFD mechanism. Add new events such as access to illegal
  address (RAZWI), page fault, device unavailable. In addition, change the
  event workqueue to be handled in a single-threaded workqueue.

- Allow the control device to work during reset of the ASIC, to enable
  monitoring applications to continue getting the data.

- Add handling for Gaudi2 with PCI revision 2.

- Reduce severity of prints due to power/thermal events.

- Change how we use the h/w to perform memory scrubbing in Gaudi2.

- Multiple bug fixes, refactors and renames.

* tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (63 commits)
  habanalabs: fix VA range calculation
  habanalabs: fail driver load if EEPROM errors detected
  habanalabs: make print of engines idle mask more readable
  habanalabs: clear non-released encapsulated signals
  habanalabs: don't put context in hl_encaps_handle_do_release_sob()
  habanalabs: print context refcount value if hard reset fails
  habanalabs: add RMWREG32_SHIFTED to set a val within a mask
  habanalabs: fix rc when new CPUCP opcodes are not supported
  habanalabs/gaudi2: added memset for the cq_size register
  habanalabs: added return value check for hl_fw_dynamic_send_clear_cmd()
  habanalabs: increase the size of busy engines mask
  habanalabs/gaudi2: change memory scrub mechanism
  habanalabs: extend process wait timeout in device fine
  habanalabs: check schedule_hard_reset correctly
  habanalabs: reset device if still in use when released
  habanalabs/gaudi2: return to reset upon SM SEI BRESP error
  habanalabs/gaudi2: don't enable entries in the MSIX_GW table
  habanalabs/gaudi2: remove redundant firmware version check
  habanalabs/gaudi: fix print for firmware-alive event
  habanalabs: fix print for out-of-sync and pkt-failure events
  ...
parents 449ef8fb 19a17a9f
...@@ -91,6 +91,13 @@ Description: Enables the root user to set the device to specific state. ...@@ -91,6 +91,13 @@ Description: Enables the root user to set the device to specific state.
Valid values are "disable", "enable", "suspend", "resume". Valid values are "disable", "enable", "suspend", "resume".
User can read this property to see the valid values User can read this property to see the valid values
What: /sys/kernel/debug/habanalabs/hl<n>/device_release_watchdog_timeout
Date: Oct 2022
KernelVersion: 6.2
Contact: ttayar@habana.ai
Description: The watchdog timeout value in seconds for a device relese upon
certain error cases, after which the device is reset.
What: /sys/kernel/debug/habanalabs/hl<n>/dma_size What: /sys/kernel/debug/habanalabs/hl<n>/dma_size
Date: Apr 2021 Date: Apr 2021
KernelVersion: 5.13 KernelVersion: 5.13
......
...@@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref) ...@@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
*/ */
if (hl_cs_cmpl->encaps_signals) if (hl_cs_cmpl->encaps_signals)
kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount, kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
hl_encaps_handle_do_release); hl_encaps_release_handle_and_put_ctx);
} }
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
&& cs->encaps_signals) kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
kref_put(&cs->encaps_sig_hdl->refcount,
hl_encaps_handle_do_release);
out: out:
/* Must be called before hl_ctx_put because inside we use ctx to get /* Must be called before hl_ctx_put because inside we use ctx to get
...@@ -798,7 +796,7 @@ static void cs_do_release(struct kref *ref) ...@@ -798,7 +796,7 @@ static void cs_do_release(struct kref *ref)
static void cs_timedout(struct work_struct *work) static void cs_timedout(struct work_struct *work)
{ {
struct hl_device *hdev; struct hl_device *hdev;
u64 event_mask; u64 event_mask = 0x0;
int rc; int rc;
struct hl_cs *cs = container_of(work, struct hl_cs, struct hl_cs *cs = container_of(work, struct hl_cs,
work_tdr.work); work_tdr.work);
...@@ -830,11 +828,7 @@ static void cs_timedout(struct work_struct *work) ...@@ -830,11 +828,7 @@ static void cs_timedout(struct work_struct *work)
if (rc) { if (rc) {
hdev->captured_err_info.cs_timeout.timestamp = ktime_get(); hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
hdev->captured_err_info.cs_timeout.seq = cs->sequence; hdev->captured_err_info.cs_timeout.seq = cs->sequence;
event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;
hl_notifier_event_send_all(hdev, event_mask);
} }
switch (cs->type) { switch (cs->type) {
...@@ -869,8 +863,12 @@ static void cs_timedout(struct work_struct *work) ...@@ -869,8 +863,12 @@ static void cs_timedout(struct work_struct *work)
cs_put(cs); cs_put(cs);
if (device_reset) if (device_reset) {
hl_device_reset(hdev, HL_DRV_RESET_TDR); event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
} else if (event_mask) {
hl_notifier_event_send_all(hdev, event_mask);
}
} }
static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
...@@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) ...@@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
hl_complete_job(hdev, job); hl_complete_job(hdev, job);
} }
/*
* release_reserved_encaps_signals() - release reserved encapsulated signals.
* @hdev: pointer to habanalabs device structure
*
* Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
* encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
* For these signals need also to put the refcount of the H/W SOB which was taken at the
* reservation.
*/
static void release_reserved_encaps_signals(struct hl_device *hdev)
{
struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
struct hl_cs_encaps_sig_handle *handle;
struct hl_encaps_signals_mgr *mgr;
u32 id;
if (!ctx)
return;
mgr = &ctx->sig_mgr;
idr_for_each_entry(&mgr->handles, handle, id)
if (handle->cs_seq == ULLONG_MAX)
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
hl_ctx_put(ctx);
}
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
{ {
int i; int i;
...@@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) ...@@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
} }
force_complete_multi_cs(hdev); force_complete_multi_cs(hdev);
release_reserved_encaps_signals(hdev);
} }
static void static void
...@@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, ...@@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
*/ */
handle->pre_sob_val = prop->next_sob_val - handle->count; handle->pre_sob_val = prop->next_sob_val - handle->count;
handle->cs_seq = ULLONG_MAX;
*signals_count = prop->next_sob_val; *signals_count = prop->next_sob_val;
hdev->asic_funcs->hw_queues_unlock(hdev); hdev->asic_funcs->hw_queues_unlock(hdev);
...@@ -2350,10 +2380,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, ...@@ -2350,10 +2380,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
/* We finished with the CS in this function, so put the ref */ /* We finished with the CS in this function, so put the ref */
cs_put(cs); cs_put(cs);
free_cs_chunk_array: free_cs_chunk_array:
if (!wait_cs_submitted && cs_encaps_signals && handle_found && if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
is_wait_cs) kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
kref_put(&encaps_sig_hdl->refcount,
hl_encaps_handle_do_release);
kfree(cs_chunk_array); kfree(cs_chunk_array);
out: out:
return rc; return rc;
......
...@@ -9,38 +9,46 @@ ...@@ -9,38 +9,46 @@
#include <linux/slab.h> #include <linux/slab.h>
void hl_encaps_handle_do_release(struct kref *ref) static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
bool put_ctx)
{ {
struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
if (put_hw_sob)
hw_sob_put(handle->hw_sob);
spin_lock(&mgr->lock); spin_lock(&mgr->lock);
idr_remove(&mgr->handles, handle->id); idr_remove(&mgr->handles, handle->id);
spin_unlock(&mgr->lock); spin_unlock(&mgr->lock);
if (put_ctx)
hl_ctx_put(handle->ctx); hl_ctx_put(handle->ctx);
kfree(handle); kfree(handle);
} }
static void hl_encaps_handle_do_release_sob(struct kref *ref) void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
{ {
struct hl_cs_encaps_sig_handle *handle = struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount); container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
/* if we're here, then there was a signals reservation but cs with encaps_handle_do_release(handle, false, true);
* encaps signals wasn't submitted, so need to put refcount }
* to hw_sob taken at the reservation.
*/
hw_sob_put(handle->hw_sob);
spin_lock(&mgr->lock); static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
idr_remove(&mgr->handles, handle->id); {
spin_unlock(&mgr->lock); struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
hl_ctx_put(handle->ctx); encaps_handle_do_release(handle, true, false);
kfree(handle); }
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
{
struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
encaps_handle_do_release(handle, true, true);
} }
static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
...@@ -49,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) ...@@ -49,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
idr_init(&mgr->handles); idr_init(&mgr->handles);
} }
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
struct hl_encaps_signals_mgr *mgr)
{ {
struct hl_cs_encaps_sig_handle *handle; struct hl_cs_encaps_sig_handle *handle;
struct idr *idp; struct idr *idp;
...@@ -58,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, ...@@ -58,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
idp = &mgr->handles; idp = &mgr->handles;
/* The IDR is expected to be empty at this stage, because any left signal should have been
* released as part of CS roll-back.
*/
if (!idr_is_empty(idp)) { if (!idr_is_empty(idp)) {
dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n"); dev_warn(hdev->dev,
"device released while some encaps signals handles are still allocated\n");
idr_for_each_entry(idp, handle, id) idr_for_each_entry(idp, handle, id)
kref_put(&handle->refcount, kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
hl_encaps_handle_do_release_sob);
} }
idr_destroy(&mgr->handles); idr_destroy(&mgr->handles);
......
...@@ -1769,6 +1769,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) ...@@ -1769,6 +1769,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
dev_entry, dev_entry,
&hl_timeout_locked_fops); &hl_timeout_locked_fops);
debugfs_create_u32("device_release_watchdog_timeout",
0644,
dev_entry->root,
&hdev->device_release_watchdog_timeout_sec);
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name, debugfs_create_file(hl_debugfs_list[i].name,
0444, 0444,
......
This diff is collapsed.
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/crc32.h> #include <linux/crc32.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/vmalloc.h>
#define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */ #define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */
...@@ -323,6 +324,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, ...@@ -323,6 +324,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
if (!prop->supports_advanced_cpucp_rc) { if (!prop->supports_advanced_cpucp_rc) {
dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode); dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode);
rc = -EIO;
goto scrub_descriptor; goto scrub_descriptor;
} }
...@@ -615,16 +617,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, ...@@ -615,16 +617,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
/* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
dev_warn(hdev->dev, dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
"Device boot warning - EEPROM failure detected, default settings applied\n"); err_exists = true;
/* This is a warning so we don't want it to disable the
* device
*/
err_val &= ~CPU_BOOT_ERR0_EEPROM_FAIL;
} }
/* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
dev_warn(hdev->dev, dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n"); "Device boot warning - Skipped DRAM initialization\n");
...@@ -1782,6 +1780,8 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, ...@@ -1782,6 +1780,8 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
/* first send clear command to clean former commands */ /* first send clear command to clean former commands */
rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader);
if (rc)
return rc;
/* send the actual command */ /* send the actual command */
hl_fw_dynamic_send_cmd(hdev, fw_loader, cmd, size); hl_fw_dynamic_send_cmd(hdev, fw_loader, cmd, size);
...@@ -1988,10 +1988,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, ...@@ -1988,10 +1988,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
struct fw_load_mgr *fw_loader) struct fw_load_mgr *fw_loader)
{ {
struct lkd_fw_comms_desc *fw_desc; struct lkd_fw_comms_desc *fw_desc;
void __iomem *src, *temp_fw_desc;
struct pci_mem_region *region; struct pci_mem_region *region;
struct fw_response *response; struct fw_response *response;
u16 fw_data_size;
enum pci_region region_id; enum pci_region region_id;
void __iomem *src;
int rc; int rc;
fw_desc = &fw_loader->dynamic_loader.comm_desc; fw_desc = &fw_loader->dynamic_loader.comm_desc;
...@@ -2018,9 +2019,29 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, ...@@ -2018,9 +2019,29 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
fw_loader->dynamic_loader.fw_desc_valid = false; fw_loader->dynamic_loader.fw_desc_valid = false;
src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
response->ram_offset; response->ram_offset;
/*
* We do the copy of the fw descriptor in 2 phases:
* 1. copy the header + data info according to our lkd_fw_comms_desc definition.
* then we're able to read the actual data size provided by fw.
* this is needed for cases where data in descriptor was changed(add/remove)
* in embedded specs header file before updating lkd copy of the header file
* 2. copy descriptor to temporary buffer with aligned size and send it to validation
*/
memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc)); memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
fw_data_size = le16_to_cpu(fw_desc->header.size);
temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size);
if (!temp_fw_desc)
return -ENOMEM;
return hl_fw_dynamic_validate_descriptor(hdev, fw_loader, fw_desc); memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size);
rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader,
(struct lkd_fw_comms_desc *) temp_fw_desc);
vfree(temp_fw_desc);
return rc;
} }
/** /**
...@@ -2507,7 +2528,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, ...@@ -2507,7 +2528,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
struct fw_load_mgr *fw_loader) struct fw_load_mgr *fw_loader)
{ {
struct cpu_dyn_regs *dyn_regs; struct cpu_dyn_regs *dyn_regs;
int rc; int rc, fw_error_rc;
dev_info(hdev->dev, dev_info(hdev->dev,
"Loading %sfirmware to device, may take some time...\n", "Loading %sfirmware to device, may take some time...\n",
...@@ -2607,14 +2628,17 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, ...@@ -2607,14 +2628,17 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hl_fw_dynamic_update_linux_interrupt_if(hdev); hl_fw_dynamic_update_linux_interrupt_if(hdev);
return 0;
protocol_err: protocol_err:
if (fw_loader->dynamic_loader.fw_desc_valid) if (fw_loader->dynamic_loader.fw_desc_valid) {
fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), fw_error_rc = fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
le32_to_cpu(dyn_regs->cpu_boot_err1), le32_to_cpu(dyn_regs->cpu_boot_err1),
le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
if (fw_error_rc)
return fw_error_rc;
}
return rc; return rc;
} }
...@@ -2983,7 +3007,7 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void ...@@ -2983,7 +3007,7 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void
int rc; int rc;
req_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, size, &req_dma_addr); req_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, size, &req_dma_addr);
if (!data) { if (!req_cpu_addr) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to allocate DMA memory for CPU-CP packet %u\n", packet_id); "Failed to allocate DMA memory for CPU-CP packet %u\n", packet_id);
return -ENOMEM; return -ENOMEM;
......
This diff is collapsed.
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#define pr_fmt(fmt) "habanalabs: " fmt #define pr_fmt(fmt) "habanalabs: " fmt
#include "habanalabs.h" #include "habanalabs.h"
#include "../include/hw_ip/pci/pci_general.h"
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/aer.h> #include <linux/aer.h>
...@@ -74,16 +75,17 @@ MODULE_DEVICE_TABLE(pci, ids); ...@@ -74,16 +75,17 @@ MODULE_DEVICE_TABLE(pci, ids);
/* /*
* get_asic_type - translate device id to asic type * get_asic_type - translate device id to asic type
* *
* @device: id of the PCI device * @hdev: pointer to habanalabs device structure.
* *
* Translate device id to asic type. * Translate device id and revision id to asic type.
* In case of unidentified device, return -1 * In case of unidentified device, return -1
*/ */
static enum hl_asic_type get_asic_type(u16 device) static enum hl_asic_type get_asic_type(struct hl_device *hdev)
{ {
enum hl_asic_type asic_type; struct pci_dev *pdev = hdev->pdev;
enum hl_asic_type asic_type = ASIC_INVALID;
switch (device) { switch (pdev->device) {
case PCI_IDS_GOYA: case PCI_IDS_GOYA:
asic_type = ASIC_GOYA; asic_type = ASIC_GOYA;
break; break;
...@@ -94,10 +96,18 @@ static enum hl_asic_type get_asic_type(u16 device) ...@@ -94,10 +96,18 @@ static enum hl_asic_type get_asic_type(u16 device)
asic_type = ASIC_GAUDI_SEC; asic_type = ASIC_GAUDI_SEC;
break; break;
case PCI_IDS_GAUDI2: case PCI_IDS_GAUDI2:
switch (pdev->revision) {
case REV_ID_A:
asic_type = ASIC_GAUDI2; asic_type = ASIC_GAUDI2;
break; break;
case REV_ID_B:
asic_type = ASIC_GAUDI2B;
break;
default:
break;
}
break;
default: default:
asic_type = ASIC_INVALID;
break; break;
} }
...@@ -212,7 +222,8 @@ int hl_device_open(struct inode *inode, struct file *filp) ...@@ -212,7 +222,8 @@ int hl_device_open(struct inode *inode, struct file *filp)
hl_debugfs_add_file(hpriv); hl_debugfs_add_file(hpriv);
atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
atomic_set(&hdev->captured_err_info.razwi.write_enable, 1); atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0);
atomic_set(&hdev->captured_err_info.pgf_info_recorded, 0);
hdev->captured_err_info.undef_opcode.write_enable = true; hdev->captured_err_info.undef_opcode.write_enable = true;
hdev->open_counter++; hdev->open_counter++;
...@@ -270,9 +281,9 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) ...@@ -270,9 +281,9 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
mutex_lock(&hdev->fpriv_ctrl_list_lock); mutex_lock(&hdev->fpriv_ctrl_list_lock);
if (!hl_device_operational(hdev, NULL)) { if (!hl_ctrl_device_operational(hdev, NULL)) {
dev_dbg_ratelimited(hdev->dev_ctrl, dev_dbg_ratelimited(hdev->dev_ctrl,
"Can't open %s because it is disabled or in reset\n", "Can't open %s because it is disabled\n",
dev_name(hdev->dev_ctrl)); dev_name(hdev->dev_ctrl));
rc = -EPERM; rc = -EPERM;
goto out_err; goto out_err;
...@@ -415,7 +426,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) ...@@ -415,7 +426,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
/* First, we must find out which ASIC are we handling. This is needed /* First, we must find out which ASIC are we handling. This is needed
* to configure the behavior of the driver (kernel parameters) * to configure the behavior of the driver (kernel parameters)
*/ */
hdev->asic_type = get_asic_type(pdev->device); hdev->asic_type = get_asic_type(hdev);
if (hdev->asic_type == ASIC_INVALID) { if (hdev->asic_type == ASIC_INVALID) {
dev_err(&pdev->dev, "Unsupported ASIC\n"); dev_err(&pdev->dev, "Unsupported ASIC\n");
rc = -ENODEV; rc = -ENODEV;
...@@ -594,15 +605,16 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) ...@@ -594,15 +605,16 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
switch (state) { switch (state) {
case pci_channel_io_normal: case pci_channel_io_normal:
dev_warn(hdev->dev, "PCI normal state error detected\n");
return PCI_ERS_RESULT_CAN_RECOVER; return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen: case pci_channel_io_frozen:
dev_warn(hdev->dev, "frozen state error detected\n"); dev_warn(hdev->dev, "PCI frozen state error detected\n");
result = PCI_ERS_RESULT_NEED_RESET; result = PCI_ERS_RESULT_NEED_RESET;
break; break;
case pci_channel_io_perm_failure: case pci_channel_io_perm_failure:
dev_warn(hdev->dev, "failure state error detected\n"); dev_warn(hdev->dev, "PCI failure state error detected\n");
result = PCI_ERS_RESULT_DISCONNECT; result = PCI_ERS_RESULT_DISCONNECT;
break; break;
...@@ -638,6 +650,10 @@ static void hl_pci_err_resume(struct pci_dev *pdev) ...@@ -638,6 +650,10 @@ static void hl_pci_err_resume(struct pci_dev *pdev)
*/ */
static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
{ {
struct hl_device *hdev = pci_get_drvdata(pdev);
dev_warn(hdev->dev, "PCI slot reset detected\n");
return PCI_ERS_RESULT_RECOVERED; return PCI_ERS_RESULT_RECOVERED;
} }
......
...@@ -10,10 +10,11 @@ ...@@ -10,10 +10,11 @@
#include <uapi/misc/habanalabs.h> #include <uapi/misc/habanalabs.h>
#include "habanalabs.h" #include "habanalabs.h"
#include <linux/kernel.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/uaccess.h> #include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = { static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
...@@ -105,6 +106,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) ...@@ -105,6 +106,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
hw_ip.edma_enabled_mask = prop->edma_enabled_mask; hw_ip.edma_enabled_mask = prop->edma_enabled_mask;
hw_ip.server_type = prop->server_type; hw_ip.server_type = prop->server_type;
hw_ip.security_enabled = prop->fw_security_enabled; hw_ip.security_enabled = prop->fw_security_enabled;
hw_ip.revision_id = hdev->pdev->revision;
return copy_to_user(out, &hw_ip, return copy_to_user(out, &hw_ip,
min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0; min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
...@@ -121,6 +123,10 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate, ...@@ -121,6 +123,10 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
return -EINVAL; return -EINVAL;
arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size); arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size);
if (!arr) {
dev_err(hdev->dev, "Events info not supported\n");
return -EOPNOTSUPP;
}
return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
} }
...@@ -603,20 +609,14 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) ...@@ -603,20 +609,14 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{ {
struct hl_device *hdev = hpriv->hdev; struct hl_device *hdev = hpriv->hdev;
u32 max_size = args->return_size; u32 max_size = args->return_size;
struct hl_info_razwi_event info = {0}; struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi;
void __user *out = (void __user *) (uintptr_t) args->return_pointer; void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out)) if ((!max_size) || (!out))
return -EINVAL; return -EINVAL;
info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp); return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event)))
info.addr = hdev->captured_err_info.razwi.addr; ? -EFAULT : 0;
info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1;
info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2;
info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator;
info.error_type = hdev->captured_err_info.razwi.type;
return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
} }
static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args) static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
...@@ -784,6 +784,42 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args) ...@@ -784,6 +784,42 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return rc; return rc;
} }
static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
u32 max_size = args->return_size;
struct hl_page_fault_info *info = &hdev->captured_err_info.pgf_info.pgf;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out))
return -EINVAL;
return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info)))
? -EFAULT : 0;
}
static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
u32 user_buf_size = args->return_size;
struct hl_device *hdev = hpriv->hdev;
struct page_fault_info *pgf_info;
u64 actual_size;
pgf_info = &hdev->captured_err_info.pgf_info;
args->array_size = pgf_info->num_of_user_mappings;
if (!out)
return -EINVAL;
actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping);
if (user_buf_size < actual_size)
return -ENOMEM;
return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size))
? -EFAULT : 0;
}
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev) struct device *dev)
{ {
...@@ -843,6 +879,15 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, ...@@ -843,6 +879,15 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_GET_EVENTS: case HL_INFO_GET_EVENTS:
return events_info(hpriv, args); return events_info(hpriv, args);
case HL_INFO_PAGE_FAULT_EVENT:
return page_fault_info(hpriv, args);
case HL_INFO_USER_MAPPINGS:
return user_mappings_info(hpriv, args);
case HL_INFO_UNREGISTER_EVENTFD:
return eventfd_unregister(hpriv, args);
default: default:
break; break;
} }
...@@ -899,9 +944,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, ...@@ -899,9 +944,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_REGISTER_EVENTFD: case HL_INFO_REGISTER_EVENTFD:
return eventfd_register(hpriv, args); return eventfd_register(hpriv, args);
case HL_INFO_UNREGISTER_EVENTFD:
return eventfd_unregister(hpriv, args);
case HL_INFO_ENGINE_STATUS: case HL_INFO_ENGINE_STATUS:
return engine_status_info(hpriv, args); return engine_status_info(hpriv, args);
......
...@@ -1689,7 +1689,7 @@ static int hl_dmabuf_attach(struct dma_buf *dmabuf, ...@@ -1689,7 +1689,7 @@ static int hl_dmabuf_attach(struct dma_buf *dmabuf,
hl_dmabuf = dmabuf->priv; hl_dmabuf = dmabuf->priv;
hdev = hl_dmabuf->ctx->hdev; hdev = hl_dmabuf->ctx->hdev;
rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); rc = pci_p2pdma_distance(hdev->pdev, attachment->dev, true);
if (rc < 0) if (rc < 0)
attachment->peer2peer = false; attachment->peer2peer = false;
...@@ -2109,7 +2109,7 @@ static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args) ...@@ -2109,7 +2109,7 @@ static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)
/* Allocate the internal kernel buffer */ /* Allocate the internal kernel buffer */
size = num_elements * sizeof(struct hl_user_pending_interrupt); size = num_elements * sizeof(struct hl_user_pending_interrupt);
p = vmalloc(size); p = vzalloc(size);
if (!p) if (!p)
goto free_user_buff; goto free_user_buff;
...@@ -2508,24 +2508,20 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges, ...@@ -2508,24 +2508,20 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges,
/* /*
* PAGE_SIZE alignment * PAGE_SIZE alignment
* it is the callers responsibility to align the addresses if the * it is the caller's responsibility to align the addresses if the
* page size is not a power of 2 * page size is not a power of 2
*/ */
if (is_power_of_2(page_size)) { if (is_power_of_2(page_size)) {
if (start & (PAGE_SIZE - 1)) { start = round_up(start, page_size);
start &= PAGE_MASK;
start += PAGE_SIZE;
}
/* /*
* The end of the range is inclusive, hence we need to align it * The end of the range is inclusive, hence we need to align it
* to the end of the last full page in the range. For example if * to the end of the last full page in the range. For example if
* end = 0x3ff5 with page size 0x1000, we need to align it to * end = 0x3ff5 with page size 0x1000, we need to align it to
* 0x2fff. The remainig 0xff5 bytes do not form a full page. * 0x2fff. The remaining 0xff5 bytes do not form a full page.
*/ */
if ((end + 1) & (PAGE_SIZE - 1)) end = round_down(end + 1, page_size) - 1;
end = ((end + 1) & PAGE_MASK) - 1;
} }
if (start >= end) { if (start >= end) {
......
...@@ -635,7 +635,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) ...@@ -635,7 +635,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]); hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
break; break;
case ASIC_GAUDI2: case ASIC_GAUDI2:
case ASIC_GAUDI2_SEC: case ASIC_GAUDI2B:
/* MMUs in Gaudi2 are always host resident */ /* MMUs in Gaudi2 are always host resident */
hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
break; break;
...@@ -699,7 +699,7 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, ...@@ -699,7 +699,7 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
static void hl_mmu_prefetch_work_function(struct work_struct *work) static void hl_mmu_prefetch_work_function(struct work_struct *work)
{ {
struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work); struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, prefetch_work);
struct hl_ctx *ctx = pfw->ctx; struct hl_ctx *ctx = pfw->ctx;
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
...@@ -723,25 +723,25 @@ static void hl_mmu_prefetch_work_function(struct work_struct *work) ...@@ -723,25 +723,25 @@ static void hl_mmu_prefetch_work_function(struct work_struct *work)
int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size) int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size)
{ {
struct hl_prefetch_work *handle_pf_work; struct hl_prefetch_work *handle_prefetch_work;
handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL); handle_prefetch_work = kmalloc(sizeof(*handle_prefetch_work), GFP_KERNEL);
if (!handle_pf_work) if (!handle_prefetch_work)
return -ENOMEM; return -ENOMEM;
INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function); INIT_WORK(&handle_prefetch_work->prefetch_work, hl_mmu_prefetch_work_function);
handle_pf_work->ctx = ctx; handle_prefetch_work->ctx = ctx;
handle_pf_work->va = va; handle_prefetch_work->va = va;
handle_pf_work->size = size; handle_prefetch_work->size = size;
handle_pf_work->flags = flags; handle_prefetch_work->flags = flags;
handle_pf_work->asid = asid; handle_prefetch_work->asid = asid;
/* /*
* as actual prefetch is done in a WQ we must get the context (and put it * as actual prefetch is done in a WQ we must get the context (and put it
* at the end of the work function) * at the end of the work function)
*/ */
hl_ctx_get(ctx); hl_ctx_get(ctx);
queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work); queue_work(ctx->hdev->prefetch_wq, &handle_prefetch_work->prefetch_work);
return 0; return 0;
} }
......
...@@ -248,8 +248,8 @@ static ssize_t device_type_show(struct device *dev, ...@@ -248,8 +248,8 @@ static ssize_t device_type_show(struct device *dev,
case ASIC_GAUDI2: case ASIC_GAUDI2:
str = "GAUDI2"; str = "GAUDI2";
break; break;
case ASIC_GAUDI2_SEC: case ASIC_GAUDI2B:
str = "GAUDI2 SEC"; str = "GAUDI2B";
break; break;
default: default:
dev_err(hdev->dev, "Unrecognized ASIC type %d\n", dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
......
This diff is collapsed.
This diff is collapsed.
...@@ -23,8 +23,6 @@ ...@@ -23,8 +23,6 @@
#define GAUDI2_CPU_TIMEOUT_USEC 30000000 /* 30s */ #define GAUDI2_CPU_TIMEOUT_USEC 30000000 /* 30s */
#define GAUDI2_FPGA_CPU_TIMEOUT 100000000 /* 100s */
#define NUMBER_OF_PDMA_QUEUES 2 #define NUMBER_OF_PDMA_QUEUES 2
#define NUMBER_OF_EDMA_QUEUES 8 #define NUMBER_OF_EDMA_QUEUES 8
#define NUMBER_OF_MME_QUEUES 4 #define NUMBER_OF_MME_QUEUES 4
......
...@@ -1764,6 +1764,7 @@ static const struct range gaudi2_pb_nic0_qm_arc_aux0_unsecured_regs[] = { ...@@ -1764,6 +1764,7 @@ static const struct range gaudi2_pb_nic0_qm_arc_aux0_unsecured_regs[] = {
{mmNIC0_QM_ARC_AUX0_CLUSTER_NUM, mmNIC0_QM_ARC_AUX0_WAKE_UP_EVENT}, {mmNIC0_QM_ARC_AUX0_CLUSTER_NUM, mmNIC0_QM_ARC_AUX0_WAKE_UP_EVENT},
{mmNIC0_QM_ARC_AUX0_ARC_RST_REQ, mmNIC0_QM_ARC_AUX0_CID_OFFSET_7}, {mmNIC0_QM_ARC_AUX0_ARC_RST_REQ, mmNIC0_QM_ARC_AUX0_CID_OFFSET_7},
{mmNIC0_QM_ARC_AUX0_SCRATCHPAD_0, mmNIC0_QM_ARC_AUX0_INFLIGHT_LBU_RD_CNT}, {mmNIC0_QM_ARC_AUX0_SCRATCHPAD_0, mmNIC0_QM_ARC_AUX0_INFLIGHT_LBU_RD_CNT},
{mmNIC0_QM_ARC_AUX0_CBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_CBU_EARLY_BRESP_EN},
{mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN}, {mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN},
{mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_BASE_ADDR_0, mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_ALERT_MSG}, {mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_BASE_ADDR_0, mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_ALERT_MSG},
{mmNIC0_QM_ARC_AUX0_DCCM_Q_PUSH_FIFO_CNT, mmNIC0_QM_ARC_AUX0_QMAN_ARC_CQ_SHADOW_CI}, {mmNIC0_QM_ARC_AUX0_DCCM_Q_PUSH_FIFO_CNT, mmNIC0_QM_ARC_AUX0_QMAN_ARC_CQ_SHADOW_CI},
......
...@@ -4475,8 +4475,8 @@ static void goya_print_out_of_sync_info(struct hl_device *hdev, ...@@ -4475,8 +4475,8 @@ static void goya_print_out_of_sync_info(struct hl_device *hdev,
{ {
struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
} }
static void goya_print_irq_info(struct hl_device *hdev, u16 event_type, static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
......
...@@ -957,6 +957,7 @@ enum gaudi2_async_event_id { ...@@ -957,6 +957,7 @@ enum gaudi2_async_event_id {
GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG0 = 1317, GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG0 = 1317,
GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318, GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318,
GAUDI2_EVENT_ARC_DCCM_FULL = 1319, GAUDI2_EVENT_ARC_DCCM_FULL = 1319,
GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320,
GAUDI2_EVENT_SIZE, GAUDI2_EVENT_SIZE,
}; };
......
/* SPDX-License-Identifier: GPL-2.0 /* SPDX-License-Identifier: GPL-2.0
* *
* Copyright 2018-2021 HabanaLabs, Ltd. * Copyright 2018-2022 HabanaLabs, Ltd.
* All Rights Reserved. * All Rights Reserved.
* *
*/ */
...@@ -2663,6 +2663,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = { ...@@ -2663,6 +2663,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
.msg = 1, .reset = 0, .name = "STATUS_NIC11_ENG1" }, .msg = 1, .reset = 0, .name = "STATUS_NIC11_ENG1" },
{ .fc_id = 1319, .cpu_id = 625, .valid = 1, { .fc_id = 1319, .cpu_id = 625, .valid = 1,
.msg = 1, .reset = 0, .name = "ARC_DCCM_FULL" }, .msg = 1, .reset = 0, .name = "ARC_DCCM_FULL" },
{ .fc_id = 1320, .cpu_id = 626, .valid = 1,
.msg = 1, .reset = 1, .name = "FP32_NOT_SUPPORTED" },
}; };
#endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */ #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */
...@@ -20,4 +20,11 @@ ...@@ -20,4 +20,11 @@
#define PCI_CONFIG_ELBI_STS_MASK (PCI_CONFIG_ELBI_STS_ERR | \ #define PCI_CONFIG_ELBI_STS_MASK (PCI_CONFIG_ELBI_STS_ERR | \
PCI_CONFIG_ELBI_STS_DONE) PCI_CONFIG_ELBI_STS_DONE)
enum hl_revision_id {
/* PCI revision ID 0 is not legal */
REV_ID_INVALID = 0x00,
REV_ID_A = 0x01,
REV_ID_B = 0x02,
};
#endif /* INCLUDE_PCI_GENERAL_H_ */ #endif /* INCLUDE_PCI_GENERAL_H_ */
...@@ -597,6 +597,10 @@ enum gaudi2_engine_id { ...@@ -597,6 +597,10 @@ enum gaudi2_engine_id {
GAUDI2_ENGINE_ID_NIC10_1, GAUDI2_ENGINE_ID_NIC10_1,
GAUDI2_ENGINE_ID_NIC11_0, GAUDI2_ENGINE_ID_NIC11_0,
GAUDI2_ENGINE_ID_NIC11_1, GAUDI2_ENGINE_ID_NIC11_1,
GAUDI2_ENGINE_ID_PCIE,
GAUDI2_ENGINE_ID_PSOC,
GAUDI2_ENGINE_ID_ARC_FARM,
GAUDI2_ENGINE_ID_KDMA,
GAUDI2_ENGINE_ID_SIZE GAUDI2_ENGINE_ID_SIZE
}; };
...@@ -717,6 +721,8 @@ enum hl_server_type { ...@@ -717,6 +721,8 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable
* HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state * HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
*/ */
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
...@@ -725,6 +731,8 @@ enum hl_server_type { ...@@ -725,6 +731,8 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4) #define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4)
#define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5) #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5)
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
/* Opcode for management ioctl /* Opcode for management ioctl
* *
...@@ -778,6 +786,9 @@ enum hl_server_type { ...@@ -778,6 +786,9 @@ enum hl_server_type {
* HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
* HL_INFO_GET_EVENTS - Retrieve the last occurred events * HL_INFO_GET_EVENTS - Retrieve the last occurred events
* HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information. * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
* HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic.
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
*/ */
#define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1 #define HL_INFO_HW_EVENTS 1
...@@ -809,6 +820,8 @@ enum hl_server_type { ...@@ -809,6 +820,8 @@ enum hl_server_type {
#define HL_INFO_GET_EVENTS 30 #define HL_INFO_GET_EVENTS 30
#define HL_INFO_UNDEFINED_OPCODE_EVENT 31 #define HL_INFO_UNDEFINED_OPCODE_EVENT 31
#define HL_INFO_ENGINE_STATUS 32 #define HL_INFO_ENGINE_STATUS 32
#define HL_INFO_PAGE_FAULT_EVENT 33
#define HL_INFO_USER_MAPPINGS 34
#define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16 #define HL_INFO_CARD_NAME_MAX_LEN 16
...@@ -859,6 +872,7 @@ enum hl_server_type { ...@@ -859,6 +872,7 @@ enum hl_server_type {
* @number_of_user_interrupts: The number of interrupts that are available to the userspace * @number_of_user_interrupts: The number of interrupts that are available to the userspace
* application to use. Relevant for Gaudi2 and later. * application to use. Relevant for Gaudi2 and later.
* @device_mem_alloc_default_page_size: default page size used in device memory allocation. * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
* @revision_id: PCI revision ID of the ASIC.
*/ */
struct hl_info_hw_ip_info { struct hl_info_hw_ip_info {
__u64 sram_base_address; __u64 sram_base_address;
...@@ -889,6 +903,12 @@ struct hl_info_hw_ip_info { ...@@ -889,6 +903,12 @@ struct hl_info_hw_ip_info {
__u16 pad2; __u16 pad2;
__u64 reserved4; __u64 reserved4;
__u64 device_mem_alloc_default_page_size; __u64 device_mem_alloc_default_page_size;
__u64 reserved5;
__u64 reserved6;
__u32 reserved7;
__u8 reserved8;
__u8 revision_id;
__u8 pad[2];
}; };
struct hl_info_dram_usage { struct hl_info_dram_usage {
...@@ -896,7 +916,7 @@ struct hl_info_dram_usage { ...@@ -896,7 +916,7 @@ struct hl_info_dram_usage {
__u64 ctx_dram_mem; __u64 ctx_dram_mem;
}; };
#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2 #define HL_BUSY_ENGINES_MASK_EXT_SIZE 4
struct hl_info_hw_idle { struct hl_info_hw_idle {
__u32 is_idle; __u32 is_idle;
...@@ -1071,31 +1091,44 @@ struct hl_info_cs_timeout_event { ...@@ -1071,31 +1091,44 @@ struct hl_info_cs_timeout_event {
__u64 seq; __u64 seq;
}; };
#define HL_RAZWI_PAGE_FAULT 0 #define HL_RAZWI_NA_ENG_ID U16_MAX
#define HL_RAZWI_MMU_ACCESS_ERROR 1 #define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128
#define HL_RAZWI_READ BIT(0)
#define HL_RAZWI_WRITE BIT(1)
#define HL_RAZWI_LBW BIT(2)
#define HL_RAZWI_HBW BIT(3)
#define HL_RAZWI_RR BIT(4)
#define HL_RAZWI_ADDR_DEC BIT(5)
/** /**
* struct hl_info_razwi_event - razwi information. * struct hl_info_razwi_event - razwi information.
* @timestamp: timestamp of razwi. * @timestamp: timestamp of razwi.
* @addr: address which accessing it caused razwi. * @addr: address which accessing it caused razwi.
* @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not * @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not
* have engine id it will be set to U16_MAX. * have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible
* @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible * engines which caused the razwi, it will hold all of them.
* engines which one them caused the razwi. In that case, it will contain the * @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication
* second possible engine id, otherwise it will be set to U16_MAX. * might be common for several engines and there is no way to get the
* @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1, * exact engine. In this way, engine_id array will be filled with all
* otherwise 0. * possible engines caused this razwi. Also, there might be possibility
* @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. * in gaudi, where we don't indication on specific engine, in that case
* @pad: padding to 64 bit. * the value of this parameter will be zero.
* @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation
* HL_RAZWI_WRITE - razwi caused by write operation
* HL_RAZWI_LBW - razwi caused by lbw fabric transaction
* HL_RAZWI_HBW - razwi caused by hbw fabric transaction
* HL_RAZWI_RR - razwi caused by range register
* HL_RAZWI_ADDR_DEC - razwi caused by address decode error
* Note: this data is not supported by all asics, in that case the relevant bits will not
* be set.
*/ */
struct hl_info_razwi_event { struct hl_info_razwi_event {
__s64 timestamp; __s64 timestamp;
__u64 addr; __u64 addr;
__u16 engine_id_1; __u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR];
__u16 engine_id_2; __u16 num_of_possible_engines;
__u8 no_engine_id; __u8 flags;
__u8 error_type; __u8 pad[5];
__u8 pad[2];
}; };
#define MAX_QMAN_STREAMS_INFO 4 #define MAX_QMAN_STREAMS_INFO 4
...@@ -1174,6 +1207,29 @@ struct hl_info_sec_attest { ...@@ -1174,6 +1207,29 @@ struct hl_info_sec_attest {
__u8 pad0[2]; __u8 pad0[2];
}; };
/**
* struct hl_page_fault_info - page fault information.
* @timestamp: timestamp of page fault.
* @addr: address which accessing it caused page fault.
* @engine_id: engine id which caused the page fault, supported only in gaudi3.
*/
struct hl_page_fault_info {
__s64 timestamp;
__u64 addr;
__u16 engine_id;
__u8 pad[6];
};
/**
* struct hl_user_mapping - user mapping information.
* @dev_va: device virtual address.
* @size: virtual address mapping size.
*/
struct hl_user_mapping {
__u64 dev_va;
__u64 size;
};
enum gaudi_dcores { enum gaudi_dcores {
HL_GAUDI_WS_DCORE, HL_GAUDI_WS_DCORE,
HL_GAUDI_WN_DCORE, HL_GAUDI_WN_DCORE,
...@@ -1200,6 +1256,8 @@ enum gaudi_dcores { ...@@ -1200,6 +1256,8 @@ enum gaudi_dcores {
* needed, hence updating this variable so user will know the exact amount * needed, hence updating this variable so user will know the exact amount
* of bytes copied by the kernel to the buffer. * of bytes copied by the kernel to the buffer.
* @sec_attest_nonce: Nonce number used for attestation report. * @sec_attest_nonce: Nonce number used for attestation report.
* @array_size: Number of array members copied to user buffer.
* Relevant for HL_INFO_USER_MAPPINGS info ioctl.
* @pad: Padding to 64 bit. * @pad: Padding to 64 bit.
*/ */
struct hl_info_args { struct hl_info_args {
...@@ -1215,6 +1273,7 @@ struct hl_info_args { ...@@ -1215,6 +1273,7 @@ struct hl_info_args {
__u32 eventfd; __u32 eventfd;
__u32 user_buffer_actual_size; __u32 user_buffer_actual_size;
__u32 sec_attest_nonce; __u32 sec_attest_nonce;
__u32 array_size;
}; };
__u32 pad; __u32 pad;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment