Merge tag 'drm-habanalabs-next-2023-03-20' of...

Merge tag 'drm-habanalabs-next-2023-03-20' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next This tag contains habanalabs driver and accel changes for v6.4: - uAPI changes: - Add opcodes to the CS ioctl to allow user to stall/resume specific engines inside Gaudi2. This is to allow the user to perform power testing/measurements when training different topologies. - Expose in the INFO ioctl the amount of device memory that the driver and f/w reserve for themselves. - Expose in the INFO ioctl a bit-mask of the available rotator engines in Gaudi2. This is to align with other engines that are already exposed. - Expose in the INFO ioctl the register's address of the f/w that should be used to trigger interrupts from within the user's code running in the compute engines. - Add a critical-event bit in the eventfd bitmask so the user will know the event that was received was critical, and a reset will now occur - Expose in the INFO ioctl two new opcodes to fetch information on h/w and f/w events. The events recorded are the events that were reported in the eventfd. - New features and improvements: - Add a dedicated interrupt ID in MSI-X in the device to the notification of an unexpected user-related event in Gaudi2. Handle it in the driver by reporting this event. - Allow the user to fetch the device memory current usage even when the device is undergoing compute-reset (a reset type that only clears the compute engines). - Enable graceful reset mechanism for compute-reset. This will give the user a few seconds before the device is reset. For example, the user can, during that time, perform certain device operations (dump data for debug) or close the device in an orderly fashion. - Align the decoder with the rest of the engines in regard to notification to the user about interrupts and in regard to performing graceful reset when needed (instead of immediate reset). - Add support for assert interrupt from the TPC engine. - Get the reset type that is necessary to perform per event from the auto-generated irq_map array. - Print the specific reason why a device is still in use when notifying to the user about it (after the user closed the device's FD). - Move to threaded IRQ when handling interrupts of workload completions. - Firmware related fixes: - Fix RAZWI event handler to match newest f/w version. - Read error cause register in dma core events because the f/w doesn't do that. - Increase maximum time to wait for completion of Gaudi2 reset due to f/w bug. - Align to the latest firmware specs. - Enforce the release order of the compute device and dma-buf. i.e increment the device file refcount for any dma-buf that was exported for that device. This will make sure the compute device release function won't be called until the user closes all the FDs of the relevant dma-bufs. Without this change, closing the device's FD before/without closing the dma-buf's FD would always lead to hard-reset of the device. - Fix a link in the drm documentation to correctly point to the accel section. - Compilation warnings cleanups - Misc bug fixes and code cleanups Signed-off-by: Dave Airlie <airlied@redhat.com> # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCgAdFiEE7TEboABC71LctBLFZR1NuKta54AFAmQYfcAACgkQZR1NuKta # 54DB4Af/SuiHZkVXwr+yHPv9El726rz9ZQD7mQtzNmehWGonwAvz15yqocNMUSbF # JbqE/vrZjvbXrP1Uv5UrlRVdnFHSPV18VnHU4BMS/WOm19SsR6vZ0QOXOoa6/AUb # w+kF3D//DbFI4/mTGfpH5/pzwu51ti8aVktosPFlHIa8iI8CB4/4IV+ivQ8UW4oK # HyDRkIvHdRmER7vGOfhwhsr4zdqSlJBYrv3C3Z1dkSYBPW/5ICbiM1UlKycwdYKI # cajQBSdUQwUCWnI+i8RmSy3kjNO6OE4XRUvTv89F2bQeyK/1rJLG2m2xZR/Ml/o5 # 7Cgvbn0hWZyeqe7OObYiBlSOBSehCA== # =wclm # -----END PGP SIGNATURE----- # gpg: Signature made Tue 21 Mar 2023 01:37:36 AEST # gpg: using RSA key ED311BA00042EF52DCB412C5651D4DB8AB5AE780 # gpg: Can't check signature: No public key From: Oded Gabbay <ogabbay@kernel.org> Link: https://patchwork.freedesktop.org/patch/msgid/20230320154026.GA766126@ogabbay-vm-u20.habana-labs.com

Merge tag 'drm-habanalabs-next-2023-03-20' of...
Merge tag 'drm-habanalabs-next-2023-03-20' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next This tag contains habanalabs driver and accel changes for v6.4: - uAPI changes: - Add opcodes to the CS ioctl to allow user to stall/resume specific engines inside Gaudi2. This is to allow the user to perform power testing/measurements when training different topologies. - Expose in the INFO ioctl the amount of device memory that the driver and f/w reserve for themselves. - Expose in the INFO ioctl a bit-mask of the available rotator engines in Gaudi2. This is to align with other engines that are already exposed. - Expose in the INFO ioctl the register's address of the f/w that should be used to trigger interrupts from within the user's code running in the compute engines. - Add a critical-event bit in the eventfd bitmask so the user will know the event that was received was critical, and a reset will now occur - Expose in the INFO ioctl two new opcodes to fetch information on h/w and f/w events. The events recorded are the events that were reported in the eventfd. - New features and improvements: - Add a dedicated interrupt ID in MSI-X in the device to the notification of an unexpected user-related event in Gaudi2. Handle it in the driver by reporting this event. - Allow the user to fetch the device memory current usage even when the device is undergoing compute-reset (a reset type that only clears the compute engines). - Enable graceful reset mechanism for compute-reset. This will give the user a few seconds before the device is reset. For example, the user can, during that time, perform certain device operations (dump data for debug) or close the device in an orderly fashion. - Align the decoder with the rest of the engines in regard to notification to the user about interrupts and in regard to performing graceful reset when needed (instead of immediate reset). - Add support for assert interrupt from the TPC engine. - Get the reset type that is necessary to perform per event from the auto-generated irq_map array. - Print the specific reason why a device is still in use when notifying to the user about it (after the user closed the device's FD). - Move to threaded IRQ when handling interrupts of workload completions. - Firmware related fixes: - Fix RAZWI event handler to match newest f/w version. - Read error cause register in dma core events because the f/w doesn't do that. - Increase maximum time to wait for completion of Gaudi2 reset due to f/w bug. - Align to the latest firmware specs. - Enforce the release order of the compute device and dma-buf. i.e increment the device file refcount for any dma-buf that was exported for that device. This will make sure the compute device release function won't be called until the user closes all the FDs of the relevant dma-bufs. Without this change, closing the device's FD before/without closing the dma-buf's FD would always lead to hard-reset of the device. - Fix a link in the drm documentation to correctly point to the accel section. - Compilation warnings cleanups - Misc bug fixes and code cleanups Signed-off-by: Dave Airlie <airlied@redhat.com> # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCgAdFiEE7TEboABC71LctBLFZR1NuKta54AFAmQYfcAACgkQZR1NuKta # 54DB4Af/SuiHZkVXwr+yHPv9El726rz9ZQD7mQtzNmehWGonwAvz15yqocNMUSbF # JbqE/vrZjvbXrP1Uv5UrlRVdnFHSPV18VnHU4BMS/WOm19SsR6vZ0QOXOoa6/AUb # w+kF3D//DbFI4/mTGfpH5/pzwu51ti8aVktosPFlHIa8iI8CB4/4IV+ivQ8UW4oK # HyDRkIvHdRmER7vGOfhwhsr4zdqSlJBYrv3C3Z1dkSYBPW/5ICbiM1UlKycwdYKI # cajQBSdUQwUCWnI+i8RmSy3kjNO6OE4XRUvTv89F2bQeyK/1rJLG2m2xZR/Ml/o5 # 7Cgvbn0hWZyeqe7OObYiBlSOBSehCA== # =wclm # -----END PGP SIGNATURE----- # gpg: Signature made Tue 21 Mar 2023 01:37:36 AEST # gpg: using RSA key ED311BA00042EF52DCB412C5651D4DB8AB5AE780 # gpg: Can't check signature: No public key From: Oded Gabbay <ogabbay@kernel.org> Link: https://patchwork.freedesktop.org/patch/msgid/20230320154026.GA766126@ogabbay-vm-u20.habana-labs.com
d36d68fd · Dave Airlie · d240daa2 · 75b44575 · d36d68fd · d36d68fd
Commit d36d68fd authored Mar 22, 2023 by Dave Airlie
30 changed files
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -258,7 +258,7 @@ static int vm_show(struct seq_file *s, void *data)
 	if (!dev_entry->hdev->mmu_enable)
 		return 0;

-	spin_lock(&dev_entry->ctx_mem_hash_spinlock);
+	mutex_lock(&dev_entry->ctx_mem_hash_mutex);

 	list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) {
 		once = false;
@@ -329,7 +329,7 @@ static int vm_show(struct seq_file *s, void *data)

 	}

-	spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
+	mutex_unlock(&dev_entry->ctx_mem_hash_mutex);

 	ctx = hl_get_compute_ctx(dev_entry->hdev);
 	if (ctx) {
@@ -1583,209 +1583,216 @@ static const struct file_operations hl_debugfs_fops = {
 	.release = single_release,
 };

-static void add_secured_nodes(struct hl_dbg_device_entry *dev_entry)
+static void add_secured_nodes(struct hl_dbg_device_entry *dev_entry, struct dentry *root)
 {
 	debugfs_create_u8("i2c_bus",
 				0644,
-				dev_entry->root,
+				root,
 				&dev_entry->i2c_bus);

 	debugfs_create_u8("i2c_addr",
 				0644,
-				dev_entry->root,
+				root,
 				&dev_entry->i2c_addr);

 	debugfs_create_u8("i2c_reg",
 				0644,
-				dev_entry->root,
+				root,
 				&dev_entry->i2c_reg);

 	debugfs_create_u8("i2c_len",
 				0644,
-				dev_entry->root,
+				root,
 				&dev_entry->i2c_len);

 	debugfs_create_file("i2c_data",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_i2c_data_fops);

 	debugfs_create_file("led0",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_led0_fops);

 	debugfs_create_file("led1",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_led1_fops);

 	debugfs_create_file("led2",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_led2_fops);
 }

-void hl_debugfs_add_device(struct hl_device *hdev)
+static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_entry *dev_entry,
+				struct dentry *root)
 {
-	struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
 	int count = ARRAY_SIZE(hl_debugfs_list);
 	struct hl_debugfs_entry *entry;
 	int i;

-	dev_entry->hdev = hdev;
-	dev_entry->entry_arr = kmalloc_array(count,
-					sizeof(struct hl_debugfs_entry),
-					GFP_KERNEL);
-	if (!dev_entry->entry_arr)
-		return;
-
-	dev_entry->data_dma_blob_desc.size = 0;
-	dev_entry->data_dma_blob_desc.data = NULL;
-	dev_entry->mon_dump_blob_desc.size = 0;
-	dev_entry->mon_dump_blob_desc.data = NULL;
-
-	INIT_LIST_HEAD(&dev_entry->file_list);
-	INIT_LIST_HEAD(&dev_entry->cb_list);
-	INIT_LIST_HEAD(&dev_entry->cs_list);
-	INIT_LIST_HEAD(&dev_entry->cs_job_list);
-	INIT_LIST_HEAD(&dev_entry->userptr_list);
-	INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list);
-	mutex_init(&dev_entry->file_mutex);
-	init_rwsem(&dev_entry->state_dump_sem);
-	spin_lock_init(&dev_entry->cb_spinlock);
-	spin_lock_init(&dev_entry->cs_spinlock);
-	spin_lock_init(&dev_entry->cs_job_spinlock);
-	spin_lock_init(&dev_entry->userptr_spinlock);
-	spin_lock_init(&dev_entry->ctx_mem_hash_spinlock);
-
-	dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
-						hl_debug_root);
-
 	debugfs_create_x64("memory_scrub_val",
 				0644,
-				dev_entry->root,
+				root,
 				&hdev->memory_scrub_val);

 	debugfs_create_file("memory_scrub",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_mem_scrub_fops);

 	debugfs_create_x64("addr",
 				0644,
-				dev_entry->root,
+				root,
 				&dev_entry->addr);

 	debugfs_create_file("data32",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_data32b_fops);

 	debugfs_create_file("data64",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_data64b_fops);

 	debugfs_create_file("set_power_state",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_power_fops);

 	debugfs_create_file("device",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_device_fops);

 	debugfs_create_file("clk_gate",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_clk_gate_fops);

 	debugfs_create_file("stop_on_err",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_stop_on_err_fops);

 	debugfs_create_file("dump_security_violations",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_security_violations_fops);

 	debugfs_create_file("dump_razwi_events",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_razwi_check_fops);

 	debugfs_create_file("dma_size",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_dma_size_fops);

 	debugfs_create_blob("data_dma",
 				0400,
-				dev_entry->root,
+				root,
 				&dev_entry->data_dma_blob_desc);

 	debugfs_create_file("monitor_dump_trig",
 				0200,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_monitor_dump_fops);

 	debugfs_create_blob("monitor_dump",
 				0400,
-				dev_entry->root,
+				root,
 				&dev_entry->mon_dump_blob_desc);

 	debugfs_create_x8("skip_reset_on_timeout",
 				0644,
-				dev_entry->root,
+				root,
 				&hdev->reset_info.skip_reset_on_timeout);

 	debugfs_create_file("state_dump",
 				0600,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_state_dump_fops);

 	debugfs_create_file("timeout_locked",
 				0644,
-				dev_entry->root,
+				root,
 				dev_entry,
 				&hl_timeout_locked_fops);

 	debugfs_create_u32("device_release_watchdog_timeout",
 				0644,
-				dev_entry->root,
+				root,
 				&hdev->device_release_watchdog_timeout_sec);

 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
 					0444,
-					dev_entry->root,
+					root,
 					entry,
 					&hl_debugfs_fops);
 		entry->info_ent = &hl_debugfs_list[i];
 		entry->dev_entry = dev_entry;
 	}
+}
+
+void hl_debugfs_add_device(struct hl_device *hdev)
+{
+	struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
+	int count = ARRAY_SIZE(hl_debugfs_list);
+
+	dev_entry->hdev = hdev;
+	dev_entry->entry_arr = kmalloc_array(count,
+					sizeof(struct hl_debugfs_entry),
+					GFP_KERNEL);
+	if (!dev_entry->entry_arr)
+		return;
+
+	dev_entry->data_dma_blob_desc.size = 0;
+	dev_entry->data_dma_blob_desc.data = NULL;
+	dev_entry->mon_dump_blob_desc.size = 0;
+	dev_entry->mon_dump_blob_desc.data = NULL;
+
+	INIT_LIST_HEAD(&dev_entry->file_list);
+	INIT_LIST_HEAD(&dev_entry->cb_list);
+	INIT_LIST_HEAD(&dev_entry->cs_list);
+	INIT_LIST_HEAD(&dev_entry->cs_job_list);
+	INIT_LIST_HEAD(&dev_entry->userptr_list);
+	INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list);
+	mutex_init(&dev_entry->file_mutex);
+	init_rwsem(&dev_entry->state_dump_sem);
+	spin_lock_init(&dev_entry->cb_spinlock);
+	spin_lock_init(&dev_entry->cs_spinlock);
+	spin_lock_init(&dev_entry->cs_job_spinlock);
+	spin_lock_init(&dev_entry->userptr_spinlock);
+	mutex_init(&dev_entry->ctx_mem_hash_mutex);
+
+	dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
+						hl_debug_root);

+	add_files_to_device(hdev, dev_entry, dev_entry->root);
 	if (!hdev->asic_prop.fw_security_enabled)
-		add_secured_nodes(dev_entry);
+		add_secured_nodes(dev_entry, dev_entry->root);
 }

 void hl_debugfs_remove_device(struct hl_device *hdev)
@@ -1795,6 +1802,7 @@ void hl_debugfs_remove_device(struct hl_device *hdev)

 	debugfs_remove_recursive(entry->root);

+	mutex_destroy(&entry->ctx_mem_hash_mutex);
 	mutex_destroy(&entry->file_mutex);

 	vfree(entry->data_dma_blob_desc.data);
@@ -1901,18 +1909,18 @@ void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
 {
 	struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;

-	spin_lock(&dev_entry->ctx_mem_hash_spinlock);
+	mutex_lock(&dev_entry->ctx_mem_hash_mutex);
 	list_add(&ctx->debugfs_list, &dev_entry->ctx_mem_hash_list);
-	spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
+	mutex_unlock(&dev_entry->ctx_mem_hash_mutex);
 }

 void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
 {
 	struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;

-	spin_lock(&dev_entry->ctx_mem_hash_spinlock);
+	mutex_lock(&dev_entry->ctx_mem_hash_mutex);
 	list_del(&ctx->debugfs_list);
-	spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
+	mutex_unlock(&dev_entry->ctx_mem_hash_mutex);
 }

 /**

--- a/drivers/accel/habanalabs/common/decoder.c
+++ b/drivers/accel/habanalabs/common/decoder.c
@@ -46,7 +46,7 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
 static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
 {
 	bool reset_required = false;
-	u32 irq_status;
+	u32 irq_status, event_mask;

 	irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);

@@ -54,17 +54,27 @@ static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_

 	dec_print_abnrm_intr_source(hdev, irq_status);

-	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK)
-		reset_required = true;
-
 	/* Clear the interrupt */
 	WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);

 	/* Flush the interrupt clear */
 	RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);

-	if (reset_required)
-		hl_device_reset(hdev, HL_DRV_RESET_HARD);
+	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
+		reset_required = true;
+		event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+	} else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
+		event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+	} else {
+		event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+	}
+
+	if (reset_required) {
+		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+		hl_device_cond_reset(hdev, 0, event_mask);
+	} else {
+		hl_notifier_event_send_all(hdev, event_mask);
+	}
 }

 static void dec_completion_abnrm(struct work_struct *work)

--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -3152,7 +3152,7 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
 int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
 						dma_addr_t buff, u32 *size)
 {
-	struct cpucp_packet pkt = {0};
+	struct cpucp_packet pkt = {};
 	u64 result;
 	int rc = 0;


--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -12,7 +12,6 @@
 #include "../include/hw_ip/pci/pci_general.h"

 #include <linux/pci.h>
-#include <linux/aer.h>
 #include <linux/module.h>

 #define CREATE_TRACE_POINTS
@@ -221,12 +220,9 @@ int hl_device_open(struct inode *inode, struct file *filp)

 	hl_debugfs_add_file(hpriv);

+	memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
-	atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
-	atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
 	hdev->captured_err_info.undef_opcode.write_enable = true;
-	hdev->captured_err_info.razwi_info.razwi_info_available = false;
-	hdev->captured_err_info.page_fault_info.page_fault_info_available = false;

 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
@@ -237,6 +233,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 out_err:
 	mutex_unlock(&hdev->fpriv_list_lock);
 	hl_mem_mgr_fini(&hpriv->mem_mgr);
+	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	filp->private_data = NULL;
 	mutex_destroy(&hpriv->ctx_lock);
@@ -324,6 +321,7 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev)
 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);

 	hdev->major = hl_major;
+	hdev->hclass = hl_class;
 	hdev->memory_scrub = memory_scrub;
 	hdev->reset_on_lockup = reset_on_lockup;
 	hdev->boot_error_status_mask = boot_error_status_mask;
@@ -550,9 +548,7 @@ static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

 	pci_set_drvdata(pdev, hdev);

-	pci_enable_pcie_error_reporting(pdev);
-
-	rc = hl_device_init(hdev, hl_class);
+	rc = hl_device_init(hdev);
 	if (rc) {
 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
 		rc = -ENODEV;
@@ -562,7 +558,6 @@ static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;

 disable_device:
-	pci_disable_pcie_error_reporting(pdev);
 	pci_set_drvdata(pdev, NULL);
 	destroy_hdev(hdev);

@@ -585,7 +580,6 @@ static void hl_pci_remove(struct pci_dev *pdev)
 		return;

 	hl_device_fini(hdev);
-	pci_disable_pcie_error_reporting(pdev);
 	pci_set_drvdata(pdev, NULL);
 	destroy_hdev(hdev);
 }

--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -102,11 +102,15 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.mme_master_slave_mode = prop->mme_master_slave_mode;
 	hw_ip.first_available_interrupt_id = prop->first_available_user_interrupt;
 	hw_ip.number_of_user_interrupts = prop->user_interrupt_count;
+	hw_ip.tpc_interrupt_id = prop->tpc_interrupt_id;

 	hw_ip.edma_enabled_mask = prop->edma_enabled_mask;
 	hw_ip.server_type = prop->server_type;
 	hw_ip.security_enabled = prop->fw_security_enabled;
 	hw_ip.revision_id = hdev->pdev->revision;
+	hw_ip.rotator_enabled_mask = prop->rotator_enabled_mask;
+	hw_ip.engine_core_interrupt_reg_addr = prop->engine_core_interrupt_reg_addr;
+	hw_ip.reserved_dram_size = dram_kmd_size;

 	return copy_to_user(out, &hw_ip,
 		min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
@@ -830,6 +834,50 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0;
 }

+static int hw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+	struct hl_device *hdev = hpriv->hdev;
+	u32 user_buf_size = args->return_size;
+	struct hw_err_info *info;
+	int rc;
+
+	if ((!user_buf_size) || (!user_buf))
+		return -EINVAL;
+
+	if (user_buf_size < sizeof(struct hl_info_hw_err_event))
+		return -ENOMEM;
+
+	info = &hdev->captured_err_info.hw_err;
+	if (!info->event_info_available)
+		return -ENOENT;
+
+	rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_hw_err_event));
+	return rc ? -EFAULT : 0;
+}
+
+static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+	struct hl_device *hdev = hpriv->hdev;
+	u32 user_buf_size = args->return_size;
+	struct fw_err_info *info;
+	int rc;
+
+	if ((!user_buf_size) || (!user_buf))
+		return -EINVAL;
+
+	if (user_buf_size < sizeof(struct hl_info_fw_err_event))
+		return -ENOMEM;
+
+	info = &hdev->captured_err_info.fw_err;
+	if (!info->event_info_available)
+		return -ENOENT;
+
+	rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_fw_err_event));
+	return rc ? -EFAULT : 0;
+}
+
 static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
 {
 	void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@@ -950,6 +998,14 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_UNREGISTER_EVENTFD:
 		return eventfd_unregister(hpriv, args);

+	case HL_INFO_HW_ERR_EVENT:
+		return hw_err_info(hpriv, args);
+
+	case HL_INFO_FW_ERR_EVENT:
+		return fw_err_info(hpriv, args);
+
+	case HL_INFO_DRAM_USAGE:
+		return dram_usage_info(hpriv, args);
 	default:
 		break;
 	}
@@ -962,10 +1018,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	}

 	switch (args->op) {
-	case HL_INFO_DRAM_USAGE:
-		rc = dram_usage_info(hpriv, args);
-		break;
-
 	case HL_INFO_HW_IDLE:
 		rc = hw_idle(hdev, args);
 		break;

--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -280,7 +280,6 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
 	struct list_head *ts_reg_free_list_head = NULL;
 	struct timestamp_reg_work_obj *job;
 	bool reg_node_handle_fail = false;
-	ktime_t now = ktime_get();
 	int rc;

 	/* For registration nodes:
@@ -303,13 +302,13 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
 			if (pend->ts_reg_info.buf) {
 				if (!reg_node_handle_fail) {
 					rc = handle_registration_node(hdev, pend,
-								&ts_reg_free_list_head, now);
+							&ts_reg_free_list_head, intr->timestamp);
 					if (rc)
 						reg_node_handle_fail = true;
 				}
 			} else {
 				/* Handle wait target value node */
-				pend->fence.timestamp = now;
+				pend->fence.timestamp = intr->timestamp;
 				complete_all(&pend->fence.completion);
 			}
 		}
@@ -326,6 +325,26 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
 	}
 }

+static void handle_tpc_interrupt(struct hl_device *hdev)
+{
+	u64 event_mask;
+	u32 flags;
+
+	event_mask = HL_NOTIFIER_EVENT_TPC_ASSERT |
+		HL_NOTIFIER_EVENT_USER_ENGINE_ERR |
+		HL_NOTIFIER_EVENT_DEVICE_RESET;
+
+	flags = HL_DRV_RESET_DELAY;
+
+	dev_err_ratelimited(hdev->dev, "Received TPC assert\n");
+	hl_device_cond_reset(hdev, flags, event_mask);
+}
+
+static void handle_unexpected_user_interrupt(struct hl_device *hdev)
+{
+	dev_err_ratelimited(hdev->dev, "Received unexpected user error interrupt\n");
+}
+
 /**
 * hl_irq_handler_user_interrupt - irq handler for user interrupts
 *
@@ -334,6 +353,23 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
 *
 */
 irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
+{
+	struct hl_user_interrupt *user_int = arg;
+
+	user_int->timestamp = ktime_get();
+
+	return IRQ_WAKE_THREAD;
+}
+
+/**
+ * hl_irq_user_interrupt_thread_handler - irq thread handler for user interrupts.
+ * This function is invoked by threaded irq mechanism
+ *
+ * @irq: irq number
+ * @arg: pointer to user interrupt structure
+ *
+ */
+irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
 {
 	struct hl_user_interrupt *user_int = arg;
 	struct hl_device *hdev = user_int->hdev;
@@ -351,6 +387,12 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
 		/* Handle decoder interrupt registered on this specific irq */
 		handle_user_interrupt(hdev, user_int);
 		break;
+	case HL_USR_INTERRUPT_TPC:
+		handle_tpc_interrupt(hdev);
+		break;
+	case HL_USR_INTERRUPT_UNEXPECTED:
+		handle_unexpected_user_interrupt(hdev);
+		break;
 	default:
 		break;
 	}
@@ -358,24 +400,6 @@ irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg)
 	return IRQ_HANDLED;
 }

-/**
- * hl_irq_handler_default - default irq handler
- *
- * @irq: irq number
- * @arg: pointer to user interrupt structure
- *
- */
-irqreturn_t hl_irq_handler_default(int irq, void *arg)
-{
-	struct hl_user_interrupt *user_interrupt = arg;
-	struct hl_device *hdev = user_interrupt->hdev;
-	u32 interrupt_id = user_interrupt->interrupt_id;
-
-	dev_err(hdev->dev, "got invalid user interrupt %u", interrupt_id);
-
-	return IRQ_HANDLED;
-}
-
 /**
 * hl_irq_handler_eq - irq handler for event queue
 *
@@ -405,11 +429,10 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)

 		cur_eqe_index = FIELD_GET(EQ_CTL_INDEX_MASK, cur_eqe);
 		if ((hdev->event_queue.check_eqe_index) &&
-				(((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK)
-							!= cur_eqe_index)) {
+				(((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK) != cur_eqe_index)) {
 			dev_dbg(hdev->dev,
-				"EQE 0x%x in queue is ready but index does not match %d!=%d",
-				eq_base[eq->ci].hdr.ctl,
+				"EQE %#x in queue is ready but index does not match %d!=%d",
+				cur_eqe,
 				((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK),
 				cur_eqe_index);
 			break;

--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -235,10 +235,8 @@ static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
 	}

 	rc = hl_pin_host_memory(hdev, addr, size, userptr);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to pin host memory\n");
+	if (rc)
 		goto pin_err;
-	}

 	userptr->dma_mapped = true;
 	userptr->dir = DMA_BIDIRECTIONAL;
@@ -1097,10 +1095,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
 			huge_page_size = hdev->asic_prop.pmmu_huge.page_size;

 		rc = dma_map_host_va(hdev, addr, size, &userptr);
-		if (rc) {
-			dev_err(hdev->dev, "failed to get userptr from va\n");
+		if (rc)
 			return rc;
-		}

 		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
 				&phys_pg_pack, false);
@@ -1270,6 +1266,18 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
 	return rc;
 }

+/* Should be called while the context's mem_hash_lock is taken */
+static struct hl_vm_hash_node *get_vm_hash_node_locked(struct hl_ctx *ctx, u64 vaddr)
+{
+	struct hl_vm_hash_node *hnode;
+
+	hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)
+		if (vaddr == hnode->vaddr)
+			return hnode;
+
+	return NULL;
+}
+
 /**
 * unmap_device_va() - unmap the given device virtual address.
 * @ctx: pointer to the context structure.
@@ -1285,10 +1293,10 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 {
 	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
 	u64 vaddr = args->unmap.device_virt_addr;
-	struct hl_vm_hash_node *hnode = NULL;
 	struct asic_fixed_properties *prop;
 	struct hl_device *hdev = ctx->hdev;
 	struct hl_userptr *userptr = NULL;
+	struct hl_vm_hash_node *hnode;
 	struct hl_va_range *va_range;
 	enum vm_type *vm_type;
 	bool is_userptr;
@@ -1298,15 +1306,10 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,

 	/* protect from double entrance */
 	mutex_lock(&ctx->mem_hash_lock);
-	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
-		if (vaddr == hnode->vaddr)
-			break;
-
+	hnode = get_vm_hash_node_locked(ctx, vaddr);
 	if (!hnode) {
 		mutex_unlock(&ctx->mem_hash_lock);
-		dev_err(hdev->dev,
-			"unmap failed, no mem hnode for vaddr 0x%llx\n",
-			vaddr);
+		dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);
 		return -EINVAL;
 	}

@@ -1779,6 +1782,44 @@ static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
 	kfree(sgt);
 }

+static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm_hash_node *hnode;
+
+	/* get the memory handle */
+	mutex_lock(&ctx->mem_hash_lock);
+	hnode = get_vm_hash_node_locked(ctx, addr);
+	if (!hnode) {
+		mutex_unlock(&ctx->mem_hash_lock);
+		dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (upper_32_bits(hnode->handle)) {
+		mutex_unlock(&ctx->mem_hash_lock);
+		dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",
+				hnode->handle, addr);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * node found, increase export count so this memory cannot be unmapped
+	 * and the hash node cannot be deleted.
+	 */
+	hnode->export_cnt++;
+	mutex_unlock(&ctx->mem_hash_lock);
+
+	return hnode;
+}
+
+static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)
+{
+	mutex_lock(&ctx->mem_hash_lock);
+	hnode->export_cnt--;
+	mutex_unlock(&ctx->mem_hash_lock);
+}
+
 static void hl_release_dmabuf(struct dma_buf *dmabuf)
 {
 	struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
@@ -1789,13 +1830,15 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)

 	ctx = hl_dmabuf->ctx;

-	if (hl_dmabuf->memhash_hnode) {
-		mutex_lock(&ctx->mem_hash_lock);
-		hl_dmabuf->memhash_hnode->export_cnt--;
-		mutex_unlock(&ctx->mem_hash_lock);
-	}
+	if (hl_dmabuf->memhash_hnode)
+		memhash_node_export_put(ctx, hl_dmabuf->memhash_hnode);

+	atomic_dec(&ctx->hdev->dmabuf_export_cnt);
 	hl_ctx_put(ctx);
+
+	/* Paired with get_file() in export_dmabuf() */
+	fput(ctx->hpriv->filp);
+
 	kfree(hl_dmabuf);
 }

@@ -1834,6 +1877,13 @@ static int export_dmabuf(struct hl_ctx *ctx,

 	hl_dmabuf->ctx = ctx;
 	hl_ctx_get(hl_dmabuf->ctx);
+	atomic_inc(&ctx->hdev->dmabuf_export_cnt);
+
+	/* Get compute device file to enforce release order, such that all exported dma-buf will be
+	 * released first and only then the compute device.
+	 * Paired with fput() in hl_release_dmabuf().
+	 */
+	get_file(ctx->hpriv->filp);

 	*dmabuf_fd = fd;

@@ -1933,47 +1983,6 @@ static int validate_export_params(struct hl_device *hdev, u64 device_addr, u64 s
 	return 0;
 }

-static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct hl_vm_hash_node *hnode;
-
-	/* get the memory handle */
-	mutex_lock(&ctx->mem_hash_lock);
-	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)addr)
-		if (addr == hnode->vaddr)
-			break;
-
-	if (!hnode) {
-		mutex_unlock(&ctx->mem_hash_lock);
-		dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (upper_32_bits(hnode->handle)) {
-		mutex_unlock(&ctx->mem_hash_lock);
-		dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",
-				hnode->handle, addr);
-		return ERR_PTR(-EINVAL);
-	}
-
-	/*
-	 * node found, increase export count so this memory cannot be unmapped
-	 * and the hash node cannot be deleted.
-	 */
-	hnode->export_cnt++;
-	mutex_unlock(&ctx->mem_hash_lock);
-
-	return hnode;
-}
-
-static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)
-{
-	mutex_lock(&ctx->mem_hash_lock);
-	hnode->export_cnt--;
-	mutex_unlock(&ctx->mem_hash_lock);
-}
-
 static struct hl_vm_phys_pg_pack *get_phys_pg_pack_from_hash_node(struct hl_device *hdev,
 							struct hl_vm_hash_node *hnode)
 {
@@ -2221,11 +2230,11 @@ static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {
 * allocate_timestamps_buffers() - allocate timestamps buffers
 * This function will allocate ts buffer that will later on be mapped to the user
 * in order to be able to read the timestamp.
- * in additon it'll allocate an extra buffer for registration management.
+ * in addition it'll allocate an extra buffer for registration management.
 * since we cannot fail during registration for out-of-memory situation, so
 * we'll prepare a pool which will be used as user interrupt nodes and instead
 * of dynamically allocating nodes while registration we'll pick the node from
- * this pool. in addtion it'll add node to the mapping hash which will be used
+ * this pool. in addition it'll add node to the mapping hash which will be used
 * to map user ts buffer to the internal kernel ts buffer.
 * @hpriv: pointer to the private data of the fd
 * @args: ioctl input

--- a/drivers/accel/habanalabs/common/memory_mgr.c
+++ b/drivers/accel/habanalabs/common/memory_mgr.c
@@ -275,7 +275,7 @@ int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,

 	if (atomic_cmpxchg(&buf->mmap, 0, 1)) {
 		dev_err(mmg->dev,
-			"%s, Memory mmap failed, already mmaped to user\n",
+			"%s, Memory mmap failed, already mapped to user\n",
 			buf->behavior->topic);
 		rc = -EINVAL;
 		goto put_mem;
@@ -341,8 +341,19 @@ void hl_mem_mgr_fini(struct hl_mem_mgr *mmg)
 				"%s: Buff handle %u for CTX is still alive\n",
 				topic, id);
 	}
+}

-	/* TODO: can it happen that some buffer is still in use at this point? */
+/**
+ * hl_mem_mgr_idr_destroy() - destroy memory manager IDR.
+ * @mmg: parent unified memory manager
+ *
+ * Destroy the memory manager IDR.
+ * Shall be called when IDR is empty and no memory buffers are in use.
+ */
+void hl_mem_mgr_idr_destroy(struct hl_mem_mgr *mmg)
+{
+	if (!idr_is_empty(&mmg->handles))
+		dev_crit(mmg->dev, "memory manager IDR is destroyed while it is not empty!\n");

 	idr_destroy(&mmg->handles);
 }
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -540,8 +540,8 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
 		u32 page_off;

 		/*
-		 * Bit arithmetics cannot be used for non power of two page
-		 * sizes. In addition, since bit arithmetics is not used,
+		 * Bit arithmetic cannot be used for non power of two page
+		 * sizes. In addition, since bit arithmetic is not used,
 		 * we cannot ignore dram base. All that shall be considered.
 		 */

@@ -757,7 +757,7 @@ u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
 * @mmu_prop: MMU properties.
 * @hop_idx: HOP index.
 * @hop_addr: HOP address.
- * @virt_addr: virtual address fro the translation.
+ * @virt_addr: virtual address for the translation.
 *
 * @return the matching PTE value on success, otherwise U64_MAX.
 */

--- a/drivers/accel/habanalabs/common/security.c
+++ b/drivers/accel/habanalabs/common/security.c
@@ -502,7 +502,7 @@ int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset,
 int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset,
 		u32 num_instances, u32 instance_offset,
 		const u32 pb_blocks[], u32 blocks_array_size,
-		const struct range *regs_range_array, u32 regs_range_array_size)
+		const struct range *user_regs_range_array, u32 user_regs_range_array_size)
 {
 	int i;
 	struct hl_block_glbl_sec *glbl_sec;
@@ -514,8 +514,8 @@ int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset,
 		return -ENOMEM;

 	hl_secure_block(hdev, glbl_sec, blocks_array_size);
-	hl_unsecure_registers_range(hdev, regs_range_array,
-			regs_range_array_size, 0, pb_blocks, glbl_sec,
+	hl_unsecure_registers_range(hdev, user_regs_range_array,
+			user_regs_range_array_size, 0, pb_blocks, glbl_sec,
 			blocks_array_size);

 	/* Fill all blocks with the same configuration */

--- a/drivers/accel/habanalabs/common/security.h
+++ b/drivers/accel/habanalabs/common/security.h
@@ -10,7 +10,7 @@

 #include <linux/io-64-nonatomic-lo-hi.h>

-extern struct hl_device *hdev;
+struct hl_device;

 /* special blocks */
 #define HL_MAX_NUM_OF_GLBL_ERR_CAUSE		10

--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -656,6 +656,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI_EVENT_SIZE;
+	prop->max_num_of_engines = GAUDI_ENGINE_ID_SIZE;
 	prop->tpc_enabled_mask = TPC_ENABLED_MASK;

 	set_default_power_values(hdev);
@@ -679,6 +680,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 			(num_sync_stream_queues * HL_RSVD_MONS);

 	prop->first_available_user_interrupt = USHRT_MAX;
+	prop->tpc_interrupt_id = USHRT_MAX;

 	for (i = 0 ; i < HL_MAX_DCORES ; i++)
 		prop->first_available_cq[i] = USHRT_MAX;
@@ -867,13 +869,18 @@ static int gaudi_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}

 	if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+			goto pci_fini;
+		}
 	}

 	return 0;
@@ -3718,7 +3725,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to set hop0 addr for asid %d\n", i);
-			goto err;
+			return rc;
 		}
 	}

@@ -3729,7 +3736,9 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 	/* mem cache invalidation */
 	WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1);

-	hl_mmu_invalidate_cache(hdev, true, 0);
+	rc = hl_mmu_invalidate_cache(hdev, true, 0);
+	if (rc)
+		return rc;

 	WREG32(mmMMU_UP_MMU_ENABLE, 1);
 	WREG32(mmMMU_UP_SPI_MASK, 0xF);
@@ -3745,9 +3754,6 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 	gaudi->hw_cap_initialized |= HW_CAP_MMU;

 	return 0;
-
-err:
-	return rc;
 }

 static int gaudi_load_firmware_to_device(struct hl_device *hdev)
@@ -4068,7 +4074,7 @@ static int gaudi_hw_init(struct hl_device *hdev)
 	return rc;
 }

-static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
+static int gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	struct cpu_dyn_regs *dyn_regs =
 			&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@@ -4078,7 +4084,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset

 	if (!hard_reset) {
 		dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n");
-		return;
+		return 0;
 	}

 	if (hdev->pldm) {
@@ -4199,10 +4205,10 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
 	msleep(reset_timeout_ms);

 	status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM);
-	if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK)
-		dev_err(hdev->dev,
-			"Timeout while waiting for device to reset 0x%x\n",
-			status);
+	if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) {
+		dev_err(hdev->dev, "Timeout while waiting for device to reset 0x%x\n", status);
+		return -ETIMEDOUT;
+	}

 	if (gaudi) {
 		gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | HW_CAP_HBM |
@@ -4215,6 +4221,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset

 		hdev->device_cpu_is_halted = false;
 	}
+	return 0;
 }

 static int gaudi_suspend(struct hl_device *hdev)
@@ -7297,7 +7304,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
 }

 static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
-					bool razwi, u64 *event_mask)
+					bool check_razwi, u64 *event_mask)
 {
 	bool is_read = false, is_write = false;
 	u16 engine_id[2], num_of_razwi_eng = 0;
@@ -7316,7 +7323,7 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
 	dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
 		event_type, desc);

-	if (razwi) {
+	if (check_razwi) {
 		gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
 						&is_write);
 		gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask);
@@ -7333,8 +7340,9 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
 				num_of_razwi_eng = 1;
 		}

-		hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags,
-				event_mask);
+		if (razwi_flags)
+			hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng,
+					razwi_flags, event_mask);
 	}
 }

@@ -7633,6 +7641,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type,
 static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
+	struct hl_info_fw_err_info fw_err_info;
 	u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0;
 	u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
 	u32 fw_fatal_err_flag = 0, flags = 0;
@@ -7911,7 +7920,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_FW_ALIVE_S:
 		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
-		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+		fw_err_info.err_type = HL_INFO_FW_REPORTED_ERR;
+		fw_err_info.event_id = event_type;
+		fw_err_info.event_mask = &event_mask;
+		hl_handle_fw_err(hdev, &fw_err_info);
 		goto reset_device;

 	default:
@@ -7942,6 +7954,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	}

 	if (reset_required) {
+		/* escalate general hw errors to critical/fatal error */
+		if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
+			hl_handle_critical_hw_err(hdev, event_type, &event_mask);
+
 		hl_device_cond_reset(hdev, flags, event_mask);
 	} else {
 		hl_fw_unmask_irq(hdev, event_type);
@@ -8403,19 +8419,26 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev,
 	}

 	mutex_lock(&hdev->mmu_lock);
+
 	rc = hl_mmu_map_contiguous(ctx, hdev->internal_cb_va_base,
 			hdev->internal_cb_pool_dma_addr,
 			HOST_SPACE_INTERNAL_CB_SZ);
-
-	hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
-	mutex_unlock(&hdev->mmu_lock);
-
 	if (rc)
 		goto unreserve_internal_cb_pool;

+	rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
+	if (rc)
+		goto unmap_internal_cb_pool;
+
+	mutex_unlock(&hdev->mmu_lock);
+
 	return 0;

+unmap_internal_cb_pool:
+	hl_mmu_unmap_contiguous(ctx, hdev->internal_cb_va_base,
+			HOST_SPACE_INTERNAL_CB_SZ);
 unreserve_internal_cb_pool:
+	mutex_unlock(&hdev->mmu_lock);
 	hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base,
 			HOST_SPACE_INTERNAL_CB_SZ);
 destroy_internal_cb_pool:

--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -387,6 +387,8 @@ enum gaudi2_edma_id {
 * We have 64 CQ's per dcore, CQ0 in dcore 0 is reserved for legacy mode
 */
 #define GAUDI2_NUM_USER_INTERRUPTS 255
+#define GAUDI2_NUM_RESERVED_INTERRUPTS 1
+#define GAUDI2_TOTAL_USER_INTERRUPTS (GAUDI2_NUM_USER_INTERRUPTS + GAUDI2_NUM_RESERVED_INTERRUPTS)

 enum gaudi2_irq_num {
 	GAUDI2_IRQ_NUM_EVENT_QUEUE = GAUDI2_EVENT_QUEUE_MSIX_IDX,
@@ -410,12 +412,15 @@ enum gaudi2_irq_num {
 	GAUDI2_IRQ_NUM_SHARED_DEC0_ABNRM,
 	GAUDI2_IRQ_NUM_SHARED_DEC1_NRM,
 	GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM,
+	GAUDI2_IRQ_NUM_DEC_LAST = GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM,
 	GAUDI2_IRQ_NUM_COMPLETION,
 	GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
 	GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
+	GAUDI2_IRQ_NUM_TPC_ASSERT,
 	GAUDI2_IRQ_NUM_RESERVED_FIRST,
-	GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_NUM_USER_INTERRUPTS - 1),
-	GAUDI2_IRQ_NUM_USER_FIRST,
+	GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
+	GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
+	GAUDI2_IRQ_NUM_USER_FIRST = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR + 1,
 	GAUDI2_IRQ_NUM_USER_LAST = (GAUDI2_IRQ_NUM_USER_FIRST + GAUDI2_NUM_USER_INTERRUPTS - 1),
 	GAUDI2_IRQ_NUM_LAST = (GAUDI2_MSIX_ENTRIES - 1)
 };

--- a/drivers/accel/habanalabs/gaudi2/gaudi2_coresight.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2_coresight.c
@@ -2657,7 +2657,7 @@ int gaudi2_coresight_init(struct hl_device *hdev)
 	/*
 	 * Mask out all the disabled binned offsets.
 	 * so when user request to configure a binned or masked out component,
-	 * driver will ignore programing it ( happens when offset value is set to 0x0 )
+	 * driver will ignore programming it ( happens when offset value is set to 0x0 )
 	 * this is being set in gaudi2_coresight_set_disabled_components
 	 */


--- a/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h
@@ -79,7 +79,6 @@
 			DCORE0_MME_CTRL_LO_ARCH_STATUS_QM_RDY_MASK)

 #define TPC_IDLE_MASK	(DCORE0_TPC0_CFG_STATUS_SCALAR_PIPE_EMPTY_MASK | \
-			DCORE0_TPC0_CFG_STATUS_VECTOR_PIPE_EMPTY_MASK | \
 			DCORE0_TPC0_CFG_STATUS_IQ_EMPTY_MASK | \
 			DCORE0_TPC0_CFG_STATUS_SB_EMPTY_MASK | \
 			DCORE0_TPC0_CFG_STATUS_QM_IDLE_MASK | \
@@ -87,6 +86,8 @@

 #define DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK 0x100

+#define DCORE0_TPC0_EML_CFG_DBG_CNT_DBG_EXIT_MASK 0x40
+
 /* CGM_IDLE_MASK is valid for all engines CGM idle check */
 #define CGM_IDLE_MASK	DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK


--- a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c
@@ -1595,6 +1595,7 @@ static const u32 gaudi2_pb_dcr0_tpc0_unsecured_regs[] = {
 	mmDCORE0_TPC0_CFG_KERNEL_SRF_30,
 	mmDCORE0_TPC0_CFG_KERNEL_SRF_31,
 	mmDCORE0_TPC0_CFG_TPC_SB_L0CD,
+	mmDCORE0_TPC0_CFG_TPC_ID,
 	mmDCORE0_TPC0_CFG_QM_KERNEL_ID_INC,
 	mmDCORE0_TPC0_CFG_QM_TID_BASE_SIZE_HIGH_DIM_0,
 	mmDCORE0_TPC0_CFG_QM_TID_BASE_SIZE_HIGH_DIM_1,

--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -472,6 +472,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->max_pending_cs = GOYA_MAX_PENDING_CS;

 	prop->first_available_user_interrupt = USHRT_MAX;
+	prop->tpc_interrupt_id = USHRT_MAX;

 	for (i = 0 ; i < HL_MAX_DCORES ; i++)
 		prop->first_available_cq[i] = USHRT_MAX;
@@ -668,13 +669,18 @@ static int goya_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}

 	if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+			goto pci_fini;
+		}
 	}

 	if (!hdev->pldm) {
@@ -2782,7 +2788,7 @@ static int goya_hw_init(struct hl_device *hdev)
 	return rc;
 }

-static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
+static int goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	struct goya_device *goya = hdev->asic_specific;
 	u32 reset_timeout_ms, cpu_timeout_ms, status;
@@ -2828,17 +2834,17 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 	msleep(reset_timeout_ms);

 	status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM);
-	if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK)
-		dev_err(hdev->dev,
-			"Timeout while waiting for device to reset 0x%x\n",
-			status);
+	if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) {
+		dev_err(hdev->dev, "Timeout while waiting for device to reset 0x%x\n", status);
+		return -ETIMEDOUT;
+	}

 	if (!hard_reset && goya) {
 		goya->hw_cap_initialized &= ~(HW_CAP_DMA | HW_CAP_MME |
 						HW_CAP_GOLDEN | HW_CAP_TPC);
 		WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
 				GOYA_ASYNC_EVENT_ID_SOFT_RESET);
-		return;
+		return 0;
 	}

 	/* Chicken bit to re-initiate boot sequencer flow */
@@ -2857,6 +2863,7 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)

 		memset(goya->events_stat, 0, sizeof(goya->events_stat));
 	}
+	return 0;
 }

 int goya_suspend(struct hl_device *hdev)

--- a/drivers/accel/habanalabs/include/common/cpucp_if.h
+++ b/drivers/accel/habanalabs/include/common/cpucp_if.h
@@ -357,6 +357,7 @@ struct hl_eq_addr_dec_intr_data {
 struct hl_eq_entry {
 	struct hl_eq_header hdr;
 	union {
+		__le64 data_placeholder;
 		struct hl_eq_ecc_data ecc_data;
 		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Gaudi1 HBM */
 		struct hl_eq_sm_sei_data sm_sei_data;
@@ -661,6 +662,9 @@ enum pq_init_status {
 * CPUCP_PACKET_ACTIVE_STATUS_SET -
 *       LKD sends FW indication whether device is free or in use, this indication is reported
 *       also to the BMC.
+ *
+ * CPUCP_PACKET_REGISTER_INTERRUPTS -
+ *       Packet to register interrupts indicating LKD is ready to receive events from FW.
 */

 enum cpucp_packet_id {
@@ -725,6 +729,8 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_RESERVED9,			/* not used */
 	CPUCP_PACKET_RESERVED10,		/* not used */
 	CPUCP_PACKET_RESERVED11,		/* not used */
+	CPUCP_PACKET_RESERVED12,		/* internal */
+	CPUCP_PACKET_REGISTER_INTERRUPTS,	/* internal */
 	CPUCP_PACKET_ID_MAX			/* must be last */
 };

@@ -1127,6 +1133,7 @@ struct cpucp_security_info {
 *                     (0 = functional 1 = binned)
 * @interposer_version: Interposer version programmed in eFuse
 * @substrate_version: Substrate version programmed in eFuse
+ * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
 * @fw_os_version: Firmware OS Version
 */
 struct cpucp_info {
@@ -1154,7 +1161,7 @@ struct cpucp_info {
 	__u8 substrate_version;
 	__u8 reserved2;
 	struct cpucp_security_info sec_info;
-	__le32 reserved3;
+	__le32 fw_hbm_region_size;
 	__u8 pll_map[PLL_MAP_LEN];
 	__le64 mme_binning_mask;
 	__u8 fw_os_version[VERSION_MAX_LEN];

--- a/drivers/accel/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/accel/habanalabs/include/common/hl_boot_if.h
@@ -770,15 +770,23 @@ enum hl_components {
 	HL_COMPONENTS_ARMCP,
 	HL_COMPONENTS_CPLD,
 	HL_COMPONENTS_UBOOT,
+	HL_COMPONENTS_FUSE,
 	HL_COMPONENTS_MAX_NUM = 16
 };

+#define NAME_MAX_LEN	32 /* bytes */
+struct hl_module_data {
+	__u8 name[NAME_MAX_LEN];
+	__u8 version[VERSION_MAX_LEN];
+};
+
 /**
 * struct hl_component_versions - versions associated with hl component.
 * @struct_size: size of all the struct (including dynamic size of modules).
 * @modules_offset: offset of the modules field in this struct.
 * @component: version of the component itself.
 * @fw_os: Firmware OS Version.
+ * @comp_name: Name of the component.
 * @modules_mask: i'th bit (from LSB) is a flag - on if module i in enum
 *              hl_modules is used.
 * @modules_counter: number of set bits in modules_mask.
@@ -791,45 +799,14 @@ struct hl_component_versions {
 	__le16 modules_offset;
 	__u8 component[VERSION_MAX_LEN];
 	__u8 fw_os[VERSION_MAX_LEN];
+	__u8 comp_name[NAME_MAX_LEN];
 	__le16 modules_mask;
 	__u8 modules_counter;
 	__u8 reserved[1];
-	__u8 modules[][VERSION_MAX_LEN];
-};
-
-/**
- * struct hl_fw_versions - all versions (fuse, cpucp's components with their
- *              modules)
- * @struct_size: size of all the struct (including dynamic size of components).
- * @components_offset: offset of the components field in this struct.
- * @fuse: silicon production FUSE information.
- * @components_mask: i'th bit (from LSB) is a flag - on if component i in enum
- *              hl_components is used.
- * @components_counter: number of set bits in components_mask.
- * @reserved: reserved for future use.
- * @components: versions of hl components. Index i corresponds to the i'th bit
- *              that is *on* in components_mask. For example, if
- *              components_mask=0b101, then *components represents arcpid and
- *              *(hl_component_versions*)((char*)components + 1') represents
- *              preboot, where 1' = components[0].struct_size.
- */
-struct hl_fw_versions {
-	__le16 struct_size;
-	__le16 components_offset;
-	__u8 fuse[VERSION_MAX_LEN];
-	__le16 components_mask;
-	__u8 components_counter;
-	__u8 reserved[1];
-	struct hl_component_versions components[];
+	struct hl_module_data modules[];
 };

-/* Max size of struct hl_component_versions */
-#define HL_COMPONENT_VERSIONS_MAX_SIZE \
-	(sizeof(struct hl_component_versions) + HL_MODULES_MAX_NUM * \
-	 VERSION_MAX_LEN)
-
-/* Max size of struct hl_fw_versions */
-#define HL_FW_VERSIONS_MAX_SIZE (sizeof(struct hl_fw_versions) + \
-		HL_COMPONENTS_MAX_NUM * HL_COMPONENT_VERSIONS_MAX_SIZE)
+/* Max size of fit size */
+#define HL_FW_VERSIONS_FIT_SIZE	4096

 #endif /* HL_BOOT_IF_H */
--- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -164,6 +164,8 @@

 #define mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR	0x4800040

+#define mmDCORE0_TPC0_EML_CFG_DBG_CNT		0x40000
+
 #define SM_OBJS_PROT_BITS_OFFS			0x14000

 #define DCORE_OFFSET			(mmDCORE1_TPC0_QM_BASE - mmDCORE0_TPC0_QM_BASE)
@@ -185,7 +187,10 @@
 #define TPC_CFG_STALL_ON_ERR_OFFSET	(mmDCORE0_TPC0_CFG_STALL_ON_ERR - mmDCORE0_TPC0_CFG_BASE)
 #define TPC_CFG_TPC_INTR_MASK_OFFSET	(mmDCORE0_TPC0_CFG_TPC_INTR_MASK - mmDCORE0_TPC0_CFG_BASE)
 #define TPC_CFG_MSS_CONFIG_OFFSET	(mmDCORE0_TPC0_CFG_MSS_CONFIG - mmDCORE0_TPC0_CFG_BASE)
+#define TPC_EML_CFG_DBG_CNT_OFFSET	(mmDCORE0_TPC0_EML_CFG_DBG_CNT - mmDCORE0_TPC0_EML_CFG_BASE)

+#define EDMA_CORE_CFG_STALL_OFFSET	(mmDCORE0_EDMA0_CORE_CFG_1 - mmDCORE0_EDMA0_CORE_BASE)
+#define MME_CTRL_LO_QM_STALL_OFFSET	(mmDCORE0_MME_CTRL_LO_QM_STALL - mmDCORE0_MME_CTRL_LO_BASE)
 #define MME_ACC_INTR_MASK_OFFSET	(mmDCORE0_MME_ACC_INTR_MASK - mmDCORE0_MME_ACC_BASE)
 #define MME_ACC_WR_AXI_AGG_COUT0_OFFSET	(mmDCORE0_MME_ACC_WR_AXI_AGG_COUT0 - mmDCORE0_MME_ACC_BASE)
 #define MME_ACC_WR_AXI_AGG_COUT1_OFFSET	(mmDCORE0_MME_ACC_WR_AXI_AGG_COUT1 - mmDCORE0_MME_ACC_BASE)

--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2.h
@@ -63,6 +63,8 @@
 #define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_START	0xFFF0F80000000000ull
 #define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_END	0xFFF0FFFFFFFFFFFFull

+#define RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT	256
+
 #define GAUDI2_MSIX_ENTRIES	512

 #define QMAN_PQ_ENTRY_SIZE	16			/* Bytes */

--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_events.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_events.h
 /* SPDX-License-Identifier: GPL-2.0
 *
- * Copyright 2018-2021 HabanaLabs, Ltd.
+ * Copyright 2018-2022 HabanaLabs, Ltd.
 * All Rights Reserved.
 *
 */
@@ -958,7 +958,7 @@ enum gaudi2_async_event_id {
 	GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318,
 	GAUDI2_EVENT_ARC_DCCM_FULL = 1319,
 	GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320,
-	GAUDI2_EVENT_DEV_RESET_REQ = 1321,
+	GAUDI2_EVENT_CPU_DEV_RESET_REQ = 1321,
 	GAUDI2_EVENT_SIZE,
 };


--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
@@ -63,7 +63,10 @@ struct gaudi2_cold_rst_data {
 			u32 fake_sig_validation_en : 1;
 			u32 bist_skip_enable : 1;
 			u32 bist_need_iatu_config : 1;
-			u32 reserved : 24;
+			u32 fake_bis_compliant : 1;
+			u32 wd_rst_cause_arm : 1;
+			u32 wd_rst_cause_arcpid : 1;
+			u32 reserved : 21;
 		};
 		__le32 data;
 	};

--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -408,7 +408,8 @@ static inline bool drm_is_render_client(const struct drm_file *file_priv)
 * Returns true if this is an open file of the compute acceleration node, i.e.
 * &drm_file.minor of @file_priv is a accel minor.
 *
- * See also the :ref:`section on accel nodes <drm_accel_node>`.
+ * See also :doc:`Introduction to compute accelerators subsystem
+ * </accel/introduction>`.
 */
 static inline bool drm_is_accel_client(const struct drm_file *file_priv)
 {

--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -723,6 +723,10 @@ enum hl_server_type {
 * HL_NOTIFIER_EVENT_GENERAL_HW_ERR     - Indicates device HW error
 * HL_NOTIFIER_EVENT_RAZWI              - Indicates razwi happened
 * HL_NOTIFIER_EVENT_PAGE_FAULT         - Indicates page fault happened
+ * HL_NOTIFIER_EVENT_CRITICAL_HW_ERR    - Indicates a HW error that requires SW abort and
+ *                                        HW reset
+ * HL_NOTIFIER_EVENT_CRITICAL_FW_ERR    - Indicates a FW error that requires SW abort and
+ *                                        HW reset
 */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
@@ -733,6 +737,8 @@ enum hl_server_type {
 #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR	(1ULL << 6)
 #define HL_NOTIFIER_EVENT_RAZWI			(1ULL << 7)
 #define HL_NOTIFIER_EVENT_PAGE_FAULT		(1ULL << 8)
+#define HL_NOTIFIER_EVENT_CRITICL_HW_ERR	(1ULL << 9)
+#define HL_NOTIFIER_EVENT_CRITICL_FW_ERR	(1ULL << 10)

 /* Opcode for management ioctl
 *
@@ -790,6 +796,8 @@ enum hl_server_type {
 * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
 * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
 * HL_INFO_FW_GENERIC_REQ - Send generic request to FW.
+ * HL_INFO_HW_ERR_EVENT   - Retrieve information on the reported HW error.
+ * HL_INFO_FW_ERR_EVENT   - Retrieve information on the reported FW error.
 */
 #define HL_INFO_HW_IP_INFO			0
 #define HL_INFO_HW_EVENTS			1
@@ -824,6 +832,8 @@ enum hl_server_type {
 #define HL_INFO_PAGE_FAULT_EVENT		33
 #define HL_INFO_USER_MAPPINGS			34
 #define HL_INFO_FW_GENERIC_REQ			35
+#define HL_INFO_HW_ERR_EVENT			36
+#define HL_INFO_FW_ERR_EVENT			37

 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -875,6 +885,12 @@ enum hl_server_type {
 *                             application to use. Relevant for Gaudi2 and later.
 * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
 * @revision_id: PCI revision ID of the ASIC.
+ * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
+ * @rotator_enabled_mask: Bit-mask that represents which rotators are enabled.
+ *                        Relevant for Gaudi3 and later.
+ * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
+ *                                  in order to raise events toward FW.
+ * @reserved_dram_size: DRAM size reserved for driver and firmware.
 */
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
@@ -902,15 +918,20 @@ struct hl_info_hw_ip_info {
 	__u64 dram_page_size;
 	__u32 edma_enabled_mask;
 	__u16 number_of_user_interrupts;
-	__u16 pad2;
-	__u64 reserved4;
+	__u8 reserved1;
+	__u8 reserved2;
+	__u64 reserved3;
 	__u64 device_mem_alloc_default_page_size;
+	__u64 reserved4;
 	__u64 reserved5;
-	__u64 reserved6;
-	__u32 reserved7;
-	__u8 reserved8;
+	__u32 reserved6;
+	__u8 reserved7;
 	__u8 revision_id;
-	__u8 pad[2];
+	__u16 tpc_interrupt_id;
+	__u32 rotator_enabled_mask;
+	__u32 reserved9;
+	__u64 engine_core_interrupt_reg_addr;
+	__u64 reserved_dram_size;
 };

 struct hl_info_dram_usage {
@@ -1161,6 +1182,39 @@ struct hl_info_undefined_opcode_event {
 	__u32 stream_id;
 };

+/**
+ * struct hl_info_hw_err_event - info about HW error
+ * @timestamp: timestamp of error occurrence
+ * @event_id: The async event ID (specific to each device type).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_hw_err_event {
+	__s64 timestamp;
+	__u16 event_id;
+	__u16 pad[3];
+};
+
+/* FW error definition for event_type in struct hl_info_fw_err_event */
+enum hl_info_fw_err_type {
+	HL_INFO_FW_HEARTBEAT_ERR,
+	HL_INFO_FW_REPORTED_ERR,
+};
+
+/**
+ * struct hl_info_fw_err_event - info about FW error
+ * @timestamp: time-stamp of error occurrence
+ * @err_type: The type of event as defined in hl_info_fw_err_type.
+ * @event_id: The async event ID (specific to each device type, applicable only when event type is
+ *             HL_INFO_FW_REPORTED_ERR).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_fw_err_event {
+	__s64 timestamp;
+	__u16 err_type;
+	__u16 event_id;
+	__u32 pad;
+};
+
 /**
 * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
 * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
@@ -1486,17 +1540,31 @@ struct hl_cs_chunk {
 */
 #define HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES	0x8000

+/*
+ * The engines CS is merged into the existing CS ioctls.
+ * Use it to control engines modes.
+ */
+#define HL_CS_FLAGS_ENGINES_COMMAND		0x10000
+
 #define HL_CS_STATUS_SUCCESS		0

 #define HL_MAX_JOBS_PER_CS		512

-/* HL_ENGINE_CORE_ values
+/*
+ * enum hl_engine_command - engine command
 *
- * HL_ENGINE_CORE_HALT: engine core halt
- * HL_ENGINE_CORE_RUN:  engine core run
+ * @HL_ENGINE_CORE_HALT: engine core halt
+ * @HL_ENGINE_CORE_RUN: engine core run
+ * @HL_ENGINE_STALL: user engine/s stall
+ * @HL_ENGINE_RESUME: user engine/s resume
 */
-#define HL_ENGINE_CORE_HALT	(1 << 0)
-#define HL_ENGINE_CORE_RUN	(1 << 1)
+enum hl_engine_command {
+	HL_ENGINE_CORE_HALT = 1,
+	HL_ENGINE_CORE_RUN = 2,
+	HL_ENGINE_STALL = 3,
+	HL_ENGINE_RESUME = 4,
+	HL_ENGINE_COMMAND_MAX
+};

 struct hl_cs_in {

@@ -1520,6 +1588,18 @@ struct hl_cs_in {
 			/* the core command to be sent towards engine cores */
 			__u32 core_command;
 		};
+
+		/* Valid only when HL_CS_FLAGS_ENGINES_COMMAND is set */
+		struct {
+			/* this holds address of array of uint32 for engines */
+			__u64 engines;
+
+			/* number of engines in engines array */
+			__u32 num_engines;
+
+			/* the engine command to be sent towards engines */
+			__u32 engine_command;
+		};
 	};

 	union {