Merge tag 'misc-habanalabs-next-2019-07-04' of...

Merge tag 'misc-habanalabs-next-2019-07-04' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.3: - Change the way the device's CPU access the host memory. This allows the driver to use the kernel API of setting DMA mask in a standard way (call it once). - Add a new debugfs entry to show the status of the internal DMA and compute engines. This is very helpful for debugging in case a command submission get stuck. - Return to the user a mask of the internal engines indicating their busy state. - Make sure to restore registers that can be modified by the user to their default values. Only applies to registers that are initialized by the driver. - Elimination of redundant and dead-code. - Support memset of the device's memory with size larger then 4GB - Force the user to set the device to debug mode before configuring the device's coresight infrastructure - Improve error printing in case of interrupts from the device * tag 'misc-habanalabs-next-2019-07-04' of git://people.freedesktop.org/~gabbayo/linux: (31 commits) habanalabs: Add busy engines bitmask to HW idle IOCTL habanalabs: Add debugfs node for engines status habanalabs: Update the device idle check habanalabs: Allow accessing host mapped addresses via debugfs habanalabs: add WARN in case of bad MMU mapping habanalabs: remove DMA mask hack for Goya habanalabs: set Goya CPU to use ASIC MMU habanalabs: add MMU mappings for Goya CPU habanalabs: initialize MMU context for driver habanalabs: de-couple MMU and VM module initialization habanalabs: initialize device CPU queues after MMU init docs/habanalabs: update text for some entries in sysfs habanalabs: add rate-limit to an error message habanalabs: remove simulator dedicated code habanalabs: restore unsecured registers default values habanalabs: clear sobs and monitors in context switch habanalabs: make tpc registers secured habanalabs: don't limit packet size for device CPU habanalabs: support device memory memset > 4GB habanalabs: print event name for fatal and non-RAZWI events ...

Merge tag 'misc-habanalabs-next-2019-07-04' of...
Merge tag 'misc-habanalabs-next-2019-07-04' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.3: - Change the way the device's CPU access the host memory. This allows the driver to use the kernel API of setting DMA mask in a standard way (call it once). - Add a new debugfs entry to show the status of the internal DMA and compute engines. This is very helpful for debugging in case a command submission get stuck. - Return to the user a mask of the internal engines indicating their busy state. - Make sure to restore registers that can be modified by the user to their default values. Only applies to registers that are initialized by the driver. - Elimination of redundant and dead-code. - Support memset of the device's memory with size larger then 4GB - Force the user to set the device to debug mode before configuring the device's coresight infrastructure - Improve error printing in case of interrupts from the device * tag 'misc-habanalabs-next-2019-07-04' of git://people.freedesktop.org/~gabbayo/linux: (31 commits) habanalabs: Add busy engines bitmask to HW idle IOCTL habanalabs: Add debugfs node for engines status habanalabs: Update the device idle check habanalabs: Allow accessing host mapped addresses via debugfs habanalabs: add WARN in case of bad MMU mapping habanalabs: remove DMA mask hack for Goya habanalabs: set Goya CPU to use ASIC MMU habanalabs: add MMU mappings for Goya CPU habanalabs: initialize MMU context for driver habanalabs: de-couple MMU and VM module initialization habanalabs: initialize device CPU queues after MMU init docs/habanalabs: update text for some entries in sysfs habanalabs: add rate-limit to an error message habanalabs: remove simulator dedicated code habanalabs: restore unsecured registers default values habanalabs: clear sobs and monitors in context switch habanalabs: make tpc registers secured habanalabs: don't limit packet size for device CPU habanalabs: support device memory memset > 4GB habanalabs: print event name for fatal and non-RAZWI events ...
a94de2e7 · Greg Kroah-Hartman · 60e8523e · e8960ca0 · a94de2e7 · a94de2e7
Commit a94de2e7 authored Jul 04, 2019 by Greg Kroah-Hartman
22 changed files
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -3,7 +3,10 @@ Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Sets the device address to be used for read or write through
-                PCI bar. The acceptable value is a string that starts with "0x"
+                PCI bar, or the device VA of a host mapped memory to be read or
+                written directly from the host. The latter option is allowed
+                only when the IOMMU is disabled.
+                The acceptable value is a string that starts with "0x"

 What:           /sys/kernel/debug/habanalabs/hl<n>/command_buffers
 Date:           Jan 2019
@@ -33,10 +36,12 @@ Contact:        oded.gabbay@gmail.com
 Description:    Allows the root user to read or write directly through the
                device's PCI bar. Writing to this file generates a write
                transaction while reading from the file generates a read
-                transcation. This custom interface is needed (instead of using
+                transaction. This custom interface is needed (instead of using
                the generic Linux user-space PCI mapping) because the DDR bar
                is very small compared to the DDR memory and only the driver can
-                move the bar before and after the transaction
+                move the bar before and after the transaction.
+                If the IOMMU is disabled, it also allows the root user to read
+                or write from the host a device VA of a host mapped memory

 What:           /sys/kernel/debug/habanalabs/hl<n>/device
 Date:           Jan 2019
@@ -46,6 +51,13 @@ Description:    Enables the root user to set the device to specific state.
                Valid values are "disable", "enable", "suspend", "resume".
                User can read this property to see the valid values

+What:           /sys/kernel/debug/habanalabs/hl<n>/engines
+Date:           Jul 2019
+KernelVersion:  5.3
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the status registers values of the device engines and
+                their derived idle status
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
 Date:           Jan 2019
 KernelVersion:  5.1

--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -62,18 +62,20 @@ What:           /sys/class/habanalabs/hl<n>/ic_clk
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Allows the user to set the maximum clock frequency of the
-                Interconnect fabric. Writes to this parameter affect the device
-                only when the power management profile is set to "manual" mode.
-                The device IC clock might be set to lower value then the
+Description:    Allows the user to set the maximum clock frequency, in Hz, of
+                the Interconnect fabric. Writes to this parameter affect the
+                device only when the power management profile is set to "manual"
+                mode. The device IC clock might be set to lower value than the
                maximum. The user should read the ic_clk_curr to see the actual
-                frequency value of the IC
+                frequency value of the IC. This property is valid only for the
+                Goya ASIC family

 What:           /sys/class/habanalabs/hl<n>/ic_clk_curr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Displays the current clock frequency of the Interconnect fabric
+Description:    Displays the current clock frequency, in Hz, of the Interconnect
+                fabric. This property is valid only for the Goya ASIC family

 What:           /sys/class/habanalabs/hl<n>/infineon_ver
 Date:           Jan 2019
@@ -92,18 +94,20 @@ What:           /sys/class/habanalabs/hl<n>/mme_clk
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Allows the user to set the maximum clock frequency of the
-                MME compute engine. Writes to this parameter affect the device
-                only when the power management profile is set to "manual" mode.
-                The device MME clock might be set to lower value then the
+Description:    Allows the user to set the maximum clock frequency, in Hz, of
+                the MME compute engine. Writes to this parameter affect the
+                device only when the power management profile is set to "manual"
+                mode. The device MME clock might be set to lower value than the
                maximum. The user should read the mme_clk_curr to see the actual
-                frequency value of the MME
+                frequency value of the MME. This property is valid only for the
+                Goya ASIC family

 What:           /sys/class/habanalabs/hl<n>/mme_clk_curr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Displays the current clock frequency of the MME compute engine
+Description:    Displays the current clock frequency, in Hz, of the MME compute
+                engine. This property is valid only for the Goya ASIC family

 What:           /sys/class/habanalabs/hl<n>/pci_addr
 Date:           Jan 2019
@@ -163,18 +167,20 @@ What:           /sys/class/habanalabs/hl<n>/tpc_clk
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Allows the user to set the maximum clock frequency of the
-                TPC compute engines. Writes to this parameter affect the device
-                only when the power management profile is set to "manual" mode.
-                The device TPC clock might be set to lower value then the
+Description:    Allows the user to set the maximum clock frequency, in Hz, of
+                the TPC compute engines. Writes to this parameter affect the
+                device only when the power management profile is set to "manual"
+                mode. The device TPC clock might be set to lower value than the
                maximum. The user should read the tpc_clk_curr to see the actual
-                frequency value of the TPC
+                frequency value of the TPC. This property is valid only for
+                Goya ASIC family

 What:           /sys/class/habanalabs/hl<n>/tpc_clk_curr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Displays the current clock frequency of the TPC compute engines
+Description:    Displays the current clock frequency, in Hz, of the TPC compute
+                engines. This property is valid only for the Goya ASIC family

 What:           /sys/class/habanalabs/hl<n>/uboot_ver
 Date:           Jan 2019

--- a/drivers/misc/habanalabs/asid.c
+++ b/drivers/misc/habanalabs/asid.c
@@ -18,7 +18,7 @@ int hl_asid_init(struct hl_device *hdev)

 	mutex_init(&hdev->asid_mutex);

-	/* ASID 0 is reserved for KMD */
+	/* ASID 0 is reserved for KMD and device CPU */
 	set_bit(0, hdev->asid_bitmap);

 	return 0;

--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -682,14 +682,12 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		u32 tmp;

 		rc = hl_poll_timeout_memory(hdev,
-			(u64) (uintptr_t) &ctx->thread_ctx_switch_wait_token,
-			jiffies_to_usecs(hdev->timeout_jiffies),
-			&tmp);
+			&ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
+			100, jiffies_to_usecs(hdev->timeout_jiffies));

-		if (rc || !tmp) {
+		if (rc == -ETIMEDOUT) {
 			dev_err(hdev->dev,
-				"context switch phase didn't finish in time\n");
-			rc = -ETIMEDOUT;
+				"context switch phase timeout (%d)\n", tmp);
 			goto out;
 		}
 	}

--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -31,9 +31,13 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		 * Coresight might be still working by accessing addresses
 		 * related to the stopped engines. Hence stop it explicitly.
 		 */
-		hdev->asic_funcs->halt_coresight(hdev);
+		if (hdev->in_debug)
+			hl_device_set_debug_mode(hdev, false);
+
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
+	} else {
+		hl_mmu_ctx_fini(ctx);
 	}
 }

@@ -117,6 +121,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)

 	if (is_kernel_ctx) {
 		ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
+		rc = hl_mmu_ctx_init(ctx);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to init mmu ctx module\n");
+			goto mem_ctx_err;
+		}
 	} else {
 		ctx->asid = hl_asid_alloc(hdev);
 		if (!ctx->asid) {

--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -355,7 +355,7 @@ static int mmu_show(struct seq_file *s, void *data)
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	struct hl_ctx *ctx = hdev->user_ctx;
+	struct hl_ctx *ctx;

 	u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
 		hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
@@ -367,6 +367,11 @@ static int mmu_show(struct seq_file *s, void *data)
 	if (!hdev->mmu_enable)
 		return 0;

+	if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
+		ctx = hdev->kernel_ctx;
+	else
+		ctx = hdev->user_ctx;
+
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
 		return 0;
@@ -495,6 +500,36 @@ static ssize_t mmu_write(struct file *file, const char __user *buf,
 	return -EINVAL;
 }

+static int engines_show(struct seq_file *s, void *data)
+{
+	struct hl_debugfs_entry *entry = s->private;
+	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+	struct hl_device *hdev = dev_entry->hdev;
+
+	hdev->asic_funcs->is_device_idle(hdev, NULL, s);
+
+	return 0;
+}
+
+static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
+	if (!hdev->mmu_enable)
+		goto out;
+
+	if (hdev->dram_supports_virtual_memory &&
+			addr >= prop->va_space_dram_start_address &&
+			addr < prop->va_space_dram_end_address)
+		return true;
+
+	if (addr >= prop->va_space_host_start_address &&
+			addr < prop->va_space_host_end_address)
+		return true;
+out:
+	return false;
+}
+
 static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 				u64 *phys_addr)
 {
@@ -568,7 +603,6 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	char tmp_buf[32];
 	u64 addr = entry->addr;
 	u32 val;
@@ -577,11 +611,8 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 	if (*ppos)
 		return 0;

-	if (addr >= prop->va_space_dram_start_address &&
-			addr < prop->va_space_dram_end_address &&
-			hdev->mmu_enable &&
-			hdev->dram_supports_virtual_memory) {
-		rc = device_va_to_pa(hdev, entry->addr, &addr);
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
 		if (rc)
 			return rc;
 	}
@@ -602,7 +633,6 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 addr = entry->addr;
 	u32 value;
 	ssize_t rc;
@@ -611,11 +641,8 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	if (rc)
 		return rc;

-	if (addr >= prop->va_space_dram_start_address &&
-			addr < prop->va_space_dram_end_address &&
-			hdev->mmu_enable &&
-			hdev->dram_supports_virtual_memory) {
-		rc = device_va_to_pa(hdev, entry->addr, &addr);
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
 		if (rc)
 			return rc;
 	}
@@ -877,6 +904,7 @@ static const struct hl_info_list hl_debugfs_list[] = {
 	{"userptr", userptr_show, NULL},
 	{"vm", vm_show, NULL},
 	{"mmu", mmu_show, mmu_write},
+	{"engines", engines_show, NULL}
 };

 static int hl_debugfs_open(struct inode *inode, struct file *file)

--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -231,6 +231,7 @@ static int device_early_init(struct hl_device *hdev)

 	mutex_init(&hdev->fd_open_cnt_lock);
 	mutex_init(&hdev->send_cpu_message_lock);
+	mutex_init(&hdev->debug_lock);
 	mutex_init(&hdev->mmu_cache_lock);
 	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
 	spin_lock_init(&hdev->hw_queues_mirror_lock);
@@ -262,6 +263,7 @@ static int device_early_init(struct hl_device *hdev)
 static void device_early_fini(struct hl_device *hdev)
 {
 	mutex_destroy(&hdev->mmu_cache_lock);
+	mutex_destroy(&hdev->debug_lock);
 	mutex_destroy(&hdev->send_cpu_message_lock);

 	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
@@ -324,7 +326,15 @@ static int device_late_init(struct hl_device *hdev)
 {
 	int rc;

-	INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
+	if (hdev->asic_funcs->late_init) {
+		rc = hdev->asic_funcs->late_init(hdev);
+		if (rc) {
+			dev_err(hdev->dev,
+				"failed late initialization for the H/W\n");
+			return rc;
+		}
+	}
+
 	hdev->high_pll = hdev->asic_prop.high_pll;

 	/* force setting to low frequency */
@@ -335,15 +345,7 @@ static int device_late_init(struct hl_device *hdev)
 	else
 		hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);

-	if (hdev->asic_funcs->late_init) {
-		rc = hdev->asic_funcs->late_init(hdev);
-		if (rc) {
-			dev_err(hdev->dev,
-				"failed late initialization for the H/W\n");
-			return rc;
-		}
-	}
-
+	INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
 	schedule_delayed_work(&hdev->work_freq,
 	usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));

@@ -420,6 +422,52 @@ int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
 	return 1;
 }

+int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
+{
+	int rc = 0;
+
+	mutex_lock(&hdev->debug_lock);
+
+	if (!enable) {
+		if (!hdev->in_debug) {
+			dev_err(hdev->dev,
+				"Failed to disable debug mode because device was not in debug mode\n");
+			rc = -EFAULT;
+			goto out;
+		}
+
+		hdev->asic_funcs->halt_coresight(hdev);
+		hdev->in_debug = 0;
+
+		goto out;
+	}
+
+	if (hdev->in_debug) {
+		dev_err(hdev->dev,
+			"Failed to enable debug mode because device is already in debug mode\n");
+		rc = -EFAULT;
+		goto out;
+	}
+
+	mutex_lock(&hdev->fd_open_cnt_lock);
+
+	if (atomic_read(&hdev->fd_open_cnt) > 1) {
+		dev_err(hdev->dev,
+			"Failed to enable debug mode. More then a single user is using the device\n");
+		rc = -EPERM;
+		goto unlock_fd_open_lock;
+	}
+
+	hdev->in_debug = 1;
+
+unlock_fd_open_lock:
+	mutex_unlock(&hdev->fd_open_cnt_lock);
+out:
+	mutex_unlock(&hdev->debug_lock);
+
+	return rc;
+}
+
 /*
 * hl_device_suspend - initiate device suspend
 *
@@ -647,13 +695,6 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,

 		hdev->hard_reset_pending = true;

-		if (!hdev->pdev) {
-			dev_err(hdev->dev,
-				"Reset action is NOT supported in simulator\n");
-			rc = -EINVAL;
-			goto out_err;
-		}
-
 		device_reset_work = kzalloc(sizeof(*device_reset_work),
 						GFP_ATOMIC);
 		if (!device_reset_work) {
@@ -704,6 +745,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,

 	if (hard_reset) {
 		hl_vm_fini(hdev);
+		hl_mmu_fini(hdev);
 		hl_eq_reset(hdev, &hdev->event_queue);
 	}

@@ -731,6 +773,13 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 			goto out_err;
 		}

+		rc = hl_mmu_init(hdev);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to initialize MMU S/W after hard reset\n");
+			goto out_err;
+		}
+
 		/* Allocate the kernel context */
 		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
 						GFP_KERNEL);
@@ -902,11 +951,18 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto cq_fini;
 	}

+	/* MMU S/W must be initialized before kernel context is created */
+	rc = hl_mmu_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
+		goto eq_fini;
+	}
+
 	/* Allocate the kernel context */
 	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
 	if (!hdev->kernel_ctx) {
 		rc = -ENOMEM;
-		goto eq_fini;
+		goto mmu_fini;
 	}

 	hdev->user_ctx = NULL;
@@ -954,8 +1010,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto out_disabled;
 	}

-	/* After test_queues, KMD can start sending messages to device CPU */
-
 	rc = device_late_init(hdev);
 	if (rc) {
 		dev_err(hdev->dev, "Failed late initialization\n");
@@ -1001,6 +1055,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 			"kernel ctx is still alive on initialization failure\n");
 free_ctx:
 	kfree(hdev->kernel_ctx);
+mmu_fini:
+	hl_mmu_fini(hdev);
 eq_fini:
 	hl_eq_fini(hdev, &hdev->event_queue);
 cq_fini:
@@ -1105,6 +1161,8 @@ void hl_device_fini(struct hl_device *hdev)

 	hl_vm_fini(hdev);

+	hl_mmu_fini(hdev);
+
 	hl_eq_fini(hdev, &hdev->event_queue);

 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
@@ -1125,95 +1183,6 @@ void hl_device_fini(struct hl_device *hdev)
 	pr_info("removed device successfully\n");
 }

-/*
- * hl_poll_timeout_memory - Periodically poll a host memory address
- *                              until it is not zero or a timeout occurs
- * @hdev: pointer to habanalabs device structure
- * @addr: Address to poll
- * @timeout_us: timeout in us
- * @val: Variable to read the value into
- *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @addr is stored in @val. Must not
- * be called from atomic context if sleep_us or timeout_us are used.
- *
- * The function sleeps for 100us with timeout value of
- * timeout_us
- */
-int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
-				u32 timeout_us, u32 *val)
-{
-	/*
-	 * address in this function points always to a memory location in the
-	 * host's (server's) memory. That location is updated asynchronously
-	 * either by the direct access of the device or by another core
-	 */
-	u32 *paddr = (u32 *) (uintptr_t) addr;
-	ktime_t timeout;
-
-	/* timeout should be longer when working with simulator */
-	if (!hdev->pdev)
-		timeout_us *= 10;
-
-	timeout = ktime_add_us(ktime_get(), timeout_us);
-
-	might_sleep();
-
-	for (;;) {
-		/*
-		 * Flush CPU read/write buffers to make sure we read updates
-		 * done by other cores or by the device
-		 */
-		mb();
-		*val = *paddr;
-		if (*val)
-			break;
-		if (ktime_compare(ktime_get(), timeout) > 0) {
-			*val = *paddr;
-			break;
-		}
-		usleep_range((100 >> 2) + 1, 100);
-	}
-
-	return *val ? 0 : -ETIMEDOUT;
-}
-
-/*
- * hl_poll_timeout_devicememory - Periodically poll a device memory address
- *                                until it is not zero or a timeout occurs
- * @hdev: pointer to habanalabs device structure
- * @addr: Device address to poll
- * @timeout_us: timeout in us
- * @val: Variable to read the value into
- *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @addr is stored in @val. Must not
- * be called from atomic context if sleep_us or timeout_us are used.
- *
- * The function sleeps for 100us with timeout value of
- * timeout_us
- */
-int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
-				u32 timeout_us, u32 *val)
-{
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
-
-	might_sleep();
-
-	for (;;) {
-		*val = readl(addr);
-		if (*val)
-			break;
-		if (ktime_compare(ktime_get(), timeout) > 0) {
-			*val = readl(addr);
-			break;
-		}
-		usleep_range((100 >> 2) + 1, 100);
-	}
-
-	return *val ? 0 : -ETIMEDOUT;
-}
-
 /*
 * MMIO register access helper functions.
 */

--- a/drivers/misc/habanalabs/firmware_if.c
+++ b/drivers/misc/habanalabs/firmware_if.c
@@ -29,13 +29,13 @@ int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name,

 	rc = request_firmware(&fw, fw_name, hdev->dev);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to request %s\n", fw_name);
+		dev_err(hdev->dev, "Firmware file %s is not found!\n", fw_name);
 		goto out;
 	}

 	fw_size = fw->size;
 	if ((fw_size % 4) != 0) {
-		dev_err(hdev->dev, "illegal %s firmware size %zu\n",
+		dev_err(hdev->dev, "Illegal %s firmware size %zu\n",
 			fw_name, fw_size);
 		rc = -EINVAL;
 		goto out;
@@ -85,12 +85,6 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	u32 tmp;
 	int rc = 0;

-	if (len > HL_CPU_CB_SIZE) {
-		dev_err(hdev->dev, "Invalid CPU message size of %d bytes\n",
-			len);
-		return -ENOMEM;
-	}
-
 	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
 								&pkt_dma_addr);
 	if (!pkt) {
@@ -117,34 +111,29 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 		goto out;
 	}

-	rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) &pkt->fence,
-					timeout, &tmp);
+	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
+				(tmp == ARMCP_PACKET_FENCE_VAL), 1000, timeout);

 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);

 	if (rc == -ETIMEDOUT) {
-		dev_err(hdev->dev, "Timeout while waiting for device CPU\n");
+		dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp);
 		hdev->device_cpu_disabled = true;
 		goto out;
 	}

-	if (tmp == ARMCP_PACKET_FENCE_VAL) {
-		u32 ctl = le32_to_cpu(pkt->ctl);
+	tmp = le32_to_cpu(pkt->ctl);

-		rc = (ctl & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
+	rc = (tmp & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
 	if (rc) {
-			dev_err(hdev->dev,
-				"F/W ERROR %d for CPU packet %d\n",
-				rc, (ctl & ARMCP_PKT_CTL_OPCODE_MASK)
+		dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
+			rc,
+			(tmp & ARMCP_PKT_CTL_OPCODE_MASK)
 						>> ARMCP_PKT_CTL_OPCODE_SHIFT);
-			rc = -EINVAL;
+		rc = -EIO;
 	} else if (result) {
 		*result = (long) le64_to_cpu(pkt->result);
 	}
-	} else {
-		dev_err(hdev->dev, "CPU packet wrong fence value\n");
-		rc = -EINVAL;
-	}

 out:
 	mutex_unlock(&hdev->send_cpu_message_lock);
@@ -186,9 +175,6 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 {
 	u64 kernel_addr;

-	/* roundup to HL_CPU_PKT_SIZE */
-	size = (size + (HL_CPU_PKT_SIZE - 1)) & HL_CPU_PKT_MASK;
-
 	kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size);

 	*dma_handle = hdev->cpu_accessible_dma_address +
@@ -200,9 +186,6 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr)
 {
-	/* roundup to HL_CPU_PKT_SIZE */
-	size = (size + (HL_CPU_PKT_SIZE - 1)) & HL_CPU_PKT_MASK;
-
 	gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr,
 			size);
 }
@@ -256,7 +239,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
 					HL_ARMCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to send armcp info pkt, error %d\n", rc);
+			"Failed to send ArmCP info pkt, error %d\n", rc);
 		goto out;
 	}

@@ -291,7 +274,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 					max_size, &eeprom_info_dma_addr);
 	if (!eeprom_info_cpu_addr) {
 		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for EEPROM info packet\n");
+			"Failed to allocate DMA memory for ArmCP EEPROM packet\n");
 		return -ENOMEM;
 	}

@@ -307,7 +290,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)

 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to send armcp EEPROM pkt, error %d\n", rc);
+			"Failed to send ArmCP EEPROM packet, error %d\n", rc);
 		goto out;
 	}


--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -126,6 +126,12 @@
 #define VA_DDR_SPACE_SIZE	(VA_DDR_SPACE_END - \
 					VA_DDR_SPACE_START)	/* 128GB */

+#if (HL_CPU_ACCESSIBLE_MEM_SIZE != SZ_2M)
+#error "HL_CPU_ACCESSIBLE_MEM_SIZE must be exactly 2MB to enable MMU mapping"
+#endif
+
+#define VA_CPU_ACCESSIBLE_MEM_ADDR	0x8000000000ull
+
 #define DMA_MAX_TRANSFER_SIZE	U32_MAX

 #define HW_CAP_PLL		0x00000001
@@ -157,6 +163,7 @@ struct goya_device {
 	u64		ddr_bar_cur_addr;
 	u32		events_stat[GOYA_ASYNC_EVENT_ID_SIZE];
 	u32		hw_cap_initialized;
+	u8		device_cpu_mmu_mappings_done;
 };

 void goya_get_fixed_properties(struct hl_device *hdev);
@@ -204,18 +211,14 @@ int goya_armcp_info_get(struct hl_device *hdev);
 int goya_debug_coresight(struct hl_device *hdev, void *data);
 void goya_halt_coresight(struct hl_device *hdev);

-void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
-int goya_mmu_clear_pgt_range(struct hl_device *hdev);
-int goya_mmu_set_dram_default_page(struct hl_device *hdev);
-
 int goya_suspend(struct hl_device *hdev);
 int goya_resume(struct hl_device *hdev);

 void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry);
 void *goya_get_events_stat(struct hl_device *hdev, u32 *size);

-void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr,
-				u32 cq_val, u32 msix_vec);
+void goya_add_end_of_cb_packets(struct hl_device *hdev, u64 kernel_address,
+				u32 len, u64 cq_addr, u32 cq_val, u32 msix_vec);
 int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser);
 void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
 				dma_addr_t *dma_handle,	u16 *queue_len);
@@ -225,5 +228,6 @@ void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 					dma_addr_t *dma_handle);
 void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr);
+void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev);

 #endif /* GOYAP_H_ */
--- a/drivers/misc/habanalabs/goya/goya_security.c
+++ b/drivers/misc/habanalabs/goya/goya_security.c
@@ -677,6 +677,17 @@ static void goya_init_tpc_protection_bits(struct hl_device *hdev)
 	goya_pb_set_block(hdev, mmTPC0_RD_REGULATOR_BASE);
 	goya_pb_set_block(hdev, mmTPC0_WR_REGULATOR_BASE);

+	pb_addr = (mmTPC0_CFG_SEMAPHORE & ~0xFFF) + PROT_BITS_OFFS;
+	word_offset = ((mmTPC0_CFG_SEMAPHORE & PROT_BITS_OFFS) >> 7) << 2;
+
+	mask = 1 << ((mmTPC0_CFG_SEMAPHORE & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_VFLAGS & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_SFLAGS & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_LFSR_POLYNOM & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_STATUS & 0x7F) >> 2);
+
+	WREG32(pb_addr + word_offset, ~mask);
+
 	pb_addr = (mmTPC0_CFG_CFG_BASE_ADDRESS_HIGH & ~0xFFF) + PROT_BITS_OFFS;
 	word_offset = ((mmTPC0_CFG_CFG_BASE_ADDRESS_HIGH &
 			PROT_BITS_OFFS) >> 7) << 2;
@@ -684,6 +695,11 @@ static void goya_init_tpc_protection_bits(struct hl_device *hdev)
 	mask |= 1 << ((mmTPC0_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
 	mask |= 1 << ((mmTPC0_CFG_SM_BASE_ADDRESS_LOW & 0x7F) >> 2);
 	mask |= 1 << ((mmTPC0_CFG_SM_BASE_ADDRESS_HIGH & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_TPC_STALL & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_MSS_CONFIG & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_TPC_INTR_CAUSE & 0x7F) >> 2);
+	mask |= 1 << ((mmTPC0_CFG_TPC_INTR_MASK & 0x7F) >> 2);

 	WREG32(pb_addr + word_offset, ~mask);


--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -34,6 +34,8 @@
 #define HL_ARMCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
 #define HL_ARMCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */

+#define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */
+
 #define HL_MAX_QUEUES			128

 #define HL_MAX_JOBS_PER_CS		64
@@ -123,7 +125,7 @@ enum hl_device_hw_state {
 /**
 * struct asic_fixed_properties - ASIC specific immutable properties.
 * @hw_queues_props: H/W queues properties.
- * @armcp_info: received various information from ArmCP regarding the H/W. e.g.
+ * @armcp_info: received various information from ArmCP regarding the H/W, e.g.
 *		available sensors.
 * @uboot_ver: F/W U-boot version.
 * @preboot_ver: F/W Preboot version.
@@ -318,18 +320,8 @@ struct hl_cs_job;
 #define HL_EQ_LENGTH			64
 #define HL_EQ_SIZE_IN_BYTES		(HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)

-#define HL_CPU_PKT_SHIFT		5
-#define HL_CPU_PKT_SIZE			(1 << HL_CPU_PKT_SHIFT)
-#define HL_CPU_PKT_MASK			(~((1 << HL_CPU_PKT_SHIFT) - 1))
-#define HL_CPU_MAX_PKTS_IN_CB		32
-#define HL_CPU_CB_SIZE			(HL_CPU_PKT_SIZE * \
-					 HL_CPU_MAX_PKTS_IN_CB)
-#define HL_CPU_CB_QUEUE_SIZE		(HL_QUEUE_LENGTH * HL_CPU_CB_SIZE)
-
-/* KMD <-> ArmCP shared memory size (EQ + PQ + CPU CB queue) */
-#define HL_CPU_ACCESSIBLE_MEM_SIZE	(HL_EQ_SIZE_IN_BYTES + \
-					 HL_QUEUE_SIZE_IN_BYTES + \
-					 HL_CPU_CB_QUEUE_SIZE)
+/* KMD <-> ArmCP shared memory size */
+#define HL_CPU_ACCESSIBLE_MEM_SIZE	SZ_2M

 /**
 * struct hl_hw_queue - describes a H/W transport queue.
@@ -543,8 +535,9 @@ struct hl_asic_funcs {
 				enum dma_data_direction dir);
 	u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
 					struct sg_table *sgt);
-	void (*add_end_of_cb_packets)(u64 kernel_address, u32 len, u64 cq_addr,
-					u32 cq_val, u32 msix_num);
+	void (*add_end_of_cb_packets)(struct hl_device *hdev,
+					u64 kernel_address, u32 len,
+					u64 cq_addr, u32 cq_val, u32 msix_num);
 	void (*update_eq_ci)(struct hl_device *hdev, u32 val);
 	int (*context_switch)(struct hl_device *hdev, u32 asid);
 	void (*restore_phase_topology)(struct hl_device *hdev);
@@ -564,7 +557,8 @@ struct hl_asic_funcs {
 			u32 asid, u64 va, u64 size);
 	int (*send_heartbeat)(struct hl_device *hdev);
 	int (*debug_coresight)(struct hl_device *hdev, void *data);
-	bool (*is_device_idle)(struct hl_device *hdev, char *buf, size_t size);
+	bool (*is_device_idle)(struct hl_device *hdev, u32 *mask,
+				struct seq_file *s);
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -1065,12 +1059,59 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 	(cond) ? 0 : -ETIMEDOUT; \
 })

+/*
+ * address in this macro points always to a memory location in the
+ * host's (server's) memory. That location is updated asynchronously
+ * either by the direct access of the device or by another core
+ */
+#define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us) \
+({ \
+	ktime_t __timeout; \
+	/* timeout should be longer when working with simulator */ \
+	if (hdev->pdev) \
+		__timeout = ktime_add_us(ktime_get(), timeout_us); \
+	else \
+		__timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
+	might_sleep_if(sleep_us); \
+	for (;;) { \
+		/* Verify we read updates done by other cores or by device */ \
+		mb(); \
+		(val) = *((u32 *) (uintptr_t) (addr)); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \
+			(val) = *((u32 *) (uintptr_t) (addr)); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})

-#define HL_ENG_BUSY(buf, size, fmt, ...) ({ \
-		if (buf) \
-			snprintf(buf, size, fmt, ##__VA_ARGS__); \
-		false; \
-	})
+#define hl_poll_timeout_device_memory(hdev, addr, val, cond, sleep_us, \
+					timeout_us) \
+({ \
+	ktime_t __timeout; \
+	/* timeout should be longer when working with simulator */ \
+	if (hdev->pdev) \
+		__timeout = ktime_add_us(ktime_get(), timeout_us); \
+	else \
+		__timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
+	might_sleep_if(sleep_us); \
+	for (;;) { \
+		(val) = readl(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \
+			(val) = readl(addr); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})

 struct hwmon_chip_info;

@@ -1117,6 +1158,7 @@ struct hl_device_reset_work {
 *                    lock here so we can flush user processes which are opening
 *                    the device while we are trying to hard reset it
 * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue.
+ * @debug_lock: protects critical section of setting debug mode for device
 * @asic_prop: ASIC specific immutable properties.
 * @asic_funcs: ASIC specific functions.
 * @asic_specific: ASIC specific information to use only from ASIC files.
@@ -1159,6 +1201,8 @@ struct hl_device_reset_work {
 * @mmu_enable: is MMU enabled.
 * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
 * @dma_mask: the dma mask that was set for this device
+ * @in_debug: is device under debug. This, together with fd_open_cnt, enforces
+ *            that only a single user is configuring the debug infrastructure.
 */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -1188,6 +1232,7 @@ struct hl_device {
 	/* TODO: remove fd_open_cnt_lock for multiple process support */
 	struct mutex			fd_open_cnt_lock;
 	struct mutex			send_cpu_message_lock;
+	struct mutex			debug_lock;
 	struct asic_fixed_properties	asic_prop;
 	const struct hl_asic_funcs	*asic_funcs;
 	void				*asic_specific;
@@ -1230,6 +1275,7 @@ struct hl_device {
 	u8				init_done;
 	u8				device_cpu_disabled;
 	u8				dma_mask;
+	u8				in_debug;

 	/* Parameters for bring-up */
 	u8				mmu_enable;
@@ -1325,13 +1371,10 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
 int hl_device_open(struct inode *inode, struct file *filp);
 bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
 enum hl_device_status hl_device_status(struct hl_device *hdev);
+int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		enum hl_asic_type asic_type, int minor);
 void destroy_hdev(struct hl_device *hdev);
-int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, u32 timeout_us,
-				u32 *val);
-int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
-				u32 timeout_us, u32 *val);
 int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,

--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -105,9 +105,17 @@ int hl_device_open(struct inode *inode, struct file *filp)
 		return -EPERM;
 	}

+	if (hdev->in_debug) {
+		dev_err_ratelimited(hdev->dev,
+			"Can't open %s because it is being debugged by another user\n",
+			dev_name(hdev->dev));
+		mutex_unlock(&hdev->fd_open_cnt_lock);
+		return -EPERM;
+	}
+
 	if (atomic_read(&hdev->fd_open_cnt)) {
 		dev_info_ratelimited(hdev->dev,
-			"Device %s is already attached to application\n",
+			"Can't open %s because another user is working on it\n",
 			dev_name(hdev->dev));
 		mutex_unlock(&hdev->fd_open_cnt_lock);
 		return -EBUSY;
@@ -164,6 +172,17 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	return rc;
 }

+static void set_driver_behavior_per_device(struct hl_device *hdev)
+{
+	hdev->mmu_enable = 1;
+	hdev->cpu_enable = 1;
+	hdev->fw_loading = 1;
+	hdev->cpu_queues_enable = 1;
+	hdev->heartbeat = 1;
+
+	hdev->reset_pcilink = 0;
+}
+
 /*
 * create_hdev - create habanalabs device instance
 *
@@ -188,29 +207,25 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	if (!hdev)
 		return -ENOMEM;

+	/* First, we must find out which ASIC are we handling. This is needed
+	 * to configure the behavior of the driver (kernel parameters)
+	 */
+	if (pdev) {
+		hdev->asic_type = get_asic_type(pdev->device);
+		if (hdev->asic_type == ASIC_INVALID) {
+			dev_err(&pdev->dev, "Unsupported ASIC\n");
+			rc = -ENODEV;
+			goto free_hdev;
+		}
+	} else {
+		hdev->asic_type = asic_type;
+	}
+
 	hdev->major = hl_major;
 	hdev->reset_on_lockup = reset_on_lockup;
-
-	/* Parameters for bring-up - set them to defaults */
-	hdev->mmu_enable = 1;
-	hdev->cpu_enable = 1;
-	hdev->reset_pcilink = 0;
-	hdev->cpu_queues_enable = 1;
-	hdev->fw_loading = 1;
 	hdev->pldm = 0;
-	hdev->heartbeat = 1;
-
-	/* If CPU is disabled, no point in loading FW */
-	if (!hdev->cpu_enable)
-		hdev->fw_loading = 0;

-	/* If we don't load FW, no need to initialize CPU queues */
-	if (!hdev->fw_loading)
-		hdev->cpu_queues_enable = 0;
-
-	/* If CPU queues not enabled, no way to do heartbeat */
-	if (!hdev->cpu_queues_enable)
-		hdev->heartbeat = 0;
+	set_driver_behavior_per_device(hdev);

 	if (timeout_locked)
 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
@@ -220,17 +235,6 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	hdev->disabled = true;
 	hdev->pdev = pdev; /* can be NULL in case of simulator device */

-	if (pdev) {
-		hdev->asic_type = get_asic_type(pdev->device);
-		if (hdev->asic_type == ASIC_INVALID) {
-			dev_err(&pdev->dev, "Unsupported ASIC\n");
-			rc = -ENODEV;
-			goto free_hdev;
-		}
-	} else {
-		hdev->asic_type = asic_type;
-	}
-
 	/* Set default DMA mask to 32 bits */
 	hdev->dma_mask = 32;


--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -119,7 +119,8 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;

-	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, NULL, 0);
+	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev,
+					&hw_idle.busy_engines_mask, NULL);

 	return copy_to_user(out, &hw_idle,
 		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
@@ -254,10 +255,18 @@ static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data)
 	case HL_DEBUG_OP_BMON:
 	case HL_DEBUG_OP_SPMU:
 	case HL_DEBUG_OP_TIMESTAMP:
+		if (!hdev->in_debug) {
+			dev_err_ratelimited(hdev->dev,
+				"Rejecting debug configuration request because device not in debug mode\n");
+			return -EFAULT;
+		}
 		args->input_size =
 			min(args->input_size, hl_debug_struct_size[args->op]);
 		rc = debug_coresight(hdev, args);
 		break;
+	case HL_DEBUG_OP_SET_MODE:
+		rc = hl_device_set_debug_mode(hdev, (bool) args->enable);
+		break;
 	default:
 		dev_err(hdev->dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;

--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -265,7 +265,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 	cq = &hdev->completion_queue[q->hw_queue_id];
 	cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);

-	hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
+	hdev->asic_funcs->add_end_of_cb_packets(hdev, cb->kernel_address, len,
 						cq_addr,
 						__le32_to_cpu(cq_pkt.data),
 						q->hw_queue_id);

--- a/drivers/misc/habanalabs/include/goya/asic_reg/dma_ch_0_masks.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/dma_ch_0_masks.h
--- a/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
@@ -88,6 +88,7 @@
 #include "psoc_global_conf_masks.h"
 #include "dma_macro_masks.h"
 #include "dma_qm_0_masks.h"
+#include "dma_ch_0_masks.h"
 #include "tpc0_qm_masks.h"
 #include "tpc0_cmdq_masks.h"
 #include "mme_qm_masks.h"

--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -1657,17 +1657,10 @@ int hl_vm_init(struct hl_device *hdev)
 	struct hl_vm *vm = &hdev->vm;
 	int rc;

-	rc = hl_mmu_init(hdev);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to init MMU\n");
-		return rc;
-	}
-
 	vm->dram_pg_pool = gen_pool_create(__ffs(prop->dram_page_size), -1);
 	if (!vm->dram_pg_pool) {
 		dev_err(hdev->dev, "Failed to create dram page pool\n");
-		rc = -ENOMEM;
-		goto pool_create_err;
+		return -ENOMEM;
 	}

 	kref_init(&vm->dram_pg_pool_refcount);
@@ -1693,8 +1686,6 @@ int hl_vm_init(struct hl_device *hdev)

 pool_add_err:
 	gen_pool_destroy(vm->dram_pg_pool);
-pool_create_err:
-	hl_mmu_fini(hdev);

 	return rc;
 }
@@ -1724,7 +1715,5 @@ void hl_vm_fini(struct hl_device *hdev)
 		dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
 				__func__);

-	hl_mmu_fini(hdev);
-
 	vm->init_done = false;
 }
--- a/drivers/misc/habanalabs/mmu.c
+++ b/drivers/misc/habanalabs/mmu.c
@@ -241,8 +241,9 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 		hop2_pte_addr, hop3_pte_addr, pte_val;
 	int rc, i, j, hop3_allocated = 0;

-	if (!hdev->dram_supports_virtual_memory ||
-			!hdev->dram_default_page_mapping)
+	if ((!hdev->dram_supports_virtual_memory) ||
+			(!hdev->dram_default_page_mapping) ||
+			(ctx->asid == HL_KERNEL_ASID_ID))
 		return 0;

 	num_of_hop3 = prop->dram_size_for_default_page_mapping;
@@ -340,8 +341,9 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 		hop2_pte_addr, hop3_pte_addr;
 	int i, j;

-	if (!hdev->dram_supports_virtual_memory ||
-			!hdev->dram_default_page_mapping)
+	if ((!hdev->dram_supports_virtual_memory) ||
+			(!hdev->dram_default_page_mapping) ||
+			(ctx->asid == HL_KERNEL_ASID_ID))
 		return;

 	num_of_hop3 = prop->dram_size_for_default_page_mapping;
@@ -385,12 +387,8 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 * @hdev: habanalabs device structure.
 *
 * This function does the following:
- * - Allocate max_asid zeroed hop0 pgts so no mapping is available.
- * - Enable MMU in H/W.
- * - Invalidate the MMU cache.
 * - Create a pool of pages for pgt_infos.
- *
- * This function depends on DMA QMAN to be working!
+ * - Create a shadow table for pgt
 *
 * Return: 0 for success, non-zero for failure.
 */
@@ -915,6 +913,10 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
 		return -EFAULT;
 	}

+	WARN_ONCE((phys_addr & (real_page_size - 1)),
+		"Mapping 0x%llx with page size of 0x%x is erroneous! Address must be divisible by page size",
+		phys_addr, real_page_size);
+
 	npages = page_size / real_page_size;
 	real_virt_addr = virt_addr;
 	real_phys_addr = phys_addr;

--- a/drivers/misc/habanalabs/pci.c
+++ b/drivers/misc/habanalabs/pci.c
@@ -10,6 +10,8 @@

 #include <linux/pci.h>

+#define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC	(HL_PCI_ELBI_TIMEOUT_MSEC * 10)
+
 /**
 * hl_pci_bars_map() - Map PCI BARs.
 * @hdev: Pointer to hl_device structure.
@@ -88,8 +90,14 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
 {
 	struct pci_dev *pdev = hdev->pdev;
 	ktime_t timeout;
+	u64 msec;
 	u32 val;

+	if (hdev->pldm)
+		msec = HL_PLDM_PCI_ELBI_TIMEOUT_MSEC;
+	else
+		msec = HL_PCI_ELBI_TIMEOUT_MSEC;
+
 	/* Clear previous status */
 	pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, 0);

@@ -98,7 +106,7 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
 	pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_CTRL,
 				PCI_CONFIG_ELBI_CTRL_WRITE);

-	timeout = ktime_add_ms(ktime_get(), 10);
+	timeout = ktime_add_ms(ktime_get(), msec);
 	for (;;) {
 		pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, &val);
 		if (val & PCI_CONFIG_ELBI_STS_MASK)

--- a/drivers/misc/habanalabs/sysfs.c
+++ b/drivers/misc/habanalabs/sysfs.c
@@ -328,10 +328,6 @@ static ssize_t pci_addr_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	/* Use dummy, fixed address for simulator */
-	if (!hdev->pdev)
-		return sprintf(buf, "0000:%02d:00.0\n", hdev->id);
-
 	return sprintf(buf, "%04x:%02x:%02x.%x\n",
 			pci_domain_nr(hdev->pdev->bus),
 			hdev->pdev->bus->number,

--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -45,6 +45,30 @@ enum goya_queue_id {
 	GOYA_QUEUE_ID_SIZE
 };

+/*
+ * Engine Numbering
+ *
+ * Used in the "busy_engines_mask" field in `struct hl_info_hw_idle'
+ */
+
+enum goya_engine_id {
+	GOYA_ENGINE_ID_DMA_0 = 0,
+	GOYA_ENGINE_ID_DMA_1,
+	GOYA_ENGINE_ID_DMA_2,
+	GOYA_ENGINE_ID_DMA_3,
+	GOYA_ENGINE_ID_DMA_4,
+	GOYA_ENGINE_ID_MME_0,
+	GOYA_ENGINE_ID_TPC_0,
+	GOYA_ENGINE_ID_TPC_1,
+	GOYA_ENGINE_ID_TPC_2,
+	GOYA_ENGINE_ID_TPC_3,
+	GOYA_ENGINE_ID_TPC_4,
+	GOYA_ENGINE_ID_TPC_5,
+	GOYA_ENGINE_ID_TPC_6,
+	GOYA_ENGINE_ID_TPC_7,
+	GOYA_ENGINE_ID_SIZE
+};
+
 enum hl_device_status {
 	HL_DEVICE_STATUS_OPERATIONAL,
 	HL_DEVICE_STATUS_IN_RESET,
@@ -86,7 +110,11 @@ struct hl_info_dram_usage {

 struct hl_info_hw_idle {
 	__u32 is_idle;
-	__u32 pad;
+	/*
+	 * Bitmask of busy engines.
+	 * Bits definition is according to `enum <chip>_enging_id'.
+	 */
+	__u32 busy_engines_mask;
 };

 struct hl_info_device_status {