Merge tag 'misc-habanalabs-next-2020-09-22' of...

Merge tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.10-rc1: - Stop using the DRM's dma-fence module and instead use kernel completions. - Support PCIe AER - Use dma_mmap_coherent for memory allocated using dma_alloc_coherent - Use smallest possible alignment when allocating virtual addresses in our MMU driver. - Refactor MMU driver code to be device-oriented - Allow user to check CS status without any sleep - Add an option to map a Command Buffer to the Device's MMU - Expose sync manager resource allocation to user through INFO IOCTL - Convert code to use standard BIT(), GENMASK() and FIELD_PREP() - Many small fixes (casting, better error messages, remove unused defines, h/w configuration fixes, etc.) * tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux: (46 commits) habanalabs: update scratchpad register map habanalabs: add indication of security-enabled F/W habanalabs/gaudi: fix DMA completions max outstanding to 15 habanalabs/gaudi: remove axi drain support habanalabs: update firmware interface file habanalabs: Add an option to map CB to device MMU habanalabs: Save context in a command buffer object habanalabs: no need for DMA_SHARED_BUFFER habanalabs: allow to wait on CS without sleep habanalabs/gaudi: increase timeout for boot fit load habanalabs: add debugfs support for MMU with 6 HOPs habanalabs: add num_hops to hl_mmu_properties habanalabs: refactor MMU as device-oriented habanalabs: rename mmu.c to mmu_v1.c habanalabs: use smallest possible alignment for virtual addresses habanalabs: check flag before reset because of f/w event habanalabs: increase PQ COMP_OFFSET by one nibble habanalabs: Fix alignment issue in cpucp_info structure habanalabs: remove unused define habanalabs: remove unused ASIC function pointer ...

Merge tag 'misc-habanalabs-next-2020-09-22' of...
Merge tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.10-rc1: - Stop using the DRM's dma-fence module and instead use kernel completions. - Support PCIe AER - Use dma_mmap_coherent for memory allocated using dma_alloc_coherent - Use smallest possible alignment when allocating virtual addresses in our MMU driver. - Refactor MMU driver code to be device-oriented - Allow user to check CS status without any sleep - Add an option to map a Command Buffer to the Device's MMU - Expose sync manager resource allocation to user through INFO IOCTL - Convert code to use standard BIT(), GENMASK() and FIELD_PREP() - Many small fixes (casting, better error messages, remove unused defines, h/w configuration fixes, etc.) * tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux: (46 commits) habanalabs: update scratchpad register map habanalabs: add indication of security-enabled F/W habanalabs/gaudi: fix DMA completions max outstanding to 15 habanalabs/gaudi: remove axi drain support habanalabs: update firmware interface file habanalabs: Add an option to map CB to device MMU habanalabs: Save context in a command buffer object habanalabs: no need for DMA_SHARED_BUFFER habanalabs: allow to wait on CS without sleep habanalabs/gaudi: increase timeout for boot fit load habanalabs: add debugfs support for MMU with 6 HOPs habanalabs: add num_hops to hl_mmu_properties habanalabs: refactor MMU as device-oriented habanalabs: rename mmu.c to mmu_v1.c habanalabs: use smallest possible alignment for virtual addresses habanalabs: check flag before reset because of f/w event habanalabs: increase PQ COMP_OFFSET by one nibble habanalabs: Fix alignment issue in cpucp_info structure habanalabs: remove unused define habanalabs: remove unused ASIC function pointer ...
9e072793 · Greg Kroah-Hartman · e82ed736 · f279e5cd · 9e072793 · 9e072793
Commit 9e072793 authored Sep 22, 2020 by Greg Kroah-Hartman
33 changed files
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -2,13 +2,17 @@ What:           /sys/class/habanalabs/hl<n>/armcp_kernel_ver
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
-Description:    Version of the Linux kernel running on the device's CPU
+Description:    Version of the Linux kernel running on the device's CPU.
+                Will be DEPRECATED in Linux kernel version 5.10, and be
+                replaced with cpucp_kernel_ver

 What:           /sys/class/habanalabs/hl<n>/armcp_ver
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Version of the application running on the device's CPU
+                Will be DEPRECATED in Linux kernel version 5.10, and be
+                replaced with cpucp_ver

 What:           /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
 Date:           Jun 2019
@@ -33,6 +37,18 @@ KernelVersion:  5.1
 Contact:        oded.gabbay@gmail.com
 Description:    Version of the Device's CPLD F/W

+What:           /sys/class/habanalabs/hl<n>/cpucp_kernel_ver
+Date:           Oct 2020
+KernelVersion:  5.10
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the Linux kernel running on the device's CPU
+
+What:           /sys/class/habanalabs/hl<n>/cpucp_ver
+Date:           Oct 2020
+KernelVersion:  5.10
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the application running on the device's CPU
+
 What:           /sys/class/habanalabs/hl<n>/device_type
 Date:           Jan 2019
 KernelVersion:  5.1

--- a/drivers/misc/habanalabs/Kconfig
+++ b/drivers/misc/habanalabs/Kconfig
@@ -7,7 +7,6 @@ config HABANA_AI
 	tristate "HabanaAI accelerators (habanalabs)"
 	depends on PCI && HAS_IOMEM
 	select FRAME_VECTOR
-	select DMA_SHARED_BUFFER
 	select GENERIC_ALLOCATOR
 	select HWMON
 	help

--- a/drivers/misc/habanalabs/common/Makefile
+++ b/drivers/misc/habanalabs/common/Makefile
@@ -3,5 +3,5 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
 		common/asid.o common/habanalabs_ioctl.o \
 		common/command_buffer.o common/hw_queue.o common/irq.o \
 		common/sysfs.o common/hwmon.o common/memory.o \
-		common/command_submission.o common/mmu.o common/firmware_if.o \
-		common/pci.o
+		common/command_submission.o common/mmu.o common/mmu_v1.o \
+		common/firmware_if.o common/pci.o
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -13,6 +13,131 @@
 #include <linux/uaccess.h>
 #include <linux/genalloc.h>

+static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_vm_va_block *va_block, *tmp;
+	dma_addr_t bus_addr;
+	u64 virt_addr;
+	u32 page_size = prop->pmmu.page_size;
+	s32 offset;
+	int rc;
+
+	if (!hdev->supports_cb_mapping) {
+		dev_err_ratelimited(hdev->dev,
+				"Cannot map CB because no VA range is allocated for CB mapping\n");
+		return -EINVAL;
+	}
+
+	if (!hdev->mmu_enable) {
+		dev_err_ratelimited(hdev->dev,
+				"Cannot map CB because MMU is disabled\n");
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&cb->va_block_list);
+
+	for (bus_addr = cb->bus_address;
+			bus_addr < cb->bus_address + cb->size;
+			bus_addr += page_size) {
+
+		virt_addr = (u64) gen_pool_alloc(ctx->cb_va_pool, page_size);
+		if (!virt_addr) {
+			dev_err(hdev->dev,
+				"Failed to allocate device virtual address for CB\n");
+			rc = -ENOMEM;
+			goto err_va_pool_free;
+		}
+
+		va_block = kzalloc(sizeof(*va_block), GFP_KERNEL);
+		if (!va_block) {
+			rc = -ENOMEM;
+			gen_pool_free(ctx->cb_va_pool, virt_addr, page_size);
+			goto err_va_pool_free;
+		}
+
+		va_block->start = virt_addr;
+		va_block->end = virt_addr + page_size;
+		va_block->size = page_size;
+		list_add_tail(&va_block->node, &cb->va_block_list);
+	}
+
+	mutex_lock(&ctx->mmu_lock);
+
+	bus_addr = cb->bus_address;
+	offset = 0;
+	list_for_each_entry(va_block, &cb->va_block_list, node) {
+		rc = hl_mmu_map(ctx, va_block->start, bus_addr, va_block->size,
+				list_is_last(&va_block->node,
+						&cb->va_block_list));
+		if (rc) {
+			dev_err(hdev->dev, "Failed to map VA %#llx to CB\n",
+				va_block->start);
+			goto err_va_umap;
+		}
+
+		bus_addr += va_block->size;
+		offset += va_block->size;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	cb->is_mmu_mapped = true;
+
+	return 0;
+
+err_va_umap:
+	list_for_each_entry(va_block, &cb->va_block_list, node) {
+		if (offset <= 0)
+			break;
+		hl_mmu_unmap(ctx, va_block->start, va_block->size,
+				offset <= va_block->size);
+		offset -= va_block->size;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+err_va_pool_free:
+	list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
+		gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+
+	return rc;
+}
+
+static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm_va_block *va_block, *tmp;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	list_for_each_entry(va_block, &cb->va_block_list, node)
+		if (hl_mmu_unmap(ctx, va_block->start, va_block->size,
+				list_is_last(&va_block->node,
+						&cb->va_block_list)))
+			dev_warn_ratelimited(hdev->dev,
+					"Failed to unmap CB's va 0x%llx\n",
+					va_block->start);
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
+		gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+}
+
 static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
 {
 	if (cb->is_internal)
@@ -47,6 +172,11 @@ static void cb_release(struct kref *ref)

 	hl_debugfs_remove_cb(cb);

+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
+
+	hl_ctx_put(cb->ctx);
+
 	cb_do_release(hdev, cb);
 }

@@ -107,11 +237,12 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
 }

 int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
+			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
+			bool map_cb, u64 *handle)
 {
 	struct hl_cb *cb;
 	bool alloc_new_cb = true;
-	int rc;
+	int rc, ctx_id = ctx->asid;

 	/*
 	 * Can't use generic function to check this because of special case
@@ -163,7 +294,21 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	}

 	cb->hdev = hdev;
-	cb->ctx_id = ctx_id;
+	cb->ctx = ctx;
+	hl_ctx_get(hdev, cb->ctx);
+
+	if (map_cb) {
+		if (ctx_id == HL_KERNEL_ASID_ID) {
+			dev_err(hdev->dev,
+				"CB mapping is not supported for kernel context\n");
+			rc = -EINVAL;
+			goto release_cb;
+		}
+
+		rc = cb_map_mem(ctx, cb);
+		if (rc)
+			goto release_cb;
+	}

 	spin_lock(&mgr->cb_lock);
 	rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
@@ -171,10 +316,10 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,

 	if (rc < 0) {
 		dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n");
-		goto release_cb;
+		goto unmap_mem;
 	}

-	cb->id = rc;
+	cb->id = (u64) rc;

 	kref_init(&cb->refcount);
 	spin_lock_init(&cb->lock);
@@ -183,14 +328,18 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	 * idr is 32-bit so we can safely OR it with a mask that is above
 	 * 32 bit
 	 */
-	*handle = cb->id | HL_MMAP_CB_MASK;
+	*handle = cb->id | HL_MMAP_TYPE_CB;
 	*handle <<= PAGE_SHIFT;

 	hl_debugfs_add_cb(cb);

 	return 0;

+unmap_mem:
+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
 release_cb:
+	hl_ctx_put(cb->ctx);
 	cb_do_release(hdev, cb);
 out_err:
 	*handle = 0;
@@ -250,9 +399,10 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 				args->in.cb_size, HL_MAX_CB_SIZE);
 			rc = -EINVAL;
 		} else {
-			rc = hl_cb_create(hdev, &hpriv->cb_mgr,
-					args->in.cb_size, &handle,
-					hpriv->ctx->asid, false);
+			rc = hl_cb_create(hdev, &hpriv->cb_mgr, hpriv->ctx,
+					args->in.cb_size, false,
+					!!(args->in.flags & HL_CB_FLAGS_MAP),
+					&handle);
 		}

 		memset(args, 0, sizeof(*args));
@@ -300,11 +450,14 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_cb *cb;
-	phys_addr_t address;
 	u32 handle, user_cb_size;
 	int rc;

+	/* We use the page offset to hold the idr and thus we need to clear
+	 * it before doing the mmap itself
+	 */
 	handle = vma->vm_pgoff;
+	vma->vm_pgoff = 0;

 	/* reference was taken here */
 	cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle);
@@ -356,12 +509,8 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)

 	vma->vm_private_data = cb;

-	/* Calculate address for CB */
-	address = virt_to_phys((void *) (uintptr_t) cb->kernel_address);
-
-	rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
-					address, cb->size);
-
+	rc = hdev->asic_funcs->cb_mmap(hdev, vma, (void *) cb->kernel_address,
+					cb->bus_address, cb->size);
 	if (rc) {
 		spin_lock(&cb->lock);
 		cb->mmap = false;
@@ -425,7 +574,7 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
 		if (kref_put(&cb->refcount, cb_release) != 1)
 			dev_err(hdev->dev,
 				"CB %d for CTX ID %d is still alive\n",
-				id, cb->ctx_id);
+				id, cb->ctx->asid);
 	}

 	idr_destroy(&mgr->cb_handles);
@@ -438,8 +587,8 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 	struct hl_cb *cb;
 	int rc;

-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
-			HL_KERNEL_ASID_ID, internal_cb);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx, cb_size,
+				internal_cb, false, &cb_handle);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to allocate CB for the kernel driver %d\n", rc);
@@ -495,3 +644,45 @@ int hl_cb_pool_fini(struct hl_device *hdev)

 	return 0;
 }
+
+int hl_cb_va_pool_init(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!hdev->supports_cb_mapping)
+		return 0;
+
+	ctx->cb_va_pool = gen_pool_create(__ffs(prop->pmmu.page_size), -1);
+	if (!ctx->cb_va_pool) {
+		dev_err(hdev->dev,
+			"Failed to create VA gen pool for CB mapping\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(ctx->cb_va_pool, prop->cb_va_start_addr,
+			prop->cb_va_end_addr - prop->cb_va_start_addr, -1);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add memory to VA gen pool for CB mapping\n");
+		goto err_pool_destroy;
+	}
+
+	return 0;
+
+err_pool_destroy:
+	gen_pool_destroy(ctx->cb_va_pool);
+
+	return rc;
+}
+
+void hl_cb_va_pool_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	if (!hdev->supports_cb_mapping)
+		return;
+
+	gen_pool_destroy(ctx->cb_va_pool);
+}
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref)
 			hw_sob->q_idx, hw_sob->sob_id);
 }

-static const char *hl_fence_get_driver_name(struct dma_fence *fence)
-{
-	return "HabanaLabs";
-}
-
-static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
-{
-	struct hl_cs_compl *hl_cs_compl =
-		container_of(fence, struct hl_cs_compl, base_fence);
-
-	return dev_name(hl_cs_compl->hdev->dev);
-}
-
-static bool hl_fence_enable_signaling(struct dma_fence *fence)
-{
-	return true;
-}
-
-static void hl_fence_release(struct dma_fence *fence)
+static void hl_fence_release(struct kref *kref)
 {
+	struct hl_fence *fence =
+		container_of(kref, struct hl_fence, refcount);
 	struct hl_cs_compl *hl_cs_cmpl =
 		container_of(fence, struct hl_cs_compl, base_fence);
 	struct hl_device *hdev = hl_cs_cmpl->hdev;
@@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence)
 	}

 free:
-	kfree_rcu(hl_cs_cmpl, base_fence.rcu);
+	kfree(hl_cs_cmpl);
 }

-static const struct dma_fence_ops hl_fence_ops = {
-	.get_driver_name = hl_fence_get_driver_name,
-	.get_timeline_name = hl_fence_get_timeline_name,
-	.enable_signaling = hl_fence_enable_signaling,
-	.release = hl_fence_release
-};
+void hl_fence_put(struct hl_fence *fence)
+{
+	if (fence)
+		kref_put(&fence->refcount, hl_fence_release);
+}
+
+void hl_fence_get(struct hl_fence *fence)
+{
+	if (fence)
+		kref_get(&fence->refcount);
+}
+
+static void hl_fence_init(struct hl_fence *fence)
+{
+	kref_init(&fence->refcount);
+	fence->error = 0;
+	init_completion(&fence->completion);
+}

 static void cs_get(struct hl_cs *cs)
 {
@@ -256,6 +252,8 @@ static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
 			ctx->cs_counters.parsing_drop_cnt;
 	hdev->aggregated_cs_counters.queue_full_drop_cnt +=
 			ctx->cs_counters.queue_full_drop_cnt;
+	hdev->aggregated_cs_counters.max_cs_in_flight_drop_cnt +=
+			ctx->cs_counters.max_cs_in_flight_drop_cnt;
 }

 static void cs_do_release(struct kref *ref)
@@ -336,7 +334,7 @@ static void cs_do_release(struct kref *ref)
 		 * In case the wait for signal CS was submitted, the put occurs
 		 * in init_signal_wait_cs() right before hanging on the PQ.
 		 */
-		dma_fence_put(cs->signal_fence);
+		hl_fence_put(cs->signal_fence);
 	}

 	/*
@@ -348,19 +346,18 @@ static void cs_do_release(struct kref *ref)
 	hl_ctx_put(cs->ctx);

 	/* We need to mark an error for not submitted because in that case
-	 * the dma fence release flow is different. Mainly, we don't need
+	 * the hl fence release flow is different. Mainly, we don't need
 	 * to handle hw_sob for signal/wait
 	 */
 	if (cs->timedout)
-		dma_fence_set_error(cs->fence, -ETIMEDOUT);
+		cs->fence->error = -ETIMEDOUT;
 	else if (cs->aborted)
-		dma_fence_set_error(cs->fence, -EIO);
+		cs->fence->error = -EIO;
 	else if (!cs->submitted)
-		dma_fence_set_error(cs->fence, -EBUSY);
-
-	dma_fence_signal(cs->fence);
-	dma_fence_put(cs->fence);
+		cs->fence->error = -EBUSY;

+	complete_all(&cs->fence->completion);
+	hl_fence_put(cs->fence);
 	cs_counters_aggregate(hdev, cs->ctx);

 	kfree(cs->jobs_in_queue_cnt);
@@ -401,7 +398,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 			enum hl_cs_type cs_type, struct hl_cs **cs_new)
 {
 	struct hl_cs_compl *cs_cmpl;
-	struct dma_fence *other = NULL;
+	struct hl_fence *other = NULL;
 	struct hl_cs *cs;
 	int rc;

@@ -434,9 +431,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs_cmpl->cs_seq = ctx->cs_sequence;
 	other = ctx->cs_pending[cs_cmpl->cs_seq &
 				(hdev->asic_prop.max_pending_cs - 1)];
-	if ((other) && (!dma_fence_is_signaled(other))) {
-		dev_dbg(hdev->dev,
+
+	if (other && !completion_done(&other->completion)) {
+		dev_dbg_ratelimited(hdev->dev,
 			"Rejecting CS because of too many in-flights CS\n");
+		ctx->cs_counters.max_cs_in_flight_drop_cnt++;
 		rc = -EAGAIN;
 		goto free_fence;
 	}
@@ -448,8 +447,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 		goto free_fence;
 	}

-	dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
-			ctx->asid, ctx->cs_sequence);
+	/* init hl_fence */
+	hl_fence_init(&cs_cmpl->base_fence);

 	cs->sequence = cs_cmpl->cs_seq;

@@ -458,9 +457,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 							&cs_cmpl->base_fence;
 	ctx->cs_sequence++;

-	dma_fence_get(&cs_cmpl->base_fence);
+	hl_fence_get(&cs_cmpl->base_fence);

-	dma_fence_put(other);
+	hl_fence_put(other);

 	spin_unlock(&ctx->cs_lock);

@@ -690,8 +689,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 			rc = -ENOMEM;
 			if (is_kernel_allocated_cb)
 				goto release_cb;
-			else
-				goto free_cs_object;
+
+			goto free_cs_object;
 		}

 		job->id = i + 1;
@@ -773,7 +772,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	struct hl_ctx *ctx = hpriv->ctx;
 	struct hl_cs_chunk *cs_chunk_array, *chunk;
 	struct hw_queue_properties *hw_queue_prop;
-	struct dma_fence *sig_fence = NULL;
+	struct hl_fence *sig_fence = NULL;
 	struct hl_cs_job *job;
 	struct hl_cs *cs;
 	struct hl_cb *cb;
@@ -883,14 +882,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 			dev_err(hdev->dev,
 				"CS seq 0x%llx is not of a signal CS\n",
 				signal_seq);
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 			rc = -EINVAL;
 			goto free_signal_seq_array;
 		}

-		if (dma_fence_is_signaled(sig_fence)) {
+		if (completion_done(&sig_fence->completion)) {
 			/* signal CS already finished */
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 			rc = 0;
 			goto free_signal_seq_array;
 		}
@@ -902,7 +901,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	rc = allocate_cs(hdev, ctx, cs_type, &cs);
 	if (rc) {
 		if (cs_type == CS_TYPE_WAIT)
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 		hl_ctx_put(ctx);
 		goto free_signal_seq_array;
 	}
@@ -1162,7 +1161,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
 {
-	struct dma_fence *fence;
+	struct hl_fence *fence;
 	unsigned long timeout;
 	long rc;

@@ -1181,12 +1180,18 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 				"Can't wait on CS %llu because current CS is at seq %llu\n",
 				seq, ctx->cs_sequence);
 	} else if (fence) {
-		rc = dma_fence_wait_timeout(fence, true, timeout);
+		if (!timeout_us)
+			rc = completion_done(&fence->completion);
+		else
+			rc = wait_for_completion_interruptible_timeout(
+					&fence->completion, timeout);
+
 		if (fence->error == -ETIMEDOUT)
 			rc = -ETIMEDOUT;
 		else if (fence->error == -EIO)
 			rc = -EIO;
-		dma_fence_put(fence);
+
+		hl_fence_put(fence);
 	} else {
 		dev_dbg(hdev->dev,
 			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",

--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -23,7 +23,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 	 */

 	for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
-		dma_fence_put(ctx->cs_pending[i]);
+		hl_fence_put(ctx->cs_pending[i]);

 	kfree(ctx->cs_pending);

@@ -37,6 +37,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
 			hl_device_set_debug_mode(hdev, false);

+		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
 	} else {
@@ -128,7 +129,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 	atomic_set(&ctx->thread_ctx_switch_token, 1);
 	ctx->thread_ctx_switch_wait_token = 0;
 	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
-				sizeof(struct dma_fence *),
+				sizeof(struct hl_fence *),
 				GFP_KERNEL);
 	if (!ctx->cs_pending)
 		return -ENOMEM;
@@ -155,15 +156,24 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 			goto err_asid_free;
 		}

+		rc = hl_cb_va_pool_init(ctx);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to init VA pool for mapped CB\n");
+			goto err_vm_ctx_fini;
+		}
+
 		rc = hdev->asic_funcs->ctx_init(ctx);
 		if (rc) {
 			dev_err(hdev->dev, "ctx_init failed\n");
-			goto err_vm_ctx_fini;
+			goto err_cb_va_pool_fini;
 		}
 	}

 	return 0;

+err_cb_va_pool_fini:
+	hl_cb_va_pool_fini(ctx);
 err_vm_ctx_fini:
 	hl_vm_ctx_fini(ctx);
 err_asid_free:
@@ -184,10 +194,10 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }

-struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
-	struct dma_fence *fence;
+	struct hl_fence *fence;

 	spin_lock(&ctx->cs_lock);

@@ -201,8 +211,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 		return NULL;
 	}

-	fence = dma_fence_get(
-			ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
+	fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
+	hl_fence_get(fence);
+
 	spin_unlock(&ctx->cs_lock);

 	return fence;

--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -21,7 +21,7 @@ static struct dentry *hl_debug_root;
 static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 				u8 i2c_reg, long *val)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	if (hl_device_disabled_or_in_reset(hdev))
@@ -29,8 +29,8 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_RD <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
@@ -47,7 +47,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 				u8 i2c_reg, u32 val)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	if (hl_device_disabled_or_in_reset(hdev))
@@ -55,8 +55,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_WR <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
@@ -73,7 +73,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,

 static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	if (hl_device_disabled_or_in_reset(hdev))
@@ -81,8 +81,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_LED_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_LED_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.led_index = cpu_to_le32(led);
 	pkt.value = cpu_to_le64(state);

@@ -110,8 +110,8 @@ static int command_buffers_show(struct seq_file *s, void *data)
 			seq_puts(s, "---------------------------------------------------------------\n");
 		}
 		seq_printf(s,
-			"   %03d        %d    0x%08x      %d          %d          %d\n",
-			cb->id, cb->ctx_id, cb->size,
+			"   %03llu        %d    0x%08x      %d          %d          %d\n",
+			cb->id, cb->ctx->asid, cb->size,
 			kref_read(&cb->refcount),
 			cb->mmap, cb->cs_cnt);
 	}
@@ -354,6 +354,14 @@ static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
 					mmu_specs->hop4_shift);
 }

+static inline u64 get_hop5_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop5_mask,
+					mmu_specs->hop5_shift);
+}
+
 static inline u64 get_next_hop_addr(u64 curr_pte)
 {
 	if (curr_pte & PAGE_PRESENT_MASK)
@@ -377,6 +385,7 @@ static int mmu_show(struct seq_file *s, void *data)
 		hop2_addr = 0, hop2_pte_addr = 0, hop2_pte = 0,
 		hop3_addr = 0, hop3_pte_addr = 0, hop3_pte = 0,
 		hop4_addr = 0, hop4_pte_addr = 0, hop4_pte = 0,
+		hop5_addr = 0, hop5_pte_addr = 0, hop5_pte = 0,
 		virt_addr = dev_entry->mmu_addr;

 	if (!hdev->mmu_enable)
@@ -428,20 +437,49 @@ static int mmu_show(struct seq_file *s, void *data)
 	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);

-	if (!(hop3_pte & LAST_MASK)) {
+	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
+		if (!(hop3_pte & LAST_MASK)) {
+			hop4_addr = get_next_hop_addr(hop3_pte);
+
+			if (hop4_addr == ULLONG_MAX)
+				goto not_mapped;
+
+			hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
+							hop4_addr, virt_addr);
+			hop4_pte = hdev->asic_funcs->read_pte(hdev,
+								hop4_pte_addr);
+			if (!(hop4_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		} else {
+			if (!(hop3_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		}
+	} else {
 		hop4_addr = get_next_hop_addr(hop3_pte);

 		if (hop4_addr == ULLONG_MAX)
 			goto not_mapped;

-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-		hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
-		if (!(hop4_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
-	} else {
-		if (!(hop3_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
+						hop4_addr, virt_addr);
+		hop4_pte = hdev->asic_funcs->read_pte(hdev,
+							hop4_pte_addr);
+		if (!(hop4_pte & LAST_MASK)) {
+			hop5_addr = get_next_hop_addr(hop4_pte);
+
+			if (hop5_addr == ULLONG_MAX)
+				goto not_mapped;
+
+			hop5_pte_addr = get_hop5_pte_addr(ctx, mmu_prop,
+							hop5_addr, virt_addr);
+			hop5_pte = hdev->asic_funcs->read_pte(hdev,
+								hop5_pte_addr);
+			if (!(hop5_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		} else {
+			if (!(hop4_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		}
 	}

 	seq_printf(s, "asid: %u, virt_addr: 0x%llx\n",
@@ -463,10 +501,22 @@ static int mmu_show(struct seq_file *s, void *data)
 	seq_printf(s, "hop3_pte_addr: 0x%llx\n", hop3_pte_addr);
 	seq_printf(s, "hop3_pte: 0x%llx\n", hop3_pte);

-	if (!(hop3_pte & LAST_MASK)) {
+	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
+		if (!(hop3_pte & LAST_MASK)) {
+			seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
+			seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
+			seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
+		}
+	} else {
 		seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
 		seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
 		seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
+
+		if (!(hop4_pte & LAST_MASK)) {
+			seq_printf(s, "hop5_addr: 0x%llx\n", hop5_addr);
+			seq_printf(s, "hop5_pte_addr: 0x%llx\n", hop5_pte_addr);
+			seq_printf(s, "hop5_pte: 0x%llx\n", hop5_pte);
+		}
 	}

 	goto out;

--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -123,9 +123,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct hl_fpriv *hpriv = filp->private_data;
+	unsigned long vm_pgoff;

-	if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
-		vma->vm_pgoff ^= HL_MMAP_CB_MASK;
+	vm_pgoff = vma->vm_pgoff;
+	vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
+
+	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
+	case HL_MMAP_TYPE_CB:
 		return hl_cb_mmap(hpriv, vma);
 	}

@@ -286,7 +290,7 @@ static int device_early_init(struct hl_device *hdev)
 	}

 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
-		snprintf(workq_name, 32, "hl-free-jobs-%u", i);
+		snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
 		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
 		if (hdev->cq_wq[i] == NULL) {
 			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@@ -317,6 +321,10 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_chip_info;
 	}

+	rc = hl_mmu_if_set_funcs(hdev);
+	if (rc)
+		goto free_idle_busy_ts_arr;
+
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);

 	mutex_init(&hdev->send_cpu_message_lock);
@@ -330,6 +338,8 @@ static int device_early_init(struct hl_device *hdev)

 	return 0;

+free_idle_busy_ts_arr:
+	kfree(hdev->idle_busy_ts_arr);
 free_chip_info:
 	kfree(hdev->hl_chip_info);
 free_eq_wq:
@@ -871,7 +881,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 			 * so this message won't be sent
 			 */
 			if (hl_fw_send_pci_access_msg(hdev,
-					ARMCP_PACKET_DISABLE_PCI_ACCESS))
+					CPUCP_PACKET_DISABLE_PCI_ACCESS))
 				dev_warn(hdev->dev,
 					"Failed to disable PCI access by F/W\n");
 		}

--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -11,6 +11,7 @@
 #include "habanalabs.h"

 #include <linux/pci.h>
+#include <linux/aer.h>
 #include <linux/module.h>

 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
@@ -408,6 +409,8 @@ static int hl_pci_probe(struct pci_dev *pdev,

 	pci_set_drvdata(pdev, hdev);

+	pci_enable_pcie_error_reporting(pdev);
+
 	rc = hl_device_init(hdev, hl_class);
 	if (rc) {
 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
@@ -440,22 +443,93 @@ static void hl_pci_remove(struct pci_dev *pdev)
 		return;

 	hl_device_fini(hdev);
+	pci_disable_pcie_error_reporting(pdev);
 	pci_set_drvdata(pdev, NULL);
-
 	destroy_hdev(hdev);
 }

+/**
+ * hl_pci_err_detected - a PCI bus error detected on this device
+ *
+ * @pdev: pointer to pci device
+ * @state: PCI error type
+ *
+ * Called by the PCI subsystem whenever a non-correctable
+ * PCI bus error is detected
+ */
+static pci_ers_result_t
+hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct hl_device *hdev = pci_get_drvdata(pdev);
+	enum pci_ers_result result;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+
+	case pci_channel_io_frozen:
+		dev_warn(hdev->dev, "frozen state error detected\n");
+		result = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		dev_warn(hdev->dev, "failure state error detected\n");
+		result = PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default:
+		result = PCI_ERS_RESULT_NONE;
+	}
+
+	hdev->asic_funcs->halt_engines(hdev, true);
+
+	return result;
+}
+
+/**
+ * hl_pci_err_resume - resume after a PCI slot reset
+ *
+ * @pdev: pointer to pci device
+ *
+ */
+static void hl_pci_err_resume(struct pci_dev *pdev)
+{
+	struct hl_device *hdev = pci_get_drvdata(pdev);
+
+	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
+	hl_device_resume(hdev);
+}
+
+/**
+ * hl_pci_err_slot_reset - a PCI slot reset has just happened
+ *
+ * @pdev: pointer to pci device
+ *
+ * Determine if the driver can recover from the PCI slot reset
+ */
+static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
+{
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
 static const struct dev_pm_ops hl_pm_ops = {
 	.suspend = hl_pmops_suspend,
 	.resume = hl_pmops_resume,
 };

+static const struct pci_error_handlers hl_pci_err_handler = {
+	.error_detected = hl_pci_err_detected,
+	.slot_reset = hl_pci_err_slot_reset,
+	.resume = hl_pci_err_resume,
+};
+
 static struct pci_driver hl_pci_driver = {
 	.name = HL_NAME,
 	.id_table = ids,
 	.probe = hl_pci_probe,
 	.remove = hl_pci_remove,
 	.driver.pm = &hl_pm_ops,
+	.err_handler = &hl_pci_err_handler,
 };

 /*

--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -8,6 +8,7 @@
 #include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"

+#include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
@@ -64,14 +65,14 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 		hw_ip.dram_enabled = 1;
 	hw_ip.num_of_events = prop->num_of_events;

-	memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
+	memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version,
 		min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));

-	memcpy(hw_ip.card_name, prop->armcp_info.card_name,
+	memcpy(hw_ip.card_name, prop->cpucp_info.card_name,
 		min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));

-	hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
-	hw_ip.module_id = le32_to_cpu(prop->armcp_info.card_location);
+	hw_ip.cpld_version = le32_to_cpu(prop->cpucp_info.cpld_version);
+	hw_ip.module_id = le32_to_cpu(prop->cpucp_info.card_location);

 	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
 	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
@@ -131,7 +132,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
 		return -EINVAL;

 	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev,
-					&hw_idle.busy_engines_mask, NULL);
+					&hw_idle.busy_engines_mask_ext, NULL);

 	return copy_to_user(out, &hw_idle,
 		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
@@ -276,10 +277,45 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
 }

+static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_pci_counters pci_counters = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_pci_counters_get(hdev, &pci_counters);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &pci_counters,
+		min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
+}
+
+static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_clk_throttle clk_throttle = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+
+	return copy_to_user(out, &clk_throttle,
+		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
+}
+
 static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
-	struct hl_info_cs_counters cs_counters = {0};
+	struct hl_info_cs_counters cs_counters = { {0} };
 	u32 max_size = args->return_size;
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;

@@ -297,6 +333,51 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0;
 }

+static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_info_sync_manager sm_info = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	if (args->dcore_id >= HL_MAX_DCORES)
+		return -EINVAL;
+
+	sm_info.first_available_sync_object =
+			prop->first_available_user_sob[args->dcore_id];
+	sm_info.first_available_monitor =
+			prop->first_available_user_mon[args->dcore_id];
+
+
+	return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size,
+			sizeof(sm_info))) ? -EFAULT : 0;
+}
+
+static int total_energy_consumption_info(struct hl_fpriv *hpriv,
+			struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_energy total_energy = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_total_energy_get(hdev,
+			&total_energy.total_energy_consumption);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &total_energy,
+		min((size_t) max_size, sizeof(total_energy))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -360,6 +441,18 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_CS_COUNTERS:
 		return cs_counters_info(hpriv, args);

+	case HL_INFO_PCI_COUNTERS:
+		return pci_counters_info(hpriv, args);
+
+	case HL_INFO_CLK_THROTTLE_REASON:
+		return clk_throttle_info(hpriv, args);
+
+	case HL_INFO_SYNC_MANAGER:
+		return sync_manager_info(hpriv, args);
+
+	case HL_INFO_TOTAL_ENERGY:
+		return total_energy_consumption_info(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;

--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -288,10 +288,10 @@ static void ext_queue_schedule_job(struct hl_cs_job *job)
 	ptr = cb->bus_address;

 	cq_pkt.data = cpu_to_le32(
-				((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
-					& CQ_ENTRY_SHADOW_INDEX_MASK) |
-				(1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT) |
-				(1 << CQ_ENTRY_READY_SHIFT));
+			((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
+				& CQ_ENTRY_SHADOW_INDEX_MASK) |
+			FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) |
+			FIELD_PREP(CQ_ENTRY_READY_MASK, 1));

 	/*
 	 * No need to protect pi_offset because scheduling to the
@@ -474,7 +474,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
 		 * wait CS was submitted.
 		 */
 		mb();
-		dma_fence_put(cs->signal_fence);
+		hl_fence_put(cs->signal_fence);
 		cs->signal_fence = NULL;
 	}
 }

--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -13,7 +13,7 @@
 #define HWMON_NR_SENSOR_TYPES		(hwmon_pwm + 1)

 int hl_build_hwmon_channel_info(struct hl_device *hdev,
-				struct armcp_sensor *sensors_arr)
+				struct cpucp_sensor *sensors_arr)
 {
 	u32 counts[HWMON_NR_SENSOR_TYPES] = {0};
 	u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL};
@@ -24,7 +24,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 	enum hwmon_sensor_types type;
 	int rc, i, j;

-	for (i = 0 ; i < ARMCP_MAX_SENSORS ; i++) {
+	for (i = 0 ; i < CPUCP_MAX_SENSORS ; i++) {
 		type = le32_to_cpu(sensors_arr[i].type);

 		if ((type == 0) && (sensors_arr[i].flags == 0))
@@ -311,13 +311,13 @@ static const struct hwmon_ops hl_hwmon_ops = {
 int hl_get_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@@ -337,13 +337,13 @@ int hl_get_temperature(struct hl_device *hdev,
 int hl_set_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@@ -362,13 +362,13 @@ int hl_set_temperature(struct hl_device *hdev,
 int hl_get_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@@ -388,13 +388,13 @@ int hl_get_voltage(struct hl_device *hdev,
 int hl_get_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@@ -414,13 +414,13 @@ int hl_get_current(struct hl_device *hdev,
 int hl_get_fan_speed(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FAN_SPEED_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_FAN_SPEED_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@@ -440,13 +440,13 @@ int hl_get_fan_speed(struct hl_device *hdev,
 int hl_get_pwm_info(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);

@@ -466,13 +466,13 @@ int hl_get_pwm_info(struct hl_device *hdev,
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 			long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = cpu_to_le64(value);
@@ -489,13 +489,13 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@@ -514,13 +514,13 @@ int hl_set_voltage(struct hl_device *hdev,
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@@ -549,7 +549,7 @@ int hl_hwmon_init(struct hl_device *hdev)
 		hdev->hl_chip_info->ops = &hl_hwmon_ops;

 		hdev->hwmon_dev = hwmon_device_register_with_info(dev,
-					prop->armcp_info.card_name, hdev,
+					prop->cpucp_info.card_name, hdev,
 					hdev->hl_chip_info, NULL);
 		if (IS_ERR(hdev->hwmon_dev)) {
 			rc = PTR_ERR(hdev->hwmon_dev);

--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -11,7 +11,7 @@

 /**
 * struct hl_eqe_work - This structure is used to schedule work of EQ
- *                      entry and armcp_reset event
+ *                      entry and cpucp_reset event
 *
 * @eq_work:          workqueue object to run when EQ entry is received
 * @hdev:             pointer to device structure

--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -505,41 +505,32 @@ static inline int add_va_block(struct hl_device *hdev,
 }

 /*
- * get_va_block - get a virtual block with the requested size
- *
- * @hdev            : pointer to the habanalabs device structure
- * @va_range        : pointer to the virtual addresses range
- * @size            : requested block size
- * @hint_addr       : hint for request address by the user
- * @is_userptr      : is host or DRAM memory
+ * get_va_block() - get a virtual block for the given size and alignment.
+ * @hdev: pointer to the habanalabs device structure.
+ * @va_range: pointer to the virtual addresses range.
+ * @size: requested block size.
+ * @hint_addr: hint for requested address by the user.
+ * @va_block_align: required alignment of the virtual block start address.
 *
 * This function does the following:
 * - Iterate on the virtual block list to find a suitable virtual block for the
- *   requested size
- * - Reserve the requested block and update the list
- * - Return the start address of the virtual block
+ *   given size and alignment.
+ * - Reserve the requested block and update the list.
+ * - Return the start address of the virtual block.
 */
-static u64 get_va_block(struct hl_device *hdev,
-			struct hl_va_range *va_range, u64 size, u64 hint_addr,
-			bool is_userptr)
+static u64 get_va_block(struct hl_device *hdev, struct hl_va_range *va_range,
+			u64 size, u64 hint_addr, u32 va_block_align)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
-	u64 valid_start, valid_size, prev_start, prev_end, page_mask,
+	u64 valid_start, valid_size, prev_start, prev_end, align_mask,
 		res_valid_start = 0, res_valid_size = 0;
-	u32 page_size;
 	bool add_prev = false;

-	if (is_userptr)
-		/*
-		 * We cannot know if the user allocated memory with huge pages
-		 * or not, hence we continue with the biggest possible
-		 * granularity.
-		 */
-		page_size = hdev->asic_prop.pmmu_huge.page_size;
-	else
-		page_size = hdev->asic_prop.dmmu.page_size;
+	align_mask = ~((u64)va_block_align - 1);

-	page_mask = ~((u64)page_size - 1);
+	/* check if hint_addr is aligned */
+	if (hint_addr & (va_block_align - 1))
+		hint_addr = 0;

 	mutex_lock(&va_range->lock);

@@ -549,9 +540,9 @@ static u64 get_va_block(struct hl_device *hdev,
 		/* calc the first possible aligned addr */
 		valid_start = va_block->start;

-		if (valid_start & (page_size - 1)) {
-			valid_start &= page_mask;
-			valid_start += page_size;
+		if (valid_start & (va_block_align - 1)) {
+			valid_start &= align_mask;
+			valid_start += va_block_align;
 			if (valid_start > va_block->end)
 				continue;
 		}
@@ -863,7 +854,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	struct hl_va_range *va_range;
 	enum vm_type_t *vm_type;
 	u64 ret_vaddr, hint_addr;
-	u32 handle = 0;
+	u32 handle = 0, va_block_align;
 	int rc;
 	bool is_userptr = args->flags & HL_MEM_USERPTR;

@@ -873,6 +864,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	if (is_userptr) {
 		u64 addr = args->map_host.host_virt_addr,
 			size = args->map_host.mem_size;
+		u32 page_size = hdev->asic_prop.pmmu.page_size,
+			huge_page_size = hdev->asic_prop.pmmu_huge.page_size;

 		rc = dma_map_host_va(hdev, addr, size, &userptr);
 		if (rc) {
@@ -892,6 +885,27 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		vm_type = (enum vm_type_t *) userptr;
 		hint_addr = args->map_host.hint_addr;
 		handle = phys_pg_pack->handle;
+
+		/* get required alignment */
+		if (phys_pg_pack->page_size == page_size) {
+			va_range = ctx->host_va_range;
+
+			/*
+			 * huge page alignment may be needed in case of regular
+			 * page mapping, depending on the host VA alignment
+			 */
+			if (addr & (huge_page_size - 1))
+				va_block_align = page_size;
+			else
+				va_block_align = huge_page_size;
+		} else {
+			/*
+			 * huge page alignment is needed in case of huge page
+			 * mapping
+			 */
+			va_range = ctx->host_huge_va_range;
+			va_block_align = huge_page_size;
+		}
 	} else {
 		handle = lower_32_bits(args->map_device.handle);

@@ -912,6 +926,10 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		vm_type = (enum vm_type_t *) phys_pg_pack;

 		hint_addr = args->map_device.hint_addr;
+
+		/* DRAM VA alignment is the same as the DRAM page size */
+		va_range = ctx->dram_va_range;
+		va_block_align = hdev->asic_prop.dmmu.page_size;
 	}

 	/*
@@ -933,16 +951,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto hnode_err;
 	}

-	if (is_userptr)
-		if (phys_pg_pack->page_size == hdev->asic_prop.pmmu.page_size)
-			va_range = ctx->host_va_range;
-		else
-			va_range = ctx->host_huge_va_range;
-	else
-		va_range = ctx->dram_va_range;
-
 	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
-					hint_addr, is_userptr);
+					hint_addr, va_block_align);
 	if (!ret_vaddr) {
 		dev_err(hdev->dev, "no available va block for handle %u\n",
 				handle);

--- a/drivers/misc/habanalabs/common/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu.c
--- a/drivers/misc/habanalabs/common/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu_v1.c
--- a/drivers/misc/habanalabs/common/pci.c
+++ b/drivers/misc/habanalabs/common/pci.c
@@ -9,7 +9,6 @@
 #include "../include/hw_ip/pci/pci_general.h"

 #include <linux/pci.h>
-#include <linux/bitfield.h>

 #define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC	(HL_PCI_ELBI_TIMEOUT_MSEC * 10)

@@ -339,12 +338,17 @@ static int hl_pci_set_dma_mask(struct hl_device *hdev)
 /**
 * hl_pci_init() - PCI initialization code.
 * @hdev: Pointer to hl_device structure.
+ * @cpu_boot_status_reg: status register of the device's CPU
+ * @boot_err0_reg: boot error register of the device's CPU
+ * @preboot_ver_timeout: how much to wait before bailing out on reading
+ *                       the preboot version
 *
 * Set DMA masks, initialize the PCI controller and map the PCI BARs.
 *
 * Return: 0 on success, non-zero for failure.
 */
-int hl_pci_init(struct hl_device *hdev)
+int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 boot_err0_reg, u32 preboot_ver_timeout)
 {
 	struct pci_dev *pdev = hdev->pdev;
 	int rc;
@@ -376,6 +380,15 @@ int hl_pci_init(struct hl_device *hdev)
 	if (rc)
 		goto unmap_pci_bars;

+	/* Before continuing in the initialization, we need to read the preboot
+	 * version to determine whether we run with a security-enabled firmware
+	 * The check will be done in each ASIC's specific code
+	 */
+	rc = hl_fw_read_preboot_ver(hdev, cpu_boot_status_reg, boot_err0_reg,
+					preboot_ver_timeout);
+	if (rc)
+		goto unmap_pci_bars;
+
 	return 0;

 unmap_pci_bars:

--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -11,18 +11,18 @@

 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

 	if (curr)
-		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_CURR_GET <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_CURR_GET <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 	else
-		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_GET <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_GET <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.pll_index = cpu_to_le32(pll_index);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -40,13 +40,13 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)

 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_SET <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_SET <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.pll_index = cpu_to_le32(pll_index);
 	pkt.value = cpu_to_le64(freq);

@@ -61,14 +61,14 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)

 u64 hl_get_max_power(struct hl_device *hdev)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, &result);
@@ -83,13 +83,13 @@ u64 hl_get_max_power(struct hl_device *hdev)

 void hl_set_max_power(struct hl_device *hdev)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;

 	memset(&pkt, 0, sizeof(pkt));

-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.value = cpu_to_le64(hdev->max_power);

 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -112,7 +112,7 @@ static ssize_t armcp_kernel_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s", hdev->asic_prop.armcp_info.kernel_version);
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
 }

 static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
@@ -120,7 +120,7 @@ static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.armcp_version);
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
 }

 static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
@@ -129,7 +129,23 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);

 	return sprintf(buf, "0x%08x\n",
-			hdev->asic_prop.armcp_info.cpld_version);
+			hdev->asic_prop.cpucp_info.cpld_version);
+}
+
+static ssize_t cpucp_kernel_ver_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
+}
+
+static ssize_t cpucp_ver_show(struct device *dev, struct device_attribute *attr,
+				char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
 }

 static ssize_t infineon_ver_show(struct device *dev,
@@ -138,7 +154,7 @@ static ssize_t infineon_ver_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);

 	return sprintf(buf, "0x%04x\n",
-			hdev->asic_prop.armcp_info.infineon_version);
+			hdev->asic_prop.cpucp_info.infineon_version);
 }

 static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
@@ -146,7 +162,7 @@ static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.fuse_version);
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.fuse_version);
 }

 static ssize_t thermal_ver_show(struct device *dev,
@@ -154,7 +170,7 @@ static ssize_t thermal_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);

-	return sprintf(buf, "%s", hdev->asic_prop.armcp_info.thermal_version);
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.thermal_version);
 }

 static ssize_t preboot_btl_ver_show(struct device *dev,
@@ -356,6 +372,8 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj,
 static DEVICE_ATTR_RO(armcp_kernel_ver);
 static DEVICE_ATTR_RO(armcp_ver);
 static DEVICE_ATTR_RO(cpld_ver);
+static DEVICE_ATTR_RO(cpucp_kernel_ver);
+static DEVICE_ATTR_RO(cpucp_ver);
 static DEVICE_ATTR_RO(device_type);
 static DEVICE_ATTR_RO(fuse_ver);
 static DEVICE_ATTR_WO(hard_reset);
@@ -380,6 +398,8 @@ static struct attribute *hl_dev_attrs[] = {
 	&dev_attr_armcp_kernel_ver.attr,
 	&dev_attr_armcp_ver.attr,
 	&dev_attr_cpld_ver.attr,
+	&dev_attr_cpucp_kernel_ver.attr,
+	&dev_attr_cpucp_ver.attr,
 	&dev_attr_device_type.attr,
 	&dev_attr_fuse_ver.attr,
 	&dev_attr_hard_reset.attr,

--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -35,8 +35,6 @@
 #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
 #endif

-#define QMAN_FENCE_TIMEOUT_USEC		10000		/* 10 ms */
-
 #define CORESIGHT_TIMEOUT_USEC		100000		/* 100 ms */

 #define GAUDI_MAX_CLK_FREQ		2200000000ull	/* 2200 MHz */
@@ -44,7 +42,7 @@
 #define MAX_POWER_DEFAULT_PCI		200000		/* 200W */
 #define MAX_POWER_DEFAULT_PMC		350000		/* 350W */

-#define GAUDI_CPU_TIMEOUT_USEC		15000000	/* 15s */
+#define GAUDI_CPU_TIMEOUT_USEC		30000000	/* 30s */

 #define TPC_ENABLED_MASK		0xFF

@@ -142,28 +140,28 @@
 #define VA_HOST_SPACE_SIZE	(VA_HOST_SPACE_END - \
 					VA_HOST_SPACE_START) /* 767TB */

-#define HW_CAP_PLL		0x00000001
-#define HW_CAP_HBM		0x00000002
-#define HW_CAP_MMU		0x00000004
-#define HW_CAP_MME		0x00000008
-#define HW_CAP_CPU		0x00000010
-#define HW_CAP_PCI_DMA		0x00000020
-#define HW_CAP_MSI		0x00000040
-#define HW_CAP_CPU_Q		0x00000080
-#define HW_CAP_HBM_DMA		0x00000100
-#define HW_CAP_CLK_GATE		0x00000200
-#define HW_CAP_SRAM_SCRAMBLER	0x00000400
-#define HW_CAP_HBM_SCRAMBLER	0x00000800
-
-#define HW_CAP_TPC0		0x01000000
-#define HW_CAP_TPC1		0x02000000
-#define HW_CAP_TPC2		0x04000000
-#define HW_CAP_TPC3		0x08000000
-#define HW_CAP_TPC4		0x10000000
-#define HW_CAP_TPC5		0x20000000
-#define HW_CAP_TPC6		0x40000000
-#define HW_CAP_TPC7		0x80000000
-#define HW_CAP_TPC_MASK		0xFF000000
+#define HW_CAP_PLL		BIT(0)
+#define HW_CAP_HBM		BIT(1)
+#define HW_CAP_MMU		BIT(2)
+#define HW_CAP_MME		BIT(3)
+#define HW_CAP_CPU		BIT(4)
+#define HW_CAP_PCI_DMA		BIT(5)
+#define HW_CAP_MSI		BIT(6)
+#define HW_CAP_CPU_Q		BIT(7)
+#define HW_CAP_HBM_DMA		BIT(8)
+#define HW_CAP_CLK_GATE		BIT(9)
+#define HW_CAP_SRAM_SCRAMBLER	BIT(10)
+#define HW_CAP_HBM_SCRAMBLER	BIT(11)
+
+#define HW_CAP_TPC0		BIT(24)
+#define HW_CAP_TPC1		BIT(25)
+#define HW_CAP_TPC2		BIT(26)
+#define HW_CAP_TPC3		BIT(27)
+#define HW_CAP_TPC4		BIT(28)
+#define HW_CAP_TPC5		BIT(29)
+#define HW_CAP_TPC6		BIT(30)
+#define HW_CAP_TPC7		BIT(31)
+#define HW_CAP_TPC_MASK		GENMASK(31, 24)
 #define HW_CAP_TPC_SHIFT	24

 #define GAUDI_CPU_PCI_MSB_ADDR(addr)	(((addr) & GENMASK_ULL(49, 39)) >> 39)
@@ -216,7 +214,7 @@ struct gaudi_internal_qman_info {

 /**
 * struct gaudi_device - ASIC specific manage structure.
- * @armcp_info_get: get information on device from ArmCP
+ * @cpucp_info_get: get information on device from CPU-CP
 * @hw_queues_lock: protects the H/W queues from concurrent access.
 * @clk_gate_mutex: protects code areas that require clock gating to be disabled
 *                  temporarily
@@ -239,7 +237,7 @@ struct gaudi_internal_qman_info {
 *                    8-bit value so use u8.
 */
 struct gaudi_device {
-	int (*armcp_info_get)(struct hl_device *hdev);
+	int (*cpucp_info_get)(struct hl_device *hdev);

 	/* TODO: remove hw_queues_lock after moving to scheduler code */
 	spinlock_t			hw_queues_lock;

--- a/drivers/misc/habanalabs/gaudi/gaudi_security.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_security.c
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -207,7 +207,7 @@ void goya_set_max_power(struct hl_device *hdev, u64 value);
 void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq);
 void goya_add_device_attr(struct hl_device *hdev,
 			struct attribute_group *dev_attr_grp);
-int goya_armcp_info_get(struct hl_device *hdev);
+int goya_cpucp_info_get(struct hl_device *hdev);
 int goya_debug_coresight(struct hl_device *hdev, void *data);
 void goya_halt_coresight(struct hl_device *hdev);


--- a/drivers/misc/habanalabs/include/common/armcp_if.h
+++ b/drivers/misc/habanalabs/include/common/armcp_if.h
--- a/drivers/misc/habanalabs/include/common/qman_if.h
+++ b/drivers/misc/habanalabs/include/common/qman_if.h
@@ -40,7 +40,7 @@ struct hl_bd {
 */

 #define BD_CTL_COMP_OFFSET_SHIFT	16
-#define BD_CTL_COMP_OFFSET_MASK		0x00FF0000
+#define BD_CTL_COMP_OFFSET_MASK		0x0FFF0000

 #define BD_CTL_COMP_DATA_SHIFT		0
 #define BD_CTL_COMP_DATA_MASK		0x0000FFFF

--- a/drivers/misc/habanalabs/include/gaudi/gaudi.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi.h
@@ -44,6 +44,8 @@

 #define MME_NUMBER_OF_MASTER_ENGINES	2

+#define MME_NUMBER_OF_SLAVE_ENGINES	2
+
 #define TPC_NUMBER_OF_ENGINES	8

 #define DMA_NUMBER_OF_CHANNELS	8

--- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
@@ -12,6 +12,7 @@
 * PSOC scratch-pad registers
 */
 #define mmHW_STATE			mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
+#define mmFUSE_VER_OFFSET		mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
 #define mmCPU_CMD_STATUS_TO_HOST	mmPSOC_GLOBAL_CONF_SCRATCHPAD_23
 #define mmCPU_BOOT_ERR0			mmPSOC_GLOBAL_CONF_SCRATCHPAD_24
 #define mmCPU_BOOT_ERR1			mmPSOC_GLOBAL_CONF_SCRATCHPAD_25

--- a/drivers/misc/habanalabs/include/goya/goya_reg_map.h
+++ b/drivers/misc/habanalabs/include/goya/goya_reg_map.h
@@ -22,6 +22,7 @@
 #define mmCPU_CQ_BASE_ADDR_LOW		mmPSOC_GLOBAL_CONF_SCRATCHPAD_8
 #define mmCPU_CQ_BASE_ADDR_HIGH		mmPSOC_GLOBAL_CONF_SCRATCHPAD_9
 #define mmCPU_CQ_LENGTH			mmPSOC_GLOBAL_CONF_SCRATCHPAD_10
+#define mmFUSE_VER_OFFSET		mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
 #define mmCPU_CMD_STATUS_TO_HOST	mmPSOC_GLOBAL_CONF_SCRATCHPAD_23
 #define mmCPU_BOOT_ERR0			mmPSOC_GLOBAL_CONF_SCRATCHPAD_24
 #define mmCPU_BOOT_ERR1			mmPSOC_GLOBAL_CONF_SCRATCHPAD_25

--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -29,6 +29,8 @@
 #define HOP3_SHIFT			21
 #define HOP4_SHIFT			12

+#define MMU_ARCH_5_HOPS			5
+
 #define HOP_PHYS_ADDR_MASK		(~FLAGS_MASK)

 #define HL_PTE_SIZE			sizeof(u64)

--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h