Merge tag 'misc-habanalabs-next-2019-11-21' of...

Merge tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.5: - MMU code improvements that includes: - Distinguish between "normal" unmapping and unmapping that is done as part of the tear-down of a user process. This improves performance of unmapping during reset of the device. - Add future ASIC support in generic MMU code. - Improve device reset code by adding more protection around accessing the device during the reset process. - Add new H/W queue type for future ASIC support - Add more information to be retrieved by users through INFO IOCTL: - clock rate - board name - reset counters - Small bug fixes and minor improvements to code. * tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux: (31 commits) habanalabs: add more protection of device during reset habanalabs: flush EQ workers in hard reset habanalabs: make the reset code more consistent habanalabs: expose reset counters via existing INFO IOCTL habanalabs: make code more concise habanalabs: use defines for F/W files habanalabs: remove prints on successful device initialization habanalabs: remove unnecessary checks habanalabs: invalidate MMU cache only once habanalabs: skip VA block list update in reset flow habanalabs: optimize MMU unmap habanalabs: prevent read/write from/to the device during hard reset habanalabs: split MMU properties to PCI/DRAM habanalabs: re-factor MMU masks and documentation habanalabs: type specific MMU cache invalidation habanalabs: re-factor memory module code habanalabs: export uapi defines to user-space habanalabs: don't print error when queues are full habanalabs: increase max jobs number to 512 habanalabs: set ETR as non-secured ...

Merge tag 'misc-habanalabs-next-2019-11-21' of...
Merge tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next Oded writes: This tag contains the following changes for kernel 5.5: - MMU code improvements that includes: - Distinguish between "normal" unmapping and unmapping that is done as part of the tear-down of a user process. This improves performance of unmapping during reset of the device. - Add future ASIC support in generic MMU code. - Improve device reset code by adding more protection around accessing the device during the reset process. - Add new H/W queue type for future ASIC support - Add more information to be retrieved by users through INFO IOCTL: - clock rate - board name - reset counters - Small bug fixes and minor improvements to code. * tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux: (31 commits) habanalabs: add more protection of device during reset habanalabs: flush EQ workers in hard reset habanalabs: make the reset code more consistent habanalabs: expose reset counters via existing INFO IOCTL habanalabs: make code more concise habanalabs: use defines for F/W files habanalabs: remove prints on successful device initialization habanalabs: remove unnecessary checks habanalabs: invalidate MMU cache only once habanalabs: skip VA block list update in reset flow habanalabs: optimize MMU unmap habanalabs: prevent read/write from/to the device during hard reset habanalabs: split MMU properties to PCI/DRAM habanalabs: re-factor MMU masks and documentation habanalabs: type specific MMU cache invalidation habanalabs: re-factor memory module code habanalabs: export uapi defines to user-space habanalabs: don't print error when queues are full habanalabs: increase max jobs number to 512 habanalabs: set ETR as non-secured ...
b78cda79 · Greg Kroah-Hartman · 599ea01c · 5feccddc · b78cda79 · b78cda79
Commit b78cda79 authored Nov 21, 2019 by Greg Kroah-Hartman
20 changed files
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
 	kref_put(&cs->refcount, cs_do_release);
 }

+static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	/*
+	 * Patched CB is created for external queues jobs, and for H/W queues
+	 * jobs if the user CB was allocated by driver and MMU is disabled.
+	 */
+	return (job->queue_type == QUEUE_TYPE_EXT ||
+			(job->queue_type == QUEUE_TYPE_HW &&
+					job->is_kernel_allocated_cb &&
+					!hdev->mmu_enable));
+}
+
 /*
 * cs_parser - parse the user command submission
 *
@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 	parser.patched_cb = NULL;
 	parser.user_cb = job->user_cb;
 	parser.user_cb_size = job->user_cb_size;
-	parser.ext_queue = job->ext_queue;
+	parser.queue_type = job->queue_type;
+	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
 	job->patched_cb = NULL;

 	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
-	if (job->ext_queue) {
+
+	if (is_cb_patched(hdev, job)) {
 		if (!rc) {
 			job->patched_cb = parser.patched_cb;
 			job->job_cb_size = parser.patched_cb_size;
@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	struct hl_cs *cs = job->cs;

-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job)) {
 		hl_userptr_delete_list(hdev, &job->userptr_list);

 		/*
@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 		}
 	}

+	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
+	 * enabled, the user CB isn't released in cs_parser() and thus should be
+	 * released here.
+	 */
+	if (job->queue_type == QUEUE_TYPE_HW &&
+			job->is_kernel_allocated_cb && hdev->mmu_enable) {
+		spin_lock(&job->user_cb->lock);
+		job->user_cb->cs_cnt--;
+		spin_unlock(&job->user_cb->lock);
+
+		hl_cb_put(job->user_cb);
+	}
+
 	/*
 	 * This is the only place where there can be multiple threads
 	 * modifying the list at the same time
@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)

 	hl_debugfs_remove_job(hdev, job);

-	if (job->ext_queue)
+	if (job->queue_type == QUEUE_TYPE_EXT ||
+			job->queue_type == QUEUE_TYPE_HW)
 		cs_put(cs);

 	kfree(job);
@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
 	free_job(hdev, job);
 }

-static struct hl_cb *validate_queue_index(struct hl_device *hdev,
-					struct hl_cb_mgr *cb_mgr,
-					struct hl_cs_chunk *chunk,
-					bool *ext_queue)
+static int validate_queue_index(struct hl_device *hdev,
+				struct hl_cs_chunk *chunk,
+				enum hl_queue_type *queue_type,
+				bool *is_kernel_allocated_cb)
 {
 	struct asic_fixed_properties *asic = &hdev->asic_prop;
 	struct hw_queue_properties *hw_queue_prop;
-	u32 cb_handle;
-	struct hl_cb *cb;
-
-	/* Assume external queue */
-	*ext_queue = true;

 	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];

@@ -406,20 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
 		dev_err(hdev->dev, "Queue index %d is invalid\n",
 			chunk->queue_index);
-		return NULL;
+		return -EINVAL;
 	}

 	if (hw_queue_prop->driver_only) {
 		dev_err(hdev->dev,
 			"Queue index %d is restricted for the kernel driver\n",
 			chunk->queue_index);
-		return NULL;
-	} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
-		*ext_queue = false;
-		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
+		return -EINVAL;
 	}

-	/* Retrieve CB object */
+	*queue_type = hw_queue_prop->type;
+	*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
+
+	return 0;
+}
+
+static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
+					struct hl_cb_mgr *cb_mgr,
+					struct hl_cs_chunk *chunk)
+{
+	struct hl_cb *cb;
+	u32 cb_handle;
+
 	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);

 	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
@@ -444,7 +476,8 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 	return NULL;
 }

-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+		enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
 {
 	struct hl_cs_job *job;

@@ -452,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
 	if (!job)
 		return NULL;

-	job->ext_queue = ext_queue;
+	job->queue_type = queue_type;
+	job->is_kernel_allocated_cb = is_kernel_allocated_cb;

-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job))
 		INIT_LIST_HEAD(&job->userptr_list);
+
+	if (job->queue_type == QUEUE_TYPE_EXT)
 		INIT_WORK(&job->finish_work, job_wq_completion);
-	}

 	return job;
 }
@@ -470,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	struct hl_cs_job *job;
 	struct hl_cs *cs;
 	struct hl_cb *cb;
-	bool ext_queue_present = false;
+	bool int_queues_only = true;
 	u32 size_to_copy;
 	int rc, i, parse_cnt;

@@ -514,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	/* Validate ALL the CS chunks before submitting the CS */
 	for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
 		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
-		bool ext_queue;
+		enum hl_queue_type queue_type;
+		bool is_kernel_allocated_cb;
+
+		rc = validate_queue_index(hdev, chunk, &queue_type,
+						&is_kernel_allocated_cb);
+		if (rc)
+			goto free_cs_object;

-		cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
-					&ext_queue);
-		if (ext_queue) {
-			ext_queue_present = true;
+		if (is_kernel_allocated_cb) {
+			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
 			if (!cb) {
 				rc = -EINVAL;
 				goto free_cs_object;
 			}
+		} else {
+			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
 		}

-		job = hl_cs_allocate_job(hdev, ext_queue);
+		if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+			int_queues_only = false;
+
+		job = hl_cs_allocate_job(hdev, queue_type,
+						is_kernel_allocated_cb);
 		if (!job) {
 			dev_err(hdev->dev, "Failed to allocate a new job\n");
 			rc = -ENOMEM;
-			if (ext_queue)
+			if (is_kernel_allocated_cb)
 				goto release_cb;
 			else
 				goto free_cs_object;
@@ -540,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		job->cs = cs;
 		job->user_cb = cb;
 		job->user_cb_size = chunk->cb_size;
-		if (job->ext_queue)
+		if (is_kernel_allocated_cb)
 			job->job_cb_size = cb->size;
 		else
 			job->job_cb_size = chunk->cb_size;
@@ -553,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		/*
 		 * Increment CS reference. When CS reference is 0, CS is
 		 * done and can be signaled to user and free all its resources
-		 * Only increment for JOB on external queues, because only
-		 * for those JOBs we get completion
+		 * Only increment for JOB on external or H/W queues, because
+		 * only for those JOBs we get completion
 		 */
-		if (job->ext_queue)
+		if (job->queue_type == QUEUE_TYPE_EXT ||
+				job->queue_type == QUEUE_TYPE_HW)
 			cs_get(cs);

 		hl_debugfs_add_job(hdev, job);
@@ -570,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		}
 	}

-	if (!ext_queue_present) {
+	if (int_queues_only) {
 		dev_err(hdev->dev,
-			"Reject CS %d.%llu because no external queues jobs\n",
+			"Reject CS %d.%llu because only internal queues jobs are present\n",
 			cs->ctx->asid, cs->sequence);
 		rc = -EINVAL;
 		goto free_cs_object;
@@ -580,9 +626,10 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,

 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to submit CS %d.%llu to H/W queues, error %d\n",
-			cs->ctx->asid, cs->sequence, rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
+				cs->ctx->asid, cs->sequence, rc);
 		goto free_cs_object;
 	}


--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -307,45 +307,57 @@ static inline u64 get_hop0_addr(struct hl_ctx *ctx)
 			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
 }

-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
+					u64 virt_addr, u64 mask, u64 shift)
 {
 	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP0_MASK) >> HOP0_SHIFT);
+			((virt_addr & mask) >> shift);
 }

-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP1_MASK) >> HOP1_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop0_mask,
+					mmu_specs->hop0_shift);
 }

-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP2_MASK) >> HOP2_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop1_mask,
+					mmu_specs->hop1_shift);
 }

-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP3_MASK) >> HOP3_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop2_mask,
+					mmu_specs->hop2_shift);
 }

-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-		u64 virt_addr)
+static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
 {
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & HOP4_MASK) >> HOP4_SHIFT);
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop3_mask,
+					mmu_specs->hop3_shift);
+}
+
+static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop4_mask,
+					mmu_specs->hop4_shift);
 }

 static inline u64 get_next_hop_addr(u64 curr_pte)
 {
 	if (curr_pte & PAGE_PRESENT_MASK)
-		return curr_pte & PHYS_ADDR_MASK;
+		return curr_pte & HOP_PHYS_ADDR_MASK;
 	else
 		return ULLONG_MAX;
 }
@@ -355,7 +367,10 @@ static int mmu_show(struct seq_file *s, void *data)
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	struct hl_ctx *ctx;
+	bool is_dram_addr;

 	u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
 		hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
@@ -377,33 +392,39 @@ static int mmu_show(struct seq_file *s, void *data)
 		return 0;
 	}

+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+				prop->va_space_dram_start_address,
+				prop->va_space_dram_end_address);
+
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
 	mutex_lock(&ctx->mmu_lock);

 	/* the following lookup is copied from unmap() in mmu.c */

 	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
 	hop0_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
 	hop1_addr = get_next_hop_addr(hop0_pte);

 	if (hop1_addr == ULLONG_MAX)
 		goto not_mapped;

-	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
 	hop1_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
 	hop2_addr = get_next_hop_addr(hop1_pte);

 	if (hop2_addr == ULLONG_MAX)
 		goto not_mapped;

-	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
 	hop2_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
 	hop3_addr = get_next_hop_addr(hop2_pte);

 	if (hop3_addr == ULLONG_MAX)
 		goto not_mapped;

-	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);

 	if (!(hop3_pte & LAST_MASK)) {
@@ -412,7 +433,8 @@ static int mmu_show(struct seq_file *s, void *data)
 		if (hop4_addr == ULLONG_MAX)
 			goto not_mapped;

-		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
 		hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
 		if (!(hop4_pte & PAGE_PRESENT_MASK))
 			goto not_mapped;
@@ -506,6 +528,12 @@ static int engines_show(struct seq_file *s, void *data)
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;

+	if (atomic_read(&hdev->in_reset)) {
+		dev_warn_ratelimited(hdev->dev,
+				"Can't check device idle during reset\n");
+		return 0;
+	}
+
 	hdev->asic_funcs->is_device_idle(hdev, NULL, s);

 	return 0;
@@ -534,41 +562,50 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 				u64 *phys_addr)
 {
 	struct hl_ctx *ctx = hdev->compute_ctx;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
 	u64 hop_addr, hop_pte_addr, hop_pte;
-	u64 offset_mask = HOP4_MASK | OFFSET_MASK;
+	u64 offset_mask = HOP4_MASK | FLAGS_MASK;
 	int rc = 0;
+	bool is_dram_addr;

 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
 		return -EINVAL;
 	}

+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+				prop->va_space_dram_start_address,
+				prop->va_space_dram_end_address);
+
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
 	mutex_lock(&ctx->mmu_lock);

 	/* hop 0 */
 	hop_addr = get_hop0_addr(ctx);
-	hop_pte_addr = get_hop0_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

 	/* hop 1 */
 	hop_addr = get_next_hop_addr(hop_pte);
 	if (hop_addr == ULLONG_MAX)
 		goto not_mapped;
-	hop_pte_addr = get_hop1_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

 	/* hop 2 */
 	hop_addr = get_next_hop_addr(hop_pte);
 	if (hop_addr == ULLONG_MAX)
 		goto not_mapped;
-	hop_pte_addr = get_hop2_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

 	/* hop 3 */
 	hop_addr = get_next_hop_addr(hop_pte);
 	if (hop_addr == ULLONG_MAX)
 		goto not_mapped;
-	hop_pte_addr = get_hop3_pte_addr(ctx, hop_addr, virt_addr);
+	hop_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
 	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

 	if (!(hop_pte & LAST_MASK)) {
@@ -576,10 +613,11 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 		hop_addr = get_next_hop_addr(hop_pte);
 		if (hop_addr == ULLONG_MAX)
 			goto not_mapped;
-		hop_pte_addr = get_hop4_pte_addr(ctx, hop_addr, virt_addr);
+		hop_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop_addr,
+							virt_addr);
 		hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

-		offset_mask = OFFSET_MASK;
+		offset_mask = FLAGS_MASK;
 	}

 	if (!(hop_pte & PAGE_PRESENT_MASK))
@@ -608,6 +646,11 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 	u32 val;
 	ssize_t rc;

+	if (atomic_read(&hdev->in_reset)) {
+		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
+		return 0;
+	}
+
 	if (*ppos)
 		return 0;

@@ -637,6 +680,11 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	u32 value;
 	ssize_t rc;

+	if (atomic_read(&hdev->in_reset)) {
+		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
+		return 0;
+	}
+
 	rc = kstrtouint_from_user(buf, count, 16, &value);
 	if (rc)
 		return rc;

--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -42,12 +42,10 @@ static void hpriv_release(struct kref *ref)
 {
 	struct hl_fpriv *hpriv;
 	struct hl_device *hdev;
-	struct hl_ctx *ctx;

 	hpriv = container_of(ref, struct hl_fpriv, refcount);

 	hdev = hpriv->hdev;
-	ctx = hpriv->ctx;

 	put_pid(hpriv->taskpid);

@@ -889,13 +887,19 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);

-	/* Kill processes here after CS rollback. This is because the process
-	 * can't really exit until all its CSs are done, which is what we
-	 * do in cs rollback
-	 */
-	if (from_hard_reset_thread)
+	if (hard_reset) {
+		/* Kill processes here after CS rollback. This is because the
+		 * process can't really exit until all its CSs are done, which
+		 * is what we do in cs rollback
+		 */
 		device_kill_open_processes(hdev);

+		/* Flush the Event queue workers to make sure no other thread is
+		 * reading or writing to registers during the reset
+		 */
+		flush_workqueue(hdev->eq_wq);
+	}
+
 	/* Release kernel context */
 	if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
 		hdev->kernel_ctx = NULL;

--- a/drivers/misc/habanalabs/firmware_if.c
+++ b/drivers/misc/habanalabs/firmware_if.c
@@ -143,10 +143,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
 			sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);

 	if (!rc) {
-		if (result == ARMCP_PACKET_FENCE_VAL)
-			dev_info(hdev->dev,
-				"queue test on CPU queue succeeded\n");
-		else
+		if (result != ARMCP_PACKET_FENCE_VAL)
 			dev_err(hdev->dev,
 				"CPU queue test failed (0x%08lX)\n", result);
 	} else {

--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -72,6 +72,9 @@
 *
 */

+#define GOYA_UBOOT_FW_FILE	"habanalabs/goya/goya-u-boot.bin"
+#define GOYA_LINUX_FW_FILE	"habanalabs/goya/goya-fit.itb"
+
 #define GOYA_MMU_REGS_NUM		63

 #define GOYA_DMA_POOL_BLK_SIZE		0x100		/* 256 bytes */
@@ -337,17 +340,20 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
 		prop->hw_queues_props[i].driver_only = 0;
+		prop->hw_queues_props[i].requires_kernel_cb = 1;
 	}

 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
 		prop->hw_queues_props[i].driver_only = 1;
+		prop->hw_queues_props[i].requires_kernel_cb = 0;
 	}

 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
 			NUMBER_OF_INT_HW_QUEUES; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
 		prop->hw_queues_props[i].driver_only = 0;
+		prop->hw_queues_props[i].requires_kernel_cb = 0;
 	}

 	for (; i < HL_MAX_QUEUES; i++)
@@ -377,6 +383,23 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;

+	prop->dmmu.hop0_shift = HOP0_SHIFT;
+	prop->dmmu.hop1_shift = HOP1_SHIFT;
+	prop->dmmu.hop2_shift = HOP2_SHIFT;
+	prop->dmmu.hop3_shift = HOP3_SHIFT;
+	prop->dmmu.hop4_shift = HOP4_SHIFT;
+	prop->dmmu.hop0_mask = HOP0_MASK;
+	prop->dmmu.hop1_mask = HOP1_MASK;
+	prop->dmmu.hop2_mask = HOP2_MASK;
+	prop->dmmu.hop3_mask = HOP3_MASK;
+	prop->dmmu.hop4_mask = HOP4_MASK;
+	prop->dmmu.huge_page_size = PAGE_SIZE_2MB;
+
+	/* No difference between PMMU and DMMU except of page size */
+	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
+	prop->dmmu.page_size = PAGE_SIZE_2MB;
+	prop->pmmu.page_size = PAGE_SIZE_4KB;
+
 	prop->va_space_host_start_address = VA_HOST_SPACE_START;
 	prop->va_space_host_end_address = VA_HOST_SPACE_END;
 	prop->va_space_dram_start_address = VA_DDR_SPACE_START;
@@ -393,6 +416,9 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->tpc_enabled_mask = TPC_ENABLED_MASK;
 	prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
 	prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;
+
+	strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
+		CARD_NAME_MAX_LEN);
 }

 /*
@@ -1454,6 +1480,9 @@ static void goya_init_golden_registers(struct hl_device *hdev)
 				1 << TPC0_NRTR_SCRAMB_EN_VAL_SHIFT);
 		WREG32(mmTPC0_NRTR_NON_LIN_SCRAMB + offset,
 				1 << TPC0_NRTR_NON_LIN_SCRAMB_EN_SHIFT);
+
+		WREG32_FIELD(TPC0_CFG_MSS_CONFIG, offset,
+				ICACHE_FETCH_LINE_NUM, 2);
 	}

 	WREG32(mmDMA_NRTR_SCRAMB_EN, 1 << DMA_NRTR_SCRAMB_EN_VAL_SHIFT);
@@ -1533,7 +1562,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
 	u32 mtr_base_lo, mtr_base_hi;
 	u32 so_base_lo, so_base_hi;
 	u32 gic_base_lo, gic_base_hi;
-	u64 qman_base_addr;

 	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
 	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
@@ -1545,9 +1573,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
 	gic_base_hi =
 		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);

-	qman_base_addr = hdev->asic_prop.sram_base_address +
-				MME_QMAN_BASE_OFFSET;
-
 	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
 	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
 	WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO,	so_base_lo);
@@ -2141,13 +2166,11 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
 */
 static int goya_push_uboot_to_device(struct hl_device *hdev)
 {
-	char fw_name[200];
 	void __iomem *dst;

-	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
 	dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;

-	return hl_fw_push_fw_to_device(hdev, fw_name, dst);
+	return hl_fw_push_fw_to_device(hdev, GOYA_UBOOT_FW_FILE, dst);
 }

 /*
@@ -2160,13 +2183,11 @@ static int goya_push_uboot_to_device(struct hl_device *hdev)
 */
 static int goya_push_linux_to_device(struct hl_device *hdev)
 {
-	char fw_name[200];
 	void __iomem *dst;

-	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
 	dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;

-	return hl_fw_push_fw_to_device(hdev, fw_name, dst);
+	return hl_fw_push_fw_to_device(hdev, GOYA_LINUX_FW_FILE, dst);
 }

 static int goya_pldm_init_cpu(struct hl_device *hdev)
@@ -2291,6 +2312,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 		10000,
 		cpu_timeout);

+	/* Read U-Boot version now in case we will later fail */
+	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
+	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
+
 	if (rc) {
 		dev_err(hdev->dev, "Error in ARM u-boot!");
 		switch (status) {
@@ -2328,6 +2353,11 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 				"ARM status %d - u-boot stopped by user\n",
 				status);
 			break;
+		case CPU_BOOT_STATUS_TS_INIT_FAIL:
+			dev_err(hdev->dev,
+				"ARM status %d - Thermal Sensor initialization failed\n",
+				status);
+			break;
 		default:
 			dev_err(hdev->dev,
 				"ARM status %d - Invalid status code\n",
@@ -2337,10 +2367,6 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 		return -EIO;
 	}

-	/* Read U-Boot version now in case we will later fail */
-	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
-	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
-
 	if (!hdev->fw_loading) {
 		dev_info(hdev->dev, "Skip loading FW\n");
 		goto out;
@@ -2453,7 +2479,8 @@ int goya_mmu_init(struct hl_device *hdev)
 	WREG32_AND(mmSTLB_STLB_FEATURE_EN,
 			(~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK));

-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
+					VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK);

 	WREG32(mmMMU_MMU_ENABLE, 1);
 	WREG32(mmMMU_SPI_MASK, 0xF);
@@ -2978,9 +3005,6 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
 			"H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n",
 			hw_queue_id, (unsigned long long) fence_dma_addr, tmp);
 		rc = -EIO;
-	} else {
-		dev_info(hdev->dev, "queue test on H/W queue %d succeeded\n",
-			hw_queue_id);
 	}

 free_pkt:
@@ -3925,7 +3949,7 @@ static int goya_parse_cb_no_ext_queue(struct hl_device *hdev,
 		return 0;

 	dev_err(hdev->dev,
-		"Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n",
+		"Internal CB address 0x%px + 0x%x is not in SRAM nor in DRAM\n",
 		parser->user_cb, parser->user_cb_size);

 	return -EFAULT;
@@ -3935,7 +3959,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
 {
 	struct goya_device *goya = hdev->asic_specific;

-	if (!parser->ext_queue)
+	if (parser->queue_type == QUEUE_TYPE_INT)
 		return goya_parse_cb_no_ext_queue(hdev, parser);

 	if (goya->hw_cap_initialized & HW_CAP_MMU)
@@ -4606,7 +4630,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
 		lin_dma_pkt++;
 	} while (--lin_dma_pkts_cnt);

-	job = hl_cs_allocate_job(hdev, true);
+	job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
 	if (!job) {
 		dev_err(hdev->dev, "Failed to allocate a new job\n");
 		rc = -ENOMEM;
@@ -4835,13 +4859,15 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
 		goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
 }

-static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard)
+static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
+					u32 flags)
 {
 	struct goya_device *goya = hdev->asic_specific;
 	u32 status, timeout_usec;
 	int rc;

-	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
+		hdev->hard_reset_pending)
 		return;

 	/* no need in L1 only invalidation in Goya */
@@ -4880,7 +4906,8 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
 	u32 status, timeout_usec, inv_data, pi;
 	int rc;

-	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
+		hdev->hard_reset_pending)
 		return;

 	/* no need in L1 only invalidation in Goya */
@@ -5137,7 +5164,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.init_iatu = goya_init_iatu,
 	.rreg = hl_rreg,
 	.wreg = hl_wreg,
-	.halt_coresight = goya_halt_coresight
+	.halt_coresight = goya_halt_coresight,
+	.get_clk_rate = goya_get_clk_rate
 };

 /*

--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -233,4 +233,6 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr);
 void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev);

+int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
+
 #endif /* GOYAP_H_ */
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -8,6 +8,7 @@
 #include "goyaP.h"
 #include "include/goya/goya_coresight.h"
 #include "include/goya/asic_reg/goya_regs.h"
+#include "include/goya/asic_reg/goya_masks.h"

 #include <uapi/misc/habanalabs.h>

@@ -377,33 +378,32 @@ static int goya_config_etr(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	struct hl_debug_params_etr *input;
-	u64 base_reg = mmPSOC_ETR_BASE - CFG_BASE;
 	u32 val;
 	int rc;

-	WREG32(base_reg + 0xFB0, CORESIGHT_UNLOCK);
+	WREG32(mmPSOC_ETR_LAR, CORESIGHT_UNLOCK);

-	val = RREG32(base_reg + 0x304);
+	val = RREG32(mmPSOC_ETR_FFCR);
 	val |= 0x1000;
-	WREG32(base_reg + 0x304, val);
+	WREG32(mmPSOC_ETR_FFCR, val);
 	val |= 0x40;
-	WREG32(base_reg + 0x304, val);
+	WREG32(mmPSOC_ETR_FFCR, val);

-	rc = goya_coresight_timeout(hdev, base_reg + 0x304, 6, false);
+	rc = goya_coresight_timeout(hdev, mmPSOC_ETR_FFCR, 6, false);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n",
 				params->enable ? "enable" : "disable", rc);
 		return rc;
 	}

-	rc = goya_coresight_timeout(hdev, base_reg + 0xC, 2, true);
+	rc = goya_coresight_timeout(hdev, mmPSOC_ETR_STS, 2, true);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n",
 				params->enable ? "enable" : "disable", rc);
 		return rc;
 	}

-	WREG32(base_reg + 0x20, 0);
+	WREG32(mmPSOC_ETR_CTL, 0);

 	if (params->enable) {
 		input = params->input;
@@ -423,25 +423,26 @@ static int goya_config_etr(struct hl_device *hdev,
 			return -EINVAL;
 		}

-		WREG32(base_reg + 0x34, 0x3FFC);
-		WREG32(base_reg + 0x4, input->buffer_size);
-		WREG32(base_reg + 0x28, input->sink_mode);
-		WREG32(base_reg + 0x110, 0x700);
-		WREG32(base_reg + 0x118,
+		WREG32(mmPSOC_ETR_BUFWM, 0x3FFC);
+		WREG32(mmPSOC_ETR_RSZ, input->buffer_size);
+		WREG32(mmPSOC_ETR_MODE, input->sink_mode);
+		WREG32(mmPSOC_ETR_AXICTL,
+				0x700 | PSOC_ETR_AXICTL_PROTCTRLBIT1_SHIFT);
+		WREG32(mmPSOC_ETR_DBALO,
 				lower_32_bits(input->buffer_address));
-		WREG32(base_reg + 0x11C,
+		WREG32(mmPSOC_ETR_DBAHI,
 				upper_32_bits(input->buffer_address));
-		WREG32(base_reg + 0x304, 3);
-		WREG32(base_reg + 0x308, 0xA);
-		WREG32(base_reg + 0x20, 1);
+		WREG32(mmPSOC_ETR_FFCR, 3);
+		WREG32(mmPSOC_ETR_PSCR, 0xA);
+		WREG32(mmPSOC_ETR_CTL, 1);
 	} else {
-		WREG32(base_reg + 0x34, 0);
-		WREG32(base_reg + 0x4, 0x400);
-		WREG32(base_reg + 0x118, 0);
-		WREG32(base_reg + 0x11C, 0);
-		WREG32(base_reg + 0x308, 0);
-		WREG32(base_reg + 0x28, 0);
-		WREG32(base_reg + 0x304, 0);
+		WREG32(mmPSOC_ETR_BUFWM, 0);
+		WREG32(mmPSOC_ETR_RSZ, 0x400);
+		WREG32(mmPSOC_ETR_DBALO, 0);
+		WREG32(mmPSOC_ETR_DBAHI, 0);
+		WREG32(mmPSOC_ETR_PSCR, 0);
+		WREG32(mmPSOC_ETR_MODE, 0);
+		WREG32(mmPSOC_ETR_FFCR, 0);

 		if (params->output_size >= sizeof(u64)) {
 			u32 rwp, rwphi;
@@ -451,8 +452,8 @@ static int goya_config_etr(struct hl_device *hdev,
 			 * the buffer is set in the RWP register (lower 32
 			 * bits), and in the RWPHI register (upper 8 bits).
 			 */
-			rwp = RREG32(base_reg + 0x18);
-			rwphi = RREG32(base_reg + 0x3c) & 0xff;
+			rwp = RREG32(mmPSOC_ETR_RWP);
+			rwphi = RREG32(mmPSOC_ETR_RWPHI) & 0xff;
 			*(u64 *) params->output = ((u64) rwphi << 32) | rwp;
 		}
 	}

--- a/drivers/misc/habanalabs/goya/goya_hwmgr.c
+++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c
@@ -32,6 +32,37 @@ void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq)
 	}
 }

+int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk)
+{
+	long value;
+
+	if (hl_device_disabled_or_in_reset(hdev))
+		return -ENODEV;
+
+	value = hl_get_frequency(hdev, MME_PLL, false);
+
+	if (value < 0) {
+		dev_err(hdev->dev, "Failed to retrieve device max clock %ld\n",
+			value);
+		return value;
+	}
+
+	*max_clk = (value / 1000 / 1000);
+
+	value = hl_get_frequency(hdev, MME_PLL, true);
+
+	if (value < 0) {
+		dev_err(hdev->dev,
+			"Failed to retrieve device current clock %ld\n",
+			value);
+		return value;
+	}
+
+	*cur_clk = (value / 1000 / 1000);
+
+	return 0;
+}
+
 static ssize_t mme_clk_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {

--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -60,11 +60,16 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask;
 	hw_ip.sram_size = prop->sram_size - sram_kmd_size;
 	hw_ip.dram_size = prop->dram_size - dram_kmd_size;
-	if (hw_ip.dram_size > 0)
+	if (hw_ip.dram_size > PAGE_SIZE)
 		hw_ip.dram_enabled = 1;
 	hw_ip.num_of_events = prop->num_of_events;
-	memcpy(hw_ip.armcp_version,
-		prop->armcp_info.armcp_version, VERSION_MAX_LEN);
+
+	memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
+		min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));
+
+	memcpy(hw_ip.card_name, prop->armcp_info.card_name,
+		min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));
+
 	hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
 	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
 	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
@@ -179,17 +184,14 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args)
 		goto out;
 	}

-	if (output) {
-		if (copy_to_user((void __user *) (uintptr_t) args->output_ptr,
-					output,
-					args->output_size)) {
-			dev_err(hdev->dev,
-				"copy to user failed in debug ioctl\n");
-			rc = -EFAULT;
-			goto out;
-		}
+	if (output && copy_to_user((void __user *) (uintptr_t) args->output_ptr,
+					output, args->output_size)) {
+		dev_err(hdev->dev, "copy to user failed in debug ioctl\n");
+		rc = -EFAULT;
+		goto out;
 	}

+
 out:
 	kfree(params);
 	kfree(output);
@@ -221,6 +223,41 @@ static int device_utilization(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(device_util))) ? -EFAULT : 0;
 }

+static int get_clk_rate(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_clk_rate clk_rate = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hdev->asic_funcs->get_clk_rate(hdev, &clk_rate.cur_clk_rate_mhz,
+						&clk_rate.max_clk_rate_mhz);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &clk_rate,
+		min((size_t) max_size, sizeof(clk_rate))) ? -EFAULT : 0;
+}
+
+static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_reset_count reset_count = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	reset_count.hard_reset_cnt = hdev->hard_reset_cnt;
+	reset_count.soft_reset_cnt = hdev->soft_reset_cnt;
+
+	return copy_to_user(out, &reset_count,
+		min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -239,6 +276,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DEVICE_STATUS:
 		return device_status_info(hdev, args);

+	case HL_INFO_RESET_COUNT:
+		return get_reset_count(hdev, args);
+
 	default:
 		break;
 	}
@@ -271,6 +311,10 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 		rc = hw_events_info(hdev, true, args);
 		break;

+	case HL_INFO_CLK_RATE:
+		rc = get_clk_rate(hdev, args);
+		break;
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
@@ -406,9 +450,8 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,

 	retcode = func(hpriv, kdata);

-	if (cmd & IOC_OUT)
-		if (copy_to_user((void __user *)arg, kdata, usize))
-			retcode = -EFAULT;
+	if ((cmd & IOC_OUT) && copy_to_user((void __user *)arg, kdata, usize))
+		retcode = -EFAULT;

 out_err:
 	if (retcode)

--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
--- a/drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/goya_masks.h
@@ -260,4 +260,6 @@
 #define DMA_QM_3_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT
 #define DMA_QM_4_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT

+#define PSOC_ETR_AXICTL_PROTCTRLBIT1_SHIFT                           1
+
 #endif /* ASIC_REG_GOYA_MASKS_H_ */
--- a/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h
@@ -84,6 +84,7 @@
 #include "tpc6_rtr_regs.h"
 #include "tpc7_nrtr_regs.h"
 #include "tpc0_eml_cfg_regs.h"
+#include "psoc_etr_regs.h"

 #include "psoc_global_conf_masks.h"
 #include "dma_macro_masks.h"

--- a/drivers/misc/habanalabs/include/goya/asic_reg/psoc_etr_regs.h
+++ b/drivers/misc/habanalabs/include/goya/asic_reg/psoc_etr_regs.h
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+/************************************
+ ** This is an auto-generated file **
+ **       DO NOT EDIT BELOW        **
+ ************************************/
+
+#ifndef ASIC_REG_PSOC_ETR_REGS_H_
+#define ASIC_REG_PSOC_ETR_REGS_H_
+
+/*
+ *****************************************
+ *   PSOC_ETR (Prototype: ETR)
+ *****************************************
+ */
+
+#define mmPSOC_ETR_RSZ                                               0x2C43004
+
+#define mmPSOC_ETR_STS                                               0x2C4300C
+
+#define mmPSOC_ETR_RRD                                               0x2C43010
+
+#define mmPSOC_ETR_RRP                                               0x2C43014
+
+#define mmPSOC_ETR_RWP                                               0x2C43018
+
+#define mmPSOC_ETR_TRG                                               0x2C4301C
+
+#define mmPSOC_ETR_CTL                                               0x2C43020
+
+#define mmPSOC_ETR_RWD                                               0x2C43024
+
+#define mmPSOC_ETR_MODE                                              0x2C43028
+
+#define mmPSOC_ETR_LBUFLEVEL                                         0x2C4302C
+
+#define mmPSOC_ETR_CBUFLEVEL                                         0x2C43030
+
+#define mmPSOC_ETR_BUFWM                                             0x2C43034
+
+#define mmPSOC_ETR_RRPHI                                             0x2C43038
+
+#define mmPSOC_ETR_RWPHI                                             0x2C4303C
+
+#define mmPSOC_ETR_AXICTL                                            0x2C43110
+
+#define mmPSOC_ETR_DBALO                                             0x2C43118
+
+#define mmPSOC_ETR_DBAHI                                             0x2C4311C
+
+#define mmPSOC_ETR_FFSR                                              0x2C43300
+
+#define mmPSOC_ETR_FFCR                                              0x2C43304
+
+#define mmPSOC_ETR_PSCR                                              0x2C43308
+
+#define mmPSOC_ETR_ITMISCOP0                                         0x2C43EE0
+
+#define mmPSOC_ETR_ITTRFLIN                                          0x2C43EE8
+
+#define mmPSOC_ETR_ITATBDATA0                                        0x2C43EEC
+
+#define mmPSOC_ETR_ITATBCTR2                                         0x2C43EF0
+
+#define mmPSOC_ETR_ITATBCTR1                                         0x2C43EF4
+
+#define mmPSOC_ETR_ITATBCTR0                                         0x2C43EF8
+
+#define mmPSOC_ETR_ITCTRL                                            0x2C43F00
+
+#define mmPSOC_ETR_CLAIMSET                                          0x2C43FA0
+
+#define mmPSOC_ETR_CLAIMCLR                                          0x2C43FA4
+
+#define mmPSOC_ETR_LAR                                               0x2C43FB0
+
+#define mmPSOC_ETR_LSR                                               0x2C43FB4
+
+#define mmPSOC_ETR_AUTHSTATUS                                        0x2C43FB8
+
+#define mmPSOC_ETR_DEVID                                             0x2C43FC8
+
+#define mmPSOC_ETR_DEVTYPE                                           0x2C43FCC
+
+#define mmPSOC_ETR_PERIPHID4                                         0x2C43FD0
+
+#define mmPSOC_ETR_PERIPHID5                                         0x2C43FD4
+
+#define mmPSOC_ETR_PERIPHID6                                         0x2C43FD8
+
+#define mmPSOC_ETR_PERIPHID7                                         0x2C43FDC
+
+#define mmPSOC_ETR_PERIPHID0                                         0x2C43FE0
+
+#define mmPSOC_ETR_PERIPHID1                                         0x2C43FE4
+
+#define mmPSOC_ETR_PERIPHID2                                         0x2C43FE8
+
+#define mmPSOC_ETR_PERIPHID3                                         0x2C43FEC
+
+#define mmPSOC_ETR_COMPID0                                           0x2C43FF0
+
+#define mmPSOC_ETR_COMPID1                                           0x2C43FF4
+
+#define mmPSOC_ETR_COMPID2                                           0x2C43FF8
+
+#define mmPSOC_ETR_COMPID3                                           0x2C43FFC
+
+#endif /* ASIC_REG_PSOC_ETR_REGS_H_ */
--- a/drivers/misc/habanalabs/include/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/hl_boot_if.h
@@ -20,6 +20,8 @@ enum cpu_boot_status {
 	CPU_BOOT_STATUS_DRAM_INIT_FAIL,
 	CPU_BOOT_STATUS_FIT_CORRUPTED,
 	CPU_BOOT_STATUS_UBOOT_NOT_READY,
+	CPU_BOOT_STATUS_RESERVED,
+	CPU_BOOT_STATUS_TS_INIT_FAIL,
 };

 enum kmd_msg {

--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -12,18 +12,16 @@
 #define PAGE_SHIFT_2MB			21
 #define PAGE_SIZE_2MB			(_AC(1, UL) << PAGE_SHIFT_2MB)
 #define PAGE_SIZE_4KB			(_AC(1, UL) << PAGE_SHIFT_4KB)
-#define PAGE_MASK_2MB			(~(PAGE_SIZE_2MB - 1))

 #define PAGE_PRESENT_MASK		0x0000000000001ull
 #define SWAP_OUT_MASK			0x0000000000004ull
 #define LAST_MASK			0x0000000000800ull
-#define PHYS_ADDR_MASK			0xFFFFFFFFFFFFF000ull
 #define HOP0_MASK			0x3000000000000ull
 #define HOP1_MASK			0x0FF8000000000ull
 #define HOP2_MASK			0x0007FC0000000ull
 #define HOP3_MASK			0x000003FE00000ull
 #define HOP4_MASK			0x00000001FF000ull
-#define OFFSET_MASK			0x0000000000FFFull
+#define FLAGS_MASK			0x0000000000FFFull

 #define HOP0_SHIFT			48
 #define HOP1_SHIFT			39
@@ -31,8 +29,7 @@
 #define HOP3_SHIFT			21
 #define HOP4_SHIFT			12

-#define PTE_PHYS_ADDR_SHIFT		12
-#define PTE_PHYS_ADDR_MASK		~OFFSET_MASK
+#define HOP_PHYS_ADDR_MASK		(~FLAGS_MASK)

 #define HL_PTE_SIZE			sizeof(u64)
 #define HOP_TABLE_SIZE			PAGE_SIZE_4KB

--- a/drivers/misc/habanalabs/include/qman_if.h
+++ b/drivers/misc/habanalabs/include/qman_if.h
@@ -23,6 +23,8 @@ struct hl_bd {
 #define HL_BD_SIZE			sizeof(struct hl_bd)

 /*
+ * S/W CTL FIELDS.
+ *
 * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
 * valid. 1 means the repeat field is valid, 0 means not-valid,
 * i.e. repeat == 1
@@ -33,6 +35,16 @@ struct hl_bd {
 #define BD_CTL_SHADOW_INDEX_SHIFT	0
 #define BD_CTL_SHADOW_INDEX_MASK	0x00000FFF

+/*
+ * H/W CTL FIELDS
+ */
+
+#define BD_CTL_COMP_OFFSET_SHIFT	16
+#define BD_CTL_COMP_OFFSET_MASK		0x00FF0000
+
+#define BD_CTL_COMP_DATA_SHIFT		0
+#define BD_CTL_COMP_DATA_MASK		0x0000FFFF
+
 /*
 * COMPLETION QUEUE
 */

--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
--- a/drivers/misc/habanalabs/mmu.c
+++ b/drivers/misc/habanalabs/mmu.c
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -88,13 +88,19 @@ enum hl_device_status {
 *                         internal engine.
 * HL_INFO_DEVICE_STATUS - Retrieve the device's status. This opcode doesn't
 *                         require an open context.
- * HL_INFO_DEVICE_UTILIZATION - Retrieve the total utilization of the device
- *                              over the last period specified by the user.
- *                              The period can be between 100ms to 1s, in
- *                              resolution of 100ms. The return value is a
- *                              percentage of the utilization rate.
+ * HL_INFO_DEVICE_UTILIZATION  - Retrieve the total utilization of the device
+ *                               over the last period specified by the user.
+ *                               The period can be between 100ms to 1s, in
+ *                               resolution of 100ms. The return value is a
+ *                               percentage of the utilization rate.
 * HL_INFO_HW_EVENTS_AGGREGATE - Receive an array describing how many times each
 *                               event occurred since the driver was loaded.
+ * HL_INFO_CLK_RATE            - Retrieve the current and maximum clock rate
+ *                               of the device in MHz. The maximum clock rate is
+ *                               configurable via sysfs parameter
+ * HL_INFO_RESET_COUNT   - Retrieve the counts of the soft and hard reset
+ *                         operations performed on the device since the last
+ *                         time the driver was loaded.
 */
 #define HL_INFO_HW_IP_INFO		0
 #define HL_INFO_HW_EVENTS		1
@@ -103,8 +109,11 @@ enum hl_device_status {
 #define HL_INFO_DEVICE_STATUS		4
 #define HL_INFO_DEVICE_UTILIZATION	6
 #define HL_INFO_HW_EVENTS_AGGREGATE	7
+#define HL_INFO_CLK_RATE		8
+#define HL_INFO_RESET_COUNT		9

 #define HL_INFO_VERSION_MAX_LEN	128
+#define HL_INFO_CARD_NAME_MAX_LEN	16

 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
@@ -123,6 +132,7 @@ struct hl_info_hw_ip_info {
 	__u8 dram_enabled;
 	__u8 pad[2];
 	__u8 armcp_version[HL_INFO_VERSION_MAX_LEN];
+	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
 };

 struct hl_info_dram_usage {
@@ -149,6 +159,16 @@ struct hl_info_device_utilization {
 	__u32 pad;
 };

+struct hl_info_clk_rate {
+	__u32 cur_clk_rate_mhz;
+	__u32 max_clk_rate_mhz;
+};
+
+struct hl_info_reset_count {
+	__u32 hard_reset_cnt;
+	__u32 soft_reset_cnt;
+};
+
 struct hl_info_args {
 	/* Location of relevant struct in userspace */
 	__u64 return_pointer;
@@ -181,13 +201,15 @@ struct hl_info_args {
 /* Opcode to destroy previously created command buffer */
 #define HL_CB_OP_DESTROY	1

+#define HL_MAX_CB_SIZE		0x200000	/* 2MB */
+
 struct hl_cb_in {
 	/* Handle of CB or 0 if we want to create one */
 	__u64 cb_handle;
 	/* HL_CB_OP_* */
 	__u32 op;
-	/* Size of CB. Maximum size is 2MB. The minimum size that will be
-	 * allocated, regardless of this parameter's value, is PAGE_SIZE
+	/* Size of CB. Maximum size is HL_MAX_CB_SIZE. The minimum size that
+	 * will be allocated, regardless of this parameter's value, is PAGE_SIZE
 	 */
 	__u32 cb_size;
 	/* Context ID - Currently not in use */
@@ -233,6 +255,8 @@ struct hl_cs_chunk {

 #define HL_CS_STATUS_SUCCESS		0

+#define HL_MAX_JOBS_PER_CS		512
+
 struct hl_cs_in {
 	/* this holds address of array of hl_cs_chunk for restore phase */
 	__u64 chunks_restore;
@@ -242,9 +266,13 @@ struct hl_cs_in {
 	 * Currently not in use
 	 */
 	__u64 chunks_store;
-	/* Number of chunks in restore phase array */
+	/* Number of chunks in restore phase array. Maximum number is
+	 * HL_MAX_JOBS_PER_CS
+	 */
 	__u32 num_chunks_restore;
-	/* Number of chunks in execution array */
+	/* Number of chunks in execution array. Maximum number is
+	 * HL_MAX_JOBS_PER_CS
+	 */
 	__u32 num_chunks_execute;
 	/* Number of chunks in restore phase array - Currently not in use */
 	__u32 num_chunks_store;
@@ -589,7 +617,7 @@ struct hl_debug_args {
 *
 * The user can call this IOCTL with a handle it received from the CS IOCTL
 * to wait until the handle's CS has finished executing. The user will wait
- * inside the kernel until the CS has finished or until the user-requeusted
+ * inside the kernel until the CS has finished or until the user-requested
 * timeout has expired.
 *
 * The return value of the IOCTL is a standard Linux error code. The possible