drm/amd/pm: Add special handling for RAS messages

When a RAS fatal error is detected, PMFW will only process priority messages. Other messages won't be taken up for processing and therefore won't get any response in such a state. Add logic to filter out non-priority messages when RAS error is detected. Also, don't poll response response status register before sending priority messages. Use firmware capability flag to determine whether to filter priority messages. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amd/pm: Add special handling for RAS messages
When a RAS fatal error is detected, PMFW will only process priority messages. Other messages won't be taken up for processing and therefore won't get any response in such a state. Add logic to filter out non-priority messages when RAS error is detected. Also, don't poll response response status register before sending priority messages. Use firmware capability flag to determine whether to filter priority messages. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
301661b0 · Lijo Lazar · Alex Deucher · 7b8081ea · 301661b0
Commit 301661b0 authored Mar 21, 2024 by Lijo Lazar Committed by Alex Deucher Apr 09, 2024
Show whitespace changes
Inline Side-by-side

Showing with 59 additions and 6 deletions

drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +59 -6

No files found.
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -235,6 +235,50 @@ static void __smu_cmn_send_msg(struct smu_context *smu,
 	WREG32(smu->msg_reg, msg);
 }

+static inline uint32_t __smu_cmn_get_msg_flags(struct smu_context *smu,
+					       enum smu_message_type msg)
+{
+	return smu->message_map[msg].flags;
+}
+
+static int __smu_cmn_ras_filter_msg(struct smu_context *smu,
+				    enum smu_message_type msg, bool *poll)
+{
+	struct amdgpu_device *adev = smu->adev;
+	uint32_t flags, resp;
+	bool fed_status;
+
+	flags = __smu_cmn_get_msg_flags(smu, msg);
+	*poll = true;
+
+	/* When there is RAS fatal error, FW won't process non-RAS priority
+	 * messages. Don't allow any messages other than RAS priority messages.
+	 */
+	fed_status = amdgpu_ras_get_fed_status(adev);
+	if (fed_status) {
+		if (!(flags & SMU_MSG_RAS_PRI)) {
+			dev_dbg(adev->dev,
+				"RAS error detected, skip sending %s",
+				smu_get_message_name(smu, msg));
+			return -EACCES;
+		}
+
+		/* FW will ignore non-priority messages when a RAS fatal error
+		 * is detected. Hence it is possible that a previous message
+		 * wouldn't have got response. Allow to continue without polling
+		 * for response status for priority messages.
+		 */
+		resp = RREG32(smu->resp_reg);
+		dev_dbg(adev->dev,
+			"Sending RAS priority message %s response status: %x",
+			smu_get_message_name(smu, msg), resp);
+		if (resp == 0)
+			*poll = false;
+	}
+
+	return 0;
+}
+
 static int __smu_cmn_send_debug_msg(struct smu_context *smu,
 			       u32 msg,
 			       u32 param)
@@ -354,6 +398,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 {
 	struct amdgpu_device *adev = smu->adev;
 	int res, index;
+	bool poll = true;
 	u32 reg;

 	if (adev->no_hw_access)
@@ -366,13 +411,21 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 		return index == -EACCES ? 0 : index;

 	mutex_lock(&smu->message_lock);
+
+	if (smu->smc_fw_caps & SMU_FW_CAP_RAS_PRI) {
+		res = __smu_cmn_ras_filter_msg(smu, msg, &poll);
+		if (res)
+			goto Out;
+	}
+
+	if (poll) {
 		reg = __smu_cmn_poll_stat(smu);
 		res = __smu_cmn_reg2errno(smu, reg);
-	if (reg == SMU_RESP_NONE ||
-	    res == -EREMOTEIO) {
+		if (reg == SMU_RESP_NONE || res == -EREMOTEIO) {
 			__smu_cmn_reg_print_error(smu, reg, index, param, msg);
 			goto Out;
 		}
+	}
 	__smu_cmn_send_msg(smu, (uint16_t) index, param);
 	reg = __smu_cmn_poll_stat(smu);
 	res = __smu_cmn_reg2errno(smu, reg);