Commit ed1e1e42 authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher

drm/amdgpu: Support passing poison consumption ras block to SRIOV

Support passing poison consumption ras blocks
to SRIOV.
Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 1630c6de
...@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev) ...@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
amdgpu_device_flush_hdp(adev, NULL); amdgpu_device_flush_hdp(adev, NULL);
} }
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset)
{ {
amdgpu_umc_poison_handler(adev, reset); amdgpu_umc_poison_handler(adev, block, reset);
} }
int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
......
...@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); ...@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config); struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool reset); enum amdgpu_ras_block block, bool reset);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p); void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p); int amdgpu_amdkfd_criu_resume(void *p);
......
...@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * ...@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
} }
} }
amdgpu_umc_poison_handler(adev, false); amdgpu_umc_poison_handler(adev, obj->head.block, false);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
......
...@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, ...@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
return 0; return 0;
} }
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset)
{ {
int ret = AMDGPU_RAS_SUCCESS; int ret = AMDGPU_RAS_SUCCESS;
...@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) ...@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
} }
} else { } else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler) if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev); adev->virt.ops->ras_poison_handler(adev, block);
else else
dev_warn(adev->dev, dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV!\n"); "No ras_poison_handler interface in SRIOV!\n");
......
...@@ -102,7 +102,8 @@ struct amdgpu_umc { ...@@ -102,7 +102,8 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset); int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry); struct amdgpu_iv_entry *entry);
......
...@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev, ...@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
amdgpu_ras_interrupt_dispatch(adev, &ih_data); amdgpu_ras_interrupt_dispatch(adev, &ih_data);
} else { } else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler) if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev); adev->virt.ops->ras_poison_handler(adev, ras_if->block);
else else
dev_warn(adev->dev, dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV for VCN!\n"); "No ras_poison_handler interface in SRIOV for VCN!\n");
......
...@@ -88,7 +88,8 @@ struct amdgpu_virt_ops { ...@@ -88,7 +88,8 @@ struct amdgpu_virt_ops {
int (*wait_reset)(struct amdgpu_device *adev); int (*wait_reset)(struct amdgpu_device *adev);
void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req, void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
u32 data1, u32 data2, u32 data3); u32 data1, u32 data2, u32 data3);
void (*ras_poison_handler)(struct amdgpu_device *adev); void (*ras_poison_handler)(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
}; };
/* /*
......
...@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev, ...@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
amdgpu_ras_interrupt_dispatch(adev, &ih_data); amdgpu_ras_interrupt_dispatch(adev, &ih_data);
} else { } else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler) if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev); adev->virt.ops->ras_poison_handler(adev, ras_if->block);
else else
dev_warn(adev->dev, dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name); "No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name);
......
...@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev) ...@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA); return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);
} }
static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev) static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
{ {
xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
} }
......
...@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev, ...@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev,
xgpu_nv_mailbox_set_valid(adev, false); xgpu_nv_mailbox_set_valid(adev, false);
} }
static int xgpu_nv_send_access_requests(struct amdgpu_device *adev, static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
enum idh_request req) enum idh_request req, u32 data1, u32 data2, u32 data3)
{ {
int r, retry = 1; int r, retry = 1;
enum idh_event event = -1; enum idh_event event = -1;
send_request: send_request:
xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0); xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3);
switch (req) { switch (req) {
case IDH_REQ_GPU_INIT_ACCESS: case IDH_REQ_GPU_INIT_ACCESS:
...@@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev, ...@@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
return 0; return 0;
} }
static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
enum idh_request req)
{
return xgpu_nv_send_access_requests_with_param(adev,
req, 0, 0, 0);
}
static int xgpu_nv_request_reset(struct amdgpu_device *adev) static int xgpu_nv_request_reset(struct amdgpu_device *adev)
{ {
int ret, i = 0; int ret, i = 0;
...@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) ...@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
} }
static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev) static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
{ {
xgpu_nv_send_access_requests(adev, IDH_RAS_POISON); if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
} else {
xgpu_nv_send_access_requests_with_param(adev,
IDH_RAS_POISON, block, 0, 0);
}
} }
const struct amdgpu_virt_ops xgpu_nv_virt_ops = { const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
......
...@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { ...@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
static void event_interrupt_poison_consumption(struct kfd_node *dev, static void event_interrupt_poison_consumption(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id) uint16_t pasid, uint16_t client_id)
{ {
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL; int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
...@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, ...@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2: case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid); ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
break; break;
case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA1:
case SOC15_IH_CLIENTID_SDMA2: case SOC15_IH_CLIENTID_SDMA2:
case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4: case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
break; break;
default: default:
break; break;
...@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, ...@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n", "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
} else { } else {
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n", "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
} }
} }
......
...@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) ...@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
uint16_t pasid, uint16_t source_id) uint16_t pasid, uint16_t source_id)
{ {
enum amdgpu_ras_block block = 0;
int ret = -EINVAL; int ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
...@@ -210,9 +211,11 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, ...@@ -210,9 +211,11 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
case SOC15_INTSRC_SQ_INTERRUPT_MSG: case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues) if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
break; break;
case SOC21_INTSRC_SDMA_ECC: case SOC21_INTSRC_SDMA_ECC:
default: default:
block = AMDGPU_RAS_BLOCK__GFX;
break; break;
} }
...@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, ...@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset /* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */ resetting queue fails, fallback to gpu reset solution */
if (!ret) if (!ret)
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
else else
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
} }
static bool event_interrupt_isr_v11(struct kfd_node *dev, static bool event_interrupt_isr_v11(struct kfd_node *dev,
......
...@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { ...@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id) uint16_t pasid, uint16_t client_id)
{ {
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL; int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
...@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2: case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid); ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
break; break;
case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA1:
case SOC15_IH_CLIENTID_SDMA2: case SOC15_IH_CLIENTID_SDMA2:
case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4: case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
break; break;
default: default:
break; break;
...@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n", "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
} else { } else {
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n", "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment