Commit 47f21981 authored by Hidetoshi Seto's avatar Hidetoshi Seto Committed by Tony Luck

[IA64] Recovery from user-mode memory error

This is the latest/Updated OS_MCA handler which try to do recovery
from multibit-ECC/poisoned memory-read error on user-land.
(Thank you very much for comments, Keith and Grant!)

I'd still appreciate it if anyone having good test environment
could apply my patch and could report how it works.
(especially reports on non-Tiger/non-Intel platform are welcome.)
Signed-off-by: default avatarHidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: default avatarTony Luck <tony.luck@intel.com>
parent e720b735
...@@ -280,6 +280,9 @@ config COMPAT ...@@ -280,6 +280,9 @@ config COMPAT
depends on IA32_SUPPORT depends on IA32_SUPPORT
default y default y
config IA64_MCA_RECOVERY
tristate "MCA recovery from errors other than TLB."
config PERFMON config PERFMON
bool "Performance monitor support" bool "Performance monitor support"
help help
......
...@@ -17,6 +17,8 @@ obj-$(CONFIG_MODULES) += module.o ...@@ -17,6 +17,8 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_SMP) += smp.o smpboot.o
obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o
obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
mca_recovery-y += mca_drv.o mca_drv_asm.o
# The gate DSO image is built using a special linker script. # The gate DSO image is built using a special linker script.
targets += gate.so gate-syms.o targets += gate.so gate-syms.o
......
...@@ -82,11 +82,6 @@ ...@@ -82,11 +82,6 @@
# define IA64_MCA_DEBUG(fmt...) # define IA64_MCA_DEBUG(fmt...)
#endif #endif
typedef struct ia64_fptr {
unsigned long fp;
unsigned long gp;
} ia64_fptr_t;
/* Used by mca_asm.S */ /* Used by mca_asm.S */
ia64_mca_sal_to_os_state_t ia64_sal_to_os_handoff_state; ia64_mca_sal_to_os_state_t ia64_sal_to_os_handoff_state;
ia64_mca_os_to_sal_state_t ia64_os_to_sal_handoff_state; ia64_mca_os_to_sal_state_t ia64_os_to_sal_handoff_state;
...@@ -831,6 +826,31 @@ ia64_return_to_sal_check(int recover) ...@@ -831,6 +826,31 @@ ia64_return_to_sal_check(int recover)
} }
/* Function pointer for extra MCA recovery */
int (*ia64_mca_ucmc_extension)
(void*,ia64_mca_sal_to_os_state_t*,ia64_mca_os_to_sal_state_t*)
= NULL;
int
ia64_reg_MCA_extension(void *fn)
{
if (ia64_mca_ucmc_extension)
return 1;
ia64_mca_ucmc_extension = fn;
return 0;
}
void
ia64_unreg_MCA_extension(void)
{
if (ia64_mca_ucmc_extension)
ia64_mca_ucmc_extension = NULL;
}
EXPORT_SYMBOL(ia64_reg_MCA_extension);
EXPORT_SYMBOL(ia64_unreg_MCA_extension);
/* /*
* ia64_mca_ucmc_handler * ia64_mca_ucmc_handler
* *
...@@ -852,11 +872,20 @@ ia64_mca_ucmc_handler(void) ...@@ -852,11 +872,20 @@ ia64_mca_ucmc_handler(void)
{ {
pal_processor_state_info_t *psp = (pal_processor_state_info_t *) pal_processor_state_info_t *psp = (pal_processor_state_info_t *)
&ia64_sal_to_os_handoff_state.proc_state_param; &ia64_sal_to_os_handoff_state.proc_state_param;
int recover = psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc); int recover;
/* Get the MCA error record and log it */ /* Get the MCA error record and log it */
ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
/* TLB error is only exist in this SAL error record */
recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
/* other error recovery */
|| (ia64_mca_ucmc_extension
&& ia64_mca_ucmc_extension(
IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
&ia64_sal_to_os_handoff_state,
&ia64_os_to_sal_handoff_state));
/* /*
* Wakeup all the processors which are spinning in the rendezvous * Wakeup all the processors which are spinning in the rendezvous
* loop. * loop.
......
This diff is collapsed.
/*
* File: mca_drv.h
* Purpose: Define helpers for Generic MCA handling
*
* Copyright (C) 2004 FUJITSU LIMITED
* Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
*/
/*
* Processor error section:
*
* +-sal_log_processor_info_t *info-------------+
* | sal_log_section_hdr_t header; |
* | ... |
* | sal_log_mod_error_info_t info[0]; |
* +-+----------------+-------------------------+
* | CACHE_CHECK | ^ num_cache_check v
* +----------------+
* | TLB_CHECK | ^ num_tlb_check v
* +----------------+
* | BUS_CHECK | ^ num_bus_check v
* +----------------+
* | REG_FILE_CHECK | ^ num_reg_file_check v
* +----------------+
* | MS_CHECK | ^ num_ms_check v
* +-struct cpuid_info *id----------------------+
* | regs[5]; |
* | reserved; |
* +-sal_processor_static_info_t *regs----------+
* | valid; |
* | ... |
* | fr[128]; |
* +--------------------------------------------+
*/
/* peidx: index of processor error section */
typedef struct peidx_table {
sal_log_processor_info_t *info;
struct sal_cpuid_info *id;
sal_processor_static_info_t *regs;
} peidx_table_t;
#define peidx_head(p) (((p)->info))
#define peidx_mid(p) (((p)->id))
#define peidx_bottom(p) (((p)->regs))
#define peidx_psp(p) (&(peidx_head(p)->proc_state_parameter))
#define peidx_field_valid(p) (&(peidx_head(p)->valid))
#define peidx_minstate_area(p) (&(peidx_bottom(p)->min_state_area))
#define peidx_cache_check_num(p) (peidx_head(p)->valid.num_cache_check)
#define peidx_tlb_check_num(p) (peidx_head(p)->valid.num_tlb_check)
#define peidx_bus_check_num(p) (peidx_head(p)->valid.num_bus_check)
#define peidx_reg_file_check_num(p) (peidx_head(p)->valid.num_reg_file_check)
#define peidx_ms_check_num(p) (peidx_head(p)->valid.num_ms_check)
#define peidx_cache_check_idx(p, n) (n)
#define peidx_tlb_check_idx(p, n) (peidx_cache_check_idx(p, peidx_cache_check_num(p)) + n)
#define peidx_bus_check_idx(p, n) (peidx_tlb_check_idx(p, peidx_tlb_check_num(p)) + n)
#define peidx_reg_file_check_idx(p, n) (peidx_bus_check_idx(p, peidx_bus_check_num(p)) + n)
#define peidx_ms_check_idx(p, n) (peidx_reg_file_check_idx(p, peidx_reg_file_check_num(p)) + n)
#define peidx_mod_error_info(p, name, n) \
({ int __idx = peidx_##name##_idx(p, n); \
sal_log_mod_error_info_t *__ret = NULL; \
if (peidx_##name##_num(p) > n) /*BUG*/ \
__ret = &(peidx_head(p)->info[__idx]); \
__ret; })
#define peidx_cache_check(p, n) peidx_mod_error_info(p, cache_check, n)
#define peidx_tlb_check(p, n) peidx_mod_error_info(p, tlb_check, n)
#define peidx_bus_check(p, n) peidx_mod_error_info(p, bus_check, n)
#define peidx_reg_file_check(p, n) peidx_mod_error_info(p, reg_file_check, n)
#define peidx_ms_check(p, n) peidx_mod_error_info(p, ms_check, n)
#define peidx_check_info(proc, name, n) \
({ \
sal_log_mod_error_info_t *__info = peidx_mod_error_info(proc, name, n);\
u64 __temp = __info && __info->valid.check_info \
? __info->check_info : 0; \
__temp; })
/* slidx: index of SAL log error record */
typedef struct slidx_list {
struct list_head list;
sal_log_section_hdr_t *hdr;
} slidx_list_t;
typedef struct slidx_table {
sal_log_record_header_t *header;
int n_sections; /* # of section headers */
struct list_head proc_err;
struct list_head mem_dev_err;
struct list_head sel_dev_err;
struct list_head pci_bus_err;
struct list_head smbios_dev_err;
struct list_head pci_comp_err;
struct list_head plat_specific_err;
struct list_head host_ctlr_err;
struct list_head plat_bus_err;
struct list_head unsupported; /* list of unsupported sections */
} slidx_table_t;
#define slidx_foreach_entry(pos, head) \
list_for_each_entry(pos, head, list)
#define slidx_first_entry(head) \
(((head)->next != (head)) ? list_entry((head)->next, typeof(slidx_list_t), list) : NULL)
#define slidx_count(slidx, sec) \
({ int __count = 0; \
slidx_list_t *__pos; \
slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
__count; })
/*
* File: mca_drv_asm.S
* Purpose: Assembly portion of Generic MCA handling
*
* Copyright (C) 2004 FUJITSU LIMITED
* Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
*/
#include <linux/config.h>
#include <linux/threads.h>
#include <asm/asmmacro.h>
#include <asm/processor.h>
GLOBAL_ENTRY(mca_handler_bhhook)
invala // clear RSE ?
;; //
cover //
;; //
clrrrb //
;;
alloc r16=ar.pfs,0,2,1,0 // make a new frame
;;
mov r13=IA64_KR(CURRENT) // current task pointer
;;
adds r12=IA64_TASK_THREAD_KSP_OFFSET,r13
;;
ld8 r12=[r12] // stack pointer
;;
mov loc0=r16
movl loc1=mca_handler_bh // recovery C function
;;
mov out0=r8 // poisoned address
mov b6=loc1
;;
mov loc1=rp
;;
br.call.sptk.many rp=b6 // not return ...
;;
mov ar.pfs=loc0
mov rp=loc1
;;
mov r8=r0
br.ret.sptk.many rp
;;
END(mca_handler_bhhook)
...@@ -22,6 +22,11 @@ ...@@ -22,6 +22,11 @@
#define IA64_MCA_RENDEZ_TIMEOUT (20 * 1000) /* value in milliseconds - 20 seconds */ #define IA64_MCA_RENDEZ_TIMEOUT (20 * 1000) /* value in milliseconds - 20 seconds */
typedef struct ia64_fptr {
unsigned long fp;
unsigned long gp;
} ia64_fptr_t;
typedef union cmcv_reg_u { typedef union cmcv_reg_u {
u64 cmcv_regval; u64 cmcv_regval;
struct { struct {
...@@ -114,6 +119,7 @@ extern void ia64_mca_ucmc_handler(void); ...@@ -114,6 +119,7 @@ extern void ia64_mca_ucmc_handler(void);
extern void ia64_monarch_init_handler(void); extern void ia64_monarch_init_handler(void);
extern void ia64_slave_init_handler(void); extern void ia64_slave_init_handler(void);
extern void ia64_mca_cmc_vector_setup(void); extern void ia64_mca_cmc_vector_setup(void);
extern int (*ia64_mca_ucmc_other_recover_fp)(void *,ia64_mca_sal_to_os_state_t *,ia64_mca_os_to_sal_state_t *);
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
#endif /* _ASM_IA64_MCA_H */ #endif /* _ASM_IA64_MCA_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment