Commit e98e03d0 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

Pull more s390 updates from Vasily Gorbik:

 - Fix preempt_count initialization.

 - Rework call_on_stack() macro to add proper type handling and avoid
   possible register corruption.

 - More error prone "register asm" removal and fixes.

 - Fix syscall restarting when multiple signals are coming in. This adds
   minimalistic trampolines to vdso so we can return from signal without
   using the stack which requires pgm check handler hacks when NX is
   enabled.

 - Remove HAVE_IRQ_EXIT_ON_IRQ_STACK since this is no longer true after
   switch to generic entry.

 - Fix protected virtualization secure storage access exception
   handling.

 - Make machine check C handler always enter with DAT enabled and move
   register validation to C code.

 - Fix tinyconfig boot problem by avoiding MONITOR CALL without
   CONFIG_BUG.

 - Increase asm symbols alignment to 16 to make it consistent with
   compilers.

 - Enable concurrent access to the CPU Measurement Counter Facility.

 - Add support for dynamic AP bus size limit and rework ap_dqap to deal
   with messages greater than recv buffer.

* tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (41 commits)
  s390: preempt: Fix preempt_count initialization
  s390/linkage: increase asm symbols alignment to 16
  s390: rename CALL_ON_STACK_NORETURN() to call_on_stack_noreturn()
  s390: add type checking to CALL_ON_STACK_NORETURN() macro
  s390: remove old CALL_ON_STACK() macro
  s390/softirq: use call_on_stack() macro
  s390/lib: use call_on_stack() macro
  s390/smp: use call_on_stack() macro
  s390/kexec: use call_on_stack() macro
  s390/irq: use call_on_stack() macro
  s390/mm: use call_on_stack() macro
  s390: introduce proper type handling call_on_stack() macro
  s390/irq: simplify on_async_stack()
  s390/irq: inline do_softirq_own_stack()
  s390/irq: simplify do_softirq_own_stack()
  s390/ap: get rid of register asm in ap_dqap()
  s390: rename PIF_SYSCALL_RESTART to PIF_EXECVE_PGSTE_RESTART
  s390: move restart of execve() syscall
  s390/signal: remove sigreturn on stack
  s390/signal: switch to using vdso for sigreturn and syscall restart
  ...
parents 379cf80a 6a942f57
......@@ -163,7 +163,6 @@ config S390
select HAVE_GCC_PLUGINS
select HAVE_GENERIC_VDSO
select HAVE_IOREMAP_PROT if PCI
select HAVE_IRQ_EXIT_ON_IRQ_STACK
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZ4
......@@ -438,6 +437,7 @@ config COMPAT
select COMPAT_OLD_SIGACTION
select HAVE_UID16
depends on MULTIUSER
depends on !CC_IS_CLANG
help
Select this option if you want to enable your system kernel to
handle system-calls from ELF binaries for 31 bit ESA. This option
......
......@@ -166,6 +166,19 @@ archheaders:
archprepare:
$(Q)$(MAKE) $(build)=$(syscalls) kapi
$(Q)$(MAKE) $(build)=$(tools) kapi
ifeq ($(KBUILD_EXTMOD),)
# We need to generate vdso-offsets.h before compiling certain files in kernel/.
# In order to do that, we should use the archprepare target, but we can't since
# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
# Therefore we need to generate the header after prepare0 has been made, hence
# this hack.
prepare: vdso_prepare
vdso_prepare: prepare0
$(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
$(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
$(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
endif
# Don't use tabs in echo arguments
define archhelp
......
......@@ -23,6 +23,7 @@ unsigned long __bootdata_preserved(vmemmap_size);
unsigned long __bootdata_preserved(MODULES_VADDR);
unsigned long __bootdata_preserved(MODULES_END);
unsigned long __bootdata(ident_map_size);
int __bootdata(is_full_image) = 1;
u64 __bootdata_preserved(stfle_fac_list[16]);
u64 __bootdata_preserved(alt_stfle_fac_list[16]);
......
......@@ -36,6 +36,7 @@ void uv_query_info(void)
uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
uv_info.uv_feature_indications = uvcb.uv_feature_indications;
}
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
......
......@@ -53,18 +53,20 @@ struct ap_queue_status {
*/
static inline bool ap_instructions_available(void)
{
register unsigned long reg0 asm ("0") = AP_MKQID(0, 0);
register unsigned long reg1 asm ("1") = 0;
register unsigned long reg2 asm ("2") = 0;
unsigned long reg0 = AP_MKQID(0, 0);
unsigned long reg1 = 0;
asm volatile(
" lgr 0,%[reg0]\n" /* qid into gr0 */
" lghi 1,0\n" /* 0 into gr1 */
" lghi 2,0\n" /* 0 into gr2 */
" .long 0xb2af0000\n" /* PQAP(TAPQ) */
"0: la %0,1\n"
"0: la %[reg1],1\n" /* 1 into reg1 */
"1:\n"
EX_TABLE(0b, 1b)
: "+d" (reg1), "+d" (reg2)
: "d" (reg0)
: "cc");
: [reg1] "+&d" (reg1)
: [reg0] "d" (reg0)
: "cc", "0", "1", "2");
return reg1 != 0;
}
......@@ -77,14 +79,18 @@ static inline bool ap_instructions_available(void)
*/
static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info)
{
register unsigned long reg0 asm ("0") = qid;
register struct ap_queue_status reg1 asm ("1");
register unsigned long reg2 asm ("2");
asm volatile(".long 0xb2af0000" /* PQAP(TAPQ) */
: "=d" (reg1), "=d" (reg2)
: "d" (reg0)
: "cc");
struct ap_queue_status reg1;
unsigned long reg2;
asm volatile(
" lgr 0,%[qid]\n" /* qid into gr0 */
" lghi 2,0\n" /* 0 into gr2 */
" .long 0xb2af0000\n" /* PQAP(TAPQ) */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
" lgr %[reg2],2\n" /* gr2 into reg2 */
: [reg1] "=&d" (reg1), [reg2] "=&d" (reg2)
: [qid] "d" (qid)
: "cc", "0", "1", "2");
if (info)
*info = reg2;
return reg1;
......@@ -115,14 +121,16 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
*/
static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
{
register unsigned long reg0 asm ("0") = qid | (1UL << 24);
register struct ap_queue_status reg1 asm ("1");
unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */
struct ap_queue_status reg1;
asm volatile(
".long 0xb2af0000" /* PQAP(RAPQ) */
: "=d" (reg1)
: "d" (reg0)
: "cc");
" lgr 0,%[reg0]\n" /* qid arg into gr0 */
" .long 0xb2af0000\n" /* PQAP(RAPQ) */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
: [reg1] "=&d" (reg1)
: [reg0] "d" (reg0)
: "cc", "0", "1");
return reg1;
}
......@@ -134,14 +142,16 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
*/
static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
{
register unsigned long reg0 asm ("0") = qid | (2UL << 24);
register struct ap_queue_status reg1 asm ("1");
unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */
struct ap_queue_status reg1;
asm volatile(
".long 0xb2af0000" /* PQAP(ZAPQ) */
: "=d" (reg1)
: "d" (reg0)
: "cc");
" lgr 0,%[reg0]\n" /* qid arg into gr0 */
" .long 0xb2af0000\n" /* PQAP(ZAPQ) */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
: [reg1] "=&d" (reg1)
: [reg0] "d" (reg0)
: "cc", "0", "1");
return reg1;
}
......@@ -172,18 +182,20 @@ struct ap_config_info {
*/
static inline int ap_qci(struct ap_config_info *config)
{
register unsigned long reg0 asm ("0") = 4UL << 24;
register unsigned long reg1 asm ("1") = -EOPNOTSUPP;
register struct ap_config_info *reg2 asm ("2") = config;
unsigned long reg0 = 4UL << 24; /* fc 4UL is QCI */
unsigned long reg1 = -EOPNOTSUPP;
struct ap_config_info *reg2 = config;
asm volatile(
".long 0xb2af0000\n" /* PQAP(QCI) */
"0: la %0,0\n"
" lgr 0,%[reg0]\n" /* QCI fc into gr0 */
" lgr 2,%[reg2]\n" /* ptr to config into gr2 */
" .long 0xb2af0000\n" /* PQAP(QCI) */
"0: la %[reg1],0\n" /* good case, QCI fc available */
"1:\n"
EX_TABLE(0b, 1b)
: "+d" (reg1)
: "d" (reg0), "d" (reg2)
: "cc", "memory");
: [reg1] "+&d" (reg1)
: [reg0] "d" (reg0), [reg2] "d" (reg2)
: "cc", "memory", "0", "2");
return reg1;
}
......@@ -220,21 +232,25 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid,
struct ap_qirq_ctrl qirqctrl,
void *ind)
{
register unsigned long reg0 asm ("0") = qid | (3UL << 24);
register union {
unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */
union {
unsigned long value;
struct ap_qirq_ctrl qirqctrl;
struct ap_queue_status status;
} reg1 asm ("1");
register void *reg2 asm ("2") = ind;
} reg1;
void *reg2 = ind;
reg1.qirqctrl = qirqctrl;
asm volatile(
".long 0xb2af0000" /* PQAP(AQIC) */
: "+d" (reg1)
: "d" (reg0), "d" (reg2)
: "cc");
" lgr 0,%[reg0]\n" /* qid param into gr0 */
" lgr 1,%[reg1]\n" /* irq ctrl into gr1 */
" lgr 2,%[reg2]\n" /* ni addr into gr2 */
" .long 0xb2af0000\n" /* PQAP(AQIC) */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
: [reg1] "+&d" (reg1)
: [reg0] "d" (reg0), [reg2] "d" (reg2)
: "cc", "0", "1", "2");
return reg1.status;
}
......@@ -268,21 +284,24 @@ union ap_qact_ap_info {
static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit,
union ap_qact_ap_info *apinfo)
{
register unsigned long reg0 asm ("0") = qid | (5UL << 24)
| ((ifbit & 0x01) << 22);
register union {
unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22);
union {
unsigned long value;
struct ap_queue_status status;
} reg1 asm ("1");
register unsigned long reg2 asm ("2");
} reg1;
unsigned long reg2;
reg1.value = apinfo->val;
asm volatile(
".long 0xb2af0000" /* PQAP(QACT) */
: "+d" (reg1), "=d" (reg2)
: "d" (reg0)
: "cc");
" lgr 0,%[reg0]\n" /* qid param into gr0 */
" lgr 1,%[reg1]\n" /* qact in info into gr1 */
" .long 0xb2af0000\n" /* PQAP(QACT) */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
" lgr %[reg2],2\n" /* qact out info into reg2 */
: [reg1] "+&d" (reg1), [reg2] "=&d" (reg2)
: [reg0] "d" (reg0)
: "cc", "0", "1", "2");
apinfo->val = reg2;
return reg1.status;
}
......@@ -303,19 +322,24 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
unsigned long long psmid,
void *msg, size_t length)
{
register unsigned long reg0 asm ("0") = qid | 0x40000000UL;
register struct ap_queue_status reg1 asm ("1");
register unsigned long reg2 asm ("2") = (unsigned long) msg;
register unsigned long reg3 asm ("3") = (unsigned long) length;
register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32);
register unsigned long reg5 asm ("5") = psmid & 0xffffffff;
unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */
union register_pair nqap_r1, nqap_r2;
struct ap_queue_status reg1;
nqap_r1.even = (unsigned int)(psmid >> 32);
nqap_r1.odd = psmid & 0xffffffff;
nqap_r2.even = (unsigned long)msg;
nqap_r2.odd = (unsigned long)length;
asm volatile (
"0: .long 0xb2ad0042\n" /* NQAP */
" brc 2,0b"
: "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3)
: "d" (reg4), "d" (reg5)
: "cc", "memory");
" lgr 0,%[reg0]\n" /* qid param in gr0 */
"0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n"
" brc 2,0b\n" /* handle partial completion */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1),
[nqap_r2] "+&d" (nqap_r2.pair)
: [nqap_r1] "d" (nqap_r1.pair)
: "cc", "memory", "0", "1");
return reg1;
}
......@@ -325,6 +349,8 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
* @psmid: Pointer to program supplied message identifier
* @msg: The message text
* @length: The message length
* @reslength: Resitual length on return
* @resgr0: input: gr0 value (only used if != 0), output: resitual gr0 content
*
* Returns AP queue status structure.
* Condition code 1 on DQAP means the receive has taken place
......@@ -336,27 +362,65 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
* Note that gpr2 is used by the DQAP instruction to keep track of
* any 'residual' length, in case the instruction gets interrupted.
* Hence it gets zeroed before the instruction.
* If the message does not fit into the buffer, this function will
* return with a truncated message and the reply in the firmware queue
* is not removed. This is indicated to the caller with an
* ap_queue_status response_code value of all bits on (0xFF) and (if
* the reslength ptr is given) the remaining length is stored in
* *reslength and (if the resgr0 ptr is given) the updated gr0 value
* for further processing of this msg entry is stored in *resgr0. The
* caller needs to detect this situation and should invoke ap_dqap
* with a valid resgr0 ptr and a value in there != 0 to indicate that
* *resgr0 is to be used instead of qid to further process this entry.
*/
static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
unsigned long long *psmid,
void *msg, size_t length)
void *msg, size_t length,
size_t *reslength,
unsigned long *resgr0)
{
register unsigned long reg0 asm("0") = qid | 0x80000000UL;
register struct ap_queue_status reg1 asm ("1");
register unsigned long reg2 asm("2") = 0UL;
register unsigned long reg4 asm("4") = (unsigned long) msg;
register unsigned long reg5 asm("5") = (unsigned long) length;
register unsigned long reg6 asm("6") = 0UL;
register unsigned long reg7 asm("7") = 0UL;
unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL;
struct ap_queue_status reg1;
unsigned long reg2;
union register_pair rp1, rp2;
rp1.even = 0UL;
rp1.odd = 0UL;
rp2.even = (unsigned long)msg;
rp2.odd = (unsigned long)length;
asm volatile(
"0: .long 0xb2ae0064\n" /* DQAP */
" brc 6,0b\n"
: "+d" (reg0), "=d" (reg1), "+d" (reg2),
"+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7)
: : "cc", "memory");
*psmid = (((unsigned long long) reg6) << 32) + reg7;
" lgr 0,%[reg0]\n" /* qid param into gr0 */
" lghi 2,0\n" /* 0 into gr2 (res length) */
"0: ltgr %N[rp2],%N[rp2]\n" /* check buf len */
" jz 2f\n" /* go out if buf len is 0 */
"1: .insn rre,0xb2ae0000,%[rp1],%[rp2]\n"
" brc 6,0b\n" /* handle partial complete */
"2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */
" lgr %[reg1],1\n" /* gr1 (status) into reg1 */
" lgr %[reg2],2\n" /* gr2 (res length) into reg2 */
: [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), [reg2] "=&d" (reg2),
[rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair)
:
: "cc", "memory", "0", "1", "2");
if (reslength)
*reslength = reg2;
if (reg2 != 0 && rp2.odd == 0) {
/*
* Partially complete, status in gr1 is not set.
* Signal the caller that this dqap is only partially received
* with a special status response code 0xFF and *resgr0 updated
*/
reg1.response_code = 0xFF;
if (resgr0)
*resgr0 = reg0;
} else {
*psmid = (((unsigned long long)rp1.even) << 32) + rp1.odd;
if (resgr0)
*resgr0 = 0;
}
return reg1;
}
......
......@@ -32,39 +32,22 @@ static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
[CPUMF_CTR_SET_MT_DIAG] = 0x20,
};
static inline void ctr_set_enable(u64 *state, int ctr_set)
{
*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
}
static inline void ctr_set_disable(u64 *state, int ctr_set)
{
*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
}
static inline void ctr_set_start(u64 *state, int ctr_set)
{
*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
}
static inline void ctr_set_stop(u64 *state, int ctr_set)
{
*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
}
static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
static inline void ctr_set_enable(u64 *state, u64 ctrsets)
{
*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
}
static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
static inline void ctr_set_disable(u64 *state, u64 ctrsets)
{
*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
}
static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
static inline void ctr_set_start(u64 *state, u64 ctrsets)
{
*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
}
static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
static inline void ctr_set_stop(u64 *state, u64 ctrsets)
{
*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
}
......@@ -92,8 +75,15 @@ struct cpu_cf_events {
struct cpumf_ctr_info info;
atomic_t ctr_set[CPUMF_CTR_SET_MAX];
atomic64_t alert;
u64 state;
u64 state; /* For perf_event_open SVC */
u64 dev_state; /* For /dev/hwctr */
unsigned int flags;
size_t used; /* Bytes used in data */
size_t usedss; /* Bytes used in start/stop */
unsigned char start[PAGE_SIZE]; /* Counter set at event add */
unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */
unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */
unsigned int sets; /* # Counter set saved in memory */
};
DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
......@@ -124,4 +114,6 @@ static inline int stccm_avail(void)
size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
struct cpumf_ctr_info *info);
int cfset_online_cpu(unsigned int cpu);
int cfset_offline_cpu(unsigned int cpu);
#endif /* _ASM_S390_CPU_MCF_H */
......@@ -21,8 +21,6 @@
#define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57)
#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58)
#define CR2_GUARDED_STORAGE BIT(63 - 59)
#define CR14_UNUSED_32 BIT(63 - 32)
#define CR14_UNUSED_33 BIT(63 - 33)
#define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35)
......
......@@ -144,10 +144,6 @@ typedef s390_compat_regs compat_elf_gregset_t;
#include <linux/sched/mm.h> /* for task_struct */
#include <asm/mmu_context.h>
#include <asm/vdso.h>
extern unsigned int vdso_enabled;
/*
* This is used to ensure we don't load something for the wrong architecture.
*/
......@@ -176,7 +172,7 @@ struct arch_elf_state {
!current->mm->context.alloc_pgste) { \
set_thread_flag(TIF_PGSTE); \
set_pt_regs_flag(task_pt_regs(current), \
PIF_SYSCALL_RESTART); \
PIF_EXECVE_PGSTE_RESTART); \
_state->rc = -EAGAIN; \
} \
_state->rc; \
......@@ -270,7 +266,6 @@ do { \
/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
#define ARCH_DLINFO \
do { \
if (vdso_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
(unsigned long)current->mm->context.vdso_base); \
} while (0)
......
......@@ -14,7 +14,6 @@
#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
void do_per_trap(struct pt_regs *regs);
void do_syscall(struct pt_regs *regs);
#ifdef CONFIG_DEBUG_ENTRY
static __always_inline void arch_check_user_regs(struct pt_regs *regs)
......
......@@ -5,7 +5,7 @@
#include <asm/asm-const.h>
#include <linux/stringify.h>
#define __ALIGN .align 4, 0x07
#define __ALIGN .align 16, 0x07
#define __ALIGN_STR __stringify(__ALIGN)
/*
......
......@@ -23,12 +23,16 @@
#define MCCK_CODE_SYSTEM_DAMAGE BIT(63)
#define MCCK_CODE_EXT_DAMAGE BIT(63 - 5)
#define MCCK_CODE_CP BIT(63 - 9)
#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
#define MCCK_CODE_STG_ERROR BIT(63 - 16)
#define MCCK_CODE_STG_KEY_ERROR BIT(63 - 18)
#define MCCK_CODE_STG_DEGRAD BIT(63 - 19)
#define MCCK_CODE_PSW_MWP_VALID BIT(63 - 20)
#define MCCK_CODE_PSW_IA_VALID BIT(63 - 23)
#define MCCK_CODE_STG_FAIL_ADDR BIT(63 - 24)
#define MCCK_CODE_CR_VALID BIT(63 - 29)
#define MCCK_CODE_GS_VALID BIT(63 - 36)
#define MCCK_CODE_FC_VALID BIT(63 - 43)
#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
#ifndef __ASSEMBLY__
......
......@@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc)
old, new) != old);
}
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
{
__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
......@@ -88,12 +82,6 @@ static inline void preempt_count_set(int pc)
S390_lowcore.preempt_count = pc;
}
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
S390_lowcore.preempt_count = PREEMPT_DISABLED; \
} while (0)
static inline void set_preempt_need_resched(void)
{
}
......@@ -130,6 +118,10 @@ static inline bool should_resched(int preempt_offset)
#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
#define init_task_preempt_count(p) do { } while (0)
/* Deferred to CPU bringup time */
#define init_idle_preempt_count(p, cpu) do { } while (0)
#ifdef CONFIG_PREEMPTION
extern void preempt_schedule(void);
#define __preempt_schedule() preempt_schedule()
......
......@@ -12,12 +12,12 @@
#include <asm/tpi.h>
#define PIF_SYSCALL 0 /* inside a system call */
#define PIF_SYSCALL_RESTART 1 /* restart the current system call */
#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */
#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */
#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */
#define _PIF_SYSCALL BIT(PIF_SYSCALL)
#define _PIF_SYSCALL_RESTART BIT(PIF_SYSCALL_RESTART)
#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART)
#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET)
#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT)
......@@ -162,6 +162,14 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
return !!(regs->flags & (1UL << flag));
}
static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
{
int ret = test_pt_regs_flag(regs, flag);
clear_pt_regs_flag(regs, flag);
return ret;
}
/*
* These are defined as per linux/ptrace.h, which see.
*/
......
......@@ -159,6 +159,8 @@ static inline unsigned long kaslr_offset(void)
return __kaslr_offset;
}
extern int is_full_image;
static inline u32 gen_lpswe(unsigned long addr)
{
BUILD_BUG_ON(addr > 0xfff);
......
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef __ASM_S390_SOFTIRQ_STACK_H
#define __ASM_S390_SOFTIRQ_STACK_H
#include <asm/lowcore.h>
#include <asm/stacktrace.h>
static inline void do_softirq_own_stack(void)
{
call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
}
#endif /* __ASM_S390_SOFTIRQ_STACK_H */
......@@ -74,23 +74,6 @@ struct stack_frame {
((unsigned long)__builtin_frame_address(0) - \
offsetof(struct stack_frame, back_chain))
#define CALL_ARGS_0() \
register unsigned long r2 asm("2")
#define CALL_ARGS_1(arg1) \
register unsigned long r2 asm("2") = (unsigned long)(arg1)
#define CALL_ARGS_2(arg1, arg2) \
CALL_ARGS_1(arg1); \
register unsigned long r3 asm("3") = (unsigned long)(arg2)
#define CALL_ARGS_3(arg1, arg2, arg3) \
CALL_ARGS_2(arg1, arg2); \
register unsigned long r4 asm("4") = (unsigned long)(arg3)
#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \
CALL_ARGS_3(arg1, arg2, arg3); \
register unsigned long r4 asm("5") = (unsigned long)(arg4)
#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \
CALL_ARGS_4(arg1, arg2, arg3, arg4); \
register unsigned long r4 asm("6") = (unsigned long)(arg5)
/*
* To keep this simple mark register 2-6 as being changed (volatile)
* by the called function, even though register 6 is saved/nonvolatile.
......@@ -109,34 +92,113 @@ struct stack_frame {
#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3"
#define CALL_CLOBBER_0 CALL_CLOBBER_1
#define CALL_ON_STACK(fn, stack, nr, args...) \
#define CALL_LARGS_0(...) \
long dummy = 0
#define CALL_LARGS_1(t1, a1) \
long arg1 = (long)(t1)(a1)
#define CALL_LARGS_2(t1, a1, t2, a2) \
CALL_LARGS_1(t1, a1); \
long arg2 = (long)(t2)(a2)
#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3) \
CALL_LARGS_2(t1, a1, t2, a2); \
long arg3 = (long)(t3)(a3)
#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4) \
CALL_LARGS_3(t1, a1, t2, a2, t3, a3); \
long arg4 = (long)(t4)(a4)
#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \
CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4); \
long arg5 = (long)(t5)(a5)
#define CALL_REGS_0 \
register long r2 asm("2") = dummy
#define CALL_REGS_1 \
register long r2 asm("2") = arg1
#define CALL_REGS_2 \
CALL_REGS_1; \
register long r3 asm("3") = arg2
#define CALL_REGS_3 \
CALL_REGS_2; \
register long r4 asm("4") = arg3
#define CALL_REGS_4 \
CALL_REGS_3; \
register long r5 asm("5") = arg4
#define CALL_REGS_5 \
CALL_REGS_4; \
register long r6 asm("6") = arg5
#define CALL_TYPECHECK_0(...)
#define CALL_TYPECHECK_1(t, a, ...) \
typecheck(t, a)
#define CALL_TYPECHECK_2(t, a, ...) \
CALL_TYPECHECK_1(__VA_ARGS__); \
typecheck(t, a)
#define CALL_TYPECHECK_3(t, a, ...) \
CALL_TYPECHECK_2(__VA_ARGS__); \
typecheck(t, a)
#define CALL_TYPECHECK_4(t, a, ...) \
CALL_TYPECHECK_3(__VA_ARGS__); \
typecheck(t, a)
#define CALL_TYPECHECK_5(t, a, ...) \
CALL_TYPECHECK_4(__VA_ARGS__); \
typecheck(t, a)
#define CALL_PARM_0(...) void
#define CALL_PARM_1(t, a, ...) t
#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__)
#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__)
#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__)
#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__)
#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__)
/*
* Use call_on_stack() to call a function switching to a specified
* stack. Proper sign and zero extension of function arguments is
* done. Usage:
*
* rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...)
*
* - nr specifies the number of function arguments of fn.
* - stack specifies the stack to be used.
* - fn is the function to be called.
* - rettype is the return type of fn.
* - t1, a1, ... are pairs, where t1 must match the type of the first
* argument of fn, t2 the second, etc. a1 is the corresponding
* first function argument (not name), etc.
*/
#define call_on_stack(nr, stack, rettype, fn, ...) \
({ \
rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn; \
unsigned long frame = current_frame_address(); \
CALL_ARGS_##nr(args); \
unsigned long __stack = stack; \
unsigned long prev; \
CALL_LARGS_##nr(__VA_ARGS__); \
CALL_REGS_##nr; \
\
CALL_TYPECHECK_##nr(__VA_ARGS__); \
asm volatile( \
" la %[_prev],0(15)\n" \
" lgr %[_prev],15\n" \
" lg 15,%[_stack]\n" \
" stg %[_frame],%[_bc](15)\n" \
" brasl 14,%[_fn]\n" \
" la 15,0(%[_prev])\n" \
: [_prev] "=&a" (prev), CALL_FMT_##nr \
: [_stack] "R" (stack), \
" lgr 15,%[_prev]\n" \
: [_prev] "=&d" (prev), CALL_FMT_##nr \
: [_stack] "R" (__stack), \
[_bc] "i" (offsetof(struct stack_frame, back_chain)), \
[_frame] "d" (frame), \
[_fn] "X" (fn) : CALL_CLOBBER_##nr); \
r2; \
[_fn] "X" (__fn) : CALL_CLOBBER_##nr); \
(rettype)r2; \
})
#define CALL_ON_STACK_NORETURN(fn, stack) \
#define call_on_stack_noreturn(fn, stack) \
({ \
void (*__fn)(void) = fn; \
\
asm volatile( \
" la 15,0(%[_stack])\n" \
" xc %[_bc](8,15),%[_bc](15)\n" \
" brasl 14,%[_fn]\n" \
::[_bc] "i" (offsetof(struct stack_frame, back_chain)), \
[_stack] "a" (stack), [_fn] "X" (fn)); \
[_stack] "a" (stack), [_fn] "X" (__fn)); \
BUG(); \
})
......
......@@ -73,6 +73,10 @@ enum uv_cmds_inst {
BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
};
enum uv_feat_ind {
BIT_UV_FEAT_MISC = 0,
};
struct uv_cb_header {
u16 len;
u16 cmd; /* Command Code */
......@@ -97,7 +101,8 @@ struct uv_cb_qui {
u64 max_guest_stor_addr;
u8 reserved88[158 - 136];
u16 max_guest_cpu_id;
u8 reserveda0[200 - 160];
u64 uv_feature_indications;
u8 reserveda0[200 - 168];
} __packed __aligned(8);
/* Initialize Ultravisor */
......@@ -274,6 +279,7 @@ struct uv_info {
unsigned long max_sec_stor_addr;
unsigned int max_num_sec_conf;
unsigned short max_guest_cpu_id;
unsigned long uv_feature_indications;
};
extern struct uv_info uv_info;
......
......@@ -4,18 +4,31 @@
#include <vdso/datapage.h>
/* Default link address for the vDSO */
#define VDSO64_LBASE 0
#ifndef __ASSEMBLY__
#define __VVAR_PAGES 2
#include <generated/vdso64-offsets.h>
#ifdef CONFIG_COMPAT
#include <generated/vdso32-offsets.h>
#endif
#define VDSO_VERSION_STRING LINUX_2.6.29
#ifndef __ASSEMBLY__
#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
#ifdef CONFIG_COMPAT
#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
#else
#define VDSO32_SYMBOL(tsk, name) (-1UL)
#endif
extern struct vdso_data *vdso_data;
int vdso_getcpu_init(void);
#endif /* __ASSEMBLY__ */
/* Default link address for the vDSO */
#define VDSO_LBASE 0
#define __VVAR_PAGES 2
#define VDSO_VERSION_STRING LINUX_2.6.29
#endif /* __S390_VDSO_H__ */
......@@ -8,7 +8,6 @@
#include <asm/timex.h>
#include <asm/unistd.h>
#include <asm/vdso.h>
#include <linux/compiler.h>
#define vdso_calc_delta __arch_vdso_calc_delta
......
......@@ -71,10 +71,10 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o
obj-$(CONFIG_TRACEPOINTS) += trace.o
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
# vdso
obj-y += vdso64/
obj-$(CONFIG_COMPAT) += vdso32/
......@@ -14,8 +14,6 @@
#include <linux/pgtable.h>
#include <asm/idle.h>
#include <asm/gmap.h>
#include <asm/nmi.h>
#include <asm/setup.h>
#include <asm/stacktrace.h>
int main(void)
......@@ -108,7 +106,6 @@ int main(void)
OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
OFFSET(__LC_CURRENT, lowcore, current_task);
OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
......@@ -145,9 +142,6 @@ int main(void)
OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
BLANK();
/* extended machine check save area */
OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
BLANK();
/* gmap/sie offsets */
OFFSET(__GMAP_ASCE, gmap, asce);
OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
......
......@@ -28,6 +28,7 @@
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
#include <asm/vdso.h>
#include "compat_linux.h"
#include "compat_ptrace.h"
#include "entry.h"
......@@ -118,7 +119,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
return 0;
}
......@@ -304,11 +304,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
/* Signal frames without vectors registers are short ! */
__u16 __user *svc = (void __user *) frame + frame_size - 2;
if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
return -EFAULT;
restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
restorer = VDSO32_SYMBOL(current, sigreturn);
}
/* Set up registers for signal handler */
......@@ -371,10 +367,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
__u16 __user *svc = &frame->svc_insn;
if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
return -EFAULT;
restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
restorer = VDSO32_SYMBOL(current, rt_sigreturn);
}
/* Create siginfo on the signal stack */
......
......@@ -33,6 +33,8 @@
#include <asm/switch_to.h>
#include "entry.h"
int __bootdata(is_full_image);
static void __init reset_tod_clock(void)
{
union tod_clock clk;
......@@ -279,7 +281,7 @@ static void __init setup_boot_command_line(void)
static void __init check_image_bootable(void)
{
if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
if (is_full_image)
return;
sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
......
......@@ -14,7 +14,6 @@
#include <asm/alternative-asm.h>
#include <asm/processor.h>
#include <asm/cache.h>
#include <asm/ctl_reg.h>
#include <asm/dwarf.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
......@@ -129,6 +128,24 @@ _LPP_OFFSET = __LC_LPP
"jnz .+8; .long 0xb2e8d000", 82
.endm
/*
* The CHKSTG macro jumps to the provided label in case the
* machine check interruption code reports one of unrecoverable
* storage errors:
* - Storage error uncorrected
* - Storage key error uncorrected
* - Storage degradation with Failing-storage-address validity
*/
.macro CHKSTG errlabel
TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR)
jnz \errlabel
TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD
jz oklabel\@
TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR
jnz \errlabel
oklabel\@:
.endm
#if IS_ENABLED(CONFIG_KVM)
/*
* The OUTSIDE macro jumps to the provided label in case the value
......@@ -148,6 +165,13 @@ _LPP_OFFSET = __LC_LPP
clgr %r14,%r13
jhe \outside_label
.endm
.macro SIEEXIT
lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
larl %r9,sie_exit # skip forward to sie_exit
.endm
#endif
GEN_BR_THUNK %r14
......@@ -235,7 +259,6 @@ ENTRY(sie64a)
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
# Other instructions between sie64a and .Lsie_done should not cause program
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
# See also .Lcleanup_sie
.Lrewind_pad6:
nopr 7
.Lrewind_pad4:
......@@ -341,10 +364,7 @@ ENTRY(pgm_check_handler)
#if IS_ENABLED(CONFIG_KVM)
# cleanup critical section for program checks in sie64a
OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
larl %r9,sie_exit # skip forward to sie_exit
SIEEXIT
lghi %r10,_PIF_GUEST_FAULT
#endif
1: tmhh %r8,0x4000 # PER bit set in old PSW ?
......@@ -410,7 +430,8 @@ ENTRY(\name)
jnz 1f
#if IS_ENABLED(CONFIG_KVM)
OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f
brasl %r14,.Lcleanup_sie
BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
SIEEXIT
#endif
0: CHECK_STACK __LC_SAVE_AREA_ASYNC
aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
......@@ -484,8 +505,6 @@ ENTRY(mcck_int_handler)
BPOFF
la %r1,4095 # validate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer
sckc __LC_CLOCK_COMPARATOR # validate comparator
lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
lg %r12,__LC_CURRENT
lmg %r8,%r9,__LC_MCK_OLD_PSW
......@@ -496,41 +515,7 @@ ENTRY(mcck_int_handler)
la %r14,4095
lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
ptlb
lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area
nill %r11,0xfc00 # MCESA_ORIGIN_MASK
TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
jno 0f
TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID
jno 0f
.insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID
jo 0f
sr %r14,%r14
0: sfpc %r14
TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
jo 0f
lghi %r14,__LC_FPREGS_SAVE_AREA
ld %f0,0(%r14)
ld %f1,8(%r14)
ld %f2,16(%r14)
ld %f3,24(%r14)
ld %f4,32(%r14)
ld %f5,40(%r14)
ld %f6,48(%r14)
ld %f7,56(%r14)
ld %f8,64(%r14)
ld %f9,72(%r14)
ld %f10,80(%r14)
ld %f11,88(%r14)
ld %f12,96(%r14)
ld %f13,104(%r14)
ld %f14,112(%r14)
ld %f15,120(%r14)
j 1f
0: VLM %v0,%v15,0,%r11
VLM %v16,%v31,256,%r11
1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA
lghi %r14,__LC_CPU_TIMER_SAVE_AREA
mvc __LC_MCCK_ENTER_TIMER(8),0(%r14)
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
jo 3f
......@@ -546,24 +531,29 @@ ENTRY(mcck_int_handler)
3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
jno .Lmcck_panic
tmhh %r8,0x0001 # interrupting from user ?
jnz 4f
jnz 6f
TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
jno .Lmcck_panic
4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off
tmhh %r8,0x0001 # interrupting from user ?
jnz .Lmcck_user
#if IS_ENABLED(CONFIG_KVM)
OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack
OUTSIDE %r9,.Lsie_entry,.Lsie_skip,5f
OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f
OUTSIDE %r9,.Lsie_entry,.Lsie_skip,4f
oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
5: brasl %r14,.Lcleanup_sie
#endif
j 5f
4: CHKSTG .Lmcck_panic
5: larl %r14,.Lstosm_tmp
stosm 0(%r14),0x04 # turn dat on, keep irqs off
BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
SIEEXIT
j .Lmcck_stack
.Lmcck_user:
#endif
6: CHKSTG .Lmcck_panic
larl %r14,.Lstosm_tmp
stosm 0(%r14),0x04 # turn dat on, keep irqs off
tmhh %r8,0x0001 # interrupting from user ?
jz .Lmcck_stack
BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
.Lmcck_stack:
lg %r15,__LC_MCCK_STACK
.Lmcck_skip:
la %r11,STACK_FRAME_OVERHEAD(%r15)
stctg %c1,%c1,__PT_CR1(%r11)
lctlg %c1,%c1,__LC_KERNEL_ASCE
......@@ -605,8 +595,33 @@ ENTRY(mcck_int_handler)
b __LC_RETURN_MCCK_LPSWE
.Lmcck_panic:
lg %r15,__LC_NODAT_STACK
j .Lmcck_skip
/*
* Iterate over all possible CPU addresses in the range 0..0xffff
* and stop each CPU using signal processor. Use compare and swap
* to allow just one CPU-stopper and prevent concurrent CPUs from
* stopping each other while leaving the others running.
*/
lhi %r5,0
lhi %r6,1
larl %r7,.Lstop_lock
cs %r5,%r6,0(%r7) # single CPU-stopper only
jnz 4f
larl %r7,.Lthis_cpu
stap 0(%r7) # this CPU address
lh %r4,0(%r7)
nilh %r4,0
lhi %r0,1
sll %r0,16 # CPU counter
lhi %r3,0 # next CPU address
0: cr %r3,%r4
je 2f
1: sigp %r1,%r3,SIGP_STOP # stop next CPU
brc SIGP_CC_BUSY,1b
2: ahi %r3,1
brct %r0,0b
3: sigp %r1,%r4,SIGP_STOP # stop this CPU
brc SIGP_CC_BUSY,3b
4: j 4b
ENDPROC(mcck_int_handler)
#
......@@ -657,15 +672,11 @@ ENTRY(stack_overflow)
ENDPROC(stack_overflow)
#endif
#if IS_ENABLED(CONFIG_KVM)
.Lcleanup_sie:
BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_KERNEL_ASCE
larl %r9,sie_exit # skip forward to sie_exit
BR_EX %r14,%r13
#endif
.section .data, "aw"
.align 4
.Lstop_lock: .long 0
.Lthis_cpu: .short 0
.Lstosm_tmp: .byte 0
.section .rodata, "a"
#define SYSCALL(esame,emu) .quad __s390x_ ## esame
.globl sys_call_table
......
......@@ -110,15 +110,17 @@ static int on_async_stack(void)
{
unsigned long frame = current_frame_address();
return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER));
return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
}
static void do_irq_async(struct pt_regs *regs, int irq)
{
if (on_async_stack())
if (on_async_stack()) {
do_IRQ(regs, irq);
else
CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq);
} else {
call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
struct pt_regs *, regs, int, irq);
}
}
static int irq_pending(struct pt_regs *regs)
......@@ -265,24 +267,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
return from < NR_IRQS_BASE ? NR_IRQS_BASE : from;
}
/*
* Switch to the asynchronous interrupt stack for softirq execution.
*/
void do_softirq_own_stack(void)
{
unsigned long old, new;
old = current_stack_pointer();
/* Check against async. stack address range. */
new = S390_lowcore.async_stack;
if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
CALL_ON_STACK(__do_softirq, new, 0);
} else {
/* We are already on the async stack. */
__do_softirq();
}
}
/*
* ext_int_hash[index] is the list head for all external interrupts that hash
* to this index.
......
......@@ -92,11 +92,6 @@ static void copy_instruction(struct kprobe *p)
}
NOKPROBE_SYMBOL(copy_instruction);
static inline int is_kernel_addr(void *addr)
{
return addr < (void *)_end;
}
static int s390_get_insn_slot(struct kprobe *p)
{
/*
......@@ -105,7 +100,7 @@ static int s390_get_insn_slot(struct kprobe *p)
* field can be patched and executed within the insn slot.
*/
p->ainsn.insn = NULL;
if (is_kernel_addr(p->addr))
if (is_kernel((unsigned long)p->addr))
p->ainsn.insn = get_s390_insn_slot();
else if (is_module_addr(p->addr))
p->ainsn.insn = get_insn_slot();
......@@ -117,7 +112,7 @@ static void s390_free_insn_slot(struct kprobe *p)
{
if (!p->ainsn.insn)
return;
if (is_kernel_addr(p->addr))
if (is_kernel((unsigned long)p->addr))
free_s390_insn_slot(p->ainsn.insn, 0);
else
free_insn_slot(p->ainsn.insn, 0);
......
......@@ -132,7 +132,8 @@ static bool kdump_csum_valid(struct kimage *image)
int rc;
preempt_disable();
rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump,
unsigned long, (unsigned long)image);
preempt_enable();
return rc == 0;
#else
......
......@@ -189,12 +189,16 @@ void noinstr s390_handle_mcck(void)
* returns 0 if all required registers are available
* returns 1 otherwise
*/
static int notrace s390_check_registers(union mci mci, int umode)
static int notrace s390_validate_registers(union mci mci, int umode)
{
struct mcesa *mcesa;
void *fpt_save_area;
union ctlreg2 cr2;
int kill_task;
u64 zero;
kill_task = 0;
zero = 0;
if (!mci.gr) {
/*
......@@ -205,14 +209,6 @@ static int notrace s390_check_registers(union mci mci, int umode)
s390_handle_damage();
kill_task = 1;
}
/* Check control registers */
if (!mci.cr) {
/*
* Control registers have unknown contents.
* Can't recover and therefore stopping machine.
*/
s390_handle_damage();
}
if (!mci.fp) {
/*
* Floating point registers can't be restored. If the
......@@ -225,35 +221,89 @@ static int notrace s390_check_registers(union mci mci, int umode)
if (!test_cpu_flag(CIF_FPU))
kill_task = 1;
}
fpt_save_area = &S390_lowcore.floating_pt_save_area;
if (!mci.fc) {
/*
* Floating point control register can't be restored.
* If the kernel currently uses the floating pointer
* registers and needs the FPC register the system is
* stopped. If the process has its floating pointer
* registers loaded it is terminated.
* registers loaded it is terminated. Otherwise the
* FPC is just validated.
*/
if (S390_lowcore.fpu_flags & KERNEL_FPC)
s390_handle_damage();
asm volatile(
" lfpc %0\n"
:
: "Q" (zero));
if (!test_cpu_flag(CIF_FPU))
kill_task = 1;
} else {
asm volatile(
" lfpc %0\n"
:
: "Q" (S390_lowcore.fpt_creg_save_area));
}
if (MACHINE_HAS_VX) {
mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
if (!MACHINE_HAS_VX) {
/* Validate floating point registers */
asm volatile(
" ld 0,0(%0)\n"
" ld 1,8(%0)\n"
" ld 2,16(%0)\n"
" ld 3,24(%0)\n"
" ld 4,32(%0)\n"
" ld 5,40(%0)\n"
" ld 6,48(%0)\n"
" ld 7,56(%0)\n"
" ld 8,64(%0)\n"
" ld 9,72(%0)\n"
" ld 10,80(%0)\n"
" ld 11,88(%0)\n"
" ld 12,96(%0)\n"
" ld 13,104(%0)\n"
" ld 14,112(%0)\n"
" ld 15,120(%0)\n"
:
: "a" (fpt_save_area)
: "memory");
} else {
/* Validate vector registers */
union ctlreg0 cr0;
if (!mci.vr) {
/*
* Vector registers can't be restored. If the kernel
* currently uses vector registers the system is
* stopped. If the process has its vector registers
* loaded it is terminated.
* loaded it is terminated. Otherwise just validate
* the registers.
*/
if (S390_lowcore.fpu_flags & KERNEL_VXR)
s390_handle_damage();
if (!test_cpu_flag(CIF_FPU))
kill_task = 1;
}
cr0.val = S390_lowcore.cregs_save_area[0];
cr0.afp = cr0.vx = 1;
__ctl_load(cr0.val, 0, 0);
asm volatile(
" la 1,%0\n"
" .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
" .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
:
: "Q" (*(struct vx_array *)mcesa->vector_save_area)
: "1");
__ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
}
/* Check if access registers are valid */
/* Validate access registers */
asm volatile(
" lam 0,15,0(%0)\n"
:
: "a" (&S390_lowcore.access_regs_save_area)
: "memory");
if (!mci.ar) {
/*
* Access registers have unknown contents.
......@@ -261,7 +311,7 @@ static int notrace s390_check_registers(union mci mci, int umode)
*/
kill_task = 1;
}
/* Check guarded storage registers */
/* Validate guarded storage registers */
cr2.val = S390_lowcore.cregs_save_area[2];
if (cr2.gse) {
if (!mci.gs) {
......@@ -271,31 +321,26 @@ static int notrace s390_check_registers(union mci mci, int umode)
* It has to be terminated.
*/
kill_task = 1;
} else {
load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
}
}
/* Check if old PSW is valid */
if (!mci.wp) {
/*
* Can't tell if we come from user or kernel mode
* -> stopping machine.
*/
s390_handle_damage();
}
/* Check for invalid kernel instruction address */
if (!mci.ia && !umode) {
/*
* The instruction address got lost while running
* in the kernel -> stopping machine.
* The getcpu vdso syscall reads CPU number from the programmable
* field of the TOD clock. Disregard the TOD programmable register
* validity bit and load the CPU number into the TOD programmable
* field unconditionally.
*/
s390_handle_damage();
}
set_tod_programmable_field(raw_smp_processor_id());
/* Validate clock comparator register */
set_clock_comparator(S390_lowcore.clock_comparator);
if (!mci.ms || !mci.pm || !mci.ia)
kill_task = 1;
return kill_task;
}
NOKPROBE_SYMBOL(s390_check_registers);
NOKPROBE_SYMBOL(s390_validate_registers);
/*
* Backup the guest's machine check info to its description block
......@@ -353,11 +398,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
mci.val = S390_lowcore.mcck_interruption_code;
mcck = this_cpu_ptr(&cpu_mcck);
if (mci.sd) {
/* System damage -> stopping machine */
s390_handle_damage();
}
/*
* Reinject the instruction processing damages' machine checks
* including Delayed Access Exception into the guest
......@@ -398,7 +438,7 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
s390_handle_damage();
}
}
if (s390_check_registers(mci, user_mode(regs))) {
if (s390_validate_registers(mci, user_mode(regs))) {
/*
* Couldn't restore all register contents for the
* user space process -> mark task for termination.
......@@ -428,21 +468,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs)
mcck_pending = 1;
}
/*
* Reinject storage related machine checks into the guest if they
* happen when the guest is running.
*/
if (!test_cpu_flag(CIF_MCCK_GUEST)) {
if (mci.se)
/* Storage error uncorrected */
s390_handle_damage();
if (mci.ke)
/* Storage key-error uncorrected */
s390_handle_damage();
if (mci.ds && mci.fa)
/* Storage degradation */
s390_handle_damage();
}
if (mci.cp) {
/* Channel report word pending */
mcck->channel_report = 1;
......
......@@ -2,8 +2,9 @@
/*
* Performance event support for s390x - CPU-measurement Counter Facility
*
* Copyright IBM Corp. 2012, 2019
* Copyright IBM Corp. 2012, 2021
* Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
* Thomas Richter <tmricht@linux.ibm.com>
*/
#define KMSG_COMPONENT "cpum_cf"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
......@@ -14,7 +15,223 @@
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/miscdevice.h>
#include <asm/cpu_mcf.h>
#include <asm/hwctrset.h>
#include <asm/debug.h>
static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */
static debug_info_t *cf_dbg;
#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
/* interval in seconds */
/* Counter sets are stored as data stream in a page sized memory buffer and
* exported to user space via raw data attached to the event sample data.
* Each counter set starts with an eight byte header consisting of:
* - a two byte eye catcher (0xfeef)
* - a one byte counter set number
* - a two byte counter set size (indicates the number of counters in this set)
* - a three byte reserved value (must be zero) to make the header the same
* size as a counter value.
* All counter values are eight byte in size.
*
* All counter sets are followed by a 64 byte trailer.
* The trailer consists of a:
* - flag field indicating valid fields when corresponding bit set
* - the counter facility first and second version number
* - the CPU speed if nonzero
* - the time stamp the counter sets have been collected
* - the time of day (TOD) base value
* - the machine type.
*
* The counter sets are saved when the process is prepared to be executed on a
* CPU and saved again when the process is going to be removed from a CPU.
* The difference of both counter sets are calculated and stored in the event
* sample data area.
*/
struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
unsigned int def:16; /* 0-15 Data Entry Format */
unsigned int set:16; /* 16-31 Counter set identifier */
unsigned int ctr:16; /* 32-47 Number of stored counters */
unsigned int res1:16; /* 48-63 Reserved */
};
struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
/* 0 - 7 */
union {
struct {
unsigned int clock_base:1; /* TOD clock base set */
unsigned int speed:1; /* CPU speed set */
/* Measurement alerts */
unsigned int mtda:1; /* Loss of MT ctr. data alert */
unsigned int caca:1; /* Counter auth. change alert */
unsigned int lcda:1; /* Loss of counter data alert */
};
unsigned long flags; /* 0-63 All indicators */
};
/* 8 - 15 */
unsigned int cfvn:16; /* 64-79 Ctr First Version */
unsigned int csvn:16; /* 80-95 Ctr Second Version */
unsigned int cpu_speed:32; /* 96-127 CPU speed */
/* 16 - 23 */
unsigned long timestamp; /* 128-191 Timestamp (TOD) */
/* 24 - 55 */
union {
struct {
unsigned long progusage1;
unsigned long progusage2;
unsigned long progusage3;
unsigned long tod_base;
};
unsigned long progusage[4];
};
/* 56 - 63 */
unsigned int mach_type:16; /* Machine type */
unsigned int res1:16; /* Reserved */
unsigned int res2:32; /* Reserved */
};
/* Create the trailer data at the end of a page. */
static void cfdiag_trailer(struct cf_trailer_entry *te)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cpuid cpuid;
te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */
te->csvn = cpuhw->info.csvn;
get_cpu_id(&cpuid); /* Machine type */
te->mach_type = cpuid.machine;
te->cpu_speed = cfdiag_cpu_speed;
if (te->cpu_speed)
te->speed = 1;
te->clock_base = 1; /* Save clock base */
te->tod_base = tod_clock_base.tod;
te->timestamp = get_tod_clock_fast();
}
/* Read a counter set. The counter set number determines the counter set and
* the CPUM-CF first and second version number determine the number of
* available counters in each counter set.
* Each counter set starts with header containing the counter set number and
* the number of eight byte counters.
*
* The functions returns the number of bytes occupied by this counter set
* including the header.
* If there is no counter in the counter set, this counter set is useless and
* zero is returned on this case.
*
* Note that the counter sets may not be enabled or active and the stcctm
* instruction might return error 3. Depending on error_ok value this is ok,
* for example when called from cpumf_pmu_start() call back function.
*/
static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
size_t room, bool error_ok)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
size_t ctrset_size, need = 0;
int rc = 3; /* Assume write failure */
ctrdata->def = CF_DIAG_CTRSET_DEF;
ctrdata->set = ctrset;
ctrdata->res1 = 0;
ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
if (ctrset_size) { /* Save data */
need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
if (need <= room) {
rc = ctr_stcctm(ctrset, ctrset_size,
(u64 *)(ctrdata + 1));
}
if (rc != 3 || error_ok)
ctrdata->ctr = ctrset_size;
else
need = 0;
}
debug_sprintf_event(cf_dbg, 3,
"%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
" need %zd rc %d\n", __func__, ctrset, ctrset_size,
cpuhw->info.cfvn, cpuhw->info.csvn, need, rc);
return need;
}
/* Read out all counter sets and save them in the provided data buffer.
* The last 64 byte host an artificial trailer entry.
*/
static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
bool error_ok)
{
struct cf_trailer_entry *trailer;
size_t offset = 0, done;
int i;
memset(data, 0, sz);
sz -= sizeof(*trailer); /* Always room for trailer */
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
struct cf_ctrset_entry *ctrdata = data + offset;
if (!(auth & cpumf_ctr_ctl[i]))
continue; /* Counter set not authorized */
done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
offset += done;
}
trailer = data + offset;
cfdiag_trailer(trailer);
return offset + sizeof(*trailer);
}
/* Calculate the difference for each counter in a counter set. */
static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
{
for (; --counters >= 0; ++pstart, ++pstop)
if (*pstop >= *pstart)
*pstop -= *pstart;
else
*pstop = *pstart - *pstop + 1;
}
/* Scan the counter sets and calculate the difference of each counter
* in each set. The result is the increment of each counter during the
* period the counter set has been activated.
*
* Return true on success.
*/
static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
{
struct cf_trailer_entry *trailer_start, *trailer_stop;
struct cf_ctrset_entry *ctrstart, *ctrstop;
size_t offset = 0;
auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
do {
ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
pr_err_once("cpum_cf_diag counter set compare error "
"in set %i\n", ctrstart->set);
return 0;
}
auth &= ~cpumf_ctr_ctl[ctrstart->set];
if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
cfdiag_diffctrset((u64 *)(ctrstart + 1),
(u64 *)(ctrstop + 1), ctrstart->ctr);
offset += ctrstart->ctr * sizeof(u64) +
sizeof(*ctrstart);
}
} while (ctrstart->def && auth);
/* Save time_stamp from start of event in stop's trailer */
trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
trailer_stop->progusage[0] = trailer_start->timestamp;
return 1;
}
static enum cpumf_ctr_set get_counter_set(u64 event)
{
......@@ -34,7 +251,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
return set;
}
static int validate_ctr_version(const struct hw_perf_event *hwc)
static int validate_ctr_version(const struct hw_perf_event *hwc,
enum cpumf_ctr_set set)
{
struct cpu_cf_events *cpuhw;
int err = 0;
......@@ -43,7 +261,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
cpuhw = &get_cpu_var(cpu_cf_events);
/* check required version for counter sets */
switch (hwc->config_base) {
switch (set) {
case CPUMF_CTR_SET_BASIC:
case CPUMF_CTR_SET_USER:
if (cpuhw->info.cfvn < 1)
......@@ -86,6 +304,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
(cpuhw->info.act_ctl & mtdiag_ctl)))
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_MAX:
err = -EOPNOTSUPP;
}
put_cpu_var(cpu_cf_events);
......@@ -95,7 +315,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
static int validate_ctr_auth(const struct hw_perf_event *hwc)
{
struct cpu_cf_events *cpuhw;
u64 ctrs_state;
int err = 0;
cpuhw = &get_cpu_var(cpu_cf_events);
......@@ -105,8 +324,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
* return with -ENOENT in order to fall back to other
* PMUs that might suffice the event request.
*/
ctrs_state = cpumf_ctr_ctl[hwc->config_base];
if (!(ctrs_state & cpuhw->info.auth_ctl))
if (!(hwc->config_base & cpuhw->info.auth_ctl))
err = -ENOENT;
put_cpu_var(cpu_cf_events);
......@@ -126,7 +344,7 @@ static void cpumf_pmu_enable(struct pmu *pmu)
if (cpuhw->flags & PMU_F_ENABLED)
return;
err = lcctl(cpuhw->state);
err = lcctl(cpuhw->state | cpuhw->dev_state);
if (err) {
pr_err("Enabling the performance measuring unit "
"failed with rc=%x\n", err);
......@@ -151,6 +369,7 @@ static void cpumf_pmu_disable(struct pmu *pmu)
return;
inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
inactive |= cpuhw->dev_state;
err = lcctl(inactive);
if (err) {
pr_err("Disabling the performance measuring unit "
......@@ -199,6 +418,14 @@ static const int cpumf_generic_events_user[] = {
[PERF_COUNT_HW_BUS_CYCLES] = -1,
};
static void cpumf_hw_inuse(void)
{
mutex_lock(&pmc_reserve_mutex);
if (atomic_inc_return(&num_events) == 1)
__kernel_cpumcf_begin();
mutex_unlock(&pmc_reserve_mutex);
}
static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
{
struct perf_event_attr *attr = &event->attr;
......@@ -258,11 +485,11 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
/*
* Use the hardware perf event structure to store the
* counter number in the 'config' member and the counter
* set number in the 'config_base'. The counter set number
* is then later used to enable/disable the counter(s).
* set number in the 'config_base' as bit mask.
* It is later used to enable/disable the counter(s).
*/
hwc->config = ev;
hwc->config_base = set;
hwc->config_base = cpumf_ctr_ctl[set];
break;
case CPUMF_CTR_SET_MAX:
/* The counter could not be associated to a counter set */
......@@ -270,22 +497,13 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
}
/* Initialize for using the CPU-measurement counter facility */
if (!atomic_inc_not_zero(&num_events)) {
mutex_lock(&pmc_reserve_mutex);
if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
err = -EBUSY;
else
atomic_inc(&num_events);
mutex_unlock(&pmc_reserve_mutex);
}
if (err)
return err;
cpumf_hw_inuse();
event->destroy = hw_perf_event_destroy;
/* Finally, validate version and authorization of the counter set */
err = validate_ctr_auth(hwc);
if (!err)
err = validate_ctr_version(hwc);
err = validate_ctr_version(hwc, set);
return err;
}
......@@ -361,6 +579,7 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct hw_perf_event *hwc = &event->hw;
int i;
if (!(hwc->state & PERF_HES_STOPPED))
return;
......@@ -376,28 +595,91 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
* needs to be synchronized. At this point, the counter set can be in
* the inactive or disabled state.
*/
if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
cpuhw->usedss = cfdiag_getctr(cpuhw->start,
sizeof(cpuhw->start),
hwc->config_base, true);
} else {
hw_perf_event_reset(event);
}
/* Increment refcount for counter sets */
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
if ((hwc->config_base & cpumf_ctr_ctl[i]))
atomic_inc(&cpuhw->ctr_set[i]);
}
/* increment refcount for this counter set */
atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
/* Create perf event sample with the counter sets as raw data. The sample
* is then pushed to the event subsystem and the function checks for
* possible event overflows. If an event overflow occurs, the PMU is
* stopped.
*
* Return non-zero if an event overflow occurred.
*/
static int cfdiag_push_sample(struct perf_event *event,
struct cpu_cf_events *cpuhw)
{
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
int overflow;
/* Setup perf sample */
perf_sample_data_init(&data, 0, event->hw.last_period);
memset(&regs, 0, sizeof(regs));
memset(&raw, 0, sizeof(raw));
if (event->attr.sample_type & PERF_SAMPLE_CPU)
data.cpu_entry.cpu = event->cpu;
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = cpuhw->usedss;
raw.frag.data = cpuhw->stop;
raw.size = raw.frag.size;
data.raw = &raw;
}
overflow = perf_event_overflow(event, &data, &regs);
debug_sprintf_event(cf_dbg, 3,
"%s event %#llx sample_type %#llx raw %d ov %d\n",
__func__, event->hw.config,
event->attr.sample_type, raw.size, overflow);
if (overflow)
event->pmu->stop(event, 0);
perf_event_update_userpage(event);
return overflow;
}
static void cpumf_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct hw_perf_event *hwc = &event->hw;
int i;
if (!(hwc->state & PERF_HES_STOPPED)) {
/* Decrement reference count for this counter set and if this
* is the last used counter in the set, clear activation
* control and set the counter set state to inactive.
*/
if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
ctr_set_stop(&cpuhw->state, hwc->config_base);
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
if (!(hwc->config_base & cpumf_ctr_ctl[i]))
continue;
if (!atomic_dec_return(&cpuhw->ctr_set[i]))
ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
}
hwc->state |= PERF_HES_STOPPED;
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
local64_inc(&event->count);
cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
sizeof(cpuhw->stop),
event->hw.config_base,
false);
if (cfdiag_diffctr(cpuhw, event->hw.config_base))
cfdiag_push_sample(event, cpuhw);
} else
hw_perf_event_update(event);
hwc->state |= PERF_HES_UPTODATE;
}
......@@ -419,6 +701,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
static void cpumf_pmu_del(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
int i;
cpumf_pmu_stop(event, PERF_EF_UPDATE);
......@@ -430,8 +713,9 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
* clear enable control and resets all counters in a set. Therefore,
* cpumf_pmu_start() always has to reenable a counter set.
*/
if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
ctr_set_disable(&cpuhw->state, event->hw.config_base);
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
if (!atomic_read(&cpuhw->ctr_set[i]))
ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
}
/* Performance monitoring unit for s390x */
......@@ -448,6 +732,7 @@ static struct pmu cpumf_pmu = {
.read = cpumf_pmu_read,
};
static int cfset_init(void);
static int __init cpumf_pmu_init(void)
{
int rc;
......@@ -455,10 +740,689 @@ static int __init cpumf_pmu_init(void)
if (!kernel_cpumcf_avail())
return -ENODEV;
/* Setup s390dbf facility */
cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
if (!cf_dbg) {
pr_err("Registration of s390dbf(cpum_cf) failed\n");
return -ENOMEM;
};
debug_register_view(cf_dbg, &debug_sprintf_view);
cpumf_pmu.attr_groups = cpumf_cf_event_group();
rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
if (rc)
if (rc) {
debug_unregister_view(cf_dbg, &debug_sprintf_view);
debug_unregister(cf_dbg);
pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
} else if (stccm_avail()) { /* Setup counter set device */
cfset_init();
}
return rc;
}
/* Support for the CPU Measurement Facility counter set extraction using
* device /dev/hwctr. This allows user space programs to extract complete
* counter set via normal file operations.
*/
static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */
static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */
unsigned int sets; /* Counter set bit mask */
atomic_t cpus_ack; /* # CPUs successfully executed func */
};
static struct cfset_request { /* CPUs and counter set bit mask */
unsigned long ctrset; /* Bit mask of counter set to read */
cpumask_t mask; /* CPU mask to read from */
} cfset_request;
static void cfset_ctrset_clear(void)
{
cpumask_clear(&cfset_request.mask);
cfset_request.ctrset = 0;
}
/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
* path is currently used.
* The cpu_cf_events::dev_state is used to denote counter sets in use by this
* interface. It is always or'ed in. If this interface is not active, its
* value is zero and no additional counter sets will be included.
*
* The cpu_cf_events::state is used by the perf_event_open SVC and remains
* unchanged.
*
* perf_pmu_enable() and perf_pmu_enable() and its call backs
* cpumf_pmu_enable() and cpumf_pmu_disable() are called by the
* performance measurement subsystem to enable per process
* CPU Measurement counter facility.
* The XXX_enable() and XXX_disable functions are used to turn off
* x86 performance monitoring interrupt (PMI) during scheduling.
* s390 uses these calls to temporarily stop and resume the active CPU
* counters sets during scheduling.
*
* We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
* device access. The perf_event_open() SVC interface makes a lot of effort
* to only run the counters while the calling process is actively scheduled
* to run.
* When /dev/hwctr interface is also used at the same time, the counter sets
* will keep running, even when the process is scheduled off a CPU.
* However this is not a problem and does not lead to wrong counter values
* for the perf_event_open() SVC. The current counter value will be recorded
* during schedule-in. At schedule-out time the current counter value is
* extracted again and the delta is calculated and added to the event.
*/
/* Stop all counter sets via ioctl interface */
static void cfset_ioctl_off(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cfset_call_on_cpu_parm *p = parm;
int rc;
cpuhw->dev_state = 0;
for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
if ((p->sets & cpumf_ctr_ctl[rc]))
atomic_dec(&cpuhw->ctr_set[rc]);
rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */
if (rc)
pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
cpuhw->state, S390_HWCTR_DEVICE, rc);
cpuhw->flags &= ~PMU_F_IN_USE;
debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
__func__, rc, cpuhw->state, cpuhw->dev_state);
}
/* Start counter sets on particular CPU */
static void cfset_ioctl_on(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cfset_call_on_cpu_parm *p = parm;
int rc;
cpuhw->flags |= PMU_F_IN_USE;
ctr_set_enable(&cpuhw->dev_state, p->sets);
ctr_set_start(&cpuhw->dev_state, p->sets);
for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
if ((p->sets & cpumf_ctr_ctl[rc]))
atomic_inc(&cpuhw->ctr_set[rc]);
rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */
if (!rc)
atomic_inc(&p->cpus_ack);
else
pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
__func__, rc, cpuhw->state, cpuhw->dev_state);
}
static void cfset_release_cpu(void *p)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
int rc;
debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
__func__, cpuhw->state, cpuhw->dev_state);
rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */
if (rc)
pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
cpuhw->state, S390_HWCTR_DEVICE, rc);
cpuhw->dev_state = 0;
}
/* Release function is also called when application gets terminated without
* doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
*/
static int cfset_release(struct inode *inode, struct file *file)
{
on_each_cpu(cfset_release_cpu, NULL, 1);
hw_perf_event_destroy(NULL);
cfset_ctrset_clear();
atomic_set(&cfset_opencnt, 0);
return 0;
}
static int cfset_open(struct inode *inode, struct file *file)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* Only one user space program can open /dev/hwctr */
if (atomic_xchg(&cfset_opencnt, 1))
return -EBUSY;
cpumf_hw_inuse();
file->private_data = NULL;
/* nonseekable_open() never fails */
return nonseekable_open(inode, file);
}
static int cfset_all_stop(void)
{
struct cfset_call_on_cpu_parm p = {
.sets = cfset_request.ctrset,
};
cpumask_var_t mask;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
free_cpumask_var(mask);
return 0;
}
static int cfset_all_start(void)
{
struct cfset_call_on_cpu_parm p = {
.sets = cfset_request.ctrset,
.cpus_ack = ATOMIC_INIT(0),
};
cpumask_var_t mask;
int rc = 0;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
rc = -EIO;
debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__);
}
free_cpumask_var(mask);
return rc;
}
subsys_initcall(cpumf_pmu_init);
/* Return the maximum required space for all possible CPUs in case one
* CPU will be onlined during the START, READ, STOP cycles.
* To find out the size of the counter sets, any one CPU will do. They
* all have the same counter sets.
*/
static size_t cfset_needspace(unsigned int sets)
{
struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
size_t bytes = 0;
int i;
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
if (!(sets & cpumf_ctr_ctl[i]))
continue;
bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
sizeof(((struct s390_ctrset_setdata *)0)->set) +
sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
}
bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
put_cpu_ptr(&cpu_cf_events);
return bytes;
}
static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
{
struct s390_ctrset_read __user *ctrset_read;
unsigned int cpu, cpus, rc;
void __user *uptr;
ctrset_read = (struct s390_ctrset_read __user *)arg;
uptr = ctrset_read->data;
for_each_cpu(cpu, mask) {
struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu);
struct s390_ctrset_cpudata __user *ctrset_cpudata;
ctrset_cpudata = uptr;
rc = put_user(cpu, &ctrset_cpudata->cpu_nr);
rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
cpuhw->used);
if (rc)
return -EFAULT;
uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
cond_resched();
}
cpus = cpumask_weight(mask);
if (put_user(cpus, &ctrset_read->no_cpus))
return -EFAULT;
debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__,
uptr - (void __user *)ctrset_read->data);
return 0;
}
static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
int ctrset_size, size_t room)
{
size_t need = 0;
int rc = -1;
need = sizeof(*p) + sizeof(u64) * ctrset_size;
if (need <= room) {
p->set = cpumf_ctr_ctl[ctrset];
p->no_cnts = ctrset_size;
rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
if (rc == 3) /* Nothing stored */
need = 0;
}
return need;
}
/* Read all counter sets. */
static void cfset_cpu_read(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cfset_call_on_cpu_parm *p = parm;
int set, set_size;
size_t space;
/* No data saved yet */
cpuhw->used = 0;
cpuhw->sets = 0;
memset(cpuhw->data, 0, sizeof(cpuhw->data));
/* Scan the counter sets */
for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
cpuhw->used;
if (!(p->sets & cpumf_ctr_ctl[set]))
continue; /* Counter set not in list */
set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
space = sizeof(cpuhw->data) - cpuhw->used;
space = cfset_cpuset_read(sp, set, set_size, space);
if (space) {
cpuhw->used += space;
cpuhw->sets += 1;
}
}
debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__,
cpuhw->sets, cpuhw->used);
}
static int cfset_all_read(unsigned long arg)
{
struct cfset_call_on_cpu_parm p;
cpumask_var_t mask;
int rc;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
p.sets = cfset_request.ctrset;
cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
rc = cfset_all_copy(arg, mask);
free_cpumask_var(mask);
return rc;
}
static long cfset_ioctl_read(unsigned long arg)
{
struct s390_ctrset_read read;
int ret = 0;
if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
return -EFAULT;
ret = cfset_all_read(arg);
return ret;
}
static long cfset_ioctl_stop(void)
{
int ret = ENXIO;
if (cfset_request.ctrset) {
ret = cfset_all_stop();
cfset_ctrset_clear();
}
return ret;
}
static long cfset_ioctl_start(unsigned long arg)
{
struct s390_ctrset_start __user *ustart;
struct s390_ctrset_start start;
void __user *umask;
unsigned int len;
int ret = 0;
size_t need;
if (cfset_request.ctrset)
return -EBUSY;
ustart = (struct s390_ctrset_start __user *)arg;
if (copy_from_user(&start, ustart, sizeof(start)))
return -EFAULT;
if (start.version != S390_HWCTR_START_VERSION)
return -EINVAL;
if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
return -EINVAL; /* Invalid counter set */
if (!start.counter_sets)
return -EINVAL; /* No counter set at all? */
cpumask_clear(&cfset_request.mask);
len = min_t(u64, start.cpumask_len, cpumask_size());
umask = (void __user *)start.cpumask;
if (copy_from_user(&cfset_request.mask, umask, len))
return -EFAULT;
if (cpumask_empty(&cfset_request.mask))
return -EINVAL;
need = cfset_needspace(start.counter_sets);
if (put_user(need, &ustart->data_bytes))
ret = -EFAULT;
if (ret)
goto out;
cfset_request.ctrset = start.counter_sets;
ret = cfset_all_start();
out:
if (ret)
cfset_ctrset_clear();
debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n",
__func__, cfset_request.ctrset, need, ret);
return ret;
}
/* Entry point to the /dev/hwctr device interface.
* The ioctl system call supports three subcommands:
* S390_HWCTR_START: Start the specified counter sets on a CPU list. The
* counter set keeps running until explicitly stopped. Returns the number
* of bytes needed to store the counter values. If another S390_HWCTR_START
* ioctl subcommand is called without a previous S390_HWCTR_STOP stop
* command, -EBUSY is returned.
* S390_HWCTR_READ: Read the counter set values from specified CPU list given
* with the S390_HWCTR_START command.
* S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
* previous S390_HWCTR_START subcommand.
*/
static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
int ret;
get_online_cpus();
mutex_lock(&cfset_ctrset_mutex);
switch (cmd) {
case S390_HWCTR_START:
ret = cfset_ioctl_start(arg);
break;
case S390_HWCTR_STOP:
ret = cfset_ioctl_stop();
break;
case S390_HWCTR_READ:
ret = cfset_ioctl_read(arg);
break;
default:
ret = -ENOTTY;
break;
}
mutex_unlock(&cfset_ctrset_mutex);
put_online_cpus();
return ret;
}
static const struct file_operations cfset_fops = {
.owner = THIS_MODULE,
.open = cfset_open,
.release = cfset_release,
.unlocked_ioctl = cfset_ioctl,
.compat_ioctl = cfset_ioctl,
.llseek = no_llseek
};
static struct miscdevice cfset_dev = {
.name = S390_HWCTR_DEVICE,
.minor = MISC_DYNAMIC_MINOR,
.fops = &cfset_fops,
};
int cfset_online_cpu(unsigned int cpu)
{
struct cfset_call_on_cpu_parm p;
mutex_lock(&cfset_ctrset_mutex);
if (cfset_request.ctrset) {
p.sets = cfset_request.ctrset;
cfset_ioctl_on(&p);
cpumask_set_cpu(cpu, &cfset_request.mask);
}
mutex_unlock(&cfset_ctrset_mutex);
return 0;
}
int cfset_offline_cpu(unsigned int cpu)
{
struct cfset_call_on_cpu_parm p;
mutex_lock(&cfset_ctrset_mutex);
if (cfset_request.ctrset) {
p.sets = cfset_request.ctrset;
cfset_ioctl_off(&p);
cpumask_clear_cpu(cpu, &cfset_request.mask);
}
mutex_unlock(&cfset_ctrset_mutex);
return 0;
}
static void cfdiag_read(struct perf_event *event)
{
debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__,
event->attr.config, local64_read(&event->count));
}
static int get_authctrsets(void)
{
struct cpu_cf_events *cpuhw;
unsigned long auth = 0;
enum cpumf_ctr_set i;
cpuhw = &get_cpu_var(cpu_cf_events);
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
auth |= cpumf_ctr_ctl[i];
}
put_cpu_var(cpu_cf_events);
return auth;
}
/* Setup the event. Test for authorized counter sets and only include counter
* sets which are authorized at the time of the setup. Including unauthorized
* counter sets result in specification exception (and panic).
*/
static int cfdiag_event_init2(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
int err = 0;
/* Set sample_period to indicate sampling */
event->hw.config = attr->config;
event->hw.sample_period = attr->sample_period;
local64_set(&event->hw.period_left, event->hw.sample_period);
local64_set(&event->count, 0);
event->hw.last_period = event->hw.sample_period;
/* Add all authorized counter sets to config_base. The
* the hardware init function is either called per-cpu or just once
* for all CPUS (event->cpu == -1). This depends on the whether
* counting is started for all CPUs or on a per workload base where
* the perf event moves from one CPU to another CPU.
* Checking the authorization on any CPU is fine as the hardware
* applies the same authorization settings to all CPUs.
*/
event->hw.config_base = get_authctrsets();
/* No authorized counter sets, nothing to count/sample */
if (!event->hw.config_base)
err = -EINVAL;
debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n",
__func__, err, event->hw.config_base);
return err;
}
static int cfdiag_event_init(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
int err = -ENOENT;
if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
event->attr.type != event->pmu->type)
goto out;
/* Raw events are used to access counters directly,
* hence do not permit excludes.
* This event is useless without PERF_SAMPLE_RAW to return counter set
* values as raw data.
*/
if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
!(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
err = -EOPNOTSUPP;
goto out;
}
/* Initialize for using the CPU-measurement counter facility */
cpumf_hw_inuse();
event->destroy = hw_perf_event_destroy;
err = cfdiag_event_init2(event);
if (unlikely(err))
event->destroy(event);
out:
return err;
}
/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
* to collect the complete counter sets for a scheduled process. Target
* are complete counter sets attached as raw data to the artificial event.
* This results in complete counter sets available when a process is
* scheduled. Contains the delta of every counter while the process was
* running.
*/
CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
static struct attribute *cfdiag_events_attr[] = {
CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
NULL,
};
PMU_FORMAT_ATTR(event, "config:0-63");
static struct attribute *cfdiag_format_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group cfdiag_events_group = {
.name = "events",
.attrs = cfdiag_events_attr,
};
static struct attribute_group cfdiag_format_group = {
.name = "format",
.attrs = cfdiag_format_attr,
};
static const struct attribute_group *cfdiag_attr_groups[] = {
&cfdiag_events_group,
&cfdiag_format_group,
NULL,
};
/* Performance monitoring unit for event CF_DIAG. Since this event
* is also started and stopped via the perf_event_open() system call, use
* the same event enable/disable call back functions. They do not
* have a pointer to the perf_event strcture as first parameter.
*
* The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
* Reuse them and distinguish the event (always first parameter) via
* 'config' member.
*/
static struct pmu cf_diag = {
.task_ctx_nr = perf_sw_context,
.event_init = cfdiag_event_init,
.pmu_enable = cpumf_pmu_enable,
.pmu_disable = cpumf_pmu_disable,
.add = cpumf_pmu_add,
.del = cpumf_pmu_del,
.start = cpumf_pmu_start,
.stop = cpumf_pmu_stop,
.read = cfdiag_read,
.attr_groups = cfdiag_attr_groups
};
/* Calculate memory needed to store all counter sets together with header and
* trailer data. This is independent of the counter set authorization which
* can vary depending on the configuration.
*/
static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
{
size_t max_size = sizeof(struct cf_trailer_entry);
enum cpumf_ctr_set i;
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
size_t size = cpum_cf_ctrset_size(i, info);
if (size)
max_size += size * sizeof(u64) +
sizeof(struct cf_ctrset_entry);
}
return max_size;
}
/* Get the CPU speed, try sampling facility first and CPU attributes second. */
static void cfdiag_get_cpu_speed(void)
{
if (cpum_sf_avail()) { /* Sampling facility first */
struct hws_qsi_info_block si;
memset(&si, 0, sizeof(si));
if (!qsi(&si)) {
cfdiag_cpu_speed = si.cpu_speed;
return;
}
}
/* Fallback: CPU speed extract static part. Used in case
* CPU Measurement Sampling Facility is turned off.
*/
if (test_facility(34)) {
unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
if (mhz != -1UL)
cfdiag_cpu_speed = mhz & 0xffffffff;
}
}
static int cfset_init(void)
{
struct cpumf_ctr_info info;
size_t need;
int rc;
if (qctri(&info))
return -ENODEV;
cfdiag_get_cpu_speed();
/* Make sure the counter set data fits into predefined buffer. */
need = cfdiag_maxsize(&info);
if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
need);
return -ENOMEM;
}
rc = misc_register(&cfset_dev);
if (rc) {
pr_err("Registration of /dev/%s failed rc=%i\n",
cfset_dev.name, rc);
goto out;
}
rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
if (rc) {
misc_deregister(&cfset_dev);
pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
rc);
}
out:
return rc;
}
device_initcall(cpumf_pmu_init);
......@@ -29,7 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
},
.alert = ATOMIC64_INIT(0),
.state = 0,
.dev_state = 0,
.flags = 0,
.used = 0,
.usedss = 0,
.sets = 0
};
/* Indicator whether the CPU-Measurement Counter Facility Support is ready */
static bool cpum_cf_initalized;
......@@ -96,25 +100,10 @@ bool kernel_cpumcf_avail(void)
}
EXPORT_SYMBOL(kernel_cpumcf_avail);
/* Reserve/release functions for sharing perf hardware */
static DEFINE_SPINLOCK(cpumcf_owner_lock);
static void *cpumcf_owner;
/* Initialize the CPU-measurement counter facility */
int __kernel_cpumcf_begin(void)
{
int flags = PMC_INIT;
int err = 0;
spin_lock(&cpumcf_owner_lock);
if (cpumcf_owner)
err = -EBUSY;
else
cpumcf_owner = __builtin_return_address(0);
spin_unlock(&cpumcf_owner_lock);
if (err)
return err;
on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
......@@ -144,10 +133,6 @@ void __kernel_cpumcf_end(void)
on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
spin_lock(&cpumcf_owner_lock);
cpumcf_owner = NULL;
spin_unlock(&cpumcf_owner_lock);
}
EXPORT_SYMBOL(__kernel_cpumcf_end);
......@@ -161,11 +146,13 @@ static int cpum_cf_setup(unsigned int cpu, int flags)
static int cpum_cf_online_cpu(unsigned int cpu)
{
return cpum_cf_setup(cpu, PMC_INIT);
cpum_cf_setup(cpu, PMC_INIT);
return cfset_online_cpu(cpu);
}
static int cpum_cf_offline_cpu(unsigned int cpu)
{
cfset_offline_cpu(cpu);
return cpum_cf_setup(cpu, PMC_RELEASE);
}
......
// SPDX-License-Identifier: GPL-2.0
/*
* Performance event support for s390x - CPU-measurement Counter Sets
*
* Copyright IBM Corp. 2019, 2021
* Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
* Thomas Richer <tmricht@linux.ibm.com>
*/
#define KMSG_COMPONENT "cpum_cf_diag"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/processor.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <asm/ctl_reg.h>
#include <asm/irq.h>
#include <asm/cpu_mcf.h>
#include <asm/timex.h>
#include <asm/debug.h>
#include <asm/hwctrset.h>
#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
/* interval in seconds */
static unsigned int cf_diag_cpu_speed;
static debug_info_t *cf_diag_dbg;
struct cf_diag_csd { /* Counter set data per CPU */
size_t used; /* Bytes used in data/start */
unsigned char start[PAGE_SIZE]; /* Counter set at event start */
unsigned char data[PAGE_SIZE]; /* Counter set at event delete */
unsigned int sets; /* # Counter set saved in data */
};
static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
/* Counter sets are stored as data stream in a page sized memory buffer and
* exported to user space via raw data attached to the event sample data.
* Each counter set starts with an eight byte header consisting of:
* - a two byte eye catcher (0xfeef)
* - a one byte counter set number
* - a two byte counter set size (indicates the number of counters in this set)
* - a three byte reserved value (must be zero) to make the header the same
* size as a counter value.
* All counter values are eight byte in size.
*
* All counter sets are followed by a 64 byte trailer.
* The trailer consists of a:
* - flag field indicating valid fields when corresponding bit set
* - the counter facility first and second version number
* - the CPU speed if nonzero
* - the time stamp the counter sets have been collected
* - the time of day (TOD) base value
* - the machine type.
*
* The counter sets are saved when the process is prepared to be executed on a
* CPU and saved again when the process is going to be removed from a CPU.
* The difference of both counter sets are calculated and stored in the event
* sample data area.
*/
struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
unsigned int def:16; /* 0-15 Data Entry Format */
unsigned int set:16; /* 16-31 Counter set identifier */
unsigned int ctr:16; /* 32-47 Number of stored counters */
unsigned int res1:16; /* 48-63 Reserved */
};
struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
/* 0 - 7 */
union {
struct {
unsigned int clock_base:1; /* TOD clock base set */
unsigned int speed:1; /* CPU speed set */
/* Measurement alerts */
unsigned int mtda:1; /* Loss of MT ctr. data alert */
unsigned int caca:1; /* Counter auth. change alert */
unsigned int lcda:1; /* Loss of counter data alert */
};
unsigned long flags; /* 0-63 All indicators */
};
/* 8 - 15 */
unsigned int cfvn:16; /* 64-79 Ctr First Version */
unsigned int csvn:16; /* 80-95 Ctr Second Version */
unsigned int cpu_speed:32; /* 96-127 CPU speed */
/* 16 - 23 */
unsigned long timestamp; /* 128-191 Timestamp (TOD) */
/* 24 - 55 */
union {
struct {
unsigned long progusage1;
unsigned long progusage2;
unsigned long progusage3;
unsigned long tod_base;
};
unsigned long progusage[4];
};
/* 56 - 63 */
unsigned int mach_type:16; /* Machine type */
unsigned int res1:16; /* Reserved */
unsigned int res2:32; /* Reserved */
};
/* Create the trailer data at the end of a page. */
static void cf_diag_trailer(struct cf_trailer_entry *te)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cpuid cpuid;
te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */
te->csvn = cpuhw->info.csvn;
get_cpu_id(&cpuid); /* Machine type */
te->mach_type = cpuid.machine;
te->cpu_speed = cf_diag_cpu_speed;
if (te->cpu_speed)
te->speed = 1;
te->clock_base = 1; /* Save clock base */
te->tod_base = tod_clock_base.tod;
te->timestamp = get_tod_clock_fast();
}
/*
* Change the CPUMF state to active.
* Enable and activate the CPU-counter sets according
* to the per-cpu control state.
*/
static void cf_diag_enable(struct pmu *pmu)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
int err;
debug_sprintf_event(cf_diag_dbg, 5,
"%s pmu %p cpu %d flags %#x state %#llx\n",
__func__, pmu, smp_processor_id(), cpuhw->flags,
cpuhw->state);
if (cpuhw->flags & PMU_F_ENABLED)
return;
err = lcctl(cpuhw->state);
if (err) {
pr_err("Enabling the performance measuring unit "
"failed with rc=%x\n", err);
return;
}
cpuhw->flags |= PMU_F_ENABLED;
}
/*
* Change the CPUMF state to inactive.
* Disable and enable (inactive) the CPU-counter sets according
* to the per-cpu control state.
*/
static void cf_diag_disable(struct pmu *pmu)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
u64 inactive;
int err;
debug_sprintf_event(cf_diag_dbg, 5,
"%s pmu %p cpu %d flags %#x state %#llx\n",
__func__, pmu, smp_processor_id(), cpuhw->flags,
cpuhw->state);
if (!(cpuhw->flags & PMU_F_ENABLED))
return;
inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
err = lcctl(inactive);
if (err) {
pr_err("Disabling the performance measuring unit "
"failed with rc=%x\n", err);
return;
}
cpuhw->flags &= ~PMU_F_ENABLED;
}
/* Number of perf events counting hardware events */
static atomic_t cf_diag_events = ATOMIC_INIT(0);
/* Used to avoid races in calling reserve/release_cpumf_hardware */
static DEFINE_MUTEX(cf_diag_reserve_mutex);
/* Release the PMU if event is the last perf event */
static void cf_diag_perf_event_destroy(struct perf_event *event)
{
debug_sprintf_event(cf_diag_dbg, 5,
"%s event %p cpu %d cf_diag_events %d\n",
__func__, event, smp_processor_id(),
atomic_read(&cf_diag_events));
if (atomic_dec_return(&cf_diag_events) == 0)
__kernel_cpumcf_end();
}
static int get_authctrsets(void)
{
struct cpu_cf_events *cpuhw;
unsigned long auth = 0;
enum cpumf_ctr_set i;
cpuhw = &get_cpu_var(cpu_cf_events);
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
auth |= cpumf_ctr_ctl[i];
}
put_cpu_var(cpu_cf_events);
return auth;
}
/* Setup the event. Test for authorized counter sets and only include counter
* sets which are authorized at the time of the setup. Including unauthorized
* counter sets result in specification exception (and panic).
*/
static int __hw_perf_event_init(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
int err = 0;
debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
event, event->cpu);
event->hw.config = attr->config;
/* Add all authorized counter sets to config_base. The
* the hardware init function is either called per-cpu or just once
* for all CPUS (event->cpu == -1). This depends on the whether
* counting is started for all CPUs or on a per workload base where
* the perf event moves from one CPU to another CPU.
* Checking the authorization on any CPU is fine as the hardware
* applies the same authorization settings to all CPUs.
*/
event->hw.config_base = get_authctrsets();
/* No authorized counter sets, nothing to count/sample */
if (!event->hw.config_base) {
err = -EINVAL;
goto out;
}
/* Set sample_period to indicate sampling */
event->hw.sample_period = attr->sample_period;
local64_set(&event->hw.period_left, event->hw.sample_period);
event->hw.last_period = event->hw.sample_period;
out:
debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
__func__, err, event->hw.config_base);
return err;
}
/* Return 0 if the CPU-measurement counter facility is currently free
* and an error otherwise.
*/
static int cf_diag_perf_event_inuse(void)
{
int err = 0;
if (!atomic_inc_not_zero(&cf_diag_events)) {
mutex_lock(&cf_diag_reserve_mutex);
if (atomic_read(&cf_diag_events) == 0 &&
__kernel_cpumcf_begin())
err = -EBUSY;
else
err = atomic_inc_return(&cf_diag_events);
mutex_unlock(&cf_diag_reserve_mutex);
}
return err;
}
static int cf_diag_event_init(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
int err = -ENOENT;
debug_sprintf_event(cf_diag_dbg, 5,
"%s event %p cpu %d config %#llx type:%u "
"sample_type %#llx cf_diag_events %d\n", __func__,
event, event->cpu, attr->config, event->pmu->type,
attr->sample_type, atomic_read(&cf_diag_events));
if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
event->attr.type != event->pmu->type)
goto out;
/* Raw events are used to access counters directly,
* hence do not permit excludes.
* This event is usesless without PERF_SAMPLE_RAW to return counter set
* values as raw data.
*/
if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
!(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
err = -EOPNOTSUPP;
goto out;
}
/* Initialize for using the CPU-measurement counter facility */
err = cf_diag_perf_event_inuse();
if (err < 0)
goto out;
event->destroy = cf_diag_perf_event_destroy;
err = __hw_perf_event_init(event);
if (unlikely(err))
event->destroy(event);
out:
debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
return err;
}
static void cf_diag_read(struct perf_event *event)
{
debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
}
/* Calculate memory needed to store all counter sets together with header and
* trailer data. This is independend of the counter set authorization which
* can vary depending on the configuration.
*/
static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
{
size_t max_size = sizeof(struct cf_trailer_entry);
enum cpumf_ctr_set i;
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
size_t size = cpum_cf_ctrset_size(i, info);
if (size)
max_size += size * sizeof(u64) +
sizeof(struct cf_ctrset_entry);
}
debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
max_size);
return max_size;
}
/* Read a counter set. The counter set number determines which counter set and
* the CPUM-CF first and second version number determine the number of
* available counters in this counter set.
* Each counter set starts with header containing the counter set number and
* the number of 8 byte counters.
*
* The functions returns the number of bytes occupied by this counter set
* including the header.
* If there is no counter in the counter set, this counter set is useless and
* zero is returned on this case.
*/
static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
size_t room)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
size_t ctrset_size, need = 0;
int rc = 3; /* Assume write failure */
ctrdata->def = CF_DIAG_CTRSET_DEF;
ctrdata->set = ctrset;
ctrdata->res1 = 0;
ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
if (ctrset_size) { /* Save data */
need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
if (need <= room)
rc = ctr_stcctm(ctrset, ctrset_size,
(u64 *)(ctrdata + 1));
if (rc != 3)
ctrdata->ctr = ctrset_size;
else
need = 0;
}
debug_sprintf_event(cf_diag_dbg, 6,
"%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
" need %zd rc %d\n",
__func__, ctrset, ctrset_size, cpuhw->info.cfvn,
cpuhw->info.csvn, need, rc);
return need;
}
/* Read out all counter sets and save them in the provided data buffer.
* The last 64 byte host an artificial trailer entry.
*/
static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
{
struct cf_trailer_entry *trailer;
size_t offset = 0, done;
int i;
memset(data, 0, sz);
sz -= sizeof(*trailer); /* Always room for trailer */
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
struct cf_ctrset_entry *ctrdata = data + offset;
if (!(auth & cpumf_ctr_ctl[i]))
continue; /* Counter set not authorized */
done = cf_diag_getctrset(ctrdata, i, sz - offset);
offset += done;
debug_sprintf_event(cf_diag_dbg, 6,
"%s ctrset %d offset %zu done %zu\n",
__func__, i, offset, done);
}
trailer = data + offset;
cf_diag_trailer(trailer);
return offset + sizeof(*trailer);
}
/* Calculate the difference for each counter in a counter set. */
static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
{
for (; --counters >= 0; ++pstart, ++pstop)
if (*pstop >= *pstart)
*pstop -= *pstart;
else
*pstop = *pstart - *pstop;
}
/* Scan the counter sets and calculate the difference of each counter
* in each set. The result is the increment of each counter during the
* period the counter set has been activated.
*
* Return true on success.
*/
static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
{
struct cf_trailer_entry *trailer_start, *trailer_stop;
struct cf_ctrset_entry *ctrstart, *ctrstop;
size_t offset = 0;
auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
do {
ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
pr_err("cpum_cf_diag counter set compare error "
"in set %i\n", ctrstart->set);
return 0;
}
auth &= ~cpumf_ctr_ctl[ctrstart->set];
if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
cf_diag_diffctrset((u64 *)(ctrstart + 1),
(u64 *)(ctrstop + 1), ctrstart->ctr);
offset += ctrstart->ctr * sizeof(u64) +
sizeof(*ctrstart);
}
debug_sprintf_event(cf_diag_dbg, 6,
"%s set %d ctr %d offset %zu auth %lx\n",
__func__, ctrstart->set, ctrstart->ctr,
offset, auth);
} while (ctrstart->def && auth);
/* Save time_stamp from start of event in stop's trailer */
trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
trailer_stop->progusage[0] = trailer_start->timestamp;
return 1;
}
/* Create perf event sample with the counter sets as raw data. The sample
* is then pushed to the event subsystem and the function checks for
* possible event overflows. If an event overflow occurs, the PMU is
* stopped.
*
* Return non-zero if an event overflow occurred.
*/
static int cf_diag_push_sample(struct perf_event *event,
struct cf_diag_csd *csd)
{
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
int overflow;
/* Setup perf sample */
perf_sample_data_init(&data, 0, event->hw.last_period);
memset(&regs, 0, sizeof(regs));
memset(&raw, 0, sizeof(raw));
if (event->attr.sample_type & PERF_SAMPLE_CPU)
data.cpu_entry.cpu = event->cpu;
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = csd->used;
raw.frag.data = csd->data;
raw.size = csd->used;
data.raw = &raw;
}
overflow = perf_event_overflow(event, &data, &regs);
debug_sprintf_event(cf_diag_dbg, 6,
"%s event %p cpu %d sample_type %#llx raw %d "
"ov %d\n", __func__, event, event->cpu,
event->attr.sample_type, raw.size, overflow);
if (overflow)
event->pmu->stop(event, 0);
perf_event_update_userpage(event);
return overflow;
}
static void cf_diag_start(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
struct hw_perf_event *hwc = &event->hw;
debug_sprintf_event(cf_diag_dbg, 5,
"%s event %p cpu %d flags %#x hwc-state %#x\n",
__func__, event, event->cpu, flags, hwc->state);
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
/* (Re-)enable and activate all counter sets */
lcctl(0); /* Reset counter sets */
hwc->state = 0;
ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
lcctl(cpuhw->state); /* Enable counter sets */
csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
event->hw.config_base);
ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
/* Function cf_diag_enable() starts the counter sets. */
}
static void cf_diag_stop(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
struct hw_perf_event *hwc = &event->hw;
debug_sprintf_event(cf_diag_dbg, 5,
"%s event %p cpu %d flags %#x hwc-state %#x\n",
__func__, event, event->cpu, flags, hwc->state);
/* Deactivate all counter sets */
ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
local64_inc(&event->count);
csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
event->hw.config_base);
if (cf_diag_diffctr(csd, event->hw.config_base))
cf_diag_push_sample(event, csd);
hwc->state |= PERF_HES_STOPPED;
}
static int cf_diag_add(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
int err = 0;
debug_sprintf_event(cf_diag_dbg, 5,
"%s event %p cpu %d flags %#x cpuhw %p\n",
__func__, event, event->cpu, flags, cpuhw);
if (cpuhw->flags & PMU_F_IN_USE) {
err = -EAGAIN;
goto out;
}
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
cpuhw->flags |= PMU_F_IN_USE;
if (flags & PERF_EF_START)
cf_diag_start(event, PERF_EF_RELOAD);
out:
debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
return err;
}
static void cf_diag_del(struct perf_event *event, int flags)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
debug_sprintf_event(cf_diag_dbg, 5,
"%s event %p cpu %d flags %#x\n",
__func__, event, event->cpu, flags);
cf_diag_stop(event, PERF_EF_UPDATE);
ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
cpuhw->flags &= ~PMU_F_IN_USE;
}
/* Default counter set events and format attribute groups */
CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
static struct attribute *cf_diag_events_attr[] = {
CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
NULL,
};
PMU_FORMAT_ATTR(event, "config:0-63");
static struct attribute *cf_diag_format_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group cf_diag_events_group = {
.name = "events",
.attrs = cf_diag_events_attr,
};
static struct attribute_group cf_diag_format_group = {
.name = "format",
.attrs = cf_diag_format_attr,
};
static const struct attribute_group *cf_diag_attr_groups[] = {
&cf_diag_events_group,
&cf_diag_format_group,
NULL,
};
/* Performance monitoring unit for s390x */
static struct pmu cf_diag = {
.task_ctx_nr = perf_sw_context,
.pmu_enable = cf_diag_enable,
.pmu_disable = cf_diag_disable,
.event_init = cf_diag_event_init,
.add = cf_diag_add,
.del = cf_diag_del,
.start = cf_diag_start,
.stop = cf_diag_stop,
.read = cf_diag_read,
.attr_groups = cf_diag_attr_groups
};
/* Get the CPU speed, try sampling facility first and CPU attributes second. */
static void cf_diag_get_cpu_speed(void)
{
if (cpum_sf_avail()) { /* Sampling facility first */
struct hws_qsi_info_block si;
memset(&si, 0, sizeof(si));
if (!qsi(&si)) {
cf_diag_cpu_speed = si.cpu_speed;
return;
}
}
if (test_facility(34)) { /* CPU speed extract static part */
unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
if (mhz != -1UL)
cf_diag_cpu_speed = mhz & 0xffffffff;
}
}
/* Code to create device and file I/O operations */
static atomic_t ctrset_opencnt = ATOMIC_INIT(0); /* Excl. access */
static int cf_diag_open(struct inode *inode, struct file *file)
{
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (atomic_xchg(&ctrset_opencnt, 1))
return -EBUSY;
/* Avoid concurrent access with perf_event_open() system call */
mutex_lock(&cf_diag_reserve_mutex);
if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin())
err = -EBUSY;
mutex_unlock(&cf_diag_reserve_mutex);
if (err) {
atomic_set(&ctrset_opencnt, 0);
return err;
}
file->private_data = NULL;
debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
/* nonseekable_open() never fails */
return nonseekable_open(inode, file);
}
/* Variables for ioctl() interface support */
static DEFINE_MUTEX(cf_diag_ctrset_mutex);
static struct cf_diag_ctrset {
unsigned long ctrset; /* Bit mask of counter set to read */
cpumask_t mask; /* CPU mask to read from */
} cf_diag_ctrset;
static void cf_diag_ctrset_clear(void)
{
cpumask_clear(&cf_diag_ctrset.mask);
cf_diag_ctrset.ctrset = 0;
}
static void cf_diag_release_cpu(void *p)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__,
smp_processor_id());
lcctl(0); /* Reset counter sets */
cpuhw->state = 0; /* Save state in CPU hardware state */
}
/* Release function is also called when application gets terminated without
* doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
* Since only one application is allowed to open the device, simple stop all
* CPU counter sets.
*/
static int cf_diag_release(struct inode *inode, struct file *file)
{
on_each_cpu(cf_diag_release_cpu, NULL, 1);
cf_diag_ctrset_clear();
atomic_set(&ctrset_opencnt, 0);
__kernel_cpumcf_end();
debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
return 0;
}
struct cf_diag_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */
unsigned int sets; /* Counter set bit mask */
atomic_t cpus_ack; /* # CPUs successfully executed func */
};
static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask)
{
struct s390_ctrset_read __user *ctrset_read;
unsigned int cpu, cpus, rc;
void __user *uptr;
ctrset_read = (struct s390_ctrset_read __user *)arg;
uptr = ctrset_read->data;
for_each_cpu(cpu, mask) {
struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu);
struct s390_ctrset_cpudata __user *ctrset_cpudata;
ctrset_cpudata = uptr;
debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n",
__func__, cpu, csd->used);
rc = put_user(cpu, &ctrset_cpudata->cpu_nr);
rc |= put_user(csd->sets, &ctrset_cpudata->no_sets);
rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used);
if (rc)
return -EFAULT;
uptr += sizeof(struct s390_ctrset_cpudata) + csd->used;
cond_resched();
}
cpus = cpumask_weight(mask);
if (put_user(cpus, &ctrset_read->no_cpus))
return -EFAULT;
debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n",
__func__, uptr - (void __user *)ctrset_read->data);
return 0;
}
static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
int ctrset_size, size_t room)
{
size_t need = 0;
int rc = -1;
need = sizeof(*p) + sizeof(u64) * ctrset_size;
debug_sprintf_event(cf_diag_dbg, 5,
"%s room %zd need %zd set %#x set_size %d\n",
__func__, room, need, ctrset, ctrset_size);
if (need <= room) {
p->set = cpumf_ctr_ctl[ctrset];
p->no_cnts = ctrset_size;
rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
if (rc == 3) /* Nothing stored */
need = 0;
}
debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__,
need, rc);
return need;
}
/* Read all counter sets. Since the perf_event_open() system call with
* event cpum_cf_diag/.../ is blocked when this interface is active, reuse
* the perf_event_open() data buffer to store the counter sets.
*/
static void cf_diag_cpu_read(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
struct cf_diag_call_on_cpu_parm *p = parm;
int set, set_size;
size_t space;
debug_sprintf_event(cf_diag_dbg, 5,
"%s new %#x flags %#x state %#llx\n",
__func__, p->sets, cpuhw->flags,
cpuhw->state);
/* No data saved yet */
csd->used = 0;
csd->sets = 0;
memset(csd->data, 0, sizeof(csd->data));
/* Scan the counter sets */
for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used;
if (!(p->sets & cpumf_ctr_ctl[set]))
continue; /* Counter set not in list */
set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
space = sizeof(csd->data) - csd->used;
space = cf_diag_cpuset_read(sp, set, set_size, space);
if (space) {
csd->used += space;
csd->sets += 1;
}
debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n",
__func__, sp, space);
}
debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__,
csd->sets, csd->used);
}
static int cf_diag_all_read(unsigned long arg)
{
struct cf_diag_call_on_cpu_parm p;
cpumask_var_t mask;
int rc;
debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
p.sets = cf_diag_ctrset.ctrset;
cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1);
rc = cf_diag_all_copy(arg, mask);
free_cpumask_var(mask);
debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc);
return rc;
}
/* Stop all counter sets via ioctl interface */
static void cf_diag_ioctl_off(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cf_diag_call_on_cpu_parm *p = parm;
int rc;
debug_sprintf_event(cf_diag_dbg, 5,
"%s new %#x flags %#x state %#llx\n",
__func__, p->sets, cpuhw->flags,
cpuhw->state);
ctr_set_multiple_disable(&cpuhw->state, p->sets);
ctr_set_multiple_stop(&cpuhw->state, p->sets);
rc = lcctl(cpuhw->state); /* Stop counter sets */
if (!cpuhw->state)
cpuhw->flags &= ~PMU_F_IN_USE;
debug_sprintf_event(cf_diag_dbg, 5,
"%s rc %d flags %#x state %#llx\n", __func__,
rc, cpuhw->flags, cpuhw->state);
}
/* Start counter sets on particular CPU */
static void cf_diag_ioctl_on(void *parm)
{
struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
struct cf_diag_call_on_cpu_parm *p = parm;
int rc;
debug_sprintf_event(cf_diag_dbg, 5,
"%s new %#x flags %#x state %#llx\n",
__func__, p->sets, cpuhw->flags,
cpuhw->state);
if (!(cpuhw->flags & PMU_F_IN_USE))
cpuhw->state = 0;
cpuhw->flags |= PMU_F_IN_USE;
rc = lcctl(cpuhw->state); /* Reset unused counter sets */
ctr_set_multiple_enable(&cpuhw->state, p->sets);
ctr_set_multiple_start(&cpuhw->state, p->sets);
rc |= lcctl(cpuhw->state); /* Start counter sets */
if (!rc)
atomic_inc(&p->cpus_ack);
debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n",
__func__, rc, cpuhw->state);
}
static int cf_diag_all_stop(void)
{
struct cf_diag_call_on_cpu_parm p = {
.sets = cf_diag_ctrset.ctrset,
};
cpumask_var_t mask;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
free_cpumask_var(mask);
return 0;
}
static int cf_diag_all_start(void)
{
struct cf_diag_call_on_cpu_parm p = {
.sets = cf_diag_ctrset.ctrset,
.cpus_ack = ATOMIC_INIT(0),
};
cpumask_var_t mask;
int rc = 0;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1);
if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
rc = -EIO;
}
free_cpumask_var(mask);
return rc;
}
/* Return the maximum required space for all possible CPUs in case one
* CPU will be onlined during the START, READ, STOP cycles.
* To find out the size of the counter sets, any one CPU will do. They
* all have the same counter sets.
*/
static size_t cf_diag_needspace(unsigned int sets)
{
struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
size_t bytes = 0;
int i;
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
if (!(sets & cpumf_ctr_ctl[i]))
continue;
bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
sizeof(((struct s390_ctrset_setdata *)0)->set) +
sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
}
bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
bytes);
put_cpu_ptr(&cpu_cf_events);
return bytes;
}
static long cf_diag_ioctl_read(unsigned long arg)
{
struct s390_ctrset_read read;
int ret = 0;
debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
return -EFAULT;
ret = cf_diag_all_read(arg);
debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
return ret;
}
static long cf_diag_ioctl_stop(void)
{
int ret;
debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
ret = cf_diag_all_stop();
cf_diag_ctrset_clear();
debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
return ret;
}
static long cf_diag_ioctl_start(unsigned long arg)
{
struct s390_ctrset_start __user *ustart;
struct s390_ctrset_start start;
void __user *umask;
unsigned int len;
int ret = 0;
size_t need;
if (cf_diag_ctrset.ctrset)
return -EBUSY;
ustart = (struct s390_ctrset_start __user *)arg;
if (copy_from_user(&start, ustart, sizeof(start)))
return -EFAULT;
if (start.version != S390_HWCTR_START_VERSION)
return -EINVAL;
if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
return -EINVAL; /* Invalid counter set */
if (!start.counter_sets)
return -EINVAL; /* No counter set at all? */
cpumask_clear(&cf_diag_ctrset.mask);
len = min_t(u64, start.cpumask_len, cpumask_size());
umask = (void __user *)start.cpumask;
if (copy_from_user(&cf_diag_ctrset.mask, umask, len))
return -EFAULT;
if (cpumask_empty(&cf_diag_ctrset.mask))
return -EINVAL;
need = cf_diag_needspace(start.counter_sets);
if (put_user(need, &ustart->data_bytes))
ret = -EFAULT;
if (ret)
goto out;
cf_diag_ctrset.ctrset = start.counter_sets;
ret = cf_diag_all_start();
out:
if (ret)
cf_diag_ctrset_clear();
debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n",
__func__, cf_diag_ctrset.ctrset, need, ret);
return ret;
}
static long cf_diag_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
int ret;
debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__,
cmd, arg);
get_online_cpus();
mutex_lock(&cf_diag_ctrset_mutex);
switch (cmd) {
case S390_HWCTR_START:
ret = cf_diag_ioctl_start(arg);
break;
case S390_HWCTR_STOP:
ret = cf_diag_ioctl_stop();
break;
case S390_HWCTR_READ:
ret = cf_diag_ioctl_read(arg);
break;
default:
ret = -ENOTTY;
break;
}
mutex_unlock(&cf_diag_ctrset_mutex);
put_online_cpus();
debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret);
return ret;
}
static const struct file_operations cf_diag_fops = {
.owner = THIS_MODULE,
.open = cf_diag_open,
.release = cf_diag_release,
.unlocked_ioctl = cf_diag_ioctl,
.compat_ioctl = cf_diag_ioctl,
.llseek = no_llseek
};
static struct miscdevice cf_diag_dev = {
.name = S390_HWCTR_DEVICE,
.minor = MISC_DYNAMIC_MINOR,
.fops = &cf_diag_fops,
};
static int cf_diag_online_cpu(unsigned int cpu)
{
struct cf_diag_call_on_cpu_parm p;
mutex_lock(&cf_diag_ctrset_mutex);
if (!cf_diag_ctrset.ctrset)
goto out;
p.sets = cf_diag_ctrset.ctrset;
cf_diag_ioctl_on(&p);
out:
mutex_unlock(&cf_diag_ctrset_mutex);
return 0;
}
static int cf_diag_offline_cpu(unsigned int cpu)
{
struct cf_diag_call_on_cpu_parm p;
mutex_lock(&cf_diag_ctrset_mutex);
if (!cf_diag_ctrset.ctrset)
goto out;
p.sets = cf_diag_ctrset.ctrset;
cf_diag_ioctl_off(&p);
out:
mutex_unlock(&cf_diag_ctrset_mutex);
return 0;
}
/* Initialize the counter set PMU to generate complete counter set data as
* event raw data. This relies on the CPU Measurement Counter Facility device
* already being loaded and initialized.
*/
static int __init cf_diag_init(void)
{
struct cpumf_ctr_info info;
size_t need;
int rc;
if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
return -ENODEV;
cf_diag_get_cpu_speed();
/* Make sure the counter set data fits into predefined buffer. */
need = cf_diag_ctrset_maxsize(&info);
if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
need);
return -ENOMEM;
}
rc = misc_register(&cf_diag_dev);
if (rc) {
pr_err("Registration of /dev/" S390_HWCTR_DEVICE
"failed rc=%d\n", rc);
goto out;
}
/* Setup s390dbf facility */
cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
if (!cf_diag_dbg) {
pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
rc = -ENOMEM;
goto out_dbf;
}
debug_register_view(cf_diag_dbg, &debug_sprintf_view);
rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
if (rc) {
pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
rc);
goto out_perf;
}
rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE,
"perf/s390/cfd:online",
cf_diag_online_cpu, cf_diag_offline_cpu);
if (!rc)
goto out;
pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n",
rc);
perf_pmu_unregister(&cf_diag);
out_perf:
debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
debug_unregister(cf_diag_dbg);
out_dbf:
misc_deregister(&cf_diag_dev);
out:
return rc;
}
device_initcall(cf_diag_init);
......@@ -166,6 +166,12 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
p->thread.acrs[1] = (unsigned int)tls;
}
}
/*
* s390 stores the svc return address in arch_data when calling
* sigreturn()/restart_syscall() via vdso. 1 means no valid address
* stored.
*/
p->restart_block.arch_data = 1;
return 0;
}
......
......@@ -354,7 +354,7 @@ void __init arch_call_rest_init(void)
set_task_stack_end_magic(current);
stack += STACK_INIT_OFFSET;
S390_lowcore.kernel_stack = stack;
CALL_ON_STACK_NORETURN(rest_init, stack);
call_on_stack_noreturn(rest_init, stack);
}
static void __init setup_lowcore_dat_off(void)
......@@ -442,6 +442,7 @@ static void __init setup_lowcore_dat_off(void)
lc->br_r1_trampoline = 0x07f1; /* br %r1 */
lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
lc->preempt_count = PREEMPT_DISABLED;
set_prefix((u32)(unsigned long) lc);
lowcore_ptr[0] = lc;
......
......@@ -32,6 +32,7 @@
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
#include <asm/vdso.h>
#include "entry.h"
/*
......@@ -171,7 +172,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
fpregs_load(&user_sregs.fpregs, &current->thread.fpu);
clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
return 0;
}
......@@ -334,15 +334,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
if (ka->sa.sa_flags & SA_RESTORER) {
if (ka->sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ka->sa.sa_restorer;
} else {
/* Signal frame without vector registers are short ! */
__u16 __user *svc = (void __user *) frame + frame_size - 2;
if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
return -EFAULT;
restorer = (unsigned long) svc;
}
else
restorer = VDSO64_SYMBOL(current, sigreturn);
/* Set up registers for signal handler */
regs->gprs[14] = restorer;
......@@ -397,14 +392,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
if (ksig->ka.sa.sa_flags & SA_RESTORER) {
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ksig->ka.sa.sa_restorer;
} else {
__u16 __user *svc = &frame->svc_insn;
if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
return -EFAULT;
restorer = (unsigned long) svc;
}
else
restorer = VDSO64_SYMBOL(current, rt_sigreturn);
/* Create siginfo on the signal stack */
if (copy_siginfo_to_user(&frame->info, &ksig->info))
......@@ -501,7 +492,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
}
/* No longer in a system call */
clear_pt_regs_flag(regs, PIF_SYSCALL);
clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
rseq_signal_deliver(&ksig, regs);
if (is_compat_task())
handle_signal32(&ksig, oldset, regs);
......@@ -517,14 +508,20 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
switch (regs->gprs[2]) {
case -ERESTART_RESTARTBLOCK:
/* Restart with sys_restart_syscall */
regs->int_code = __NR_restart_syscall;
fallthrough;
regs->gprs[2] = regs->orig_gpr2;
current->restart_block.arch_data = regs->psw.addr;
if (is_compat_task())
regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
else
regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
if (test_thread_flag(TIF_SINGLE_STEP))
clear_thread_flag(TIF_PER_TRAP);
break;
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
/* Restart system call with magic TIF bit. */
regs->gprs[2] = regs->orig_gpr2;
set_pt_regs_flag(regs, PIF_SYSCALL_RESTART);
regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
if (test_thread_flag(TIF_SINGLE_STEP))
clear_thread_flag(TIF_PER_TRAP);
break;
......
......@@ -210,6 +210,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
lc->br_r1_trampoline = 0x07f1; /* br %r1 */
lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
lc->preempt_count = PREEMPT_DISABLED;
if (nmi_alloc_per_cpu(lc))
goto out;
lowcore_ptr[cpu] = lc;
......@@ -300,24 +301,28 @@ static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
}
typedef void (pcpu_delegate_fn)(void *);
/*
* Call function via PSW restart on pcpu and stop the current cpu.
*/
static void __pcpu_delegate(void (*func)(void*), void *data)
static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
{
func(data); /* should not return */
}
static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
void (*func)(void *),
pcpu_delegate_fn *func,
void *data, unsigned long stack)
{
struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
unsigned long source_cpu = stap();
__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
if (pcpu->address == source_cpu)
CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
if (pcpu->address == source_cpu) {
call_on_stack(2, stack, void, __pcpu_delegate,
pcpu_delegate_fn *, func, void *, data);
}
/* Stop target cpu (if func returns this stops the current cpu). */
pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
/* Restart func on the target cpu and stop the current cpu. */
......@@ -898,7 +903,7 @@ static void __no_sanitize_address smp_start_secondary(void *cpuvoid)
S390_lowcore.restart_source = -1UL;
__ctl_load(S390_lowcore.cregs_save_area, 0, 15);
__load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack);
}
/* Upping and downing of CPUs */
......
......@@ -108,7 +108,7 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}
void do_syscall(struct pt_regs *regs)
static void do_syscall(struct pt_regs *regs)
{
unsigned long nr;
......@@ -121,6 +121,10 @@ void do_syscall(struct pt_regs *regs)
regs->gprs[2] = nr;
if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
regs->psw.addr = current->restart_block.arch_data;
current->restart_block.arch_data = 1;
}
nr = syscall_enter_from_user_mode_work(regs, nr);
/*
......@@ -130,13 +134,16 @@ void do_syscall(struct pt_regs *regs)
* work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
* and if set, the syscall will be skipped.
*/
if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) {
if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
goto out;
regs->gprs[2] = -ENOSYS;
if (likely(nr < NR_syscalls))
if (likely(nr >= NR_syscalls))
goto out;
do {
regs->gprs[2] = current->thread.sys_call_table[nr](regs);
} else {
clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET);
}
} while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART));
out:
syscall_exit_to_user_mode_work(regs);
}
......@@ -154,13 +161,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
if (per_trap)
set_thread_flag(TIF_PER_TRAP);
for (;;) {
regs->flags = 0;
set_pt_regs_flag(regs, PIF_SYSCALL);
do_syscall(regs);
if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART))
break;
local_irq_enable();
}
exit_to_user_mode();
}
......@@ -277,6 +277,8 @@ static void __init test_monitor_call(void)
{
int val = 1;
if (!IS_ENABLED(CONFIG_BUG))
return;
asm volatile(
" mc 0,0\n"
"0: xgr %0,%0\n"
......@@ -299,10 +301,9 @@ static void (*pgm_check_table[128])(struct pt_regs *regs);
void noinstr __do_pgm_check(struct pt_regs *regs)
{
unsigned long last_break = S390_lowcore.breaking_event_addr;
unsigned int trapnr, syscall_redirect = 0;
unsigned int trapnr;
irqentry_state_t state;
add_random_kstack_offset();
regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc;
regs->int_parm_long = S390_lowcore.trans_exc_code;
......@@ -344,18 +345,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
trapnr = regs->int_code & PGM_INT_CODE_MASK;
if (trapnr)
pgm_check_table[trapnr](regs);
syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL);
out:
local_irq_disable();
irqentry_exit(regs, state);
if (syscall_redirect) {
enter_from_user_mode(regs);
local_irq_enable();
regs->orig_gpr2 = regs->gprs[2];
do_syscall(regs);
exit_to_user_mode();
}
}
/*
......
......@@ -358,6 +358,15 @@ static ssize_t uv_query_facilities(struct kobject *kobj,
static struct kobj_attribute uv_query_facilities_attr =
__ATTR(facilities, 0444, uv_query_facilities, NULL);
static ssize_t uv_query_feature_indications(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
}
static struct kobj_attribute uv_query_feature_indications_attr =
__ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
......@@ -390,6 +399,7 @@ static struct kobj_attribute uv_query_max_guest_addr_attr =
static struct attribute *uv_query_attrs[] = {
&uv_query_facilities_attr.attr,
&uv_query_feature_indications_attr.attr,
&uv_query_max_guest_cpus_attr.attr,
&uv_query_max_guest_vms_attr.attr,
&uv_query_max_guest_addr_attr.attr,
......
......@@ -20,7 +20,7 @@
#include <asm/vdso.h>
extern char vdso64_start[], vdso64_end[];
static unsigned int vdso_pages;
extern char vdso32_start[], vdso32_end[];
static struct vm_special_mapping vvar_mapping;
......@@ -37,18 +37,6 @@ enum vvar_pages {
VVAR_NR_PAGES,
};
unsigned int __read_mostly vdso_enabled = 1;
static int __init vdso_setup(char *str)
{
bool enabled;
if (!kstrtobool(str, &enabled))
vdso_enabled = enabled;
return 1;
}
__setup("vdso=", vdso_setup);
#ifdef CONFIG_TIME_NS
struct vdso_data *arch_get_vdso_data(void *vvar_page)
{
......@@ -155,7 +143,12 @@ static struct vm_special_mapping vvar_mapping = {
.fault = vvar_fault,
};
static struct vm_special_mapping vdso_mapping = {
static struct vm_special_mapping vdso64_mapping = {
.name = "[vdso]",
.mremap = vdso_mremap,
};
static struct vm_special_mapping vdso32_mapping = {
.name = "[vdso]",
.mremap = vdso_mremap,
};
......@@ -171,16 +164,22 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
unsigned long vdso_text_len, vdso_mapping_len;
unsigned long vvar_start, vdso_text_start;
struct vm_special_mapping *vdso_mapping;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int rc;
BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
if (!vdso_enabled || is_compat_task())
return 0;
if (mmap_write_lock_killable(mm))
return -EINTR;
vdso_text_len = vdso_pages << PAGE_SHIFT;
if (is_compat_task()) {
vdso_text_len = vdso32_end - vdso32_start;
vdso_mapping = &vdso32_mapping;
} else {
vdso_text_len = vdso64_end - vdso64_start;
vdso_mapping = &vdso64_mapping;
}
vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE;
vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
rc = vvar_start;
......@@ -198,7 +197,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
&vdso_mapping);
vdso_mapping);
if (IS_ERR(vma)) {
do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
rc = PTR_ERR(vma);
......@@ -211,21 +210,25 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
return rc;
}
static int __init vdso_init(void)
static struct page ** __init vdso_setup_pages(void *start, void *end)
{
struct page **pages;
int pages = (end - start) >> PAGE_SHIFT;
struct page **pagelist;
int i;
vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT;
pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
vdso_enabled = 0;
return -ENOMEM;
}
for (i = 0; i < vdso_pages; i++)
pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE);
pages[vdso_pages] = NULL;
vdso_mapping.pages = pages;
pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
if (!pagelist)
panic("%s: Cannot allocate page list for VDSO", __func__);
for (i = 0; i < pages; i++)
pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
return pagelist;
}
static int __init vdso_init(void)
{
vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
if (IS_ENABLED(CONFIG_COMPAT))
vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
return 0;
}
arch_initcall(vdso_init);
# SPDX-License-Identifier: GPL-2.0-only
vdso32.lds
# SPDX-License-Identifier: GPL-2.0
# List of files in the vdso
KCOV_INSTRUMENT := n
ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
include $(srctree)/lib/vdso/Makefile
obj-vdso32 = vdso_user_wrapper-32.o note-32.o
# Build rules
targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
KBUILD_AFLAGS += -DBUILD_VDSO
KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
KBUILD_AFLAGS_32 += -m31 -s
KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \
--hash-style=both --build-id=sha1 -melf_s390 -T
$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
obj-y += vdso32_wrapper.o
CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
# Disable gcov profiling, ubsan and kasan for VDSO code
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
# Force dependency (incbin is bad)
$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
$(call if_changed,ld)
# strip rule for the .so file
$(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
$(obj-vdso32): %-32.o: %.S FORCE
$(call if_changed_dep,vdso32as)
# actual build commands
quiet_cmd_vdso32as = VDSO32A $@
cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
quiet_cmd_vdso32cc = VDSO32C $@
cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
# install commands for the unstripped file
quiet_cmd_vdso_install = INSTALL $@
cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
vdso32.so: $(obj)/vdso32.so.dbg
@mkdir -p $(MODLIB)/vdso
$(call cmd,vdso_install)
vdso_install: vdso32.so
# Generate VDSO offsets using helper script
gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
quiet_cmd_vdsosym = VDSOSYM $@
cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
$(call if_changed,vdsosym)
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
#
# Match symbols in the DSO that look like VDSO_*; produce a header file
# of constant offsets into the shared object.
#
# Doing this inside the Makefile will break the $(filter-out) function,
# causing Kbuild to rebuild the vdso-offsets header file every time.
#
# Inspired by arm64 version.
#
LC_ALL=C
sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
/* SPDX-License-Identifier: GPL-2.0 */
/*
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
* Here we can supply some information useful to userland.
*/
#include <linux/uts.h>
#include <linux/version.h>
#include <linux/elfnote.h>
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END
/* SPDX-License-Identifier: GPL-2.0 */
/*
* This is the infamous ld script for the 64 bits vdso
* library
*/
#include <asm/page.h>
#include <asm/vdso.h>
OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
OUTPUT_ARCH(s390:31-bit)
ENTRY(_start)
SECTIONS
{
PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
#ifdef CONFIG_TIME_NS
PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
#endif
. = VDSO_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.note : { *(.note.*) } :text :note
. = ALIGN(16);
.text : {
*(.text .stub .text.* .gnu.linkonce.t.*)
} :text
PROVIDE(__etext = .);
PROVIDE(_etext = .);
PROVIDE(etext = .);
/*
* Other stuff is appended to the text segment:
*/
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
.rodata1 : { *(.rodata1) }
.dynamic : { *(.dynamic) } :text :dynamic
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
.gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
.rela.dyn ALIGN(8) : { *(.rela.dyn) }
.got ALIGN(8) : { *(.got .toc) }
_end = .;
PROVIDE(end = .);
/*
* Stabs debugging sections are here too.
*/
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
.stab.excl 0 : { *(.stab.excl) }
.stab.exclstr 0 : { *(.stab.exclstr) }
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
/*
* DWARF debug sections.
* Symbols in the DWARF debugging sections are relative to the
* beginning of the section so we begin them at 0.
*/
/* DWARF 1 */
.debug 0 : { *(.debug) }
.line 0 : { *(.line) }
/* GNU DWARF 1 extensions */
.debug_srcinfo 0 : { *(.debug_srcinfo) }
.debug_sfnames 0 : { *(.debug_sfnames) }
/* DWARF 1.1 and DWARF 2 */
.debug_aranges 0 : { *(.debug_aranges) }
.debug_pubnames 0 : { *(.debug_pubnames) }
/* DWARF 2 */
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
.debug_abbrev 0 : { *(.debug_abbrev) }
.debug_line 0 : { *(.debug_line) }
.debug_frame 0 : { *(.debug_frame) }
.debug_str 0 : { *(.debug_str) }
.debug_loc 0 : { *(.debug_loc) }
.debug_macinfo 0 : { *(.debug_macinfo) }
/* SGI/MIPS DWARF 2 extensions */
.debug_weaknames 0 : { *(.debug_weaknames) }
.debug_funcnames 0 : { *(.debug_funcnames) }
.debug_typenames 0 : { *(.debug_typenames) }
.debug_varnames 0 : { *(.debug_varnames) }
/* DWARF 3 */
.debug_pubtypes 0 : { *(.debug_pubtypes) }
.debug_ranges 0 : { *(.debug_ranges) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : {
*(.note.GNU-stack)
*(.branch_lt)
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
}
}
/*
* Very old versions of ld do not recognize this name token; use the constant.
*/
#define PT_GNU_EH_FRAME 0x6474e550
/*
* We must supply the ELF program headers explicitly to get just one
* PT_LOAD segment, and set the flags explicitly to make segments read-only.
*/
PHDRS
{
text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
note PT_NOTE FLAGS(4); /* PF_R */
eh_frame_hdr PT_GNU_EH_FRAME;
}
/*
* This controls what symbols we export from the DSO.
*/
VERSION
{
VDSO_VERSION_STRING {
global:
/*
* Has to be there for the kernel to find
*/
__kernel_compat_restart_syscall;
__kernel_compat_rt_sigreturn;
__kernel_compat_sigreturn;
local: *;
};
}
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/page.h>
__PAGE_ALIGNED_DATA
.globl vdso32_start, vdso32_end
.balign PAGE_SIZE
vdso32_start:
.incbin "arch/s390/kernel/vdso32/vdso32.so"
.balign PAGE_SIZE
vdso32_end:
.previous
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/unistd.h>
#include <asm/dwarf.h>
.macro vdso_syscall func,syscall
.globl __kernel_compat_\func
.type __kernel_compat_\func,@function
.align 8
__kernel_compat_\func:
CFI_STARTPROC
svc \syscall
/* Make sure we notice when a syscall returns, which shouldn't happen */
.word 0
CFI_ENDPROC
.size __kernel_compat_\func,.-__kernel_compat_\func
.endm
vdso_syscall restart_syscall,__NR_restart_syscall
vdso_syscall sigreturn,__NR_sigreturn
vdso_syscall rt_sigreturn,__NR_rt_sigreturn
......@@ -74,3 +74,11 @@ vdso64.so: $(obj)/vdso64.so.dbg
$(call cmd,vdso_install)
vdso_install: vdso64.so
# Generate VDSO offsets using helper script
gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
quiet_cmd_vdsosym = VDSOSYM $@
cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
$(call if_changed,vdsosym)
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
#
# Match symbols in the DSO that look like VDSO_*; produce a header file
# of constant offsets into the shared object.
#
# Doing this inside the Makefile will break the $(filter-out) function,
# causing Kbuild to rebuild the vdso-offsets header file every time.
#
# Inspired by arm64 version.
#
LC_ALL=C
sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
......@@ -17,7 +17,7 @@ SECTIONS
#ifdef CONFIG_TIME_NS
PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
#endif
. = VDSO64_LBASE + SIZEOF_HEADERS;
. = VDSO_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
......@@ -137,6 +137,9 @@ VERSION
__kernel_clock_gettime;
__kernel_clock_getres;
__kernel_getcpu;
__kernel_restart_syscall;
__kernel_rt_sigreturn;
__kernel_sigreturn;
local: *;
};
}
......@@ -37,3 +37,20 @@ vdso_func gettimeofday
vdso_func clock_getres
vdso_func clock_gettime
vdso_func getcpu
.macro vdso_syscall func,syscall
.globl __kernel_\func
.type __kernel_\func,@function
.align 8
__kernel_\func:
CFI_STARTPROC
svc \syscall
/* Make sure we notice when a syscall returns, which shouldn't happen */
.word 0
CFI_ENDPROC
.size __kernel_\func,.-__kernel_\func
.endm
vdso_syscall restart_syscall,__NR_restart_syscall
vdso_syscall sigreturn,__NR_sigreturn
vdso_syscall rt_sigreturn,__NR_rt_sigreturn
......@@ -162,7 +162,7 @@ char *strcat(char *dest, const char *src)
" jo 0b\n"
"1: mvst %[dummy],%[src]\n"
" jo 1b\n"
: [dummy] "=&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
: [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
:
: "cc", "memory", "0");
return ret;
......
......@@ -120,7 +120,7 @@ static struct unwindme *unwindme;
#define UWM_REGS 0x2 /* Pass regs to test_unwind(). */
#define UWM_SP 0x4 /* Pass sp to test_unwind(). */
#define UWM_CALLER 0x8 /* Unwind starting from caller. */
#define UWM_SWITCH_STACK 0x10 /* Use CALL_ON_STACK. */
#define UWM_SWITCH_STACK 0x10 /* Use call_on_stack. */
#define UWM_IRQ 0x20 /* Unwind from irq context. */
#define UWM_PGM 0x40 /* Unwind from program check handler. */
......@@ -211,7 +211,8 @@ static noinline int unwindme_func2(struct unwindme *u)
if (u->flags & UWM_SWITCH_STACK) {
local_irq_save(flags);
local_mcck_disable();
rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u);
rc = call_on_stack(1, S390_lowcore.nodat_stack,
int, unwindme_func3, struct unwindme *, u);
local_mcck_enable();
local_irq_restore(flags);
return rc;
......
......@@ -224,7 +224,7 @@ static inline unsigned long copy_in_user_mvcos(void __user *to, const void __use
EX_TABLE(0b,3b)
: "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
: [spec] "d" (0x810081UL)
: "cc", "memory");
: "cc", "memory", "0");
return size;
}
......
......@@ -285,26 +285,6 @@ static noinline void do_sigbus(struct pt_regs *regs)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
static noinline int signal_return(struct pt_regs *regs)
{
u16 instruction;
int rc;
rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
if (rc)
return rc;
if (instruction == 0x0a77) {
set_pt_regs_flag(regs, PIF_SYSCALL);
regs->int_code = 0x00040077;
return 0;
} else if (instruction == 0x0aad) {
set_pt_regs_flag(regs, PIF_SYSCALL);
regs->int_code = 0x000400ad;
return 0;
}
return -EACCES;
}
static noinline void do_fault_error(struct pt_regs *regs, int access,
vm_fault_t fault)
{
......@@ -312,9 +292,6 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
switch (fault) {
case VM_FAULT_BADACCESS:
if (access == VM_EXEC && signal_return(regs) == 0)
break;
fallthrough;
case VM_FAULT_BADMAP:
/* Bad memory access. Check if it is kernel or user space. */
if (user_mode(regs)) {
......@@ -792,6 +769,32 @@ void do_secure_storage_access(struct pt_regs *regs)
struct page *page;
int rc;
/*
* bit 61 tells us if the address is valid, if it's not we
* have a major problem and should stop the kernel or send a
* SIGSEGV to the process. Unfortunately bit 61 is not
* reliable without the misc UV feature so we need to check
* for that as well.
*/
if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
!test_bit_inv(61, &regs->int_parm_long)) {
/*
* When this happens, userspace did something that it
* was not supposed to do, e.g. branching into secure
* memory. Trigger a segmentation fault.
*/
if (user_mode(regs)) {
send_sig(SIGSEGV, current, 0);
return;
}
/*
* The kernel should never run into this case and we
* have no way out of this situation.
*/
panic("Unexpected PGM 0x3d with TEID bit 61=0");
}
switch (get_fault_type(regs)) {
case USER_FAULT:
mm = current->mm;
......
......@@ -125,12 +125,18 @@ static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
*/
int memcpy_real(void *dest, void *src, size_t count)
{
unsigned long _dest = (unsigned long)dest;
unsigned long _src = (unsigned long)src;
unsigned long _count = (unsigned long)count;
int rc;
if (S390_lowcore.nodat_stack != 0) {
preempt_disable();
rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
dest, src, count);
rc = call_on_stack(3, S390_lowcore.nodat_stack,
unsigned long, _memcpy_real,
unsigned long, _dest,
unsigned long, _src,
unsigned long, _count);
preempt_enable();
return rc;
}
......@@ -139,8 +145,7 @@ int memcpy_real(void *dest, void *src, size_t count)
* not set up yet. Just call _memcpy_real on the early boot
* stack
*/
return _memcpy_real((unsigned long) dest,(unsigned long) src,
(unsigned long) count);
return _memcpy_real(_dest, _src, _count);
}
/*
......
......@@ -61,6 +61,9 @@ static char *aqm_str;
module_param_named(aqmask, aqm_str, charp, 0440);
MODULE_PARM_DESC(aqmask, "AP bus domain mask.");
atomic_t ap_max_msg_size = ATOMIC_INIT(AP_DEFAULT_MAX_MSG_SIZE);
EXPORT_SYMBOL(ap_max_msg_size);
static struct device *ap_root_device;
/* Hashtable of all queue devices on the AP bus */
......@@ -316,11 +319,24 @@ EXPORT_SYMBOL(ap_test_config_ctrl_domain);
* Returns true if TAPQ succeeded and the info is filled or
* false otherwise.
*/
static bool ap_queue_info(ap_qid_t qid, int *q_type,
unsigned int *q_fac, int *q_depth, bool *q_decfg)
static bool ap_queue_info(ap_qid_t qid, int *q_type, unsigned int *q_fac,
int *q_depth, int *q_ml, bool *q_decfg)
{
struct ap_queue_status status;
unsigned long info = 0;
union {
unsigned long value;
struct {
unsigned int fac : 32; /* facility bits */
unsigned int at : 8; /* ap type */
unsigned int _res1 : 8;
unsigned int _res2 : 4;
unsigned int ml : 4; /* apxl ml */
unsigned int _res3 : 4;
unsigned int qd : 4; /* queue depth */
} tapq_gr2;
} tapq_info;
tapq_info.value = 0;
/* make sure we don't run into a specifiation exception */
if (AP_QID_CARD(qid) > ap_max_adapter_id ||
......@@ -328,7 +344,7 @@ static bool ap_queue_info(ap_qid_t qid, int *q_type,
return false;
/* call TAPQ on this APQN */
status = ap_test_queue(qid, ap_apft_available(), &info);
status = ap_test_queue(qid, ap_apft_available(), &tapq_info.value);
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
case AP_RESPONSE_RESET_IN_PROGRESS:
......@@ -340,11 +356,12 @@ static bool ap_queue_info(ap_qid_t qid, int *q_type,
* info should be filled. All bits 0 is not possible as
* there is at least one of the mode bits set.
*/
if (WARN_ON_ONCE(!info))
if (WARN_ON_ONCE(!tapq_info.value))
return false;
*q_type = (int)((info >> 24) & 0xff);
*q_fac = (unsigned int)(info >> 32);
*q_depth = (int)(info & 0xff);
*q_type = tapq_info.tapq_gr2.at;
*q_fac = tapq_info.tapq_gr2.fac;
*q_depth = tapq_info.tapq_gr2.qd;
*q_ml = tapq_info.tapq_gr2.ml;
*q_decfg = status.response_code == AP_RESPONSE_DECONFIGURED;
switch (*q_type) {
/* For CEX2 and CEX3 the available functions
......@@ -1516,7 +1533,7 @@ static inline void ap_scan_domains(struct ap_card *ac)
unsigned int func;
struct device *dev;
struct ap_queue *aq;
int rc, dom, depth, type;
int rc, dom, depth, type, ml;
/*
* Go through the configuration for the domains and compare them
......@@ -1540,7 +1557,7 @@ static inline void ap_scan_domains(struct ap_card *ac)
continue;
}
/* domain is valid, get info from this APQN */
if (!ap_queue_info(qid, &type, &func, &depth, &decfg)) {
if (!ap_queue_info(qid, &type, &func, &depth, &ml, &decfg)) {
if (aq) {
AP_DBF_INFO(
"%s(%d,%d) ap_queue_info() not successful, rm queue device\n",
......@@ -1639,7 +1656,7 @@ static inline void ap_scan_adapter(int ap)
unsigned int func;
struct device *dev;
struct ap_card *ac;
int rc, dom, depth, type, comp_type;
int rc, dom, depth, type, comp_type, ml;
/* Is there currently a card device for this adapter ? */
dev = bus_find_device(&ap_bus_type, NULL,
......@@ -1668,7 +1685,8 @@ static inline void ap_scan_adapter(int ap)
for (dom = 0; dom <= ap_max_domain_id; dom++)
if (ap_test_config_usage_domain(dom)) {
qid = AP_MKQID(ap, dom);
if (ap_queue_info(qid, &type, &func, &depth, &decfg))
if (ap_queue_info(qid, &type, &func,
&depth, &ml, &decfg))
break;
}
if (dom > ap_max_domain_id) {
......@@ -1737,7 +1755,7 @@ static inline void ap_scan_adapter(int ap)
__func__, ap, type);
return;
}
ac = ap_card_create(ap, depth, type, comp_type, func);
ac = ap_card_create(ap, depth, type, comp_type, func, ml);
if (!ac) {
AP_DBF_WARN("%s(%d) ap_card_create() failed\n",
__func__, ap);
......@@ -1748,6 +1766,12 @@ static inline void ap_scan_adapter(int ap)
dev->bus = &ap_bus_type;
dev->parent = ap_root_device;
dev_set_name(dev, "card%02x", ap);
/* maybe enlarge ap_max_msg_size to support this card */
if (ac->maxmsgsize > atomic_read(&ap_max_msg_size)) {
atomic_set(&ap_max_msg_size, ac->maxmsgsize);
AP_DBF_INFO("%s(%d) ap_max_msg_size update to %d byte\n",
__func__, ap, atomic_read(&ap_max_msg_size));
}
/* Register the new card device with AP bus */
rc = device_register(dev);
if (rc) {
......
......@@ -25,8 +25,11 @@
#define AP_RESET_TIMEOUT (HZ*0.7) /* Time in ticks for reset timeouts. */
#define AP_CONFIG_TIME 30 /* Time in seconds between AP bus rescans. */
#define AP_POLL_TIME 1 /* Time in ticks between receive polls. */
#define AP_DEFAULT_MAX_MSG_SIZE (12 * 1024)
#define AP_TAPQ_ML_FIELD_CHUNK_SIZE (4096)
extern int ap_domain_index;
extern atomic_t ap_max_msg_size;
extern DECLARE_HASHTABLE(ap_queues, 8);
extern spinlock_t ap_queues_lock;
......@@ -167,6 +170,7 @@ struct ap_card {
unsigned int functions; /* AP device function bitfield. */
int queue_depth; /* AP queue depth.*/
int id; /* AP card number. */
unsigned int maxmsgsize; /* AP msg limit for this card */
bool config; /* configured state */
atomic64_t total_request_count; /* # requests ever for this AP device.*/
};
......@@ -228,7 +232,8 @@ struct ap_message {
struct list_head list; /* Request queueing. */
unsigned long long psmid; /* Message id. */
void *msg; /* Pointer to message buffer. */
unsigned int len; /* Message length. */
unsigned int len; /* actual msg len in msg buffer */
unsigned int bufsize; /* allocated msg buffer size */
u16 flags; /* Flags, see AP_MSG_FLAG_xxx */
struct ap_fi fi; /* Failure Injection cmd */
int rc; /* Return code for this message */
......@@ -290,8 +295,8 @@ void ap_queue_prepare_remove(struct ap_queue *aq);
void ap_queue_remove(struct ap_queue *aq);
void ap_queue_init_state(struct ap_queue *aq);
struct ap_card *ap_card_create(int id, int queue_depth, int raw_device_type,
int comp_device_type, unsigned int functions);
struct ap_card *ap_card_create(int id, int queue_depth, int raw_type,
int comp_type, unsigned int functions, int ml);
struct ap_perms {
unsigned long ioctlm[BITS_TO_LONGS(AP_IOCTLS)];
......
......@@ -174,6 +174,16 @@ static ssize_t config_store(struct device *dev,
static DEVICE_ATTR_RW(config);
static ssize_t max_msg_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ap_card *ac = to_ap_card(dev);
return scnprintf(buf, PAGE_SIZE, "%u\n", ac->maxmsgsize);
}
static DEVICE_ATTR_RO(max_msg_size);
static struct attribute *ap_card_dev_attrs[] = {
&dev_attr_hwtype.attr,
&dev_attr_raw_hwtype.attr,
......@@ -184,6 +194,7 @@ static struct attribute *ap_card_dev_attrs[] = {
&dev_attr_pendingq_count.attr,
&dev_attr_modalias.attr,
&dev_attr_config.attr,
&dev_attr_max_msg_size.attr,
NULL
};
......@@ -209,7 +220,7 @@ static void ap_card_device_release(struct device *dev)
}
struct ap_card *ap_card_create(int id, int queue_depth, int raw_type,
int comp_type, unsigned int functions)
int comp_type, unsigned int functions, int ml)
{
struct ap_card *ac;
......@@ -223,5 +234,8 @@ struct ap_card *ap_card_create(int id, int queue_depth, int raw_type,
ac->queue_depth = queue_depth;
ac->functions = functions;
ac->id = id;
ac->maxmsgsize = ml > 0 ?
ml * AP_TAPQ_ML_FIELD_CHUNK_SIZE : AP_DEFAULT_MAX_MSG_SIZE;
return ac;
}
......@@ -101,7 +101,7 @@ int ap_recv(ap_qid_t qid, unsigned long long *psmid, void *msg, size_t length)
if (msg == NULL)
return -EINVAL;
status = ap_dqap(qid, psmid, msg, length);
status = ap_dqap(qid, psmid, msg, length, NULL, NULL);
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
return 0;
......@@ -136,9 +136,24 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
struct ap_queue_status status;
struct ap_message *ap_msg;
bool found = false;
size_t reslen;
unsigned long resgr0 = 0;
int parts = 0;
/*
* DQAP loop until response code and resgr0 indicate that
* the msg is totally received. As we use the very same buffer
* the msg is overwritten with each invocation. That's intended
* and the receiver of the msg is informed with a msg rc code
* of EMSGSIZE in such a case.
*/
do {
status = ap_dqap(aq->qid, &aq->reply->psmid,
aq->reply->msg, aq->reply->len);
aq->reply->msg, aq->reply->bufsize,
&reslen, &resgr0);
parts++;
} while (status.response_code == 0xFF && resgr0 != 0);
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
aq->queue_count = max_t(int, 0, aq->queue_count - 1);
......@@ -150,7 +165,12 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
continue;
list_del_init(&ap_msg->list);
aq->pendingq_count--;
if (parts > 1) {
ap_msg->rc = -EMSGSIZE;
ap_msg->receive(aq, ap_msg, NULL);
} else {
ap_msg->receive(aq, ap_msg, aq->reply);
}
found = true;
break;
}
......
......@@ -900,6 +900,9 @@ static long _zcrypt_send_cprb(bool userspace, struct ap_perms *perms,
if (xcRB->user_defined != AUTOSELECT &&
xcRB->user_defined != zc->card->id)
continue;
/* check if request size exceeds card max msg size */
if (ap_msg.len > zc->card->maxmsgsize)
continue;
/* check if device node has admission for this card */
if (!zcrypt_check_card(perms, zc->card->id))
continue;
......@@ -1068,6 +1071,9 @@ static long _zcrypt_send_ep11_cprb(bool userspace, struct ap_perms *perms,
if (targets &&
!is_desired_ep11_card(zc->card->id, target_num, targets))
continue;
/* check if request size exceeds card max msg size */
if (ap_msg.len > zc->card->maxmsgsize)
continue;
/* check if device node has admission for this card */
if (!zcrypt_check_card(perms, zc->card->id))
continue;
......
......@@ -28,9 +28,6 @@
#define CEX4C_MIN_MOD_SIZE 16 /* 256 bits */
#define CEX4C_MAX_MOD_SIZE 512 /* 4096 bits */
#define CEX4A_MAX_MESSAGE_SIZE MSGTYPE50_CRB3_MAX_MSG_SIZE
#define CEX4C_MAX_MESSAGE_SIZE MSGTYPE06_MAX_MSG_SIZE
/* Waiting time for requests to be processed.
* Currently there are some types of request which are not deterministic.
* But the maximum time limit managed by the stomper code is set to 60sec.
......@@ -605,19 +602,19 @@ static int zcrypt_cex4_queue_probe(struct ap_device *ap_dev)
int rc;
if (ap_test_bit(&aq->card->functions, AP_FUNC_ACCEL)) {
zq = zcrypt_queue_alloc(CEX4A_MAX_MESSAGE_SIZE);
zq = zcrypt_queue_alloc(aq->card->maxmsgsize);
if (!zq)
return -ENOMEM;
zq->ops = zcrypt_msgtype(MSGTYPE50_NAME,
MSGTYPE50_VARIANT_DEFAULT);
} else if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) {
zq = zcrypt_queue_alloc(CEX4C_MAX_MESSAGE_SIZE);
zq = zcrypt_queue_alloc(aq->card->maxmsgsize);
if (!zq)
return -ENOMEM;
zq->ops = zcrypt_msgtype(MSGTYPE06_NAME,
MSGTYPE06_VARIANT_DEFAULT);
} else if (ap_test_bit(&aq->card->functions, AP_FUNC_EP11)) {
zq = zcrypt_queue_alloc(CEX4C_MAX_MESSAGE_SIZE);
zq = zcrypt_queue_alloc(aq->card->maxmsgsize);
if (!zq)
return -ENOMEM;
zq->ops = zcrypt_msgtype(MSGTYPE06_NAME,
......
......@@ -442,11 +442,13 @@ static void zcrypt_cex2a_receive(struct ap_queue *aq,
goto out; /* ap_msg->rc indicates the error */
t80h = reply->msg;
if (t80h->type == TYPE80_RSP_CODE) {
if (aq->ap_dev.device_type == AP_DEVICE_TYPE_CEX2A)
len = min_t(int, CEX2A_MAX_RESPONSE_SIZE, t80h->len);
else
len = min_t(int, CEX3A_MAX_RESPONSE_SIZE, t80h->len);
len = t80h->len;
if (len > reply->bufsize || len > msg->bufsize) {
msg->rc = -EMSGSIZE;
} else {
memcpy(msg->msg, reply->msg, len);
msg->len = len;
}
} else
memcpy(msg->msg, reply->msg, sizeof(error_reply));
out:
......@@ -469,10 +471,9 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq,
struct completion work;
int rc;
if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
ap_msg->msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL);
else
ap_msg->msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL);
ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ?
MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE;
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->receive = zcrypt_cex2a_receive;
......@@ -515,10 +516,9 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq,
struct completion work;
int rc;
if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
ap_msg->msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL);
else
ap_msg->msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL);
ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ?
MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE;
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->receive = zcrypt_cex2a_receive;
......
......@@ -403,7 +403,7 @@ static int XCRB_msg_to_type6CPRB_msgX(bool userspace, struct ap_message *ap_msg,
} __packed * msg = ap_msg->msg;
int rcblen = CEIL4(xcRB->request_control_blk_length);
int replylen, req_sumlen, resp_sumlen;
int req_sumlen, resp_sumlen;
char *req_data = ap_msg->msg + sizeof(struct type6_hdr) + rcblen;
char *function_code;
......@@ -415,7 +415,7 @@ static int XCRB_msg_to_type6CPRB_msgX(bool userspace, struct ap_message *ap_msg,
ap_msg->len = sizeof(struct type6_hdr) +
CEIL4(xcRB->request_control_blk_length) +
xcRB->request_data_length;
if (ap_msg->len > MSGTYPE06_MAX_MSG_SIZE)
if (ap_msg->len > ap_msg->bufsize)
return -EINVAL;
/*
......@@ -435,12 +435,6 @@ static int XCRB_msg_to_type6CPRB_msgX(bool userspace, struct ap_message *ap_msg,
xcRB->reply_control_blk_length)
return -EINVAL; /* overflow after alignment*/
replylen = sizeof(struct type86_fmt2_msg) +
CEIL4(xcRB->reply_control_blk_length) +
xcRB->reply_data_length;
if (replylen > MSGTYPE06_MAX_MSG_SIZE)
return -EINVAL;
/*
* Overflow check
* sum must be greater (or equal) than the largest operand
......@@ -530,18 +524,13 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(bool userspace, struct ap_message *ap
return -EINVAL; /* overflow after alignment*/
/* length checks */
ap_msg->len = sizeof(struct type6_hdr) + xcRB->req_len;
if (CEIL4(xcRB->req_len) > MSGTYPE06_MAX_MSG_SIZE -
(sizeof(struct type6_hdr)))
ap_msg->len = sizeof(struct type6_hdr) + CEIL4(xcRB->req_len);
if (ap_msg->len > ap_msg->bufsize)
return -EINVAL;
if (CEIL4(xcRB->resp_len) < xcRB->resp_len)
return -EINVAL; /* overflow after alignment*/
if (CEIL4(xcRB->resp_len) > MSGTYPE06_MAX_MSG_SIZE -
(sizeof(struct type86_fmt2_msg)))
return -EINVAL;
/* prepare type6 header */
msg->hdr = static_type6_ep11_hdr;
msg->hdr.ToCardLen1 = xcRB->req_len;
......@@ -952,13 +941,21 @@ static void zcrypt_msgtype6_receive(struct ap_queue *aq,
switch (resp_type->type) {
case CEXXC_RESPONSE_TYPE_ICA:
len = sizeof(struct type86x_reply) + t86r->length - 2;
len = min_t(int, CEXXC_MAX_ICA_RESPONSE_SIZE, len);
if (len > reply->bufsize || len > msg->bufsize) {
msg->rc = -EMSGSIZE;
} else {
memcpy(msg->msg, reply->msg, len);
msg->len = len;
}
break;
case CEXXC_RESPONSE_TYPE_XCRB:
len = t86r->fmt2.offset2 + t86r->fmt2.count2;
len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len);
if (len > reply->bufsize || len > msg->bufsize) {
msg->rc = -EMSGSIZE;
} else {
memcpy(msg->msg, reply->msg, len);
msg->len = len;
}
break;
default:
memcpy(msg->msg, &error_reply, sizeof(error_reply));
......@@ -999,8 +996,12 @@ static void zcrypt_msgtype6_receive_ep11(struct ap_queue *aq,
switch (resp_type->type) {
case CEXXC_RESPONSE_TYPE_EP11:
len = t86r->fmt2.offset1 + t86r->fmt2.count1;
len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len);
if (len > reply->bufsize || len > msg->bufsize) {
msg->rc = -EMSGSIZE;
} else {
memcpy(msg->msg, reply->msg, len);
msg->len = len;
}
break;
default:
memcpy(msg->msg, &error_reply, sizeof(error_reply));
......@@ -1033,6 +1034,7 @@ static long zcrypt_msgtype6_modexpo(struct zcrypt_queue *zq,
ap_msg->msg = (void *) get_zeroed_page(GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->bufsize = PAGE_SIZE;
ap_msg->receive = zcrypt_msgtype6_receive;
ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
atomic_inc_return(&zcrypt_step);
......@@ -1080,6 +1082,7 @@ static long zcrypt_msgtype6_modexpo_crt(struct zcrypt_queue *zq,
ap_msg->msg = (void *) get_zeroed_page(GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->bufsize = PAGE_SIZE;
ap_msg->receive = zcrypt_msgtype6_receive;
ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
atomic_inc_return(&zcrypt_step);
......@@ -1124,7 +1127,8 @@ unsigned int get_cprb_fc(bool userspace, struct ica_xcRB *xcRB,
.type = CEXXC_RESPONSE_TYPE_XCRB,
};
ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
ap_msg->bufsize = atomic_read(&ap_max_msg_size);
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->receive = zcrypt_msgtype6_receive;
......@@ -1181,7 +1185,8 @@ unsigned int get_ep11cprb_fc(bool userspace, struct ep11_urb *xcrb,
.type = CEXXC_RESPONSE_TYPE_EP11,
};
ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
ap_msg->bufsize = atomic_read(&ap_max_msg_size);
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->receive = zcrypt_msgtype6_receive_ep11;
......@@ -1277,7 +1282,8 @@ unsigned int get_rng_fc(struct ap_message *ap_msg, int *func_code,
.type = CEXXC_RESPONSE_TYPE_XCRB,
};
ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
ap_msg->bufsize = AP_DEFAULT_MAX_MSG_SIZE;
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
ap_msg->receive = zcrypt_msgtype6_receive;
......
......@@ -19,8 +19,6 @@
#define MSGTYPE06_VARIANT_NORNG 1
#define MSGTYPE06_VARIANT_EP11 2
#define MSGTYPE06_MAX_MSG_SIZE (12*1024)
/**
* The type 6 message family is associated with CEXxC/CEXxP cards.
*
......
......@@ -111,17 +111,17 @@ bool zcrypt_queue_force_online(struct zcrypt_queue *zq, int online)
return false;
}
struct zcrypt_queue *zcrypt_queue_alloc(size_t max_response_size)
struct zcrypt_queue *zcrypt_queue_alloc(size_t reply_buf_size)
{
struct zcrypt_queue *zq;
zq = kzalloc(sizeof(struct zcrypt_queue), GFP_KERNEL);
if (!zq)
return NULL;
zq->reply.msg = kmalloc(max_response_size, GFP_KERNEL);
zq->reply.msg = kmalloc(reply_buf_size, GFP_KERNEL);
if (!zq->reply.msg)
goto out_free;
zq->reply.len = max_response_size;
zq->reply.bufsize = reply_buf_size;
INIT_LIST_HEAD(&zq->list);
kref_init(&zq->refcount);
return zq;
......
......@@ -171,7 +171,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_CSTATE_ONLINE,
CPUHP_AP_PERF_X86_IDXD_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_CFD_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,
CPUHP_AP_PERF_ARM_CCN_ONLINE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment