Commit c68dc1b5 authored by Oliver Upton's avatar Oliver Upton Committed by Paolo Bonzini

KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK

Handling the migration of TSCs correctly is difficult, in part because
Linux does not provide userspace with the ability to retrieve a (TSC,
realtime) clock pair for a single instant in time. In lieu of a more
convenient facility, KVM can report similar information in the kvm_clock
structure.

Provide userspace with a host TSC & realtime pair iff the realtime clock
is based on the TSC. If userspace provides KVM_SET_CLOCK with a valid
realtime value, advance the KVM clock by the amount of elapsed time. Do
not step the KVM clock backwards, though, as it is a monotonic
oscillator.
Suggested-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
Signed-off-by: default avatarOliver Upton <oupton@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210916181538.968978-5-oupton@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent 3d5e7a28
......@@ -1010,20 +1010,37 @@ such as migration.
When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the
set of bits that KVM can return in struct kvm_clock_data's flag member.
The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned
value is the exact kvmclock value seen by all VCPUs at the instant
when KVM_GET_CLOCK was called. If clear, the returned value is simply
CLOCK_MONOTONIC plus a constant offset; the offset can be modified
with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock,
but the exact value read by each VCPU could differ, because the host
TSC is not stable.
The following flags are defined:
KVM_CLOCK_TSC_STABLE
If set, the returned value is the exact kvmclock
value seen by all VCPUs at the instant when KVM_GET_CLOCK was called.
If clear, the returned value is simply CLOCK_MONOTONIC plus a constant
offset; the offset can be modified with KVM_SET_CLOCK. KVM will try
to make all VCPUs follow this clock, but the exact value read by each
VCPU could differ, because the host TSC is not stable.
KVM_CLOCK_REALTIME
If set, the `realtime` field in the kvm_clock_data
structure is populated with the value of the host's real time
clocksource at the instant when KVM_GET_CLOCK was called. If clear,
the `realtime` field does not contain a value.
KVM_CLOCK_HOST_TSC
If set, the `host_tsc` field in the kvm_clock_data
structure is populated with the value of the host's timestamp counter (TSC)
at the instant when KVM_GET_CLOCK was called. If clear, the `host_tsc` field
does not contain a value.
::
struct kvm_clock_data {
__u64 clock; /* kvmclock current value */
__u32 flags;
__u32 pad[9];
__u32 pad0;
__u64 realtime;
__u64 host_tsc;
__u32 pad[4];
};
......@@ -1040,12 +1057,25 @@ Sets the current timestamp of kvmclock to the value specified in its parameter.
In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
such as migration.
The following flags can be passed:
KVM_CLOCK_REALTIME
If set, KVM will compare the value of the `realtime` field
with the value of the host's real time clocksource at the instant when
KVM_SET_CLOCK was called. The difference in elapsed time is added to the final
kvmclock value that will be provided to guests.
Other flags returned by ``KVM_GET_CLOCK`` are accepted but ignored.
::
struct kvm_clock_data {
__u64 clock; /* kvmclock current value */
__u32 flags;
__u32 pad[9];
__u32 pad0;
__u64 realtime;
__u64 host_tsc;
__u32 pad[4];
};
......
......@@ -1942,4 +1942,7 @@ int kvm_cpu_dirty_log_size(void);
int alloc_all_memslots_rmaps(struct kvm *kvm);
#define KVM_CLOCK_VALID_FLAGS \
(KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
#endif /* _ASM_X86_KVM_HOST_H */
......@@ -2787,6 +2787,7 @@ static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
struct pvclock_vcpu_time_info hv_clock;
unsigned long flags;
data->flags = 0;
spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {
spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
......@@ -2803,10 +2804,20 @@ static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
get_cpu();
if (__this_cpu_read(cpu_tsc_khz)) {
#ifdef CONFIG_X86_64
struct timespec64 ts;
if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
} else
#endif
data->host_tsc = rdtsc();
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
data->clock = __pvclock_read_cycles(&hv_clock, rdtsc());
data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
} else {
data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
}
......@@ -2818,12 +2829,6 @@ u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_clock_data data;
/*
* Zero flags as it's accessed RMW, leave everything else uninitialized
* as clock is always written and no other fields are consumed.
*/
data.flags = 0;
get_kvmclock(kvm, &data);
return data.clock;
}
......@@ -4050,7 +4055,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_SYNC_X86_VALID_FIELDS;
break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_TSC_STABLE;
r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
......@@ -5847,12 +5852,16 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
{
struct kvm_arch *ka = &kvm->arch;
struct kvm_clock_data data;
u64 now_ns;
u64 now_raw_ns;
if (copy_from_user(&data, argp, sizeof(data)))
return -EFAULT;
if (data.flags)
/*
* Only KVM_CLOCK_REALTIME is used, but allow passing the
* result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
*/
if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
return -EINVAL;
kvm_hv_invalidate_tsc_page(kvm);
......@@ -5866,11 +5875,21 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
* is slightly ahead) here we risk going negative on unsigned
* 'system_time' when 'data.clock' is very small.
*/
if (kvm->arch.use_master_clock)
now_ns = ka->master_kernel_ns;
if (data.flags & KVM_CLOCK_REALTIME) {
u64 now_real_ns = ktime_get_real_ns();
/*
* Avoid stepping the kvmclock backwards.
*/
if (now_real_ns > data.realtime)
data.clock += now_real_ns - data.realtime;
}
if (ka->use_master_clock)
now_raw_ns = ka->master_kernel_ns;
else
now_ns = get_kvmclock_base_ns();
ka->kvmclock_offset = data.clock - now_ns;
now_raw_ns = get_kvmclock_base_ns();
ka->kvmclock_offset = data.clock - now_raw_ns;
kvm_end_pvclock_update(kvm);
return 0;
}
......
......@@ -1231,11 +1231,16 @@ struct kvm_irqfd {
/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags. */
#define KVM_CLOCK_TSC_STABLE 2
#define KVM_CLOCK_REALTIME (1 << 2)
#define KVM_CLOCK_HOST_TSC (1 << 3)
struct kvm_clock_data {
__u64 clock;
__u32 flags;
__u32 pad[9];
__u32 pad0;
__u64 realtime;
__u64 host_tsc;
__u32 pad[4];
};
/* For KVM_CAP_SW_TLB */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment