Commit 92525be5 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] monotonic clock source for hangcheck timer

From: john stultz <johnstul@us.ibm.com>

This patch, written with the advice of Joel Becker, addresses a problem with
the hangcheck-timer.

The basic problem is that the hangcheck-timer code (Required for Oracle)
needs a accurate hard clock which can be used to detect OS stalls (due to
udelay() or pci bus hangs) that would cause system time to skew (its sort of
a sanity check that insures the system's notion of time is accurate).
However, currently they are using get_cycles() to fetch the cpu's TSC
register, thus this does not work on systems w/o a synced TSC.

As suggested by Andi Kleen (see thread here:
http://www.uwsg.iu.edu/hypermail/linux/kernel/0302.0/1234.html ) I've worked
with Joel and others to implement the monotonic_clock() interface.  Some of
the major considerations made when writing this patch were

o Needs to be able to return accurate time in the absence of multiple timer
  interrupts

o Needs to be abstracted out from the hardware

o Avoids impacting gettimeofday() performance

This interface returns a unsigned long long representing the number of
nanoseconds that has passed since time_init().
parent 68fa8120
...@@ -138,6 +138,17 @@ void do_settimeofday(struct timeval *tv) ...@@ -138,6 +138,17 @@ void do_settimeofday(struct timeval *tv)
clock_was_set(); clock_was_set();
} }
/* monotonic_clock(): returns # of nanoseconds passed since time_init()
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
*/
unsigned long long monotonic_clock(void)
{
return timer->monotonic_clock();
}
EXPORT_SYMBOL(monotonic_clock);
/* /*
* In order to set the CMOS clock precisely, set_rtc_mmss has to be * In order to set the CMOS clock precisely, set_rtc_mmss has to be
* called 500 ms after the second nowtime has started, because when * called 500 ms after the second nowtime has started, because when
......
...@@ -28,27 +28,46 @@ static int delay_at_last_interrupt; ...@@ -28,27 +28,46 @@ static int delay_at_last_interrupt;
#define CYCLONE_MPMC_OFFSET 0x51D0 #define CYCLONE_MPMC_OFFSET 0x51D0
#define CYCLONE_MPCS_OFFSET 0x51A8 #define CYCLONE_MPCS_OFFSET 0x51A8
#define CYCLONE_TIMER_FREQ 100000000 #define CYCLONE_TIMER_FREQ 100000000
#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
int use_cyclone = 0; int use_cyclone = 0;
static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */
static u32 last_cyclone_timer; static u32 last_cyclone_low;
static u32 last_cyclone_high;
static unsigned long long monotonic_base;
static rwlock_t monotonic_lock = RW_LOCK_UNLOCKED;
/* helper macro to atomically read both cyclone counter registers */
#define read_cyclone_counter(low,high) \
do{ \
high = cyclone_timer[1]; low = cyclone_timer[0]; \
} while (high != cyclone_timer[1]);
static void mark_offset_cyclone(void) static void mark_offset_cyclone(void)
{ {
int count; int count;
unsigned long long this_offset, last_offset;
write_lock(&monotonic_lock);
last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
spin_lock(&i8253_lock); spin_lock(&i8253_lock);
/* quickly read the cyclone timer */ read_cyclone_counter(last_cyclone_low,last_cyclone_high);
if(cyclone_timer)
last_cyclone_timer = cyclone_timer[0];
/* calculate delay_at_last_interrupt */ /* read values for delay_at_last_interrupt */
outb_p(0x00, 0x43); /* latch the count ASAP */ outb_p(0x00, 0x43); /* latch the count ASAP */
count = inb_p(0x40); /* read the latched count */ count = inb_p(0x40); /* read the latched count */
count |= inb(0x40) << 8; count |= inb(0x40) << 8;
spin_unlock(&i8253_lock); spin_unlock(&i8253_lock);
/* update the monotonic base value */
this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
write_unlock(&monotonic_lock);
/* calculate delay_at_last_interrupt */
count = ((LATCH-1) - count) * TICK_SIZE; count = ((LATCH-1) - count) * TICK_SIZE;
delay_at_last_interrupt = (count + LATCH/2) / LATCH; delay_at_last_interrupt = (count + LATCH/2) / LATCH;
} }
...@@ -64,7 +83,7 @@ static unsigned long get_offset_cyclone(void) ...@@ -64,7 +83,7 @@ static unsigned long get_offset_cyclone(void)
offset = cyclone_timer[0]; offset = cyclone_timer[0];
/* .. relative to previous jiffy */ /* .. relative to previous jiffy */
offset = offset - last_cyclone_timer; offset = offset - last_cyclone_low;
/* convert cyclone ticks to microseconds */ /* convert cyclone ticks to microseconds */
/* XXX slow, can we speed this up? */ /* XXX slow, can we speed this up? */
...@@ -74,6 +93,27 @@ static unsigned long get_offset_cyclone(void) ...@@ -74,6 +93,27 @@ static unsigned long get_offset_cyclone(void)
return delay_at_last_interrupt + offset; return delay_at_last_interrupt + offset;
} }
static unsigned long long monotonic_clock_cyclone(void)
{
u32 now_low, now_high;
unsigned long long last_offset, this_offset, base;
unsigned long long ret;
/* atomically read monotonic base & last_offset */
read_lock_irq(&monotonic_lock);
last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
base = monotonic_base;
read_unlock_irq(&monotonic_lock);
/* Read the cyclone counter */
read_cyclone_counter(now_low,now_high);
this_offset = ((unsigned long long)now_high<<32)|now_low;
/* convert to nanoseconds */
ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
return ret * (1000000000 / CYCLONE_TIMER_FREQ);
}
static int __init init_cyclone(char* override) static int __init init_cyclone(char* override)
{ {
u32* reg; u32* reg;
...@@ -194,5 +234,6 @@ struct timer_opts timer_cyclone = { ...@@ -194,5 +234,6 @@ struct timer_opts timer_cyclone = {
.init = init_cyclone, .init = init_cyclone,
.mark_offset = mark_offset_cyclone, .mark_offset = mark_offset_cyclone,
.get_offset = get_offset_cyclone, .get_offset = get_offset_cyclone,
.monotonic_clock = monotonic_clock_cyclone,
.delay = delay_cyclone, .delay = delay_cyclone,
}; };
...@@ -16,6 +16,11 @@ static unsigned long get_offset_none(void) ...@@ -16,6 +16,11 @@ static unsigned long get_offset_none(void)
return 0; return 0;
} }
static unsigned long long monotonic_clock_none(void)
{
return 0;
}
static void delay_none(unsigned long loops) static void delay_none(unsigned long loops)
{ {
int d0; int d0;
...@@ -34,5 +39,6 @@ struct timer_opts timer_none = { ...@@ -34,5 +39,6 @@ struct timer_opts timer_none = {
.init = init_none, .init = init_none,
.mark_offset = mark_offset_none, .mark_offset = mark_offset_none,
.get_offset = get_offset_none, .get_offset = get_offset_none,
.monotonic_clock = monotonic_clock_none,
.delay = delay_none, .delay = delay_none,
}; };
...@@ -31,6 +31,11 @@ static void mark_offset_pit(void) ...@@ -31,6 +31,11 @@ static void mark_offset_pit(void)
/* nothing needed */ /* nothing needed */
} }
static unsigned long long monotonic_clock_pit(void)
{
return 0;
}
static void delay_pit(unsigned long loops) static void delay_pit(unsigned long loops)
{ {
int d0; int d0;
...@@ -145,5 +150,6 @@ struct timer_opts timer_pit = { ...@@ -145,5 +150,6 @@ struct timer_opts timer_pit = {
.init = init_pit, .init = init_pit,
.mark_offset = mark_offset_pit, .mark_offset = mark_offset_pit,
.get_offset = get_offset_pit, .get_offset = get_offset_pit,
.monotonic_clock = monotonic_clock_pit,
.delay = delay_pit, .delay = delay_pit,
}; };
...@@ -24,6 +24,38 @@ static int use_tsc; ...@@ -24,6 +24,38 @@ static int use_tsc;
static int delay_at_last_interrupt; static int delay_at_last_interrupt;
static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
static unsigned long long monotonic_base;
static rwlock_t monotonic_lock = RW_LOCK_UNLOCKED;
/* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_mhz * 10^6))
* ns = cycles * (10^3 / cpu_mhz)
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^3 * SC / cpu_mhz) / SC
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
static unsigned long cyc2ns_scale;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
{
cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}
/* Cached *multiplier* to convert TSC counts to microseconds. /* Cached *multiplier* to convert TSC counts to microseconds.
* (see the equation below). * (see the equation below).
...@@ -61,11 +93,32 @@ static unsigned long get_offset_tsc(void) ...@@ -61,11 +93,32 @@ static unsigned long get_offset_tsc(void)
return delay_at_last_interrupt + edx; return delay_at_last_interrupt + edx;
} }
static unsigned long long monotonic_clock_tsc(void)
{
unsigned long long last_offset, this_offset, base;
/* atomically read monotonic base & last_offset */
read_lock_irq(&monotonic_lock);
last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
base = monotonic_base;
read_unlock_irq(&monotonic_lock);
/* Read the Time Stamp Counter */
rdtscll(this_offset);
/* return the value in ns */
return base + cycles_2_ns(this_offset - last_offset);
}
static void mark_offset_tsc(void) static void mark_offset_tsc(void)
{ {
int count; int count;
int countmp; int countmp;
static int count1=0, count2=LATCH; static int count1=0, count2=LATCH;
unsigned long long this_offset, last_offset;
write_lock(&monotonic_lock);
last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
/* /*
* It is important that these two operations happen almost at * It is important that these two operations happen almost at
* the same time. We do the RDTSC stuff first, since it's * the same time. We do the RDTSC stuff first, since it's
...@@ -80,7 +133,7 @@ static void mark_offset_tsc(void) ...@@ -80,7 +133,7 @@ static void mark_offset_tsc(void)
/* read Pentium cycle counter */ /* read Pentium cycle counter */
rdtscl(last_tsc_low); rdtsc(last_tsc_low, last_tsc_high);
spin_lock(&i8253_lock); spin_lock(&i8253_lock);
outb_p(0x00, 0x43); /* latch the count ASAP */ outb_p(0x00, 0x43); /* latch the count ASAP */
...@@ -103,6 +156,12 @@ static void mark_offset_tsc(void) ...@@ -103,6 +156,12 @@ static void mark_offset_tsc(void)
} }
} }
/* update the monotonic base value */
this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
monotonic_base += cycles_2_ns(this_offset - last_offset);
write_unlock(&monotonic_lock);
/* calculate delay_at_last_interrupt */
count = ((LATCH-1) - count) * TICK_SIZE; count = ((LATCH-1) - count) * TICK_SIZE;
delay_at_last_interrupt = (count + LATCH/2) / LATCH; delay_at_last_interrupt = (count + LATCH/2) / LATCH;
} }
...@@ -301,6 +360,7 @@ static int __init init_tsc(char* override) ...@@ -301,6 +360,7 @@ static int __init init_tsc(char* override)
"0" (eax), "1" (edx)); "0" (eax), "1" (edx));
printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000);
} }
set_cyc2ns_scale(cpu_khz/1000);
return 0; return 0;
} }
} }
...@@ -334,5 +394,6 @@ struct timer_opts timer_tsc = { ...@@ -334,5 +394,6 @@ struct timer_opts timer_tsc = {
.init = init_tsc, .init = init_tsc,
.mark_offset = mark_offset_tsc, .mark_offset = mark_offset_tsc,
.get_offset = get_offset_tsc, .get_offset = get_offset_tsc,
.monotonic_clock = monotonic_clock_tsc,
.delay = delay_tsc, .delay = delay_tsc,
}; };
...@@ -78,11 +78,13 @@ static void hangcheck_fire(unsigned long); ...@@ -78,11 +78,13 @@ static void hangcheck_fire(unsigned long);
static struct timer_list hangcheck_ticktock = static struct timer_list hangcheck_ticktock =
TIMER_INITIALIZER(hangcheck_fire, 0, 0); TIMER_INITIALIZER(hangcheck_fire, 0, 0);
extern unsigned long long monotonic_clock(void);
static void hangcheck_fire(unsigned long data) static void hangcheck_fire(unsigned long data)
{ {
unsigned long long cur_tsc, tsc_diff; unsigned long long cur_tsc, tsc_diff;
cur_tsc = get_cycles(); cur_tsc = monotonic_clock();
if (cur_tsc > hangcheck_tsc) if (cur_tsc > hangcheck_tsc)
tsc_diff = cur_tsc - hangcheck_tsc; tsc_diff = cur_tsc - hangcheck_tsc;
...@@ -98,7 +100,7 @@ static void hangcheck_fire(unsigned long data) ...@@ -98,7 +100,7 @@ static void hangcheck_fire(unsigned long data)
} }
} }
mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
hangcheck_tsc = get_cycles(); hangcheck_tsc = monotonic_clock();
} }
...@@ -108,10 +110,10 @@ static int __init hangcheck_init(void) ...@@ -108,10 +110,10 @@ static int __init hangcheck_init(void)
VERSION_STR, hangcheck_tick, hangcheck_margin); VERSION_STR, hangcheck_tick, hangcheck_margin);
hangcheck_tsc_margin = hangcheck_margin + hangcheck_tick; hangcheck_tsc_margin = hangcheck_margin + hangcheck_tick;
hangcheck_tsc_margin *= HZ; hangcheck_tsc_margin *= 1000000000;
hangcheck_tsc_margin *= current_cpu_data.loops_per_jiffy;
hangcheck_tsc = get_cycles(); hangcheck_tsc = monotonic_clock();
mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
return 0; return 0;
......
...@@ -14,6 +14,7 @@ struct timer_opts{ ...@@ -14,6 +14,7 @@ struct timer_opts{
int (*init)(char *override); int (*init)(char *override);
void (*mark_offset)(void); void (*mark_offset)(void);
unsigned long (*get_offset)(void); unsigned long (*get_offset)(void);
unsigned long long (*monotonic_clock)(void);
void (*delay)(unsigned long); void (*delay)(unsigned long);
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment