/* * Xen time implementation. * * This is implemented in terms of a clocksource driver which uses * the hypervisor clock as a nanosecond timebase, and a clockevent * driver which uses the hypervisor's timer mechanism. * * Jeremy Fitzhardinge , XenSource Inc, 2007 */ #include #include #include #include #include #include #include #include #include #include #include #include #include "xen-ops.h" #define XEN_SHIFT 22 /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); /* unused ns of stolen and blocked time */ static DEFINE_PER_CPU(u64, residual_stolen); static DEFINE_PER_CPU(u64, residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) { u64 ret; if (BITS_PER_LONG < 64) { u32 *p32 = (u32 *)p; u32 h, l; /* * Read high then low, and then make sure high is * still the same; this will only loop if low wraps * and carries into high. * XXX some clean way to make this endian-proof? */ do { h = p32[1]; barrier(); l = p32[0]; barrier(); } while (p32[1] != h); ret = (((u64)h) << 32) | l; } else ret = *p; return ret; } /* * Runstate accounting */ static void get_runstate_snapshot(struct vcpu_runstate_info *res) { u64 state_time; struct vcpu_runstate_info *state; BUG_ON(preemptible()); state = &__get_cpu_var(runstate); /* * The runstate info is always updated by the hypervisor on * the current CPU, so there's no need to use anything * stronger than a compiler barrier when fetching it. */ do { state_time = get64(&state->state_entry_time); barrier(); *res = *state; barrier(); } while (get64(&state->state_entry_time) != state_time); } /* return true when a vcpu could run but has no real cpu to run on */ bool xen_vcpu_stolen(int vcpu) { return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; } static void setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; area.addr.v = &per_cpu(runstate, cpu); if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area)) BUG(); } static void do_stolen_accounting(void) { struct vcpu_runstate_info state; struct vcpu_runstate_info *snap; s64 blocked, runnable, offline, stolen; cputime_t ticks; get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); snap = &__get_cpu_var(runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; *snap = state; /* Add the appropriate number of ticks of stolen time, including any left-overs from last time. Passing NULL to account_steal_time accounts the time as stolen. */ stolen = runnable + offline + __get_cpu_var(residual_stolen); if (stolen < 0) stolen = 0; ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __get_cpu_var(residual_stolen) = stolen; account_steal_time(NULL, ticks); /* Add the appropriate number of ticks of blocked time, including any left-overs from last time. Passing idle to account_steal_time accounts the time as idle/wait. */ blocked += __get_cpu_var(residual_blocked); if (blocked < 0) blocked = 0; ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); __get_cpu_var(residual_blocked) = blocked; account_steal_time(idle_task(smp_processor_id()), ticks); } /* * Xen sched_clock implementation. Returns the number of unstolen * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED * states. */ unsigned long long xen_sched_clock(void) { struct vcpu_runstate_info state; cycle_t now; u64 ret; s64 offset; /* * Ideally sched_clock should be called on a per-cpu basis * anyway, so preempt should already be disabled, but that's * not current practice at the moment. */ preempt_disable(); now = xen_clocksource_read(); get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); offset = now - state.state_entry_time; if (offset < 0) offset = 0; ret = state.time[RUNSTATE_blocked] + state.time[RUNSTATE_running] + offset; preempt_enable(); return ret; } /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { u64 xen_khz = 1000000ULL << 32; const struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; do_div(xen_khz, info->tsc_to_system_mul); if (info->tsc_shift < 0) xen_khz <<= -info->tsc_shift; else xen_khz >>= info->tsc_shift; return xen_khz; } cycle_t xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; src = &get_cpu_var(xen_vcpu)->time; ret = pvclock_clocksource_read(src); put_cpu_var(xen_vcpu); return ret; } static void xen_read_wallclock(struct timespec *ts) { struct shared_info *s = HYPERVISOR_shared_info; struct pvclock_wall_clock *wall_clock = &(s->wc); struct pvclock_vcpu_time_info *vcpu_time; vcpu_time = &get_cpu_var(xen_vcpu)->time; pvclock_read_wallclock(wall_clock, vcpu_time, ts); put_cpu_var(xen_vcpu); } unsigned long xen_get_wallclock(void) { struct timespec ts; xen_read_wallclock(&ts); return ts.tv_sec; } int xen_set_wallclock(unsigned long now) { /* do nothing for domU */ return -1; } static struct clocksource xen_clocksource __read_mostly = { .name = "xen", .rating = 400, .read = xen_clocksource_read, .mask = ~0, .mult = 1<mode != CLOCK_EVT_MODE_ONESHOT); if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) BUG(); /* We may have missed the deadline, but there's no real way of knowing for sure. If the event was in the past, then we'll get an immediate interrupt. */ return 0; } static const struct clock_event_device xen_timerop_clockevent = { .name = "xen", .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, .min_delta_ns = TIMER_SLOP, .mult = 1, .shift = 0, .rating = 500, .set_mode = xen_timerop_set_mode, .set_next_event = xen_timerop_set_next_event, }; static void xen_vcpuop_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) { int cpu = smp_processor_id(); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: WARN_ON(1); /* unsupported */ break; case CLOCK_EVT_MODE_ONESHOT: if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) BUG(); break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) BUG(); break; case CLOCK_EVT_MODE_RESUME: break; } } static int xen_vcpuop_set_next_event(unsigned long delta, struct clock_event_device *evt) { int cpu = smp_processor_id(); struct vcpu_set_singleshot_timer single; int ret; WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); single.timeout_abs_ns = get_abs_timeout(delta); single.flags = VCPU_SSHOTTMR_future; ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); BUG_ON(ret != 0 && ret != -ETIME); return ret; } static const struct clock_event_device xen_vcpuop_clockevent = { .name = "xen", .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, .min_delta_ns = TIMER_SLOP, .mult = 1, .shift = 0, .rating = 500, .set_mode = xen_vcpuop_set_mode, .set_next_event = xen_vcpuop_set_next_event, }; static const struct clock_event_device *xen_clockevent = &xen_timerop_clockevent; static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); irqreturn_t ret; ret = IRQ_NONE; if (evt->event_handler) { evt->event_handler(evt); ret = IRQ_HANDLED; } do_stolen_accounting(); return ret; } void xen_setup_timer(int cpu) { const char *name; struct clock_event_device *evt; int irq; printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); name = kasprintf(GFP_KERNEL, "timer%d", cpu); if (!name) name = ""; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, name, NULL); evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); evt->cpumask = cpumask_of_cpu(cpu); evt->irq = irq; setup_runstate_info(cpu); } void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); clockevents_register_device(&__get_cpu_var(xen_clock_events)); } void xen_timer_resume(void) { int cpu; if (xen_clockevent != &xen_vcpuop_clockevent) return; for_each_online_cpu(cpu) { if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) BUG(); } } __init void xen_time_init(void) { int cpu = smp_processor_id(); clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { /* Successfully turned off 100Hz tick, so we have the vcpuop-based timer interface */ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); xen_clockevent = &xen_vcpuop_clockevent; } /* Set initial system time with full resolution */ xen_read_wallclock(&xtime); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); setup_force_cpu_cap(X86_FEATURE_TSC); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); }