gettimeofday(2) は VDSO によりユーザー空間で実行される - ablog
と違って Oracle Linux 6.7 on OracleVM 3.2 (Xenベース) だと strace で gettimeofday が出るという話を聞いたので、
strace のログを見てみます。Non Blocking I/O の
システムコール(io_submit/io_getevents) が 出ているやで 彡(゚)(゚)★LGWRプロセスのシステムコールを確認 cat lgwr_strace_async_true.log.5657 : gettimeofday({1448245846, 974961}, NULL) = 0 io_submit(139669291659264, 1, {{data:0x7f07479db320, pwrite, filedes:258, … ★ gettimeofday({1448245846, 975302}, NULL) = 0 io_getevents(139669291659264, 1, 128, {{0x7f07479db320, 0x7f07479db320, … ★ gettimeofday({144ASM使用時 に DISK_ASYNCH_IOパラメータ を変えて、LGWR/DBWRプロセスのシステムコールを見てみる。 - ねら~ITエンジニア雑記
調べてみたところ、Xen(Paravirtualization) のゲスト(DomU)で Linux を使っている場合、clocksource が xen(pv clock) の場合は vDSO にならないようです。
P.24
Time Keeping Explained
- Time keeping in an instance is deceptively hard
- gettimeofday(), clock_gettime(), QueryPerformanceCounter()
- The TSC
- CPU counter, accessible from userspace
- Requires calibration, vDSO
- Invariant on Sandy Bridge+ processors
- Xen pvclock; does not support vDSO
- On current generation instances, use TSC as clocksource
RHEL7(kernel 3.10)では pv clock の vDSO 対応が入ってるぽいです。
x86: vdso: pvclock gettime support
author Marcelo Tosatti <mtosatti@redhat.com> 2012-11-28 01:28:57 (GMT) committer Marcelo Tosatti <mtosatti@redhat.com> 2012-11-28 01:29:11 (GMT) commit 51c19b4f5927f5a646e93d69f73c7e89ea14e737 (patch) tree a48486f72dc433fd516684d4441022650429d333 /arch/x86/vdso/vclock_gettime.c parent 3dc4f7cfb7441e5e0fed3a02fc81cdaabd28300a (diff) x86: vdso: pvclock gettime support Improve performance of time system calls when using Linux pvclock, by reading time info from fixmap visible copy of pvclock data. Originally from Jeremy Fitzhardinge. Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> ... diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 4df6c37..205ad32 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,7 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> +#include <asm/pvclock.h> #define gtod (&VVAR(vsyscall_gtod_data)) @@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void) return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); } +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u32 migrate_count; + u8 flags; + unsigned cpu, cpu1; + + + /* + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + migrate_count = pvti->migrate_count; + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) } -notrace static inline u64 vgetsns(void) +notrace static inline u64 vgetsns(int *mode) { long v; cycles_t cycles; @@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void) cycles = vread_tsc(); else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif else return 0; v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; @@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns);