Oracle Linux 6 on OracleVM では gettimeofday が vDSO でない

gettimeofday(2) は VDSO によりユーザー空間で実行される - ablog
と違って Oracle Linux 6.7 on OracleVM 3.2 (Xenベース) だと strace で gettimeofday が出るという話を聞いたので、

strace のログを見てみます。Non Blocking I/O の
システムコール(io_submit/io_getevents) が出ているやで彡(ﾟ)(ﾟ)
★LGWRプロセスのシステムコールを確認
cat lgwr_strace_async_true.log.5657
：
gettimeofday({1448245846, 974961}, NULL) = 0
io_submit(139669291659264, 1, {{data:0x7f07479db320, pwrite, filedes:258, … ★
gettimeofday({1448245846, 975302}, NULL) = 0
io_getevents(139669291659264, 1, 128, {{0x7f07479db320, 0x7f07479db320,  … ★
gettimeofday({144
ASM使用時に DISK_ASYNCH_IOパラメータを変えて、LGWR／DBWRプロセスのシステムコールを見てみる。 - ねら～ITエンジニア雑記

調べてみたところ、Xen(Paravirtualization) のゲスト(DomU)で Linux を使っている場合、clocksource が xen(pv clock) の場合は vDSO にならないようです。

(CMP402) Amazon EC2 Instances Deep Dive from Amazon Web Services

P.24

Time Keeping Explained

Time keeping in an instance is deceptively hard

gettimeofday(), clock_gettime(), QueryPerformanceCounter()

The TSC

CPU counter, accessible from userspace

Requires calibration, vDSO

Invariant on Sandy Bridge+ processors

Xen pvclock; does not support vDSO

On current generation instances, use TSC as clocksource

RHEL7(kernel 3.10)では pv clock の vDSO 対応が入ってるぽいです。

x86: vdso: pvclock gettime support

author	Marcelo Tosatti <mtosatti@redhat.com>	2012-11-28 01:28:57 (GMT)
committer	Marcelo Tosatti <mtosatti@redhat.com>	2012-11-28 01:29:11 (GMT)
commit	51c19b4f5927f5a646e93d69f73c7e89ea14e737 (patch)
tree	a48486f72dc433fd516684d4441022650429d333 /arch/x86/vdso/vclock_gettime.c
parent	3dc4f7cfb7441e5e0fed3a02fc81cdaabd28300a (diff)
x86: vdso: pvclock gettime support
Improve performance of time system calls when using Linux pvclock,
by reading time info from fixmap visible copy of pvclock data.

Originally from Jeremy Fitzhardinge.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

...

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c37..205ad32 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/pvclock.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
 }
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+
+static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+{
+	const struct pvclock_vsyscall_time_info *pvti_base;
+	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
+	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
+
+	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
+
+	pvti_base = (struct pvclock_vsyscall_time_info *)
+		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
+
+	return &pvti_base[offset];
+}
+
+static notrace cycle_t vread_pvclock(int *mode)
+{
+	const struct pvclock_vsyscall_time_info *pvti;
+	cycle_t ret;
+	u64 last;
+	u32 version;
+	u32 migrate_count;
+	u8 flags;
+	unsigned cpu, cpu1;
+
+
+	/*
+	 * When looping to get a consistent (time-info, tsc) pair, we
+	 * also need to deal with the possibility we can switch vcpus,
+	 * so make sure we always re-fetch time-info for the current vcpu.
+	 */
+	do {
+		cpu = __getcpu() & VGETCPU_CPU_MASK;
+		/* TODO: We can put vcpu id into higher bits of pvti.version.
+		 * This will save a couple of cycles by getting rid of
+		 * __getcpu() calls (Gleb).
+		 */
+
+		pvti = get_pvti(cpu);
+
+		migrate_count = pvti->migrate_count;
+
+		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+
+		/*
+		 * Test we're still on the cpu as well as the version.
+		 * We could have been migrated just after the first
+		 * vgetcpu but before fetching the version, so we
+		 * wouldn't notice a version change.
+		 */
+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+	} while (unlikely(cpu != cpu1 ||
+			  (pvti->pvti.version & 1) ||
+			  pvti->pvti.version != version ||
+			  pvti->migrate_count != migrate_count));
+
+	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+		*mode = VCLOCK_NONE;
+
+	/* refer to tsc.c read_tsc() comment for rationale */
+	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+	if (likely(ret >= last))
+		return ret;
+
+	return last;
+}
+#endif
+
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 }
 
 
-notrace static inline u64 vgetsns(void)
+notrace static inline u64 vgetsns(int *mode)
 {
 	long v;
 	cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
 		cycles = vread_tsc();
 	else if (gtod->clock.vclock_mode == VCLOCK_HPET)
 		cycles = vread_hpet();
+#ifdef CONFIG_PARAVIRT_CLOCK
+	else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
+		cycles = vread_pvclock(mode);
+#endif
 	else
 		return 0;
 	v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ns = gtod->wall_time_snsec;
-		ns += vgetsns();
+		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->monotonic_time_sec;
 		ns = gtod->monotonic_time_snsec;
-		ns += vgetsns();
+		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 	timespec_add_ns(ts, ns);