ablog

不器用で落着きのない技術者のメモ

gettimeofday(2) は VDSO によりユーザー空間で実行される

gettimeofday(2) はシステムコールなので、大量に発行すると%sysが上がると思っていたが、VDSOという仕組みでユーザー空間で実行されるので%userが上がるらしい。時刻取得みたいなちょっとした処理でシステムコールを発行してコンテキストスイッチするのって無駄が多いなって思ってたけど、そこはちゃんと考えられているんですね。

多くのアプリケーション負荷 (特にデータベースおよび財務サービスアプリケーション) は gettimeofday または類似の時間機能コールを非常に頻繁に実行します。 このコールの効率性を最適化すると、 大きな利点があります。
VDSO (Virtual Dynamic Shared Object) は、 ユーザースペースのアプリケーションがシステムコールよりも少ないオーバーヘッドで一部のカーネルアクションを実行できるようにする共有ライブラリです。 多くの場合、 VDSO は gettimeofday システムコールデータへの高速なアクセスを提供するために使用されます。
VDSO を有効にすると、 カーネルはユーザースペース共有ライブラリ (特に glibc) にあるものではなく VDSO のシンボルの定義を使用するよう指示されます。 VDSO の有効化の影響はシステム全体に及びます。 すべてのプロセスが使用するか、 またはどのプロセスも使用しないかのどちらかになります。
有効化されたら、 VDSO は gettimeofday の glibc 定義をその独自の定義でオーバーライドします。 これにより、 システムコールのオーバーヘッドが取り除かれます (コールは glibc ではなくカーネルメモリに直接行われるため)。

https://lists.openshift.redhat.com/docs/ja-JP/Red_Hat_Enterprise_MRG/1.2/html/Realtime_Tuning_Guide/sect-Realtime_Tuning_Guide-General_System_Tuning-gettimeofday_speedup.html

The "vDSO" (virtual dynamic shared object) is a small shared library that the kernel automatically maps into the address space of all user-space applications. Applications usually do not need to concern themselves with these details as the vDSO is most commonly called by the C library. This way you can code in the normal way using standard functions and the C library will take care of using any functionality that is available via the vDSO.
(中略)
One frequently used system call is gettimeofday(2). This system call is called both directly by user-space applications as well as indirectly by the C library. Think timestamps or timing loops or polling—all of these frequently need to know what time it is right now. This information is also not secret—any application in any privilege mode (root or any unprivileged user) will get the same answer. Thus the kernel arranges for the information required to answer this question to be placed in memory the process can access.Now a call to gettimeofday(2) changes from a system call to a normal function call and a few memory accesses.

vdso(7) - Linux manual page


無限ループしながら gettimeofday を実行するプログラムを書いて、

  • gettimeofday.c
#include <stdio.h>
#include <sys/time.h>

int main(void)
{
	struct timeval tv;
        while(1) {
                gettimeofday(&tv, NULL);
        }
	return 1;
}

コンパイルして実行してみると、ユーザーモードで使った時間がほとんどで、カーネルモードで使った時間は僅かしかない。

$ gcc -o gettimeofday gettimeofday.c 
$ ldd gettimeofday
	linux-vdso.so.1 =>  (0x00007fff22715000)
	libc.so.6 => /lib64/libc.so.6 (0x0000003ba6600000)
	/lib64/ld-linux-x86-64.so.2 (0x0000003ba6200000)
$ time ./gettimeofday 
^C

real	12m6.745s
user	12m1.298s <-- ユーザーモードの時間が長く
sys	0m0.369s <-- カーネルモードの時間は短い


以下はこのプログラムを実行中に取得した情報。

  • dstat
    • cpu3 の usr が 100% で sys は 0%!
$ dstat -f 5
-------cpu0-usage--------------cpu1-usage--------------cpu2-usage--------------cpu3-usage------
usr sys idl wai hiq siq:usr sys idl wai hiq siq:usr sys idl wai hiq siq:usr sys idl wai hiq siq
 27  61  12   0   0   1: 26  51  22   0   0   0: 16  63  21   0   0   0: 17  75   8   1   0   0
  3   5  92   0   0   0:  2   5  93   0   0   0:  1   2  97   0   0   0:1000   0   0   0   |
  3   4  93   0   0   0:  2   4  93   0   0   0:  1   4  95   0   0   0:1000   0   0   0   0
  4   5  91   0   0   0:  3   5  92   0   0   0:  1   2  97   0   0   0:1000   0   0   0   0
  • strace
    • むむっ、何も出力されない
$ strace -p `pgrep gettimeofday`
Process 25792 attached - interrupt to quit
  • ltrace
    • おおっ、ひたすら gettimeofday が出力される
$ ltrace -p `pgrep gettimeofday`  
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0 
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
gettimeofday(0x7fff18b60a60, NULL)                                                                     = 0
  • pstack
    • ふむ
$ pstack `pgrep gettimeofday`
#0  0x00007fff18bff82b in ?? ()
#1  0x00007fff18bff8ad in gettimeofday ()
#2  0x0000003ba669ca8a in gettimeofday () from /lib64/libc.so.6
#3  0x00000000004004dd in main ()
  • perf top
    • gettimeofday(libcのユーザー空間の) がトップ。
# perf top
   PerfTop:    1359 irqs/sec  kernel:17.7%  exact:  0.0% [1000Hz cycles],  (all, 4 CPUs)
----------------------------------------------------------------------る------------------------------------------------------------------------------------------------

samples  pcnt function                             DSO
_______ _____ ____________________________________ ______________________________________________________________________________

740.00 26.1% gettimeofday                         /lib64/libc-2.12.so                          
492.00 17.3% main                                 /home/yazekats/Documents/work/gettimeofday
229.00  8.1% gettimeofday@plt                     /home/yazekats/Documents/work/gettimeofday
118.00  4.2% supdrvTracerInit                     [vboxdrv]
73.00  2.6% intel_idle                           [kernel.kallsyms]
67.00  2.4% PDMCritSectLeave                     /usr/lib/virtualbox/VBoxVMM.so
64.00  2.3% RTTimeNanoTSLFenceSync               /usr/lib/virtualbox/VBoxRT.so
60.00  2.1% __ticket_spin_lock                   [kernel.kallsyms]
51.00  1.8% PDMCritSectEnter                     /usr/lib/virtualbox/VBoxVMM.so
50.00  1.8% fget_light                           [kernel.kallsyms]
38.00  1.3% pthread_mutex_lock                   /lib64/libpthread-2.12.so
35.00  1.2% __schedule                           [kernel.kallsyms]
29.00  1.0% __memmove_ssse3_back                 /opt/google/chrome/lib/libc.so.6
28.00  1.0% __pthread_mutex_unlock_internal      /lib64/libpthread-2.12.so
27.00  1.0% native_write_msr_safe                [kernel.kallsyms]
26.00  0.9% memcpy                               /lib64/libc-2.12.so
22.00  0.8% drm_rmmap                            /lib/modules/2.6.39-400.17.1.el6uek.x86_64/kernel/drivers/gpu/drm/drm.ko
17.00  0.6% __pthread_getspecific_internal       /lib64/libpthread-2.12.so
17.00  0.6% fput                                 [kernel.kallsyms]
16.00  0.6% find_busiest_group                   [kernel.kallsyms]
15.00  0.5% inflate_fast                         /lib64/libz.so.1.2.3
15.00  0.5% unix_poll                            [kernel.kallsyms]
14.00  0.5% kfree                                [kernel.kallsyms]
14.00  0.5% ioread32                             [kernel.kallsyms]
13.00  0.5% apic_timer_interrupt                 [kernel.kallsyms]
13.00  0.5% supdrvGipCpuIndexFromCpuId           [vboxdrv]
12.00  0.4% i8042_interrupt                      [kernel.kallsyms]                                    
12.00  0.4% notify_ring                          /lib/modules/2.6.39-400.17.1.el6uek.x86_64/kernel/drivers/gpu/drm/i915/i915.ko
  • /proc/[pid]/maps
$ cat /proc/4272/maps
00400000-00401000 r-xp 00000000 fc:03 263652                             /home/yazekats/Documents/work/gettimeofday
00600000-00601000 rw-p 00000000 fc:03 263652                             /home/yazekats/Documents/work/gettimeofday
3ba6200000-3ba6220000 r-xp 00000000 fc:01 1966108                        /lib64/ld-2.12.so
3ba641f000-3ba6420000 r--p 0001f000 fc:01 1966108                        /lib64/ld-2.12.so
3ba6420000-3ba6421000 rw-p 00020000 fc:01 1966108                        /lib64/ld-2.12.so
3ba6421000-3ba6422000 rw-p 00000000 00:00 0 
3ba6600000-3ba678b000 r-xp 00000000 fc:01 1966110                        /lib64/libc-2.12.so
3ba678b000-3ba698a000 ---p 0018b000 fc:01 1966110                        /lib64/libc-2.12.so
3ba698a000-3ba698e000 r--p 0018a000 fc:01 1966110                        /lib64/libc-2.12.so
3ba698e000-3ba698f000 rw-p 0018e000 fc:01 1966110                        /lib64/libc-2.12.so
3ba698f000-3ba6994000 rw-p 00000000 00:00 0 
7f58e4ba3000-7f58e4ba6000 rw-p 00000000 00:00 0 
7f58e4bc0000-7f58e4bc1000 rw-p 00000000 00:00 0 
7fff4eb19000-7fff4eb3b000 rw-p 00000000 00:00 0                          [stack]
7fff4eb9a000-7fff4eb9b000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall

ソースコードはこのへんぽい。

  • kernel-2.6.39/linux-2.6.39.x86_64/arch/x86/vdso/vclock_gettime.c
/*
 * Copyright 2006 Andi Kleen, SUSE Labs.
 * Subject to the GNU Public License, v.2
 *
 * Fast user context implementation of clock_gettime, gettimeofday, and time.
 *
 * The code should have no internal unresolved relocations.
 * Check with readelf after changing.
 * Also alternative() doesn't work.
 */

/* Disable profiling for userspace code: */
#define DISABLE_BRANCH_PROFILING

#include <linux/kernel.h>
#include <linux/posix-timers.h>
#include <linux/time.h>
#include <linux/string.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
#include <asm/timex.h>
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>

#define gtod (&VVAR(vsyscall_gtod_data))

notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
        long ret;
        asm("syscall" : "=a" (ret) :
            "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
        return ret;
}

notrace static inline long vgetns(void)
{
        long v;
        cycles_t (*vread)(void);
        vread = gtod->clock.vread;
        v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
        return (v * gtod->clock.mult) >> gtod->clock.shift;
}

notrace static noinline int do_realtime(struct timespec *ts)
{
        unsigned long seq, ns;
        do {
                seq = read_seqbegin(&gtod->lock);
                ts->tv_sec = gtod->wall_time_sec;
                ts->tv_nsec = gtod->wall_time_nsec;
                ns = vgetns();
        } while (unlikely(read_seqretry(&gtod->lock, seq)));
        timespec_add_ns(ts, ns);
        return 0;
}

notrace static noinline int do_monotonic(struct timespec *ts)
{
        unsigned long seq, ns, secs;
        do {
                seq = read_seqbegin(&gtod->lock);
                secs = gtod->wall_time_sec;
                ns = gtod->wall_time_nsec + vgetns();
                secs += gtod->wall_to_monotonic.tv_sec;
                ns += gtod->wall_to_monotonic.tv_nsec;
        } while (unlikely(read_seqretry(&gtod->lock, seq)));

        /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
         * are all guaranteed to be nonnegative.
         */
        while (ns >= NSEC_PER_SEC) {
                ns -= NSEC_PER_SEC;
                ++secs;
        }
        ts->tv_sec = secs;
        ts->tv_nsec = ns;

        return 0;
}

notrace static noinline int do_realtime_coarse(struct timespec *ts)
{
        unsigned long seq;
        do {
                seq = read_seqbegin(&gtod->lock);
                ts->tv_sec = gtod->wall_time_coarse.tv_sec;
                ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
        } while (unlikely(read_seqretry(&gtod->lock, seq)));
        return 0;
}

notrace static noinline int do_monotonic_coarse(struct timespec *ts)
{
        unsigned long seq, ns, secs;
        do {
                seq = read_seqbegin(&gtod->lock);
                secs = gtod->wall_time_coarse.tv_sec;
                ns = gtod->wall_time_coarse.tv_nsec;
                secs += gtod->wall_to_monotonic.tv_sec;
                ns += gtod->wall_to_monotonic.tv_nsec;
        } while (unlikely(read_seqretry(&gtod->lock, seq)));

        /* wall_time_nsec and wall_to_monotonic.tv_nsec are
         * guaranteed to be between 0 and NSEC_PER_SEC.
         */
        if (ns >= NSEC_PER_SEC) {
                ns -= NSEC_PER_SEC;
                ++secs;
        }
        ts->tv_sec = secs;
        ts->tv_nsec = ns;

        return 0;
}

notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
        if (likely(gtod->sysctl_enabled))
                switch (clock) {
                case CLOCK_REALTIME:
                        if (likely(gtod->clock.vread))
                                return do_realtime(ts);
                        break;
                case CLOCK_MONOTONIC:
                        if (likely(gtod->clock.vread))
                                return do_monotonic(ts);
                        break;
                case CLOCK_REALTIME_COARSE:
                        return do_realtime_coarse(ts);
                case CLOCK_MONOTONIC_COARSE:
                        return do_monotonic_coarse(ts);
                }
        return vdso_fallback_gettime(clock, ts);
}
int clock_gettime(clockid_t, struct timespec *)
        __attribute__((weak, alias("__vdso_clock_gettime")));

notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
        long ret;
        if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
                if (likely(tv != NULL)) {
                        BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
                                     offsetof(struct timespec, tv_nsec) ||
                                     sizeof(*tv) != sizeof(struct timespec));
                        do_realtime((struct timespec *)tv);
                        tv->tv_usec /= 1000;
                }
                if (unlikely(tz != NULL)) {
                        /* Avoid memcpy. Some old compilers fail to inline it */
                        tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
                        tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
                }
                return 0;
        }
        asm("syscall" : "=a" (ret) :
            "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
        return ret;
}
int gettimeofday(struct timeval *, struct timezone *)
        __attribute__((weak, alias("__vdso_gettimeofday")));

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */

static __always_inline long time_syscall(long *t)
{
        long secs;
        asm volatile("syscall"
                     : "=a" (secs)
                     : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
        return secs;
}

notrace time_t __vdso_time(time_t *t)
{
        time_t result;

        if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
                return time_syscall(t);

        /* This is atomic on x86_64 so we don't need any locks. */
        result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);

        if (t)
                *t = result;
        return result;
}
int time(time_t *t)
        __attribute__((weak, alias("__vdso_time")));

参考


追記(2015/12/01):

P.24
Xen(Paravirtualization) のゲスト(DomU)で Linux を使っている場合、clocksource が xen(pv clock) の場合は vDSO にならないようですね。

Time Keeping Explained

  • Time keeping in an instance is deceptively hard
  • gettimeofday(), clock_gettime(), QueryPerformanceCounter()
  • The TSC
    • CPU counter, accessible from userspace
    • Requires calibration, vDSO
    • Invariant on Sandy Bridge+ processors
  • Xen pvclock; does not support vDSO
  • On current generation instances, use TSC as clocksource

改良されてそうですね。RHEL7(kernel 3.10)はこの改良が入ってます。
x86: vdso: pvclock gettime support

author	Marcelo Tosatti <mtosatti@redhat.com>	2012-11-28 01:28:57 (GMT)
committer	Marcelo Tosatti <mtosatti@redhat.com>	2012-11-28 01:29:11 (GMT)
commit	51c19b4f5927f5a646e93d69f73c7e89ea14e737 (patch)
tree	a48486f72dc433fd516684d4441022650429d333 /arch/x86/vdso/vclock_gettime.c
parent	3dc4f7cfb7441e5e0fed3a02fc81cdaabd28300a (diff)
x86: vdso: pvclock gettime support
Improve performance of time system calls when using Linux pvclock,
by reading time info from fixmap visible copy of pvclock data.

Originally from Jeremy Fitzhardinge.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

...

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c37..205ad32 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/pvclock.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
 }
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+
+static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+{
+	const struct pvclock_vsyscall_time_info *pvti_base;
+	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
+	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
+
+	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
+
+	pvti_base = (struct pvclock_vsyscall_time_info *)
+		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
+
+	return &pvti_base[offset];
+}
+
+static notrace cycle_t vread_pvclock(int *mode)
+{
+	const struct pvclock_vsyscall_time_info *pvti;
+	cycle_t ret;
+	u64 last;
+	u32 version;
+	u32 migrate_count;
+	u8 flags;
+	unsigned cpu, cpu1;
+
+
+	/*
+	 * When looping to get a consistent (time-info, tsc) pair, we
+	 * also need to deal with the possibility we can switch vcpus,
+	 * so make sure we always re-fetch time-info for the current vcpu.
+	 */
+	do {
+		cpu = __getcpu() & VGETCPU_CPU_MASK;
+		/* TODO: We can put vcpu id into higher bits of pvti.version.
+		 * This will save a couple of cycles by getting rid of
+		 * __getcpu() calls (Gleb).
+		 */
+
+		pvti = get_pvti(cpu);
+
+		migrate_count = pvti->migrate_count;
+
+		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+
+		/*
+		 * Test we're still on the cpu as well as the version.
+		 * We could have been migrated just after the first
+		 * vgetcpu but before fetching the version, so we
+		 * wouldn't notice a version change.
+		 */
+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+	} while (unlikely(cpu != cpu1 ||
+			  (pvti->pvti.version & 1) ||
+			  pvti->pvti.version != version ||
+			  pvti->migrate_count != migrate_count));
+
+	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+		*mode = VCLOCK_NONE;
+
+	/* refer to tsc.c read_tsc() comment for rationale */
+	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+	if (likely(ret >= last))
+		return ret;
+
+	return last;
+}
+#endif
+
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 }
 
 
-notrace static inline u64 vgetsns(void)
+notrace static inline u64 vgetsns(int *mode)
 {
 	long v;
 	cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
 		cycles = vread_tsc();
 	else if (gtod->clock.vclock_mode == VCLOCK_HPET)
 		cycles = vread_hpet();
+#ifdef CONFIG_PARAVIRT_CLOCK
+	else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
+		cycles = vread_pvclock(mode);
+#endif
 	else
 		return 0;
 	v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ns = gtod->wall_time_snsec;
-		ns += vgetsns();
+		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->monotonic_time_sec;
 		ns = gtod->monotonic_time_snsec;
-		ns += vgetsns();
+		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 	timespec_add_ns(ts, ns);