iostat の見方 - ablog
Linux の iostat の出力結果を銀行のATMに例えて説明してみる - ablog
の続編。Linux の iostat は %util がどう算出しているか調べてみた(途中)。
環境
History for iostat.c - sysstat/sysstat · GitHub を見てると、iostat は結構更新されてるので契約したてのさくらのVPSに入っているバージョンで調べてみることにした。
$ cat /etc/issue CentOS release 6.4 (Final) Kernel \r on an \m $ uname -r 2.6.32-358.18.1.el6.x86_64 $ rpm -q sysstat sysstat-9.0.4-20.el6.x86_64 $ iostat -V sysstat version 9.0.4 (C) Sebastien Godard (sysstat <at> orange.fr)
マニュアル
man には以下の通り書かれている。
$ man iostat ... %util Percentage of CPU time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100%.
ソースコード
入手
- sysstat version 9.0.4 のソースコードをダウンロードする
- sysstat-9.0.4.tar.gz を解凍する。
iostat
- iostat.c#write_ext_stat
/* *************************************************************************** * Display extended stats, read from /proc/{diskstats,partitions} or /sys. * * IN: * @curr Index in array for current sample statistics. * @itv Interval of time. * @fctr Conversion factor. * @shi Structures describing the devices and partitions. * @ioi Current sample statistics. * @ioj Previous sample statistics. *************************************************************************** */ void write_ext_stat(int curr, unsigned long long itv, int fctr, struct io_hdr_stats *shi, struct io_stats *ioi, struct io_stats *ioj) { struct stats_disk sdc, sdp; struct ext_disk_stats xds; /* * Counters overflows are possible, but don't need to be handled in * a special way: the difference is still properly calculated if the * result is of the same type as the two values. * Exception is field rq_ticks which is incremented by the number of * I/O in progress times the number of milliseconds spent doing I/O. * But the number of I/O in progress (field ios_pgr) happens to be * sometimes negative... */ sdc.nr_ios = ioi->rd_ios + ioi->wr_ios; sdp.nr_ios = ioj->rd_ios + ioj->wr_ios; sdc.tot_ticks = ioi->tot_ticks; sdp.tot_ticks = ioj->tot_ticks; sdc.rd_ticks = ioi->rd_ticks; sdp.rd_ticks = ioj->rd_ticks; sdc.wr_ticks = ioi->wr_ticks; sdp.wr_ticks = ioj->wr_ticks; sdc.rd_sect = ioi->rd_sectors; sdp.rd_sect = ioj->rd_sectors; sdc.wr_sect = ioi->wr_sectors; sdp.wr_sect = ioj->wr_sectors; compute_ext_disk_stats(&sdc, &sdp, itv, &xds); ★ /* DEV rrq/s wrq/s r/s w/s rsec wsec rqsz qusz await svctm %util */ printf("%-13s %8.2f %8.2f %7.2f %7.2f %8.2f %8.2f %8.2f %8.2f %7.2f %6.2f %6.2f\n", shi->name, S_VALUE(ioj->rd_merges, ioi->rd_merges, itv), S_VALUE(ioj->wr_merges, ioi->wr_merges, itv), S_VALUE(ioj->rd_ios, ioi->rd_ios, itv), S_VALUE(ioj->wr_ios, ioi->wr_ios, itv), ll_s_value(ioj->rd_sectors, ioi->rd_sectors, itv) / fctr, ll_s_value(ioj->wr_sectors, ioi->wr_sectors, itv) / fctr, xds.arqsz, S_VALUE(ioj->rq_ticks, ioi->rq_ticks, itv) / 1000.0, xds.await, /* The ticks output is biased to output 1000 ticks per second */ xds.svctm, /* Again: Ticks in milliseconds */ xds.util / 10.0); ★ }
- common.c#compute_ext_disk_stats
/* *************************************************************************** * Compute "extended" device statistics (service time, etc.). * * IN: * @sdc Structure with current device statistics. * @sdp Structure with previous device statistics. * @itv Interval of time in jiffies. * * OUT: * @xds Structure with extended statistics. *************************************************************************** */ void compute_ext_disk_stats(struct stats_disk *sdc, struct stats_disk *sdp, unsigned long long itv, struct ext_disk_stats *xds) { double tput = ((double) (sdc->nr_ios - sdp->nr_ios)) * HZ / itv; xds->util = S_VALUE(sdp->tot_ticks, sdc->tot_ticks, itv); ★ xds->svctm = tput ? xds->util / tput : 0.0; /* * Kernel gives ticks already in milliseconds for all platforms * => no need for further scaling. */ xds->await = (sdc->nr_ios - sdp->nr_ios) ? ((sdc->rd_ticks - sdp->rd_ticks) + (sdc->wr_ticks - sdp->wr_ticks)) / ((double) (sdc->nr_ios - sdp->nr_ios)) : 0.0; xds->arqsz = (sdc->nr_ios - sdp->nr_ios) ? ((sdc->rd_sect - sdp->rd_sect) + (sdc->wr_sect - sdp->wr_sect)) / ((double) (sdc->nr_ios - sdp->nr_ios)) : 0.0; }
- common.h
/* * Macros used to display statistics values. * * NB: Define SP_VALUE() to normalize to %; * HZ is 1024 on IA64 and % should be normalized to 100. */ #define S_VALUE(m,n,p) (((double) ((n) - (m))) / (p) * HZ) ★ #define SP_VALUE(m,n,p) (((double) ((n) - (m))) / (p) * 100)
- iostat.h
/* * Structures for I/O stats. * The number of structures allocated corresponds to the number of devices * present in the system, plus a preallocation number to handle those * that can be registered dynamically. * The number of devices is found by using /sys filesystem (if mounted), * or the number of "disk_io:" entries in /proc/stat (2.4 kernels), * else the default value is 4 (for old kernels, which maintained stats * for the first four devices in /proc/stat). * For each io_stats structure allocated corresponds a io_hdr_stats structure. * A io_stats structure is considered as unused or "free" (containing no stats * for a particular device) if the 'major' field of the io_hdr_stats * structure is set to 0. */ struct io_stats { /* # of sectors read */ unsigned long long rd_sectors __attribute__ ((aligned (8))); /* # of sectors written */ unsigned long long wr_sectors __attribute__ ((packed)); /* # of read operations issued to the device */ unsigned long rd_ios __attribute__ ((packed)); /* # of read requests merged */ unsigned long rd_merges __attribute__ ((packed)); /* Time of read requests in queue */ unsigned long rd_ticks __attribute__ ((packed)); /* # of write operations issued to the device */ unsigned long wr_ios __attribute__ ((packed)); /* # of write requests merged */ unsigned long wr_merges __attribute__ ((packed)); /* Time of write requests in queue */ unsigned long wr_ticks __attribute__ ((packed)); /* # of I/Os in progress */ unsigned long ios_pgr __attribute__ ((packed)); /* # of ticks total (for this device) for I/O */ unsigned long tot_ticks __attribute__ ((packed)); ★ /* # of ticks requests spent in queue */ unsigned long rq_ticks __attribute__ ((packed)); /* # of I/O done since last reboot */ unsigned long dk_drive __attribute__ ((packed)); /* # of blocks read */ unsigned long dk_drive_rblk __attribute__ ((packed)); /* # of blocks written */ unsigned long dk_drive_wblk __attribute__ ((packed)); };
- iostat.c#write_stats
/* *************************************************************************** * Print everything now (stats and uptime). * * IN: * @curr Index in array for current sample statistics. * @rectime Current date and time. *************************************************************************** */ void write_stats(int curr, struct tm *rectime) { int dev, i, fctr = 1; unsigned long long itv; struct io_hdr_stats *shi; struct io_dlist *st_dev_list_i; /* Test stdout */ TEST_STDOUT(STDOUT_FILENO); /* Print time stamp */ if (DISPLAY_TIMESTAMP(flags)) { if (DISPLAY_ISO(flags)) { strftime(timestamp, sizeof(timestamp), "%FT%T%z", rectime); } else { strftime(timestamp, sizeof(timestamp), "%x %X", rectime); } printf("%s\n", timestamp); } /* Interval is multiplied by the number of processors */ itv = get_interval(uptime[!curr], uptime[curr]); if (DISPLAY_CPU(flags)) { /* Display CPU utilization */ write_cpu_stat(curr, itv); } if (cpu_nr > 1) { /* On SMP machines, reduce itv to one processor (see note above) */ itv = get_interval(uptime0[!curr], uptime0[curr]); } if (DISPLAY_DISK(flags)) { struct io_stats *ioi, *ioj; ★ shi = st_hdr_iodev; /* Display disk stats header */ write_disk_stat_header(&fctr); if (DISPLAY_EXTENDED(flags) && (HAS_OLD_KERNEL(flags) || HAS_PLAIN_KERNEL24(flags))) { /* No extended stats with old 2.2-2.4 kernels */ printf("\n"); return; } for (i = 0; i < iodev_nr; i++, shi++) { if (shi->used) { if (dlist_idx && !HAS_SYSFS(flags)) { /* * With sysfs, only stats for the requested * devices are read. * With /proc/{diskstats,partitions}, stats for * every device are read. Thus we need to check * if stats for current device are to be displayed. */ for (dev = 0; dev < dlist_idx; dev++) { st_dev_list_i = st_dev_list + dev; if (!strcmp(shi->name, st_dev_list_i->dev_name)) break; } if (dev == dlist_idx) /* Device not found in list: Don't display it */ continue; } ioi = st_iodev[curr] + i; ★ ioj = st_iodev[!curr] + i; ★ if (!DISPLAY_UNFILTERED(flags)) { if (HAS_OLD_KERNEL(flags) || HAS_PLAIN_KERNEL24(flags)) { if (!ioi->dk_drive) continue; } else { if (!ioi->rd_ios && !ioi->wr_ios) continue; } } if (DISPLAY_ZERO_OMIT(flags)) { if (HAS_OLD_KERNEL(flags) || HAS_PLAIN_KERNEL24(flags)) { if (ioi->dk_drive == ioj->dk_drive) /* No activity: Ignore it */ continue; } else { if ((ioi->rd_ios == ioj->rd_ios) && (ioi->wr_ios == ioj->wr_ios)) /* No activity: Ignore it */ continue; } } if (DISPLAY_EXTENDED(flags)) { write_ext_stat(curr, itv, fctr, shi, ioi, ioj); ★ } else { write_basic_stat(curr, itv, fctr, shi, ioi, ioj); } } } printf("\n"); } if (DISPLAY_NFS(flags)) { struct io_nfs_stats *ioni, *ionj; shi = st_hdr_ionfs; /* Display NFS stats header */ write_nfs_stat_header(&fctr); if (!HAS_NFS(flags)) { /* No NFS stats */ printf("\n"); return; } for (i = 0; i < ionfs_nr; i++, shi++) { if (shi->used) { ioni = st_ionfs[curr] + i; ionj = st_ionfs[!curr] + i; write_nfs_stat(curr, itv, fctr, shi, ioni, ionj); } } printf("\n"); } }
- common.h
/* Files */ #define STAT "/proc/stat" #define UPTIME "/proc/uptime" #define PPARTITIONS "/proc/partitions" #define DISKSTATS "/proc/diskstats" #define INTERRUPTS "/proc/interrupts" #define MEMINFO "/proc/meminfo" #define SYSFS_BLOCK "/sys/block" ★ #define SYSFS_DEVCPU "/sys/devices/system/cpu" #define NFSMOUNTSTATS "/proc/self/mountstats" #define S_STAT "stat" ★ #define DEVMAP_DIR "/dev/mapper"
Linuxカーネル
- include/linux/genhd.h
struct disk_stats { unsigned long sectors[2]; /* READs and WRITEs */ unsigned long ios[2]; unsigned long merges[2]; unsigned long ticks[2]; unsigned long io_ticks; unsigned long time_in_queue; };
- block/blk-core.c#part_round_stats_single
static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { if (now == part->stamp) return; if (part_in_flight(part)) { __part_stat_add(cpu, part, time_in_queue, part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; }
参考
Ticklessカーネルとクロックソースに関するお話 - めもめも
- 一定の時間間隔(1000Hz)でタイマ割り込みが入る。
- タイマ割り込みごとにjiffies変数を1増やす(つまり、jiffiesは、システム起動後にタイマ割り込みが入った回数を表す。)
- jiffiesの増加分に合わせて、システム時刻(変数xtime)をアップデートする。
システムのタイマー割り込み間隔は、1 秒間に発生するタイマー割り込み回数として HZ というマクロで定義されています。HZ は asm/param.h で定義されています。 この値はプラットフォームによって異なるので、特定値であるとは思わない様にして下さい。 なお、システムが起動してからの総タイマー割り込み回数は jiffies というグローバル変数に格納されています。jiffies は linux/sched.h 内で宣言されています。
http://homepage3.nifty.com/rio_i/lab/driver24/003timer.html
※3 つい最近まで1 tick = 10msecでしたが、最近はCPUやメモリが急速に速くなってきているためデフォルトがこれよりも短くなっているようです。簡単なプログラム書いて試してみましたが、kernel 2.6.19 CentOS5では4msecになっているようです。一方、kernel 2.6.18 x86_64 RHEL5(64bit)では2msecでした
Linuxの時間関数について | 日々雑感、覚書
大抵のOSはtick単位でタイマー割り込みを発生させ、この割り込みのタイミングで時間に関する処理を実行する。Linuxでは、このタイマー割り込みごとにjiffiesが加算される。多くのシステムでは、その間隔は10ミリ秒であり、秒間100回の割り込みがあるのでHZ=100として定義されている。
tick単位 ‐ 通信用語の基礎知識
Block layer statistics in /sys/block/<dev>/stat =============================================== This file documents the contents of the /sys/block/<dev>/stat file. The stat file provides several statistics about the state of block device <dev>. Q. Why are there multiple statistics in a single file? Doesn't sysfs normally contain a single value per file? A. By having a single file, the kernel can guarantee that the statistics represent a consistent snapshot of the state of the device. If the statistics were exported as multiple files containing one statistic each, it would be impossible to guarantee that a set of readings represent a single point in time. The stat file consists of a single line of text containing 11 decimal values separated by whitespace. The fields are summarized in the following table, and described in more detail below. Name units description ---- ----- ----------- read I/Os requests number of read I/Os processed read merges requests number of read I/Os merged with in-queue I/O read sectors sectors number of sectors read read ticks milliseconds total wait time for read requests write I/Os requests number of write I/Os processed write merges requests number of write I/Os merged with in-queue I/O write sectors sectors number of sectors written write ticks milliseconds total wait time for write requests in_flight requests number of I/Os currently in flight io_ticks milliseconds total time this block device has been active time_in_queue milliseconds total wait time for all requests ... time_in_queue ============= This value counts the number of milliseconds that I/O requests have waited on this block device. If there are multiple I/O requests waiting, this value will increase as the product of the number of milliseconds times the number of requests waiting (see "read ticks" above for an example).