ablog

不器用で落着きのない技術者のメモ

When graphing CPU as stacked area chart, which parts add up to total ?


Answer: user + nice + sys + irq + softirq + steal

  • User space: user + nice
  • Kernel space: sys + irq + softirq
  • Idle: idle + iowait
  • Steal: Stolen by other guest VMs or Hypervisor.

The field itself means the time the VM CPU has to wait for others VMs (virtual machines) finishing their turn (slice), or for a task of the hypervisor itself. ... It’s the time the hypervisor scheduled something else to run instead of something within your VM. This might be time for another VM, or for the Hypervisor host itself. If no time were stolen, this time would be used to run your CPU workload or your idle thread.

linux - iostat - What does the 'steal' field mean? - Unix & Linux Stack Exchange
/*
 * Account guest cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in virtual machine since the last update
 */
void account_guest_time(struct task_struct *p, u64 cputime)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;

	/* Add guest time to process. */
	p->utime += cputime;
	account_group_user_time(p, cputime);
	p->gtime += cputime;

	/* Add guest time to cpustat. */
	if (task_nice(p) > 0) {
		cpustat[CPUTIME_NICE] += cputime; <-- double-counted 
		cpustat[CPUTIME_GUEST_NICE] += cputime; <--
	} else {
		cpustat[CPUTIME_USER] += cputime; <-- double-counted
		cpustat[CPUTIME_GUEST] += cputime; <--
	}
}

Is nice part of user?: Yes

vmstat
  • vmstat reads /proc/stat to get cpu time
$ strace -e open vmstat 1 1
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
open("/lib64/libproc-3.2.8.so", O_RDONLY|O_CLOEXEC) = 3
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
open("/proc/self/auxv", O_RDONLY)       = 3
open("/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 3
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
open("/proc/meminfo", O_RDONLY)         = 3
open("/proc/stat", O_RDONLY)            = 4 <--- vmstat reads /proc/stat to get cpu time
open("/proc/vmstat", O_RDONLY)          = 5
 0  0      0 238889648 134392 765980    0    0    76     4    1    1  0  0 99  0  0	
+++ exited with 0 +++
static void new_format(void)
{

...

	getstat(cpu_use, cpu_nic, cpu_sys, cpu_idl, cpu_iow, cpu_xxx, cpu_yyy,
		cpu_zzz, pgpgin, pgpgout, pswpin, pswpout, intr, ctxt, &running,
		&blocked, &dummy_1, &dummy_2);

...

	duse = *cpu_use + *cpu_nic; <--- user + nice in /proc/stat
	dsys = *cpu_sys + *cpu_xxx + *cpu_yyy; <--- system + irq + softirq in /proc/stat
	didl = *cpu_idl;
	diow = *cpu_iow;
	dstl = *cpu_zzz;
void getstat(jiff *restrict cuse, jiff *restrict cice, jiff *restrict csys, jiff *restrict cide, jiff *restrict ciow, jiff *restrict cxxx, jiff *restrict cyyy, jiff *restrict czzz,
	     unsigned long *restrict pin, unsigned long *restrict pout, unsigned long *restrict s_in, unsigned long *restrict sout,
	     unsigned *restrict intr, unsigned *restrict ctxt,
	     unsigned int *restrict running, unsigned int *restrict blocked,
	     unsigned int *restrict btime, unsigned int *restrict processes) {
  static int fd;
  unsigned long long llbuf = 0;
  int need_vmstat_file = 0;
  int need_proc_scan = 0;
  const char* b;
  memset(buff, '\0', BUFFSIZE);  /* ensure null termination in buffer */

  if(fd){
    lseek(fd, 0L, SEEK_SET);
  }else{
    fd = open("/proc/stat", O_RDONLY, 0); <--- vmstat reads /proc/stat to get cpu time
    if(fd == -1) crash("/proc/stat");
  }
  read(fd,buff,BUFFSIZE-1);
  *intr = 0;
  *ciow = 0;  /* not separated out until the 2.5.41 kernel */
  *cxxx = 0;  /* not separated out until the 2.6.0-test4 kernel */ <--- irq in /proc/stat
  *cyyy = 0;  /* not separated out until the 2.6.0-test4 kernel */ <--- softirq in /proc/stat
  *czzz = 0;  /* not separated out until the 2.6.11 kernel */ <---steal in /proc/stat

  b = strstr(buff, "cpu ");
  if(b) sscanf(b,  "cpu  %llu %llu %llu %llu %llu %llu %llu %llu", cuse, cice, csys, cide, ciow, cxxx, cyyy, czzz);
/proc/stat
              kernel/system statistics.  Varies with architecture.  Common
              entries include:

              cpu 10132153 290696 3084719 46828483 16683 0 25195 0 175628 0
              cpu0 1393280 32966 572056 13343292 6130 0 17875 0 23933 0
                     The amount of time, measured in units of USER_HZ
                     (1/100ths of a second on most architectures, use
                     sysconf(_SC_CLK_TCK) to obtain the right value), that
                     the system ("cpu" line) or the specific CPU ("cpuN"
                     line) spent in various states:

                     user   (1) Time spent in user mode.

                     nice   (2) Time spent in user mode with low priority
                            (nice).

                     system (3) Time spent in system mode.

                     idle   (4) Time spent in the idle task.  This value
                            should be USER_HZ times the second entry in the
                            /proc/uptime pseudo-file.

                     iowait (since Linux 2.5.41)
                            (5) Time waiting for I/O to complete.  This
                            value is not reliable, for the following rea‐
                            sons:

                            1. The CPU will not wait for I/O to complete;
                               iowait is the time that a task is waiting for
                               I/O to complete.  When a CPU goes into idle
                               state for outstanding task I/O, another task
                               will be scheduled on this CPU.

                            2. On a multi-core CPU, the task waiting for I/O
                               to complete is not running on any CPU, so the
                               iowait of each CPU is difficult to calculate.

                            3. The value in this field may decrease in cer‐
                               tain conditions.

                     irq (since Linux 2.6.0) <--- cxxx in /proc/sysinfo.c#getstat
                            (6) Time servicing interrupts.

                     softirq (since Linux 2.6.0  <--- cyyy in /proc/sysinfo.c#getstat
                            (7) Time servicing softirqs.

                     steal (since Linux 2.6.11)  <--- czzz in /proc/sysinfo.c#getstat
                            (8) Stolen time, which is the time spent in
                            other operating systems when running in a virtu‐
                            alized environment

                     guest (since Linux 2.6.24)
                            (9) Time spent running a virtual CPU for guest
                            operating systems under the control of the Linux
                            kernel.

                     guest_nice (since Linux 2.6.33)
                            (10) Time spent running a niced guest (virtual
                            CPU for guest operating systems under the con‐
                            trol of the Linux kernel).
mpstat
  • mpstat reads /proc/stat to get cpu time
$ strace -e open mpstat -P ALL  1 1
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
open("/proc/interrupts", O_RDONLY)      = 3
open("/etc/localtime", O_RDONLY|O_CLOEXEC) = 3
Linux 4.14.146-93.123.amzn1.x86_64 (ip-172-31-10-8) 	12/21/19 	_x86_64_	(32 CPU)
open("/proc/uptime", O_RDONLY)          = 3
open("/proc/stat", O_RDONLY)            = 3  <--- mpstat reads /proc/stat to get cpu time
--- SIGALRM {si_signo=SIGALRM, si_code=SI_KERNEL, si_value={int=194, ptr=0xc2}} ---
open("/proc/uptime", O_RDONLY)          = 3
open("/proc/stat", O_RDONLY)            = 3

17:49:18     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
17:49:19     all    0.03    0.00    0.03    0.00    0.00    0.00    0.00    0.00   99.94
17:49:19       0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
17:49:19       1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
17:49:19       2    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
int main(int argc, char **argv)

...

	/* Get system name, release number and hostname */
	__uname(&header);
	print_gal_header(&(mp_tstamp[0]), header.sysname, header.release,
			 header.nodename, header.machine, get_cpu_nr(~0, FALSE),
			 DISPLAY_JSON_OUTPUT(flags));

	/* Main loop */
	rw_mpstat_loop(dis_hdr, rows); <--- Read stats and display them.
/*
 ***************************************************************************
 * Main loop: Read stats from the relevant sources, and display them.
 *
 * IN:
 * @dis_hdr	Set to TRUE if the header line must always be printed.
 * @rows	Number of rows of screen.
 ***************************************************************************
 */
void rw_mpstat_loop(int dis_hdr, int rows)
{
	struct stats_cpu *scc;
	int i;
	int curr = 1, dis = 1;
	unsigned long lines = rows;

	/* Dont buffer data if redirected to a pipe */
	setbuf(stdout, NULL);

	/* Read system uptime and CPU stats */
	read_uptime(&(uptime_cs[0]));
	read_stat_cpu(st_cpu[0], cpu_nr + 1); <--- Read cpu stats.

	/*
	 * Calculate global CPU stats as the sum of individual ones.
	 * Done only on SMP machines. On UP machines, we keep the values
	 * read from /proc/stat for global CPU stats.
	 */
	if (cpu_nr > 1) {
		memset(st_cpu[0], 0, STATS_CPU_SIZE);

		for (i = 1; i <= cpu_nr; i++) {
			scc = st_cpu[0] + i;

			st_cpu[0]->cpu_user += scc->cpu_user;
			st_cpu[0]->cpu_nice += scc->cpu_nice;
			st_cpu[0]->cpu_sys += scc->cpu_sys;
			st_cpu[0]->cpu_idle += scc->cpu_idle;
			st_cpu[0]->cpu_iowait += scc->cpu_iowait;
			st_cpu[0]->cpu_hardirq += scc->cpu_hardirq;
			st_cpu[0]->cpu_steal += scc->cpu_steal;
			st_cpu[0]->cpu_softirq += scc->cpu_softirq;
			st_cpu[0]->cpu_guest += scc->cpu_guest;
			st_cpu[0]->cpu_guest_nice += scc->cpu_guest_nice;
		}
	}
/*
 ***************************************************************************
 * Read CPU statistics.
 * Remember that this function is used by several sysstat commands!
 *
 * IN:
 * @st_cpu	Buffer where structures containing stats will be saved.
 * @nr_alloc	Total number of structures allocated. Value is >= 1.
 *
 * OUT:
 * @st_cpu	Buffer with statistics.
 *
 * RETURNS:
 * Highest CPU number(*) for which statistics have been read.
 * 1 means CPU "all", 2 means CPU 0, 3 means CPU 1, etc.
 * Or -1 if the buffer was too small and needs to be reallocated.
 *
 * (*)This doesn't account for all processors in the machine in the case
 * where some CPU are offline and located at the end of the list.
 ***************************************************************************
 */
__nr_t read_stat_cpu(struct stats_cpu *st_cpu, __nr_t nr_alloc)
{
	FILE *fp;
	struct stats_cpu *st_cpu_i;
	struct stats_cpu sc;
	char line[8192];
	int proc_nr;
	__nr_t cpu_read = 0;

	if ((fp = fopen(STAT, "r")) == NULL) { <--- mpstat reads /proc/stat to get cpu time
		fprintf(stderr, _("Cannot open %s: %s\n"), STAT, strerror(errno));
		exit(2);
	}
/* Files */
#define STAT			PRE "/proc/stat" <--- /proc/stat
#define UPTIME			PRE "/proc/uptime"
#define DISKSTATS		PRE "/proc/diskstats"
#define INTERRUPTS		PRE "/proc/interrupts"
#define MEMINFO			PRE "/proc/meminfo"
/proc/stat
/proc/stat
              kernel/system statistics.  Varies with architecture.  Common
              entries include:

              cpu 10132153 290696 3084719 46828483 16683 0 25195 0 175628 0
              cpu0 1393280 32966 572056 13343292 6130 0 17875 0 23933 0
                     The amount of time, measured in units of USER_HZ
                     (1/100ths of a second on most architectures, use
                     sysconf(_SC_CLK_TCK) to obtain the right value), that
                     the system ("cpu" line) or the specific CPU ("cpuN"
                     line) spent in various states:

                     user   (1) Time spent in user mode.

                     nice   (2) Time spent in user mode with low priority
                            (nice).

                     system (3) Time spent in system mode.

                     idle   (4) Time spent in the idle task.  This value
                            should be USER_HZ times the second entry in the
                            /proc/uptime pseudo-file.

                     iowait (since Linux 2.5.41)
                            (5) Time waiting for I/O to complete.  This
                            value is not reliable, for the following rea‐
                            sons:

                            1. The CPU will not wait for I/O to complete;
                               iowait is the time that a task is waiting for
                               I/O to complete.  When a CPU goes into idle
                               state for outstanding task I/O, another task
                               will be scheduled on this CPU.

                            2. On a multi-core CPU, the task waiting for I/O
                               to complete is not running on any CPU, so the
                               iowait of each CPU is difficult to calculate.

                            3. The value in this field may decrease in cer‐
                               tain conditions.

                     irq (since Linux 2.6.0) 
                            (6) Time servicing interrupts.

                     softirq (since Linux 2.6.0) 
                            (7) Time servicing softirqs.

                     steal (since Linux 2.6.11) 
                            (8) Stolen time, which is the time spent in
                            other operating systems when running in a virtu‐
                            alized environment

                     guest (since Linux 2.6.24)
                            (9) Time spent running a virtual CPU for guest
                            operating systems under the control of the Linux
                            kernel.

                     guest_nice (since Linux 2.6.33)
                            (10) Time spent running a niced guest (virtual
                            CPU for guest operating systems under the con‐
                            trol of the Linux kernel).
static int show_stat(struct seq_file *p, void *v)
{
	int i, j;
	u64 user, nice, system, idle, iowait, irq, softirq, steal;
	u64 guest, guest_nice;
	u64 sum = 0;
	u64 sum_softirq = 0;
	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
	struct timespec64 boottime;

	user = nice = system = idle = iowait =
		irq = softirq = steal = 0;
	guest = guest_nice = 0;
	getboottime64(&boottime);

	for_each_possible_cpu(i) {
		user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
		nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
		system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
		idle += get_idle_time(i);
		iowait += get_iowait_time(i);
		irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
		softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
		sum += kstat_cpu_irqs_sum(i);
		sum += arch_irq_stat_cpu(i);

		for (j = 0; j < NR_SOFTIRQS; j++) {
			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);

			per_softirq_sums[j] += softirq_stat;
			sum_softirq += softirq_stat;
		}
	}
	sum += arch_irq_stat();

	seq_put_decimal_ull(p, "cpu  ", nsec_to_clock_t(user));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
	seq_putc(p, '\n');

	for_each_online_cpu(i) {
		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
		user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
		nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
		system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
		idle = get_idle_time(i);
		iowait = get_iowait_time(i);
		irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
		softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
		steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
		seq_printf(p, "cpu%d", i);
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
		seq_putc(p, '\n');
	}
	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);

	/* sum again ? it could be updated? */
	for_each_irq_nr(j)
		seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));

	seq_printf(p,
		"\nctxt %llu\n"
		"btime %llu\n"
		"processes %lu\n"
		"procs_running %lu\n"
		"procs_blocked %lu\n",
		nr_context_switches(),
		(unsigned long long)boottime.tv_sec,
		total_forks,
		nr_running(),
		nr_iowait());

	seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);

	for (i = 0; i < NR_SOFTIRQS; i++)
		seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
	seq_putc(p, '\n');

	return 0;
}
/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in user space since the last update
 */
void account_user_time(struct task_struct *p, u64 cputime)
{
	int index;

	/* Add user time to process. */
	p->utime += cputime;
	account_group_user_time(p, cputime);

	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;

	/* Add user time to cpustat. */
	task_group_account_field(p, index, cputime);

	/* Account for user time used */
	acct_account_cputime(p);
}

...

/*
 * Account guest cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in virtual machine since the last update
 */
void account_guest_time(struct task_struct *p, u64 cputime)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;

	/* Add guest time to process. */
	p->utime += cputime;
	account_group_user_time(p, cputime);
	p->gtime += cputime;

	/* Add guest time to cpustat. */
	if (task_nice(p) > 0) {
		cpustat[CPUTIME_NICE] += cputime;
		cpustat[CPUTIME_GUEST_NICE] += cputime;
	} else {
		cpustat[CPUTIME_USER] += cputime;
		cpustat[CPUTIME_GUEST] += cputime;
	}
}

...

/*
 * Account system cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
 * @cputime: the cpu time spent in kernel space since the last update
 */
void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
	int index;

	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
		account_guest_time(p, cputime);
		return;
	}

	if (hardirq_count() - hardirq_offset)
		index = CPUTIME_IRQ;
	else if (in_serving_softirq())
		index = CPUTIME_SOFTIRQ;
	else
		index = CPUTIME_SYSTEM;

	account_system_index_time(p, cputime, index);
}

/*
 * Account for idle time.
 * @cputime: the cpu time spent in idle wait
 */
void account_idle_time(u64 cputime)
{
	u64 *cpustat = kcpustat_this_cpu->cpustat;
	struct rq *rq = this_rq();

	if (atomic_read(&rq->nr_iowait) > 0)
		cpustat[CPUTIME_IOWAIT] += cputime;
	else
		cpustat[CPUTIME_IDLE] += cputime;
}

Is irq part of sys?: Yes

/*
 * Account system cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
 * @cputime: the cpu time spent in kernel space since the last update
 */
void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
	int index;

	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
		account_guest_time(p, cputime);
		return;
	}

	if (hardirq_count() - hardirq_offset)
		index = CPUTIME_IRQ;
	else if (in_serving_softirq())
		index = CPUTIME_SOFTIRQ;
	else
		index = CPUTIME_SYSTEM;

	account_system_index_time(p, cputime, index);
}