/*
 * Linux proc/<pid>/{stat,statm,status,...} Clusters
 *
 * Copyright (c) 2013-2020 Red Hat.
 * Copyright (c) 2000,2004,2006 Silicon Graphics, Inc.  All Rights Reserved.
 * Copyright (c) 2010 Aconex.  All Rights Reserved.
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2 of the License, or (at your
 * option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 */

#include "pmapi.h"
#include "libpcp.h"
#include "pmda.h"
#include <ctype.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <pwd.h>
#include <grp.h>
#include "proc_pid.h"
#include "proc_runq.h"
#include "indom.h"
#include "cgroups.h"
#include "hotproc.h"

static proc_pid_list_t procpids; /* previous pids list that the proc pmda uses */
static void refresh_proc_pidlist(proc_pid_t *, proc_pid_list_t *);


/* Hotproc variables */

/* PIDS that we are keeping track of as POSSIBLE "hot" candidates
 * need a seperate list since it is generated by the timer update.
*/
static proc_pid_list_t hotpids;
/* Hold a pointer to this since we need it for the timer */
static proc_pid_t *hotproc_poss_pid;

#define INIT_HOTPROC_MAX 200

/* Actual processes that are hot based on the current configuration 
 * Filled in hotproc_eval_procs 
 */
static pid_t *hot_active_list;

static int hot_numactive;
static int hot_maxactive = INIT_HOTPROC_MAX;

/* array size allocated */
static int hot_maxprocs[2] = {INIT_HOTPROC_MAX, INIT_HOTPROC_MAX};

/* number of procs used in list (<= hot_maxprocs) */
static int hot_numprocs[2] = {0, 0};

/* Current and Previous list of processes that we are considering for "hot" inclusion
 * Updated by the timer callback
 * keeps stats that we will use for determination
 */
static process_t *hotproc_list[2] = {NULL, NULL};

/* various cpu time totals  */
static int num_cpus;
static int hot_have_totals;
static double hot_total_transient;
static double hot_total_cpuidle;
static double hot_total_active;
static double hot_total_inactive;

static unsigned long hot_refresh_count;

/* index into proc_list etc.. */
static int current;
static int previous = 1;

struct timeval   hotproc_update_interval;
int     hotproc_timer_id = -1;

int 
get_hot_totals(double * ta, double * ti, double * tt, double * tci )
{
    if (hot_have_totals) {
	*ta = hot_total_active;
	*ti = hot_total_inactive;
	*tt = hot_total_transient;
	*tci = hot_total_cpuidle;
	return 1;
    }
    return 0;
}

static int
compare_pid(const void *pa, const void *pb)
{
    int a = *(int *)pa;
    int b = *(int *)pb;
    return a - b;
}

static void
pidlist_append_pid(int pid, proc_pid_list_t *pids)
{
    if (pids->count >= pids->size) {
	pids->size += 64;
	if (!(pids->pids = (int *)realloc(pids->pids, pids->size * sizeof(int)))) {
	    perror("pidlist_append: out of memory");
	    pids->size = pids->count = 0;
	    return;	/* soldier on bravely */
	}
    }
    pids->pids[pids->count++] = pid;
}

static void
pidlist_append(const char *pidname, proc_pid_list_t *pids)
{
    pidlist_append_pid(atoi(pidname), pids);
}

static void
tasklist_append(const char *pid, proc_pid_list_t *pids)
{
    DIR *taskdirp;
    struct dirent *tdp;
    char taskpath[1024];

    pmsprintf(taskpath, sizeof(taskpath), "%s/proc/%s/task", proc_statspath, pid);
    if ((taskdirp = opendir(taskpath)) != NULL) {
	while ((tdp = readdir(taskdirp)) != NULL) {
	    if (!isdigit((int)tdp->d_name[0]) || strcmp(pid, tdp->d_name) == 0)
		continue;
	    pidlist_append(tdp->d_name, pids);
	}
	closedir(taskdirp);
    }
    else {
	if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
	    char ebuf[1024];
	    fprintf(stderr, "tasklist_append: opendir(\"%s\") failed: %s\n", taskpath, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	}
    }
}

static int
refresh_cgroup_pidlist(int want_threads, proc_runq_t *runq_stats, proc_pid_list_t *pids, const char *cgroup)
{
    char path[MAXPATHLEN];
    FILE *fp;
    int pid;

    pids->count = 0;
    pids->threads = want_threads;

    /*
     * We're running in cgroups mode where a subset of the processes is
     * going to be returned based on the cgroup specified earlier via a
     * store into the proc.control.{all,perclient}.cgroups metric.
     *
     * Use the "cgroup.procs" (v2/v1) and "cgroups.threads" (cgroups v2)
     * or "tasks" (cgroups1) file, depending on want_threads.
     * Note that both these files are already sorted, ascending numeric.
     */
    if (cgroup_version == 0)
	refresh_cgroup_filesys();
    if (want_threads && cgroup_version == 1)
	pmsprintf(path, sizeof(path), "%s%s/tasks", proc_statspath, cgroup);
    else if (want_threads && cgroup_version > 1)
	pmsprintf(path, sizeof(path), "%s%s/container/cgroup.threads", proc_statspath, cgroup);
    else
	pmsprintf(path, sizeof(path), "%s%s/container/cgroup.procs", proc_statspath, cgroup);

    if ((fp = fopen(path, "r")) != NULL) {
	while (fscanf(fp, "%d\n", &pid) == 1) {
	    pidlist_append_pid(pid, pids);
	    if (runq_stats)
		proc_runq_append_pid(pid, runq_stats);
	}
	fclose(fp);
    }
    else {
	if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
	    char ebuf[1024];
	    fprintf(stderr, "refresh_cgroup_pidlist: fopen(\"%s\", \"r\") failed: %s\n", path, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	}
    }
    return 0;
}

static int
refresh_global_pidlist(int want_threads, proc_runq_t *runq_stats, proc_pid_list_t *pids)
{
    DIR *dirp;
    struct dirent *dp;
    char path[MAXPATHLEN];

    pids->count = 0;
    pids->threads = want_threads;

    pmsprintf(path, sizeof(path), "%s/proc", proc_statspath);
    if ((dirp = opendir(path)) == NULL) {
	if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
	    char ebuf[1024];
	    fprintf(stderr, "refresh_global_pidlist: opendir(\"%s\") failed: %s\n", path, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	}
	return -oserror();
    }

    /* note: readdir on /proc ignores threads */
    while ((dp = readdir(dirp)) != NULL) {
	if (isdigit((int)dp->d_name[0])) {
	    pidlist_append(dp->d_name, pids);
	    if (want_threads)
		tasklist_append(dp->d_name, pids);
	    if (runq_stats)
		proc_runq_append(dp->d_name, runq_stats);
	}
    }
    closedir(dirp);

    qsort(pids->pids, pids->count, sizeof(int), compare_pid);
    return 0;
}

static int
in_hot_active_list(pid_t pid)
{
    int i;

    for (i = 0; i < hot_numactive; i++) {
        if (pid == hot_active_list[i])
            return 1;
    }
    return 0;
}

static int
check_if_hot(char *cpid)
{
    int mypid;

    if (sscanf(cpid, "%d", &mypid) == 0)
	return 0;
    if (in_hot_active_list(mypid))
	return 1;
    return 0;
}

static int
refresh_hotproc_pidlist(proc_pid_list_t *pids)
{
    DIR *dirp;
    struct dirent *dp;

    if ((dirp = opendir("/proc")) == NULL)
	return -oserror();

    /* note: readdir on /proc ignores threads */
    while ((dp = readdir(dirp)) != NULL) {
	if (isdigit((int)dp->d_name[0])) {
	    if (check_if_hot( dp->d_name)) {
		pidlist_append(dp->d_name, pids);
		if (pids->threads)
		    tasklist_append(dp->d_name, pids);
	    }
	}
    }
    closedir(dirp);

    qsort(pids->pids, pids->count, sizeof(int), compare_pid);
    return 0;
}

static int
init_hotproc_list(void)
{
    hot_active_list = (pid_t*)malloc(INIT_HOTPROC_MAX * sizeof(pid_t));
    hotproc_list[0] = (process_t*)malloc(INIT_HOTPROC_MAX * sizeof(process_t));
    hotproc_list[1] = (process_t*)malloc(INIT_HOTPROC_MAX * sizeof(process_t));
    if (hotproc_list[0] == NULL || hotproc_list[1] == NULL || hot_active_list == NULL)
        return -oserror();
    return 0;
}

static void
init_hot_active_list(void)
{
    hot_numactive = 0;
}

/*
 * add_hot_active_list:
 * - If unsuccessful in add - due to memory then return neg status.
 * - If member of active list return 1
 * - If non-member of active list return 0
 */
static int
add_hot_active_list(process_t *node, config_vars *vars)
{
    if (eval_tree(vars) == 0) {
        return 0;
    }
    else {
        /*fprintf(stderr, "Added hotproc %d\n", node->pid);*/
    }

    if (hot_numactive == hot_maxactive) {
        pid_t *res;
        hot_maxactive = hot_numactive*2;
        res = (pid_t *)realloc(hot_active_list, hot_maxactive * sizeof(pid_t));
        if (res == NULL)
            return -1;
        hot_active_list = res;
    }
    hot_active_list[hot_numactive++] = node->pid;
    return 1;
}

static int
compare_pids(const void *n1, const void *n2)
{
    return ((process_t*)n2)->pid - ((process_t*)n1)->pid;
}

static process_t *
lookup_node(int curr_prev, pid_t pid)
{
    process_t key;
    process_t *node;

    key.pid = pid;

    if ((hot_numprocs[curr_prev] > 0) &&
        ((node = bsearch(&key, hotproc_list[curr_prev], hot_numprocs[curr_prev],
			sizeof(process_t), compare_pids)) != NULL)) {
	return node;
    }
    return NULL;
}

static process_t *
lookup_curr_node(pid_t pid)
{
    return lookup_node(current, pid);
}

static double
diff_counter(double current, double previous, int pmtype)
{
    double      outval = current-previous;

    if (outval < 0.0) {
        switch (pmtype) {
            case PM_TYPE_32:
            case PM_TYPE_U32:
                outval += (double)UINT_MAX+1;
                break;
            case PM_TYPE_64:
            case PM_TYPE_U64:
                outval += (double)ULONGLONG_MAX+1;
                break;
        }
    }
    return outval;
}

int
get_hotproc_node(pid_t pid, process_t **getnode)
{
    if (in_hot_active_list(pid)) {
	*getnode = lookup_curr_node(pid);
	return (*getnode != NULL);
    }
    *getnode = NULL;
    return 0;
}

/* The idea of this is copied from linux/proc_stat.c */
static unsigned long long
get_idle_time(void)
{
    FILE *fp = NULL;
    unsigned long long idle_time = 0;
    int n;
    char buf[MAXPATHLEN];

    pmsprintf(buf, sizeof(buf), "%s/proc/stat", proc_statspath);
    if ((fp = fopen(buf, "r")) == NULL)
	return -oserror();
    n = fscanf(fp, "cpu %*u %*u %*u %llu %*u %*u %*u %*u %*u", &idle_time);
    if (n != 1)
	idle_time = 0;
    fclose(fp);

    return idle_time;
}

/*
 * For each pid, compute stats and store in hotpid array
 * (called by the timer)
 */
static int
hotproc_eval_procs(void)
{
    pid_t pid;
    struct timeval ts;
    int sts;
    char                *f;
    unsigned long       ul;
    unsigned long long  ull;
    char                *tail;
    process_t *oldnode = NULL;      
    process_t *newnode = NULL;      
    int np = 0;                    
    struct timeval p_timestamp = {0};   
    config_vars vars;
    proc_pid_entry_t    *statentry;
    proc_pid_entry_t    *statusentry;
    proc_pid_entry_t    *ioentry;
    proc_pid_entry_t    *schedstatentry;
    __pmHashNode *node;
    int i;

    /* Still need to compute some of these */
    static double refresh_time[2];  /* timestamp after refresh */
    static time_t sysidle[2];       /* sys idle from /proc/stat */
    double sysidle_delta;           /* system idle delta time since last refresh */
    double actual_delta;            /* actual delta time since last refresh */
    double transient_delta;         /* calculated delta time of transient procs */
    double cputime_delta;           /* delta cpu time for a process */
    //double syscalls_delta;          /* delta num of syscalls for a process */
    double vctx_delta;              /* delta num of vol ctx switches for a process */
    double ictx_delta;              /* delta num of invol ctx switches for a process */
    double bread_delta;             /* delta num of bytes read */
    double bwrit_delta;             /* delta num of bytes written */
    double bwtime_delta;            /* delta num of microsec for waiting for blocked io */
    double qwtime_delta;            /* delta num of nanosec waiting on run queue */
    double timestamp_delta;         /* real time delta b/w refreshes for process */
    double total_cputime = 0;       /* total of cputime_deltas for each process */
    double total_activetime = 0;    /* total of cputime_deltas for active processes */
    double total_inactivetime = 0;  /* total of cputime_deltas for inactive processes */

    if (num_cpus == 0) {
	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    }

    if (current == 0) {
        current = 1; previous = 0;
    }
    else {
        current = 0; previous = 1;
    }

    init_hot_active_list();

    memset(&vars, 0, sizeof(config_vars));

    hotpids.count = 0;
    hotpids.threads = 0;

    /* Whats running right now */
    refresh_global_pidlist(0, NULL, &hotpids);
    refresh_proc_pidlist(hotproc_poss_pid, &hotpids);

    for (i=0; i < hotpids.count; i++) {

	pid = hotpids.pids[i];

        node = __pmHashSearch(hotpids.pids[i], &hotproc_poss_pid->pidhash);

	if (node == NULL) {
	    fprintf(stderr,"hotproc : Hash search failed for Proc %d!\n", i);
	    continue;
	}

	pmtimevalNow(&p_timestamp);

	/* Collect all the stat/status/statm info */
	statentry = fetch_proc_pid_stat(pid, hotproc_poss_pid, &sts);
	statusentry = fetch_proc_pid_status(pid, hotproc_poss_pid, &sts);
	ioentry = fetch_proc_pid_io(pid, hotproc_poss_pid, &sts);
	schedstatentry = fetch_proc_pid_schedstat(pid, hotproc_poss_pid, &sts);

        /* Note: /proc/pid/schedstat and /proc/pid/io not on all platforms */
	if (!statentry || !statusentry /*|| !ioentry || !schedstatentry */) {
	    /* Can happen if the process was exiting during
	     * refresh_proc_pidlist then the above fetch's will fail.
	     * Would be best if they were not in the list at all
	     * "np" is used from now on, so hopefully can just continue
	     */
	    continue;
	}

	if (np == hot_maxprocs[current]) {
	    process_t *res;
	    hot_maxprocs[current] = np*2;
	    res = (process_t *)realloc(hotproc_list[current],
		    hot_maxprocs[current] * sizeof(process_t));
	    if (res == NULL)
		return -oserror();
	    hotproc_list[current] = res;
	}

	newnode = &hotproc_list[current][np++];
        newnode->pid = pid;

	/* Calc the stats we will need */
	/* CPU Time is sum of U & S time */
	
	if ((f = _pm_getfield(statentry->stat_buf, PROC_PID_STAT_UTIME)) == NULL)
	    newnode->r_cputime = 0;
	else {
	    ul = (__uint32_t)strtoul(f, &tail, 0);
	    newnode->r_cputime      = (double)ul / (double)hz;
	}

	if ((f = _pm_getfield(statentry->stat_buf, PROC_PID_STAT_STIME)) == NULL){
	    /* Nothing */
	}
	else {
	    ul = (__uint32_t)strtoul(f, &tail, 0);
	    newnode->r_cputime      += (double)ul / (double)hz;
	}

	newnode->r_cputimestamp = p_timestamp.tv_sec + p_timestamp.tv_usec / 1000000;

	/* Context Switches : vol and invol */

        if ((f = _pm_getfield(statusentry->status_lines.vctxsw, 1)) == NULL)
	    newnode->r_vctx = 0;
	else
	    newnode->r_vctx = (__uint32_t)strtoul(f, &tail, 0);

	if ((f = _pm_getfield(statusentry->status_lines.nvctxsw, 1)) == NULL)
	    newnode->r_ictx = 0;
	else
	    newnode->r_ictx = (__uint32_t)strtoul(f, &tail, 0);

	/* IO demand */
	/* Read */
	
        if( !ioentry ) /* ioentry is not enabled on all kernels */
            ull = 0;
	else if ((f = _pm_getfield(ioentry->io_lines.readb, 1)) == NULL)
	    ull = 0;
	else
	    ull = (__uint64_t)strtoull(f, &tail, 0);
		
	newnode->r_bread = ull;

	/* Write */

        if( !ioentry ) /* ioentry is not enabled on all kernels */
            ull = 0;
	else if ((f = _pm_getfield(ioentry->io_lines.writeb, 1)) == NULL)
	    ull = 0;
	else
	    ull = (__uint64_t)strtoull(f, &tail, 0);
		
	newnode->r_bwrit = ull;
	
	/* Block IO wait (delayacct_blkio_ticks) */

	if ((f = _pm_getfield(statentry->stat_buf, PROC_PID_STAT_DELAYACCT_BLKIO_TICKS - 3)) == NULL)  /* Note the offset */
	    ul = 0;
	else
	    ul = (__uint32_t)strtoul(f, &tail, 0);
		
	newnode->r_bwtime = (double)ul / hz;

	/* Schedwait (run_delay) */
        if( !schedstatentry ) /* schedstat is not enabled on all kernels */
            ull =0;
        else if ((f = _pm_getfield(schedstatentry->schedstat_buf, 1)) == NULL)
	    ull = 0;
	else
	    ull  = (__uint64_t)strtoull(f, &tail, 0);

	newnode->r_qwtime = ull;


	/* This is not the first time through, so we can generate rate stats */
	if ((oldnode = lookup_node(previous, pid)) != NULL) {

	    /* CPU */
	    cputime_delta = diff_counter(newnode->r_cputime, oldnode->r_cputime, PM_TYPE_64);
	    timestamp_delta = diff_counter(newnode->r_cputimestamp, oldnode->r_cputimestamp, PM_TYPE_64);

	    newnode->r_cpuburn = cputime_delta / timestamp_delta;
	    vars.cpuburn = newnode->r_cpuburn;

	    /* IO */
	    bread_delta = diff_counter((double)newnode->r_bread,
                                   (double)oldnode->r_bread, PM_TYPE_64);
	    bwrit_delta = diff_counter((double)newnode->r_bwrit,
                                    (double)oldnode->r_bwrit, PM_TYPE_64);
	    vars.preds.iodemand = (
                                 (double)bread_delta  +
                                 (double)bwrit_delta ) /
                                timestamp_delta;

	    /* ctx switches */
	    vctx_delta = diff_counter((double)newnode->r_vctx,
                                    (double)oldnode->r_vctx, PM_TYPE_64);
	    ictx_delta = diff_counter((double)newnode->r_ictx,
                                    (double)oldnode->r_ictx, PM_TYPE_64);
	    vars.preds.ctxswitch = (vctx_delta + ictx_delta) / timestamp_delta;

	    /* IO wait */
	    bwtime_delta = diff_counter((double)newnode->r_bwtime,
                                    (double)oldnode->r_bwtime, PM_TYPE_64);

	    vars.preds.iowait = bwtime_delta / timestamp_delta;

	    /* schedwait */
	    qwtime_delta = diff_counter((double)newnode->r_qwtime,
		    (double)oldnode->r_qwtime, PM_TYPE_64);
	    vars.preds.schedwait = qwtime_delta / (timestamp_delta * 1000000000); /* run_delay in nsec */

	}
        else {
	    newnode->r_cpuburn = 0;
	    memset(&newnode->preds, 0, sizeof(newnode->preds));
	    vars.cpuburn = 0;
	    //vars.preds.syscalls = 0;
	    vars.preds.ctxswitch = 0;
	    vars.preds.iowait = 0;
	    vars.preds.schedwait = 0;
	    vars.preds.iodemand = 0;
	    cputime_delta = 0;
        }

        total_cputime += cputime_delta;

	/* Command */

	if ((f = _pm_getfield(statentry->stat_buf, PROC_PID_STAT_CMD)) == NULL) {
	    strcpy(vars.fname, "Unknown");
	}
	else {
	    char *cmd = f + 1;	/* skip enclosing parentheses */
	    size_t len = strlen(cmd);

	    strncpy(vars.fname, cmd, sizeof(vars.fname));
	    if (len < sizeof(vars.fname))
		vars.fname[len-1] = '\0'; /* Skip the closing parenthesis */
	    vars.fname[sizeof(vars.fname) - 1] = '\0';
	}

	/* PS Args */
	strncpy(vars.psargs, statentry->name+7, sizeof(vars.psargs));
	vars.psargs[sizeof(vars.psargs)-1]='\0';

	/* UID and GID */
	if ((f = _pm_getfield(statusentry->status_lines.uid, 1)) == NULL) {
	    ul = 0;
	}
	else {
	    ul = (__uint32_t)strtoul(f, &tail, 0);
	}
	
	vars.uid = ul;

	if ((f = _pm_getfield(statusentry->status_lines.gid, 1)) == NULL) {
	    ul = 0;
	}
	else {
	    ul = (__uint32_t)strtoul(f, &tail, 0);
	}

	vars.gid = ul;

	/* uname and gname */

	struct passwd *pwe;		

	if ((pwe = getpwuid((uid_t)vars.uid)) != NULL) {
	    strncpy(vars.uname, pwe->pw_name, sizeof(vars.uname));
	    vars.uname[sizeof(vars.uname)-1] = '\0';
	}
	else {
	    strcpy(vars.uname, "UNKNOWN");
	}

	struct group *gre;

	if ((gre = getgrgid((gid_t)vars.gid)) != NULL) {
	    strncpy(vars.gname, gre->gr_name, sizeof(vars.gname));
	    vars.gname[sizeof(vars.gname)-1] = '\0';
	} 
	else {
	    strcpy(vars.gname, "UNKNOWN");
	}

	/* VSIZE from stat */

	if ((f = _pm_getfield(statentry->stat_buf, PROC_PID_STAT_VSIZE)) == NULL) {
	    ul = 0;
	}
	else {
	    ul = (__uint32_t)strtoul(f, &tail, 0);
	    ul /= 1024;
	}

	vars.preds.virtualsize = ul;

	/* RSS from stat */

	if ((f = _pm_getfield(statentry->stat_buf, PROC_PID_STAT_RSS)) == NULL) {
	    ul = 0;
	}
	else {
	    ul = (__uint32_t)strtoul(f, &tail, 0);
	    ul *= getpagesize() / 1024;
	}

	vars.preds.residentsize = ul;

	//  Struct copy.  I think it was a bug before.  Copy should be after rss and vm calcs
	newnode->preds = vars.preds;

	if ((sts = add_hot_active_list(newnode, &vars)) < 0) {
	    return sts;
       	}

       	if (sts == 0)
	    total_inactivetime += cputime_delta;
	else
	    total_activetime += cputime_delta;

    }

    hot_numprocs[current] = np;

    pmtimevalNow(&ts);
    refresh_time[current] = ts.tv_sec + ts.tv_usec / 1000000;

    double hptime = (ts.tv_sec - p_timestamp.tv_sec) + (ts.tv_usec - p_timestamp.tv_usec)/1000000.0;

    if (pmDebugOptions.libpmda)
	fprintf(stderr, "Hotproc Update took %f time\n", hptime);

    /* Idle */
    sysidle[current] = get_idle_time();

    /* Handle rollover */
    hot_refresh_count++;
    if (hot_refresh_count == 0)
        hot_refresh_count = 2;

    if (hot_refresh_count > 1) {
	sysidle_delta = diff_counter(sysidle[current], sysidle[previous], PM_TYPE_64) / (double)HZ;
	actual_delta = diff_counter(refresh_time[current], refresh_time[previous], PM_TYPE_64);
	transient_delta = num_cpus * actual_delta - (total_cputime + sysidle_delta);
	if (transient_delta < 0) /* sanity check */
	    transient_delta = 0;

        hot_have_totals = 1;
        hot_total_transient = transient_delta / actual_delta;
        hot_total_cpuidle = sysidle_delta / actual_delta;
        hot_total_active = total_activetime / actual_delta;
        hot_total_inactive = total_inactivetime / actual_delta;
    }

    qsort(hotproc_list[current], hot_numprocs[current],
          sizeof(process_t), compare_pids);

    return 0;
}

static void
hotproc_timer(int sig, void *ptr)
{
    hotproc_eval_procs();
}

void
init_hotproc_pid(proc_pid_t *_hotproc_poss_pid)
{
    hotproc_poss_pid = _hotproc_poss_pid;
    hotproc_update_interval.tv_sec = 10;
    init_hotproc_list();
    reset_hotproc_timer();
}

void
reset_hotproc_timer(void)
{
    int	sts;

    /* Only reset/enable timer when a valid configuration is present. */
    if (!conf_gen)
	return;

    __pmAFunregister(hotproc_timer_id);
    sts = __pmAFregister(&hotproc_update_interval, NULL, hotproc_timer);
    if (sts < 0) {
	pmNotifyErr(LOG_ERR, "error registering hotproc timer: %s",
			pmErrStr(sts));
	exit(1);
    }
    hotproc_timer_id = sts;
}

void
disable_hotproc(void)
{
    /* Clear out the hotlist */
    init_hot_active_list();
    /* Disable the timer */
    __pmAFunregister(hotproc_timer_id);
    conf_gen = 0;
}

static void
refresh_proc_pidlist(proc_pid_t *proc_pid, proc_pid_list_t *pids)
{
    int i, numinst, idx;
    int fd;
    char *p;
    char buf[MAXPATHLEN];
    __pmHashNode *node, *next, *prev;
    proc_pid_entry_t *ep;
    pmdaIndom *indomp = proc_pid->indom;

    /*
     * invalidate all entries so we can harvest pids that have exited
     */
    for (i=0; i < proc_pid->pidhash.hsize; i++) {
	for (node=proc_pid->pidhash.hash[i]; node != NULL; node = node->next) {
	    ep = (proc_pid_entry_t *)node->data;
	    ep->flags = 0;
	}
    }

    /*
     * walk pid list and add new pids to the hash table,
     * marking entries valid as we go ...
     */
    for (i=0; i < pids->count; i++) {
	node = __pmHashSearch(pids->pids[i], &proc_pid->pidhash);
	if (node)
	    ep = (proc_pid_entry_t *)node->data;
	else {
	    int k = 0;

	    ep = (proc_pid_entry_t *)malloc(sizeof(proc_pid_entry_t));
	    memset(ep, 0, sizeof(proc_pid_entry_t));

	    ep->id = pids->pids[i];

	    pmsprintf(buf, sizeof(buf), "%s/proc/%d/cmdline", proc_statspath, pids->pids[i]);
	    if ((fd = open(buf, O_RDONLY)) >= 0) {
		int numlen = pmsprintf(buf, sizeof(buf), "%06d ", pids->pids[i]);
		if ((k = read(fd, buf+numlen, sizeof(buf)-numlen)) > 0) {
		    p = buf + k + numlen;
		    if (p - buf >= sizeof(buf))
			p--;
		    *p-- = '\0';
		    /* Skip trailing nils, i.e. don't replace them */
		    while (buf+numlen < p) {
			if (*p-- != '\0') {
				break;
			}
		    }
		    /* Remove NULL terminators from cmdline string array */
		    while (buf+numlen < p) {
			if (*p == '\0') *p = ' ';
			p--;
		    }
		}
		close(fd);
	    }
	    else {
		if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
		    char ebuf[1024];
		    fprintf(stderr, "refresh_proc_pidlist: open(\"%s\", O_RDONLY) failed: %s\n", buf, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
		}
	    }
	    if (k == 0) {
		/*
		 * If a process is swapped out, /proc/<pid>/cmdline
		 * returns an empty string so we have to get it
		 * from /proc/<pid>/status or /proc/<pid>/stat
		 */
		pmsprintf(buf, sizeof(buf), "%s/proc/%d/status", proc_statspath, pids->pids[i]);
		if ((fd = open(buf, O_RDONLY)) >= 0) {
		    /* We engage in a bit of a hanky-panky here:
		     * the string should look like "123456 (name)",
		     * we get it from /proc/XX/status as "Name:   name\n...",
		     * to fit the 6 digits of PID and opening parenthesis, 
	             * save 2 bytes at the start of the buffer. 
                     * And don't forget to leave 2 bytes for the trailing 
		     * parenthesis and the nil. Here is
		     * an example of what we're trying to achieve:
		     * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+
		     * |  |  | N| a| m| e| :|\t| i| n| i| t|\n| S|...
		     * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+
		     * | 0| 0| 0| 0| 0| 1|  | (| i| n| i| t| )|\0|...
		     * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+ */
		    if ((k = read(fd, buf+2, sizeof(buf)-4)) > 0) {
			int bc;

			if ((p = strchr(buf+2, '\n')) == NULL)
			    p = buf+k;
			p[0] = ')'; 
			p[1] = '\0';
			bc = pmsprintf(buf, sizeof(buf), "%06d ", pids->pids[i]); 
			buf[bc] = '(';
		    }
		    close(fd);
		}
		else {
		    if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
			char ebuf[1024];
			fprintf(stderr, "refresh_proc_pidlist: open(\"%s\", O_RDONLY) failed: %s\n", buf, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
		    }
		}
	    }

	    if (k <= 0) {
		/* hmm .. must be exiting */
	    	pmsprintf(buf, sizeof(buf), "%06d <exiting>", pids->pids[i]);
	    }

	    ep->name = strdup(buf);

	    __pmHashAdd(pids->pids[i], (void *)ep, &proc_pid->pidhash);
	    //fprintf(stderr, "key %d : ADDED \"%s\" to hash table\n", pids->pids[i], buf);
	}

	if (ep->instname == NULL) {
           /*
             * The external instance name is the pid followed by
             * a copy of the psargs truncated at the first space.
             * e.g. "012345 /path/to/command". Command line args,
             * if any, are truncated. The full command line is
             * available in the proc.psinfo.psargs metric.
             */
            if ((p = strchr(ep->name, ' ')) != NULL) {
                if ((p = strchr(p+1, ' ')) != NULL) {
                    int len = p - ep->name;
		    if (len > PROC_PID_STAT_CMD_MAXLEN)
			len = PROC_PID_STAT_CMD_MAXLEN;
                    ep->instname = (char *)malloc(len+1);
                    strncpy(ep->instname, ep->name, len);
                    ep->instname[len] = '\0';
                }
            }
            if (ep->instname == NULL) /* no spaces found, so use the full name */
                ep->instname = strndup(ep->name, PROC_PID_STAT_CMD_MAXLEN);
	}
	
	/* mark pid as valid (new or still running) */
	ep->flags |= PROC_PID_FLAG_VALID;
    }

    /* 
     * harvest pids that have exit'ed
     */
    numinst = 0;
    for (i=0; i < proc_pid->pidhash.hsize; i++) {
	for (prev=NULL, node=proc_pid->pidhash.hash[i]; node != NULL;) {
	    next = node->next;
	    ep = (proc_pid_entry_t *)node->data;
	    // fprintf(stderr, "CHECKING key=%d node=" PRINTF_P_PFX "%p prev=" PRINTF_P_PFX "%p next=" PRINTF_P_PFX "%p ep=" PRINTF_P_PFX "%p valid=%d\n",
	    	// ep->id, node, prev, node->next, ep, ep->valid);
	    if (ep->flags & PROC_PID_FLAG_VALID) {
		numinst++;
	    	prev = node;
	    }
	    else {
		// This process has exited.
	        //fprintf(stderr, "DELETED key=%d name=\"%s\"\n", ep->id, ep->name);
		if (ep->instname != NULL)
		    free(ep->instname);
		if (ep->name != NULL)
		    free(ep->name);
		if (ep->stat_buf != NULL)
		    free(ep->stat_buf);
		if (ep->status_buf != NULL)
		    free(ep->status_buf);
		if (ep->statm_buf != NULL)
		    free(ep->statm_buf);
		if (ep->maps_buf != NULL)
		    free(ep->maps_buf);
		if (ep->smaps_buf != NULL)
		    free(ep->smaps_buf);
		if (ep->schedstat_buf != NULL)
		    free(ep->schedstat_buf);
		if (ep->io_buf != NULL)
		    free(ep->io_buf);
		if (ep->wchan_buf != NULL)
		    free(ep->wchan_buf);
		if (ep->environ_buf != NULL)
		    free(ep->environ_buf);

	    	if (prev == NULL)
		    proc_pid->pidhash.hash[i] = node->next;
		else
		    prev->next = node->next;
		free(ep);
		free(node);
	    }
	    if ((node = next) == NULL)
	    	break;
	}
    }

    /*
     * At this point, the hash table contains only valid pids. Refresh the indom table,
     * based on the updated process hash table. Indom table instance names are shared
     * with their hash table entry, do not free!
     */
    indomp->it_numinst = numinst;
    indomp->it_set = (pmdaInstid *)realloc(indomp->it_set, numinst * sizeof(pmdaInstid));
    for (idx=0, i=0; i < proc_pid->pidhash.hsize; i++) {
	for (node=proc_pid->pidhash.hash[i]; node != NULL; node=node->next, idx++) {
	    ep = (proc_pid_entry_t *)node->data;
	    indomp->it_set[idx].i_inst = ep->id; /* internal instid is pid */
	    indomp->it_set[idx].i_name = ep->instname; /* ptr ref, do not free */
	}
    }
}

int
refresh_proc_pid(proc_pid_t *proc_pid, proc_runq_t *proc_runq,
		 int want_threads, const char *cgroups,
		 const char *container, int namelen)
{
    char path[MAXPATHLEN];
    int sts, want_cgroups;
    const char *filter = cgroups;

    want_cgroups = container || (cgroups && cgroups[0] != '\0');

    /*
     * For containers we asked pmdaroot for a cgroup name for the container;
     * next find a matching filesystem path we can use to look up processes.
     */
    if (container)
	filter = cgroup_container_path(path, sizeof(path), container);

    /* Reset accounting of the runqueue metrics, initially all zeroes */
    if (proc_runq)
	memset(proc_runq, 0, sizeof(proc_runq_t));

    sts = !want_cgroups ?
	refresh_global_pidlist(want_threads, proc_runq, &procpids) :
	refresh_cgroup_pidlist(want_threads, proc_runq, &procpids, filter);
    if (sts < 0)
	return sts;

    if (pmDebugOptions.libpmda)
	fprintf(stderr,
		"refresh_proc_pid: %d pids (threads=%d, %s=\"%s\")\n",
		procpids.count, procpids.threads,
		container ? "container" : "cgroups", filter ? filter : "");

    refresh_proc_pidlist(proc_pid, &procpids);
    return 0;
}

int
refresh_hotproc_pid(proc_pid_t *proc_pid, int threads, const char *cgroups)
{

    int sts;

    hotpids.count = 0;
    hotpids.threads = threads;

    sts = refresh_hotproc_pidlist(&hotpids);

    if (sts < 0)
        return sts;

    refresh_proc_pidlist(proc_pid, &hotpids);
    return 0;
}



/*
 * Open a proc file, taking into account that we may want thread info
 * rather than process information.
 *
 * We make (ab)use of some obscure Linux procfs mechanisms here!
 * Even though readdir(/proc) does not contain tasks, we can still open
 * taskid directory files; on top of that, the tasks sub-directory in a
 * task group has all (peer) tasks in that group, even for "children".
 */
static int
proc_open(const char *base, proc_pid_entry_t *ep)
{
    int fd;
    char buf[128];

    if (procpids.threads) {
	pmsprintf(buf, sizeof(buf), "%s/proc/%d/task/%d/%s",
			proc_statspath, ep->id, ep->id, base);
	fd = open(buf, O_RDONLY);
	if (fd < 0) {
	    if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
		char ebuf[1024];
		fprintf(stderr, "proc_open: open(\"%s\", O_RDONLY) failed: %s\n",
			    buf, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	    }
	    /* fallback to /proc path if task path open fails */
	} else {
	    if (pmDebugOptions.libpmda && pmDebugOptions.desperate)
		fprintf(stderr, "proc_open: thread: %s -> fd=%d\n", buf, fd);
	    return fd;
	}
    }
    pmsprintf(buf, sizeof(buf), "%s/proc/%d/%s", proc_statspath, ep->id, base);
    fd = open(buf, O_RDONLY);
    if (fd < 0) {
	if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
	    char ebuf[1024];
	    fprintf(stderr, "proc_open: open(\"%s\", O_RDONLY) failed: %s\n",
			    buf, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	}
    }
    if (pmDebugOptions.libpmda && pmDebugOptions.desperate)
	fprintf(stderr, "proc_open: %s -> fd=%d\n", buf, fd);
    return fd;
}

static DIR *
proc_opendir(const char *base, proc_pid_entry_t *ep)
{
    DIR *dir;
    char buf[128];

    if (procpids.threads) {
	pmsprintf(buf, sizeof(buf), "%s/proc/%d/task/%d/%s", proc_statspath, ep->id, ep->id, base);
	if ((dir = opendir(buf)) != NULL) {
	    return dir;
	}
	else {
	    if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
		char ebuf[1024];
		fprintf(stderr, "proc_opendir: opendir(\"%s\") failed: %s\n", buf, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	    }
	}
	/* fallback to /proc path if task path opendir fails */
    }
    pmsprintf(buf, sizeof(buf), "%s/proc/%d/%s", proc_statspath, ep->id, base);
    dir = opendir(buf);
    if (dir == NULL) {
	if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
	    char ebuf[1024];
	    fprintf(stderr, "proc_opendir: opendir(\"%s\") failed: %s\n", buf, pmErrStr_r(-oserror(), ebuf, sizeof(ebuf)));
	}
    }
    return dir;
}

/*
 * error mapping for fetch routines ...
 * EACCESS, EINVAL => no values (don't disclose anything else)
 * ENOENT => PM_ERR_APPVERSION
 */
static int
maperr(void)
{
    int		sts = -oserror();

    if (sts == -EACCES || sts == -EINVAL) sts = 0;
    else if (sts == -ENOENT) sts = PM_ERR_APPVERSION;
    return sts;
}

static int
read_proc_entry(int fd, int *lenp, char **bufp)
{
    int sts = 0;
    int n, len = 0;
    char *p = *bufp;
    char buf[1024];

    for (len=0;;) {
	if ((n = read(fd, buf, sizeof(buf))) <= 0)
	    break;
	len += n;
	if (*lenp < len) {
	    *lenp = len;
	    *bufp = (char *)realloc(*bufp, len+1);
	    p = *bufp + len - n;
	}
	memcpy(p, buf, n);
	p += n;
    }

    if (len > 0)
    	*p = '\0';
    else {
	/* invalid read */
	if (n < 0)
	    sts = maperr();
	else if (n == 0) {
	    sts = -ENODATA;
	    if (pmDebugOptions.libpmda && pmDebugOptions.desperate)
		fprintf(stderr, "read_proc_entry: fd=%d: no data\n", fd);
	}
    }

    return sts;
}

/*
 * fetch a proc/<pid>/stat entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_stat(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;
    char		*p;
    int			fd;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_STAT_FETCHED)) {
	if (ep->stat_buflen > 0)
	    ep->stat_buf[0] = '\0';
	if ((fd = proc_open("stat", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->stat_buflen, &ep->stat_buf);
	    close(fd);
	}
	ep->flags |= PROC_PID_FLAG_STAT_FETCHED;
    }

    if (!(ep->flags & PROC_PID_FLAG_WCHAN_FETCHED)) {
	if (ep->wchan_buflen > 0)
	    ep->wchan_buf[0] = '\0';
	if ((fd = proc_open("wchan", ep)) < 0)
	    ; /* ignore failure here, backwards compat */
	else {
	    *sts = read_proc_entry(fd, &ep->wchan_buflen, &ep->wchan_buf);
	    close(fd);
	}
	ep->flags |= PROC_PID_FLAG_WCHAN_FETCHED;
    }

    if (!(ep->flags & PROC_PID_FLAG_ENVIRON_FETCHED)) {
	if (ep->environ_buflen > 0)
	    ep->environ_buf[0] = '\0';
	if ((fd = proc_open("environ", ep)) >= 0) {
	    *sts = read_proc_entry(fd, &ep->environ_buflen, &ep->environ_buf);
	    close(fd);
	    if (*sts == 0) {
		/* Replace nulls with spaces */
		if (ep->environ_buf) {
		    for (p=ep->environ_buf; p < ep->environ_buf + ep->environ_buflen; p++) {
			if (*p == '\0')
			    *p = ' ';
		    }
		    ep->environ_buf[ep->environ_buflen-1] = '\0';
		}
	    }
	    else {
		/*
		 * probably EOF on first read, especially for
		 * /proc/<N>/environ ...
		 */
		ep->environ_buflen = 0;
		*sts = 0; /* clear -ENODATA */
	    }
	}
	else {
	    /*
	     * have seen Permission denied errors from open(),
	     * especially for /proc/<N>/environ ...
	     */
	    ep->environ_buflen = 0;
	    *sts = 0; /* clear -ENODATA */
	}
	ep->flags |= PROC_PID_FLAG_ENVIRON_FETCHED;
    }

    if (*sts < 0)
    	return NULL;
    return ep;
}

/*
 * Skip an initial colon-terminated header and whitespace, then comma-separate
 * the remainder of the line by overwriting any whitespace.
 */
static char *
commasep(char **buf)
{
    char *start, *p = *buf;

    for (; *p && *p != ':'; p++);	/* skip header */
    if (*p) p++;
    for (; *p && isspace(*p); p++);	/* skip initial whitespace */
    start = *buf = p;
    for (; *p; p++) {
	if (*p == '\n') {
	    *p = '\0';	/* replace end of line */
	    *buf = p + 1;
	    break;
	}
	if (isspace(*p)) *p = ',';	/* replace whitespace */
    }
    return start;
}

/*
 * fetch a proc/<pid>/status entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_status(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_STATUS_FETCHED)) {
	int	fd;
	char	*curline;

	if (ep->status_buflen > 0)
	    ep->status_buf[0] = '\0';
	if ((fd = proc_open("status", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->status_buflen, &ep->status_buf);
	    close(fd);
	}

	if (*sts == 0) {
	    /* assign pointers to individual lines in buffer */
	    curline = ep->status_buf;
	    /*
	     * Expecting something like ...
	     *
	     * Name:	bash
	     * State:	S (sleeping)
	     * Tgid:	21374
	     * Pid:	21374
	     * PPid:	21373
	     * TracerPid:	0
	     * Uid:	1000	1000	1000	1000
	     * Gid:	1000	1000	1000	1000
	     * FDSize:	256
	     * Groups:	24 25 27 29 30 44 46 105 110 112 1000 
	     * VmPeak:	   22388 kB
	     * VmSize:	   22324 kB
	     * VmLck:	       0 kB
	     * VmPin:	       0 kB
	     * VmHWM:	    5200 kB
	     * VmRSS:	    5200 kB
	     * VmData:	    3280 kB
	     * VmStk:	     136 kB
	     * VmExe:	     916 kB
	     * VmLib:	    2024 kB
	     * VmPTE:	      60 kB
	     * VmSwap:	       0 kB
	     * Threads:	1
	     * SigQ:	0/47779
	     * SigPnd:	0000000000000000
	     * ShdPnd:	0000000000000000
	     * SigBlk:	0000000000010000
	     * SigIgn:	0000000000384004
	     * SigCgt:	000000004b813efb
	     * CapInh:	0000000000000000
	     * CapPrm:	0000000000000000
	     * CapEff:	0000000000000000
	     * CapBnd:	ffffffffffffffff
	     * Cpus_allowed:	3
	     * Cpus_allowed_list:	0-1
	     * Mems_allowed:	00000000,00000001
	     * Mems_allowed_list:	0
	     * voluntary_ctxt_switches:	225
	     * nonvoluntary_ctxt_switches:	56
	     */
	    while (curline) {
		/* small optimization ... peek at first character */
		switch (*curline) {
		    case 'U':
			if (strncmp(curline, "Uid:", 4) == 0)
			    ep->status_lines.uid = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    case 'G':
			if (strncmp(curline, "Gid:", 4) == 0)
			    ep->status_lines.gid = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    case 'V':
			if (strncmp(curline, "VmPeak:", 7) == 0)
			    ep->status_lines.vmpeak = strsep(&curline, "\n");
			else if (strncmp(curline, "VmSize:", 7) == 0)
			    ep->status_lines.vmsize = strsep(&curline, "\n");
			else if (strncmp(curline, "VmLck:", 6) == 0)
			    ep->status_lines.vmlck = strsep(&curline, "\n");
			else if (strncmp(curline, "VmPin:", 6) == 0)
			    ep->status_lines.vmpin = strsep(&curline, "\n");
			else if (strncmp(curline, "VmHWM:", 6) == 0)
			    ep->status_lines.vmhwm = strsep(&curline, "\n");
			else if (strncmp(curline, "VmRSS:", 6) == 0)
			    ep->status_lines.vmrss = strsep(&curline, "\n");
			else if (strncmp(curline, "VmData:", 7) == 0)
			    ep->status_lines.vmdata = strsep(&curline, "\n");
			else if (strncmp(curline, "VmStk:", 6) == 0)
			    ep->status_lines.vmstk = strsep(&curline, "\n");
			else if (strncmp(curline, "VmExe:", 6) == 0)
			    ep->status_lines.vmexe = strsep(&curline, "\n");
			else if (strncmp(curline, "VmLib:", 6) == 0)
			    ep->status_lines.vmlib = strsep(&curline, "\n");
			else if (strncmp(curline, "VmPTE:", 6) == 0)
			    ep->status_lines.vmpte = strsep(&curline, "\n");
			else if (strncmp(curline, "VmSwap:", 7) == 0)
			    ep->status_lines.vmswap = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    case 'T':
			if (strncmp(curline, "Threads:", 8) == 0)
			    ep->status_lines.threads = strsep(&curline, "\n");
			else if (strncmp(curline, "Tgid:", 5) == 0)
			    ep->status_lines.tgid = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    case 'S':
			if (strncmp(curline, "SigPnd:", 7) == 0)
			    ep->status_lines.sigpnd = strsep(&curline, "\n");
			else if (strncmp(curline, "SigBlk:", 7) == 0)
			    ep->status_lines.sigblk = strsep(&curline, "\n");
			else if (strncmp(curline, "SigIgn:", 7) == 0)
			    ep->status_lines.sigign = strsep(&curline, "\n");
			else if (strncmp(curline, "SigCgt:", 7) == 0)
			    ep->status_lines.sigcgt = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    case 'v':
			if (strncmp(curline, "voluntary_ctxt_switches:", 24) == 0)
			    ep->status_lines.vctxsw = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    case 'N':
			if (strncmp(curline, "Ngid:", 5) == 0)
			    ep->status_lines.ngid = strsep(&curline, "\n");
			else if (strncmp(curline, "NStgid:", 7) == 0) {
			    ep->status_lines.nstgid = commasep(&curline);
			}
			else if (strncmp(curline, "NSpid:", 6) == 0) {
			    ep->status_lines.nspid = commasep(&curline);
			}
			else if (strncmp(curline, "NSpgid:", 7) == 0)
			    ep->status_lines.nspgid = commasep(&curline);
			else if (strncmp(curline, "NSsid:", 6) == 0)
			    ep->status_lines.nssid = commasep(&curline);
			else
			    goto nomatch;
			break;
		    case 'n':
			if (strncmp(curline, "nonvoluntary_ctxt_switches:", 27) == 0)
			    ep->status_lines.nvctxsw = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
                    case 'C':
		        if (strncmp(curline, "Cpus_allowed_list:", 18) == 0)
		            ep->status_lines.cpusallowed = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
                    case 'e':
		        if (strncmp(curline, "envID:", 6) == 0)
		            ep->status_lines.envid = strsep(&curline, "\n");
			else
			    goto nomatch;
			break;
		    default:
nomatch:
			if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
			    char	*p;
			    fprintf(stderr, "fetch_proc_pid_status: skip ");
			    for (p = curline; *p && *p != '\n'; p++)
				fputc(*p, stderr);
			    fputc('\n', stderr);
			}
			curline = index(curline, '\n');
			if (curline != NULL) curline++;
		}
	    }
	    ep->flags |= PROC_PID_FLAG_STATUS_FETCHED;
	}
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch a proc/<pid>/statm entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_statm(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
    	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_STATM_FETCHED)) {
	int fd;

	if (ep->statm_buflen > 0)
	    ep->statm_buf[0] = '\0';
	if ((fd = proc_open("statm", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->statm_buflen, &ep->statm_buf);
	    close(fd);
	}
	ep->flags |= PROC_PID_FLAG_STATM_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}


/*
 * fetch a proc/<pid>/maps entry for pid
 * WARNING: This can be very large!  Only ask for it if you really need it.
 */
proc_pid_entry_t *
fetch_proc_pid_maps(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;
    int			fd;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_MAPS_FETCHED)) {
	if (ep->maps_buflen > 0)
	    ep->maps_buf[0] = '\0';
	if ((fd = proc_open("maps", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->maps_buflen, &ep->maps_buf);
	    close(fd);

	    /* If there are no maps, make maps_buf a zero length string. */
	    if (ep->maps_buflen == 0) {
		ep->maps_buflen = 1;
		ep->maps_buf = (char *)malloc(1);
	    }
	    if (ep->maps_buf) {
		ep->maps_buf[ep->maps_buflen - 1] = '\0';
		*sts = 0; /* clear -ENODATA */
	    }
	    else
		ep->maps_buflen = 0;
	}

	ep->flags |= PROC_PID_FLAG_MAPS_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch a proc/<pid>/schedstat entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_schedstat(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_SCHEDSTAT_FETCHED)) {
	int		fd;

	if (ep->schedstat_buflen > 0)
	    ep->schedstat_buf[0] = '\0';
	if ((fd = proc_open("schedstat", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->schedstat_buflen, &ep->schedstat_buf);
	    close(fd);
	}
	ep->flags |= PROC_PID_FLAG_SCHEDSTAT_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch a proc/<pid>/io entry for pid
 *
 * Depends on kernel built with CONFIG_TASK_IO_ACCOUNTING=y
 * which means the following must also be set:
 * CONFIG_TASKSTATS=y
 * CONFIG_TASK_DELAY_ACCT=y
 * CONFIG_TASK_XACCT=y
 */
proc_pid_entry_t *
fetch_proc_pid_io(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_IO_FETCHED)) {
	int	fd;
	char	*curline;

	if (ep->io_buflen > 0)
	    ep->io_buf[0] = '\0';
	if ((fd = proc_open("io", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->io_buflen, &ep->io_buf);
	    close(fd);
	}

	if (*sts == 0) {
	    /* assign pointers to individual lines in buffer */
	    curline = ep->io_buf;
	    /*
	     * expecting 
	     * rchar: 714415843
	     * wchar: 101078796
	     * syscr: 780339
	     * syscw: 493583
	     * read_bytes: 209099776
	     * write_bytes: 118263808
	     * cancelled_write_bytes: 102301696
	    */
	    while (curline) {
		if (strncmp(curline, "rchar:", 6) == 0)
		    ep->io_lines.rchar = strsep(&curline, "\n");
		else if (strncmp(curline, "wchar:", 6) == 0)
		    ep->io_lines.wchar = strsep(&curline, "\n");
		else if (strncmp(curline, "syscr:", 6) == 0)
		    ep->io_lines.syscr = strsep(&curline, "\n");
		else if (strncmp(curline, "syscw:", 6) == 0)
		    ep->io_lines.syscw = strsep(&curline, "\n");
		else if (strncmp(curline, "read_bytes:", 11) == 0)
		    ep->io_lines.readb = strsep(&curline, "\n");
		else if (strncmp(curline, "write_bytes:", 12) == 0)
		    ep->io_lines.writeb = strsep(&curline, "\n");
		else if (strncmp(curline, "cancelled_write_bytes:", 22) == 0)
		    ep->io_lines.cancel = strsep(&curline, "\n");
		else {
		    if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
			char	*p;
			fprintf(stderr, "fetch_proc_pid_io: skip ");
			for (p = curline; *p && *p != '\n'; p++)
			    fputc(*p, stderr);
			fputc('\n', stderr);
		    }
		    curline = index(curline, '\n');
		    if (curline != NULL) curline++;
		}
	    }
	    ep->flags |= PROC_PID_FLAG_IO_FETCHED;
	}
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch a proc/<pid>/smaps_rollup entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_smaps(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_SMAPS_FETCHED)) {
	int	fd;
	char	*curline;

	if (ep->smaps_buflen > 0)
	    ep->smaps_buf[0] = '\0';
	if ((fd = proc_open("smaps_rollup", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &ep->smaps_buflen, &ep->smaps_buf);
	    close(fd);
	}

	if (*sts == 0) {
	    /* assign pointers to individual lines in buffer */
	    curline = ep->smaps_buf;
	    /*
	     * expecting 
	     * Rss:                1860 kB
	     * Pss:                 354 kB
	     * Pss_Anon:             92 kB
	     * Pss_File:            262 kB
             * [...]
	     * Locked:                0 kB
	     */
	    while (curline) {
		if (strncmp(curline, "Rss:", 4) == 0)
		    ep->smaps_lines.rss = strsep(&curline, "\n");
		else if (strncmp(curline, "Pss:", 4) == 0)
		    ep->smaps_lines.pss = strsep(&curline, "\n");
		else if (strncmp(curline, "Pss_Anon:", 9) == 0)
		    ep->smaps_lines.pss_anon = strsep(&curline, "\n");
		else if (strncmp(curline, "Pss_File:", 9) == 0)
		    ep->smaps_lines.pss_file = strsep(&curline, "\n");
		else if (strncmp(curline, "Pss_Shmem:", 10) == 0)
		    ep->smaps_lines.pss_shmem = strsep(&curline, "\n");
		else if (strncmp(curline, "Shared_Clean:", 13) == 0)
		    ep->smaps_lines.shared_clean = strsep(&curline, "\n");
		else if (strncmp(curline, "Shared_Dirty:", 13) == 0)
		    ep->smaps_lines.shared_dirty = strsep(&curline, "\n");
		else if (strncmp(curline, "Private_Clean:", 14) == 0)
		    ep->smaps_lines.private_clean = strsep(&curline, "\n");
		else if (strncmp(curline, "Private_Dirty:", 14) == 0)
		    ep->smaps_lines.private_dirty = strsep(&curline, "\n");
		else if (strncmp(curline, "Referenced:", 11) == 0)
		    ep->smaps_lines.referenced = strsep(&curline, "\n");
		else if (strncmp(curline, "Anonymous:", 10) == 0)
		    ep->smaps_lines.anonymous = strsep(&curline, "\n");
		else if (strncmp(curline, "LazyFree:", 9) == 0)
		    ep->smaps_lines.lazyfree = strsep(&curline, "\n");
		else if (strncmp(curline, "AnonHugePages:", 14) == 0)
		    ep->smaps_lines.anonhugepages = strsep(&curline, "\n");
		else if (strncmp(curline, "ShmemPmdMapped:", 15) == 0)
		    ep->smaps_lines.shmempmdmapped = strsep(&curline, "\n");
		else if (strncmp(curline, "FilePmdMapped:", 14) == 0)
		    ep->smaps_lines.filepmdmapped = strsep(&curline, "\n");
		else if (strncmp(curline, "Shared_Hugetlb:", 15) == 0)
		    ep->smaps_lines.shared_hugetlb = strsep(&curline, "\n");
		else if (strncmp(curline, "Private_Hugetlb:", 16) == 0)
		    ep->smaps_lines.private_hugetlb = strsep(&curline, "\n");
		else if (strncmp(curline, "Swap:", 5) == 0)
		    ep->smaps_lines.swap = strsep(&curline, "\n");
		else if (strncmp(curline, "SwapPss:", 8) == 0)
		    ep->smaps_lines.swappss = strsep(&curline, "\n");
		else if (strncmp(curline, "Locked:", 7) == 0)
		    ep->smaps_lines.locked = strsep(&curline, "\n");
		else {
		    if (pmDebugOptions.libpmda && pmDebugOptions.desperate) {
			char	*p;
			fprintf(stderr, "fetch_proc_pid_smaps: skip ");
			for (p = curline; *p && *p != '\n'; p++)
			    fputc(*p, stderr);
			fputc('\n', stderr);
		    }
		    curline = index(curline, '\n');
		    if (curline != NULL) curline++;
		}
	    }
	    ep->flags |= PROC_PID_FLAG_SMAPS_FETCHED;
	}
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch a proc/<pid>/fd entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_fd(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_FD_FETCHED)) {
	uint32_t de_count = 0;
	DIR	*dir = proc_opendir("fd", ep);

	if (dir == NULL) {
	    *sts = maperr();
	    return NULL;
	}
	while (readdir(dir) != NULL)
	    de_count++;
	closedir(dir);
	ep->fd_count = de_count - 2; /* subtract cwd and parent entries */
	ep->flags |= PROC_PID_FLAG_FD_FETCHED;
    }

    return ep;
}

/*
 * From a kernel proc cgroups file entry attempt to extract a
 * container ID using the cgroup_container_search routine.
 */
static char *
proc_container_search(const char *buf, int buflen, char *cid, int cidlen)
{
    if (strncmp(buf, "cpuset:", 7) != 0)
	return NULL;
    return cgroup_container_search(buf + 7, cid, cidlen);
}

/*
 * From the kernel format for a single process cgroup set:
 *     2:cpu:/
 *     1:cpuset:/
 *
 * Produce the same one-line format string that "ps" uses:
 *     "cpu:/;cpuset:/"
 */
static void
proc_cgroup_reformat(char *buf, int buflen, char *fmt, int fmtlen, char *cid, int cidlen)
{
    char	*target = fmt, *p, *s = NULL, *c = NULL;
    int		off, len;

    *target = *cid = '\0';
    for (p = buf; p - buf < buflen; p++) {
	if (*p == '\0')
	    break;
	if (*p == ':' && !s)	/* position "s" at start */
	    s = p + 1;
	if (*p != '\n' || !s)	/* find end of this line */
	    continue;
	if (target != fmt)      /* not the first cgroup? */
	    strncat(target, ";", 2);
	/* have a complete cgroup line now, copy it over */
	/* (but first try out container name heuristics) */
	off = target - fmt;
	len = p - s;
	if (off + len >= fmtlen)
	    break;
	if (!c)
	    c = proc_container_search(s, len, cid, cidlen);
	strncat(target, s, len);
	target += len;
	s = NULL;		/* reset it for new line */
    }
}

/*
 * fetch a proc/<pid>/cgroup entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_cgroup(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;
    static char		*cbuf1, *cbuf2;
    static int		clen1, clen2;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_CGROUP_FETCHED)) {
	char	cid[72], *tmp;
	int	fd;

	if ((fd = proc_open("cgroup", ep)) < 0)
	    *sts = maperr();
	else {
	    *sts = read_proc_entry(fd, &clen1, &cbuf1);
	    if (*sts >= 0) {
		if (clen1 > clen2) {
		    if ((tmp = realloc(cbuf2, clen1)) != NULL) {
			clen2 = clen1;
			cbuf2 = tmp;
		    }
		}
		/* reformat the buffer to match "ps" output format and */
		/* try any container name heuristics, then hash (both) */
		proc_cgroup_reformat(cbuf1, clen1, cbuf2, clen2, cid, sizeof(cid));
		ep->container_id = proc_strings_insert(cid);
		ep->cgroup_id = proc_strings_insert(cbuf2);
	    }
	    close(fd);
	}
	ep->flags |= PROC_PID_FLAG_CGROUP_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch a proc/<pid>/attr/current entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_label(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_LABEL_FETCHED)) {
	char	buf[1024];
	int	n, fd;

	if ((fd = proc_open("attr/current", ep)) < 0)
	    *sts = maperr();
	else if ((n = read(fd, buf, sizeof(buf))) < 0)
	    *sts = maperr();
	else if (n == 0)
	    *sts = -ENODATA;
	else {
	    /* buffer matches "ps" output format, direct hash */
	    buf[n-1] = '\0';
	    ep->label_id = proc_strings_insert(buf);
	}
	if (fd >= 0)
	    close(fd);
	ep->flags |= PROC_PID_FLAG_LABEL_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * fetch the proc/<pid>/oom_score value for pid
 */
proc_pid_entry_t *
fetch_proc_pid_oom_score(int id, proc_pid_t *proc_pid, int *sts)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*ep = node ? (proc_pid_entry_t *)node->data : NULL;

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_OOM_SCORE_FETCHED)) {
	char	buf[64];
	int	n, fd;

	if ((fd = proc_open("oom_score", ep)) < 0)
	    *sts = maperr();
	else if ((n = read(fd, buf, sizeof(buf))) < 0)
	    *sts = maperr();
	else if (n == 0)
	    *sts = -ENODATA;
	else {
	    buf[n-1] = '\0';
	    ep->oom_score = (__uint32_t)strtoul(buf, NULL, 0);
	}
	if (fd >= 0)
	    close(fd);
	ep->flags |= PROC_PID_FLAG_OOM_SCORE_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

/*
 * Extract the ith (space separated) field from a char buffer.
 * The first field starts at zero.  There is a special case we
 * have to deal with - brace-enclosed command name may contain
 * embedded whitespace.
 * BEWARE: return copy is in a static buffer.
 */
char *
_pm_getfield(char *buf, int field)
{
    static int	retbuflen = 0;
    static char	*retbuf = NULL;
    char	*p, *rp;
    int		i;

    if (buf == NULL)
	return NULL;

    for (p = buf, i=0; i < field; i++) {
	/* if brace-enclosed, skip to the closing brace */
	if (*p == '(')
	    for (; *p && *p != ')'; p++) {;}

	/* skip to the next space */
	for (; *p && !isspace((int)*p); p++) {;}

	/* skip to the next word */
	for (; *p && isspace((int)*p); p++) {;}
    }

    /* return a null terminated copy of the field */
    for (i=0; ; i++) {
	if (p[i] == '\0' || p[i] == '\n')
	    break;
	if (p[0] == '(' && i > 0 && p[i-1] == ')')
	    break;
	if (p[0] != '(' && isspace((int)p[i]))
	    break;
    }

    if (i >= retbuflen) {
	if ((rp = (char *)realloc(retbuf, i + 4)) == NULL)
	    return NULL;
	retbuf = rp;
	retbuflen = i+4;
    }
    memcpy(retbuf, p, i);
    retbuf[i] = '\0';

    return retbuf;
}
