psproc源码阅读 - 3

接着上一篇的来,首先这个ARG_SYSV的花括号是真的很风骚,我也是第一次看到把case放在if的花括号里面的。anyway,先看这两个分支共同会到达的部分parse_bsd_option。

    case ARG_SYSV:
        if(!force_bsd) {  /* else go past case ARG_BSD */
            err = parse_sysv_option();
            break;

case ARG_BSD:
                if(force_bsd && !(personality & PER_FORCE_BSD)) return _("way bad");
        }
        prefer_bsd_defaults = 1;
        err = parse_bsd_option();
        break;

BSD options的处理由一个大函数完成,这也是我们用ps时常用的语法。开头几句它检查命令的格式是否和设置冲突,然后对flag中每一个字符进行switch case来处理。

/************************* parse BSD options **********************/
static const char *parse_bsd_option(void) {
    const char *arg;
    const char *err;

    flagptr = ps_argv[thisarg];  /* assume we _have_ a '-' */
    if(flagptr[0]=='-') {
        if(!force_bsd) return _("cannot happen - problem #1");
    } else {
        flagptr--; /* off beginning, will increment before use */
        if(personality & PER_FORCE_BSD) {
            if(!force_bsd) return _("cannot happen - problem #2");
        } else {
            if(force_bsd) return _("second chance parse failed, not BSD or SysV");
        }
    }

    while(*++flagptr) {
        switch(*flagptr) {
        case '0' ... '9': /* end */

这里的处理没什么新意,挑几个之前没出现过的函数来读一下。

    case 'O': /* end */
        trace("O like o + defaults, add new columns after PID, also sort\n");
        arg=get_opt_arg();
        if(!arg) return _("format or sort specification must follow O");
        defer_sf_option(arg, SF_B_O);
        return NULL; /* can't have any more options */
        break;

首先是defer_sf_option。其实出现过了,但是当时我懒得看,现在看一下它在做什么。defer_sf_option是一个中等长度的函数,开头依然是熟悉的初始化,初始化要用到的sf_node结构体。支持的sort & format一共7种,在common.h中定义。

/* sorting & formatting */
/* U,B,G is Unix,BSD,Gnu and then there is the option itself */
#define SF_U_O      1
#define SF_U_o      2
#define SF_B_O      3
#define SF_B_o      4
#define SF_B_m      5       /* overloaded: threads, sort, format */
#define SF_G_sort   6
#define SF_G_format 7

/************ Main parser calls this to save lists for later **********/
/* store data for later and return 1 if arg looks non-standard */
int defer_sf_option(const char *arg, int source) {
    sf_node *sfn;
    char buf[16];
    int dist;
    const format_struct *fs;
    int need_item = 1;

    sfn = xmalloc(sizeof(sf_node));
    sfn->sf = strdup(arg);
    sfn->sf_code = source;
    sfn->s_cooked = NULL;
    sfn->f_cooked = NULL;
    sfn->next = sf_list;
    sf_list = sfn;

    if(source == SF_G_sort) have_gnu_sort = 1;

    /* Now try to find an excuse to ignore broken Unix98 parsing. */
    if(source != SF_U_o) return 1;    /* Wonderful! Already non-Unix98. */
    do {
        switch(*arg) {
        case ' ':
        case ',':
        case '\0':  /* no \t\n\r support in Unix98 */
            if(need_item) return 1;       /* something wrong */
            need_item=1;
            break;
        case '=':
            if(need_item) return 1;       /* something wrong */
            return 0;                     /* broken Unix98 parsing is required */
        default:
            if(!need_item) break;
            need_item=0;
            dist = strcspn(arg,", =");
            if(dist>15) return 1;         /* something wrong, sort maybe? */
            strncpy(buf,arg,dist);   /* no '\0' on end */
            buf[dist] = '\0';        /* fix that problem */
            fs = search_format_array(buf);
            if(!fs) return 1;             /* invalid spec, macro or sort maybe? */
            if(fs->vendor) return 1;      /* Wonderful! Legal non-Unix98 spec. */
        }
    } while (*++arg);

    return 0;                         /* boring, Unix98 is no change */
}

说是有这么多SF格式,实际上它只当场处理SF_G_sort和SF_U_o。其余的直接初始化完sf_node,加到sf_list(全局变量)链表中便结束了。
对SF_U_o,仍然是经典的列表处理,找到项目后,使用search_format_array来搜索对应处理项,如果没找到,或者有fs->vendor(只有U98是0,其余的都是>0的值)返回1。

const format_struct *search_format_array(const char *findme) {
    format_struct key;
    key.spec = findme;
    return bsearch(&key, format_array, format_array_count,
                   sizeof(format_struct), compare_format_structs
                  );
}


/* Note: upon conversion to the <pids> API the numerous former sort provisions
         for otherwise non-printable fields (pr_nop) have been retained. And,
         since the new library can sort on any item, many previously printable
         but unsortable fields have now been made sortable. */
/* there are about 211 listed */
/* Many of these are placeholders for unsupported options. */
static const format_struct format_array[] = { /*
 .spec        .head      .pr               .sr                   .width .vendor .flags  */
{"%cpu",      "%CPU",    pr_pcpu,          PIDS_extra,               4,    BSD,  ET|RIGHT}, /*pcpu*/
{"%mem",      "%MEM",    pr_pmem,          PIDS_VM_RSS,              4,    BSD,  PO|RIGHT}, /*pmem*/
{"_left",     "LLLLLLLL", pr_t_left,       PIDS_noop,                8,    TST,  ET|LEFT},
{"_left2",    "L2L2L2L2", pr_t_left2,      PIDS_noop,                8,    TST,  ET|LEFT},
{"_right",    "RRRRRRRRRRR", pr_t_right,   PIDS_noop,                11,   TST,  ET|RIGHT},

回到parse_bsd_option中。其余的选项大多数是在设置标记位,不再重复了。

    case 'X':
        trace("X old Linux i386 register format\n");
        format_flags |= FF_LX;
        break;
    case 'Z':  /* FreeBSD does MAC like SGI's Irix does it */
        trace("Z print security label for Mandatory Access Control.\n");
        format_modifiers |= FM_M;
        break;
    case 'a':
        trace("a select all w/tty, including other users\n");
        simple_select |= SS_B_a;
        break;
    case 'c':
        trace("c true command name\n");
        bsd_c_option = 1;
        break;

目前只剩下来最后一个,parse_sysv_option(),想必也不会有什么惊喜。

/***************** parse SysV options, including Unix98  *****************/
static const char *parse_sysv_option(void) {
    const char *arg;
    const char *err;

    flagptr = ps_argv[thisarg];
    while(*++flagptr) {
        switch(*flagptr) {
        case 'A':
            trace("-A selects all processes\n");
            all_processes = 1;
            break;
        case 'C': /* end */
            trace("-C select by process name\n");  /* Why only HP/UX and us? */
            arg=get_opt_arg();
            if(!arg) return _("list of command names must follow -C");
            err=parse_list(arg, parse_cmd);
            if(err) return err;
            selection_list->typecode = SEL_COMM;
            return NULL; /* can't have any more options */
        case 'F':  /* DYNIX/ptx -f plus sz,rss,psr=ENG between c and stime */
            trace("-F does fuller listing\n");
            format_modifiers |= FM_F;
            format_flags |= FF_Uf;
            unix_f_option = 1; /* does this matter? */
            break;

事实也确实如此,这里都是我们看过的函数,不再过多介绍了。
终于看完了parse_all_options的所有内容,回到上一层arg_parse。

int arg_parse(int argc, char *argv[]) {
    const char *err = NULL;
    const char *err2 = NULL;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;

    if(personality & PER_FORCE_BSD) goto try_bsd;

    err = parse_all_options();  //<----------
    if(err) goto try_bsd;
    err = thread_option_check();
    if(err) goto try_bsd;
    err = process_sf_options();
    if(err) goto try_bsd;
    err = select_bits_setup();
    if(err) goto try_bsd;

看来我们只走了一小步,thread_option_check全是在处理thread_flags这个全局变量,根据之前parse_all_options传入的内容对其进行设置。process_sf_options对参数“o”进行处理,代码比较复杂,我们单独拖出来看一看。

首先其注释表示这个功能是遗留下来的坑,前人挖坑埋后人系列。

/**************************************************************************
 * Used to parse option O lists. Option O is shared between
 * sorting and formatting. Users may expect one or the other.
 * The "broken" flag enables a really bad Unix98 misfeature.
 */
const char *process_sf_options(void) {
    sf_node *sf_walk;

    if(sf_list) {
        const char *err;
        err = parse_O_option(sf_list);
        if(err) return err;
    }

    if(format_list) catastrophic_failure(__FILE__, __LINE__, _("bug: must reset the list first"));

第一阶段的代码调用parse_O_option,这玩意儿也是个中型函数。没办法,看一看它是什么。

/*
 * Used to parse option O lists. Option O is shared between
 * sorting and formatting. Users may expect one or the other.
 * Recursion is to preserve original order.
 */
static const char *parse_O_option(sf_node *sfn) {
    const char *err;     /* error code that could or did happen */

    if(sfn->next) {
        err = parse_O_option(sfn->next);
        if(err) return err;
    }

    switch(sfn->sf_code) {
    case SF_B_o:
    case SF_G_format:
    case SF_U_o: /*** format ***/
        err = format_parse(sfn);
        if(!err) already_parsed_format = 1;
        break;
    case SF_U_O:                                /*** format ***/
        /* Can have -l -f f u... set already_parsed_format like DEC does */
        if(already_parsed_format) return _("option -O can not follow other format options");
        err = format_parse(sfn);
        if(err) return err;
        already_parsed_format = 1;
        O_wrap(sfn,'u'); /* must wrap user format in default */
        break;
    case SF_B_O:                                /***  both  ***/
        if(have_gnu_sort || already_parsed_sort) err = _("multiple sort options");
        else err = verify_short_sort(sfn->sf);
        if(!err) { /* success as sorting code */
            short_sort_parse(sfn);
            already_parsed_sort = 1;
            return NULL;
        }
        if(already_parsed_format) {
            err = _("option O is neither first format nor sort order");
            break;
        }
        if(!format_parse(sfn)) { /* if success as format code */
            already_parsed_format = 1;
            O_wrap(sfn,'b'); /* must wrap user format in default */
            return NULL;
        }
        break;
    case SF_G_sort:
    case SF_B_m:                 /***  sort  ***/
        if(already_parsed_sort) err = _("multiple sort options");
        else err = long_sort_parse(sfn);
        already_parsed_sort = 1;
        break;
    default:                                    /***  junk  ***/
        catastrophic_failure(__FILE__, __LINE__, _("please report this bug"));
    }
    return err; /* could be NULL */
}

很不妙,一上来就是另一个parser。对G_o、G_format、U_o三个情况而言,进入format_parse。

    case SF_B_o:
    case SF_G_format:
    case SF_U_o: /*** format ***/
        err = format_parse(sfn);
        if(!err) already_parsed_format = 1;

format_parse定义如下,由一个大型状态机构成。前面的状态机只是检查是否符合语法,并不做其他事情。通过后,开始处理。

/******************************************************************
 * Used to parse option AIX field descriptors.
 * Put each completed format_node onto the list starting at ->f_cooked
 */
static const char *aix_format_parse(sf_node *sfn) {
    char *buf;                   /* temp copy of arg to hack on */
    char *walk;
    int items;

    /*** sanity check and count items ***/
    items = 0;
    walk = sfn->sf;
    /* state machine */ {
        int c;
initial:
        c = *walk++;
        if(c=='%')    goto get_desc;
        if(!c)        goto looks_ok;
        /* get_text: */
        items++;
get_more_text:
        c = *walk++;
        if(c=='%')    goto get_desc;
        if(c)         goto get_more_text;
        goto looks_ok;
get_desc:
        items++;
        c = *walk++;
        if(c)         goto initial;
        return _("improper AIX field descriptor");
looks_ok:
        ;
    }

处理阶段,复制一份sfn->sf,这是带%的命令行。只要不是%%,就交给search_aix_array去搜索对应的列。

    /*** sanity check passed ***/
    buf = strdup(sfn->sf);
    walk = sfn->sf;

    while(items--) {
        format_node *fnode;  /* newly allocated */
        format_node *endp;   /* for list manipulation */

        if(*walk == '%') {
            const aix_struct *aix;
            walk++;
            if(*walk == '%') goto double_percent;
            aix = search_aix_array(*walk);

search_aix_array的定义如下:

const aix_struct *search_aix_array(const int findme) {
    const aix_struct *walk = aix_array;
    while(walk->desc != '~') {
        if(walk->desc == findme) return walk;
        walk++;
    }
    return NULL;
}

它搜索的是这样一个数组。

/*************************** AIX formats ********************/
/* Convert AIX format codes to normal format specifiers. */
static const aix_struct aix_array[] = {
    {'C', "pcpu",   "%CPU"},
    {'G', "group",  "GROUP"},
    {'P', "ppid",   "PPID"},
    {'U', "user",   "USER"},
    {'a', "args",   "COMMAND"},
    {'c', "comm",   "COMMAND"},
    {'g', "rgroup", "RGROUP"},
    {'n', "nice",   "NI"},
    {'p', "pid",    "PID"},
    {'r', "pgid",   "PGID"},
    {'t', "etime",  "ELAPSED"},
    {'u', "ruser",  "RUSER"},
    {'x', "time",   "TIME"},
    {'y', "tty",    "TTY"},
    {'z', "vsz",    "VSZ"},
    {'~', "~",      "~"} /* NULL would ruin alphabetical order */
};

回到之前的函数。如果找到了,则调用do_one_spec来处理对应的规范和表头。

            walk++;
            if(!aix) {
                free(buf);
                return _("unknown AIX field descriptor");
            }
            fnode =  do_one_spec(aix->spec, aix->head);

do_one_spec也是个大函数,定义如下:

/****************  Parse single format specifier *******************/
static format_node *do_one_spec(const char *spec, const char *override) {
    const format_struct *fs;
    const macro_struct *ms;

    fs = search_format_array(spec);

函数一上来就在format_array中找对应的spec(参数1)。之前已经见识过这个format_array了:

static const format_struct format_array[] = { /*
 .spec        .head      .pr               .sr                   .width .vendor .flags  */
{"%cpu",      "%CPU",    pr_pcpu,          PIDS_extra,               4,    BSD,  ET|RIGHT}, /*pcpu*/
{"%mem",      "%MEM",    pr_pmem,          PIDS_VM_RSS,              4,    BSD,  PO|RIGHT}, /*pmem*/

如果找到,则对其进行处理,生成format_node并返回。

    if(fs) {
        int w1, w2;
        format_node *thisnode;
        thisnode = xmalloc(sizeof(format_node));
        if(fs->flags & CF_PIDMAX) {
            w1 = (int)procps_pid_length();
            w2 = strlen(fs->head);
            if(w2>w1) w1=w2; // FIXME w/ separate header/body column sizing
        } else {
            w1 = fs->width;
        }
        if(override) {
            w2 = strlen(override);
            thisnode->width = (w1>w2)?w1:w2;
            thisnode->name = strdup(override);
        } else {
            thisnode->width = w1;
            thisnode->name = strdup(fs->head);
        }
        thisnode->pr = fs->pr;
        thisnode->vendor = fs->vendor;
        thisnode->flags = fs->flags;
        thisnode->next = NULL;
        return thisnode;
    }

format_node的各项解释如下:
1) .pr ,处理函数,处理函数由format_array定义,各项形如:

/* normal %CPU in ##.# format. */
static int pr_pcpu(char *restrict const outbuf, const proc_t *restrict const pp) {
    unsigned long long total_time;   /* jiffies used by this process */
    unsigned pcpu;                   /* scaled %cpu, 999 means 99.9% */
    unsigned long long seconds;      /* seconds of process life */
    setREL3(TICS_ALL,TICS_ALL_C,TIME_ELAPSED)
    pcpu = 0;
    if(include_dead_children) total_time = rSv(TICS_ALL_C, ull_int, pp);
    else total_time = rSv(TICS_ALL, ull_int, pp);
    seconds = rSv(TIME_ELAPSED, ull_int, pp);
    if(seconds) pcpu = (total_time * 1000ULL / Hertz) / seconds;
    if (pcpu > 999U)
        return snprintf(outbuf, COLWID, "%u", pcpu/10U);
    return snprintf(outbuf, COLWID, "%u.%u", pcpu/10U, pcpu%10U);
}

2) .vendor ,哪个系统引入的功能。
3) .flags,预设的flag。
4) .next,与其关联的下一个节点(链表)。

如果没有找到format_array,则尝试按macro再次查找。macro array是一组对应的字符映射关系,很像C的宏:

static const macro_struct macro_array[] = {
    {"DFMT",     "pid,tname,state,cputime,cmd"},         /* Digital's default */
    {"DefBSD",   "pid,tname,stat,bsdtime,args"},               /* Our BSD default */
    {"DefSysV",  "pid,tname,time,cmd"},                     /* Our SysV default */

将macro展开后,对macro中每个section,调用自己再解析一次。

    /* That failed, so try it as a macro. */
    ms = search_macro_array(spec);
    if(ms) {
        format_node *list = NULL;
        format_node *newnode;
        const char *walk;
        int dist;
        char buf[16]; /* trust strings will be short (from above, not user) */
        walk = ms->head;
        while(*walk) {
            dist = strcspn(walk, ", ");
            strncpy(buf,walk,dist);
            buf[dist] = '\0';
            newnode = do_one_spec(buf,override); /* call self, assume success */
            newnode->next = list;
            list = newnode;
            walk += dist;
            if(*walk) walk++;
        }
        return list;
    }
    return NULL;   /* bad, spec not found */
}

继续回到上层,把%之前的内容dump出来,保存在fnode中。检查最后一个节点,保存到sfn->f_cooked中,然后退出。

            if(!fnode) {
                free(buf);
                return _("AIX field descriptor processing bug");
            }
        } else {
            size_t len;
            len = strcspn(walk, "%");
            memcpy(buf,walk,len);
            if(0) {
double_percent:
                len = 1;
                buf[0] = '%';
            }
            buf[len] = '\0';
            walk += len;
            fnode = xmalloc(sizeof(format_node));
            fnode->width = len < INT_MAX ? len : INT_MAX;
            fnode->name = strdup(buf);
            fnode->pr = NULL;     /* checked for */
            fnode->vendor = AIX;
            fnode->flags = CF_PRINT_EVERY_TIME;
            fnode->next = NULL;
        }

        endp = fnode;
        while(endp->next) endp = endp->next;  /* find end */
        endp->next = sfn->f_cooked;
        sfn->f_cooked = fnode;
    }
    free(buf);
    already_parsed_format = 1;
    return NULL;
}

再回到最外面的那层。后面就比较简单了,分别维护两个链表,一个是format_list,一个是sort_list,将二者分类放到不同的链表中。

    /* merge formatting info of sf_list into format_list here */
    sf_walk = sf_list;
    while(sf_walk) {
        format_node *fmt_walk;
        fmt_walk = sf_walk->f_cooked;
        sf_walk->f_cooked = NULL;
        while(fmt_walk) {  /* put any nodes onto format_list in opposite way */
            format_node *travler;
            travler = fmt_walk;
            fmt_walk = fmt_walk->next;
            travler->next = format_list;
            format_list = travler;
        }
        sf_walk = sf_walk->next;
    }

    /* merge sorting info of sf_list into sort_list here */
    sf_walk = sf_list;
    while(sf_walk) {
        sort_node *srt_walk;
        srt_walk = sf_walk->s_cooked;
        sf_walk->s_cooked = NULL;
        if (srt_walk) {
            sort_node *travler = srt_walk;
            while (travler->next) travler = travler->next;
            travler->next = sort_list;
            sort_list = srt_walk;
        }
        sf_walk = sf_walk->next;
    }

并在接下来处理PS_FORMAT环境变量(format_parse),然后重复放到format_list的步骤。

// Get somebody to explain how -L/-T is supposed to interact
// with sorting. Do the threads remain grouped, with sorting
// by process, or do the threads get sorted by themselves?
if(sort_list && (thread_flags&TF_no_sort)) {
    return _("tell <procps@freelists.org> what you expected");
}

// If nothing else, try to use $PS_FORMAT before the default.
if(!format_flags && !format_modifiers && !format_list) {
    char *tmp;
    tmp = getenv("PS_FORMAT");  /* user override kills default */
    if(tmp && *tmp) {
        const char *err;
        sf_node sfn;
        if(thread_flags&TF_must_use) return _("tell <procps@freelists.org> what you want (-L/-T, -m/m/H, and $PS_FORMAT)");
        sfn.sf = tmp;
        sfn.f_cooked = NULL;
        err = format_parse(&sfn);
        if(!err) {
            format_node *fmt_walk;
            fmt_walk = sfn.f_cooked;
            while(fmt_walk) {  /* put any nodes onto format_list in opposite way */
                format_node *travler;
                travler = fmt_walk;
                fmt_walk = fmt_walk->next;
                travler->next = format_list;
                format_list = travler;
            }
            return NULL;
        }
        // FIXME: prove that this won't be hit on valid bogus-BSD options
        fprintf(stderr, _("warning: $PS_FORMAT ignored. (%s)\n"), err);
    }
}

如果有指定format_flags,则同样处理它。

if(format_list) {
    if(format_flags) return _("conflicting format options");
    if(format_modifiers) return _("can not use output modifiers with user-defined output");
    if(thread_flags&TF_must_use) return _("-L/-T with H/m/-m and -o/-O/o/O is nonsense");
    return NULL;
}

do {
    const char *spec;
    switch(format_flags) {

    default:
        return _("conflicting format options");

    /* These can be NULL, which enables SysV list generation code. */
    case 0:
        spec=NULL;
        break;
           ……
    case FF_Lm:
        spec="OL_m";
        break;

    /* This is the sole FLASK security option. */
    case FF_Fc:
        spec="FLASK_context";
        break;

    }  /* end switch(format_flags) */

    // not just for case 0, since sysv_l_format and such may be NULL
    if(!spec) return generate_sysv_list();

    do {
        format_node *fmt_walk;
        fmt_walk = do_one_spec(spec, NULL); /* use override "" for no headers */
        while(fmt_walk) {  /* put any nodes onto format_list in opposite way */
            format_node *travler;
            travler = fmt_walk;
            fmt_walk = fmt_walk->next;
            travler->next = format_list;
            format_list = travler;
        }
    } while(0);
} while(0);

接下来,对format_modifiers进行处理。fmt_add_after、fmt_delete将字符串与format_list的项目name属性做对比,并添加项目/删除项目。

    do {
        format_node *fn;
        if(format_modifiers & FM_j) {
            fn = do_one_spec("pgid", NULL);
            if(!fmt_add_after("PPID", fn)) if(!fmt_add_after("PID", fn))
                    catastrophic_failure(__FILE__, __LINE__, _("internal error: no PID or PPID for -j option"));
            fn = do_one_spec("sid", NULL);
            if(!fmt_add_after("PGID", fn)) return _("lost my PGID");
        }
        if(format_modifiers & FM_y) {
            /* TODO: check for failure to do something, and complain if so */
            fmt_delete("F");
            fn = do_one_spec("rss", NULL);
            if(fmt_add_after("ADDR", fn)) fmt_delete("ADDR");
        }
        if(format_modifiers & FM_c) {
            fmt_delete("%CPU");
            fmt_delete("CPU");
            fmt_delete("CP");
            fmt_delete("C");
            fmt_delete("NI");
            fn = do_one_spec("class", NULL);
            if(!fmt_add_after("PRI", fn))
                catastrophic_failure(__FILE__, __LINE__, _("internal error: no PRI for -c option"));
            fmt_delete("PRI"); /* we want a different one */
            fn = do_one_spec("pri", NULL);
            if(!fmt_add_after("CLS", fn)) return _("lost my CLS");
        }
        if(thread_flags & TF_U_T) {
            fn = do_one_spec("spid", NULL);
            if(!fmt_add_after("PID", fn) && (thread_flags&TF_must_use))
                return _("-T with H/-m/m but no PID for SPID to follow");
        }
        if(thread_flags & TF_U_L) {
            fn = do_one_spec("lwp", NULL);
            if(fmt_add_after("SID",  fn)) goto did_lwp;
            if(fmt_add_after("SESS", fn)) goto did_lwp;
            if(fmt_add_after("PGID", fn)) goto did_lwp;
            if(fmt_add_after("PGRP", fn)) goto did_lwp;
            if(fmt_add_after("PPID", fn)) goto did_lwp;
            if(fmt_add_after("PID",  fn)) goto did_lwp;
            if(thread_flags&TF_must_use)
                return _("-L with H/-m/m but no PID/PGID/SID/SESS for NLWP to follow");
did_lwp:
            fn = do_one_spec("nlwp", NULL);
            fmt_add_after("%CPU",  fn);
        }
        if(format_modifiers & FM_M) {   // Mandatory Access Control, IRIX style
            fn = do_one_spec("label", NULL);
            fn->next=format_list;
            format_list=fn;
        }
        /* Do personality-specific translations not covered by format_flags.
         * Generally, these only get hit when personality overrides unix output.
         * That (mostly?) means the Digital and Debian personalities.
         */
        if((personality & PER_ZAP_ADDR) && (format_flags & FF_Ul)) {
            fn = do_one_spec("sgi_p", NULL);
            if(fmt_add_after("ADDR", fn)) fmt_delete("ADDR");
        }
        if((personality & PER_SANE_USER) && (format_flags & FF_Uf)) {
            fn = do_one_spec("user", NULL);
            if(fmt_add_after("UID", fn)) fmt_delete("UID");
        }
    } while(0);

    return NULL;
}

终于回到最开始的arg_parse。

err = parse_all_options();
if(err) goto try_bsd;
err = thread_option_check();
if(err) goto try_bsd;
err = process_sf_options(); //<------
if(err) goto try_bsd;
err = select_bits_setup(); 
if(err) goto try_bsd;

我们接下来是select_bits_setup。这是一个设置select_bits全局变量的函数,不介绍了,全是魔法数字,后面碰着再看。这个函数调用完成后,arg_parse也结束了。回到最初的起点main()。

reset_global();  /* must be before parser */
arg_parse(argc,argv);  //<--------在这里

/* check for invalid combination of arguments */
arg_check_conflicts();

/*  arg_show(); */
trace("screen is %ux%u\n",screen_cols,screen_rows);
/*  printf("sizeof(proc_t) is %d.\n", sizeof(proc_t)); */
trace("======= ps output follows =======\n");

init_output(); /* must be between parser and output */

接下来的arg_check_conflicts没有什么惊喜,只是检查有没有冲突的参数。进入init_output。

void init_output(void)
{
    int outbuf_pages;
    char *outbuf;

    // add page_size-1 to round up
    outbuf_pages = (OUTBUF_SIZE+SPACE_AMOUNT+page_size-1)/page_size;
    outbuf = mmap(
                 0,
                 page_size * (outbuf_pages+1), // 1 more, for guard page at high addresses
                 PROT_READ | PROT_WRITE,
                 MAP_PRIVATE | MAP_ANONYMOUS,
                 -1,
                 0);

    if(outbuf == MAP_FAILED)
        catastrophic_failure(__FILE__, __LINE__, _("please report this bug"));

    memset(outbuf, ' ', SPACE_AMOUNT);
    if(SPACE_AMOUNT==page_size)
        mprotect(outbuf, page_size, PROT_READ);
    mprotect(outbuf + page_size*outbuf_pages, page_size, PROT_NONE); // guard page
    saved_outbuf = outbuf + SPACE_AMOUNT;
    // available space:  page_size*outbuf_pages-SPACE_AMOUNT
    seconds_since_1970 = time(NULL);

    check_header_width();
}

OUTBUF_SIZE是2 * 64 * 1024, SPACE_AMOUNT是144,page_size是分页大小,一般认为是4096。所以outbuf实际上是由33页构成(如果page_size=4096)。mmap申请34页(139,264字节)。前SPACE_AMOUNT字节设置为空格,最后一页无权限。

回到main中,还剩最后一点点代码:

    lists_and_needs(); //<===
    finalize_stacks();

    if(forest_type || sort_list) fancy_spew(); /* sort or forest */
    else simple_spew(); /* no sort, no forest */
    show_one_proc((proc_t *)-1,format_list); /* no output yet? */

    procps_pids_unref(&Pids_info);
    return 0;
}

继续看lists_and_needs。check_headers用于检查有多少个header(遍历format_list中有name的、有pr项的并计数)。然后后面的代码用于对列表中的项目需求进行处理,并修改一部分类型节点的pr值。

/***** munge lists and determine final needs */
static void lists_and_needs(void) {
    check_headers();

    // only care about the difference when showing both
    if(thread_flags & TF_show_both) {
        format_node pfn, tfn; // junk, to handle special case at begin of list
        format_node *walk = format_list;
        format_node *p_end = &pfn;
        format_node *t_end = &tfn;
        while(walk) {
            format_node *new = xmalloc(sizeof(format_node));
            memcpy(new,walk,sizeof(format_node));
            p_end->next = walk;
            t_end->next = new;
            p_end       = walk;
            t_end       = new;
            switch(walk->flags & CF_PRINT_MASK) {
            case CF_PRINT_THREAD_ONLY:
                p_end->pr   = pr_nop;
                break;
            case CF_PRINT_PROCESS_ONLY:
                t_end->pr   = pr_nop;
                break;
            default:
                catastrophic_failure(__FILE__, __LINE__, _("please report this bug"));
            // FALL THROUGH
            case CF_PRINT_AS_NEEDED:
            case CF_PRINT_EVERY_TIME:
                break;
            }
            walk = walk->next;
        }
        t_end->next = NULL;
        p_end->next = NULL;
        proc_format_list = pfn.next;
        task_format_list = tfn.next;
    } else {
        proc_format_list = format_list;
        task_format_list = format_list;
    }
}

回到main,阅读finalize_stacks()。

psproc源码阅读 - 2

main中接下来的函数都比较重要,所以这里就分段来介绍了。

arg_parse(argc,argv);

/* check for invalid combination of arguments */
arg_check_conflicts();

首先是arg_parse。

int arg_parse(int argc, char *argv[]) {
    const char *err = NULL;
    const char *err2 = NULL;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;

    if(personality & PER_FORCE_BSD) goto try_bsd;

    err = parse_all_options();
    if(err) goto try_bsd;
    err = thread_option_check();
    if(err) goto try_bsd;
    err = process_sf_options();
    if(err) goto try_bsd;
    err = select_bits_setup();
    if(err) goto try_bsd;

    choose_dimensions();
    return 0;

try_bsd:
    trace("--------- now try BSD ------\n");

    reset_global();
    reset_parser();
    reset_sortformat();
    format_flags = 0;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;
    /* no need to reset flagptr */
    force_bsd=1;
    prefer_bsd_defaults=1;
    if(!( (PER_OLD_m|PER_BSD_m) & personality )) /* if default m setting... */
        personality |= PER_OLD_m; /* Prefer old Linux over true BSD. */
    /* Do not set PER_FORCE_BSD! It is tested below. */

    err2 = parse_all_options();
    if(err2) goto total_failure;
    err2 = thread_option_check();
    if(err2) goto total_failure;
    err2 = process_sf_options();
    if(err2) goto total_failure;
    err2 = select_bits_setup();
    if(err2) goto total_failure;

    choose_dimensions();
    return 0;

total_failure:
    reset_parser();
    if(personality & PER_FORCE_BSD) fprintf(stderr, _("error: %s\n"), err2);
    else fprintf(stderr, _("error: %s\n"), err);
    do_help(NULL, EXIT_FAILURE);
}

先看第一部分,全局变量personality由set_personality设置,大体就是根据不同的操作系统和架构,来设置不同的参数。

int arg_parse(int argc, char *argv[]) {
    const char *err = NULL;
    const char *err2 = NULL;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;

    if(personality & PER_FORCE_BSD) goto try_bsd;

对BSD而言,其personality是包含PER_FORCE_BSD位的,但是对linux则没有。因此如果有这个位,则优先尝试bsd。

case_bsd:
    personality = PER_FORCE_BSD | PER_BSD_h | PER_BSD_m;
    prefer_bsd_defaults = 1;
    bsd_j_format = "FB_j";
    bsd_l_format = "FB_l";
    /* bsd_s_format not used */
    bsd_u_format = "FB_u";
    bsd_v_format = "FB_v";
    return NULL;

否则继续执行parse_all_options()。

err = parse_all_options();

开始阅读parse_all_options函数,对每个当前的参数,调用arg_type(ps_argv[thisarg])获取其类型。

/* First assume sysv, because that is the POSIX and Unix98 standard. */
static const char *parse_all_options(void) {
    const char *err = NULL;
    int at;
    while(++thisarg < ps_argc) {
        trace("parse_all_options calling arg_type for \"%s\"\n", ps_argv[thisarg]);
        at = arg_type(ps_argv[thisarg]);
        trace("ps_argv[thisarg] is %s\n", ps_argv[thisarg]);
        switch(at) {
        case ARG_GNU:
            err = parse_gnu_option();
            break;
        case ARG_SYSV:
            if(!force_bsd) {  /* else go past case ARG_BSD */
                err = parse_sysv_option();
                break;

                case ARG_BSD:
                    if(force_bsd && !(personality & PER_FORCE_BSD)) return _("way bad");
            }
            prefer_bsd_defaults = 1;
            err = parse_bsd_option();
            break;
        case ARG_PGRP:
        case ARG_SESS:
        case ARG_PID:
            prefer_bsd_defaults = 1;
            err = parse_trailing_pids();
            break;
        case ARG_END:
        case ARG_FAIL:
            trace("              FAIL/END on [%s]\n",ps_argv[thisarg]);
            return _("garbage option");
            break;
        default:
            printf("                  ?    %s\n",ps_argv[thisarg]);
            return _("something broke");
        } /* switch */
        if(err) return err;
    } /* while */
    return NULL;
}

arg_type的定义如下,即:如果是字母开头,则认为是BSD风格的参数。如果是数字开头,则认为是PID,如果是+开头的,则认为是ARG_SESS类型。如果是其他情况且非-开头,认为是非法符号。然后再看下一个字符,如果是字母开头的,认为是SYSV参数(例如-a),如果是数字开头的,则认为是PGRP,如果其他字符且不是-,认为非法。如果是--开头的,再看第三个字符,是不是字母,如果是的话则认为是GNU参数。

static int arg_type(const char *str) {
    int tmp = str[0];
    if((tmp>='a') && (tmp<='z'))   return ARG_BSD;
    if((tmp>='A') && (tmp<='Z'))   return ARG_BSD;
    if((tmp>='0') && (tmp<='9'))   return ARG_PID;
    if(tmp=='+')                   return ARG_SESS;
    if(tmp!='-')                   return ARG_FAIL;
    tmp = str[1];
    if((tmp>='a') && (tmp<='z'))   return ARG_SYSV;
    if((tmp>='A') && (tmp<='Z'))   return ARG_SYSV;
    if((tmp>='0') && (tmp<='9'))   return ARG_PGRP;
    if(tmp!='-')                   return ARG_FAIL;
    tmp = str[2];
    if((tmp>='a') && (tmp<='z'))   return ARG_GNU;
    if((tmp>='A') && (tmp<='Z'))   return ARG_GNU;
    if(tmp=='\0')                  return ARG_END;
    return ARG_FAIL;
}

对比ps的man,即可理解:

DESCRIPTION
       ps displays information about a selection of the active processes.  If you want a
       repetitive update of the selection and the displayed information, use top(1) instead.

       This version of ps accepts several kinds of options:

       1   UNIX options, which may be grouped and must be preceded by a dash.
       2   BSD options, which may be grouped and must not be used with a dash.
       3   GNU long options, which are preceded by two dashes.

总之为了保证兼容性,ps的命令行非常混乱。

加号和数字的作用:

   --sort spec
          Specify sorting order.  Sorting syntax is [+|-]key[,[+|-]key[,...]].  Choose a
          multi-letter key from the STANDARD FORMAT SPECIFIERS section.  The "+" is
          optional since default direction is increasing numerical or lexicographic
          order.  Identical to k.  For example: ps jax --sort=uid,-ppid,+pid

PROCESS SELECTION BY LIST
       These options accept a single argument in the form of a blank-separated or
       comma-separated list.  They can be used multiple times.  For example:
       ps -p "1 2" -p 3,4

       -123   Identical to --pid 123.

       123    Identical to --pid 123.

在所有类型中,ARG_END、ARG_FAIL、default会导致直接退出。

    case ARG_PGRP:
    case ARG_SESS:
    case ARG_PID:
        prefer_bsd_defaults = 1;
        err = parse_trailing_pids();
        break;

PGRP、SESS、PID会使ps进一步解析后面的pid。

/*************** process trailing PIDs  **********************/
static const char *parse_trailing_pids(void) {
    selection_node *pidnode;  /* pid */
    selection_node *grpnode;  /* process group */
    selection_node *sidnode;  /* session */
    char **argp;     /* pointer to pointer to text of PID */
    const char *err;       /* error code that could or did happen */
    int i;

    i = ps_argc - thisarg;  /* how many trailing PIDs, SIDs, PGRPs?? */
    argp = ps_argv + thisarg;
    thisarg = ps_argc - 1;   /* we must be at the end now */

    pidnode = xmalloc(sizeof(selection_node));
    pidnode->u = xmalloc(i*sizeof(sel_union)); /* waste is insignificant */
    pidnode->n = 0;

    grpnode = xmalloc(sizeof(selection_node));
    grpnode->u = xmalloc(i*sizeof(sel_union)); /* waste is insignificant */
    grpnode->n = 0;

    sidnode = xmalloc(sizeof(selection_node));
    sidnode->u = xmalloc(i*sizeof(sel_union)); /* waste is insignificant */
    sidnode->n = 0;

    while(i--) {
        char *data;
        data = *(argp++);
        switch(*data) {
        default:
            err = parse_pid(  data, pidnode->u + pidnode->n++);
            break;
        case '-':
            err = parse_pid(++data, grpnode->u + grpnode->n++);
            break;
        case '+':
            err = parse_pid(++data, sidnode->u + sidnode->n++);
            break;
        }
        if(err) return err;     /* the node gets freed with the list */
    }

    if(pidnode->n) {
        pidnode->next = selection_list;
        selection_list = pidnode;
        selection_list->typecode = SEL_PID;
    }  /* else free both parts */

    if(grpnode->n) {
        grpnode->next = selection_list;
        selection_list = grpnode;
        selection_list->typecode = SEL_PGRP;
    }  /* else free both parts */

    if(sidnode->n) {
        sidnode->next = selection_list;
        selection_list = sidnode;
        selection_list->typecode = SEL_SESS;
    }  /* else free both parts */

    return NULL;
}

解析时要求它是一个1~0x7fffffff的正整数。并按+、-、默认的情况分别放置在sid/gid/pidnode中。

static const char *parse_pid(char *str, sel_union *ret) {
    char *endp;
    unsigned long num;
    num = strtoul(str, &endp, 0);
    if(*endp != '\0')      return _("process ID list syntax error");
    if(num<1)              return _("process ID out of range");
    if(num > 0x7fffffffUL) return _("process ID out of range");
    ret->pid = num;
    return 0;
}

对ARG_GNU而言,处理函数是parse_gnu_option

parser.c:parse_all_options

    case ARG_GNU:
        err = parse_gnu_option();
        break;

parse_gnu_option的开头列出了一组支持的参数。

static const gnu_table_struct gnu_table[] = {
    {"Group",         &&case_Group},       /* rgid */
    {"User",          &&case_User},        /* ruid */
    {"cols",          &&case_cols},
    {"columns",       &&case_columns},
    {"context",       &&case_context},
    {"cumulative",    &&case_cumulative},
    {"deselect",      &&case_deselect},    /* -N */
    {"forest",        &&case_forest},      /* f -H */

这里的case_Group之类的不是什么全局变量,而是本地标签,第一次看到能这么用,很神奇……

        {"version",       &&case_version},
        {"width",         &&case_width},
    };
    const int gnu_table_count = sizeof(gnu_table)/sizeof(gnu_table_struct);

    s = ps_argv[thisarg]+2;
    sl = strcspn(s,":=");
    if(sl > 15) return _("unknown gnu long option");
    strncpy(buf, s, sl);
    buf[sl] = '\0';
    flagptr = s+sl;

    found = bsearch(&findme, gnu_table, gnu_table_count,
                    sizeof(gnu_table_struct), compare_gnu_table_structs
                   );

    if(!found) {
        if (!strcmp(buf, the_word_help))
            goto case_help;
        return _("unknown gnu long option");
    }

    goto *(found->jump);    /* See gcc extension info.  :-)   */

case_Group:
    trace("--Group\n");
    arg = grab_gnu_arg();
    if(!arg) return _("list of real groups must follow --Group");
    err=parse_list(arg, parse_gid);
    if(err) return err;
    selection_list->typecode = SEL_RGID;
    return NULL;
case_User:
    trace("--User\n");
    arg = grab_gnu_arg();
    if(!arg) return _("list of real users must follow --User");
    err=parse_list(arg, parse_uid);
    if(err) return err;
    selection_list->typecode = SEL_RUID;
    return NULL;

先逐行读一下代码。s是argv[i] + 2,这是因为ARG_GNU是“--”开头的,跳过前两个字符。sl是:=前的字符数。然后将:=前的内容拷贝到buf中。buf定义为buf[16]所以限制长度不能大于15。在这之后,flagptr就是:=开始的字符。

然后,通过bsearch库函数在gnu_table中搜索findme={buf, NULL}。如果找到就直接跳到对应标签上,这语法也是很离谱。

先看看man手册中对这些参数的定义:

   --cols n
          Set screen width.

   --columns n
          Set screen width.

   --cumulative
          Include some dead child process data (as a sum with the parent).

有但不是全有,比如--Group就不在主词条里面(但在其他词条的描述里有提到)。挑几个比较有特点的读一下好了。

首先是它们的一个通用工具函数grab_gnu_arg,它在所有需要额外参数的,比如--cols n中被使用。

/*
 * Return the argument or NULL
 */
static const char *grab_gnu_arg(void) {
    switch(*flagptr) {    /* argument is part of ps_argv[thisarg] */
    default:
        return NULL;                     /* something bad */
    case '=':
    case ':':
        if(*++flagptr) return flagptr;   /* found it */
        return NULL;                     /* empty '=' or ':' */
    case '\0': /* try next argv[] */
        ;
    }
    if(thisarg+2 > ps_argc) return NULL;   /* there is nothing left */
    /* argument follows ps_argv[thisarg] */
    if(*(ps_argv[thisarg+1]) == '\0') return NULL;
    return ps_argv[++thisarg];
}
//
//<---->
//
case_cols:
case_width:
case_columns:
    trace("--cols\n");
    arg = grab_gnu_arg();
    if(arg && *arg) {
        long t;
        char *endptr;
        t = strtol(arg, &endptr, 0);
        if(!*endptr && (t>0) && (t<2000000000)) {
            screen_cols = (int)t;
            return NULL;
        }
    }
    return _("number of columns must follow --cols, --width, or --columns");

如果指定的是例如--cols=2,--cols:2,则返回=和:之后的部分。如果当前命令已经到头(\0),则看看下一个命令是不是有效的(非"\0"),如果是,返回,thisarg+1。

以--cols为例,这里设置screen_cols为“0~2000000000”中间的一个整数(用0x7ffffff不好吗……)。

再挑一个典型。

case_Group:
    trace("--Group\n");
    arg = grab_gnu_arg();
    if(!arg) return _("list of real groups must follow --Group");
    err=parse_list(arg, parse_gid);
    if(err) return err;
    selection_list->typecode = SEL_RGID;
    return NULL;

--Group这样后面跟一个list的,还需要parse_list来处理列表。

/*
 * Used to parse lists in a generic way. (function pointers)
 */
static const char *parse_list(const char *arg, const char *(*parse_fn)(char *, sel_union *) ) {
    selection_node *node;
    char *buf;                      /* temp copy of arg to hack on */
    char *sep_loc;                  /* separator location: " \t," */
    char *walk;
    int items;
    int need_item;
    const char *err;       /* error code that could or did happen */
    /*** prepare to operate ***/
    node = xmalloc(sizeof(selection_node));
    node->u = xmalloc(strlen(arg)*sizeof(sel_union)); /* waste is insignificant */
    node->n = 0;
    buf = strdup(arg);
    /*** sanity check and count items ***/
    need_item = 1; /* true */
    items = 0;
    walk = buf;
    err = _("improper list");
    do {
        switch(*walk) {
        case ' ':
        case ',':
        case '\t':
        case '\0':
            if(need_item) goto parse_error;
            need_item=1;
            break;
        default:
            if(need_item) items++;
            need_item=0;
        }
    } while (*++walk);
    if(need_item) goto parse_error;
    node->n = items;
    /*** actually parse the list ***/
    walk = buf;
    while(items--) {
        sep_loc = strpbrk(walk," ,\t");
        if(sep_loc) *sep_loc = '\0';
        if(( err=(parse_fn)(walk, node->u+items) )) goto parse_error;
        walk = sep_loc + 1; /* point to next item, if any */
    }
    free(buf);
    node->next = selection_list;
    selection_list = node;
    return NULL;
parse_error:
    free(buf);
    free(node->u);
    free(node);
    return err;
}

逐字查找,如果是逗号或者空白,则标记need_item,它们后面必须跟其他字符,统计一共有多少段。
然后设置node->n为计算到的总数。从头开始扫描空白或逗号,并对扫出来的部分调用parse_fn。parse_fn是传入的参数之一,看一下典型的parse_fn,这个是"C"参数传入的parse_fn,它将向ret->cmd(即(node->u + items)->cmd内拷贝长度为sizeof ret->cmd的字符串)。

static const char *parse_cmd(char *str, sel_union *ret) {
    strncpy(ret->cmd, str, sizeof ret->cmd);  // strncpy pads to end
    ret->cmd[sizeof(ret->cmd)-1] = '\0';      // but let's be safe
    return 0;
}

关于sel_union->cmd,在common.h中有定义。sizeof ret->cmd必然也就是64了。在parse_list的开头注意有:“node->u = xmalloc(strlen(arg)sizeof(sel_union));”,arg就是列表的长度,因此这里会试图分配列表长度64字节的数组给node->u。

typedef union sel_union {
    pid_t pid;
    pid_t ppid;
    uid_t uid;
    gid_t gid;
    dev_t tty;
    char  cmd[64];  /* this is _not_ \0 terminated */
} sel_union;

typedef struct selection_node {
    struct selection_node *next;
    sel_union *u;  /* used if selection type has a list of values */
    int n;         /* used if selection type has a list of values */
    int typecode;
} selection_node;

这篇已经足够长了,而且已经看完了parse_gnu_option部分。下一篇看看剩余的两个 sysv_option 和 bsd_option。

    case ARG_SYSV:
        if(!force_bsd) {  /* else go past case ARG_BSD */
            err = parse_sysv_option();
            break;

            case ARG_BSD:
                if(force_bsd && !(personality & PER_FORCE_BSD)) return _("way bad");
        }
        prefer_bsd_defaults = 1;
        err = parse_bsd_option();
        break;

psproc源码阅读 - 1

好久没有写文章了(2021年一年都没写……),随便开点新坑,从简单的代码来读起。
从psproc工程下的ps代码开始。

display.c:

    /***** no comment */
int main(int argc, char *argv[]) {  
    atexit(close_stdout);
    myname = strrchr(*argv, '/');
    if (myname) ++myname;
    else myname = *argv;
    Hertz = procps_hertz_get();

    setlocale (LC_ALL, "");
    bindtextdomain(PACKAGE, LOCALEDIR);
    textdomain(PACKAGE);
    setenv("TZ", ":/etc/localtime", 0);

先从main看起,首先,atexit函数设置close_stdout为其退出时的处理函数,这个库函数确实少见。

ATEXIT(3)                         Linux Programmer's Manual                        ATEXIT(3)
NAME
       atexit - register a function to be called at normal process termination
SYNOPSIS
       #include <stdlib.h>
       int atexit(void (*function)(void));

然后搜索argv[0],并找到"/"之后的内容作为自己的文件名,如果没有就直接用argv[0]。
procps_hertz_get用于获取CPU的时钟频率(sysconf(_SC_CLK_TCK)),如果获取失败返回100。
然后设置区域信息,并设置环境变量TZ为/etc/localtime。

然后是一段信号处理的函数。将一些黑名单信号以外的信号传递给singal_handler。

#ifdef DEBUG
    init_stack_trace(argv[0]);
#else
    do {
        struct sigaction sa;
        int i = 32;
        memset(&sa, 0, sizeof(sa));
        sa.sa_handler = signal_handler;
        sigfillset(&sa.sa_mask);
        while(i--) switch(i) {
            default:
                sigaction(i,&sa,NULL);
            case 0:
            case SIGCONT:
            case SIGINT:   /* ^C */
            case SIGTSTP:  /* ^Z */
            case SIGTTOU:  /* see stty(1) man page */
            case SIGQUIT:  /* ^\ */
            case SIGPROF:  /* profiling */
            case SIGKILL:  /* can not catch */
            case SIGSTOP:  /* can not catch */
            case SIGWINCH: /* don't care if window size changes */
            case SIGURG:   /* Urgent condition on socket (4.2BSD) */
                ;
            }
    } while (0);
#endif

接下来是几个相对比较重要的处理代码。

reset_global();  /* must be before parser */
arg_parse(argc,argv);

/* check for invalid combination of arguments */
arg_check_conflicts();

/*  arg_show(); */
trace("screen is %ux%u\n",screen_cols,screen_rows);
/*  printf("sizeof(proc_t) is %d.\n", sizeof(proc_t)); */
trace("======= ps output follows =======\n");

首先是reset_global()。global.c:reset_global用于初始化所有的环境变量。我们依次阅读它的代码。

global.c

/************ Call this to reinitialize everything ***************/
void reset_global(void) {
    proc_t *p;
    int i;

    reset_selection_list();

>>

static void reset_selection_list(void) {
    selection_node *old;
    selection_node *walk = selection_list;
    if(selection_list == (selection_node *)0xdeadbeef) {
        selection_list = NULL;
        return;
    }
    while(walk) {
        old = walk;
        walk = old->next;
        free(old->u);
        free(old);
    }
    selection_list = NULL;
}

它首先调用reset_selection_list,如果section_list无效(0xdeadbeef)则置空,如果有内容,则挨个释放并将其置空。这个值并不是编译器或者内存管理器置的,而是它自己初始化的时候设置的:

selection_node *selection_list = (selection_node *)0xdeadbeef;

回到reset_global中,

/************ Call this to reinitialize everything ***************/
void reset_global(void) {
    proc_t *p;
    int i;

    reset_selection_list();

// --- <pids> interface --------------------------------------------------
    if (!Pids_items)
        Pids_items = xcalloc(PIDSITEMS, sizeof(enum pids_item));

    for (i = 0; i < PIDSITEMS; i++)
        Pids_items[i] = PIDS_noop;

    if (!Pids_info) {
        if (procps_pids_new(&Pids_info, Pids_items, i)) {
            fprintf(stderr, _("fatal library error, context\n"));
            exit(EXIT_FAILURE);
        }
    }

    Pids_items[0] = PIDS_TTY;
    procps_pids_reset(Pids_info, Pids_items, 1);
    if (!(p = fatal_proc_unmounted(Pids_info, 1))) {
        fprintf(stderr, _("fatal library error, lookup self\n"));
        exit(EXIT_FAILURE);
    }

接下来是另一个重要的结构,Pids_items,它是一个全局变量,类型为“enum pids_item*”。 xcalloc是psproc自己的wrap,就是calloc加了个检查,可以认为二者相同。PIDSITEMS为70,注释里写道70是拍脑袋的数字。
因此首先,它为Pids_items分配70个pids_item,然后将其初始化为“PIDS_noop”。PIDS_noop是enum pids_item的第一项。

接下来是对Pids_info的初始化。pids.c:procps_pids_new用来初始化Pids_info结构体。这也是一个大型函数,我们把它抽出来看:

pids.c

PROCPS_EXPORT int procps_pids_new (
    struct pids_info **info,
    enum pids_item *items,
    int numitems)
{
    struct pids_info *p;
    double uptime_secs;
    int pgsz;

#ifdef ITEMTABLE_DEBUG
    ... (Removed) ...
#endif

    if (info == NULL || *info != NULL)
        return -EINVAL;
    if (!(p = calloc(1, sizeof(struct pids_info))))
        return -ENOMEM;

    /* if we're without items or numitems, a later call to
       procps_pids_reset() will become mandatory */
    if (items && numitems) {
        if (pids_items_check_failed(items, numitems)) {
            free(p);
            return -EINVAL;
        }
        // allow for our PIDS_logical_end
        p->maxitems = numitems + 1;
        if (!(p->items = calloc(p->maxitems, sizeof(enum pids_item)))) {
            free(p);
            return -ENOMEM;
        }
        memcpy(p->items, items, sizeof(enum pids_item) * numitems);
        p->items[numitems] = PIDS_logical_end;
        p->curitems = p->maxitems;
        pids_libflags_set(p);
    }

    if (!(p->hist = calloc(1, sizeof(struct history_info)))
            || (!(p->hist->PHist_new = calloc(NEWOLD_INIT, sizeof(HST_t))))
            || (!(p->hist->PHist_sav = calloc(NEWOLD_INIT, sizeof(HST_t))))) {
        free(p->items);
        if (p->hist) {
            free(p->hist->PHist_sav);  // this & next might be NULL ...
            free(p->hist->PHist_new);
            free(p->hist);
        }
        free(p);
        return -ENOMEM;
    }
    p->hist->HHist_siz = NEWOLD_INIT;
    pids_config_history(p);

    pgsz = getpagesize();
    while (pgsz > 1024) {
        pgsz >>= 1;
        p->pgs2k_shift++;
    }
    p->hertz = procps_hertz_get();

    // in case 'fatal_proc_unmounted' wasn't called and /proc isn't mounted
    if (0 >= procps_uptime(&uptime_secs, NULL))
        p->boot_seconds = uptime_secs;

    numa_init();

    p->fetch.results.counts = &p->fetch.counts;

    p->refcount = 1;
    *info = p;
    return 0;
} // end: procps_pids_new

首先是为struct pids_info *p;赋予初始值的p = calloc(1, sizeof(struct pids_info)),然后对传入的items、numitems进行处理。

        if (pids_items_check_failed(items, numitems)) {
            free(p);
            return -EINVAL;
        }

调用的pids_items_check_failed用于检查传入的item是否合法。传入的如果不是enum pids_item*指针(而是enum的值),则在这里返回错误(<0x8000的值认为非法)。合法的话,检查是不是每项都在enum范围内。

static inline int pids_items_check_failed (
    enum pids_item *items,
    int numitems)
{
    int i;

    /* if an enum is passed instead of an address of one or more enums, ol' gcc
     * will silently convert it to an address (possibly NULL).  only clang will
     * offer any sort of warning like the following:
     *
     * warning: incompatible integer to pointer conversion passing 'int' to parameter of type 'enum pids_item *'
     * if (procps_pids_new(&info, PIDS_noop, 3) < 0)
     *                            ^~~~~~~~~~~~~~~~
     */
    if (numitems < 1
            || (void *)items < (void *)0x8000)      // twice as big as our largest enum
        return 1;

    for (i = 0; i < numitems; i++) {
        // a pids_item is currently unsigned, but we'll protect our future
        if (items[i] < 0)
            return 1;
        if (items[i] >= PIDS_logical_end) {
            return 1;
        }
    }
    return 0;
} // end: pids_items_check_failed

检查通过以后,分配对应的项目并将值复制到p中。

        // allow for our PIDS_logical_end
        p->maxitems = numitems + 1;
        if (!(p->items = calloc(p->maxitems, sizeof(enum pids_item)))) {
            free(p);
            return -ENOMEM;
        }
        memcpy(p->items, items, sizeof(enum pids_item) * numitems);
        p->items[numitems] = PIDS_logical_end;
        p->curitems = p->maxitems;
        pids_libflags_set(p);

然后是另一部分的初始化。如果hist的任何一部分初始化失败了,则释放里面已申请的内容。pids_config_history用于初始化HHash_one和HHash_two(初始化为HHash_nul),并修改PHash_save为HHash_one,PHash_new为HHash_two。

if (!(p->hist = calloc(1, sizeof(struct history_info)))
        || (!(p->hist->PHist_new = calloc(NEWOLD_INIT, sizeof(HST_t))))
        || (!(p->hist->PHist_sav = calloc(NEWOLD_INIT, sizeof(HST_t))))) {
    free(p->items);
    if (p->hist) {
        free(p->hist->PHist_sav);  // this & next might be NULL ...
        free(p->hist->PHist_new);
        free(p->hist);
    }
    free(p);
    return -ENOMEM;
}
p->hist->HHist_siz = NEWOLD_INIT;
pids_config_history(p);

pgsz = getpagesize();
while (pgsz > 1024) {
    pgsz >>= 1;
    p->pgs2k_shift++;
}
p->hertz = procps_hertz_get();

最后是一些收尾的。procps_uptime用于读取/proc/uptime来获取系统的uptime和idle time。numa_init用于初始化numa(Non Uniform Memory Access, libnuma.so/libnuma.so.1)。

    // in case 'fatal_proc_unmounted' wasn't called and /proc isn't mounted
    if (0 >= procps_uptime(&uptime_secs, NULL))
        p->boot_seconds = uptime_secs;

    numa_init();

    p->fetch.results.counts = &p->fetch.counts;

    p->refcount = 1;
    *info = p;
    return 0;
} // end: procps_pids_new

这个大函数终于结束了。回到我们最开始的reset_global的后半部分中:

Pids_items[0] = PIDS_TTY;
procps_pids_reset(Pids_info, Pids_items, 1);
if (!(p = fatal_proc_unmounted(Pids_info, 1))) {
    fprintf(stderr, _("fatal library error, lookup self\n"));
    exit(EXIT_FAILURE);
}

不过我们只能在这里短暂停留,因为procps_pids_reset也是一个大函数。

pids.c:

PROCPS_EXPORT int procps_pids_reset (
    struct pids_info *info,
    enum pids_item *newitems,
    int newnumitems)
{
    if (info == NULL || newitems == NULL)
        return -EINVAL;
    if (pids_items_check_failed(newitems, newnumitems))
        return -EINVAL;

    pids_cleanup_stacks_all(info);

pids_clenaup_stacks_all函数的定义如下,它对info->extends的ext链表中的每个项目都调用pids_cleanp_stack。目标是info->extends->ext[..]->stacks[i]->head。 stacks顾名思义是一个栈结构。pids_clean_stack对每个项目查找Item_table中对应的freefunc,并使用freefunc来释放它们。

static inline void pids_cleanup_stacks_all (
    struct pids_info *info)
{
    struct stacks_extent *ext = info->extents;
    int i;

    while (ext) {
        for (i = 0; ext->stacks[i]; i++)
            pids_cleanup_stack(ext->stacks[i]->head);
        ext = ext->next;
    };
} // end: pids_cleanup_stacks_all

>>>

static inline void pids_cleanup_stack (
    struct pids_result *this)
{
    for (;;) {
        enum pids_item item = this->item;
        if (item >= PIDS_logical_end)
            break;
        if (Item_table[item].freefunc)
            Item_table[item].freefunc(this);
        this->result.ull_int = 0;
        ++this;
    }
} // end: pids_cleanup_stack

Item_table的内容类似:

static struct {
    SET_t    setsfunc;            // the actual result setting routine
#ifdef ITEMTABLE_DEBUG
    int      enumnumb;            // enumerator (must match position!)
    char    *enum2str;            // enumerator name as a char* string
#endif
    unsigned oldflags;            // PROC_FILLxxxx flags for this item
    FRE_t    freefunc;            // free function for strings storage
    QSR_t    sortfunc;            // sort cmp func for a specific type
    int      needhist;            // a result requires history support
    char    *type2str;            // the result type as a string value
} Item_table[] = {
    /*    setsfunc               oldflags    freefunc   sortfunc       needhist  type2str
          ---------------------  ----------  ---------  -------------  --------  ----------- */
    { RS(noop),              0,          NULL,      QS(noop),      0,        TS_noop     }, // user only, never altered
    { RS(extra),             0,          NULL,      QS(ull_int),   0,        TS_noop     }, // user only, reset to zero

    { RS(ADDR_CODE_END),     f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_CODE_START),   f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_CURR_EIP),     f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_CURR_ESP),     f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_STACK_START),  f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },

这些freefunc实际上也就是对不同类型的东西调用其free。其实这里就是用c实现了一套接口,谁叫这不是用c++写的呢。

static void freNAME(str) (struct pids_result *R) {
    if (R->result.str) free(R->result.str);
}

static void freNAME(strv) (struct pids_result *R) {
    if (R->result.strv && *R->result.strv) free(*R->result.strv);
}

再回到procps_pid_reset中:

    /* shame on this caller, they didn't change anything. and unless they have
       altered the depth of the stacks we're not gonna change anything either! */
    if (info->curitems == newnumitems + 1
            && !memcmp(info->items, newitems, sizeof(enum pids_item) * newnumitems))
        return 0;

    if (info->maxitems < newnumitems + 1) {
        while (info->extents) {
            struct stacks_extent *p = info->extents;
            info->extents = p->next;
            free(p);
        };
        if (info->get_ext) {
            pids_oldproc_close(&info->get_PT);
            info->get_ext = NULL;
        }
        if (info->fetch.anchor) {
            free(info->fetch.anchor);
            info->fetch.anchor = NULL;
        }
        // allow for our PIDS_logical_end
        info->maxitems = newnumitems + 1;
        if (!(info->items = realloc(info->items, sizeof(enum pids_item) * info->maxitems)))
            return -ENOMEM;
    }

    memcpy(info->items, newitems, sizeof(enum pids_item) * newnumitems);
    info->items[newnumitems] = PIDS_logical_end;
    // account for above PIDS_logical_end
    info->curitems = newnumitems + 1;

    // if extents were freed above, this next guy will have no effect
    // so we'll rely on pids_stacks_alloc() to itemize ...
    pids_itemize_stacks_all(info);
    pids_libflags_set(info);

    return 0;
} // end: procps_pids_reset

剩余的代码相对就没那么复杂了。作者抽风写的注释也能解释很多,首先是第一个if判断,当调用时items的数量没有变,且内容也没有变的时候就什么都不做。如果当前的容量已经不够了,把栈区多余的内容释放,停止扫描进程表,释放fetch.anchor,并扩展max_items,拷贝newitems到原始的内容中。最后,调用pids_itemize_stacks_all。老实说这个函数在干什么我暂时也不太清楚,先留着坑后面再看看(可能是给top用的,不是给ps用的)。最后,设置flags并完成函数功能。

回到reset_global,继续看下一个大函数fatal_proc_unmounted。

if (!(p = fatal_proc_unmounted(Pids_info, 1))) {
    fprintf(stderr, _("fatal library error, lookup self\n"));
    exit(EXIT_FAILURE);
}

fatal_proc_unmounted为每个pids结构分配一个栈结构,并初始化相关结构体。不细看了,后面碰到有用相关结构的时候再回头看看。在pids接口的相关内容处理完成后,reset_global接下来的内容比较轻松:

    set_screen_size();
    set_personality();

    all_processes         = 0;
    bsd_c_option          = 0;
    bsd_e_option          = 0;
    cached_euid           = geteuid();
    cached_tty            = PIDS_VAL(0, s_int, p, Pids_info);
    /* forest_prefix must be all zero because of POSIX */
    forest_type           = 0;
    format_flags          = 0;   /* -l -f l u s -j... */
    format_list           = NULL; /* digested formatting options */
    format_modifiers      = 0;   /* -c -j -y -P -L... */
    header_gap            = -1;  /* send lines_to_next_header to -infinity */
    header_type           = HEAD_SINGLE;
    include_dead_children = 0;
    lines_to_next_header  = 1;
    negate_selection      = 0;
    page_size             = getpagesize();
    running_only          = 0;
    selection_list        = NULL;
    simple_select         = 0;
    sort_list             = NULL;
    thread_flags          = 0;
    unix_f_option         = 0;
    user_is_number        = 0;
    wchan_is_number       = 0;
    /* Translation Note:
       . The following translatable word will be used to recognize the
       . user's request for help text.  In other words, the translation
       . you provide will alter program behavior.
       .
       . It must be limited to 15 characters or less.
       */
    the_word_help         = _("help");
}

基本就是把全局变量都给初始化了。psproc的这些全局变量命名长得和局部变量一样挺讨厌的,好在软件标注看起来还不那么难受。

还记得我们是从main过来的么……难受的reset_global看完了以后,回到main中继续阅读剩余的代码。

跟踪qemu-kvm下的磁盘写入

傻了,上一个调试的时候没加-enable-kvm,而且电脑的虚拟化也是关着的。假装无事发生过,一切调整就绪后,重新在KVM模式下调试。终于在另一台linux老爷机上装好了qemu和各种软件,继续从这里来,qcow2_pre_write_overlap_check下个断点,这里的栈和TCG模式一样,继续操作,b blk_aio_prwv

(gdb) bt
#0  qcow2_pre_write_overlap_check (bs=0x558eef1841a0, ign=0, offset=1670656, 
    size=4096, data_file=true) at block/qcow2-refcount.c:2817
#1  0x0000558eedcb12e6 in qcow2_co_pwritev_part (bs=0x558eef1841a0, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/qcow2.c:2513
#2  0x0000558eedcfe0de in bdrv_driver_pwritev (bs=0x558eef1841a0, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/io.c:1171
#3  0x0000558eedd000a5 in bdrv_aligned_pwritev (child=0x558eef191900, 
    req=0x7fa0b8acae10, offset=1879080448, bytes=4096, align=1, 
    qiov=0x7fa0e4236760, qiov_offset=0, flags=0) at block/io.c:1980
#4  0x0000558eedd0087f in bdrv_co_pwritev_part (child=0x558eef191900, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/io.c:2137
#5  0x0000558eedce6f6d in blk_co_pwritev_part (blk=0x558eef183e40, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/block-backend.c:1211
#6  0x0000558eedce6fbf in blk_co_pwritev (blk=0x558eef183e40, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, flags=0)
    at block/block-backend.c:1221
#7  0x0000558eedce7795 in blk_aio_write_entry (opaque=0x7fa0e4238780)
    at block/block-backend.c:1415
#8  0x0000558eedddcc2f in coroutine_trampoline (i0=-467430144, i1=32672)
    at util/coroutine-ucontext.c:115
#9  0x00007fa0f56c8000 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#10 0x00007fa0e9cbad90 in ?? ()
#11 0x0000000000000000 in ?? ()

断下来以后,可以看到,除了上层的消息循环变成了kvm的,后面都是一样,通过直接向ioport写数据,然后转移到对应的后端处理函数中。差不多就调试完了,后面开设了一个网站督促自己读代码,qemu.world,等我想起来就更新。

(gdb) bt
#0  blk_aio_prwv (blk=0x558eef183e40, offset=0, bytes=0, iobuf=0x0, 
    co_entry=0x558eedce7a28 <blk_aio_flush_entry>, flags=0, 
    cb=0x558eedaad47c <ide_flush_cb>, opaque=0x558eefc24730)
    at block/block-backend.c:1360
#1  0x0000558eedce7ab1 in blk_aio_flush (blk=0x558eef183e40, 
    cb=0x558eedaad47c <ide_flush_cb>, opaque=0x558eefc24730)
    at block/block-backend.c:1503
#2  0x0000558eedaad5da in ide_flush_cache (s=0x558eefc24730)
    at hw/ide/core.c:1088
#3  0x0000558eedaae5b3 in cmd_flush_cache (s=0x558eefc24730, cmd=231 '\347')
    at hw/ide/core.c:1554
#4  0x0000558eedaaf8c5 in ide_exec_cmd (bus=0x558eefc246b0, val=231)
    at hw/ide/core.c:2085
#5  0x0000558eedaaddef in ide_ioport_write (opaque=0x558eefc246b0, addr=503, 
    val=231) at hw/ide/core.c:1294
#6  0x0000558eed85cd3f in portio_write (opaque=0x558eefcbff30, addr=7, 
    data=231, size=1) at /home/leon/qemu-4.2.0/ioport.c:201
#7  0x0000558eed861fbc in memory_region_write_accessor (mr=0x558eefcbff30, 
    addr=7, value=0x7fa0e9cbb818, size=1, shift=0, mask=255, attrs=...)
    at /home/leon/qemu-4.2.0/memory.c:483
#8  0x0000558eed8621a6 in access_with_adjusted_size (addr=7, 
    value=0x7fa0e9cbb818, size=1, access_size_min=1, access_size_max=4, 
    access_fn=0x558eed861efc <memory_region_write_accessor>, 
    mr=0x558eefcbff30, attrs=...) at /home/leon/qemu-4.2.0/memory.c:544
#9  0x0000558eed8650d7 in memory_region_dispatch_write (mr=0x558eefcbff30, addr=7, data=231, op=MO_8, attrs=...) at /home/leon/qemu-4.2.0/memory.c:1475
#10 0x0000558eed803386 in flatview_write_continue (fv=0x7fa0e410c970, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1, addr1=7, l=1, mr=0x558eefcbff30) at /home/leon/qemu-4.2.0/exec.c:3129
#11 0x0000558eed8034cb in flatview_write (fv=0x7fa0e410c970, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1) at /home/leon/qemu-4.2.0/exec.c:3169
#12 0x0000558eed803818 in address_space_write (as=0x558eee7a4b60 <address_space_io>, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1) at /home/leon/qemu-4.2.0/exec.c:3259
#13 0x0000558eed803885 in address_space_rw (as=0x558eee7a4b60 <address_space_io>, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1, is_write=true) at /home/leon/qemu-4.2.0/exec.c:3269
#14 0x0000558eed87cf9f in kvm_handle_io (port=503, attrs=..., data=0x7fa0f86ac000, direction=1, size=1, count=1) at /home/leon/qemu-4.2.0/accel/kvm/kvm-all.c:2104
#15 0x0000558eed87d737 in kvm_cpu_exec (cpu=0x558eef1b29b0) at /home/leon/qemu-4.2.0/accel/kvm/kvm-all.c:2350
#16 0x0000558eed853017 in qemu_kvm_cpu_thread_fn (arg=0x558eef1b29b0) at /home/leon/qemu-4.2.0/cpus.c:1318
#17 0x0000558eeddc042b in qemu_thread_start (args=0x558eef1da7e0) at util/qemu-thread-posix.c:519
#18 0x00007fa0f5a2a4a4 in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#19 0x00007fa0f576cd0f in clone () from /lib/x86_64-linux-gnu/libc.so.6

调试qemu 硬盘io的过程

好久没有水文章了……在家无聊,正好最近也是在研究虚拟化相关的东西,就调一调qemu中文件写入的流程吧。

这里说的写入是指,qemu启动的虚拟机,虚拟机中如果发生文件IO,那么qemu如何知道要更新对应的虚拟磁盘文件呢?qemu这方面我比较菜,说实话,刚接触不到1周,感觉能水的文章还是挺多的。而且本篇大概率会有错误……反正不管,先从这个开始吧。

先粘一下编译选项,后面换机器不用再找了……直接复制
./configure --target-list=x86_64-softmmu --enable-kvm --enable-debug --enable-debug-info --enable-modules --enable-vnc --disable-strip

为了方便调试,我将qemu启动的虚拟机设置成为TinyCore Linux(http://www.tinycorelinux.net/)。毕竟现在我还在老家,搞不到Linux电脑,实际的调试环境是Windows上跑一个VirtualBox,里面跑个Linux,Linux再跑Qemu,如果是比较完整的Linux,估计我这台老爷机得卡死,所以一切最简化,用这个Linux安装一个命令行版的就可以了。

(后记:因为我启动参数配置错误,整个虚拟机跑在tcg模式下,性能依旧很慢,不过先不管这些,直接看看tcg下是如何通知到硬盘写入操作的,是否和kvm不同。)

我为虚拟机设置的磁盘格式是qcow2格式,然而问题来了,我该从哪里下手,换言之,我该断哪个函数?众所周知,也可能不知,与块设备相关的文件大部分位于block/下面。于是直接在block/下搜索qcow2 AND write,很快,发现几个函数,其中一个是qcow2_pre_write_overlap_check,看起来是一个很有用的校验函数。gdb挂上qemu后下个断点,很快地,就能断到它。

Thread 5 (Thread 0x7f8f31d33700 (LWP 23615)):
#0  0x0000562359abf4f0 in qcow2_pre_write_overlap_check (bs=0x56235abb8280, ign=0, offset=359936, size=4096, data_file=true) at block/qcow2-refcount.c:2817
#1  0x0000562359ab132a in qcow2_co_pwritev_part (bs=0x56235abb8280, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/qcow2.c:2513
#2  0x0000562359afe694 in bdrv_driver_pwritev (bs=0x56235abb8280, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/io.c:1171
#3  0x0000562359b0066a in bdrv_aligned_pwritev (child=0x56235aa76db0, req=0x7f8f183e9e10, offset=32256, bytes=4096, align=1, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/io.c:1980
#4  0x0000562359b00e44 in bdrv_co_pwritev_part (child=0x56235aa76db0, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/io.c:2137
#5  0x0000562359ae736b in blk_co_pwritev_part (blk=0x56235aaa6ed0, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/block-backend.c:1211
#6  0x0000562359ae73bd in blk_co_pwritev (blk=0x56235aaa6ed0, offset=32256, bytes=4096, qiov=0x7f8f14136db0, flags=0) at block/block-backend.c:1221
#7  0x0000562359ae7b93 in blk_aio_write_entry (opaque=0x7f8f14024650) at block/block-backend.c:1415
#8  0x0000562359beafcb in coroutine_trampoline (i0=335845504, i1=32655) at util/coroutine-ucontext.c:115
#9  0x00007f8f504286b0 in __start_context () at /lib/x86_64-linux-gnu/libc.so.6
#10 0x00007f8f31d2ef80 in  ()
#11 0x0000000000000000 in  ()

coroutine_trampoline是qemu实现协程的主要函数,而进入的入口则是blk_aio_write_entry

搜索对blk_aio_write_entry的引用,可以发现仅有这两处引用:

block-backend.c
1424    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1428                        blk_aio_write_entry, flags, cb, opaque);

分别位于

1424
blk_aio_pwrite_zeroes -> blk_aio_prwv

1428:
blk_aio_pwritev -> blk_aio_prwv

而在blk_aio_prwv中,可以明显的看到这个协程的创建过程。

static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
                                void *iobuf, CoroutineEntry co_entry,
                                BdrvRequestFlags flags,
                                BlockCompletionFunc *cb, void *opaque) {
    BlkAioEmAIOCB *acb;
    Coroutine *co;

    blk_inc_in_flight(blk);
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
    acb->rwco = (BlkRwCo) {
        .blk    = blk,
        .offset = offset,
        .iobuf  = iobuf,
        .flags  = flags,
        .ret    = NOT_DONE,
    };
    acb->bytes = bytes;
    acb->has_returned = false;

    /* HERE */co = qemu_coroutine_create(co_entry, acb);
    bdrv_coroutine_enter(blk_bs(blk), co);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
                                         blk_aio_complete_bh, acb);
    }

    return &acb->common; }

协程非常类似于线程。但是协程是协作式多任务的,而线程典型是抢占式多任务的。这意味着协程提供并发性而非并行性。
知道协程的创建位置就好办了,继续往上层的blk_aio_prwv挂断点。

很快,我们可以拿到这样的栈,而且是带消息循环的栈,大致就能知道断点下对了。

#0  blk_aio_prwv (blk=0x55a4a09c5800, offset=0, bytes=4096, iobuf=0x7f1dc8036c60, co_entry=0x55a49e41d9d0 <blk_aio_read_entry>, flags=0, cb=0x55a49e0ddbc2 <dma_blk_cb>, opaque=0x7f1dc8036c00)
    at block/block-backend.c:1360
#1  0x000055a49e41ddc5 in blk_aio_preadv (blk=0x55a4a09c5800, offset=0, qiov=0x7f1dc8036c60, flags=0, cb=0x55a49e0ddbc2 <dma_blk_cb>, opaque=0x7f1dc8036c00) at block/block-backend.c:1479
#2  0x000055a49e0de16a in dma_blk_read_io_func (offset=0, iov=0x7f1dc8036c60, cb=0x55a49e0ddbc2 <dma_blk_cb>, cb_opaque=0x7f1dc8036c00, opaque=0x55a4a09c5800) at dma-helpers.c:243
#3  0x000055a49e0dde9a in dma_blk_cb (opaque=0x7f1dc8036c00, ret=0) at dma-helpers.c:168
#4  0x000055a49e0de119 in dma_blk_io (ctx=0x55a4a08876d0, sg=0x55a4a171b788, offset=0, align=512, io_func=0x55a49e0de11f <dma_blk_read_io_func>, io_func_opaque=0x55a4a09c5800, 
    cb=0x55a49e1cadf1 <ide_dma_cb>, opaque=0x55a4a171b460, dir=DMA_DIRECTION_FROM_DEVICE) at dma-helpers.c:232
#5  0x000055a49e0de1c7 in dma_blk_read (blk=0x55a4a09c5800, sg=0x55a4a171b788, offset=0, align=512, cb=0x55a49e1cadf1 <ide_dma_cb>, opaque=0x55a4a171b460) at dma-helpers.c:250
#6  0x000055a49e1cb11f in ide_dma_cb (opaque=0x55a4a171b460, ret=0) at hw/ide/core.c:915
#7  0x000055a49e1d4d79 in bmdma_cmd_writeb (bm=0x55a4a171c5b0, val=9) at hw/ide/pci.c:306
#8  0x000055a49e1d5aad in bmdma_write (opaque=0x55a4a171c5b0, addr=0, val=9, size=1) at hw/ide/piix.c:75
#9  0x000055a49df42831 in memory_region_write_accessor (mr=0x55a4a171c700, addr=0, value=0x7f1dd8ea5a48, size=1, shift=0, mask=255, attrs=...) at /home/leon/qemu-4.2.0/memory.c:483
#10 0x000055a49df42a18 in access_with_adjusted_size (addr=0, value=0x7f1dd8ea5a48, size=1, access_size_min=1, access_size_max=4, access_fn=0x55a49df42771 <memory_region_write_accessor>, 
    mr=0x55a4a171c700, attrs=...) at /home/leon/qemu-4.2.0/memory.c:544
#11 0x000055a49df459c2 in memory_region_dispatch_write (mr=0x55a4a171c700, addr=0, data=9, op=MO_8, attrs=...) at /home/leon/qemu-4.2.0/memory.c:1475
#12 0x000055a49dee5a07 in address_space_stb (as=0x55a49eeac0e0 <address_space_io>, addr=49216, val=9, attrs=..., result=0x0) at /home/leon/qemu-4.2.0/memory_ldst.inc.c:378
#13 0x000055a49e0a7d16 in helper_outb (env=0x55a4a0bfa3e0, port=49216, data=9) at /home/leon/qemu-4.2.0/target/i386/misc_helper.c:33
#14 0x00007f1dbd998d65 in code_gen_buffer ()
#15 0x000055a49df7ad63 in cpu_tb_exec (cpu=0x55a4a0bf1b80, itb=0x7f1dbde60980 <code_gen_buffer+31852886>) at /home/leon/qemu-4.2.0/accel/tcg/cpu-exec.c:172
#16 0x000055a49df7bc47 in cpu_loop_exec_tb (cpu=0x55a4a0bf1b80, tb=0x7f1dbde60980 <code_gen_buffer+31852886>, last_tb=0x7f1dd8ea6078, tb_exit=0x7f1dd8ea6070)
    at /home/leon/qemu-4.2.0/accel/tcg/cpu-exec.c:618
#17 0x000055a49df7bf61 in cpu_exec (cpu=0x55a4a0bf1b80) at /home/leon/qemu-4.2.0/accel/tcg/cpu-exec.c:731
#18 0x000055a49df33eb8 in tcg_cpu_exec (cpu=0x55a4a0bf1b80) at /home/leon/qemu-4.2.0/cpus.c:1473
#19 0x000055a49df3470e in qemu_tcg_cpu_thread_fn (arg=0x55a4a0bf1b80) at /home/leon/qemu-4.2.0/cpus.c:1781
#20 0x000055a49e50488c in qemu_thread_start (args=0x55a4a0956070) at util/qemu-thread-posix.c:519
#21 0x00007f1df39476db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#22 0x00007f1df366988f in clone () from /lib/x86_64-linux-gnu/libc.so.6

基本就是ioport直接写的方式。通过这个硬件直接操作的方式,向cmd646设备写数据,来通知bmdma_write后面一系列函数。具体的后面再看,等过段时间我去linux机器上再确认Kvm的通知方式是否不一样,虽然感觉应该是一样的。