nul.pw

psproc源码阅读 - 2

作者：leonwxqian
时间：January 10, 2022
分类：技术分享
评论

main中接下来的函数都比较重要，所以这里就分段来介绍了。

arg_parse(argc,argv);

/* check for invalid combination of arguments */
arg_check_conflicts();

首先是arg_parse。

int arg_parse(int argc, char *argv[]) {
    const char *err = NULL;
    const char *err2 = NULL;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;

    if(personality & PER_FORCE_BSD) goto try_bsd;

    err = parse_all_options();
    if(err) goto try_bsd;
    err = thread_option_check();
    if(err) goto try_bsd;
    err = process_sf_options();
    if(err) goto try_bsd;
    err = select_bits_setup();
    if(err) goto try_bsd;

    choose_dimensions();
    return 0;

try_bsd:
    trace("--------- now try BSD ------\n");

    reset_global();
    reset_parser();
    reset_sortformat();
    format_flags = 0;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;
    /* no need to reset flagptr */
    force_bsd=1;
    prefer_bsd_defaults=1;
    if(!( (PER_OLD_m|PER_BSD_m) & personality )) /* if default m setting... */
        personality |= PER_OLD_m; /* Prefer old Linux over true BSD. */
    /* Do not set PER_FORCE_BSD! It is tested below. */

    err2 = parse_all_options();
    if(err2) goto total_failure;
    err2 = thread_option_check();
    if(err2) goto total_failure;
    err2 = process_sf_options();
    if(err2) goto total_failure;
    err2 = select_bits_setup();
    if(err2) goto total_failure;

    choose_dimensions();
    return 0;

total_failure:
    reset_parser();
    if(personality & PER_FORCE_BSD) fprintf(stderr, _("error: %s\n"), err2);
    else fprintf(stderr, _("error: %s\n"), err);
    do_help(NULL, EXIT_FAILURE);
}

先看第一部分，全局变量personality由set_personality设置，大体就是根据不同的操作系统和架构，来设置不同的参数。

int arg_parse(int argc, char *argv[]) {
    const char *err = NULL;
    const char *err2 = NULL;
    ps_argc = argc;
    ps_argv = argv;
    thisarg = 0;

    if(personality & PER_FORCE_BSD) goto try_bsd;

对BSD而言，其personality是包含PER_FORCE_BSD位的，但是对linux则没有。因此如果有这个位，则优先尝试bsd。

case_bsd:
    personality = PER_FORCE_BSD | PER_BSD_h | PER_BSD_m;
    prefer_bsd_defaults = 1;
    bsd_j_format = "FB_j";
    bsd_l_format = "FB_l";
    /* bsd_s_format not used */
    bsd_u_format = "FB_u";
    bsd_v_format = "FB_v";
    return NULL;

否则继续执行parse_all_options()。

err = parse_all_options();

开始阅读parse_all_options函数，对每个当前的参数，调用arg_type(ps_argv[thisarg])获取其类型。

/* First assume sysv, because that is the POSIX and Unix98 standard. */
static const char *parse_all_options(void) {
    const char *err = NULL;
    int at;
    while(++thisarg < ps_argc) {
        trace("parse_all_options calling arg_type for \"%s\"\n", ps_argv[thisarg]);
        at = arg_type(ps_argv[thisarg]);
        trace("ps_argv[thisarg] is %s\n", ps_argv[thisarg]);
        switch(at) {
        case ARG_GNU:
            err = parse_gnu_option();
            break;
        case ARG_SYSV:
            if(!force_bsd) {  /* else go past case ARG_BSD */
                err = parse_sysv_option();
                break;

                case ARG_BSD:
                    if(force_bsd && !(personality & PER_FORCE_BSD)) return _("way bad");
            }
            prefer_bsd_defaults = 1;
            err = parse_bsd_option();
            break;
        case ARG_PGRP:
        case ARG_SESS:
        case ARG_PID:
            prefer_bsd_defaults = 1;
            err = parse_trailing_pids();
            break;
        case ARG_END:
        case ARG_FAIL:
            trace("              FAIL/END on [%s]\n",ps_argv[thisarg]);
            return _("garbage option");
            break;
        default:
            printf("                  ?    %s\n",ps_argv[thisarg]);
            return _("something broke");
        } /* switch */
        if(err) return err;
    } /* while */
    return NULL;
}

arg_type的定义如下，即：如果是字母开头，则认为是BSD风格的参数。如果是数字开头，则认为是PID，如果是+开头的，则认为是ARG_SESS类型。如果是其他情况且非-开头，认为是非法符号。然后再看下一个字符，如果是字母开头的，认为是SYSV参数（例如-a），如果是数字开头的，则认为是PGRP，如果其他字符且不是-，认为非法。如果是--开头的，再看第三个字符，是不是字母，如果是的话则认为是GNU参数。

static int arg_type(const char *str) {
    int tmp = str[0];
    if((tmp>='a') && (tmp<='z'))   return ARG_BSD;
    if((tmp>='A') && (tmp<='Z'))   return ARG_BSD;
    if((tmp>='0') && (tmp<='9'))   return ARG_PID;
    if(tmp=='+')                   return ARG_SESS;
    if(tmp!='-')                   return ARG_FAIL;
    tmp = str[1];
    if((tmp>='a') && (tmp<='z'))   return ARG_SYSV;
    if((tmp>='A') && (tmp<='Z'))   return ARG_SYSV;
    if((tmp>='0') && (tmp<='9'))   return ARG_PGRP;
    if(tmp!='-')                   return ARG_FAIL;
    tmp = str[2];
    if((tmp>='a') && (tmp<='z'))   return ARG_GNU;
    if((tmp>='A') && (tmp<='Z'))   return ARG_GNU;
    if(tmp=='\0')                  return ARG_END;
    return ARG_FAIL;
}

对比ps的man，即可理解：

DESCRIPTION
       ps displays information about a selection of the active processes.  If you want a
       repetitive update of the selection and the displayed information, use top(1) instead.

       This version of ps accepts several kinds of options:

       1   UNIX options, which may be grouped and must be preceded by a dash.
       2   BSD options, which may be grouped and must not be used with a dash.
       3   GNU long options, which are preceded by two dashes.

总之为了保证兼容性，ps的命令行非常混乱。

加号和数字的作用：

   --sort spec
          Specify sorting order.  Sorting syntax is [+|-]key[,[+|-]key[,...]].  Choose a
          multi-letter key from the STANDARD FORMAT SPECIFIERS section.  The "+" is
          optional since default direction is increasing numerical or lexicographic
          order.  Identical to k.  For example: ps jax --sort=uid,-ppid,+pid

PROCESS SELECTION BY LIST
       These options accept a single argument in the form of a blank-separated or
       comma-separated list.  They can be used multiple times.  For example:
       ps -p "1 2" -p 3,4

       -123   Identical to --pid 123.

       123    Identical to --pid 123.

在所有类型中，ARG_END、ARG_FAIL、default会导致直接退出。

    case ARG_PGRP:
    case ARG_SESS:
    case ARG_PID:
        prefer_bsd_defaults = 1;
        err = parse_trailing_pids();
        break;

PGRP、SESS、PID会使ps进一步解析后面的pid。

/*************** process trailing PIDs  **********************/
static const char *parse_trailing_pids(void) {
    selection_node *pidnode;  /* pid */
    selection_node *grpnode;  /* process group */
    selection_node *sidnode;  /* session */
    char **argp;     /* pointer to pointer to text of PID */
    const char *err;       /* error code that could or did happen */
    int i;

    i = ps_argc - thisarg;  /* how many trailing PIDs, SIDs, PGRPs?? */
    argp = ps_argv + thisarg;
    thisarg = ps_argc - 1;   /* we must be at the end now */

    pidnode = xmalloc(sizeof(selection_node));
    pidnode->u = xmalloc(i*sizeof(sel_union)); /* waste is insignificant */
    pidnode->n = 0;

    grpnode = xmalloc(sizeof(selection_node));
    grpnode->u = xmalloc(i*sizeof(sel_union)); /* waste is insignificant */
    grpnode->n = 0;

    sidnode = xmalloc(sizeof(selection_node));
    sidnode->u = xmalloc(i*sizeof(sel_union)); /* waste is insignificant */
    sidnode->n = 0;

    while(i--) {
        char *data;
        data = *(argp++);
        switch(*data) {
        default:
            err = parse_pid(  data, pidnode->u + pidnode->n++);
            break;
        case '-':
            err = parse_pid(++data, grpnode->u + grpnode->n++);
            break;
        case '+':
            err = parse_pid(++data, sidnode->u + sidnode->n++);
            break;
        }
        if(err) return err;     /* the node gets freed with the list */
    }

    if(pidnode->n) {
        pidnode->next = selection_list;
        selection_list = pidnode;
        selection_list->typecode = SEL_PID;
    }  /* else free both parts */

    if(grpnode->n) {
        grpnode->next = selection_list;
        selection_list = grpnode;
        selection_list->typecode = SEL_PGRP;
    }  /* else free both parts */

    if(sidnode->n) {
        sidnode->next = selection_list;
        selection_list = sidnode;
        selection_list->typecode = SEL_SESS;
    }  /* else free both parts */

    return NULL;
}

解析时要求它是一个1~0x7fffffff的正整数。并按+、-、默认的情况分别放置在sid/gid/pidnode中。

static const char *parse_pid(char *str, sel_union *ret) {
    char *endp;
    unsigned long num;
    num = strtoul(str, &endp, 0);
    if(*endp != '\0')      return _("process ID list syntax error");
    if(num<1)              return _("process ID out of range");
    if(num > 0x7fffffffUL) return _("process ID out of range");
    ret->pid = num;
    return 0;
}

对ARG_GNU而言，处理函数是parse_gnu_option

parser.c:parse_all_options

    case ARG_GNU:
        err = parse_gnu_option();
        break;

parse_gnu_option的开头列出了一组支持的参数。

static const gnu_table_struct gnu_table[] = {
    {"Group",         &&case_Group},       /* rgid */
    {"User",          &&case_User},        /* ruid */
    {"cols",          &&case_cols},
    {"columns",       &&case_columns},
    {"context",       &&case_context},
    {"cumulative",    &&case_cumulative},
    {"deselect",      &&case_deselect},    /* -N */
    {"forest",        &&case_forest},      /* f -H */

这里的case_Group之类的不是什么全局变量，而是本地标签，第一次看到能这么用，很神奇……

        {"version",       &&case_version},
        {"width",         &&case_width},
    };
    const int gnu_table_count = sizeof(gnu_table)/sizeof(gnu_table_struct);

    s = ps_argv[thisarg]+2;
    sl = strcspn(s,":=");
    if(sl > 15) return _("unknown gnu long option");
    strncpy(buf, s, sl);
    buf[sl] = '\0';
    flagptr = s+sl;

    found = bsearch(&findme, gnu_table, gnu_table_count,
                    sizeof(gnu_table_struct), compare_gnu_table_structs
                   );

    if(!found) {
        if (!strcmp(buf, the_word_help))
            goto case_help;
        return _("unknown gnu long option");
    }

    goto *(found->jump);    /* See gcc extension info.  :-)   */

case_Group:
    trace("--Group\n");
    arg = grab_gnu_arg();
    if(!arg) return _("list of real groups must follow --Group");
    err=parse_list(arg, parse_gid);
    if(err) return err;
    selection_list->typecode = SEL_RGID;
    return NULL;
case_User:
    trace("--User\n");
    arg = grab_gnu_arg();
    if(!arg) return _("list of real users must follow --User");
    err=parse_list(arg, parse_uid);
    if(err) return err;
    selection_list->typecode = SEL_RUID;
    return NULL;

先逐行读一下代码。s是argv[i] + 2，这是因为ARG_GNU是“--”开头的，跳过前两个字符。sl是:=前的字符数。然后将:=前的内容拷贝到buf中。buf定义为buf[16]所以限制长度不能大于15。在这之后，flagptr就是:=开始的字符。

然后，通过bsearch库函数在gnu_table中搜索findme={buf, NULL}。如果找到就直接跳到对应标签上，这语法也是很离谱。

先看看man手册中对这些参数的定义：

   --cols n
          Set screen width.

   --columns n
          Set screen width.

   --cumulative
          Include some dead child process data (as a sum with the parent).

有但不是全有，比如--Group就不在主词条里面（但在其他词条的描述里有提到）。挑几个比较有特点的读一下好了。

首先是它们的一个通用工具函数grab_gnu_arg，它在所有需要额外参数的，比如--cols n中被使用。

/*
 * Return the argument or NULL
 */
static const char *grab_gnu_arg(void) {
    switch(*flagptr) {    /* argument is part of ps_argv[thisarg] */
    default:
        return NULL;                     /* something bad */
    case '=':
    case ':':
        if(*++flagptr) return flagptr;   /* found it */
        return NULL;                     /* empty '=' or ':' */
    case '\0': /* try next argv[] */
        ;
    }
    if(thisarg+2 > ps_argc) return NULL;   /* there is nothing left */
    /* argument follows ps_argv[thisarg] */
    if(*(ps_argv[thisarg+1]) == '\0') return NULL;
    return ps_argv[++thisarg];
}
//
//<---->
//
case_cols:
case_width:
case_columns:
    trace("--cols\n");
    arg = grab_gnu_arg();
    if(arg && *arg) {
        long t;
        char *endptr;
        t = strtol(arg, &endptr, 0);
        if(!*endptr && (t>0) && (t<2000000000)) {
            screen_cols = (int)t;
            return NULL;
        }
    }
    return _("number of columns must follow --cols, --width, or --columns");

如果指定的是例如--cols=2，--cols:2，则返回=和:之后的部分。如果当前命令已经到头（\0），则看看下一个命令是不是有效的（非"\0"），如果是，返回，thisarg+1。

以--cols为例，这里设置screen_cols为“0~2000000000”中间的一个整数（用0x7ffffff不好吗……）。

再挑一个典型。

case_Group:
    trace("--Group\n");
    arg = grab_gnu_arg();
    if(!arg) return _("list of real groups must follow --Group");
    err=parse_list(arg, parse_gid);
    if(err) return err;
    selection_list->typecode = SEL_RGID;
    return NULL;

--Group这样后面跟一个list的，还需要parse_list来处理列表。

/*
 * Used to parse lists in a generic way. (function pointers)
 */
static const char *parse_list(const char *arg, const char *(*parse_fn)(char *, sel_union *) ) {
    selection_node *node;
    char *buf;                      /* temp copy of arg to hack on */
    char *sep_loc;                  /* separator location: " \t," */
    char *walk;
    int items;
    int need_item;
    const char *err;       /* error code that could or did happen */
    /*** prepare to operate ***/
    node = xmalloc(sizeof(selection_node));
    node->u = xmalloc(strlen(arg)*sizeof(sel_union)); /* waste is insignificant */
    node->n = 0;
    buf = strdup(arg);
    /*** sanity check and count items ***/
    need_item = 1; /* true */
    items = 0;
    walk = buf;
    err = _("improper list");
    do {
        switch(*walk) {
        case ' ':
        case ',':
        case '\t':
        case '\0':
            if(need_item) goto parse_error;
            need_item=1;
            break;
        default:
            if(need_item) items++;
            need_item=0;
        }
    } while (*++walk);
    if(need_item) goto parse_error;
    node->n = items;
    /*** actually parse the list ***/
    walk = buf;
    while(items--) {
        sep_loc = strpbrk(walk," ,\t");
        if(sep_loc) *sep_loc = '\0';
        if(( err=(parse_fn)(walk, node->u+items) )) goto parse_error;
        walk = sep_loc + 1; /* point to next item, if any */
    }
    free(buf);
    node->next = selection_list;
    selection_list = node;
    return NULL;
parse_error:
    free(buf);
    free(node->u);
    free(node);
    return err;
}

逐字查找，如果是逗号或者空白，则标记need_item，它们后面必须跟其他字符，统计一共有多少段。
然后设置node->n为计算到的总数。从头开始扫描空白或逗号，并对扫出来的部分调用parse_fn。parse_fn是传入的参数之一，看一下典型的parse_fn，这个是"C"参数传入的parse_fn，它将向ret->cmd（即(node->u + items)->cmd内拷贝长度为sizeof ret->cmd的字符串）。

static const char *parse_cmd(char *str, sel_union *ret) {
    strncpy(ret->cmd, str, sizeof ret->cmd);  // strncpy pads to end
    ret->cmd[sizeof(ret->cmd)-1] = '\0';      // but let's be safe
    return 0;
}

关于sel_union->cmd，在common.h中有定义。sizeof ret->cmd必然也就是64了。在parse_list的开头注意有：“node->u = xmalloc(strlen(arg)sizeof(sel_union));”，arg就是列表的长度，因此这里会试图分配列表长度64字节的数组给node->u。

typedef union sel_union {
    pid_t pid;
    pid_t ppid;
    uid_t uid;
    gid_t gid;
    dev_t tty;
    char  cmd[64];  /* this is _not_ \0 terminated */
} sel_union;

typedef struct selection_node {
    struct selection_node *next;
    sel_union *u;  /* used if selection type has a list of values */
    int n;         /* used if selection type has a list of values */
    int typecode;
} selection_node;

这篇已经足够长了，而且已经看完了parse_gnu_option部分。下一篇看看剩余的两个 sysv_option 和 bsd_option。

    case ARG_SYSV:
        if(!force_bsd) {  /* else go past case ARG_BSD */
            err = parse_sysv_option();
            break;

            case ARG_BSD:
                if(force_bsd && !(personality & PER_FORCE_BSD)) return _("way bad");
        }
        prefer_bsd_defaults = 1;
        err = parse_bsd_option();
        break;

psproc源码阅读 - 1

作者：leonwxqian
时间：January 10, 2022
分类：技术分享
评论

好久没有写文章了（2021年一年都没写……），随便开点新坑，从简单的代码来读起。
从psproc工程下的ps代码开始。

display.c:

    /***** no comment */
int main(int argc, char *argv[]) {  
    atexit(close_stdout);
    myname = strrchr(*argv, '/');
    if (myname) ++myname;
    else myname = *argv;
    Hertz = procps_hertz_get();

    setlocale (LC_ALL, "");
    bindtextdomain(PACKAGE, LOCALEDIR);
    textdomain(PACKAGE);
    setenv("TZ", ":/etc/localtime", 0);

先从main看起，首先，atexit函数设置close_stdout为其退出时的处理函数，这个库函数确实少见。

ATEXIT(3)                         Linux Programmer's Manual                        ATEXIT(3)
NAME
       atexit - register a function to be called at normal process termination
SYNOPSIS
       #include <stdlib.h>
       int atexit(void (*function)(void));

然后搜索argv[0]，并找到"/"之后的内容作为自己的文件名，如果没有就直接用argv[0]。
procps_hertz_get用于获取CPU的时钟频率（sysconf(_SC_CLK_TCK)），如果获取失败返回100。
然后设置区域信息，并设置环境变量TZ为/etc/localtime。

然后是一段信号处理的函数。将一些黑名单信号以外的信号传递给singal_handler。

#ifdef DEBUG
    init_stack_trace(argv[0]);
#else
    do {
        struct sigaction sa;
        int i = 32;
        memset(&sa, 0, sizeof(sa));
        sa.sa_handler = signal_handler;
        sigfillset(&sa.sa_mask);
        while(i--) switch(i) {
            default:
                sigaction(i,&sa,NULL);
            case 0:
            case SIGCONT:
            case SIGINT:   /* ^C */
            case SIGTSTP:  /* ^Z */
            case SIGTTOU:  /* see stty(1) man page */
            case SIGQUIT:  /* ^\ */
            case SIGPROF:  /* profiling */
            case SIGKILL:  /* can not catch */
            case SIGSTOP:  /* can not catch */
            case SIGWINCH: /* don't care if window size changes */
            case SIGURG:   /* Urgent condition on socket (4.2BSD) */
                ;
            }
    } while (0);
#endif

接下来是几个相对比较重要的处理代码。

reset_global();  /* must be before parser */
arg_parse(argc,argv);

/* check for invalid combination of arguments */
arg_check_conflicts();

/*  arg_show(); */
trace("screen is %ux%u\n",screen_cols,screen_rows);
/*  printf("sizeof(proc_t) is %d.\n", sizeof(proc_t)); */
trace("======= ps output follows =======\n");

首先是reset_global()。global.c:reset_global用于初始化所有的环境变量。我们依次阅读它的代码。

global.c

/************ Call this to reinitialize everything ***************/
void reset_global(void) {
    proc_t *p;
    int i;

    reset_selection_list();

>>

static void reset_selection_list(void) {
    selection_node *old;
    selection_node *walk = selection_list;
    if(selection_list == (selection_node *)0xdeadbeef) {
        selection_list = NULL;
        return;
    }
    while(walk) {
        old = walk;
        walk = old->next;
        free(old->u);
        free(old);
    }
    selection_list = NULL;
}

它首先调用reset_selection_list，如果section_list无效（0xdeadbeef）则置空，如果有内容，则挨个释放并将其置空。这个值并不是编译器或者内存管理器置的，而是它自己初始化的时候设置的：

selection_node *selection_list = (selection_node *)0xdeadbeef;

回到reset_global中，

/************ Call this to reinitialize everything ***************/
void reset_global(void) {
    proc_t *p;
    int i;

    reset_selection_list();

// --- <pids> interface --------------------------------------------------
    if (!Pids_items)
        Pids_items = xcalloc(PIDSITEMS, sizeof(enum pids_item));

    for (i = 0; i < PIDSITEMS; i++)
        Pids_items[i] = PIDS_noop;

    if (!Pids_info) {
        if (procps_pids_new(&Pids_info, Pids_items, i)) {
            fprintf(stderr, _("fatal library error, context\n"));
            exit(EXIT_FAILURE);
        }
    }

    Pids_items[0] = PIDS_TTY;
    procps_pids_reset(Pids_info, Pids_items, 1);
    if (!(p = fatal_proc_unmounted(Pids_info, 1))) {
        fprintf(stderr, _("fatal library error, lookup self\n"));
        exit(EXIT_FAILURE);
    }

接下来是另一个重要的结构，Pids_items，它是一个全局变量，类型为“enum pids_item*”。 xcalloc是psproc自己的wrap，就是calloc加了个检查，可以认为二者相同。PIDSITEMS为70，注释里写道70是拍脑袋的数字。
因此首先，它为Pids_items分配70个pids_item，然后将其初始化为“PIDS_noop”。PIDS_noop是enum pids_item的第一项。

接下来是对Pids_info的初始化。pids.c:procps_pids_new用来初始化Pids_info结构体。这也是一个大型函数，我们把它抽出来看：

pids.c

PROCPS_EXPORT int procps_pids_new (
    struct pids_info **info,
    enum pids_item *items,
    int numitems)
{
    struct pids_info *p;
    double uptime_secs;
    int pgsz;

#ifdef ITEMTABLE_DEBUG
    ... (Removed) ...
#endif

    if (info == NULL || *info != NULL)
        return -EINVAL;
    if (!(p = calloc(1, sizeof(struct pids_info))))
        return -ENOMEM;

    /* if we're without items or numitems, a later call to
       procps_pids_reset() will become mandatory */
    if (items && numitems) {
        if (pids_items_check_failed(items, numitems)) {
            free(p);
            return -EINVAL;
        }
        // allow for our PIDS_logical_end
        p->maxitems = numitems + 1;
        if (!(p->items = calloc(p->maxitems, sizeof(enum pids_item)))) {
            free(p);
            return -ENOMEM;
        }
        memcpy(p->items, items, sizeof(enum pids_item) * numitems);
        p->items[numitems] = PIDS_logical_end;
        p->curitems = p->maxitems;
        pids_libflags_set(p);
    }

    if (!(p->hist = calloc(1, sizeof(struct history_info)))
            || (!(p->hist->PHist_new = calloc(NEWOLD_INIT, sizeof(HST_t))))
            || (!(p->hist->PHist_sav = calloc(NEWOLD_INIT, sizeof(HST_t))))) {
        free(p->items);
        if (p->hist) {
            free(p->hist->PHist_sav);  // this & next might be NULL ...
            free(p->hist->PHist_new);
            free(p->hist);
        }
        free(p);
        return -ENOMEM;
    }
    p->hist->HHist_siz = NEWOLD_INIT;
    pids_config_history(p);

    pgsz = getpagesize();
    while (pgsz > 1024) {
        pgsz >>= 1;
        p->pgs2k_shift++;
    }
    p->hertz = procps_hertz_get();

    // in case 'fatal_proc_unmounted' wasn't called and /proc isn't mounted
    if (0 >= procps_uptime(&uptime_secs, NULL))
        p->boot_seconds = uptime_secs;

    numa_init();

    p->fetch.results.counts = &p->fetch.counts;

    p->refcount = 1;
    *info = p;
    return 0;
} // end: procps_pids_new

首先是为struct pids_info *p;赋予初始值的p = calloc(1, sizeof(struct pids_info))，然后对传入的items、numitems进行处理。

        if (pids_items_check_failed(items, numitems)) {
            free(p);
            return -EINVAL;
        }

调用的pids_items_check_failed用于检查传入的item是否合法。传入的如果不是enum pids_item*指针（而是enum的值），则在这里返回错误（<0x8000的值认为非法）。合法的话，检查是不是每项都在enum范围内。

static inline int pids_items_check_failed (
    enum pids_item *items,
    int numitems)
{
    int i;

    /* if an enum is passed instead of an address of one or more enums, ol' gcc
     * will silently convert it to an address (possibly NULL).  only clang will
     * offer any sort of warning like the following:
     *
     * warning: incompatible integer to pointer conversion passing 'int' to parameter of type 'enum pids_item *'
     * if (procps_pids_new(&info, PIDS_noop, 3) < 0)
     *                            ^~~~~~~~~~~~~~~~
     */
    if (numitems < 1
            || (void *)items < (void *)0x8000)      // twice as big as our largest enum
        return 1;

    for (i = 0; i < numitems; i++) {
        // a pids_item is currently unsigned, but we'll protect our future
        if (items[i] < 0)
            return 1;
        if (items[i] >= PIDS_logical_end) {
            return 1;
        }
    }
    return 0;
} // end: pids_items_check_failed

检查通过以后，分配对应的项目并将值复制到p中。

        // allow for our PIDS_logical_end
        p->maxitems = numitems + 1;
        if (!(p->items = calloc(p->maxitems, sizeof(enum pids_item)))) {
            free(p);
            return -ENOMEM;
        }
        memcpy(p->items, items, sizeof(enum pids_item) * numitems);
        p->items[numitems] = PIDS_logical_end;
        p->curitems = p->maxitems;
        pids_libflags_set(p);

然后是另一部分的初始化。如果hist的任何一部分初始化失败了，则释放里面已申请的内容。pids_config_history用于初始化HHash_one和HHash_two（初始化为HHash_nul），并修改PHash_save为HHash_one，PHash_new为HHash_two。

if (!(p->hist = calloc(1, sizeof(struct history_info)))
        || (!(p->hist->PHist_new = calloc(NEWOLD_INIT, sizeof(HST_t))))
        || (!(p->hist->PHist_sav = calloc(NEWOLD_INIT, sizeof(HST_t))))) {
    free(p->items);
    if (p->hist) {
        free(p->hist->PHist_sav);  // this & next might be NULL ...
        free(p->hist->PHist_new);
        free(p->hist);
    }
    free(p);
    return -ENOMEM;
}
p->hist->HHist_siz = NEWOLD_INIT;
pids_config_history(p);

pgsz = getpagesize();
while (pgsz > 1024) {
    pgsz >>= 1;
    p->pgs2k_shift++;
}
p->hertz = procps_hertz_get();

最后是一些收尾的。procps_uptime用于读取/proc/uptime来获取系统的uptime和idle time。numa_init用于初始化numa（Non Uniform Memory Access, libnuma.so/libnuma.so.1）。

    // in case 'fatal_proc_unmounted' wasn't called and /proc isn't mounted
    if (0 >= procps_uptime(&uptime_secs, NULL))
        p->boot_seconds = uptime_secs;

    numa_init();

    p->fetch.results.counts = &p->fetch.counts;

    p->refcount = 1;
    *info = p;
    return 0;
} // end: procps_pids_new

这个大函数终于结束了。回到我们最开始的reset_global的后半部分中：

Pids_items[0] = PIDS_TTY;
procps_pids_reset(Pids_info, Pids_items, 1);
if (!(p = fatal_proc_unmounted(Pids_info, 1))) {
    fprintf(stderr, _("fatal library error, lookup self\n"));
    exit(EXIT_FAILURE);
}

不过我们只能在这里短暂停留，因为procps_pids_reset也是一个大函数。

pids.c:

PROCPS_EXPORT int procps_pids_reset (
    struct pids_info *info,
    enum pids_item *newitems,
    int newnumitems)
{
    if (info == NULL || newitems == NULL)
        return -EINVAL;
    if (pids_items_check_failed(newitems, newnumitems))
        return -EINVAL;

    pids_cleanup_stacks_all(info);

pids_clenaup_stacks_all函数的定义如下，它对info->extends的ext链表中的每个项目都调用pids_cleanp_stack。目标是info->extends->ext[..]->stacks[i]->head。 stacks顾名思义是一个栈结构。pids_clean_stack对每个项目查找Item_table中对应的freefunc，并使用freefunc来释放它们。

static inline void pids_cleanup_stacks_all (
    struct pids_info *info)
{
    struct stacks_extent *ext = info->extents;
    int i;

    while (ext) {
        for (i = 0; ext->stacks[i]; i++)
            pids_cleanup_stack(ext->stacks[i]->head);
        ext = ext->next;
    };
} // end: pids_cleanup_stacks_all

>>>

static inline void pids_cleanup_stack (
    struct pids_result *this)
{
    for (;;) {
        enum pids_item item = this->item;
        if (item >= PIDS_logical_end)
            break;
        if (Item_table[item].freefunc)
            Item_table[item].freefunc(this);
        this->result.ull_int = 0;
        ++this;
    }
} // end: pids_cleanup_stack

Item_table的内容类似：

static struct {
    SET_t    setsfunc;            // the actual result setting routine
#ifdef ITEMTABLE_DEBUG
    int      enumnumb;            // enumerator (must match position!)
    char    *enum2str;            // enumerator name as a char* string
#endif
    unsigned oldflags;            // PROC_FILLxxxx flags for this item
    FRE_t    freefunc;            // free function for strings storage
    QSR_t    sortfunc;            // sort cmp func for a specific type
    int      needhist;            // a result requires history support
    char    *type2str;            // the result type as a string value
} Item_table[] = {
    /*    setsfunc               oldflags    freefunc   sortfunc       needhist  type2str
          ---------------------  ----------  ---------  -------------  --------  ----------- */
    { RS(noop),              0,          NULL,      QS(noop),      0,        TS_noop     }, // user only, never altered
    { RS(extra),             0,          NULL,      QS(ull_int),   0,        TS_noop     }, // user only, reset to zero

    { RS(ADDR_CODE_END),     f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_CODE_START),   f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_CURR_EIP),     f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_CURR_ESP),     f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },
    { RS(ADDR_STACK_START),  f_stat,     NULL,      QS(ul_int),    0,        TS(ul_int)  },

这些freefunc实际上也就是对不同类型的东西调用其free。其实这里就是用c实现了一套接口，谁叫这不是用c++写的呢。

static void freNAME(str) (struct pids_result *R) {
    if (R->result.str) free(R->result.str);
}

static void freNAME(strv) (struct pids_result *R) {
    if (R->result.strv && *R->result.strv) free(*R->result.strv);
}

再回到procps_pid_reset中：

    /* shame on this caller, they didn't change anything. and unless they have
       altered the depth of the stacks we're not gonna change anything either! */
    if (info->curitems == newnumitems + 1
            && !memcmp(info->items, newitems, sizeof(enum pids_item) * newnumitems))
        return 0;

    if (info->maxitems < newnumitems + 1) {
        while (info->extents) {
            struct stacks_extent *p = info->extents;
            info->extents = p->next;
            free(p);
        };
        if (info->get_ext) {
            pids_oldproc_close(&info->get_PT);
            info->get_ext = NULL;
        }
        if (info->fetch.anchor) {
            free(info->fetch.anchor);
            info->fetch.anchor = NULL;
        }
        // allow for our PIDS_logical_end
        info->maxitems = newnumitems + 1;
        if (!(info->items = realloc(info->items, sizeof(enum pids_item) * info->maxitems)))
            return -ENOMEM;
    }

    memcpy(info->items, newitems, sizeof(enum pids_item) * newnumitems);
    info->items[newnumitems] = PIDS_logical_end;
    // account for above PIDS_logical_end
    info->curitems = newnumitems + 1;

    // if extents were freed above, this next guy will have no effect
    // so we'll rely on pids_stacks_alloc() to itemize ...
    pids_itemize_stacks_all(info);
    pids_libflags_set(info);

    return 0;
} // end: procps_pids_reset

剩余的代码相对就没那么复杂了。作者抽风写的注释也能解释很多，首先是第一个if判断，当调用时items的数量没有变，且内容也没有变的时候就什么都不做。如果当前的容量已经不够了，把栈区多余的内容释放，停止扫描进程表，释放fetch.anchor，并扩展max_items，拷贝newitems到原始的内容中。最后，调用pids_itemize_stacks_all。老实说这个函数在干什么我暂时也不太清楚，先留着坑后面再看看（可能是给top用的，不是给ps用的）。最后，设置flags并完成函数功能。

回到reset_global，继续看下一个大函数fatal_proc_unmounted。

if (!(p = fatal_proc_unmounted(Pids_info, 1))) {
    fprintf(stderr, _("fatal library error, lookup self\n"));
    exit(EXIT_FAILURE);
}

fatal_proc_unmounted为每个pids结构分配一个栈结构，并初始化相关结构体。不细看了，后面碰到有用相关结构的时候再回头看看。在pids接口的相关内容处理完成后，reset_global接下来的内容比较轻松：

    set_screen_size();
    set_personality();

    all_processes         = 0;
    bsd_c_option          = 0;
    bsd_e_option          = 0;
    cached_euid           = geteuid();
    cached_tty            = PIDS_VAL(0, s_int, p, Pids_info);
    /* forest_prefix must be all zero because of POSIX */
    forest_type           = 0;
    format_flags          = 0;   /* -l -f l u s -j... */
    format_list           = NULL; /* digested formatting options */
    format_modifiers      = 0;   /* -c -j -y -P -L... */
    header_gap            = -1;  /* send lines_to_next_header to -infinity */
    header_type           = HEAD_SINGLE;
    include_dead_children = 0;
    lines_to_next_header  = 1;
    negate_selection      = 0;
    page_size             = getpagesize();
    running_only          = 0;
    selection_list        = NULL;
    simple_select         = 0;
    sort_list             = NULL;
    thread_flags          = 0;
    unix_f_option         = 0;
    user_is_number        = 0;
    wchan_is_number       = 0;
    /* Translation Note:
       . The following translatable word will be used to recognize the
       . user's request for help text.  In other words, the translation
       . you provide will alter program behavior.
       .
       . It must be limited to 15 characters or less.
       */
    the_word_help         = _("help");
}

基本就是把全局变量都给初始化了。psproc的这些全局变量命名长得和局部变量一样挺讨厌的，好在软件标注看起来还不那么难受。

还记得我们是从main过来的么……难受的reset_global看完了以后，回到main中继续阅读剩余的代码。

跟踪qemu-kvm下的磁盘写入

作者：leonwxqian
时间：February 6, 2020
分类：技术分享
评论

傻了，上一个调试的时候没加-enable-kvm，而且电脑的虚拟化也是关着的。假装无事发生过，一切调整就绪后，重新在KVM模式下调试。终于在另一台linux老爷机上装好了qemu和各种软件，继续从这里来，qcow2_pre_write_overlap_check下个断点，这里的栈和TCG模式一样，继续操作，b blk_aio_prwv。

(gdb) bt
#0  qcow2_pre_write_overlap_check (bs=0x558eef1841a0, ign=0, offset=1670656, 
    size=4096, data_file=true) at block/qcow2-refcount.c:2817
#1  0x0000558eedcb12e6 in qcow2_co_pwritev_part (bs=0x558eef1841a0, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/qcow2.c:2513
#2  0x0000558eedcfe0de in bdrv_driver_pwritev (bs=0x558eef1841a0, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/io.c:1171
#3  0x0000558eedd000a5 in bdrv_aligned_pwritev (child=0x558eef191900, 
    req=0x7fa0b8acae10, offset=1879080448, bytes=4096, align=1, 
    qiov=0x7fa0e4236760, qiov_offset=0, flags=0) at block/io.c:1980
#4  0x0000558eedd0087f in bdrv_co_pwritev_part (child=0x558eef191900, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/io.c:2137
#5  0x0000558eedce6f6d in blk_co_pwritev_part (blk=0x558eef183e40, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, qiov_offset=0, flags=0)
    at block/block-backend.c:1211
#6  0x0000558eedce6fbf in blk_co_pwritev (blk=0x558eef183e40, 
    offset=1879080448, bytes=4096, qiov=0x7fa0e4236760, flags=0)
    at block/block-backend.c:1221
#7  0x0000558eedce7795 in blk_aio_write_entry (opaque=0x7fa0e4238780)
    at block/block-backend.c:1415
#8  0x0000558eedddcc2f in coroutine_trampoline (i0=-467430144, i1=32672)
    at util/coroutine-ucontext.c:115
#9  0x00007fa0f56c8000 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#10 0x00007fa0e9cbad90 in ?? ()
#11 0x0000000000000000 in ?? ()

断下来以后，可以看到，除了上层的消息循环变成了kvm的，后面都是一样，通过直接向ioport写数据，然后转移到对应的后端处理函数中。差不多就调试完了，后面开设了一个网站督促自己读代码，qemu.world，等我想起来就更新。

(gdb) bt
#0  blk_aio_prwv (blk=0x558eef183e40, offset=0, bytes=0, iobuf=0x0, 
    co_entry=0x558eedce7a28 <blk_aio_flush_entry>, flags=0, 
    cb=0x558eedaad47c <ide_flush_cb>, opaque=0x558eefc24730)
    at block/block-backend.c:1360
#1  0x0000558eedce7ab1 in blk_aio_flush (blk=0x558eef183e40, 
    cb=0x558eedaad47c <ide_flush_cb>, opaque=0x558eefc24730)
    at block/block-backend.c:1503
#2  0x0000558eedaad5da in ide_flush_cache (s=0x558eefc24730)
    at hw/ide/core.c:1088
#3  0x0000558eedaae5b3 in cmd_flush_cache (s=0x558eefc24730, cmd=231 '\347')
    at hw/ide/core.c:1554
#4  0x0000558eedaaf8c5 in ide_exec_cmd (bus=0x558eefc246b0, val=231)
    at hw/ide/core.c:2085
#5  0x0000558eedaaddef in ide_ioport_write (opaque=0x558eefc246b0, addr=503, 
    val=231) at hw/ide/core.c:1294
#6  0x0000558eed85cd3f in portio_write (opaque=0x558eefcbff30, addr=7, 
    data=231, size=1) at /home/leon/qemu-4.2.0/ioport.c:201
#7  0x0000558eed861fbc in memory_region_write_accessor (mr=0x558eefcbff30, 
    addr=7, value=0x7fa0e9cbb818, size=1, shift=0, mask=255, attrs=...)
    at /home/leon/qemu-4.2.0/memory.c:483
#8  0x0000558eed8621a6 in access_with_adjusted_size (addr=7, 
    value=0x7fa0e9cbb818, size=1, access_size_min=1, access_size_max=4, 
    access_fn=0x558eed861efc <memory_region_write_accessor>, 
    mr=0x558eefcbff30, attrs=...) at /home/leon/qemu-4.2.0/memory.c:544
#9  0x0000558eed8650d7 in memory_region_dispatch_write (mr=0x558eefcbff30, addr=7, data=231, op=MO_8, attrs=...) at /home/leon/qemu-4.2.0/memory.c:1475
#10 0x0000558eed803386 in flatview_write_continue (fv=0x7fa0e410c970, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1, addr1=7, l=1, mr=0x558eefcbff30) at /home/leon/qemu-4.2.0/exec.c:3129
#11 0x0000558eed8034cb in flatview_write (fv=0x7fa0e410c970, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1) at /home/leon/qemu-4.2.0/exec.c:3169
#12 0x0000558eed803818 in address_space_write (as=0x558eee7a4b60 <address_space_io>, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1) at /home/leon/qemu-4.2.0/exec.c:3259
#13 0x0000558eed803885 in address_space_rw (as=0x558eee7a4b60 <address_space_io>, addr=503, attrs=..., buf=0x7fa0f86ac000 "\347\200\354\036", len=1, is_write=true) at /home/leon/qemu-4.2.0/exec.c:3269
#14 0x0000558eed87cf9f in kvm_handle_io (port=503, attrs=..., data=0x7fa0f86ac000, direction=1, size=1, count=1) at /home/leon/qemu-4.2.0/accel/kvm/kvm-all.c:2104
#15 0x0000558eed87d737 in kvm_cpu_exec (cpu=0x558eef1b29b0) at /home/leon/qemu-4.2.0/accel/kvm/kvm-all.c:2350
#16 0x0000558eed853017 in qemu_kvm_cpu_thread_fn (arg=0x558eef1b29b0) at /home/leon/qemu-4.2.0/cpus.c:1318
#17 0x0000558eeddc042b in qemu_thread_start (args=0x558eef1da7e0) at util/qemu-thread-posix.c:519
#18 0x00007fa0f5a2a4a4 in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#19 0x00007fa0f576cd0f in clone () from /lib/x86_64-linux-gnu/libc.so.6

调试qemu 硬盘io的过程

作者：leonwxqian
时间：February 5, 2020
分类：技术分享
评论

好久没有水文章了……在家无聊，正好最近也是在研究虚拟化相关的东西，就调一调qemu中文件写入的流程吧。

这里说的写入是指，qemu启动的虚拟机，虚拟机中如果发生文件IO，那么qemu如何知道要更新对应的虚拟磁盘文件呢？qemu这方面我比较菜，说实话，刚接触不到1周，感觉能水的文章还是挺多的。而且本篇大概率会有错误……反正不管，先从这个开始吧。

先粘一下编译选项，后面换机器不用再找了……直接复制
./configure --target-list=x86_64-softmmu --enable-kvm --enable-debug --enable-debug-info --enable-modules --enable-vnc --disable-strip

为了方便调试，我将qemu启动的虚拟机设置成为TinyCore Linux（http://www.tinycorelinux.net/）。毕竟现在我还在老家，搞不到Linux电脑，实际的调试环境是Windows上跑一个VirtualBox，里面跑个Linux，Linux再跑Qemu，如果是比较完整的Linux，估计我这台老爷机得卡死，所以一切最简化，用这个Linux安装一个命令行版的就可以了。

（后记：因为我启动参数配置错误，整个虚拟机跑在tcg模式下，性能依旧很慢，不过先不管这些，直接看看tcg下是如何通知到硬盘写入操作的，是否和kvm不同。）

我为虚拟机设置的磁盘格式是qcow2格式，然而问题来了，我该从哪里下手，换言之，我该断哪个函数？众所周知，也可能不知，与块设备相关的文件大部分位于block/下面。于是直接在block/下搜索qcow2 AND write，很快，发现几个函数，其中一个是qcow2_pre_write_overlap_check，看起来是一个很有用的校验函数。gdb挂上qemu后下个断点，很快地，就能断到它。

Thread 5 (Thread 0x7f8f31d33700 (LWP 23615)):
#0  0x0000562359abf4f0 in qcow2_pre_write_overlap_check (bs=0x56235abb8280, ign=0, offset=359936, size=4096, data_file=true) at block/qcow2-refcount.c:2817
#1  0x0000562359ab132a in qcow2_co_pwritev_part (bs=0x56235abb8280, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/qcow2.c:2513
#2  0x0000562359afe694 in bdrv_driver_pwritev (bs=0x56235abb8280, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/io.c:1171
#3  0x0000562359b0066a in bdrv_aligned_pwritev (child=0x56235aa76db0, req=0x7f8f183e9e10, offset=32256, bytes=4096, align=1, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/io.c:1980
#4  0x0000562359b00e44 in bdrv_co_pwritev_part (child=0x56235aa76db0, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/io.c:2137
#5  0x0000562359ae736b in blk_co_pwritev_part (blk=0x56235aaa6ed0, offset=32256, bytes=4096, qiov=0x7f8f14136db0, qiov_offset=0, flags=0) at block/block-backend.c:1211
#6  0x0000562359ae73bd in blk_co_pwritev (blk=0x56235aaa6ed0, offset=32256, bytes=4096, qiov=0x7f8f14136db0, flags=0) at block/block-backend.c:1221
#7  0x0000562359ae7b93 in blk_aio_write_entry (opaque=0x7f8f14024650) at block/block-backend.c:1415
#8  0x0000562359beafcb in coroutine_trampoline (i0=335845504, i1=32655) at util/coroutine-ucontext.c:115
#9  0x00007f8f504286b0 in __start_context () at /lib/x86_64-linux-gnu/libc.so.6
#10 0x00007f8f31d2ef80 in  ()
#11 0x0000000000000000 in  ()

coroutine_trampoline是qemu实现协程的主要函数，而进入的入口则是blk_aio_write_entry。

搜索对blk_aio_write_entry的引用，可以发现仅有这两处引用：

block-backend.c
1424    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1428                        blk_aio_write_entry, flags, cb, opaque);

分别位于

1424
blk_aio_pwrite_zeroes -> blk_aio_prwv

1428：
blk_aio_pwritev -> blk_aio_prwv

而在blk_aio_prwv中，可以明显的看到这个协程的创建过程。

static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
                                void *iobuf, CoroutineEntry co_entry,
                                BdrvRequestFlags flags,
                                BlockCompletionFunc *cb, void *opaque) {
    BlkAioEmAIOCB *acb;
    Coroutine *co;

    blk_inc_in_flight(blk);
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
    acb->rwco = (BlkRwCo) {
        .blk    = blk,
        .offset = offset,
        .iobuf  = iobuf,
        .flags  = flags,
        .ret    = NOT_DONE,
    };
    acb->bytes = bytes;
    acb->has_returned = false;

    /* HERE */co = qemu_coroutine_create(co_entry, acb);
    bdrv_coroutine_enter(blk_bs(blk), co);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
                                         blk_aio_complete_bh, acb);
    }

    return &acb->common; }

协程非常类似于线程。但是协程是协作式多任务的，而线程典型是抢占式多任务的。这意味着协程提供并发性而非并行性。
知道协程的创建位置就好办了，继续往上层的blk_aio_prwv挂断点。

很快，我们可以拿到这样的栈，而且是带消息循环的栈，大致就能知道断点下对了。

#0  blk_aio_prwv (blk=0x55a4a09c5800, offset=0, bytes=4096, iobuf=0x7f1dc8036c60, co_entry=0x55a49e41d9d0 <blk_aio_read_entry>, flags=0, cb=0x55a49e0ddbc2 <dma_blk_cb>, opaque=0x7f1dc8036c00)
    at block/block-backend.c:1360
#1  0x000055a49e41ddc5 in blk_aio_preadv (blk=0x55a4a09c5800, offset=0, qiov=0x7f1dc8036c60, flags=0, cb=0x55a49e0ddbc2 <dma_blk_cb>, opaque=0x7f1dc8036c00) at block/block-backend.c:1479
#2  0x000055a49e0de16a in dma_blk_read_io_func (offset=0, iov=0x7f1dc8036c60, cb=0x55a49e0ddbc2 <dma_blk_cb>, cb_opaque=0x7f1dc8036c00, opaque=0x55a4a09c5800) at dma-helpers.c:243
#3  0x000055a49e0dde9a in dma_blk_cb (opaque=0x7f1dc8036c00, ret=0) at dma-helpers.c:168
#4  0x000055a49e0de119 in dma_blk_io (ctx=0x55a4a08876d0, sg=0x55a4a171b788, offset=0, align=512, io_func=0x55a49e0de11f <dma_blk_read_io_func>, io_func_opaque=0x55a4a09c5800, 
    cb=0x55a49e1cadf1 <ide_dma_cb>, opaque=0x55a4a171b460, dir=DMA_DIRECTION_FROM_DEVICE) at dma-helpers.c:232
#5  0x000055a49e0de1c7 in dma_blk_read (blk=0x55a4a09c5800, sg=0x55a4a171b788, offset=0, align=512, cb=0x55a49e1cadf1 <ide_dma_cb>, opaque=0x55a4a171b460) at dma-helpers.c:250
#6  0x000055a49e1cb11f in ide_dma_cb (opaque=0x55a4a171b460, ret=0) at hw/ide/core.c:915
#7  0x000055a49e1d4d79 in bmdma_cmd_writeb (bm=0x55a4a171c5b0, val=9) at hw/ide/pci.c:306
#8  0x000055a49e1d5aad in bmdma_write (opaque=0x55a4a171c5b0, addr=0, val=9, size=1) at hw/ide/piix.c:75
#9  0x000055a49df42831 in memory_region_write_accessor (mr=0x55a4a171c700, addr=0, value=0x7f1dd8ea5a48, size=1, shift=0, mask=255, attrs=...) at /home/leon/qemu-4.2.0/memory.c:483
#10 0x000055a49df42a18 in access_with_adjusted_size (addr=0, value=0x7f1dd8ea5a48, size=1, access_size_min=1, access_size_max=4, access_fn=0x55a49df42771 <memory_region_write_accessor>, 
    mr=0x55a4a171c700, attrs=...) at /home/leon/qemu-4.2.0/memory.c:544
#11 0x000055a49df459c2 in memory_region_dispatch_write (mr=0x55a4a171c700, addr=0, data=9, op=MO_8, attrs=...) at /home/leon/qemu-4.2.0/memory.c:1475
#12 0x000055a49dee5a07 in address_space_stb (as=0x55a49eeac0e0 <address_space_io>, addr=49216, val=9, attrs=..., result=0x0) at /home/leon/qemu-4.2.0/memory_ldst.inc.c:378
#13 0x000055a49e0a7d16 in helper_outb (env=0x55a4a0bfa3e0, port=49216, data=9) at /home/leon/qemu-4.2.0/target/i386/misc_helper.c:33
#14 0x00007f1dbd998d65 in code_gen_buffer ()
#15 0x000055a49df7ad63 in cpu_tb_exec (cpu=0x55a4a0bf1b80, itb=0x7f1dbde60980 <code_gen_buffer+31852886>) at /home/leon/qemu-4.2.0/accel/tcg/cpu-exec.c:172
#16 0x000055a49df7bc47 in cpu_loop_exec_tb (cpu=0x55a4a0bf1b80, tb=0x7f1dbde60980 <code_gen_buffer+31852886>, last_tb=0x7f1dd8ea6078, tb_exit=0x7f1dd8ea6070)
    at /home/leon/qemu-4.2.0/accel/tcg/cpu-exec.c:618
#17 0x000055a49df7bf61 in cpu_exec (cpu=0x55a4a0bf1b80) at /home/leon/qemu-4.2.0/accel/tcg/cpu-exec.c:731
#18 0x000055a49df33eb8 in tcg_cpu_exec (cpu=0x55a4a0bf1b80) at /home/leon/qemu-4.2.0/cpus.c:1473
#19 0x000055a49df3470e in qemu_tcg_cpu_thread_fn (arg=0x55a4a0bf1b80) at /home/leon/qemu-4.2.0/cpus.c:1781
#20 0x000055a49e50488c in qemu_thread_start (args=0x55a4a0956070) at util/qemu-thread-posix.c:519
#21 0x00007f1df39476db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#22 0x00007f1df366988f in clone () from /lib/x86_64-linux-gnu/libc.so.6

基本就是ioport直接写的方式。通过这个硬件直接操作的方式，向cmd646设备写数据，来通知bmdma_write后面一系列函数。具体的后面再看，等过段时间我去linux机器上再确认Kvm的通知方式是否不一样，虽然感觉应该是一样的。

psproc源码阅读 - 3

psproc源码阅读 - 2

psproc源码阅读 - 1

跟踪qemu-kvm下的磁盘写入

调试qemu 硬盘io的过程

最新文章

最近回复

分类

归档

其它

友情链接