perf_event 源码分析
前言
简单来说,perf是一种性能监测工具,它首先对通用处理器提供的performance counter进行编程,设定计数器阈值和事件,然后性能计数器就会在设定事件发生时递增计数器,直至这个计数器的计数值达到阈值,在不同的结构中对于计数器数值的提取有不同的方式,例如MIPS上会注册一个硬件中断,这样在计数器溢出时触发一个硬件中断,在中断处理函数中记录数值,x86中则是利用通知链机制,将溢出处理函数注册到die_chain通知链上,它会利用任何一个硬件中断发生的时机,检测性能计数器是否溢出,是则记录这个数值,这种实现方式就避免了单独为性能计数器溢出注册一个硬件中断。
perf源码分为用户层和内核层,用户层代码为用户提供命令行指定事件与采样方式,perf的一大特点就体现在丰富的用户层工具,可以说,内核部分代码只是为perf提供采样引擎,用户层才是perf的精华。用户层代码位于src/tools/perf目录下,c代码有13000行左右,此外还有大量的脚本程序。内核层代码分为结构无关代码(位于src/kernel/core/目录),和结构相关代码(位于src/arch/x86/cpu/**)。
这里先列个框架:首先从系统启动初始化开始,perf-init的相关工作,之后介绍用户层指定事件,通过系统调用转入内核,执行采样,采样数据通过内存映射返回给用户层,用户层工具进行上层分析并显示
perf_event源码分析(一)——cmd_record
perf's main entry
tools/perf/perf.cstatic struct cmd_struct commands[] = { { "buildid-cache", cmd_buildid_cache, 0 }, { "buildid-list", cmd_buildid_list, 0 }, { "diff", cmd_diff, 0 }, { "evlist", cmd_evlist, 0 }, { "help", cmd_help, 0 }, { "list", cmd_list, 0 }, { "record", cmd_record, 0 }, { "report", cmd_report, 0 }, { "bench", cmd_bench, 0 }, { "stat", cmd_stat, 0 }, { "timechart", cmd_timechart, 0 }, { "top", cmd_top, 0 }, { "annotate", cmd_annotate, 0 }, { "version", cmd_version, 0 }, { "script", cmd_script, 0 }, { "sched", cmd_sched, 0 },#ifdef HAVE_LIBELF_SUPPORT { "probe", cmd_probe, 0 },#endif { "kmem", cmd_kmem, 0 }, { "lock", cmd_lock, 0 }, { "kvm", cmd_kvm, 0 }, { "test", cmd_test, 0 },#ifdef HAVE_LIBAUDIT_SUPPORT { "trace", cmd_trace, 0 },#endif { "inject", cmd_inject, 0 }, { "mem", cmd_mem, 0 }, { "data", cmd_data, 0 },};
perf record's CALL CHAIN:
cmd_record ;; new a struct "record" rec, and a struct "evlist" in rec->evlist; perf_evlist__new perf_config __cmd_record(&record, argc, argv); // fill out "struct record" perf_session__new(file, false, tool); // New a sesssion for this rec, rec->session, attention: file is "struct perf_data_file *file", &rec->file; machines__init(&session->machines); ordered_events__init(&session->ordered_events, ordered_events__deliver_event); perf_data_file__open(file) check_pipe(file) file->path = "perf.data" // If not specified name, fill out file->path open_file(file); fd = perf_data_file__is_read(file) ? open_file_read(file) : open_file_write(file); file->fd = fd; perf_session__create_kernel_maps(session) // fd = perf_data_file__fd(file); // Get rec's fd, rec->file->fd record__init_features(rec); perf_header__set_feat // Fill out session's header of this rec, rec->session->header record__open(rec) perf_evlist__config(evlist, opts); // perf_evlist perf_evsel__config(evsel, opts); // perf_evsel perf_header__clear_feat perf_header__write_pipe / perf_session__write_header perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, machine); perf_event__synthesize_modules(tool, process_synthesized_event, machine); machines__process_guests(&session->machines,perf_event__synthesize_guest_os, tool); __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,process_synthesized_event, opts->sample_address);
tools/perf/builtin-record.cint cmd_record(int argc, const char **argv, const char *prefix __maybe_unused){ int err = -ENOMEM; struct record *rec = &record; char errbuf[BUFSIZ]; rec->evlist = perf_evlist__new(); if (rec->evlist == NULL) return -ENOMEM; perf_config(perf_record_config, rec); // 解析, tools/perf/util/config.c argc = parse_options(argc, argv, record_options, record_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc && target__none(&rec->opts.target)) usage_with_options(record_usage, record_options); if (nr_cgroups && !rec->opts.target.system_wide) { ui__error("cgroup monitoring only available in" " system-wide mode\n"); usage_with_options(record_usage, record_options); }}
tools/perf/util/parse-events.csetup_events // tools/perf/builtin-stat.c parse_events // tools/perf/util/parse-events.c parse_events // tools/perf/util/parse-events.cint parse_events(struct perf_evlist *evlist, const char *str){ struct parse_events_evlist data = { .list = LIST_HEAD_INIT(data.list), .idx = evlist->nr_entries, }; int ret; ret = parse_events__scanner(str, &data, PE_START_EVENTS); perf_pmu__parse_cleanup(); if (!ret) { int entries = data.idx - evlist->nr_entries; perf_evlist__splice_list_tail(evlist, &data.list, entries); evlist->nr_groups += data.nr_groups; return 0; } /* * There are 2 users - builtin-record and builtin-test objects. * Both call perf_evlist__delete in case of error, so we dont * need to bother. */ return ret;}
struct introduction
tools/perf/util/target.hstruct target { const char *pid; const char *tid; const char *cpu_list; const char *uid_str; uid_t uid; bool system_wide; bool uses_mmap; bool default_per_cpu; bool per_thread;};===tools/perf/util/data.hstruct perf_data_file { const char *path; int fd; bool is_pipe; bool force; unsigned long size; enum perf_data_mode mode;};=== tools/perf/util/session.hstruct perf_session { struct perf_header header; struct machines machines; struct perf_evlist *evlist; struct trace_event tevent; bool repipe; bool one_mmap; void *one_mmap_addr; u64 one_mmap_offset; struct ordered_events ordered_events; struct perf_data_file *file; struct perf_tool *tool;};===tools/perf/util/evlist.h struct perf_evlist { struct list_head entries; struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; int nr_entries; int nr_groups; int nr_mmaps; size_t mmap_len; int id_pos; int is_pos; u64 combined_sample_type; struct { int cork_fd; pid_t pid; } workload; bool overwrite; struct fdarray pollfd; struct perf_mmap *mmap; struct thread_map *threads; // threads struct cpu_map *cpus; // cpus struct perf_evsel *selected; struct events_stats stats;};=== /** struct perf_evsel - event selector **/Each event passed from user mapping one perf_evsel struct. struct perf_evsel { struct list_head node; struct perf_event_attr attr; char *filter; struct xyarray *fd; struct xyarray *sample_id; u64 *id; struct perf_counts *counts; struct perf_counts *prev_raw_counts; int idx; u32 ids; char *name; double scale; const char *unit; bool snapshot; struct event_format *tp_format; ... ... struct perf_evsel *leader;}=== tools/perf/builtin-record.cstruct record { struct perf_tool tool; struct record_opts opts; u64 bytes_written; struct perf_data_file file; struct perf_evlist *evlist; struct perf_session *session; const char *progname; int realtime_prio; bool no_buildid; bool no_buildid_cache; long samples;};===Here is important, perf_stat is an array include three "struct stats" in "perf_stat", and will init perf_stat: for (i = 0; i < 3; i++) init_stats(&ps->res_stats[i]);struct perf_stat { struct stats res_stats[3];};tools/perf/util/stat.hstruct stats{ double n, mean, M2; u64 max, min;};==== tools/perf/util/evsel.hstruct perf_counts_values { union { struct { u64 val; u64 ena; u64 run; }; u64 values[3]; };};struct perf_counts { s8 scaled; struct perf_counts_values aggr; struct perf_counts_values cpu[];};
perf stat's CALL CHAIN
CALL CHAIN: commands // tools/perf/perf.c cmd_stat // tools/perf/builtin-stat.c parse_events_option // If perf stat -e xxx, specified event name, will check this event name parse_events parse_events__scanner // check events parse_events_lex_init_extra parse_events__scan_string parse_events_parse parse_events__flush_buffer parse_events__delete_buffer parse_events_lex_destroy perf_pmu__parse_cleanup: perf_evlist__new(); perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, struct thread_map *threads) // evlist->cpus, evlist->threads perf_evlist__set_maps /// parse_options parse_options_usage add_default_attributes() target__validate(&target); perf_evlist__create_maps(evsel_list, &target) // fill out evlist->threads(thread_map) evlist->threads = thread_map__new_str(target->pid, target->tid,target->uid); // evlist->threads evlist->threads(thread_map) = [tid,tid,tid,tid,...] target__uses_dummy_map(target) evlist->cpus = cpu_map__dummy_new() // evlist->cpus evlist->cpus = cpu_map__new(target->cpu_list) perf_evlist__alloc_stats(evsel_list, interval) // Traverse all evsel evlist__for_each(evlist, evsel) { perf_evsel__alloc_stat_priv(evsel) // Alloc memory for each evsel->priv = zalloc(sizeof(struct perf_stat)); perf_evsel__reset_stat_priv(evsel) init_stats // Fill out "struct perf_stat", perf_stat include 3 elements of "struct stats{}" perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) // Alloc evsel->counts alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) // Alloc evsel->prev_raw_counts = addr; } perf_stat_init_aggr_mode() cpu_map__build_socket_map cpu_map__build_map(cpus, sockp, cpu_map__get_socket); cpu_map__get_socket cpu_map__build_core_map cpu_map__build_map(cpus, corep, cpu_map__get_core); cpu_map__get_core cpu_map__get_socket run_perf_stat(argc, argv); __run_perf_stat(argc, argv); perf_evlist__prepare_workload(evsel_list, &target, argv, false, workload_exec_failed_signal) perf_evlist__set_leader(evsel_list); // evlist->nr_groups = 1 or 0 ? decide by evlist->nr_entries > 1 or not __perf_evlist__set_leader(&evlist->entries); evlist__for_each(evsel_list, evsel) { // Traverse all evsel create_perf_stat_counter(evsel) struct perf_event_attr *attr = &evsel->attr; attr->xxx = xxx perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel) perf_evsel__is_group_leader(evsel) perf_evsel__open_per_thread(evsel, evsel_list->threads) // important: __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads) __perf_evsel__open(evsel, &empty_cpu_map.map, threads) // perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads), if system_wide: nthreads = 1 perf_evsel__alloc_fd(evsel, cpus->nr, nthreads) evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int)); for (cpu = 0; cpu < cpus->nr; cpu++) { for (thread = 0; thread < nthreads; thread++) { group_fd = get_group_fd(evsel, cpu, thread); sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu], group_fd, flags); } } } perf_evlist__apply_filters(evsel_list, &counter) evlist__for_each(evlist, evsel) { perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter); } t0 = rdclock(); clock_gettime(CLOCK_MONOTONIC, &ref_time); if (forks) { perf_evlist__start_workload(evsel_list); handle_initial_delay(); if (interval) { print_interval(); } } else { handle_initial_delay(); print_interval(); } t1 = rdclock(); update_stats(&walltime_nsecs_stats, t1 - t0); // 开始为每个evsel读 if (aggr_mode == AGGR_GLOBAL) { evlist__for_each(evsel_list, counter) { // 读到struct: "struct perf_counts_values", 保存在evsel的 &counter->counts->aggr , (这里evsel 就是counter) // 还有“struct perf_stat” , counter->priv read_counter_aggr(counter); aggr->val = aggr->ena = aggr->run = 0; // 这里, 把 perf_counts_values aggr 全部初始化为0 read_counter(counter) // 如何读此event?遍历每个thread和cpu int nthreads = thread_map__nr(evsel_list->threads); int ncpus = perf_evsel__nr_cpus(counter); int cpu, thread; for (thread = 0; thread < nthreads; thread++) { for (cpu = 0; cpu < ncpus; cpu++) { // pocess + cpu 二维数组方式读, 读到 "struct perf_counts_values count" process_per_cpu(struct perf_evsel *evsel, int cpu, int thread)) perf_evsel__read_cb(evsel, cpu, thread, &count) memset(count, 0, sizeof(*count)); FD(evsel, cpu, thread) readn(FD(evsel, cpu, thread), count, sizeof(*count)) ion(true, fd, buf, n); read(fd, buf, left) read_cb(evsel, cpu, thread, tmp); switch (aggr_mode) { case AGGR_CORE: case AGGR_SOCKET: case AGGR_NONE: perf_evsel__compute_deltas(evsel, cpu, count); perf_counts_values__scale(count, scale, NULL); update_shadow_stats(evsel, count->values, cpu); } } } perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), thread_map__nr(evsel_list->threads)); } } else { evlist__for_each(evsel_list, counter) { read_counter(counter); perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); } } print_stat print_aggr // AGGR_CORE AGGR_SOCKET print_counter_aggr(evsel, NULL); // AGGR_GLOBAL print_counter(evsel, NULL) // AGGR_NONE
tools/perf/util/evsel.hstruct perf_evsel {}