linuxftrace_如何选择一个 Linux Tracer

A. red hat 内核2.6.32的kernel/fork.c在哪里

部分核心的代码在arch/*/kernel目录
*
 2 * linux/kernel/fork.c
 3 *
 4 * Copyright (C) 1991, 1992 Linus Torvalds
 5 */
 6 
 7 /*
 8 * 'fork.c' contains the help-routines for the 'fork' system call
 9 * (see also entry.S and others).
 10 * Fork is rather simple, once you get the hang of it, but the memory
 11 * management can be a bitch. See 'mm/memory.c': '_page_range()'
 12 */
 13 
 14 #include <linux/slab.h>
 15 #include <linux/init.h>
 16 #include <linux/unistd.h>
 17 #include <linux/mole.h>
 18 #include <linux/vmalloc.h>
 19 #include <linux/completion.h>
 20 #include <linux/personality.h>
 21 #include <linux/mempolicy.h>
 22 #include <linux/sem.h>
 23 #include <linux/file.h>
 24 #include <linux/fdtable.h>
 25 #include <linux/iocontext.h>
 26 #include <linux/key.h>
 27 #include <linux/binfmts.h>
 28 #include <linux/mman.h>
 29 #include <linux/mmu_notifier.h>
 30 #include <linux/fs.h>
 31 #include <linux/nsproxy.h>
 32 #include <linux/capability.h>
 33 #include <linux/cpu.h>
 34 #include <linux/cgroup.h>
 35 #include <linux/security.h>
 36 #include <linux/hugetlb.h>
 37 #include <linux/seccomp.h>
 38 #include <linux/swap.h>
 39 #include <linux/syscalls.h>
 40 #include <linux/jiffies.h>
 41 #include <linux/futex.h>
 42 #include <linux/compat.h>
 43 #include <linux/kthread.h>
 44 #include <linux/task_io_accounting_ops.h>
 45 #include <linux/rcupdate.h>
 46 #include <linux/ptrace.h>
 47 #include <linux/mount.h>
 48 #include <linux/audit.h>
 49 #include <linux/memcontrol.h>
 50 #include <linux/ftrace.h>
 51 #include <linux/proc_fs.h>
 52 #include <linux/profile.h>
 53 #include <linux/rmap.h>
 54 #include <linux/ksm.h>
 55 #include <linux/acct.h>
 56 #include <linux/tsacct_kern.h>
 57 #include <linux/cn_proc.h>
 58 #include <linux/freezer.h>
 59 #include <linux/delayacct.h>
 60 #include <linux/taskstats_kern.h>
 61 #include <linux/random.h>
 62 #include <linux/tty.h>
 63 #include <linux/blkdev.h>
 64 #include <linux/fs_struct.h>
 65 #include <linux/magic.h>
 66 #include <linux/perf_event.h>
 67 #include <linux/posix-timers.h>
 68 #include <linux/user-return-notifier.h>
 69 #include <linux/oom.h>
 70 #include <linux/khugepaged.h>
 71 #include <linux/signalfd.h>
 72 #include <linux/uprobes.h>
 73 #include <linux/aio.h>
 74 
 75 #include <asm/pgtable.h>
 76 #include <asm/pgalloc.h>
 77 #include <asm/uaccess.h>
 78 #include <asm/mmu_context.h>
 79 #include <asm/cacheflush.h>
 80 #include <asm/tlbflush.h>
 81 
 82 #include <trace/events/sched.h>
 83 
 84 #define CREATE_TRACE_POINTS
 85 #include <trace/events/task.h>
 86 
 87 /*
 88 * Protected counters by write_lock_irq(&tasklist_lock)
 89 */
 90 unsigned long total_forks;   /* Handle normal Linux uptimes. */
 91 int nr_threads;         /* The idle threads do not count.. */
 92 
 93 int max_threads;        /* tunable limit on nr_threads */
 94 
 95 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 96 
 97 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
 98 
 99 #ifdef CONFIG_PROVE_RCU
100 int lockdep_tasklist_lock_is_held(void)
101 {
102     return lockdep_is_held(&tasklist_lock);
103 }
104 EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
105 #endif /* #ifdef CONFIG_PROVE_RCU */
106 
107 int nr_processes(void)
108 {
109     int cpu;
110     int total = 0;
111 
112     for_each_possible_cpu(cpu)
113         total += per_cpu(process_counts, cpu);
114 
115     return total;
116 }
117 
118 void __weak arch_release_task_struct(struct task_struct *tsk)
119 {
120 }
121 
122 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
123 static struct kmem_cache *task_struct_cachep;
124 
125 static inline struct task_struct *alloc_task_struct_node(int node)
126 {
127     return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
128 }
129 
130 static inline void free_task_struct(struct task_struct *tsk)
131 {
132     kmem_cache_free(task_struct_cachep, tsk);
133 }
134 #endif
135 
136 void __weak arch_release_thread_info(struct thread_info *ti)
137 {
138 }
139 
140 #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
141 
142 /*
143 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
144 * kmemcache based allocator.
145 */
146 # if THREAD_SIZE >= PAGE_SIZE
147 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
148                          int node)
149 {
150     struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
151                       THREAD_SIZE_ORDER);
152 
153     return page ? page_address(page) : NULL;
154 }
155 
156 static inline void free_thread_info(struct thread_info *ti)
157 {
158     free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
159 }
160 # else
161 static struct kmem_cache *thread_info_cache;
162 
163 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
164                          int node)
165 {
166     return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
167 }
168 
169 static void free_thread_info(struct thread_info *ti)
170 {
171     kmem_cache_free(thread_info_cache, ti);
172 }
173 
174 void thread_info_cache_init(void)
175 {
176     thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
177                        THREAD_SIZE, 0, NULL);
178     BUG_ON(thread_info_cache == NULL);
179 }
180 # endif
181 #endif
182 
183 /* SLAB cache for signal_struct structures (tsk->signal) */
184 static struct kmem_cache *signal_cachep;
185 
186 /* SLAB cache for sighand_struct structures (tsk->sighand) */
187 struct kmem_cache *sighand_cachep;
188 
189 /* SLAB cache for files_struct structures (tsk->files) */
190 struct kmem_cache *files_cachep;

B. 编译linux 2.6.30内核提示错误

make clean

C. 用C语言结构状态描述进程控制块的信息

以下来自linux内核2.6.35.7版本的代码，在头文件linux/sched.h中定义
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;

int lock_depth; /* BKL lock depth */

#ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
int oncpu;
#endif
#endif

int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif

/*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif

unsigned int policy;
cpumask_t cpus_allowed;

#ifdef CONFIG_TREE_PREEMPT_RCU
int rcu_read_lock_nesting;
char rcu_read_unlock_special;
struct rcu_node *rcu_blocked_node;
struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif

struct list_head tasks;
struct plist_node pushable_tasks;

struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned int personality;
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;

/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;

pid_t pid;
pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif

/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct *real_parent; /* real parent process */
struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

/*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;

/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;

struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

struct task_cputime cputime_expires;
struct list_head cpu_timers[3];

/* process credentials */
const struct cred *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred *cred; /* effective (overridable) subjective task
* credentials (COW) */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace) */
struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */

char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;

sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
uid_t loginuid;
unsigned int sessionid;
#endif
seccomp_t seccomp;

/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
* mempolicy */
spinlock_t alloc_lock;

#ifdef CONFIG_GENERIC_HARDIRQS
/* IRQ handler threads */
struct irqaction *irqaction;
#endif

/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
void *journal_info;

/* stacked block device info */
struct bio_list *bio_list;

/* VM state */
struct reclaim_state *reclaim_state;

struct backing_dev_info *backing_dev_info;

struct io_context *io_context;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
int mems_allowed_change_disable;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
struct css_set *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;

/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
struct prop_local_single dirties;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;

struct list_head *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
/* time stamp for last schele */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun.
*/
atomic_t trace_overrun;
/* Pause for the tracing */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* state flags for use by tracers */
unsigned long trace;
/* bitmask of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
struct memcg_batch_info {
int do_batch; /* incremented when batch uncharge started */
struct mem_cgroup *memcg; /* target memcg of uncharge */
unsigned long bytes; /* uncharged usage */
unsigned long memsw_bytes; /* uncharged mem+swap usage */
} memcg_batch;
#endif
};

D. 《Mastering Linux Programming》txt下载在线阅读全文,求百度网盘云资源

《Mastering Embedded Linux Programming - Second Edition》（Chris Simmonds）电子书网盘下载免费在线阅读

链接: https://pan..com/s/1YaMA1KDcVCte49Usibsk4A

pdf" data_size="5.14M" data_filelogo="https://gss0.bdstatic.com//yun-file-logo/file-logo-6.png" data_number="1" data_sharelink="https://pan..com/s/1YaMA1KDcVCte49Usibsk4A" data_code="i8sb">

提取码: i8sb

书名：Mastering Embedded Linux Programming - Second Edition

作者：Chris Simmonds

出版社：Packt Publishing

出版年份：2017-6-30

页数：478

内容简介：

Key Features

Discover how to build and configure reliable embedded Linux devicesThis book has been updated to include Linux 4.9 and Yocto Project 2.2 (Morty)This comprehensive guide covers the remote update of devices in the field and power management

Book Description

Embedded Linux runs many of the devices we use every day, from smart TVs to WiFi routers, test equipment to instrial controllers - all of them have Linux at their heart. Linux is a core technology in the implementation of the inter-connected world of the Internet of Things.

The comprehensive guide shows you the technologies and techniques required to build Linux into embedded systems. You will begin by learning about the fundamental elements that underpin all embedded Linux projects: the toolchain, the bootloader, the kernel, and the root filesystem. You'll see how to create each of these elements from scratch, and how to automate the process using Buildroot and the Yocto Project.

Moving on, you'll find out how to implement an effective storage strategy for flash memory chips, and how to install updates to the device remotely once it is deployed. You'll also get to know the key aspects of writing code for embedded Linux, such as how to access hardware from applications, the implications of writing multi-threaded code, and techniques to manage memory in an efficient way. The final chapters show you how to debug your code, both in applications and in the Linux kernel, and how to profile the system so that you can look out for performance bottlenecks.

By the end of the book, you will have a complete overview of the steps required to create a successful embedded Linux system.

What you will learn

Evaluate the Board Support Packages offered by most manufacturers of a system on chip or embedded moleUse Buildroot and the Yocto Project to create embedded Linux systems quickly and efficientlyUpdate IoT devices in the field without compromising securityRece the power budget of devices to make batteries last longerInteract with the hardware without having to write kernel device driversDebug devices remotely using GDB, and see how to measure the performance of the systems using powerful tools such as perk, ftrace, and valgrindFind out how to configure Linux as a real-time operating system

About the Author

Chris Simmonds is a software consultant and trainer living in southern England. He has almost two decades of experience in designing and building open-source embedded systems. He is the founder and chief consultant at 2net Ltd, which provides professional training and mentoring services in embedded Linux, Linux device drivers, and android platform development. He has trained engineers at many of the biggest companies in the embedded world, including ARM, Qualcomm, Intel, Ericsson, and General Dynamics. He is a frequent presenter at open source and embedded conferences, including the Embedded Linux Conference and Embedded World. You can see some of his work on the Inner Penguin blog.

Table of Contents

Starting outLearning about ToolchainsAll about BootloadersPorting and Configuring the KernelBuilding a Root filesystemSelecting a Build SystemCreating a storage strategyUpdating software in the fieldInterfacing with Device DriversStarting up: the init programPower managementLearning about processes and threadsManaging MemoryDebugging with GDBProfiling and tracingReal time programming

作者简介：

About the Author

Chris SimmondsChris Simmonds is a software consultant and trainer living in southern England. He has almost two decades of experience in designing and building open-source embedded systems. He is the founder and chief consultant at 2net Ltd, which provides professional training and mentoring services in embedded Linux, Linux device drivers, and Android platform development. He has trained engineers at many of the biggest companies in the embedded world, including ARM, Qualcomm, Intel, Ericsson, and General Dynamics. He is a frequent presenter at open source and embedded conferences, including the Embedded Linux Conference and Embedded World. You can see some of his work on the Inner Penguin blog.

E. linux 2.6 和 3.10 的不同

3.10是比较新的内核，2.6大多数用于服务器。3.10更改/升级了更多2.6已经发现的BUG，但没有在SERVER系统中大规模应用。具体更改成百上千，似乎只有开发人员才知道

F. 如何查看android systrace 分析

Systrace的介绍

Systrace跟踪代码
Systrace运行方式
Systrace数据分析
Systrace使用示例
TraceView的介绍
TraceView运行方式
TraceView数据分析
TraceView使用示例
总结

内容：

1.Systrace的介绍

>Systrace是Android4.1中新增的性能数据采样和分析工具。它可帮助开发者收集Android关键子系统（如Surfaceflinger、WindowManagerService等Framework部分关键模块、服务）的运行信息，从而帮助开发者更直观的分析系统瓶颈，改进性能。

Systrace的功能包括跟踪系统的I/O操作、内核工作队列、CPU负载以及Android各个子系统的运行状况等。在Android平台中，它主要由3部分组成：

1.内核部分：Systrace利用了Linux Kernel中的ftrace功能。所以，如果要使用Systrace的话，必须开启kernel中和ftrace相关的模块。

2.数据采集部分：Android定义了一个Trace类。应用程序可利用该类把统计信息输出给ftrace。同时，Android还有一个atrace程序，它可以从ftrace中读取统计信息然后交给数据分析工具来处理。

3.数据分析工具：Android提供一个systrace.py（python脚本文件，位于Android SDK目录/tools/systrace中，其内部将调用atrace程序）用来配置数据采集的方式（如采集数据的标签、输出文件名等）和收集 ftrace统计数据并生成一个结果网页文件供用户查看。

从本质上说，Systrace是对Linux Kernel中ftrace的封装。应用进程需要利用Android提供的Trace类来使用Systrace。

G. 如何linux内核报告问题

Linux Kernel BUG:soft lockup CPU#1 stuck分析
1.线上内核bug日志
kernel: Deltaway too big! 18428729675200069867 ts=18446743954022816244 write stamp =18014278822746377
kernel:------------[ cut here ]------------
kernel:WARNING: at kernel/trace/ring_buffer.c:1988 rb_reserve_next_event+0x2ce/0x370()(Not tainted)
kernel:Hardware name: ProLiant DL360 G7
kernel:Moles linked in: fuse ipv6 power_meter bnx2 sg microcode serio_raw iTCO_wdtiTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2sd_mod crc_t10dif hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_coredm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
kernel: Pid:5483, comm: master Not tainted 2.6.32-220.el6.x86_64 #1
kernel: CallTrace:
kernel:[<ffffffff81069b77>] ? warn_slowpath_common+0x87/0xc0
kernel:[<ffffffff81069bca>] ? warn_slowpath_null+0x1a/0x20
kernel:[<ffffffff810ea8ae>] ? rb_reserve_next_event+0x2ce/0x370
kernel:[<ffffffff810eab02>] ? ring_buffer_lock_reserve+0xa2/0x160
kernel:[<ffffffff810ec97c>] ? trace_buffer_lock_reserve+0x2c/0x70
kernel:[<ffffffff810ecb16>] ? trace_current_buffer_lock_reserve+0x16/0x20
kernel:[<ffffffff8107ae1e>] ? ftrace_raw_event_hrtimer_cancel+0x4e/0xb0
kernel:[<ffffffff81095e7a>] ? hrtimer_try_to_cancel+0xba/0xd0
kernel:[<ffffffff8106f634>] ? do_setitimer+0xd4/0x220
kernel:[<ffffffff8106f88a>] ? alarm_setitimer+0x3a/0x60
kernel:[<ffffffff8107c27e>] ? sys_alarm+0xe/0x20
kernel:[<ffffffff8100b308>] ? tracesys+0xd9/0xde
kernel: ---[end trace 4d0a1ef2e62cb1a2 ]---
abrt-mp-oops: Reported 1 kernel oopses to Abrt
kernel: BUG: softlockup - CPU#11 stuck for 4278190091s! [qmgr:5492]
kernel:Moles linked in: fuse ipv6 power_meter bnx2 sg microcode serio_raw iTCO_wdtiTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2sd_mod crc_t10dif hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_coredm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
kernel: CPU 11
kernel:Moles linked in: fuse ipv6 power_meter bnx2 sg microcode serio_raw iTCO_wdtiTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2sd_mod crc_t10dif hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_coredm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
kernel:
kernel: Pid:5492, comm: qmgr Tainted: G W ---------------- 2.6.32-220.el6.x86_64 #1 HPProLiant DL360 G7
kernel: RIP:0010:[<ffffffff8106f730>] [<ffffffff8106f730>]do_setitimer+0x1d0/0x220
kernel: RSP:0018:ffff88080a661ef8 EFLAGS: 00000286
kernel: RAX:ffff88080b175a08 RBX: ffff88080a661f18 RCX: 0000000000000000
kernel: RDX:0000000000000000 RSI: 0000000000000082 RDI: ffff88080c8c4c40
kernel: RBP:ffffffff8100bc0e R08: 0000000000000000 R09: 0099d7270e01c3f1
kernel: R10:0000000000000000 R11: 0000000000000246 R12: ffffffff810ef9a3
kernel: R13:ffff88080a661e88 R14: 0000000000000000 R15: ffff88080a65a544
kernel: FS:00007f10b245f7c0(0000) GS:ffff88083c4a0000(0000) knlGS:0000000000000000
kernel: CS:0010 DS: 0000 ES: 0000 CR0: 000000008005003b
kernel: CR2:00007ff955977380 CR3: 000000100a80b000 CR4: 00000000000006e0
kernel: DR0:0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
kernel: DR3:0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
kernel:Process qmgr (pid: 5492, threadinfo ffff88080a660000, task ffff880809577500)
kernel: Stack:
kernel:00007f10b323def0 00007f10b248ead0 00007f10b26d0f78 00007f10b248ede0
kernel:<0> ffff88080a661f68 ffffffff8106f88a 0000000000000000 0000000000000000
kernel:<0> 000000000000014c 00000000000f423d 0000000000000000 0000000000000000
kernel: CallTrace:
kernel:[<ffffffff8106f88a>] ? alarm_setitimer+0x3a/0x60
kernel:[<ffffffff8107c27e>] ? sys_alarm+0xe/0x20
kernel:[<ffffffff8100b308>] ? tracesys+0xd9/0xde
kernel: Code:89 ef e8 74 66 02 00 83 3d 15 69 b5 00 00 75 37 49 8b 84 24 70 07 00 00 48 0508 08 00 00 66 ff 00 66 66 90 fb 66 0f 1f 44 00 00 <31> c0 e9 64 fe ff ff49 8b 84 24 68 07 00 00 48 c7 80 d0 00 00
kernel: CallTrace:
kernel:[<ffffffff8106f769>] ? do_setitimer+0x209/0x220
kernel:[<ffffffff8106f88a>] ? alarm_setitimer+0x3a/0x60
kernel:[<ffffffff8107c27e>] ? sys_alarm+0xe/0x20
kernel:[<ffffffff8100b308>] ? tracesys+0xd9/0xde
abrt-mp-oops: Reported 1 kernel oopses to Abrt

2.内核软死锁（soft lockup）bug原因分析
Soft lockup名称解释：所谓，soft lockup就是说，这个bug没有让系统彻底死机，但是若干个进程（或者kernel thread）被锁死在了某个状态（一般在内核区域），很多情况下这个是由于内核锁的使用的问题。
Linux内核对于每一个cpu都有一个监控进程，在技术界这个叫做watchdog（看门狗）。通过ps –ef | grep watchdog能够看见，进程名称大概是watchdog/X（数字：cpu逻辑编号1/2/3/4之类的）。这个进程或者线程每一秒钟运行一次，否则会睡眠和待机。这个进程运行会收集每一个cpu运行时使用数据的时间并且存放到属于每个cpu自己的内核数据结构。在内核中有很多特定的中断函数。这些中断函数会调用soft lockup计数，他会使用当前的时间戳与特定（对应的）cpu的内核数据结构中保存的时间对比，如果发现当前的时间戳比对应cpu保存的时间大于设定的阀值，他就假设监测进程或看门狗线程在一个相当可观的时间还没有执。Cpu软锁为什么会产生，是怎么产生的？如果linux内核是经过精心设计安排的CPU调度访问，那么怎么会产生cpu软死锁？那么只能说由于用户开发的或者第三方软件引入，看我们服务器内核panic的原因就是qmgr进程引起。因为每一个无限的循环都会一直有一个cpu的执行流程（qmgr进程示一个后台邮件的消息队列服务进程），并且拥有一定的优先级。Cpu调度器调度一个驱动程序来运行，如果这个驱动程序有问题并且没有被检测到，那么这个驱动程序将会暂用cpu的很长时间。根据前面的描述，看门狗进程会抓住（catch）这一点并且抛出一个软死锁（soft lockup）错误。软死锁会挂起cpu使你的系统不可用。
如果是用户空间的进程或线程引起的问题backtrace是不会有内容的，如果内核线程那么在soft lockup消息中会显示出backtrace信息。
3.根据linux内核源码分析错误
根据我们第一部分内核抛出的错误信息和call trace（linux内核的跟踪子系统）来分析产生的具体原因。
首先根据我们的centos版本安装相应的linux内核源码，具体步骤如下：
（1）下载源码的rpm包kernel-2.6.32-220.17.1.el6.src.rpm
（2）安装相应的依赖库，命令：yuminstall rpm-build redhat-rpm-config asciidoc newt-devel
（3）安装源码包：rpm -ikernel-2.6.32-220.17.1.el6.src.rpm
（4）进入建立源码的目录：cd~/rpmbuild/SPECS
（5）建立生成源码目录：rpmbuild-bp --target=`uname -m` kernel.spec

下面开始真正的根据内核bug日志分析源码：
（1）第一阶段内核错误日志分析（时间在Dec 4 14:03:34这个阶段的日志输出代码分析，其实这部分代码不会导致cpu软死锁，主要是第二阶段错误日志显示导致cpu软死锁）
我们首先通过日志定位到相关源代码：看下面日志：Dec 4 14:03:34 BP-YZH-1-xxxx kernel: WARNING: atkernel/trace/ring_buffer.c:1988 rb_reserve_next_event+0x2ce/0x370() (Not tainted)
根据日志内容我们可以很容易的定位到kernel/trace/ring_buffer.c这个文件的1988行代码如下：WARN_ON(1)。
先简单解释一下WARN_ON的作用：WARN_ON只是打印出当前栈信息，不会panic。所以会看到后面有一大堆的栈信息。这个宏定义如下：
#ifndef WARN_ON
#defineWARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
__WARN(); \
unlikely(__ret_warn_on); \
})
#endif
这个宏很简单保证传递进来的条件值为0或者1（两次逻辑非操作的结果），然后使用分支预测技术（保证执行概率大的分支紧邻上面的指令）判断是否需要调用__WARN()宏定义。如果满足条件执行了__WARN()宏定义也接着执行一条空指令;。上面调用WARN_ON宏是传递的1，所以会执行__WARN()。下面继续看一下__WARN()宏定义如下：
#define __WARN() warn_slowpath_null(__FILE__,__LINE__)
从接下来的call trace信息中我们也确实发现调用了warn_slowpath_null这个函数。通过在linux内核源代码中搜索这个函数的实现，发现在panic.c（内核恐慌时的相关功能实现）中实现如下：
voidwarn_slowpath_null(const char *file, int line)
{
warn_slowpath_common(file, line,__builtin_return_address(0),
TAINT_WARN, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);//都出这个符号，让其他模块可以使用这个函数
同样的我们看到了warn_slowpath_common这个函数，而在call trace当中这个函数在warn_slowpath_null函数之前打印出来，再次印证了这个流程是正确的。同样在panic.c这个文件中我发现了warn_slowpath_common这个函数的实现如下：
static voidwarn_slowpath_common(const char *file, int line, void *caller,
unsigned taint, struct slowpath_args *args)
{
const char *board;

printk(KERN_WARNING "------------[ cut here]------------\n");
printk(KERN_WARNING "WARNING: at %s:%d %pS()(%s)\n",
file, line, caller, print_tainted());
board = dmi_get_system_info(DMI_PRODUCT_NAME);//得到dmi系统信息
if (board)
printk(KERN_WARNING "Hardware name:%s\n", board);//通过我们的日志信息可以发现我们硬件名称是ProLiant DL360 G7

if (args)
vprintk(args->fmt, args->args);

print_moles();//打印系统模块信息
mp_stack();//mp信息输出（call trace开始）
print_oops_end_marker();//打印oops结束
add_taint(taint);
}
分析这个函数的实现不难发现我们的很多日志信息从这里开始输出，包括打印一些系统信息，就不继续深入分析了（请看代码注释，里面调用相关函数打印对应信息，通过我分析这些函数的实现和我们的日志信息完全能够对应，其中mp_stack是与cpu体系结构相关的，我们的服务器应该是属于x86体系）。这里在继续分析一下mp_stack函数的实现，因为这个是与cpu体系结构相关的，而且这个函数直接反应出导致内核panic的相关进程。这个函数实现如下：
/*
* The architecture-independent mp_stackgenerator
*/
void mp_stack(void)
{
unsigned long stack;

printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm,print_tainted(),
init_utsname()->release,
(int

H. 如何选择一个 Linux Tracer

tracer 是一个高级的性能分析和诊断工具，但是不要让这名词唬住你，如果你使用过 strace 和tcpmp，其实你就已经使用过 tracer 了。系统 tracer 可以获取更多的系统调用和数据包。它们通常能跟踪任意的内核和应用程序。
有太多的 linux tracer 可以选择。每一种都有其官方的（或非官方的）的卡通的独角兽吉祥物，足够撑起一台"儿童剧"了。

那么我们应该使用哪个 tracer 呢？
我会为两类读者回答这个问题，大部分人和性能/内核工程师。过一段时间这些可能会发生变化，我会持续跟进并补充，大概会一年更新一次。
多数人
多数人 (开发者，系统管理员，开发管理者，运维人员，评测人员，等等) 不关心系统追踪器的细节。下面是对于追踪器你应该知道和做的：
1. 使用perf_events分析CPU性能

使用 perf_events 做 CPU 性能分析。性能指标可以使用flame graph 等工具做可视化。
git clone --depth 1 https://github.com/brendangregg/FlameGraph
perf record -F 99 -a -g -- sleep 30
perf script | ./FlameGraph/stackcollapse-perf.pl | ./FlameGraph/flamegraph.pl > perf.svg

Linux perf_events (又称 "perf"，同命令名) 是 Linux 用户的官方追踪器和性能分析器。内置于内核代码，有很好维护（近来获得快速增强），通常通过 linux 命令行工具包添加。
perf 有很多功能，如果只能推荐一个，我选择 CPU 性能分析。尽管这只是采样，而不是从技术上追踪事件。最难的部分是获取完整的栈和信息，我为 java 和 node.js 做的一个演讲 Linux Profiling at Netflix中已经说过这个问题
2.了解其他的Tracer
正如我一个朋友说的：“你不需要知道如何操作 X 射线机器，但是一旦你吞了一枚硬币，你得知道这得去做 X 射线”，你应该了解各种 tracer 都能做什么，这样就能在你工作中真正需要 tracer 的时候，你既可以选择稍后学习使用，也可以雇相应的人来完成。
简短来说：几乎所有的东西都可以使用 tracer 来进行分析和跟踪。如，文件系统，网络处理器，硬件驱动器，全部的应用程序。可以看一下我的个人网站上关于 ftrace的文章，还有我写的关于perf_events 文档介绍，可以做为一个追踪(或者性能分析)的例子。
3. 寻求前端支持工具

如果你正想买一个能支持跟踪 Linux 的性能分析工具（有许多卖这类工具的公司）。想象一下，只需要直接点击一下界面就能“洞察”整个系统内核，包括隐藏的不同堆栈位置的热图，我在Monitorama talk 中介绍了一个这样带图形界面的工具。
我开源了一些我自己开发的前端工具，尽管只是 CLI （命令行界面）而不是（图形界面）。这些工具也会让人们更加快速容易的使用 tracer。比如下面的例子，用我的 perf_tool，跟踪一个新进程:
# ./execsnoopTracing exec()s. Ctrl-C to end.
PID PPID ARGS
22898 22004 man ls
22905 22898 preconv -e UTF-8
22908 22898 pager -s
22907 22898 nroff -mandoc -rLL=164n -rLT=164n -Tutf8
[...]

在 Netflix 上，我们创建了一个 Vector，一个分析工具的实例同时也是 Linux 上的 tracer 的最终前端。
致性能或内核工程师

我们的工作变的越来越困难，很多的人会问我们怎么样去追踪，哪种路径可以用！为了正确理解一个路径，你经常需要花上至少100个小时才能做到。理解所有的 linux 路径去做出理性的决定是一个浩大的工程。（我可能是唯一一个接近做到这件事情的人）
这里是我的建议，可以二选其一：
A) 选中一个全能的路径，并且使它标准化，这将涉及花费大量的时间去弄清楚它在测试环境中的细微差别和安全性。我现在推荐 SystemTap 的最新版本（ie，从源代码构建）。我知道有些公司已经选用 LTTng，而且他们用的很好，尽管它不是非常的强大（虽然它更安全）。Sysdig 可以成为另一个候选如果它可以增加追踪点或者 kprobes。
B) 遵循我上面提供的流程图，它将意味着尽可能更多的使用 ftrace 或者 perf_event， eBPF 会得到整合，之后其他的路径像 SystemTap/LTTng 会去填补这个空白。这就是我目前在 Netflix 做的工作。
tracer 的评论：
1. ftrace
我喜欢用 ftrace，它是内核 hacker 的首选，内置于系统内核，可以使用跟踪点(静态检查点)，能调用内核 kprobes 和 uprobes 调试工具。并且提供几个这样的功能：带可选过滤器和参数的事件追踪功能；在内核中进行统计的事件计数和定时功能；还有函数流程遍历的功能。可以看一下内核代码中 ftrace.txt 例子了解一下。ftrace 由 /sys 控制，仅支持单一的 root 用户使用（但是你可以通过缓冲区实例破解以支持多用户）。某些时候 Ftrace 的操作界面非常繁琐，但是的确非常“hack”，而且它有前端界面。Steven Rostedt，ftace 的主要作者，创建了 trace-cmd 命令工具，而我创建了 perf 的工具集。我对这个工具最大的不满就是它不可编程。举例来说，你不能保存和获取时间戳，不能计算延迟，不能把这些计算结果保存成直方图的形式。你需要转储事件至用户级别，并且花一些时间去处理结果。ftrace 可以通过 eBPF 变成可编程的。
2.perf_events

perf_events 是 Linux 用户的主要跟踪工具，它内置在内核源码中，通常通过 linux-tools-commom 加入。也称“perf”，同前端工具名称，通常用来跟踪和转储信息到一个叫做 perf.data 的文件中，perf.data 文件相当于一个动态的缓冲区，用来保存之后需要处理的结果。ftrace 能做到的，perf_events 大都也可以做到，perf-events 不能做函数流程遍历，少了一点儿“hack”劲儿（但是对于安全/错误检查有更好的支持）。它可以进行 CPU 分析和性能统计，用户级堆栈解析，也可以使用对于跟踪每行局部变量产生的调试信息。它也支持多用户并发操作。和 ftrace 一样也不支持可编程。如果要我只推荐一款 tracer，那一定是 perf 了。它能解决众多问题，并且它相对较安全。
3. eBPF

extended Berkeley Packet Filter（eBPF）是一个可以在事件上运行程序的高效内核虚拟机（JIT）。它可能最终会提供 ftrace 和 perf_events 的内核编程，并强化其他的 tracer。这是 Alexei Starovoitov 目前正在开发的，还没有完全集成，但是从4.1开始已经对一些优秀的工具有足够的内核支持了，如块设备I/O的延迟热图。可参考其主要作者 Alexei Starovoitov 的BPF slides和eBPF samples。
4. SystemTap

SystemTap 是最强大的tracer。它能做所有事情，如概要分析，跟踪点，探针，uprobes（来自SystemTap），USDT和内核编程等。它将程序编译为内核模块，然后加载，这是一种获取安全的巧妙做法。它也是从tree发展而来，在过去有很多问题（多的可怕）。很多不是 SystemTap 本身的错——它常常是第一个使用内核追踪功能，也是第一个碰到 bug 的。SystemTap 的最新版本好多了（必须由源代码编译），但是很多人仍然会被早期版本吓到。如果你想用它，可先在测试环境中使用，并与irc.freenode.net上的 #systemtap 开发人员交流。（Netflix 有容错机制，我们已经使用了 SystemTap，但是可能我们考虑的安全方面的问题比你们少。）我最大的不满是，它似乎认为你有常常没有的内核 debug 信息。实际上没有它也能做很多事情，但是缺少文档和例子（我必须自己全靠自己开始学习）。
5. LTTng
LTTng 优化了事件采集，这比其他 tracers 做得好。它从 tree 发展而来，它的核心很简单：通过一组小规模的固定指令集将事件写入追踪缓冲区，这种方式使它安全、快速，缺点是它没有内核编码的简单途径。我一直听说这不是一个大问题，因为尽管需要后期处理，它也已经优化到可以充分的度量。此外，它还首创了一个不同的分析技术，更多对所有关注事件的黑盒记录将稍后以 GUI 的方式进行研究。我关心的是前期没有考虑到要录制的事件缺失问题如何解决，但我真正要做的是花更多时间来看它在实践中用的怎么样。这是我花的时间最少的一个 tracer（没有什么特殊原因）。
6. Ktap
ktap 在过去是一款前景很好的 tracer，它使用内核中的 lua 虚拟机处理，在没有调试信息的情况下在嵌入式设备上运行的很好。它分为几个步骤，并在有一段时间似乎超过了 Linux 上所有的追踪器。然后 eBPF 开始进行内核集成，而 ktap 的集成在它可以使用 eBPF 替代它自己的虚拟机后才开始。因为 eBPF 仍将持续集成几个月，ktap 开发者要继续等上一段时间。我希??今年晚些时候它能重新开发。
7. dtrace4linux

dtrace4linux 主要是 Paul Fox 一个人在业余时间完成的，它是 Sun DTrace 的 Linux 版本。它引入瞩目，还有一些 provider 可以运行，但是从某种程度上来说还不完整，更多的是一种实验性的工具（不安全）。我认为，顾忌到许可问题，人们会小心翼翼的为 dtrace4linux 贡献代码：由于当年 Sun 开源DTrace 使用的是 CDDL 协议，而 dtrace4linux 也不大可能最终进入 Linux kernel。Paul 的方法很可能会使其成为一个 add-on。我很乐意看到 Linux 平台上的 DTrace 和这个项目的完成，我认为当我加入 Netflix 后将会花些时间来协助完成这个项目。然而，我还是要继续使用内置的 tracers，如 ftrace 和 perf_events。
8.OL DTrace

Oracle Linux DTrace为了将 DTrace 引入 Linux，特别是 Oracle Linux，做出了很大的努力。这些年来发布的多个版本表明了它的稳定进展。开发者们以一种对这个项目的前景看好的态度谈论着改进 DTrace 测试套件。很多有用的 provider 已经完成了，如：syscall, profile, sdt, proc, sched 以及 USDT。我很期待 fbt（function boundary tracing, 用于内核动态跟踪）的完成，它是 Linux 内核上非常棒的 provider。OL DTrace 最终的成功将取决于人们对运行 Oracle Linux（为技术支持付费）有多大兴趣，另一方面取决于它是否完全开源：它的内核元件是开源的，而我没有看到它的用户级别代码。
9. sysdig

sysdig是一个使用类tcpmp语法来操作系统事件的新tracer，它使用lua提交进程。它很优秀，它见证了系统跟踪领域的变革。它的局限性在于它只在当前进行系统调用，在提交进行时将所有事件转储为用户级别。你可以使用系统调用做很多事情，然而我还是很希望它能支持跟踪点、kprobe和uprobe。我还期待它能支持eBPF做内核摘要。目前，sysdig开发者正在增加容器支持。留意这些内容。
延伸阅读

我关于 tracer 的工作包括：
ftrace：我的 perf-tools工具集（参考实例目录）；我在 lwn.net 上的关于ftrace的文章；LISA14的发言；还有帖子：函数计数， iosnoop，opensnoop，execsnoop，TCP转发， uprobes 以及USDT。
perf_evenets：我的网页 perf_events实例；SCALE上的发言Netflix的Linux性能分析；还有帖子CPU采样，静态追踪点，热点图，计数，内核行追踪，off-CPU时间图。
eBPF：帖子eBPF：迈出一小步，和一些BPF工具（我需要发布更多）。
SystemTap：我很久以前写了一篇有点过期的帖子使用SystemTap。最近，我发布了一些工具systemtap-lwtools来演示如何在没有内核诊断信息的情况下使用SystemTap。
LTTng：我只花了一点时间，还不足以发表任何内容。
ktap：我的网页ktap实例包含一些早期版本的单行小程序和脚本。
dtrace4linux：我在系统性能一书中给出了一些实例，并曾经开发了一些小的修复程序，如timestamps。
OL DTrace：由于它直接由DTrace转变而来，很多我早期关于DTrace的工作都有相关性（如果在这里给出链接的话就太多了，可以在我的主页上搜索）。当它更完善时，我会开发一些特殊工具。
sysdig：我向 fileslower 和 subsecond offset spectrogram 贡献了代码。
其他：我写了关于strace 的注意事项。
请不要有更多的 tracer！如果你想知道为什么 Linux 不仅仅只有一个 tracer，或者只用本身的DTrace，你可以在我的演讲稿从DTrace到Linux中找到答案，从28张幻灯片开始。
感谢Deirdré Straughan的编辑，以及与 General Zoi 的小马宝莉创作者一起创作的 tracing 小马。

导航:首页 > 操作系统 > linuxftrace

linuxftrace

与linuxftrace相关的资料