linuxftrace_如何選擇一個 Linux Tracer

A. red hat 內核2.6.32的kernel/fork.c在哪裡

部分核心的代碼在arch/*/kernel目錄
*
 2 * linux/kernel/fork.c
 3 *
 4 * Copyright (C) 1991, 1992 Linus Torvalds
 5 */
 6 
 7 /*
 8 * 'fork.c' contains the help-routines for the 'fork' system call
 9 * (see also entry.S and others).
 10 * Fork is rather simple, once you get the hang of it, but the memory
 11 * management can be a bitch. See 'mm/memory.c': '_page_range()'
 12 */
 13 
 14 #include <linux/slab.h>
 15 #include <linux/init.h>
 16 #include <linux/unistd.h>
 17 #include <linux/mole.h>
 18 #include <linux/vmalloc.h>
 19 #include <linux/completion.h>
 20 #include <linux/personality.h>
 21 #include <linux/mempolicy.h>
 22 #include <linux/sem.h>
 23 #include <linux/file.h>
 24 #include <linux/fdtable.h>
 25 #include <linux/iocontext.h>
 26 #include <linux/key.h>
 27 #include <linux/binfmts.h>
 28 #include <linux/mman.h>
 29 #include <linux/mmu_notifier.h>
 30 #include <linux/fs.h>
 31 #include <linux/nsproxy.h>
 32 #include <linux/capability.h>
 33 #include <linux/cpu.h>
 34 #include <linux/cgroup.h>
 35 #include <linux/security.h>
 36 #include <linux/hugetlb.h>
 37 #include <linux/seccomp.h>
 38 #include <linux/swap.h>
 39 #include <linux/syscalls.h>
 40 #include <linux/jiffies.h>
 41 #include <linux/futex.h>
 42 #include <linux/compat.h>
 43 #include <linux/kthread.h>
 44 #include <linux/task_io_accounting_ops.h>
 45 #include <linux/rcupdate.h>
 46 #include <linux/ptrace.h>
 47 #include <linux/mount.h>
 48 #include <linux/audit.h>
 49 #include <linux/memcontrol.h>
 50 #include <linux/ftrace.h>
 51 #include <linux/proc_fs.h>
 52 #include <linux/profile.h>
 53 #include <linux/rmap.h>
 54 #include <linux/ksm.h>
 55 #include <linux/acct.h>
 56 #include <linux/tsacct_kern.h>
 57 #include <linux/cn_proc.h>
 58 #include <linux/freezer.h>
 59 #include <linux/delayacct.h>
 60 #include <linux/taskstats_kern.h>
 61 #include <linux/random.h>
 62 #include <linux/tty.h>
 63 #include <linux/blkdev.h>
 64 #include <linux/fs_struct.h>
 65 #include <linux/magic.h>
 66 #include <linux/perf_event.h>
 67 #include <linux/posix-timers.h>
 68 #include <linux/user-return-notifier.h>
 69 #include <linux/oom.h>
 70 #include <linux/khugepaged.h>
 71 #include <linux/signalfd.h>
 72 #include <linux/uprobes.h>
 73 #include <linux/aio.h>
 74 
 75 #include <asm/pgtable.h>
 76 #include <asm/pgalloc.h>
 77 #include <asm/uaccess.h>
 78 #include <asm/mmu_context.h>
 79 #include <asm/cacheflush.h>
 80 #include <asm/tlbflush.h>
 81 
 82 #include <trace/events/sched.h>
 83 
 84 #define CREATE_TRACE_POINTS
 85 #include <trace/events/task.h>
 86 
 87 /*
 88 * Protected counters by write_lock_irq(&tasklist_lock)
 89 */
 90 unsigned long total_forks;   /* Handle normal Linux uptimes. */
 91 int nr_threads;         /* The idle threads do not count.. */
 92 
 93 int max_threads;        /* tunable limit on nr_threads */
 94 
 95 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 96 
 97 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
 98 
 99 #ifdef CONFIG_PROVE_RCU
100 int lockdep_tasklist_lock_is_held(void)
101 {
102     return lockdep_is_held(&tasklist_lock);
103 }
104 EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
105 #endif /* #ifdef CONFIG_PROVE_RCU */
106 
107 int nr_processes(void)
108 {
109     int cpu;
110     int total = 0;
111 
112     for_each_possible_cpu(cpu)
113         total += per_cpu(process_counts, cpu);
114 
115     return total;
116 }
117 
118 void __weak arch_release_task_struct(struct task_struct *tsk)
119 {
120 }
121 
122 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
123 static struct kmem_cache *task_struct_cachep;
124 
125 static inline struct task_struct *alloc_task_struct_node(int node)
126 {
127     return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
128 }
129 
130 static inline void free_task_struct(struct task_struct *tsk)
131 {
132     kmem_cache_free(task_struct_cachep, tsk);
133 }
134 #endif
135 
136 void __weak arch_release_thread_info(struct thread_info *ti)
137 {
138 }
139 
140 #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
141 
142 /*
143 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
144 * kmemcache based allocator.
145 */
146 # if THREAD_SIZE >= PAGE_SIZE
147 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
148                          int node)
149 {
150     struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
151                       THREAD_SIZE_ORDER);
152 
153     return page ? page_address(page) : NULL;
154 }
155 
156 static inline void free_thread_info(struct thread_info *ti)
157 {
158     free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
159 }
160 # else
161 static struct kmem_cache *thread_info_cache;
162 
163 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
164                          int node)
165 {
166     return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
167 }
168 
169 static void free_thread_info(struct thread_info *ti)
170 {
171     kmem_cache_free(thread_info_cache, ti);
172 }
173 
174 void thread_info_cache_init(void)
175 {
176     thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
177                        THREAD_SIZE, 0, NULL);
178     BUG_ON(thread_info_cache == NULL);
179 }
180 # endif
181 #endif
182 
183 /* SLAB cache for signal_struct structures (tsk->signal) */
184 static struct kmem_cache *signal_cachep;
185 
186 /* SLAB cache for sighand_struct structures (tsk->sighand) */
187 struct kmem_cache *sighand_cachep;
188 
189 /* SLAB cache for files_struct structures (tsk->files) */
190 struct kmem_cache *files_cachep;

B. 編譯linux 2.6.30內核提示錯誤

make clean

C. 用C語言結構狀態描述進程式控制制塊的信息

以下來自linux內核2.6.35.7版本的代碼，在頭文件linux/sched.h中定義
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;

int lock_depth; /* BKL lock depth */

#ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
int oncpu;
#endif
#endif

int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif

/*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif

unsigned int policy;
cpumask_t cpus_allowed;

#ifdef CONFIG_TREE_PREEMPT_RCU
int rcu_read_lock_nesting;
char rcu_read_unlock_special;
struct rcu_node *rcu_blocked_node;
struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif

struct list_head tasks;
struct plist_node pushable_tasks;

struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned int personality;
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;

/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;

pid_t pid;
pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif

/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct *real_parent; /* real parent process */
struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

/*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;

/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;

struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

struct task_cputime cputime_expires;
struct list_head cpu_timers[3];

/* process credentials */
const struct cred *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred *cred; /* effective (overridable) subjective task
* credentials (COW) */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace) */
struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */

char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;

sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
uid_t loginuid;
unsigned int sessionid;
#endif
seccomp_t seccomp;

/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
* mempolicy */
spinlock_t alloc_lock;

#ifdef CONFIG_GENERIC_HARDIRQS
/* IRQ handler threads */
struct irqaction *irqaction;
#endif

/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
void *journal_info;

/* stacked block device info */
struct bio_list *bio_list;

/* VM state */
struct reclaim_state *reclaim_state;

struct backing_dev_info *backing_dev_info;

struct io_context *io_context;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
int mems_allowed_change_disable;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
struct css_set *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;

/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
struct prop_local_single dirties;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;

struct list_head *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
/* time stamp for last schele */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun.
*/
atomic_t trace_overrun;
/* Pause for the tracing */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* state flags for use by tracers */
unsigned long trace;
/* bitmask of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
struct memcg_batch_info {
int do_batch; /* incremented when batch uncharge started */
struct mem_cgroup *memcg; /* target memcg of uncharge */
unsigned long bytes; /* uncharged usage */
unsigned long memsw_bytes; /* uncharged mem+swap usage */
} memcg_batch;
#endif
};

D. 《Mastering Linux Programming》txt下載在線閱讀全文,求百度網盤雲資源

《Mastering Embedded Linux Programming - Second Edition》（Chris Simmonds）電子書網盤下載免費在線閱讀

鏈接: https://pan..com/s/1YaMA1KDcVCte49Usibsk4A

pdf" data_size="5.14M" data_filelogo="https://gss0.bdstatic.com//yun-file-logo/file-logo-6.png" data_number="1" data_sharelink="https://pan..com/s/1YaMA1KDcVCte49Usibsk4A" data_code="i8sb">

提取碼: i8sb

書名：Mastering Embedded Linux Programming - Second Edition

作者：Chris Simmonds

出版社：Packt Publishing

出版年份：2017-6-30

頁數：478

內容簡介：

Key Features

Discover how to build and configure reliable embedded Linux devicesThis book has been updated to include Linux 4.9 and Yocto Project 2.2 (Morty)This comprehensive guide covers the remote update of devices in the field and power management

Book Description

Embedded Linux runs many of the devices we use every day, from smart TVs to WiFi routers, test equipment to instrial controllers - all of them have Linux at their heart. Linux is a core technology in the implementation of the inter-connected world of the Internet of Things.

The comprehensive guide shows you the technologies and techniques required to build Linux into embedded systems. You will begin by learning about the fundamental elements that underpin all embedded Linux projects: the toolchain, the bootloader, the kernel, and the root filesystem. You'll see how to create each of these elements from scratch, and how to automate the process using Buildroot and the Yocto Project.

Moving on, you'll find out how to implement an effective storage strategy for flash memory chips, and how to install updates to the device remotely once it is deployed. You'll also get to know the key aspects of writing code for embedded Linux, such as how to access hardware from applications, the implications of writing multi-threaded code, and techniques to manage memory in an efficient way. The final chapters show you how to debug your code, both in applications and in the Linux kernel, and how to profile the system so that you can look out for performance bottlenecks.

By the end of the book, you will have a complete overview of the steps required to create a successful embedded Linux system.

What you will learn

Evaluate the Board Support Packages offered by most manufacturers of a system on chip or embedded moleUse Buildroot and the Yocto Project to create embedded Linux systems quickly and efficientlyUpdate IoT devices in the field without compromising securityRece the power budget of devices to make batteries last longerInteract with the hardware without having to write kernel device driversDebug devices remotely using GDB, and see how to measure the performance of the systems using powerful tools such as perk, ftrace, and valgrindFind out how to configure Linux as a real-time operating system

About the Author

Chris Simmonds is a software consultant and trainer living in southern England. He has almost two decades of experience in designing and building open-source embedded systems. He is the founder and chief consultant at 2net Ltd, which provides professional training and mentoring services in embedded Linux, Linux device drivers, and android platform development. He has trained engineers at many of the biggest companies in the embedded world, including ARM, Qualcomm, Intel, Ericsson, and General Dynamics. He is a frequent presenter at open source and embedded conferences, including the Embedded Linux Conference and Embedded World. You can see some of his work on the Inner Penguin blog.

Table of Contents

Starting outLearning about ToolchainsAll about BootloadersPorting and Configuring the KernelBuilding a Root filesystemSelecting a Build SystemCreating a storage strategyUpdating software in the fieldInterfacing with Device DriversStarting up: the init programPower managementLearning about processes and threadsManaging MemoryDebugging with GDBProfiling and tracingReal time programming

作者簡介：

About the Author

Chris SimmondsChris Simmonds is a software consultant and trainer living in southern England. He has almost two decades of experience in designing and building open-source embedded systems. He is the founder and chief consultant at 2net Ltd, which provides professional training and mentoring services in embedded Linux, Linux device drivers, and Android platform development. He has trained engineers at many of the biggest companies in the embedded world, including ARM, Qualcomm, Intel, Ericsson, and General Dynamics. He is a frequent presenter at open source and embedded conferences, including the Embedded Linux Conference and Embedded World. You can see some of his work on the Inner Penguin blog.

E. linux 2.6 和 3.10 的不同

3.10是比較新的內核，2.6大多數用於伺服器。3.10更改/升級了更多2.6已經發現的BUG，但沒有在SERVER系統中大規模應用。具體更改成百上千，似乎只有開發人員才知道

F. 如何查看android systrace 分析

Systrace的介紹

Systrace跟蹤代碼
Systrace運行方式
Systrace數據分析
Systrace使用示例
TraceView的介紹
TraceView運行方式
TraceView數據分析
TraceView使用示例
總結

內容：

1.Systrace的介紹

>Systrace是Android4.1中新增的性能數據采樣和分析工具。它可幫助開發者收集Android關鍵子系統（如Surfaceflinger、WindowManagerService等Framework部分關鍵模塊、服務）的運行信息，從而幫助開發者更直觀的分析系統瓶頸，改進性能。

Systrace的功能包括跟蹤系統的I/O操作、內核工作隊列、CPU負載以及Android各個子系統的運行狀況等。在Android平台中，它主要由3部分組成：

1.內核部分：Systrace利用了Linux Kernel中的ftrace功能。所以，如果要使用Systrace的話，必須開啟kernel中和ftrace相關的模塊。

2.數據採集部分：Android定義了一個Trace類。應用程序可利用該類把統計信息輸出給ftrace。同時，Android還有一個atrace程序，它可以從ftrace中讀取統計信息然後交給數據分析工具來處理。

3.數據分析工具：Android提供一個systrace.py（python腳本文件，位於Android SDK目錄/tools/systrace中，其內部將調用atrace程序）用來配置數據採集的方式（如採集數據的標簽、輸出文件名等）和收集 ftrace統計數據並生成一個結果網頁文件供用戶查看。

從本質上說，Systrace是對Linux Kernel中ftrace的封裝。應用進程需要利用Android提供的Trace類來使用Systrace。

G. 如何linux內核報告問題

Linux Kernel BUG:soft lockup CPU#1 stuck分析
1.線上內核bug日誌
kernel: Deltaway too big! 18428729675200069867 ts=18446743954022816244 write stamp =18014278822746377
kernel:------------[ cut here ]------------
kernel:WARNING: at kernel/trace/ring_buffer.c:1988 rb_reserve_next_event+0x2ce/0x370()(Not tainted)
kernel:Hardware name: ProLiant DL360 G7
kernel:Moles linked in: fuse ipv6 power_meter bnx2 sg microcode serio_raw iTCO_wdtiTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2sd_mod crc_t10dif hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_coredm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
kernel: Pid:5483, comm: master Not tainted 2.6.32-220.el6.x86_64 #1
kernel: CallTrace:
kernel:[<ffffffff81069b77>] ? warn_slowpath_common+0x87/0xc0
kernel:[<ffffffff81069bca>] ? warn_slowpath_null+0x1a/0x20
kernel:[<ffffffff810ea8ae>] ? rb_reserve_next_event+0x2ce/0x370
kernel:[<ffffffff810eab02>] ? ring_buffer_lock_reserve+0xa2/0x160
kernel:[<ffffffff810ec97c>] ? trace_buffer_lock_reserve+0x2c/0x70
kernel:[<ffffffff810ecb16>] ? trace_current_buffer_lock_reserve+0x16/0x20
kernel:[<ffffffff8107ae1e>] ? ftrace_raw_event_hrtimer_cancel+0x4e/0xb0
kernel:[<ffffffff81095e7a>] ? hrtimer_try_to_cancel+0xba/0xd0
kernel:[<ffffffff8106f634>] ? do_setitimer+0xd4/0x220
kernel:[<ffffffff8106f88a>] ? alarm_setitimer+0x3a/0x60
kernel:[<ffffffff8107c27e>] ? sys_alarm+0xe/0x20
kernel:[<ffffffff8100b308>] ? tracesys+0xd9/0xde
kernel: ---[end trace 4d0a1ef2e62cb1a2 ]---
abrt-mp-oops: Reported 1 kernel oopses to Abrt
kernel: BUG: softlockup - CPU#11 stuck for 4278190091s! [qmgr:5492]
kernel:Moles linked in: fuse ipv6 power_meter bnx2 sg microcode serio_raw iTCO_wdtiTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2sd_mod crc_t10dif hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_coredm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
kernel: CPU 11
kernel:Moles linked in: fuse ipv6 power_meter bnx2 sg microcode serio_raw iTCO_wdtiTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2sd_mod crc_t10dif hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_coredm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
kernel:
kernel: Pid:5492, comm: qmgr Tainted: G W ---------------- 2.6.32-220.el6.x86_64 #1 HPProLiant DL360 G7
kernel: RIP:0010:[<ffffffff8106f730>] [<ffffffff8106f730>]do_setitimer+0x1d0/0x220
kernel: RSP:0018:ffff88080a661ef8 EFLAGS: 00000286
kernel: RAX:ffff88080b175a08 RBX: ffff88080a661f18 RCX: 0000000000000000
kernel: RDX:0000000000000000 RSI: 0000000000000082 RDI: ffff88080c8c4c40
kernel: RBP:ffffffff8100bc0e R08: 0000000000000000 R09: 0099d7270e01c3f1
kernel: R10:0000000000000000 R11: 0000000000000246 R12: ffffffff810ef9a3
kernel: R13:ffff88080a661e88 R14: 0000000000000000 R15: ffff88080a65a544
kernel: FS:00007f10b245f7c0(0000) GS:ffff88083c4a0000(0000) knlGS:0000000000000000
kernel: CS:0010 DS: 0000 ES: 0000 CR0: 000000008005003b
kernel: CR2:00007ff955977380 CR3: 000000100a80b000 CR4: 00000000000006e0
kernel: DR0:0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
kernel: DR3:0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
kernel:Process qmgr (pid: 5492, threadinfo ffff88080a660000, task ffff880809577500)
kernel: Stack:
kernel:00007f10b323def0 00007f10b248ead0 00007f10b26d0f78 00007f10b248ede0
kernel:<0> ffff88080a661f68 ffffffff8106f88a 0000000000000000 0000000000000000
kernel:<0> 000000000000014c 00000000000f423d 0000000000000000 0000000000000000
kernel: CallTrace:
kernel:[<ffffffff8106f88a>] ? alarm_setitimer+0x3a/0x60
kernel:[<ffffffff8107c27e>] ? sys_alarm+0xe/0x20
kernel:[<ffffffff8100b308>] ? tracesys+0xd9/0xde
kernel: Code:89 ef e8 74 66 02 00 83 3d 15 69 b5 00 00 75 37 49 8b 84 24 70 07 00 00 48 0508 08 00 00 66 ff 00 66 66 90 fb 66 0f 1f 44 00 00 <31> c0 e9 64 fe ff ff49 8b 84 24 68 07 00 00 48 c7 80 d0 00 00
kernel: CallTrace:
kernel:[<ffffffff8106f769>] ? do_setitimer+0x209/0x220
kernel:[<ffffffff8106f88a>] ? alarm_setitimer+0x3a/0x60
kernel:[<ffffffff8107c27e>] ? sys_alarm+0xe/0x20
kernel:[<ffffffff8100b308>] ? tracesys+0xd9/0xde
abrt-mp-oops: Reported 1 kernel oopses to Abrt

2.內核軟死鎖（soft lockup）bug原因分析
Soft lockup名稱解釋：所謂，soft lockup就是說，這個bug沒有讓系統徹底死機，但是若干個進程（或者kernel thread）被鎖死在了某個狀態（一般在內核區域），很多情況下這個是由於內核鎖的使用的問題。
Linux內核對於每一個cpu都有一個監控進程，在技術界這個叫做watchdog（看門狗）。通過ps –ef | grep watchdog能夠看見，進程名稱大概是watchdog/X（數字：cpu邏輯編號1/2/3/4之類的）。這個進程或者線程每一秒鍾運行一次，否則會睡眠和待機。這個進程運行會收集每一個cpu運行時使用數據的時間並且存放到屬於每個cpu自己的內核數據結構。在內核中有很多特定的中斷函數。這些中斷函數會調用soft lockup計數，他會使用當前的時間戳與特定（對應的）cpu的內核數據結構中保存的時間對比，如果發現當前的時間戳比對應cpu保存的時間大於設定的閥值，他就假設監測進程或看門狗線程在一個相當可觀的時間還沒有執。Cpu軟鎖為什麼會產生，是怎麼產生的？如果linux內核是經過精心設計安排的CPU調度訪問，那麼怎麼會產生cpu軟死鎖？那麼只能說由於用戶開發的或者第三方軟體引入，看我們伺服器內核panic的原因就是qmgr進程引起。因為每一個無限的循環都會一直有一個cpu的執行流程（qmgr進程示一個後台郵件的消息隊列服務進程），並且擁有一定的優先順序。Cpu調度器調度一個驅動程序來運行，如果這個驅動程序有問題並且沒有被檢測到，那麼這個驅動程序將會暫用cpu的很長時間。根據前面的描述，看門狗進程會抓住（catch）這一點並且拋出一個軟死鎖（soft lockup）錯誤。軟死鎖會掛起cpu使你的系統不可用。
如果是用戶空間的進程或線程引起的問題backtrace是不會有內容的，如果內核線程那麼在soft lockup消息中會顯示出backtrace信息。
3.根據linux內核源碼分析錯誤
根據我們第一部分內核拋出的錯誤信息和call trace（linux內核的跟蹤子系統）來分析產生的具體原因。
首先根據我們的centos版本安裝相應的linux內核源碼，具體步驟如下：
（1）下載源碼的rpm包kernel-2.6.32-220.17.1.el6.src.rpm
（2）安裝相應的依賴庫，命令：yuminstall rpm-build redhat-rpm-config asciidoc newt-devel
（3）安裝源碼包：rpm -ikernel-2.6.32-220.17.1.el6.src.rpm
（4）進入建立源碼的目錄：cd~/rpmbuild/SPECS
（5）建立生成源碼目錄：rpmbuild-bp --target=`uname -m` kernel.spec

下面開始真正的根據內核bug日誌分析源碼：
（1）第一階段內核錯誤日誌分析（時間在Dec 4 14:03:34這個階段的日誌輸出代碼分析，其實這部分代碼不會導致cpu軟死鎖，主要是第二階段錯誤日誌顯示導致cpu軟死鎖）
我們首先通過日誌定位到相關源代碼：看下面日誌：Dec 4 14:03:34 BP-YZH-1-xxxx kernel: WARNING: atkernel/trace/ring_buffer.c:1988 rb_reserve_next_event+0x2ce/0x370() (Not tainted)
根據日誌內容我們可以很容易的定位到kernel/trace/ring_buffer.c這個文件的1988行代碼如下：WARN_ON(1)。
先簡單解釋一下WARN_ON的作用：WARN_ON只是列印出當前棧信息，不會panic。所以會看到後面有一大堆的棧信息。這個宏定義如下：
#ifndef WARN_ON
#defineWARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
__WARN(); \
unlikely(__ret_warn_on); \
})
#endif
這個宏很簡單保證傳遞進來的條件值為0或者1（兩次邏輯非操作的結果），然後使用分支預測技術（保證執行概率大的分支緊鄰上面的指令）判斷是否需要調用__WARN()宏定義。如果滿足條件執行了__WARN()宏定義也接著執行一條空指令;。上面調用WARN_ON宏是傳遞的1，所以會執行__WARN()。下面繼續看一下__WARN()宏定義如下：
#define __WARN() warn_slowpath_null(__FILE__,__LINE__)
從接下來的call trace信息中我們也確實發現調用了warn_slowpath_null這個函數。通過在linux內核源代碼中搜索這個函數的實現，發現在panic.c（內核恐慌時的相關功能實現）中實現如下：
voidwarn_slowpath_null(const char *file, int line)
{
warn_slowpath_common(file, line,__builtin_return_address(0),
TAINT_WARN, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);//都出這個符號，讓其他模塊可以使用這個函數
同樣的我們看到了warn_slowpath_common這個函數，而在call trace當中這個函數在warn_slowpath_null函數之前列印出來，再次印證了這個流程是正確的。同樣在panic.c這個文件中我發現了warn_slowpath_common這個函數的實現如下：
static voidwarn_slowpath_common(const char *file, int line, void *caller,
unsigned taint, struct slowpath_args *args)
{
const char *board;

printk(KERN_WARNING "------------[ cut here]------------\n");
printk(KERN_WARNING "WARNING: at %s:%d %pS()(%s)\n",
file, line, caller, print_tainted());
board = dmi_get_system_info(DMI_PRODUCT_NAME);//得到dmi系統信息
if (board)
printk(KERN_WARNING "Hardware name:%s\n", board);//通過我們的日誌信息可以發現我們硬體名稱是ProLiant DL360 G7

if (args)
vprintk(args->fmt, args->args);

print_moles();//列印系統模塊信息
mp_stack();//mp信息輸出（call trace開始）
print_oops_end_marker();//列印oops結束
add_taint(taint);
}
分析這個函數的實現不難發現我們的很多日誌信息從這里開始輸出，包括列印一些系統信息，就不繼續深入分析了（請看代碼注釋，裡面調用相關函數列印對應信息，通過我分析這些函數的實現和我們的日誌信息完全能夠對應，其中mp_stack是與cpu體系結構相關的，我們的伺服器應該是屬於x86體系）。這里在繼續分析一下mp_stack函數的實現，因為這個是與cpu體系結構相關的，而且這個函數直接反應出導致內核panic的相關進程。這個函數實現如下：
/*
* The architecture-independent mp_stackgenerator
*/
void mp_stack(void)
{
unsigned long stack;

printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm,print_tainted(),
init_utsname()->release,
(int

H. 如何選擇一個 Linux Tracer

tracer 是一個高級的性能分析和診斷工具，但是不要讓這名詞唬住你，如果你使用過 strace 和tcpmp，其實你就已經使用過 tracer 了。系統 tracer 可以獲取更多的系統調用和數據包。它們通常能跟蹤任意的內核和應用程序。
有太多的 linux tracer 可以選擇。每一種都有其官方的（或非官方的）的卡通的獨角獸吉祥物，足夠撐起一台"兒童劇"了。

那麼我們應該使用哪個 tracer 呢？
我會為兩類讀者回答這個問題，大部分人和性能/內核工程師。過一段時間這些可能會發生變化，我會持續跟進並補充，大概會一年更新一次。
多數人
多數人 (開發者，系統管理員，開發管理者，運維人員，評測人員，等等) 不關心系統追蹤器的細節。下面是對於追蹤器你應該知道和做的：
1. 使用perf_events分析CPU性能

使用 perf_events 做 CPU 性能分析。性能指標可以使用flame graph 等工具做可視化。
git clone --depth 1 https://github.com/brendangregg/FlameGraph
perf record -F 99 -a -g -- sleep 30
perf script | ./FlameGraph/stackcollapse-perf.pl | ./FlameGraph/flamegraph.pl > perf.svg

Linux perf_events (又稱 "perf"，同命令名) 是 Linux 用戶的官方追蹤器和性能分析器。內置於內核代碼，有很好維護（近來獲得快速增強），通常通過 linux 命令行工具包添加。
perf 有很多功能，如果只能推薦一個，我選擇 CPU 性能分析。盡管這只是采樣，而不是從技術上追蹤事件。最難的部分是獲取完整的棧和信息，我為 java 和 node.js 做的一個演講 Linux Profiling at Netflix中已經說過這個問題
2.了解其他的Tracer
正如我一個朋友說的：「你不需要知道如何操作 X 射線機器，但是一旦你吞了一枚硬幣，你得知道這得去做 X 射線」，你應該了解各種 tracer 都能做什麼，這樣就能在你工作中真正需要 tracer 的時候，你既可以選擇稍後學習使用，也可以雇相應的人來完成。
簡短來說：幾乎所有的東西都可以使用 tracer 來進行分析和跟蹤。如，文件系統，網路處理器，硬體驅動器，全部的應用程序。可以看一下我的個人網站上關於 ftrace的文章，還有我寫的關於perf_events 文檔介紹，可以做為一個追蹤(或者性能分析)的例子。
3. 尋求前端支持工具

如果你正想買一個能支持跟蹤 Linux 的性能分析工具（有許多賣這類工具的公司）。想像一下，只需要直接點擊一下界面就能「洞察」整個系統內核，包括隱藏的不同堆棧位置的熱圖，我在Monitorama talk 中介紹了一個這樣帶圖形界面的工具。
我開源了一些我自己開發的前端工具，盡管只是 CLI （命令行界面）而不是（圖形界面）。這些工具也會讓人們更加快速容易的使用 tracer。比如下面的例子，用我的 perf_tool，跟蹤一個新進程:
# ./execsnoopTracing exec()s. Ctrl-C to end.
PID PPID ARGS
22898 22004 man ls
22905 22898 preconv -e UTF-8
22908 22898 pager -s
22907 22898 nroff -mandoc -rLL=164n -rLT=164n -Tutf8
[...]

在 Netflix 上，我們創建了一個 Vector，一個分析工具的實例同時也是 Linux 上的 tracer 的最終前端。
致性能或內核工程師

我們的工作變的越來越困難，很多的人會問我們怎麼樣去追蹤，哪種路徑可以用！為了正確理解一個路徑，你經常需要花上至少100個小時才能做到。理解所有的 linux 路徑去做出理性的決定是一個浩大的工程。（我可能是唯一一個接近做到這件事情的人）
這里是我的建議，可以二選其一：
A) 選中一個全能的路徑，並且使它標准化，這將涉及花費大量的時間去弄清楚它在測試環境中的細微差別和安全性。我現在推薦 SystemTap 的最新版本（ie，從源代碼構建）。我知道有些公司已經選用 LTTng，而且他們用的很好，盡管它不是非常的強大（雖然它更安全）。Sysdig 可以成為另一個候選如果它可以增加追蹤點或者 kprobes。
B) 遵循我上面提供的流程圖，它將意味著盡可能更多的使用 ftrace 或者 perf_event， eBPF 會得到整合，之後其他的路徑像 SystemTap/LTTng 會去填補這個空白。這就是我目前在 Netflix 做的工作。
tracer 的評論：
1. ftrace
我喜歡用 ftrace，它是內核 hacker 的首選，內置於系統內核，可以使用跟蹤點(靜態檢查點)，能調用內核 kprobes 和 uprobes 調試工具。並且提供幾個這樣的功能：帶可選過濾器和參數的事件追蹤功能；在內核中進行統計的事件計數和定時功能；還有函數流程遍歷的功能。可以看一下內核代碼中 ftrace.txt 例子了解一下。ftrace 由 /sys 控制，僅支持單一的 root 用戶使用（但是你可以通過緩沖區實例破解以支持多用戶）。某些時候 Ftrace 的操作界面非常繁瑣，但是的確非常「hack」，而且它有前端界面。Steven Rostedt，ftace 的主要作者，創建了 trace-cmd 命令工具，而我創建了 perf 的工具集。我對這個工具最大的不滿就是它不可編程。舉例來說，你不能保存和獲取時間戳，不能計算延遲，不能把這些計算結果保存成直方圖的形式。你需要轉儲事件至用戶級別，並且花一些時間去處理結果。ftrace 可以通過 eBPF 變成可編程的。
2.perf_events

perf_events 是 Linux 用戶的主要跟蹤工具，它內置在內核源碼中，通常通過 linux-tools-commom 加入。也稱「perf」，同前端工具名稱，通常用來跟蹤和轉儲信息到一個叫做 perf.data 的文件中，perf.data 文件相當於一個動態的緩沖區，用來保存之後需要處理的結果。ftrace 能做到的，perf_events 大都也可以做到，perf-events 不能做函數流程遍歷，少了一點兒「hack」勁兒（但是對於安全/錯誤檢查有更好的支持）。它可以進行 CPU 分析和性能統計，用戶級堆棧解析，也可以使用對於跟蹤每行局部變數產生的調試信息。它也支持多用戶並發操作。和 ftrace 一樣也不支持可編程。如果要我只推薦一款 tracer，那一定是 perf 了。它能解決眾多問題，並且它相對較安全。
3. eBPF

extended Berkeley Packet Filter（eBPF）是一個可以在事件上運行程序的高效內核虛擬機（JIT）。它可能最終會提供 ftrace 和 perf_events 的內核編程，並強化其他的 tracer。這是 Alexei Starovoitov 目前正在開發的，還沒有完全集成，但是從4.1開始已經對一些優秀的工具有足夠的內核支持了，如塊設備I/O的延遲熱圖。可參考其主要作者 Alexei Starovoitov 的BPF slides和eBPF samples。
4. SystemTap

SystemTap 是最強大的tracer。它能做所有事情，如概要分析，跟蹤點，探針，uprobes（來自SystemTap），USDT和內核編程等。它將程序編譯為內核模塊，然後載入，這是一種獲取安全的巧妙做法。它也是從tree發展而來，在過去有很多問題（多的可怕）。很多不是 SystemTap 本身的錯——它常常是第一個使用內核追蹤功能，也是第一個碰到 bug 的。SystemTap 的最新版本好多了（必須由源代碼編譯），但是很多人仍然會被早期版本嚇到。如果你想用它，可先在測試環境中使用，並與irc.freenode.net上的 #systemtap 開發人員交流。（Netflix 有容錯機制，我們已經使用了 SystemTap，但是可能我們考慮的安全方面的問題比你們少。）我最大的不滿是，它似乎認為你有常常沒有的內核 debug 信息。實際上沒有它也能做很多事情，但是缺少文檔和例子（我必須自己全靠自己開始學習）。
5. LTTng
LTTng 優化了事件採集，這比其他 tracers 做得好。它從 tree 發展而來，它的核心很簡單：通過一組小規模的固定指令集將事件寫入追蹤緩沖區，這種方式使它安全、快速，缺點是它沒有內核編碼的簡單途徑。我一直聽說這不是一個大問題，因為盡管需要後期處理，它也已經優化到可以充分的度量。此外，它還首創了一個不同的分析技術，更多對所有關注事件的黑盒記錄將稍後以 GUI 的方式進行研究。我關心的是前期沒有考慮到要錄制的事件缺失問題如何解決，但我真正要做的是花更多時間來看它在實踐中用的怎麼樣。這是我花的時間最少的一個 tracer（沒有什麼特殊原因）。
6. Ktap
ktap 在過去是一款前景很好的 tracer，它使用內核中的 lua 虛擬機處理，在沒有調試信息的情況下在嵌入式設備上運行的很好。它分為幾個步驟，並在有一段時間似乎超過了 Linux 上所有的追蹤器。然後 eBPF 開始進行內核集成，而 ktap 的集成在它可以使用 eBPF 替代它自己的虛擬機後才開始。因為 eBPF 仍將持續集成幾個月，ktap 開發者要繼續等上一段時間。我希??今年晚些時候它能重新開發。
7. dtrace4linux

dtrace4linux 主要是 Paul Fox 一個人在業余時間完成的，它是 Sun DTrace 的 Linux 版本。它引入矚目，還有一些 provider 可以運行，但是從某種程度上來說還不完整，更多的是一種實驗性的工具（不安全）。我認為，顧忌到許可問題，人們會小心翼翼的為 dtrace4linux 貢獻代碼：由於當年 Sun 開源DTrace 使用的是 CDDL 協議，而 dtrace4linux 也不大可能最終進入 Linux kernel。Paul 的方法很可能會使其成為一個 add-on。我很樂意看到 Linux 平台上的 DTrace 和這個項目的完成，我認為當我加入 Netflix 後將會花些時間來協助完成這個項目。然而，我還是要繼續使用內置的 tracers，如 ftrace 和 perf_events。
8.OL DTrace

Oracle Linux DTrace為了將 DTrace 引入 Linux，特別是 Oracle Linux，做出了很大的努力。這些年來發布的多個版本表明了它的穩定進展。開發者們以一種對這個項目的前景看好的態度談論著改進 DTrace 測試套件。很多有用的 provider 已經完成了，如：syscall, profile, sdt, proc, sched 以及 USDT。我很期待 fbt（function boundary tracing, 用於內核動態跟蹤）的完成，它是 Linux 內核上非常棒的 provider。OL DTrace 最終的成功將取決於人們對運行 Oracle Linux（為技術支持付費）有多大興趣，另一方面取決於它是否完全開源：它的內核元件是開源的，而我沒有看到它的用戶級別代碼。
9. sysdig

sysdig是一個使用類tcpmp語法來操作系統事件的新tracer，它使用lua提交進程。它很優秀，它見證了系統跟蹤領域的變革。它的局限性在於它只在當前進行系統調用，在提交進行時將所有事件轉儲為用戶級別。你可以使用系統調用做很多事情，然而我還是很希望它能支持跟蹤點、kprobe和uprobe。我還期待它能支持eBPF做內核摘要。目前，sysdig開發者正在增加容器支持。留意這些內容。
延伸閱讀

我關於 tracer 的工作包括：
ftrace：我的 perf-tools工具集（參考實例目錄）；我在 lwn.net 上的關於ftrace的文章；LISA14的發言；還有帖子：函數計數， iosnoop，opensnoop，execsnoop，TCP轉發， uprobes 以及USDT。
perf_evenets：我的網頁 perf_events實例；SCALE上的發言Netflix的Linux性能分析；還有帖子CPU采樣，靜態追蹤點，熱點圖，計數，內核行追蹤，off-CPU時間圖。
eBPF：帖子eBPF：邁出一小步，和一些BPF工具（我需要發布更多）。
SystemTap：我很久以前寫了一篇有點過期的帖子使用SystemTap。最近，我發布了一些工具systemtap-lwtools來演示如何在沒有內核診斷信息的情況下使用SystemTap。
LTTng：我只花了一點時間，還不足以發表任何內容。
ktap：我的網頁ktap實例包含一些早期版本的單行小程序和腳本。
dtrace4linux：我在系統性能一書中給出了一些實例，並曾經開發了一些小的修復程序，如timestamps。
OL DTrace：由於它直接由DTrace轉變而來，很多我早期關於DTrace的工作都有相關性（如果在這里給出鏈接的話就太多了，可以在我的主頁上搜索）。當它更完善時，我會開發一些特殊工具。
sysdig：我向 fileslower 和 subsecond offset spectrogram 貢獻了代碼。
其他：我寫了關於strace 的注意事項。
請不要有更多的 tracer！如果你想知道為什麼 Linux 不僅僅只有一個 tracer，或者只用本身的DTrace，你可以在我的演講稿從DTrace到Linux中找到答案，從28張幻燈片開始。
感謝Deirdré Straughan的編輯，以及與 General Zoi 的小馬寶莉創作者一起創作的 tracing 小馬。

導航:首頁 > 操作系統 > linuxftrace

linuxftrace

與linuxftrace相關的資料