目录
1.前言
本文主要是根据阅码场 《Linux内核tracers的实现原理与应用》视频课程在aarch64上的实践。通过观察钩子函数的创建过程以及替换过程,理解trace的原理。本文同样以blk_update_request函数为例进行说明kretprobe的工作原理,此处的kretprobe是基于trace event来实现,同时使用了ftrace的框架。
kernel版本:5.10
平台:arm64
2. kretprobe领域模型
同 trace系列3 - kretprobe学习笔记
kretprobe_instance : 记录了原始的返回地址,以及所属的kretprobe,作为kretprobe实例连入kretprobe的free_instancesl链表,当kretprobe_instance 被初始化后,它将从free_instancesl链表移除;重新连入全局kretprobe_inst_table链表
3. kretprobe创建
在执行如下指令时,会完成kretprobe的创建:
#echo 'r:blk_update blk_update_request $retval' > /sys/kernel/debug/tracing/kprobe_events
此过程主要通过调用create_or_delete_trace_kprobe,最主要的设置pre_handler为pre_handler_kretprobe,同时设置了打印格式,并完成trace_kprobe的注册。与kprobe创建时的主要区别在于:rp->kp.pre_handler初始化和kretprobe.handler初始化
|- -rp->kp.pre_handler初始化
create_or_delete_trace_kprobe ->
trace_kprobe_create ->
register_trace_kprobe ->
__register_trace_kprobe
会调用register_kretprobe,它初始化了pre_handler为pre_handler_kretprobe
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
int i;
void *addr;
if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
return -EINVAL;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
return PTR_ERR(addr);
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
if (kretprobe_blacklist[i].addr == addr)
return -EINVAL;
}
}
//初始化pre_handler回调
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
//此处为10
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
#endif
}
raw_spin_lock_init(&rp->lock);
INIT_HLIST_HEAD(&rp->free_instances);
//本例中rp->maxactive为10,循环创建10个kretprobe_instance,并连入kretprobe.free_instances链表
//此处可以看出一个kretprobe可以有多个kretprobe_instance实例
for (i = 0; i < rp->maxactive; i++) {
inst = kmalloc(sizeof(struct kretprobe_instance) +
rp->data_size, GFP_KERNEL);
if (inst == NULL) {
free_rp_inst(rp);
return -ENOMEM;
}
INIT_HLIST_NODE(&inst->hlist);
hlist_add_head(&inst->hlist, &rp->free_instances);
}
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
return ret;
}
|- -kretprobe.handler初始化
create_or_delete_trace_kprobe ->
trace_kprobe_create ->
alloc_trace_kprobe
- alloc_trace_kprobe:为trace_kprobe分配空间,主要初始化了kreprobe的handler为kretprobe_dispatcher
4. kretprobe brk指令替换
先来看下未替换指令前blk_update_request的反汇编:
Dump of assembler code for function blk_update_request:
0xffff8000104ec1f0 <+0>: sub sp, sp, #0x60
0xffff8000104ec1f4 <+4>: stp x29, x30, [sp,#16]
0xffff8000104ec1f8 <+8>: add x29, sp, #0x10
0xffff8000104ec1fc <+12>: stp x19, x20, [sp,#32]
0xffff8000104ec200 <+16>: stp x21, x22, [sp,#48]
0xffff8000104ec204 <+20>: stp x23, x24, [sp,#64]
0xffff8000104ec208 <+24>: str x25, [sp,#80]
0xffff8000104ec20c <+28>: mov x22, x0
0xffff8000104ec210 <+32>: uxtb w24, w1
0xffff8000104ec214 <+36>: mov w21, w2
0xffff8000104ec218 <+40>: mov x0, x30
0xffff8000104ec21c <+44>: nop
......
在执行如下命令后
# echo 1 >/sys/kernel/debug/tracing/events/kprobes/blk_update/enable
我们可以看到,在执行如上操作后,blk_update_request的入口处的指令
sub sp, sp, #0x60
被替换为:
0xffff8000104ec1f0 <+0>: brk #0x4
很奇怪居然与kprobe是一致的,主要调用了如下的函数,enable_kprobe与使能kprobe是一致的
static inline int enable_kretprobe(struct kretprobe *rp)
{
return enable_kprobe(&rp->kp);
}
5. kretprobe钩子函数的执行
与前述kprobe的执行路径相同,当触发kretprobe执行时会按如下的执行路径,区别是执行的pre_handler不同:
#0 kprobe_handler (regs=0xffff80001253bcf0) at arch/arm64/kernel/probes/kprobes.c:352
#1 kprobe_breakpoint_handler (regs=0xffff80001253bcf0, esr=<optimized out>) at arch/arm64/kernel/probes/kprobes.c:404
#2 0xffff8000100148c4 in call_break_hook (regs=regs@entry=0xffff80001253bcf0, esr=esr@entry=4060086276) at arch/arm64/kernel/debug-monitors.c:322
#3 0xffff800010014a00 in brk_handler (unused=<optimized out>, esr=4060086276, regs=0xffff80001253bcf0) at arch/arm64/kernel/debug-monitors.c:329
#4 0xffff800010036180 in do_debug_exception (addr_if_watchpoint=addr_if_watchpoint@entry=5651652, esr=esr@entry=4060086276, regs=regs@entry=0xffff80001253bcf0) at arch/arm64/mm/fault.c:848
#5 0xffff800010cad220 in el1_dbg (regs=0xffff80001253bcf0, esr=4060086276) at arch/arm64/kernel/entry-common.c:190
#6 0xffff800010cad468 in el1_sync_handler (regs=<optimized out>) at arch/arm64/kernel/entry-common.c:227
#7 0xffff8000100119bc in el1_sync () at arch/arm64/kernel/entry.S:627
|- -pre_handler_kretprobe
对于kretprobe则会执行pre_handler_kretprobe回调:
int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
|--struct kretprobe_instance *ri = NULL, *last = NULL;
|--struct kretprobe *rp = container_of(p, struct kretprobe, kp);
|--hash = hash_ptr(current, KPROBE_HASH_BITS);
|--if (!hlist_empty(&rp->free_instances))
// 从kretprobe->free_instances的实例链表中,找到空闲的kretprobe_instance实例
ri = hlist_entry(rp->free_instances.first,struct kretprobe_instance, hlist);
//从kretprobe->free_instances的实例链表中,删除此实例
hlist_del(&ri->hlist);
//初始化找到的空闲kretprobe_instance实例
ri->rp = rp;
ri->task = current;
arch_prepare_kretprobe(ri, regs);
INIT_HLIST_NODE(&ri->hlist);
//将初始化的kretprobe_instance实例连入全局kretprobe_inst_table哈希链表
hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
pre_handler_kretprobe执行完毕后会执行arch_prepare_kretprobe,它主要保存了原始返回地址用于恢复,并设置了临时返回地址为kretprobe_trampoline用于执行kretprobe的功能
arch_prepare_kretprobe(struct kretprobe_instance *ri,struct pt_regs *regs)
| // 初始化kretprobe_instance实例为原始的返回地址,即blk_mq_end_request的返回地址,
| // 当从kretprobe_trampoline返回时用于恢复原有执行路径
|--ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
| //初始化栈帧
|--ri->fp = (void *)kernel_stack_pointer(regs);
| /* replace return addr (x30) with trampoline */
| //更新了返回地址,这样在从blk_update_request返回时会执行kretprobe_trampoline函数
|--regs->regs[30] = (long)&kretprobe_trampoline;
|- -setup_singlestep
setup_singlestep(p, regs, kcb, 0)
|--unsigned long slot;
|--kcb->kprobe_status = KPROBE_HIT_SS;
|--if (p->ainsn.api.insn)
//slot存放了blk_update_request的入口指令:sub sp, sp, #0x60
slot = (unsigned long)p->ainsn.api.insn;
set_ss_context(kcb, slot);
|--kcb->ss_ctx.ss_pending = true;
| //slot(kcb->ss_ctx.match_addr)同时存放了指令: brk #0x6
|--kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t);
kprobes_save_local_irqflag(kcb, regs);
instruction_pointer_set(regs, slot);
| //将regs->pc赋值为val, 此处val就是slot, 它对应指令为sub sp, sp, #0x60
|--regs->pc = val
instruction_pointer_set设置了当断点指令brk #0x4返回执行的pc值,它就是blk_update_request原始的入口指令,当断点指令brk #0x4异常返回后,将执行blk_update_request的原始入口指令(注意:它位于另一个内存地址p->ainsn.api.insn,非原始内存地址)。由于slot槽同时还有一条端点指令brk #0x6,因此会继续执行断点指令brk #0x6
|- -brk #0x6
0xffff800012533000 sub sp, sp, #0x60
0xffff800012533004 brk #0x6
执行slot槽指令,后将再次陷入断点异常,之后从断点异常退出后,将继续沿着blk_update_request原有的执行路径执行,这部分执行与kprobe没有任何区别,直到执行到函数返回处,由于pre_handler_kretprobe -> arch_prepare_kretprobe函数中替换了返回地址,因此从blk_update_request函数返回后将不会按照原有的返回地址执行,而是会执行设置的返回地址,即kretprobe_trampoline
|- -kretprobe_trampoline
SYM_CODE_START(kretprobe_trampoline)
//kretprobe_trampoline相当于占据了blk_update_request的栈(鸠占鹊巢)
//此处分配栈空间,用于保存pt_regs寄存器
sub sp, sp, #S_FRAME_SIZE
//保存pt_regs寄存器
save_all_base_regs
//保存栈顶到x0, 即struct pt_regs指针
mov x0, sp
bl trampoline_probe_handler
/*
* Replace trampoline address in lr with actual orig_ret_addr return
* address.
*/
mov lr, x0
restore_all_base_regs
add sp, sp, #S_FRAME_SIZE
ret
SYM_CODE_END(krtprobe_trampoline)
void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
{
return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline,
(void *)kernel_stack_pointer(regs));
}
static nokprobe_inline
unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
void *trampoline_address,
void *frame_pointer)
{
unsigned long ret;
/*
* Set a dummy kprobe for avoiding kretprobe recursion.
* Since kretprobe never runs in kprobe handler, no kprobe must
* be running at this point.
*/
kprobe_busy_begin();
ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer);
kprobe_busy_end();
return ret;
}
__kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer)
|--struct kretprobe_instance *ri = NULL, *last = NULL;
| struct hlist_head *head;
|--kprobe_opcode_t *correct_ret_addr = NULL;
|--kretprobe_hash_lock(current, &head, &flags);
| | //kretprobe_inst_table哈希上链接了初始化的kretprobe_instance实例
| |--*head = &kretprobe_inst_table[hash];
| //遍历kretprobe_inst_table哈希上链接了初始化的kretprobe_instance实例
|--hlist_for_each_entry(ri, head, hlist)
| //找到与blk_mq_end_request帧指针相同的kretprobe_instance实例
| if (ri->fp != frame_pointer)
| skipped = true;
| continue;
| //获取原始的返回地址, 用于trampoline_address执行完毕后返回到blk_mq_end_request
| correct_ret_addr = ri->ret_addr;
| if (correct_ret_addr != trampoline_address)
| break;
|--last = ri
|--hlist_for_each_entry_safe(ri, tmp, head, hlist)
| if (ri->task != current)
| continue;
| if (ri->fp != frame_pointer)
| continue;
| if (ri->rp && ri->rp->handler)
| struct kprobe *prev = kprobe_running()
| ri->ret_addr = correct_ret_addr;
| ri->rp->handler(ri, regs);
| recycle_rp_inst(ri);
\--return (unsigned long)correct_ret_addr;
根据前面初始化,ri->rp->handler为kretprobe_dispatcher
static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
raw_cpu_inc(*tk->nhit);
if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
kretprobe_trace_func(tk, ri, regs);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
kretprobe_perf_func(tk, ri, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
static void
kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tk->tp)
__kretprobe_trace_func(tk, ri, regs, link->file);
}
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
struct trace_event_file *trace_file)
{
struct kretprobe_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
struct trace_event_call *call = trace_probe_event_call(&tk->tp);
int dsize;
WARN_ON(call != trace_file->event_call);
if (trace_trigger_soft_disabled(trace_file))
return;
local_save_flags(fbuffer.flags);
fbuffer.pc = preempt_count();
fbuffer.trace_file = trace_file;
dsize = __get_data_size(&tk->tp, regs);
fbuffer.event =
trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
call->event.type,
sizeof(*entry) + tk->tp.size + dsize,
fbuffer.flags, fbuffer.pc);
if (!fbuffer.event)
return;
fbuffer.regs = regs;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
//设置返回地址为原始的返回地址
entry->ret_ip = (unsigned long)ri->ret_addr;
//存储参数值
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
//写入ring buffer
trace_event_buffer_commit(&fbuffer);
}
由于trampoline_probe_handler最终返回的是原始的返回地址,会被保存在x0中,回到kretprobe_trampoline中,
mov lr, x0 用来恢复链接寄存器,从kretprobe_trampoline返回后将返回到blk_mq_end_request的原始路径执行
6. 总结
我们再来简单总结kretprobe的工作流程:
-
首先要注册kretprobe
这主要是通过向/sys/kernel/debug/tracing/kprobe_events节点写入命令完成,这个过程将会:
(1)完成kretprobe的注册,这其中最重要的是初始化pre_handler回调为pre_handler_kretprobe,它将在brk #0x4断点处理函数中被调用,它主要保存从blk_update_request返回的原始的返回地址,同时设置了临时的返回函数为kretprobe_trampoline用于执行kretprobe功能;
(2)保存被探测函数入口的原始指令,再加上一条brk #0x6断点指令,它们会被保存到slot中,将来被替换的brk #0x4返回后将首先执行此slot中原始的指令代码;
(3)同时也会记录探测点的后一条指令地址,将来从brk #0x6返回时将执行此指令,从而恢复原始的指令执行路径; -
断点指令插入
主要通过echo 1 > /sys/kernel/debug/tracing/events/kprobes/blk_update/enable完成。它将会将被探测函数探测点的指令替换为brk #0x4。
注:brk #0x4和brk #0x6将对应不同的断点处理回调 -
执行kretprobe回调
当进入被探测函数探测点时,会执行brk #0x4断点指令引发断点异常,根据0x4参数将执行断点立即处理回调,最终将执行pre_handler_kretprobe回调,主要用于设置blk_update_request的原始返回地址;之后将执行第一步初始化好的slot槽中的指令,slot槽的第一条指令就是被探测函数原始入口执行的指令,之后将执行brk #0x6再次陷入断点异常,此时根据参数0x6将执行断点单步异常处理函数,它将会通过将第1步(3)中记录的指令地址恢复PC,这样brk #0x6返回时,将继续沿着被探测函数探测点之后的指令路径执行,恢复正常的指令执行路径。在blk_update_request返回处,会跳转到临时返回地址kretprobe_trampoline,完成kretprobe的功能,之后将返回地址修改为blk_update_request原始的返回地址,这样从kretprobe_trampoline返回后,将返回到blk_mq_end_request原始的返回地址继续执行。
执行结果如下:
/ # cat /sys/kernel/debug/tracing/trace
# tracer: nop
#
# entries-in-buffer/entries-written: 17/17 #P:2
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
<idle>-0 [000] d.s3 16.343746: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
<idle>-0 [000] d.s3 16.345540: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
<idle>-0 [000] d.s4 16.346275: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] d.s2 16.348317: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] d.s3 16.348719: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
kworker/u4:0-7 [000] d.s3 35.025022: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
kworker/u4:0-7 [000] d.s3 40.146858: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
<idle>-0 [000] d.s3 45.536322: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] d.s2 45.566258: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] d.s2 45.599543: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] dNs3 45.612681: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
cat-116 [000] dNs3 51.593771: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
<idle>-0 [000] d.s3 56.762610: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
kworker/0:1H-97 [000] d.s3 56.794490: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
jbd2/vda-8-103 [000] d.s4 56.814426: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] d.s2 56.825756: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
ksoftirqd/0-9 [000] d.s3 56.826898: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
附录
struct kretprobe {
struct kprobe kp;
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler;
int maxactive;
int nmissed;
size_t data_size;
struct hlist_head free_instances;
raw_spinlock_t lock;
}
struct kretprobe_instance {
union {
struct hlist_node hlist;
struct rcu_head rcu;
};
struct kretprobe *rp;
//保存原始的返回地址
kprobe_opcode_t *ret_addr;
struct task_struct *task;
void *fp;
char data[];
};