linux kprobe使用
使用场景
- 监控某个内核函数是否被调用
- 获取某个内核函数耗费的时间
- 获取某个内核函数的入参
- 获取某个内核函数的调用栈(
dump_stack()
) - 获取某个内核函数的返回值
参数传递规则
x86平台对pt_regs的定义
arch/x86/include/asm/ptrace.h
// i386架构
#ifdef __i386__struct pt_regs {/** NB: 32-bit x86 CPUs are inconsistent as what happens in the* following cases (where %seg represents a segment register):** - pushl %seg: some do a 16-bit write and leave the high* bits alone* - movl %seg, [mem]: some do a 16-bit write despite the movl* - IDT entry: some (e.g. 486) will leave the high bits of CS* and (if applicable) SS undefined.** Fortunately, x86-32 doesn't read the high bits on POP or IRET,* so we can just treat all of the segment registers as 16-bit* values.*/unsigned long bx;unsigned long cx;unsigned long dx;unsigned long si;unsigned long di;unsigned long bp;unsigned long ax;unsigned short ds;unsigned short __dsh;unsigned short es;unsigned short __esh;unsigned short fs;unsigned short __fsh;/* On interrupt, gs and __gsh store the vector number. */unsigned short gs;unsigned short __gsh;/* On interrupt, this is the error code. */unsigned long orig_ax;unsigned long ip;unsigned short cs;unsigned short __csh;unsigned long flags;unsigned long sp;unsigned short ss; unsigned short __ssh;
};#else /* __i386__ */
// ia64
struct pt_regs {
/** C ABI says these regs are callee-preserved. They aren't saved on kernel entry* unless syscall needs a complete, fully filled "struct pt_regs".*/unsigned long r15;unsigned long r14;unsigned long r13;unsigned long r12;unsigned long bp;unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */unsigned long r11;unsigned long r10;unsigned long r9;unsigned long r8;unsigned long ax;unsigned long cx;unsigned long dx;unsigned long si;unsigned long di;
/** On syscall entry, this is syscall#. On CPU exception, this is error code.* On hw interrupt, it's IRQ number:*/unsigned long orig_ax;
/* Return frame for iretq */unsigned long ip;unsigned long cs;unsigned long flags;unsigned long sp;unsigned long ss;
/* top of stack page */
};#endif /* !__i386__ */
从4.18的内核版本bpf
的相关源码/tools/testing/selftests/bpf/bpf_helpers.h
中可以窥探x86
结构和`arm``架构函数参数传递规则。
#if defined(bpf_target_x86)
#define PT_REGS_PARM1(x) ((x)->di)
#define PT_REGS_PARM2(x) ((x)->si)
#define PT_REGS_PARM3(x) ((x)->dx)
#define PT_REGS_PARM4(x) ((x)->cx)
#define PT_REGS_PARM5(x) ((x)->r8)
#define PT_REGS_RET(x) ((x)->sp)
#define PT_REGS_FP(x) ((x)->bp)
#define PT_REGS_RC(x) ((x)->ax)
#define PT_REGS_SP(x) ((x)->sp)
#define PT_REGS_IP(x) ((x)->ip)#elif defined(bpf_target_arm64)
#define PT_REGS_PARM1(x) ((x)->regs[0])
#define PT_REGS_PARM2(x) ((x)->regs[1])
#define PT_REGS_PARM3(x) ((x)->regs[2])
#define PT_REGS_PARM4(x) ((x)->regs[3])
#define PT_REGS_PARM5(x) ((x)->regs[4])
#define PT_REGS_RET(x) ((x)->regs[30])
#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */
#define PT_REGS_RC(x) ((x)->regs[0])
#define PT_REGS_SP(x) ((x)->sp)
#define PT_REGS_IP(x) ((x)->pc)
/samples/bpf/test_overhead_kprobe_kern.c
// 使用示例
SEC("kprobe/__set_task_comm")
int prog(struct pt_regs *ctx)
{struct signal_struct *signal;struct task_struct *tsk;char oldcomm[16] = {};char newcomm[16] = {};u16 oom_score_adj;u32 pid;tsk = (void *)PT_REGS_PARM1(ctx);pid = _(tsk->pid);bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));signal = _(tsk->signal);oom_score_adj = _(signal->oom_score_adj);return 0;
}// 函数原型
/** These functions flushes out all traces of the currently running executable* so that a new one can be started*/
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{task_lock(tsk);trace_task_rename(tsk, buf);strlcpy(tsk->comm, buf, sizeof(tsk->comm));task_unlock(tsk);perf_event_comm(tsk, exec);
}
- x86架构寄存器约定与函数参数传递
在 X86_64 架构中,寄存器的约定如上,当调用一个函数的时候,RDI 寄存器用于传递第一个参数,RSI 寄存器用于传递第二个寄存器,依次类推,R9 寄存器传递第六个参数, 函数返回值保存在 RAX 寄存器中。那么如果函数的参数超过六个,那么多余的参数参数如何传递? 在 X86_64 架构中,函数大于 6 个参数的参数通过堆栈进行传输。
其中RDI对应pt_regs结构体中的di,其他寄存器依次类推。
- ARM架构寄存器约定与函数参数传递
在 ARM64 架构中,使用 X0-X7 寄存器传递参数,第一个参数通过 X0 寄存器传递,第二个参数通过 X1 寄存器传递,以此类推. 返回值存储在 X0 寄存器中。
使用实例
/samples/kprobes/kprobe_example.c
/** NOTE: This example is works on x86 and powerpc.* Here's a sample kernel module showing the use of kprobes to dump a* stack trace and selected registers when _do_fork() is called.** For more information on theory of operation of kprobes, see* Documentation/kprobes.txt** You will see the trace data in /var/log/messages and on the console* whenever _do_fork() is invoked to create a new process.*/#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>#define MAX_SYMBOL_LEN 64
static char symbol[MAX_SYMBOL_LEN] = "_do_fork";
module_param_string(symbol, symbol, sizeof(symbol), 0644);/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {.symbol_name = symbol,
};/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",p->symbol_name, p->addr, regs->ip, regs->flags);
#endif
#ifdef CONFIG_PPCpr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",p->symbol_name, p->addr, regs->nip, regs->msr);
#endif
#ifdef CONFIG_MIPSpr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"" pstate = 0x%lx\n",p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
#ifdef CONFIG_S390pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",p->symbol_name, p->addr, regs->psw.addr, regs->flags);
#endif/* A dump_stack() here will give a stack backtrace */return 0;
}/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,unsigned long flags)
{
#ifdef CONFIG_X86pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",p->symbol_name, p->addr, regs->flags);
#endif
#ifdef CONFIG_PPCpr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",p->symbol_name, p->addr, regs->msr);
#endif
#ifdef CONFIG_MIPSpr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",p->symbol_name, p->addr, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",p->symbol_name, p->addr, (long)regs->pstate);
#endif
#ifdef CONFIG_S390pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",p->symbol_name, p->addr, regs->flags);
#endif
}/** fault_handler: this is called if an exception is generated for any* instruction within the pre- or post-handler, or when Kprobes* single-steps the probed instruction.*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);/* Return 0 because we don't handle the fault. */return 0;
}static int __init kprobe_init(void)
{int ret;kp.pre_handler = handler_pre;kp.post_handler = handler_post;kp.fault_handler = handler_fault;ret = register_kprobe(&kp);if (ret < 0) {pr_err("register_kprobe failed, returned %d\n", ret);return ret;}pr_info("Planted kprobe at %p\n", kp.addr);return 0;
}static void __exit kprobe_exit(void)
{unregister_kprobe(&kp);pr_info("kprobe at %p unregistered\n", kp.addr);
}module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
obj-m := kprobe.okprobe-y += kprobe_example.oBASEINCLUDE ?= /lib/modules/`uname -r`/buildall:$(MAKE) -C $(BASEINCLUDE) M=$(PWD) modules;clean:$(MAKE) -C $(BASEINCLUDE) M=$(PWD) clean;rm -f *.ko;
```