eBPF 之 ProgramType、AttachType和InputContext
1. ProgramType 定义
定义在 include/uapi/linux/bpf.h 文件中,不同 Linux 版本会有变化,以下是 Linux 5.19 版本定义:
enum bpf_prog_type {BPF_PROG_TYPE_UNSPEC,BPF_PROG_TYPE_SOCKET_FILTER,BPF_PROG_TYPE_KPROBE,BPF_PROG_TYPE_SCHED_CLS,BPF_PROG_TYPE_SCHED_ACT,BPF_PROG_TYPE_TRACEPOINT,BPF_PROG_TYPE_XDP,BPF_PROG_TYPE_PERF_EVENT,BPF_PROG_TYPE_CGROUP_SKB,BPF_PROG_TYPE_CGROUP_SOCK,BPF_PROG_TYPE_LWT_IN,BPF_PROG_TYPE_LWT_OUT,BPF_PROG_TYPE_LWT_XMIT,BPF_PROG_TYPE_SOCK_OPS,BPF_PROG_TYPE_SK_SKB,BPF_PROG_TYPE_CGROUP_DEVICE,BPF_PROG_TYPE_SK_MSG,BPF_PROG_TYPE_RAW_TRACEPOINT,BPF_PROG_TYPE_CGROUP_SOCK_ADDR,BPF_PROG_TYPE_LWT_SEG6LOCAL,BPF_PROG_TYPE_LIRC_MODE2,BPF_PROG_TYPE_SK_REUSEPORT,BPF_PROG_TYPE_FLOW_DISSECTOR,BPF_PROG_TYPE_CGROUP_SYSCTL,BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,BPF_PROG_TYPE_CGROUP_SOCKOPT,BPF_PROG_TYPE_TRACING,BPF_PROG_TYPE_STRUCT_OPS,BPF_PROG_TYPE_EXT,BPF_PROG_TYPE_LSM,BPF_PROG_TYPE_SK_LOOKUP,BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
};
2. AttachType 定义
定义在 include/uapi/linux/bpf.h 文件中,不同 Linux 版本会有变化,以下是 Linux 5.19 版本定义:
enum bpf_attach_type {BPF_CGROUP_INET_INGRESS,BPF_CGROUP_INET_EGRESS,BPF_CGROUP_INET_SOCK_CREATE,BPF_CGROUP_SOCK_OPS,BPF_SK_SKB_STREAM_PARSER,BPF_SK_SKB_STREAM_VERDICT,BPF_CGROUP_DEVICE,BPF_SK_MSG_VERDICT,BPF_CGROUP_INET4_BIND,BPF_CGROUP_INET6_BIND,BPF_CGROUP_INET4_CONNECT,BPF_CGROUP_INET6_CONNECT,BPF_CGROUP_INET4_POST_BIND,BPF_CGROUP_INET6_POST_BIND,BPF_CGROUP_UDP4_SENDMSG,BPF_CGROUP_UDP6_SENDMSG,BPF_LIRC_MODE2,BPF_FLOW_DISSECTOR,BPF_CGROUP_SYSCTL,BPF_CGROUP_UDP4_RECVMSG,BPF_CGROUP_UDP6_RECVMSG,BPF_CGROUP_GETSOCKOPT,BPF_CGROUP_SETSOCKOPT,BPF_TRACE_RAW_TP,BPF_TRACE_FENTRY,BPF_TRACE_FEXIT,BPF_MODIFY_RETURN,BPF_LSM_MAC,BPF_TRACE_ITER,BPF_CGROUP_INET4_GETPEERNAME,BPF_CGROUP_INET6_GETPEERNAME,BPF_CGROUP_INET4_GETSOCKNAME,BPF_CGROUP_INET6_GETSOCKNAME,BPF_XDP_DEVMAP,BPF_CGROUP_INET_SOCK_RELEASE,BPF_XDP_CPUMAP,BPF_SK_LOOKUP,BPF_XDP,BPF_SK_SKB_VERDICT,BPF_SK_REUSEPORT_SELECT,BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,BPF_PERF_EVENT,__MAX_BPF_ATTACH_TYPE
};
3. ProgramType、AttachType和 InputContext关系
在 Linux 源码 kernel/bpf/syscall.c 文件的 attach_type_to_prog_type 函数中有 ProgramType 与 AttachType 的映射关系,同时在 Linux 源码 include/linux/bpf_types.h 中定义了 ProgramType 与 InputContext 的映射关系。
整理后的映射关系如下:
ProgramType | AttachType | InputContext |
BPF_PROG_TYPE_SOCKET_FILTER | None | struct __sk_buff |
BPF_PROG_TYPE_KPROBE | AttachTraceKprobeMulti | struct pt_regs |
BPF_PROG_TYPE_SCHED_CLS | None | struct __sk_buff |
BPF_PROG_TYPE_SCHED_ACT | None | struct __sk_buff |
BPF_PROG_TYPE_TRACEPOINT | None | __u64 |
BPF_PROG_TYPE_XDP | BPF_XDP_DEVMAP BPF_XDP_CPUMAP BPF_XDP | struct xdp_md |
BPF_PROG_TYPE_PERF_EVENT | None | struct bpf_perf_event_data |
BPF_PROG_TYPE_CGROUP_SKB | BPF_CGROUP_INET_INGRESS BPF_CGROUP_INET_EGRESS | struct __sk_buff |
BPF_PROG_TYPE_CGROUP_SOCK | BPF_CGROUP_INET_SOCK_CREATE BPF_CGROUP_INET_SOCK_RELEASE BPF_CGROUP_INET4_POST_BIND BPF_CGROUP_INET6_POST_BIND | struct bpf_sock |
BPF_PROG_TYPE_LWT_IN | None | struct __sk_buff |
BPF_PROG_TYPE_LWT_OUT | None | struct __sk_buff |
BPF_PROG_TYPE_LWT_XMIT | None | struct __sk_buff |
BPF_PROG_TYPE_SOCK_OPS | BPF_CGROUP_SOCK_OPS | struct bpf_sock_ops |
BPF_PROG_TYPE_SK_SKB | BPF_SK_SKB_STREAM_PARSER BPF_SK_SKB_STREAM_VERDICT BPF_SK_SKB_VERDICT | struct __sk_buff |
BPF_PROG_TYPE_CGROUP_DEVICE | BPF_CGROUP_DEVICE | struct bpf_cgroup_dev_ctx |
BPF_PROG_TYPE_SK_MSG | BPF_SK_MSG_VERDICT | struct sk_msg_md |
BPF_PROG_TYPE_RAW_TRACEPOINT | None | struct bpf_raw_tracepoint_args |
BPF_PROG_TYPE_CGROUP_SOCK_ADDR | BPF_CGROUP_INET4_BIND BPF_CGROUP_INET6_BIND BPF_CGROUP_INET4_CONNECT BPF_CGROUP_INET6_CONNECT BPF_CGROUP_UDP4_SENDMSG BPF_CGROUP_UDP6_SENDMSG BPF_CGROUP_UDP4_RECVMSG BPF_CGROUP_UDP6_RECVMSG BPF_CGROUP_INET4_GETPEERNAME BPF_CGROUP_INET6_GETPEERNAME BPF_CGROUP_INET4_GETSOCKNAME BPF_CGROUP_INET6_GETSOCKNAME | struct bpf_sock_addr |
BPF_PROG_TYPE_LWT_SEG6LOCAL | None | struct __sk_buff |
BPF_PROG_TYPE_LIRC_MODE2 | BPF_LIRC_MODE2 | __u32 |
BPF_PROG_TYPE_SK_REUSEPORT | BPF_SK_REUSEPORT_SELECT BPF_SK_REUSEPORT_SELECT_OR_MIGRATE | struct sk_reuseport_md |
BPF_PROG_TYPE_FLOW_DISSECTOR | BPF_FLOW_DISSECTOR | struct __sk_buff |
BPF_PROG_TYPE_CGROUP_SYSCTL | BPF_CGROUP_SYSCTL | struct bpf_sysctl |
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE | None | struct bpf_raw_tracepoint_args |
BPF_PROG_TYPE_CGROUP_SOCKOPT | BPF_CGROUP_GETSOCKOPT BPF_CGROUP_SETSOCKOPT | struct bpf_sockopt |
BPF_PROG_TYPE_TRACING | BPF_TRACE_RAW_TP BPF_TRACE_FENTRY BPF_TRACE_FEXIT BPF_MODIFY_RETURN BPF_TRACE_ITER | void * |
BPF_PROG_TYPE_STRUCT_OPS | None | void * |
BPF_PROG_TYPE_EXT | None | void * |
BPF_PROG_TYPE_LSM | BPF_LSM_MAC | void * |
BPF_PROG_TYPE_SK_LOOKUP | BPF_SK_LOOKUP | struct bpf_sk_lookup |
BPF_PROG_TYPE_SYSCALL | None | void * |
注:也参考了 github.com/cilium/ebpf/elf_reader.go 文件定义的映射关系。
4. InputContext 详细定义
4.1. struct xdp_md
在 include/uapi/linux/bpf.h 文件下定义:
struct xdp_md { __u32 data;__u32 data_end;__u32 data_meta;/* Below access go through struct xdp_rxq_info */__u32 ingress_ifindex; /* rxq->dev->ifindex */__u32 rx_queue_index; /* rxq->queue_index */__u32 egress_ifindex; /* txq->dev->ifindex */
};
4.2. struct pt_regs
struct pt_regs 的定义与系统架构相关,以 x86 系统为例,可以在 /usr/src/linux-headers-${uname -r}/arch/x86/include/uapi/asm/ptrace.h 文件中找到,下面是 x86_64 系统架构的定义:
struct pt_regs {
/** C ABI says these regs are callee-preserved. They aren't saved on kernel entry* unless syscall needs a complete, fully filled "struct pt_regs".*/unsigned long r15;unsigned long r14;unsigned long r13;unsigned long r12;unsigned long rbp;unsigned long rbx;
/* These regs are callee-clobbered. Always saved on kernel entry. */unsigned long r11;unsigned long r10;unsigned long r9;unsigned long r8;unsigned long rax;unsigned long rcx;unsigned long rdx;unsigned long rsi;unsigned long rdi;
/** On syscall entry, this is syscall#. On CPU exception, this is error code.* On hw interrupt, it's IRQ number:*/unsigned long orig_rax;
/* Return frame for iretq */unsigned long rip;unsigned long cs;unsigned long eflags;unsigned long rsp;unsigned long ss;
/* top of stack page */
};
4.3. struct __sk_buff
在 include/uapi/linux/bpf.h 文件下定义:
/* user accessible mirror of in-kernel sk_buff.* new fields can only be added to the end of this structure*/
struct __sk_buff {__u32 len; __u32 pkt_type;__u32 mark;__u32 queue_mapping;__u32 protocol;__u32 vlan_present;__u32 vlan_tci;__u32 vlan_proto;__u32 priority;__u32 ingress_ifindex;__u32 ifindex;__u32 tc_index;__u32 cb[5];__u32 hash;__u32 tc_classid;__u32 data;__u32 data_end;__u32 napi_id;/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */__u32 family;__u32 remote_ip4; /* Stored in network byte order */__u32 local_ip4; /* Stored in network byte order */__u32 remote_ip6[4]; /* Stored in network byte order */__u32 local_ip6[4]; /* Stored in network byte order */__u32 remote_port; /* Stored in network byte order */__u32 local_port; /* stored in host byte order *//* ... here. */__u32 data_meta;__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);__u64 tstamp;__u32 wire_len;__u32 gso_segs;__bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size;
};
4.4. struct bpf_perf_event_data
在 include/uapi/linux/bpf_perf_event.h 文件下定义:
typedef struct pt_regs bpf_user_pt_regs_t;struct bpf_perf_event_data {bpf_user_pt_regs_t regs;__u64 sample_period;__u64 addr;
};
4.5. struct bpf_sock
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_sock {__u32 bound_dev_if;__u32 family;__u32 type;__u32 protocol;__u32 mark;__u32 priority;/* IP address also allows 1 and 2 bytes access */__u32 src_ip4;__u32 src_ip6[4];__u32 src_port; /* host byte order */__be16 dst_port; /* network byte order */__u16 :16; /* zero padding */__u32 dst_ip4;__u32 dst_ip6[4];__u32 state;__s32 rx_queue_mapping;
};
4.6. struct bpf_sock_ops
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_sock_ops {__u32 op;union {__u32 args[4]; /* Optionally passed to bpf program */__u32 reply; /* Returned by bpf program */__u32 replylong[4]; /* Optionally returned by bpf prog */};__u32 family;__u32 remote_ip4; /* Stored in network byte order */__u32 local_ip4; /* Stored in network byte order */__u32 remote_ip6[4]; /* Stored in network byte order */__u32 local_ip6[4]; /* Stored in network byte order */__u32 remote_port; /* Stored in network byte order */__u32 local_port; /* stored in host byte order */__u32 is_fullsock; /* Some TCP fields are only valid if* there is a full socket. If not, the* fields read as zero.*/__u32 snd_cwnd;__u32 srtt_us; /* Averaged RTT << 3 in usecs */__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */__u32 state;__u32 rtt_min;__u32 snd_ssthresh;__u32 rcv_nxt;__u32 snd_nxt;__u32 snd_una;__u32 mss_cache;__u32 ecn_flags;__u32 rate_delivered;__u32 rate_interval_us;__u32 packets_out;__u32 retrans_out;__u32 total_retrans;__u32 segs_in;__u32 data_segs_in;__u32 segs_out;__u32 data_segs_out;__u32 lost_out;__u32 sacked_out;__u32 sk_txhash;__u64 bytes_received;__u64 bytes_acked;__bpf_md_ptr(struct bpf_sock *, sk);/* [skb_data, skb_data_end) covers the whole TCP header.** BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received* BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the* header has not been written.* BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have* been written so far.* BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes* the 3WHS.* BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes* the 3WHS.** bpf_load_hdr_opt() can also be used to read a particular option.*/__bpf_md_ptr(void *, skb_data);__bpf_md_ptr(void *, skb_data_end);__u32 skb_len; /* The total length of a packet.* It includes the header, options,* and payload.*/__u32 skb_tcp_flags; /* tcp_flags of the header. It provides* an easy way to check for tcp_flags* without parsing skb_data.** In particular, the skb_tcp_flags* will still be available in* BPF_SOCK_OPS_HDR_OPT_LEN even though* the outgoing header has not* been written yet.*/
};
4.7. struct bpf_cgroup_dev_ctx
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_cgroup_dev_ctx {/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */__u32 access_type;__u32 major;__u32 minor;
};
4.8. struct sk_msg_md
在 include/uapi/linux/bpf.h 文件下定义:
struct sk_msg_md {__bpf_md_ptr(void *, data);__bpf_md_ptr(void *, data_end);__u32 family;__u32 remote_ip4; /* Stored in network byte order */__u32 local_ip4; /* Stored in network byte order */__u32 remote_ip6[4]; /* Stored in network byte order */__u32 local_ip6[4]; /* Stored in network byte order */__u32 remote_port; /* Stored in network byte order */__u32 local_port; /* stored in host byte order */__u32 size; /* Total size of sk_msg */__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
};
4.9. struct sk_reuseport_md
在 include/uapi/linux/bpf.h 文件下定义:
struct sk_reuseport_md {/** Start of directly accessible data. It begins from* the tcp/udp header.*/__bpf_md_ptr(void *, data);/* End of directly accessible data */__bpf_md_ptr(void *, data_end);/** Total length of packet (starting from the tcp/udp header).* Note that the directly accessible bytes (data_end - data)* could be less than this "len". Those bytes could be* indirectly read by a helper "bpf_skb_load_bytes()".*/__u32 len;/** Eth protocol in the mac header (network byte order). e.g.* ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)*/__u32 eth_protocol;__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */__u32 bind_inany; /* Is sock bound to an INANY address? */__u32 hash; /* A hash of the packet 4 tuples *//* When reuse->migrating_sk is NULL, it is selecting a sk for the* new incoming connection request (e.g. selecting a listen sk for* the received SYN in the TCP case). reuse->sk is one of the sk* in the reuseport group. The bpf prog can use reuse->sk to learn* the local listening ip/port without looking into the skb.** When reuse->migrating_sk is not NULL, reuse->sk is closed and* reuse->migrating_sk is the socket that needs to be migrated* to another listening socket. migrating_sk could be a fullsock* sk that is fully established or a reqsk that is in-the-middle* of 3-way handshake.*/__bpf_md_ptr(struct bpf_sock *, sk);__bpf_md_ptr(struct bpf_sock *, migrating_sk);
};
4.10. struct bpf_raw_tracepoint_args
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_raw_tracepoint_args {__u64 args[0];
};
4.11. struct bpf_sock_addr
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_sock_addr {__u32 user_family; /* Allows 4-byte read, but no write. */__u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.* Stored in network byte order.*/__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.* Stored in network byte order.*/__u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.* Stored in network byte order*/__u32 family; /* Allows 4-byte read, but no write */__u32 type; /* Allows 4-byte read, but no write */__u32 protocol; /* Allows 4-byte read, but no write */__u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write.* Stored in network byte order.*/__u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.* Stored in network byte order.*/__bpf_md_ptr(struct bpf_sock *, sk);
};
4.12. struct bpf_sysctl
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_sysctl {__u32 write; /* Sysctl is being read (= 0) or written (= 1).* Allows 1,2,4-byte read, but no write.*/__u32 file_pos; /* Sysctl file position to read from, write to.* Allows 1,2,4-byte read an 4-byte write.*/
};
4.13. struct bpf_sockopt
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_sockopt {__bpf_md_ptr(struct bpf_sock *, sk);__bpf_md_ptr(void *, optval);__bpf_md_ptr(void *, optval_end);__s32 level;__s32 optname;__s32 optlen;__s32 retval;
};
4.14. struct bpf_sk_lookup
在 include/uapi/linux/bpf.h 文件下定义:
struct bpf_sk_lookup {union {__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */__u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */};__u32 family; /* Protocol family (AF_INET, AF_INET6) */__u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */__u32 remote_ip4; /* Network byte order */__u32 remote_ip6[4]; /* Network byte order */__be16 remote_port; /* Network byte order */__u16 :16; /* Zero padding */__u32 local_ip4; /* Network byte order */__u32 local_ip6[4]; /* Network byte order */__u32 local_port; /* Host byte order */__u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */
};