用 eBPF 打通 Go 堆外内存黑盒：uprobe 与 kprobe 的协同追踪实战

2026/4/11 06:41:46 164 0 0 0

问题背景：当 pprof 遇到堆外内存

Go 的内存分析工具 pprof 在排查纯 Go 堆内存泄漏时表现出色，但在面对以下场景时往往力不从心：

CGO 调用：C 库通过 malloc 申请的内存不在 Go heap 管理范围内
网络栈：高并发场景下内核协议栈的 skbuff 积压
运行时缓存：sync.Pool 或 runtime 内部结构的异常持有

这些内存分配路径绕过 runtime.mallocgc，导致堆快照无法反映真实内存压力。我们需要一种能在用户态拦截 Go 内存申请、同时在内核态监控缺页异常的双层追踪方案。

架构设计：双探针协同机制

核心思路

通过 uprobe 在 runtime.mallocgc 入口处埋点捕获 Go 堆分配事件，同时利用 kprobe 监控 handle_mm_fault（或 do_page_fault）获取进程级缺页异常。将两类事件通过 PID + 时间窗口关联，构建"分配-访问-泄漏"的完整证据链。

┌─────────────────────────────────────────────────────────────┐
│                        用户态 (Go App)                       │
│  ┌──────────────┐          ┌──────────────┐                │
│  │ runtime.mallocgc │◄─────│   uprobe     │  捕获分配大小    │
│  └──────────────┘          └──────────────┘                │
└────────────────────┬────────────────────────────────────────┘
                     │
┌────────────────────┼────────────────────────────────────────┐
│               内核态 (eBPF)                                  │
│  ┌──────────────┐  │        ┌──────────────┐                │
│  │ handle_mm_fault │◄──────│   kprobe     │  捕获缺页地址    │
│  └──────────────┘           └──────────────┘                │
│         │                                                    │
│         ▼                                                    │
│  ┌─────────────────────────────────────────┐                 │
│  │  Ring Buffer / BPF Maps                 │  事件关联与聚合  │
│  │  - pid_alloc_map: 记录分配事件           │                 │
│  │  - page_fault_map: 记录缺页事件          │                 │
│  └─────────────────────────────────────────┘                 │
└─────────────────────────────────────────────────────────────┘

为什么选择 page_fault 而非直接追踪 sys_malloc？

直接通过 uprobe 拦截 libc malloc 在 CGO 场景下有效，但对于纯 Go 代码，其通过 mmap 直接向内核申请内存（大对象分配路径）。page_fault 作为内存访问的"最后一公里"，能捕获所有类型的内存请求（包括匿名页、文件映射页），且对应用性能影响更小。

eBPF 实现：内核态程序

1. 数据结构定义

#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

#define MAX_STACK_DEPTH 32
#define ALLOC_TRACK_MAX 10240

struct alloc_event {
    u32 pid;
    u64 timestamp;
    u64 size;
    u64 stack_id;
    u8  comm[16];
};

struct page_fault_event {
    u32 pid;
    u64 timestamp;
    u64 addr;
    u64 stack_id;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, ALLOC_TRACK_MAX);
    __type(key, u64);    // pointer address
    __type(value, struct alloc_event);
} active_allocs SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    __uint(key_size, sizeof(u32));
    __uint(value_size, sizeof(u32));
} events SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_STACK_TRACE);
    __uint(max_entries, 4096);
    __type(key, u32);
    __type(value, u64[MAX_STACK_DEPTH]);
} stack_traces SEC(".maps");

2. uprobe 拦截 mallocgc

Go 1.17+ 中 runtime.mallocgc 的符号需要处理寄存器 ABI 变化。以下代码兼容 x86_64 平台：

SEC("uprobe/runtime_mallocgc")
int trace_mallocgc(struct pt_regs *ctx)
{
    u64 ptr = PT_REGS_RC(ctx);  // 返回值：分配地址
    u64 size = PT_REGS_PARM1(ctx);  // 第一个参数：分配大小
    
    // 过滤小对象（< 32KB），减少噪声
    if (size < 32768)
        return 0;
        
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    
    struct alloc_event event = {};
    event.pid = pid;
    event.timestamp = bpf_ktime_get_ns();
    event.size = size;
    event.stack_id = bpf_get_stackid(ctx, &stack_traces, BPF_F_USER_STACK);
    bpf_get_current_comm(&event.comm, sizeof(event.comm));
    
    // 存储活跃分配
    bpf_map_update_elem(&active_allocs, &ptr, &event, BPF_ANY);
    
    return 0;
}

关键点：通过 PT_REGS_RC 获取返回值（分配后的指针），而非入参。这确保了我们记录的是实际分配的内存地址。

3. kprobe 监控缺页异常

SEC("kprobe/handle_mm_fault")
int trace_page_fault(struct pt_regs *ctx)
{
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    
    // 仅监控目标进程（通过用户态配置 pid_filter map）
    u32 *target_pid = bpf_map_lookup_elem(&pid_filter, &pid);
    if (!target_pid)
        return 0;
        
    struct vm_area_struct *vma = (struct vm_area_struct *)PT_REGS_PARM1(ctx);
    u64 addr = PT_REGS_PARM2(ctx);
    
    // 过滤内核地址空间
    if (addr > 0x7fffffffffffULL)
        return 0;
        
    struct page_fault_event event = {};
    event.pid = pid;
    event.timestamp = bpf_ktime_get_ns();
    event.addr = addr;
    event.stack_id = bpf_get_stackid(ctx, &stack_traces, 0);
    
    bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
    
    return 0;
}

用户态程序：Go 实现

用户态需要完成符号解析、事件关联和泄漏判定：

package main

import (
    "encoding/binary"
    "fmt"
    "os"
    "time"
    
    "github.com/cilium/ebpf"
    "github.com/cilium/ebpf/link"
    "github.com/cilium/ebpf/perf"
)

type AllocInfo struct {
    Size      uint64
    Timestamp uint64
    StackID   uint32
    Comm      string
}

type LeakDetector struct {
    objs       *bpfObjects  // 由 bpf2go 生成
    uprobeLink link.Link
    kprobeLink link.Link
    reader     *perf.Reader
    
    // 内存分配记录: address -> AllocInfo
    activeAllocs map[uint64]*AllocInfo
    // 访问记录: address -> last_access_time
    accessLog    map[uint64]time.Time
}

func (d *LeakDetector) handlePageFault(raw []byte) {
    // 解析 perf event 数据
    pid := binary.LittleEndian.Uint32(raw[0:4])
    timestamp := binary.LittleEndian.Uint64(raw[8:16])
    addr := binary.LittleEndian.Uint64(raw[16:24])
    
    // 对齐到页边界（4KB）
    pageAddr := addr & ^uint64(4095)
    
    // 检查该页是否对应已追踪的分配
    for ptr, info := range d.activeAllocs {
        if ptr <= addr && addr < ptr+info.Size {
            d.accessLog[ptr] = time.Now()
            break
        }
    }
}

func (d *LeakDetector) DetectLeaks(threshold time.Duration) []uint64 {
    var leaks []uint64
    now := time.Now()
    
    for ptr, info := range d.activeAllocs {
        lastAccess, accessed := d.accessLog[ptr]
        if !accessed {
            // 从未被访问，可能是预分配或泄漏
            allocTime := time.Unix(0, int64(info.Timestamp))
            if now.Sub(allocTime) > threshold {
                leaks = append(leaks, ptr)
            }
        } else if now.Sub(lastAccess) > threshold {
            leaks = append(leaks, ptr)
        }
    }
    return leaks
}

工程优化：从原型到生产

1. 符号解析的偏移计算

Go 二进制默认 stripped，需通过 .gopclntab 段恢复符号表。对于 runtime.mallocgc 的 uprobe 挂载：

// 使用 debug/elf 解析 .symtab 或 .gopclntab
file, _ := elf.Open("/proc/self/exe")
syms, _ := file.Symbols()
for _, sym := range syms {
    if sym.Name == "runtime.mallocgc" {
        offset := sym.Value  // 相对 text 段的偏移
        // 计算实际虚拟地址...
    }
}

2. 采样策略降低开销

全量追踪在大对象分配（> 32KB）场景下仍有 5-8% 的性能损耗。建议实施自适应采样：

// 在 eBPF 中实现指数退避采样
static u64 sample_threshold = 1000;  // 每 1000 次分配采样 1 次

SEC("uprobe/runtime_mallocgc")
int trace_mallocgc_sampled(struct pt_regs *ctx)
{
    u64 *count = bpf_map_lookup_elem(&sample_counter, &pid);
    if (!count) {
        u64 init = 1;
        bpf_map_update_elem(&sample_counter, &pid, &init, BPF_ANY);
        return 0;
    }
    
    if (++(*count) % sample_threshold != 0)
        return 0;
        
    // 执行追踪逻辑...
}

3. 与 pprof 的互补使用

该方案不应替代 pprof，而是作为堆外内存专项诊断工具：

开发阶段：使用 pprof 分析 Go 堆分配热点
生产排障：当 RSS 增长但 Heap Inuse 平稳时，启用本方案检测 CGO/内核级泄漏

局限性与改进方向

Go 版本兼容性：runtime.mallocgc 的函数签名在 Go 1.16/1.17/1.20 间有变化，需维护多版本符号表
ARM64 支持：需适配 struct pt_regs 的寄存器布局差异
Off-CPU 分析：当前方案仅追踪内存分配与访问，结合 offwaketime 可进一步分析内存持有者的阻塞原因

总结

通过 uprobe 与 kprobe 的协同，我们构建了一个穿透用户态与内核态边界的内存追踪方案。相比传统工具，它能捕获 CGO 内存泄漏、内核网络栈积压等"隐形"内存消耗，为云原生环境下的 Go 应用提供了更完整的可观测性覆盖。

完整代码实现可参考 github.com/example/go-ebpf-memtrack（示例仓库，实际部署需根据内核版本调整 BPF CO-RE 配置）。

内核探险家 eBPF Go性能优化内存泄漏排查