Skip to content

MM Memory Cgroup

1. mem_cgroup 结构

1.1 struct mem_cgroup

文件: mm/memcontrol.h:600-800

c
struct mem_cgroup {
    struct cgroup_subsys_state css;
    struct mem_cgroup_lruvec __lruvec;
    struct lruvec_stat __lruvec_stat;
    atomic_long_t memory_events_local[VM_EVENTS_MAX];
    unsigned long flags;

    /* 内存限制 */
    struct memory_limit {
        atomic64_t usage;
        atomic64_t soft_limit;
        u64 high;
        u64 max;
        u64 oom;
    } memory;

    /* Swap 限制 */
    struct memory_limit swap;

    /* 统计信息 */
    struct memcg_memory_data __memcg_data;
    struct lru_gen_folio *lrugen;

    /* 层级结构 */
    struct list_head children;
    struct mem_cgroup *parent;

    /* OOM 相关 */
    wait_queue_head_t oom_waitq;
    struct oom_control oom;
};

1.2 memory_stat 统计

文件: mm/memcontrol.c:2000-2100

c
enum memcg_memory_event {
    MEMCG_LOW,
    MEMCG_HIGH,
    MEMCG_MAX,
    MEMCG_OOM,
    MEMCG_SWAP_HIGH,
    MEMCG_SWAP_MAX,
    MEMCG_SWAP_OOM,
};

enum memcg_stat_item {
    MEMCG_CACHE,
    MEMCG_RSS,
    MEMCG_RSS_HUGE,
    MEMCG_SHMEM,
    MEMCG_SHMEM_HUGE,
    MEMCG_FILE,
    MEMCG_FILE_HUGE,
    MEMCG_PGPGIN,
    MEMCG_PGPGOUT,
    MEMCG_PGFAULT,
    MEMCG_PGMAJFAULT,
    // ...
};

2. 内存计费

2.1 mem_cgroup_charge

文件: mm/memcontrol.c:3500-3600

c
int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
                      gfp_t gfp)
{
    struct mem_cgroup *memcg;
    int ret;

    memcg = get_mem_cgroup_from_mm(mm);
    if (memcg == root_mem_cgroup)
        return 0;

    ret = try_charge(memcg, gfp, folio_nr_pages(folio));
    if (ret)
        return ret;

    // 设置 folio->memcg_data
    folio->memcg_data = (unsigned long)memcg;

    return 0;
}

2.2 try_charge

文件: mm/memcontrol.c:3200-3350

c
int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                unsigned int nr_pages)
{
    unsigned int batch = MGC_CHARGE_BATCH;
    struct mem_cgroup *memcg_over_se;
    bool maybe_oom;
    int ret;

    // 检查是否超过 high 限制
    if (consume_charge(memcg, nr_pages))
        return 0;

    // 超过限制,尝试回收
    ret = reclaim_charge(memcg, gfp_mask, nr_pages);
    if (ret == 0)
        return 0;

    // 回收失败,可能 OOM
    maybe_oom = !memcg->oom_lock.waiters;

    if (memcg->memory.max == PAGE_COUNTER_MAX)
        goto force_retry;

    // 检查 high 限制
    if (consume_charge(memcg, nr_pages))
        return 0;

    // 触发 memory.high 事件
    memcg_memory_event(memcg, MEMCG_HIGH);

force_retry:
    if (maybe_oom) {
        // 唤醒 OOM killer
        mem_cgroup_oom(memcg, gfp_mask, 0);
    }

    return -ENOMEM;
}

3. 内存回收

3.1 memcg_reclaim

文件: mm/vmscan.c:5500-5600

c
static int memcg_reclaim(struct mem_cgroup *memcg, int nr_to_reclaim,
                         unsigned long *nr_reclaimed)
{
    struct scan_control sc = {
        .nr_to_reclaim = nr_to_reclaim,
        .gfp_mask = GFP_KERNEL,
        .may_writepage = 1,
        .may_unmap = 1,
        .may_swap = 1,
        .target_mem_cgroup = memcg,
    };

    return try_to_free_mem_cgroup_pages(memcg, &sc, nr_reclaimed);
}

3.2 try_to_free_mem_cgroup_pages

文件: mm/vmscan.c:5450-5500

c
int try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                 struct scan_control *sc,
                                 unsigned long *nr_reclaimed)
{
    struct zonelist *zonelist = &NODE_DATA(numa_node_id())->node_zonelists[ZONELIST_FALLBACK];

    sc->nr_reclaimed = 0;

    // 执行回收
    do {
        shrink_node_memcgs(zonelist, sc);
    } while (sc->nr_reclaimed < sc->nr_to_reclaim &&
             sc->priority >= 0);

    *nr_reclaimed = sc->nr_reclaimed;
    return sc->nr_reclaimed >= sc->nr_to_reclaim ? 0 : -EAGAIN;
}

4. Memory Pressure

4.1 memory_pressure_handle

文件: mm/memcontrol.c:4000-4100

c
static void memory_pressure_handle(struct mem_cgroup *memcg)
{
    u64 current, min, low;
    u64 pressure;

    // 计算内存压力
    current = atomic64_read(&memcg->memory.usage);
    min = atomic64_read(&memcg->memory.min);
    low = atomic64_read(&memcg->memory.soft_limit);

    if (current < low)
        return;

    // 计算压力百分比
    if (low > min)
        pressure = (current - min) * 100 / (low - min);
    else
        pressure = 100;

    // 通知用户空间
    cgroup_file_notify(&memcg->events_local[MEMCG_LOW]);
}

4.2 cgroup 压力通知

文件: kernel/cgroup/cgroup-util.c

cgroup v2 的 memory.pressure 接口通过 cgroup 事件机制实现。

5. OOM 处理

5.1 mem_cgroup_oom

文件: mm/memcontrol.c:1706-1723

c
static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
    if (order > PAGE_ALLOC_COSTLY_ORDER)
        return false;

    memcg_memory_event(memcg, MEMCG_OOM);

    if (!memcg1_oom_prepare(memcg, &locked))
        return false;

    ret = mem_cgroup_out_of_memory(memcg, mask, order);

    memcg1_oom_finish(memcg, locked);

    return ret;
}

5.2 mem_cgroup_out_of_memory

文件: mm/memcontrol.c:1673-1700

c
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg,
                                     gfp_t gfp_mask, int order)
{
    struct oom_control oc = {
        .zonelist = NULL,
        .nodemask = NULL,
        .memcg = memcg,
        .gfp_mask = gfp_mask,
        .order = order,
    };
    bool ret = true;

    if (mutex_lock_killable(&oom_lock))
        return true;

    // 检查 margin
    if (mem_cgroup_margin(memcg) >= (1 << order))
        goto unlock;

    ret = out_of_memory(&oc);

unlock:
    mutex_unlock(&oom_lock);
    return ret;
}

6. 软限制回收

6.1 memcg1_soft_limit_reclaim

文件: mm/vmscan.c:5700-5800

c
unsigned long memcg1_soft_limit_reclaim(struct mem_cgroup *memcg,
                                        int priority, unsigned long *nr_scanned)
{
    unsigned long nr_reclaimed = 0;
    unsigned long try_age = 1;
    unsigned long max_size;

    // 获取软限制
    max_size = atomic64_read(&memcg->memory.soft_limit);

    // 如果当前使用量小于软限制,不需要回收
    if (atomic64_read(&memcg->memory.usage) < max_size)
        return 0;

    // 扫描 LRU 进行回收
    while (nr_reclaimed < try_age) {
        // 回收操作
        // ...
    }

    return nr_reclaimed;
}

7. 层级限制继承

c
// 子 cgroup 继承父 cgroup 的限制
static void memcg_update_limits(struct mem_cgroup *memcg)
{
    struct mem_cgroup *parent = memcg->parent;

    if (parent) {
        // 子 cgroup 的 max 不能超过父
        if (memcg->memory.max > parent->memory.max)
            memcg->memory.max = parent->memory.max;

        // 子 cgroup 的 soft_limit 不能超过 max
        if (memcg->memory.soft_limit > memcg->memory.max)
            memcg->memory.soft_limit = memcg->memory.max;
    }
}

8. 关键源码位置

函数文件行号
mem_cgroup_chargemm/memcontrol.c3500
try_chargemm/memcontrol.c3200
memcg_reclaimmm/vmscan.c5500
try_to_free_mem_cgroup_pagesmm/vmscan.c5450
mem_cgroup_oommm/memcontrol.c1706
mem_cgroup_out_of_memorymm/memcontrol.c1673
memcg1_soft_limit_reclaimmm/vmscan.c5700

基于 VitePress 构建