Home linux Slab
Post
Cancel

linux Slab

一、slab

1
2
3
4
https://zhuanlan.zhihu.com/p/166649492 Linux内存管理:slub分配器
https://www.jianshu.com/p/95d68389fbd1 slab分配器
细节拉满,80 张图带你一步一步推演 slab 内存池的设计与实现
https://segmentfault.com/a/1190000043626203  

​ 内核中的物理内存由伙伴系统(buddy system)进行管理,它的分配粒度是以物理页帧(page)为单位的,但内核中有大量的数据结构只需要若干bytes的空间,倘若仍按页来分配,势必会造成大量的内存被浪费掉。slab分配器的出现就是为了解决内核中这些小块内存分配与管理的难题。slab分配器是基于buddy页分配器,在它上面实现了一层面向对象的缓存管理机制。

1) 主要数据结构

1 struct kmem_cache

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
struct kmem_cache {
  	  /*per-cpu变量,用来实现每个CPU上的slab缓存。好处如下:
        1.促使cpu_slab->freelist可以无锁访问,避免了竞争,提升分配速度
        2.使得本地cpu缓存中分配出的objects被同一cpu访问,提升TLB对object的命中率
        (因为一个page中有多个object,他们共用同一个PTE)
        */
        struct kmem_cache_cpu __percpu *cpu_slab;
        slab_flags_t flags;        /* 下面这些是初始化kmem_cache时会设置的一些变量
         分配时会用到的flags */
        unsigned long min_partial;/*kmem_cache_shrink缩减partial slabs时,
        将被保有slab的最小值。set_min_partial(s, ilog2(s->size)/2)设置。*/
        unsigned int size; /*object的实际大小,包含元数据和对齐的空间*/
        unsigned int object_size;	/*object中payload的大小,即目标数据结构的实际大小*/
        unsigned int offset;/*每个free object中都存了next free object的地址,但是并未
        存储在object的首地址,而是首地址加上offset的地方*/
        struct kmem_cache_order_objects oo; /*此结构体实际是个unsigned int,
         page_order << 16 || slab_object_num & 0xFFFF */
        /* Allocation and freeing of slabs */
        struct kmem_cache_order_objects max;
        struct kmem_cache_order_objects min;
        gfp_t allocflags;       /* gfp flags to use on each alloc 标准gfp掩码,
        用于从buddy分配页面时 */
        int refcount;           /* Refcount for slab cache destroy */
        void (*ctor)(void *); 	/*object的构造函数,通常不使用*/
        unsigned int inuse;    	/*object中到metadata的偏移*/
        unsigned int align;    	/*对齐大小。澄清:slab中对齐方式通常有两种。
        1是按处理器字长对齐;2是按照cacheline大小对齐。*/
        unsigned int red_left_pad;      /* Left redzone padding size
         若flags中使用REDZONE时有意义*/
    	const char *name;/*对象名称,例:mm_struct task_struct*/
    	struct list_head list; 	/*kmem_cache的链表结构,通过此成员串在slab_caches链表上*/
		/*下面两个成员用于表示对象内部的一块空间,使userspace可以访问其中的内容。具体可以看kmem_cache_create_usercopy的实现*/
        unsigned int useroffset;   /* 类似offsetof(struct ext4_inode_info, i_data), ext4_inode_info->i_data用户态可访问 */
        unsigned int usersize;    /* sizeof_field(struct ext4_inode_info, i_data) */
        struct kmem_cache_node *node[MAX_NUMNODES];	/*每个node对应一个数组项,kmem_cache_node中包含partial slab链表*/
};
struct kmem_cache_cpu {
        void **freelist;      /*指向下面page指向的slab中的第一个free object*/
    	/* Globally unique transaction id */
        unsigned long tid;      
        struct page *page; /*指向当前正在使用的slab*/     
        struct page *partial; /*本地slab缓存池中的partial slab链表*/
};

struct kmem_cache_node {
        spinlock_t list_lock;   	/*kmem_cache_node数据结构的自选锁,可能涉及到多核访问*/
        unsigned long nr_partial;    	/*node中slab的数量*/
        struct list_head partial;    	/*指向partial slab链表*/
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
crash> struct kmem_cache -x ffff8001f8e80800
struct kmem_cache {
  cpu_slab = 0xfffefdfe40f4d3d0, 
  flags = 0x40002000, 
  min_partial = 0x5, 
  size = 0x680, 
  object_size = 0x640, 
  offset = 0x640, 
  cpu_partial = 0x6, 
  oo = {
    x = 0x30013
  }, 
  max = {
    x = 0x30013
  }, 
  min = {
    x = 0x2
  }, 
  allocflags = 0x4000, 
  refcount = 0x1, 
  ctor = 0xffff000000aba000, 
  inuse = 0x640, 
  align = 0x40, 
  red_left_pad = 0x0, 
  name = 0xffff8001facd7d00 "slab_test_cachep", 
  list = {
    next = 0xffff8000fee71260, 
    prev = 0xffff000081308bb0 <slab_caches>
  }, 
  kobj = {
    name = 0xffff8001facd7c00 "slab_test_cachep", 
    entry = {
      next = 0xffff8000fc8ba800, 
      prev = 0xffff8000fee71278
    }, 
    parent = 0xffff8000fc8ba818, 
    kset = 0xffff8000fc8ba800, 
    ktype = 0xffff000081311898 <slab_ktype>, 
    sd = 0xffff8001f94cba18, 
    kref = {
      refcount = {
        refs = {
          counter = 0x1
        }
      }
    }, 
    state_initialized = 0x1, 
    state_in_sysfs = 0x1, 
    state_add_uevent_sent = 0x1, 
    state_remove_uevent_sent = 0x0, 
    uevent_suppress = 0x0, 
    kabi_reserved1 = 0x0, 
    kabi_reserved2 = 0x0, 
    kabi_reserved3 = 0x0, 
    kabi_reserved4 = 0x0
  }, 
  kobj_remove_work = {
    data = {
      counter = 0xfffffffe0
    }, 
    entry = {
      next = 0xffff8001f8e808d8, 
      prev = 0xffff8001f8e808d8
    }, 
    func = 0xffff00008033f4d8 <sysfs_slab_remove_workfn>, 
    kabi_reserved1 = 0x0, 
    kabi_reserved2 = 0x0, 
    kabi_reserved3 = 0x0, 
    kabi_reserved4 = 0x0
  }, 
  memcg_params = {
    root_cache = 0x0, 
    {
      {
        memcg_caches = 0x0, 
        __root_caches_node = {
          next = 0xffff8000fee71320, 
          prev = 0xffff000081308ba0 <slab_root_caches>
        }, 
        children = {
          next = 0xffff8001f8e80930, 
          prev = 0xffff8001f8e80930
        }, 
        dying = 0x0
      }, 
      {
        memcg = 0x0, 
        children_node = {
          next = 0xffff8000fee71320, 
          prev = 0xffff000081308ba0 <slab_root_caches>
        }, 
        kmem_caches_node = {
          next = 0xffff8001f8e80930, 
          prev = 0xffff8001f8e80930
        }, 
        deact_fn = 0x0, 
        {
          deact_rcu_head = {
            next = 0x0, 
            func = 0x0
          }, 
          deact_work = {
            data = {
              counter = 0x0
            }, 
            entry = {
              next = 0x0, 
              prev = 0x0
            }, 
            func = 0x0, 
            kabi_reserved1 = 0x0, 
            kabi_reserved2 = 0x0, 
            kabi_reserved3 = 0x0, 
            kabi_reserved4 = 0x0
          }
        }
      }
    }
  }, 
  max_attr_size = 0x0, 
  memcg_kset = 0x0, 
  remote_node_defrag_ratio = 0x3e8, 
  random_seq = 0xffff8001facd7800, 
  useroffset = 0x0, 
  usersize = 0x0, 
  node = {0xffff8000fbaab180, 0xffff8001fb802580, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff000080f980a0, 0x400c0000, 0x5, 0x8000000088, 0x1e00000080, 0x1e0000001e}
}
1
2
3
4
PS:
每个物理页都对应一个struct page结构体,结构体中有个联合体,其中定义了一些slab分配器要用到的成员。若
page用于slab,则下面成员将生效并被使用,代码如下。需要注意的是这里也有个freelist,它指向所属slab
第一个free object, 不能和kmem_cachefreelist混淆。

2) 常用函数

1 kmem_cache_create

创建一个缓存管理描述符kmem_cache;

1
2
3
4
*name是一個字符串,存放kmem_cache緩存的名字;size是緩存所存放的對象的大小;align是slab內第一個對象
的偏移;flag是可選的配置項,用來控制緩存的行爲。最後一個參數ctor是對象的構造函數,一般是不需要的,以
NULL來代替。kmem_cache_create()成功執行之後會返回一個指向所創建的緩存的指針,否則返回NULL。
kmem_cache_create()可能會引起阻塞(睡眠),因此不能在中斷上下文中使用。
1
2
3
crash> kmem -s slab_test_cachep
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff8001fb843600     1600          0         0      0    32k  slab_test_cachep
2 kmem_cache_alloc申请object
1
2
3
4
//1个slabs,申请8x4K, 1个slab有19个object, 目前被申请走2个, object的真实大小为1600bytes.
crash> kmem -s slab_test_cachep
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff8001f8e80800     1600          2        19      1    32k  slab_test_cachep

object的分配通过kmem_cache_alloc()接口,实际分配object的过程会存在以下几种情形:

1> fast path

即可直接从本地cpu缓存中的freelist拿到可用object

1
2
3
4
5
6
7
8
kmem_cache_alloc
  slab_alloc
    slab_alloc_node
      -->object = c->freelist                                  //本地cpu缓存的freelist有可用的object
      -->void *next_object=get_freepointer_safe(s, object);    //获取next object的地址,用于后面更新freelist
      -->this_cpu_cmpxchg_double                               //更新cpu_slab->freelist和cpu_slab->tid
      -->prefetch_freepointer(s, next_object);                 //优化语句,将next object的地址放入cacheline,提高后面用到时的命中率
      -->stat(s, ALLOC_FASTPATH);                              //设置状态为ALLOC_FASTPATH
2> slow path

本地cpu缓存中的freelist为NULL,但本地cpu缓存中的partial中有未满的slab

1
2
3
4
5
6
7
8
9
kmem_cache_alloc
  slab_alloc
    slab_alloc_node
      __slab_alloc                                                                //分配过程关闭了本地中断
        ___slab_alloc
          -->page = c->pageNULL的情况下                 //即本地cpu缓存中当前在使用的slab的free object已经分完
          -->goto new_slab;                             //跳转到new_slab,从本地缓存池的partial取一个slab赋给page,并跳转到redo
          -->freelist = get_freelist(s, page)           //获取page中的freelist(注意:此freelist为strcut page中的,并非本地cpu缓存的freelist)
          -->c->freelist = get_freepointer(s, freelist) //将freelist重新赋给kmem_cache_cpu中的freelist     
3> very slow path

本地cpu缓存中的freelist为NULL,且本地cpu缓存中的partial也无slab可用。

1
2
3
4
5
6
7
8
9
10
11
kmem_cache_alloc
  slab_alloc
    slab_alloc_node
      __slab_alloc         //分配过程关闭了本地中断
        ___slab_alloc
          -->page = c->pageNULL的情况下  //即本地cpu缓存中当前在使用的slab的free object已经分完                            
          -->goto new_slab;   //跳转到new_slab,通过slub_percpu_partial(c)检查到本地cpu缓存池中partial无slab可用。
          -->freelist = new_slab_objects(s, gfpflags, node, &c); //此函数中会出现两种情况:情况 
          //1.当前node对应的kmem_cache_node中有可用partial slab,并从中获取slab分给本地cpu缓冲池。
          // 情况2.当前node对应的kmem_cache_node无可用的partial slab,过new_slab->allocate_slab->alloc_slab_page->alloc_pages从buddy分配器申请内存并创建新的 slab。两种情况最终都会返回一个可用的freelist
          -->c->freelist = get_freepointer(s, freelist)  //将freelist重新赋给kmem_cache_cpu中的freelist 
3 kmem_cache_free释放object
4 kmem_cache_destory销毁kmem_cache
1
2
3
調用kmem_cache_destroy()之前應該滿足下面幾個條件:首先,cachep所指向的緩存中所有slab都爲空閒,否則
的話是不可以撤銷的;其次在調用kmem_cache_destroy()過程中以及調用之後,調用者需要確保不會再訪問這個緩
存;最後,該函數也可能會引起阻塞,因此不能在中斷上下文中使用。

二、slab_debug/strace

1
2
Reference:
http://www.wowotech.net/memory_management/427.html SLUB DEBUG机制

1 slab_debug机制

1
2
3
CONFIG_SLUB=y
CONFIG_SLUB_DEBUG=y
CONFIG_SLUB_DEBUG_ON=y

SLUB内存检测功能在某些情况下不能立刻检测出来,必须主动触发,因此我们需要借助slabinfo命令触发SLUB allocator检测功能.

CONFIG_SLUB_DEBUG_ON:

1
kmem_cacheflag = SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE|SLAB_POISON | SLAB_STORE_USER

SLUBU DEBUG关闭的情况下, free pointer是内嵌在object之中的, 但是SLUB DEBUG打开之后, free pointer是在object, 将FP后移就是因为为了检测use-after-free问题, 当free object时会在将object填充magic num(0x6b)。如果不后移的话,岂不是破坏了object之间的单链表关系。

1
2
3
4
5
6
#define SLUB_RED_INACTIVE   0xbb
#define SLUB_RED_ACTIVE     0xcc
/* ...and for poisoning */
#define POISON_INUSE         0x5a    /* for use-uninitialised poisoning */
#define POISON_FREE          0x6b    /* for use-after-free poisoning */
#define POISON_END           0xa5    /* end-byte of poisoning */

2 slab debug使用

1
2
3
Reference:
http://linuxperf.com/?p=184 如何诊断SLUB问题
https://blog.csdn.net/thwack/article/details/79865758  slab_trace
1) slab trace
1>观察slabinfo

启动后记录下slabinfo。运行一段时间,再观察slabinfo。找到增长比较大的slab。

1
cat /proc/slabinfo
2>打开slab trace
1
2
3
echo 1 > /sys/kernel/slab/<leaking_slab>/trace
sleep 60
echo 0 > /sys/kernel/slab/<leaking_slab>/trace
3> 3) 打开以后slab trace会向console打印。串口/dmesg日志输出

如果console是串口的话很有可能把系统打的无响应。最好写一个脚本。运行一段时间后关闭slab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[ 2144.128477] INFO: Slab 0x000000004afbdf16 objects=19 used=1 fp=0x00000000e8006aea flags=0x7ffff0000008100
[ 2144.128483] CPU: 3 PID: 31658 Comm: rmmod Tainted: G    B      OE     4.19.90-vhulk2103.1.0.h469.eulerosv2r10.aarch64 #1
[ 2144.128485] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 2144.128487] Call trace:
[ 2144.128496]  dump_backtrace+0x0/0x198
[ 2144.128499]  show_stack+0x24/0x30
[ 2144.128504]  dump_stack+0xb0/0x100
[ 2144.128510]  slab_err+0xc0/0xe8
[ 2144.128513]  __kmem_cache_shutdown+0x1c0/0x408
[ 2144.128517]  shutdown_cache+0x20/0x1d8
[ 2144.128519]  kmem_cache_destroy+0x26c/0x2e0
[ 2144.128527]  kmem_cache_create_exit+0x14/0xfd8 [kmem_create]
[ 2144.128532]  __arm64_sys_delete_module+0x1a4/0x2b8
[ 2144.128535]  el0_svc_common+0x80/0x1c0
[ 2144.128538]  el0_svc_handler+0x78/0xe0
[ 2144.128541]  el0_svc+0x10/0x260
[ 2144.128546] INFO: Object 0x0000000076a78344 @offset=4992
[ 2144.128549] kmem_cache_destroy slab_test_cachep: Slab cache still has objects
2) slab debug
3) slab merge

三、kmemleak

This post is licensed under CC BY 4.0 by the author.